marc2solr 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.md +29 -0
- data/Rakefile +56 -0
- data/bin/marc2solr +247 -0
- data/bin/solrmarc_to_marc2solr +260 -0
- data/lib/marc2solr/marc2solr_custom.rb +194 -0
- data/lib/marc2solr.rb +452 -0
- data/spec/marc2solr_spec.rb +7 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +190 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'logback-simple'
|
3
|
+
|
4
|
+
module MARC2Solr
|
5
|
+
module Custom
|
6
|
+
LOG = Logback::Simple::Logger.singleton
|
7
|
+
|
8
|
+
# Custom routines are defined as module methods that take two arguments: a MARC4J4R record,
|
9
|
+
# and an (optional) array of other arguments passed in.
|
10
|
+
#
|
11
|
+
# They don't need to live in the MARC2Solr::Custom namespace, but it's not a bad idea to use, e.g.,
|
12
|
+
# MARC2Solr::Custom::UMich, or maybe MARC2Solr::Custom::DateStuff
|
13
|
+
#
|
14
|
+
# You can return multiple values in an array
|
15
|
+
|
16
|
+
# The simplest possible example; just call a method on the underlying MARC4J4R record
|
17
|
+
# Note that even though we don't use the arguments, the method signature has to
|
18
|
+
# support it
|
19
|
+
#
|
20
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
21
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
22
|
+
# @param [#[]] doc A hashlike (responds to #[]) that holds the computed values for fields "so far"
|
23
|
+
# @return [String] The XML representation of the record
|
24
|
+
|
25
|
+
def self.asXML doc, r #Remember, module fucntion! Define with "def self.methodName"
|
26
|
+
return r.to_xml
|
27
|
+
end
|
28
|
+
|
29
|
+
# Another for marc binary
|
30
|
+
def self.asMARC doc, r
|
31
|
+
return r.to_marc
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# And another for marc-in-json
|
36
|
+
|
37
|
+
def self.as_marc_in_json doc, r
|
38
|
+
return r.to_marc_in_json
|
39
|
+
end
|
40
|
+
|
41
|
+
# Here we get all the text from fields between (inclusive) the two tag strings in args;
|
42
|
+
#
|
43
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
44
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
45
|
+
# @param [Array<String>] args An array of two strings, the lowest tag you want to include, and
|
46
|
+
# the highest
|
47
|
+
# @return [String] A single single string with all the text from included fields
|
48
|
+
def self.getAllSearchableFields(doc, r, lower, upper)
|
49
|
+
data = []
|
50
|
+
r.each do |field|
|
51
|
+
next unless field.tag <= upper and field.tag >= lower
|
52
|
+
data << field.value
|
53
|
+
end
|
54
|
+
return data.join(' ')
|
55
|
+
end
|
56
|
+
|
57
|
+
# How about one to sort out, say, the 035s? We'll make a generic routine
|
58
|
+
# that looks for specified values in specified subfields of variable
|
59
|
+
# fields, and then make sure they match before returning them.
|
60
|
+
#
|
61
|
+
# See the use of this in the simple_sample/simple_index.rb file for field 'oclc'
|
62
|
+
#
|
63
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
64
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
65
|
+
# @param [String] tag A tag string (e.g., '035')
|
66
|
+
# @param [String, Array<String>] codes A subfield code ('a') or array of them (['a', 'c'])
|
67
|
+
# @param [Regexp] pattern A pattern that must match for the value to be included
|
68
|
+
# @param [Fixnum] matchindex The number of the substring captured by parens in the pattern to return
|
69
|
+
# The default is zero, which means "the whole string"
|
70
|
+
# @return [Array<String>] a (possibly empty) array of found values
|
71
|
+
def self.valsByPattern(doc, r, tag, codes, pattern, matchindex=0)
|
72
|
+
data = []
|
73
|
+
r.find_by_tag(tag).each do |f|
|
74
|
+
f.sub_values(codes).each do |v|
|
75
|
+
if m = pattern.match(v)
|
76
|
+
data << m[matchindex]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
data.uniq!
|
81
|
+
return data
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# An example of a DateOfPublication implementation
|
86
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
87
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
88
|
+
# @return [String] the found date, or nil if not found.
|
89
|
+
|
90
|
+
def self.getDate doc, r
|
91
|
+
begin
|
92
|
+
ohoh8 = r['008'].value
|
93
|
+
date1 = ohoh8[7..10].downcase
|
94
|
+
datetype = ohoh8[6..6]
|
95
|
+
if ['n','u','b'].include? datetype
|
96
|
+
date1 = ""
|
97
|
+
else
|
98
|
+
date1 = date1.gsub('u', '0').gsub('|', ' ')
|
99
|
+
date1 = '' if date1 == '0000'
|
100
|
+
end
|
101
|
+
|
102
|
+
if m = /^\d\d\d\d$/.match(date1)
|
103
|
+
return m[0]
|
104
|
+
end
|
105
|
+
rescue
|
106
|
+
# do nothing ... go on to the 260c
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
# No good? Fall back on the 260c
|
111
|
+
begin
|
112
|
+
d = r['260']['c']
|
113
|
+
if m = /\d\d\d\d/.match(d)
|
114
|
+
return m[0]
|
115
|
+
end
|
116
|
+
rescue
|
117
|
+
LOG.debug "Record #{r['001']} has no valid date"
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# A simple function to pull the non-indexing characters off the front of a field
|
123
|
+
# based on the second indicator
|
124
|
+
def self.fieldWithoutIndexingChars doc, r, tag
|
125
|
+
vals = []
|
126
|
+
r.find_by_tag(tag).each do |df|
|
127
|
+
ind2 = df.ind2.to_i
|
128
|
+
if ind2 > 0
|
129
|
+
vals << df.value[ind2..-1]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
return vals
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
# A helper function -- take in a year, and return a date category
|
137
|
+
def self.getDateRange(date, r)
|
138
|
+
if date < "1500"
|
139
|
+
return "Pre-1500"
|
140
|
+
end
|
141
|
+
|
142
|
+
case date.to_i
|
143
|
+
when 1500..1800 then
|
144
|
+
century = date[0..1]
|
145
|
+
return century + '00' + century + '99'
|
146
|
+
when 1801..2100 then
|
147
|
+
decade = date[0..2]
|
148
|
+
return decade + "0-" + decade + "9";
|
149
|
+
else
|
150
|
+
# puts "getDateRange: #{r['001'].value} invalid date #{date}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
|
155
|
+
# Get the date range, based on the previously-computed pubdate
|
156
|
+
def self.pubDateRange(doc, r, wherePubdateIsStored)
|
157
|
+
previouslyComputedPubdate = doc[wherePubdateIsStored][0]
|
158
|
+
return [self.getDateRange(previouslyComputedPubdate)]
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
# We can do the same thing as a multi-return function -- compute the pubdate and
|
163
|
+
# the pubdaterange in one fell swoop.
|
164
|
+
#
|
165
|
+
# In this case, we *could* just use the above self.pubDateRange. However, there
|
166
|
+
# are times when you several fields are based on intermediate values that you
|
167
|
+
# don't want to actually store in the solr document itself (e.g., a set of call number
|
168
|
+
# that you want to normalize or translate in a few different ways, without actually wanting
|
169
|
+
# to store the raw callnumbers in their own field). You may also need access to more metadata
|
170
|
+
# as you're constructing the data (e.g., you may want to store titles and titles-without-non-filing-
|
171
|
+
# character in different fields, but you can't compute one from the other wihout access to the
|
172
|
+
# associated indicator-2 value).
|
173
|
+
#
|
174
|
+
# So, in this case, we'll get the pubDate and the pubDateRange all at once, just as an example,
|
175
|
+
# and put in the custom spec as:
|
176
|
+
#
|
177
|
+
# {
|
178
|
+
# :solrField => ['pubDate', 'pubDateRange'],
|
179
|
+
# :module => MARC2Solr::Custom,
|
180
|
+
# :functionSymbol => :pubDateAndRange
|
181
|
+
# }
|
182
|
+
|
183
|
+
|
184
|
+
def self.pubDateAndRange(doc, r)
|
185
|
+
date = self.getDate(doc, r)
|
186
|
+
return [nil, nil] unless date
|
187
|
+
range = self.getDateRange(date, r)
|
188
|
+
return [date, range]
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
end # close the inner module Custom
|
193
|
+
end # close the module MARC2Solr
|
194
|
+
|
data/lib/marc2solr.rb
ADDED
@@ -0,0 +1,452 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'logback-simple'
|
4
|
+
require 'trollop'
|
5
|
+
require 'ftools'
|
6
|
+
require 'jruby_streaming_update_solr_server'
|
7
|
+
require 'marc4j4r'
|
8
|
+
|
9
|
+
module MARC2Solr
|
10
|
+
|
11
|
+
class Conf
|
12
|
+
include Logback::Simple
|
13
|
+
|
14
|
+
SUB_COMMANDS = %w(index delete commit help)
|
15
|
+
|
16
|
+
|
17
|
+
OPTIONSCONFIG = [
|
18
|
+
[:config, {:desc => "Configuation file specifying options. Repeatable. Command-line arguments always override the config file(s)",
|
19
|
+
:type => :io,
|
20
|
+
:multi => true}],
|
21
|
+
[:benchmark, {:desc=> "Benchmark production of each solr field",
|
22
|
+
:only=> [:index],
|
23
|
+
:short => '-B'
|
24
|
+
}],
|
25
|
+
[:NObenchmark, {:desc=> "Benchmark production of each solr field",
|
26
|
+
:only=> [:index],
|
27
|
+
}],
|
28
|
+
[:dryrun, {:desc => "Don't send anything to solr",
|
29
|
+
}],
|
30
|
+
[:NOdryrun, {:desc => "Disable a previous 'dryrun' directive",
|
31
|
+
}],
|
32
|
+
|
33
|
+
[:printmarc, {:desc =>"Print MARC Record (as text) to --debugfile",
|
34
|
+
:only => [:index],
|
35
|
+
:short => '-r'
|
36
|
+
}],
|
37
|
+
[:NOprintmarc, {:desc =>"Turn off printing MARC Record (as text) to --debugfile",
|
38
|
+
:only => [:index],
|
39
|
+
}],
|
40
|
+
[:printdoc, {:desc => "Print each completed document to --debugfile",
|
41
|
+
:only => [:index],
|
42
|
+
:short => '-d'}
|
43
|
+
],
|
44
|
+
[:NOprintdoc, {:desc => "Turn off printing each completed document to --debugfile",
|
45
|
+
:only => [:index],
|
46
|
+
}],
|
47
|
+
[:debugfile, {:desc => "Where to send output from --printmarc and --printdoc (takes filename, 'STDERR', 'STDOUT', or 'NONE') (repeatable)", \
|
48
|
+
:default => "STDOUT",
|
49
|
+
:isOutfile => true,
|
50
|
+
:takesNone => true,
|
51
|
+
:type => String,
|
52
|
+
:only => [:delete, :index],
|
53
|
+
}],
|
54
|
+
[:clearsolr, {:desc => "Clean out Solr by deleting everything in it (DANGEROUS)",
|
55
|
+
:only => [:index]
|
56
|
+
}],
|
57
|
+
[:NOclearsolr, {:desc => "Disable a previous --clearsolr command",
|
58
|
+
:only => [:index]
|
59
|
+
}],
|
60
|
+
[:skipcommit, {:desc => "DON'T send solr a 'commit' afterwards",
|
61
|
+
:short => '-C',
|
62
|
+
:only => [:delete, :index],
|
63
|
+
}],
|
64
|
+
[:threads, {:desc => "Number of threads to use to process MARC records (>1 => use 'threach')",
|
65
|
+
:type => :int,
|
66
|
+
:default => 1,
|
67
|
+
:only => [:index]
|
68
|
+
}],
|
69
|
+
[:sussthreads, {:desc => "Number of threads to send completed docs to Solr",
|
70
|
+
:type => :int,
|
71
|
+
:default => 1}],
|
72
|
+
[:susssize, {:desc => "Size of the documente queue for sending to Solr",
|
73
|
+
:short => '-S',
|
74
|
+
:default => 128}],
|
75
|
+
[:machine, {:desc => "Name of solr machine (e.g., solr.myplace.org)",
|
76
|
+
:short => '-m',
|
77
|
+
# :required => [:index, :commit, :delete],
|
78
|
+
:type => String}],
|
79
|
+
[:port, {:desc => "Port of solr machine (e.g., '8088')",
|
80
|
+
:short => '-p',
|
81
|
+
:type => :int}],
|
82
|
+
[:solrpath, {:desc => "URL path to solr",
|
83
|
+
:short => '-P',
|
84
|
+
}],
|
85
|
+
[:javabin, {:desc => "Use javabin (presumes /update/bin is configured in schema.xml)",
|
86
|
+
}],
|
87
|
+
[:NOjavabin, {:desc => "Don't use javabin",
|
88
|
+
}],
|
89
|
+
[:logfile, {:desc => "Name of the logfile (filename, 'STDERR', 'DEFAULT', or 'NONE'). 'DEFAULT' is a file based on input file name",
|
90
|
+
:default => "DEFAULT",
|
91
|
+
:takesNone => true,
|
92
|
+
:type => String}],
|
93
|
+
[:loglevel, {:desc=>"Level at which to log (DEBUG, INFO, WARN, ERROR, OFF)",
|
94
|
+
:short => '-L',
|
95
|
+
:takesNone => true,
|
96
|
+
:valid => %w{OFF DEBUG INFO WARN ERROR },
|
97
|
+
:default => 'INFO'}],
|
98
|
+
[:logbatchsize, {:desc => "Write progress information to logfile after every N records",
|
99
|
+
:default => 25000,
|
100
|
+
:only => [:delete, :index],
|
101
|
+
:short => '-b'}],
|
102
|
+
[:indexfile, {:desc => "The index file describing your specset (usually index.dsl)",
|
103
|
+
:type => String,
|
104
|
+
:only => [:index],
|
105
|
+
}],
|
106
|
+
[:tmapdir, {:desc => "Directory that contains any translation maps",
|
107
|
+
:type => String,
|
108
|
+
:only => [:index]
|
109
|
+
}],
|
110
|
+
[:customdir, {:desc=>"The directory containging custom routine libraries (usually the 'lib' next to index.rb). Repeatable",
|
111
|
+
:only => [:index],
|
112
|
+
:multi => true,
|
113
|
+
:takesNone => true,
|
114
|
+
:type => String
|
115
|
+
}],
|
116
|
+
[:marctype, {:desc => "Type of marc file ('bestguess', 'strictmarc'. 'marcxml', 'alephsequential', 'permissivemarc')",
|
117
|
+
:only => [:index],
|
118
|
+
:short => '-t',
|
119
|
+
:valid => %w{bestguess strictmarc permissivemarc marcxml alephsequential },
|
120
|
+
:default => 'bestguess'
|
121
|
+
}],
|
122
|
+
[:encoding, {:desc => "Encoding of the MARC file ('bestguess', 'utf8', 'marc8', 'iso')",
|
123
|
+
:valid => %w{bestguess utf8 marc8 iso},
|
124
|
+
:only => [:index],
|
125
|
+
:default => 'bestguess'}],
|
126
|
+
[:gzipped, {:desc=>"Is the input gzipped? An extenstion of .gz will always force this to true",
|
127
|
+
:default => false,
|
128
|
+
:only => [:index, :delete],
|
129
|
+
}]
|
130
|
+
|
131
|
+
]
|
132
|
+
|
133
|
+
VALIDOPTIONS = {}
|
134
|
+
OPTIONSCONFIG.each {|a| VALIDOPTIONS[a[0]] = a[1]}
|
135
|
+
|
136
|
+
|
137
|
+
HELPTEXT = {
|
138
|
+
'help' => "Get help on a command\nmarc2solr help <cmd> where <cmd> is index, delete, or commit",
|
139
|
+
'index' => "Index the given MARC file\nmarc2solr index --config <file> --override <marcfile> <marcfile2...>",
|
140
|
+
'delete' => "Delete based on ID\nmarc2solr delete --config <file> --override <file_of_ids_to_delete> <another_file...>",
|
141
|
+
'commit' => "Send a commit to the specified Solr\nmarc2solr commit --config <file> --override",
|
142
|
+
}
|
143
|
+
|
144
|
+
attr_accessor :config, :cmdline, :rest, :command
|
145
|
+
def initialize
|
146
|
+
@config = {}
|
147
|
+
@cmdline = command_line_opts
|
148
|
+
|
149
|
+
# Load the config files
|
150
|
+
if @cmdline[:config]
|
151
|
+
@cmdline[:config].each do |f|
|
152
|
+
log.info "Reading config-file '#{f.path}'"
|
153
|
+
self.instance_eval(f.read)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Remove the config
|
158
|
+
# Now override with the command line
|
159
|
+
@cmdline.delete :config
|
160
|
+
@cmdline.delete :config_given
|
161
|
+
|
162
|
+
# Remove any "help" stuff
|
163
|
+
@cmdline.delete_if {|k, v| k.to_s =~ /^help/}
|
164
|
+
|
165
|
+
# Keep track of what was passed on cmdline
|
166
|
+
|
167
|
+
@cmdline_given = {}
|
168
|
+
@cmdline.keys.map do |k|
|
169
|
+
if k.to_s =~ /^(.+?)_given$/
|
170
|
+
@cmdline_given[$1.to_sym] = true
|
171
|
+
@cmdline.delete(k)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
@cmdline.each_pair do |k,v|
|
176
|
+
if @cmdline_given[k]
|
177
|
+
# puts "Send override #{k} = #{v}"
|
178
|
+
self.send(k,v)
|
179
|
+
else
|
180
|
+
unless @config.has_key? k
|
181
|
+
# puts "Send default #{k} = #{v}"
|
182
|
+
self.send(k,v)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
@rest = ARGV
|
188
|
+
end
|
189
|
+
|
190
|
+
def [] arg
|
191
|
+
return @config[arg]
|
192
|
+
end
|
193
|
+
|
194
|
+
def command_line_opts
|
195
|
+
@command = ARGV.shift # get the subcommand
|
196
|
+
|
197
|
+
# First, deal with the help situations
|
198
|
+
unless SUB_COMMANDS.include? @command
|
199
|
+
puts "Unknown command '#{@command}'" if @command
|
200
|
+
print_basic_help
|
201
|
+
end
|
202
|
+
|
203
|
+
if ARGV.size == 0
|
204
|
+
print_basic_help
|
205
|
+
end
|
206
|
+
|
207
|
+
if @command== 'help'
|
208
|
+
@command= ARGV.shift
|
209
|
+
if SUB_COMMANDS.include? @cmd
|
210
|
+
print_command_help @cmd
|
211
|
+
else
|
212
|
+
print_basic_help
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# OK. Now let's actuall get and return the args
|
217
|
+
#
|
218
|
+
# Trollop is a DSL and doesn't see our local instance variable, so I
|
219
|
+
# need to alias @commandto cmd
|
220
|
+
|
221
|
+
cmd = @command
|
222
|
+
return Trollop::options do
|
223
|
+
OPTIONSCONFIG.each do |opt|
|
224
|
+
k = opt[0]
|
225
|
+
d = opt[1]
|
226
|
+
next if d[:only] and not d[:only].include? cmd.to_sym
|
227
|
+
desc = d.delete(:desc)
|
228
|
+
opt k, desc, d
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
|
234
|
+
def print_basic_help
|
235
|
+
puts %Q{
|
236
|
+
marc2solr: get MARC data into Solr
|
237
|
+
|
238
|
+
USAGE
|
239
|
+
marc2solr index (index MARC records into Solr)
|
240
|
+
marc2solr delete (delete by ID from Solr)
|
241
|
+
marc2solr commit (send a 'commit' to a solr install)
|
242
|
+
|
243
|
+
Use "marc2solr <cmd> --help" for more help
|
244
|
+
|
245
|
+
}
|
246
|
+
Process.exit
|
247
|
+
end
|
248
|
+
|
249
|
+
def print_command_help cmd
|
250
|
+
ARGV.unshift '--help'
|
251
|
+
Trollop::options do
|
252
|
+
puts "\n\n" + HELPTEXT[cmd] + "\n\n"
|
253
|
+
puts "You may specify multiple configuration files and they will be loaded in"
|
254
|
+
puts "the order given."
|
255
|
+
puts ""
|
256
|
+
puts "Command line arguments always override configuration file settings\n\n"
|
257
|
+
|
258
|
+
OPTIONSCONFIG.each do |opt|
|
259
|
+
k = opt[0]
|
260
|
+
d = opt[1]
|
261
|
+
next if d[:only] and not d[:only].include? cmd.to_sym
|
262
|
+
desc = d.delete(:desc)
|
263
|
+
opt k, desc, d
|
264
|
+
end
|
265
|
+
end
|
266
|
+
print "\n\n"
|
267
|
+
Process.exit
|
268
|
+
|
269
|
+
end
|
270
|
+
|
271
|
+
|
272
|
+
def pretty_print(pp)
|
273
|
+
pp.pp @config
|
274
|
+
end
|
275
|
+
|
276
|
+
def method_missing(methodSymbol, arg=:notgiven, fromCmdline = false)
|
277
|
+
return @config[methodSymbol] if arg == :notgiven
|
278
|
+
methodSymbol = methodSymbol.to_s.gsub(/=$/, '').to_sym
|
279
|
+
|
280
|
+
# Deal with negatives. We only want them if the argument is true
|
281
|
+
if methodSymbol.to_s =~ /^NO(.*)/
|
282
|
+
if arg == true
|
283
|
+
methodSymbol = $1.to_sym
|
284
|
+
arg = false
|
285
|
+
else
|
286
|
+
# puts "Ignoring false-valued #{methodSymbol}"
|
287
|
+
return # do nothing
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
# puts " Setting #{methodSymbol} to #{arg}"
|
292
|
+
if VALIDOPTIONS.has_key? methodSymbol
|
293
|
+
conf = VALIDOPTIONS[methodSymbol]
|
294
|
+
# Zero it out?
|
295
|
+
if conf[:takesNone] and arg.to_a.map{|a| a.downcase}.include? 'none'
|
296
|
+
@config[methodSymbol] = nil
|
297
|
+
return nil
|
298
|
+
end
|
299
|
+
|
300
|
+
|
301
|
+
# Check for a valid value
|
302
|
+
if conf[:valid]
|
303
|
+
unless conf[:valid].include? arg
|
304
|
+
raise ArgumentError "'#{arg}' is not a valid value for #{methodSymbol}"
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
# Make it a file?
|
309
|
+
|
310
|
+
if conf[:isOutfile]
|
311
|
+
# If it's an IO object, just take it
|
312
|
+
break if arg.is_a? IO or arg.is_a? StringIO
|
313
|
+
|
314
|
+
# Otherwise...
|
315
|
+
case arg.downcase
|
316
|
+
when "stdin"
|
317
|
+
arg = STDIN
|
318
|
+
when "stdout"
|
319
|
+
arg = STDOUT
|
320
|
+
when "stderr"
|
321
|
+
arg = STDERR
|
322
|
+
else
|
323
|
+
arg = File.new(arg, 'w')
|
324
|
+
Trollop.die "Can't open '#{arg}' for writing in argument #{methodSymbol}" unless arg
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
|
329
|
+
if conf[:multi]
|
330
|
+
@config[methodSymbol] ||= []
|
331
|
+
@config[methodSymbol] << arg
|
332
|
+
@config[methodSymbol].flatten!
|
333
|
+
else
|
334
|
+
@config[methodSymbol] = arg
|
335
|
+
end
|
336
|
+
# puts "Set #{methodSymbol} to #{arg}"
|
337
|
+
return @config[methodSymbol]
|
338
|
+
else
|
339
|
+
raise NoMethodError, "'#{methodSymbol} is not a valid MARC2Solr configuration option for #{@cmd}"
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
|
344
|
+
# Create a SUSS from the given arguments
|
345
|
+
|
346
|
+
def sussURL
|
347
|
+
machine = self[:machine]
|
348
|
+
unless machine
|
349
|
+
log.error "Need solr machine name (--machine)"
|
350
|
+
raise ArgumentError, "Need solr machine name (--machine)"
|
351
|
+
end
|
352
|
+
|
353
|
+
port = self[:port]
|
354
|
+
unless port
|
355
|
+
log.error "Need solr port (--port)"
|
356
|
+
raise ArgumentError, "Need solr port (--port)"
|
357
|
+
end
|
358
|
+
|
359
|
+
path = self[:solrpath]
|
360
|
+
unless path
|
361
|
+
log.error "Need solr path (--solrpath)"
|
362
|
+
raise ArgumentError, "Need solr path (--solrpath)"
|
363
|
+
end
|
364
|
+
|
365
|
+
url = 'http://' + machine + ':' + port + '/' + path.gsub(/^\//, '')
|
366
|
+
end
|
367
|
+
|
368
|
+
def suss
|
369
|
+
url = self.sussURL
|
370
|
+
log.debug "Set suss url to #{url}"
|
371
|
+
|
372
|
+
suss = StreamingUpdateSolrServer.new(url,@config[:susssize],@config[:sussthreads])
|
373
|
+
if self[:javabin]
|
374
|
+
suss.setRequestWriter Java::org.apache.solr.client.solrj.impl.BinaryRequestWriter.new
|
375
|
+
log.debug "Using javabin"
|
376
|
+
end
|
377
|
+
return suss
|
378
|
+
end
|
379
|
+
|
380
|
+
def masterLogger
|
381
|
+
mlog = Logback::Simple::Logger.singleton(self.command)
|
382
|
+
mlog.loglevel = @config[:loglevel].downcase.to_sym
|
383
|
+
|
384
|
+
firstfile = self.rest[0] || self.command
|
385
|
+
logfilename = File.basename(firstfile).gsub(/\..*$/, '') # remove the last extension
|
386
|
+
logfilename += '-' + Time.new.strftime('%Y%m%d-%H%M%S') + '.log'
|
387
|
+
|
388
|
+
Logback::Simple.loglevel = @config[:loglevel].downcase.to_sym
|
389
|
+
case @config[:logfile]
|
390
|
+
when "STDERR"
|
391
|
+
Logback::Simple.startConsoleLogger
|
392
|
+
when "DEFAULT"
|
393
|
+
Logback::Simple.startFileLogger(logfilename)
|
394
|
+
when 'NONE', nil
|
395
|
+
# do nothing
|
396
|
+
else
|
397
|
+
Logback::Simple.startFileLogger(@config[:logfile])
|
398
|
+
end
|
399
|
+
return mlog
|
400
|
+
end
|
401
|
+
|
402
|
+
|
403
|
+
def reader filename
|
404
|
+
configuredType = @config[:marctype].downcase.to_sym
|
405
|
+
encoding = @config[:encoding].downcase.to_sym
|
406
|
+
|
407
|
+
if encoding == :bestguess
|
408
|
+
encoding = nil
|
409
|
+
end
|
410
|
+
|
411
|
+
gzipped = false
|
412
|
+
if configuredType == :bestguess
|
413
|
+
if filename =~ /\.(.+)$/ # if there's an extension
|
414
|
+
ext = File.basename(filename).split(/\./)[-1].downcase
|
415
|
+
if ext == 'gz'
|
416
|
+
ext = File.basename(filename).split(/\./)[-2].downcase
|
417
|
+
gzipped = true
|
418
|
+
end
|
419
|
+
|
420
|
+
log.info "Sniffed marc file type as #{ext}"
|
421
|
+
case ext
|
422
|
+
when /xml/, /marcxml/
|
423
|
+
type = :marcxml
|
424
|
+
when /seq/, /aleph/
|
425
|
+
type = :alephsequential
|
426
|
+
else
|
427
|
+
type = :permissivemarc
|
428
|
+
end
|
429
|
+
else
|
430
|
+
type = :permissivemarc
|
431
|
+
end
|
432
|
+
else
|
433
|
+
type = configuredType
|
434
|
+
end
|
435
|
+
|
436
|
+
source = filename
|
437
|
+
if source == "STDIN"
|
438
|
+
source = STDIN
|
439
|
+
end
|
440
|
+
|
441
|
+
if gzipped or @config[:gzipped]
|
442
|
+
source = Java::java.util.zip.GZIPInputStream.new(IOConvert.byteinstream(source))
|
443
|
+
end
|
444
|
+
|
445
|
+
return MARC4J4R::Reader.new(source, type, encoding)
|
446
|
+
end
|
447
|
+
|
448
|
+
|
449
|
+
end
|
450
|
+
end
|
451
|
+
|
452
|
+
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|