marc2solr 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.md +29 -0
- data/Rakefile +56 -0
- data/bin/marc2solr +247 -0
- data/bin/solrmarc_to_marc2solr +260 -0
- data/lib/marc2solr/marc2solr_custom.rb +194 -0
- data/lib/marc2solr.rb +452 -0
- data/spec/marc2solr_spec.rb +7 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +190 -0
@@ -0,0 +1,194 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'logback-simple'
|
3
|
+
|
4
|
+
module MARC2Solr
|
5
|
+
module Custom
|
6
|
+
LOG = Logback::Simple::Logger.singleton
|
7
|
+
|
8
|
+
# Custom routines are defined as module methods that take two arguments: a MARC4J4R record,
|
9
|
+
# and an (optional) array of other arguments passed in.
|
10
|
+
#
|
11
|
+
# They don't need to live in the MARC2Solr::Custom namespace, but it's not a bad idea to use, e.g.,
|
12
|
+
# MARC2Solr::Custom::UMich, or maybe MARC2Solr::Custom::DateStuff
|
13
|
+
#
|
14
|
+
# You can return multiple values in an array
|
15
|
+
|
16
|
+
# The simplest possible example; just call a method on the underlying MARC4J4R record
|
17
|
+
# Note that even though we don't use the arguments, the method signature has to
|
18
|
+
# support it
|
19
|
+
#
|
20
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
21
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
22
|
+
# @param [#[]] doc A hashlike (responds to #[]) that holds the computed values for fields "so far"
|
23
|
+
# @return [String] The XML representation of the record
|
24
|
+
|
25
|
+
def self.asXML doc, r #Remember, module fucntion! Define with "def self.methodName"
|
26
|
+
return r.to_xml
|
27
|
+
end
|
28
|
+
|
29
|
+
# Another for marc binary
|
30
|
+
def self.asMARC doc, r
|
31
|
+
return r.to_marc
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# And another for marc-in-json
|
36
|
+
|
37
|
+
def self.as_marc_in_json doc, r
|
38
|
+
return r.to_marc_in_json
|
39
|
+
end
|
40
|
+
|
41
|
+
# Here we get all the text from fields between (inclusive) the two tag strings in args;
|
42
|
+
#
|
43
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
44
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
45
|
+
# @param [Array<String>] args An array of two strings, the lowest tag you want to include, and
|
46
|
+
# the highest
|
47
|
+
# @return [String] A single single string with all the text from included fields
|
48
|
+
def self.getAllSearchableFields(doc, r, lower, upper)
|
49
|
+
data = []
|
50
|
+
r.each do |field|
|
51
|
+
next unless field.tag <= upper and field.tag >= lower
|
52
|
+
data << field.value
|
53
|
+
end
|
54
|
+
return data.join(' ')
|
55
|
+
end
|
56
|
+
|
57
|
+
# How about one to sort out, say, the 035s? We'll make a generic routine
|
58
|
+
# that looks for specified values in specified subfields of variable
|
59
|
+
# fields, and then make sure they match before returning them.
|
60
|
+
#
|
61
|
+
# See the use of this in the simple_sample/simple_index.rb file for field 'oclc'
|
62
|
+
#
|
63
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
64
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
65
|
+
# @param [String] tag A tag string (e.g., '035')
|
66
|
+
# @param [String, Array<String>] codes A subfield code ('a') or array of them (['a', 'c'])
|
67
|
+
# @param [Regexp] pattern A pattern that must match for the value to be included
|
68
|
+
# @param [Fixnum] matchindex The number of the substring captured by parens in the pattern to return
|
69
|
+
# The default is zero, which means "the whole string"
|
70
|
+
# @return [Array<String>] a (possibly empty) array of found values
|
71
|
+
def self.valsByPattern(doc, r, tag, codes, pattern, matchindex=0)
|
72
|
+
data = []
|
73
|
+
r.find_by_tag(tag).each do |f|
|
74
|
+
f.sub_values(codes).each do |v|
|
75
|
+
if m = pattern.match(v)
|
76
|
+
data << m[matchindex]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
data.uniq!
|
81
|
+
return data
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
# An example of a DateOfPublication implementation
|
86
|
+
# @param [hashlike] doc The document object being added to; allows you to leverage already-done work
|
87
|
+
# @param [MARC4J4R::Record] r A MARC4J4R record
|
88
|
+
# @return [String] the found date, or nil if not found.
|
89
|
+
|
90
|
+
def self.getDate doc, r
|
91
|
+
begin
|
92
|
+
ohoh8 = r['008'].value
|
93
|
+
date1 = ohoh8[7..10].downcase
|
94
|
+
datetype = ohoh8[6..6]
|
95
|
+
if ['n','u','b'].include? datetype
|
96
|
+
date1 = ""
|
97
|
+
else
|
98
|
+
date1 = date1.gsub('u', '0').gsub('|', ' ')
|
99
|
+
date1 = '' if date1 == '0000'
|
100
|
+
end
|
101
|
+
|
102
|
+
if m = /^\d\d\d\d$/.match(date1)
|
103
|
+
return m[0]
|
104
|
+
end
|
105
|
+
rescue
|
106
|
+
# do nothing ... go on to the 260c
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
# No good? Fall back on the 260c
|
111
|
+
begin
|
112
|
+
d = r['260']['c']
|
113
|
+
if m = /\d\d\d\d/.match(d)
|
114
|
+
return m[0]
|
115
|
+
end
|
116
|
+
rescue
|
117
|
+
LOG.debug "Record #{r['001']} has no valid date"
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# A simple function to pull the non-indexing characters off the front of a field
|
123
|
+
# based on the second indicator
|
124
|
+
def self.fieldWithoutIndexingChars doc, r, tag
|
125
|
+
vals = []
|
126
|
+
r.find_by_tag(tag).each do |df|
|
127
|
+
ind2 = df.ind2.to_i
|
128
|
+
if ind2 > 0
|
129
|
+
vals << df.value[ind2..-1]
|
130
|
+
end
|
131
|
+
end
|
132
|
+
return vals
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
# A helper function -- take in a year, and return a date category
|
137
|
+
def self.getDateRange(date, r)
|
138
|
+
if date < "1500"
|
139
|
+
return "Pre-1500"
|
140
|
+
end
|
141
|
+
|
142
|
+
case date.to_i
|
143
|
+
when 1500..1800 then
|
144
|
+
century = date[0..1]
|
145
|
+
return century + '00' + century + '99'
|
146
|
+
when 1801..2100 then
|
147
|
+
decade = date[0..2]
|
148
|
+
return decade + "0-" + decade + "9";
|
149
|
+
else
|
150
|
+
# puts "getDateRange: #{r['001'].value} invalid date #{date}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
|
155
|
+
# Get the date range, based on the previously-computed pubdate
|
156
|
+
def self.pubDateRange(doc, r, wherePubdateIsStored)
|
157
|
+
previouslyComputedPubdate = doc[wherePubdateIsStored][0]
|
158
|
+
return [self.getDateRange(previouslyComputedPubdate)]
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
# We can do the same thing as a multi-return function -- compute the pubdate and
|
163
|
+
# the pubdaterange in one fell swoop.
|
164
|
+
#
|
165
|
+
# In this case, we *could* just use the above self.pubDateRange. However, there
|
166
|
+
# are times when you several fields are based on intermediate values that you
|
167
|
+
# don't want to actually store in the solr document itself (e.g., a set of call number
|
168
|
+
# that you want to normalize or translate in a few different ways, without actually wanting
|
169
|
+
# to store the raw callnumbers in their own field). You may also need access to more metadata
|
170
|
+
# as you're constructing the data (e.g., you may want to store titles and titles-without-non-filing-
|
171
|
+
# character in different fields, but you can't compute one from the other wihout access to the
|
172
|
+
# associated indicator-2 value).
|
173
|
+
#
|
174
|
+
# So, in this case, we'll get the pubDate and the pubDateRange all at once, just as an example,
|
175
|
+
# and put in the custom spec as:
|
176
|
+
#
|
177
|
+
# {
|
178
|
+
# :solrField => ['pubDate', 'pubDateRange'],
|
179
|
+
# :module => MARC2Solr::Custom,
|
180
|
+
# :functionSymbol => :pubDateAndRange
|
181
|
+
# }
|
182
|
+
|
183
|
+
|
184
|
+
def self.pubDateAndRange(doc, r)
|
185
|
+
date = self.getDate(doc, r)
|
186
|
+
return [nil, nil] unless date
|
187
|
+
range = self.getDateRange(date, r)
|
188
|
+
return [date, range]
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
end # close the inner module Custom
|
193
|
+
end # close the module MARC2Solr
|
194
|
+
|
data/lib/marc2solr.rb
ADDED
@@ -0,0 +1,452 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require 'logback-simple'
|
4
|
+
require 'trollop'
|
5
|
+
require 'ftools'
|
6
|
+
require 'jruby_streaming_update_solr_server'
|
7
|
+
require 'marc4j4r'
|
8
|
+
|
9
|
+
module MARC2Solr
|
10
|
+
|
11
|
+
class Conf
|
12
|
+
include Logback::Simple
|
13
|
+
|
14
|
+
SUB_COMMANDS = %w(index delete commit help)
|
15
|
+
|
16
|
+
|
17
|
+
OPTIONSCONFIG = [
|
18
|
+
[:config, {:desc => "Configuation file specifying options. Repeatable. Command-line arguments always override the config file(s)",
|
19
|
+
:type => :io,
|
20
|
+
:multi => true}],
|
21
|
+
[:benchmark, {:desc=> "Benchmark production of each solr field",
|
22
|
+
:only=> [:index],
|
23
|
+
:short => '-B'
|
24
|
+
}],
|
25
|
+
[:NObenchmark, {:desc=> "Benchmark production of each solr field",
|
26
|
+
:only=> [:index],
|
27
|
+
}],
|
28
|
+
[:dryrun, {:desc => "Don't send anything to solr",
|
29
|
+
}],
|
30
|
+
[:NOdryrun, {:desc => "Disable a previous 'dryrun' directive",
|
31
|
+
}],
|
32
|
+
|
33
|
+
[:printmarc, {:desc =>"Print MARC Record (as text) to --debugfile",
|
34
|
+
:only => [:index],
|
35
|
+
:short => '-r'
|
36
|
+
}],
|
37
|
+
[:NOprintmarc, {:desc =>"Turn off printing MARC Record (as text) to --debugfile",
|
38
|
+
:only => [:index],
|
39
|
+
}],
|
40
|
+
[:printdoc, {:desc => "Print each completed document to --debugfile",
|
41
|
+
:only => [:index],
|
42
|
+
:short => '-d'}
|
43
|
+
],
|
44
|
+
[:NOprintdoc, {:desc => "Turn off printing each completed document to --debugfile",
|
45
|
+
:only => [:index],
|
46
|
+
}],
|
47
|
+
[:debugfile, {:desc => "Where to send output from --printmarc and --printdoc (takes filename, 'STDERR', 'STDOUT', or 'NONE') (repeatable)", \
|
48
|
+
:default => "STDOUT",
|
49
|
+
:isOutfile => true,
|
50
|
+
:takesNone => true,
|
51
|
+
:type => String,
|
52
|
+
:only => [:delete, :index],
|
53
|
+
}],
|
54
|
+
[:clearsolr, {:desc => "Clean out Solr by deleting everything in it (DANGEROUS)",
|
55
|
+
:only => [:index]
|
56
|
+
}],
|
57
|
+
[:NOclearsolr, {:desc => "Disable a previous --clearsolr command",
|
58
|
+
:only => [:index]
|
59
|
+
}],
|
60
|
+
[:skipcommit, {:desc => "DON'T send solr a 'commit' afterwards",
|
61
|
+
:short => '-C',
|
62
|
+
:only => [:delete, :index],
|
63
|
+
}],
|
64
|
+
[:threads, {:desc => "Number of threads to use to process MARC records (>1 => use 'threach')",
|
65
|
+
:type => :int,
|
66
|
+
:default => 1,
|
67
|
+
:only => [:index]
|
68
|
+
}],
|
69
|
+
[:sussthreads, {:desc => "Number of threads to send completed docs to Solr",
|
70
|
+
:type => :int,
|
71
|
+
:default => 1}],
|
72
|
+
[:susssize, {:desc => "Size of the documente queue for sending to Solr",
|
73
|
+
:short => '-S',
|
74
|
+
:default => 128}],
|
75
|
+
[:machine, {:desc => "Name of solr machine (e.g., solr.myplace.org)",
|
76
|
+
:short => '-m',
|
77
|
+
# :required => [:index, :commit, :delete],
|
78
|
+
:type => String}],
|
79
|
+
[:port, {:desc => "Port of solr machine (e.g., '8088')",
|
80
|
+
:short => '-p',
|
81
|
+
:type => :int}],
|
82
|
+
[:solrpath, {:desc => "URL path to solr",
|
83
|
+
:short => '-P',
|
84
|
+
}],
|
85
|
+
[:javabin, {:desc => "Use javabin (presumes /update/bin is configured in schema.xml)",
|
86
|
+
}],
|
87
|
+
[:NOjavabin, {:desc => "Don't use javabin",
|
88
|
+
}],
|
89
|
+
[:logfile, {:desc => "Name of the logfile (filename, 'STDERR', 'DEFAULT', or 'NONE'). 'DEFAULT' is a file based on input file name",
|
90
|
+
:default => "DEFAULT",
|
91
|
+
:takesNone => true,
|
92
|
+
:type => String}],
|
93
|
+
[:loglevel, {:desc=>"Level at which to log (DEBUG, INFO, WARN, ERROR, OFF)",
|
94
|
+
:short => '-L',
|
95
|
+
:takesNone => true,
|
96
|
+
:valid => %w{OFF DEBUG INFO WARN ERROR },
|
97
|
+
:default => 'INFO'}],
|
98
|
+
[:logbatchsize, {:desc => "Write progress information to logfile after every N records",
|
99
|
+
:default => 25000,
|
100
|
+
:only => [:delete, :index],
|
101
|
+
:short => '-b'}],
|
102
|
+
[:indexfile, {:desc => "The index file describing your specset (usually index.dsl)",
|
103
|
+
:type => String,
|
104
|
+
:only => [:index],
|
105
|
+
}],
|
106
|
+
[:tmapdir, {:desc => "Directory that contains any translation maps",
|
107
|
+
:type => String,
|
108
|
+
:only => [:index]
|
109
|
+
}],
|
110
|
+
[:customdir, {:desc=>"The directory containging custom routine libraries (usually the 'lib' next to index.rb). Repeatable",
|
111
|
+
:only => [:index],
|
112
|
+
:multi => true,
|
113
|
+
:takesNone => true,
|
114
|
+
:type => String
|
115
|
+
}],
|
116
|
+
[:marctype, {:desc => "Type of marc file ('bestguess', 'strictmarc'. 'marcxml', 'alephsequential', 'permissivemarc')",
|
117
|
+
:only => [:index],
|
118
|
+
:short => '-t',
|
119
|
+
:valid => %w{bestguess strictmarc permissivemarc marcxml alephsequential },
|
120
|
+
:default => 'bestguess'
|
121
|
+
}],
|
122
|
+
[:encoding, {:desc => "Encoding of the MARC file ('bestguess', 'utf8', 'marc8', 'iso')",
|
123
|
+
:valid => %w{bestguess utf8 marc8 iso},
|
124
|
+
:only => [:index],
|
125
|
+
:default => 'bestguess'}],
|
126
|
+
[:gzipped, {:desc=>"Is the input gzipped? An extenstion of .gz will always force this to true",
|
127
|
+
:default => false,
|
128
|
+
:only => [:index, :delete],
|
129
|
+
}]
|
130
|
+
|
131
|
+
]
|
132
|
+
|
133
|
+
VALIDOPTIONS = {}
|
134
|
+
OPTIONSCONFIG.each {|a| VALIDOPTIONS[a[0]] = a[1]}
|
135
|
+
|
136
|
+
|
137
|
+
HELPTEXT = {
|
138
|
+
'help' => "Get help on a command\nmarc2solr help <cmd> where <cmd> is index, delete, or commit",
|
139
|
+
'index' => "Index the given MARC file\nmarc2solr index --config <file> --override <marcfile> <marcfile2...>",
|
140
|
+
'delete' => "Delete based on ID\nmarc2solr delete --config <file> --override <file_of_ids_to_delete> <another_file...>",
|
141
|
+
'commit' => "Send a commit to the specified Solr\nmarc2solr commit --config <file> --override",
|
142
|
+
}
|
143
|
+
|
144
|
+
attr_accessor :config, :cmdline, :rest, :command
|
145
|
+
def initialize
|
146
|
+
@config = {}
|
147
|
+
@cmdline = command_line_opts
|
148
|
+
|
149
|
+
# Load the config files
|
150
|
+
if @cmdline[:config]
|
151
|
+
@cmdline[:config].each do |f|
|
152
|
+
log.info "Reading config-file '#{f.path}'"
|
153
|
+
self.instance_eval(f.read)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Remove the config
|
158
|
+
# Now override with the command line
|
159
|
+
@cmdline.delete :config
|
160
|
+
@cmdline.delete :config_given
|
161
|
+
|
162
|
+
# Remove any "help" stuff
|
163
|
+
@cmdline.delete_if {|k, v| k.to_s =~ /^help/}
|
164
|
+
|
165
|
+
# Keep track of what was passed on cmdline
|
166
|
+
|
167
|
+
@cmdline_given = {}
|
168
|
+
@cmdline.keys.map do |k|
|
169
|
+
if k.to_s =~ /^(.+?)_given$/
|
170
|
+
@cmdline_given[$1.to_sym] = true
|
171
|
+
@cmdline.delete(k)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
@cmdline.each_pair do |k,v|
|
176
|
+
if @cmdline_given[k]
|
177
|
+
# puts "Send override #{k} = #{v}"
|
178
|
+
self.send(k,v)
|
179
|
+
else
|
180
|
+
unless @config.has_key? k
|
181
|
+
# puts "Send default #{k} = #{v}"
|
182
|
+
self.send(k,v)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
@rest = ARGV
|
188
|
+
end
|
189
|
+
|
190
|
+
def [] arg
|
191
|
+
return @config[arg]
|
192
|
+
end
|
193
|
+
|
194
|
+
def command_line_opts
|
195
|
+
@command = ARGV.shift # get the subcommand
|
196
|
+
|
197
|
+
# First, deal with the help situations
|
198
|
+
unless SUB_COMMANDS.include? @command
|
199
|
+
puts "Unknown command '#{@command}'" if @command
|
200
|
+
print_basic_help
|
201
|
+
end
|
202
|
+
|
203
|
+
if ARGV.size == 0
|
204
|
+
print_basic_help
|
205
|
+
end
|
206
|
+
|
207
|
+
if @command== 'help'
|
208
|
+
@command= ARGV.shift
|
209
|
+
if SUB_COMMANDS.include? @cmd
|
210
|
+
print_command_help @cmd
|
211
|
+
else
|
212
|
+
print_basic_help
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# OK. Now let's actuall get and return the args
|
217
|
+
#
|
218
|
+
# Trollop is a DSL and doesn't see our local instance variable, so I
|
219
|
+
# need to alias @commandto cmd
|
220
|
+
|
221
|
+
cmd = @command
|
222
|
+
return Trollop::options do
|
223
|
+
OPTIONSCONFIG.each do |opt|
|
224
|
+
k = opt[0]
|
225
|
+
d = opt[1]
|
226
|
+
next if d[:only] and not d[:only].include? cmd.to_sym
|
227
|
+
desc = d.delete(:desc)
|
228
|
+
opt k, desc, d
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
|
234
|
+
def print_basic_help
|
235
|
+
puts %Q{
|
236
|
+
marc2solr: get MARC data into Solr
|
237
|
+
|
238
|
+
USAGE
|
239
|
+
marc2solr index (index MARC records into Solr)
|
240
|
+
marc2solr delete (delete by ID from Solr)
|
241
|
+
marc2solr commit (send a 'commit' to a solr install)
|
242
|
+
|
243
|
+
Use "marc2solr <cmd> --help" for more help
|
244
|
+
|
245
|
+
}
|
246
|
+
Process.exit
|
247
|
+
end
|
248
|
+
|
249
|
+
def print_command_help cmd
|
250
|
+
ARGV.unshift '--help'
|
251
|
+
Trollop::options do
|
252
|
+
puts "\n\n" + HELPTEXT[cmd] + "\n\n"
|
253
|
+
puts "You may specify multiple configuration files and they will be loaded in"
|
254
|
+
puts "the order given."
|
255
|
+
puts ""
|
256
|
+
puts "Command line arguments always override configuration file settings\n\n"
|
257
|
+
|
258
|
+
OPTIONSCONFIG.each do |opt|
|
259
|
+
k = opt[0]
|
260
|
+
d = opt[1]
|
261
|
+
next if d[:only] and not d[:only].include? cmd.to_sym
|
262
|
+
desc = d.delete(:desc)
|
263
|
+
opt k, desc, d
|
264
|
+
end
|
265
|
+
end
|
266
|
+
print "\n\n"
|
267
|
+
Process.exit
|
268
|
+
|
269
|
+
end
|
270
|
+
|
271
|
+
|
272
|
+
def pretty_print(pp)
|
273
|
+
pp.pp @config
|
274
|
+
end
|
275
|
+
|
276
|
+
def method_missing(methodSymbol, arg=:notgiven, fromCmdline = false)
|
277
|
+
return @config[methodSymbol] if arg == :notgiven
|
278
|
+
methodSymbol = methodSymbol.to_s.gsub(/=$/, '').to_sym
|
279
|
+
|
280
|
+
# Deal with negatives. We only want them if the argument is true
|
281
|
+
if methodSymbol.to_s =~ /^NO(.*)/
|
282
|
+
if arg == true
|
283
|
+
methodSymbol = $1.to_sym
|
284
|
+
arg = false
|
285
|
+
else
|
286
|
+
# puts "Ignoring false-valued #{methodSymbol}"
|
287
|
+
return # do nothing
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|
291
|
+
# puts " Setting #{methodSymbol} to #{arg}"
|
292
|
+
if VALIDOPTIONS.has_key? methodSymbol
|
293
|
+
conf = VALIDOPTIONS[methodSymbol]
|
294
|
+
# Zero it out?
|
295
|
+
if conf[:takesNone] and arg.to_a.map{|a| a.downcase}.include? 'none'
|
296
|
+
@config[methodSymbol] = nil
|
297
|
+
return nil
|
298
|
+
end
|
299
|
+
|
300
|
+
|
301
|
+
# Check for a valid value
|
302
|
+
if conf[:valid]
|
303
|
+
unless conf[:valid].include? arg
|
304
|
+
raise ArgumentError "'#{arg}' is not a valid value for #{methodSymbol}"
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
# Make it a file?
|
309
|
+
|
310
|
+
if conf[:isOutfile]
|
311
|
+
# If it's an IO object, just take it
|
312
|
+
break if arg.is_a? IO or arg.is_a? StringIO
|
313
|
+
|
314
|
+
# Otherwise...
|
315
|
+
case arg.downcase
|
316
|
+
when "stdin"
|
317
|
+
arg = STDIN
|
318
|
+
when "stdout"
|
319
|
+
arg = STDOUT
|
320
|
+
when "stderr"
|
321
|
+
arg = STDERR
|
322
|
+
else
|
323
|
+
arg = File.new(arg, 'w')
|
324
|
+
Trollop.die "Can't open '#{arg}' for writing in argument #{methodSymbol}" unless arg
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
|
329
|
+
if conf[:multi]
|
330
|
+
@config[methodSymbol] ||= []
|
331
|
+
@config[methodSymbol] << arg
|
332
|
+
@config[methodSymbol].flatten!
|
333
|
+
else
|
334
|
+
@config[methodSymbol] = arg
|
335
|
+
end
|
336
|
+
# puts "Set #{methodSymbol} to #{arg}"
|
337
|
+
return @config[methodSymbol]
|
338
|
+
else
|
339
|
+
raise NoMethodError, "'#{methodSymbol} is not a valid MARC2Solr configuration option for #{@cmd}"
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
|
344
|
+
# Create a SUSS from the given arguments
|
345
|
+
|
346
|
+
def sussURL
|
347
|
+
machine = self[:machine]
|
348
|
+
unless machine
|
349
|
+
log.error "Need solr machine name (--machine)"
|
350
|
+
raise ArgumentError, "Need solr machine name (--machine)"
|
351
|
+
end
|
352
|
+
|
353
|
+
port = self[:port]
|
354
|
+
unless port
|
355
|
+
log.error "Need solr port (--port)"
|
356
|
+
raise ArgumentError, "Need solr port (--port)"
|
357
|
+
end
|
358
|
+
|
359
|
+
path = self[:solrpath]
|
360
|
+
unless path
|
361
|
+
log.error "Need solr path (--solrpath)"
|
362
|
+
raise ArgumentError, "Need solr path (--solrpath)"
|
363
|
+
end
|
364
|
+
|
365
|
+
url = 'http://' + machine + ':' + port + '/' + path.gsub(/^\//, '')
|
366
|
+
end
|
367
|
+
|
368
|
+
def suss
|
369
|
+
url = self.sussURL
|
370
|
+
log.debug "Set suss url to #{url}"
|
371
|
+
|
372
|
+
suss = StreamingUpdateSolrServer.new(url,@config[:susssize],@config[:sussthreads])
|
373
|
+
if self[:javabin]
|
374
|
+
suss.setRequestWriter Java::org.apache.solr.client.solrj.impl.BinaryRequestWriter.new
|
375
|
+
log.debug "Using javabin"
|
376
|
+
end
|
377
|
+
return suss
|
378
|
+
end
|
379
|
+
|
380
|
+
def masterLogger
|
381
|
+
mlog = Logback::Simple::Logger.singleton(self.command)
|
382
|
+
mlog.loglevel = @config[:loglevel].downcase.to_sym
|
383
|
+
|
384
|
+
firstfile = self.rest[0] || self.command
|
385
|
+
logfilename = File.basename(firstfile).gsub(/\..*$/, '') # remove the last extension
|
386
|
+
logfilename += '-' + Time.new.strftime('%Y%m%d-%H%M%S') + '.log'
|
387
|
+
|
388
|
+
Logback::Simple.loglevel = @config[:loglevel].downcase.to_sym
|
389
|
+
case @config[:logfile]
|
390
|
+
when "STDERR"
|
391
|
+
Logback::Simple.startConsoleLogger
|
392
|
+
when "DEFAULT"
|
393
|
+
Logback::Simple.startFileLogger(logfilename)
|
394
|
+
when 'NONE', nil
|
395
|
+
# do nothing
|
396
|
+
else
|
397
|
+
Logback::Simple.startFileLogger(@config[:logfile])
|
398
|
+
end
|
399
|
+
return mlog
|
400
|
+
end
|
401
|
+
|
402
|
+
|
403
|
+
def reader filename
|
404
|
+
configuredType = @config[:marctype].downcase.to_sym
|
405
|
+
encoding = @config[:encoding].downcase.to_sym
|
406
|
+
|
407
|
+
if encoding == :bestguess
|
408
|
+
encoding = nil
|
409
|
+
end
|
410
|
+
|
411
|
+
gzipped = false
|
412
|
+
if configuredType == :bestguess
|
413
|
+
if filename =~ /\.(.+)$/ # if there's an extension
|
414
|
+
ext = File.basename(filename).split(/\./)[-1].downcase
|
415
|
+
if ext == 'gz'
|
416
|
+
ext = File.basename(filename).split(/\./)[-2].downcase
|
417
|
+
gzipped = true
|
418
|
+
end
|
419
|
+
|
420
|
+
log.info "Sniffed marc file type as #{ext}"
|
421
|
+
case ext
|
422
|
+
when /xml/, /marcxml/
|
423
|
+
type = :marcxml
|
424
|
+
when /seq/, /aleph/
|
425
|
+
type = :alephsequential
|
426
|
+
else
|
427
|
+
type = :permissivemarc
|
428
|
+
end
|
429
|
+
else
|
430
|
+
type = :permissivemarc
|
431
|
+
end
|
432
|
+
else
|
433
|
+
type = configuredType
|
434
|
+
end
|
435
|
+
|
436
|
+
source = filename
|
437
|
+
if source == "STDIN"
|
438
|
+
source = STDIN
|
439
|
+
end
|
440
|
+
|
441
|
+
if gzipped or @config[:gzipped]
|
442
|
+
source = Java::java.util.zip.GZIPInputStream.new(IOConvert.byteinstream(source))
|
443
|
+
end
|
444
|
+
|
445
|
+
return MARC4J4R::Reader.new(source, type, encoding)
|
446
|
+
end
|
447
|
+
|
448
|
+
|
449
|
+
end
|
450
|
+
end
|
451
|
+
|
452
|
+
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|