traject 0.0.2 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/README.md +85 -61
- data/Rakefile +5 -0
- data/bin/traject +31 -3
- data/doc/settings.md +74 -13
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject/indexer/settings.rb +75 -0
- data/lib/traject/indexer.rb +255 -45
- data/lib/traject/json_writer.rb +4 -2
- data/lib/traject/macros/marc21.rb +18 -6
- data/lib/traject/macros/marc21_semantics.rb +405 -0
- data/lib/traject/macros/marc_format_classifier.rb +180 -0
- data/lib/traject/marc4j_reader.rb +160 -0
- data/lib/traject/marc_extractor.rb +33 -17
- data/lib/traject/marc_reader.rb +14 -11
- data/lib/traject/solrj_writer.rb +247 -9
- data/lib/traject/thread_pool.rb +154 -0
- data/lib/traject/translation_map.rb +46 -4
- data/lib/traject/util.rb +30 -0
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/indexer/each_record_test.rb +34 -0
- data/test/indexer/macros_marc21_semantics_test.rb +206 -0
- data/test/indexer/macros_marc21_test.rb +10 -1
- data/test/indexer/map_record_test.rb +78 -8
- data/test/indexer/read_write_test.rb +43 -10
- data/test/indexer/settings_test.rb +60 -4
- data/test/indexer/to_field_test.rb +39 -0
- data/test/marc4j_reader_test.rb +75 -0
- data/test/marc_extractor_test.rb +62 -0
- data/test/marc_format_classifier_test.rb +91 -0
- data/test/marc_reader_test.rb +12 -0
- data/test/solrj_writer_test.rb +146 -43
- data/test/test_helper.rb +50 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +153 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +8 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/traject.gemspec +1 -1
- data/vendor/marc4j/README.md +17 -0
- data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
- metadata +81 -2
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -9,16 +9,16 @@ them somewhere.
|
|
9
9
|
|
10
10
|
## Background/Goals
|
11
11
|
|
12
|
-
|
12
|
+
Existing tools for indexing Marc to Solr exist, and have served many of us for many years. But I was having more and more difficulty working with the existing tools, and difficulty providing the custom logic I needed in a maintainable way. I realized that for me, to create a tool with the flexibility, maintainability, and performance I wanted, I would need to do it in jruby (ruby on the JVM).
|
13
13
|
|
14
|
-
|
14
|
+
Some goals:
|
15
15
|
|
16
|
-
*
|
17
|
-
*
|
18
|
-
|
19
|
-
|
20
|
-
*
|
21
|
-
*
|
16
|
+
* Aim to be accessible even to non-rubyists
|
17
|
+
* Concise and maintainable local configuration -- including an only gradual increase in difficulty to write your own simple logic.
|
18
|
+
* Support reusable and shareable mapping logic routines.
|
19
|
+
* Built of modular and composable elements: If you want to change part of what traject does, you should be able to do so without having to reimplement other things you don't want to change.
|
20
|
+
* A maintainable internal architecture, well-factored with seperated concerns and DRY logic. Aim to be comprehensible to newcomer developers, and well-covered by tests.
|
21
|
+
* High performance, using multi-threaded concurrency where appropriate to maximize throughput. Actual throughput can depend on complexity of your mapping rules and capacity of your server(s), but I am getting throughput 2-5x greater than previous solutions.
|
22
22
|
|
23
23
|
|
24
24
|
## Installation
|
@@ -57,26 +57,42 @@ in a config file:
|
|
57
57
|
|
58
58
|
settings do
|
59
59
|
# Where to find solr server to write to
|
60
|
-
|
60
|
+
provide "solr.url", "http://example.org/solr"
|
61
|
+
|
62
|
+
# If you are connecting to Solr 1.x, you need to set
|
63
|
+
# for SolrJ compatibility:
|
64
|
+
# provide "solrj_writer.parser_class_name", "XMLResponseParser"
|
61
65
|
|
62
66
|
# solr.version doesn't currently do anything, but set it
|
63
67
|
# anyway, in the future it will warn you if you have settings
|
64
68
|
# that may not work with your version.
|
65
|
-
|
69
|
+
provide "solr.version", "4.3.0"
|
66
70
|
|
67
71
|
# default source type is binary, traject can't guess
|
68
72
|
# you have to tell it.
|
69
|
-
|
73
|
+
provide "marc_source.type", "xml"
|
70
74
|
|
71
75
|
# settings can be set on command line instead of
|
72
76
|
# config file too.
|
73
77
|
|
74
78
|
# various others...
|
75
|
-
|
79
|
+
provide "solrj_writer.commit_on_close", "true"
|
80
|
+
|
81
|
+
# By default, we use the Traject::Marc4JReader, which
|
82
|
+
# can read marc8 and ISO8859_1 -- if your records are all in UTF8,
|
83
|
+
# the pure-ruby MarcReader may be faster...
|
84
|
+
# provide "reader_class_name", "Traject::MarcReader"
|
85
|
+
# If you ARE using the Marc4JReader, it defaults to "BESTGUESS"
|
86
|
+
# as to encoding when reading binary, you may want to tell it instead
|
87
|
+
provide "marc4j_reader.source_encoding", "MARC8" # or UTF-8 or ISO8859_1
|
76
88
|
end
|
77
89
|
~~~
|
78
90
|
|
79
|
-
|
91
|
+
`provide` will only set the key if it was previously unset, so first
|
92
|
+
setting wins, and command-line comes first of all and overrides everything.
|
93
|
+
You can also use `store` if you want to force-set, last set wins.
|
94
|
+
|
95
|
+
See, docs page on [Settings](./doc/settings.md) for list
|
80
96
|
of all standardized settings.
|
81
97
|
|
82
98
|
### Indexing Rules
|
@@ -158,7 +174,7 @@ for mapping form MARC codes to user-displayable strings. See Traject::Translatio
|
|
158
174
|
|
159
175
|
#### Direct indexing logic vs. Macros
|
160
176
|
|
161
|
-
It turns out all those functions we saw above used with `to_field` -- `literal`, `serialized_marc`, `extract_all_marc_values
|
177
|
+
It turns out all those functions we saw above used with `to_field` -- `literal`, `serialized_marc`, `extract_all_marc_values`, and `extract_marc` -- are what Traject calls 'macros'.
|
162
178
|
|
163
179
|
They are all actually built based upon a more basic element of
|
164
180
|
indexing functionality, which you can always drop down to, and
|
@@ -178,7 +194,8 @@ used to define a block of logic that can be stored and executed later. When the
|
|
178
194
|
|
179
195
|
The third argument is a `Traject::Indexer::Context` object that can
|
180
196
|
be used for more advanced functionality, including caching expensive
|
181
|
-
per-record calculations, writing out to more than one output field at a time
|
197
|
+
per-record calculations, writing out to more than one output field at a time, or taking account of current Traject Settings in your logic. The third argument is optional, you can supply
|
198
|
+
a two-argument block too.
|
182
199
|
|
183
200
|
You can always drop out to this basic direct use whenever you need
|
184
201
|
special purpose logic, directly in the config file, writing in
|
@@ -197,12 +214,12 @@ end
|
|
197
214
|
# marc_extract does, you may want to use the Traject::MarcExtractor
|
198
215
|
# class
|
199
216
|
to_field "weirdo" do |record, accumulator, context|
|
200
|
-
list = MarcExtractor.
|
217
|
+
list = MarcExtractor.extract_by_spec(record, "700a")
|
201
218
|
# combine all the 700a's in ONE string, cause we're weird
|
202
219
|
list = list.join(" ")
|
203
220
|
accumulator << list
|
204
221
|
end
|
205
|
-
|
222
|
+
~~~
|
206
223
|
|
207
224
|
You can also *combine* a macro and a direct block for some
|
208
225
|
post-processing. In this case, the `accumulator` parameter
|
@@ -220,6 +237,54 @@ If you find yourself repeating code a lot in direct blocks, you
|
|
220
237
|
can supply your _own_ macros, for local use, or even to share
|
221
238
|
with others in a ruby gem. See docs [Macros](./doc/macros.md)
|
222
239
|
|
240
|
+
#### each_record
|
241
|
+
|
242
|
+
There is also a method `each_record`, which is like `to_field`, but without
|
243
|
+
a specific field. It can be used for other side-effects of your choice, or
|
244
|
+
even for writing to multiple fields.
|
245
|
+
|
246
|
+
~~~ruby
|
247
|
+
each_record do |record, context|
|
248
|
+
# example of writing to two fields at once.
|
249
|
+
(x, y) = Something.do_stuff
|
250
|
+
(context["one_field"] ||= []) << x
|
251
|
+
(context["another_field"] ||= []) << y
|
252
|
+
end
|
253
|
+
~~~
|
254
|
+
|
255
|
+
You could write or use macros for `each_record` too. It's suggested that
|
256
|
+
such a macro take the field names it will effect as arguments (example?)
|
257
|
+
|
258
|
+
`each_record` and `to_field` calls will be processed in one big order, guaranteed
|
259
|
+
in order.
|
260
|
+
|
261
|
+
~~~ruby
|
262
|
+
to_field("foo") {...} # will be called first on each record
|
263
|
+
each_record {...} # will always be called AFTER above has potentially added values
|
264
|
+
to_field("foo") {...} # and will be called after each of the preceding for each record
|
265
|
+
~~~
|
266
|
+
|
267
|
+
#### Built-in MARC21 Semantics
|
268
|
+
|
269
|
+
There is another package of 'macros' that comes with Traject for extracting semantics
|
270
|
+
from Marc21. These are sometimes 'opinionated', using heuristics or algorithms
|
271
|
+
that are not inherently part of Marc21, but have proven useful in actual practice.
|
272
|
+
|
273
|
+
It's not loaded by default, you can use straight ruby `require` and `extend`
|
274
|
+
to load the macros into the indexer.
|
275
|
+
|
276
|
+
~~~ruby
|
277
|
+
# in a traject config file, extend so we can use methods from...
|
278
|
+
require 'traject/macros/marc21_semantics'
|
279
|
+
extend Traject::Macros::Marc21Semantics
|
280
|
+
|
281
|
+
to_field "date", marc_publication_date
|
282
|
+
to_field "author_sort", marc_sortable_author
|
283
|
+
to_field "inst_facet", marc_instrumentation_humanized
|
284
|
+
~~~
|
285
|
+
|
286
|
+
See documented list of macros available in [Marc21Semantics](./lib/traject/macros/marc21_semantics.rb)
|
287
|
+
|
223
288
|
## Command Line
|
224
289
|
|
225
290
|
The simplest invocation is:
|
@@ -241,8 +306,7 @@ If you leave off the marc_file, traject will try to read from stdin. You can onl
|
|
241
306
|
cat some/dir/*.marc | traject -c conf_file.rb
|
242
307
|
|
243
308
|
You can set any setting on the command line with `-s key=value`.
|
244
|
-
This will over-ride any settings
|
245
|
-
think over-riding works, it's actually a bit tricky)
|
309
|
+
This will over-ride any settings set with `provide` in conf files.
|
246
310
|
|
247
311
|
traject -c conf_file.rb marc_file -s solr.url=http://somehere/solr -s solr.url=http://example.com/solr -s solrj_writer.commit_on_close=true
|
248
312
|
|
@@ -292,46 +356,6 @@ and/or extra files in ./docs -- as appropriate for what needs to be docs.
|
|
292
356
|
|
293
357
|
## TODO
|
294
358
|
|
295
|
-
* Logging
|
296
|
-
* it's doing no logging of it's own
|
297
|
-
* It's not properly setting up the solrj logging
|
298
|
-
* Making solrj and it's own logging go to same place, accross jruby bridge, not sure
|
299
|
-
(I want all of this code BUT the Solr writing stuff to be usable under MRI too,
|
300
|
-
I want to repurpose the mapping code for DISPLAY too)
|
301
|
-
|
302
|
-
* Error handling. Related to logging. Catch errors indexing
|
303
|
-
particular records, make
|
304
|
-
sure they are logged in an obvious place, make sure processing proceeds with other
|
305
|
-
records (if it should!) etc.
|
306
|
-
|
307
|
-
* Distro and the SolrJ jars. Right now the SolrJ jars are included in the gem (although they
|
308
|
-
aren't actually loaded until you try to use the SolrJWriter). This is not neccesarily
|
309
|
-
best. other possibilities:
|
310
|
-
* Put them in their own gem
|
311
|
-
* Make the end-user download them theirselves, possibly providing the ivy.xml's to do so for
|
312
|
-
them.
|
313
|
-
|
314
|
-
* Various performance improvements, this is not optimized yet. Some improvements
|
315
|
-
may challenge architecture, when they involve threading.
|
316
|
-
* Profile and optimize marc loading -- right now just using ruby-marc, always.
|
317
|
-
* Profile/optimize marc serialization back to stored filed, right now it uses
|
318
|
-
known-to-be-slow rexml as part of ruby-marc.
|
319
|
-
* Use threads for the mapping step? With celluloid, or threach, or other? Does
|
320
|
-
this require thinking more about thread safety of existing code?
|
321
|
-
* Use threads for writing to solr?
|
322
|
-
* I am not sure about using the solrj ConcurrentUpdateSolrServer -- among other
|
323
|
-
things, it seems to swallow solr errors, that i'm not sure we want to do.
|
324
|
-
* But we can batch docs ourselves before HttpServer#add'ing them -- every
|
325
|
-
solrj HTTPServer#add is an http transaction, but you can give it an ARRAY
|
326
|
-
to load multiple at once -- and still get the errors, I think. (Have to test)
|
327
|
-
Could be perf nearly as good as concurrentupdate? Or do that, but then make each
|
328
|
-
HttpServer#add in one of our own manual threads (Celluloid? Or raw?), so
|
329
|
-
continued processing doesn't block?
|
330
|
-
|
331
|
-
* Reading Marc8. It can't do it yet. Easiest way would be using Marc4j to read, or using it as a transcoder anyway. Don't really want to write marc8 transcoder in ruby.
|
332
|
-
|
333
|
-
* We need something like `to_field`, but without actually being
|
334
|
-
for mapping to a specific output field. For generic pre or post-processing, or multi-output-field logic. `before_record do &block`, `after_record do &block` , `on_each_record do &block`, one or more of those.
|
335
359
|
|
336
360
|
* Unicode normalization. Has to normalize to NFKC on way out to index. Except for serialized marc field and other exceptions? Except maybe don't have to, rely on solr analyzer to do it?
|
337
361
|
|
@@ -340,8 +364,8 @@ for mapping to a specific output field. For generic pre or post-processing, or m
|
|
340
364
|
* Either way, all optional/configurable of course. based
|
341
365
|
on Settings.
|
342
366
|
|
343
|
-
* More macros. Not all the built-in functionality that comes with SolrMarc is here yet. It can be provided as macros, either built in, or distro'd in other gems. If really needed as macros, and not just something local configs build themselves as needed out of the parts already here.
|
344
|
-
|
345
367
|
* Command line code. It's only 150 lines, but it's kind of messy
|
346
368
|
jammed into one file *and lacks tests*. I couldn't figure out
|
347
369
|
what to do with it or how to test it. Needs a bit of love.
|
370
|
+
|
371
|
+
* Optional built-in jetty stop/start to allow indexing to Solr that wasn't running before. maybe https://github.com/projecthydra/jettywrapper ?
|
data/Rakefile
CHANGED
@@ -14,3 +14,8 @@ Rake::TestTask.new do |t|
|
|
14
14
|
t.pattern = 'test/**/*_test.rb'
|
15
15
|
t.libs.push 'test', 'test_support'
|
16
16
|
end
|
17
|
+
|
18
|
+
# Not documented well, but this seems to be
|
19
|
+
# the way to load rake tasks from other files
|
20
|
+
#import "lib/tasks/load_map.rake"
|
21
|
+
Dir.glob('lib/tasks/*.rake').each { |r| import r}
|
data/bin/traject
CHANGED
@@ -14,13 +14,16 @@ require 'traject'
|
|
14
14
|
require 'traject/indexer'
|
15
15
|
|
16
16
|
|
17
|
+
orig_argv = ARGV.dup
|
18
|
+
|
17
19
|
|
18
20
|
opts = Slop.new(:strict => true) do
|
19
21
|
banner "traject [options] -c configuration.rb [-c config2.rb] file.mrc"
|
20
22
|
|
21
23
|
on 'v', 'version', "print version information to stderr"
|
24
|
+
on 'd', 'debug', "Include debug log, -s log.level=debug"
|
22
25
|
on 'h', 'help', "print usage information to stderr"
|
23
|
-
on 'c', 'conf', 'configuration file path (repeatable)', :argument => true, :as => Array
|
26
|
+
on 'c', 'conf', 'configuration file path (repeatable)', :argument => true, :as => Array
|
24
27
|
on :s, :setting, "settings: `-s key=value` (repeatable)", :argument => true, :as => Array
|
25
28
|
on :r, :reader, "Set reader class, shortcut for `-s reader_class_name=*`", :argument => true
|
26
29
|
on :w, :writer, "Set writer class, shortcut for `-s writer_class_name=*`", :argument => true
|
@@ -48,11 +51,12 @@ options = opts.to_hash
|
|
48
51
|
|
49
52
|
if options[:version]
|
50
53
|
$stderr.puts "traject version #{Traject::VERSION}"
|
54
|
+
exit 1
|
51
55
|
end
|
52
56
|
|
53
57
|
if options[:help]
|
54
58
|
$stderr.puts opts.help
|
55
|
-
exit
|
59
|
+
exit 1
|
56
60
|
end
|
57
61
|
|
58
62
|
# have to use Slop object to tell diff between
|
@@ -87,6 +91,10 @@ settings = {}
|
|
87
91
|
end
|
88
92
|
end
|
89
93
|
|
94
|
+
|
95
|
+
if options[:debug]
|
96
|
+
settings["log.level"] = "debug"
|
97
|
+
end
|
90
98
|
if options[:writer]
|
91
99
|
settings["writer_class_name"] = options[:writer]
|
92
100
|
end
|
@@ -112,6 +120,14 @@ end
|
|
112
120
|
indexer = Traject::Indexer.new
|
113
121
|
indexer.settings( settings )
|
114
122
|
|
123
|
+
unless options[:conf] && options[:conf].length > 0
|
124
|
+
$stderr.puts "Error: Missing required configuration file"
|
125
|
+
$stderr.puts "Exiting..."
|
126
|
+
$stderr.puts
|
127
|
+
$stderr.puts opts.help
|
128
|
+
exit 2
|
129
|
+
end
|
130
|
+
|
115
131
|
options[:conf].each do |conf_path|
|
116
132
|
begin
|
117
133
|
indexer.instance_eval(File.open(conf_path).read, conf_path)
|
@@ -128,6 +144,14 @@ options[:conf].each do |conf_path|
|
|
128
144
|
end
|
129
145
|
end
|
130
146
|
|
147
|
+
## SAFE TO LOG STARTING HERE.
|
148
|
+
#
|
149
|
+
# Shoudln't log before config files are read above, because
|
150
|
+
# config files set up logger
|
151
|
+
##############
|
152
|
+
indexer.logger.info("executing with arguments: `#{orig_argv.join(' ')}`")
|
153
|
+
|
154
|
+
|
131
155
|
# ARGF might be perfect for this, but problems with it include:
|
132
156
|
# * jruby is broken, no way to set it's encoding, leads to encoding errors reading non-ascii
|
133
157
|
# https://github.com/jruby/jruby/issues/891
|
@@ -145,9 +169,13 @@ if ARGV.length > 1
|
|
145
169
|
exit 1
|
146
170
|
end
|
147
171
|
if ARGV.length == 0
|
172
|
+
indexer.logger.info "Reading from STDIN..."
|
148
173
|
io = $stdin
|
149
174
|
else
|
175
|
+
indexer.logger.info "Reading from #{ARGV.first}"
|
150
176
|
io = File.open(ARGV.first, 'r')
|
151
177
|
end
|
152
178
|
|
153
|
-
indexer.process(io)
|
179
|
+
result = indexer.process(io)
|
180
|
+
|
181
|
+
exit 1 unless result # non-zero exit status on process telling us there's problems.
|
data/doc/settings.md
CHANGED
@@ -6,29 +6,90 @@ used for grouping and namespacing.
|
|
6
6
|
|
7
7
|
Values are usually strings, but occasionally something else.
|
8
8
|
|
9
|
-
Settings can be set in configuration files,
|
10
|
-
|
9
|
+
Settings can be set in configuration files, usually like:
|
10
|
+
|
11
|
+
~~~ruby
|
12
|
+
settings do
|
13
|
+
provide "key", "value"
|
14
|
+
end
|
15
|
+
~~~~
|
16
|
+
|
17
|
+
or on the command line: `-s key=value`. There are also some command line shortcuts
|
18
|
+
for commonly used settings, see `traject -h`.
|
11
19
|
|
12
20
|
## Known settings
|
13
21
|
|
14
|
-
*
|
22
|
+
* `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
|
23
|
+
yes, this is fixed to STDERR, regardless of your logging setup.
|
24
|
+
* `.` for every batch of records read and parsed
|
25
|
+
* `^` for every batch of records batched and queued for adding to solr
|
26
|
+
(possibly in thread pool)
|
27
|
+
* `%` for completing of a Solr 'add'
|
28
|
+
* `!` when threadpool for solr add has a full queue, so solr add is
|
29
|
+
going to happen in calling queue -- means solr adding can't
|
30
|
+
keep up with production.
|
31
|
+
|
32
|
+
* `json_writer.pretty_print`: used by the JsonWriter, if set to true, will output pretty printed json (with added whitespace) for easier human readability. Default false.
|
33
|
+
|
34
|
+
* `log.file`: filename to send logging, or 'STDOUT' or 'STDERR' for those streams. Default STDERR
|
35
|
+
|
36
|
+
* `log.error_file`: Default nil, if set then all log lines of ERROR and higher will be _additionally_
|
37
|
+
sent to error file named.
|
38
|
+
|
39
|
+
* `log.format`: Formatting string used by Yell logger. https://github.com/rudionrails/yell/wiki/101-formatting-log-messages
|
40
|
+
|
41
|
+
* `log.level`: Log this level and above. Default 'info', set to eg 'debug' to get potentially more logging info,
|
42
|
+
or 'error' to get less. https://github.com/rudionrails/yell/wiki/101-setting-the-log-level
|
15
43
|
|
16
|
-
*
|
44
|
+
* `log.batch_progress`: If set to a number N (or string representation), will output a progress line to INFO
|
45
|
+
log, every N records.
|
17
46
|
|
18
|
-
*
|
47
|
+
* `marc_source.type`: default 'binary'. Can also set to 'xml' or (not yet implemented todo) 'json'. Command line shortcut `-t`
|
19
48
|
|
20
|
-
*
|
49
|
+
* `marc4j_reader.jar_dir`: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
|
50
|
+
be loaded. If unset, uses marc4j.jar bundled with traject.
|
21
51
|
|
22
|
-
*
|
52
|
+
* `marc4j_reader.permissive`: Used by Marc4JReader only when marc.source_type is 'binary', boolean, argument to the underlying MarcPermissiveStreamReader. Default true.
|
23
53
|
|
24
|
-
*
|
54
|
+
* `marc4j_reader.source_encoding`: Used by Marc4JReader only when marc.source_type is 'binary', encoding strings accepted
|
55
|
+
by marc4j MarcPermissiveStreamReader. Default "BESTGUESS", also "UTF-8", "MARC"
|
56
|
+
|
57
|
+
* `processing_thread_pool` Default 3. Main thread pool used for processing records with input rules. Choose a
|
58
|
+
pool size based on size of your machine, and complexity of your indexing rules.
|
59
|
+
Probably no reason for it ever to be more than number of cores on indexing machine.
|
60
|
+
But this is the first thread_pool to try increasing for better performance on a multi-core machine.
|
61
|
+
|
62
|
+
A pool here can sometimes result in multi-threaded commiting to Solr too with the
|
63
|
+
SolrJWriter, as processing worker threads will do their own commits to solr if the
|
64
|
+
solrj_writer.thread_pool is full. Having a multi-threaded pool here can help even out throughput
|
65
|
+
through Solr's pauses for committing too.
|
66
|
+
|
67
|
+
* `reader_class_name`: a Traject Reader class, used by the indexer as a source of records. Default Traject::Marc4jReader. If you don't need to read marc binary with Marc8 encoding, the pure ruby MarcReader may give you better performance. Command-line shortcut `-r`
|
68
|
+
|
69
|
+
* `solr.url`: URL to connect to a solr instance for indexing, eg http://example.org:8983/solr . Command-line short-cut `-u`.
|
70
|
+
|
71
|
+
* `solrj.jar_dir`: SolrJWriter needs to load Java .jar files with SolrJ. It will load from a packaged SolrJ, but you can load your own SolrJ (different version etc) by specifying a directory. All *.jar in directory will be loaded.
|
72
|
+
|
73
|
+
* `solr.version`: Set to eg "1.4.0", "4.3.0"; currently un-used, but in the future will control
|
25
74
|
change some default settings, and/or sanity check and warn you if you're doing something
|
26
|
-
that might not work with that version of solr. Set now for help in the future.
|
75
|
+
that might not work with that version of solr. Set now for help in the future.
|
76
|
+
|
77
|
+
* `solrj_writer.batch_size`: size of batches that SolrJWriter will send docs to Solr in. Default 200. Set to nil,
|
78
|
+
0, or 1, and SolrJWriter will do one http transaction per document, no batching.
|
79
|
+
|
80
|
+
* `solrj_writer.commit_on_close`: default false, set to true to have SolrJWriter send an explicit commit message to Solr after indexing.
|
27
81
|
|
28
|
-
* solrj_writer.
|
82
|
+
* `solrj_writer.parser_class_name`: Set to "XMLResponseParser" or "BinaryResponseParser". Will be instantiated and passed to the solrj.SolrServer with setResponseParser. Default nil, use SolrServer default. To talk to a solr 1.x, you will want to set to "XMLResponseParser"
|
29
83
|
|
30
|
-
* solrj_writer.
|
84
|
+
* `solrj_writer.server_class_name`: String name of a solrj.SolrServer subclass to be used by SolrJWriter. Default "HttpSolrServer"
|
31
85
|
|
32
|
-
* solrj_writer.
|
86
|
+
* `solrj_writer.thread_pool`: Defaults to 1 (single bg thread). A thread pool is used for submitting docs
|
87
|
+
to solr. Set to 0 or nil to disable threading. Set to 1,
|
88
|
+
there will still be a single bg thread doing the adds.
|
89
|
+
May make sense to set higher than number of cores on your
|
90
|
+
indexing machine, as these threads will mostly be waiting
|
91
|
+
on Solr. Speed/capacity of your solr might be more relevant.
|
92
|
+
Note that processing_thread_pool threads can end up submitting
|
93
|
+
to solr too, if solrj_writer.thread_pool is full.
|
33
94
|
|
34
|
-
* writer_class_name
|
95
|
+
* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJWriter, also available Traject::JsonWriter. See Traject::Indexer for more info. Command line shortcut `-w`
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
namespace :load_maps do
|
8
|
+
|
9
|
+
desc "Load MARC geo codes by screen-scraping LC"
|
10
|
+
task :marc_geographic do
|
11
|
+
begin
|
12
|
+
require 'nokogiri'
|
13
|
+
rescue LoadError => e
|
14
|
+
$stderr.puts "\n load_maps:marc_geographic task requires nokogiri"
|
15
|
+
$stderr.puts " Try `gem install nokogiri` and try again. Exiting...\n\n"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"
|
20
|
+
|
21
|
+
filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
|
22
|
+
file = File.open( filename, "w:utf-8" )
|
23
|
+
|
24
|
+
$stderr.puts "Writing to `#{filename}` ..."
|
25
|
+
|
26
|
+
html = Nokogiri::HTML(open(source_url).read)
|
27
|
+
|
28
|
+
file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"
|
29
|
+
file.puts "# Scraped from #{source_url} at #{Time.now}"
|
30
|
+
file.puts "# Intentionally includes discontinued codes."
|
31
|
+
|
32
|
+
file.puts "\n"
|
33
|
+
html.css("tr").each do |line|
|
34
|
+
code = line.css("td.code").inner_text.strip
|
35
|
+
unless code.nil? || code.empty?
|
36
|
+
code.gsub!(/^\-/, '') # treat discontinued code like any other
|
37
|
+
|
38
|
+
label = line.css("td[2]").inner_text.strip
|
39
|
+
|
40
|
+
label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
|
41
|
+
label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true.
|
42
|
+
|
43
|
+
file.puts "'#{code}': '#{label}'"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
$stderr.puts "Done."
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'hashie'
|
2
|
+
|
3
|
+
# A Hash of settings for a Traject::Indexer, which also ends up passed along
|
4
|
+
# to other objects Traject::Indexer interacts with.
|
5
|
+
#
|
6
|
+
# Enhanced with a few features from Hashie, to make it for
|
7
|
+
# instance string/symbol indifferent
|
8
|
+
#
|
9
|
+
# #provide(key, value) is added, to do like settings[key] ||= value,
|
10
|
+
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
11
|
+
#
|
12
|
+
# Also has an interesting 'defaults' system, meant to play along
|
13
|
+
# with configuration file 'provide' statements. There is a built-in hash of
|
14
|
+
# defaults, which will be lazily filled in if accessed and not yet
|
15
|
+
# set. (nil can count as set, though!). If they haven't been lazily
|
16
|
+
# set yet, then #provide will still fill them in. But you can also call
|
17
|
+
# fill_in_defaults! to fill all defaults in, if you know configuration
|
18
|
+
# files have all been loaded, and want to fill them in for inspection.
|
19
|
+
class Traject::Indexer
|
20
|
+
class Settings < Hash
|
21
|
+
include Hashie::Extensions::MergeInitializer # can init with hash
|
22
|
+
include Hashie::Extensions::IndifferentAccess
|
23
|
+
|
24
|
+
# Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
|
25
|
+
alias_method :store, :indifferent_writer
|
26
|
+
|
27
|
+
def initialize(*args)
|
28
|
+
super
|
29
|
+
self.default_proc = lambda do |hash, key|
|
30
|
+
if self.class.defaults.has_key?(key)
|
31
|
+
return hash[key] = self.class.defaults[key]
|
32
|
+
else
|
33
|
+
return nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# a cautious store, which only saves key=value if
|
39
|
+
# there was not already a value for #key. Can be used
|
40
|
+
# to set settings that can be overridden on command line,
|
41
|
+
# or general first-set-wins settings.
|
42
|
+
def provide(key, value)
|
43
|
+
unless has_key? key
|
44
|
+
store(key, value)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# reverse_merge copied from ActiveSupport, pretty straightforward,
|
49
|
+
# modified to make sure we return a Settings
|
50
|
+
def reverse_merge(other_hash)
|
51
|
+
self.class.new(other_hash).merge(self)
|
52
|
+
end
|
53
|
+
|
54
|
+
def reverse_merge!(other_hash)
|
55
|
+
replace(reverse_merge(other_hash))
|
56
|
+
end
|
57
|
+
|
58
|
+
def fill_in_defaults!
|
59
|
+
self.reverse_merge!(self.class.defaults)
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.defaults
|
63
|
+
@@defaults ||= {
|
64
|
+
"reader_class_name" => "Traject::Marc4JReader",
|
65
|
+
"writer_class_name" => "Traject::SolrJWriter",
|
66
|
+
"marc_source.type" => "binary",
|
67
|
+
"marc4j_reader.permissive" => true,
|
68
|
+
"marc4j_reader.source_encoding" => "BESTGUESS",
|
69
|
+
"solrj_writer.batch_size" => 200,
|
70
|
+
"solrj_writer.thread_pool" => 1,
|
71
|
+
"processing_thread_pool" => 3
|
72
|
+
}
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|