traject 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,265 @@
1
+ # Details on Traject Indexing: from custom logic to Macros
2
+
3
+ Traject macros are a way of providing re-usable index mapping rules. Before we discuss how they work, we need to remind ourselves of the basic/direct Traject `to_field` indexing method.
4
+
5
+ ## How direct indexing logic works
6
+
7
+ Here's the simplest possible direct Traject mapping logic, duplicating the effects of the `literal` macro:
8
+
9
+ ~~~ruby
10
+ to_field("title") do |record, accumulator, context|
11
+ accumulator << "FIXED LITERAL"
12
+ end
13
+ ~~~
14
+
15
+ That `do` is just ruby `block` syntax, whereby we can pass a block of ruby code as an argument to to a ruby method. We pass a block taking three arguments, labeled `record`, `accumulator`, and `context`, to the `to_field` method. The third 'context' object is optional, you can define it in your block or not, depending on if you want to use it.
16
+
17
+ The block is then stored by the Traject::Indexer, and called for each record indexed, with three arguments provided.
18
+
19
+ #### record argument
20
+
21
+ The record that gets passed to your block is a MARC::Record object (or, theoretically, any object that gets returned by a traject Reader). Your logic will usually examine the record to calculate the desired output.
22
+
23
+ ### accumulator argument
24
+
25
+ The accumulator argument is an array. At the end of your custom code, the accumulator
26
+ array should hold the output you want to send off, to the field specified in the `to_field`.
27
+
28
+ The accumulator is a reference to a ruby array, and you need to **modify** that array,
29
+ manipulating it in place with Array methods that mutate the array, like `concat`, `<<`,
30
+ `map!` or even `replace`.
31
+
32
+ You can't simply assign the accumulator variable to a different array, that won't work,
33
+ you need to modify the array in-place.
34
+
35
+ # Won't work, assigning variable
36
+ to_field('foo') do |rec, acc|
37
+ acc = ["some constant"] } # WRONG!
38
+ end
39
+
40
+ # Won't work, assigning variable
41
+ to_field('foo') do |rec, acc|
42
+ acc << 'bill'
43
+ acc << 'dueber'
44
+ acc = acc.map{|str| str.upcase}
45
+ end # WRONG! WRONG! WRONG! WRONG! WRONG!
46
+
47
+
48
+ # Instead, do, modify array in place
49
+ to_field('foo') {|rec, acc| acc << "some constant" }
50
+ to_field('foo') do |rec, acc|
51
+ acc << 'bill'
52
+ acc << 'dueber'
53
+ acc = acc.map!{|str| str.upcase} #notice using "map!" not just "map"
54
+ end
55
+
56
+ ### context argument
57
+
58
+ The third optional context argument
59
+
60
+ The third optional argument is a
61
+ [Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/github/traject-project/traject/Traject/Indexer/Context))
62
+ object. Most of the time you don't need it, but you can use it for
63
+ some sophisticated functionality, for example using these Context methods:
64
+
65
+ * `context.clipboard` A hash into which you can stuff values that you want to pass from one indexing step to another. For example, if you go through a bunch of work to query a database and get a result you'll need more than once, stick the results somewhere in the clipboard. This clipboard is record-specific, and won't persist between records.
66
+ * `context.position` The position of the record in the input file (e.g., was it the first record, seoncd, etc.). Useful for error reporting
67
+ * `context.output_hash` A hash mapping the field names (generally defined in `to_field` calls) to an array of values to be sent to the writer associated with that field. This allows you to modify what goes to the writer without going through a `to_field` call -- you can just set `context.output_hash['myfield'] = ['my', 'values']` and you're set. See below for more examples
68
+ * `context.skip!(msg)` An assertion that this record should be ignored. No more indexing steps will be called, no results will be sent to the writer, and a `debug`-level log message will be written stating that the record was skipped.
69
+
70
+
71
+ ## Gotcha: Use closures to make your code more efficient
72
+
73
+ A _closure_ is a computer-science term that means "a piece of code
74
+ that remembers all the variables that were in scope when it was
75
+ created." In ruby, lambdas and blocks are closures. Method definitions
76
+ are not, which most of us have run across much to our chagrin.
77
+
78
+ Within the context of `traject`, this means you can define a variable
79
+ outside of a `to_field` or `each_record` block and it will be avaiable
80
+ inside those blocks. And you only have to define it once.
81
+
82
+ That's useful to do for any object that is even a bit expensive
83
+ to create -- we can maximize the performance of our traject
84
+ indexing by creating those objects once outside the block,
85
+ instead of inside the block where it will be created
86
+ once per-record (every time the block is executed):
87
+
88
+ Compare:
89
+
90
+ ```ruby
91
+ # Create the transformer for every single record
92
+ to_field 'normalized_title' do |rec, acc|
93
+ transformer = My::Custom::Format::Transformer.new # Oh no! I'm doing this for each of my 10M records!
94
+ acc << transformer.transform(rec['245'].value)
95
+ end
96
+
97
+ # Create the transformer exactly once
98
+ transformer = My::Custom::Format::Transformer.new # Ahhh. Do it once.
99
+ to_field 'normalized_title' do |rec, acc|
100
+ acc << transformer.transform(rec['245'].value)
101
+ end
102
+ ```
103
+
104
+ Certain built-in traject calls have been optimized to be high performance
105
+ so it's safe to do them inside 'inner loop' blocks though.
106
+ That includes `Traject::TranslationMap.new` and `Traject::MarcExtractor.cached("xxx")`
107
+ (note #cached rather than #new there)
108
+
109
+
110
+ ## From block to lambda
111
+
112
+ In the ruby language, in addition to creating a code block as an argument
113
+ to a method with `do |args| ... end` or `{|arg| ... }, we can also create
114
+ a code block to hold in a variable, with the `lambda` keyword:
115
+
116
+ always_output_foo = lambda do |record, accumulator|
117
+ accumulator << "FOO"
118
+ end
119
+
120
+ traject `to_field` is written so, as a convenience, it can take a lambda expression
121
+ stored in a variable as an alternative to a block:
122
+
123
+ to_field("always_has_foo"), always_output_foo
124
+
125
+ Why is this a convenience? Well, ordinarily it's not something we
126
+ need, but in fact it's what allows traject 'macros' as re-useable
127
+ code templates.
128
+
129
+
130
+ ## Macros
131
+
132
+ A Traject macro is a way to automatically create indexing rules via re-usable "templates".
133
+
134
+ Traject macros are simply methods that return ruby lambda/proc objects, possibly creating
135
+ them based on parameters passed in.
136
+
137
+ Here is in fact how the `literal` function is implemented:
138
+
139
+ ~~~ruby
140
+ def literal(value)
141
+ return lambda do |record, accumulator, context|
142
+ # because a lambda is a closure, we can define it in terms
143
+ # of the 'value' from the scope it's defined in!
144
+ accumulator << value
145
+ end
146
+ end
147
+ to_field("something"), literal("something")
148
+ ~~~
149
+
150
+ It's really as simple as that, that's all a Traject macro is. A function that takes parameters, and based on those parameters returns a lambda; the lambda is then passed to the `to_field` indexing method, or similar methods.
151
+
152
+ How do you make these methods available to the indexer?
153
+
154
+ Define it in a module:
155
+
156
+ ~~~ruby
157
+ # in a file literal_macro.rb
158
+ module LiteralMacro
159
+ def literal(value)
160
+ return lambda do |record, accumulator, context|
161
+ # because a lambda is a closure, we can define it in terms
162
+ # of the 'value' from the scope it's defined in!
163
+ accumulator << value
164
+ end
165
+ end
166
+ end
167
+ ~~~
168
+
169
+ And then use ordinary ruby `require` and `extend` to add it to the current Indexer file, by simply including this
170
+ in one of your config files:
171
+
172
+ ~~~
173
+ require `literal_macro.rb`
174
+ extend LiteralMacro
175
+
176
+ to_field ...
177
+ ~~~
178
+
179
+ That's it. You can use the traject command line `-I` option to set the ruby load path, so your file will be findable via `require`. Or you can distribute it in a gem, and use straight rubygems and the `gem` command in your configuration file, or Bundler with traject command-line `-g` option.
180
+
181
+ ## Using a lambda _and_ and block
182
+
183
+ Traject macros (such as `extract_marc`) create and return a lambda. If
184
+ you include a lambda _and_ a block on a `to_field` call, the latter
185
+ gets the accumulator as it was filled in by the former.
186
+
187
+ ```ruby
188
+ # Get the titles and lowercase them
189
+ to_field 'lc_title', extract_marc('245') do |rec, acc, context|
190
+ acc.map!{|title| title.downcase}
191
+ end
192
+
193
+ # Build my own lambda and use it
194
+ mylam = lambda {|rec, acc| acc << 'one'} # just add a constant
195
+ to_field('foo'), mylam do |rec, acc, context|
196
+ acc << 'two'
197
+ end #=> context.output_hash['foo'] == ['one', 'two']
198
+
199
+
200
+ # You might also want to do something like this
201
+
202
+ to_field('foo'), my_macro_that_doesn't_dedup_ do |rec, acc|
203
+ acc.uniq!
204
+ end
205
+ ```
206
+
207
+ ## Maniuplating `context.output_hash` directly
208
+
209
+ If you ask for the context argument, a [Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/Indexer/Context)), you have access to context.output_hash, with is
210
+ the hash of transformed output that will be sent to Solr (or any other Writer)
211
+
212
+ You can look in there to see any already transformed output and use it as the source
213
+ for new output. You can actually *write* to there manually, which can be useful
214
+ to write routines that effect more than one output field at once.
215
+
216
+ **Note**: Make sure you always assign an _array_ to, e.g., `context.output_hash['foo']`, not a single value!
217
+
218
+
219
+
220
+ ## each_record
221
+
222
+ All the previous discussion was in terms of `to_field` -- `each_record` is a similar
223
+ routine, to define logic that is executed for each record, but isn't fixed to write
224
+ to a single output field.
225
+
226
+ So `each_record` blocks have no `accumulator` argument, instead they either take a single
227
+ `record` argument; or both a `record` and a `context`.
228
+
229
+ `each_record` can be used for logging or notifiying; computing intermediate
230
+ results; or writing to more than one field at once.
231
+
232
+ ~~~ruby
233
+ each_record do |record, context|
234
+ if is_it_bad?(record)
235
+ context.skip!("Skipping bad record")
236
+ else
237
+ context.clipboard[:expensive_result] = calculate_expensive_thing(record)
238
+ end
239
+ end
240
+
241
+ each_record do |record, context|
242
+ (one, two) = calculate_two_things_from(record)
243
+
244
+ context.output_hash["first_field"] ||= []
245
+ context.output_hash["first_field"] << one
246
+
247
+ context.output_hash["second_field"] ||= []
248
+ context.output_hash["second_field"] << one
249
+ end
250
+ ~~~
251
+
252
+ traject doesn't come with any macros written for use with
253
+ `each_record`, but they could be created if useful --
254
+ just methods that return lambda's taking the right
255
+ args for `each_record`.
256
+
257
+ ## More tips and gotchas about indexing steps
258
+
259
+ * **All your `to_field` and `each_record` steps are run _in the order in which they were initially evaluated_**. That means that the order you call your config files can potentially make a difference if you're screwing around stuffing stuff into the context clipboard or whatnot.
260
+
261
+ * **`to_field` can be called multiple times on the same field name.** If you call the same field name multiple times, all the values will be sent to the writer.
262
+
263
+ * **Once you call `context.skip!(msg)` no more index steps will be run for that record**. So if you have any cleanup code, you'll need to make sure to call it yourself.
264
+
265
+ * **By default, `trajcet` indexing runs multi-threaded**. In the current implementation, the indexing steps for one record are *not* split across threads, but different records can be processed simultaneously by more than one thread. That means you need to make sure your code is thread-safe (or always set `processing_thread_pool` to 0).
@@ -0,0 +1,47 @@
1
+ # Other traject command-line commands
2
+
3
+ The traject command line supporst a few other miscellaneous commands with
4
+ the "-x command" switch. The usual traject command line is actually
5
+ the `process` command, `traject -x process ...` is the same as leaving out
6
+ the `-x process`.
7
+
8
+ ## Commit
9
+
10
+ `traject -x commit` will send a 'commit' message to the Solr server
11
+ specified in setting `solr.url`. Other parts of configuration will
12
+ be ignored, but don't hurt.
13
+
14
+ traject -x commit -s solr.url=http://some.com/solr
15
+
16
+ Or with a config file that includes a solr.url setting:
17
+
18
+ traject -x commit -c config_file.rb
19
+
20
+ ## marcout
21
+
22
+ The `marcout` command will skip all processing/mapping, and simply
23
+ serialize marc out to a file stream.
24
+
25
+ This is mainly useful when you're using a custom reader to read
26
+ marc from a database or something, but could also be used to
27
+ convert marc from one format to another or something.
28
+
29
+ Will write to stdout, or set the `output_file` setting (`-o` shortcut).
30
+
31
+ Set the `marcout.type` setting to 'xml' or 'binary' for type of output.
32
+ Or to `human` for human readable display of marc (that is not meant for
33
+ machine readability, but can be good for manual diagnostics.)
34
+
35
+ If outputing type binary, setting `marcout.allow_oversized` to
36
+ true or false (boolean or string), to pass that to the MARC::Writer.
37
+ If set to true, then oversized MARC records can still be serialized,
38
+ with length bytes zero'd out -- technically illegal, but can
39
+ be read by MARC::Reader in permissive mode.
40
+
41
+ If you have MARC-XML *input*, you need to
42
+ set the `marc_source.type` setting to XML for xml input.
43
+
44
+ ~~~bash
45
+ traject -x marcout somefile.marc -o output.xml -s marcout.type=xml
46
+ traject -x marcout -s marc_source.type=xml somefile.xml -c configuration.rb
47
+ ~~~
data/doc/settings.md ADDED
@@ -0,0 +1,101 @@
1
+ # Traject settings
2
+
3
+ Traject settings are a flat list of key/value pairs -- a single
4
+ Hash, not nested. Keys are always strings, and dots (".") can be
5
+ used for grouping and namespacing.
6
+
7
+ Values are usually strings, but occasionally something else. String values can be easily
8
+ set via the command line.
9
+
10
+ Settings can be set in configuration files, usually like:
11
+
12
+ ~~~ruby
13
+ settings do
14
+ provide "key", "value"
15
+ end
16
+ ~~~~
17
+
18
+ or on the command line: `-s key=value`. There are also some command line shortcuts
19
+ for commonly used settings, see `traject -h`.
20
+
21
+ `provide` will only set the key if it was previously unset, so first time to set 'wins'. And command-line
22
+ settings are applied first of all. It's recommended you use `provide`.
23
+
24
+ `store` is also available, and forces setting of the new value overriding any previous value set.
25
+
26
+ ## Known settings
27
+
28
+ * `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
29
+ yes, this is fixed to STDERR, regardless of your logging setup.
30
+ * `.` for every batch of records read and parsed
31
+ * `^` for every batch of records batched and queued for adding to solr
32
+ (possibly in thread pool)
33
+ * `%` for completing of a Solr 'add'
34
+ * `!` when threadpool for solr add has a full queue, so solr add is
35
+ going to happen in calling queue -- means solr adding can't
36
+ keep up with production.
37
+
38
+ * `json_writer.pretty_print`: used by the JsonWriter, if set to true, will output pretty printed json (with added whitespace) for easier human readability. Default false.
39
+
40
+ * `log.file`: filename to send logging, or 'STDOUT' or 'STDERR' for those streams. Default STDERR
41
+
42
+ * `log.error_file`: Default nil, if set then all log lines of ERROR and higher will be _additionally_
43
+ sent to error file named.
44
+
45
+ * `log.format`: Formatting string used by Yell logger. https://github.com/rudionrails/yell/wiki/101-formatting-log-messages
46
+
47
+ * `log.level`: Log this level and above. Default 'info', set to eg 'debug' to get potentially more logging info,
48
+ or 'error' to get less. https://github.com/rudionrails/yell/wiki/101-setting-the-log-level
49
+
50
+ * `log.batch_size`: If set to a number N (or string representation), will output a progress line to
51
+ log. (by default as INFO, but see log.batch_size.severity)
52
+
53
+ * `log.batch_size.severity`: If `log.batch_size` is set, what logger severity level to log to. Default "INFO", set to "DEBUG" etc if desired.
54
+
55
+ * `marc_source.type`: default 'binary'. Can also set to 'xml' or (not yet implemented todo) 'json'. Command line shortcut `-t`
56
+
57
+ * `marcout.allow_oversized`: Used with `-x marcout` command to output marc when outputting
58
+ as ISO 2709 binary, set to true or string "true", and the MARC::Writer will have
59
+ allow_oversized=true set, allowing oversized records to be serialized with length
60
+ bytes zero'd out -- technically illegal, but can be read by MARC::Reader in permissive mode.
61
+
62
+ * `output_file`: Output file to write to for operations that write to files: For instance the `marcout` command,
63
+ or Writer classes that write to files, like Traject::JsonWriter. Has an shortcut
64
+ `-o` on command line.
65
+
66
+ * `processing_thread_pool` Number of threads in the main thread pool used for processing
67
+ records with input rules. On JRuby or Rubinius, defaults to 1 less than the number of processors detected on your machine. On other ruby platforms, defaults to 1. Set to 0 or nil
68
+ to disable thread pool, and do all processing in main thread.
69
+
70
+ Choose a pool size based on size of your machine, and complexity of your indexing rules, you
71
+ might want to try different sizes and measure which works best for you.
72
+ Probably no reason for it ever to be more than number of cores on indexing machine.
73
+
74
+
75
+ * `reader_class_name`: a Traject Reader class, used by the indexer as a source
76
+ of records. Defaults to Traject::Marc4JReader (using the Java Marc4J
77
+ library) on JRuby; Traject::MarcReader (using the ruby marc gem) otherwise.
78
+ Command-line shortcut `-r`
79
+
80
+ * `solr.url`: URL to connect to a solr instance for indexing, eg http://example.org:8983/solr . Command-line short-cut `-u`.
81
+
82
+ * `solr.version`: Set to eg "1.4.0", "4.3.0"; currently un-used, but in the future will control
83
+ change some default settings, and/or sanity check and warn you if you're doing something
84
+ that might not work with that version of solr. Set now for help in the future.
85
+
86
+ * `solr_writer.batch_size`: size of batches that SolrJsonWriter will send docs to Solr in. Default 100. Set to nil,
87
+ 0, or 1, and SolrJsonWriter will do one http transaction per document, no batching.
88
+
89
+ * `solr_writer.commit_on_close`: default false, set to true to have the solr writer send an explicit commit message to Solr after indexing.
90
+
91
+
92
+ * `solr_writer.thread_pool`: Defaults to 1 (single bg thread). A thread pool is used for submitting docs
93
+ to solr. Set to 0 or nil to disable threading. Set to 1,
94
+ there will still be a single bg thread doing the adds.
95
+ May make sense to set higher than number of cores on your
96
+ indexing machine, as these threads will mostly be waiting
97
+ on Solr. Speed/capacity of your solr might be more relevant.
98
+ Note that processing_thread_pool threads can end up submitting
99
+ to solr too, if solr_json_writer.thread_pool is full.
100
+
101
+ * `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJsonWriter, other writers for debugging or writing to files are also available. See Traject::Indexer for more info. Command line shortcut `-w`
@@ -0,0 +1,48 @@
1
+ require 'net/http'
2
+ require 'open-uri'
3
+
4
+
5
+
6
+
7
+ namespace :load_maps do
8
+
9
+ desc "Load MARC geo codes by screen-scraping LC"
10
+ task :marc_geographic do
11
+ begin
12
+ require 'nokogiri'
13
+ rescue LoadError => e
14
+ $stderr.puts "\n load_maps:marc_geographic task requires nokogiri"
15
+ $stderr.puts " Try `gem install nokogiri` and try again. Exiting...\n\n"
16
+ exit 1
17
+ end
18
+
19
+ source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"
20
+
21
+ filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
22
+ file = File.open( filename, "w:utf-8" )
23
+
24
+ $stderr.puts "Writing to `#{filename}` ..."
25
+
26
+ html = Nokogiri::HTML(open(source_url).read)
27
+
28
+ file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"
29
+ file.puts "# Scraped from #{source_url} at #{Time.now}"
30
+ file.puts "# Intentionally includes discontinued codes."
31
+
32
+ file.puts "\n"
33
+ html.css("tr").each do |line|
34
+ code = line.css("td.code").inner_text.strip
35
+ unless code.nil? || code.empty?
36
+ code.gsub!(/^\-/, '') # treat discontinued code like any other
37
+
38
+ label = line.css("td[2]").inner_text.strip
39
+
40
+ label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
41
+ label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true.
42
+
43
+ file.puts "'#{code}': '#{label}'"
44
+ end
45
+ end
46
+ $stderr.puts "Done."
47
+ end
48
+ end