traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,265 @@
1
+ # Details on Traject Indexing: from custom logic to Macros
2
+
3
+ Traject macros are a way of providing re-usable index mapping rules. Before we discuss how they work, we need to remind ourselves of the basic/direct Traject `to_field` indexing method.
4
+
5
+ ## How direct indexing logic works
6
+
7
+ Here's the simplest possible direct Traject mapping logic, duplicating the effects of the `literal` macro:
8
+
9
+ ~~~ruby
10
+ to_field("title") do |record, accumulator, context|
11
+ accumulator << "FIXED LITERAL"
12
+ end
13
+ ~~~
14
+
15
+ That `do` is just ruby `block` syntax, whereby we can pass a block of ruby code as an argument to to a ruby method. We pass a block taking three arguments, labeled `record`, `accumulator`, and `context`, to the `to_field` method. The third 'context' object is optional, you can define it in your block or not, depending on if you want to use it.
16
+
17
+ The block is then stored by the Traject::Indexer, and called for each record indexed, with three arguments provided.
18
+
19
+ #### record argument
20
+
21
+ The record that gets passed to your block is a MARC::Record object (or, theoretically, any object that gets returned by a traject Reader). Your logic will usually examine the record to calculate the desired output.
22
+
23
+ ### accumulator argument
24
+
25
+ The accumulator argument is an array. At the end of your custom code, the accumulator
26
+ array should hold the output you want to send off, to the field specified in the `to_field`.
27
+
28
+ The accumulator is a reference to a ruby array, and you need to **modify** that array,
29
+ manipulating it in place with Array methods that mutate the array, like `concat`, `<<`,
30
+ `map!` or even `replace`.
31
+
32
+ You can't simply assign the accumulator variable to a different array, that won't work,
33
+ you need to modify the array in-place.
34
+
35
+ # Won't work, assigning variable
36
+ to_field('foo') do |rec, acc|
37
+ acc = ["some constant"] } # WRONG!
38
+ end
39
+
40
+ # Won't work, assigning variable
41
+ to_field('foo') do |rec, acc|
42
+ acc << 'bill'
43
+ acc << 'dueber'
44
+ acc = acc.map{|str| str.upcase}
45
+ end # WRONG! WRONG! WRONG! WRONG! WRONG!
46
+
47
+
48
+ # Instead, do, modify array in place
49
+ to_field('foo') {|rec, acc| acc << "some constant" }
50
+ to_field('foo') do |rec, acc|
51
+ acc << 'bill'
52
+ acc << 'dueber'
53
+ acc = acc.map!{|str| str.upcase} #notice using "map!" not just "map"
54
+ end
55
+
56
+ ### context argument
57
+
58
+ The third optional context argument
59
+
60
+ The third optional argument is a
61
+ [Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/github/traject-project/traject/Traject/Indexer/Context))
62
+ object. Most of the time you don't need it, but you can use it for
63
+ some sophisticated functionality, for example using these Context methods:
64
+
65
+ * `context.clipboard` A hash into which you can stuff values that you want to pass from one indexing step to another. For example, if you go through a bunch of work to query a database and get a result you'll need more than once, stick the results somewhere in the clipboard. This clipboard is record-specific, and won't persist between records.
66
+ * `context.position` The position of the record in the input file (e.g., was it the first record, seoncd, etc.). Useful for error reporting
67
+ * `context.output_hash` A hash mapping the field names (generally defined in `to_field` calls) to an array of values to be sent to the writer associated with that field. This allows you to modify what goes to the writer without going through a `to_field` call -- you can just set `context.output_hash['myfield'] = ['my', 'values']` and you're set. See below for more examples
68
+ * `context.skip!(msg)` An assertion that this record should be ignored. No more indexing steps will be called, no results will be sent to the writer, and a `debug`-level log message will be written stating that the record was skipped.
69
+
70
+
71
+ ## Gotcha: Use closures to make your code more efficient
72
+
73
+ A _closure_ is a computer-science term that means "a piece of code
74
+ that remembers all the variables that were in scope when it was
75
+ created." In ruby, lambdas and blocks are closures. Method definitions
76
+ are not, which most of us have run across much to our chagrin.
77
+
78
+ Within the context of `traject`, this means you can define a variable
79
+ outside of a `to_field` or `each_record` block and it will be avaiable
80
+ inside those blocks. And you only have to define it once.
81
+
82
+ That's useful to do for any object that is even a bit expensive
83
+ to create -- we can maximize the performance of our traject
84
+ indexing by creating those objects once outside the block,
85
+ instead of inside the block where it will be created
86
+ once per-record (every time the block is executed):
87
+
88
+ Compare:
89
+
90
+ ```ruby
91
+ # Create the transformer for every single record
92
+ to_field 'normalized_title' do |rec, acc|
93
+ transformer = My::Custom::Format::Transformer.new # Oh no! I'm doing this for each of my 10M records!
94
+ acc << transformer.transform(rec['245'].value)
95
+ end
96
+
97
+ # Create the transformer exactly once
98
+ transformer = My::Custom::Format::Transformer.new # Ahhh. Do it once.
99
+ to_field 'normalized_title' do |rec, acc|
100
+ acc << transformer.transform(rec['245'].value)
101
+ end
102
+ ```
103
+
104
+ Certain built-in traject calls have been optimized to be high performance
105
+ so it's safe to do them inside 'inner loop' blocks though.
106
+ That includes `Traject::TranslationMap.new` and `Traject::MarcExtractor.cached("xxx")`
107
+ (note #cached rather than #new there)
108
+
109
+
110
+ ## From block to lambda
111
+
112
+ In the ruby language, in addition to creating a code block as an argument
113
+ to a method with `do |args| ... end` or `{|arg| ... }, we can also create
114
+ a code block to hold in a variable, with the `lambda` keyword:
115
+
116
+ always_output_foo = lambda do |record, accumulator|
117
+ accumulator << "FOO"
118
+ end
119
+
120
+ traject `to_field` is written so, as a convenience, it can take a lambda expression
121
+ stored in a variable as an alternative to a block:
122
+
123
+ to_field("always_has_foo"), always_output_foo
124
+
125
+ Why is this a convenience? Well, ordinarily it's not something we
126
+ need, but in fact it's what allows traject 'macros' as re-useable
127
+ code templates.
128
+
129
+
130
+ ## Macros
131
+
132
+ A Traject macro is a way to automatically create indexing rules via re-usable "templates".
133
+
134
+ Traject macros are simply methods that return ruby lambda/proc objects, possibly creating
135
+ them based on parameters passed in.
136
+
137
+ Here is in fact how the `literal` function is implemented:
138
+
139
+ ~~~ruby
140
+ def literal(value)
141
+ return lambda do |record, accumulator, context|
142
+ # because a lambda is a closure, we can define it in terms
143
+ # of the 'value' from the scope it's defined in!
144
+ accumulator << value
145
+ end
146
+ end
147
+ to_field("something"), literal("something")
148
+ ~~~
149
+
150
+ It's really as simple as that, that's all a Traject macro is. A function that takes parameters, and based on those parameters returns a lambda; the lambda is then passed to the `to_field` indexing method, or similar methods.
151
+
152
+ How do you make these methods available to the indexer?
153
+
154
+ Define it in a module:
155
+
156
+ ~~~ruby
157
+ # in a file literal_macro.rb
158
+ module LiteralMacro
159
+ def literal(value)
160
+ return lambda do |record, accumulator, context|
161
+ # because a lambda is a closure, we can define it in terms
162
+ # of the 'value' from the scope it's defined in!
163
+ accumulator << value
164
+ end
165
+ end
166
+ end
167
+ ~~~
168
+
169
+ And then use ordinary ruby `require` and `extend` to add it to the current Indexer file, by simply including this
170
+ in one of your config files:
171
+
172
+ ~~~
173
+ require `literal_macro.rb`
174
+ extend LiteralMacro
175
+
176
+ to_field ...
177
+ ~~~
178
+
179
+ That's it. You can use the traject command line `-I` option to set the ruby load path, so your file will be findable via `require`. Or you can distribute it in a gem, and use straight rubygems and the `gem` command in your configuration file, or Bundler with traject command-line `-g` option.
180
+
181
+ ## Using a lambda _and_ and block
182
+
183
+ Traject macros (such as `extract_marc`) create and return a lambda. If
184
+ you include a lambda _and_ a block on a `to_field` call, the latter
185
+ gets the accumulator as it was filled in by the former.
186
+
187
+ ```ruby
188
+ # Get the titles and lowercase them
189
+ to_field 'lc_title', extract_marc('245') do |rec, acc, context|
190
+ acc.map!{|title| title.downcase}
191
+ end
192
+
193
+ # Build my own lambda and use it
194
+ mylam = lambda {|rec, acc| acc << 'one'} # just add a constant
195
+ to_field('foo'), mylam do |rec, acc, context|
196
+ acc << 'two'
197
+ end #=> context.output_hash['foo'] == ['one', 'two']
198
+
199
+
200
+ # You might also want to do something like this
201
+
202
+ to_field('foo'), my_macro_that_doesn't_dedup_ do |rec, acc|
203
+ acc.uniq!
204
+ end
205
+ ```
206
+
207
+ ## Maniuplating `context.output_hash` directly
208
+
209
+ If you ask for the context argument, a [Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/Indexer/Context)), you have access to context.output_hash, with is
210
+ the hash of transformed output that will be sent to Solr (or any other Writer)
211
+
212
+ You can look in there to see any already transformed output and use it as the source
213
+ for new output. You can actually *write* to there manually, which can be useful
214
+ to write routines that effect more than one output field at once.
215
+
216
+ **Note**: Make sure you always assign an _array_ to, e.g., `context.output_hash['foo']`, not a single value!
217
+
218
+
219
+
220
+ ## each_record
221
+
222
+ All the previous discussion was in terms of `to_field` -- `each_record` is a similar
223
+ routine, to define logic that is executed for each record, but isn't fixed to write
224
+ to a single output field.
225
+
226
+ So `each_record` blocks have no `accumulator` argument, instead they either take a single
227
+ `record` argument; or both a `record` and a `context`.
228
+
229
+ `each_record` can be used for logging or notifiying; computing intermediate
230
+ results; or writing to more than one field at once.
231
+
232
+ ~~~ruby
233
+ each_record do |record, context|
234
+ if is_it_bad?(record)
235
+ context.skip!("Skipping bad record")
236
+ else
237
+ context.clipboard[:expensive_result] = calculate_expensive_thing(record)
238
+ end
239
+ end
240
+
241
+ each_record do |record, context|
242
+ (one, two) = calculate_two_things_from(record)
243
+
244
+ context.output_hash["first_field"] ||= []
245
+ context.output_hash["first_field"] << one
246
+
247
+ context.output_hash["second_field"] ||= []
248
+ context.output_hash["second_field"] << one
249
+ end
250
+ ~~~
251
+
252
+ traject doesn't come with any macros written for use with
253
+ `each_record`, but they could be created if useful --
254
+ just methods that return lambda's taking the right
255
+ args for `each_record`.
256
+
257
+ ## More tips and gotchas about indexing steps
258
+
259
+ * **All your `to_field` and `each_record` steps are run _in the order in which they were initially evaluated_**. That means that the order you call your config files can potentially make a difference if you're screwing around stuffing stuff into the context clipboard or whatnot.
260
+
261
+ * **`to_field` can be called multiple times on the same field name.** If you call the same field name multiple times, all the values will be sent to the writer.
262
+
263
+ * **Once you call `context.skip!(msg)` no more index steps will be run for that record**. So if you have any cleanup code, you'll need to make sure to call it yourself.
264
+
265
+ * **By default, `trajcet` indexing runs multi-threaded**. In the current implementation, the indexing steps for one record are *not* split across threads, but different records can be processed simultaneously by more than one thread. That means you need to make sure your code is thread-safe (or always set `processing_thread_pool` to 0).
@@ -0,0 +1,47 @@
1
+ # Other traject command-line commands
2
+
3
+ The traject command line supporst a few other miscellaneous commands with
4
+ the "-x command" switch. The usual traject command line is actually
5
+ the `process` command, `traject -x process ...` is the same as leaving out
6
+ the `-x process`.
7
+
8
+ ## Commit
9
+
10
+ `traject -x commit` will send a 'commit' message to the Solr server
11
+ specified in setting `solr.url`. Other parts of configuration will
12
+ be ignored, but don't hurt.
13
+
14
+ traject -x commit -s solr.url=http://some.com/solr
15
+
16
+ Or with a config file that includes a solr.url setting:
17
+
18
+ traject -x commit -c config_file.rb
19
+
20
+ ## marcout
21
+
22
+ The `marcout` command will skip all processing/mapping, and simply
23
+ serialize marc out to a file stream.
24
+
25
+ This is mainly useful when you're using a custom reader to read
26
+ marc from a database or something, but could also be used to
27
+ convert marc from one format to another or something.
28
+
29
+ Will write to stdout, or set the `output_file` setting (`-o` shortcut).
30
+
31
+ Set the `marcout.type` setting to 'xml' or 'binary' for type of output.
32
+ Or to `human` for human readable display of marc (that is not meant for
33
+ machine readability, but can be good for manual diagnostics.)
34
+
35
+ If outputing type binary, setting `marcout.allow_oversized` to
36
+ true or false (boolean or string), to pass that to the MARC::Writer.
37
+ If set to true, then oversized MARC records can still be serialized,
38
+ with length bytes zero'd out -- technically illegal, but can
39
+ be read by MARC::Reader in permissive mode.
40
+
41
+ If you have MARC-XML *input*, you need to
42
+ set the `marc_source.type` setting to XML for xml input.
43
+
44
+ ~~~bash
45
+ traject -x marcout somefile.marc -o output.xml -s marcout.type=xml
46
+ traject -x marcout -s marc_source.type=xml somefile.xml -c configuration.rb
47
+ ~~~
data/doc/settings.md ADDED
@@ -0,0 +1,101 @@
1
+ # Traject settings
2
+
3
+ Traject settings are a flat list of key/value pairs -- a single
4
+ Hash, not nested. Keys are always strings, and dots (".") can be
5
+ used for grouping and namespacing.
6
+
7
+ Values are usually strings, but occasionally something else. String values can be easily
8
+ set via the command line.
9
+
10
+ Settings can be set in configuration files, usually like:
11
+
12
+ ~~~ruby
13
+ settings do
14
+ provide "key", "value"
15
+ end
16
+ ~~~~
17
+
18
+ or on the command line: `-s key=value`. There are also some command line shortcuts
19
+ for commonly used settings, see `traject -h`.
20
+
21
+ `provide` will only set the key if it was previously unset, so first time to set 'wins'. And command-line
22
+ settings are applied first of all. It's recommended you use `provide`.
23
+
24
+ `store` is also available, and forces setting of the new value overriding any previous value set.
25
+
26
+ ## Known settings
27
+
28
+ * `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
29
+ yes, this is fixed to STDERR, regardless of your logging setup.
30
+ * `.` for every batch of records read and parsed
31
+ * `^` for every batch of records batched and queued for adding to solr
32
+ (possibly in thread pool)
33
+ * `%` for completing of a Solr 'add'
34
+ * `!` when threadpool for solr add has a full queue, so solr add is
35
+ going to happen in calling queue -- means solr adding can't
36
+ keep up with production.
37
+
38
+ * `json_writer.pretty_print`: used by the JsonWriter, if set to true, will output pretty printed json (with added whitespace) for easier human readability. Default false.
39
+
40
+ * `log.file`: filename to send logging, or 'STDOUT' or 'STDERR' for those streams. Default STDERR
41
+
42
+ * `log.error_file`: Default nil, if set then all log lines of ERROR and higher will be _additionally_
43
+ sent to error file named.
44
+
45
+ * `log.format`: Formatting string used by Yell logger. https://github.com/rudionrails/yell/wiki/101-formatting-log-messages
46
+
47
+ * `log.level`: Log this level and above. Default 'info', set to eg 'debug' to get potentially more logging info,
48
+ or 'error' to get less. https://github.com/rudionrails/yell/wiki/101-setting-the-log-level
49
+
50
+ * `log.batch_size`: If set to a number N (or string representation), will output a progress line to
51
+ log. (by default as INFO, but see log.batch_size.severity)
52
+
53
+ * `log.batch_size.severity`: If `log.batch_size` is set, what logger severity level to log to. Default "INFO", set to "DEBUG" etc if desired.
54
+
55
+ * `marc_source.type`: default 'binary'. Can also set to 'xml' or (not yet implemented todo) 'json'. Command line shortcut `-t`
56
+
57
+ * `marcout.allow_oversized`: Used with `-x marcout` command to output marc when outputting
58
+ as ISO 2709 binary, set to true or string "true", and the MARC::Writer will have
59
+ allow_oversized=true set, allowing oversized records to be serialized with length
60
+ bytes zero'd out -- technically illegal, but can be read by MARC::Reader in permissive mode.
61
+
62
+ * `output_file`: Output file to write to for operations that write to files: For instance the `marcout` command,
63
+ or Writer classes that write to files, like Traject::JsonWriter. Has an shortcut
64
+ `-o` on command line.
65
+
66
+ * `processing_thread_pool` Number of threads in the main thread pool used for processing
67
+ records with input rules. On JRuby or Rubinius, defaults to 1 less than the number of processors detected on your machine. On other ruby platforms, defaults to 1. Set to 0 or nil
68
+ to disable thread pool, and do all processing in main thread.
69
+
70
+ Choose a pool size based on size of your machine, and complexity of your indexing rules, you
71
+ might want to try different sizes and measure which works best for you.
72
+ Probably no reason for it ever to be more than number of cores on indexing machine.
73
+
74
+
75
+ * `reader_class_name`: a Traject Reader class, used by the indexer as a source
76
+ of records. Defaults to Traject::Marc4JReader (using the Java Marc4J
77
+ library) on JRuby; Traject::MarcReader (using the ruby marc gem) otherwise.
78
+ Command-line shortcut `-r`
79
+
80
+ * `solr.url`: URL to connect to a solr instance for indexing, eg http://example.org:8983/solr . Command-line short-cut `-u`.
81
+
82
+ * `solr.version`: Set to eg "1.4.0", "4.3.0"; currently un-used, but in the future will control
83
+ change some default settings, and/or sanity check and warn you if you're doing something
84
+ that might not work with that version of solr. Set now for help in the future.
85
+
86
+ * `solr_writer.batch_size`: size of batches that SolrJsonWriter will send docs to Solr in. Default 100. Set to nil,
87
+ 0, or 1, and SolrJsonWriter will do one http transaction per document, no batching.
88
+
89
+ * `solr_writer.commit_on_close`: default false, set to true to have the solr writer send an explicit commit message to Solr after indexing.
90
+
91
+
92
+ * `solr_writer.thread_pool`: Defaults to 1 (single bg thread). A thread pool is used for submitting docs
93
+ to solr. Set to 0 or nil to disable threading. Set to 1,
94
+ there will still be a single bg thread doing the adds.
95
+ May make sense to set higher than number of cores on your
96
+ indexing machine, as these threads will mostly be waiting
97
+ on Solr. Speed/capacity of your solr might be more relevant.
98
+ Note that processing_thread_pool threads can end up submitting
99
+ to solr too, if solr_json_writer.thread_pool is full.
100
+
101
+ * `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJsonWriter, other writers for debugging or writing to files are also available. See Traject::Indexer for more info. Command line shortcut `-w`
@@ -0,0 +1,48 @@
1
+ require 'net/http'
2
+ require 'open-uri'
3
+
4
+
5
+
6
+
7
+ namespace :load_maps do
8
+
9
+ desc "Load MARC geo codes by screen-scraping LC"
10
+ task :marc_geographic do
11
+ begin
12
+ require 'nokogiri'
13
+ rescue LoadError => e
14
+ $stderr.puts "\n load_maps:marc_geographic task requires nokogiri"
15
+ $stderr.puts " Try `gem install nokogiri` and try again. Exiting...\n\n"
16
+ exit 1
17
+ end
18
+
19
+ source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"
20
+
21
+ filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
22
+ file = File.open( filename, "w:utf-8" )
23
+
24
+ $stderr.puts "Writing to `#{filename}` ..."
25
+
26
+ html = Nokogiri::HTML(open(source_url).read)
27
+
28
+ file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"
29
+ file.puts "# Scraped from #{source_url} at #{Time.now}"
30
+ file.puts "# Intentionally includes discontinued codes."
31
+
32
+ file.puts "\n"
33
+ html.css("tr").each do |line|
34
+ code = line.css("td.code").inner_text.strip
35
+ unless code.nil? || code.empty?
36
+ code.gsub!(/^\-/, '') # treat discontinued code like any other
37
+
38
+ label = line.css("td[2]").inner_text.strip
39
+
40
+ label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
41
+ label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true.
42
+
43
+ file.puts "'#{code}': '#{label}'"
44
+ end
45
+ end
46
+ $stderr.puts "Done."
47
+ end
48
+ end