traject 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
@@ -0,0 +1,265 @@
|
|
1
|
+
# Details on Traject Indexing: from custom logic to Macros
|
2
|
+
|
3
|
+
Traject macros are a way of providing re-usable index mapping rules. Before we discuss how they work, we need to remind ourselves of the basic/direct Traject `to_field` indexing method.
|
4
|
+
|
5
|
+
## How direct indexing logic works
|
6
|
+
|
7
|
+
Here's the simplest possible direct Traject mapping logic, duplicating the effects of the `literal` macro:
|
8
|
+
|
9
|
+
~~~ruby
|
10
|
+
to_field("title") do |record, accumulator, context|
|
11
|
+
accumulator << "FIXED LITERAL"
|
12
|
+
end
|
13
|
+
~~~
|
14
|
+
|
15
|
+
That `do` is just ruby `block` syntax, whereby we can pass a block of ruby code as an argument to to a ruby method. We pass a block taking three arguments, labeled `record`, `accumulator`, and `context`, to the `to_field` method. The third 'context' object is optional, you can define it in your block or not, depending on if you want to use it.
|
16
|
+
|
17
|
+
The block is then stored by the Traject::Indexer, and called for each record indexed, with three arguments provided.
|
18
|
+
|
19
|
+
#### record argument
|
20
|
+
|
21
|
+
The record that gets passed to your block is a MARC::Record object (or, theoretically, any object that gets returned by a traject Reader). Your logic will usually examine the record to calculate the desired output.
|
22
|
+
|
23
|
+
### accumulator argument
|
24
|
+
|
25
|
+
The accumulator argument is an array. At the end of your custom code, the accumulator
|
26
|
+
array should hold the output you want to send off, to the field specified in the `to_field`.
|
27
|
+
|
28
|
+
The accumulator is a reference to a ruby array, and you need to **modify** that array,
|
29
|
+
manipulating it in place with Array methods that mutate the array, like `concat`, `<<`,
|
30
|
+
`map!` or even `replace`.
|
31
|
+
|
32
|
+
You can't simply assign the accumulator variable to a different array, that won't work,
|
33
|
+
you need to modify the array in-place.
|
34
|
+
|
35
|
+
# Won't work, assigning variable
|
36
|
+
to_field('foo') do |rec, acc|
|
37
|
+
acc = ["some constant"] } # WRONG!
|
38
|
+
end
|
39
|
+
|
40
|
+
# Won't work, assigning variable
|
41
|
+
to_field('foo') do |rec, acc|
|
42
|
+
acc << 'bill'
|
43
|
+
acc << 'dueber'
|
44
|
+
acc = acc.map{|str| str.upcase}
|
45
|
+
end # WRONG! WRONG! WRONG! WRONG! WRONG!
|
46
|
+
|
47
|
+
|
48
|
+
# Instead, do, modify array in place
|
49
|
+
to_field('foo') {|rec, acc| acc << "some constant" }
|
50
|
+
to_field('foo') do |rec, acc|
|
51
|
+
acc << 'bill'
|
52
|
+
acc << 'dueber'
|
53
|
+
acc = acc.map!{|str| str.upcase} #notice using "map!" not just "map"
|
54
|
+
end
|
55
|
+
|
56
|
+
### context argument
|
57
|
+
|
58
|
+
The third optional context argument
|
59
|
+
|
60
|
+
The third optional argument is a
|
61
|
+
[Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/github/traject-project/traject/Traject/Indexer/Context))
|
62
|
+
object. Most of the time you don't need it, but you can use it for
|
63
|
+
some sophisticated functionality, for example using these Context methods:
|
64
|
+
|
65
|
+
* `context.clipboard` A hash into which you can stuff values that you want to pass from one indexing step to another. For example, if you go through a bunch of work to query a database and get a result you'll need more than once, stick the results somewhere in the clipboard. This clipboard is record-specific, and won't persist between records.
|
66
|
+
* `context.position` The position of the record in the input file (e.g., was it the first record, seoncd, etc.). Useful for error reporting
|
67
|
+
* `context.output_hash` A hash mapping the field names (generally defined in `to_field` calls) to an array of values to be sent to the writer associated with that field. This allows you to modify what goes to the writer without going through a `to_field` call -- you can just set `context.output_hash['myfield'] = ['my', 'values']` and you're set. See below for more examples
|
68
|
+
* `context.skip!(msg)` An assertion that this record should be ignored. No more indexing steps will be called, no results will be sent to the writer, and a `debug`-level log message will be written stating that the record was skipped.
|
69
|
+
|
70
|
+
|
71
|
+
## Gotcha: Use closures to make your code more efficient
|
72
|
+
|
73
|
+
A _closure_ is a computer-science term that means "a piece of code
|
74
|
+
that remembers all the variables that were in scope when it was
|
75
|
+
created." In ruby, lambdas and blocks are closures. Method definitions
|
76
|
+
are not, which most of us have run across much to our chagrin.
|
77
|
+
|
78
|
+
Within the context of `traject`, this means you can define a variable
|
79
|
+
outside of a `to_field` or `each_record` block and it will be avaiable
|
80
|
+
inside those blocks. And you only have to define it once.
|
81
|
+
|
82
|
+
That's useful to do for any object that is even a bit expensive
|
83
|
+
to create -- we can maximize the performance of our traject
|
84
|
+
indexing by creating those objects once outside the block,
|
85
|
+
instead of inside the block where it will be created
|
86
|
+
once per-record (every time the block is executed):
|
87
|
+
|
88
|
+
Compare:
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
# Create the transformer for every single record
|
92
|
+
to_field 'normalized_title' do |rec, acc|
|
93
|
+
transformer = My::Custom::Format::Transformer.new # Oh no! I'm doing this for each of my 10M records!
|
94
|
+
acc << transformer.transform(rec['245'].value)
|
95
|
+
end
|
96
|
+
|
97
|
+
# Create the transformer exactly once
|
98
|
+
transformer = My::Custom::Format::Transformer.new # Ahhh. Do it once.
|
99
|
+
to_field 'normalized_title' do |rec, acc|
|
100
|
+
acc << transformer.transform(rec['245'].value)
|
101
|
+
end
|
102
|
+
```
|
103
|
+
|
104
|
+
Certain built-in traject calls have been optimized to be high performance
|
105
|
+
so it's safe to do them inside 'inner loop' blocks though.
|
106
|
+
That includes `Traject::TranslationMap.new` and `Traject::MarcExtractor.cached("xxx")`
|
107
|
+
(note #cached rather than #new there)
|
108
|
+
|
109
|
+
|
110
|
+
## From block to lambda
|
111
|
+
|
112
|
+
In the ruby language, in addition to creating a code block as an argument
|
113
|
+
to a method with `do |args| ... end` or `{|arg| ... }, we can also create
|
114
|
+
a code block to hold in a variable, with the `lambda` keyword:
|
115
|
+
|
116
|
+
always_output_foo = lambda do |record, accumulator|
|
117
|
+
accumulator << "FOO"
|
118
|
+
end
|
119
|
+
|
120
|
+
traject `to_field` is written so, as a convenience, it can take a lambda expression
|
121
|
+
stored in a variable as an alternative to a block:
|
122
|
+
|
123
|
+
to_field("always_has_foo"), always_output_foo
|
124
|
+
|
125
|
+
Why is this a convenience? Well, ordinarily it's not something we
|
126
|
+
need, but in fact it's what allows traject 'macros' as re-useable
|
127
|
+
code templates.
|
128
|
+
|
129
|
+
|
130
|
+
## Macros
|
131
|
+
|
132
|
+
A Traject macro is a way to automatically create indexing rules via re-usable "templates".
|
133
|
+
|
134
|
+
Traject macros are simply methods that return ruby lambda/proc objects, possibly creating
|
135
|
+
them based on parameters passed in.
|
136
|
+
|
137
|
+
Here is in fact how the `literal` function is implemented:
|
138
|
+
|
139
|
+
~~~ruby
|
140
|
+
def literal(value)
|
141
|
+
return lambda do |record, accumulator, context|
|
142
|
+
# because a lambda is a closure, we can define it in terms
|
143
|
+
# of the 'value' from the scope it's defined in!
|
144
|
+
accumulator << value
|
145
|
+
end
|
146
|
+
end
|
147
|
+
to_field("something"), literal("something")
|
148
|
+
~~~
|
149
|
+
|
150
|
+
It's really as simple as that, that's all a Traject macro is. A function that takes parameters, and based on those parameters returns a lambda; the lambda is then passed to the `to_field` indexing method, or similar methods.
|
151
|
+
|
152
|
+
How do you make these methods available to the indexer?
|
153
|
+
|
154
|
+
Define it in a module:
|
155
|
+
|
156
|
+
~~~ruby
|
157
|
+
# in a file literal_macro.rb
|
158
|
+
module LiteralMacro
|
159
|
+
def literal(value)
|
160
|
+
return lambda do |record, accumulator, context|
|
161
|
+
# because a lambda is a closure, we can define it in terms
|
162
|
+
# of the 'value' from the scope it's defined in!
|
163
|
+
accumulator << value
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
~~~
|
168
|
+
|
169
|
+
And then use ordinary ruby `require` and `extend` to add it to the current Indexer file, by simply including this
|
170
|
+
in one of your config files:
|
171
|
+
|
172
|
+
~~~
|
173
|
+
require `literal_macro.rb`
|
174
|
+
extend LiteralMacro
|
175
|
+
|
176
|
+
to_field ...
|
177
|
+
~~~
|
178
|
+
|
179
|
+
That's it. You can use the traject command line `-I` option to set the ruby load path, so your file will be findable via `require`. Or you can distribute it in a gem, and use straight rubygems and the `gem` command in your configuration file, or Bundler with traject command-line `-g` option.
|
180
|
+
|
181
|
+
## Using a lambda _and_ and block
|
182
|
+
|
183
|
+
Traject macros (such as `extract_marc`) create and return a lambda. If
|
184
|
+
you include a lambda _and_ a block on a `to_field` call, the latter
|
185
|
+
gets the accumulator as it was filled in by the former.
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
# Get the titles and lowercase them
|
189
|
+
to_field 'lc_title', extract_marc('245') do |rec, acc, context|
|
190
|
+
acc.map!{|title| title.downcase}
|
191
|
+
end
|
192
|
+
|
193
|
+
# Build my own lambda and use it
|
194
|
+
mylam = lambda {|rec, acc| acc << 'one'} # just add a constant
|
195
|
+
to_field('foo'), mylam do |rec, acc, context|
|
196
|
+
acc << 'two'
|
197
|
+
end #=> context.output_hash['foo'] == ['one', 'two']
|
198
|
+
|
199
|
+
|
200
|
+
# You might also want to do something like this
|
201
|
+
|
202
|
+
to_field('foo'), my_macro_that_doesn't_dedup_ do |rec, acc|
|
203
|
+
acc.uniq!
|
204
|
+
end
|
205
|
+
```
|
206
|
+
|
207
|
+
## Maniuplating `context.output_hash` directly
|
208
|
+
|
209
|
+
If you ask for the context argument, a [Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/Indexer/Context)), you have access to context.output_hash, with is
|
210
|
+
the hash of transformed output that will be sent to Solr (or any other Writer)
|
211
|
+
|
212
|
+
You can look in there to see any already transformed output and use it as the source
|
213
|
+
for new output. You can actually *write* to there manually, which can be useful
|
214
|
+
to write routines that effect more than one output field at once.
|
215
|
+
|
216
|
+
**Note**: Make sure you always assign an _array_ to, e.g., `context.output_hash['foo']`, not a single value!
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
## each_record
|
221
|
+
|
222
|
+
All the previous discussion was in terms of `to_field` -- `each_record` is a similar
|
223
|
+
routine, to define logic that is executed for each record, but isn't fixed to write
|
224
|
+
to a single output field.
|
225
|
+
|
226
|
+
So `each_record` blocks have no `accumulator` argument, instead they either take a single
|
227
|
+
`record` argument; or both a `record` and a `context`.
|
228
|
+
|
229
|
+
`each_record` can be used for logging or notifiying; computing intermediate
|
230
|
+
results; or writing to more than one field at once.
|
231
|
+
|
232
|
+
~~~ruby
|
233
|
+
each_record do |record, context|
|
234
|
+
if is_it_bad?(record)
|
235
|
+
context.skip!("Skipping bad record")
|
236
|
+
else
|
237
|
+
context.clipboard[:expensive_result] = calculate_expensive_thing(record)
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
each_record do |record, context|
|
242
|
+
(one, two) = calculate_two_things_from(record)
|
243
|
+
|
244
|
+
context.output_hash["first_field"] ||= []
|
245
|
+
context.output_hash["first_field"] << one
|
246
|
+
|
247
|
+
context.output_hash["second_field"] ||= []
|
248
|
+
context.output_hash["second_field"] << one
|
249
|
+
end
|
250
|
+
~~~
|
251
|
+
|
252
|
+
traject doesn't come with any macros written for use with
|
253
|
+
`each_record`, but they could be created if useful --
|
254
|
+
just methods that return lambda's taking the right
|
255
|
+
args for `each_record`.
|
256
|
+
|
257
|
+
## More tips and gotchas about indexing steps
|
258
|
+
|
259
|
+
* **All your `to_field` and `each_record` steps are run _in the order in which they were initially evaluated_**. That means that the order you call your config files can potentially make a difference if you're screwing around stuffing stuff into the context clipboard or whatnot.
|
260
|
+
|
261
|
+
* **`to_field` can be called multiple times on the same field name.** If you call the same field name multiple times, all the values will be sent to the writer.
|
262
|
+
|
263
|
+
* **Once you call `context.skip!(msg)` no more index steps will be run for that record**. So if you have any cleanup code, you'll need to make sure to call it yourself.
|
264
|
+
|
265
|
+
* **By default, `trajcet` indexing runs multi-threaded**. In the current implementation, the indexing steps for one record are *not* split across threads, but different records can be processed simultaneously by more than one thread. That means you need to make sure your code is thread-safe (or always set `processing_thread_pool` to 0).
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Other traject command-line commands
|
2
|
+
|
3
|
+
The traject command line supporst a few other miscellaneous commands with
|
4
|
+
the "-x command" switch. The usual traject command line is actually
|
5
|
+
the `process` command, `traject -x process ...` is the same as leaving out
|
6
|
+
the `-x process`.
|
7
|
+
|
8
|
+
## Commit
|
9
|
+
|
10
|
+
`traject -x commit` will send a 'commit' message to the Solr server
|
11
|
+
specified in setting `solr.url`. Other parts of configuration will
|
12
|
+
be ignored, but don't hurt.
|
13
|
+
|
14
|
+
traject -x commit -s solr.url=http://some.com/solr
|
15
|
+
|
16
|
+
Or with a config file that includes a solr.url setting:
|
17
|
+
|
18
|
+
traject -x commit -c config_file.rb
|
19
|
+
|
20
|
+
## marcout
|
21
|
+
|
22
|
+
The `marcout` command will skip all processing/mapping, and simply
|
23
|
+
serialize marc out to a file stream.
|
24
|
+
|
25
|
+
This is mainly useful when you're using a custom reader to read
|
26
|
+
marc from a database or something, but could also be used to
|
27
|
+
convert marc from one format to another or something.
|
28
|
+
|
29
|
+
Will write to stdout, or set the `output_file` setting (`-o` shortcut).
|
30
|
+
|
31
|
+
Set the `marcout.type` setting to 'xml' or 'binary' for type of output.
|
32
|
+
Or to `human` for human readable display of marc (that is not meant for
|
33
|
+
machine readability, but can be good for manual diagnostics.)
|
34
|
+
|
35
|
+
If outputing type binary, setting `marcout.allow_oversized` to
|
36
|
+
true or false (boolean or string), to pass that to the MARC::Writer.
|
37
|
+
If set to true, then oversized MARC records can still be serialized,
|
38
|
+
with length bytes zero'd out -- technically illegal, but can
|
39
|
+
be read by MARC::Reader in permissive mode.
|
40
|
+
|
41
|
+
If you have MARC-XML *input*, you need to
|
42
|
+
set the `marc_source.type` setting to XML for xml input.
|
43
|
+
|
44
|
+
~~~bash
|
45
|
+
traject -x marcout somefile.marc -o output.xml -s marcout.type=xml
|
46
|
+
traject -x marcout -s marc_source.type=xml somefile.xml -c configuration.rb
|
47
|
+
~~~
|
data/doc/settings.md
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
# Traject settings
|
2
|
+
|
3
|
+
Traject settings are a flat list of key/value pairs -- a single
|
4
|
+
Hash, not nested. Keys are always strings, and dots (".") can be
|
5
|
+
used for grouping and namespacing.
|
6
|
+
|
7
|
+
Values are usually strings, but occasionally something else. String values can be easily
|
8
|
+
set via the command line.
|
9
|
+
|
10
|
+
Settings can be set in configuration files, usually like:
|
11
|
+
|
12
|
+
~~~ruby
|
13
|
+
settings do
|
14
|
+
provide "key", "value"
|
15
|
+
end
|
16
|
+
~~~~
|
17
|
+
|
18
|
+
or on the command line: `-s key=value`. There are also some command line shortcuts
|
19
|
+
for commonly used settings, see `traject -h`.
|
20
|
+
|
21
|
+
`provide` will only set the key if it was previously unset, so first time to set 'wins'. And command-line
|
22
|
+
settings are applied first of all. It's recommended you use `provide`.
|
23
|
+
|
24
|
+
`store` is also available, and forces setting of the new value overriding any previous value set.
|
25
|
+
|
26
|
+
## Known settings
|
27
|
+
|
28
|
+
* `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
|
29
|
+
yes, this is fixed to STDERR, regardless of your logging setup.
|
30
|
+
* `.` for every batch of records read and parsed
|
31
|
+
* `^` for every batch of records batched and queued for adding to solr
|
32
|
+
(possibly in thread pool)
|
33
|
+
* `%` for completing of a Solr 'add'
|
34
|
+
* `!` when threadpool for solr add has a full queue, so solr add is
|
35
|
+
going to happen in calling queue -- means solr adding can't
|
36
|
+
keep up with production.
|
37
|
+
|
38
|
+
* `json_writer.pretty_print`: used by the JsonWriter, if set to true, will output pretty printed json (with added whitespace) for easier human readability. Default false.
|
39
|
+
|
40
|
+
* `log.file`: filename to send logging, or 'STDOUT' or 'STDERR' for those streams. Default STDERR
|
41
|
+
|
42
|
+
* `log.error_file`: Default nil, if set then all log lines of ERROR and higher will be _additionally_
|
43
|
+
sent to error file named.
|
44
|
+
|
45
|
+
* `log.format`: Formatting string used by Yell logger. https://github.com/rudionrails/yell/wiki/101-formatting-log-messages
|
46
|
+
|
47
|
+
* `log.level`: Log this level and above. Default 'info', set to eg 'debug' to get potentially more logging info,
|
48
|
+
or 'error' to get less. https://github.com/rudionrails/yell/wiki/101-setting-the-log-level
|
49
|
+
|
50
|
+
* `log.batch_size`: If set to a number N (or string representation), will output a progress line to
|
51
|
+
log. (by default as INFO, but see log.batch_size.severity)
|
52
|
+
|
53
|
+
* `log.batch_size.severity`: If `log.batch_size` is set, what logger severity level to log to. Default "INFO", set to "DEBUG" etc if desired.
|
54
|
+
|
55
|
+
* `marc_source.type`: default 'binary'. Can also set to 'xml' or (not yet implemented todo) 'json'. Command line shortcut `-t`
|
56
|
+
|
57
|
+
* `marcout.allow_oversized`: Used with `-x marcout` command to output marc when outputting
|
58
|
+
as ISO 2709 binary, set to true or string "true", and the MARC::Writer will have
|
59
|
+
allow_oversized=true set, allowing oversized records to be serialized with length
|
60
|
+
bytes zero'd out -- technically illegal, but can be read by MARC::Reader in permissive mode.
|
61
|
+
|
62
|
+
* `output_file`: Output file to write to for operations that write to files: For instance the `marcout` command,
|
63
|
+
or Writer classes that write to files, like Traject::JsonWriter. Has an shortcut
|
64
|
+
`-o` on command line.
|
65
|
+
|
66
|
+
* `processing_thread_pool` Number of threads in the main thread pool used for processing
|
67
|
+
records with input rules. On JRuby or Rubinius, defaults to 1 less than the number of processors detected on your machine. On other ruby platforms, defaults to 1. Set to 0 or nil
|
68
|
+
to disable thread pool, and do all processing in main thread.
|
69
|
+
|
70
|
+
Choose a pool size based on size of your machine, and complexity of your indexing rules, you
|
71
|
+
might want to try different sizes and measure which works best for you.
|
72
|
+
Probably no reason for it ever to be more than number of cores on indexing machine.
|
73
|
+
|
74
|
+
|
75
|
+
* `reader_class_name`: a Traject Reader class, used by the indexer as a source
|
76
|
+
of records. Defaults to Traject::Marc4JReader (using the Java Marc4J
|
77
|
+
library) on JRuby; Traject::MarcReader (using the ruby marc gem) otherwise.
|
78
|
+
Command-line shortcut `-r`
|
79
|
+
|
80
|
+
* `solr.url`: URL to connect to a solr instance for indexing, eg http://example.org:8983/solr . Command-line short-cut `-u`.
|
81
|
+
|
82
|
+
* `solr.version`: Set to eg "1.4.0", "4.3.0"; currently un-used, but in the future will control
|
83
|
+
change some default settings, and/or sanity check and warn you if you're doing something
|
84
|
+
that might not work with that version of solr. Set now for help in the future.
|
85
|
+
|
86
|
+
* `solr_writer.batch_size`: size of batches that SolrJsonWriter will send docs to Solr in. Default 100. Set to nil,
|
87
|
+
0, or 1, and SolrJsonWriter will do one http transaction per document, no batching.
|
88
|
+
|
89
|
+
* `solr_writer.commit_on_close`: default false, set to true to have the solr writer send an explicit commit message to Solr after indexing.
|
90
|
+
|
91
|
+
|
92
|
+
* `solr_writer.thread_pool`: Defaults to 1 (single bg thread). A thread pool is used for submitting docs
|
93
|
+
to solr. Set to 0 or nil to disable threading. Set to 1,
|
94
|
+
there will still be a single bg thread doing the adds.
|
95
|
+
May make sense to set higher than number of cores on your
|
96
|
+
indexing machine, as these threads will mostly be waiting
|
97
|
+
on Solr. Speed/capacity of your solr might be more relevant.
|
98
|
+
Note that processing_thread_pool threads can end up submitting
|
99
|
+
to solr too, if solr_json_writer.thread_pool is full.
|
100
|
+
|
101
|
+
* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJsonWriter, other writers for debugging or writing to files are also available. See Traject::Indexer for more info. Command line shortcut `-w`
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
namespace :load_maps do
|
8
|
+
|
9
|
+
desc "Load MARC geo codes by screen-scraping LC"
|
10
|
+
task :marc_geographic do
|
11
|
+
begin
|
12
|
+
require 'nokogiri'
|
13
|
+
rescue LoadError => e
|
14
|
+
$stderr.puts "\n load_maps:marc_geographic task requires nokogiri"
|
15
|
+
$stderr.puts " Try `gem install nokogiri` and try again. Exiting...\n\n"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"
|
20
|
+
|
21
|
+
filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
|
22
|
+
file = File.open( filename, "w:utf-8" )
|
23
|
+
|
24
|
+
$stderr.puts "Writing to `#{filename}` ..."
|
25
|
+
|
26
|
+
html = Nokogiri::HTML(open(source_url).read)
|
27
|
+
|
28
|
+
file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"
|
29
|
+
file.puts "# Scraped from #{source_url} at #{Time.now}"
|
30
|
+
file.puts "# Intentionally includes discontinued codes."
|
31
|
+
|
32
|
+
file.puts "\n"
|
33
|
+
html.css("tr").each do |line|
|
34
|
+
code = line.css("td.code").inner_text.strip
|
35
|
+
unless code.nil? || code.empty?
|
36
|
+
code.gsub!(/^\-/, '') # treat discontinued code like any other
|
37
|
+
|
38
|
+
label = line.css("td[2]").inner_text.strip
|
39
|
+
|
40
|
+
label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
|
41
|
+
label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true.
|
42
|
+
|
43
|
+
file.puts "'#{code}': '#{label}'"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
$stderr.puts "Done."
|
47
|
+
end
|
48
|
+
end
|