traject 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.travis.yml +27 -0
- data/.yardopts +3 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +461 -0
- data/Rakefile +21 -0
- data/bench/bench.rb +30 -0
- data/bin/traject +16 -0
- data/doc/batch_execution.md +243 -0
- data/doc/extending.md +190 -0
- data/doc/indexing_rules.md +265 -0
- data/doc/other_commands.md +47 -0
- data/doc/settings.md +101 -0
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject.rb +11 -0
- data/lib/traject/command_line.rb +301 -0
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/debug_writer.rb +47 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +613 -0
- data/lib/traject/indexer/settings.rb +110 -0
- data/lib/traject/json_writer.rb +51 -0
- data/lib/traject/line_writer.rb +63 -0
- data/lib/traject/macros/basic.rb +9 -0
- data/lib/traject/macros/marc21.rb +223 -0
- data/lib/traject/macros/marc21_semantics.rb +584 -0
- data/lib/traject/macros/marc_format_classifier.rb +197 -0
- data/lib/traject/marc_extractor.rb +410 -0
- data/lib/traject/marc_reader.rb +89 -0
- data/lib/traject/mock_reader.rb +97 -0
- data/lib/traject/ndj_reader.rb +40 -0
- data/lib/traject/null_writer.rb +22 -0
- data/lib/traject/qualified_const_get.rb +40 -0
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +161 -0
- data/lib/traject/translation_map.rb +267 -0
- data/lib/traject/util.rb +52 -0
- data/lib/traject/version.rb +3 -0
- data/lib/traject/yaml_writer.rb +9 -0
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/debug_writer_test.rb +38 -0
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/each_record_test.rb +59 -0
- data/test/indexer/macros_marc21_semantics_test.rb +391 -0
- data/test/indexer/macros_marc21_test.rb +190 -0
- data/test/indexer/macros_test.rb +40 -0
- data/test/indexer/map_record_test.rb +209 -0
- data/test/indexer/read_write_test.rb +101 -0
- data/test/indexer/settings_test.rb +152 -0
- data/test/indexer/to_field_test.rb +77 -0
- data/test/marc_extractor_test.rb +412 -0
- data/test/marc_format_classifier_test.rb +98 -0
- data/test/marc_reader_test.rb +110 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +90 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/bad_utf_byte.utf8.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +155 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/escaped_character_reference.marc8.marc +1 -0
- data/test/test_support/george_eliot.marc +1 -0
- data/test/test_support/hebrew880s.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manufacturing_consent.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/nature.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/test_data.utf8.json +30 -0
- data/test/test_support/test_data.utf8.marc.xml +2609 -0
- data/test/test_support/test_data.utf8.mrc +1 -0
- data/test/test_support/test_data.utf8.mrc.gz +0 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +225 -0
- data/test/translation_maps/bad_ruby.rb +8 -0
- data/test/translation_maps/bad_yaml.yaml +1 -0
- data/test/translation_maps/both_map.rb +1 -0
- data/test/translation_maps/both_map.yaml +1 -0
- data/test/translation_maps/default_literal.rb +10 -0
- data/test/translation_maps/default_passthrough.rb +10 -0
- data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/test/translation_maps/ruby_map.rb +10 -0
- data/test/translation_maps/translate_array_test.yaml +8 -0
- data/test/translation_maps/yaml_map.yaml +7 -0
- data/traject.gemspec +47 -0
- metadata +382 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# Details on Traject Indexing: from custom logic to Macros
|
|
2
|
+
|
|
3
|
+
Traject macros are a way of providing re-usable index mapping rules. Before we discuss how they work, we need to remind ourselves of the basic/direct Traject `to_field` indexing method.
|
|
4
|
+
|
|
5
|
+
## How direct indexing logic works
|
|
6
|
+
|
|
7
|
+
Here's the simplest possible direct Traject mapping logic, duplicating the effects of the `literal` macro:
|
|
8
|
+
|
|
9
|
+
~~~ruby
|
|
10
|
+
to_field("title") do |record, accumulator, context|
|
|
11
|
+
accumulator << "FIXED LITERAL"
|
|
12
|
+
end
|
|
13
|
+
~~~
|
|
14
|
+
|
|
15
|
+
That `do` is just ruby `block` syntax, whereby we can pass a block of ruby code as an argument to to a ruby method. We pass a block taking three arguments, labeled `record`, `accumulator`, and `context`, to the `to_field` method. The third 'context' object is optional, you can define it in your block or not, depending on if you want to use it.
|
|
16
|
+
|
|
17
|
+
The block is then stored by the Traject::Indexer, and called for each record indexed, with three arguments provided.
|
|
18
|
+
|
|
19
|
+
#### record argument
|
|
20
|
+
|
|
21
|
+
The record that gets passed to your block is a MARC::Record object (or, theoretically, any object that gets returned by a traject Reader). Your logic will usually examine the record to calculate the desired output.
|
|
22
|
+
|
|
23
|
+
### accumulator argument
|
|
24
|
+
|
|
25
|
+
The accumulator argument is an array. At the end of your custom code, the accumulator
|
|
26
|
+
array should hold the output you want to send off, to the field specified in the `to_field`.
|
|
27
|
+
|
|
28
|
+
The accumulator is a reference to a ruby array, and you need to **modify** that array,
|
|
29
|
+
manipulating it in place with Array methods that mutate the array, like `concat`, `<<`,
|
|
30
|
+
`map!` or even `replace`.
|
|
31
|
+
|
|
32
|
+
You can't simply assign the accumulator variable to a different array, that won't work,
|
|
33
|
+
you need to modify the array in-place.
|
|
34
|
+
|
|
35
|
+
# Won't work, assigning variable
|
|
36
|
+
to_field('foo') do |rec, acc|
|
|
37
|
+
acc = ["some constant"] } # WRONG!
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Won't work, assigning variable
|
|
41
|
+
to_field('foo') do |rec, acc|
|
|
42
|
+
acc << 'bill'
|
|
43
|
+
acc << 'dueber'
|
|
44
|
+
acc = acc.map{|str| str.upcase}
|
|
45
|
+
end # WRONG! WRONG! WRONG! WRONG! WRONG!
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Instead, do, modify array in place
|
|
49
|
+
to_field('foo') {|rec, acc| acc << "some constant" }
|
|
50
|
+
to_field('foo') do |rec, acc|
|
|
51
|
+
acc << 'bill'
|
|
52
|
+
acc << 'dueber'
|
|
53
|
+
acc = acc.map!{|str| str.upcase} #notice using "map!" not just "map"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
### context argument
|
|
57
|
+
|
|
58
|
+
The third optional context argument
|
|
59
|
+
|
|
60
|
+
The third optional argument is a
|
|
61
|
+
[Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/github/traject-project/traject/Traject/Indexer/Context))
|
|
62
|
+
object. Most of the time you don't need it, but you can use it for
|
|
63
|
+
some sophisticated functionality, for example using these Context methods:
|
|
64
|
+
|
|
65
|
+
* `context.clipboard` A hash into which you can stuff values that you want to pass from one indexing step to another. For example, if you go through a bunch of work to query a database and get a result you'll need more than once, stick the results somewhere in the clipboard. This clipboard is record-specific, and won't persist between records.
|
|
66
|
+
* `context.position` The position of the record in the input file (e.g., was it the first record, seoncd, etc.). Useful for error reporting
|
|
67
|
+
* `context.output_hash` A hash mapping the field names (generally defined in `to_field` calls) to an array of values to be sent to the writer associated with that field. This allows you to modify what goes to the writer without going through a `to_field` call -- you can just set `context.output_hash['myfield'] = ['my', 'values']` and you're set. See below for more examples
|
|
68
|
+
* `context.skip!(msg)` An assertion that this record should be ignored. No more indexing steps will be called, no results will be sent to the writer, and a `debug`-level log message will be written stating that the record was skipped.
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
## Gotcha: Use closures to make your code more efficient
|
|
72
|
+
|
|
73
|
+
A _closure_ is a computer-science term that means "a piece of code
|
|
74
|
+
that remembers all the variables that were in scope when it was
|
|
75
|
+
created." In ruby, lambdas and blocks are closures. Method definitions
|
|
76
|
+
are not, which most of us have run across much to our chagrin.
|
|
77
|
+
|
|
78
|
+
Within the context of `traject`, this means you can define a variable
|
|
79
|
+
outside of a `to_field` or `each_record` block and it will be avaiable
|
|
80
|
+
inside those blocks. And you only have to define it once.
|
|
81
|
+
|
|
82
|
+
That's useful to do for any object that is even a bit expensive
|
|
83
|
+
to create -- we can maximize the performance of our traject
|
|
84
|
+
indexing by creating those objects once outside the block,
|
|
85
|
+
instead of inside the block where it will be created
|
|
86
|
+
once per-record (every time the block is executed):
|
|
87
|
+
|
|
88
|
+
Compare:
|
|
89
|
+
|
|
90
|
+
```ruby
|
|
91
|
+
# Create the transformer for every single record
|
|
92
|
+
to_field 'normalized_title' do |rec, acc|
|
|
93
|
+
transformer = My::Custom::Format::Transformer.new # Oh no! I'm doing this for each of my 10M records!
|
|
94
|
+
acc << transformer.transform(rec['245'].value)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Create the transformer exactly once
|
|
98
|
+
transformer = My::Custom::Format::Transformer.new # Ahhh. Do it once.
|
|
99
|
+
to_field 'normalized_title' do |rec, acc|
|
|
100
|
+
acc << transformer.transform(rec['245'].value)
|
|
101
|
+
end
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Certain built-in traject calls have been optimized to be high performance
|
|
105
|
+
so it's safe to do them inside 'inner loop' blocks though.
|
|
106
|
+
That includes `Traject::TranslationMap.new` and `Traject::MarcExtractor.cached("xxx")`
|
|
107
|
+
(note #cached rather than #new there)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
## From block to lambda
|
|
111
|
+
|
|
112
|
+
In the ruby language, in addition to creating a code block as an argument
|
|
113
|
+
to a method with `do |args| ... end` or `{|arg| ... }, we can also create
|
|
114
|
+
a code block to hold in a variable, with the `lambda` keyword:
|
|
115
|
+
|
|
116
|
+
always_output_foo = lambda do |record, accumulator|
|
|
117
|
+
accumulator << "FOO"
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
traject `to_field` is written so, as a convenience, it can take a lambda expression
|
|
121
|
+
stored in a variable as an alternative to a block:
|
|
122
|
+
|
|
123
|
+
to_field("always_has_foo"), always_output_foo
|
|
124
|
+
|
|
125
|
+
Why is this a convenience? Well, ordinarily it's not something we
|
|
126
|
+
need, but in fact it's what allows traject 'macros' as re-useable
|
|
127
|
+
code templates.
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
## Macros
|
|
131
|
+
|
|
132
|
+
A Traject macro is a way to automatically create indexing rules via re-usable "templates".
|
|
133
|
+
|
|
134
|
+
Traject macros are simply methods that return ruby lambda/proc objects, possibly creating
|
|
135
|
+
them based on parameters passed in.
|
|
136
|
+
|
|
137
|
+
Here is in fact how the `literal` function is implemented:
|
|
138
|
+
|
|
139
|
+
~~~ruby
|
|
140
|
+
def literal(value)
|
|
141
|
+
return lambda do |record, accumulator, context|
|
|
142
|
+
# because a lambda is a closure, we can define it in terms
|
|
143
|
+
# of the 'value' from the scope it's defined in!
|
|
144
|
+
accumulator << value
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
to_field("something"), literal("something")
|
|
148
|
+
~~~
|
|
149
|
+
|
|
150
|
+
It's really as simple as that, that's all a Traject macro is. A function that takes parameters, and based on those parameters returns a lambda; the lambda is then passed to the `to_field` indexing method, or similar methods.
|
|
151
|
+
|
|
152
|
+
How do you make these methods available to the indexer?
|
|
153
|
+
|
|
154
|
+
Define it in a module:
|
|
155
|
+
|
|
156
|
+
~~~ruby
|
|
157
|
+
# in a file literal_macro.rb
|
|
158
|
+
module LiteralMacro
|
|
159
|
+
def literal(value)
|
|
160
|
+
return lambda do |record, accumulator, context|
|
|
161
|
+
# because a lambda is a closure, we can define it in terms
|
|
162
|
+
# of the 'value' from the scope it's defined in!
|
|
163
|
+
accumulator << value
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
~~~
|
|
168
|
+
|
|
169
|
+
And then use ordinary ruby `require` and `extend` to add it to the current Indexer file, by simply including this
|
|
170
|
+
in one of your config files:
|
|
171
|
+
|
|
172
|
+
~~~
|
|
173
|
+
require `literal_macro.rb`
|
|
174
|
+
extend LiteralMacro
|
|
175
|
+
|
|
176
|
+
to_field ...
|
|
177
|
+
~~~
|
|
178
|
+
|
|
179
|
+
That's it. You can use the traject command line `-I` option to set the ruby load path, so your file will be findable via `require`. Or you can distribute it in a gem, and use straight rubygems and the `gem` command in your configuration file, or Bundler with traject command-line `-g` option.
|
|
180
|
+
|
|
181
|
+
## Using a lambda _and_ and block
|
|
182
|
+
|
|
183
|
+
Traject macros (such as `extract_marc`) create and return a lambda. If
|
|
184
|
+
you include a lambda _and_ a block on a `to_field` call, the latter
|
|
185
|
+
gets the accumulator as it was filled in by the former.
|
|
186
|
+
|
|
187
|
+
```ruby
|
|
188
|
+
# Get the titles and lowercase them
|
|
189
|
+
to_field 'lc_title', extract_marc('245') do |rec, acc, context|
|
|
190
|
+
acc.map!{|title| title.downcase}
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Build my own lambda and use it
|
|
194
|
+
mylam = lambda {|rec, acc| acc << 'one'} # just add a constant
|
|
195
|
+
to_field('foo'), mylam do |rec, acc, context|
|
|
196
|
+
acc << 'two'
|
|
197
|
+
end #=> context.output_hash['foo'] == ['one', 'two']
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# You might also want to do something like this
|
|
201
|
+
|
|
202
|
+
to_field('foo'), my_macro_that_doesn't_dedup_ do |rec, acc|
|
|
203
|
+
acc.uniq!
|
|
204
|
+
end
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Maniuplating `context.output_hash` directly
|
|
208
|
+
|
|
209
|
+
If you ask for the context argument, a [Traject::Indexer::Context](./lib/traject/indexer/context.rb) ([rdoc](http://rdoc.info/gems/traject/Traject/Indexer/Context)), you have access to context.output_hash, with is
|
|
210
|
+
the hash of transformed output that will be sent to Solr (or any other Writer)
|
|
211
|
+
|
|
212
|
+
You can look in there to see any already transformed output and use it as the source
|
|
213
|
+
for new output. You can actually *write* to there manually, which can be useful
|
|
214
|
+
to write routines that effect more than one output field at once.
|
|
215
|
+
|
|
216
|
+
**Note**: Make sure you always assign an _array_ to, e.g., `context.output_hash['foo']`, not a single value!
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
## each_record
|
|
221
|
+
|
|
222
|
+
All the previous discussion was in terms of `to_field` -- `each_record` is a similar
|
|
223
|
+
routine, to define logic that is executed for each record, but isn't fixed to write
|
|
224
|
+
to a single output field.
|
|
225
|
+
|
|
226
|
+
So `each_record` blocks have no `accumulator` argument, instead they either take a single
|
|
227
|
+
`record` argument; or both a `record` and a `context`.
|
|
228
|
+
|
|
229
|
+
`each_record` can be used for logging or notifiying; computing intermediate
|
|
230
|
+
results; or writing to more than one field at once.
|
|
231
|
+
|
|
232
|
+
~~~ruby
|
|
233
|
+
each_record do |record, context|
|
|
234
|
+
if is_it_bad?(record)
|
|
235
|
+
context.skip!("Skipping bad record")
|
|
236
|
+
else
|
|
237
|
+
context.clipboard[:expensive_result] = calculate_expensive_thing(record)
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
each_record do |record, context|
|
|
242
|
+
(one, two) = calculate_two_things_from(record)
|
|
243
|
+
|
|
244
|
+
context.output_hash["first_field"] ||= []
|
|
245
|
+
context.output_hash["first_field"] << one
|
|
246
|
+
|
|
247
|
+
context.output_hash["second_field"] ||= []
|
|
248
|
+
context.output_hash["second_field"] << one
|
|
249
|
+
end
|
|
250
|
+
~~~
|
|
251
|
+
|
|
252
|
+
traject doesn't come with any macros written for use with
|
|
253
|
+
`each_record`, but they could be created if useful --
|
|
254
|
+
just methods that return lambda's taking the right
|
|
255
|
+
args for `each_record`.
|
|
256
|
+
|
|
257
|
+
## More tips and gotchas about indexing steps
|
|
258
|
+
|
|
259
|
+
* **All your `to_field` and `each_record` steps are run _in the order in which they were initially evaluated_**. That means that the order you call your config files can potentially make a difference if you're screwing around stuffing stuff into the context clipboard or whatnot.
|
|
260
|
+
|
|
261
|
+
* **`to_field` can be called multiple times on the same field name.** If you call the same field name multiple times, all the values will be sent to the writer.
|
|
262
|
+
|
|
263
|
+
* **Once you call `context.skip!(msg)` no more index steps will be run for that record**. So if you have any cleanup code, you'll need to make sure to call it yourself.
|
|
264
|
+
|
|
265
|
+
* **By default, `trajcet` indexing runs multi-threaded**. In the current implementation, the indexing steps for one record are *not* split across threads, but different records can be processed simultaneously by more than one thread. That means you need to make sure your code is thread-safe (or always set `processing_thread_pool` to 0).
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Other traject command-line commands
|
|
2
|
+
|
|
3
|
+
The traject command line supporst a few other miscellaneous commands with
|
|
4
|
+
the "-x command" switch. The usual traject command line is actually
|
|
5
|
+
the `process` command, `traject -x process ...` is the same as leaving out
|
|
6
|
+
the `-x process`.
|
|
7
|
+
|
|
8
|
+
## Commit
|
|
9
|
+
|
|
10
|
+
`traject -x commit` will send a 'commit' message to the Solr server
|
|
11
|
+
specified in setting `solr.url`. Other parts of configuration will
|
|
12
|
+
be ignored, but don't hurt.
|
|
13
|
+
|
|
14
|
+
traject -x commit -s solr.url=http://some.com/solr
|
|
15
|
+
|
|
16
|
+
Or with a config file that includes a solr.url setting:
|
|
17
|
+
|
|
18
|
+
traject -x commit -c config_file.rb
|
|
19
|
+
|
|
20
|
+
## marcout
|
|
21
|
+
|
|
22
|
+
The `marcout` command will skip all processing/mapping, and simply
|
|
23
|
+
serialize marc out to a file stream.
|
|
24
|
+
|
|
25
|
+
This is mainly useful when you're using a custom reader to read
|
|
26
|
+
marc from a database or something, but could also be used to
|
|
27
|
+
convert marc from one format to another or something.
|
|
28
|
+
|
|
29
|
+
Will write to stdout, or set the `output_file` setting (`-o` shortcut).
|
|
30
|
+
|
|
31
|
+
Set the `marcout.type` setting to 'xml' or 'binary' for type of output.
|
|
32
|
+
Or to `human` for human readable display of marc (that is not meant for
|
|
33
|
+
machine readability, but can be good for manual diagnostics.)
|
|
34
|
+
|
|
35
|
+
If outputing type binary, setting `marcout.allow_oversized` to
|
|
36
|
+
true or false (boolean or string), to pass that to the MARC::Writer.
|
|
37
|
+
If set to true, then oversized MARC records can still be serialized,
|
|
38
|
+
with length bytes zero'd out -- technically illegal, but can
|
|
39
|
+
be read by MARC::Reader in permissive mode.
|
|
40
|
+
|
|
41
|
+
If you have MARC-XML *input*, you need to
|
|
42
|
+
set the `marc_source.type` setting to XML for xml input.
|
|
43
|
+
|
|
44
|
+
~~~bash
|
|
45
|
+
traject -x marcout somefile.marc -o output.xml -s marcout.type=xml
|
|
46
|
+
traject -x marcout -s marc_source.type=xml somefile.xml -c configuration.rb
|
|
47
|
+
~~~
|
data/doc/settings.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Traject settings
|
|
2
|
+
|
|
3
|
+
Traject settings are a flat list of key/value pairs -- a single
|
|
4
|
+
Hash, not nested. Keys are always strings, and dots (".") can be
|
|
5
|
+
used for grouping and namespacing.
|
|
6
|
+
|
|
7
|
+
Values are usually strings, but occasionally something else. String values can be easily
|
|
8
|
+
set via the command line.
|
|
9
|
+
|
|
10
|
+
Settings can be set in configuration files, usually like:
|
|
11
|
+
|
|
12
|
+
~~~ruby
|
|
13
|
+
settings do
|
|
14
|
+
provide "key", "value"
|
|
15
|
+
end
|
|
16
|
+
~~~~
|
|
17
|
+
|
|
18
|
+
or on the command line: `-s key=value`. There are also some command line shortcuts
|
|
19
|
+
for commonly used settings, see `traject -h`.
|
|
20
|
+
|
|
21
|
+
`provide` will only set the key if it was previously unset, so first time to set 'wins'. And command-line
|
|
22
|
+
settings are applied first of all. It's recommended you use `provide`.
|
|
23
|
+
|
|
24
|
+
`store` is also available, and forces setting of the new value overriding any previous value set.
|
|
25
|
+
|
|
26
|
+
## Known settings
|
|
27
|
+
|
|
28
|
+
* `debug_ascii_progress`: true/'true' to print ascii characters to STDERR indicating progress. Note,
|
|
29
|
+
yes, this is fixed to STDERR, regardless of your logging setup.
|
|
30
|
+
* `.` for every batch of records read and parsed
|
|
31
|
+
* `^` for every batch of records batched and queued for adding to solr
|
|
32
|
+
(possibly in thread pool)
|
|
33
|
+
* `%` for completing of a Solr 'add'
|
|
34
|
+
* `!` when threadpool for solr add has a full queue, so solr add is
|
|
35
|
+
going to happen in calling queue -- means solr adding can't
|
|
36
|
+
keep up with production.
|
|
37
|
+
|
|
38
|
+
* `json_writer.pretty_print`: used by the JsonWriter, if set to true, will output pretty printed json (with added whitespace) for easier human readability. Default false.
|
|
39
|
+
|
|
40
|
+
* `log.file`: filename to send logging, or 'STDOUT' or 'STDERR' for those streams. Default STDERR
|
|
41
|
+
|
|
42
|
+
* `log.error_file`: Default nil, if set then all log lines of ERROR and higher will be _additionally_
|
|
43
|
+
sent to error file named.
|
|
44
|
+
|
|
45
|
+
* `log.format`: Formatting string used by Yell logger. https://github.com/rudionrails/yell/wiki/101-formatting-log-messages
|
|
46
|
+
|
|
47
|
+
* `log.level`: Log this level and above. Default 'info', set to eg 'debug' to get potentially more logging info,
|
|
48
|
+
or 'error' to get less. https://github.com/rudionrails/yell/wiki/101-setting-the-log-level
|
|
49
|
+
|
|
50
|
+
* `log.batch_size`: If set to a number N (or string representation), will output a progress line to
|
|
51
|
+
log. (by default as INFO, but see log.batch_size.severity)
|
|
52
|
+
|
|
53
|
+
* `log.batch_size.severity`: If `log.batch_size` is set, what logger severity level to log to. Default "INFO", set to "DEBUG" etc if desired.
|
|
54
|
+
|
|
55
|
+
* `marc_source.type`: default 'binary'. Can also set to 'xml' or (not yet implemented todo) 'json'. Command line shortcut `-t`
|
|
56
|
+
|
|
57
|
+
* `marcout.allow_oversized`: Used with `-x marcout` command to output marc when outputting
|
|
58
|
+
as ISO 2709 binary, set to true or string "true", and the MARC::Writer will have
|
|
59
|
+
allow_oversized=true set, allowing oversized records to be serialized with length
|
|
60
|
+
bytes zero'd out -- technically illegal, but can be read by MARC::Reader in permissive mode.
|
|
61
|
+
|
|
62
|
+
* `output_file`: Output file to write to for operations that write to files: For instance the `marcout` command,
|
|
63
|
+
or Writer classes that write to files, like Traject::JsonWriter. Has an shortcut
|
|
64
|
+
`-o` on command line.
|
|
65
|
+
|
|
66
|
+
* `processing_thread_pool` Number of threads in the main thread pool used for processing
|
|
67
|
+
records with input rules. On JRuby or Rubinius, defaults to 1 less than the number of processors detected on your machine. On other ruby platforms, defaults to 1. Set to 0 or nil
|
|
68
|
+
to disable thread pool, and do all processing in main thread.
|
|
69
|
+
|
|
70
|
+
Choose a pool size based on size of your machine, and complexity of your indexing rules, you
|
|
71
|
+
might want to try different sizes and measure which works best for you.
|
|
72
|
+
Probably no reason for it ever to be more than number of cores on indexing machine.
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
* `reader_class_name`: a Traject Reader class, used by the indexer as a source
|
|
76
|
+
of records. Defaults to Traject::Marc4JReader (using the Java Marc4J
|
|
77
|
+
library) on JRuby; Traject::MarcReader (using the ruby marc gem) otherwise.
|
|
78
|
+
Command-line shortcut `-r`
|
|
79
|
+
|
|
80
|
+
* `solr.url`: URL to connect to a solr instance for indexing, eg http://example.org:8983/solr . Command-line short-cut `-u`.
|
|
81
|
+
|
|
82
|
+
* `solr.version`: Set to eg "1.4.0", "4.3.0"; currently un-used, but in the future will control
|
|
83
|
+
change some default settings, and/or sanity check and warn you if you're doing something
|
|
84
|
+
that might not work with that version of solr. Set now for help in the future.
|
|
85
|
+
|
|
86
|
+
* `solr_writer.batch_size`: size of batches that SolrJsonWriter will send docs to Solr in. Default 100. Set to nil,
|
|
87
|
+
0, or 1, and SolrJsonWriter will do one http transaction per document, no batching.
|
|
88
|
+
|
|
89
|
+
* `solr_writer.commit_on_close`: default false, set to true to have the solr writer send an explicit commit message to Solr after indexing.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
* `solr_writer.thread_pool`: Defaults to 1 (single bg thread). A thread pool is used for submitting docs
|
|
93
|
+
to solr. Set to 0 or nil to disable threading. Set to 1,
|
|
94
|
+
there will still be a single bg thread doing the adds.
|
|
95
|
+
May make sense to set higher than number of cores on your
|
|
96
|
+
indexing machine, as these threads will mostly be waiting
|
|
97
|
+
on Solr. Speed/capacity of your solr might be more relevant.
|
|
98
|
+
Note that processing_thread_pool threads can end up submitting
|
|
99
|
+
to solr too, if solr_json_writer.thread_pool is full.
|
|
100
|
+
|
|
101
|
+
* `writer_class_name`: a Traject Writer class, used by indexer to send processed dictionaries off. Default Traject::SolrJsonWriter, other writers for debugging or writing to files are also available. See Traject::Indexer for more info. Command line shortcut `-w`
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
require 'net/http'
|
|
2
|
+
require 'open-uri'
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
namespace :load_maps do
|
|
8
|
+
|
|
9
|
+
desc "Load MARC geo codes by screen-scraping LC"
|
|
10
|
+
task :marc_geographic do
|
|
11
|
+
begin
|
|
12
|
+
require 'nokogiri'
|
|
13
|
+
rescue LoadError => e
|
|
14
|
+
$stderr.puts "\n load_maps:marc_geographic task requires nokogiri"
|
|
15
|
+
$stderr.puts " Try `gem install nokogiri` and try again. Exiting...\n\n"
|
|
16
|
+
exit 1
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"
|
|
20
|
+
|
|
21
|
+
filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
|
|
22
|
+
file = File.open( filename, "w:utf-8" )
|
|
23
|
+
|
|
24
|
+
$stderr.puts "Writing to `#{filename}` ..."
|
|
25
|
+
|
|
26
|
+
html = Nokogiri::HTML(open(source_url).read)
|
|
27
|
+
|
|
28
|
+
file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"
|
|
29
|
+
file.puts "# Scraped from #{source_url} at #{Time.now}"
|
|
30
|
+
file.puts "# Intentionally includes discontinued codes."
|
|
31
|
+
|
|
32
|
+
file.puts "\n"
|
|
33
|
+
html.css("tr").each do |line|
|
|
34
|
+
code = line.css("td.code").inner_text.strip
|
|
35
|
+
unless code.nil? || code.empty?
|
|
36
|
+
code.gsub!(/^\-/, '') # treat discontinued code like any other
|
|
37
|
+
|
|
38
|
+
label = line.css("td[2]").inner_text.strip
|
|
39
|
+
|
|
40
|
+
label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
|
|
41
|
+
label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true.
|
|
42
|
+
|
|
43
|
+
file.puts "'#{code}': '#{label}'"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
$stderr.puts "Done."
|
|
47
|
+
end
|
|
48
|
+
end
|