traject 0.16.0 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/README.md +183 -191
- data/bench/bench.rb +1 -1
- data/doc/batch_execution.md +14 -0
- data/doc/extending.md +14 -12
- data/doc/indexing_rules.md +265 -0
- data/lib/traject/command_line.rb +12 -41
- data/lib/traject/debug_writer.rb +32 -13
- data/lib/traject/indexer.rb +101 -24
- data/lib/traject/indexer/settings.rb +18 -17
- data/lib/traject/json_writer.rb +32 -11
- data/lib/traject/line_writer.rb +6 -6
- data/lib/traject/macros/basic.rb +1 -1
- data/lib/traject/macros/marc21.rb +17 -13
- data/lib/traject/macros/marc21_semantics.rb +27 -25
- data/lib/traject/macros/marc_format_classifier.rb +39 -25
- data/lib/traject/marc4j_reader.rb +36 -22
- data/lib/traject/marc_extractor.rb +79 -75
- data/lib/traject/marc_reader.rb +33 -25
- data/lib/traject/mock_reader.rb +9 -10
- data/lib/traject/ndj_reader.rb +7 -7
- data/lib/traject/null_writer.rb +1 -1
- data/lib/traject/qualified_const_get.rb +12 -2
- data/lib/traject/solrj_writer.rb +61 -52
- data/lib/traject/thread_pool.rb +45 -45
- data/lib/traject/translation_map.rb +59 -27
- data/lib/traject/util.rb +3 -3
- data/lib/traject/version.rb +1 -1
- data/lib/traject/yaml_writer.rb +1 -1
- data/test/debug_writer_test.rb +7 -7
- data/test/indexer/each_record_test.rb +4 -4
- data/test/indexer/macros_marc21_semantics_test.rb +12 -12
- data/test/indexer/macros_marc21_test.rb +10 -10
- data/test/indexer/macros_test.rb +1 -1
- data/test/indexer/map_record_test.rb +6 -6
- data/test/indexer/read_write_test.rb +43 -4
- data/test/indexer/settings_test.rb +2 -2
- data/test/indexer/to_field_test.rb +8 -8
- data/test/marc4j_reader_test.rb +4 -4
- data/test/marc_extractor_test.rb +33 -25
- data/test/marc_format_classifier_test.rb +3 -3
- data/test/marc_reader_test.rb +2 -2
- data/test/test_helper.rb +3 -3
- data/test/test_support/demo_config.rb +52 -48
- data/test/translation_map_test.rb +22 -4
- data/test/translation_maps/bad_ruby.rb +2 -2
- data/test/translation_maps/both_map.rb +1 -1
- data/test/translation_maps/default_literal.rb +1 -1
- data/test/translation_maps/default_passthrough.rb +1 -1
- data/test/translation_maps/ruby_map.rb +1 -1
- metadata +7 -31
- data/doc/macros.md +0 -103
@@ -37,7 +37,15 @@ describe "TranslationMap" do
|
|
37
37
|
map = Traject::TranslationMap.new("default_literal")
|
38
38
|
map = Traject::TranslationMap.new("default_literal")
|
39
39
|
|
40
|
-
assert_equal "DEFAULT LITERAL", map["not in the map"]
|
40
|
+
assert_equal "DEFAULT LITERAL", map["not in the map"]
|
41
|
+
end
|
42
|
+
|
43
|
+
it "does not trigger default on explicit nil result" do
|
44
|
+
map = Traject::TranslationMap.new({"alpha" => "one", "beta" => nil}, :default => "DEFAULT")
|
45
|
+
|
46
|
+
assert_equal "one", map["alpha"]
|
47
|
+
assert_nil map["beta"]
|
48
|
+
assert_equal "DEFAULT", map["not_found_in_map"]
|
41
49
|
end
|
42
50
|
|
43
51
|
it "finds .rb over .yaml" do
|
@@ -80,12 +88,12 @@ describe "TranslationMap" do
|
|
80
88
|
assert_equal "value1", map["key1"]
|
81
89
|
end
|
82
90
|
|
83
|
-
it "finds .properties defn" do
|
91
|
+
it "finds .properties defn" do
|
84
92
|
map =Traject::TranslationMap.new("properties_map")
|
85
93
|
|
86
94
|
assert_equal "Value1", map["key1"]
|
87
95
|
assert_equal "Value2", map["key2"]
|
88
|
-
assert_equal "Multi word value", map["key3"]
|
96
|
+
assert_equal "Multi word value", map["key3"]
|
89
97
|
end
|
90
98
|
|
91
99
|
it "can use a hash instance too" do
|
@@ -117,6 +125,16 @@ describe "TranslationMap" do
|
|
117
125
|
assert_equal ["hola", "first", "second", "last thing", "buenas noches", "hola", "everything else"], arr
|
118
126
|
end
|
119
127
|
|
128
|
+
it "translate_array does not include nil values" do
|
129
|
+
# TranslationMap can explicitly map to nil, meaning,
|
130
|
+
# well, map to nothing. Make sure translate_array respects that.
|
131
|
+
map = Traject::TranslationMap.new("alpha" => "one", "beta" => nil)
|
132
|
+
|
133
|
+
values = map.translate_array(["alpha", "beta"])
|
134
|
+
|
135
|
+
assert_equal ["one"], values
|
136
|
+
end
|
137
|
+
|
120
138
|
it "#to_hash" do
|
121
139
|
map = Traject::TranslationMap.new("yaml_map")
|
122
140
|
|
@@ -129,4 +147,4 @@ describe "TranslationMap" do
|
|
129
147
|
refute_same hash, map.to_hash, "each #to_hash result is a copy"
|
130
148
|
end
|
131
149
|
|
132
|
-
end
|
150
|
+
end
|
@@ -1 +1 @@
|
|
1
|
-
{ "ruby" => "ruby" }
|
1
|
+
{ "ruby" => "ruby" }
|
metadata
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 0.16.0
|
4
|
+
version: 0.17.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Jonathan Rochkind
|
@@ -10,7 +9,7 @@ authors:
|
|
10
9
|
autorequire:
|
11
10
|
bindir: bin
|
12
11
|
cert_chain: []
|
13
|
-
date: 2013-
|
12
|
+
date: 2013-10-10 00:00:00.000000000 Z
|
14
13
|
dependencies:
|
15
14
|
- !ruby/object:Gem::Dependency
|
16
15
|
name: marc
|
@@ -19,13 +18,11 @@ dependencies:
|
|
19
18
|
- - '>='
|
20
19
|
- !ruby/object:Gem::Version
|
21
20
|
version: 0.7.1
|
22
|
-
none: false
|
23
21
|
requirement: !ruby/object:Gem::Requirement
|
24
22
|
requirements:
|
25
23
|
- - '>='
|
26
24
|
- !ruby/object:Gem::Version
|
27
25
|
version: 0.7.1
|
28
|
-
none: false
|
29
26
|
prerelease: false
|
30
27
|
type: :runtime
|
31
28
|
- !ruby/object:Gem::Dependency
|
@@ -35,13 +32,11 @@ dependencies:
|
|
35
32
|
- - '>='
|
36
33
|
- !ruby/object:Gem::Version
|
37
34
|
version: 0.1.1
|
38
|
-
none: false
|
39
35
|
requirement: !ruby/object:Gem::Requirement
|
40
36
|
requirements:
|
41
37
|
- - '>='
|
42
38
|
- !ruby/object:Gem::Version
|
43
39
|
version: 0.1.1
|
44
|
-
none: false
|
45
40
|
prerelease: false
|
46
41
|
type: :runtime
|
47
42
|
- !ruby/object:Gem::Dependency
|
@@ -54,7 +49,6 @@ dependencies:
|
|
54
49
|
- - <
|
55
50
|
- !ruby/object:Gem::Version
|
56
51
|
version: '2.1'
|
57
|
-
none: false
|
58
52
|
requirement: !ruby/object:Gem::Requirement
|
59
53
|
requirements:
|
60
54
|
- - '>='
|
@@ -63,7 +57,6 @@ dependencies:
|
|
63
57
|
- - <
|
64
58
|
- !ruby/object:Gem::Version
|
65
59
|
version: '2.1'
|
66
|
-
none: false
|
67
60
|
prerelease: false
|
68
61
|
type: :runtime
|
69
62
|
- !ruby/object:Gem::Dependency
|
@@ -76,7 +69,6 @@ dependencies:
|
|
76
69
|
- - <
|
77
70
|
- !ruby/object:Gem::Version
|
78
71
|
version: '4.0'
|
79
|
-
none: false
|
80
72
|
requirement: !ruby/object:Gem::Requirement
|
81
73
|
requirements:
|
82
74
|
- - '>='
|
@@ -85,7 +77,6 @@ dependencies:
|
|
85
77
|
- - <
|
86
78
|
- !ruby/object:Gem::Version
|
87
79
|
version: '4.0'
|
88
|
-
none: false
|
89
80
|
prerelease: false
|
90
81
|
type: :runtime
|
91
82
|
- !ruby/object:Gem::Dependency
|
@@ -95,13 +86,11 @@ dependencies:
|
|
95
86
|
- - '>='
|
96
87
|
- !ruby/object:Gem::Version
|
97
88
|
version: '0'
|
98
|
-
none: false
|
99
89
|
requirement: !ruby/object:Gem::Requirement
|
100
90
|
requirements:
|
101
91
|
- - '>='
|
102
92
|
- !ruby/object:Gem::Version
|
103
93
|
version: '0'
|
104
|
-
none: false
|
105
94
|
prerelease: false
|
106
95
|
type: :runtime
|
107
96
|
- !ruby/object:Gem::Dependency
|
@@ -111,13 +100,11 @@ dependencies:
|
|
111
100
|
- - ~>
|
112
101
|
- !ruby/object:Gem::Version
|
113
102
|
version: '1.3'
|
114
|
-
none: false
|
115
103
|
requirement: !ruby/object:Gem::Requirement
|
116
104
|
requirements:
|
117
105
|
- - ~>
|
118
106
|
- !ruby/object:Gem::Version
|
119
107
|
version: '1.3'
|
120
|
-
none: false
|
121
108
|
prerelease: false
|
122
109
|
type: :development
|
123
110
|
- !ruby/object:Gem::Dependency
|
@@ -127,13 +114,11 @@ dependencies:
|
|
127
114
|
- - '>='
|
128
115
|
- !ruby/object:Gem::Version
|
129
116
|
version: '0'
|
130
|
-
none: false
|
131
117
|
requirement: !ruby/object:Gem::Requirement
|
132
118
|
requirements:
|
133
119
|
- - '>='
|
134
120
|
- !ruby/object:Gem::Version
|
135
121
|
version: '0'
|
136
|
-
none: false
|
137
122
|
prerelease: false
|
138
123
|
type: :development
|
139
124
|
- !ruby/object:Gem::Dependency
|
@@ -143,13 +128,11 @@ dependencies:
|
|
143
128
|
- - '>='
|
144
129
|
- !ruby/object:Gem::Version
|
145
130
|
version: '0'
|
146
|
-
none: false
|
147
131
|
requirement: !ruby/object:Gem::Requirement
|
148
132
|
requirements:
|
149
133
|
- - '>='
|
150
134
|
- !ruby/object:Gem::Version
|
151
135
|
version: '0'
|
152
|
-
none: false
|
153
136
|
prerelease: false
|
154
137
|
type: :development
|
155
138
|
description:
|
@@ -161,7 +144,7 @@ extensions: []
|
|
161
144
|
extra_rdoc_files:
|
162
145
|
- doc/batch_execution.md
|
163
146
|
- doc/extending.md
|
164
|
-
- doc/
|
147
|
+
- doc/indexing_rules.md
|
165
148
|
- doc/other_commands.md
|
166
149
|
- doc/settings.md
|
167
150
|
files:
|
@@ -176,7 +159,7 @@ files:
|
|
176
159
|
- bin/traject
|
177
160
|
- doc/batch_execution.md
|
178
161
|
- doc/extending.md
|
179
|
-
- doc/
|
162
|
+
- doc/indexing_rules.md
|
180
163
|
- doc/other_commands.md
|
181
164
|
- doc/settings.md
|
182
165
|
- lib/tasks/load_maps.rake
|
@@ -285,6 +268,7 @@ files:
|
|
285
268
|
homepage: http://github.com/jrochkind/traject
|
286
269
|
licenses:
|
287
270
|
- MIT
|
271
|
+
metadata: {}
|
288
272
|
post_install_message:
|
289
273
|
rdoc_options: []
|
290
274
|
require_paths:
|
@@ -293,25 +277,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
293
277
|
requirements:
|
294
278
|
- - '>='
|
295
279
|
- !ruby/object:Gem::Version
|
296
|
-
segments:
|
297
|
-
- 0
|
298
|
-
hash: 2
|
299
280
|
version: '0'
|
300
|
-
none: false
|
301
281
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
302
282
|
requirements:
|
303
283
|
- - '>='
|
304
284
|
- !ruby/object:Gem::Version
|
305
|
-
segments:
|
306
|
-
- 0
|
307
|
-
hash: 2
|
308
285
|
version: '0'
|
309
|
-
none: false
|
310
286
|
requirements: []
|
311
287
|
rubyforge_project:
|
312
|
-
rubygems_version: 1.
|
288
|
+
rubygems_version: 2.1.5
|
313
289
|
signing_key:
|
314
|
-
specification_version:
|
290
|
+
specification_version: 4
|
315
291
|
summary: Index MARC to Solr; or generally process source records to hash-like structures
|
316
292
|
test_files:
|
317
293
|
- test/debug_writer_test.rb
|
data/doc/macros.md
DELETED
@@ -1,103 +0,0 @@
|
|
1
|
-
# Traject Indexing 'Macros'
|
2
|
-
|
3
|
-
Traject macros are a way of providing re-usable index mapping rules. Before we discuss how they work, we need to remind ourselves of the basic/direct Traject `to_field` indexing method.
|
4
|
-
|
5
|
-
## Review and details of direct indexing logic
|
6
|
-
|
7
|
-
Here's the simplest possible direct Traject mapping logic, duplicating the effects of the `literal` function:
|
8
|
-
|
9
|
-
~~~ruby
|
10
|
-
to_field("title") do |record, accumulator, context|
|
11
|
-
accumulator << "FIXED LITERAL"
|
12
|
-
end
|
13
|
-
~~~
|
14
|
-
|
15
|
-
That `do` is just ruby `block` syntax, whereby we can pass a block of ruby code as an argument to to a ruby method. We pass a block taking three arguments, labelled `record`, `accumulator`, and `context`, to the `to_field` method.
|
16
|
-
|
17
|
-
The block is then stored by the Traject::Indexer, and called for each record indexed. When it's called, it's passed the particular record at hand for the first argument, an Array used as an 'accumulator' as the second argument, and a Traject::Indexer::Context as the third argument.
|
18
|
-
|
19
|
-
The code in the block can add values to the accumulator array, which the Traject::Indexer then adds to the field specified by `to_field`.
|
20
|
-
|
21
|
-
It's also worth pointing out that ruby blocks are `closures`, so they can "capture" and use values from outside the block. So this would work too:
|
22
|
-
|
23
|
-
~~~ruby
|
24
|
-
my_var = "FIXED LITERAL"
|
25
|
-
to_field("title") do |record, accumulator, context|
|
26
|
-
accumulator << my_var
|
27
|
-
end
|
28
|
-
~~~
|
29
|
-
|
30
|
-
So that's the way to provide direct logic for mapping rules.
|
31
|
-
|
32
|
-
## Macros
|
33
|
-
|
34
|
-
A Traject macro is a way to automatically create indexing rules via re-usable "templates".
|
35
|
-
|
36
|
-
Traject macros are simply methods that return ruby lambda/proc objects. A ruby lambda is just another syntax for creating blocks of ruby logic that can be passed around as data.
|
37
|
-
|
38
|
-
So, for instance, we could capture that fixed literal block in a lambda like this:
|
39
|
-
|
40
|
-
~~~ruby
|
41
|
-
always_add_black = lambda do |record, accumulator, context|
|
42
|
-
accumulator << "BLACK"
|
43
|
-
end
|
44
|
-
~~~
|
45
|
-
|
46
|
-
Then, knowing that the `to_field` ruby method takes a block, we can use the ruby `&` operator
|
47
|
-
to convert our lambda to a block argument. This would in fact work:
|
48
|
-
|
49
|
-
~~~ruby
|
50
|
-
to_field "color", &always_add_black
|
51
|
-
~~~
|
52
|
-
|
53
|
-
However, for convenience, the `to_field` method can take a lambda directly (without having to use '&' to convert it to a block argument) as a second argument too. So this would work too:
|
54
|
-
|
55
|
-
~~~ruby
|
56
|
-
to_field "color", always_add_black
|
57
|
-
~~~
|
58
|
-
|
59
|
-
A macro is jus more step, using a method to create lambdas dynamically: A Traject macro is just a ruby method that **returns** a lambda, a three-arg lambda like `to_field` wants.
|
60
|
-
|
61
|
-
Here is in fact how the `literal` function is implemented:
|
62
|
-
|
63
|
-
~~~ruby
|
64
|
-
def literal(value)
|
65
|
-
return lambda do |record, accumulator, context|
|
66
|
-
# because a lambda is a closure, we can define it in terms
|
67
|
-
# of the 'value' from the scope it's defined in!
|
68
|
-
accumulator << value
|
69
|
-
end
|
70
|
-
end
|
71
|
-
to_field("something"), literal("something")
|
72
|
-
~~~
|
73
|
-
|
74
|
-
It's really as simple as that, that's all a Traject macro is. A function that takes parameters, and based on those parameters returns a lambda; the lambda is then passed to the `to_field` indexing method, or similar methods.
|
75
|
-
|
76
|
-
How do you make these methods available to the indexer?
|
77
|
-
|
78
|
-
Define it in a module:
|
79
|
-
|
80
|
-
~~~ruby
|
81
|
-
# in a file literal_macro.rb
|
82
|
-
module LiteralMacro
|
83
|
-
def literal(value)
|
84
|
-
return lambda do |record, accumulator, context|
|
85
|
-
# because a lambda is a closure, we can define it in terms
|
86
|
-
# of the 'value' from the scope it's defined in!
|
87
|
-
accumulator << value
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
~~~
|
92
|
-
|
93
|
-
And then use ordinary ruby `require` and `extend` to add it to the current Indexer file, by simply including this
|
94
|
-
in one of your config files:
|
95
|
-
|
96
|
-
~~~
|
97
|
-
require `literal_macro.rb`
|
98
|
-
extend LiteralMacro
|
99
|
-
|
100
|
-
to_field ...
|
101
|
-
~~~
|
102
|
-
|
103
|
-
That's it. You can use the traject command line `-I` option to set the ruby load path, so your file will be findable via `require`. Or you can distribute it in a gem, and use straight rubygems and the `gem` command in your configuration file, or Bundler with traject command-line `-g` option.
|