traject 0.16.0 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +1 -0
  3. data/README.md +183 -191
  4. data/bench/bench.rb +1 -1
  5. data/doc/batch_execution.md +14 -0
  6. data/doc/extending.md +14 -12
  7. data/doc/indexing_rules.md +265 -0
  8. data/lib/traject/command_line.rb +12 -41
  9. data/lib/traject/debug_writer.rb +32 -13
  10. data/lib/traject/indexer.rb +101 -24
  11. data/lib/traject/indexer/settings.rb +18 -17
  12. data/lib/traject/json_writer.rb +32 -11
  13. data/lib/traject/line_writer.rb +6 -6
  14. data/lib/traject/macros/basic.rb +1 -1
  15. data/lib/traject/macros/marc21.rb +17 -13
  16. data/lib/traject/macros/marc21_semantics.rb +27 -25
  17. data/lib/traject/macros/marc_format_classifier.rb +39 -25
  18. data/lib/traject/marc4j_reader.rb +36 -22
  19. data/lib/traject/marc_extractor.rb +79 -75
  20. data/lib/traject/marc_reader.rb +33 -25
  21. data/lib/traject/mock_reader.rb +9 -10
  22. data/lib/traject/ndj_reader.rb +7 -7
  23. data/lib/traject/null_writer.rb +1 -1
  24. data/lib/traject/qualified_const_get.rb +12 -2
  25. data/lib/traject/solrj_writer.rb +61 -52
  26. data/lib/traject/thread_pool.rb +45 -45
  27. data/lib/traject/translation_map.rb +59 -27
  28. data/lib/traject/util.rb +3 -3
  29. data/lib/traject/version.rb +1 -1
  30. data/lib/traject/yaml_writer.rb +1 -1
  31. data/test/debug_writer_test.rb +7 -7
  32. data/test/indexer/each_record_test.rb +4 -4
  33. data/test/indexer/macros_marc21_semantics_test.rb +12 -12
  34. data/test/indexer/macros_marc21_test.rb +10 -10
  35. data/test/indexer/macros_test.rb +1 -1
  36. data/test/indexer/map_record_test.rb +6 -6
  37. data/test/indexer/read_write_test.rb +43 -4
  38. data/test/indexer/settings_test.rb +2 -2
  39. data/test/indexer/to_field_test.rb +8 -8
  40. data/test/marc4j_reader_test.rb +4 -4
  41. data/test/marc_extractor_test.rb +33 -25
  42. data/test/marc_format_classifier_test.rb +3 -3
  43. data/test/marc_reader_test.rb +2 -2
  44. data/test/test_helper.rb +3 -3
  45. data/test/test_support/demo_config.rb +52 -48
  46. data/test/translation_map_test.rb +22 -4
  47. data/test/translation_maps/bad_ruby.rb +2 -2
  48. data/test/translation_maps/both_map.rb +1 -1
  49. data/test/translation_maps/default_literal.rb +1 -1
  50. data/test/translation_maps/default_passthrough.rb +1 -1
  51. data/test/translation_maps/ruby_map.rb +1 -1
  52. metadata +7 -31
  53. data/doc/macros.md +0 -103
@@ -37,7 +37,15 @@ describe "TranslationMap" do
37
37
  map = Traject::TranslationMap.new("default_literal")
38
38
  map = Traject::TranslationMap.new("default_literal")
39
39
 
40
- assert_equal "DEFAULT LITERAL", map["not in the map"]
40
+ assert_equal "DEFAULT LITERAL", map["not in the map"]
41
+ end
42
+
43
+ it "does not trigger default on explicit nil result" do
44
+ map = Traject::TranslationMap.new({"alpha" => "one", "beta" => nil}, :default => "DEFAULT")
45
+
46
+ assert_equal "one", map["alpha"]
47
+ assert_nil map["beta"]
48
+ assert_equal "DEFAULT", map["not_found_in_map"]
41
49
  end
42
50
 
43
51
  it "finds .rb over .yaml" do
@@ -80,12 +88,12 @@ describe "TranslationMap" do
80
88
  assert_equal "value1", map["key1"]
81
89
  end
82
90
 
83
- it "finds .properties defn" do
91
+ it "finds .properties defn" do
84
92
  map =Traject::TranslationMap.new("properties_map")
85
93
 
86
94
  assert_equal "Value1", map["key1"]
87
95
  assert_equal "Value2", map["key2"]
88
- assert_equal "Multi word value", map["key3"]
96
+ assert_equal "Multi word value", map["key3"]
89
97
  end
90
98
 
91
99
  it "can use a hash instance too" do
@@ -117,6 +125,16 @@ describe "TranslationMap" do
117
125
  assert_equal ["hola", "first", "second", "last thing", "buenas noches", "hola", "everything else"], arr
118
126
  end
119
127
 
128
+ it "translate_array does not include nil values" do
129
+ # TranslationMap can explicitly map to nil, meaning,
130
+ # well, map to nothing. Make sure translate_array respects that.
131
+ map = Traject::TranslationMap.new("alpha" => "one", "beta" => nil)
132
+
133
+ values = map.translate_array(["alpha", "beta"])
134
+
135
+ assert_equal ["one"], values
136
+ end
137
+
120
138
  it "#to_hash" do
121
139
  map = Traject::TranslationMap.new("yaml_map")
122
140
 
@@ -129,4 +147,4 @@ describe "TranslationMap" do
129
147
  refute_same hash, map.to_hash, "each #to_hash result is a copy"
130
148
  end
131
149
 
132
- end
150
+ end
@@ -1,8 +1,8 @@
1
- # this is not good ruby code at all.
1
+ # this is not good ruby code at all.
2
2
 
3
3
  x+y
4
4
 
5
5
  This is
6
6
  not valid ruby
7
7
  * if anything
8
- * it's more like markdown
8
+ * it's more like markdown
@@ -1 +1 @@
1
- { "ruby" => "ruby" }
1
+ { "ruby" => "ruby" }
@@ -7,4 +7,4 @@ some_hash = {
7
7
  some_hash["__default__"] = "DEFAULT LITERAL"
8
8
 
9
9
  # can be other ruby here, last line needs to evaluate as a Hash
10
- some_hash
10
+ some_hash
@@ -7,4 +7,4 @@ some_hash = {
7
7
  some_hash["__default__"] = "__passthrough__"
8
8
 
9
9
  # can be other ruby here, last line needs to evaluate as a Hash
10
- some_hash
10
+ some_hash
@@ -7,4 +7,4 @@ some_hash = {
7
7
  some_hash["also"] = "this"
8
8
 
9
9
  # can be other ruby here, last line needs to evaluate as a Hash
10
- some_hash
10
+ some_hash
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.16.0
4
+ version: 0.17.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Jonathan Rochkind
@@ -10,7 +9,7 @@ authors:
10
9
  autorequire:
11
10
  bindir: bin
12
11
  cert_chain: []
13
- date: 2013-09-30 00:00:00.000000000 Z
12
+ date: 2013-10-10 00:00:00.000000000 Z
14
13
  dependencies:
15
14
  - !ruby/object:Gem::Dependency
16
15
  name: marc
@@ -19,13 +18,11 @@ dependencies:
19
18
  - - '>='
20
19
  - !ruby/object:Gem::Version
21
20
  version: 0.7.1
22
- none: false
23
21
  requirement: !ruby/object:Gem::Requirement
24
22
  requirements:
25
23
  - - '>='
26
24
  - !ruby/object:Gem::Version
27
25
  version: 0.7.1
28
- none: false
29
26
  prerelease: false
30
27
  type: :runtime
31
28
  - !ruby/object:Gem::Dependency
@@ -35,13 +32,11 @@ dependencies:
35
32
  - - '>='
36
33
  - !ruby/object:Gem::Version
37
34
  version: 0.1.1
38
- none: false
39
35
  requirement: !ruby/object:Gem::Requirement
40
36
  requirements:
41
37
  - - '>='
42
38
  - !ruby/object:Gem::Version
43
39
  version: 0.1.1
44
- none: false
45
40
  prerelease: false
46
41
  type: :runtime
47
42
  - !ruby/object:Gem::Dependency
@@ -54,7 +49,6 @@ dependencies:
54
49
  - - <
55
50
  - !ruby/object:Gem::Version
56
51
  version: '2.1'
57
- none: false
58
52
  requirement: !ruby/object:Gem::Requirement
59
53
  requirements:
60
54
  - - '>='
@@ -63,7 +57,6 @@ dependencies:
63
57
  - - <
64
58
  - !ruby/object:Gem::Version
65
59
  version: '2.1'
66
- none: false
67
60
  prerelease: false
68
61
  type: :runtime
69
62
  - !ruby/object:Gem::Dependency
@@ -76,7 +69,6 @@ dependencies:
76
69
  - - <
77
70
  - !ruby/object:Gem::Version
78
71
  version: '4.0'
79
- none: false
80
72
  requirement: !ruby/object:Gem::Requirement
81
73
  requirements:
82
74
  - - '>='
@@ -85,7 +77,6 @@ dependencies:
85
77
  - - <
86
78
  - !ruby/object:Gem::Version
87
79
  version: '4.0'
88
- none: false
89
80
  prerelease: false
90
81
  type: :runtime
91
82
  - !ruby/object:Gem::Dependency
@@ -95,13 +86,11 @@ dependencies:
95
86
  - - '>='
96
87
  - !ruby/object:Gem::Version
97
88
  version: '0'
98
- none: false
99
89
  requirement: !ruby/object:Gem::Requirement
100
90
  requirements:
101
91
  - - '>='
102
92
  - !ruby/object:Gem::Version
103
93
  version: '0'
104
- none: false
105
94
  prerelease: false
106
95
  type: :runtime
107
96
  - !ruby/object:Gem::Dependency
@@ -111,13 +100,11 @@ dependencies:
111
100
  - - ~>
112
101
  - !ruby/object:Gem::Version
113
102
  version: '1.3'
114
- none: false
115
103
  requirement: !ruby/object:Gem::Requirement
116
104
  requirements:
117
105
  - - ~>
118
106
  - !ruby/object:Gem::Version
119
107
  version: '1.3'
120
- none: false
121
108
  prerelease: false
122
109
  type: :development
123
110
  - !ruby/object:Gem::Dependency
@@ -127,13 +114,11 @@ dependencies:
127
114
  - - '>='
128
115
  - !ruby/object:Gem::Version
129
116
  version: '0'
130
- none: false
131
117
  requirement: !ruby/object:Gem::Requirement
132
118
  requirements:
133
119
  - - '>='
134
120
  - !ruby/object:Gem::Version
135
121
  version: '0'
136
- none: false
137
122
  prerelease: false
138
123
  type: :development
139
124
  - !ruby/object:Gem::Dependency
@@ -143,13 +128,11 @@ dependencies:
143
128
  - - '>='
144
129
  - !ruby/object:Gem::Version
145
130
  version: '0'
146
- none: false
147
131
  requirement: !ruby/object:Gem::Requirement
148
132
  requirements:
149
133
  - - '>='
150
134
  - !ruby/object:Gem::Version
151
135
  version: '0'
152
- none: false
153
136
  prerelease: false
154
137
  type: :development
155
138
  description:
@@ -161,7 +144,7 @@ extensions: []
161
144
  extra_rdoc_files:
162
145
  - doc/batch_execution.md
163
146
  - doc/extending.md
164
- - doc/macros.md
147
+ - doc/indexing_rules.md
165
148
  - doc/other_commands.md
166
149
  - doc/settings.md
167
150
  files:
@@ -176,7 +159,7 @@ files:
176
159
  - bin/traject
177
160
  - doc/batch_execution.md
178
161
  - doc/extending.md
179
- - doc/macros.md
162
+ - doc/indexing_rules.md
180
163
  - doc/other_commands.md
181
164
  - doc/settings.md
182
165
  - lib/tasks/load_maps.rake
@@ -285,6 +268,7 @@ files:
285
268
  homepage: http://github.com/jrochkind/traject
286
269
  licenses:
287
270
  - MIT
271
+ metadata: {}
288
272
  post_install_message:
289
273
  rdoc_options: []
290
274
  require_paths:
@@ -293,25 +277,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
293
277
  requirements:
294
278
  - - '>='
295
279
  - !ruby/object:Gem::Version
296
- segments:
297
- - 0
298
- hash: 2
299
280
  version: '0'
300
- none: false
301
281
  required_rubygems_version: !ruby/object:Gem::Requirement
302
282
  requirements:
303
283
  - - '>='
304
284
  - !ruby/object:Gem::Version
305
- segments:
306
- - 0
307
- hash: 2
308
285
  version: '0'
309
- none: false
310
286
  requirements: []
311
287
  rubyforge_project:
312
- rubygems_version: 1.8.24
288
+ rubygems_version: 2.1.5
313
289
  signing_key:
314
- specification_version: 3
290
+ specification_version: 4
315
291
  summary: Index MARC to Solr; or generally process source records to hash-like structures
316
292
  test_files:
317
293
  - test/debug_writer_test.rb
@@ -1,103 +0,0 @@
1
- # Traject Indexing 'Macros'
2
-
3
- Traject macros are a way of providing re-usable index mapping rules. Before we discuss how they work, we need to remind ourselves of the basic/direct Traject `to_field` indexing method.
4
-
5
- ## Review and details of direct indexing logic
6
-
7
- Here's the simplest possible direct Traject mapping logic, duplicating the effects of the `literal` function:
8
-
9
- ~~~ruby
10
- to_field("title") do |record, accumulator, context|
11
- accumulator << "FIXED LITERAL"
12
- end
13
- ~~~
14
-
15
- That `do` is just ruby `block` syntax, whereby we can pass a block of ruby code as an argument to to a ruby method. We pass a block taking three arguments, labelled `record`, `accumulator`, and `context`, to the `to_field` method.
16
-
17
- The block is then stored by the Traject::Indexer, and called for each record indexed. When it's called, it's passed the particular record at hand for the first argument, an Array used as an 'accumulator' as the second argument, and a Traject::Indexer::Context as the third argument.
18
-
19
- The code in the block can add values to the accumulator array, which the Traject::Indexer then adds to the field specified by `to_field`.
20
-
21
- It's also worth pointing out that ruby blocks are `closures`, so they can "capture" and use values from outside the block. So this would work too:
22
-
23
- ~~~ruby
24
- my_var = "FIXED LITERAL"
25
- to_field("title") do |record, accumulator, context|
26
- accumulator << my_var
27
- end
28
- ~~~
29
-
30
- So that's the way to provide direct logic for mapping rules.
31
-
32
- ## Macros
33
-
34
- A Traject macro is a way to automatically create indexing rules via re-usable "templates".
35
-
36
- Traject macros are simply methods that return ruby lambda/proc objects. A ruby lambda is just another syntax for creating blocks of ruby logic that can be passed around as data.
37
-
38
- So, for instance, we could capture that fixed literal block in a lambda like this:
39
-
40
- ~~~ruby
41
- always_add_black = lambda do |record, accumulator, context|
42
- accumulator << "BLACK"
43
- end
44
- ~~~
45
-
46
- Then, knowing that the `to_field` ruby method takes a block, we can use the ruby `&` operator
47
- to convert our lambda to a block argument. This would in fact work:
48
-
49
- ~~~ruby
50
- to_field "color", &always_add_black
51
- ~~~
52
-
53
- However, for convenience, the `to_field` method can take a lambda directly (without having to use '&' to convert it to a block argument) as a second argument too. So this would work too:
54
-
55
- ~~~ruby
56
- to_field "color", always_add_black
57
- ~~~
58
-
59
- A macro is jus more step, using a method to create lambdas dynamically: A Traject macro is just a ruby method that **returns** a lambda, a three-arg lambda like `to_field` wants.
60
-
61
- Here is in fact how the `literal` function is implemented:
62
-
63
- ~~~ruby
64
- def literal(value)
65
- return lambda do |record, accumulator, context|
66
- # because a lambda is a closure, we can define it in terms
67
- # of the 'value' from the scope it's defined in!
68
- accumulator << value
69
- end
70
- end
71
- to_field("something"), literal("something")
72
- ~~~
73
-
74
- It's really as simple as that, that's all a Traject macro is. A function that takes parameters, and based on those parameters returns a lambda; the lambda is then passed to the `to_field` indexing method, or similar methods.
75
-
76
- How do you make these methods available to the indexer?
77
-
78
- Define it in a module:
79
-
80
- ~~~ruby
81
- # in a file literal_macro.rb
82
- module LiteralMacro
83
- def literal(value)
84
- return lambda do |record, accumulator, context|
85
- # because a lambda is a closure, we can define it in terms
86
- # of the 'value' from the scope it's defined in!
87
- accumulator << value
88
- end
89
- end
90
- end
91
- ~~~
92
-
93
- And then use ordinary ruby `require` and `extend` to add it to the current Indexer file, by simply including this
94
- in one of your config files:
95
-
96
- ~~~
97
- require `literal_macro.rb`
98
- extend LiteralMacro
99
-
100
- to_field ...
101
- ~~~
102
-
103
- That's it. You can use the traject command line `-I` option to set the ruby load path, so your file will be findable via `require`. Or you can distribute it in a gem, and use straight rubygems and the `gem` command in your configuration file, or Bundler with traject command-line `-g` option.