traject 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +27 -0
  4. data/.yardopts +3 -0
  5. data/Gemfile +12 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +461 -0
  8. data/Rakefile +21 -0
  9. data/bench/bench.rb +30 -0
  10. data/bin/traject +16 -0
  11. data/doc/batch_execution.md +243 -0
  12. data/doc/extending.md +190 -0
  13. data/doc/indexing_rules.md +265 -0
  14. data/doc/other_commands.md +47 -0
  15. data/doc/settings.md +101 -0
  16. data/lib/tasks/load_maps.rake +48 -0
  17. data/lib/traject.rb +11 -0
  18. data/lib/traject/command_line.rb +301 -0
  19. data/lib/traject/csv_writer.rb +34 -0
  20. data/lib/traject/debug_writer.rb +47 -0
  21. data/lib/traject/delimited_writer.rb +110 -0
  22. data/lib/traject/indexer.rb +613 -0
  23. data/lib/traject/indexer/settings.rb +110 -0
  24. data/lib/traject/json_writer.rb +51 -0
  25. data/lib/traject/line_writer.rb +63 -0
  26. data/lib/traject/macros/basic.rb +9 -0
  27. data/lib/traject/macros/marc21.rb +223 -0
  28. data/lib/traject/macros/marc21_semantics.rb +584 -0
  29. data/lib/traject/macros/marc_format_classifier.rb +197 -0
  30. data/lib/traject/marc_extractor.rb +410 -0
  31. data/lib/traject/marc_reader.rb +89 -0
  32. data/lib/traject/mock_reader.rb +97 -0
  33. data/lib/traject/ndj_reader.rb +40 -0
  34. data/lib/traject/null_writer.rb +22 -0
  35. data/lib/traject/qualified_const_get.rb +40 -0
  36. data/lib/traject/solr_json_writer.rb +277 -0
  37. data/lib/traject/thread_pool.rb +161 -0
  38. data/lib/traject/translation_map.rb +267 -0
  39. data/lib/traject/util.rb +52 -0
  40. data/lib/traject/version.rb +3 -0
  41. data/lib/traject/yaml_writer.rb +9 -0
  42. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  43. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  44. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  45. data/lib/translation_maps/marc_geographic.yaml +589 -0
  46. data/lib/translation_maps/marc_instruments.yaml +102 -0
  47. data/lib/translation_maps/marc_languages.yaml +490 -0
  48. data/test/debug_writer_test.rb +38 -0
  49. data/test/delimited_writer_test.rb +104 -0
  50. data/test/indexer/each_record_test.rb +59 -0
  51. data/test/indexer/macros_marc21_semantics_test.rb +391 -0
  52. data/test/indexer/macros_marc21_test.rb +190 -0
  53. data/test/indexer/macros_test.rb +40 -0
  54. data/test/indexer/map_record_test.rb +209 -0
  55. data/test/indexer/read_write_test.rb +101 -0
  56. data/test/indexer/settings_test.rb +152 -0
  57. data/test/indexer/to_field_test.rb +77 -0
  58. data/test/marc_extractor_test.rb +412 -0
  59. data/test/marc_format_classifier_test.rb +98 -0
  60. data/test/marc_reader_test.rb +110 -0
  61. data/test/solr_json_writer_test.rb +248 -0
  62. data/test/test_helper.rb +90 -0
  63. data/test/test_support/245_no_ab.marc +1 -0
  64. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  65. data/test/test_support/bad_subfield_code.marc +1 -0
  66. data/test/test_support/bad_utf_byte.utf8.marc +1 -0
  67. data/test/test_support/date_resort_to_260.marc +1 -0
  68. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  69. data/test/test_support/date_with_u.marc +1 -0
  70. data/test/test_support/demo_config.rb +155 -0
  71. data/test/test_support/emptyish_record.marc +1 -0
  72. data/test/test_support/escaped_character_reference.marc8.marc +1 -0
  73. data/test/test_support/george_eliot.marc +1 -0
  74. data/test/test_support/hebrew880s.marc +1 -0
  75. data/test/test_support/louis_armstrong.marc +1 -0
  76. data/test/test_support/manufacturing_consent.marc +1 -0
  77. data/test/test_support/manuscript_online_thesis.marc +1 -0
  78. data/test/test_support/microform_online_conference.marc +1 -0
  79. data/test/test_support/multi_era.marc +1 -0
  80. data/test/test_support/multi_geo.marc +1 -0
  81. data/test/test_support/musical_cage.marc +1 -0
  82. data/test/test_support/nature.marc +1 -0
  83. data/test/test_support/one-marc8.mrc +1 -0
  84. data/test/test_support/online_only.marc +1 -0
  85. data/test/test_support/packed_041a_lang.marc +1 -0
  86. data/test/test_support/test_data.utf8.json +30 -0
  87. data/test/test_support/test_data.utf8.marc.xml +2609 -0
  88. data/test/test_support/test_data.utf8.mrc +1 -0
  89. data/test/test_support/test_data.utf8.mrc.gz +0 -0
  90. data/test/test_support/the_business_ren.marc +1 -0
  91. data/test/translation_map_test.rb +225 -0
  92. data/test/translation_maps/bad_ruby.rb +8 -0
  93. data/test/translation_maps/bad_yaml.yaml +1 -0
  94. data/test/translation_maps/both_map.rb +1 -0
  95. data/test/translation_maps/both_map.yaml +1 -0
  96. data/test/translation_maps/default_literal.rb +10 -0
  97. data/test/translation_maps/default_passthrough.rb +10 -0
  98. data/test/translation_maps/marc_040a_translate_test.yaml +1 -0
  99. data/test/translation_maps/properties_map.properties +5 -0
  100. data/test/translation_maps/ruby_map.rb +10 -0
  101. data/test/translation_maps/translate_array_test.yaml +8 -0
  102. data/test/translation_maps/yaml_map.yaml +7 -0
  103. data/traject.gemspec +47 -0
  104. metadata +382 -0
@@ -0,0 +1,98 @@
1
+ require 'test_helper'
2
+
3
+ require 'traject/macros/marc_format_classifier'
4
+
5
+ MarcFormatClassifier = Traject::Macros::MarcFormatClassifier
6
+
7
+ def classifier_for(filename)
8
+ record = MARC::Reader.new(support_file_path filename).to_a.first
9
+ return MarcFormatClassifier.new( record )
10
+ end
11
+
12
+ describe "MarcFormatClassifier" do
13
+
14
+ it "returns 'Print' when there's no other data" do
15
+ assert_equal ['Print'], MarcFormatClassifier.new( empty_record ).formats
16
+ end
17
+
18
+ describe "genre" do
19
+ # We don't have the patience to test every case, just a sampling
20
+ it "says book" do
21
+ assert_equal ["Book"], classifier_for("manufacturing_consent.marc").genre
22
+ end
23
+ it "says Book for a weird one" do
24
+ assert_equal ["Book"], classifier_for("microform_online_conference.marc").genre
25
+ end
26
+ it "says Musical Recording" do
27
+ assert_equal ["Musical Recording"], classifier_for("musical_cage.marc").genre
28
+ end
29
+ it "says Journal" do
30
+ assert_equal ["Journal/Newspaper"], classifier_for("the_business_ren.marc").genre
31
+ end
32
+ end
33
+
34
+
35
+ describe "print?" do
36
+ it "says print when it is" do
37
+ assert classifier_for("manufacturing_consent.marc").print?
38
+ end
39
+ it "does not say print for online only" do
40
+ assert ! classifier_for("online_only.marc").print?
41
+ end
42
+ end
43
+
44
+ describe "online?" do
45
+ it "says online when it is" do
46
+ assert classifier_for("online_only.marc").online?
47
+ assert classifier_for("microform_online_conference.marc").online?
48
+ assert classifier_for("manuscript_online_thesis.marc").online?
49
+ end
50
+ it "does not say online for a print only" do
51
+ assert ! classifier_for("manufacturing_consent.marc").online?
52
+ end
53
+ end
54
+
55
+ describe "microform?" do
56
+ it "says microform when it is" do
57
+ assert classifier_for("microform_online_conference.marc").microform?
58
+ end
59
+ it "does not say microform when it ain't" do
60
+ assert ! classifier_for("manufacturing_consent.marc").microform?
61
+ assert ! classifier_for("online_only.marc").microform?
62
+ end
63
+ it "catches microform in an 007" do
64
+ assert classifier_for("nature.marc").microform?
65
+ end
66
+ end
67
+
68
+ describe "conference?" do
69
+ it "says conference when it is" do
70
+ assert classifier_for("microform_online_conference.marc").proceeding?
71
+ end
72
+ it "does not say conference when it ain't" do
73
+ assert ! classifier_for("manufacturing_consent.marc").proceeding?
74
+ assert ! classifier_for("online_only.marc").proceeding?
75
+ end
76
+ end
77
+
78
+ describe "thesis?" do
79
+ it "says thesis when it is" do
80
+ assert classifier_for("manuscript_online_thesis.marc").thesis?
81
+ end
82
+ it "does not say thesis when it ain't" do
83
+ assert ! classifier_for("manufacturing_consent.marc").thesis?
84
+ assert ! classifier_for("online_only.marc").thesis?
85
+ end
86
+ end
87
+
88
+ describe "manuscript_archive?" do
89
+ it "says manuscript when it is" do
90
+ assert classifier_for("manuscript_online_thesis.marc").manuscript_archive?
91
+ end
92
+ it "does not say manuscript when it ain't" do
93
+ assert ! classifier_for("manufacturing_consent.marc").manuscript_archive?
94
+ assert ! classifier_for("online_only.marc").manuscript_archive?
95
+ end
96
+ end
97
+
98
+ end
@@ -0,0 +1,110 @@
1
+ # Encoding: UTF-8
2
+
3
+ require 'test_helper'
4
+ require 'traject/marc_reader'
5
+ require 'marc'
6
+
7
+ describe "Traject::MarcReader" do
8
+
9
+
10
+ it "reads XML" do
11
+ file = File.new(support_file_path "test_data.utf8.marc.xml")
12
+ settings = Traject::Indexer::Settings.new("marc_source.type" => "xml")
13
+ reader = Traject::MarcReader.new(file, settings)
14
+
15
+ array = reader.to_a
16
+
17
+ assert_equal 30, array.length
18
+ end
19
+
20
+
21
+ describe "MARC binary" do
22
+ it "reads" do
23
+ file = File.new(support_file_path "test_data.utf8.mrc")
24
+ settings = Traject::Indexer::Settings.new() # binary type is default
25
+ reader = Traject::MarcReader.new(file, settings)
26
+
27
+ array = reader.to_a
28
+
29
+ assert_equal 30, array.length
30
+
31
+ first = array.first
32
+
33
+ assert_kind_of MARC::Record, first
34
+
35
+ assert first['245']['a'].encoding.name, "UTF-8"
36
+ assert_equal "Fikr-i Ayāz /", first['245']['a']
37
+ end
38
+
39
+ it "reads Marc binary in Marc8 encoding, transcoding to UTF-8" do
40
+ file = File.new(support_file_path("one-marc8.mrc"))
41
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8")
42
+ reader = Traject::MarcReader.new(file, settings)
43
+
44
+ array = reader.to_a
45
+
46
+ assert_length 1, array
47
+
48
+
49
+ assert_kind_of MARC::Record, array.first
50
+ a245a = array.first['245']['a']
51
+
52
+ assert a245a.encoding.name, "UTF-8"
53
+ assert a245a.valid_encoding?
54
+ assert_equal "Por uma outra globalização :", a245a
55
+ end
56
+
57
+ it "replaces unicode character reference in Marc8 transcode" do
58
+ file = File.new(support_file_path("escaped_character_reference.marc8.marc"))
59
+
60
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8") # binary type is default
61
+ record = Traject::MarcReader.new(file, settings).to_a.first
62
+
63
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
64
+ end
65
+
66
+ it "raises on unrecognized encoding for binary type" do
67
+ file = File.new(support_file_path "one-marc8.mrc")
68
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "ADFADFADF")
69
+ assert_raises(ArgumentError) do
70
+ record = Traject::MarcReader.new(file, settings).to_a.first
71
+ end
72
+ end
73
+
74
+ it "replaces bad byte in UTF8 marc binary" do
75
+ file = File.new(support_file_path "bad_utf_byte.utf8.marc")
76
+
77
+ settings = Traject::Indexer::Settings.new() # binary type is default
78
+ reader = Traject::MarcReader.new(file, settings)
79
+
80
+ record = reader.to_a.first
81
+
82
+ value = record['300']['a']
83
+
84
+ assert_equal value.encoding.name, "UTF-8"
85
+ assert value.valid_encoding?, "Has valid encoding"
86
+ assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", value
87
+ end
88
+ end
89
+
90
+ it "reads JSON" do
91
+ file = File.new(support_file_path "test_data.utf8.json")
92
+ settings = Traject::Indexer::Settings.new("marc_source.type" => "json")
93
+ reader = Traject::MarcReader.new(file, settings)
94
+ array = reader.to_a
95
+
96
+ assert_equal 30, array.length
97
+
98
+ first = array.first
99
+
100
+ assert_kind_of MARC::Record, first
101
+
102
+ assert first['245']['a'].encoding.name, "UTF-8"
103
+ assert_equal "Fikr-i Ayāz /", first['245']['a']
104
+ end
105
+
106
+
107
+
108
+
109
+
110
+ end
@@ -0,0 +1,248 @@
1
+ require 'test_helper'
2
+ require 'httpclient'
3
+ require 'traject/solr_json_writer'
4
+ require 'thread'
5
+ require 'json'
6
+ require 'stringio'
7
+ require 'logger'
8
+
9
+
10
+ # Some basic tests, using a mocked HTTPClient so we can see what it did --
11
+ # these tests do not run against a real solr server at present.
12
+ describe "Traject::SolrJsonWriter" do
13
+
14
+
15
+ #######
16
+ # A bunch of utilities to help testing
17
+ #######
18
+
19
+ class FakeHTTPClient
20
+ # Always reply with this status, normally 200, can
21
+ # be reset for testing error conditions.
22
+ attr_accessor :response_status
23
+ attr_accessor :allow_update_json_path
24
+
25
+ def initialize(*args)
26
+ @post_args = []
27
+ @get_args = []
28
+ @response_status = 200
29
+ @allow_update_json_path = true
30
+ @mutex = Monitor.new
31
+ end
32
+
33
+ def post(*args)
34
+ @mutex.synchronize do
35
+ @post_args << args
36
+ end
37
+
38
+ resp = HTTP::Message.new_response("")
39
+ resp.status = self.response_status
40
+
41
+ return resp
42
+ end
43
+
44
+ def get (*args)
45
+ @mutex.synchronize do
46
+ @get_args << args
47
+ end
48
+
49
+ resp = HTTP::Message.new_response("")
50
+ resp.status = self.response_status
51
+
52
+ if args.first.end_with?("/update/json") && ! self.allow_update_json_path
53
+ # Need to test auto-detection of /update/json being available
54
+ resp.status = 404
55
+ end
56
+
57
+ return resp
58
+ end
59
+
60
+ def post_args
61
+ @mutex.synchronize do
62
+ @post_args.dup
63
+ end
64
+ end
65
+
66
+ def get_args
67
+ @mutex.synchronize do
68
+ @get_args.dup
69
+ end
70
+ end
71
+
72
+ # Everything else, just return nil please
73
+ def method_missing(*args)
74
+ end
75
+ end
76
+
77
+
78
+ def context_with(hash)
79
+ Traject::Indexer::Context.new(:output_hash => hash)
80
+ end
81
+
82
+ def create_writer(settings = {})
83
+ settings = {
84
+ "solr.url" => "http://example.com/solr",
85
+ "solr_json_writer.http_client" => FakeHTTPClient.new
86
+ }.merge!(settings)
87
+ @fake_http_client = settings["solr_json_writer.http_client"]
88
+
89
+ writer = Traject::SolrJsonWriter.new(settings)
90
+
91
+ return writer
92
+ end
93
+
94
+ # strio = StringIO.new
95
+ # logger_to_strio(strio)
96
+ #
97
+ # Later check for strio.string for contents
98
+ def logger_to_strio(strio)
99
+ # Yell makes this hard, let's do it with an ordinary logger, think
100
+ # it's okay.
101
+ Logger.new(strio)
102
+ end
103
+
104
+ #########
105
+ # Actual tests
106
+ #########
107
+
108
+ before do
109
+ @writer = create_writer
110
+ end
111
+
112
+ it "defaults to 1 bg thread" do
113
+ assert_equal 1, @writer.thread_pool_size
114
+ end
115
+
116
+ it "adds a document" do
117
+ @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
118
+ @writer.close
119
+
120
+ post_args = @fake_http_client.post_args.first
121
+
122
+ refute_nil post_args
123
+
124
+ assert_equal "http://example.com/solr/update/json", post_args[0]
125
+
126
+ refute_nil post_args[1]
127
+ posted_json = JSON.parse(post_args[1])
128
+
129
+ assert_equal [{"id" => "one", "key" => ["value1", "value2"]}], posted_json
130
+ end
131
+
132
+ it "adds more than a batch in batches" do
133
+ (Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE + 1).times do |i|
134
+ doc = {"id" => "doc_#{i}", "key" => "value"}
135
+ @writer.put context_with(doc)
136
+ end
137
+ @writer.close
138
+
139
+ post_args = @fake_http_client.post_args
140
+
141
+ assert_length 2, post_args, "Makes two posts to Solr for two batches"
142
+
143
+ assert_length Traject::SolrJsonWriter::DEFAULT_BATCH_SIZE, JSON.parse(post_args[0][1]), "first batch posted with batch size docs"
144
+ assert_length 1, JSON.parse(post_args[1][1]), "second batch posted with last remaining doc"
145
+ end
146
+
147
+ it "commits on close when set" do
148
+ @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
149
+ @writer.put context_with({"id" => "one", "key" => ["value1", "value2"]})
150
+ @writer.close
151
+
152
+ last_solr_get = @fake_http_client.get_args.last
153
+
154
+ assert_equal "http://example.com/update/json", last_solr_get[0]
155
+ assert_equal( {"commit" => "true"}, last_solr_get[1] )
156
+ end
157
+
158
+ describe "skipped records" do
159
+ it "skips and reports under max_skipped" do
160
+ strio = StringIO.new
161
+ @writer = create_writer("solr_writer.max_skipped" => 10, "logger" => logger_to_strio(strio))
162
+ @fake_http_client.response_status = 500
163
+
164
+ 10.times do |i|
165
+ @writer.put context_with("id" => "doc_#{i}", "key" => "value")
166
+ end
167
+ @writer.close
168
+
169
+ assert_equal 10, @writer.skipped_record_count
170
+
171
+ logged = strio.string
172
+
173
+ 10.times do |i|
174
+ assert_match /ERROR.*Could not add record doc_#{i} at source file position : Solr error response: 500/, logged
175
+ end
176
+ end
177
+
178
+ it "raises when skipped more than max_skipped" do
179
+ @writer = create_writer("solr_writer.max_skipped" => 5)
180
+ @fake_http_client.response_status = 500
181
+
182
+ e = assert_raises(RuntimeError) do
183
+ 6.times do |i|
184
+ @writer.put context_with("id" => "doc_#{i}", "key" => "value")
185
+ end
186
+ @writer.close
187
+ end
188
+
189
+ assert_includes e.message, "Exceeded maximum number of skipped records"
190
+ end
191
+
192
+ it "raises on one skipped record when max_skipped is 0" do
193
+ @writer = create_writer("solr_writer.max_skipped" => 0)
194
+ @fake_http_client.response_status = 500
195
+
196
+ e = assert_raises(RuntimeError) do
197
+ @writer.put context_with("id" => "doc_1", "key" => "value")
198
+ @writer.close
199
+ end
200
+ end
201
+ end
202
+
203
+ describe "auto-discovers proper update path" do
204
+ it "finds /update/json" do
205
+ assert_equal "http://example.com/solr/update/json", @writer.determine_solr_update_url
206
+ end
207
+
208
+ it "resorts to plain /update" do
209
+ @fake_http_client = FakeHTTPClient.new
210
+ @fake_http_client.allow_update_json_path = false
211
+
212
+ @writer = create_writer("solr.url" => "http://example.com/solr",
213
+ "solr_json_writer.http_client" => @fake_http_client)
214
+
215
+ assert_equal "http://example.com/solr/update", @writer.determine_solr_update_url
216
+ end
217
+ end
218
+
219
+ describe "Record id from context" do
220
+ before do
221
+ @record = MARC::Reader.new(support_file_path('test_data.utf8.mrc')).first
222
+ @context = Traject::Indexer::Context.new
223
+ @writer = create_writer
224
+ @record_001 = " 00282214 " # from the mrc file
225
+ end
226
+
227
+ it "gets it from 001" do
228
+ @context.source_record = @record
229
+ assert_equal @record_001, @writer.record_id_from_context(@context)
230
+ end
231
+
232
+ it "gets it from the id" do
233
+ @context.output_hash['id'] = 'the_record_id'
234
+ assert_equal 'the_record_id', @writer.record_id_from_context(@context)
235
+ end
236
+
237
+ it "gets it from both 001 and id" do
238
+ @context.output_hash['id'] = 'the_record_id'
239
+ @context.source_record = @record
240
+ assert_equal [@record_001, 'the_record_id'].join('/'), @writer.record_id_from_context(@context)
241
+ end
242
+
243
+
244
+
245
+ end
246
+
247
+
248
+ end