traject 1.1.0 → 2.0.0.rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/test/test_helper.rb
CHANGED
@@ -47,9 +47,11 @@ def empty_record
|
|
47
47
|
rec
|
48
48
|
end
|
49
49
|
|
50
|
-
# pretends to be a
|
50
|
+
# pretends to be a Solr HTTPServer-like thing, just kind of mocks it up
|
51
51
|
# and records what happens and simulates errors in some cases.
|
52
52
|
class MockSolrServer
|
53
|
+
class Exception < RuntimeError;end
|
54
|
+
|
53
55
|
attr_accessor :things_added, :url, :committed, :parser, :shutted_down
|
54
56
|
|
55
57
|
def initialize(url)
|
@@ -61,12 +63,12 @@ class MockSolrServer
|
|
61
63
|
def add(thing)
|
62
64
|
@add_mutex.synchronize do # easy peasy threadsafety for our mock
|
63
65
|
if @url == "http://no.such.place"
|
64
|
-
raise
|
66
|
+
raise MockSolrServer::Exception.new("mock bad uri")
|
65
67
|
end
|
66
68
|
|
67
69
|
# simulate a multiple id error please
|
68
70
|
if [thing].flatten.find {|doc| doc.getField("id").getValueCount() != 1}
|
69
|
-
raise
|
71
|
+
raise MockSolrServer::Exception.new("mock non-1 size of 'id'")
|
70
72
|
else
|
71
73
|
things_added << thing
|
72
74
|
end
|
@@ -21,11 +21,6 @@ extend Traject::Macros::MarcFormats
|
|
21
21
|
# config files as you like, `traject -c one.rb -c two.rb -c etc.rb`
|
22
22
|
settings do
|
23
23
|
provide "solr.url", "http://solr.somewhere.edu:8983/solr/corename"
|
24
|
-
|
25
|
-
# Only if you need to connect to a Solr 1.x:
|
26
|
-
provide "solrj_writer.parser_class_name", "XMLResponseParser"
|
27
|
-
|
28
|
-
provide "solrj_writer.commit_on_close", true
|
29
24
|
end
|
30
25
|
|
31
26
|
# Extract first 001, then supply code block to add "bib_" prefix to it
|
@@ -104,6 +104,15 @@ describe "TranslationMap" do
|
|
104
104
|
assert_equal "output_value", map["input_value"]
|
105
105
|
end
|
106
106
|
|
107
|
+
it "can be initialized with another map" do
|
108
|
+
map = Traject::TranslationMap.new({"alpha" => "one", "beta" => nil}, :default => "DEFAULT")
|
109
|
+
|
110
|
+
new_map = Traject::TranslationMap.new(map)
|
111
|
+
|
112
|
+
assert_equal map.to_hash, new_map.to_hash
|
113
|
+
assert_equal map.default, new_map.default
|
114
|
+
end
|
115
|
+
|
107
116
|
it "respects __default__ literal" do
|
108
117
|
map = Traject::TranslationMap.new("default_literal")
|
109
118
|
|
data/traject.gemspec
CHANGED
@@ -20,12 +20,25 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.extra_rdoc_files = spec.files.grep(%r{^doc/})
|
21
21
|
|
22
22
|
|
23
|
-
spec.add_dependency "
|
24
|
-
spec.add_dependency "marc
|
25
|
-
|
23
|
+
spec.add_dependency "concurrent-ruby", ">= 0.8.0"
|
24
|
+
spec.add_dependency "marc", "~> 1.0"
|
25
|
+
|
26
|
+
spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
|
26
27
|
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
27
|
-
spec.add_dependency "yell"
|
28
|
-
spec.add_dependency "dot-properties", ">= 0.1.1"
|
28
|
+
spec.add_dependency "yell" # logging
|
29
|
+
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
|
+
spec.add_dependency "httpclient", "~> 2.5"
|
31
|
+
|
32
|
+
# If we're building the package under JRuby, add in the
|
33
|
+
# jruby-only gems and specify the platform.
|
34
|
+
|
35
|
+
if defined? JRUBY_VERSION
|
36
|
+
spec.platform = 'java'
|
37
|
+
spec.add_dependency "traject-marc4j_reader", "~> 1.0"
|
38
|
+
else
|
39
|
+
spec.platform = "ruby"
|
40
|
+
end
|
41
|
+
|
29
42
|
|
30
43
|
spec.add_development_dependency "bundler", "~> 1.3"
|
31
44
|
spec.add_development_dependency "rake"
|
metadata
CHANGED
@@ -1,155 +1,163 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0.rc.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
8
8
|
- Bill Dueber
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2015-02-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
+
name: concurrent-ruby
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
16
17
|
requirements:
|
17
|
-
- -
|
18
|
+
- - ">="
|
18
19
|
- !ruby/object:Gem::Version
|
19
20
|
version: 0.8.0
|
20
|
-
name: marc
|
21
|
-
prerelease: false
|
22
21
|
type: :runtime
|
22
|
+
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- -
|
25
|
+
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: 0.8.0
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
|
+
name: marc
|
29
30
|
requirement: !ruby/object:Gem::Requirement
|
30
31
|
requirements:
|
31
|
-
- -
|
32
|
+
- - "~>"
|
32
33
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
34
|
-
name: marc-marc4j
|
35
|
-
prerelease: false
|
34
|
+
version: '1.0'
|
36
35
|
type: :runtime
|
36
|
+
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - "~>"
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
41
|
+
version: '1.0'
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
|
+
name: hashie
|
43
44
|
requirement: !ruby/object:Gem::Requirement
|
44
45
|
requirements:
|
45
|
-
- -
|
46
|
+
- - "~>"
|
46
47
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
48
|
-
- - <
|
49
|
-
- !ruby/object:Gem::Version
|
50
|
-
version: '2.1'
|
51
|
-
name: hashie
|
52
|
-
prerelease: false
|
48
|
+
version: '3.1'
|
53
49
|
type: :runtime
|
50
|
+
prerelease: false
|
54
51
|
version_requirements: !ruby/object:Gem::Requirement
|
55
52
|
requirements:
|
56
|
-
- -
|
57
|
-
- !ruby/object:Gem::Version
|
58
|
-
version: 2.0.5
|
59
|
-
- - <
|
53
|
+
- - "~>"
|
60
54
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
55
|
+
version: '3.1'
|
62
56
|
- !ruby/object:Gem::Dependency
|
57
|
+
name: slop
|
63
58
|
requirement: !ruby/object:Gem::Requirement
|
64
59
|
requirements:
|
65
|
-
- -
|
60
|
+
- - ">="
|
66
61
|
- !ruby/object:Gem::Version
|
67
62
|
version: 3.4.5
|
68
|
-
- - <
|
63
|
+
- - "<"
|
69
64
|
- !ruby/object:Gem::Version
|
70
65
|
version: '4.0'
|
71
|
-
name: slop
|
72
|
-
prerelease: false
|
73
66
|
type: :runtime
|
67
|
+
prerelease: false
|
74
68
|
version_requirements: !ruby/object:Gem::Requirement
|
75
69
|
requirements:
|
76
|
-
- -
|
70
|
+
- - ">="
|
77
71
|
- !ruby/object:Gem::Version
|
78
72
|
version: 3.4.5
|
79
|
-
- - <
|
73
|
+
- - "<"
|
80
74
|
- !ruby/object:Gem::Version
|
81
75
|
version: '4.0'
|
82
76
|
- !ruby/object:Gem::Dependency
|
77
|
+
name: yell
|
83
78
|
requirement: !ruby/object:Gem::Requirement
|
84
79
|
requirements:
|
85
|
-
- -
|
80
|
+
- - ">="
|
86
81
|
- !ruby/object:Gem::Version
|
87
82
|
version: '0'
|
88
|
-
name: yell
|
89
|
-
prerelease: false
|
90
83
|
type: :runtime
|
84
|
+
prerelease: false
|
91
85
|
version_requirements: !ruby/object:Gem::Requirement
|
92
86
|
requirements:
|
93
|
-
- -
|
87
|
+
- - ">="
|
94
88
|
- !ruby/object:Gem::Version
|
95
89
|
version: '0'
|
96
90
|
- !ruby/object:Gem::Dependency
|
91
|
+
name: dot-properties
|
97
92
|
requirement: !ruby/object:Gem::Requirement
|
98
93
|
requirements:
|
99
|
-
- -
|
94
|
+
- - ">="
|
100
95
|
- !ruby/object:Gem::Version
|
101
96
|
version: 0.1.1
|
102
|
-
name: dot-properties
|
103
|
-
prerelease: false
|
104
97
|
type: :runtime
|
98
|
+
prerelease: false
|
105
99
|
version_requirements: !ruby/object:Gem::Requirement
|
106
100
|
requirements:
|
107
|
-
- -
|
101
|
+
- - ">="
|
108
102
|
- !ruby/object:Gem::Version
|
109
103
|
version: 0.1.1
|
110
104
|
- !ruby/object:Gem::Dependency
|
105
|
+
name: httpclient
|
111
106
|
requirement: !ruby/object:Gem::Requirement
|
112
107
|
requirements:
|
113
|
-
- - ~>
|
108
|
+
- - "~>"
|
114
109
|
- !ruby/object:Gem::Version
|
115
|
-
version: '
|
116
|
-
|
110
|
+
version: '2.5'
|
111
|
+
type: :runtime
|
117
112
|
prerelease: false
|
113
|
+
version_requirements: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.5'
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: bundler
|
120
|
+
requirement: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.3'
|
118
125
|
type: :development
|
126
|
+
prerelease: false
|
119
127
|
version_requirements: !ruby/object:Gem::Requirement
|
120
128
|
requirements:
|
121
|
-
- - ~>
|
129
|
+
- - "~>"
|
122
130
|
- !ruby/object:Gem::Version
|
123
131
|
version: '1.3'
|
124
132
|
- !ruby/object:Gem::Dependency
|
133
|
+
name: rake
|
125
134
|
requirement: !ruby/object:Gem::Requirement
|
126
135
|
requirements:
|
127
|
-
- -
|
136
|
+
- - ">="
|
128
137
|
- !ruby/object:Gem::Version
|
129
138
|
version: '0'
|
130
|
-
name: rake
|
131
|
-
prerelease: false
|
132
139
|
type: :development
|
140
|
+
prerelease: false
|
133
141
|
version_requirements: !ruby/object:Gem::Requirement
|
134
142
|
requirements:
|
135
|
-
- -
|
143
|
+
- - ">="
|
136
144
|
- !ruby/object:Gem::Version
|
137
145
|
version: '0'
|
138
146
|
- !ruby/object:Gem::Dependency
|
147
|
+
name: minitest
|
139
148
|
requirement: !ruby/object:Gem::Requirement
|
140
149
|
requirements:
|
141
|
-
- -
|
150
|
+
- - ">="
|
142
151
|
- !ruby/object:Gem::Version
|
143
152
|
version: '0'
|
144
|
-
name: minitest
|
145
|
-
prerelease: false
|
146
153
|
type: :development
|
154
|
+
prerelease: false
|
147
155
|
version_requirements: !ruby/object:Gem::Requirement
|
148
156
|
requirements:
|
149
|
-
- -
|
157
|
+
- - ">="
|
150
158
|
- !ruby/object:Gem::Version
|
151
159
|
version: '0'
|
152
|
-
description:
|
160
|
+
description:
|
153
161
|
email:
|
154
162
|
- none@nowhere.org
|
155
163
|
executables:
|
@@ -162,9 +170,9 @@ extra_rdoc_files:
|
|
162
170
|
- doc/other_commands.md
|
163
171
|
- doc/settings.md
|
164
172
|
files:
|
165
|
-
- .gitignore
|
166
|
-
- .travis.yml
|
167
|
-
- .yardopts
|
173
|
+
- ".gitignore"
|
174
|
+
- ".travis.yml"
|
175
|
+
- ".yardopts"
|
168
176
|
- Gemfile
|
169
177
|
- LICENSE.txt
|
170
178
|
- README.md
|
@@ -179,7 +187,9 @@ files:
|
|
179
187
|
- lib/tasks/load_maps.rake
|
180
188
|
- lib/traject.rb
|
181
189
|
- lib/traject/command_line.rb
|
190
|
+
- lib/traject/csv_writer.rb
|
182
191
|
- lib/traject/debug_writer.rb
|
192
|
+
- lib/traject/delimited_writer.rb
|
183
193
|
- lib/traject/indexer.rb
|
184
194
|
- lib/traject/indexer/settings.rb
|
185
195
|
- lib/traject/json_writer.rb
|
@@ -188,14 +198,13 @@ files:
|
|
188
198
|
- lib/traject/macros/marc21.rb
|
189
199
|
- lib/traject/macros/marc21_semantics.rb
|
190
200
|
- lib/traject/macros/marc_format_classifier.rb
|
191
|
-
- lib/traject/marc4j_reader.rb
|
192
201
|
- lib/traject/marc_extractor.rb
|
193
202
|
- lib/traject/marc_reader.rb
|
194
203
|
- lib/traject/mock_reader.rb
|
195
204
|
- lib/traject/ndj_reader.rb
|
196
205
|
- lib/traject/null_writer.rb
|
197
206
|
- lib/traject/qualified_const_get.rb
|
198
|
-
- lib/traject/
|
207
|
+
- lib/traject/solr_json_writer.rb
|
199
208
|
- lib/traject/thread_pool.rb
|
200
209
|
- lib/traject/translation_map.rb
|
201
210
|
- lib/traject/util.rb
|
@@ -208,6 +217,7 @@ files:
|
|
208
217
|
- lib/translation_maps/marc_instruments.yaml
|
209
218
|
- lib/translation_maps/marc_languages.yaml
|
210
219
|
- test/debug_writer_test.rb
|
220
|
+
- test/delimited_writer_test.rb
|
211
221
|
- test/indexer/each_record_test.rb
|
212
222
|
- test/indexer/macros_marc21_semantics_test.rb
|
213
223
|
- test/indexer/macros_marc21_test.rb
|
@@ -216,11 +226,10 @@ files:
|
|
216
226
|
- test/indexer/read_write_test.rb
|
217
227
|
- test/indexer/settings_test.rb
|
218
228
|
- test/indexer/to_field_test.rb
|
219
|
-
- test/marc4j_reader_test.rb
|
220
229
|
- test/marc_extractor_test.rb
|
221
230
|
- test/marc_format_classifier_test.rb
|
222
231
|
- test/marc_reader_test.rb
|
223
|
-
- test/
|
232
|
+
- test/solr_json_writer_test.rb
|
224
233
|
- test/test_helper.rb
|
225
234
|
- test/test_support/245_no_ab.marc
|
226
235
|
- test/test_support/880_with_no_6.utf8.marc
|
@@ -263,51 +272,33 @@ files:
|
|
263
272
|
- test/translation_maps/translate_array_test.yaml
|
264
273
|
- test/translation_maps/yaml_map.yaml
|
265
274
|
- traject.gemspec
|
266
|
-
- vendor/solrj/README
|
267
|
-
- vendor/solrj/build.xml
|
268
|
-
- vendor/solrj/ivy.xml
|
269
|
-
- vendor/solrj/lib/commons-codec-1.7.jar
|
270
|
-
- vendor/solrj/lib/commons-io-2.1.jar
|
271
|
-
- vendor/solrj/lib/httpclient-4.2.3.jar
|
272
|
-
- vendor/solrj/lib/httpcore-4.2.2.jar
|
273
|
-
- vendor/solrj/lib/httpmime-4.2.3.jar
|
274
|
-
- vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar
|
275
|
-
- vendor/solrj/lib/jul-to-slf4j-1.6.6.jar
|
276
|
-
- vendor/solrj/lib/log4j-1.2.16.jar
|
277
|
-
- vendor/solrj/lib/noggit-0.5.jar
|
278
|
-
- vendor/solrj/lib/slf4j-api-1.6.6.jar
|
279
|
-
- vendor/solrj/lib/slf4j-log4j12-1.6.6.jar
|
280
|
-
- vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar
|
281
|
-
- vendor/solrj/lib/solr-solrj-4.3.1-sources.jar
|
282
|
-
- vendor/solrj/lib/solr-solrj-4.3.1.jar
|
283
|
-
- vendor/solrj/lib/wstx-asl-3.2.7.jar
|
284
|
-
- vendor/solrj/lib/zookeeper-3.4.5.jar
|
285
275
|
homepage: http://github.com/traject-project/traject
|
286
276
|
licenses:
|
287
277
|
- MIT
|
288
278
|
metadata: {}
|
289
|
-
post_install_message:
|
279
|
+
post_install_message:
|
290
280
|
rdoc_options: []
|
291
281
|
require_paths:
|
292
282
|
- lib
|
293
283
|
required_ruby_version: !ruby/object:Gem::Requirement
|
294
284
|
requirements:
|
295
|
-
- -
|
285
|
+
- - ">="
|
296
286
|
- !ruby/object:Gem::Version
|
297
287
|
version: '0'
|
298
288
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
299
289
|
requirements:
|
300
|
-
- -
|
290
|
+
- - ">"
|
301
291
|
- !ruby/object:Gem::Version
|
302
|
-
version:
|
292
|
+
version: 1.3.1
|
303
293
|
requirements: []
|
304
|
-
rubyforge_project:
|
305
|
-
rubygems_version: 2.
|
306
|
-
signing_key:
|
294
|
+
rubyforge_project:
|
295
|
+
rubygems_version: 2.4.5
|
296
|
+
signing_key:
|
307
297
|
specification_version: 4
|
308
298
|
summary: Index MARC to Solr; or generally process source records to hash-like structures
|
309
299
|
test_files:
|
310
300
|
- test/debug_writer_test.rb
|
301
|
+
- test/delimited_writer_test.rb
|
311
302
|
- test/indexer/each_record_test.rb
|
312
303
|
- test/indexer/macros_marc21_semantics_test.rb
|
313
304
|
- test/indexer/macros_marc21_test.rb
|
@@ -316,11 +307,10 @@ test_files:
|
|
316
307
|
- test/indexer/read_write_test.rb
|
317
308
|
- test/indexer/settings_test.rb
|
318
309
|
- test/indexer/to_field_test.rb
|
319
|
-
- test/marc4j_reader_test.rb
|
320
310
|
- test/marc_extractor_test.rb
|
321
311
|
- test/marc_format_classifier_test.rb
|
322
312
|
- test/marc_reader_test.rb
|
323
|
-
- test/
|
313
|
+
- test/solr_json_writer_test.rb
|
324
314
|
- test/test_helper.rb
|
325
315
|
- test/test_support/245_no_ab.marc
|
326
316
|
- test/test_support/880_with_no_6.utf8.marc
|
@@ -1,153 +0,0 @@
|
|
1
|
-
require 'traject'
|
2
|
-
require 'marc'
|
3
|
-
require 'marc/marc4j'
|
4
|
-
|
5
|
-
# `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
|
6
|
-
# into standard ruby-marc MARC::Record objects. This reader may be faster than
|
7
|
-
# Traject::MarcReader, especially for XML.
|
8
|
-
#
|
9
|
-
# Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
|
10
|
-
# for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
|
11
|
-
# for reading xml. The actual code for dealing with Marc4J is in the separate
|
12
|
-
# [marc-marc4j gem](https://github.com/billdueber/ruby-marc-marc4j).
|
13
|
-
#
|
14
|
-
# See also the pure ruby Traject::MarcReader as an alternative, if you need to read
|
15
|
-
# marc-in-json, or if you don't need binary Marc8 support, it may in some cases
|
16
|
-
# be faster.
|
17
|
-
#
|
18
|
-
# ## Settings
|
19
|
-
#
|
20
|
-
# * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
|
21
|
-
#
|
22
|
-
# * marc4j_reader.permissive: default true, false to turn off permissive reading. Used as
|
23
|
-
# value to 'permissive' arg of MarcPermissiveStreamReader constructor.
|
24
|
-
# Only used for 'binary'
|
25
|
-
#
|
26
|
-
# * marc_source.encoding: Only used for 'binary', otherwise always UTF-8.
|
27
|
-
# String of the values MarcPermissiveStreamReader accepts:
|
28
|
-
# * BESTGUESS (default: not entirely clear what Marc4J does with this)
|
29
|
-
# * ISO-8859-1 (also accepted: ISO8859_1)
|
30
|
-
# * UTF-8
|
31
|
-
# * MARC-8 (also accepted: MARC8)
|
32
|
-
# Default 'BESTGUESS', but HIGHLY recommend setting
|
33
|
-
# to avoid some Marc4J unpredictability, Marc4J "BESTGUESS" can be unpredictable
|
34
|
-
# in a variety of ways.
|
35
|
-
# (will ALWAYS be transcoded to UTF-8 on the way out. We insist.)
|
36
|
-
#
|
37
|
-
# * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
|
38
|
-
# be loaded. If unset, uses marc4j.jar bundled with traject.
|
39
|
-
#
|
40
|
-
# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
|
41
|
-
# the eventual ruby-marc record via record#original_marc4j. Intended for
|
42
|
-
# those that have legacy java code for which a marc4j object is needed. .
|
43
|
-
#
|
44
|
-
#
|
45
|
-
# ## Example
|
46
|
-
#
|
47
|
-
# In a configuration file:
|
48
|
-
#
|
49
|
-
# require 'traject/marc4j_reader
|
50
|
-
# settings do
|
51
|
-
# provide "reader_class_name", "Traject::Marc4JReader"
|
52
|
-
#
|
53
|
-
# #for MarcXML:
|
54
|
-
# # provide "marc_source.type", "xml"
|
55
|
-
#
|
56
|
-
# # Or instead for binary:
|
57
|
-
# provide "marc4j_reader.permissive", true
|
58
|
-
# provide "marc_source.encoding", "MARC8"
|
59
|
-
# end
|
60
|
-
class Traject::Marc4JReader
|
61
|
-
include Enumerable
|
62
|
-
|
63
|
-
attr_reader :settings, :input_stream
|
64
|
-
|
65
|
-
def initialize(input_stream, settings)
|
66
|
-
@settings = Traject::Indexer::Settings.new settings
|
67
|
-
@input_stream = input_stream
|
68
|
-
|
69
|
-
if @settings['marc4j_reader.keep_marc4j'] &&
|
70
|
-
! (MARC::Record.instance_methods.include?(:original_marc4j) &&
|
71
|
-
MARC::Record.instance_methods.include?(:"original_marc4j="))
|
72
|
-
MARC::Record.class_eval('attr_accessor :original_marc4j')
|
73
|
-
end
|
74
|
-
|
75
|
-
# Creating a converter will do the following:
|
76
|
-
# - nothing, if it detects that the marc4j jar is already loaded
|
77
|
-
# - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
|
78
|
-
# - load the marc4j jar file bundled with MARC::MARC4J otherwise
|
79
|
-
|
80
|
-
@converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
|
81
|
-
|
82
|
-
# Convenience
|
83
|
-
java_import org.marc4j.MarcPermissiveStreamReader
|
84
|
-
java_import org.marc4j.MarcXmlReader
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
def internal_reader
|
90
|
-
@internal_reader ||= create_marc_reader!
|
91
|
-
end
|
92
|
-
|
93
|
-
def input_type
|
94
|
-
# maybe later add some guessing somehow
|
95
|
-
settings["marc_source.type"]
|
96
|
-
end
|
97
|
-
|
98
|
-
def specified_source_encoding
|
99
|
-
#settings["marc4j_reader.source_encoding"]
|
100
|
-
enc = settings["marc_source.encoding"]
|
101
|
-
|
102
|
-
# one is standard for ruby and we want to support,
|
103
|
-
# the other is used by Marc4J and we have to pass it to Marc4J
|
104
|
-
enc = "ISO8859_1" if enc == "ISO-8859-1"
|
105
|
-
|
106
|
-
# default
|
107
|
-
enc = "BESTGUESS" if enc.nil? || enc.empty?
|
108
|
-
|
109
|
-
return enc
|
110
|
-
end
|
111
|
-
|
112
|
-
def create_marc_reader!
|
113
|
-
case input_type
|
114
|
-
when "binary"
|
115
|
-
permissive = settings["marc4j_reader.permissive"].to_s == "true"
|
116
|
-
|
117
|
-
# #to_inputstream turns our ruby IO into a Java InputStream
|
118
|
-
# third arg means 'convert to UTF-8, yes'
|
119
|
-
MarcPermissiveStreamReader.new(input_stream.to_inputstream, permissive, true, specified_source_encoding)
|
120
|
-
when "xml"
|
121
|
-
MarcXmlReader.new(input_stream.to_inputstream)
|
122
|
-
else
|
123
|
-
raise IllegalArgument.new("Unrecgonized marc_source.type: #{input_type}")
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
def each
|
128
|
-
while (internal_reader.hasNext)
|
129
|
-
begin
|
130
|
-
marc4j = internal_reader.next
|
131
|
-
rubymarc = @converter.marc4j_to_rubymarc(marc4j)
|
132
|
-
if @settings['marc4j_reader.keep_marc4j']
|
133
|
-
rubymarc.original_marc4j = marc4j
|
134
|
-
end
|
135
|
-
rescue Exception =>e
|
136
|
-
msg = "MARC4JReader: Error reading MARC, fatal, re-raising"
|
137
|
-
if marc4j
|
138
|
-
msg += "\n 001 id: #{marc4j.getControlNumber}"
|
139
|
-
end
|
140
|
-
msg += "\n #{Traject::Util.exception_to_log_message(e)}"
|
141
|
-
logger.fatal msg
|
142
|
-
raise e
|
143
|
-
end
|
144
|
-
|
145
|
-
yield rubymarc
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
def logger
|
150
|
-
@logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
151
|
-
end
|
152
|
-
|
153
|
-
end
|