traject 1.1.0 → 2.0.0.rc.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/test/test_helper.rb
CHANGED
@@ -47,9 +47,11 @@ def empty_record
|
|
47
47
|
rec
|
48
48
|
end
|
49
49
|
|
50
|
-
# pretends to be a
|
50
|
+
# pretends to be a Solr HTTPServer-like thing, just kind of mocks it up
|
51
51
|
# and records what happens and simulates errors in some cases.
|
52
52
|
class MockSolrServer
|
53
|
+
class Exception < RuntimeError;end
|
54
|
+
|
53
55
|
attr_accessor :things_added, :url, :committed, :parser, :shutted_down
|
54
56
|
|
55
57
|
def initialize(url)
|
@@ -61,12 +63,12 @@ class MockSolrServer
|
|
61
63
|
def add(thing)
|
62
64
|
@add_mutex.synchronize do # easy peasy threadsafety for our mock
|
63
65
|
if @url == "http://no.such.place"
|
64
|
-
raise
|
66
|
+
raise MockSolrServer::Exception.new("mock bad uri")
|
65
67
|
end
|
66
68
|
|
67
69
|
# simulate a multiple id error please
|
68
70
|
if [thing].flatten.find {|doc| doc.getField("id").getValueCount() != 1}
|
69
|
-
raise
|
71
|
+
raise MockSolrServer::Exception.new("mock non-1 size of 'id'")
|
70
72
|
else
|
71
73
|
things_added << thing
|
72
74
|
end
|
@@ -21,11 +21,6 @@ extend Traject::Macros::MarcFormats
|
|
21
21
|
# config files as you like, `traject -c one.rb -c two.rb -c etc.rb`
|
22
22
|
settings do
|
23
23
|
provide "solr.url", "http://solr.somewhere.edu:8983/solr/corename"
|
24
|
-
|
25
|
-
# Only if you need to connect to a Solr 1.x:
|
26
|
-
provide "solrj_writer.parser_class_name", "XMLResponseParser"
|
27
|
-
|
28
|
-
provide "solrj_writer.commit_on_close", true
|
29
24
|
end
|
30
25
|
|
31
26
|
# Extract first 001, then supply code block to add "bib_" prefix to it
|
@@ -104,6 +104,15 @@ describe "TranslationMap" do
|
|
104
104
|
assert_equal "output_value", map["input_value"]
|
105
105
|
end
|
106
106
|
|
107
|
+
it "can be initialized with another map" do
|
108
|
+
map = Traject::TranslationMap.new({"alpha" => "one", "beta" => nil}, :default => "DEFAULT")
|
109
|
+
|
110
|
+
new_map = Traject::TranslationMap.new(map)
|
111
|
+
|
112
|
+
assert_equal map.to_hash, new_map.to_hash
|
113
|
+
assert_equal map.default, new_map.default
|
114
|
+
end
|
115
|
+
|
107
116
|
it "respects __default__ literal" do
|
108
117
|
map = Traject::TranslationMap.new("default_literal")
|
109
118
|
|
data/traject.gemspec
CHANGED
@@ -20,12 +20,25 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.extra_rdoc_files = spec.files.grep(%r{^doc/})
|
21
21
|
|
22
22
|
|
23
|
-
spec.add_dependency "
|
24
|
-
spec.add_dependency "marc
|
25
|
-
|
23
|
+
spec.add_dependency "concurrent-ruby", ">= 0.8.0"
|
24
|
+
spec.add_dependency "marc", "~> 1.0"
|
25
|
+
|
26
|
+
spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
|
26
27
|
spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
|
27
|
-
spec.add_dependency "yell"
|
28
|
-
spec.add_dependency "dot-properties", ">= 0.1.1"
|
28
|
+
spec.add_dependency "yell" # logging
|
29
|
+
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
30
|
+
spec.add_dependency "httpclient", "~> 2.5"
|
31
|
+
|
32
|
+
# If we're building the package under JRuby, add in the
|
33
|
+
# jruby-only gems and specify the platform.
|
34
|
+
|
35
|
+
if defined? JRUBY_VERSION
|
36
|
+
spec.platform = 'java'
|
37
|
+
spec.add_dependency "traject-marc4j_reader", "~> 1.0"
|
38
|
+
else
|
39
|
+
spec.platform = "ruby"
|
40
|
+
end
|
41
|
+
|
29
42
|
|
30
43
|
spec.add_development_dependency "bundler", "~> 1.3"
|
31
44
|
spec.add_development_dependency "rake"
|
metadata
CHANGED
@@ -1,155 +1,163 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0.rc.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
8
8
|
- Bill Dueber
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2015-02-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
+
name: concurrent-ruby
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
16
17
|
requirements:
|
17
|
-
- -
|
18
|
+
- - ">="
|
18
19
|
- !ruby/object:Gem::Version
|
19
20
|
version: 0.8.0
|
20
|
-
name: marc
|
21
|
-
prerelease: false
|
22
21
|
type: :runtime
|
22
|
+
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- -
|
25
|
+
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: 0.8.0
|
28
28
|
- !ruby/object:Gem::Dependency
|
29
|
+
name: marc
|
29
30
|
requirement: !ruby/object:Gem::Requirement
|
30
31
|
requirements:
|
31
|
-
- -
|
32
|
+
- - "~>"
|
32
33
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
34
|
-
name: marc-marc4j
|
35
|
-
prerelease: false
|
34
|
+
version: '1.0'
|
36
35
|
type: :runtime
|
36
|
+
prerelease: false
|
37
37
|
version_requirements: !ruby/object:Gem::Requirement
|
38
38
|
requirements:
|
39
|
-
- -
|
39
|
+
- - "~>"
|
40
40
|
- !ruby/object:Gem::Version
|
41
|
-
version:
|
41
|
+
version: '1.0'
|
42
42
|
- !ruby/object:Gem::Dependency
|
43
|
+
name: hashie
|
43
44
|
requirement: !ruby/object:Gem::Requirement
|
44
45
|
requirements:
|
45
|
-
- -
|
46
|
+
- - "~>"
|
46
47
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
48
|
-
- - <
|
49
|
-
- !ruby/object:Gem::Version
|
50
|
-
version: '2.1'
|
51
|
-
name: hashie
|
52
|
-
prerelease: false
|
48
|
+
version: '3.1'
|
53
49
|
type: :runtime
|
50
|
+
prerelease: false
|
54
51
|
version_requirements: !ruby/object:Gem::Requirement
|
55
52
|
requirements:
|
56
|
-
- -
|
57
|
-
- !ruby/object:Gem::Version
|
58
|
-
version: 2.0.5
|
59
|
-
- - <
|
53
|
+
- - "~>"
|
60
54
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
55
|
+
version: '3.1'
|
62
56
|
- !ruby/object:Gem::Dependency
|
57
|
+
name: slop
|
63
58
|
requirement: !ruby/object:Gem::Requirement
|
64
59
|
requirements:
|
65
|
-
- -
|
60
|
+
- - ">="
|
66
61
|
- !ruby/object:Gem::Version
|
67
62
|
version: 3.4.5
|
68
|
-
- - <
|
63
|
+
- - "<"
|
69
64
|
- !ruby/object:Gem::Version
|
70
65
|
version: '4.0'
|
71
|
-
name: slop
|
72
|
-
prerelease: false
|
73
66
|
type: :runtime
|
67
|
+
prerelease: false
|
74
68
|
version_requirements: !ruby/object:Gem::Requirement
|
75
69
|
requirements:
|
76
|
-
- -
|
70
|
+
- - ">="
|
77
71
|
- !ruby/object:Gem::Version
|
78
72
|
version: 3.4.5
|
79
|
-
- - <
|
73
|
+
- - "<"
|
80
74
|
- !ruby/object:Gem::Version
|
81
75
|
version: '4.0'
|
82
76
|
- !ruby/object:Gem::Dependency
|
77
|
+
name: yell
|
83
78
|
requirement: !ruby/object:Gem::Requirement
|
84
79
|
requirements:
|
85
|
-
- -
|
80
|
+
- - ">="
|
86
81
|
- !ruby/object:Gem::Version
|
87
82
|
version: '0'
|
88
|
-
name: yell
|
89
|
-
prerelease: false
|
90
83
|
type: :runtime
|
84
|
+
prerelease: false
|
91
85
|
version_requirements: !ruby/object:Gem::Requirement
|
92
86
|
requirements:
|
93
|
-
- -
|
87
|
+
- - ">="
|
94
88
|
- !ruby/object:Gem::Version
|
95
89
|
version: '0'
|
96
90
|
- !ruby/object:Gem::Dependency
|
91
|
+
name: dot-properties
|
97
92
|
requirement: !ruby/object:Gem::Requirement
|
98
93
|
requirements:
|
99
|
-
- -
|
94
|
+
- - ">="
|
100
95
|
- !ruby/object:Gem::Version
|
101
96
|
version: 0.1.1
|
102
|
-
name: dot-properties
|
103
|
-
prerelease: false
|
104
97
|
type: :runtime
|
98
|
+
prerelease: false
|
105
99
|
version_requirements: !ruby/object:Gem::Requirement
|
106
100
|
requirements:
|
107
|
-
- -
|
101
|
+
- - ">="
|
108
102
|
- !ruby/object:Gem::Version
|
109
103
|
version: 0.1.1
|
110
104
|
- !ruby/object:Gem::Dependency
|
105
|
+
name: httpclient
|
111
106
|
requirement: !ruby/object:Gem::Requirement
|
112
107
|
requirements:
|
113
|
-
- - ~>
|
108
|
+
- - "~>"
|
114
109
|
- !ruby/object:Gem::Version
|
115
|
-
version: '
|
116
|
-
|
110
|
+
version: '2.5'
|
111
|
+
type: :runtime
|
117
112
|
prerelease: false
|
113
|
+
version_requirements: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.5'
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: bundler
|
120
|
+
requirement: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.3'
|
118
125
|
type: :development
|
126
|
+
prerelease: false
|
119
127
|
version_requirements: !ruby/object:Gem::Requirement
|
120
128
|
requirements:
|
121
|
-
- - ~>
|
129
|
+
- - "~>"
|
122
130
|
- !ruby/object:Gem::Version
|
123
131
|
version: '1.3'
|
124
132
|
- !ruby/object:Gem::Dependency
|
133
|
+
name: rake
|
125
134
|
requirement: !ruby/object:Gem::Requirement
|
126
135
|
requirements:
|
127
|
-
- -
|
136
|
+
- - ">="
|
128
137
|
- !ruby/object:Gem::Version
|
129
138
|
version: '0'
|
130
|
-
name: rake
|
131
|
-
prerelease: false
|
132
139
|
type: :development
|
140
|
+
prerelease: false
|
133
141
|
version_requirements: !ruby/object:Gem::Requirement
|
134
142
|
requirements:
|
135
|
-
- -
|
143
|
+
- - ">="
|
136
144
|
- !ruby/object:Gem::Version
|
137
145
|
version: '0'
|
138
146
|
- !ruby/object:Gem::Dependency
|
147
|
+
name: minitest
|
139
148
|
requirement: !ruby/object:Gem::Requirement
|
140
149
|
requirements:
|
141
|
-
- -
|
150
|
+
- - ">="
|
142
151
|
- !ruby/object:Gem::Version
|
143
152
|
version: '0'
|
144
|
-
name: minitest
|
145
|
-
prerelease: false
|
146
153
|
type: :development
|
154
|
+
prerelease: false
|
147
155
|
version_requirements: !ruby/object:Gem::Requirement
|
148
156
|
requirements:
|
149
|
-
- -
|
157
|
+
- - ">="
|
150
158
|
- !ruby/object:Gem::Version
|
151
159
|
version: '0'
|
152
|
-
description:
|
160
|
+
description:
|
153
161
|
email:
|
154
162
|
- none@nowhere.org
|
155
163
|
executables:
|
@@ -162,9 +170,9 @@ extra_rdoc_files:
|
|
162
170
|
- doc/other_commands.md
|
163
171
|
- doc/settings.md
|
164
172
|
files:
|
165
|
-
- .gitignore
|
166
|
-
- .travis.yml
|
167
|
-
- .yardopts
|
173
|
+
- ".gitignore"
|
174
|
+
- ".travis.yml"
|
175
|
+
- ".yardopts"
|
168
176
|
- Gemfile
|
169
177
|
- LICENSE.txt
|
170
178
|
- README.md
|
@@ -179,7 +187,9 @@ files:
|
|
179
187
|
- lib/tasks/load_maps.rake
|
180
188
|
- lib/traject.rb
|
181
189
|
- lib/traject/command_line.rb
|
190
|
+
- lib/traject/csv_writer.rb
|
182
191
|
- lib/traject/debug_writer.rb
|
192
|
+
- lib/traject/delimited_writer.rb
|
183
193
|
- lib/traject/indexer.rb
|
184
194
|
- lib/traject/indexer/settings.rb
|
185
195
|
- lib/traject/json_writer.rb
|
@@ -188,14 +198,13 @@ files:
|
|
188
198
|
- lib/traject/macros/marc21.rb
|
189
199
|
- lib/traject/macros/marc21_semantics.rb
|
190
200
|
- lib/traject/macros/marc_format_classifier.rb
|
191
|
-
- lib/traject/marc4j_reader.rb
|
192
201
|
- lib/traject/marc_extractor.rb
|
193
202
|
- lib/traject/marc_reader.rb
|
194
203
|
- lib/traject/mock_reader.rb
|
195
204
|
- lib/traject/ndj_reader.rb
|
196
205
|
- lib/traject/null_writer.rb
|
197
206
|
- lib/traject/qualified_const_get.rb
|
198
|
-
- lib/traject/
|
207
|
+
- lib/traject/solr_json_writer.rb
|
199
208
|
- lib/traject/thread_pool.rb
|
200
209
|
- lib/traject/translation_map.rb
|
201
210
|
- lib/traject/util.rb
|
@@ -208,6 +217,7 @@ files:
|
|
208
217
|
- lib/translation_maps/marc_instruments.yaml
|
209
218
|
- lib/translation_maps/marc_languages.yaml
|
210
219
|
- test/debug_writer_test.rb
|
220
|
+
- test/delimited_writer_test.rb
|
211
221
|
- test/indexer/each_record_test.rb
|
212
222
|
- test/indexer/macros_marc21_semantics_test.rb
|
213
223
|
- test/indexer/macros_marc21_test.rb
|
@@ -216,11 +226,10 @@ files:
|
|
216
226
|
- test/indexer/read_write_test.rb
|
217
227
|
- test/indexer/settings_test.rb
|
218
228
|
- test/indexer/to_field_test.rb
|
219
|
-
- test/marc4j_reader_test.rb
|
220
229
|
- test/marc_extractor_test.rb
|
221
230
|
- test/marc_format_classifier_test.rb
|
222
231
|
- test/marc_reader_test.rb
|
223
|
-
- test/
|
232
|
+
- test/solr_json_writer_test.rb
|
224
233
|
- test/test_helper.rb
|
225
234
|
- test/test_support/245_no_ab.marc
|
226
235
|
- test/test_support/880_with_no_6.utf8.marc
|
@@ -263,51 +272,33 @@ files:
|
|
263
272
|
- test/translation_maps/translate_array_test.yaml
|
264
273
|
- test/translation_maps/yaml_map.yaml
|
265
274
|
- traject.gemspec
|
266
|
-
- vendor/solrj/README
|
267
|
-
- vendor/solrj/build.xml
|
268
|
-
- vendor/solrj/ivy.xml
|
269
|
-
- vendor/solrj/lib/commons-codec-1.7.jar
|
270
|
-
- vendor/solrj/lib/commons-io-2.1.jar
|
271
|
-
- vendor/solrj/lib/httpclient-4.2.3.jar
|
272
|
-
- vendor/solrj/lib/httpcore-4.2.2.jar
|
273
|
-
- vendor/solrj/lib/httpmime-4.2.3.jar
|
274
|
-
- vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar
|
275
|
-
- vendor/solrj/lib/jul-to-slf4j-1.6.6.jar
|
276
|
-
- vendor/solrj/lib/log4j-1.2.16.jar
|
277
|
-
- vendor/solrj/lib/noggit-0.5.jar
|
278
|
-
- vendor/solrj/lib/slf4j-api-1.6.6.jar
|
279
|
-
- vendor/solrj/lib/slf4j-log4j12-1.6.6.jar
|
280
|
-
- vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar
|
281
|
-
- vendor/solrj/lib/solr-solrj-4.3.1-sources.jar
|
282
|
-
- vendor/solrj/lib/solr-solrj-4.3.1.jar
|
283
|
-
- vendor/solrj/lib/wstx-asl-3.2.7.jar
|
284
|
-
- vendor/solrj/lib/zookeeper-3.4.5.jar
|
285
275
|
homepage: http://github.com/traject-project/traject
|
286
276
|
licenses:
|
287
277
|
- MIT
|
288
278
|
metadata: {}
|
289
|
-
post_install_message:
|
279
|
+
post_install_message:
|
290
280
|
rdoc_options: []
|
291
281
|
require_paths:
|
292
282
|
- lib
|
293
283
|
required_ruby_version: !ruby/object:Gem::Requirement
|
294
284
|
requirements:
|
295
|
-
- -
|
285
|
+
- - ">="
|
296
286
|
- !ruby/object:Gem::Version
|
297
287
|
version: '0'
|
298
288
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
299
289
|
requirements:
|
300
|
-
- -
|
290
|
+
- - ">"
|
301
291
|
- !ruby/object:Gem::Version
|
302
|
-
version:
|
292
|
+
version: 1.3.1
|
303
293
|
requirements: []
|
304
|
-
rubyforge_project:
|
305
|
-
rubygems_version: 2.
|
306
|
-
signing_key:
|
294
|
+
rubyforge_project:
|
295
|
+
rubygems_version: 2.4.5
|
296
|
+
signing_key:
|
307
297
|
specification_version: 4
|
308
298
|
summary: Index MARC to Solr; or generally process source records to hash-like structures
|
309
299
|
test_files:
|
310
300
|
- test/debug_writer_test.rb
|
301
|
+
- test/delimited_writer_test.rb
|
311
302
|
- test/indexer/each_record_test.rb
|
312
303
|
- test/indexer/macros_marc21_semantics_test.rb
|
313
304
|
- test/indexer/macros_marc21_test.rb
|
@@ -316,11 +307,10 @@ test_files:
|
|
316
307
|
- test/indexer/read_write_test.rb
|
317
308
|
- test/indexer/settings_test.rb
|
318
309
|
- test/indexer/to_field_test.rb
|
319
|
-
- test/marc4j_reader_test.rb
|
320
310
|
- test/marc_extractor_test.rb
|
321
311
|
- test/marc_format_classifier_test.rb
|
322
312
|
- test/marc_reader_test.rb
|
323
|
-
- test/
|
313
|
+
- test/solr_json_writer_test.rb
|
324
314
|
- test/test_helper.rb
|
325
315
|
- test/test_support/245_no_ab.marc
|
326
316
|
- test/test_support/880_with_no_6.utf8.marc
|
@@ -1,153 +0,0 @@
|
|
1
|
-
require 'traject'
|
2
|
-
require 'marc'
|
3
|
-
require 'marc/marc4j'
|
4
|
-
|
5
|
-
# `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
|
6
|
-
# into standard ruby-marc MARC::Record objects. This reader may be faster than
|
7
|
-
# Traject::MarcReader, especially for XML.
|
8
|
-
#
|
9
|
-
# Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
|
10
|
-
# for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
|
11
|
-
# for reading xml. The actual code for dealing with Marc4J is in the separate
|
12
|
-
# [marc-marc4j gem](https://github.com/billdueber/ruby-marc-marc4j).
|
13
|
-
#
|
14
|
-
# See also the pure ruby Traject::MarcReader as an alternative, if you need to read
|
15
|
-
# marc-in-json, or if you don't need binary Marc8 support, it may in some cases
|
16
|
-
# be faster.
|
17
|
-
#
|
18
|
-
# ## Settings
|
19
|
-
#
|
20
|
-
# * marc_source.type: serialization type. default 'binary', also 'xml' (TODO: json/marc-in-json)
|
21
|
-
#
|
22
|
-
# * marc4j_reader.permissive: default true, false to turn off permissive reading. Used as
|
23
|
-
# value to 'permissive' arg of MarcPermissiveStreamReader constructor.
|
24
|
-
# Only used for 'binary'
|
25
|
-
#
|
26
|
-
# * marc_source.encoding: Only used for 'binary', otherwise always UTF-8.
|
27
|
-
# String of the values MarcPermissiveStreamReader accepts:
|
28
|
-
# * BESTGUESS (default: not entirely clear what Marc4J does with this)
|
29
|
-
# * ISO-8859-1 (also accepted: ISO8859_1)
|
30
|
-
# * UTF-8
|
31
|
-
# * MARC-8 (also accepted: MARC8)
|
32
|
-
# Default 'BESTGUESS', but HIGHLY recommend setting
|
33
|
-
# to avoid some Marc4J unpredictability, Marc4J "BESTGUESS" can be unpredictable
|
34
|
-
# in a variety of ways.
|
35
|
-
# (will ALWAYS be transcoded to UTF-8 on the way out. We insist.)
|
36
|
-
#
|
37
|
-
# * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
|
38
|
-
# be loaded. If unset, uses marc4j.jar bundled with traject.
|
39
|
-
#
|
40
|
-
# * marc4j_reader.keep_marc4j: Keeps the original marc4j record accessible from
|
41
|
-
# the eventual ruby-marc record via record#original_marc4j. Intended for
|
42
|
-
# those that have legacy java code for which a marc4j object is needed. .
|
43
|
-
#
|
44
|
-
#
|
45
|
-
# ## Example
|
46
|
-
#
|
47
|
-
# In a configuration file:
|
48
|
-
#
|
49
|
-
# require 'traject/marc4j_reader
|
50
|
-
# settings do
|
51
|
-
# provide "reader_class_name", "Traject::Marc4JReader"
|
52
|
-
#
|
53
|
-
# #for MarcXML:
|
54
|
-
# # provide "marc_source.type", "xml"
|
55
|
-
#
|
56
|
-
# # Or instead for binary:
|
57
|
-
# provide "marc4j_reader.permissive", true
|
58
|
-
# provide "marc_source.encoding", "MARC8"
|
59
|
-
# end
|
60
|
-
class Traject::Marc4JReader
|
61
|
-
include Enumerable
|
62
|
-
|
63
|
-
attr_reader :settings, :input_stream
|
64
|
-
|
65
|
-
def initialize(input_stream, settings)
|
66
|
-
@settings = Traject::Indexer::Settings.new settings
|
67
|
-
@input_stream = input_stream
|
68
|
-
|
69
|
-
if @settings['marc4j_reader.keep_marc4j'] &&
|
70
|
-
! (MARC::Record.instance_methods.include?(:original_marc4j) &&
|
71
|
-
MARC::Record.instance_methods.include?(:"original_marc4j="))
|
72
|
-
MARC::Record.class_eval('attr_accessor :original_marc4j')
|
73
|
-
end
|
74
|
-
|
75
|
-
# Creating a converter will do the following:
|
76
|
-
# - nothing, if it detects that the marc4j jar is already loaded
|
77
|
-
# - load all the .jar files in settings['marc4j_reader.jar_dir'] if set
|
78
|
-
# - load the marc4j jar file bundled with MARC::MARC4J otherwise
|
79
|
-
|
80
|
-
@converter = MARC::MARC4J.new(:jardir => settings['marc4j_reader.jar_dir'], :logger => logger)
|
81
|
-
|
82
|
-
# Convenience
|
83
|
-
java_import org.marc4j.MarcPermissiveStreamReader
|
84
|
-
java_import org.marc4j.MarcXmlReader
|
85
|
-
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
def internal_reader
|
90
|
-
@internal_reader ||= create_marc_reader!
|
91
|
-
end
|
92
|
-
|
93
|
-
def input_type
|
94
|
-
# maybe later add some guessing somehow
|
95
|
-
settings["marc_source.type"]
|
96
|
-
end
|
97
|
-
|
98
|
-
def specified_source_encoding
|
99
|
-
#settings["marc4j_reader.source_encoding"]
|
100
|
-
enc = settings["marc_source.encoding"]
|
101
|
-
|
102
|
-
# one is standard for ruby and we want to support,
|
103
|
-
# the other is used by Marc4J and we have to pass it to Marc4J
|
104
|
-
enc = "ISO8859_1" if enc == "ISO-8859-1"
|
105
|
-
|
106
|
-
# default
|
107
|
-
enc = "BESTGUESS" if enc.nil? || enc.empty?
|
108
|
-
|
109
|
-
return enc
|
110
|
-
end
|
111
|
-
|
112
|
-
def create_marc_reader!
|
113
|
-
case input_type
|
114
|
-
when "binary"
|
115
|
-
permissive = settings["marc4j_reader.permissive"].to_s == "true"
|
116
|
-
|
117
|
-
# #to_inputstream turns our ruby IO into a Java InputStream
|
118
|
-
# third arg means 'convert to UTF-8, yes'
|
119
|
-
MarcPermissiveStreamReader.new(input_stream.to_inputstream, permissive, true, specified_source_encoding)
|
120
|
-
when "xml"
|
121
|
-
MarcXmlReader.new(input_stream.to_inputstream)
|
122
|
-
else
|
123
|
-
raise IllegalArgument.new("Unrecgonized marc_source.type: #{input_type}")
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
def each
|
128
|
-
while (internal_reader.hasNext)
|
129
|
-
begin
|
130
|
-
marc4j = internal_reader.next
|
131
|
-
rubymarc = @converter.marc4j_to_rubymarc(marc4j)
|
132
|
-
if @settings['marc4j_reader.keep_marc4j']
|
133
|
-
rubymarc.original_marc4j = marc4j
|
134
|
-
end
|
135
|
-
rescue Exception =>e
|
136
|
-
msg = "MARC4JReader: Error reading MARC, fatal, re-raising"
|
137
|
-
if marc4j
|
138
|
-
msg += "\n 001 id: #{marc4j.getControlNumber}"
|
139
|
-
end
|
140
|
-
msg += "\n #{Traject::Util.exception_to_log_message(e)}"
|
141
|
-
logger.fatal msg
|
142
|
-
raise e
|
143
|
-
end
|
144
|
-
|
145
|
-
yield rubymarc
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
def logger
|
150
|
-
@logger ||= (settings[:logger] || Yell.new(STDERR, :level => "gt.fatal")) # null logger)
|
151
|
-
end
|
152
|
-
|
153
|
-
end
|