traject 1.1.0 → 2.0.0.rc.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +20 -0
- data/README.md +85 -73
- data/doc/batch_execution.md +2 -6
- data/doc/other_commands.md +3 -5
- data/doc/settings.md +27 -38
- data/lib/traject/command_line.rb +1 -1
- data/lib/traject/csv_writer.rb +34 -0
- data/lib/traject/delimited_writer.rb +110 -0
- data/lib/traject/indexer.rb +29 -11
- data/lib/traject/indexer/settings.rb +39 -13
- data/lib/traject/line_writer.rb +10 -6
- data/lib/traject/marc_reader.rb +2 -1
- data/lib/traject/solr_json_writer.rb +277 -0
- data/lib/traject/thread_pool.rb +38 -48
- data/lib/traject/translation_map.rb +3 -0
- data/lib/traject/util.rb +13 -51
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/marc_geographic.yaml +2 -2
- data/test/delimited_writer_test.rb +104 -0
- data/test/indexer/read_write_test.rb +0 -22
- data/test/indexer/settings_test.rb +24 -0
- data/test/solr_json_writer_test.rb +248 -0
- data/test/test_helper.rb +5 -3
- data/test/test_support/demo_config.rb +0 -5
- data/test/translation_map_test.rb +9 -0
- data/traject.gemspec +18 -5
- metadata +77 -87
- data/lib/traject/marc4j_reader.rb +0 -153
- data/lib/traject/solrj_writer.rb +0 -351
- data/test/marc4j_reader_test.rb +0 -136
- data/test/solrj_writer_test.rb +0 -209
- data/vendor/solrj/README +0 -8
- data/vendor/solrj/build.xml +0 -39
- data/vendor/solrj/ivy.xml +0 -16
- data/vendor/solrj/lib/commons-codec-1.7.jar +0 -0
- data/vendor/solrj/lib/commons-io-2.1.jar +0 -0
- data/vendor/solrj/lib/httpclient-4.2.3.jar +0 -0
- data/vendor/solrj/lib/httpcore-4.2.2.jar +0 -0
- data/vendor/solrj/lib/httpmime-4.2.3.jar +0 -0
- data/vendor/solrj/lib/jcl-over-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/jul-to-slf4j-1.6.6.jar +0 -0
- data/vendor/solrj/lib/log4j-1.2.16.jar +0 -0
- data/vendor/solrj/lib/noggit-0.5.jar +0 -0
- data/vendor/solrj/lib/slf4j-api-1.6.6.jar +0 -0
- data/vendor/solrj/lib/slf4j-log4j12-1.6.6.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-javadoc.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1-sources.jar +0 -0
- data/vendor/solrj/lib/solr-solrj-4.3.1.jar +0 -0
- data/vendor/solrj/lib/wstx-asl-3.2.7.jar +0 -0
- data/vendor/solrj/lib/zookeeper-3.4.5.jar +0 -0
data/lib/traject/solrj_writer.rb
DELETED
@@ -1,351 +0,0 @@
|
|
1
|
-
require 'yell'
|
2
|
-
|
3
|
-
require 'traject'
|
4
|
-
require 'traject/util'
|
5
|
-
require 'traject/qualified_const_get'
|
6
|
-
require 'traject/thread_pool'
|
7
|
-
|
8
|
-
require 'uri'
|
9
|
-
require 'thread' # for Mutex
|
10
|
-
|
11
|
-
#
|
12
|
-
# Writes to a Solr using SolrJ, and the SolrJ HttpSolrServer.
|
13
|
-
#
|
14
|
-
# After you call #close, you can check #skipped_record_count if you want
|
15
|
-
# for an integer count of skipped records.
|
16
|
-
#
|
17
|
-
# For fatal errors that raise... async processing with thread_pool means that
|
18
|
-
# you may not get a raise immediately after calling #put, you may get it on
|
19
|
-
# a FUTURE #put or #close. You should get it eventually though.
|
20
|
-
#
|
21
|
-
# ## Settings
|
22
|
-
#
|
23
|
-
# * solr.url: Your solr url (required)
|
24
|
-
#
|
25
|
-
# * solrj_writer.server_class_name: Defaults to "HttpSolrServer". You can specify
|
26
|
-
# another Solr Server sub-class, but it has
|
27
|
-
# to take a one-arg url constructor. Maybe
|
28
|
-
# subclass this writer class and overwrite
|
29
|
-
# instantiate_solr_server! otherwise
|
30
|
-
#
|
31
|
-
# * solrj.jar_dir: Custom directory containing all of the SolrJ jars. All
|
32
|
-
# jars in this dir will be loaded. Otherwise,
|
33
|
-
# we load our own packaged solrj jars. This setting
|
34
|
-
# can't really be used differently in the same app instance,
|
35
|
-
# since jars are loaded globally.
|
36
|
-
#
|
37
|
-
# * solrj_writer.parser_class_name: A String name of a class in package
|
38
|
-
# org.apache.solr.client.solrj.impl,
|
39
|
-
# we'll instantiate one with a zero-arg
|
40
|
-
# constructor, and pass it as an arg to setParser on
|
41
|
-
# the SolrServer instance, if present.
|
42
|
-
# NOTE: For contacting a Solr 1.x server, with the
|
43
|
-
# recent version of SolrJ used by default, set to
|
44
|
-
# "XMLResponseParser"
|
45
|
-
#
|
46
|
-
# * solrj_writer.commit_on_close: If true (or string 'true'), send a commit to solr
|
47
|
-
# at end of #process.
|
48
|
-
#
|
49
|
-
# * solrj_writer.batch_size: If non-nil and more than 1, send documents to
|
50
|
-
# solr in batches of solrj_writer.batch_size. If nil/1,
|
51
|
-
# however, an http transaction with solr will be done
|
52
|
-
# per doc. DEFAULT to 100, which seems to be a sweet spot.
|
53
|
-
#
|
54
|
-
# * solrj_writer.thread_pool: Defaults to 1. A thread pool is used for submitting docs
|
55
|
-
# to solr. Set to 0 or nil to disable threading. Set to 1,
|
56
|
-
# there will still be a single bg thread doing the adds. For
|
57
|
-
# very fast Solr servers and very fast indexing processes, may
|
58
|
-
# make sense to increase this value to throw at Solr as fast as it
|
59
|
-
# can catch.
|
60
|
-
#
|
61
|
-
# ## Example
|
62
|
-
#
|
63
|
-
# settings do
|
64
|
-
# provide "writer_class_name", "Traject::SolrJWriter"
|
65
|
-
#
|
66
|
-
# # This is just regular ruby, so don't be afraid to have conditionals!
|
67
|
-
# # Switch on hostname, for test and production server differences
|
68
|
-
# if Socket.gethostname =~ /devhost/
|
69
|
-
# provide "solr.url", "http://my.dev.machine:9033/catalog"
|
70
|
-
# else
|
71
|
-
# provide "solr.url", "http://my.production.machine:9033/catalog"
|
72
|
-
# end
|
73
|
-
#
|
74
|
-
# provide "solrj_writer.parser_class_name", "BinaryResponseParser" # for Solr 4.x
|
75
|
-
# # provide "solrj_writer.parser_class_name", "XMLResponseParser" # For solr 1.x or 3.x
|
76
|
-
#
|
77
|
-
# provide "solrj_writer.commit_on_close", "true"
|
78
|
-
# end
|
79
|
-
class Traject::SolrJWriter
|
80
|
-
# just a tuple of a SolrInputDocument
|
81
|
-
# and a Traject::Indexer::Context it came from
|
82
|
-
class UpdatePackage
|
83
|
-
attr_accessor :solr_document, :context
|
84
|
-
def initialize(doc, ctx)
|
85
|
-
self.solr_document = doc
|
86
|
-
self.context = ctx
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
include Traject::QualifiedConstGet
|
91
|
-
|
92
|
-
attr_reader :settings
|
93
|
-
|
94
|
-
attr_reader :batched_queue
|
95
|
-
|
96
|
-
def initialize(argSettings)
|
97
|
-
@settings = Traject::Indexer::Settings.new(argSettings)
|
98
|
-
settings_check!(settings)
|
99
|
-
|
100
|
-
ensure_solrj_loaded!
|
101
|
-
|
102
|
-
solr_server # init
|
103
|
-
|
104
|
-
@batched_queue = java.util.concurrent.LinkedBlockingQueue.new
|
105
|
-
|
106
|
-
# when multi-threaded exceptions raised in threads are held here
|
107
|
-
# we need a HIGH performance queue here to try and avoid slowing things down,
|
108
|
-
# since we need to check it frequently.
|
109
|
-
@async_exception_queue = java.util.concurrent.ConcurrentLinkedQueue.new
|
110
|
-
|
111
|
-
# Store error count in an AtomicInteger, so multi threads can increment
|
112
|
-
# it safely, if we're threaded.
|
113
|
-
@skipped_record_incrementer = java.util.concurrent.atomic.AtomicInteger.new(0)
|
114
|
-
|
115
|
-
# if our thread pool settings are 0, it'll just create a null threadpool that
|
116
|
-
# executes in calling context.
|
117
|
-
@thread_pool = Traject::ThreadPool.new( @settings["solrj_writer.thread_pool"].to_i )
|
118
|
-
|
119
|
-
@debug_ascii_progress = (@settings["debug_ascii_progress"].to_s == "true")
|
120
|
-
|
121
|
-
logger.info(" #{self.class.name} writing to '#{settings['solr.url']}'")
|
122
|
-
end
|
123
|
-
|
124
|
-
# Loads solrj if not already loaded. By loading all jars found
|
125
|
-
# in settings["solrj.jar_dir"]
|
126
|
-
def ensure_solrj_loaded!
|
127
|
-
unless defined?(HttpSolrServer) && defined?(SolrInputDocument)
|
128
|
-
Traject::Util.require_solrj_jars(settings)
|
129
|
-
end
|
130
|
-
|
131
|
-
# And for now, SILENCE SolrJ logging
|
132
|
-
org.apache.log4j.Logger.getRootLogger().addAppender(org.apache.log4j.varia.NullAppender.new)
|
133
|
-
end
|
134
|
-
|
135
|
-
# Method IS thread-safe, can be called concurrently by multi-threads.
|
136
|
-
#
|
137
|
-
# Why? If not using batched add, we just use the SolrServer, which is already
|
138
|
-
# thread safe itself.
|
139
|
-
#
|
140
|
-
# If we are using batch add, we surround all access to our shared state batch queue
|
141
|
-
# in a mutex -- just a naive implementation. May be able to improve performance
|
142
|
-
# with more sophisticated java.util.concurrent data structure (blocking queue etc)
|
143
|
-
# I did try a java ArrayBlockingQueue or LinkedBlockingQueue instead of our own
|
144
|
-
# mutex -- I did not see consistently different performance. May want to
|
145
|
-
# change so doesn't use a mutex at all if multiple mapping threads aren't being
|
146
|
-
# used.
|
147
|
-
#
|
148
|
-
# this class does not at present use any threads itself, all work will be done
|
149
|
-
# in the calling thread, including actual http transactions to solr via solrj SolrServer
|
150
|
-
# if using batches, then not every #put is a http transaction, but when it is,
|
151
|
-
# it's in the calling thread, synchronously.
|
152
|
-
def put(context)
|
153
|
-
@thread_pool.raise_collected_exception!
|
154
|
-
|
155
|
-
# package the SolrInputDocument along with the context, so we have
|
156
|
-
# the context for error reporting when we actually add.
|
157
|
-
|
158
|
-
package = UpdatePackage.new(hash_to_solr_document(context.output_hash), context)
|
159
|
-
|
160
|
-
if settings["solrj_writer.batch_size"].to_i > 1
|
161
|
-
ready_batch = []
|
162
|
-
|
163
|
-
batched_queue.add(package)
|
164
|
-
if batched_queue.size >= settings["solrj_writer.batch_size"].to_i
|
165
|
-
batched_queue.drain_to(ready_batch)
|
166
|
-
end
|
167
|
-
|
168
|
-
if ready_batch.length > 0
|
169
|
-
if @debug_ascii_progress
|
170
|
-
$stderr.write("^")
|
171
|
-
if @thread_pool.queue && (@thread_pool.queue.size >= @thread_pool.queue_capacity)
|
172
|
-
$stderr.write "!"
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
@thread_pool.maybe_in_thread_pool { batch_add_document_packages(ready_batch) }
|
177
|
-
end
|
178
|
-
else # non-batched add, add one at a time.
|
179
|
-
@thread_pool.maybe_in_thread_pool { add_one_document_package(package) }
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
def hash_to_solr_document(hash)
|
184
|
-
doc = SolrInputDocument.new
|
185
|
-
hash.each_pair do |key, value_array|
|
186
|
-
value_array.each do |value|
|
187
|
-
doc.addField( key, value )
|
188
|
-
end
|
189
|
-
end
|
190
|
-
return doc
|
191
|
-
end
|
192
|
-
|
193
|
-
# Takes array and batch adds it to solr -- array of UpdatePackage tuples of
|
194
|
-
# SolrInputDocument and context.
|
195
|
-
#
|
196
|
-
# Catches error in batch add, logs, and re-tries docs individually
|
197
|
-
#
|
198
|
-
# Is thread-safe, because SolrServer is thread-safe, and we aren't
|
199
|
-
# referencing any other shared state. Important that CALLER passes
|
200
|
-
# in a doc array that is not shared state, extracting it from
|
201
|
-
# shared state batched_queue in a mutex.
|
202
|
-
def batch_add_document_packages(current_batch)
|
203
|
-
begin
|
204
|
-
a = current_batch.collect {|package| package.solr_document }
|
205
|
-
solr_server.add( a )
|
206
|
-
|
207
|
-
$stderr.write "%" if @debug_ascii_progress
|
208
|
-
rescue Exception => e
|
209
|
-
# Error in batch, none of the docs got added, let's try to re-add
|
210
|
-
# em all individually, so those that CAN get added get added, and those
|
211
|
-
# that can't get individually logged.
|
212
|
-
logger.warn "Error encountered in batch solr add, will re-try documents individually, at a performance penalty...\n" + Traject::Util.exception_to_log_message(e)
|
213
|
-
current_batch.each do |package|
|
214
|
-
add_one_document_package(package)
|
215
|
-
end
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
|
220
|
-
# Adds a single SolrInputDocument passed in as an UpdatePackage combo of SolrInputDocument
|
221
|
-
# and context.
|
222
|
-
#
|
223
|
-
# Rescues exceptions thrown by SolrServer.add, logs them, and then raises them
|
224
|
-
# again if deemed fatal and should stop indexing. Only intended to be used on a SINGLE
|
225
|
-
# document add. If we get an exception on a multi-doc batch add, we need to recover
|
226
|
-
# differently.
|
227
|
-
def add_one_document_package(package)
|
228
|
-
begin
|
229
|
-
solr_server.add(package.solr_document)
|
230
|
-
# Honestly not sure what the difference is between those types, but SolrJ raises both
|
231
|
-
rescue org.apache.solr.common.SolrException, org.apache.solr.client.solrj.SolrServerException => e
|
232
|
-
id = package.context.source_record && package.context.source_record['001'] && package.context.source_record['001'].value
|
233
|
-
id_str = id ? "001:#{id}" : ""
|
234
|
-
|
235
|
-
position = package.context.position
|
236
|
-
position_str = position ? "at file position #{position} (starting at 1)" : ""
|
237
|
-
|
238
|
-
logger.error("Could not index record #{id_str} #{position_str}\n" + Traject::Util.exception_to_log_message(e) )
|
239
|
-
logger.debug(package.context.source_record.to_s)
|
240
|
-
|
241
|
-
@skipped_record_incrementer.getAndIncrement() # AtomicInteger, thread-safe increment.
|
242
|
-
|
243
|
-
if fatal_exception? e
|
244
|
-
logger.fatal ("SolrJ exception judged fatal, raising...")
|
245
|
-
raise e
|
246
|
-
end
|
247
|
-
end
|
248
|
-
end
|
249
|
-
|
250
|
-
def logger
|
251
|
-
settings["logger"] ||= Yell.new(STDERR, :level => "gt.fatal") # null logger
|
252
|
-
end
|
253
|
-
|
254
|
-
# If an exception is encountered talking to Solr, is it one we should
|
255
|
-
# entirely give up on? SolrJ doesn't use a useful exception class hieararchy,
|
256
|
-
# we have to look into it's details and guess.
|
257
|
-
def fatal_exception?(e)
|
258
|
-
|
259
|
-
|
260
|
-
root_cause = e.respond_to?(:getRootCause) && e.getRootCause
|
261
|
-
|
262
|
-
# Various kinds of inability to actually talk to the
|
263
|
-
# server look like this:
|
264
|
-
if root_cause.kind_of? java.io.IOException
|
265
|
-
return true
|
266
|
-
end
|
267
|
-
|
268
|
-
# Consider Solr server returning HTTP 500 Internal Server Error to be fatal.
|
269
|
-
# This can mean, for instance, that disk space is exhausted on solr server.
|
270
|
-
if e.kind_of?(Java::OrgApacheSolrCommon::SolrException) && e.code == 500
|
271
|
-
return true
|
272
|
-
end
|
273
|
-
|
274
|
-
return false
|
275
|
-
end
|
276
|
-
|
277
|
-
def close
|
278
|
-
@thread_pool.raise_collected_exception!
|
279
|
-
|
280
|
-
# Any leftovers in batch buffer? Send em to the threadpool too.
|
281
|
-
if batched_queue.length > 0
|
282
|
-
packages = []
|
283
|
-
batched_queue.drain_to(packages)
|
284
|
-
|
285
|
-
# we do it in the thread pool for consistency, and so
|
286
|
-
# it goes to the end of the queue behind any outstanding
|
287
|
-
# work in the pool.
|
288
|
-
@thread_pool.maybe_in_thread_pool { batch_add_document_packages( packages ) }
|
289
|
-
end
|
290
|
-
|
291
|
-
# Wait for shutdown, and time it.
|
292
|
-
logger.debug "SolrJWriter: Shutting down thread pool, waiting if needed..."
|
293
|
-
elapsed = @thread_pool.shutdown_and_wait
|
294
|
-
if elapsed > 60
|
295
|
-
logger.warn "Waited #{elapsed} seconds for all SolrJWriter threads, you may want to increase solrj_writer.thread_pool (currently #{@settings["solrj_writer.thread_pool"]})"
|
296
|
-
end
|
297
|
-
logger.debug "SolrJWriter: Thread pool shutdown complete"
|
298
|
-
logger.warn "SolrJWriter: #{skipped_record_count} skipped records" if skipped_record_count > 0
|
299
|
-
|
300
|
-
# check again now that we've waited, there could still be some
|
301
|
-
# that didn't show up before.
|
302
|
-
@thread_pool.raise_collected_exception!
|
303
|
-
|
304
|
-
if settings["solrj_writer.commit_on_close"].to_s == "true"
|
305
|
-
logger.info "SolrJWriter: Sending commit to solr..."
|
306
|
-
solr_server.commit
|
307
|
-
end
|
308
|
-
|
309
|
-
solr_server.shutdown
|
310
|
-
@solr_server = nil
|
311
|
-
end
|
312
|
-
|
313
|
-
# Return count of encountered skipped records. Most accurate to call
|
314
|
-
# it after #close, in which case it should include full count, even
|
315
|
-
# under async thread_pool.
|
316
|
-
def skipped_record_count
|
317
|
-
@skipped_record_incrementer.get
|
318
|
-
end
|
319
|
-
|
320
|
-
|
321
|
-
def solr_server
|
322
|
-
@solr_server ||= instantiate_solr_server!
|
323
|
-
end
|
324
|
-
attr_writer :solr_server # mainly for testing
|
325
|
-
|
326
|
-
# Instantiates a solr server of class settings["solrj_writer.server_class_name"] or "HttpSolrServer"
|
327
|
-
# and initializes it with settings["solr.url"]
|
328
|
-
def instantiate_solr_server!
|
329
|
-
server_class = qualified_const_get( settings["solrj_writer.server_class_name"] || "HttpSolrServer" )
|
330
|
-
server = server_class.new( settings["solr.url"].to_s );
|
331
|
-
|
332
|
-
if parser_name = settings["solrj_writer.parser_class_name"]
|
333
|
-
#parser = org.apache.solr.client.solrj.impl.const_get(parser_name).new
|
334
|
-
parser = Java::JavaClass.for_name("org.apache.solr.client.solrj.impl.#{parser_name}").ruby_class.new
|
335
|
-
server.setParser( parser )
|
336
|
-
end
|
337
|
-
|
338
|
-
server
|
339
|
-
end
|
340
|
-
|
341
|
-
def settings_check!(settings)
|
342
|
-
unless settings.has_key?("solr.url") && ! settings["solr.url"].nil?
|
343
|
-
raise ArgumentError.new("SolrJWriter requires a 'solr.url' solr url in settings")
|
344
|
-
end
|
345
|
-
|
346
|
-
unless settings["solr.url"] =~ /^#{URI::regexp}$/
|
347
|
-
raise ArgumentError.new("SolrJWriter requires a 'solr.url' setting that looks like a URL, not: `#{settings['solr.url']}`")
|
348
|
-
end
|
349
|
-
end
|
350
|
-
|
351
|
-
end
|
data/test/marc4j_reader_test.rb
DELETED
@@ -1,136 +0,0 @@
|
|
1
|
-
# Encoding: UTF-8
|
2
|
-
|
3
|
-
require 'test_helper'
|
4
|
-
|
5
|
-
require 'traject'
|
6
|
-
require 'traject/indexer'
|
7
|
-
require 'traject/marc4j_reader'
|
8
|
-
|
9
|
-
require 'marc'
|
10
|
-
|
11
|
-
describe "Marc4JReader" do
|
12
|
-
it "reads Marc binary" do
|
13
|
-
file = File.new(support_file_path("test_data.utf8.mrc"))
|
14
|
-
settings = Traject::Indexer::Settings.new() # binary type is default
|
15
|
-
reader = Traject::Marc4JReader.new(file, settings)
|
16
|
-
|
17
|
-
array = reader.to_a
|
18
|
-
|
19
|
-
assert_equal 30, array.length
|
20
|
-
first = array.first
|
21
|
-
|
22
|
-
assert_kind_of MARC::Record, first
|
23
|
-
assert_equal first['245']['a'].encoding.name, "UTF-8"
|
24
|
-
end
|
25
|
-
|
26
|
-
it "can skip a bad subfield code" do
|
27
|
-
file = File.new(support_file_path("bad_subfield_code.marc"))
|
28
|
-
settings = Traject::Indexer::Settings.new() # binary type is default
|
29
|
-
reader = Traject::Marc4JReader.new(file, settings)
|
30
|
-
|
31
|
-
array = reader.to_a
|
32
|
-
|
33
|
-
assert_equal 1, array.length
|
34
|
-
assert_kind_of MARC::Record, array.first
|
35
|
-
assert_length 2, array.first['260'].subfields
|
36
|
-
end
|
37
|
-
|
38
|
-
it "reads Marc binary in Marc8 encoding" do
|
39
|
-
file = File.new(support_file_path("one-marc8.mrc"))
|
40
|
-
settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8")
|
41
|
-
reader = Traject::Marc4JReader.new(file, settings)
|
42
|
-
|
43
|
-
array = reader.to_a
|
44
|
-
|
45
|
-
assert_length 1, array
|
46
|
-
|
47
|
-
|
48
|
-
assert_kind_of MARC::Record, array.first
|
49
|
-
a245a = array.first['245']['a']
|
50
|
-
|
51
|
-
assert a245a.encoding.name, "UTF-8"
|
52
|
-
assert a245a.valid_encoding?
|
53
|
-
# marc4j converts to denormalized unicode, bah. Although
|
54
|
-
# it's legal, it probably looks weird as a string literal
|
55
|
-
# below, depending on your editor.
|
56
|
-
assert_equal "Por uma outra globalização :", a245a
|
57
|
-
|
58
|
-
# Set leader byte to proper for unicode
|
59
|
-
assert_equal 'a', array.first.leader[9]
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
it "reads XML" do
|
64
|
-
file = File.new(support_file_path "test_data.utf8.marc.xml")
|
65
|
-
settings = Traject::Indexer::Settings.new("marc_source.type" => "xml")
|
66
|
-
reader = Traject::Marc4JReader.new(file, settings)
|
67
|
-
|
68
|
-
array = reader.to_a
|
69
|
-
|
70
|
-
assert_equal 30, array.length
|
71
|
-
|
72
|
-
first = array.first
|
73
|
-
|
74
|
-
assert_kind_of MARC::Record, first
|
75
|
-
assert first['245']['a'].encoding.name, "UTF-8"
|
76
|
-
assert_equal "Fikr-i Ayāz /", first['245']['a']
|
77
|
-
end
|
78
|
-
|
79
|
-
it "keeps marc4j object when asked" do
|
80
|
-
file = File.new(support_file_path "test_data.utf8.marc.xml")
|
81
|
-
settings = Traject::Indexer::Settings.new("marc_source.type" => "xml", 'marc4j_reader.keep_marc4j' => true)
|
82
|
-
record = Traject::Marc4JReader.new(file, settings).to_a.first
|
83
|
-
assert_kind_of MARC::Record, record
|
84
|
-
assert_kind_of Java::org.marc4j.marc.impl::RecordImpl, record.original_marc4j
|
85
|
-
end
|
86
|
-
|
87
|
-
it "replaces unicode character reference in Marc8 transcode" do
|
88
|
-
file = File.new(support_file_path "escaped_character_reference.marc8.marc")
|
89
|
-
# due to marc4j idiosyncracies, this test will NOT pass with default source_encoding
|
90
|
-
# of "BESTGUESS", it only works if you explicitly set to MARC8. Doh.
|
91
|
-
settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8") # binary type is default
|
92
|
-
record = Traject::Marc4JReader.new(file, settings).to_a.first
|
93
|
-
|
94
|
-
assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
|
95
|
-
end
|
96
|
-
|
97
|
-
describe "Marc4J Java Permissive Stream Reader" do
|
98
|
-
# needed for sanity check when our tests fail to see if Marc4J
|
99
|
-
# is not behaving how we think it should.
|
100
|
-
it "converts character references" do
|
101
|
-
file = File.new(support_file_path "escaped_character_reference.marc8.marc")
|
102
|
-
reader = MarcPermissiveStreamReader.new(file.to_inputstream, true, true, "MARC-8")
|
103
|
-
record = reader.next
|
104
|
-
|
105
|
-
field = record.getVariableField("260")
|
106
|
-
subfield = field.getSubfield('a'.ord)
|
107
|
-
value = subfield.getData
|
108
|
-
|
109
|
-
assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", value
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
it "replaces bad byte in UTF8 marc" do
|
114
|
-
skip "Marc4J needs fixing on it's end" # Marc4J won't do this in 'permissive' mode, gah.
|
115
|
-
|
116
|
-
# Note this only works because the marc file DOES correctly
|
117
|
-
# have leader byte 9 set to 'a' for UTF8, otherwise Marc4J can't do it.
|
118
|
-
file = File.new(support_file_path "bad_utf_byte.utf8.marc")
|
119
|
-
|
120
|
-
settings = Traject::Indexer::Settings.new() # binary UTF8 type is default
|
121
|
-
reader = Traject::Marc4JReader.new(file, settings)
|
122
|
-
|
123
|
-
record = reader.to_a.first
|
124
|
-
|
125
|
-
value = record['300']['a']
|
126
|
-
|
127
|
-
assert_equal value.encoding.name, "UTF-8"
|
128
|
-
assert value.valid_encoding?, "Has valid encoding"
|
129
|
-
assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", record['300']['a']
|
130
|
-
end
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
end
|