traject 0.0.2 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/README.md +85 -61
- data/Rakefile +5 -0
- data/bin/traject +31 -3
- data/doc/settings.md +74 -13
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject/indexer/settings.rb +75 -0
- data/lib/traject/indexer.rb +255 -45
- data/lib/traject/json_writer.rb +4 -2
- data/lib/traject/macros/marc21.rb +18 -6
- data/lib/traject/macros/marc21_semantics.rb +405 -0
- data/lib/traject/macros/marc_format_classifier.rb +180 -0
- data/lib/traject/marc4j_reader.rb +160 -0
- data/lib/traject/marc_extractor.rb +33 -17
- data/lib/traject/marc_reader.rb +14 -11
- data/lib/traject/solrj_writer.rb +247 -9
- data/lib/traject/thread_pool.rb +154 -0
- data/lib/traject/translation_map.rb +46 -4
- data/lib/traject/util.rb +30 -0
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/indexer/each_record_test.rb +34 -0
- data/test/indexer/macros_marc21_semantics_test.rb +206 -0
- data/test/indexer/macros_marc21_test.rb +10 -1
- data/test/indexer/map_record_test.rb +78 -8
- data/test/indexer/read_write_test.rb +43 -10
- data/test/indexer/settings_test.rb +60 -4
- data/test/indexer/to_field_test.rb +39 -0
- data/test/marc4j_reader_test.rb +75 -0
- data/test/marc_extractor_test.rb +62 -0
- data/test/marc_format_classifier_test.rb +91 -0
- data/test/marc_reader_test.rb +12 -0
- data/test/solrj_writer_test.rb +146 -43
- data/test/test_helper.rb +50 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +153 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +8 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/traject.gemspec +1 -1
- data/vendor/marc4j/README.md +17 -0
- data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
- metadata +81 -2
data/lib/traject/indexer.rb
CHANGED
@@ -1,8 +1,11 @@
|
|
1
|
-
require '
|
1
|
+
require 'yell'
|
2
2
|
|
3
3
|
require 'traject'
|
4
4
|
require 'traject/qualified_const_get'
|
5
|
+
|
6
|
+
require 'traject/indexer/settings'
|
5
7
|
require 'traject/marc_reader'
|
8
|
+
require 'traject/marc4j_reader'
|
6
9
|
require 'traject/json_writer'
|
7
10
|
require 'traject/solrj_writer'
|
8
11
|
|
@@ -28,11 +31,15 @@ require 'traject/macros/basic'
|
|
28
31
|
#
|
29
32
|
#
|
30
33
|
# A Writer is any class that:
|
31
|
-
# 1) Has a one-argument initializer taking a Settings hash.
|
34
|
+
# 1) Has a one-argument initializer taking a Settings hash. (The logger
|
35
|
+
# is provided to the Writer in settings["logger"])
|
32
36
|
# 2) Responds to a one argument #put method, where the argument is
|
33
|
-
# a
|
37
|
+
# a Traject::Indexer::Context, containing an #output_hash
|
38
|
+
# hash of mapped keys/values. The writer should write them
|
34
39
|
# to the appropriate place.
|
35
40
|
# 3) Responds to a #close method, called when we're done.
|
41
|
+
# 4) Optionally implements a #skipped_record_count method, returning int count of records
|
42
|
+
# that were skipped due to errors (and presumably logged)
|
36
43
|
#
|
37
44
|
# The default writer (will be) the SolrWriter , which is configured
|
38
45
|
# through additional Settings as well. A JsonWriter is also available,
|
@@ -55,8 +62,9 @@ class Traject::Indexer
|
|
55
62
|
include Traject::Macros::Basic
|
56
63
|
|
57
64
|
|
58
|
-
|
59
|
-
|
65
|
+
# optional hash or Traject::Indexer::Settings object of settings.
|
66
|
+
def initialize(arg_settings = {})
|
67
|
+
@settings = Settings.new(arg_settings)
|
60
68
|
@index_steps = []
|
61
69
|
end
|
62
70
|
|
@@ -88,58 +96,270 @@ class Traject::Indexer
|
|
88
96
|
return @settings
|
89
97
|
end
|
90
98
|
|
99
|
+
def logger
|
100
|
+
@logger ||= create_logger
|
101
|
+
end
|
102
|
+
attr_writer :logger
|
103
|
+
|
104
|
+
|
105
|
+
# Just calculates the arg that's gonna be given to Yell.new
|
106
|
+
# or SomeLogger.new
|
107
|
+
def logger_argument
|
108
|
+
specified = settings["log.file"] || "STDERR"
|
109
|
+
|
110
|
+
case specified
|
111
|
+
when "STDOUT" then STDOUT
|
112
|
+
when "STDERR" then STDERR
|
113
|
+
else specified
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Second arg to Yell.new, options hash, calculated from
|
118
|
+
# settings
|
119
|
+
def logger_options
|
120
|
+
# formatter, default is fairly basic
|
121
|
+
format = settings["log.format"] || "%d %5L %m"
|
122
|
+
format = case format
|
123
|
+
when "false" then false
|
124
|
+
when "" then nil
|
125
|
+
else format
|
126
|
+
end
|
127
|
+
|
128
|
+
level = settings["log.level"] || "info"
|
129
|
+
|
130
|
+
{:format => format, :level => level}
|
131
|
+
end
|
132
|
+
|
133
|
+
# Create logger according to settings
|
134
|
+
def create_logger
|
135
|
+
# log everything to STDERR or specified logfile
|
136
|
+
logger = Yell.new( logger_argument, logger_options )
|
137
|
+
# ADDITIONALLY log error and higher to....
|
138
|
+
if settings["log.error_file"]
|
139
|
+
logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
|
140
|
+
end
|
141
|
+
|
142
|
+
return logger
|
143
|
+
end
|
144
|
+
|
145
|
+
|
91
146
|
# Used to define an indexing mapping.
|
92
147
|
def to_field(field_name, aLambda = nil, &block)
|
148
|
+
|
149
|
+
if field_name.nil? || field_name.empty?
|
150
|
+
raise ArgumentError.new("to_field requires a non-blank first argument, field name")
|
151
|
+
end
|
152
|
+
[aLambda, block].each do |proc|
|
153
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
154
|
+
# but for positive arrity, we need 2 or 3 args
|
155
|
+
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
156
|
+
raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
93
161
|
@index_steps << {
|
94
162
|
:field_name => field_name.to_s,
|
95
163
|
:lambda => aLambda,
|
96
|
-
:block => block
|
164
|
+
:block => block,
|
165
|
+
:type => :to_field,
|
166
|
+
:source_location => Traject::Util.extract_caller_location(caller.first)
|
167
|
+
}
|
168
|
+
end
|
169
|
+
|
170
|
+
def each_record(aLambda = nil, &block)
|
171
|
+
# arity check
|
172
|
+
[aLambda, block].each do |proc|
|
173
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
174
|
+
# but for positive arrity, we need 1 or 2 args
|
175
|
+
if proc && (proc.arity == 0 || proc.arity > 2)
|
176
|
+
raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
@index_steps << {
|
181
|
+
:lambda => aLambda,
|
182
|
+
:block => block,
|
183
|
+
:type => :each_record,
|
184
|
+
:source_location => Traject::Util.extract_caller_location(caller.first)
|
97
185
|
}
|
98
186
|
end
|
99
187
|
|
100
|
-
|
101
|
-
#
|
102
|
-
#
|
188
|
+
|
189
|
+
# Processes a single record according to indexing rules set up in
|
190
|
+
# this indexer. Returns the output hash (a hash whose keys are
|
191
|
+
# string fields, and values are arrays of one or more values in that field)
|
103
192
|
#
|
193
|
+
# This is a convenience shortcut for #map_to_context! -- use that one
|
194
|
+
# if you want to provide addtional context
|
195
|
+
# like position, and/or get back the full context.
|
104
196
|
def map_record(record)
|
105
197
|
context = Context.new(:source_record => record, :settings => settings)
|
198
|
+
map_to_context!(context)
|
199
|
+
return context.output_hash
|
200
|
+
end
|
106
201
|
|
202
|
+
# Maps a single record INTO the second argument, a Traject::Indexer::Context.
|
203
|
+
#
|
204
|
+
# Context must be passed with a #source_record and #settings, and optionally
|
205
|
+
# a #position.
|
206
|
+
#
|
207
|
+
# Context will be mutated by this method, most significantly by adding
|
208
|
+
# an #output_hash, a hash from fieldname to array of values in that field.
|
209
|
+
#
|
210
|
+
# Pass in a context with a set #position if you want that to be available
|
211
|
+
# to mapping routines.
|
212
|
+
#
|
213
|
+
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
214
|
+
def map_to_context!(context)
|
107
215
|
@index_steps.each do |index_step|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
216
|
+
if index_step[:type] == :to_field
|
217
|
+
|
218
|
+
accumulator = []
|
219
|
+
context.field_name = index_step[:field_name]
|
220
|
+
|
221
|
+
# Might have a lambda arg AND a block, we execute in order,
|
222
|
+
# with same accumulator.
|
223
|
+
|
224
|
+
[index_step[:lambda], index_step[:block]].each do |aProc|
|
225
|
+
if aProc
|
226
|
+
log_mapping_errors(context, index_step, aProc) do
|
227
|
+
if aProc.arity == 2
|
228
|
+
aProc.call(context.source_record, accumulator)
|
229
|
+
else
|
230
|
+
aProc.call(context.source_record, accumulator, context)
|
231
|
+
end
|
232
|
+
end
|
120
233
|
end
|
121
234
|
end
|
235
|
+
(context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
|
236
|
+
context.field_name = nil
|
122
237
|
|
123
|
-
|
238
|
+
elsif index_step[:type] == :each_record
|
239
|
+
|
240
|
+
# one or two arg
|
241
|
+
[index_step[:lambda], index_step[:block]].each do |aProc|
|
242
|
+
if aProc
|
243
|
+
log_mapping_errors(context, index_step, aProc) do
|
244
|
+
if aProc.arity == 1
|
245
|
+
aProc.call(context.source_record)
|
246
|
+
else
|
247
|
+
aProc.call(context.source_record, context)
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
124
252
|
|
125
|
-
|
126
|
-
|
253
|
+
else
|
254
|
+
raise ArgumentError.new("An @index_step we don't know how to deal with: #{@index_step}")
|
255
|
+
end
|
127
256
|
end
|
128
257
|
|
129
|
-
return context
|
258
|
+
return context
|
259
|
+
end
|
260
|
+
|
261
|
+
# just a wrapper that captures and records any unexpected
|
262
|
+
# errors raised in mapping, along with contextual information
|
263
|
+
# on record and location in source file of mapping rule.
|
264
|
+
#
|
265
|
+
# Re-raises error at the moment.
|
266
|
+
#
|
267
|
+
# log_errors(context, some_lambda) do
|
268
|
+
# all_sorts_of_stuff # that will have errors logged
|
269
|
+
# end
|
270
|
+
def log_mapping_errors(context, index_step, aProc)
|
271
|
+
begin
|
272
|
+
yield
|
273
|
+
rescue Exception => e
|
274
|
+
msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
|
275
|
+
|
276
|
+
conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
|
277
|
+
|
278
|
+
msg += " while executing #{conf} defined at #{index_step[:source_location]}\n"
|
279
|
+
msg += Traject::Util.exception_to_log_message(e)
|
280
|
+
|
281
|
+
logger.error msg
|
282
|
+
logger.debug "Record: " + context.source_record.to_s
|
283
|
+
|
284
|
+
raise e
|
285
|
+
end
|
130
286
|
end
|
131
287
|
|
132
288
|
# Processes a stream of records, reading from the configured Reader,
|
133
289
|
# mapping according to configured mapping rules, and then writing
|
134
290
|
# to configured Writer.
|
291
|
+
#
|
292
|
+
# returns 'false' as a signal to command line to return non-zero exit code
|
293
|
+
# for some reason (reason found in logs, presumably). This particular mechanism
|
294
|
+
# is open to complexification, starting simple. We do need SOME way to return
|
295
|
+
# non-zero to command line.
|
296
|
+
#
|
135
297
|
def process(io_stream)
|
298
|
+
settings.fill_in_defaults!
|
299
|
+
|
300
|
+
count = 0
|
301
|
+
start_time = batch_start_time = Time.now
|
302
|
+
logger.info "beginning Indexer#process with settings: #{settings.inspect}"
|
303
|
+
|
136
304
|
reader = self.reader!(io_stream)
|
137
305
|
writer = self.writer!
|
138
306
|
|
139
|
-
|
140
|
-
|
307
|
+
thread_pool = Traject::ThreadPool.new(settings["processing_thread_pool"].to_i)
|
308
|
+
|
309
|
+
logger.info " with reader: #{reader.class.name} and writer: #{writer.class.name}"
|
310
|
+
|
311
|
+
reader.each do |record; position|
|
312
|
+
count += 1
|
313
|
+
|
314
|
+
# have to use a block local var, so the changing `count` one
|
315
|
+
# doesn't get caught in the closure. Weird, yeah.
|
316
|
+
position = count
|
317
|
+
|
318
|
+
thread_pool.raise_collected_exception!
|
319
|
+
|
320
|
+
if settings["debug_ascii_progress"].to_s == "true"
|
321
|
+
$stderr.write "." if count % settings["solrj_writer.batch_size"] == 0
|
322
|
+
end
|
323
|
+
|
324
|
+
if settings["log.batch_progress"] && (count % settings["log.batch_progress"].to_i == 0)
|
325
|
+
batch_rps = settings["log.batch_progress"].to_i / (Time.now - batch_start_time)
|
326
|
+
overall_rps = count / (Time.now - start_time)
|
327
|
+
logger.info "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall"
|
328
|
+
batch_start_time = Time.now
|
329
|
+
end
|
330
|
+
|
331
|
+
# we have to use this weird lambda to properly "capture" the count, instead
|
332
|
+
# of having it be bound to the original variable in a non-threadsafe way.
|
333
|
+
# This is confusing, I might not be understanding things properly, but that's where i am.
|
334
|
+
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
335
|
+
thread_pool.maybe_in_thread_pool do
|
336
|
+
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
337
|
+
map_to_context!(context)
|
338
|
+
writer.put context
|
339
|
+
end
|
340
|
+
|
141
341
|
end
|
342
|
+
$stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
|
343
|
+
|
344
|
+
logger.debug "Shutting down #processing mapper threadpool..."
|
345
|
+
thread_pool.shutdown_and_wait
|
346
|
+
logger.debug "#processing mapper threadpool shutdown complete."
|
347
|
+
|
348
|
+
thread_pool.raise_collected_exception!
|
349
|
+
|
350
|
+
|
142
351
|
writer.close if writer.respond_to?(:close)
|
352
|
+
|
353
|
+
elapsed = Time.now - start_time
|
354
|
+
avg_rps = (count / elapsed)
|
355
|
+
logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
356
|
+
|
357
|
+
if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
|
358
|
+
logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
|
359
|
+
return false
|
360
|
+
end
|
361
|
+
|
362
|
+
return true
|
143
363
|
end
|
144
364
|
|
145
365
|
def reader_class
|
@@ -159,33 +379,21 @@ class Traject::Indexer
|
|
159
379
|
# Instantiate a Traject Reader, using class set
|
160
380
|
# in #reader_class, initialized with io_stream passed in
|
161
381
|
def reader!(io_stream)
|
162
|
-
return reader_class.new(io_stream, settings)
|
382
|
+
return reader_class.new(io_stream, settings.merge("logger" => logger))
|
163
383
|
end
|
164
384
|
|
165
385
|
# Instantiate a Traject Writer, suing class set in #writer_class
|
166
386
|
def writer!
|
167
|
-
return writer_class.new(settings)
|
387
|
+
return writer_class.new(settings.merge("logger" => logger))
|
168
388
|
end
|
169
389
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
}
|
390
|
+
# get a printable id from record for error logging.
|
391
|
+
# Maybe override this for a future XML version.
|
392
|
+
def id_string(record)
|
393
|
+
record && record['001'] && record['001'].value.to_s
|
175
394
|
end
|
176
395
|
|
177
396
|
|
178
|
-
|
179
|
-
# Enhanced with a few features from Hashie, to make it for
|
180
|
-
# instance string/symbol indifferent
|
181
|
-
class Settings < Hash
|
182
|
-
include Hashie::Extensions::MergeInitializer # can init with hash
|
183
|
-
include Hashie::Extensions::IndifferentAccess
|
184
|
-
|
185
|
-
# Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
|
186
|
-
alias_method :store, :indifferent_writer
|
187
|
-
end
|
188
|
-
|
189
397
|
# Represents the context of a specific record being indexed, passed
|
190
398
|
# to indexing logic blocks
|
191
399
|
#
|
@@ -203,5 +411,7 @@ class Traject::Indexer
|
|
203
411
|
|
204
412
|
attr_accessor :clipboard, :output_hash
|
205
413
|
attr_accessor :field_name, :source_record, :settings
|
414
|
+
# 1-based position in stream of processed records.
|
415
|
+
attr_accessor :position
|
206
416
|
end
|
207
417
|
end
|
data/lib/traject/json_writer.rb
CHANGED
@@ -20,7 +20,9 @@ class Traject::JsonWriter
|
|
20
20
|
@settings = argSettings
|
21
21
|
end
|
22
22
|
|
23
|
-
def put(
|
23
|
+
def put(context)
|
24
|
+
hash = context.output_hash
|
25
|
+
|
24
26
|
serialized =
|
25
27
|
if settings["json_writer.pretty_print"]
|
26
28
|
JSON.pretty_generate(hash)
|
@@ -34,7 +36,7 @@ class Traject::JsonWriter
|
|
34
36
|
unless defined? @output_file
|
35
37
|
@output_file =
|
36
38
|
if settings["output_file"]
|
37
|
-
File.open(settings["output_file"])
|
39
|
+
File.open(settings["output_file"], 'w:UTF-8')
|
38
40
|
elsif settings["output_stream"]
|
39
41
|
settings["output_stream"]
|
40
42
|
else
|
@@ -18,6 +18,13 @@ module Traject::Macros
|
|
18
18
|
# Second arg is optional options, including options valid on MarcExtractor.new,
|
19
19
|
# and others. (TODO)
|
20
20
|
#
|
21
|
+
# * :first => true: take only first value
|
22
|
+
# * :translation_map => String: translate with named translation map looked up in load
|
23
|
+
# path, uses Tranject::TranslationMap.new(translation_map_arg)
|
24
|
+
# * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
|
25
|
+
# have shown themselves useful with Marc, using Marc21.trim_punctuation
|
26
|
+
# * :default => String: if otherwise empty, add default value
|
27
|
+
#
|
21
28
|
# Examples:
|
22
29
|
#
|
23
30
|
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
@@ -26,11 +33,12 @@ module Traject::Macros
|
|
26
33
|
def extract_marc(spec, options = {})
|
27
34
|
only_first = options.delete(:first)
|
28
35
|
trim_punctuation = options.delete(:trim_punctuation)
|
36
|
+
default_value = options.delete(:default)
|
29
37
|
|
30
38
|
# We create the TranslationMap here on load, not inside the closure
|
31
39
|
# where it'll be called for every record. Since TranslationMap is supposed
|
32
40
|
# to cache, prob doesn't matter, but doens't hurt. Also causes any syntax
|
33
|
-
# exceptions to raise on load.
|
41
|
+
# exceptions to raise on load.
|
34
42
|
if translation_map_arg = options.delete(:translation_map)
|
35
43
|
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
36
44
|
end
|
@@ -49,6 +57,10 @@ module Traject::Macros
|
|
49
57
|
if trim_punctuation
|
50
58
|
accumulator.collect! {|s| Marc21.trim_punctuation(s)}
|
51
59
|
end
|
60
|
+
|
61
|
+
if default_value && accumulator.empty?
|
62
|
+
accumulator << default_value
|
63
|
+
end
|
52
64
|
end
|
53
65
|
end
|
54
66
|
|
@@ -97,7 +109,7 @@ module Traject::Macros
|
|
97
109
|
# All fields in from-to must be marc DATA (not control fields), or weirdness
|
98
110
|
#
|
99
111
|
# Can always run this thing multiple times on the same field if you need
|
100
|
-
# non-contiguous ranges of fields.
|
112
|
+
# non-contiguous ranges of fields.
|
101
113
|
def extract_all_marc_values(options = {})
|
102
114
|
options = {:from => "100", :to => "899", :seperator => ' '}.merge(options)
|
103
115
|
|
@@ -123,15 +135,15 @@ module Traject::Macros
|
|
123
135
|
# pretty simple.
|
124
136
|
#
|
125
137
|
# Removes
|
126
|
-
# * trailing: comma, slash, semicolon, colon (possibly followed by whitespace)
|
127
|
-
# * trailing period if it is preceded by at least three letters (possibly followed by whitespace)
|
138
|
+
# * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
|
139
|
+
# * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
128
140
|
# * single square bracket characters if they are the start and/or end
|
129
141
|
# chars and there are no internal square brackets.
|
130
142
|
#
|
131
143
|
# Returns altered string, doesn't change original arg.
|
132
144
|
def self.trim_punctuation(str)
|
133
|
-
str = str.sub(/[ ,\/;:] *\Z/, '')
|
134
|
-
str = str.sub(/(\w\w\w)\. *\Z/, '\1')
|
145
|
+
str = str.sub(/ *[ ,\/;:] *\Z/, '')
|
146
|
+
str = str.sub(/ *(\w\w\w)\. *\Z/, '\1')
|
135
147
|
str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
|
136
148
|
return str
|
137
149
|
end
|