traject 0.0.2 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/README.md +85 -61
- data/Rakefile +5 -0
- data/bin/traject +31 -3
- data/doc/settings.md +74 -13
- data/lib/tasks/load_maps.rake +48 -0
- data/lib/traject/indexer/settings.rb +75 -0
- data/lib/traject/indexer.rb +255 -45
- data/lib/traject/json_writer.rb +4 -2
- data/lib/traject/macros/marc21.rb +18 -6
- data/lib/traject/macros/marc21_semantics.rb +405 -0
- data/lib/traject/macros/marc_format_classifier.rb +180 -0
- data/lib/traject/marc4j_reader.rb +160 -0
- data/lib/traject/marc_extractor.rb +33 -17
- data/lib/traject/marc_reader.rb +14 -11
- data/lib/traject/solrj_writer.rb +247 -9
- data/lib/traject/thread_pool.rb +154 -0
- data/lib/traject/translation_map.rb +46 -4
- data/lib/traject/util.rb +30 -0
- data/lib/traject/version.rb +1 -1
- data/lib/translation_maps/lcc_top_level.yaml +26 -0
- data/lib/translation_maps/marc_genre_007.yaml +9 -0
- data/lib/translation_maps/marc_genre_leader.yaml +22 -0
- data/lib/translation_maps/marc_geographic.yaml +589 -0
- data/lib/translation_maps/marc_instruments.yaml +102 -0
- data/lib/translation_maps/marc_languages.yaml +490 -0
- data/test/indexer/each_record_test.rb +34 -0
- data/test/indexer/macros_marc21_semantics_test.rb +206 -0
- data/test/indexer/macros_marc21_test.rb +10 -1
- data/test/indexer/map_record_test.rb +78 -8
- data/test/indexer/read_write_test.rb +43 -10
- data/test/indexer/settings_test.rb +60 -4
- data/test/indexer/to_field_test.rb +39 -0
- data/test/marc4j_reader_test.rb +75 -0
- data/test/marc_extractor_test.rb +62 -0
- data/test/marc_format_classifier_test.rb +91 -0
- data/test/marc_reader_test.rb +12 -0
- data/test/solrj_writer_test.rb +146 -43
- data/test/test_helper.rb +50 -0
- data/test/test_support/245_no_ab.marc +1 -0
- data/test/test_support/880_with_no_6.utf8.marc +1 -0
- data/test/test_support/bad_subfield_code.marc +1 -0
- data/test/test_support/date_resort_to_260.marc +1 -0
- data/test/test_support/date_type_r_missing_date2.marc +1 -0
- data/test/test_support/date_with_u.marc +1 -0
- data/test/test_support/demo_config.rb +153 -0
- data/test/test_support/emptyish_record.marc +1 -0
- data/test/test_support/louis_armstrong.marc +1 -0
- data/test/test_support/manuscript_online_thesis.marc +1 -0
- data/test/test_support/microform_online_conference.marc +1 -0
- data/test/test_support/multi_era.marc +1 -0
- data/test/test_support/multi_geo.marc +1 -0
- data/test/test_support/musical_cage.marc +1 -0
- data/test/test_support/one-marc8.mrc +1 -0
- data/test/test_support/online_only.marc +1 -0
- data/test/test_support/packed_041a_lang.marc +1 -0
- data/test/test_support/the_business_ren.marc +1 -0
- data/test/translation_map_test.rb +8 -0
- data/test/translation_maps/properties_map.properties +5 -0
- data/traject.gemspec +1 -1
- data/vendor/marc4j/README.md +17 -0
- data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
- metadata +81 -2
data/lib/traject/indexer.rb
CHANGED
@@ -1,8 +1,11 @@
|
|
1
|
-
require '
|
1
|
+
require 'yell'
|
2
2
|
|
3
3
|
require 'traject'
|
4
4
|
require 'traject/qualified_const_get'
|
5
|
+
|
6
|
+
require 'traject/indexer/settings'
|
5
7
|
require 'traject/marc_reader'
|
8
|
+
require 'traject/marc4j_reader'
|
6
9
|
require 'traject/json_writer'
|
7
10
|
require 'traject/solrj_writer'
|
8
11
|
|
@@ -28,11 +31,15 @@ require 'traject/macros/basic'
|
|
28
31
|
#
|
29
32
|
#
|
30
33
|
# A Writer is any class that:
|
31
|
-
# 1) Has a one-argument initializer taking a Settings hash.
|
34
|
+
# 1) Has a one-argument initializer taking a Settings hash. (The logger
|
35
|
+
# is provided to the Writer in settings["logger"])
|
32
36
|
# 2) Responds to a one argument #put method, where the argument is
|
33
|
-
# a
|
37
|
+
# a Traject::Indexer::Context, containing an #output_hash
|
38
|
+
# hash of mapped keys/values. The writer should write them
|
34
39
|
# to the appropriate place.
|
35
40
|
# 3) Responds to a #close method, called when we're done.
|
41
|
+
# 4) Optionally implements a #skipped_record_count method, returning int count of records
|
42
|
+
# that were skipped due to errors (and presumably logged)
|
36
43
|
#
|
37
44
|
# The default writer (will be) the SolrWriter , which is configured
|
38
45
|
# through additional Settings as well. A JsonWriter is also available,
|
@@ -55,8 +62,9 @@ class Traject::Indexer
|
|
55
62
|
include Traject::Macros::Basic
|
56
63
|
|
57
64
|
|
58
|
-
|
59
|
-
|
65
|
+
# optional hash or Traject::Indexer::Settings object of settings.
|
66
|
+
def initialize(arg_settings = {})
|
67
|
+
@settings = Settings.new(arg_settings)
|
60
68
|
@index_steps = []
|
61
69
|
end
|
62
70
|
|
@@ -88,58 +96,270 @@ class Traject::Indexer
|
|
88
96
|
return @settings
|
89
97
|
end
|
90
98
|
|
99
|
+
def logger
|
100
|
+
@logger ||= create_logger
|
101
|
+
end
|
102
|
+
attr_writer :logger
|
103
|
+
|
104
|
+
|
105
|
+
# Just calculates the arg that's gonna be given to Yell.new
|
106
|
+
# or SomeLogger.new
|
107
|
+
def logger_argument
|
108
|
+
specified = settings["log.file"] || "STDERR"
|
109
|
+
|
110
|
+
case specified
|
111
|
+
when "STDOUT" then STDOUT
|
112
|
+
when "STDERR" then STDERR
|
113
|
+
else specified
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
# Second arg to Yell.new, options hash, calculated from
|
118
|
+
# settings
|
119
|
+
def logger_options
|
120
|
+
# formatter, default is fairly basic
|
121
|
+
format = settings["log.format"] || "%d %5L %m"
|
122
|
+
format = case format
|
123
|
+
when "false" then false
|
124
|
+
when "" then nil
|
125
|
+
else format
|
126
|
+
end
|
127
|
+
|
128
|
+
level = settings["log.level"] || "info"
|
129
|
+
|
130
|
+
{:format => format, :level => level}
|
131
|
+
end
|
132
|
+
|
133
|
+
# Create logger according to settings
|
134
|
+
def create_logger
|
135
|
+
# log everything to STDERR or specified logfile
|
136
|
+
logger = Yell.new( logger_argument, logger_options )
|
137
|
+
# ADDITIONALLY log error and higher to....
|
138
|
+
if settings["log.error_file"]
|
139
|
+
logger.adapter :file, settings["log.error_file"], :level => 'gte.error'
|
140
|
+
end
|
141
|
+
|
142
|
+
return logger
|
143
|
+
end
|
144
|
+
|
145
|
+
|
91
146
|
# Used to define an indexing mapping.
|
92
147
|
def to_field(field_name, aLambda = nil, &block)
|
148
|
+
|
149
|
+
if field_name.nil? || field_name.empty?
|
150
|
+
raise ArgumentError.new("to_field requires a non-blank first argument, field name")
|
151
|
+
end
|
152
|
+
[aLambda, block].each do |proc|
|
153
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
154
|
+
# but for positive arrity, we need 2 or 3 args
|
155
|
+
if proc && (proc.arity == 0 || proc.arity == 1 || proc.arity > 3)
|
156
|
+
raise ArgumentError.new("block/proc given to to_field needs 2 or 3 arguments: #{proc}")
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
|
93
161
|
@index_steps << {
|
94
162
|
:field_name => field_name.to_s,
|
95
163
|
:lambda => aLambda,
|
96
|
-
:block => block
|
164
|
+
:block => block,
|
165
|
+
:type => :to_field,
|
166
|
+
:source_location => Traject::Util.extract_caller_location(caller.first)
|
167
|
+
}
|
168
|
+
end
|
169
|
+
|
170
|
+
def each_record(aLambda = nil, &block)
|
171
|
+
# arity check
|
172
|
+
[aLambda, block].each do |proc|
|
173
|
+
# allow negative arity, meaning variable/optional, trust em on that.
|
174
|
+
# but for positive arrity, we need 1 or 2 args
|
175
|
+
if proc && (proc.arity == 0 || proc.arity > 2)
|
176
|
+
raise ArgumentError.new("block/proc given to to_field needs 1 or 2 arguments: #{proc}")
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
@index_steps << {
|
181
|
+
:lambda => aLambda,
|
182
|
+
:block => block,
|
183
|
+
:type => :each_record,
|
184
|
+
:source_location => Traject::Util.extract_caller_location(caller.first)
|
97
185
|
}
|
98
186
|
end
|
99
187
|
|
100
|
-
|
101
|
-
#
|
102
|
-
#
|
188
|
+
|
189
|
+
# Processes a single record according to indexing rules set up in
|
190
|
+
# this indexer. Returns the output hash (a hash whose keys are
|
191
|
+
# string fields, and values are arrays of one or more values in that field)
|
103
192
|
#
|
193
|
+
# This is a convenience shortcut for #map_to_context! -- use that one
|
194
|
+
# if you want to provide addtional context
|
195
|
+
# like position, and/or get back the full context.
|
104
196
|
def map_record(record)
|
105
197
|
context = Context.new(:source_record => record, :settings => settings)
|
198
|
+
map_to_context!(context)
|
199
|
+
return context.output_hash
|
200
|
+
end
|
106
201
|
|
202
|
+
# Maps a single record INTO the second argument, a Traject::Indexer::Context.
|
203
|
+
#
|
204
|
+
# Context must be passed with a #source_record and #settings, and optionally
|
205
|
+
# a #position.
|
206
|
+
#
|
207
|
+
# Context will be mutated by this method, most significantly by adding
|
208
|
+
# an #output_hash, a hash from fieldname to array of values in that field.
|
209
|
+
#
|
210
|
+
# Pass in a context with a set #position if you want that to be available
|
211
|
+
# to mapping routines.
|
212
|
+
#
|
213
|
+
# Returns the context passed in as second arg, as a convenience for chaining etc.
|
214
|
+
def map_to_context!(context)
|
107
215
|
@index_steps.each do |index_step|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
216
|
+
if index_step[:type] == :to_field
|
217
|
+
|
218
|
+
accumulator = []
|
219
|
+
context.field_name = index_step[:field_name]
|
220
|
+
|
221
|
+
# Might have a lambda arg AND a block, we execute in order,
|
222
|
+
# with same accumulator.
|
223
|
+
|
224
|
+
[index_step[:lambda], index_step[:block]].each do |aProc|
|
225
|
+
if aProc
|
226
|
+
log_mapping_errors(context, index_step, aProc) do
|
227
|
+
if aProc.arity == 2
|
228
|
+
aProc.call(context.source_record, accumulator)
|
229
|
+
else
|
230
|
+
aProc.call(context.source_record, accumulator, context)
|
231
|
+
end
|
232
|
+
end
|
120
233
|
end
|
121
234
|
end
|
235
|
+
(context.output_hash[context.field_name] ||= []).concat accumulator unless accumulator.empty?
|
236
|
+
context.field_name = nil
|
122
237
|
|
123
|
-
|
238
|
+
elsif index_step[:type] == :each_record
|
239
|
+
|
240
|
+
# one or two arg
|
241
|
+
[index_step[:lambda], index_step[:block]].each do |aProc|
|
242
|
+
if aProc
|
243
|
+
log_mapping_errors(context, index_step, aProc) do
|
244
|
+
if aProc.arity == 1
|
245
|
+
aProc.call(context.source_record)
|
246
|
+
else
|
247
|
+
aProc.call(context.source_record, context)
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
124
252
|
|
125
|
-
|
126
|
-
|
253
|
+
else
|
254
|
+
raise ArgumentError.new("An @index_step we don't know how to deal with: #{@index_step}")
|
255
|
+
end
|
127
256
|
end
|
128
257
|
|
129
|
-
return context
|
258
|
+
return context
|
259
|
+
end
|
260
|
+
|
261
|
+
# just a wrapper that captures and records any unexpected
|
262
|
+
# errors raised in mapping, along with contextual information
|
263
|
+
# on record and location in source file of mapping rule.
|
264
|
+
#
|
265
|
+
# Re-raises error at the moment.
|
266
|
+
#
|
267
|
+
# log_errors(context, some_lambda) do
|
268
|
+
# all_sorts_of_stuff # that will have errors logged
|
269
|
+
# end
|
270
|
+
def log_mapping_errors(context, index_step, aProc)
|
271
|
+
begin
|
272
|
+
yield
|
273
|
+
rescue Exception => e
|
274
|
+
msg = "Unexpected error on record id `#{id_string(context.source_record)}` at file position #{context.position}\n"
|
275
|
+
|
276
|
+
conf = context.field_name ? "to_field '#{context.field_name}'" : "each_record"
|
277
|
+
|
278
|
+
msg += " while executing #{conf} defined at #{index_step[:source_location]}\n"
|
279
|
+
msg += Traject::Util.exception_to_log_message(e)
|
280
|
+
|
281
|
+
logger.error msg
|
282
|
+
logger.debug "Record: " + context.source_record.to_s
|
283
|
+
|
284
|
+
raise e
|
285
|
+
end
|
130
286
|
end
|
131
287
|
|
132
288
|
# Processes a stream of records, reading from the configured Reader,
|
133
289
|
# mapping according to configured mapping rules, and then writing
|
134
290
|
# to configured Writer.
|
291
|
+
#
|
292
|
+
# returns 'false' as a signal to command line to return non-zero exit code
|
293
|
+
# for some reason (reason found in logs, presumably). This particular mechanism
|
294
|
+
# is open to complexification, starting simple. We do need SOME way to return
|
295
|
+
# non-zero to command line.
|
296
|
+
#
|
135
297
|
def process(io_stream)
|
298
|
+
settings.fill_in_defaults!
|
299
|
+
|
300
|
+
count = 0
|
301
|
+
start_time = batch_start_time = Time.now
|
302
|
+
logger.info "beginning Indexer#process with settings: #{settings.inspect}"
|
303
|
+
|
136
304
|
reader = self.reader!(io_stream)
|
137
305
|
writer = self.writer!
|
138
306
|
|
139
|
-
|
140
|
-
|
307
|
+
thread_pool = Traject::ThreadPool.new(settings["processing_thread_pool"].to_i)
|
308
|
+
|
309
|
+
logger.info " with reader: #{reader.class.name} and writer: #{writer.class.name}"
|
310
|
+
|
311
|
+
reader.each do |record; position|
|
312
|
+
count += 1
|
313
|
+
|
314
|
+
# have to use a block local var, so the changing `count` one
|
315
|
+
# doesn't get caught in the closure. Weird, yeah.
|
316
|
+
position = count
|
317
|
+
|
318
|
+
thread_pool.raise_collected_exception!
|
319
|
+
|
320
|
+
if settings["debug_ascii_progress"].to_s == "true"
|
321
|
+
$stderr.write "." if count % settings["solrj_writer.batch_size"] == 0
|
322
|
+
end
|
323
|
+
|
324
|
+
if settings["log.batch_progress"] && (count % settings["log.batch_progress"].to_i == 0)
|
325
|
+
batch_rps = settings["log.batch_progress"].to_i / (Time.now - batch_start_time)
|
326
|
+
overall_rps = count / (Time.now - start_time)
|
327
|
+
logger.info "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall"
|
328
|
+
batch_start_time = Time.now
|
329
|
+
end
|
330
|
+
|
331
|
+
# we have to use this weird lambda to properly "capture" the count, instead
|
332
|
+
# of having it be bound to the original variable in a non-threadsafe way.
|
333
|
+
# This is confusing, I might not be understanding things properly, but that's where i am.
|
334
|
+
#thread_pool.maybe_in_thread_pool &make_lambda(count, record, writer)
|
335
|
+
thread_pool.maybe_in_thread_pool do
|
336
|
+
context = Context.new(:source_record => record, :settings => settings, :position => position)
|
337
|
+
map_to_context!(context)
|
338
|
+
writer.put context
|
339
|
+
end
|
340
|
+
|
141
341
|
end
|
342
|
+
$stderr.write "\n" if settings["debug_ascii_progress"].to_s == "true"
|
343
|
+
|
344
|
+
logger.debug "Shutting down #processing mapper threadpool..."
|
345
|
+
thread_pool.shutdown_and_wait
|
346
|
+
logger.debug "#processing mapper threadpool shutdown complete."
|
347
|
+
|
348
|
+
thread_pool.raise_collected_exception!
|
349
|
+
|
350
|
+
|
142
351
|
writer.close if writer.respond_to?(:close)
|
352
|
+
|
353
|
+
elapsed = Time.now - start_time
|
354
|
+
avg_rps = (count / elapsed)
|
355
|
+
logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
|
356
|
+
|
357
|
+
if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
|
358
|
+
logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
|
359
|
+
return false
|
360
|
+
end
|
361
|
+
|
362
|
+
return true
|
143
363
|
end
|
144
364
|
|
145
365
|
def reader_class
|
@@ -159,33 +379,21 @@ class Traject::Indexer
|
|
159
379
|
# Instantiate a Traject Reader, using class set
|
160
380
|
# in #reader_class, initialized with io_stream passed in
|
161
381
|
def reader!(io_stream)
|
162
|
-
return reader_class.new(io_stream, settings)
|
382
|
+
return reader_class.new(io_stream, settings.merge("logger" => logger))
|
163
383
|
end
|
164
384
|
|
165
385
|
# Instantiate a Traject Writer, suing class set in #writer_class
|
166
386
|
def writer!
|
167
|
-
return writer_class.new(settings)
|
387
|
+
return writer_class.new(settings.merge("logger" => logger))
|
168
388
|
end
|
169
389
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
}
|
390
|
+
# get a printable id from record for error logging.
|
391
|
+
# Maybe override this for a future XML version.
|
392
|
+
def id_string(record)
|
393
|
+
record && record['001'] && record['001'].value.to_s
|
175
394
|
end
|
176
395
|
|
177
396
|
|
178
|
-
|
179
|
-
# Enhanced with a few features from Hashie, to make it for
|
180
|
-
# instance string/symbol indifferent
|
181
|
-
class Settings < Hash
|
182
|
-
include Hashie::Extensions::MergeInitializer # can init with hash
|
183
|
-
include Hashie::Extensions::IndifferentAccess
|
184
|
-
|
185
|
-
# Hashie bug Issue #100 https://github.com/intridea/hashie/pull/100
|
186
|
-
alias_method :store, :indifferent_writer
|
187
|
-
end
|
188
|
-
|
189
397
|
# Represents the context of a specific record being indexed, passed
|
190
398
|
# to indexing logic blocks
|
191
399
|
#
|
@@ -203,5 +411,7 @@ class Traject::Indexer
|
|
203
411
|
|
204
412
|
attr_accessor :clipboard, :output_hash
|
205
413
|
attr_accessor :field_name, :source_record, :settings
|
414
|
+
# 1-based position in stream of processed records.
|
415
|
+
attr_accessor :position
|
206
416
|
end
|
207
417
|
end
|
data/lib/traject/json_writer.rb
CHANGED
@@ -20,7 +20,9 @@ class Traject::JsonWriter
|
|
20
20
|
@settings = argSettings
|
21
21
|
end
|
22
22
|
|
23
|
-
def put(
|
23
|
+
def put(context)
|
24
|
+
hash = context.output_hash
|
25
|
+
|
24
26
|
serialized =
|
25
27
|
if settings["json_writer.pretty_print"]
|
26
28
|
JSON.pretty_generate(hash)
|
@@ -34,7 +36,7 @@ class Traject::JsonWriter
|
|
34
36
|
unless defined? @output_file
|
35
37
|
@output_file =
|
36
38
|
if settings["output_file"]
|
37
|
-
File.open(settings["output_file"])
|
39
|
+
File.open(settings["output_file"], 'w:UTF-8')
|
38
40
|
elsif settings["output_stream"]
|
39
41
|
settings["output_stream"]
|
40
42
|
else
|
@@ -18,6 +18,13 @@ module Traject::Macros
|
|
18
18
|
# Second arg is optional options, including options valid on MarcExtractor.new,
|
19
19
|
# and others. (TODO)
|
20
20
|
#
|
21
|
+
# * :first => true: take only first value
|
22
|
+
# * :translation_map => String: translate with named translation map looked up in load
|
23
|
+
# path, uses Tranject::TranslationMap.new(translation_map_arg)
|
24
|
+
# * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
|
25
|
+
# have shown themselves useful with Marc, using Marc21.trim_punctuation
|
26
|
+
# * :default => String: if otherwise empty, add default value
|
27
|
+
#
|
21
28
|
# Examples:
|
22
29
|
#
|
23
30
|
# to_field("title"), extract_marc("245abcd", :trim_punctuation => true)
|
@@ -26,11 +33,12 @@ module Traject::Macros
|
|
26
33
|
def extract_marc(spec, options = {})
|
27
34
|
only_first = options.delete(:first)
|
28
35
|
trim_punctuation = options.delete(:trim_punctuation)
|
36
|
+
default_value = options.delete(:default)
|
29
37
|
|
30
38
|
# We create the TranslationMap here on load, not inside the closure
|
31
39
|
# where it'll be called for every record. Since TranslationMap is supposed
|
32
40
|
# to cache, prob doesn't matter, but doens't hurt. Also causes any syntax
|
33
|
-
# exceptions to raise on load.
|
41
|
+
# exceptions to raise on load.
|
34
42
|
if translation_map_arg = options.delete(:translation_map)
|
35
43
|
translation_map = Traject::TranslationMap.new(translation_map_arg)
|
36
44
|
end
|
@@ -49,6 +57,10 @@ module Traject::Macros
|
|
49
57
|
if trim_punctuation
|
50
58
|
accumulator.collect! {|s| Marc21.trim_punctuation(s)}
|
51
59
|
end
|
60
|
+
|
61
|
+
if default_value && accumulator.empty?
|
62
|
+
accumulator << default_value
|
63
|
+
end
|
52
64
|
end
|
53
65
|
end
|
54
66
|
|
@@ -97,7 +109,7 @@ module Traject::Macros
|
|
97
109
|
# All fields in from-to must be marc DATA (not control fields), or weirdness
|
98
110
|
#
|
99
111
|
# Can always run this thing multiple times on the same field if you need
|
100
|
-
# non-contiguous ranges of fields.
|
112
|
+
# non-contiguous ranges of fields.
|
101
113
|
def extract_all_marc_values(options = {})
|
102
114
|
options = {:from => "100", :to => "899", :seperator => ' '}.merge(options)
|
103
115
|
|
@@ -123,15 +135,15 @@ module Traject::Macros
|
|
123
135
|
# pretty simple.
|
124
136
|
#
|
125
137
|
# Removes
|
126
|
-
# * trailing: comma, slash, semicolon, colon (possibly followed by whitespace)
|
127
|
-
# * trailing period if it is preceded by at least three letters (possibly followed by whitespace)
|
138
|
+
# * trailing: comma, slash, semicolon, colon (possibly preceded and followed by whitespace)
|
139
|
+
# * trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
|
128
140
|
# * single square bracket characters if they are the start and/or end
|
129
141
|
# chars and there are no internal square brackets.
|
130
142
|
#
|
131
143
|
# Returns altered string, doesn't change original arg.
|
132
144
|
def self.trim_punctuation(str)
|
133
|
-
str = str.sub(/[ ,\/;:] *\Z/, '')
|
134
|
-
str = str.sub(/(\w\w\w)\. *\Z/, '\1')
|
145
|
+
str = str.sub(/ *[ ,\/;:] *\Z/, '')
|
146
|
+
str = str.sub(/ *(\w\w\w)\. *\Z/, '\1')
|
135
147
|
str = str.sub(/\A\[?([^\[\]]+)\]?\Z/, '\1')
|
136
148
|
return str
|
137
149
|
end
|