traject 2.3.2 → 2.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1536be14599f2f0777b79a6bc27717ad0350223f
4
- data.tar.gz: 8cc6327ca07889c69526f3a19b4e3b91b5512c65
3
+ metadata.gz: 2507bfdf51675c233b64ebbecb3247aaf53281ec
4
+ data.tar.gz: 0fd4edc089aa6cc09e477d4e7e8538f830d20f1c
5
5
  SHA512:
6
- metadata.gz: 05126d1932a31c7fb97f571619139c287b71afe4f3638ec7e72e73518df8c42f765cdaed7e646b64f4c13ad724b95d8f99e1dc61243b07aac0d2ab7d38bf9241
7
- data.tar.gz: 2035e5bc42067a3c0ac598f894ac59c1309244d47afceaaa7b66a7dd4bfd034e8395c1e71b90d2f6d5b5d68efa78ea98c0bfc0681dff23b39276fdb7bed6b5b2
6
+ metadata.gz: 1091dbd01a7f2adf5ac7a8e4d09f9aec6c67b162379280442f62677e32dc82ec3d575fee3dc131983e5a23e4680cd92cbe4c739c2daf0f25dbf4db83febf7e95
7
+ data.tar.gz: 7220b2acb51c9ccb6cbbf8cb7caef8c478beaf16f115cbee7b8f719b4de91261926d7a6f6d059d84d0d4abced778148199864fd754c6abc3c7a54ebe85950d1a
@@ -6,8 +6,8 @@ rvm:
6
6
  - jruby-9.0.4.0
7
7
  - 1.9
8
8
  - 2.2
9
- - 2.3.0
10
- - rbx-2
9
+ - 2.3.3
10
+ - 2.4.0
11
11
  before_install:
12
12
  - gem update --system
13
13
  - gem uninstall bundler
data/CHANGES.md CHANGED
@@ -1,12 +1,22 @@
1
1
  # Changes
2
2
 
3
+ ## 2.3.3
4
+ * Further squash use of capture-variabels ('$1', etc.)
5
+ to try to work around the non-thread-safety of
6
+ regexp in ruby
7
+ * Fix a bug in trim_punctuation where trailing
8
+ periods were being eliminated even if there
9
+ was a short string before them (e.g., 'Jr.')
10
+ * Begin to reorganize tests, starting with
11
+ the Marc21 macros
12
+
3
13
  ## 2.3.2
4
14
  * Change to `extract_marc` to work around a threadsafe problem in JRuby/MRI where
5
15
  regexps were unsafely shared between threads. (@codeforkjeff)
6
16
  * Make trim-punctuation safe for non-just-ASCII text (thanks to @dunn and @redlibrarian)
7
17
 
8
18
  ## 2.3.1
9
- * Update README with more info aout new nil-related options
19
+ * Update README with more info about new nil-related options
10
20
 
11
21
  ## 2.3.0
12
22
  * Allow nil values, empty fields, and deduplication
@@ -21,7 +31,7 @@
21
31
  Set to `true` to pass empty fields on to the writer (with the value being an empty array)
22
32
 
23
33
  ## 2.2.1
24
- * Had inadverntantly broken use of arrays as extract_marc specifications. Fixed.
34
+ * Had inadvertently broken use of arrays as extract_marc specifications. Fixed.
25
35
 
26
36
  ## 2.2.0
27
37
  * Change DebugWriter to be more forgiving (and informative) about missing record-id fields
@@ -131,6 +131,6 @@ to_field "edition", extract_marc('250a')
131
131
 
132
132
  to_field 'language', marc_languages("008[35-37]:041a:041d:041e:041j")
133
133
  to_field 'language008', extract_marc('008[35-37]') do |r, acc|
134
- acc.reject! {|x| x !~ /\S/} # ditch only spaces
134
+ acc.reject! {|x| !(/\S/.match(x)} # ditch all-spaces values
135
135
  acc.uniq!
136
136
  end
@@ -202,8 +202,8 @@ module Traject
202
202
 
203
203
  # `-s key=value` command line
204
204
  (options[:setting] || []).each do |setting_pair|
205
- if setting_pair =~ /\A([^=]+)\=(.*)\Z/
206
- key, value = $1, $2
205
+ if m = /\A([^=]+)\=(.*)\Z/.match(setting_pair)
206
+ key, value = m[1], m[2]
207
207
  settings[key] = value
208
208
  else
209
209
  self.console.puts "Unrecognized setting argument '#{setting_pair}':"
@@ -103,7 +103,11 @@ class Traject::Indexer
103
103
  def inspect
104
104
  # Keep any key ending in password out of the inspect
105
105
  self.inject({}) do |hash, (key, value)|
106
- hash[key] = (key =~ /password\Z/) ? "[hidden]" : value
106
+ if /password\Z/.match(key)
107
+ hash[key] = "[hidden]"
108
+ else
109
+ hash[key] = value
110
+ end
107
111
  hash
108
112
  end.inspect
109
113
  end
@@ -233,7 +233,7 @@ module Traject::Macros
233
233
  str = str.sub(/ *[ ,\/;:] *\Z/, '')
234
234
 
235
235
  # trailing period if it is preceded by at least three letters (possibly preceded and followed by whitespace)
236
- str = str.sub(/( *[[:word:][:word:][:word:]])\. *\Z/, '\1')
236
+ str = str.sub(/( *[[:word:]]{3,})\. *\Z/, '\1')
237
237
 
238
238
  # single square bracket characters if they are the start and/or end
239
239
  # chars and there are no internal square brackets.
@@ -40,8 +40,8 @@ module Traject::Macros
40
40
  /x
41
41
 
42
42
  def self.oclcnum_extract(num)
43
- if OCLCPAT.match(num)
44
- return $1
43
+ if m = OCLCPAT.match(num)
44
+ return m[1]
45
45
  else
46
46
  return nil
47
47
  end
@@ -369,8 +369,8 @@ module Traject::Macros
369
369
  v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first
370
370
  # just try to take the first four digits out of there, we're not going to try
371
371
  # anything crazy.
372
- if v260c =~ /(\d{4})/
373
- found_date = $1.to_i
372
+ if m = /(\d{4})/.match(v260c)
373
+ found_date = m[1].to_i
374
374
  end
375
375
  end
376
376
 
@@ -408,7 +408,7 @@ module Traject::Macros
408
408
  candidates = extractor.extract(record)
409
409
 
410
410
  candidates.reject! do |candidate|
411
- !(candidate =~ lcc_regex)
411
+ !(lcc_regex.match candidate)
412
412
  end
413
413
 
414
414
  accumulator.concat translation_map.translate_array!(candidates.collect {|a| a.lstrip.slice(0, 1)}).uniq
@@ -501,10 +501,11 @@ module Traject::Macros
501
501
  end)
502
502
 
503
503
  # weird ones
504
+ special_fields_regex = /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
504
505
  extractor_special_fields.each_matching_line(record) do |field, spec, extractor|
505
506
  field.subfields.each do |sf|
506
507
  next unless sf.code == 'y'
507
- if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/
508
+ if special_fields_regex.match(sf.value)
508
509
  # it's our pattern, add the $a in please
509
510
  accumulator << "#{field['a']}#{separator}#{sf.value.sub(/\. *\Z/, '')}"
510
511
  else
@@ -562,7 +563,7 @@ module Traject::Macros
562
563
 
563
564
  marc_field.subfields.each_with_index do |sf, i|
564
565
  # ignore non-alphabetic, like numeric control subfields
565
- next unless sf.code =~ /\A[a-z]\Z/
566
+ next unless /\A[a-z]\Z/.match(sf.code)
566
567
 
567
568
  prefix = if subd_prefix_codes.include? sf.code
568
569
  subd_separator
@@ -105,7 +105,8 @@ module Traject
105
105
  def proceeding?
106
106
  @proceeding_q ||= begin
107
107
  ! record.find do |field|
108
- field.tag.slice(0) == '6' && field.subfields.find {|sf| sf.code == "v" && sf.value =~ /^\s*(C|c)ongresses\.?\s*$/}
108
+ field.tag.slice(0) == '6' &&
109
+ field.subfields.find {|sf| sf.code == "v" && /^\s*(C|c)ongresses\.?\s*$/.match(sf.value) }
109
110
  end.nil?
110
111
  end
111
112
  end
@@ -40,15 +40,16 @@ module Traject
40
40
 
41
41
  this_file_iter = file_io.each_line
42
42
 
43
+
43
44
  while true
44
45
  line = this_file_iter.next
45
- break if line =~ /^\_\_END\_\_/
46
+ break if /^\_\_END\_\_/.match line
46
47
  end
47
48
 
48
49
  begin
49
50
  while true
50
51
  json = this_file_iter.next
51
- next unless json =~ /\S/
52
+ next unless /\S/.match json
52
53
  records << MARC::Record.new_from_hash(JSON.parse(json))
53
54
  end
54
55
  rescue StopIteration
@@ -12,7 +12,7 @@ class Traject::NDJReader
12
12
  def initialize(input_stream, settings)
13
13
  @settings = settings
14
14
  @input_stream = input_stream
15
- if @settings['command_line.filename'] =~ /\.gz$/
15
+ if /\.gz\Z/.match(@settings['command_line.filename'])
16
16
  @input_stream = Zlib::GzipReader.new(@input_stream, :external_encoding => "UTF-8")
17
17
  end
18
18
  end
@@ -236,7 +236,7 @@ class Traject::SolrJsonWriter
236
236
 
237
237
  # If we've got a solr.update_url, make sure it's ok
238
238
  def check_solr_update_url(url)
239
- unless url =~ /^#{URI::regexp}$/
239
+ unless /^#{URI::regexp}$/.match(url)
240
240
  raise ArgumentError.new("#{self.class.name} setting `solr.update_url` doesn't look like a URL: `#{url}`")
241
241
  end
242
242
  url
@@ -249,7 +249,7 @@ class Traject::SolrJsonWriter
249
249
  end
250
250
 
251
251
  # Not a URL? Bail
252
- unless url =~ /^#{URI::regexp}$/
252
+ unless /^#{URI::regexp}$/.match(url)
253
253
  raise ArgumentError.new("#{self.class.name} setting `solr.url` doesn't look like a URL: `#{url}`")
254
254
  end
255
255
 
@@ -5,14 +5,14 @@ module Traject
5
5
  def self.exception_to_log_message(e)
6
6
  indent = " "
7
7
 
8
- msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
8
+ msg = indent + "Exception: " + e.class.name + ": " + e.message + "\n"
9
9
  msg += indent + e.backtrace.first + "\n"
10
10
 
11
- if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause )
11
+ if (e.respond_to?(:getRootCause) && e.getRootCause && e != e.getRootCause)
12
12
  caused_by = e.getRootCause
13
- msg += indent + "Caused by\n"
14
- msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
15
- msg += indent + caused_by.backtrace.first + "\n"
13
+ msg += indent + "Caused by\n"
14
+ msg += indent + caused_by.class.name + ": " + caused_by.message + "\n"
15
+ msg += indent + caused_by.backtrace.first + "\n"
16
16
  end
17
17
 
18
18
  return msg
@@ -37,8 +37,8 @@ module Traject
37
37
  # For a SyntaxError, we really need to grep it from the
38
38
  # exception message, it really appears to be nowhere else. Ugh.
39
39
  if exception.kind_of? SyntaxError
40
- if exception.message =~ /:(\d+):/
41
- return $1.to_i
40
+ if m = /:(\d+):/.match(exception.message)
41
+ return m[1].to_i
42
42
  end
43
43
  end
44
44
 
@@ -48,9 +48,9 @@ module Traject
48
48
  # exception.backtrace_locations exists in MRI 2.1+, which makes
49
49
  # our task a lot easier. But not yet in JRuby 1.7.x, so we got to
50
50
  # handle the old way of having to parse the strings in backtrace too.
51
- if ( exception.respond_to?(:backtrace_locations) &&
52
- exception.backtrace_locations &&
53
- exception.backtrace_locations.length > 0 )
51
+ if (exception.respond_to?(:backtrace_locations) &&
52
+ exception.backtrace_locations &&
53
+ exception.backtrace_locations.length > 0)
54
54
  location = exception.backtrace_locations.find do |bt|
55
55
  bt.path == file_path
56
56
  end
@@ -58,8 +58,10 @@ module Traject
58
58
  else # have to parse string backtrace
59
59
  exception.backtrace.each do |line|
60
60
  if line.start_with?(file_path)
61
- return $1.to_i if line =~ /\A.*\:(\d+)\:in/
62
- break
61
+ if m = /\A.*\:(\d+)\:in/.match(line)
62
+ return m[1].to_i
63
+ break
64
+ end
63
65
  end
64
66
  end
65
67
  # if we got here, we have nothing
@@ -75,14 +77,14 @@ module Traject
75
77
  # returned array will actually be of Thread::Backtrace::Location elements.
76
78
  def self.backtrace_from_config(file_path, exception)
77
79
  filtered_trace = []
78
- found = false
80
+ found = false
79
81
 
80
82
  # MRI 2.1+ has exception.backtrace_locations which makes
81
83
  # this a lot easier, but JRuby 1.7.x doesn't yet, so we
82
84
  # need to do it both ways.
83
- if ( exception.respond_to?(:backtrace_locations) &&
84
- exception.backtrace_locations &&
85
- exception.backtrace_locations.length > 0 )
85
+ if (exception.respond_to?(:backtrace_locations) &&
86
+ exception.backtrace_locations &&
87
+ exception.backtrace_locations.length > 0)
86
88
 
87
89
  exception.backtrace_locations.each do |location|
88
90
  filtered_trace << location
@@ -100,7 +102,6 @@ module Traject
100
102
  end
101
103
 
102
104
 
103
-
104
105
  # Ruby stdlib queue lacks a 'drain' function, we write one.
105
106
  #
106
107
  # Removes everything currently in the ruby stdlib queue, and returns
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "2.3.2"
2
+ VERSION = "2.3.3"
3
3
  end
@@ -169,7 +169,7 @@ describe "Traject::Macros::Marc21Semantics" do
169
169
  # there are way too many edge cases for us to test em all, but we'll test some of em.
170
170
 
171
171
  it "works when there's no date information" do
172
- assert_equal nil, Marc21Semantics.publication_date(empty_record)
172
+ assert_nil Marc21Semantics.publication_date(empty_record)
173
173
  end
174
174
 
175
175
  it "uses macro correctly with no date info" do
@@ -189,7 +189,7 @@ describe "Traject::Macros::Marc21Semantics" do
189
189
  end
190
190
  it "returns nil when the records really got nothing" do
191
191
  @record = MARC::Reader.new(support_file_path "emptyish_record.marc").to_a.first
192
- assert_equal nil, Marc21Semantics.publication_date(@record)
192
+ assert_nil Marc21Semantics.publication_date(@record)
193
193
  end
194
194
  it "estimates with a single 'u'" do
195
195
  @record = MARC::Reader.new(support_file_path "date_with_u.marc").to_a.first
@@ -1,11 +1,20 @@
1
- # Encoding: UTF-8
2
-
3
1
  require 'test_helper'
2
+
3
+ require 'traject/indexer'
4
4
  require 'traject/macros/marc21'
5
5
 
6
+ require 'json'
7
+ require 'marc'
8
+
6
9
  include Traject::Macros::Marc21
7
10
 
11
+
8
12
  describe "The extract_all_marc_values macro" do
13
+ before do
14
+ @indexer = Traject::Indexer.new
15
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
16
+ end
17
+
9
18
 
10
19
  it "is fine with no arguments" do
11
20
  assert(extract_all_marc_values)
@@ -20,4 +29,22 @@ describe "The extract_all_marc_values macro" do
20
29
  extract_all_marc_values(from: 100, to: '999')
21
30
  end
22
31
  end
32
+
33
+ it "#extract_all_marc_values" do
34
+ @indexer.instance_eval do
35
+ to_field "text", extract_all_marc_values
36
+ end
37
+ output = @indexer.map_record(@record)
38
+
39
+ assert_length 13, output["text"]
40
+ end
41
+
42
+
23
43
  end
44
+
45
+
46
+
47
+
48
+
49
+
50
+
@@ -0,0 +1,125 @@
1
+ require 'test_helper'
2
+
3
+ require 'traject/indexer'
4
+ require 'traject/macros/marc21'
5
+
6
+ require 'json'
7
+ require 'marc'
8
+
9
+
10
+ include Traject::Macros::Marc21
11
+
12
+ describe "extract_marc" do
13
+ before do
14
+ @indexer = Traject::Indexer.new
15
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
16
+ end
17
+
18
+
19
+ it "extracts marc" do
20
+ @indexer.instance_eval do
21
+ to_field "title", extract_marc("245ab")
22
+ end
23
+
24
+ output = @indexer.map_record(@record)
25
+
26
+ assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
27
+ assert_equal({}, @indexer.map_record(empty_record))
28
+
29
+ end
30
+
31
+ it "respects :first=>true option" do
32
+ @indexer.instance_eval do
33
+ to_field "other_id", extract_marc("035a", :first => true)
34
+ end
35
+
36
+ output = @indexer.map_record(@record)
37
+
38
+ assert_length 1, output["other_id"]
39
+
40
+ end
41
+
42
+ it "trims punctuation with :trim_punctuation => true" do
43
+ @indexer.instance_eval do
44
+ to_field "title", extract_marc("245ab", :trim_punctuation => true)
45
+ end
46
+
47
+ output = @indexer.map_record(@record)
48
+
49
+ assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
50
+ assert_equal({}, @indexer.map_record(empty_record))
51
+
52
+ end
53
+
54
+ it "respects :default option" do
55
+ @indexer.instance_eval do
56
+ to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
57
+ end
58
+ output = @indexer.map_record(@record)
59
+
60
+ assert_equal ["DEFAULT VALUE"], output["only_default"]
61
+ end
62
+
63
+ it "de-duplicates by default, respects :allow_duplicates" do
64
+ # Add a second 008
65
+ f = @record.fields('008').first
66
+ @record.append(f)
67
+
68
+ @indexer.instance_eval do
69
+ to_field "lang1", extract_marc('008[35-37]')
70
+ to_field "lang2", extract_marc('008[35-37]', :allow_duplicates => true)
71
+ end
72
+
73
+ output = @indexer.map_record(@record)
74
+ assert_equal ["eng"], output['lang1']
75
+ assert_equal ["eng", "eng"], output['lang2']
76
+ assert_equal({}, @indexer.map_record(empty_record))
77
+ end
78
+
79
+ it "fails on an extra/misspelled argument to extract_marc" do
80
+ assert_raises(RuntimeError) do
81
+ @indexer.instance_eval do
82
+ to_field "foo", extract_marc("9999", :misspelled => "Who cares")
83
+ end
84
+ end
85
+ end
86
+
87
+
88
+ it "throws away nil values unless settings['allow_nil_values]'" do
89
+ @indexer.instance_eval do
90
+ to_field 'default_nil', extract_marc('9999', :default => nil)
91
+ end
92
+ output = @indexer.map_record(@record)
93
+ assert_nil output['default_nil']
94
+ end
95
+
96
+
97
+ it "allows nil values if settings['allow_nil_values]'" do
98
+ @indexer.settings do |s|
99
+ s['allow_nil_values'] = true
100
+ end
101
+ @indexer.instance_eval do
102
+ to_field 'default_nil', extract_marc('9999', :default => nil)
103
+ end
104
+ output = @indexer.map_record(@record)
105
+ assert_equal [nil], output['default_nil']
106
+ end
107
+
108
+
109
+
110
+
111
+ it "uses :translation_map" do
112
+ @indexer.instance_eval do
113
+ to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
114
+ end
115
+ output = @indexer.map_record(@record)
116
+
117
+ assert_equal ["Library of Congress"], output["cataloging_agency"]
118
+ end
119
+
120
+ it "supports #extract_marc_from module method" do
121
+ output_arr = ::Traject::Macros::Marc21.extract_marc_from(@record, "245ab", :trim_punctuation => true)
122
+ assert_equal ["Manufacturing consent : the political economy of the mass media"], output_arr
123
+ end
124
+
125
+ end
@@ -0,0 +1,73 @@
1
+ require 'test_helper'
2
+
3
+ require 'traject/indexer'
4
+ require 'traject/macros/marc21'
5
+ require 'json'
6
+ require 'marc'
7
+
8
+ include Traject::Macros::Marc21
9
+
10
+
11
+ describe "serialized_marc" do
12
+ before do
13
+ @indexer = Traject::Indexer.new
14
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
15
+ end
16
+
17
+ it "serializes xml" do
18
+ @indexer.instance_eval do
19
+ to_field "marc_record", serialized_marc(:format => "xml")
20
+ end
21
+ output = @indexer.map_record(@record)
22
+
23
+ assert_length 1, output["marc_record"]
24
+ assert_kind_of String, output["marc_record"].first
25
+ roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
26
+ assert_equal @record, roundtrip_record
27
+ end
28
+
29
+ it "serializes binary UUEncoded" do
30
+ @indexer.instance_eval do
31
+ to_field "marc_record", serialized_marc(:format => "binary")
32
+ end
33
+ output = @indexer.map_record(@record)
34
+
35
+ assert_length 1, output["marc_record"]
36
+ assert_kind_of String, output["marc_record"].first
37
+
38
+ decoded = Base64.decode64(output["marc_record"].first)
39
+
40
+ # just check the marc header for now
41
+ assert_start_with "02067cam a2200469", decoded
42
+ end
43
+
44
+ it "serializes binary raw" do
45
+ @indexer.instance_eval do
46
+ to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
47
+ end
48
+ output = @indexer.map_record(@record)
49
+
50
+ assert_length 1, output["marc_record"]
51
+ assert_kind_of String, output["marc_record"].first
52
+
53
+ # just check the marc header for now
54
+ assert_start_with "02067cam a2200469", output["marc_record"].first
55
+ end
56
+
57
+ it "serializes json" do
58
+ @indexer.instance_eval do
59
+ to_field "marc_record", serialized_marc(:format => "json")
60
+ end
61
+ output = @indexer.map_record(@record)
62
+
63
+ assert_length 1, output["marc_record"]
64
+
65
+ # okay, let's actually deserialize it, why not
66
+
67
+ hash = JSON.parse(output["marc_record"].first)
68
+
69
+ deserialized = MARC::Record.new_from_hash(hash)
70
+
71
+ assert_equal @record, deserialized
72
+ end
73
+ end
@@ -0,0 +1,39 @@
1
+ # encoding: UTF-8
2
+ require 'test_helper'
3
+
4
+ require 'traject/indexer'
5
+ require 'traject/macros/marc21'
6
+
7
+
8
+ include Traject::Macros::Marc21
9
+
10
+ describe "trim_punctuation" do
11
+
12
+ # TODO: test coverage for trim_punctuation
13
+ # trim_punctuation isn't super-complicated code, and yet we've found a few bugs
14
+ # in it already. Needs more test coveragel
15
+ it "Works as expected" do
16
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three")
17
+
18
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three,")
19
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three/")
20
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three;")
21
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three:")
22
+ assert_equal "one two three .", Traject::Macros::Marc21.trim_punctuation("one two three .")
23
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three.")
24
+ assert_equal "one two three...", Traject::Macros::Marc21.trim_punctuation("one two three...")
25
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation(" one two three.")
26
+
27
+ assert_equal "one two [three]", Traject::Macros::Marc21.trim_punctuation("one two [three]")
28
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("one two three]")
29
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("[one two three")
30
+ assert_equal "one two three", Traject::Macros::Marc21.trim_punctuation("[one two three]")
31
+
32
+ # This one was a bug before
33
+ assert_equal "Feminism and art", Traject::Macros::Marc21.trim_punctuation("Feminism and art.")
34
+ assert_equal "Le réve", Traject::Macros::Marc21.trim_punctuation("Le réve.")
35
+
36
+ # This one was a bug on the bug
37
+ assert_equal "Bill Dueber, Jr.", Traject::Macros::Marc21.trim_punctuation("Bill Dueber, Jr.")
38
+ end
39
+ end
@@ -1,6 +1,6 @@
1
1
  require 'test_helper'
2
2
 
3
- describe "Indexer Macros:" do
3
+ describe "Indexer Macros#to_field" do
4
4
  before do
5
5
  @indexer = Traject::Indexer.new
6
6
  @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
@@ -192,7 +192,7 @@ describe "Traject::Indexer#map_record" do
192
192
  end
193
193
 
194
194
  @indexer.to_field('radical') do |rec, acc, context|
195
- context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
195
+ context.skip!("Chomsky!") if rec['245'].to_s =~ /Chomsky/
196
196
  end
197
197
 
198
198
  @indexer.to_field('afterSkip') do |rec, acc|
@@ -58,7 +58,7 @@ describe "Traject::Indexer.to_field" do
58
58
  acc = ['hello']
59
59
  end
60
60
  output = @indexer.map_record('never looked at')
61
- assert_equal nil, output['foo']
61
+ assert_nil output['foo']
62
62
  end
63
63
 
64
64
  it "allows use of accumulator.replace" do
@@ -2,21 +2,28 @@ require 'test_helper'
2
2
  require 'traject/yaml_writer'
3
3
 
4
4
  describe "The writer on Traject::Indexer" do
5
- let(:indexer) { Traject::Indexer.new("solr.url" => "http://example.com") }
6
-
7
- it "has a default" do
8
- assert_instance_of Traject::SolrJsonWriter, indexer.writer
9
- assert_equal Traject::SolrJsonWriter, indexer.writer_class
5
+ let(:indexer) { Traject::Indexer.new("solr.url" => "http://localhost.com") }
6
+
7
+ # TODO: fix default writer test
8
+ # Fails in the absence of a configured
9
+ # network interface.
10
+ describe "default writer from index" do
11
+ it "has a default" do
12
+ # assert_instance_of Traject::SolrJsonWriter, indexer.writer
13
+ # assert_equal Traject::SolrJsonWriter, indexer.writer_class
14
+ skip "Fails in the absence of a configured network interface."
15
+ end
10
16
  end
11
17
 
12
- describe "when the writer is set in config" do
18
+
19
+ describe "when the writer is set in config" do
13
20
  let(:writer) { Traject::YamlWriter.new({}) }
14
21
 
15
22
  let(:indexer) { Traject::Indexer.new(
16
- "solr.url" => "http://example.com",
17
- "writer_class" => 'Traject::SolrJsonWriter',
18
- "writer" => writer
19
- )}
23
+ "solr.url" => "http://example.com",
24
+ "writer_class" => 'Traject::SolrJsonWriter',
25
+ "writer" => writer
26
+ ) }
20
27
 
21
28
  it "uses writer from config" do
22
29
  assert_equal writer, indexer.writer
@@ -120,7 +120,9 @@ to_field "discipline_facet", marc_lcc_to_broad_category(:default => nil) do |re
120
120
  if call_type == "sudoc"
121
121
  # we choose to call it:
122
122
  accumulator << "Government Publication"
123
- elsif call_type.nil? || call_type == "lc" || field['a'] =~ Traject::Macros::Marc21Semantics::LCC_REGEX
123
+ elsif call_type.nil? ||
124
+ call_type == "lc" ||
125
+ Traject::Macros::Marc21Semantics::LCC_REGEX.match(field['a'])
124
126
  # run it through the map
125
127
  s = field['a']
126
128
  s = s.slice(0, 1) if s
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.2
4
+ version: 2.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-11-03 00:00:00.000000000 Z
12
+ date: 2017-01-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -254,15 +254,17 @@ files:
254
254
  - test/indexer/context_test.rb
255
255
  - test/indexer/each_record_test.rb
256
256
  - test/indexer/load_config_file_test.rb
257
- - test/indexer/macros_marc21_semantics_test.rb
258
- - test/indexer/macros_marc21_test.rb
259
- - test/indexer/macros_test.rb
257
+ - test/indexer/macros/macros_marc21_semantics_test.rb
258
+ - test/indexer/macros/marc21/extract_all_marc_values_test.rb
259
+ - test/indexer/macros/marc21/extract_marc_test.rb
260
+ - test/indexer/macros/marc21/serialize_marc_test.rb
261
+ - test/indexer/macros/marc21/trim_punctuation_test.rb
262
+ - test/indexer/macros/to_field_test.rb
260
263
  - test/indexer/map_record_test.rb
261
264
  - test/indexer/read_write_test.rb
262
265
  - test/indexer/settings_test.rb
263
266
  - test/indexer/to_field_test.rb
264
267
  - test/indexer/writer_test.rb
265
- - test/marc21_macros_test.rb
266
268
  - test/marc_extractor_test.rb
267
269
  - test/marc_format_classifier_test.rb
268
270
  - test/marc_reader_test.rb
@@ -329,7 +331,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
329
331
  version: '0'
330
332
  requirements: []
331
333
  rubyforge_project:
332
- rubygems_version: 2.5.1
334
+ rubygems_version: 2.6.8
333
335
  signing_key:
334
336
  specification_version: 4
335
337
  summary: Index MARC to Solr; or generally process source records to hash-like structures
@@ -339,15 +341,17 @@ test_files:
339
341
  - test/indexer/context_test.rb
340
342
  - test/indexer/each_record_test.rb
341
343
  - test/indexer/load_config_file_test.rb
342
- - test/indexer/macros_marc21_semantics_test.rb
343
- - test/indexer/macros_marc21_test.rb
344
- - test/indexer/macros_test.rb
344
+ - test/indexer/macros/macros_marc21_semantics_test.rb
345
+ - test/indexer/macros/marc21/extract_all_marc_values_test.rb
346
+ - test/indexer/macros/marc21/extract_marc_test.rb
347
+ - test/indexer/macros/marc21/serialize_marc_test.rb
348
+ - test/indexer/macros/marc21/trim_punctuation_test.rb
349
+ - test/indexer/macros/to_field_test.rb
345
350
  - test/indexer/map_record_test.rb
346
351
  - test/indexer/read_write_test.rb
347
352
  - test/indexer/settings_test.rb
348
353
  - test/indexer/to_field_test.rb
349
354
  - test/indexer/writer_test.rb
350
- - test/marc21_macros_test.rb
351
355
  - test/marc_extractor_test.rb
352
356
  - test/marc_format_classifier_test.rb
353
357
  - test/marc_reader_test.rb
@@ -1,219 +0,0 @@
1
- require 'test_helper'
2
-
3
- require 'traject/indexer'
4
- require 'traject/macros/marc21'
5
-
6
- require 'json'
7
- require 'marc'
8
-
9
- # See also marc_extractor_test.rb for more detailed tests on marc extraction,
10
- # this is just a basic test to make sure our macro works passing through to there
11
- # and other options.
12
- describe "Traject::Macros::Marc21" do
13
- Marc21 = Traject::Macros::Marc21 # shortcut
14
-
15
- before do
16
- @indexer = Traject::Indexer.new
17
- @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
18
- end
19
-
20
- describe "extract_marc" do
21
- it "extracts marc" do
22
- @indexer.instance_eval do
23
- to_field "title", extract_marc("245ab")
24
- end
25
-
26
- output = @indexer.map_record(@record)
27
-
28
- assert_equal ["Manufacturing consent : the political economy of the mass media /"], output["title"]
29
- assert_equal({}, @indexer.map_record(empty_record))
30
-
31
- end
32
-
33
- it "respects :first=>true option" do
34
- @indexer.instance_eval do
35
- to_field "other_id", extract_marc("035a", :first => true)
36
- end
37
-
38
- output = @indexer.map_record(@record)
39
-
40
- assert_length 1, output["other_id"]
41
-
42
- end
43
-
44
- it "trims punctuation with :trim_punctuation => true" do
45
- @indexer.instance_eval do
46
- to_field "title", extract_marc("245ab", :trim_punctuation => true)
47
- end
48
-
49
- output = @indexer.map_record(@record)
50
-
51
- assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title"]
52
- assert_equal({}, @indexer.map_record(empty_record))
53
-
54
- end
55
-
56
- it "respects :default option" do
57
- @indexer.instance_eval do
58
- to_field "only_default", extract_marc("9999", :default => "DEFAULT VALUE")
59
- end
60
- output = @indexer.map_record(@record)
61
-
62
- assert_equal ["DEFAULT VALUE"], output["only_default"]
63
- end
64
-
65
- it "de-duplicates by default, respects :allow_duplicates" do
66
- # Add a second 008
67
- f = @record.fields('008').first
68
- @record.append(f)
69
-
70
- @indexer.instance_eval do
71
- to_field "lang1", extract_marc('008[35-37]')
72
- to_field "lang2", extract_marc('008[35-37]', :allow_duplicates => true)
73
- end
74
-
75
- output = @indexer.map_record(@record)
76
- assert_equal ["eng"], output['lang1']
77
- assert_equal ["eng", "eng"], output['lang2']
78
- assert_equal({}, @indexer.map_record(empty_record))
79
- end
80
-
81
- it "fails on an extra/misspelled argument to extract_marc" do
82
- assert_raises(RuntimeError) do
83
- @indexer.instance_eval do
84
- to_field "foo", extract_marc("9999", :misspelled => "Who cares")
85
- end
86
- end
87
- end
88
-
89
-
90
- it "throws away nil values unless settings['allow_nil_values]'" do
91
- @indexer.instance_eval do
92
- to_field 'default_nil', extract_marc('9999', :default => nil)
93
- end
94
- output = @indexer.map_record(@record)
95
- assert_nil output['default_nil']
96
- end
97
-
98
-
99
-
100
- it "allows nil values if settings['allow_nil_values]'" do
101
- @indexer.settings do |s|
102
- s['allow_nil_values'] = true
103
- end
104
- @indexer.instance_eval do
105
- to_field 'default_nil', extract_marc('9999', :default => nil)
106
- end
107
- output = @indexer.map_record(@record)
108
- assert_equal [nil], output['default_nil']
109
- end
110
-
111
-
112
- it "Marc21::trim_punctuation class method" do
113
- assert_equal "one two three", Marc21.trim_punctuation("one two three")
114
-
115
- assert_equal "one two three", Marc21.trim_punctuation("one two three,")
116
- assert_equal "one two three", Marc21.trim_punctuation("one two three/")
117
- assert_equal "one two three", Marc21.trim_punctuation("one two three;")
118
- assert_equal "one two three", Marc21.trim_punctuation("one two three:")
119
- assert_equal "one two three .", Marc21.trim_punctuation("one two three .")
120
- assert_equal "one two three", Marc21.trim_punctuation("one two three.")
121
- assert_equal "one two three...", Marc21.trim_punctuation("one two three...")
122
- assert_equal "one two three", Marc21.trim_punctuation(" one two three.")
123
-
124
- assert_equal "one two [three]", Marc21.trim_punctuation("one two [three]")
125
- assert_equal "one two three", Marc21.trim_punctuation("one two three]")
126
- assert_equal "one two three", Marc21.trim_punctuation("[one two three")
127
- assert_equal "one two three", Marc21.trim_punctuation("[one two three]")
128
-
129
- # This one was a bug before
130
- assert_equal "Feminism and art", Marc21.trim_punctuation("Feminism and art.")
131
-
132
- assert_equal "Le réve", Marc21.trim_punctuation("Le réve.") # this assertion currently fails
133
- end
134
-
135
- it "uses :translation_map" do
136
- @indexer.instance_eval do
137
- to_field "cataloging_agency", extract_marc("040a", :separator => nil, :translation_map => "marc_040a_translate_test")
138
- end
139
- output = @indexer.map_record(@record)
140
-
141
- assert_equal ["Library of Congress"], output["cataloging_agency"]
142
- end
143
- end
144
-
145
- it "supports #extract_marc_from module method" do
146
- output_arr = ::Traject::Macros::Marc21.extract_marc_from(@record, "245ab", :trim_punctuation => true)
147
- assert_equal ["Manufacturing consent : the political economy of the mass media"], output_arr
148
- end
149
-
150
- describe "serialized_marc" do
151
- it "serializes xml" do
152
- @indexer.instance_eval do
153
- to_field "marc_record", serialized_marc(:format => "xml")
154
- end
155
- output = @indexer.map_record(@record)
156
-
157
- assert_length 1, output["marc_record"]
158
- assert_kind_of String, output["marc_record"].first
159
- roundtrip_record = MARC::XMLReader.new(StringIO.new(output["marc_record"].first)).first
160
- assert_equal @record, roundtrip_record
161
- end
162
-
163
- it "serializes binary UUEncoded" do
164
- @indexer.instance_eval do
165
- to_field "marc_record", serialized_marc(:format => "binary")
166
- end
167
- output = @indexer.map_record(@record)
168
-
169
- assert_length 1, output["marc_record"]
170
- assert_kind_of String, output["marc_record"].first
171
-
172
- decoded = Base64.decode64(output["marc_record"].first)
173
-
174
- # just check the marc header for now
175
- assert_start_with "02067cam a2200469", decoded
176
- end
177
-
178
- it "serializes binary raw" do
179
- @indexer.instance_eval do
180
- to_field "marc_record", serialized_marc(:format => "binary", :binary_escape => false)
181
- end
182
- output = @indexer.map_record(@record)
183
-
184
- assert_length 1, output["marc_record"]
185
- assert_kind_of String, output["marc_record"].first
186
-
187
- # just check the marc header for now
188
- assert_start_with "02067cam a2200469", output["marc_record"].first
189
- end
190
-
191
- it "serializes json" do
192
- @indexer.instance_eval do
193
- to_field "marc_record", serialized_marc(:format => "json")
194
- end
195
- output = @indexer.map_record(@record)
196
-
197
- assert_length 1, output["marc_record"]
198
-
199
- # okay, let's actually deserialize it, why not
200
-
201
- hash = JSON.parse(output["marc_record"].first)
202
-
203
- deserialized = MARC::Record.new_from_hash(hash)
204
-
205
- assert_equal @record, deserialized
206
- end
207
- end
208
-
209
- it "#extract_all_marc_values" do
210
- @indexer.instance_eval do
211
- to_field "text", extract_all_marc_values
212
- end
213
- output = @indexer.map_record(@record)
214
-
215
- assert_length 13, output["text"]
216
- end
217
-
218
-
219
- end