traject 3.6.0 → 3.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2e47e6648ed9fc963d18e10c9be48a30273147c4920cb4b7e448d078fd2398ac
4
- data.tar.gz: efa549ebcbd87e599b56b955b4bd26422dfe7de67697aed6b39cb421c3b80677
3
+ metadata.gz: 9e9868f3b83385402413a2fc6c8865dc6ab3dd3776c25c0f1f1b88bf20024005
4
+ data.tar.gz: f86c298c93905948ca9425983e65811b2091ba39d6105895744df183a7c43695
5
5
  SHA512:
6
- metadata.gz: 6acdd2b8cfc888b221a1f19cd5197127006be81d0525169d531fc9bf43fe02cc9ec87401e6b2442c57ff0cd483d9884504ac75be92e3718cbbc49208dc97024f
7
- data.tar.gz: 30abefa7af9e1c170ae8570aa59b6c571a9acc1eb7b0abf6efd64d97550b678c21c72d76a8156ef3844ab01154111fd3747f96d6346ee8a8d76e747b2cf92e1f
6
+ metadata.gz: 7d1e1122020632ac10d4da030915e0f710c8dd9bb6e9780089129c1ec7febb76f7b0e23ac4828d1e4860429164fe6052c8db768d478ae6102f4819b8cd512f4d
7
+ data.tar.gz: '08eead90c2ddfebe141aa4bea2f280878c27af35509a9f5b220d2899fad820f9954dbc30392aa3cfd00b79e77b8f5266c69ac7101fe381d5e777e47b2e12aca7'
@@ -12,7 +12,7 @@ jobs:
12
12
  strategy:
13
13
  fail-fast: false
14
14
  matrix:
15
- ruby: [ '2.4', '2.5', '2.6', '2.7', '3.0', 'jruby-9.1', 'jruby-9.2' ]
15
+ ruby: [ '2.4', '2.5', '2.6', '2.7', '3.0', '3.1', 'jruby-9.1', 'jruby-9.2' ]
16
16
  name: Ruby ${{ matrix.ruby }}
17
17
  steps:
18
18
  - uses: actions/checkout@v2
data/CHANGES.md CHANGED
@@ -1,11 +1,25 @@
1
1
  # Changes
2
2
 
3
- ## Next
3
+ ## NEXT
4
4
 
5
5
  *
6
6
 
7
7
  *
8
8
 
9
+ ## 3.8.0
10
+
11
+ SolrJsonWriter: HTTPClient should use OS certs instead of packaged ones
12
+
13
+ HTTPClient, for whatever reason, prefers its own packaged certs, which are now years out of date
14
+ and don't work with Let's Encrypt.
15
+
16
+ This changes the code to prefer the OS certs, which can be overridden by setting
17
+ `solr_json_writer.use_packaged_certs` to `true` or `"true"`.
18
+
19
+ ## 3.7.0
20
+
21
+ * Add two new transformation macros, `Traject::Macros::Transformation.delete_if` and `Traject::Macros::Transformations.select`.
22
+
9
23
  ## 3.6.0
10
24
 
11
25
  * Tiny backward compat changes for ruby 3.0 compat. https://github.com/traject/traject/pull/263
data/Gemfile CHANGED
@@ -11,3 +11,11 @@ group :debug do
11
11
  gem "ruby-debug", :platform => "jruby"
12
12
  gem "byebug", :platform => "mri"
13
13
  end
14
+
15
+ # ruby-marc stopped supporting ruby 2.3 and 2.4 in newer 1.x versions,
16
+ # while we would still like to support those old versions. When running
17
+ # CI, run with older ruby-marc that still supports them.
18
+ ruby_version_parts = RUBY_VERSION.split(".")
19
+ if ruby_version_parts[0] == "2" && ruby_version_parts[1].to_i < 5
20
+ gem "marc", "< 1.2.0"
21
+ end
data/README.md CHANGED
@@ -177,6 +177,11 @@ TranslationMap use above is just one example of a transformation macro, that tra
177
177
  * `split(" ")`: take values and split them, possibly result in multiple values.
178
178
  * `transform(proc)`: transform each existing macro using a proc, kind of like `map`.
179
179
  eg `to_field "something", extract_xml("//author"), transform( ->(author) { "#{author.last}, #{author.first}" })
180
+ * `delete_if(["a", "b"])`: remove a value from accumulated values if it is included in the passed in argumet.
181
+ * Can also take a string, proc or regex as an argument. See [tests](test/indexer/macros/transformation_test.rb) for full functionality.
182
+ * `select(proc)`: selects (keeps) values from accumulated values if proc evaluates to true for specifc value.
183
+ * Can also take a arrays, sets and regex as an argument. See [tests](test/indexer/macros/transformation_test.rb) for full functionality.
184
+
180
185
 
181
186
  You can add on as many transformation macros as you want, they will be applied to output in order.
182
187
 
@@ -327,10 +327,14 @@ module Traject::Macros
327
327
  if field008 && field008.length >= 11
328
328
  date_type = field008.slice(6)
329
329
  date1_str = field008.slice(7,4)
330
- date2_str = field008.slice(11, 4) if field008.length > 15
330
+ if field008.length > 15
331
+ date2_str = field008.slice(11, 4)
332
+ else
333
+ date2_str = date1_str
334
+ end
331
335
 
332
- # for date_type q=questionable, we have a range.
333
- if (date_type == 'q')
336
+ # for date_type q=questionable, we expect to have a range.
337
+ if date_type == 'q' and date1_str != date2_str
334
338
  # make unknown digits at the beginning or end of range,
335
339
  date1 = date1_str.sub("u", "0").to_i
336
340
  date2 = date2_str.sub("u", "9").to_i
@@ -157,6 +157,36 @@ module Traject
157
157
  acc.collect! { |v| v.gsub(pattern, replace) }
158
158
  end
159
159
  end
160
+
161
+ # Run ruby `delete_if` on the accumulator for values that include or are equal to arg.
162
+ # It will also accept an array, set, regex pattern, proc or lambda as an arugment.
163
+ #
164
+ # @example
165
+ # to_field "creator_facet", extract_marc("100abcdq"), delete_if(/foo/)
166
+ def delete_if(arg)
167
+ p = if arg.respond_to? :include?
168
+ proc { |v| arg.include?(v) }
169
+ else
170
+ proc { |v| arg === v }
171
+ end
172
+
173
+ ->(_, acc) { acc.delete_if(&p) }
174
+ end
175
+
176
+ # Run ruby `select!` on the accumulator for values that include or are equal to arg.
177
+ # It accepts an array, set, regex pattern, proc or lambda as an arugument.
178
+ #
179
+ # @example
180
+ # to_field "creator_facet", extract_marc("100abcdq"), select(->(v) { v != "foo" })
181
+ def select(arg)
182
+ p = if arg.respond_to? :include?
183
+ proc { |v| arg.include?(v) }
184
+ else
185
+ proc { |v| arg === v }
186
+ end
187
+
188
+ ->(_, acc) { acc.select!(&p) }
189
+ end
160
190
  end
161
191
  end
162
192
  end
@@ -86,6 +86,9 @@ require 'concurrent' # for atomic_fixnum
86
86
  # * solr_json_writer.http_client Mainly intended for testing, set your own HTTPClient
87
87
  # or mock object to be used for HTTP.
88
88
  #
89
+ # * solr_json_writer.use_packaged_certs: unlikely to be needed, set to true for legacy
90
+ # behavior, to use packaged HTTPClient gem ssl certs. https://github.com/nahi/httpclient/issues/445
91
+ #
89
92
  class Traject::SolrJsonWriter
90
93
  include Traject::QualifiedConstGet
91
94
 
@@ -118,6 +121,15 @@ class Traject::SolrJsonWriter
118
121
  @settings["solr_json_writer.http_client"]
119
122
  else
120
123
  client = HTTPClient.new
124
+
125
+ # By default we'll use teh host OS SSL certs, but you can use
126
+ # setting solr_json_writer.use_packaged_certs to true or "true"
127
+ # to go back to previous behavior if you have a perverse reason to.
128
+ # https://github.com/nahi/httpclient/issues/445
129
+ unless @settings["solr_json_writer.use_packaged_certs"].to_s == "true"
130
+ client.ssl_config.set_default_paths
131
+ end
132
+
121
133
  if @settings["solr_writer.http_timeout"]
122
134
  client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
123
135
  end
@@ -431,9 +443,27 @@ class Traject::SolrJsonWriter
431
443
  attr_reader :response
432
444
 
433
445
  def initialize(msg, response = nil) # :nodoc:
446
+ solr_error = find_solr_error(response)
447
+ msg += ": #{solr_error}" if solr_error
448
+
434
449
  super(msg)
450
+
435
451
  @response = response
436
452
  end
453
+
454
+ private
455
+
456
+ # If we can get the error out of a JSON response, please do,
457
+ # to include in error message.
458
+ def find_solr_error(response)
459
+ return nil unless response && response.body && response.content_type&.start_with?("application/json")
460
+
461
+ parsed = JSON.parse(response.body)
462
+
463
+ parsed && parsed.dig("error", "msg")
464
+ rescue JSON::ParserError
465
+ return nil
466
+ end
437
467
  end
438
468
 
439
469
  private
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.6.0"
2
+ VERSION = "3.8.0"
3
3
  end
@@ -209,6 +209,10 @@ describe "Traject::Macros::Marc21Semantics" do
209
209
  @record = MARC::Reader.new(support_file_path "date_type_r_missing_date2.marc").to_a.first
210
210
  assert_equal 1957, Marc21Semantics.publication_date(@record)
211
211
  end
212
+ it "provides a fallback for a missing second date" do
213
+ @record = MARC::Reader.new(support_file_path "missing-second-date.marc").to_a.first
214
+ assert_equal 1678, Marc21Semantics.publication_date(@record)
215
+ end
212
216
 
213
217
  it "works correctly with date type 'q'" do
214
218
  val = @record['008'].value
@@ -174,4 +174,114 @@ describe "Traject::Macros::Transformation" do
174
174
  end
175
175
  end
176
176
 
177
+ describe "delete_if" do
178
+
179
+ describe "argument is an Array" do
180
+ it "filters out selected values from accumulatd values" do
181
+ arg = [ "one", "three"]
182
+
183
+ @indexer.configure do
184
+ to_field "test", literal("one"), literal("two"), literal("three"), delete_if(arg)
185
+ end
186
+
187
+ output = @indexer.map_record(@record)
188
+ assert_equal ["two"], output["test"]
189
+ end
190
+ end
191
+
192
+ describe "argument is a Set" do
193
+ it "filters out selected values from accumulatd values" do
194
+ arg = [ "one", "three"].to_set
195
+
196
+ @indexer.configure do
197
+ to_field "test", literal("one"), literal("two"), literal("three"), delete_if(arg)
198
+ end
199
+
200
+ output = @indexer.map_record(@record)
201
+ assert_equal ["two"], output["test"]
202
+ end
203
+ end
204
+
205
+ describe "argument is a Regex" do
206
+ it "filters out selected values from accumulatd values" do
207
+ arg = /^t/
208
+
209
+ @indexer.configure do
210
+ to_field "test", literal("one"), literal("two"), literal("three"), delete_if(arg)
211
+ end
212
+
213
+ output = @indexer.map_record(@record)
214
+ assert_equal ["one"], output["test"]
215
+ end
216
+ end
217
+
218
+ describe "argument is a Procedure or Lambda" do
219
+ it "filters out selected values from accumulatd values" do
220
+ arg = ->(v) { v == "one" }
221
+
222
+ @indexer.configure do
223
+ to_field "test", literal("one"), literal("two"), literal("three"), delete_if(arg)
224
+ end
225
+
226
+ output = @indexer.map_record(@record)
227
+ assert_equal ["two", "three"], output["test"]
228
+ end
229
+ end
230
+ end
231
+
232
+ describe "select" do
233
+
234
+ describe "argument is an Array" do
235
+ it "selects a subset of values from accumulatd values" do
236
+ arg = [ "one", "three", "four"]
237
+
238
+ @indexer.configure do
239
+ to_field "test", literal("one"), literal("two"), literal("three"), select(arg)
240
+ end
241
+
242
+ output = @indexer.map_record(@record)
243
+ assert_equal ["one", "three"], output["test"]
244
+ end
245
+ end
246
+
247
+ describe "argument is a Set" do
248
+ it "selects a subset of values from accumulatd values" do
249
+ arg = [ "one", "three", "four"].to_set
250
+
251
+ @indexer.configure do
252
+ to_field "test", literal("one"), literal("two"), literal("three"), select(arg)
253
+ end
254
+
255
+ output = @indexer.map_record(@record)
256
+ assert_equal ["one", "three"], output["test"]
257
+ end
258
+ end
259
+
260
+ describe "argument is a Regex" do
261
+ it "selects a subset of values from accumulatd values" do
262
+ arg = /^t/
263
+
264
+ @indexer.configure do
265
+ to_field "test", literal("one"), literal("two"), literal("three"), select(arg)
266
+ end
267
+
268
+ output = @indexer.map_record(@record)
269
+ assert_equal ["two", "three"], output["test"]
270
+ end
271
+ end
272
+
273
+ describe "argument is a Procedure or Lambda" do
274
+ it "selects a subset of values from accumulatd values" do
275
+ arg = ->(v) { v != "one" }
276
+
277
+ @indexer.configure do
278
+ to_field "test", literal("one"), literal("two"), literal("three"), select(arg)
279
+ end
280
+
281
+ output = @indexer.map_record(@record)
282
+ assert_equal ["two", "three"], output["test"]
283
+ end
284
+ end
285
+ end
286
+
177
287
  end
@@ -19,7 +19,7 @@ describe "Traject::SolrJsonWriter" do
19
19
  class FakeHTTPClient
20
20
  # Always reply with this status, normally 200, can
21
21
  # be reset for testing error conditions.
22
- attr_accessor :response_status
22
+ attr_accessor :response_status, :body, :content_type
23
23
 
24
24
  def initialize(*args)
25
25
  @post_args = []
@@ -33,10 +33,7 @@ describe "Traject::SolrJsonWriter" do
33
33
  @post_args << args
34
34
  end
35
35
 
36
- resp = HTTP::Message.new_response("")
37
- resp.status = self.response_status
38
-
39
- return resp
36
+ return faked_response
40
37
  end
41
38
 
42
39
  def get(*args)
@@ -44,10 +41,7 @@ describe "Traject::SolrJsonWriter" do
44
41
  @get_args << args
45
42
  end
46
43
 
47
- resp = HTTP::Message.new_response("")
48
- resp.status = self.response_status
49
-
50
- return resp
44
+ return faked_response
51
45
  end
52
46
 
53
47
  def post_args
@@ -65,6 +59,16 @@ describe "Traject::SolrJsonWriter" do
65
59
  # Everything else, just return nil please
66
60
  def method_missing(*args)
67
61
  end
62
+
63
+ private
64
+
65
+ def faked_response
66
+ resp = HTTP::Message.new_response(self.body || "")
67
+ resp.status = self.response_status
68
+ resp.content_type = self.content_type if self.content_type
69
+
70
+ resp
71
+ end
68
72
  end
69
73
 
70
74
 
@@ -157,6 +161,26 @@ describe "Traject::SolrJsonWriter" do
157
161
  assert_length 1, JSON.parse(individual_update2[1])
158
162
  end
159
163
 
164
+ it "includes Solr reported error in base error message" do
165
+ @writer = create_writer("solr_writer.batch_size" => 1, "solr_writer.max_skipped" => 0)
166
+ @fake_http_client.response_status = 400
167
+ @fake_http_client.content_type = "application/json;charset=utf-8"
168
+ @fake_http_client.body =
169
+ { "responseHeader"=>{"status"=>400, "QTime"=>0},
170
+ "error"=>{
171
+ "metadata"=>["error-class", "org.apache.solr.common.SolrException", "root-error-class", "org.apache.solr.common.SolrException"],
172
+ "msg"=>"ERROR: this is a solr error",
173
+ "code"=>400
174
+ }
175
+ }.to_json
176
+
177
+ error = assert_raises(Traject::SolrJsonWriter::MaxSkippedRecordsExceeded) {
178
+ @writer.put context_with({"id" => "doc_1", "key" => "value"})
179
+ @writer.close
180
+ }
181
+ assert_match(/ERROR: this is a solr error/, error.message)
182
+ end
183
+
160
184
  it "can #flush" do
161
185
  2.times do |i|
162
186
  doc = {"id" => "doc_#{i}", "key" => "value"}
@@ -0,0 +1 @@
1
+ 01351nem a2200313 a 4500001001100000001001100011008004100022034001300063035002200076043003000098080001200128100006100140245011600201255002900317260005900346300003500405500004500440500013600485500003800621500005000659530004400709651006000753700003800813710010200851730000900953856005300962907001401015940000801029.b20028118.b6928510x170714q1678 fr |||| | |||| ||fre|c1 aab10000 a(OCoLC)1120596466 ae-sp---be-spcce2catmarc a(084.3)1 aBeaulieu, Sébastien de Pontault,csieur de,d1613-1674.10aPlan de la ville de Puiçerdah[Document cartogràfic] :bpris en 1678 /c[Beaulieu] ; DR f. [Des Roches fecit] aEscala [1:10 000 aprox.] a[A Paris :bpar le Chevalier de Beaulieu,cpost. 1678] a1 mapa :bgravat;c28 x 32 cm. aEscala gràfica: Eschelle de 150 toises. aPertany a l'obra "Les plans et profils des principales villes et lieux considerables de la Principauté de Catalogne", de Beaulieu. aPeu d'impremta de l'obra general. aMapa emmarcat en una orla amb motius florals. aTambé disponible la versió en línia. 4aPuigcerdà (Catalunya)xMapesxObres anteriors al 1800.1 aDes Roches, Jean Baptiste Hamont.2 aCol·lecció de mapes antics de Martí Gelabertó (Universitat Autònoma de Barcelona)5ES-BaUAB.0 aDDD.41zAccés lliureuhttps://ddd.uab.cat/record/180313 ab20028118 aUAB
data/traject.gemspec CHANGED
@@ -24,7 +24,7 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "concurrent-ruby", ">= 0.8.0"
25
25
  spec.add_dependency "marc", "~> 1.0"
26
26
 
27
- spec.add_dependency "hashie", ">= 3.1", "< 5" # used for Indexer#settings
27
+ spec.add_dependency "hashie", ">= 3.1", "< 6" # used for Indexer#settings
28
28
  spec.add_dependency "slop", "~> 4.0" # command line parsing
29
29
  spec.add_dependency "yell" # logging
30
30
  spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
33
33
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
34
34
  spec.add_dependency "nokogiri", "~> 1.9" # NokogiriIndexer
35
35
 
36
- spec.add_development_dependency 'bundler', '>= 1.7', '< 3'
36
+ spec.add_development_dependency 'bundler', '~>2.0'
37
37
 
38
38
  spec.add_development_dependency "rake"
39
39
  spec.add_development_dependency "minitest"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.6.0
4
+ version: 3.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-06-21 00:00:00.000000000 Z
12
+ date: 2022-12-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -48,7 +48,7 @@ dependencies:
48
48
  version: '3.1'
49
49
  - - "<"
50
50
  - !ruby/object:Gem::Version
51
- version: '5'
51
+ version: '6'
52
52
  type: :runtime
53
53
  prerelease: false
54
54
  version_requirements: !ruby/object:Gem::Requirement
@@ -58,7 +58,7 @@ dependencies:
58
58
  version: '3.1'
59
59
  - - "<"
60
60
  - !ruby/object:Gem::Version
61
- version: '5'
61
+ version: '6'
62
62
  - !ruby/object:Gem::Dependency
63
63
  name: slop
64
64
  requirement: !ruby/object:Gem::Requirement
@@ -167,22 +167,16 @@ dependencies:
167
167
  name: bundler
168
168
  requirement: !ruby/object:Gem::Requirement
169
169
  requirements:
170
- - - ">="
171
- - !ruby/object:Gem::Version
172
- version: '1.7'
173
- - - "<"
170
+ - - "~>"
174
171
  - !ruby/object:Gem::Version
175
- version: '3'
172
+ version: '2.0'
176
173
  type: :development
177
174
  prerelease: false
178
175
  version_requirements: !ruby/object:Gem::Requirement
179
176
  requirements:
180
- - - ">="
181
- - !ruby/object:Gem::Version
182
- version: '1.7'
183
- - - "<"
177
+ - - "~>"
184
178
  - !ruby/object:Gem::Version
185
- version: '3'
179
+ version: '2.0'
186
180
  - !ruby/object:Gem::Dependency
187
181
  name: rake
188
182
  requirement: !ruby/object:Gem::Requirement
@@ -350,6 +344,7 @@ files:
350
344
  - test/test_support/manufacturing_consent.marc
351
345
  - test/test_support/manuscript_online_thesis.marc
352
346
  - test/test_support/microform_online_conference.marc
347
+ - test/test_support/missing-second-date.marc
353
348
  - test/test_support/multi_era.marc
354
349
  - test/test_support/multi_geo.marc
355
350
  - test/test_support/musical_cage.marc
@@ -401,7 +396,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
401
396
  - !ruby/object:Gem::Version
402
397
  version: '0'
403
398
  requirements: []
404
- rubygems_version: 3.0.3
399
+ rubygems_version: 3.3.3
405
400
  signing_key:
406
401
  specification_version: 4
407
402
  summary: An easy to use, high-performance, flexible and extensible metadata transformation
@@ -455,6 +450,7 @@ test_files:
455
450
  - test/test_support/manufacturing_consent.marc
456
451
  - test/test_support/manuscript_online_thesis.marc
457
452
  - test/test_support/microform_online_conference.marc
453
+ - test/test_support/missing-second-date.marc
458
454
  - test/test_support/multi_era.marc
459
455
  - test/test_support/multi_geo.marc
460
456
  - test/test_support/musical_cage.marc