traject 3.1.0.rc1 → 3.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,51 @@
1
+ # we mostly unit test with a Traject::Indexer itself and lower-level, but
2
+ # we need at least some basic top-level integration actually command line tests,
3
+ # this is a start, we can add more.
4
+ #
5
+ # Should we be testing Traject::CommandLine as an object instead of/in addition to
6
+ # actually testing shell-out to command line call? Maybe.
7
+
8
+ require 'test_helper'
9
+
10
+ describe "Shell out to command line" do
11
+ # just encapsuluate using the minitest capture helper, but also
12
+ # getting and returning exit code
13
+ #
14
+ # out, err, result = execute_with_args("-c configuration")
15
+ def execute_with_args(args)
16
+ out, err = capture_subprocess_io do
17
+ system("./bin/traject #{args}")
18
+ end
19
+
20
+ return out, err, $?
21
+ end
22
+
23
+ it "can display version" do
24
+ out, err, result = execute_with_args("-v")
25
+ assert_equal err, "traject version #{Traject::VERSION}\n"
26
+ assert result.success?
27
+ end
28
+
29
+ it "can display help text" do
30
+ out, err, result = execute_with_args("-h")
31
+
32
+ assert err.start_with?("traject [options] -c configuration.rb [-c config2.rb] file.mrc")
33
+ assert result.success?
34
+ end
35
+
36
+ it "handles bad argument" do
37
+ out, err, result = execute_with_args("--no-such-arg")
38
+ refute result.success?
39
+
40
+ assert err.start_with?("Error: unknown option `--no-such-arg'\nExiting...\n")
41
+ end
42
+
43
+ it "does basic dry run" do
44
+ out, err, result = execute_with_args("--debug-mode -s one=two -s three=four -c test/test_support/demo_config.rb test/test_support/emptyish_record.marc")
45
+
46
+ assert result.success?
47
+ assert_includes err, "executing with: `--debug-mode -s one=two -s three=four"
48
+ assert_match /bib_1000165 +author_sort +Collection la/, out
49
+ end
50
+ end
51
+
@@ -24,40 +24,40 @@ describe "Delimited/CSV Writers" do
24
24
 
25
25
  it "creates a dw with defaults" do
26
26
  dw = Traject::DelimitedWriter.new(@settings)
27
- dw.delimiter.must_equal "\t"
28
- dw.internal_delimiter.must_equal '|'
29
- dw.edelim.must_equal ' '
30
- dw.eidelim.must_equal '\\|'
27
+ assert_equal dw.delimiter, "\t"
28
+ assert_equal dw.internal_delimiter, '|'
29
+ assert_equal dw.edelim, ' '
30
+ assert_equal dw.eidelim, '\\|'
31
31
  end
32
32
 
33
33
  it "respects different delimiter" do
34
34
  @settings['delimited_writer.delimiter'] = '^'
35
35
  dw = Traject::DelimitedWriter.new(@settings)
36
- dw.delimiter.must_equal '^'
37
- dw.edelim.must_equal '\\^'
38
- dw.internal_delimiter.must_equal '|'
36
+ assert_equal dw.delimiter, '^'
37
+ assert_equal dw.edelim, '\\^'
38
+ assert_equal dw.internal_delimiter, '|'
39
39
  end
40
40
 
41
41
  it "outputs a header if asked to" do
42
42
  Traject::DelimitedWriter.new(@settings)
43
- @out.string.chomp.must_equal %w[four one two].join("\t")
43
+ assert_equal @out.string.chomp, %w[four one two].join("\t")
44
44
  end
45
45
 
46
46
  it "doesn't output a header if asked not to" do
47
47
  @settings['delimited_writer.header'] = 'false'
48
48
  Traject::DelimitedWriter.new(@settings)
49
- @out.string.must_be_empty
49
+ assert_empty @out.string
50
50
  end
51
51
 
52
52
  it "deals with multiple values" do
53
53
  dw = Traject::DelimitedWriter.new(@settings)
54
54
  dw.put @context
55
- @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(dw.delimiter)
55
+ assert_equal @out.string.split("\n").last, ['four', 'one', 'two1|two2'].join(dw.delimiter)
56
56
  end
57
57
 
58
58
  it "bails if delimited_writer.fields isn't set" do
59
59
  @settings.delete 'delimited_writer.fields'
60
- proc { Traject::DelimitedWriter.new(@settings) }.must_raise(ArgumentError)
60
+ assert_raises(ArgumentError) { Traject::DelimitedWriter.new(@settings) }
61
61
  end
62
62
 
63
63
  end
@@ -65,18 +65,18 @@ describe "Delimited/CSV Writers" do
65
65
  describe "Traject::CSVWriter" do
66
66
  it "unsets the delimiter" do
67
67
  cw = Traject::CSVWriter.new(@settings)
68
- cw.delimiter.must_be_nil
68
+ assert_nil cw.delimiter
69
69
  end
70
70
 
71
71
  it "writes the header" do
72
72
  Traject::CSVWriter.new(@settings)
73
- @out.string.chomp.must_equal 'four,one,two'
73
+ assert_equal @out.string.chomp, 'four,one,two'
74
74
  end
75
75
 
76
76
  it "uses the internal delimiter" do
77
77
  cw = Traject::CSVWriter.new(@settings)
78
78
  cw.put @context
79
- @out.string.split("\n").last.must_equal ['four', 'one', 'two1|two2'].join(',')
79
+ assert_equal @out.string.split("\n").last, ['four', 'one', 'two1|two2'].join(',')
80
80
  end
81
81
 
82
82
  it "produces complex output" do
@@ -97,8 +97,6 @@ describe "Delimited/CSV Writers" do
97
97
  traject_csvwriter_output = @out.string.split("\n").last.chomp
98
98
 
99
99
  assert_equal(csv_output, traject_csvwriter_output)
100
-
101
100
  end
102
-
103
101
  end
104
102
  end
@@ -25,6 +25,7 @@ describe "Class-level configuration of Indexer sub-class" do
25
25
  end
26
26
  end
27
27
 
28
+
28
29
  before do
29
30
  @indexer = TestIndexerSubclass.new
30
31
  end
@@ -47,6 +48,28 @@ describe "Class-level configuration of Indexer sub-class" do
47
48
  assert_equal ['from-instance-config'], result["instance_field"]
48
49
  end
49
50
 
51
+ describe "multiple class-level configure" do
52
+ class MultipleConfigureIndexer < Traject::Indexer
53
+ configure do
54
+ to_field "field", literal("value")
55
+ end
56
+ configure do
57
+ to_field "field", literal("value from second configure")
58
+ to_field "second_call", literal("value from second configure")
59
+ end
60
+ end
61
+
62
+ before do
63
+ @indexer = MultipleConfigureIndexer.new
64
+ end
65
+
66
+ it "lets you call class-level configure multiple times and aggregates" do
67
+ result = @indexer.map_record(Object.new)
68
+ assert_equal ['value', 'value from second configure'], result['field']
69
+ assert_equal ['value from second configure'], result['second_call']
70
+ end
71
+ end
72
+
50
73
  describe "with multi-level subclass" do
51
74
  class TestIndexerSubclassSubclass < TestIndexerSubclass
52
75
  configure do
@@ -197,6 +197,10 @@ describe "Traject::Macros::Marc21Semantics" do
197
197
  # we take the first date. And need to deal with the u.
198
198
  assert_equal 1845, Marc21Semantics.publication_date(@record)
199
199
  end
200
+ it "resorts to 264c" do
201
+ @record = MARC::Reader.new(support_file_path "date_resort_to_264.marc").to_a.first
202
+ assert_equal 2015, Marc21Semantics.publication_date(@record)
203
+ end
200
204
  it "resorts to 260c" do
201
205
  @record = MARC::Reader.new(support_file_path "date_resort_to_260.marc").to_a.first
202
206
  assert_equal 1980, Marc21Semantics.publication_date(@record)
@@ -109,6 +109,41 @@ describe "Traject::NokogiriIndexer" do
109
109
  result["name"].name == "name"
110
110
  })
111
111
  end
112
+ end
112
113
 
114
+ describe "xpath to attribute" do
115
+ let(:indexer) do
116
+ namespaces = @namespaces
117
+ Traject::Indexer::NokogiriIndexer.new("nokogiri.namespaces" => namespaces,
118
+ "nokogiri.each_record_xpath" => "//oai:record") do
119
+ to_field "status", extract_xpath("//oai:record/oai:header/@status")
120
+ end
121
+ end
122
+
123
+ let(:records) { Traject::NokogiriReader.new(StringIO.new(
124
+ <<-XML
125
+ <?xml version="1.0" encoding="UTF-8"?>
126
+ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
127
+ <responseDate>2020-03-03T04:16:09Z</responseDate>
128
+ <request verb="ListRecords" metadataPrefix="marc21" set="blacklight" from="2020-03-02T20:47:11Z">https://na02.alma.exlibrisgroup.com/view/oai/01TULI_INST/request</request>
129
+ <ListRecords>
130
+ <record>
131
+ <header status="deleted">
132
+ <identifier>oai:alma.01TULI_INST:991025803889703811</identifier>
133
+ <datestamp>2020-03-03T03:54:35Z</datestamp>
134
+ <setSpec>blacklight</setSpec>
135
+ <setSpec>rapid_print_journals</setSpec>
136
+ <setSpec>blacklight_qa</setSpec>
137
+ </header>
138
+ </record>
139
+ </ListRecords>
140
+ </OAI-PMH>
141
+ XML
142
+ ), []).to_a }
143
+
144
+ it "extracts the correct attribute" do
145
+ statuses = indexer.map_record(records.first)["status"]
146
+ assert_equal ["deleted"], statuses
147
+ end
113
148
  end
114
149
  end
@@ -134,6 +134,16 @@ describe "Traject::NokogiriReader" do
134
134
  end
135
135
  end
136
136
 
137
+ describe "strict_mode" do
138
+ it "raises on non-well-formed" do
139
+ # invalid because two sibling root nodes, XML requiers one root node
140
+ reader = Traject::NokogiriReader.new(StringIO.new("<doc></doc><doc></doc>"), {"nokogiri.strict_mode" => "true" })
141
+ assert_raises(Nokogiri::XML::SyntaxError) {
142
+ reader.each { |r| }
143
+ }
144
+ end
145
+ end
146
+
137
147
 
138
148
  def shared_tests
139
149
  @reader = Traject::NokogiriReader.new(File.open(@xml_sample_path), {
@@ -170,6 +170,62 @@ describe "Traject::SolrJsonWriter" do
170
170
  assert_length 1, @fake_http_client.post_args, "Has flushed to solr"
171
171
  end
172
172
 
173
+ it "defaults to not setting basic authentication" do
174
+ settings = { "solr.url" => "http://example.com/solr/foo" }
175
+ writer = Traject::SolrJsonWriter.new(settings)
176
+ auth = writer.instance_variable_get("@http_client")
177
+ .www_auth.basic_auth.instance_variable_get("@auth")
178
+ assert(auth.empty?)
179
+ end
180
+
181
+ describe "HTTP basic auth" do
182
+
183
+ it "supports basic authentication settings" do
184
+ settings = {
185
+ "solr.url" => "http://example.com/solr/foo",
186
+ "solr_writer.basic_auth_user" => "foo",
187
+ "solr_writer.basic_auth_password" => "bar",
188
+ }
189
+
190
+ # testing with some internal implementation of HTTPClient sorry
191
+
192
+ writer = Traject::SolrJsonWriter.new(settings)
193
+
194
+ auth = writer.instance_variable_get("@http_client")
195
+ .www_auth.basic_auth.instance_variable_get("@auth")
196
+ assert(!auth.empty?)
197
+ assert_equal(auth.values.first, Base64.encode64("foo:bar").chomp)
198
+ end
199
+
200
+ it "supports basic auth from solr.url" do
201
+ settings = {
202
+ "solr.url" => "http://foo:bar@example.com/solr/foo",
203
+ }
204
+
205
+ # testing with some internal implementation of HTTPClient sorry
206
+
207
+ writer = Traject::SolrJsonWriter.new(settings)
208
+ auth = writer.instance_variable_get("@http_client")
209
+ .www_auth.basic_auth.instance_variable_get("@auth")
210
+ assert(!auth.empty?)
211
+ assert_equal(auth.values.first, Base64.encode64("foo:bar").chomp)
212
+ end
213
+
214
+ it "does not log basic auth from solr.url" do
215
+ string_io = StringIO.new
216
+ settings = {
217
+ "solr.url" => "http://secret_username:secret_password@example.com/solr/foo",
218
+ "logger" => Logger.new(string_io)
219
+ }
220
+
221
+
222
+ writer = Traject::SolrJsonWriter.new(settings)
223
+
224
+ refute_includes string_io.string, "secret_username:secret_password"
225
+ assert_includes string_io.string, "(with HTTP basic auth)"
226
+ end
227
+ end
228
+
173
229
  describe "commit" do
174
230
  it "commits on close when set" do
175
231
  @writer = create_writer("solr.url" => "http://example.com", "solr_writer.commit_on_close" => "true")
@@ -365,4 +421,13 @@ describe "Traject::SolrJsonWriter" do
365
421
  end
366
422
  end
367
423
  end
424
+
425
+ describe "#delete_all!" do
426
+ it "deletes all" do
427
+ @writer.delete_all!
428
+ post_args = @fake_http_client.post_args.first
429
+ assert_equal "http://example.com/solr/update/json", post_args[0]
430
+ assert_equal JSON.generate({"delete" => { "query" => "*:*"}}), post_args[1]
431
+ end
432
+ end
368
433
  end
@@ -0,0 +1 @@
1
+ 01180aam a2200337 a 4500001001000000008004100010015001900051016001800070020002500088020002200113040005900135043001200194050002400206082001400230100003500244245006400279260003800343264003800381264001100419300003600430336002100466336002800487337002500515338002300540504006700563651004000630651005000670651004300720651005600763035002300819a11417842130723t20uu20uuenkb b 001 0 eng d aGBB3854302bnb7 a0164999372Uk a9781849043427 (pbk.) a1849043426 (pbk.) aUKMGBcUKMGBdOCLCOdYDXCPdOCLCOdZWZdOCLCOdCaONFJC aa-ii--- 4aDS485.K25bS64 201504a954.62231 aSnedden, Christopher,eauthor.10aUnderstanding Kashmir and Kashmiris /cChristopher Snedden. 1aLondon :bHurst & Company,c2014. 1aLondon :bHurst & Company,c2015. 4c©2015 axix, 372 pages :bmaps ;c22 cm atext2rdacontent astill image2rdacontent aunmediated2rdamedia avolume2rdacarrier aIncludes bibliographical references (pages 331-355) and index. 0aJammu and Kashmir (India)xHistory. 0aJammu and Kashmir (India)xForeign relations. 0aJammu and Kashmir (India)vBoundaries. 0aJammu and Kashmir (India)xPolitics and government. a(OCoLC-M)858826393
@@ -24,12 +24,12 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "concurrent-ruby", ">= 0.8.0"
25
25
  spec.add_dependency "marc", "~> 1.0"
26
26
 
27
- spec.add_dependency "hashie", "~> 3.1" # used for Indexer#settings
28
- spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
27
+ spec.add_dependency "hashie", ">= 3.1", "< 5" # used for Indexer#settings
28
+ spec.add_dependency "slop", "~> 4.0" # command line parsing
29
29
  spec.add_dependency "yell" # logging
30
30
  spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
31
31
  spec.add_dependency "httpclient", "~> 2.5"
32
- spec.add_dependency "http", "~> 3.0" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
32
+ spec.add_dependency "http", ">= 3.0", "< 5" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
33
33
  spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
34
34
  spec.add_dependency "nokogiri", "~> 1.9" # NokogiriIndexer
35
35
 
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.1.0.rc1
4
+ version: 3.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
8
8
  - Bill Dueber
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-04-10 00:00:00.000000000 Z
12
+ date: 2020-12-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -43,34 +43,34 @@ dependencies:
43
43
  name: hashie
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
- - - "~>"
46
+ - - ">="
47
47
  - !ruby/object:Gem::Version
48
48
  version: '3.1'
49
+ - - "<"
50
+ - !ruby/object:Gem::Version
51
+ version: '5'
49
52
  type: :runtime
50
53
  prerelease: false
51
54
  version_requirements: !ruby/object:Gem::Requirement
52
55
  requirements:
53
- - - "~>"
56
+ - - ">="
54
57
  - !ruby/object:Gem::Version
55
58
  version: '3.1'
59
+ - - "<"
60
+ - !ruby/object:Gem::Version
61
+ version: '5'
56
62
  - !ruby/object:Gem::Dependency
57
63
  name: slop
58
64
  requirement: !ruby/object:Gem::Requirement
59
65
  requirements:
60
- - - ">="
61
- - !ruby/object:Gem::Version
62
- version: 3.4.5
63
- - - "<"
66
+ - - "~>"
64
67
  - !ruby/object:Gem::Version
65
68
  version: '4.0'
66
69
  type: :runtime
67
70
  prerelease: false
68
71
  version_requirements: !ruby/object:Gem::Requirement
69
72
  requirements:
70
- - - ">="
71
- - !ruby/object:Gem::Version
72
- version: 3.4.5
73
- - - "<"
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
75
  version: '4.0'
76
76
  - !ruby/object:Gem::Dependency
@@ -119,16 +119,22 @@ dependencies:
119
119
  name: http
120
120
  requirement: !ruby/object:Gem::Requirement
121
121
  requirements:
122
- - - "~>"
122
+ - - ">="
123
123
  - !ruby/object:Gem::Version
124
124
  version: '3.0'
125
+ - - "<"
126
+ - !ruby/object:Gem::Version
127
+ version: '5'
125
128
  type: :runtime
126
129
  prerelease: false
127
130
  version_requirements: !ruby/object:Gem::Requirement
128
131
  requirements:
129
- - - "~>"
132
+ - - ">="
130
133
  - !ruby/object:Gem::Version
131
134
  version: '3.0'
135
+ - - "<"
136
+ - !ruby/object:Gem::Version
137
+ version: '5'
132
138
  - !ruby/object:Gem::Dependency
133
139
  name: marc-fastxmlwriter
134
140
  requirement: !ruby/object:Gem::Requirement
@@ -219,7 +225,7 @@ dependencies:
219
225
  - - "~>"
220
226
  - !ruby/object:Gem::Version
221
227
  version: '3.4'
222
- description:
228
+ description:
223
229
  email:
224
230
  - none@nowhere.org
225
231
  executables:
@@ -234,8 +240,8 @@ extra_rdoc_files:
234
240
  - doc/settings.md
235
241
  - doc/xml.md
236
242
  files:
243
+ - ".github/workflows/ruby.yml"
237
244
  - ".gitignore"
238
- - ".travis.yml"
239
245
  - ".yardopts"
240
246
  - CHANGES.md
241
247
  - Gemfile
@@ -295,6 +301,7 @@ files:
295
301
  - lib/translation_maps/marc_geographic.yaml
296
302
  - lib/translation_maps/marc_instruments.yaml
297
303
  - lib/translation_maps/marc_languages.yaml
304
+ - test/command_line_test.rb
298
305
  - test/debug_writer_test.rb
299
306
  - test/delimited_writer_test.rb
300
307
  - test/experimental_nokogiri_streaming_reader_test.rb
@@ -330,6 +337,7 @@ files:
330
337
  - test/test_support/bad_subfield_code.marc
331
338
  - test/test_support/bad_utf_byte.utf8.marc
332
339
  - test/test_support/date_resort_to_260.marc
340
+ - test/test_support/date_resort_to_264.marc
333
341
  - test/test_support/date_type_r_missing_date2.marc
334
342
  - test/test_support/date_with_u.marc
335
343
  - test/test_support/demo_config.rb
@@ -377,7 +385,7 @@ homepage: http://github.com/traject/traject
377
385
  licenses:
378
386
  - MIT
379
387
  metadata: {}
380
- post_install_message:
388
+ post_install_message:
381
389
  rdoc_options: []
382
390
  require_paths:
383
391
  - lib
@@ -388,17 +396,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
388
396
  version: '0'
389
397
  required_rubygems_version: !ruby/object:Gem::Requirement
390
398
  requirements:
391
- - - ">"
399
+ - - ">="
392
400
  - !ruby/object:Gem::Version
393
- version: 1.3.1
401
+ version: '0'
394
402
  requirements: []
395
- rubyforge_project:
396
- rubygems_version: 2.7.6
397
- signing_key:
403
+ rubygems_version: 3.0.3
404
+ signing_key:
398
405
  specification_version: 4
399
406
  summary: An easy to use, high-performance, flexible and extensible metadata transformation
400
407
  system, focused on library-archives-museums input, and indexing to Solr as output.
401
408
  test_files:
409
+ - test/command_line_test.rb
402
410
  - test/debug_writer_test.rb
403
411
  - test/delimited_writer_test.rb
404
412
  - test/experimental_nokogiri_streaming_reader_test.rb
@@ -434,6 +442,7 @@ test_files:
434
442
  - test/test_support/bad_subfield_code.marc
435
443
  - test/test_support/bad_utf_byte.utf8.marc
436
444
  - test/test_support/date_resort_to_260.marc
445
+ - test/test_support/date_resort_to_264.marc
437
446
  - test/test_support/date_type_r_missing_date2.marc
438
447
  - test/test_support/date_with_u.marc
439
448
  - test/test_support/demo_config.rb