traject 3.5.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +10 -1
- data/CHANGES.md +5 -1
- data/README.md +17 -1
- data/doc/xml.md +2 -0
- data/examples/marc/tiny.xml +35 -0
- data/lib/traject/debug_writer.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +2 -0
- data/lib/traject/version.rb +1 -1
- data/test/command_line_test.rb +4 -3
- data/test/debug_writer_test.rb +13 -0
- data/test/indexer/read_write_test.rb +14 -3
- data/traject.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e47e6648ed9fc963d18e10c9be48a30273147c4920cb4b7e448d078fd2398ac
|
4
|
+
data.tar.gz: efa549ebcbd87e599b56b955b4bd26422dfe7de67697aed6b39cb421c3b80677
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6acdd2b8cfc888b221a1f19cd5197127006be81d0525169d531fc9bf43fe02cc9ec87401e6b2442c57ff0cd483d9884504ac75be92e3718cbbc49208dc97024f
|
7
|
+
data.tar.gz: 30abefa7af9e1c170ae8570aa59b6c571a9acc1eb7b0abf6efd64d97550b678c21c72d76a8156ef3844ab01154111fd3747f96d6346ee8a8d76e747b2cf92e1f
|
data/.github/workflows/ruby.yml
CHANGED
@@ -12,14 +12,23 @@ jobs:
|
|
12
12
|
strategy:
|
13
13
|
fail-fast: false
|
14
14
|
matrix:
|
15
|
-
ruby: [ '2.4', '2.5', '2.6', '2.7', 'jruby-9.1', 'jruby-9.2' ]
|
15
|
+
ruby: [ '2.4', '2.5', '2.6', '2.7', '3.0', 'jruby-9.1', 'jruby-9.2' ]
|
16
16
|
name: Ruby ${{ matrix.ruby }}
|
17
17
|
steps:
|
18
18
|
- uses: actions/checkout@v2
|
19
|
+
|
19
20
|
- name: Set up Ruby
|
20
21
|
uses: ruby/setup-ruby@v1
|
21
22
|
with:
|
22
23
|
ruby-version: ${{ matrix.ruby }}
|
24
|
+
|
25
|
+
- name: set JAVA_OPTS for jruby-9.1
|
26
|
+
run: echo 'JAVA_OPTS="--add-opens java.base/java.security.cert=ALL-UNNAMED --add-opens java.base/java.security=ALL-UNNAMED --add-opens java.base/java.util.zip=ALL-UNNAMED"' >> $GITHUB_ENV
|
27
|
+
if: ${{ matrix.ruby == 'jruby-9.1' }}
|
28
|
+
# https://github.com/jruby/jruby/issues/4834
|
29
|
+
# Still seems to be an issue in jruby-9.1, but not 9.2
|
30
|
+
# https://github.community/t/conditional-setting-of-env-variables-in-gh-actions/179650
|
31
|
+
|
23
32
|
- name: Install dependencies
|
24
33
|
run: bundle install --jobs 4 --retry 3
|
25
34
|
- name: Run tests
|
data/CHANGES.md
CHANGED
data/README.md
CHANGED
@@ -8,7 +8,7 @@ Traject can also be generalized to a set of tools for getting structured data fr
|
|
8
8
|
|
9
9
|
**Traject is stable, mature software, that is already being used in production by its authors and several other institutions.**
|
10
10
|
|
11
|
-
[](http://badge.fury.io/rb/traject)
|
12
12
|
[](https://github.com/traject/traject/actions?query=workflow%3ACI+branch%3Amaster)
|
13
13
|
|
14
14
|
|
@@ -468,6 +468,22 @@ Also see `-I load_path` option and suggestions for Bundler use under Extending W
|
|
468
468
|
See also [Hints for batch and cronjob use](./doc/batch_execution.md) of traject.
|
469
469
|
|
470
470
|
|
471
|
+
## A small but complete example
|
472
|
+
|
473
|
+
To process a MARC XML file with the data shown in [./examples/marc/tiny.xml](./examples/marc/tiny.xml) you can use save the following configuration as `config.rb`:
|
474
|
+
|
475
|
+
```
|
476
|
+
to_field 'title', extract_marc('245a', first: true)
|
477
|
+
```
|
478
|
+
|
479
|
+
and run Traject as follows:
|
480
|
+
|
481
|
+
```
|
482
|
+
traject -t xml -c config.rb -w Traject::DebugWriter tiny.xml
|
483
|
+
```
|
484
|
+
|
485
|
+
`-t xml` indicates that the file is a MARC XML file. `-w Traject::DebugWriter` outputs the results to the console (e.g. without saving to Solr).
|
486
|
+
|
471
487
|
## Extending With Your Own Code
|
472
488
|
|
473
489
|
Traject config files are full live ruby files, where you can do anything,
|
data/doc/xml.md
CHANGED
@@ -4,6 +4,8 @@ The [NokogiriIndexer](../lib/traject/nokogiri_indexer.md) is a Traject::Indexer
|
|
4
4
|
|
5
5
|
It by default uses the NokogiriReader to read XML and read Nokogiri::XML::Documents, and includes the NokogiriMacros mix-in, with some macros for operating on Nokogiri::XML::Documents.
|
6
6
|
|
7
|
+
Plese notice that the recommened mechanism to parse MARC XML files with Traject is via the `-t` parameter (or the via the `provide "marc_source.type", "xml"` setting). The documentation in this page is for those parsing other (non MARC) XML files.
|
8
|
+
|
7
9
|
## On the command-line
|
8
10
|
|
9
11
|
You can tell the traject command-line to use the NokogiriIndexer with the `-i xml` flag:
|
@@ -0,0 +1,35 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:marc="http://www.loc.gov/MARC21/slim">
|
3
|
+
<record>
|
4
|
+
<leader>01352cam a2200349 a 4500</leader>
|
5
|
+
<datafield tag="245" ind1="0" ind2="0">
|
6
|
+
<subfield code="6">880-01</subfield>
|
7
|
+
<subfield code="a">Kazoku kankei no shakai shinrigaku /</subfield>
|
8
|
+
<subfield code="c">Osada Masayoshi hen.</subfield>
|
9
|
+
</datafield>
|
10
|
+
</record>
|
11
|
+
<record>
|
12
|
+
<leader>01121ccm a2200289z 4500</leader>
|
13
|
+
<datafield tag="245" ind1="1" ind2="0">
|
14
|
+
<subfield code="a">Powhatan's daughter :</subfield>
|
15
|
+
<subfield code="b">march</subfield>
|
16
|
+
</datafield>
|
17
|
+
<datafield tag="100" ind1="1" ind2=" ">
|
18
|
+
<subfield code="a">Sousa, John Philip,</subfield>
|
19
|
+
<subfield code="d">1854-1932,</subfield>
|
20
|
+
<subfield code="e">composer.</subfield>
|
21
|
+
</datafield>
|
22
|
+
</record>
|
23
|
+
<record>
|
24
|
+
<leader>01137cam a2200301 a 4500</leader>
|
25
|
+
<datafield tag="245" ind1="1" ind2="0">
|
26
|
+
<subfield code="a">Two pieces /</subfield>
|
27
|
+
<subfield code="c">by Frank O'Hara.</subfield>
|
28
|
+
</datafield>
|
29
|
+
<datafield tag="100" ind1="1" ind2=" ">
|
30
|
+
<subfield code="a">O'Hara, Frank,</subfield>
|
31
|
+
<subfield code="d">1926-1966.</subfield>
|
32
|
+
<subfield code="0">http://id.loc.gov/authorities/names/n79042130</subfield>
|
33
|
+
</datafield>
|
34
|
+
</record>
|
35
|
+
</collection>
|
data/lib/traject/debug_writer.rb
CHANGED
@@ -62,7 +62,7 @@ All records are assumed to have a unique id. You can set which field to look in
|
|
62
62
|
def serialize(context)
|
63
63
|
h = context.output_hash
|
64
64
|
rec_key = record_number(context)
|
65
|
-
lines = h.keys.sort.map { |k| @format % [rec_key, k, h[k].join(' | ')] }
|
65
|
+
lines = h.keys.sort.map { |k| @format % [rec_key, k, (h[k] || []).join(' | ')] }
|
66
66
|
lines.push "\n"
|
67
67
|
lines.join("\n")
|
68
68
|
end
|
data/lib/traject/version.rb
CHANGED
data/test/command_line_test.rb
CHANGED
@@ -22,15 +22,16 @@ describe "Shell out to command line" do
|
|
22
22
|
|
23
23
|
it "can display version" do
|
24
24
|
out, err, result = execute_with_args("-v")
|
25
|
+
|
26
|
+
assert result.success?, "Expected subprocess exit code to be success.\nSTDERR:\n#{err}\n\nSTDOUT:\n#{out}"
|
25
27
|
assert_equal err, "traject version #{Traject::VERSION}\n"
|
26
|
-
assert result.success?
|
27
28
|
end
|
28
29
|
|
29
30
|
it "can display help text" do
|
30
31
|
out, err, result = execute_with_args("-h")
|
31
32
|
|
33
|
+
assert result.success?, "Expected subprocess exit code to be success.\nSTDERR:\n#{err}\n\nSTDOUT:\n#{out}"
|
32
34
|
assert err.start_with?("traject [options] -c configuration.rb [-c config2.rb] file.mrc")
|
33
|
-
assert result.success?
|
34
35
|
end
|
35
36
|
|
36
37
|
it "handles bad argument" do
|
@@ -43,7 +44,7 @@ describe "Shell out to command line" do
|
|
43
44
|
it "does basic dry run" do
|
44
45
|
out, err, result = execute_with_args("--debug-mode -s one=two -s three=four -c test/test_support/demo_config.rb test/test_support/emptyish_record.marc")
|
45
46
|
|
46
|
-
assert result.success
|
47
|
+
assert result.success?, "Expected subprocess exit code to be success.\nSTDERR:\n#{err}\n\nSTDOUT:\n#{out}"
|
47
48
|
assert_includes err, "executing with: `--debug-mode -s one=two -s three=four"
|
48
49
|
assert_match /bib_1000165 +author_sort +Collection la/, out
|
49
50
|
end
|
data/test/debug_writer_test.rb
CHANGED
@@ -73,6 +73,19 @@ describe 'Simple output' do
|
|
73
73
|
|
74
74
|
end
|
75
75
|
|
76
|
+
it "deals ok with nil values" do
|
77
|
+
record_with_nil_value = {"id"=>["2710183"], "title"=>["Manufacturing consent : the political economy of the mass media /"], "xyz"=>nil}
|
78
|
+
@writer.put Traject::Indexer::Context.new(:output_hash => record_with_nil_value)
|
79
|
+
expected = [
|
80
|
+
"#{@id} id #{@id}",
|
81
|
+
"#{@id} title #{@title}",
|
82
|
+
"#{@id} xyz",
|
83
|
+
"\n"
|
84
|
+
]
|
85
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
86
|
+
@writer.close
|
87
|
+
|
88
|
+
end
|
76
89
|
end
|
77
90
|
|
78
91
|
|
@@ -7,7 +7,8 @@ memory_writer_class = Class.new do
|
|
7
7
|
# store them in a class variable so we can test em later
|
8
8
|
# Supress the warning message
|
9
9
|
original_verbose, $VERBOSE = $VERBOSE, nil
|
10
|
-
|
10
|
+
@settings = settings
|
11
|
+
self.class.store_last_writer_settings(@settings)
|
11
12
|
# Activate warning messages again.
|
12
13
|
$VERBOSE = original_verbose
|
13
14
|
@settings["memory_writer.added"] = []
|
@@ -20,6 +21,16 @@ memory_writer_class = Class.new do
|
|
20
21
|
def close
|
21
22
|
@settings["memory_writer.closed"] = true
|
22
23
|
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def self.store_last_writer_settings(settings)
|
28
|
+
@last_writer_settings = settings
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.last_writer_settings
|
32
|
+
@last_writer_settings
|
33
|
+
end
|
23
34
|
end
|
24
35
|
|
25
36
|
describe "Traject::Indexer#process" do
|
@@ -53,7 +64,7 @@ describe "Traject::Indexer#process" do
|
|
53
64
|
|
54
65
|
# Grab the settings out of a class variable where we left em,
|
55
66
|
# as a convenient place to store outcomes so we can test em.
|
56
|
-
writer_settings = memory_writer_class.
|
67
|
+
writer_settings = memory_writer_class.last_writer_settings
|
57
68
|
|
58
69
|
assert writer_settings["memory_writer.added"]
|
59
70
|
assert_equal 30, writer_settings["memory_writer.added"].length
|
@@ -146,7 +157,7 @@ describe "Traject::Indexer#process" do
|
|
146
157
|
it "parses and loads" do
|
147
158
|
@indexer.process([@file1, @file2])
|
148
159
|
# kinda ridic, yeah.
|
149
|
-
output_hashes = memory_writer_class.
|
160
|
+
output_hashes = memory_writer_class.last_writer_settings["memory_writer.added"].collect(&:output_hash)
|
150
161
|
|
151
162
|
assert_length 2, output_hashes
|
152
163
|
assert output_hashes.all? { |hash| hash["title"].length > 0 }
|
data/traject.gemspec
CHANGED
@@ -29,7 +29,7 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_dependency "yell" # logging
|
30
30
|
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
31
31
|
spec.add_dependency "httpclient", "~> 2.5"
|
32
|
-
spec.add_dependency "http", ">= 3.0", "<
|
32
|
+
spec.add_dependency "http", ">= 3.0", "< 6" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
|
33
33
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
34
34
|
spec.add_dependency "nokogiri", "~> 1.9" # NokogiriIndexer
|
35
35
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-06-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -124,7 +124,7 @@ dependencies:
|
|
124
124
|
version: '3.0'
|
125
125
|
- - "<"
|
126
126
|
- !ruby/object:Gem::Version
|
127
|
-
version: '
|
127
|
+
version: '6'
|
128
128
|
type: :runtime
|
129
129
|
prerelease: false
|
130
130
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -134,7 +134,7 @@ dependencies:
|
|
134
134
|
version: '3.0'
|
135
135
|
- - "<"
|
136
136
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
137
|
+
version: '6'
|
138
138
|
- !ruby/object:Gem::Dependency
|
139
139
|
name: marc-fastxmlwriter
|
140
140
|
requirement: !ruby/object:Gem::Requirement
|
@@ -257,6 +257,7 @@ files:
|
|
257
257
|
- doc/programmatic_use.md
|
258
258
|
- doc/settings.md
|
259
259
|
- doc/xml.md
|
260
|
+
- examples/marc/tiny.xml
|
260
261
|
- lib/tasks/load_maps.rake
|
261
262
|
- lib/traject.rb
|
262
263
|
- lib/traject/array_writer.rb
|