traject 3.5.0 → 3.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +10 -1
- data/CHANGES.md +5 -1
- data/README.md +17 -1
- data/doc/xml.md +2 -0
- data/examples/marc/tiny.xml +35 -0
- data/lib/traject/debug_writer.rb +1 -1
- data/lib/traject/nokogiri_reader.rb +2 -0
- data/lib/traject/version.rb +1 -1
- data/test/command_line_test.rb +4 -3
- data/test/debug_writer_test.rb +13 -0
- data/test/indexer/read_write_test.rb +14 -3
- data/traject.gemspec +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e47e6648ed9fc963d18e10c9be48a30273147c4920cb4b7e448d078fd2398ac
|
4
|
+
data.tar.gz: efa549ebcbd87e599b56b955b4bd26422dfe7de67697aed6b39cb421c3b80677
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6acdd2b8cfc888b221a1f19cd5197127006be81d0525169d531fc9bf43fe02cc9ec87401e6b2442c57ff0cd483d9884504ac75be92e3718cbbc49208dc97024f
|
7
|
+
data.tar.gz: 30abefa7af9e1c170ae8570aa59b6c571a9acc1eb7b0abf6efd64d97550b678c21c72d76a8156ef3844ab01154111fd3747f96d6346ee8a8d76e747b2cf92e1f
|
data/.github/workflows/ruby.yml
CHANGED
@@ -12,14 +12,23 @@ jobs:
|
|
12
12
|
strategy:
|
13
13
|
fail-fast: false
|
14
14
|
matrix:
|
15
|
-
ruby: [ '2.4', '2.5', '2.6', '2.7', 'jruby-9.1', 'jruby-9.2' ]
|
15
|
+
ruby: [ '2.4', '2.5', '2.6', '2.7', '3.0', 'jruby-9.1', 'jruby-9.2' ]
|
16
16
|
name: Ruby ${{ matrix.ruby }}
|
17
17
|
steps:
|
18
18
|
- uses: actions/checkout@v2
|
19
|
+
|
19
20
|
- name: Set up Ruby
|
20
21
|
uses: ruby/setup-ruby@v1
|
21
22
|
with:
|
22
23
|
ruby-version: ${{ matrix.ruby }}
|
24
|
+
|
25
|
+
- name: set JAVA_OPTS for jruby-9.1
|
26
|
+
run: echo 'JAVA_OPTS="--add-opens java.base/java.security.cert=ALL-UNNAMED --add-opens java.base/java.security=ALL-UNNAMED --add-opens java.base/java.util.zip=ALL-UNNAMED"' >> $GITHUB_ENV
|
27
|
+
if: ${{ matrix.ruby == 'jruby-9.1' }}
|
28
|
+
# https://github.com/jruby/jruby/issues/4834
|
29
|
+
# Still seems to be an issue in jruby-9.1, but not 9.2
|
30
|
+
# https://github.community/t/conditional-setting-of-env-variables-in-gh-actions/179650
|
31
|
+
|
23
32
|
- name: Install dependencies
|
24
33
|
run: bundle install --jobs 4 --retry 3
|
25
34
|
- name: Run tests
|
data/CHANGES.md
CHANGED
data/README.md
CHANGED
@@ -8,7 +8,7 @@ Traject can also be generalized to a set of tools for getting structured data fr
|
|
8
8
|
|
9
9
|
**Traject is stable, mature software, that is already being used in production by its authors and several other institutions.**
|
10
10
|
|
11
|
-
[![Gem Version](https://badge.fury.io/rb/traject.
|
11
|
+
[![Gem Version](https://badge.fury.io/rb/traject.svg)](http://badge.fury.io/rb/traject)
|
12
12
|
[![CI Status](https://github.com/traject/traject/workflows/CI/badge.svg?branch=master)](https://github.com/traject/traject/actions?query=workflow%3ACI+branch%3Amaster)
|
13
13
|
|
14
14
|
|
@@ -468,6 +468,22 @@ Also see `-I load_path` option and suggestions for Bundler use under Extending W
|
|
468
468
|
See also [Hints for batch and cronjob use](./doc/batch_execution.md) of traject.
|
469
469
|
|
470
470
|
|
471
|
+
## A small but complete example
|
472
|
+
|
473
|
+
To process a MARC XML file with the data shown in [./examples/marc/tiny.xml](./examples/marc/tiny.xml) you can use save the following configuration as `config.rb`:
|
474
|
+
|
475
|
+
```
|
476
|
+
to_field 'title', extract_marc('245a', first: true)
|
477
|
+
```
|
478
|
+
|
479
|
+
and run Traject as follows:
|
480
|
+
|
481
|
+
```
|
482
|
+
traject -t xml -c config.rb -w Traject::DebugWriter tiny.xml
|
483
|
+
```
|
484
|
+
|
485
|
+
`-t xml` indicates that the file is a MARC XML file. `-w Traject::DebugWriter` outputs the results to the console (e.g. without saving to Solr).
|
486
|
+
|
471
487
|
## Extending With Your Own Code
|
472
488
|
|
473
489
|
Traject config files are full live ruby files, where you can do anything,
|
data/doc/xml.md
CHANGED
@@ -4,6 +4,8 @@ The [NokogiriIndexer](../lib/traject/nokogiri_indexer.md) is a Traject::Indexer
|
|
4
4
|
|
5
5
|
It by default uses the NokogiriReader to read XML and read Nokogiri::XML::Documents, and includes the NokogiriMacros mix-in, with some macros for operating on Nokogiri::XML::Documents.
|
6
6
|
|
7
|
+
Plese notice that the recommened mechanism to parse MARC XML files with Traject is via the `-t` parameter (or the via the `provide "marc_source.type", "xml"` setting). The documentation in this page is for those parsing other (non MARC) XML files.
|
8
|
+
|
7
9
|
## On the command-line
|
8
10
|
|
9
11
|
You can tell the traject command-line to use the NokogiriIndexer with the `-i xml` flag:
|
@@ -0,0 +1,35 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<collection xmlns="http://www.loc.gov/MARC21/slim" xmlns:marc="http://www.loc.gov/MARC21/slim">
|
3
|
+
<record>
|
4
|
+
<leader>01352cam a2200349 a 4500</leader>
|
5
|
+
<datafield tag="245" ind1="0" ind2="0">
|
6
|
+
<subfield code="6">880-01</subfield>
|
7
|
+
<subfield code="a">Kazoku kankei no shakai shinrigaku /</subfield>
|
8
|
+
<subfield code="c">Osada Masayoshi hen.</subfield>
|
9
|
+
</datafield>
|
10
|
+
</record>
|
11
|
+
<record>
|
12
|
+
<leader>01121ccm a2200289z 4500</leader>
|
13
|
+
<datafield tag="245" ind1="1" ind2="0">
|
14
|
+
<subfield code="a">Powhatan's daughter :</subfield>
|
15
|
+
<subfield code="b">march</subfield>
|
16
|
+
</datafield>
|
17
|
+
<datafield tag="100" ind1="1" ind2=" ">
|
18
|
+
<subfield code="a">Sousa, John Philip,</subfield>
|
19
|
+
<subfield code="d">1854-1932,</subfield>
|
20
|
+
<subfield code="e">composer.</subfield>
|
21
|
+
</datafield>
|
22
|
+
</record>
|
23
|
+
<record>
|
24
|
+
<leader>01137cam a2200301 a 4500</leader>
|
25
|
+
<datafield tag="245" ind1="1" ind2="0">
|
26
|
+
<subfield code="a">Two pieces /</subfield>
|
27
|
+
<subfield code="c">by Frank O'Hara.</subfield>
|
28
|
+
</datafield>
|
29
|
+
<datafield tag="100" ind1="1" ind2=" ">
|
30
|
+
<subfield code="a">O'Hara, Frank,</subfield>
|
31
|
+
<subfield code="d">1926-1966.</subfield>
|
32
|
+
<subfield code="0">http://id.loc.gov/authorities/names/n79042130</subfield>
|
33
|
+
</datafield>
|
34
|
+
</record>
|
35
|
+
</collection>
|
data/lib/traject/debug_writer.rb
CHANGED
@@ -62,7 +62,7 @@ All records are assumed to have a unique id. You can set which field to look in
|
|
62
62
|
def serialize(context)
|
63
63
|
h = context.output_hash
|
64
64
|
rec_key = record_number(context)
|
65
|
-
lines = h.keys.sort.map { |k| @format % [rec_key, k, h[k].join(' | ')] }
|
65
|
+
lines = h.keys.sort.map { |k| @format % [rec_key, k, (h[k] || []).join(' | ')] }
|
66
66
|
lines.push "\n"
|
67
67
|
lines.join("\n")
|
68
68
|
end
|
data/lib/traject/version.rb
CHANGED
data/test/command_line_test.rb
CHANGED
@@ -22,15 +22,16 @@ describe "Shell out to command line" do
|
|
22
22
|
|
23
23
|
it "can display version" do
|
24
24
|
out, err, result = execute_with_args("-v")
|
25
|
+
|
26
|
+
assert result.success?, "Expected subprocess exit code to be success.\nSTDERR:\n#{err}\n\nSTDOUT:\n#{out}"
|
25
27
|
assert_equal err, "traject version #{Traject::VERSION}\n"
|
26
|
-
assert result.success?
|
27
28
|
end
|
28
29
|
|
29
30
|
it "can display help text" do
|
30
31
|
out, err, result = execute_with_args("-h")
|
31
32
|
|
33
|
+
assert result.success?, "Expected subprocess exit code to be success.\nSTDERR:\n#{err}\n\nSTDOUT:\n#{out}"
|
32
34
|
assert err.start_with?("traject [options] -c configuration.rb [-c config2.rb] file.mrc")
|
33
|
-
assert result.success?
|
34
35
|
end
|
35
36
|
|
36
37
|
it "handles bad argument" do
|
@@ -43,7 +44,7 @@ describe "Shell out to command line" do
|
|
43
44
|
it "does basic dry run" do
|
44
45
|
out, err, result = execute_with_args("--debug-mode -s one=two -s three=four -c test/test_support/demo_config.rb test/test_support/emptyish_record.marc")
|
45
46
|
|
46
|
-
assert result.success
|
47
|
+
assert result.success?, "Expected subprocess exit code to be success.\nSTDERR:\n#{err}\n\nSTDOUT:\n#{out}"
|
47
48
|
assert_includes err, "executing with: `--debug-mode -s one=two -s three=four"
|
48
49
|
assert_match /bib_1000165 +author_sort +Collection la/, out
|
49
50
|
end
|
data/test/debug_writer_test.rb
CHANGED
@@ -73,6 +73,19 @@ describe 'Simple output' do
|
|
73
73
|
|
74
74
|
end
|
75
75
|
|
76
|
+
it "deals ok with nil values" do
|
77
|
+
record_with_nil_value = {"id"=>["2710183"], "title"=>["Manufacturing consent : the political economy of the mass media /"], "xyz"=>nil}
|
78
|
+
@writer.put Traject::Indexer::Context.new(:output_hash => record_with_nil_value)
|
79
|
+
expected = [
|
80
|
+
"#{@id} id #{@id}",
|
81
|
+
"#{@id} title #{@title}",
|
82
|
+
"#{@id} xyz",
|
83
|
+
"\n"
|
84
|
+
]
|
85
|
+
assert_equal expected.join("\n").gsub(/\s/, ''), @io.string.gsub(/\s/, '')
|
86
|
+
@writer.close
|
87
|
+
|
88
|
+
end
|
76
89
|
end
|
77
90
|
|
78
91
|
|
@@ -7,7 +7,8 @@ memory_writer_class = Class.new do
|
|
7
7
|
# store them in a class variable so we can test em later
|
8
8
|
# Supress the warning message
|
9
9
|
original_verbose, $VERBOSE = $VERBOSE, nil
|
10
|
-
|
10
|
+
@settings = settings
|
11
|
+
self.class.store_last_writer_settings(@settings)
|
11
12
|
# Activate warning messages again.
|
12
13
|
$VERBOSE = original_verbose
|
13
14
|
@settings["memory_writer.added"] = []
|
@@ -20,6 +21,16 @@ memory_writer_class = Class.new do
|
|
20
21
|
def close
|
21
22
|
@settings["memory_writer.closed"] = true
|
22
23
|
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def self.store_last_writer_settings(settings)
|
28
|
+
@last_writer_settings = settings
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.last_writer_settings
|
32
|
+
@last_writer_settings
|
33
|
+
end
|
23
34
|
end
|
24
35
|
|
25
36
|
describe "Traject::Indexer#process" do
|
@@ -53,7 +64,7 @@ describe "Traject::Indexer#process" do
|
|
53
64
|
|
54
65
|
# Grab the settings out of a class variable where we left em,
|
55
66
|
# as a convenient place to store outcomes so we can test em.
|
56
|
-
writer_settings = memory_writer_class.
|
67
|
+
writer_settings = memory_writer_class.last_writer_settings
|
57
68
|
|
58
69
|
assert writer_settings["memory_writer.added"]
|
59
70
|
assert_equal 30, writer_settings["memory_writer.added"].length
|
@@ -146,7 +157,7 @@ describe "Traject::Indexer#process" do
|
|
146
157
|
it "parses and loads" do
|
147
158
|
@indexer.process([@file1, @file2])
|
148
159
|
# kinda ridic, yeah.
|
149
|
-
output_hashes = memory_writer_class.
|
160
|
+
output_hashes = memory_writer_class.last_writer_settings["memory_writer.added"].collect(&:output_hash)
|
150
161
|
|
151
162
|
assert_length 2, output_hashes
|
152
163
|
assert output_hashes.all? { |hash| hash["title"].length > 0 }
|
data/traject.gemspec
CHANGED
@@ -29,7 +29,7 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_dependency "yell" # logging
|
30
30
|
spec.add_dependency "dot-properties", ">= 0.1.1" # reading java style .properties
|
31
31
|
spec.add_dependency "httpclient", "~> 2.5"
|
32
|
-
spec.add_dependency "http", ">= 3.0", "<
|
32
|
+
spec.add_dependency "http", ">= 3.0", "< 6" # used in oai_pmh_reader, may use more extensively in future instead of httpclient
|
33
33
|
spec.add_dependency 'marc-fastxmlwriter', '~>1.0' # fast marc->xml
|
34
34
|
spec.add_dependency "nokogiri", "~> 1.9" # NokogiriIndexer
|
35
35
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: traject
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.
|
4
|
+
version: 3.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Rochkind
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-06-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: concurrent-ruby
|
@@ -124,7 +124,7 @@ dependencies:
|
|
124
124
|
version: '3.0'
|
125
125
|
- - "<"
|
126
126
|
- !ruby/object:Gem::Version
|
127
|
-
version: '
|
127
|
+
version: '6'
|
128
128
|
type: :runtime
|
129
129
|
prerelease: false
|
130
130
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -134,7 +134,7 @@ dependencies:
|
|
134
134
|
version: '3.0'
|
135
135
|
- - "<"
|
136
136
|
- !ruby/object:Gem::Version
|
137
|
-
version: '
|
137
|
+
version: '6'
|
138
138
|
- !ruby/object:Gem::Dependency
|
139
139
|
name: marc-fastxmlwriter
|
140
140
|
requirement: !ruby/object:Gem::Requirement
|
@@ -257,6 +257,7 @@ files:
|
|
257
257
|
- doc/programmatic_use.md
|
258
258
|
- doc/settings.md
|
259
259
|
- doc/xml.md
|
260
|
+
- examples/marc/tiny.xml
|
260
261
|
- lib/tasks/load_maps.rake
|
261
262
|
- lib/traject.rb
|
262
263
|
- lib/traject/array_writer.rb
|