metacrunch-marcxml 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +35 -0
- data/Gemfile +3 -2
- data/Readme.md +59 -14
- data/lib/metacrunch/marcxml.rb +10 -0
- data/lib/metacrunch/marcxml/document.rb +52 -10
- data/lib/metacrunch/marcxml/document/subfield_set.rb +0 -8
- data/lib/metacrunch/marcxml/parser.rb +4 -3
- data/lib/metacrunch/marcxml/version.rb +1 -1
- data/metacrunch-marcxml.gemspec +1 -1
- metadata +10 -11
- data/.travis.yml +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3634c811258d11102cde40466592986814cb3ecf463985fd4ff139934539fe8b
|
4
|
+
data.tar.gz: 9b86c52e489af8bd05f698af04526d3c1561d67180fd37db88d797d351481ca9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 170cb959fb34cd493867c07e0d6ff3553385ccb7c7cca41beeee1f70208725921ea80f2a3765511fd87d330a1d61b90ab7167cf3a25df65797f6194960f8ec7b
|
7
|
+
data.tar.gz: ccfd0da9bfc9cb8d66dbf6fb632f0b7949fc0fb539586a9eff1ef2d148446f8d611ddea48346dbbb82052ff794ed6b8704fa5d3ed967a38c4c61f3b7eeabc33e
|
@@ -0,0 +1,35 @@
|
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@1.1.1
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
docker:
|
8
|
+
- image: circleci/ruby:2.6-node-browsers
|
9
|
+
|
10
|
+
working_directory: ~/repo
|
11
|
+
|
12
|
+
steps:
|
13
|
+
- checkout
|
14
|
+
|
15
|
+
- run:
|
16
|
+
name: Install dependencies
|
17
|
+
command: bundle install --jobs=4 --retry=3 --path vendor/bundle
|
18
|
+
|
19
|
+
- run:
|
20
|
+
name: Install CodeClimate test coverage reporter
|
21
|
+
command: |
|
22
|
+
curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
23
|
+
chmod +x ./cc-test-reporter
|
24
|
+
./cc-test-reporter before-build
|
25
|
+
|
26
|
+
- run:
|
27
|
+
name: Run tests
|
28
|
+
command: |
|
29
|
+
mkdir /tmp/test-results
|
30
|
+
bundle exec rspec --format progress --format RspecJunitFormatter --out /tmp/test-results/rspec.xml
|
31
|
+
|
32
|
+
- run:
|
33
|
+
name: Upload test coverage report to CodeClimate
|
34
|
+
command: ./cc-test-reporter after-build --exit-code $?
|
35
|
+
|
data/Gemfile
CHANGED
@@ -5,7 +5,6 @@ gemspec
|
|
5
5
|
group :development do
|
6
6
|
gem "bundler", ">= 1.15"
|
7
7
|
gem "rake", ">= 12.1"
|
8
|
-
gem "rspec", ">= 3.5.0", "< 4.0.0"
|
9
8
|
|
10
9
|
if !ENV["CI"]
|
11
10
|
gem "pry-byebug", ">= 3.5.0"
|
@@ -13,5 +12,7 @@ group :development do
|
|
13
12
|
end
|
14
13
|
|
15
14
|
group :test do
|
16
|
-
gem "
|
15
|
+
gem "rspec", ">= 3.5.0", "< 4.0.0"
|
16
|
+
gem "rspec_junit_formatter", ">= 0.3.0"
|
17
|
+
gem "simplecov", "= 0.17.1"
|
17
18
|
end
|
data/Readme.md
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
metacrunch-marcxml
|
2
2
|
==================
|
3
3
|
|
4
|
-
[](http://badge.fury.io/rb/metacrunch-marcxml)
|
5
5
|
[](https://codeclimate.com/github/ubpb/metacrunch-marcxml)
|
6
|
+
[](https://codeclimate.com/github/ubpb/metacrunch-marcxml/coverage)
|
7
|
+
[](https://circleci.com/gh/ubpb/metacrunch-marcxml)
|
6
8
|
|
7
9
|
This is the official [MARCXML](http://www.loc.gov/standards/marcxml/) package for the [metacrunch ETL toolkit](https://github.com/ubpb/metacrunch). It allows you to access MARCXML data by a simple and powerful Ruby API.
|
8
10
|
|
@@ -17,7 +19,7 @@ Installation
|
|
17
19
|
Include the gem in your `Gemfile`
|
18
20
|
|
19
21
|
```ruby
|
20
|
-
gem "metacrunch-marcxml", "~>
|
22
|
+
gem "metacrunch-marcxml", "~> 3.0.0"
|
21
23
|
```
|
22
24
|
|
23
25
|
and run `$ bundle install` to install it.
|
@@ -32,28 +34,33 @@ $ gem install metacrunch-marcxml
|
|
32
34
|
Usage example
|
33
35
|
-------------
|
34
36
|
|
35
|
-
|
37
|
+
*Note: For working examples on how to use this package in a metacrunch job check out our [demo repository](https://github.com/ubpb/metacrunch-demo).*
|
38
|
+
|
39
|
+
**Load the library**
|
40
|
+
|
36
41
|
```ruby
|
37
42
|
require "metacrunch/marcxml"
|
38
43
|
```
|
39
44
|
|
40
45
|
**Parsing a [MARCXML file](http://d-nb.info/982392028/about/marcxml)**
|
46
|
+
|
41
47
|
```ruby
|
42
48
|
# Load a MARCXML file (from a remote location in this example).
|
43
49
|
require "open-uri"
|
44
|
-
marcxml = open("http://d-nb.info/982392028/about/marcxml"){|io| io.read}
|
50
|
+
marcxml = URI.open("http://d-nb.info/982392028/about/marcxml"){|io| io.read}
|
45
51
|
|
46
52
|
# Now parse the file
|
47
53
|
document = Metacrunch::Marcxml.parse(marcxml)
|
48
|
-
# .. or
|
54
|
+
# .. or
|
55
|
+
document = Metacrunch::Marcxml[marcxml]
|
56
|
+
# .. or
|
49
57
|
document = Metacrunch::Marcxml(marcxml)
|
50
58
|
```
|
51
59
|
|
52
|
-
**
|
60
|
+
**Access control fields**
|
61
|
+
|
53
62
|
```ruby
|
54
63
|
controlfield = document.controlfield("005")
|
55
|
-
# same as ...
|
56
|
-
controlfield = document.controlfield(5)
|
57
64
|
# => #<Metacrunch::Marcxml::Document::Controlfield:0x007fd4c5120ec0 ...>
|
58
65
|
|
59
66
|
tag = controlfield.tag
|
@@ -62,16 +69,17 @@ value = controlfield.value
|
|
62
69
|
# => "20130926112144.0"
|
63
70
|
```
|
64
71
|
|
65
|
-
**
|
72
|
+
**Access data fields / sub fields**
|
73
|
+
|
66
74
|
```ruby
|
67
|
-
# Find fields matching tag=100 and
|
68
|
-
datafield_set = document.datafields(100, ind1: "1")
|
75
|
+
# Find fields matching tag=100 and ind1=1 (author)
|
76
|
+
datafield_set = document.datafields("100", ind1: "1")
|
69
77
|
# => #<Metacrunch::Marcxml::Document::DatafieldSet:0x007fd4c4ce4b40 ...>
|
70
78
|
|
71
79
|
first_author = datafield_set.first # set is an Enumerable
|
72
80
|
# => #<Metacrunch::Marcxml::Document::Datafield:0x007fd4c5129480 ...>
|
73
81
|
|
74
|
-
# Get the
|
82
|
+
# Get the sub fields matching code=a (author name)
|
75
83
|
subfield_set = first_author.subfields("a")
|
76
84
|
# => #<Metacrunch::Marcxml::Document::SubfieldSet:0x007fd4c4c779f0 ...>
|
77
85
|
|
@@ -82,8 +90,45 @@ first_author_subfield = subfield_set.first # subfield_set is an Enumerable
|
|
82
90
|
first_author_name = first_author_subfield.value
|
83
91
|
# => "Orwell, George"
|
84
92
|
|
85
|
-
# ...
|
86
|
-
first_author_name = document.datafields(100, ind1: "1").subfields("a").
|
93
|
+
# ... this can be a one liner
|
94
|
+
first_author_name = document.datafields(100, ind1: "1").subfields("a").values.first
|
95
|
+
```
|
96
|
+
|
97
|
+
**Direct value access using a query string**
|
98
|
+
|
99
|
+
Access fields as described above is flexible but very verbose. Most of the time you know your data and you are interested in a simple and direct way to access the field values.
|
100
|
+
|
101
|
+
For this case we provide a way to query field values using a simple query string.
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
# Get the value of control field "005"
|
105
|
+
document["005"]
|
106
|
+
# => "20130926112144.0"
|
107
|
+
|
108
|
+
# Get the first value of data field tag=100, ind1=1, sub field code=a
|
109
|
+
document["1001*a"].first
|
110
|
+
# => "Orwell, George"
|
111
|
+
```
|
112
|
+
|
113
|
+
The query string syntax is simple. Each query string starts with three letters for the tag. If the tag starts with `00` it is considered a query for a control field value. Otherwise it is considered a data field / sub field query. In that case the next two characters are used to match ind1 and ind2. The default value is `*` which matches every indicator value. `-`, `_` and ` ` are interpreted as `blank`. The last characters are used to match the code of the sub fields. To query for more than one sub field code you may separate them using commas.
|
114
|
+
|
115
|
+
**Examples**
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
document["1001*a"]
|
119
|
+
# => ["Orwell, George"]
|
120
|
+
|
121
|
+
document["020**a,c"]
|
122
|
+
# => ["9783548267456",
|
123
|
+
# "kart. : EUR 6.00 (DE), EUR 6.20 (AT), sfr 11.00",
|
124
|
+
# "3548267459",
|
125
|
+
# "kart. : EUR 6.00 (DE), EUR 6.20 (AT), sfr 11.00"]
|
126
|
+
|
127
|
+
document["2463_a"]
|
128
|
+
# => ["Neunzehnhundertvierundachtzig"]
|
129
|
+
|
130
|
+
document["264_1a,b,c"]
|
131
|
+
# => ["Berlin", "Ullstein", "2007"]
|
87
132
|
```
|
88
133
|
|
89
134
|
License
|
data/lib/metacrunch/marcxml.rb
CHANGED
@@ -32,6 +32,16 @@ module Metacrunch
|
|
32
32
|
def parse(xml)
|
33
33
|
Parser.new.parse(xml)
|
34
34
|
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Convenience method for Metacrunch::Marcxml.parse
|
38
|
+
#
|
39
|
+
# @return [Metacrunch::Marcxml::Document] the parsed {Metacrunch::Marcxml::Document}
|
40
|
+
# @see Metacrunch::Marcxml#parse
|
41
|
+
#
|
42
|
+
def [](xml)
|
43
|
+
self.parse(xml)
|
44
|
+
end
|
35
45
|
end
|
36
46
|
end
|
37
47
|
|
@@ -23,7 +23,7 @@ module Metacrunch
|
|
23
23
|
#
|
24
24
|
# @param tag [String, Integer] the tag of the control field. The tag can be
|
25
25
|
# a string or an integer.
|
26
|
-
# @return [Controlfield, nil] the control field with the matching tag or nil
|
26
|
+
# @return [Metacrunch::Marcxml::Document::Controlfield, nil] the control field with the matching tag or nil
|
27
27
|
# if a control field with a matching tag does not exist.
|
28
28
|
#
|
29
29
|
def controlfield(tag)
|
@@ -48,20 +48,18 @@ module Metacrunch
|
|
48
48
|
# to match all data fields. The tag can be a string or an integer. To filter for
|
49
49
|
# more than a single tag, `tag` also accepts any object that responds to `#each` like
|
50
50
|
# `Array` and `Range`.
|
51
|
-
#
|
52
|
-
# @param ind1 [String, nil, Array<String>] filter by indicator 1. Can be nil to match
|
51
|
+
# @param ind1 [nil, String, Array<String>] filter by indicator 1. Can be nil to match
|
53
52
|
# any indicator.
|
54
|
-
# @param ind2 [
|
53
|
+
# @param ind2 [nil, String, Array<String>] filter by indicator 2. Can be nil to match
|
55
54
|
# any indicator.
|
56
55
|
#
|
57
56
|
# @return [Metacrunch::Marcxml::Document::DatafieldSet] Set of data fields matching the
|
58
57
|
# given tag(s) and ind1/ind2. The set is empty if a matching field doesn't exist.
|
59
58
|
#
|
60
59
|
def datafields(tag = nil, ind1: nil, ind2: nil)
|
61
|
-
matched_datafields =
|
62
|
-
when nil
|
60
|
+
matched_datafields = if tag.nil?
|
63
61
|
@datafields_map.values.flatten(1)
|
64
|
-
|
62
|
+
elsif tag.is_a?(Enumerable)
|
65
63
|
tag.map{ |_tag| @datafields_map[normalize_tag(_tag)] }.compact.flatten(1)
|
66
64
|
else
|
67
65
|
@datafields_map[normalize_tag(tag)]
|
@@ -82,13 +80,57 @@ module Metacrunch
|
|
82
80
|
datafield
|
83
81
|
end
|
84
82
|
|
83
|
+
# ------------------------------------------------------------------------------
|
84
|
+
# Query API
|
85
|
+
# ------------------------------------------------------------------------------
|
86
|
+
|
87
|
+
#
|
88
|
+
# Returns a control field value or data field/sub field values matching the
|
89
|
+
# given query string.
|
90
|
+
#
|
91
|
+
# @param query_string [String] a query string.
|
92
|
+
#
|
93
|
+
# @return [Array<String>] The sub field values matching the query. Is empty if no match
|
94
|
+
# is found.
|
95
|
+
#
|
96
|
+
def [](query_string)
|
97
|
+
# Control field query
|
98
|
+
if query_string.starts_with?("00")
|
99
|
+
# Example: "005"
|
100
|
+
# [0..2] => Control field tag
|
101
|
+
tag = query_string[0..2].presence
|
102
|
+
controlfield(tag)&.value
|
103
|
+
|
104
|
+
# Data field / sub field query
|
105
|
+
else
|
106
|
+
# Example: "100**a,e"
|
107
|
+
# [0..2] => Data field tag (required).
|
108
|
+
# [3] => Ind1, defaults to `*`, which matches any indicator 1 (optional). ` `, `-` or `_` will be interpreted as `blank`.
|
109
|
+
# [4] => Ind2, defaults to `*`, which matches any indicator 2 (optional). ` `, `-` or `_` will be interpreted as `blank`.
|
110
|
+
# [5] => Sub field code(s) (optional).
|
111
|
+
tag = query_string[0..2].presence
|
112
|
+
|
113
|
+
ind1 = query_string[3].presence
|
114
|
+
ind1 = nil if ind1 == "*"
|
115
|
+
ind1 = :blank if ind1 == "-" || ind1 == "_" || ind1 == " "
|
116
|
+
|
117
|
+
ind2 = query_string[4].presence
|
118
|
+
ind2 = nil if ind2 == "*"
|
119
|
+
ind2 = :blank if ind2 == "-" || ind2 == "_" || ind2 == " "
|
120
|
+
|
121
|
+
subfield_codes = query_string[5..-1]&.split(",")&.map(&:strip).compact.presence
|
122
|
+
|
123
|
+
datafields(tag, ind1: ind1, ind2: ind2).subfields(subfield_codes).values
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
85
127
|
private
|
86
128
|
|
87
129
|
def match_indicator(requested_ind, datafield_ind)
|
88
130
|
[*[requested_ind]].flatten.map do |_requested_ind|
|
89
131
|
if !_requested_ind
|
90
132
|
true
|
91
|
-
elsif _requested_ind == :blank && (datafield_ind == " " || datafield_ind == "-" || datafield_ind.nil?)
|
133
|
+
elsif _requested_ind == :blank && (datafield_ind == " " || datafield_ind == "-" || datafield_ind == "_" || datafield_ind.nil?)
|
92
134
|
true
|
93
135
|
elsif _requested_ind == datafield_ind
|
94
136
|
true
|
@@ -101,8 +143,8 @@ module Metacrunch
|
|
101
143
|
def normalize_tag(tag)
|
102
144
|
case tag
|
103
145
|
when Integer then tag.to_s.rjust(3, "0")
|
104
|
-
when String then tag
|
105
|
-
else tag.to_s
|
146
|
+
when String then tag[0..2]
|
147
|
+
else tag.to_s[0..2]
|
106
148
|
end
|
107
149
|
end
|
108
150
|
|
@@ -2,7 +2,7 @@ module Metacrunch
|
|
2
2
|
module Marcxml
|
3
3
|
class Parser < Ox::Sax
|
4
4
|
|
5
|
-
def parse(
|
5
|
+
def parse(marc_xml)
|
6
6
|
# initialize state machine
|
7
7
|
@in_controlfield = @in_datafield = @in_subfield = false
|
8
8
|
|
@@ -11,8 +11,9 @@ module Metacrunch
|
|
11
11
|
@html_entities_coder = HTMLEntities.new
|
12
12
|
|
13
13
|
# convert_special tells ox to convert some html entities already during
|
14
|
-
# parsing, which minifies the amount of entities we have to decode
|
15
|
-
|
14
|
+
# parsing, which minifies the amount of entities we have to decode using
|
15
|
+
# html_entities_coder in #text.
|
16
|
+
Ox.sax_parse(self, marc_xml, convert_special: true)
|
16
17
|
|
17
18
|
return @document
|
18
19
|
end
|
data/metacrunch-marcxml.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metacrunch-marcxml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- René Sprotte
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-10-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -44,23 +44,23 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '2.
|
47
|
+
version: '2.11'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '2.
|
55
|
-
description:
|
54
|
+
version: '2.11'
|
55
|
+
description:
|
56
56
|
email: r.sprotte@ub.uni-paderborn.de
|
57
57
|
executables: []
|
58
58
|
extensions: []
|
59
59
|
extra_rdoc_files: []
|
60
60
|
files:
|
61
|
+
- ".circleci/config.yml"
|
61
62
|
- ".gitignore"
|
62
63
|
- ".rspec"
|
63
|
-
- ".travis.yml"
|
64
64
|
- ".yardopts"
|
65
65
|
- Gemfile
|
66
66
|
- License.txt
|
@@ -81,7 +81,7 @@ homepage: http://github.com/ubpb/metacrunch-marcxml
|
|
81
81
|
licenses:
|
82
82
|
- MIT
|
83
83
|
metadata: {}
|
84
|
-
post_install_message:
|
84
|
+
post_install_message:
|
85
85
|
rdoc_options: []
|
86
86
|
require_paths:
|
87
87
|
- lib
|
@@ -96,9 +96,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
96
|
- !ruby/object:Gem::Version
|
97
97
|
version: '0'
|
98
98
|
requirements: []
|
99
|
-
|
100
|
-
|
101
|
-
signing_key:
|
99
|
+
rubygems_version: 3.1.4
|
100
|
+
signing_key:
|
102
101
|
specification_version: 4
|
103
102
|
summary: MARCXML package for the metacrunch ETL toolkit.
|
104
103
|
test_files: []
|