metacrunch-marcxml 2.0.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +35 -0
- data/Gemfile +3 -2
- data/Readme.md +59 -14
- data/lib/metacrunch/marcxml.rb +10 -0
- data/lib/metacrunch/marcxml/document.rb +52 -10
- data/lib/metacrunch/marcxml/document/subfield_set.rb +0 -8
- data/lib/metacrunch/marcxml/parser.rb +4 -3
- data/lib/metacrunch/marcxml/version.rb +1 -1
- data/metacrunch-marcxml.gemspec +1 -1
- metadata +10 -11
- data/.travis.yml +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3634c811258d11102cde40466592986814cb3ecf463985fd4ff139934539fe8b
|
4
|
+
data.tar.gz: 9b86c52e489af8bd05f698af04526d3c1561d67180fd37db88d797d351481ca9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 170cb959fb34cd493867c07e0d6ff3553385ccb7c7cca41beeee1f70208725921ea80f2a3765511fd87d330a1d61b90ab7167cf3a25df65797f6194960f8ec7b
|
7
|
+
data.tar.gz: ccfd0da9bfc9cb8d66dbf6fb632f0b7949fc0fb539586a9eff1ef2d148446f8d611ddea48346dbbb82052ff794ed6b8704fa5d3ed967a38c4c61f3b7eeabc33e
|
@@ -0,0 +1,35 @@
|
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@1.1.1
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
docker:
|
8
|
+
- image: circleci/ruby:2.6-node-browsers
|
9
|
+
|
10
|
+
working_directory: ~/repo
|
11
|
+
|
12
|
+
steps:
|
13
|
+
- checkout
|
14
|
+
|
15
|
+
- run:
|
16
|
+
name: Install dependencies
|
17
|
+
command: bundle install --jobs=4 --retry=3 --path vendor/bundle
|
18
|
+
|
19
|
+
- run:
|
20
|
+
name: Install CodeClimate test coverage reporter
|
21
|
+
command: |
|
22
|
+
curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
|
23
|
+
chmod +x ./cc-test-reporter
|
24
|
+
./cc-test-reporter before-build
|
25
|
+
|
26
|
+
- run:
|
27
|
+
name: Run tests
|
28
|
+
command: |
|
29
|
+
mkdir /tmp/test-results
|
30
|
+
bundle exec rspec --format progress --format RspecJunitFormatter --out /tmp/test-results/rspec.xml
|
31
|
+
|
32
|
+
- run:
|
33
|
+
name: Upload test coverage report to CodeClimate
|
34
|
+
command: ./cc-test-reporter after-build --exit-code $?
|
35
|
+
|
data/Gemfile
CHANGED
@@ -5,7 +5,6 @@ gemspec
|
|
5
5
|
group :development do
|
6
6
|
gem "bundler", ">= 1.15"
|
7
7
|
gem "rake", ">= 12.1"
|
8
|
-
gem "rspec", ">= 3.5.0", "< 4.0.0"
|
9
8
|
|
10
9
|
if !ENV["CI"]
|
11
10
|
gem "pry-byebug", ">= 3.5.0"
|
@@ -13,5 +12,7 @@ group :development do
|
|
13
12
|
end
|
14
13
|
|
15
14
|
group :test do
|
16
|
-
gem "
|
15
|
+
gem "rspec", ">= 3.5.0", "< 4.0.0"
|
16
|
+
gem "rspec_junit_formatter", ">= 0.3.0"
|
17
|
+
gem "simplecov", "= 0.17.1"
|
17
18
|
end
|
data/Readme.md
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
metacrunch-marcxml
|
2
2
|
==================
|
3
3
|
|
4
|
-
[![
|
4
|
+
[![Gem Version](https://badge.fury.io/rb/metacrunch-marcxml.svg)](http://badge.fury.io/rb/metacrunch-marcxml)
|
5
5
|
[![Code Climate](https://codeclimate.com/github/ubpb/metacrunch-marcxml/badges/gpa.svg)](https://codeclimate.com/github/ubpb/metacrunch-marcxml)
|
6
|
+
[![Test Coverage](https://codeclimate.com/github/ubpb/metacrunch-marcxml/badges/coverage.svg)](https://codeclimate.com/github/ubpb/metacrunch-marcxml/coverage)
|
7
|
+
[![CircleCI](https://circleci.com/gh/ubpb/metacrunch-marcxml.svg?style=svg)](https://circleci.com/gh/ubpb/metacrunch-marcxml)
|
6
8
|
|
7
9
|
This is the official [MARCXML](http://www.loc.gov/standards/marcxml/) package for the [metacrunch ETL toolkit](https://github.com/ubpb/metacrunch). It allows you to access MARCXML data by a simple and powerful Ruby API.
|
8
10
|
|
@@ -17,7 +19,7 @@ Installation
|
|
17
19
|
Include the gem in your `Gemfile`
|
18
20
|
|
19
21
|
```ruby
|
20
|
-
gem "metacrunch-marcxml", "~>
|
22
|
+
gem "metacrunch-marcxml", "~> 3.0.0"
|
21
23
|
```
|
22
24
|
|
23
25
|
and run `$ bundle install` to install it.
|
@@ -32,28 +34,33 @@ $ gem install metacrunch-marcxml
|
|
32
34
|
Usage example
|
33
35
|
-------------
|
34
36
|
|
35
|
-
|
37
|
+
*Note: For working examples on how to use this package in a metacrunch job check out our [demo repository](https://github.com/ubpb/metacrunch-demo).*
|
38
|
+
|
39
|
+
**Load the library**
|
40
|
+
|
36
41
|
```ruby
|
37
42
|
require "metacrunch/marcxml"
|
38
43
|
```
|
39
44
|
|
40
45
|
**Parsing a [MARCXML file](http://d-nb.info/982392028/about/marcxml)**
|
46
|
+
|
41
47
|
```ruby
|
42
48
|
# Load a MARCXML file (from a remote location in this example).
|
43
49
|
require "open-uri"
|
44
|
-
marcxml = open("http://d-nb.info/982392028/about/marcxml"){|io| io.read}
|
50
|
+
marcxml = URI.open("http://d-nb.info/982392028/about/marcxml"){|io| io.read}
|
45
51
|
|
46
52
|
# Now parse the file
|
47
53
|
document = Metacrunch::Marcxml.parse(marcxml)
|
48
|
-
# .. or
|
54
|
+
# .. or
|
55
|
+
document = Metacrunch::Marcxml[marcxml]
|
56
|
+
# .. or
|
49
57
|
document = Metacrunch::Marcxml(marcxml)
|
50
58
|
```
|
51
59
|
|
52
|
-
**
|
60
|
+
**Access control fields**
|
61
|
+
|
53
62
|
```ruby
|
54
63
|
controlfield = document.controlfield("005")
|
55
|
-
# same as ...
|
56
|
-
controlfield = document.controlfield(5)
|
57
64
|
# => #<Metacrunch::Marcxml::Document::Controlfield:0x007fd4c5120ec0 ...>
|
58
65
|
|
59
66
|
tag = controlfield.tag
|
@@ -62,16 +69,17 @@ value = controlfield.value
|
|
62
69
|
# => "20130926112144.0"
|
63
70
|
```
|
64
71
|
|
65
|
-
**
|
72
|
+
**Access data fields / sub fields**
|
73
|
+
|
66
74
|
```ruby
|
67
|
-
# Find fields matching tag=100 and
|
68
|
-
datafield_set = document.datafields(100, ind1: "1")
|
75
|
+
# Find fields matching tag=100 and ind1=1 (author)
|
76
|
+
datafield_set = document.datafields("100", ind1: "1")
|
69
77
|
# => #<Metacrunch::Marcxml::Document::DatafieldSet:0x007fd4c4ce4b40 ...>
|
70
78
|
|
71
79
|
first_author = datafield_set.first # set is an Enumerable
|
72
80
|
# => #<Metacrunch::Marcxml::Document::Datafield:0x007fd4c5129480 ...>
|
73
81
|
|
74
|
-
# Get the
|
82
|
+
# Get the sub fields matching code=a (author name)
|
75
83
|
subfield_set = first_author.subfields("a")
|
76
84
|
# => #<Metacrunch::Marcxml::Document::SubfieldSet:0x007fd4c4c779f0 ...>
|
77
85
|
|
@@ -82,8 +90,45 @@ first_author_subfield = subfield_set.first # subfield_set is an Enumerable
|
|
82
90
|
first_author_name = first_author_subfield.value
|
83
91
|
# => "Orwell, George"
|
84
92
|
|
85
|
-
# ...
|
86
|
-
first_author_name = document.datafields(100, ind1: "1").subfields("a").
|
93
|
+
# ... this can be a one liner
|
94
|
+
first_author_name = document.datafields(100, ind1: "1").subfields("a").values.first
|
95
|
+
```
|
96
|
+
|
97
|
+
**Direct value access using a query string**
|
98
|
+
|
99
|
+
Access fields as described above is flexible but very verbose. Most of the time you know your data and you are interested in a simple and direct way to access the field values.
|
100
|
+
|
101
|
+
For this case we provide a way to query field values using a simple query string.
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
# Get the value of control field "005"
|
105
|
+
document["005"]
|
106
|
+
# => "20130926112144.0"
|
107
|
+
|
108
|
+
# Get the first value of data field tag=100, ind1=1, sub field code=a
|
109
|
+
document["1001*a"].first
|
110
|
+
# => "Orwell, George"
|
111
|
+
```
|
112
|
+
|
113
|
+
The query string syntax is simple. Each query string starts with three letters for the tag. If the tag starts with `00` it is considered a query for a control field value. Otherwise it is considered a data field / sub field query. In that case the next two characters are used to match ind1 and ind2. The default value is `*` which matches every indicator value. `-`, `_` and ` ` are interpreted as `blank`. The last characters are used to match the code of the sub fields. To query for more than one sub field code you may separate them using commas.
|
114
|
+
|
115
|
+
**Examples**
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
document["1001*a"]
|
119
|
+
# => ["Orwell, George"]
|
120
|
+
|
121
|
+
document["020**a,c"]
|
122
|
+
# => ["9783548267456",
|
123
|
+
# "kart. : EUR 6.00 (DE), EUR 6.20 (AT), sfr 11.00",
|
124
|
+
# "3548267459",
|
125
|
+
# "kart. : EUR 6.00 (DE), EUR 6.20 (AT), sfr 11.00"]
|
126
|
+
|
127
|
+
document["2463_a"]
|
128
|
+
# => ["Neunzehnhundertvierundachtzig"]
|
129
|
+
|
130
|
+
document["264_1a,b,c"]
|
131
|
+
# => ["Berlin", "Ullstein", "2007"]
|
87
132
|
```
|
88
133
|
|
89
134
|
License
|
data/lib/metacrunch/marcxml.rb
CHANGED
@@ -32,6 +32,16 @@ module Metacrunch
|
|
32
32
|
def parse(xml)
|
33
33
|
Parser.new.parse(xml)
|
34
34
|
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# Convenience method for Metacrunch::Marcxml.parse
|
38
|
+
#
|
39
|
+
# @return [Metacrunch::Marcxml::Document] the parsed {Metacrunch::Marcxml::Document}
|
40
|
+
# @see Metacrunch::Marcxml#parse
|
41
|
+
#
|
42
|
+
def [](xml)
|
43
|
+
self.parse(xml)
|
44
|
+
end
|
35
45
|
end
|
36
46
|
end
|
37
47
|
|
@@ -23,7 +23,7 @@ module Metacrunch
|
|
23
23
|
#
|
24
24
|
# @param tag [String, Integer] the tag of the control field. The tag can be
|
25
25
|
# a string or an integer.
|
26
|
-
# @return [Controlfield, nil] the control field with the matching tag or nil
|
26
|
+
# @return [Metacrunch::Marcxml::Document::Controlfield, nil] the control field with the matching tag or nil
|
27
27
|
# if a control field with a matching tag does not exist.
|
28
28
|
#
|
29
29
|
def controlfield(tag)
|
@@ -48,20 +48,18 @@ module Metacrunch
|
|
48
48
|
# to match all data fields. The tag can be a string or an integer. To filter for
|
49
49
|
# more than a single tag, `tag` also accepts any object that responds to `#each` like
|
50
50
|
# `Array` and `Range`.
|
51
|
-
#
|
52
|
-
# @param ind1 [String, nil, Array<String>] filter by indicator 1. Can be nil to match
|
51
|
+
# @param ind1 [nil, String, Array<String>] filter by indicator 1. Can be nil to match
|
53
52
|
# any indicator.
|
54
|
-
# @param ind2 [
|
53
|
+
# @param ind2 [nil, String, Array<String>] filter by indicator 2. Can be nil to match
|
55
54
|
# any indicator.
|
56
55
|
#
|
57
56
|
# @return [Metacrunch::Marcxml::Document::DatafieldSet] Set of data fields matching the
|
58
57
|
# given tag(s) and ind1/ind2. The set is empty if a matching field doesn't exist.
|
59
58
|
#
|
60
59
|
def datafields(tag = nil, ind1: nil, ind2: nil)
|
61
|
-
matched_datafields =
|
62
|
-
when nil
|
60
|
+
matched_datafields = if tag.nil?
|
63
61
|
@datafields_map.values.flatten(1)
|
64
|
-
|
62
|
+
elsif tag.is_a?(Enumerable)
|
65
63
|
tag.map{ |_tag| @datafields_map[normalize_tag(_tag)] }.compact.flatten(1)
|
66
64
|
else
|
67
65
|
@datafields_map[normalize_tag(tag)]
|
@@ -82,13 +80,57 @@ module Metacrunch
|
|
82
80
|
datafield
|
83
81
|
end
|
84
82
|
|
83
|
+
# ------------------------------------------------------------------------------
|
84
|
+
# Query API
|
85
|
+
# ------------------------------------------------------------------------------
|
86
|
+
|
87
|
+
#
|
88
|
+
# Returns a control field value or data field/sub field values matching the
|
89
|
+
# given query string.
|
90
|
+
#
|
91
|
+
# @param query_string [String] a query string.
|
92
|
+
#
|
93
|
+
# @return [Array<String>] The sub field values matching the query. Is empty if no match
|
94
|
+
# is found.
|
95
|
+
#
|
96
|
+
def [](query_string)
|
97
|
+
# Control field query
|
98
|
+
if query_string.starts_with?("00")
|
99
|
+
# Example: "005"
|
100
|
+
# [0..2] => Control field tag
|
101
|
+
tag = query_string[0..2].presence
|
102
|
+
controlfield(tag)&.value
|
103
|
+
|
104
|
+
# Data field / sub field query
|
105
|
+
else
|
106
|
+
# Example: "100**a,e"
|
107
|
+
# [0..2] => Data field tag (required).
|
108
|
+
# [3] => Ind1, defaults to `*`, which matches any indicator 1 (optional). ` `, `-` or `_` will be interpreted as `blank`.
|
109
|
+
# [4] => Ind2, defaults to `*`, which matches any indicator 2 (optional). ` `, `-` or `_` will be interpreted as `blank`.
|
110
|
+
# [5] => Sub field code(s) (optional).
|
111
|
+
tag = query_string[0..2].presence
|
112
|
+
|
113
|
+
ind1 = query_string[3].presence
|
114
|
+
ind1 = nil if ind1 == "*"
|
115
|
+
ind1 = :blank if ind1 == "-" || ind1 == "_" || ind1 == " "
|
116
|
+
|
117
|
+
ind2 = query_string[4].presence
|
118
|
+
ind2 = nil if ind2 == "*"
|
119
|
+
ind2 = :blank if ind2 == "-" || ind2 == "_" || ind2 == " "
|
120
|
+
|
121
|
+
subfield_codes = query_string[5..-1]&.split(",")&.map(&:strip).compact.presence
|
122
|
+
|
123
|
+
datafields(tag, ind1: ind1, ind2: ind2).subfields(subfield_codes).values
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
85
127
|
private
|
86
128
|
|
87
129
|
def match_indicator(requested_ind, datafield_ind)
|
88
130
|
[*[requested_ind]].flatten.map do |_requested_ind|
|
89
131
|
if !_requested_ind
|
90
132
|
true
|
91
|
-
elsif _requested_ind == :blank && (datafield_ind == " " || datafield_ind == "-" || datafield_ind.nil?)
|
133
|
+
elsif _requested_ind == :blank && (datafield_ind == " " || datafield_ind == "-" || datafield_ind == "_" || datafield_ind.nil?)
|
92
134
|
true
|
93
135
|
elsif _requested_ind == datafield_ind
|
94
136
|
true
|
@@ -101,8 +143,8 @@ module Metacrunch
|
|
101
143
|
def normalize_tag(tag)
|
102
144
|
case tag
|
103
145
|
when Integer then tag.to_s.rjust(3, "0")
|
104
|
-
when String then tag
|
105
|
-
else tag.to_s
|
146
|
+
when String then tag[0..2]
|
147
|
+
else tag.to_s[0..2]
|
106
148
|
end
|
107
149
|
end
|
108
150
|
|
@@ -2,7 +2,7 @@ module Metacrunch
|
|
2
2
|
module Marcxml
|
3
3
|
class Parser < Ox::Sax
|
4
4
|
|
5
|
-
def parse(
|
5
|
+
def parse(marc_xml)
|
6
6
|
# initialize state machine
|
7
7
|
@in_controlfield = @in_datafield = @in_subfield = false
|
8
8
|
|
@@ -11,8 +11,9 @@ module Metacrunch
|
|
11
11
|
@html_entities_coder = HTMLEntities.new
|
12
12
|
|
13
13
|
# convert_special tells ox to convert some html entities already during
|
14
|
-
# parsing, which minifies the amount of entities we have to decode
|
15
|
-
|
14
|
+
# parsing, which minifies the amount of entities we have to decode using
|
15
|
+
# html_entities_coder in #text.
|
16
|
+
Ox.sax_parse(self, marc_xml, convert_special: true)
|
16
17
|
|
17
18
|
return @document
|
18
19
|
end
|
data/metacrunch-marcxml.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metacrunch-marcxml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- René Sprotte
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-10-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -44,23 +44,23 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '2.
|
47
|
+
version: '2.11'
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '2.
|
55
|
-
description:
|
54
|
+
version: '2.11'
|
55
|
+
description:
|
56
56
|
email: r.sprotte@ub.uni-paderborn.de
|
57
57
|
executables: []
|
58
58
|
extensions: []
|
59
59
|
extra_rdoc_files: []
|
60
60
|
files:
|
61
|
+
- ".circleci/config.yml"
|
61
62
|
- ".gitignore"
|
62
63
|
- ".rspec"
|
63
|
-
- ".travis.yml"
|
64
64
|
- ".yardopts"
|
65
65
|
- Gemfile
|
66
66
|
- License.txt
|
@@ -81,7 +81,7 @@ homepage: http://github.com/ubpb/metacrunch-marcxml
|
|
81
81
|
licenses:
|
82
82
|
- MIT
|
83
83
|
metadata: {}
|
84
|
-
post_install_message:
|
84
|
+
post_install_message:
|
85
85
|
rdoc_options: []
|
86
86
|
require_paths:
|
87
87
|
- lib
|
@@ -96,9 +96,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
96
96
|
- !ruby/object:Gem::Version
|
97
97
|
version: '0'
|
98
98
|
requirements: []
|
99
|
-
|
100
|
-
|
101
|
-
signing_key:
|
99
|
+
rubygems_version: 3.1.4
|
100
|
+
signing_key:
|
102
101
|
specification_version: 4
|
103
102
|
summary: MARCXML package for the metacrunch ETL toolkit.
|
104
103
|
test_files: []
|