pennmarc 1.0.2 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitleaks.toml +2 -0
- data/README.md +2 -2
- data/lib/pennmarc/helpers/date.rb +4 -8
- data/lib/pennmarc/helpers/format.rb +2 -2
- data/lib/pennmarc/helpers/identifier.rb +44 -11
- data/lib/pennmarc/helpers/language.rb +23 -8
- data/lib/pennmarc/mappers.rb +6 -2
- data/lib/pennmarc/mappings/iso639-3-languages.yml +7916 -0
- data/lib/pennmarc/version.rb +1 -1
- data/spec/lib/pennmarc/helpers/date_spec.rb +16 -0
- data/spec/lib/pennmarc/helpers/identifer_spec.rb +21 -5
- data/spec/lib/pennmarc/helpers/language_spec.rb +56 -8
- data/spec/lib/pennmarc/parser_spec.rb +3 -3
- metadata +8 -6
- /data/lib/pennmarc/mappings/{language.yml → iso639-2-languages.yml} +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50fd0383f0e78807f62f8abe784f75d8dace6e5b0ecf64877f79ad90b6d40354
|
4
|
+
data.tar.gz: 3df19d10534fc787c55814e30bd1b066a8c5ea10e7da15f9b320c4500891e0be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 277bb7c15e224c8134b8cea8b28de474d27022a1537904844af8c5542ff3f0080c37f13db38a5a9463a07453bcc403ccf346c2b5538c2fe26012df6a67dec24f
|
7
|
+
data.tar.gz: 1ca8e643758f86aeabdfbeb1576f513c866ba5ef1a40b6edaa279f687af7644baf6b953db6f078548fc72ca99b75b6377db7d80d98a19059c797b6ad41b5f648
|
data/.gitleaks.toml
ADDED
data/README.md
CHANGED
@@ -73,7 +73,7 @@ rspec
|
|
73
73
|
|
74
74
|
1. Update the version in `pennmarc.gemspec`
|
75
75
|
2. Run `gem build pennmarc.gemspec` with the latest code
|
76
|
-
3. Run `gem push pennmarc-{version number here}`(e.g. `gem push pennmarc-1.0.0`) to push to RubyGems. You will need access and MFA setup with RubyGems.
|
76
|
+
3. Run `gem push pennmarc-{version number here}.gem`(e.g. `gem push pennmarc-1.0.0.gem`) to push to RubyGems. You will need access and MFA setup with RubyGems.
|
77
77
|
|
78
78
|
## QA
|
79
79
|
|
@@ -89,4 +89,4 @@ MARC_FILE=path/to/marc.xml bundle exec rake pennmarc:parse
|
|
89
89
|
- rake task or some similar command to return a full set of values extracted from a specified marcxml file
|
90
90
|
- Pipeline to run tests and publish to Rubygems
|
91
91
|
- rubocop check
|
92
|
-
- rdoc/yard coverage checks?
|
92
|
+
- rdoc/yard coverage checks?
|
@@ -25,21 +25,17 @@ module PennMARC
|
|
25
25
|
# @return [DateTime, nil] The date added, or nil if date found in record is invalid
|
26
26
|
def added(record)
|
27
27
|
record.fields(EnrichedMarc::TAG_ITEM).flat_map { |field|
|
28
|
-
field.filter_map do |
|
29
|
-
# skip unless field has date created subfield
|
30
|
-
next unless subfield_defined?(field, EnrichedMarc::SUB_ITEM_DATE_CREATED)
|
31
|
-
|
28
|
+
subfield_values(field, EnrichedMarc::SUB_ITEM_DATE_CREATED).filter_map do |date_added|
|
32
29
|
# On 2022-05-02, this field value (as exported in enriched publishing
|
33
30
|
# job from Alma) began truncating time to day-level granularity. We have
|
34
31
|
# no guarantee that this won't switch back in the future, so for the
|
35
32
|
# foreseeable future we should support both formats.
|
36
33
|
|
37
|
-
format =
|
38
|
-
|
39
|
-
DateTime.strptime(subfield.value, format)
|
34
|
+
format = date_added.size == 10 ? '%Y-%m-%d' : '%Y-%m-%d %H:%M:%S'
|
40
35
|
|
36
|
+
DateTime.strptime(date_added, format)
|
41
37
|
rescue StandardError => e
|
42
|
-
puts "Error parsing date in date added subfield: #{
|
38
|
+
puts "Error parsing date in date added subfield: #{date_added} - #{e}"
|
43
39
|
nil
|
44
40
|
end
|
45
41
|
}.max
|
@@ -41,10 +41,10 @@ module PennMARC
|
|
41
41
|
results += record.fields('880').map do |f|
|
42
42
|
subfield_to_ignore = if subfield_value?(f, 6, /^300/)
|
43
43
|
%w[3 6 8]
|
44
|
-
elsif subfield_value?(f, 6, /^(254|255|310|342|352|362)/)
|
45
|
-
%w[6 8]
|
46
44
|
elsif subfield_value?(f, 6, /^340/)
|
47
45
|
%w[0 2 6 8]
|
46
|
+
else
|
47
|
+
%w[6 8]
|
48
48
|
end
|
49
49
|
join_subfields(f, &subfield_not_in?(subfield_to_ignore))
|
50
50
|
end
|
@@ -23,7 +23,7 @@ module PennMARC
|
|
23
23
|
if field.tag == '020'
|
24
24
|
field.filter_map { |subfield| normalize_isbn(subfield.value) if subfield_in?(%w[a z]).call(subfield) }
|
25
25
|
else
|
26
|
-
field.filter_map { |subfield| subfield.value if subfield_in?(%w[a l z]).call(subfield) }
|
26
|
+
field.filter_map { |subfield| subfield.value if subfield_in?(%w[a l m y z]).call(subfield) }
|
27
27
|
end
|
28
28
|
}.flatten.uniq
|
29
29
|
end
|
@@ -56,21 +56,42 @@ module PennMARC
|
|
56
56
|
|
57
57
|
# Get numeric OCLC ID of first {https://www.oclc.org/bibformats/en/0xx/035.html 035 field}
|
58
58
|
# with an OCLC ID defined in subfield 'a'.
|
59
|
-
#
|
60
|
-
# @todo We should evaluate this to return a single value in the future since subfield a is non-repeatable
|
61
59
|
# @param [MARC::Record] record
|
62
|
-
# @return [
|
63
|
-
def
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
oclc_id.flat_map do |field|
|
60
|
+
# @return [String, nil]
|
61
|
+
def oclc_id_show(record)
|
62
|
+
ids = Array.wrap(record.fields('035')
|
63
|
+
.find { |field| field.any? { |subfield| subfield_a_is_oclc?(subfield) } })
|
64
|
+
ids.flat_map { |field|
|
68
65
|
field.filter_map do |subfield|
|
69
66
|
# skip unless subfield 'a' is an oclc id value
|
70
67
|
next unless subfield_a_is_oclc?(subfield)
|
71
68
|
|
72
69
|
# search for numeric part of oclc id (e.g. '610094484' in '(OCoLC)ocn610094484')
|
73
|
-
match =
|
70
|
+
match = match_oclc_number(subfield)
|
71
|
+
|
72
|
+
# skip unless search to find numeric part of oclc id has a match
|
73
|
+
next unless match
|
74
|
+
|
75
|
+
match[1]
|
76
|
+
end
|
77
|
+
}.first
|
78
|
+
end
|
79
|
+
|
80
|
+
# Retrieve valid and invalid numeric OCLC IDs from {https://www.oclc.org/bibformats/en/0xx/035.html 035 field}
|
81
|
+
# for search.
|
82
|
+
# @param [MARC::Record] record
|
83
|
+
# @return [Array<String>]
|
84
|
+
def oclc_id_search(record)
|
85
|
+
record.fields('035').flat_map do |field|
|
86
|
+
field.filter_map do |subfield|
|
87
|
+
# skip unless subfield 'a' or 'z'
|
88
|
+
next unless subfield.code.in?(%w[a z])
|
89
|
+
|
90
|
+
# skip unless subfield value matches OCLC ID
|
91
|
+
next unless subfield_is_oclc?(subfield)
|
92
|
+
|
93
|
+
# search for numeric part of oclc id
|
94
|
+
match = match_oclc_number(subfield)
|
74
95
|
|
75
96
|
# skip unless search to find numeric part of oclc id has a match
|
76
97
|
next unless match
|
@@ -143,7 +164,19 @@ module PennMARC
|
|
143
164
|
# @param [MARC::Subfield]
|
144
165
|
# @return [TrueClass, FalseClass]
|
145
166
|
def subfield_a_is_oclc?(subfield)
|
146
|
-
subfield.code == 'a' && (subfield
|
167
|
+
subfield.code == 'a' && subfield_is_oclc?(subfield)
|
168
|
+
end
|
169
|
+
|
170
|
+
# @param [MARC::Subfield]
|
171
|
+
# @return [TrueClass, FalseClass]
|
172
|
+
def subfield_is_oclc?(subfield)
|
173
|
+
(subfield.value =~ /^\(OCoLC\).*/).present?
|
174
|
+
end
|
175
|
+
|
176
|
+
# @param [MARC::Subfield]
|
177
|
+
# @return [MatchData, nil]
|
178
|
+
def match_oclc_number(subfield)
|
179
|
+
/^\s*\(OCoLC\)[^1-9]*([1-9][0-9]*).*$/.match(subfield.value)
|
147
180
|
end
|
148
181
|
|
149
182
|
# Normalize isbn value using {https://github.com/billdueber/library_stdnums library_stdnums gem}.
|
@@ -4,10 +4,10 @@ module PennMARC
|
|
4
4
|
# Logic for extracting and translating Language values for a record. Penn practice is to verify the value present in
|
5
5
|
# the {https://www.oclc.org/bibformats/en/fixedfield/lang.html 008 control field} as a three letter code. This code
|
6
6
|
# is then mapped to a display-friendly value using the a provided mapping hash.
|
7
|
-
# @todo should we consider values in the {https://www.oclc.org/bibformats/en/0xx/041.html 041 field}?
|
8
7
|
class Language < Helper
|
9
8
|
# Used when no value is present in the control field - still mapped
|
10
9
|
UNDETERMINED_CODE = :und
|
10
|
+
LANGUAGE_SUBFIELDS = %w[a b d e g h i j k m n p q r t].freeze
|
11
11
|
|
12
12
|
class << self
|
13
13
|
# Get language values for display from the {https://www.oclc.org/bibformats/en/5xx/546.html 546 field} and
|
@@ -21,16 +21,31 @@ module PennMARC
|
|
21
21
|
values + linked_alternate(record, '546', &subfield_not_in?(%w[6 8]))
|
22
22
|
end
|
23
23
|
|
24
|
-
# Get language values for searching and faceting of a record. The
|
25
|
-
# the
|
24
|
+
# Get language values for searching and faceting of a record. The values are extracted from subfields
|
25
|
+
# in the 041 field. Language facet and search values will typically be the same, with the exception of `zxx`,
|
26
|
+
# when no linguistic content is found.
|
27
|
+
#
|
28
|
+
# @note In franklin, we extracted the language code from the 008 control field. After engaging cataloging unit
|
29
|
+
# representatives, we decided to extract these values from the 041 field: Includes records for multilingual
|
30
|
+
# items, items that involve translation, and items where the medium of communication is a sign language.
|
31
|
+
# https://www.loc.gov/marc/bibliographic/bd041.html
|
26
32
|
#
|
27
33
|
# @param [MARC::Record] record
|
28
|
-
# @param [Hash]
|
29
|
-
# @
|
30
|
-
|
34
|
+
# @param [Hash] iso_639_2_mapping iso-639-2 spec hash for language code translation
|
35
|
+
# @param [Hash] iso_639_3_mapping iso-639-3 spec hash for language code translation
|
36
|
+
# @return [Array] array of language values
|
37
|
+
def values(record, iso_639_2_mapping: Mappers.iso_639_2_language, iso_639_3_mapping: Mappers.iso_639_3_language)
|
38
|
+
values = record.fields('041').filter_map { |field|
|
39
|
+
mapper = subfield_value?(field, '2', /iso639-3/) ? iso_639_3_mapping : iso_639_2_mapping
|
40
|
+
field.filter_map do |sf|
|
41
|
+
next unless LANGUAGE_SUBFIELDS.include? sf.code
|
42
|
+
|
43
|
+
mapper[sf.value&.to_sym]
|
44
|
+
end
|
45
|
+
}.flatten
|
31
46
|
control_field = record['008']&.value
|
32
|
-
|
33
|
-
|
47
|
+
values << iso_639_2_mapping[control_field[35..37]&.to_sym] if control_field.present?
|
48
|
+
values.empty? ? values << iso_639_2_mapping[UNDETERMINED_CODE] : values.uniq
|
34
49
|
end
|
35
50
|
end
|
36
51
|
end
|
data/lib/pennmarc/mappers.rb
CHANGED
@@ -5,8 +5,12 @@ module PennMARC
|
|
5
5
|
class Mappers
|
6
6
|
class << self
|
7
7
|
# @return [Hash]
|
8
|
-
def
|
9
|
-
@
|
8
|
+
def iso_639_2_language
|
9
|
+
@iso_639_2_language ||= load_map('iso639-2-languages.yml')
|
10
|
+
end
|
11
|
+
|
12
|
+
def iso_639_3_language
|
13
|
+
@iso_639_3_language ||= load_map('iso639-3-languages.yml')
|
10
14
|
end
|
11
15
|
|
12
16
|
# @return [Hash]
|