identifiers 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +4 -2
- data/lib/identifiers/isbn.rb +53 -16
- data/spec/identifiers/isbn_spec.rb +60 -5
- metadata +7 -11
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f0cc5842531ab0baf207b1aba207fa1fc89e338b3bb6a36c689346e7b473fdd6
|
|
4
|
+
data.tar.gz: bba49e3c29eeb837cf62f0dcc20256d024ee2690e65ce1df36d7f619a4afc25f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b629b7898e27360f8b0b45c537b735f9e3a2cfaa37cb0c8a4b00b226af0d16c861fefd00c3d630658c38447d05aa2e25dcd6b4f8a52e257762b3ed3469c53b73
|
|
7
|
+
data.tar.gz: ea4edebb501806152ac4affc90d766bd31b93526606cbd90c64c1c8fa62722da45ca40514e6ade5229878390ab857390d8f4502cb87b27ab6390561e7ec98a1c
|
data/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,14 @@
|
|
|
2
2
|
All notable changes to this project will be documented in this file. This
|
|
3
3
|
project adheres to [Semantic Versioning](http://semver.org/).
|
|
4
4
|
|
|
5
|
+
## [0.15.0] - 2026-06-12
|
|
6
|
+
### Added
|
|
7
|
+
- Extract ISBNs that mix different dash characters, including the U+2212 minus
|
|
8
|
+
sign, within a single ISBN (#31)
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- Stop extracting spurious ISBNs from a long run of digits separated by hyphens (#34)
|
|
12
|
+
|
|
5
13
|
## [0.14.0] - 2024-07-30
|
|
6
14
|
### Added
|
|
7
15
|
- Added optional prefixes argument to ISBNs extraction.
|
data/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Identifiers [](https://badge.fury.io/rb/identifiers)
|
|
2
2
|
|
|
3
3
|
Collection of utilities related to the extraction, validation and normalization of various scholarly identifiers. The supported list is:
|
|
4
4
|
|
|
@@ -13,6 +13,8 @@ Collection of utilities related to the extraction, validation and normalization
|
|
|
13
13
|
- [URNs](https://en.wikipedia.org/wiki/Uniform_Resource_Name)
|
|
14
14
|
- [ORCID identifiers](http://orcid.org/)
|
|
15
15
|
|
|
16
|
+
**Supported Ruby versions**: >= 2.7
|
|
17
|
+
|
|
16
18
|
## Installation
|
|
17
19
|
|
|
18
20
|
Add this line to your application's `Gemfile`:
|
|
@@ -106,6 +108,6 @@ We also maintain [a version of this library for PHP](https://github.com/altmetri
|
|
|
106
108
|
|
|
107
109
|
## License
|
|
108
110
|
|
|
109
|
-
Copyright © 2016-
|
|
111
|
+
Copyright © 2016-2024 Altmetric LLP
|
|
110
112
|
|
|
111
113
|
Distributed under the [MIT License](http://opensource.org/licenses/MIT).
|
data/lib/identifiers/isbn.rb
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module Identifiers
|
|
2
4
|
class ISBN
|
|
3
5
|
ISBN_13_REGEXP = /
|
|
6
|
+
(?<!\p{Pd}) # Not part of a longer hyphen-separated run of digits
|
|
4
7
|
\b
|
|
5
8
|
(
|
|
6
9
|
97[89] # ISBN (GS1) Bookland prefix
|
|
@@ -12,12 +15,14 @@ module Identifiers
|
|
|
12
15
|
\d # Check digit
|
|
13
16
|
)
|
|
14
17
|
\b
|
|
15
|
-
|
|
18
|
+
(?!\p{Pd}) # Not part of a longer hyphen-separated run of digits
|
|
19
|
+
/x.freeze
|
|
16
20
|
ISBN_10_REGEXP = /
|
|
17
21
|
(?<! # Don't match a hyphenated or spaced ISBN-13
|
|
18
22
|
97[89]
|
|
19
23
|
[\p{Pd}\p{Zs}]
|
|
20
24
|
)
|
|
25
|
+
(?<!\p{Pd}) # Not part of a longer hyphen-separated run of digits
|
|
21
26
|
\b
|
|
22
27
|
(
|
|
23
28
|
\d{1,5} # Registration group identifier
|
|
@@ -29,7 +34,8 @@ module Identifiers
|
|
|
29
34
|
[\dX] # Check digit
|
|
30
35
|
)
|
|
31
36
|
\b
|
|
32
|
-
|
|
37
|
+
(?!\p{Pd}) # Not part of a longer hyphen-separated run of digits
|
|
38
|
+
/x.freeze
|
|
33
39
|
ISBN_A_REGEXP = %r{
|
|
34
40
|
\b
|
|
35
41
|
(?<=10\.) # Directory indicator (always 10)
|
|
@@ -38,24 +44,26 @@ module Identifiers
|
|
|
38
44
|
/ # Prefix/suffix divider
|
|
39
45
|
\d{1,7} # ISBN title enumerator and check digit
|
|
40
46
|
\b
|
|
41
|
-
}x
|
|
42
|
-
|
|
47
|
+
}x.freeze
|
|
48
|
+
# Dashes other than the ASCII hyphen we normalise to, including U+2212
|
|
49
|
+
# MINUS SIGN, which is not part of \p{Pd}
|
|
50
|
+
NON_CANONICAL_DASHES_REGEXP = /[[\p{Pd}−]&&[^-]]/.freeze
|
|
43
51
|
|
|
44
|
-
def self.extract(str
|
|
45
|
-
str =
|
|
52
|
+
def self.extract(str, prefixes = [])
|
|
53
|
+
str = str.to_s
|
|
54
|
+
# Normalise dashes to a single ASCII hyphen so one ISBN can mix them
|
|
55
|
+
str = str.gsub(NON_CANONICAL_DASHES_REGEXP, '-') if str.match?(NON_CANONICAL_DASHES_REGEXP)
|
|
56
|
+
return extract_with_prefix(str, prefixes) if prefixes.any?
|
|
46
57
|
|
|
47
58
|
extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
|
|
48
59
|
end
|
|
49
60
|
|
|
50
|
-
def self.
|
|
51
|
-
prefix_regexp = prefixes
|
|
61
|
+
def self.extract_with_prefix(str, prefixes)
|
|
62
|
+
prefix_regexp = generate_prefix_regexp(prefixes)
|
|
52
63
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
.inject('') do |acum, (_prefix, match)|
|
|
57
|
-
acum + "#{match} \n "
|
|
58
|
-
end
|
|
64
|
+
[isbn_a_candidate_matcher, ISBN_13_REGEXP, ISBN_10_REGEXP].inject([]) do |matches, isbn_regexp|
|
|
65
|
+
matches | isbn_with_prefix_candidates(str, prefix_regexp, isbn_regexp)
|
|
66
|
+
end
|
|
59
67
|
end
|
|
60
68
|
|
|
61
69
|
def self.extract_isbn_as(str)
|
|
@@ -78,13 +86,13 @@ module Identifiers
|
|
|
78
86
|
.select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 3 }
|
|
79
87
|
.map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
|
|
80
88
|
.select { |isbn| valid_isbn_10?(isbn) }
|
|
81
|
-
.map
|
|
89
|
+
.map do |isbn|
|
|
82
90
|
isbn.chop!
|
|
83
91
|
isbn.prepend('978')
|
|
84
92
|
isbn << isbn_13_check_digit(isbn).to_s
|
|
85
93
|
|
|
86
94
|
isbn
|
|
87
|
-
|
|
95
|
+
end
|
|
88
96
|
end
|
|
89
97
|
|
|
90
98
|
def self.isbn_13_check_digit(isbn)
|
|
@@ -117,5 +125,34 @@ module Identifiers
|
|
|
117
125
|
def self.digits_of(isbn)
|
|
118
126
|
isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
|
|
119
127
|
end
|
|
128
|
+
|
|
129
|
+
def self.isbn_with_prefix_candidates(str, prefix_regexp, isbn_regexp)
|
|
130
|
+
regexp = Regexp.new("#{prefix_regexp}#{isbn_regexp}", Regexp::IGNORECASE | Regexp::EXTENDED)
|
|
131
|
+
|
|
132
|
+
str
|
|
133
|
+
.to_s
|
|
134
|
+
.scan(regexp)
|
|
135
|
+
.filter_map do |match|
|
|
136
|
+
extract(Array(match).first)&.first
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def self.generate_prefix_regexp(prefixes)
|
|
141
|
+
joined_prefixes = Regexp.union(prefixes).source
|
|
142
|
+
|
|
143
|
+
Regexp.new(
|
|
144
|
+
"(?<= # Lookbehind for a prefix
|
|
145
|
+
#{joined_prefixes} # ie:p1|p2|p3
|
|
146
|
+
)
|
|
147
|
+
:? # Optional colon. If you want to use a different separator, you can add it as a prefix
|
|
148
|
+
\\s* # Optional whitespaces
|
|
149
|
+
", Regexp::IGNORECASE | Regexp::EXTENDED
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def self.isbn_a_candidate_matcher
|
|
154
|
+
# We capture the ISBN-A prefix for the ISBN-A regexp to work correctly when extracting ISBN-As
|
|
155
|
+
Regexp.new(ISBN_A_REGEXP.source.gsub('(?<=10\\.)', '10\.'), Regexp::IGNORECASE | Regexp::EXTENDED)
|
|
156
|
+
end
|
|
120
157
|
end
|
|
121
158
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'identifiers/isbn'
|
|
2
4
|
|
|
3
5
|
RSpec.describe Identifiers::ISBN do
|
|
@@ -31,6 +33,10 @@ RSpec.describe Identifiers::ISBN do
|
|
|
31
33
|
expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
|
|
32
34
|
end
|
|
33
35
|
|
|
36
|
+
it 'extracts ISBNs that mix different dash characters' do
|
|
37
|
+
expect(described_class.extract('978–3−200–01908–9')).to contain_exactly('9783200019089')
|
|
38
|
+
end
|
|
39
|
+
|
|
34
40
|
it 'extracts ISBNs with spaces' do
|
|
35
41
|
expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
|
|
36
42
|
end
|
|
@@ -119,6 +125,14 @@ RSpec.describe Identifiers::ISBN do
|
|
|
119
125
|
expect(described_class.extract('0-80506909-7')).to be_empty
|
|
120
126
|
end
|
|
121
127
|
|
|
128
|
+
it 'does not extract ISBNs from a long run of digits separated by hyphens' do
|
|
129
|
+
expect(described_class.extract('0-1884-0-3140-0-4396-0-5652-0-4396-0-2826')).to be_empty
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
it 'does not extract ISBNs joined to other digits by a hyphen' do
|
|
133
|
+
expect(described_class.extract('9780805069099-9780671879198')).to be_empty
|
|
134
|
+
end
|
|
135
|
+
|
|
122
136
|
it 'extracts ISBN-10s with variable width registration group identifiers' do
|
|
123
137
|
expect(described_class.extract('99921-58-10-7 9971-5-0210-0 960-425-059-0 80-902734-1-6'))
|
|
124
138
|
.to contain_exactly('9789992158104', '9789971502102', '9789604250592', '9788090273412')
|
|
@@ -126,16 +140,57 @@ RSpec.describe Identifiers::ISBN do
|
|
|
126
140
|
|
|
127
141
|
context 'when passing prefixes' do
|
|
128
142
|
it 'extracts only prefixed ISBNs' do
|
|
129
|
-
text = "ISBN:9789992158104
|
|
130
|
-
prefixes = [
|
|
143
|
+
text = "ISBN:9789992158104 ISBN-10 9789971502102 \n IsbN-13: 9789604250592 \n 9788090273412"
|
|
144
|
+
prefixes = %w[IsBn ISBN-10 ISBN-13]
|
|
131
145
|
|
|
132
146
|
expect(described_class.extract(text, prefixes))
|
|
133
147
|
.to contain_exactly('9789992158104', '9789971502102', '9789604250592')
|
|
134
148
|
end
|
|
135
149
|
|
|
150
|
+
it 'extracts ISBNs with special characters in the prefixes' do
|
|
151
|
+
text = 'ISB*N:99921-58-10-7 IS?BN-10 9971-5-0210-0 Is$bN-13: 978-0-80-506909-9 80-902734-1-6'
|
|
152
|
+
prefixes = ['IsB*n', 'IS?BN-10', 'IS$BN-13']
|
|
153
|
+
|
|
154
|
+
expect(described_class.extract(text, prefixes))
|
|
155
|
+
.to contain_exactly('9789992158104', '9789971502102', '9780805069099')
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
it 'extracts ISBNs with Unicode dashes' do
|
|
159
|
+
text = 'ISB*N:99921-58-10-7 IS?BN-10 9971-5-0210-0 Is$bN-13: 978–0–80–506909–9 80-902734-1-6'
|
|
160
|
+
prefixes = ['IsB*n', 'IS?BN-10', 'IS$BN-13']
|
|
161
|
+
|
|
162
|
+
expect(described_class.extract(text, prefixes))
|
|
163
|
+
.to contain_exactly('9789992158104', '9789971502102', '9780805069099')
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it 'extracts ISBNs with Unicode spaces' do
|
|
167
|
+
text = 'ISBN-13: 978 0 80 506909 9'
|
|
168
|
+
prefixes = ['ISBN-13']
|
|
169
|
+
|
|
170
|
+
expect(described_class.extract(text, prefixes)).to contain_exactly('9780805069099')
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
it 'normalizes 10-digit ISBNs with hyphens and a check digit of X' do
|
|
174
|
+
expect(described_class.extract('ISBN:2-7594-0269-X', ['ISBN'])).to contain_exactly('9782759402694')
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
|
|
178
|
+
text = 'ISBN-10 2 7594 0269 X'
|
|
179
|
+
prefixes = ['ISBN-10']
|
|
180
|
+
|
|
181
|
+
expect(described_class.extract(text, prefixes)).to contain_exactly('9782759402694')
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
it 'extracts ISBN-13s from ISBN-As' do
|
|
185
|
+
text = 'ISBN 10.978.8898392/315'
|
|
186
|
+
prefixes = %w[ISBN ISBN-10]
|
|
187
|
+
|
|
188
|
+
expect(described_class.extract(text, prefixes)).to contain_exactly('9788898392315')
|
|
189
|
+
end
|
|
190
|
+
|
|
136
191
|
it 'does not extract ISBNs with different prefixes' do
|
|
137
|
-
text = "ISBN:9789992158104 \n ISBN-10 9789971502102
|
|
138
|
-
prefixes = [
|
|
192
|
+
text = "ISBN:9789992158104 \n ISBN-10 9789971502102 IsbN-13: 9789604250592 9788090273412"
|
|
193
|
+
prefixes = %w[IsBn ISBN-10]
|
|
139
194
|
|
|
140
195
|
expect(described_class.extract(text, prefixes))
|
|
141
196
|
.to contain_exactly('9789992158104', '9789971502102')
|
|
@@ -143,7 +198,7 @@ RSpec.describe Identifiers::ISBN do
|
|
|
143
198
|
|
|
144
199
|
it 'does not extract ISBNs without prefixes' do
|
|
145
200
|
text = "9789992158104 9789971502102 9789604250592 \n 9788090273412"
|
|
146
|
-
prefixes = [
|
|
201
|
+
prefixes = %w[IsBn ISBN-10 ISBN-13]
|
|
147
202
|
|
|
148
203
|
expect(described_class.extract(text, prefixes)).to be_empty
|
|
149
204
|
end
|
metadata
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: identifiers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.15.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jonathan Hernandez
|
|
8
8
|
- Paul Mucur
|
|
9
9
|
- PatoSoft
|
|
10
|
-
autorequire:
|
|
11
10
|
bindir: exe
|
|
12
11
|
cert_chain: []
|
|
13
|
-
date:
|
|
12
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
14
13
|
dependencies:
|
|
15
14
|
- !ruby/object:Gem::Dependency
|
|
16
15
|
name: urn
|
|
@@ -32,29 +31,28 @@ dependencies:
|
|
|
32
31
|
requirements:
|
|
33
32
|
- - "~>"
|
|
34
33
|
- !ruby/object:Gem::Version
|
|
35
|
-
version: '
|
|
34
|
+
version: '13.2'
|
|
36
35
|
type: :development
|
|
37
36
|
prerelease: false
|
|
38
37
|
version_requirements: !ruby/object:Gem::Requirement
|
|
39
38
|
requirements:
|
|
40
39
|
- - "~>"
|
|
41
40
|
- !ruby/object:Gem::Version
|
|
42
|
-
version: '
|
|
41
|
+
version: '13.2'
|
|
43
42
|
- !ruby/object:Gem::Dependency
|
|
44
43
|
name: rspec
|
|
45
44
|
requirement: !ruby/object:Gem::Requirement
|
|
46
45
|
requirements:
|
|
47
46
|
- - "~>"
|
|
48
47
|
- !ruby/object:Gem::Version
|
|
49
|
-
version: '3.
|
|
48
|
+
version: '3.13'
|
|
50
49
|
type: :development
|
|
51
50
|
prerelease: false
|
|
52
51
|
version_requirements: !ruby/object:Gem::Requirement
|
|
53
52
|
requirements:
|
|
54
53
|
- - "~>"
|
|
55
54
|
- !ruby/object:Gem::Version
|
|
56
|
-
version: '3.
|
|
57
|
-
description:
|
|
55
|
+
version: '3.13'
|
|
58
56
|
email:
|
|
59
57
|
- support@altmetric.com
|
|
60
58
|
executables: []
|
|
@@ -90,7 +88,6 @@ homepage: https://github.com/altmetric/identifiers
|
|
|
90
88
|
licenses:
|
|
91
89
|
- MIT
|
|
92
90
|
metadata: {}
|
|
93
|
-
post_install_message:
|
|
94
91
|
rdoc_options: []
|
|
95
92
|
require_paths:
|
|
96
93
|
- lib
|
|
@@ -105,8 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
105
102
|
- !ruby/object:Gem::Version
|
|
106
103
|
version: '0'
|
|
107
104
|
requirements: []
|
|
108
|
-
rubygems_version: 3.
|
|
109
|
-
signing_key:
|
|
105
|
+
rubygems_version: 3.6.9
|
|
110
106
|
specification_version: 4
|
|
111
107
|
summary: Utilities library for various scholarly identifiers used by Altmetric
|
|
112
108
|
test_files:
|