identifiers 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 911c52af8788814413165b25ec5dfdace4cd213906dd193216c720c31cb6d3de
4
- data.tar.gz: fb75b4b356a1e87711f7b0d9a820d938d4dbb2d28730c5c8738d105f947cae00
3
+ metadata.gz: f0cc5842531ab0baf207b1aba207fa1fc89e338b3bb6a36c689346e7b473fdd6
4
+ data.tar.gz: bba49e3c29eeb837cf62f0dcc20256d024ee2690e65ce1df36d7f619a4afc25f
5
5
  SHA512:
6
- metadata.gz: 72881f5981cec05c2273e8c76912a8853edaa5b6a0d0cc69a298daedde4037ee98080903fcf9c80f8290a9fe79d7e0ced24ec680dd9f92c7756c9e76c70e3bbe
7
- data.tar.gz: 1255cb693b80e63ccd19e2ed0c2c5962319467c6635e8713cf7185e5e6c91dc7f952e6b614f5c1131cef45586c06fff12c4eb38e7803b8bd0cb1dfec8b0134e0
6
+ metadata.gz: b629b7898e27360f8b0b45c537b735f9e3a2cfaa37cb0c8a4b00b226af0d16c861fefd00c3d630658c38447d05aa2e25dcd6b4f8a52e257762b3ed3469c53b73
7
+ data.tar.gz: ea4edebb501806152ac4affc90d766bd31b93526606cbd90c64c1c8fa62722da45ca40514e6ade5229878390ab857390d8f4502cb87b27ab6390561e7ec98a1c
data/CHANGELOG.md CHANGED
@@ -2,6 +2,14 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.15.0] - 2026-06-12
6
+ ### Added
7
+ - Extract ISBNs that mix different dash characters, including the U+2212 minus
8
+ sign, within a single ISBN (#31)
9
+
10
+ ### Fixed
11
+ - Stop extracting spurious ISBNs from a long run of digits separated by hyphens (#34)
12
+
5
13
  ## [0.14.0] - 2024-07-30
6
14
  ### Added
7
15
  - Added optional prefixes argument to ISBNs extraction.
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Identifiers [![Build Status](https://travis-ci.org/altmetric/identifiers.svg?branch=master)](https://travis-ci.org/altmetric/identifiers) [![Gem Version](https://badge.fury.io/rb/identifiers.svg)](https://badge.fury.io/rb/identifiers)
1
+ # Identifiers [![Gem Version](https://badge.fury.io/rb/identifiers.svg)](https://badge.fury.io/rb/identifiers)
2
2
 
3
3
  Collection of utilities related to the extraction, validation and normalization of various scholarly identifiers. The supported list is:
4
4
 
@@ -13,6 +13,8 @@ Collection of utilities related to the extraction, validation and normalization
13
13
  - [URNs](https://en.wikipedia.org/wiki/Uniform_Resource_Name)
14
14
  - [ORCID identifiers](http://orcid.org/)
15
15
 
16
+ **Supported Ruby versions**: >= 2.7
17
+
16
18
  ## Installation
17
19
 
18
20
  Add this line to your application's `Gemfile`:
@@ -106,6 +108,6 @@ We also maintain [a version of this library for PHP](https://github.com/altmetri
106
108
 
107
109
  ## License
108
110
 
109
- Copyright © 2016-2018 Altmetric LLP
111
+ Copyright © 2016-2024 Altmetric LLP
110
112
 
111
113
  Distributed under the [MIT License](http://opensource.org/licenses/MIT).
@@ -1,6 +1,9 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Identifiers
2
4
  class ISBN
3
5
  ISBN_13_REGEXP = /
6
+ (?<!\p{Pd}) # Not part of a longer hyphen-separated run of digits
4
7
  \b
5
8
  (
6
9
  97[89] # ISBN (GS1) Bookland prefix
@@ -12,12 +15,14 @@ module Identifiers
12
15
  \d # Check digit
13
16
  )
14
17
  \b
15
- /x
18
+ (?!\p{Pd}) # Not part of a longer hyphen-separated run of digits
19
+ /x.freeze
16
20
  ISBN_10_REGEXP = /
17
21
  (?<! # Don't match a hyphenated or spaced ISBN-13
18
22
  97[89]
19
23
  [\p{Pd}\p{Zs}]
20
24
  )
25
+ (?<!\p{Pd}) # Not part of a longer hyphen-separated run of digits
21
26
  \b
22
27
  (
23
28
  \d{1,5} # Registration group identifier
@@ -29,7 +34,8 @@ module Identifiers
29
34
  [\dX] # Check digit
30
35
  )
31
36
  \b
32
- /x
37
+ (?!\p{Pd}) # Not part of a longer hyphen-separated run of digits
38
+ /x.freeze
33
39
  ISBN_A_REGEXP = %r{
34
40
  \b
35
41
  (?<=10\.) # Directory indicator (always 10)
@@ -38,24 +44,26 @@ module Identifiers
38
44
  / # Prefix/suffix divider
39
45
  \d{1,7} # ISBN title enumerator and check digit
40
46
  \b
41
- }x
42
- TEXT_AFTER_PREFIX_REGEXP = ':?\s*(\d.*)$'.freeze
47
+ }x.freeze
48
+ # Dashes other than the ASCII hyphen we normalise to, including U+2212
49
+ # MINUS SIGN, which is not part of \p{Pd}
50
+ NON_CANONICAL_DASHES_REGEXP = /[[\p{Pd}−]&&[^-]]/.freeze
43
51
 
44
- def self.extract(str , prefixes = [])
45
- str = match_strings_with_prefix(str , prefixes) if prefixes.any?
52
+ def self.extract(str, prefixes = [])
53
+ str = str.to_s
54
+ # Normalise dashes to a single ASCII hyphen so one ISBN can mix them
55
+ str = str.gsub(NON_CANONICAL_DASHES_REGEXP, '-') if str.match?(NON_CANONICAL_DASHES_REGEXP)
56
+ return extract_with_prefix(str, prefixes) if prefixes.any?
46
57
 
47
58
  extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
48
59
  end
49
60
 
50
- def self.match_strings_with_prefix(str, prefixes)
51
- prefix_regexp = prefixes.join('|')
61
+ def self.extract_with_prefix(str, prefixes)
62
+ prefix_regexp = generate_prefix_regexp(prefixes)
52
63
 
53
- str
54
- .to_s
55
- .scan(/(#{prefix_regexp})#{TEXT_AFTER_PREFIX_REGEXP}/i)
56
- .inject('') do |acum, (_prefix, match)|
57
- acum + "#{match} \n "
58
- end
64
+ [isbn_a_candidate_matcher, ISBN_13_REGEXP, ISBN_10_REGEXP].inject([]) do |matches, isbn_regexp|
65
+ matches | isbn_with_prefix_candidates(str, prefix_regexp, isbn_regexp)
66
+ end
59
67
  end
60
68
 
61
69
  def self.extract_isbn_as(str)
@@ -78,13 +86,13 @@ module Identifiers
78
86
  .select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 3 }
79
87
  .map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
80
88
  .select { |isbn| valid_isbn_10?(isbn) }
81
- .map { |isbn|
89
+ .map do |isbn|
82
90
  isbn.chop!
83
91
  isbn.prepend('978')
84
92
  isbn << isbn_13_check_digit(isbn).to_s
85
93
 
86
94
  isbn
87
- }
95
+ end
88
96
  end
89
97
 
90
98
  def self.isbn_13_check_digit(isbn)
@@ -117,5 +125,34 @@ module Identifiers
117
125
  def self.digits_of(isbn)
118
126
  isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
119
127
  end
128
+
129
+ def self.isbn_with_prefix_candidates(str, prefix_regexp, isbn_regexp)
130
+ regexp = Regexp.new("#{prefix_regexp}#{isbn_regexp}", Regexp::IGNORECASE | Regexp::EXTENDED)
131
+
132
+ str
133
+ .to_s
134
+ .scan(regexp)
135
+ .filter_map do |match|
136
+ extract(Array(match).first)&.first
137
+ end
138
+ end
139
+
140
+ def self.generate_prefix_regexp(prefixes)
141
+ joined_prefixes = Regexp.union(prefixes).source
142
+
143
+ Regexp.new(
144
+ "(?<= # Lookbehind for a prefix
145
+ #{joined_prefixes} # ie:p1|p2|p3
146
+ )
147
+ :? # Optional colon. If you want to use a different separator, you can add it as a prefix
148
+ \\s* # Optional whitespaces
149
+ ", Regexp::IGNORECASE | Regexp::EXTENDED
150
+ )
151
+ end
152
+
153
+ def self.isbn_a_candidate_matcher
154
+ # We capture the ISBN-A prefix for the ISBN-A regexp to work correctly when extracting ISBN-As
155
+ Regexp.new(ISBN_A_REGEXP.source.gsub('(?<=10\\.)', '10\.'), Regexp::IGNORECASE | Regexp::EXTENDED)
156
+ end
120
157
  end
121
158
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'identifiers/isbn'
2
4
 
3
5
  RSpec.describe Identifiers::ISBN do
@@ -31,6 +33,10 @@ RSpec.describe Identifiers::ISBN do
31
33
  expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
32
34
  end
33
35
 
36
+ it 'extracts ISBNs that mix different dash characters' do
37
+ expect(described_class.extract('978–3−200–01908–9')).to contain_exactly('9783200019089')
38
+ end
39
+
34
40
  it 'extracts ISBNs with spaces' do
35
41
  expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
36
42
  end
@@ -119,6 +125,14 @@ RSpec.describe Identifiers::ISBN do
119
125
  expect(described_class.extract('0-80506909-7')).to be_empty
120
126
  end
121
127
 
128
+ it 'does not extract ISBNs from a long run of digits separated by hyphens' do
129
+ expect(described_class.extract('0-1884-0-3140-0-4396-0-5652-0-4396-0-2826')).to be_empty
130
+ end
131
+
132
+ it 'does not extract ISBNs joined to other digits by a hyphen' do
133
+ expect(described_class.extract('9780805069099-9780671879198')).to be_empty
134
+ end
135
+
122
136
  it 'extracts ISBN-10s with variable width registration group identifiers' do
123
137
  expect(described_class.extract('99921-58-10-7 9971-5-0210-0 960-425-059-0 80-902734-1-6'))
124
138
  .to contain_exactly('9789992158104', '9789971502102', '9789604250592', '9788090273412')
@@ -126,16 +140,57 @@ RSpec.describe Identifiers::ISBN do
126
140
 
127
141
  context 'when passing prefixes' do
128
142
  it 'extracts only prefixed ISBNs' do
129
- text = "ISBN:9789992158104 \n ISBN-10 9789971502102 \n IsbN-13: 9789604250592 \n 9788090273412"
130
- prefixes = ['IsBn', 'ISBN-10', 'ISBN-13']
143
+ text = "ISBN:9789992158104 ISBN-10 9789971502102 \n IsbN-13: 9789604250592 \n 9788090273412"
144
+ prefixes = %w[IsBn ISBN-10 ISBN-13]
131
145
 
132
146
  expect(described_class.extract(text, prefixes))
133
147
  .to contain_exactly('9789992158104', '9789971502102', '9789604250592')
134
148
  end
135
149
 
150
+ it 'extracts ISBNs with special characters in the prefixes' do
151
+ text = 'ISB*N:99921-58-10-7 IS?BN-10 9971-5-0210-0 Is$bN-13: 978-0-80-506909-9 80-902734-1-6'
152
+ prefixes = ['IsB*n', 'IS?BN-10', 'IS$BN-13']
153
+
154
+ expect(described_class.extract(text, prefixes))
155
+ .to contain_exactly('9789992158104', '9789971502102', '9780805069099')
156
+ end
157
+
158
+ it 'extracts ISBNs with Unicode dashes' do
159
+ text = 'ISB*N:99921-58-10-7 IS?BN-10 9971-5-0210-0 Is$bN-13: 978–0–80–506909–9 80-902734-1-6'
160
+ prefixes = ['IsB*n', 'IS?BN-10', 'IS$BN-13']
161
+
162
+ expect(described_class.extract(text, prefixes))
163
+ .to contain_exactly('9789992158104', '9789971502102', '9780805069099')
164
+ end
165
+
166
+ it 'extracts ISBNs with Unicode spaces' do
167
+ text = 'ISBN-13: 978 0 80 506909 9'
168
+ prefixes = ['ISBN-13']
169
+
170
+ expect(described_class.extract(text, prefixes)).to contain_exactly('9780805069099')
171
+ end
172
+
173
+ it 'normalizes 10-digit ISBNs with hyphens and a check digit of X' do
174
+ expect(described_class.extract('ISBN:2-7594-0269-X', ['ISBN'])).to contain_exactly('9782759402694')
175
+ end
176
+
177
+ it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
178
+ text = 'ISBN-10 2 7594 0269 X'
179
+ prefixes = ['ISBN-10']
180
+
181
+ expect(described_class.extract(text, prefixes)).to contain_exactly('9782759402694')
182
+ end
183
+
184
+ it 'extracts ISBN-13s from ISBN-As' do
185
+ text = 'ISBN 10.978.8898392/315'
186
+ prefixes = %w[ISBN ISBN-10]
187
+
188
+ expect(described_class.extract(text, prefixes)).to contain_exactly('9788898392315')
189
+ end
190
+
136
191
  it 'does not extract ISBNs with different prefixes' do
137
- text = "ISBN:9789992158104 \n ISBN-10 9789971502102 \n IsbN-13: 9789604250592 \n 9788090273412"
138
- prefixes = ['IsBn', 'ISBN-10']
192
+ text = "ISBN:9789992158104 \n ISBN-10 9789971502102 IsbN-13: 9789604250592 9788090273412"
193
+ prefixes = %w[IsBn ISBN-10]
139
194
 
140
195
  expect(described_class.extract(text, prefixes))
141
196
  .to contain_exactly('9789992158104', '9789971502102')
@@ -143,7 +198,7 @@ RSpec.describe Identifiers::ISBN do
143
198
 
144
199
  it 'does not extract ISBNs without prefixes' do
145
200
  text = "9789992158104 9789971502102 9789604250592 \n 9788090273412"
146
- prefixes = ['IsBn', 'ISBN-10', 'ISBN-13']
201
+ prefixes = %w[IsBn ISBN-10 ISBN-13]
147
202
 
148
203
  expect(described_class.extract(text, prefixes)).to be_empty
149
204
  end
metadata CHANGED
@@ -1,16 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.0
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
8
8
  - Paul Mucur
9
9
  - PatoSoft
10
- autorequire:
11
10
  bindir: exe
12
11
  cert_chain: []
13
- date: 2024-08-01 00:00:00.000000000 Z
12
+ date: 1980-01-02 00:00:00.000000000 Z
14
13
  dependencies:
15
14
  - !ruby/object:Gem::Dependency
16
15
  name: urn
@@ -32,29 +31,28 @@ dependencies:
32
31
  requirements:
33
32
  - - "~>"
34
33
  - !ruby/object:Gem::Version
35
- version: '10.0'
34
+ version: '13.2'
36
35
  type: :development
37
36
  prerelease: false
38
37
  version_requirements: !ruby/object:Gem::Requirement
39
38
  requirements:
40
39
  - - "~>"
41
40
  - !ruby/object:Gem::Version
42
- version: '10.0'
41
+ version: '13.2'
43
42
  - !ruby/object:Gem::Dependency
44
43
  name: rspec
45
44
  requirement: !ruby/object:Gem::Requirement
46
45
  requirements:
47
46
  - - "~>"
48
47
  - !ruby/object:Gem::Version
49
- version: '3.4'
48
+ version: '3.13'
50
49
  type: :development
51
50
  prerelease: false
52
51
  version_requirements: !ruby/object:Gem::Requirement
53
52
  requirements:
54
53
  - - "~>"
55
54
  - !ruby/object:Gem::Version
56
- version: '3.4'
57
- description:
55
+ version: '3.13'
58
56
  email:
59
57
  - support@altmetric.com
60
58
  executables: []
@@ -90,7 +88,6 @@ homepage: https://github.com/altmetric/identifiers
90
88
  licenses:
91
89
  - MIT
92
90
  metadata: {}
93
- post_install_message:
94
91
  rdoc_options: []
95
92
  require_paths:
96
93
  - lib
@@ -105,8 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
105
102
  - !ruby/object:Gem::Version
106
103
  version: '0'
107
104
  requirements: []
108
- rubygems_version: 3.3.27
109
- signing_key:
105
+ rubygems_version: 3.6.9
110
106
  specification_version: 4
111
107
  summary: Utilities library for various scholarly identifiers used by Altmetric
112
108
  test_files: