identifiers 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 612736c08372d3108c5b62dbab508417c3b67a73
4
- data.tar.gz: a693d991efe3c913cce2fe6fc4496999c0883103
3
+ metadata.gz: ab7d56e3e6048da713104ab263f36bdd96a9da98
4
+ data.tar.gz: 4d2636ffdeeb7b3a1e563b36c761e39dc81f488e
5
5
  SHA512:
6
- metadata.gz: 19ca1c46ff464e700b3158794edbcdb677d25fa2e3b7f557595d897e430a84e725d0d2a0ce0e8904af713c9adead71d26ff0dd7c72b788a033c451fec31ba73b
7
- data.tar.gz: cfcfc156b3ca04b2becf36f810a0a6b3ad4e402a3eb6d1b2ad4642091b91c20cd64e5e739d30f2e4fb35b849c583e08cab5d4a8e06996ced17db3a593b6c61a8
6
+ metadata.gz: 3a62e98dcf6f35c180e1dd2c2e574f8993c6123a182e6fee782a96bc85e342de5b3894455f8fa99325100edcb8f3a9f0fa33d3135040b775bbf6e0b442bc1992
7
+ data.tar.gz: fcc0d3d178e49ee0eefed149200f4f8489d58c4cb953b21a0cab63067ae237432eee607e8e81460f70cbdba9ebb8d1d44c6f3d01fe82f16c9038a12f31aba609
data/CHANGELOG.md CHANGED
@@ -2,6 +2,11 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.11.0] - 2018-03-12
6
+ ## Fixed
7
+ - Stricter ISBN extraction: consistent hyphenation (#27) and correct number of groups (#28)
8
+ - Prevent stack overflow when extracting DOIS (#25)
9
+
5
10
  ## [0.10.0] - 2017-12-20
6
11
  ### Added
7
12
  - Extract PubMed IDs from URLs (e.g https://www.ncbi.nlm.nih.gov/pubmed/123456) and URIs with schemes `pmid:` and `info:pmid`
@@ -68,3 +73,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
68
73
  [0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0
69
74
  [0.9.1]: https://github.com/altmetric/identifiers/releases/tag/v0.9.1
70
75
  [0.10.0]: https://github.com/altmetric/identifiers/releases/tag/v0.10.0
76
+ [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
data/README.md CHANGED
@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
18
18
  Add this line to your application's `Gemfile`:
19
19
 
20
20
  ```ruby
21
- gem 'identifiers', '~> 0.9'
21
+ gem 'identifiers', '~> 0.11'
22
22
  ```
23
23
 
24
24
  And then execute:
@@ -79,6 +79,6 @@ We also maintain [a version of this library for PHP](https://github.com/altmetri
79
79
 
80
80
  ## License
81
81
 
82
- Copyright © 2016-2017 Altmetric LLP
82
+ Copyright © 2016-2018 Altmetric LLP
83
83
 
84
84
  Distributed under the [MIT License](http://opensource.org/licenses/MIT).
@@ -29,20 +29,15 @@ module Identifiers
29
29
  /x
30
30
 
31
31
  def self.extract(str)
32
- str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
32
+ str.to_s.downcase.scan(REGEXP).map { |doi| extract_one(doi) }.compact
33
33
  end
34
34
 
35
35
  def self.extract_one(str)
36
- match = str.to_s.downcase[REGEXP]
37
- return unless match
36
+ while (match = str.to_s.downcase[REGEXP])
37
+ break match if match =~ VALID_ENDING
38
38
 
39
- strip_punctuation(match)
40
- end
41
-
42
- def self.strip_punctuation(doi)
43
- return doi if doi =~ VALID_ENDING
44
-
45
- extract_one(doi.sub(/\p{Punct}\z/, ''))
39
+ str = match.sub(/\p{Punct}\z/, '')
40
+ end
46
41
  end
47
42
  end
48
43
  end
@@ -1,99 +1,107 @@
1
- module Identifiers
2
- class ISBN
3
- ISBN_13_REGEXP = /
4
- \b
5
- 97[89] # ISBN (GS1) Bookland prefix
6
- [\p{Pd}\p{Zs}]? # Optional hyphenation
7
- (?:
8
- \d # Digit
9
- [\p{Pd}\p{Zs}]? # Optional hyphenation
10
- ){9}
11
- \d # Check digit
12
- \b
13
- /x
14
- ISBN_10_REGEXP = /
15
- (?<! # Don't match a hyphenated or spaced ISBN-13
16
- 97[89]
17
- [\p{Pd}\p{Zs}]
18
- )
19
- \b
20
- (?:
21
- \d # Digit
22
- [\p{Pd}\p{Zs}]? # Optional hyphenation
23
- ){9}
24
- [\dX] # Check digit
25
- \b
26
- /x
27
- ISBN_A_REGEXP = %r{
28
- \b
29
- (?<=10\.) # Directory indicator (always 10)
30
- 97[89]\. # ISBN (GS1) Bookland prefix
31
- \d{2,8} # ISBN registration group element and publisher prefix
32
- / # Prefix/suffix divider
33
- \d{1,7} # ISBN title enumerator and check digit
34
- \b
35
- }x
36
-
37
- def self.extract(str)
38
- extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
39
- end
40
-
41
- def self.extract_isbn_as(str)
42
- extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
43
- end
44
-
45
- def self.extract_thirteen_digit_isbns(str)
46
- str
47
- .to_s
48
- .scan(ISBN_13_REGEXP)
49
- .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
50
- .select { |isbn| valid_isbn_13?(isbn) }
51
- end
52
-
53
- def self.extract_ten_digit_isbns(str)
54
- str
55
- .to_s
56
- .scan(ISBN_10_REGEXP)
57
- .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
58
- .select { |isbn| valid_isbn_10?(isbn) }
59
- .map { |isbn|
60
- isbn.chop!
61
- isbn.prepend('978')
62
- isbn << isbn_13_check_digit(isbn).to_s
63
-
64
- isbn
65
- }
66
- end
67
-
68
- def self.isbn_13_check_digit(isbn)
69
- sum = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
70
- check_digit = 10 - (sum % 10)
71
-
72
- if check_digit == 10
73
- 0
74
- else
75
- check_digit
76
- end
77
- end
78
-
79
- def self.valid_isbn_13?(isbn)
80
- return false unless isbn =~ ISBN_13_REGEXP
81
-
82
- result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
83
-
84
- (result % 10).zero?
85
- end
86
-
87
- def self.valid_isbn_10?(isbn)
88
- return false unless isbn =~ ISBN_10_REGEXP
89
-
90
- result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
91
-
92
- (result % 11).zero?
93
- end
94
-
95
- def self.digits_of(isbn)
96
- isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
97
- end
98
- end
99
- end
1
+ module Identifiers
2
+ class ISBN
3
+ ISBN_13_REGEXP = /
4
+ \b
5
+ (
6
+ 97[89] # ISBN (GS1) Bookland prefix
7
+ ([\p{Pd}\p{Zs}])? # Optional hyphenation
8
+ (?:
9
+ \d # Digit
10
+ \2? # Optional hyphenation
11
+ ){9}
12
+ \d # Check digit
13
+ )
14
+ \b
15
+ /x
16
+ ISBN_10_REGEXP = /
17
+ (?<! # Don't match a hyphenated or spaced ISBN-13
18
+ 97[89]
19
+ [\p{Pd}\p{Zs}]
20
+ )
21
+ \b
22
+ (
23
+ \d # Digit
24
+ ([\p{Pd}\p{Zs}])? # Optional hyphenation
25
+ (?:
26
+ \d # Digit
27
+ \2? # Optional hyphenation
28
+ ){8}
29
+ [\dX] # Check digit
30
+ )
31
+ \b
32
+ /x
33
+ ISBN_A_REGEXP = %r{
34
+ \b
35
+ (?<=10\.) # Directory indicator (always 10)
36
+ 97[89]\. # ISBN (GS1) Bookland prefix
37
+ \d{2,8} # ISBN registration group element and publisher prefix
38
+ / # Prefix/suffix divider
39
+ \d{1,7} # ISBN title enumerator and check digit
40
+ \b
41
+ }x
42
+
43
+ def self.extract(str)
44
+ extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
45
+ end
46
+
47
+ def self.extract_isbn_as(str)
48
+ extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
49
+ end
50
+
51
+ def self.extract_thirteen_digit_isbns(str)
52
+ str
53
+ .to_s
54
+ .scan(ISBN_13_REGEXP)
55
+ .select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 4 }
56
+ .map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
57
+ .select { |isbn| valid_isbn_13?(isbn) }
58
+ end
59
+
60
+ def self.extract_ten_digit_isbns(str)
61
+ str
62
+ .to_s
63
+ .scan(ISBN_10_REGEXP)
64
+ .select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 3 }
65
+ .map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
66
+ .select { |isbn| valid_isbn_10?(isbn) }
67
+ .map { |isbn|
68
+ isbn.chop!
69
+ isbn.prepend('978')
70
+ isbn << isbn_13_check_digit(isbn).to_s
71
+
72
+ isbn
73
+ }
74
+ end
75
+
76
+ def self.isbn_13_check_digit(isbn)
77
+ sum = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
78
+ check_digit = 10 - (sum % 10)
79
+
80
+ if check_digit == 10
81
+ 0
82
+ else
83
+ check_digit
84
+ end
85
+ end
86
+
87
+ def self.valid_isbn_13?(isbn)
88
+ return false unless isbn =~ ISBN_13_REGEXP
89
+
90
+ result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
91
+
92
+ (result % 10).zero?
93
+ end
94
+
95
+ def self.valid_isbn_10?(isbn)
96
+ return false unless isbn =~ ISBN_10_REGEXP
97
+
98
+ result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
99
+
100
+ (result % 11).zero?
101
+ end
102
+
103
+ def self.digits_of(isbn)
104
+ isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
105
+ end
106
+ end
107
+ end
@@ -107,6 +107,12 @@ RSpec.describe Identifiers::DOI do
107
107
  expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
108
108
  end
109
109
 
110
+ it 'does not overflow when given lots of trailing punctuation' do
111
+ str = '10.1130/2013.2502' + ('.' * 10000)
112
+
113
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
114
+ end
115
+
110
116
  it 'does not extract DOIs with purely punctuation suffixes' do
111
117
  expect(described_class.extract('10.1130/!).",')).to be_empty
112
118
  end
@@ -1,97 +1,121 @@
1
- require 'identifiers/isbn'
2
-
3
- RSpec.describe Identifiers::ISBN do
4
- it 'extracts a ISBN' do
5
- expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
- end
7
-
8
- it 'extracts ISBNs when given as a number' do
9
- isbn = 9780805069099
10
-
11
- expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
12
- end
13
-
14
- it 'normalizes 13-digit ISBNs' do
15
- str = "978-0-80-506909-9\n978-0-67-187919-8"
16
-
17
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
18
- end
19
-
20
- it 'extracts multiple ISBN-13s separated by a space' do
21
- str = '978-0-80-506909-9 978-0-67-187919-8'
22
-
23
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
24
- end
25
-
26
- it 'extracts ISBNs with hyphens' do
27
- expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
28
- end
29
-
30
- it 'extracts ISBNs with Unicode dashes' do
31
- expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
32
- end
33
-
34
- it 'extracts ISBNs with spaces' do
35
- expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
36
- end
37
-
38
- it 'extracts ISBNs with Unicode spaces' do
39
- expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
40
- end
41
-
42
- it 'extracts ISBN-13s from ISBN-As' do
43
- expect(described_class.extract('10.978.8898392/315')).to contain_exactly('9788898392315')
44
- end
45
-
46
- it 'does not extract invalid ISBNs from ISBN-As' do
47
- expect(described_class.extract('10.978.8898392/316')).to be_empty
48
- end
49
-
50
- it 'normalizes 10-digit ISBNs' do
51
- str = "0-8050-6909-7 \n 2-7594-0269-X"
52
-
53
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
54
- end
55
-
56
- it 'extracts multiple 10-digit ISBNs separated by a space' do
57
- str = '0-8050-6909-7 2-7594-0269-X'
58
-
59
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
60
- end
61
-
62
- it 'normalizes 10-digit ISBNs with Unicode dashes' do
63
- expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
64
- end
65
-
66
- it 'normalizes 10-digit ISBNs with a check digit of 10' do
67
- expect(described_class.extract('4423272350')).to contain_exactly('9784423272350')
68
- end
69
-
70
- it 'normalizes 10-digit ISBNs with spaces' do
71
- expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
72
- end
73
-
74
- it 'normalizes 10-digit ISBNs with Unicode spaces' do
75
- expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
76
- end
77
-
78
- it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
79
- expect(described_class.extract('2 7594 0269 X')).to contain_exactly('9782759402694')
80
- end
81
-
82
- it 'does not extract invalid 13-digit ISBNs' do
83
- expect(described_class.extract('9783319217280')).to be_empty
84
- end
85
-
86
- it 'does not extract invalid 10-digit ISBNs' do
87
- expect(described_class.extract('3319217280')).to be_empty
88
- end
89
-
90
- it 'does not extract ISBN-10s from hyphenated ISBN-13s' do
91
- expect(described_class.extract('978-0-309-57079-4')).to contain_exactly('9780309570794')
92
- end
93
-
94
- it 'does not extract ISBN-10s from space-separated ISBN-13s' do
95
- expect(described_class.extract('978 0 309 57079 4')).to contain_exactly('9780309570794')
96
- end
97
- end
1
+ require 'identifiers/isbn'
2
+
3
+ RSpec.describe Identifiers::ISBN do
4
+ it 'extracts a ISBN' do
5
+ expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
+ end
7
+
8
+ it 'extracts ISBNs when given as a number' do
9
+ isbn = 9780805069099
10
+
11
+ expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
12
+ end
13
+
14
+ it 'normalizes 13-digit ISBNs' do
15
+ str = "978-0-80-506909-9\n978-0-67-187919-8"
16
+
17
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
18
+ end
19
+
20
+ it 'extracts multiple ISBN-13s separated by a space' do
21
+ str = '978-0-80-506909-9 978-0-67-187919-8'
22
+
23
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
24
+ end
25
+
26
+ it 'extracts ISBNs with hyphens' do
27
+ expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
28
+ end
29
+
30
+ it 'extracts ISBNs with Unicode dashes' do
31
+ expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
32
+ end
33
+
34
+ it 'extracts ISBNs with spaces' do
35
+ expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
36
+ end
37
+
38
+ it 'extracts ISBNs with Unicode spaces' do
39
+ expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
40
+ end
41
+
42
+ it 'extracts ISBN-13s from ISBN-As' do
43
+ expect(described_class.extract('10.978.8898392/315')).to contain_exactly('9788898392315')
44
+ end
45
+
46
+ it 'does not extract invalid ISBNs from ISBN-As' do
47
+ expect(described_class.extract('10.978.8898392/316')).to be_empty
48
+ end
49
+
50
+ it 'normalizes 10-digit ISBNs' do
51
+ str = "0-8050-6909-7 \n 2-7594-0269-X"
52
+
53
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
54
+ end
55
+
56
+ it 'extracts multiple 10-digit ISBNs separated by a space' do
57
+ str = '0-8050-6909-7 2-7594-0269-X'
58
+
59
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
60
+ end
61
+
62
+ it 'normalizes 10-digit ISBNs with Unicode dashes' do
63
+ expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
64
+ end
65
+
66
+ it 'normalizes 10-digit ISBNs with a check digit of 10' do
67
+ expect(described_class.extract('4423272350')).to contain_exactly('9784423272350')
68
+ end
69
+
70
+ it 'normalizes 10-digit ISBNs with spaces' do
71
+ expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
72
+ end
73
+
74
+ it 'normalizes 10-digit ISBNs with Unicode spaces' do
75
+ expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
76
+ end
77
+
78
+ it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
79
+ expect(described_class.extract('2 7594 0269 X')).to contain_exactly('9782759402694')
80
+ end
81
+
82
+ it 'does not extract invalid 13-digit ISBNs' do
83
+ expect(described_class.extract('9783319217280')).to be_empty
84
+ end
85
+
86
+ it 'does not extract invalid 10-digit ISBNs' do
87
+ expect(described_class.extract('3319217280')).to be_empty
88
+ end
89
+
90
+ it 'does not extract ISBN-10s from hyphenated ISBN-13s' do
91
+ expect(described_class.extract('978-0-309-57079-4')).to contain_exactly('9780309570794')
92
+ end
93
+
94
+ it 'does not extract ISBN-10s from space-separated ISBN-13s' do
95
+ expect(described_class.extract('978 0 309 57079 4')).to contain_exactly('9780309570794')
96
+ end
97
+
98
+ it 'does not extract ISBN-13s from strings with inconsistent hyphenation' do
99
+ expect(described_class.extract('978-0 80-506909 9')).to be_empty
100
+ end
101
+
102
+ it 'does not extract ISBN-10s from strings with inconsistent hyphenation' do
103
+ expect(described_class.extract('0-8050 6909-7')).to be_empty
104
+ end
105
+
106
+ it 'does not extract ISBN-13s if they have more than five groups' do
107
+ expect(described_class.extract('978-0-80-506-909-9')).to be_empty
108
+ end
109
+
110
+ it 'does not extract ISBN-13s if they have less than five groups' do
111
+ expect(described_class.extract('978-0-80506909-9')).to be_empty
112
+ end
113
+
114
+ it 'does not extract ISBN-10s if they have more than four groups' do
115
+ expect(described_class.extract('0-8050-69-09-7')).to be_empty
116
+ end
117
+
118
+ it 'does not extract ISBN-10s if they have less than four groups' do
119
+ expect(described_class.extract('0-80506909-7')).to be_empty
120
+ end
121
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-12-20 00:00:00.000000000 Z
12
+ date: 2018-03-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn