identifiers 0.10.0 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 612736c08372d3108c5b62dbab508417c3b67a73
4
- data.tar.gz: a693d991efe3c913cce2fe6fc4496999c0883103
3
+ metadata.gz: ab7d56e3e6048da713104ab263f36bdd96a9da98
4
+ data.tar.gz: 4d2636ffdeeb7b3a1e563b36c761e39dc81f488e
5
5
  SHA512:
6
- metadata.gz: 19ca1c46ff464e700b3158794edbcdb677d25fa2e3b7f557595d897e430a84e725d0d2a0ce0e8904af713c9adead71d26ff0dd7c72b788a033c451fec31ba73b
7
- data.tar.gz: cfcfc156b3ca04b2becf36f810a0a6b3ad4e402a3eb6d1b2ad4642091b91c20cd64e5e739d30f2e4fb35b849c583e08cab5d4a8e06996ced17db3a593b6c61a8
6
+ metadata.gz: 3a62e98dcf6f35c180e1dd2c2e574f8993c6123a182e6fee782a96bc85e342de5b3894455f8fa99325100edcb8f3a9f0fa33d3135040b775bbf6e0b442bc1992
7
+ data.tar.gz: fcc0d3d178e49ee0eefed149200f4f8489d58c4cb953b21a0cab63067ae237432eee607e8e81460f70cbdba9ebb8d1d44c6f3d01fe82f16c9038a12f31aba609
data/CHANGELOG.md CHANGED
@@ -2,6 +2,11 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.11.0] - 2018-03-12
6
+ ## Fixed
7
+ - Stricter ISBN extraction: consistent hyphenation (#27) and correct number of groups (#28)
8
+ - Prevent stack overflow when extracting DOIS (#25)
9
+
5
10
  ## [0.10.0] - 2017-12-20
6
11
  ### Added
7
12
  - Extract PubMed IDs from URLs (e.g https://www.ncbi.nlm.nih.gov/pubmed/123456) and URIs with schemes `pmid:` and `info:pmid`
@@ -68,3 +73,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
68
73
  [0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0
69
74
  [0.9.1]: https://github.com/altmetric/identifiers/releases/tag/v0.9.1
70
75
  [0.10.0]: https://github.com/altmetric/identifiers/releases/tag/v0.10.0
76
+ [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
data/README.md CHANGED
@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
18
18
  Add this line to your application's `Gemfile`:
19
19
 
20
20
  ```ruby
21
- gem 'identifiers', '~> 0.9'
21
+ gem 'identifiers', '~> 0.11'
22
22
  ```
23
23
 
24
24
  And then execute:
@@ -79,6 +79,6 @@ We also maintain [a version of this library for PHP](https://github.com/altmetri
79
79
 
80
80
  ## License
81
81
 
82
- Copyright © 2016-2017 Altmetric LLP
82
+ Copyright © 2016-2018 Altmetric LLP
83
83
 
84
84
  Distributed under the [MIT License](http://opensource.org/licenses/MIT).
@@ -29,20 +29,15 @@ module Identifiers
29
29
  /x
30
30
 
31
31
  def self.extract(str)
32
- str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
32
+ str.to_s.downcase.scan(REGEXP).map { |doi| extract_one(doi) }.compact
33
33
  end
34
34
 
35
35
  def self.extract_one(str)
36
- match = str.to_s.downcase[REGEXP]
37
- return unless match
36
+ while (match = str.to_s.downcase[REGEXP])
37
+ break match if match =~ VALID_ENDING
38
38
 
39
- strip_punctuation(match)
40
- end
41
-
42
- def self.strip_punctuation(doi)
43
- return doi if doi =~ VALID_ENDING
44
-
45
- extract_one(doi.sub(/\p{Punct}\z/, ''))
39
+ str = match.sub(/\p{Punct}\z/, '')
40
+ end
46
41
  end
47
42
  end
48
43
  end
@@ -1,99 +1,107 @@
1
- module Identifiers
2
- class ISBN
3
- ISBN_13_REGEXP = /
4
- \b
5
- 97[89] # ISBN (GS1) Bookland prefix
6
- [\p{Pd}\p{Zs}]? # Optional hyphenation
7
- (?:
8
- \d # Digit
9
- [\p{Pd}\p{Zs}]? # Optional hyphenation
10
- ){9}
11
- \d # Check digit
12
- \b
13
- /x
14
- ISBN_10_REGEXP = /
15
- (?<! # Don't match a hyphenated or spaced ISBN-13
16
- 97[89]
17
- [\p{Pd}\p{Zs}]
18
- )
19
- \b
20
- (?:
21
- \d # Digit
22
- [\p{Pd}\p{Zs}]? # Optional hyphenation
23
- ){9}
24
- [\dX] # Check digit
25
- \b
26
- /x
27
- ISBN_A_REGEXP = %r{
28
- \b
29
- (?<=10\.) # Directory indicator (always 10)
30
- 97[89]\. # ISBN (GS1) Bookland prefix
31
- \d{2,8} # ISBN registration group element and publisher prefix
32
- / # Prefix/suffix divider
33
- \d{1,7} # ISBN title enumerator and check digit
34
- \b
35
- }x
36
-
37
- def self.extract(str)
38
- extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
39
- end
40
-
41
- def self.extract_isbn_as(str)
42
- extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
43
- end
44
-
45
- def self.extract_thirteen_digit_isbns(str)
46
- str
47
- .to_s
48
- .scan(ISBN_13_REGEXP)
49
- .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
50
- .select { |isbn| valid_isbn_13?(isbn) }
51
- end
52
-
53
- def self.extract_ten_digit_isbns(str)
54
- str
55
- .to_s
56
- .scan(ISBN_10_REGEXP)
57
- .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
58
- .select { |isbn| valid_isbn_10?(isbn) }
59
- .map { |isbn|
60
- isbn.chop!
61
- isbn.prepend('978')
62
- isbn << isbn_13_check_digit(isbn).to_s
63
-
64
- isbn
65
- }
66
- end
67
-
68
- def self.isbn_13_check_digit(isbn)
69
- sum = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
70
- check_digit = 10 - (sum % 10)
71
-
72
- if check_digit == 10
73
- 0
74
- else
75
- check_digit
76
- end
77
- end
78
-
79
- def self.valid_isbn_13?(isbn)
80
- return false unless isbn =~ ISBN_13_REGEXP
81
-
82
- result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
83
-
84
- (result % 10).zero?
85
- end
86
-
87
- def self.valid_isbn_10?(isbn)
88
- return false unless isbn =~ ISBN_10_REGEXP
89
-
90
- result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
91
-
92
- (result % 11).zero?
93
- end
94
-
95
- def self.digits_of(isbn)
96
- isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
97
- end
98
- end
99
- end
1
+ module Identifiers
2
+ class ISBN
3
+ ISBN_13_REGEXP = /
4
+ \b
5
+ (
6
+ 97[89] # ISBN (GS1) Bookland prefix
7
+ ([\p{Pd}\p{Zs}])? # Optional hyphenation
8
+ (?:
9
+ \d # Digit
10
+ \2? # Optional hyphenation
11
+ ){9}
12
+ \d # Check digit
13
+ )
14
+ \b
15
+ /x
16
+ ISBN_10_REGEXP = /
17
+ (?<! # Don't match a hyphenated or spaced ISBN-13
18
+ 97[89]
19
+ [\p{Pd}\p{Zs}]
20
+ )
21
+ \b
22
+ (
23
+ \d # Digit
24
+ ([\p{Pd}\p{Zs}])? # Optional hyphenation
25
+ (?:
26
+ \d # Digit
27
+ \2? # Optional hyphenation
28
+ ){8}
29
+ [\dX] # Check digit
30
+ )
31
+ \b
32
+ /x
33
+ ISBN_A_REGEXP = %r{
34
+ \b
35
+ (?<=10\.) # Directory indicator (always 10)
36
+ 97[89]\. # ISBN (GS1) Bookland prefix
37
+ \d{2,8} # ISBN registration group element and publisher prefix
38
+ / # Prefix/suffix divider
39
+ \d{1,7} # ISBN title enumerator and check digit
40
+ \b
41
+ }x
42
+
43
+ def self.extract(str)
44
+ extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
45
+ end
46
+
47
+ def self.extract_isbn_as(str)
48
+ extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
49
+ end
50
+
51
+ def self.extract_thirteen_digit_isbns(str)
52
+ str
53
+ .to_s
54
+ .scan(ISBN_13_REGEXP)
55
+ .select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 4 }
56
+ .map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
57
+ .select { |isbn| valid_isbn_13?(isbn) }
58
+ end
59
+
60
+ def self.extract_ten_digit_isbns(str)
61
+ str
62
+ .to_s
63
+ .scan(ISBN_10_REGEXP)
64
+ .select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 3 }
65
+ .map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
66
+ .select { |isbn| valid_isbn_10?(isbn) }
67
+ .map { |isbn|
68
+ isbn.chop!
69
+ isbn.prepend('978')
70
+ isbn << isbn_13_check_digit(isbn).to_s
71
+
72
+ isbn
73
+ }
74
+ end
75
+
76
+ def self.isbn_13_check_digit(isbn)
77
+ sum = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
78
+ check_digit = 10 - (sum % 10)
79
+
80
+ if check_digit == 10
81
+ 0
82
+ else
83
+ check_digit
84
+ end
85
+ end
86
+
87
+ def self.valid_isbn_13?(isbn)
88
+ return false unless isbn =~ ISBN_13_REGEXP
89
+
90
+ result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
91
+
92
+ (result % 10).zero?
93
+ end
94
+
95
+ def self.valid_isbn_10?(isbn)
96
+ return false unless isbn =~ ISBN_10_REGEXP
97
+
98
+ result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
99
+
100
+ (result % 11).zero?
101
+ end
102
+
103
+ def self.digits_of(isbn)
104
+ isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
105
+ end
106
+ end
107
+ end
@@ -107,6 +107,12 @@ RSpec.describe Identifiers::DOI do
107
107
  expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
108
108
  end
109
109
 
110
+ it 'does not overflow when given lots of trailing punctuation' do
111
+ str = '10.1130/2013.2502' + ('.' * 10000)
112
+
113
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
114
+ end
115
+
110
116
  it 'does not extract DOIs with purely punctuation suffixes' do
111
117
  expect(described_class.extract('10.1130/!).",')).to be_empty
112
118
  end
@@ -1,97 +1,121 @@
1
- require 'identifiers/isbn'
2
-
3
- RSpec.describe Identifiers::ISBN do
4
- it 'extracts a ISBN' do
5
- expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
- end
7
-
8
- it 'extracts ISBNs when given as a number' do
9
- isbn = 9780805069099
10
-
11
- expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
12
- end
13
-
14
- it 'normalizes 13-digit ISBNs' do
15
- str = "978-0-80-506909-9\n978-0-67-187919-8"
16
-
17
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
18
- end
19
-
20
- it 'extracts multiple ISBN-13s separated by a space' do
21
- str = '978-0-80-506909-9 978-0-67-187919-8'
22
-
23
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
24
- end
25
-
26
- it 'extracts ISBNs with hyphens' do
27
- expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
28
- end
29
-
30
- it 'extracts ISBNs with Unicode dashes' do
31
- expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
32
- end
33
-
34
- it 'extracts ISBNs with spaces' do
35
- expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
36
- end
37
-
38
- it 'extracts ISBNs with Unicode spaces' do
39
- expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
40
- end
41
-
42
- it 'extracts ISBN-13s from ISBN-As' do
43
- expect(described_class.extract('10.978.8898392/315')).to contain_exactly('9788898392315')
44
- end
45
-
46
- it 'does not extract invalid ISBNs from ISBN-As' do
47
- expect(described_class.extract('10.978.8898392/316')).to be_empty
48
- end
49
-
50
- it 'normalizes 10-digit ISBNs' do
51
- str = "0-8050-6909-7 \n 2-7594-0269-X"
52
-
53
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
54
- end
55
-
56
- it 'extracts multiple 10-digit ISBNs separated by a space' do
57
- str = '0-8050-6909-7 2-7594-0269-X'
58
-
59
- expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
60
- end
61
-
62
- it 'normalizes 10-digit ISBNs with Unicode dashes' do
63
- expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
64
- end
65
-
66
- it 'normalizes 10-digit ISBNs with a check digit of 10' do
67
- expect(described_class.extract('4423272350')).to contain_exactly('9784423272350')
68
- end
69
-
70
- it 'normalizes 10-digit ISBNs with spaces' do
71
- expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
72
- end
73
-
74
- it 'normalizes 10-digit ISBNs with Unicode spaces' do
75
- expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
76
- end
77
-
78
- it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
79
- expect(described_class.extract('2 7594 0269 X')).to contain_exactly('9782759402694')
80
- end
81
-
82
- it 'does not extract invalid 13-digit ISBNs' do
83
- expect(described_class.extract('9783319217280')).to be_empty
84
- end
85
-
86
- it 'does not extract invalid 10-digit ISBNs' do
87
- expect(described_class.extract('3319217280')).to be_empty
88
- end
89
-
90
- it 'does not extract ISBN-10s from hyphenated ISBN-13s' do
91
- expect(described_class.extract('978-0-309-57079-4')).to contain_exactly('9780309570794')
92
- end
93
-
94
- it 'does not extract ISBN-10s from space-separated ISBN-13s' do
95
- expect(described_class.extract('978 0 309 57079 4')).to contain_exactly('9780309570794')
96
- end
97
- end
1
+ require 'identifiers/isbn'
2
+
3
+ RSpec.describe Identifiers::ISBN do
4
+ it 'extracts a ISBN' do
5
+ expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
+ end
7
+
8
+ it 'extracts ISBNs when given as a number' do
9
+ isbn = 9780805069099
10
+
11
+ expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
12
+ end
13
+
14
+ it 'normalizes 13-digit ISBNs' do
15
+ str = "978-0-80-506909-9\n978-0-67-187919-8"
16
+
17
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
18
+ end
19
+
20
+ it 'extracts multiple ISBN-13s separated by a space' do
21
+ str = '978-0-80-506909-9 978-0-67-187919-8'
22
+
23
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
24
+ end
25
+
26
+ it 'extracts ISBNs with hyphens' do
27
+ expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
28
+ end
29
+
30
+ it 'extracts ISBNs with Unicode dashes' do
31
+ expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
32
+ end
33
+
34
+ it 'extracts ISBNs with spaces' do
35
+ expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
36
+ end
37
+
38
+ it 'extracts ISBNs with Unicode spaces' do
39
+ expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
40
+ end
41
+
42
+ it 'extracts ISBN-13s from ISBN-As' do
43
+ expect(described_class.extract('10.978.8898392/315')).to contain_exactly('9788898392315')
44
+ end
45
+
46
+ it 'does not extract invalid ISBNs from ISBN-As' do
47
+ expect(described_class.extract('10.978.8898392/316')).to be_empty
48
+ end
49
+
50
+ it 'normalizes 10-digit ISBNs' do
51
+ str = "0-8050-6909-7 \n 2-7594-0269-X"
52
+
53
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
54
+ end
55
+
56
+ it 'extracts multiple 10-digit ISBNs separated by a space' do
57
+ str = '0-8050-6909-7 2-7594-0269-X'
58
+
59
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
60
+ end
61
+
62
+ it 'normalizes 10-digit ISBNs with Unicode dashes' do
63
+ expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
64
+ end
65
+
66
+ it 'normalizes 10-digit ISBNs with a check digit of 10' do
67
+ expect(described_class.extract('4423272350')).to contain_exactly('9784423272350')
68
+ end
69
+
70
+ it 'normalizes 10-digit ISBNs with spaces' do
71
+ expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
72
+ end
73
+
74
+ it 'normalizes 10-digit ISBNs with Unicode spaces' do
75
+ expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
76
+ end
77
+
78
+ it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
79
+ expect(described_class.extract('2 7594 0269 X')).to contain_exactly('9782759402694')
80
+ end
81
+
82
+ it 'does not extract invalid 13-digit ISBNs' do
83
+ expect(described_class.extract('9783319217280')).to be_empty
84
+ end
85
+
86
+ it 'does not extract invalid 10-digit ISBNs' do
87
+ expect(described_class.extract('3319217280')).to be_empty
88
+ end
89
+
90
+ it 'does not extract ISBN-10s from hyphenated ISBN-13s' do
91
+ expect(described_class.extract('978-0-309-57079-4')).to contain_exactly('9780309570794')
92
+ end
93
+
94
+ it 'does not extract ISBN-10s from space-separated ISBN-13s' do
95
+ expect(described_class.extract('978 0 309 57079 4')).to contain_exactly('9780309570794')
96
+ end
97
+
98
+ it 'does not extract ISBN-13s from strings with inconsistent hyphenation' do
99
+ expect(described_class.extract('978-0 80-506909 9')).to be_empty
100
+ end
101
+
102
+ it 'does not extract ISBN-10s from strings with inconsistent hyphenation' do
103
+ expect(described_class.extract('0-8050 6909-7')).to be_empty
104
+ end
105
+
106
+ it 'does not extract ISBN-13s if they have more than five groups' do
107
+ expect(described_class.extract('978-0-80-506-909-9')).to be_empty
108
+ end
109
+
110
+ it 'does not extract ISBN-13s if they have less than five groups' do
111
+ expect(described_class.extract('978-0-80506909-9')).to be_empty
112
+ end
113
+
114
+ it 'does not extract ISBN-10s if they have more than four groups' do
115
+ expect(described_class.extract('0-8050-69-09-7')).to be_empty
116
+ end
117
+
118
+ it 'does not extract ISBN-10s if they have less than four groups' do
119
+ expect(described_class.extract('0-80506909-7')).to be_empty
120
+ end
121
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-12-20 00:00:00.000000000 Z
12
+ date: 2018-03-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn