RubyGems - identifiers - Versions diffs - 0.10.0 → 0.11.0 - Mend

identifiers 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +2 -2
data/lib/identifiers/doi.rb +5 -10
data/lib/identifiers/isbn.rb +107 -99
data/spec/identifiers/doi_spec.rb +6 -0
data/spec/identifiers/isbn_spec.rb +121 -97
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 612736c08372d3108c5b62dbab508417c3b67a73
-  data.tar.gz: a693d991efe3c913cce2fe6fc4496999c0883103
+  metadata.gz: ab7d56e3e6048da713104ab263f36bdd96a9da98
+  data.tar.gz: 4d2636ffdeeb7b3a1e563b36c761e39dc81f488e
 SHA512:
-  metadata.gz: 19ca1c46ff464e700b3158794edbcdb677d25fa2e3b7f557595d897e430a84e725d0d2a0ce0e8904af713c9adead71d26ff0dd7c72b788a033c451fec31ba73b
-  data.tar.gz: cfcfc156b3ca04b2becf36f810a0a6b3ad4e402a3eb6d1b2ad4642091b91c20cd64e5e739d30f2e4fb35b849c583e08cab5d4a8e06996ced17db3a593b6c61a8
+  metadata.gz: 3a62e98dcf6f35c180e1dd2c2e574f8993c6123a182e6fee782a96bc85e342de5b3894455f8fa99325100edcb8f3a9f0fa33d3135040b775bbf6e0b442bc1992
+  data.tar.gz: fcc0d3d178e49ee0eefed149200f4f8489d58c4cb953b21a0cab63067ae237432eee607e8e81460f70cbdba9ebb8d1d44c6f3d01fe82f16c9038a12f31aba609

data/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,11 @@
 All notable changes to this project will be documented in this file. This
 project adheres to [Semantic Versioning](http://semver.org/).
+## [0.11.0] - 2018-03-12
+## Fixed
+- Stricter ISBN extraction: consistent hyphenation (#27) and correct number of groups (#28)
+- Prevent stack overflow when extracting DOIS (#25)
 ## [0.10.0] - 2017-12-20
 ### Added
 - Extract PubMed IDs from URLs (e.g https://www.ncbi.nlm.nih.gov/pubmed/123456) and URIs with schemes `pmid:` and `info:pmid`
@@ -68,3 +73,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
 [0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0
 [0.9.1]: https://github.com/altmetric/identifiers/releases/tag/v0.9.1
 [0.10.0]: https://github.com/altmetric/identifiers/releases/tag/v0.10.0
+[0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0

data/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
 Add this line to your application's `Gemfile`:
 ```ruby
-gem 'identifiers', '~> 0.9'
+gem 'identifiers', '~> 0.11'
 ```
 And then execute:
@@ -79,6 +79,6 @@ We also maintain [a version of this library for PHP](https://github.com/altmetri
 ## License
-Copyright © 2016-2017 Altmetric LLP
+Copyright © 2016-2018 Altmetric LLP
 Distributed under the [MIT License](http://opensource.org/licenses/MIT).

data/lib/identifiers/doi.rb CHANGED Viewed

@@ -29,20 +29,15 @@ module Identifiers
     /x
     def self.extract(str)
-      str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
+      str.to_s.downcase.scan(REGEXP).map { |doi| extract_one(doi) }.compact
     end
     def self.extract_one(str)
-      match = str.to_s.downcase[REGEXP]
-      return unless match
+      while (match = str.to_s.downcase[REGEXP])
+        break match if match =~ VALID_ENDING
-      strip_punctuation(match)
-    end
-    def self.strip_punctuation(doi)
-      return doi if doi =~ VALID_ENDING
-      extract_one(doi.sub(/\p{Punct}\z/, ''))
+        str = match.sub(/\p{Punct}\z/, '')
+      end
     end
   end
 end

data/lib/identifiers/isbn.rb CHANGED Viewed

@@ -1,99 +1,107 @@
-module Identifiers
-  class ISBN
-    ISBN_13_REGEXP = /
-      \b
-      97[89]            # ISBN (GS1) Bookland prefix
-      [\p{Pd}\p{Zs}]?   # Optional hyphenation
-      (?:
-        \d              # Digit
-        [\p{Pd}\p{Zs}]? # Optional hyphenation
-      ){9}
-      \d                # Check digit
-      \b
-    /x
-    ISBN_10_REGEXP = /
-      (?<!              # Don't match a hyphenated or spaced ISBN-13
-        97[89]
-        [\p{Pd}\p{Zs}]
-      )
-      \b
-      (?:
-        \d              # Digit
-        [\p{Pd}\p{Zs}]? # Optional hyphenation
-      ){9}
-      [\dX]             # Check digit
-      \b
-    /x
-    ISBN_A_REGEXP = %r{
-      \b
-      (?<=10\.) # Directory indicator (always 10)
-      97[89]\.  # ISBN (GS1) Bookland prefix
-      \d{2,8}   # ISBN registration group element and publisher prefix
-      /         # Prefix/suffix divider
-      \d{1,7}   # ISBN title enumerator and check digit
-      \b
-    }x
-    def self.extract(str)
-      extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
-    end
-    def self.extract_isbn_as(str)
-      extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
-    end
-    def self.extract_thirteen_digit_isbns(str)
-      str
-        .to_s
-        .scan(ISBN_13_REGEXP)
-        .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
-        .select { |isbn| valid_isbn_13?(isbn) }
-    end
-    def self.extract_ten_digit_isbns(str)
-      str
-        .to_s
-        .scan(ISBN_10_REGEXP)
-        .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
-        .select { |isbn| valid_isbn_10?(isbn) }
-        .map { |isbn|
-          isbn.chop!
-          isbn.prepend('978')
-          isbn << isbn_13_check_digit(isbn).to_s
-          isbn
-        }
-    end
-    def self.isbn_13_check_digit(isbn)
-      sum = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
-      check_digit = 10 - (sum % 10)
-      if check_digit == 10
-        0
-      else
-        check_digit
-      end
-    end
-    def self.valid_isbn_13?(isbn)
-      return false unless isbn =~ ISBN_13_REGEXP
-      result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
-      (result % 10).zero?
-    end
-    def self.valid_isbn_10?(isbn)
-      return false unless isbn =~ ISBN_10_REGEXP
-      result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
-      (result % 11).zero?
-    end
-    def self.digits_of(isbn)
-      isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
-    end
-  end
-end
+module Identifiers
+  class ISBN
+    ISBN_13_REGEXP = /
+      \b
+      (
+        97[89]            # ISBN (GS1) Bookland prefix
+        ([\p{Pd}\p{Zs}])? # Optional hyphenation
+        (?:
+          \d              # Digit
+          \2?             # Optional hyphenation
+        ){9}
+        \d                # Check digit
+      )
+      \b
+    /x
+    ISBN_10_REGEXP = /
+      (?<!              # Don't match a hyphenated or spaced ISBN-13
+        97[89]
+        [\p{Pd}\p{Zs}]
+      )
+      \b
+      (
+        \d                # Digit
+        ([\p{Pd}\p{Zs}])? # Optional hyphenation
+        (?:
+          \d              # Digit
+          \2?             # Optional hyphenation
+        ){8}
+        [\dX]             # Check digit
+      )
+      \b
+    /x
+    ISBN_A_REGEXP = %r{
+      \b
+      (?<=10\.) # Directory indicator (always 10)
+      97[89]\.  # ISBN (GS1) Bookland prefix
+      \d{2,8}   # ISBN registration group element and publisher prefix
+      /         # Prefix/suffix divider
+      \d{1,7}   # ISBN title enumerator and check digit
+      \b
+    }x
+    def self.extract(str)
+      extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
+    end
+    def self.extract_isbn_as(str)
+      extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
+    end
+    def self.extract_thirteen_digit_isbns(str)
+      str
+        .to_s
+        .scan(ISBN_13_REGEXP)
+        .select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 4 }
+        .map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
+        .select { |isbn| valid_isbn_13?(isbn) }
+    end
+    def self.extract_ten_digit_isbns(str)
+      str
+        .to_s
+        .scan(ISBN_10_REGEXP)
+        .select { |isbn, hyphen| !hyphen || isbn.count(hyphen) == 3 }
+        .map { |isbn, hyphen| isbn.delete(hyphen.to_s) }
+        .select { |isbn| valid_isbn_10?(isbn) }
+        .map { |isbn|
+          isbn.chop!
+          isbn.prepend('978')
+          isbn << isbn_13_check_digit(isbn).to_s
+          isbn
+        }
+    end
+    def self.isbn_13_check_digit(isbn)
+      sum = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
+      check_digit = 10 - (sum % 10)
+      if check_digit == 10
+        0
+      else
+        check_digit
+      end
+    end
+    def self.valid_isbn_13?(isbn)
+      return false unless isbn =~ ISBN_13_REGEXP
+      result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
+      (result % 10).zero?
+    end
+    def self.valid_isbn_10?(isbn)
+      return false unless isbn =~ ISBN_10_REGEXP
+      result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
+      (result % 11).zero?
+    end
+    def self.digits_of(isbn)
+      isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
+    end
+  end
+end

data/spec/identifiers/doi_spec.rb CHANGED Viewed

@@ -107,6 +107,12 @@ RSpec.describe Identifiers::DOI do
     expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
   end
+  it 'does not overflow when given lots of trailing punctuation' do
+    str = '10.1130/2013.2502' + ('.' * 10000)
+    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
+  end
   it 'does not extract DOIs with purely punctuation suffixes' do
     expect(described_class.extract('10.1130/!).",')).to be_empty
   end

data/spec/identifiers/isbn_spec.rb CHANGED Viewed

@@ -1,97 +1,121 @@
-require 'identifiers/isbn'
-RSpec.describe Identifiers::ISBN do
-  it 'extracts a ISBN' do
-    expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
-  end
-  it 'extracts ISBNs when given as a number' do
-    isbn = 9780805069099
-    expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
-  end
-  it 'normalizes 13-digit ISBNs' do
-    str = "978-0-80-506909-9\n978-0-67-187919-8"
-    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
-  end
-  it 'extracts multiple ISBN-13s separated by a space' do
-    str = '978-0-80-506909-9 978-0-67-187919-8'
-    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
-  end
-  it 'extracts ISBNs with hyphens' do
-    expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
-  end
-  it 'extracts ISBNs with Unicode dashes' do
-    expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
-  end
-  it 'extracts ISBNs with spaces' do
-    expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
-  end
-  it 'extracts ISBNs with Unicode spaces' do
-    expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
-  end
-  it 'extracts ISBN-13s from ISBN-As' do
-    expect(described_class.extract('10.978.8898392/315')).to contain_exactly('9788898392315')
-  end
-  it 'does not extract invalid ISBNs from ISBN-As' do
-    expect(described_class.extract('10.978.8898392/316')).to be_empty
-  end
-  it 'normalizes 10-digit ISBNs' do
-    str = "0-8050-6909-7 \n 2-7594-0269-X"
-    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
-  end
-  it 'extracts multiple 10-digit ISBNs separated by a space' do
-    str = '0-8050-6909-7 2-7594-0269-X'
-    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
-  end
-  it 'normalizes 10-digit ISBNs with Unicode dashes' do
-    expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
-  end
-  it 'normalizes 10-digit ISBNs with a check digit of 10' do
-    expect(described_class.extract('4423272350')).to contain_exactly('9784423272350')
-  end
-  it 'normalizes 10-digit ISBNs with spaces' do
-    expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
-  end
-  it 'normalizes 10-digit ISBNs with Unicode spaces' do
-    expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
-  end
-  it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
-    expect(described_class.extract('2 7594 0269 X')).to contain_exactly('9782759402694')
-  end
-  it 'does not extract invalid 13-digit ISBNs' do
-    expect(described_class.extract('9783319217280')).to be_empty
-  end
-  it 'does not extract invalid 10-digit ISBNs' do
-    expect(described_class.extract('3319217280')).to be_empty
-  end
-  it 'does not extract ISBN-10s from hyphenated ISBN-13s' do
-    expect(described_class.extract('978-0-309-57079-4')).to contain_exactly('9780309570794')
-  end
-  it 'does not extract ISBN-10s from space-separated ISBN-13s' do
-    expect(described_class.extract('978 0 309 57079 4')).to contain_exactly('9780309570794')
-  end
-end
+require 'identifiers/isbn'
+RSpec.describe Identifiers::ISBN do
+  it 'extracts a ISBN' do
+    expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
+  end
+  it 'extracts ISBNs when given as a number' do
+    isbn = 9780805069099
+    expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
+  end
+  it 'normalizes 13-digit ISBNs' do
+    str = "978-0-80-506909-9\n978-0-67-187919-8"
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
+  end
+  it 'extracts multiple ISBN-13s separated by a space' do
+    str = '978-0-80-506909-9 978-0-67-187919-8'
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
+  end
+  it 'extracts ISBNs with hyphens' do
+    expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
+  end
+  it 'extracts ISBNs with Unicode dashes' do
+    expect(described_class.extract('ISBN: 978–0–80–506909–9')).to contain_exactly('9780805069099')
+  end
+  it 'extracts ISBNs with spaces' do
+    expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
+  end
+  it 'extracts ISBNs with Unicode spaces' do
+    expect(described_class.extract('ISBN: 978 0 80 506909 9')).to contain_exactly('9780805069099')
+  end
+  it 'extracts ISBN-13s from ISBN-As' do
+    expect(described_class.extract('10.978.8898392/315')).to contain_exactly('9788898392315')
+  end
+  it 'does not extract invalid ISBNs from ISBN-As' do
+    expect(described_class.extract('10.978.8898392/316')).to be_empty
+  end
+  it 'normalizes 10-digit ISBNs' do
+    str = "0-8050-6909-7 \n 2-7594-0269-X"
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
+  end
+  it 'extracts multiple 10-digit ISBNs separated by a space' do
+    str = '0-8050-6909-7 2-7594-0269-X'
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
+  end
+  it 'normalizes 10-digit ISBNs with Unicode dashes' do
+    expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
+  end
+  it 'normalizes 10-digit ISBNs with a check digit of 10' do
+    expect(described_class.extract('4423272350')).to contain_exactly('9784423272350')
+  end
+  it 'normalizes 10-digit ISBNs with spaces' do
+    expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
+  end
+  it 'normalizes 10-digit ISBNs with Unicode spaces' do
+    expect(described_class.extract('0 8050 6909 7')).to contain_exactly('9780805069099')
+  end
+  it 'normalizes 10-digit ISBNs with spaces and a check digit of X' do
+    expect(described_class.extract('2 7594 0269 X')).to contain_exactly('9782759402694')
+  end
+  it 'does not extract invalid 13-digit ISBNs' do
+    expect(described_class.extract('9783319217280')).to be_empty
+  end
+  it 'does not extract invalid 10-digit ISBNs' do
+    expect(described_class.extract('3319217280')).to be_empty
+  end
+  it 'does not extract ISBN-10s from hyphenated ISBN-13s' do
+    expect(described_class.extract('978-0-309-57079-4')).to contain_exactly('9780309570794')
+  end
+  it 'does not extract ISBN-10s from space-separated ISBN-13s' do
+    expect(described_class.extract('978 0 309 57079 4')).to contain_exactly('9780309570794')
+  end
+  it 'does not extract ISBN-13s from strings with inconsistent hyphenation' do
+    expect(described_class.extract('978-0 80-506909 9')).to be_empty
+  end
+  it 'does not extract ISBN-10s from strings with inconsistent hyphenation' do
+    expect(described_class.extract('0-8050 6909-7')).to be_empty
+  end
+  it 'does not extract ISBN-13s if they have more than five groups' do
+    expect(described_class.extract('978-0-80-506-909-9')).to be_empty
+  end
+  it 'does not extract ISBN-13s if they have less than five groups' do
+    expect(described_class.extract('978-0-80506909-9')).to be_empty
+  end
+  it 'does not extract ISBN-10s if they have more than four groups' do
+    expect(described_class.extract('0-8050-69-09-7')).to be_empty
+  end
+  it 'does not extract ISBN-10s if they have less than four groups' do
+    expect(described_class.extract('0-80506909-7')).to be_empty
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: identifiers
 version: !ruby/object:Gem::Version
-  version: 0.10.0
+  version: 0.11.0
 platform: ruby
 authors:
 - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-12-20 00:00:00.000000000 Z
+date: 2018-03-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: urn