RubyGems - identifiers - Versions diffs - 0.8.1 → 0.9.0 - Mend

identifiers 0.8.1 → 0.9.0

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +1 -1
data/lib/identifiers.rb +1 -4
data/lib/identifiers/ads_bibcode.rb +1 -1
data/lib/identifiers/arxiv_id.rb +24 -6
data/lib/identifiers/doi.rb +3 -8
data/lib/identifiers/handle.rb +1 -1
data/lib/identifiers/isbn.rb +49 -14
data/lib/identifiers/national_clinical_trial_id.rb +1 -1
data/lib/identifiers/orcid.rb +3 -3
data/lib/identifiers/pubmed_id.rb +1 -3
data/lib/identifiers/repec_id.rb +4 -1
data/spec/identifiers/ads_bibcode_spec.rb +4 -0
data/spec/identifiers/arxiv_id_spec.rb +4 -0
data/spec/identifiers/handle_spec.rb +4 -0
data/spec/identifiers/isbn_spec.rb +18 -0
data/spec/identifiers/national_clinical_trial_id_spec.rb +4 -0
data/spec/identifiers/pubmed_id_spec.rb +4 -0
data/spec/identifiers/repec_id_spec.rb +4 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ade5848785ab153a6cb5e1b2cffdd05958879943
-  data.tar.gz: 2b0b6fa55d97c5ae2990d606b210168d33dc5dae
+  metadata.gz: 17d7278289ac40fc4fa68b488e85c98ada612f6f
+  data.tar.gz: e846cd17ac15a7cb87705c288eb8c41336721183
 SHA512:
-  metadata.gz: bbdd699cd75aef87f0318a54acb55784ead6e3ea2f603a5c3e5437ef7d358222842d2892f6ec24e0fc59119be3d621d41227497d1bd3cbafdb159612331a4ccd
-  data.tar.gz: c26923e8c6c7153fae0dc793eacd6b34e355f78360deee5a47f0c2922076b9e91c5b79c76884f0b3cf0747c93b6c3e2da97090e51fb1114f100ae9d7ae2514d4
+  metadata.gz: 5a978d601dee61a25f5f66330e6e9bcd995fe1f217bdf43284350cc678574a1890a25ff289ad50ccebb25545da98acd459cd421e98e32d4e3c5266a41a84eb69
+  data.tar.gz: 30ef27cea3bc785f8f0ed841ac1f8cbfaa1e261b8a00afa9f0d942db0b8bc913f4e39f6ea91068f8eb7054a4cae1d04b17bd47956d3c7759d5ff9d5c2bed01b3

data/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,10 @@
 All notable changes to this project will be documented in this file. This
 project adheres to [Semantic Versioning](http://semver.org/).
+## [0.9.0] - 2017-07-31
+### Added
+- Support extraction of multiple ISBNs separated by a single space
 ## [0.8.1] - 2017-04-10
 ### Fixed
 - Fixed extraction of multiple DOIs separated by Unicode whitespace
@@ -52,3 +56,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
 [0.7.0]: https://github.com/altmetric/identifiers/releases/tag/v0.7.0
 [0.8.0]: https://github.com/altmetric/identifiers/releases/tag/v0.8.0
 [0.8.1]: https://github.com/altmetric/identifiers/releases/tag/v0.8.1
+[0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0

data/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
 Add this line to your application's `Gemfile`:
 ```ruby
-gem 'identifiers', '~> 0.8'
+gem 'identifiers', '~> 0.9'
 ```
 And then execute:

data/lib/identifiers.rb CHANGED Viewed

@@ -4,10 +4,7 @@ require 'identifiers/doi'
 require 'identifiers/handle'
 require 'identifiers/isbn'
 require 'identifiers/national_clinical_trial_id'
+require 'identifiers/orcid'
 require 'identifiers/pubmed_id'
 require 'identifiers/repec_id'
 require 'identifiers/urn'
-require 'identifiers/orcid'
-module Identifiers
-end

data/lib/identifiers/ads_bibcode.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Identifiers
   class AdsBibcode
     def self.extract(str)
-      str.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
+      str.to_s.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
     end
   end
 end

data/lib/identifiers/arxiv_id.rb CHANGED Viewed

@@ -1,19 +1,37 @@
 module Identifiers
   class ArxivId
+    POST_2007_REGEXP = %r{
+      (?<=^|[[:space:]/])  # Look-behind for the start of the string, whitespace or a forward slash
+      (?:arXiv:)?       # Optional arXiv scheme
+      \d{4}             # YYMM (two-digit year and two-digit month number)
+      \.
+      \d{4,5}           # Zero-padded sequence number of 4- or 5-digits
+      (?:v\d+)?         # Literal v followed by version number of 1 or more digits
+      (?=$|[[:space:]])    # Look-ahead for end of string or whitespace
+    }xi
+    PRE_2007_REGEXP = %r{
+      (?<=^|[[:space:]/])  # Look-behind for the start of the string, whitespace or a forward slash
+      (?:arXiv:)?       # Optional arXiv scheme
+      [a-z-]+           # Archive (e.g. "math")
+      (?:\.[A-Z]{2})?   # Subject class (where applicable)
+      /
+      \d{2}             # Year
+      (?:0[1-9]|1[012]) # Month
+      \d{3}             # Number
+      (?:v\d+)?         # Literal v followed by version number of 1 or more digits
+      (?=$|[[:space:]])    # Look-ahead for end of string or whitespace
+    }xi
     def self.extract(str)
       extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
     end
     def self.extract_post_2007_arxiv_ids(str)
-      str
-        .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|[[:space:]])}i)
-        .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
+      str.to_s.scan(POST_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
     end
     def self.extract_pre_2007_arxiv_ids(str)
-      str
-        .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|[[:space:]])}i)
-        .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
+      str.to_s.scan(PRE_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
     end
   end
 end

data/lib/identifiers/doi.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Identifiers
   class DOI
-    PATTERN = %r{
+    REGEXP = %r{
       \b
       10 # Directory indicator (always 10)
       \.
@@ -29,16 +29,11 @@ module Identifiers
     /x
     def self.extract(str)
-      str
-        .to_s
-        .downcase
-        .scan(PATTERN)
-        .map { |doi| strip_punctuation(doi) }
-        .compact
+      str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
     end
     def self.extract_one(str)
-      match = str.to_s.downcase[PATTERN]
+      match = str.to_s.downcase[REGEXP]
       return unless match
       strip_punctuation(match)

data/lib/identifiers/handle.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Identifiers
   class Handle
     def self.extract(str)
-      str.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
+      str.to_s.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
     end
   end
 end

data/lib/identifiers/isbn.rb CHANGED Viewed

@@ -1,29 +1,64 @@
 module Identifiers
   class ISBN
-    REGEX_13 = /\b97[89]\d{10}\b/
-    REGEX_10 = /\b\d{9}(?:\d|X)\b/
-    REGEX_A = %r{\b(?<=10\.)97[89]\.\d{2,8}/\d{1,7}\b}
+    ISBN_13_REGEXP = /
+      \b
+      97[89]            # ISBN (GS1) Bookland prefix
+      [\p{Pd}\p{Zs}]?   # Optional hyphenation
+      (?:
+        \d              # Digit
+        [\p{Pd}\p{Zs}]? # Optional hyphenation
+      ){9}
+      \d                # Check digit
+      \b
+    /x
+    ISBN_10_REGEXP = /
+      \b
+      (?:
+        \d              # Digit
+        [\p{Pd}\p{Zs}]? # Optional hyphenation
+      ){9}
+      [\dX]             # Check digit
+      \b
+    /x
+    ISBN_A_REGEXP = %r{
+      \b
+      (?<=10\.) # Directory indicator (always 10)
+      97[89]\.  # ISBN (GS1) Bookland prefix
+      \d{2,8}   # ISBN registration group element and publisher prefix
+      /         # Prefix/suffix divider
+      \d{1,7}   # ISBN title enumerator and check digit
+      \b
+    }x
     def self.extract(str)
       extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
     end
     def self.extract_isbn_as(str)
-      extract_thirteen_digit_isbns(str.scan(REGEX_A).join("\n").tr('/.', ''))
+      extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
     end
     def self.extract_thirteen_digit_isbns(str)
-      str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=\d)/, '').scan(REGEX_13).select { |isbn| valid_isbn_13?(isbn) }
+      str
+        .to_s
+        .scan(ISBN_13_REGEXP)
+        .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
+        .select { |isbn| valid_isbn_13?(isbn) }
     end
     def self.extract_ten_digit_isbns(str)
-      str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=[\dX])/i, '').scan(REGEX_10).select { |isbn| valid_isbn_10?(isbn) }.map { |isbn|
-        isbn.chop!
-        isbn.prepend('978')
-        isbn << isbn_13_check_digit(isbn).to_s
+      str
+        .to_s
+        .scan(ISBN_10_REGEXP)
+        .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
+        .select { |isbn| valid_isbn_10?(isbn) }
+        .map { |isbn|
+          isbn.chop!
+          isbn.prepend('978')
+          isbn << isbn_13_check_digit(isbn).to_s
-        isbn
-      }
+          isbn
+        }
     end
     def self.isbn_13_check_digit(isbn)
@@ -38,7 +73,7 @@ module Identifiers
     end
     def self.valid_isbn_13?(isbn)
-      return false unless isbn =~ REGEX_13
+      return false unless isbn =~ ISBN_13_REGEXP
       result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
@@ -46,7 +81,7 @@ module Identifiers
     end
     def self.valid_isbn_10?(isbn)
-      return false unless isbn =~ REGEX_10
+      return false unless isbn =~ ISBN_10_REGEXP
       result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
@@ -54,7 +89,7 @@ module Identifiers
     end
     def self.digits_of(isbn)
-      isbn.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
+      isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
     end
   end
 end

data/lib/identifiers/national_clinical_trial_id.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Identifiers
   class NationalClinicalTrialId
     def self.extract(str)
-      str.scan(/\bNCT\d+\b/i).map(&:upcase)
+      str.to_s.scan(/\bNCT\d+\b/i).map(&:upcase)
     end
   end
 end

data/lib/identifiers/orcid.rb CHANGED Viewed

@@ -1,9 +1,9 @@
 module Identifiers
   class ORCID
-    REGEX = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
+    REGEXP = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
     def self.extract(str)
-      str.scan(REGEX).select { |orcid| valid?(orcid) }.map(&:upcase)
+      str.to_s.scan(REGEXP).select { |orcid| valid?(orcid) }.map(&:upcase)
     end
     def self.valid?(str)
@@ -14,7 +14,7 @@ module Identifiers
     end
     def self.calculate_digit(str)
-      return unless str =~ REGEX
+      return unless str =~ REGEXP
       base_digits = str.chop.tr('-', '')
       total = 0

data/lib/identifiers/pubmed_id.rb CHANGED Viewed

@@ -1,9 +1,7 @@
 module Identifiers
   class PubmedId
     def self.extract(str)
-      str
-        .scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/)
-        .flatten
+      str.to_s.scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/).flatten
     end
   end
 end

data/lib/identifiers/repec_id.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 module Identifiers
   class RepecId
     def self.extract(str)
-      str.scan(/\brepec:[^[:space:]]+\b/i).map { |repec| "RePEc:#{repec.split(':', 2).last}" }
+      str
+        .to_s
+        .scan(/\brepec:[^[:space:]]+\b/i)
+        .map { |repec| "RePEc:#{repec.split(':', 2).last}" }
     end
   end
 end

data/spec/identifiers/ads_bibcode_spec.rb CHANGED Viewed

@@ -15,5 +15,9 @@ RSpec.describe Identifiers::AdsBibcode do
     it 'does not extract Bibcodes from DOIs' do
       expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
     end
+    it 'returns no Bibcode if nothing is given' do
+      expect(described_class.extract(nil)).to be_empty
+    end
   end
 end

data/spec/identifiers/arxiv_id_spec.rb CHANGED Viewed

@@ -22,6 +22,10 @@ RSpec.describe Identifiers::ArxivId do
       expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
     end
+    it 'extracts nothing from empty arguments' do
+      expect(described_class.extract(nil)).to be_empty
+    end
     it 'extracts a post 2007 arXiv ID surrounded by Unicode whitespace' do
       expect(described_class.extract('Example: arXiv:0706.0001 ')).to contain_exactly('0706.0001')
     end

data/spec/identifiers/handle_spec.rb CHANGED Viewed

@@ -18,4 +18,8 @@ RSpec.describe Identifiers::Handle do
     expect(described_class.extract(str)).to contain_exactly('10149/596901', '10251/79612')
   end
+  it 'extracts nothing from empty arguments' do
+    expect(described_class.extract(nil)).to be_empty
+  end
 end

data/spec/identifiers/isbn_spec.rb CHANGED Viewed

@@ -5,12 +5,24 @@ RSpec.describe Identifiers::ISBN do
     expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
   end
+  it 'extracts ISBNs when given as a number' do
+    isbn = 9780805069099
+    expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
+  end
   it 'normalizes 13-digit ISBNs' do
     str = "978-0-80-506909-9\n978-0-67-187919-8"
     expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
   end
+  it 'extracts multiple ISBN-13s separated by a space' do
+    str = '978-0-80-506909-9 978-0-67-187919-8'
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
+  end
   it 'extracts ISBNs with hyphens' do
     expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
   end
@@ -41,6 +53,12 @@ RSpec.describe Identifiers::ISBN do
     expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
   end
+  it 'extracts multiple 10-digit ISBNs separated by a space' do
+    str = '0-8050-6909-7 2-7594-0269-X'
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
+  end
   it 'normalizes 10-digit ISBNs with Unicode dashes' do
     expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
   end

data/spec/identifiers/national_clinical_trial_id_spec.rb CHANGED Viewed

@@ -8,4 +8,8 @@ RSpec.describe Identifiers::NationalClinicalTrialId do
   it 'normalizes NCT IDs' do
     expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
   end
+  it 'does not match anything with empty arguments' do
+    expect(described_class.extract(nil)).to be_empty
+  end
 end

data/spec/identifiers/pubmed_id_spec.rb CHANGED Viewed

@@ -22,4 +22,8 @@ RSpec.describe Identifiers::PubmedId do
   it 'extracts PubMed IDs separated by Unicode whitespace' do
     expect(described_class.extract('123 456')).to contain_exactly('123', '456')
   end
+  it 'considers Fixnum as potential PubmedIds too' do
+    expect(described_class.extract(123)).to contain_exactly('123')
+  end
 end

data/spec/identifiers/repec_id_spec.rb CHANGED Viewed

@@ -18,4 +18,8 @@ RSpec.describe Identifiers::RepecId do
     expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
   end
+  it 'extracts nothing when given empty arguments' do
+    expect(described_class.extract(nil)).to be_empty
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: identifiers
 version: !ruby/object:Gem::Version
-  version: 0.8.1
+  version: 0.9.0
 platform: ruby
 authors:
 - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-04-10 00:00:00.000000000 Z
+date: 2017-07-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: urn