RubyGems - identifiers - Versions diffs - 0.8.1 → 0.9.0 - Mend

identifiers 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +1 -1
data/lib/identifiers.rb +1 -4
data/lib/identifiers/ads_bibcode.rb +1 -1
data/lib/identifiers/arxiv_id.rb +24 -6
data/lib/identifiers/doi.rb +3 -8
data/lib/identifiers/handle.rb +1 -1
data/lib/identifiers/isbn.rb +49 -14
data/lib/identifiers/national_clinical_trial_id.rb +1 -1
data/lib/identifiers/orcid.rb +3 -3
data/lib/identifiers/pubmed_id.rb +1 -3
data/lib/identifiers/repec_id.rb +4 -1
data/spec/identifiers/ads_bibcode_spec.rb +4 -0
data/spec/identifiers/arxiv_id_spec.rb +4 -0
data/spec/identifiers/handle_spec.rb +4 -0
data/spec/identifiers/isbn_spec.rb +18 -0
data/spec/identifiers/national_clinical_trial_id_spec.rb +4 -0
data/spec/identifiers/pubmed_id_spec.rb +4 -0
data/spec/identifiers/repec_id_spec.rb +4 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ade5848785ab153a6cb5e1b2cffdd05958879943
-  data.tar.gz: 2b0b6fa55d97c5ae2990d606b210168d33dc5dae
+  metadata.gz: 17d7278289ac40fc4fa68b488e85c98ada612f6f
+  data.tar.gz: e846cd17ac15a7cb87705c288eb8c41336721183
 SHA512:
-  metadata.gz: bbdd699cd75aef87f0318a54acb55784ead6e3ea2f603a5c3e5437ef7d358222842d2892f6ec24e0fc59119be3d621d41227497d1bd3cbafdb159612331a4ccd
-  data.tar.gz: c26923e8c6c7153fae0dc793eacd6b34e355f78360deee5a47f0c2922076b9e91c5b79c76884f0b3cf0747c93b6c3e2da97090e51fb1114f100ae9d7ae2514d4
+  metadata.gz: 5a978d601dee61a25f5f66330e6e9bcd995fe1f217bdf43284350cc678574a1890a25ff289ad50ccebb25545da98acd459cd421e98e32d4e3c5266a41a84eb69
+  data.tar.gz: 30ef27cea3bc785f8f0ed841ac1f8cbfaa1e261b8a00afa9f0d942db0b8bc913f4e39f6ea91068f8eb7054a4cae1d04b17bd47956d3c7759d5ff9d5c2bed01b3

data/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,10 @@
 All notable changes to this project will be documented in this file. This
 project adheres to [Semantic Versioning](http://semver.org/).
+## [0.9.0] - 2017-07-31
+### Added
+- Support extraction of multiple ISBNs separated by a single space
 ## [0.8.1] - 2017-04-10
 ### Fixed
 - Fixed extraction of multiple DOIs separated by Unicode whitespace
@@ -52,3 +56,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
 [0.7.0]: https://github.com/altmetric/identifiers/releases/tag/v0.7.0
 [0.8.0]: https://github.com/altmetric/identifiers/releases/tag/v0.8.0
 [0.8.1]: https://github.com/altmetric/identifiers/releases/tag/v0.8.1
+[0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0

data/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
 Add this line to your application's `Gemfile`:
 ```ruby
-gem 'identifiers', '~> 0.8'
+gem 'identifiers', '~> 0.9'
 ```
 And then execute:

data/lib/identifiers.rb CHANGED Viewed

@@ -4,10 +4,7 @@ require 'identifiers/doi'
 require 'identifiers/handle'
 require 'identifiers/isbn'
 require 'identifiers/national_clinical_trial_id'
+require 'identifiers/orcid'
 require 'identifiers/pubmed_id'
 require 'identifiers/repec_id'
 require 'identifiers/urn'
-require 'identifiers/orcid'
-module Identifiers
-end

data/lib/identifiers/ads_bibcode.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Identifiers
   class AdsBibcode
     def self.extract(str)
-      str.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
+      str.to_s.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
     end
   end
 end

data/lib/identifiers/arxiv_id.rb CHANGED Viewed

@@ -1,19 +1,37 @@
 module Identifiers
   class ArxivId
+    POST_2007_REGEXP = %r{
+      (?<=^|[[:space:]/])  # Look-behind for the start of the string, whitespace or a forward slash
+      (?:arXiv:)?       # Optional arXiv scheme
+      \d{4}             # YYMM (two-digit year and two-digit month number)
+      \.
+      \d{4,5}           # Zero-padded sequence number of 4- or 5-digits
+      (?:v\d+)?         # Literal v followed by version number of 1 or more digits
+      (?=$|[[:space:]])    # Look-ahead for end of string or whitespace
+    }xi
+    PRE_2007_REGEXP = %r{
+      (?<=^|[[:space:]/])  # Look-behind for the start of the string, whitespace or a forward slash
+      (?:arXiv:)?       # Optional arXiv scheme
+      [a-z-]+           # Archive (e.g. "math")
+      (?:\.[A-Z]{2})?   # Subject class (where applicable)
+      /
+      \d{2}             # Year
+      (?:0[1-9]|1[012]) # Month
+      \d{3}             # Number
+      (?:v\d+)?         # Literal v followed by version number of 1 or more digits
+      (?=$|[[:space:]])    # Look-ahead for end of string or whitespace
+    }xi
     def self.extract(str)
       extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
     end
     def self.extract_post_2007_arxiv_ids(str)
-      str
-        .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|[[:space:]])}i)
-        .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
+      str.to_s.scan(POST_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
     end
     def self.extract_pre_2007_arxiv_ids(str)
-      str
-        .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|[[:space:]])}i)
-        .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
+      str.to_s.scan(PRE_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
     end
   end
 end

data/lib/identifiers/doi.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Identifiers
   class DOI
-    PATTERN = %r{
+    REGEXP = %r{
       \b
       10 # Directory indicator (always 10)
       \.
@@ -29,16 +29,11 @@ module Identifiers
     /x
     def self.extract(str)
-      str
-        .to_s
-        .downcase
-        .scan(PATTERN)
-        .map { |doi| strip_punctuation(doi) }
-        .compact
+      str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
     end
     def self.extract_one(str)
-      match = str.to_s.downcase[PATTERN]
+      match = str.to_s.downcase[REGEXP]
       return unless match
       strip_punctuation(match)

data/lib/identifiers/handle.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Identifiers
   class Handle
     def self.extract(str)
-      str.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
+      str.to_s.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
     end
   end
 end

data/lib/identifiers/isbn.rb CHANGED Viewed

@@ -1,29 +1,64 @@
 module Identifiers
   class ISBN
-    REGEX_13 = /\b97[89]\d{10}\b/
-    REGEX_10 = /\b\d{9}(?:\d|X)\b/
-    REGEX_A = %r{\b(?<=10\.)97[89]\.\d{2,8}/\d{1,7}\b}
+    ISBN_13_REGEXP = /
+      \b
+      97[89]            # ISBN (GS1) Bookland prefix
+      [\p{Pd}\p{Zs}]?   # Optional hyphenation
+      (?:
+        \d              # Digit
+        [\p{Pd}\p{Zs}]? # Optional hyphenation
+      ){9}
+      \d                # Check digit
+      \b
+    /x
+    ISBN_10_REGEXP = /
+      \b
+      (?:
+        \d              # Digit
+        [\p{Pd}\p{Zs}]? # Optional hyphenation
+      ){9}
+      [\dX]             # Check digit
+      \b
+    /x
+    ISBN_A_REGEXP = %r{
+      \b
+      (?<=10\.) # Directory indicator (always 10)
+      97[89]\.  # ISBN (GS1) Bookland prefix
+      \d{2,8}   # ISBN registration group element and publisher prefix
+      /         # Prefix/suffix divider
+      \d{1,7}   # ISBN title enumerator and check digit
+      \b
+    }x
     def self.extract(str)
       extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
     end
     def self.extract_isbn_as(str)
-      extract_thirteen_digit_isbns(str.scan(REGEX_A).join("\n").tr('/.', ''))
+      extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
     end
     def self.extract_thirteen_digit_isbns(str)
-      str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=\d)/, '').scan(REGEX_13).select { |isbn| valid_isbn_13?(isbn) }
+      str
+        .to_s
+        .scan(ISBN_13_REGEXP)
+        .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
+        .select { |isbn| valid_isbn_13?(isbn) }
     end
     def self.extract_ten_digit_isbns(str)
-      str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=[\dX])/i, '').scan(REGEX_10).select { |isbn| valid_isbn_10?(isbn) }.map { |isbn|
-        isbn.chop!
-        isbn.prepend('978')
-        isbn << isbn_13_check_digit(isbn).to_s
+      str
+        .to_s
+        .scan(ISBN_10_REGEXP)
+        .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
+        .select { |isbn| valid_isbn_10?(isbn) }
+        .map { |isbn|
+          isbn.chop!
+          isbn.prepend('978')
+          isbn << isbn_13_check_digit(isbn).to_s
-        isbn
-      }
+          isbn
+        }
     end
     def self.isbn_13_check_digit(isbn)
@@ -38,7 +73,7 @@ module Identifiers
     end
     def self.valid_isbn_13?(isbn)
-      return false unless isbn =~ REGEX_13
+      return false unless isbn =~ ISBN_13_REGEXP
       result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
@@ -46,7 +81,7 @@ module Identifiers
     end
     def self.valid_isbn_10?(isbn)
-      return false unless isbn =~ REGEX_10
+      return false unless isbn =~ ISBN_10_REGEXP
       result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
@@ -54,7 +89,7 @@ module Identifiers
     end
     def self.digits_of(isbn)
-      isbn.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
+      isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
     end
   end
 end

data/lib/identifiers/national_clinical_trial_id.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Identifiers
   class NationalClinicalTrialId
     def self.extract(str)
-      str.scan(/\bNCT\d+\b/i).map(&:upcase)
+      str.to_s.scan(/\bNCT\d+\b/i).map(&:upcase)
     end
   end
 end

data/lib/identifiers/orcid.rb CHANGED Viewed

@@ -1,9 +1,9 @@
 module Identifiers
   class ORCID
-    REGEX = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
+    REGEXP = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
     def self.extract(str)
-      str.scan(REGEX).select { |orcid| valid?(orcid) }.map(&:upcase)
+      str.to_s.scan(REGEXP).select { |orcid| valid?(orcid) }.map(&:upcase)
     end
     def self.valid?(str)
@@ -14,7 +14,7 @@ module Identifiers
     end
     def self.calculate_digit(str)
-      return unless str =~ REGEX
+      return unless str =~ REGEXP
       base_digits = str.chop.tr('-', '')
       total = 0

data/lib/identifiers/pubmed_id.rb CHANGED Viewed

@@ -1,9 +1,7 @@
 module Identifiers
   class PubmedId
     def self.extract(str)
-      str
-        .scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/)
-        .flatten
+      str.to_s.scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/).flatten
     end
   end
 end

data/lib/identifiers/repec_id.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 module Identifiers
   class RepecId
     def self.extract(str)
-      str.scan(/\brepec:[^[:space:]]+\b/i).map { |repec| "RePEc:#{repec.split(':', 2).last}" }
+      str
+        .to_s
+        .scan(/\brepec:[^[:space:]]+\b/i)
+        .map { |repec| "RePEc:#{repec.split(':', 2).last}" }
     end
   end
 end

data/spec/identifiers/ads_bibcode_spec.rb CHANGED Viewed

@@ -15,5 +15,9 @@ RSpec.describe Identifiers::AdsBibcode do
     it 'does not extract Bibcodes from DOIs' do
       expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
     end
+    it 'returns no Bibcode if nothing is given' do
+      expect(described_class.extract(nil)).to be_empty
+    end
   end
 end

data/spec/identifiers/arxiv_id_spec.rb CHANGED Viewed

@@ -22,6 +22,10 @@ RSpec.describe Identifiers::ArxivId do
       expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
     end
+    it 'extracts nothing from empty arguments' do
+      expect(described_class.extract(nil)).to be_empty
+    end
     it 'extracts a post 2007 arXiv ID surrounded by Unicode whitespace' do
       expect(described_class.extract('Example: arXiv:0706.0001 ')).to contain_exactly('0706.0001')
     end

data/spec/identifiers/handle_spec.rb CHANGED Viewed

@@ -18,4 +18,8 @@ RSpec.describe Identifiers::Handle do
     expect(described_class.extract(str)).to contain_exactly('10149/596901', '10251/79612')
   end
+  it 'extracts nothing from empty arguments' do
+    expect(described_class.extract(nil)).to be_empty
+  end
 end

data/spec/identifiers/isbn_spec.rb CHANGED Viewed

@@ -5,12 +5,24 @@ RSpec.describe Identifiers::ISBN do
     expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
   end
+  it 'extracts ISBNs when given as a number' do
+    isbn = 9780805069099
+    expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
+  end
   it 'normalizes 13-digit ISBNs' do
     str = "978-0-80-506909-9\n978-0-67-187919-8"
     expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
   end
+  it 'extracts multiple ISBN-13s separated by a space' do
+    str = '978-0-80-506909-9 978-0-67-187919-8'
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
+  end
   it 'extracts ISBNs with hyphens' do
     expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
   end
@@ -41,6 +53,12 @@ RSpec.describe Identifiers::ISBN do
     expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
   end
+  it 'extracts multiple 10-digit ISBNs separated by a space' do
+    str = '0-8050-6909-7 2-7594-0269-X'
+    expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
+  end
   it 'normalizes 10-digit ISBNs with Unicode dashes' do
     expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
   end

data/spec/identifiers/national_clinical_trial_id_spec.rb CHANGED Viewed

@@ -8,4 +8,8 @@ RSpec.describe Identifiers::NationalClinicalTrialId do
   it 'normalizes NCT IDs' do
     expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
   end
+  it 'does not match anything with empty arguments' do
+    expect(described_class.extract(nil)).to be_empty
+  end
 end

data/spec/identifiers/pubmed_id_spec.rb CHANGED Viewed

@@ -22,4 +22,8 @@ RSpec.describe Identifiers::PubmedId do
   it 'extracts PubMed IDs separated by Unicode whitespace' do
     expect(described_class.extract('123 456')).to contain_exactly('123', '456')
   end
+  it 'considers Fixnum as potential PubmedIds too' do
+    expect(described_class.extract(123)).to contain_exactly('123')
+  end
 end

data/spec/identifiers/repec_id_spec.rb CHANGED Viewed

@@ -18,4 +18,8 @@ RSpec.describe Identifiers::RepecId do
     expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
   end
+  it 'extracts nothing when given empty arguments' do
+    expect(described_class.extract(nil)).to be_empty
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: identifiers
 version: !ruby/object:Gem::Version
-  version: 0.8.1
+  version: 0.9.0
 platform: ruby
 authors:
 - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-04-10 00:00:00.000000000 Z
+date: 2017-07-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: urn