RubyGems - identifiers - Versions diffs - 0.11.0 → 0.12.0 - Mend

identifiers 0.11.0 → 0.12.0

Files changed (9) hide show

checksums.yaml +5 -5
data/CHANGELOG.md +9 -1
data/README.md +1 -1
data/lib/identifiers/doi.rb +18 -27
data/lib/identifiers/pubmed_id.rb +11 -11
data/spec/identifiers/doi_spec.rb +26 -2
data/spec/identifiers/pubmed_id_spec.rb +99 -99
data/spec/spec_helper.rb +18 -18
metadata +9 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: ab7d56e3e6048da713104ab263f36bdd96a9da98
-  data.tar.gz: 4d2636ffdeeb7b3a1e563b36c761e39dc81f488e
+SHA256:
+  metadata.gz: 6ec6f86cbb54595ef76ab6615ba9e3a5f671478daebf31412f6b6820f1cb020d
+  data.tar.gz: 63881a8f3863926d66005133ba9fa0cdf471a7446c2dffa12534ac540911207e
 SHA512:
-  metadata.gz: 3a62e98dcf6f35c180e1dd2c2e574f8993c6123a182e6fee782a96bc85e342de5b3894455f8fa99325100edcb8f3a9f0fa33d3135040b775bbf6e0b442bc1992
-  data.tar.gz: fcc0d3d178e49ee0eefed149200f4f8489d58c4cb953b21a0cab63067ae237432eee607e8e81460f70cbdba9ebb8d1d44c6f3d01fe82f16c9038a12f31aba609
+  metadata.gz: 97b2352aa4a99ec8cf5158e1239f273e10f28321df11941445890549b827365e7a8ba1fb54d4a268bd146889f94aaa4fec0135ec1b67f779094557c6987bc466
+  data.tar.gz: f0e6a040a480bf9cf42900997415f8341fb7f68274434c936e725bafab484ce703fc6f0e21c5e317a85456c9fc0c2305e80ba9b1bae99cc7e6ba33fa40a890cc

data/CHANGELOG.md CHANGED Viewed

@@ -2,8 +2,15 @@
 All notable changes to this project will be documented in this file. This
 project adheres to [Semantic Versioning](http://semver.org/).
+## [0.12.0] - 2018-04-06
+### Added
+- Added support for extracting more old Wiley DOIs
+### Changed
+- Performance improvements when extracting DOIs with trailing punctuation.
 ## [0.11.0] - 2018-03-12
-## Fixed
+### Fixed
 - Stricter ISBN extraction: consistent hyphenation (#27) and correct number of groups (#28)
 - Prevent stack overflow when extracting DOIS (#25)
@@ -74,3 +81,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
 [0.9.1]: https://github.com/altmetric/identifiers/releases/tag/v0.9.1
 [0.10.0]: https://github.com/altmetric/identifiers/releases/tag/v0.10.0
 [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
+[0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0

data/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
 Add this line to your application's `Gemfile`:
 ```ruby
-gem 'identifiers', '~> 0.11'
+gem 'identifiers', '~> 0.12'
 ```
 And then execute:

data/lib/identifiers/doi.rb CHANGED Viewed

@@ -2,42 +2,33 @@ module Identifiers
   class DOI
     REGEXP = %r{
       \b
-      10 # Directory indicator (always 10)
+      10                                        # Directory indicator (always 10)
       \.
       (?:
         # ISBN-A
-        97[89]\. # ISBN (GS1) Bookland prefix
-        \d{2,8}  # ISBN registration group element and publisher prefix
-        /        # Prefix/suffix divider
-        \d{1,7}  # ISBN title enumerator and check digit
+        97[89]\.                                # ISBN (GS1) Bookland prefix
+        \d{2,8}                                 # ISBN registration group element and publisher prefix
+        /                                       # Prefix/suffix divider
+        \d{1,7}                                 # ISBN title enumerator and check digit
         |
         # DOI
-        \d{4,9}       # Registrant code
-        /             # Prefix/suffix divider
-        [^[:space:]]+ # DOI suffix
+        \d{4,9}                                 # Registrant code
+        /                                       # Prefix/suffix divider
+        (?:
+          # DOI suffix
+          [^[:space:]]+;2-[\#0-9a-z]            # Early Wiley suffix
+          |
+          [^[:space:]]+                         # Suffix...
+          \([^[:space:])]+\)                    # Ending in balanced parentheses...
+          (?![^[:space:]\p{P}])                 # Not followed by more suffix or punctuation
+          |
+          [^[:space:]]+(?![[:space:]])\p{^P}    # Suffix ending in non-punctuation
+        )
       )
     }x
-    VALID_ENDING = /
-      (?:
-        \p{^Punct} # Non-punctuation character
-        |
-        \(.+\)     # Balanced parentheses
-        |
-        2-\#       # Early Wiley DOI suffix
-      )
-      \z
-    /x
     def self.extract(str)
-      str.to_s.downcase.scan(REGEXP).map { |doi| extract_one(doi) }.compact
-    end
-    def self.extract_one(str)
-      while (match = str.to_s.downcase[REGEXP])
-        break match if match =~ VALID_ENDING
-        str = match.sub(/\p{Punct}\z/, '')
-      end
+      str.to_s.downcase.scan(REGEXP)
     end
   end
 end

data/lib/identifiers/pubmed_id.rb CHANGED Viewed

@@ -1,11 +1,11 @@
-module Identifiers
-  class PubmedId
-    ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
-    PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
-    def self.extract(str)
-      str = str.to_s
-      str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
-    end
-  end
-end
+module Identifiers
+  class PubmedId
+    ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
+    PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
+    def self.extract(str)
+      str = str.to_s
+      str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
+    end
+  end
+end

data/spec/identifiers/doi_spec.rb CHANGED Viewed

@@ -66,9 +66,9 @@ RSpec.describe Identifiers::DOI do
   end
   it 'extracts old Wiley DOIs' do
-    str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#'
+    str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
-    expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
+    expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
   end
   it 'does not extract a closing parenthesis if not part of the DOI' do
@@ -124,4 +124,28 @@ RSpec.describe Identifiers::DOI do
   it 'extracts DOIs separated by Unicode whitespace' do
     expect(described_class.extract('10.1234/foo  10.1234/bar')).to contain_exactly('10.1234/foo', '10.1234/bar')
   end
+  it 'does not extract DOIs with extra digits prefixed' do
+    expect(described_class.extract('110.1234/foo')).to be_empty
+  end
+  it 'extracts DOIs from a string with trailing closing parentheses' do
+    expect(described_class.extract('10.1130/2013.2502(04))')).to contain_exactly('10.1130/2013.2502(04)')
+  end
+  it 'extracts DOIs from a string with multiple trailing closing parentheses' do
+    expect(described_class.extract('10.1130/2013.2502(04))))')).to contain_exactly('10.1130/2013.2502(04)')
+  end
+  it 'extracts DOIs with parentheses within the suffix' do
+    expect(described_class.extract('10.1016/0005-2744(70)90072-0')).to contain_exactly('10.1016/0005-2744(70)90072-0')
+  end
+  it 'extracts all DOIs from a Crossref sample' do
+    Pathname.new(__FILE__).join('..', '..', 'fixtures', 'dois.txt').each_line do |doi|
+      doi.chomp!
+      expect(described_class.extract(doi)).to contain_exactly(doi)
+    end
+  end
 end

data/spec/identifiers/pubmed_id_spec.rb CHANGED Viewed

@@ -1,99 +1,99 @@
-require 'identifiers/pubmed_id'
-RSpec.describe Identifiers::PubmedId do
-  it 'extracts PubMed IDs' do
-    expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with www' do
-    url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with www and https' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL without www' do
-    url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL without www but with https' do
-    url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed mobile URL' do
-    url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with query parameters' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts both number and URLs PubMed IDs' do
-    url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('112233', '123456')
-  end
-  it 'does not return outputs with PubMed IDs in DOIs' do
-    str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
-    expect(described_class.extract(str)).to be_empty
-  end
-  it 'strips leading 0s' do
-    expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
-  end
-  it 'does not consider 0 as a valid PubMed ID' do
-    expect(described_class.extract('00000000')).to be_empty
-  end
-  it 'extracts PubMed IDs separated by Unicode whitespace' do
-    expect(described_class.extract('123 456')).to contain_exactly('123', '456')
-  end
-  it 'considers Fixnum as potential PubMed IDs too' do
-    expect(described_class.extract(123)).to contain_exactly('123')
-  end
-  it 'extracts PubMed IDs with pmid scheme' do
-    expect(described_class.extract('pmid:123')).to contain_exactly('123')
-  end
-  it 'strips leading zeroes from pmid scheme' do
-    expect(described_class.extract('pmid:000123')).to contain_exactly('123')
-  end
-  it 'extracts PubMed IDs with info pmid scheme' do
-    expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
-  end
-  it 'strips leading zeroes from info pmid scheme' do
-    expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
-  end
-end
+require 'identifiers/pubmed_id'
+RSpec.describe Identifiers::PubmedId do
+  it 'extracts PubMed IDs' do
+    expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with www' do
+    url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with www and https' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL without www' do
+    url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL without www but with https' do
+    url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed mobile URL' do
+    url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with query parameters' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts both number and URLs PubMed IDs' do
+    url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('112233', '123456')
+  end
+  it 'does not return outputs with PubMed IDs in DOIs' do
+    str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
+    expect(described_class.extract(str)).to be_empty
+  end
+  it 'strips leading 0s' do
+    expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
+  end
+  it 'does not consider 0 as a valid PubMed ID' do
+    expect(described_class.extract('00000000')).to be_empty
+  end
+  it 'extracts PubMed IDs separated by Unicode whitespace' do
+    expect(described_class.extract('123 456')).to contain_exactly('123', '456')
+  end
+  it 'considers Fixnum as potential PubMed IDs too' do
+    expect(described_class.extract(123)).to contain_exactly('123')
+  end
+  it 'extracts PubMed IDs with pmid scheme' do
+    expect(described_class.extract('pmid:123')).to contain_exactly('123')
+  end
+  it 'strips leading zeroes from pmid scheme' do
+    expect(described_class.extract('pmid:000123')).to contain_exactly('123')
+  end
+  it 'extracts PubMed IDs with info pmid scheme' do
+    expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
+  end
+  it 'strips leading zeroes from info pmid scheme' do
+    expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,18 +1,18 @@
-RSpec.configure do |config|
-  config.filter_run :focus
-  config.run_all_when_everything_filtered = true
-  config.example_status_persistence_file_path = "spec/examples.txt"
-  config.disable_monkey_patching!
-  config.warnings = true
-  config.order = :random
-  config.default_formatter = 'doc' if config.files_to_run.one?
-  Kernel.srand config.seed
-  config.expect_with :rspec do |expectations|
-    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
-  end
-  config.mock_with :rspec do |mocks|
-    mocks.verify_partial_doubles = true
-  end
-end
+RSpec.configure do |config|
+  config.filter_run :focus
+  config.run_all_when_everything_filtered = true
+  config.example_status_persistence_file_path = "spec/examples.txt"
+  config.disable_monkey_patching!
+  config.warnings = true
+  config.order = :random
+  config.default_formatter = 'doc' if config.files_to_run.one?
+  Kernel.srand config.seed
+  config.expect_with :rspec do |expectations|
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  config.mock_with :rspec do |mocks|
+    mocks.verify_partial_doubles = true
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: identifiers
 version: !ruby/object:Gem::Version
-  version: 0.11.0
+  version: 0.12.0
 platform: ruby
 authors:
 - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-03-12 00:00:00.000000000 Z
+date: 2018-04-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: urn
@@ -119,19 +119,19 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.6.13
+rubygems_version: 2.7.3
 signing_key:
 specification_version: 4
 summary: Utilities library for various scholarly identifiers used by Altmetric
 test_files:
-- spec/identifiers/ads_bibcode_spec.rb
+- spec/spec_helper.rb
+- spec/identifiers/repec_id_spec.rb
+- spec/identifiers/pubmed_id_spec.rb
 - spec/identifiers/arxiv_id_spec.rb
-- spec/identifiers/doi_spec.rb
+- spec/identifiers/urn_spec.rb
 - spec/identifiers/handle_spec.rb
+- spec/identifiers/ads_bibcode_spec.rb
 - spec/identifiers/isbn_spec.rb
+- spec/identifiers/doi_spec.rb
 - spec/identifiers/national_clinical_trial_id_spec.rb
 - spec/identifiers/orcid_spec.rb
-- spec/identifiers/pubmed_id_spec.rb
-- spec/identifiers/repec_id_spec.rb
-- spec/identifiers/urn_spec.rb
-- spec/spec_helper.rb