RubyGems - identifiers - Versions diffs - 0.11.0 → 0.12.0 - Mend

identifiers 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +5 -5
data/CHANGELOG.md +9 -1
data/README.md +1 -1
data/lib/identifiers/doi.rb +18 -27
data/lib/identifiers/pubmed_id.rb +11 -11
data/spec/identifiers/doi_spec.rb +26 -2
data/spec/identifiers/pubmed_id_spec.rb +99 -99
data/spec/spec_helper.rb +18 -18
metadata +9 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: ab7d56e3e6048da713104ab263f36bdd96a9da98
-  data.tar.gz: 4d2636ffdeeb7b3a1e563b36c761e39dc81f488e
+SHA256:
+  metadata.gz: 6ec6f86cbb54595ef76ab6615ba9e3a5f671478daebf31412f6b6820f1cb020d
+  data.tar.gz: 63881a8f3863926d66005133ba9fa0cdf471a7446c2dffa12534ac540911207e
 SHA512:
-  metadata.gz: 3a62e98dcf6f35c180e1dd2c2e574f8993c6123a182e6fee782a96bc85e342de5b3894455f8fa99325100edcb8f3a9f0fa33d3135040b775bbf6e0b442bc1992
-  data.tar.gz: fcc0d3d178e49ee0eefed149200f4f8489d58c4cb953b21a0cab63067ae237432eee607e8e81460f70cbdba9ebb8d1d44c6f3d01fe82f16c9038a12f31aba609
+  metadata.gz: 97b2352aa4a99ec8cf5158e1239f273e10f28321df11941445890549b827365e7a8ba1fb54d4a268bd146889f94aaa4fec0135ec1b67f779094557c6987bc466
+  data.tar.gz: f0e6a040a480bf9cf42900997415f8341fb7f68274434c936e725bafab484ce703fc6f0e21c5e317a85456c9fc0c2305e80ba9b1bae99cc7e6ba33fa40a890cc

data/CHANGELOG.md CHANGED Viewed

@@ -2,8 +2,15 @@
 All notable changes to this project will be documented in this file. This
 project adheres to [Semantic Versioning](http://semver.org/).
+## [0.12.0] - 2018-04-06
+### Added
+- Added support for extracting more old Wiley DOIs
+### Changed
+- Performance improvements when extracting DOIs with trailing punctuation.
 ## [0.11.0] - 2018-03-12
-## Fixed
+### Fixed
 - Stricter ISBN extraction: consistent hyphenation (#27) and correct number of groups (#28)
 - Prevent stack overflow when extracting DOIS (#25)
@@ -74,3 +81,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
 [0.9.1]: https://github.com/altmetric/identifiers/releases/tag/v0.9.1
 [0.10.0]: https://github.com/altmetric/identifiers/releases/tag/v0.10.0
 [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
+[0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0

data/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
 Add this line to your application's `Gemfile`:
 ```ruby
-gem 'identifiers', '~> 0.11'
+gem 'identifiers', '~> 0.12'
 ```
 And then execute:

data/lib/identifiers/doi.rb CHANGED Viewed

@@ -2,42 +2,33 @@ module Identifiers
   class DOI
     REGEXP = %r{
       \b
-      10 # Directory indicator (always 10)
+      10                                        # Directory indicator (always 10)
       \.
       (?:
         # ISBN-A
-        97[89]\. # ISBN (GS1) Bookland prefix
-        \d{2,8}  # ISBN registration group element and publisher prefix
-        /        # Prefix/suffix divider
-        \d{1,7}  # ISBN title enumerator and check digit
+        97[89]\.                                # ISBN (GS1) Bookland prefix
+        \d{2,8}                                 # ISBN registration group element and publisher prefix
+        /                                       # Prefix/suffix divider
+        \d{1,7}                                 # ISBN title enumerator and check digit
         |
         # DOI
-        \d{4,9}       # Registrant code
-        /             # Prefix/suffix divider
-        [^[:space:]]+ # DOI suffix
+        \d{4,9}                                 # Registrant code
+        /                                       # Prefix/suffix divider
+        (?:
+          # DOI suffix
+          [^[:space:]]+;2-[\#0-9a-z]            # Early Wiley suffix
+          |
+          [^[:space:]]+                         # Suffix...
+          \([^[:space:])]+\)                    # Ending in balanced parentheses...
+          (?![^[:space:]\p{P}])                 # Not followed by more suffix or punctuation
+          |
+          [^[:space:]]+(?![[:space:]])\p{^P}    # Suffix ending in non-punctuation
+        )
       )
     }x
-    VALID_ENDING = /
-      (?:
-        \p{^Punct} # Non-punctuation character
-        |
-        \(.+\)     # Balanced parentheses
-        |
-        2-\#       # Early Wiley DOI suffix
-      )
-      \z
-    /x
     def self.extract(str)
-      str.to_s.downcase.scan(REGEXP).map { |doi| extract_one(doi) }.compact
-    end
-    def self.extract_one(str)
-      while (match = str.to_s.downcase[REGEXP])
-        break match if match =~ VALID_ENDING
-        str = match.sub(/\p{Punct}\z/, '')
-      end
+      str.to_s.downcase.scan(REGEXP)
     end
   end
 end

data/lib/identifiers/pubmed_id.rb CHANGED Viewed

@@ -1,11 +1,11 @@
-module Identifiers
-  class PubmedId
-    ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
-    PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
-    def self.extract(str)
-      str = str.to_s
-      str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
-    end
-  end
-end
+module Identifiers
+  class PubmedId
+    ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
+    PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
+    def self.extract(str)
+      str = str.to_s
+      str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
+    end
+  end
+end

data/spec/identifiers/doi_spec.rb CHANGED Viewed

@@ -66,9 +66,9 @@ RSpec.describe Identifiers::DOI do
   end
   it 'extracts old Wiley DOIs' do
-    str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#'
+    str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
-    expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
+    expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
   end
   it 'does not extract a closing parenthesis if not part of the DOI' do
@@ -124,4 +124,28 @@ RSpec.describe Identifiers::DOI do
   it 'extracts DOIs separated by Unicode whitespace' do
     expect(described_class.extract('10.1234/foo  10.1234/bar')).to contain_exactly('10.1234/foo', '10.1234/bar')
   end
+  it 'does not extract DOIs with extra digits prefixed' do
+    expect(described_class.extract('110.1234/foo')).to be_empty
+  end
+  it 'extracts DOIs from a string with trailing closing parentheses' do
+    expect(described_class.extract('10.1130/2013.2502(04))')).to contain_exactly('10.1130/2013.2502(04)')
+  end
+  it 'extracts DOIs from a string with multiple trailing closing parentheses' do
+    expect(described_class.extract('10.1130/2013.2502(04))))')).to contain_exactly('10.1130/2013.2502(04)')
+  end
+  it 'extracts DOIs with parentheses within the suffix' do
+    expect(described_class.extract('10.1016/0005-2744(70)90072-0')).to contain_exactly('10.1016/0005-2744(70)90072-0')
+  end
+  it 'extracts all DOIs from a Crossref sample' do
+    Pathname.new(__FILE__).join('..', '..', 'fixtures', 'dois.txt').each_line do |doi|
+      doi.chomp!
+      expect(described_class.extract(doi)).to contain_exactly(doi)
+    end
+  end
 end

data/spec/identifiers/pubmed_id_spec.rb CHANGED Viewed

@@ -1,99 +1,99 @@
-require 'identifiers/pubmed_id'
-RSpec.describe Identifiers::PubmedId do
-  it 'extracts PubMed IDs' do
-    expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with www' do
-    url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with www and https' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL without www' do
-    url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL without www but with https' do
-    url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed mobile URL' do
-    url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts PubMed IDs from a PubMed URL with query parameters' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
-    url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
-    expect(described_class.extract(url)).to contain_exactly('123456')
-  end
-  it 'extracts both number and URLs PubMed IDs' do
-    url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
-    expect(described_class.extract(url)).to contain_exactly('112233', '123456')
-  end
-  it 'does not return outputs with PubMed IDs in DOIs' do
-    str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
-    expect(described_class.extract(str)).to be_empty
-  end
-  it 'strips leading 0s' do
-    expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
-  end
-  it 'does not consider 0 as a valid PubMed ID' do
-    expect(described_class.extract('00000000')).to be_empty
-  end
-  it 'extracts PubMed IDs separated by Unicode whitespace' do
-    expect(described_class.extract('123 456')).to contain_exactly('123', '456')
-  end
-  it 'considers Fixnum as potential PubMed IDs too' do
-    expect(described_class.extract(123)).to contain_exactly('123')
-  end
-  it 'extracts PubMed IDs with pmid scheme' do
-    expect(described_class.extract('pmid:123')).to contain_exactly('123')
-  end
-  it 'strips leading zeroes from pmid scheme' do
-    expect(described_class.extract('pmid:000123')).to contain_exactly('123')
-  end
-  it 'extracts PubMed IDs with info pmid scheme' do
-    expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
-  end
-  it 'strips leading zeroes from info pmid scheme' do
-    expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
-  end
-end
+require 'identifiers/pubmed_id'
+RSpec.describe Identifiers::PubmedId do
+  it 'extracts PubMed IDs' do
+    expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with www' do
+    url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with www and https' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL without www' do
+    url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL without www but with https' do
+    url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed mobile URL' do
+    url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts PubMed IDs from a PubMed URL with query parameters' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
+    url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
+    expect(described_class.extract(url)).to contain_exactly('123456')
+  end
+  it 'extracts both number and URLs PubMed IDs' do
+    url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
+    expect(described_class.extract(url)).to contain_exactly('112233', '123456')
+  end
+  it 'does not return outputs with PubMed IDs in DOIs' do
+    str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
+    expect(described_class.extract(str)).to be_empty
+  end
+  it 'strips leading 0s' do
+    expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
+  end
+  it 'does not consider 0 as a valid PubMed ID' do
+    expect(described_class.extract('00000000')).to be_empty
+  end
+  it 'extracts PubMed IDs separated by Unicode whitespace' do
+    expect(described_class.extract('123 456')).to contain_exactly('123', '456')
+  end
+  it 'considers Fixnum as potential PubMed IDs too' do
+    expect(described_class.extract(123)).to contain_exactly('123')
+  end
+  it 'extracts PubMed IDs with pmid scheme' do
+    expect(described_class.extract('pmid:123')).to contain_exactly('123')
+  end
+  it 'strips leading zeroes from pmid scheme' do
+    expect(described_class.extract('pmid:000123')).to contain_exactly('123')
+  end
+  it 'extracts PubMed IDs with info pmid scheme' do
+    expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
+  end
+  it 'strips leading zeroes from info pmid scheme' do
+    expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
+  end
+end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,18 +1,18 @@
-RSpec.configure do |config|
-  config.filter_run :focus
-  config.run_all_when_everything_filtered = true
-  config.example_status_persistence_file_path = "spec/examples.txt"
-  config.disable_monkey_patching!
-  config.warnings = true
-  config.order = :random
-  config.default_formatter = 'doc' if config.files_to_run.one?
-  Kernel.srand config.seed
-  config.expect_with :rspec do |expectations|
-    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
-  end
-  config.mock_with :rspec do |mocks|
-    mocks.verify_partial_doubles = true
-  end
-end
+RSpec.configure do |config|
+  config.filter_run :focus
+  config.run_all_when_everything_filtered = true
+  config.example_status_persistence_file_path = "spec/examples.txt"
+  config.disable_monkey_patching!
+  config.warnings = true
+  config.order = :random
+  config.default_formatter = 'doc' if config.files_to_run.one?
+  Kernel.srand config.seed
+  config.expect_with :rspec do |expectations|
+    expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+  end
+  config.mock_with :rspec do |mocks|
+    mocks.verify_partial_doubles = true
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: identifiers
 version: !ruby/object:Gem::Version
-  version: 0.11.0
+  version: 0.12.0
 platform: ruby
 authors:
 - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-03-12 00:00:00.000000000 Z
+date: 2018-04-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: urn
@@ -119,19 +119,19 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.6.13
+rubygems_version: 2.7.3
 signing_key:
 specification_version: 4
 summary: Utilities library for various scholarly identifiers used by Altmetric
 test_files:
-- spec/identifiers/ads_bibcode_spec.rb
+- spec/spec_helper.rb
+- spec/identifiers/repec_id_spec.rb
+- spec/identifiers/pubmed_id_spec.rb
 - spec/identifiers/arxiv_id_spec.rb
-- spec/identifiers/doi_spec.rb
+- spec/identifiers/urn_spec.rb
 - spec/identifiers/handle_spec.rb
+- spec/identifiers/ads_bibcode_spec.rb
 - spec/identifiers/isbn_spec.rb
+- spec/identifiers/doi_spec.rb
 - spec/identifiers/national_clinical_trial_id_spec.rb
 - spec/identifiers/orcid_spec.rb
-- spec/identifiers/pubmed_id_spec.rb
-- spec/identifiers/repec_id_spec.rb
-- spec/identifiers/urn_spec.rb
-- spec/spec_helper.rb