RubyGems - identifiers - Versions diffs - 0.12.1 → 0.13.0 - Mend

identifiers 0.12.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +18 -2
data/lib/identifiers/doi.rb +8 -2
data/spec/identifiers/doi_spec.rb +141 -97
metadata +3 -18

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 423438daad2350706eced26425401a5d97ae715095b54853fe55ba4f8dbbaf2d
-  data.tar.gz: 2776accbdccfc17965dd69507a0b326b2a57e275811828b05e76b47c0f54bb28
+  metadata.gz: a4546231ed2288fa0d807ac0fc71f268f92de75dbc38fb057410441b3b77ca4c
+  data.tar.gz: b55eaf0b183185203c0957faf86335ac3b1996ed7312692d92daf013162798ff
 SHA512:
-  metadata.gz: e319511db960df762b3a646239edf45ff683bef6227736c673072de6ef5d1649dd300232f93c0af0be1d7744308de4f07ebfd53582add6069c6b3f425249722b
-  data.tar.gz: 1ebd31499facbc5a51ec0dcf30fadb920e86ad6e53c20f4bb5dd66fa11c9f9cc7842a8b703348f13aac2675c0415c2c0f9464c42baafd055669c881342c33217
+  metadata.gz: 81cfdbbd15e12d7394deeaf45463837ecb01a995fe9bc713736578153f1725d0a6cb71913453d918ad2a1d2cd5970caa6d18656df70f3b175628c4f753f07c37
+  data.tar.gz: 5472c1c3f7b4c04d6a51f4d6fddfd476d3c8823d6c98a74de295cc730fe72bfb637b88e9c9f606d0144418a3d2b7828faf5ca145c1b53b19d8691c937fcc600f

data/CHANGELOG.md CHANGED

@@ -2,6 +2,11 @@
 All notable changes to this project will be documented in this file. This
 project adheres to [Semantic Versioning](http://semver.org/).
+## [0.13.0] - 2019-09-04
+### Added
+- Added new mode to the DOI extraction, so that it doesn't strip trailing
+  periods when in `strict` mode
 ## [0.12.1] - 2018-04-09
 ### Fixed
 - Restored support for extracting hyphenated ISBN-10s with registration group
@@ -88,3 +93,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
 [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
 [0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0
 [0.12.1]: https://github.com/altmetric/identifiers/releases/tag/v0.12.1
+[0.13.0]: https://github.com/altmetric/identifiers/releases/tag/v0.13.0

data/README.md CHANGED

@@ -32,18 +32,34 @@ Or install it yourself as:
 ## Usage
 ```ruby
-Identifiers::DOI.extract('example: 10.123/abcd.efghi')
-# => ["10.123/abcd.efghi"]
+Identifiers::DOI.extract('example: 10.1234/5678.ABC')
+# => ["10.1234/5678.abc"]
 Identifiers::DOI.extract('no DOIs here')
 # => []
 Identifiers::URN.new('urn:abc:123')
 # => #<URN:0x007ff11c13d930 @urn="urn:abc:123", @nid="abc", @nss="123">
 Identifiers::URN('urn:abc:123')
 # => #<URN:0x007ff11c0ff568 @urn="urn:abc:123", @nid="abc", @nss="123">
 ```
+A small percentage of DOIs end in trailing `.`. However, having trailing periods
+being returned by the default extraction method would possibly return quite a few
+false positives.
+`DOI.extract` accepts a `strict` option, which can be set to true if we prefer to
+return DOIs ending in `.`. By default, this option is set to `false`, which strips
+any trailing `.`:
+```ruby
+Identifiers::DOI.extract('example: 10.1234/5678.abc.', strict: true)
+# => ["10.1234/5678.abc."]
+Identifiers::DOI.extract('example: 10.1234/5678.abc.')
+# => ["10.1234/5678.abc"]
+```
 ## By identifier
 `.extract` is a common method that works across all the supported identifiers.

data/lib/identifiers/doi.rb CHANGED

@@ -24,11 +24,17 @@ module Identifiers
           |
           [^[:space:]]+(?![[:space:]])\p{^P}    # Suffix ending in non-punctuation
         )
+        \.{0,3}                                 # Allow a DOI to end with up to 3 .
       )
     }x
-    def self.extract(str)
-      str.to_s.downcase.scan(REGEXP)
+    def self.extract(str, options = {})
+      strict = options.fetch(:strict, false)
+      dois = str.to_s.downcase.scan(REGEXP)
+      dois = dois.map { |doi| doi.gsub(/\.+$/, '') } unless strict
+      dois
     end
   end
 end

data/spec/identifiers/doi_spec.rb CHANGED

@@ -1,151 +1,195 @@
 require 'identifiers/doi'
 RSpec.describe Identifiers::DOI do
-  it 'extracts DOIs from a string' do
-    str = 'This is an example of a DOI: 10.1049/el.2013.3006'
+  OPTIONS = [{ strict: false }, { strict: true }].freeze
-    expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
+  def each_doi(file)
+    Pathname.new(__FILE__).join('..', '..', 'fixtures', file).each_line do |doi|
+      yield(doi.chomp!)
+    end
   end
-  it 'extracts DOIs from anywhere in a string' do
-    str = 'This is an example of a DOI - 10.1049/el.2013.3006 - which is entirely valid'
+  OPTIONS.each do |options|
+    context "when extracting with options set to #{options.inspect}" do
+      it 'extracts DOIs from a string' do
+        str = 'This is an example of a DOI: 10.1049/el.2013.3006'
-    expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
+      end
-  it 'downcases the DOIs extracted' do
-    str = 'This is an example of a DOI: 10.1097/01.ASW.0000443266.17665.19'
+      it 'extracts DOIs from anywhere in a string' do
+        str = 'This is an example of a DOI - 10.1049/el.2013.3006 - which is entirely valid'
-    expect(described_class.extract(str)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
+      end
-  it 'does not extract a PubMed ID' do
-    str = 'This is NOT a DOI: 123456'
+      it 'downcases the DOIs extracted' do
+        str = 'This is an example of a DOI: 10.1097/01.ASW.0000443266.17665.19'
-    expect(described_class.extract(str)).to be_empty
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
+      end
-  it 'returns no DOIs if given nothing' do
-    expect(described_class.extract(nil)).to be_empty
-  end
+      it 'does not extract a PubMed ID' do
+        str = 'This is NOT a DOI: 123456'
-  it 'extracts ISBN-As' do
-    str = 'This is an ISBN-A: 10.978.8898392/315'
+        expect(described_class.extract(str, options)).to be_empty
+      end
-    expect(described_class.extract(str)).to contain_exactly('10.978.8898392/315')
-  end
+      it 'returns no DOIs if given nothing' do
+        expect(described_class.extract(nil)).to be_empty
+      end
-  it 'does not extract invalid ISBN-As' do
-    str = 'This is not an ISBN-A: 10.978.8898392/NotARealIsbnA'
+      it 'extracts ISBN-As' do
+        str = 'This is an ISBN-A: 10.978.8898392/315'
-    expect(described_class.extract(str)).to be_empty
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.978.8898392/315')
+      end
-  it 'retains closing parentheses that are part of the DOI' do
-    str = 'This is an example of a DOI: 10.1130/2013.2502(04)'
+      it 'does not extract invalid ISBN-As' do
+        str = 'This is not an ISBN-A: 10.978.8898392/NotARealIsbnA'
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
-  end
+        expect(described_class.extract(str, options)).to be_empty
+      end
-  it 'discards trailing punctuation' do
-    str = 'This is an example of a DOI: 10.1130/2013.2502.'
+      it 'retains closing parentheses that are part of the DOI' do
+        str = 'This is an example of a DOI: 10.1130/2013.2502(04)'
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
+      end
-  it 'discards multiple contiguous trailing punctuation' do
-    str = 'This is an example of a DOI: 10.1130/2013.2502...",'
+      it 'discards ellipses' do
+        str = 'This is an example of a DOI: 10.1130/2013.2502…'
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
+      end
-  it 'discards trailing Unicode punctuation' do
-    str = 'This is an example of a DOI: 10.1130/2013.2502…'
+      it 'extracts old Wiley DOIs' do
+        str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
+      end
-  it 'extracts old Wiley DOIs' do
-    str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
+      it 'does not extract a closing parenthesis if not part of the DOI' do
+        str = '(This is an example of a DOI: 10.1130/2013.2502)'
-    expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
+      end
-  it 'does not extract a closing parenthesis if not part of the DOI' do
-    str = '(This is an example of a DOI: 10.1130/2013.2502)'
+      it 'discards trailing punctuation from old Wiley DOIs' do
+        str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#",'
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
+      end
-  it 'discards trailing punctuation from old Wiley DOIs' do
-    str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#",'
+      it 'discards trailing Unicode punctuation after balanced parentheses' do
+        str = 'This is an example of a DOI: 10.1130/2013.2502(04)…",'
-    expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
+      end
-  it 'discards trailing punctuation after balanced parentheses' do
-    str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
+      it 'discards contiguous trailing punctuation after unbalanced parentheses' do
+        str = '(This is an example of a DOI: 10.1130/2013.2502).",'
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
-  end
+        expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
+      end
-  it 'discards contiguous trailing punctuation after balanced parentheses' do
-    str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
+      it 'does not extract DOIs with purely punctuation suffixes' do
+        expect(described_class.extract('10.1130/!).",', options)).to be_empty
+      end
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
-  end
+      it 'extracts DOIs with emoji in them' do
+        expect(described_class.extract('10.1234/🐔💩123🐔🐔🐔123', options)).to contain_exactly('10.1234/🐔💩123🐔🐔🐔123')
+      end
-  it 'discards trailing Unicode punctuation after balanced parentheses' do
-    str = 'This is an example of a DOI: 10.1130/2013.2502(04)…",'
+      it 'extracts DOIs separated by Unicode whitespace' do
+        expect(described_class.extract('10.1234/foo  10.1234/bar', options)).to contain_exactly('10.1234/foo', '10.1234/bar')
+      end
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
-  end
+      it 'does not extract DOIs with extra digits prefixed' do
+        expect(described_class.extract('110.1234/foo', options)).to be_empty
+      end
-  it 'discards contiguous trailing punctuation after unbalanced parentheses' do
-    str = '(This is an example of a DOI: 10.1130/2013.2502).",'
+      it 'extracts DOIs from a string with trailing closing parentheses' do
+        expect(described_class.extract('(10.1130/2013.2502(04))', options)).to contain_exactly('10.1130/2013.2502(04)')
+      end
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
-  end
+      it 'extracts DOIs from a string with multiple trailing closing parentheses' do
+        expect(described_class.extract('10.1130/2013.2502(04))))', options)).to contain_exactly('10.1130/2013.2502(04)')
+      end
-  it 'does not overflow when given lots of trailing punctuation' do
-    str = '10.1130/2013.2502' + ('.' * 10000)
+      it 'extracts DOIs with parentheses within the suffix' do
+        expect(described_class.extract('10.1016/0005-2744(70)90072-0', options)).to contain_exactly('10.1016/0005-2744(70)90072-0')
+      end
-    expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
+      it 'extracts all DOIs from a Crossref sample' do
+        each_doi('dois.txt') { |doi|
+          expect(described_class.extract(doi, options)).to contain_exactly(doi)
+        }
+      end
+    end
   end
-  it 'does not extract DOIs with purely punctuation suffixes' do
-    expect(described_class.extract('10.1130/!).",')).to be_empty
-  end
+  context 'when no options are provided' do
+    it 'discards trailing punctuation' do
+      str = 'This is an example of a DOI: 10.1130/2013.2502.'
-  it 'extracts DOIs with emoji in them' do
-    expect(described_class.extract('10.1234/🐔💩123🐔🐔🐔123')).to contain_exactly('10.1234/🐔💩123🐔🐔🐔123')
-  end
+      expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
+    end
-  it 'extracts DOIs separated by Unicode whitespace' do
-    expect(described_class.extract('10.1234/foo  10.1234/bar')).to contain_exactly('10.1234/foo', '10.1234/bar')
-  end
+    it 'discards multiple contiguous trailing punctuation' do
+      str = 'This is an example of a DOI: 10.1130/2013.2502...",'
-  it 'does not extract DOIs with extra digits prefixed' do
-    expect(described_class.extract('110.1234/foo')).to be_empty
-  end
+      expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
+    end
-  it 'extracts DOIs from a string with trailing closing parentheses' do
-    expect(described_class.extract('10.1130/2013.2502(04))')).to contain_exactly('10.1130/2013.2502(04)')
-  end
+    it 'discards trailing punctuation after balanced parentheses' do
+      str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
-  it 'extracts DOIs from a string with multiple trailing closing parentheses' do
-    expect(described_class.extract('10.1130/2013.2502(04))))')).to contain_exactly('10.1130/2013.2502(04)')
-  end
+      expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
+    end
-  it 'extracts DOIs with parentheses within the suffix' do
-    expect(described_class.extract('10.1016/0005-2744(70)90072-0')).to contain_exactly('10.1016/0005-2744(70)90072-0')
+    it 'discards contiguous trailing punctuation after balanced parentheses' do
+      str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
+      expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
+    end
+    it 'does not overflow when given lots of trailing punctuation' do
+      str = '10.1130/2013.2502' + ('.' * 10000)
+      expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
+    end
   end
-  it 'extracts all DOIs from a Crossref sample' do
-    Pathname.new(__FILE__).join('..', '..', 'fixtures', 'dois.txt').each_line do |doi|
-      doi.chomp!
+  context 'with strict mode on' do
+    it 'extracts DOIs ending with trailing periods' do
+      str = 'This is an example of a DOI: 10.1130/2013.2502...",'
+      expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
+    end
+    it 'keeps trailing punctuation after balanced parentheses' do
+      str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
+      expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
+    end
+    it 'discards contiguous trailing punctuation after balanced parentheses' do
+      str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
+      expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
+    end
+    it 'limits the trailing periods to 3' do
+      str = 'This is an example of a DOI: 10.1130/2013.2502.......'
+      expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
+    end
-      expect(described_class.extract(doi)).to contain_exactly(doi)
+    it 'extracts all DOIs from a Crossref sample, keeping the trailing periods' do
+      each_doi('strict_mode_dois.txt') { |doi|
+        expect(described_class.extract(doi, strict: true)).to contain_exactly(doi)
+      }
     end
   end
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: identifiers
 version: !ruby/object:Gem::Version
-  version: 0.12.1
+  version: 0.13.0
 platform: ruby
 authors:
 - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-04-09 00:00:00.000000000 Z
+date: 2019-09-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: urn
@@ -25,20 +25,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '2.0'
-- !ruby/object:Gem::Dependency
-  name: bundler
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '1.10'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '1.10'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -118,8 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.3
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: Utilities library for various scholarly identifiers used by Altmetric