identifiers 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: ab7d56e3e6048da713104ab263f36bdd96a9da98
4
- data.tar.gz: 4d2636ffdeeb7b3a1e563b36c761e39dc81f488e
2
+ SHA256:
3
+ metadata.gz: 6ec6f86cbb54595ef76ab6615ba9e3a5f671478daebf31412f6b6820f1cb020d
4
+ data.tar.gz: 63881a8f3863926d66005133ba9fa0cdf471a7446c2dffa12534ac540911207e
5
5
  SHA512:
6
- metadata.gz: 3a62e98dcf6f35c180e1dd2c2e574f8993c6123a182e6fee782a96bc85e342de5b3894455f8fa99325100edcb8f3a9f0fa33d3135040b775bbf6e0b442bc1992
7
- data.tar.gz: fcc0d3d178e49ee0eefed149200f4f8489d58c4cb953b21a0cab63067ae237432eee607e8e81460f70cbdba9ebb8d1d44c6f3d01fe82f16c9038a12f31aba609
6
+ metadata.gz: 97b2352aa4a99ec8cf5158e1239f273e10f28321df11941445890549b827365e7a8ba1fb54d4a268bd146889f94aaa4fec0135ec1b67f779094557c6987bc466
7
+ data.tar.gz: f0e6a040a480bf9cf42900997415f8341fb7f68274434c936e725bafab484ce703fc6f0e21c5e317a85456c9fc0c2305e80ba9b1bae99cc7e6ba33fa40a890cc
data/CHANGELOG.md CHANGED
@@ -2,8 +2,15 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.12.0] - 2018-04-06
6
+ ### Added
7
+ - Added support for extracting more old Wiley DOIs
8
+
9
+ ### Changed
10
+ - Performance improvements when extracting DOIs with trailing punctuation.
11
+
5
12
  ## [0.11.0] - 2018-03-12
6
- ## Fixed
13
+ ### Fixed
7
14
  - Stricter ISBN extraction: consistent hyphenation (#27) and correct number of groups (#28)
8
15
  - Prevent stack overflow when extracting DOIS (#25)
9
16
 
@@ -74,3 +81,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
74
81
  [0.9.1]: https://github.com/altmetric/identifiers/releases/tag/v0.9.1
75
82
  [0.10.0]: https://github.com/altmetric/identifiers/releases/tag/v0.10.0
76
83
  [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
84
+ [0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0
data/README.md CHANGED
@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
18
18
  Add this line to your application's `Gemfile`:
19
19
 
20
20
  ```ruby
21
- gem 'identifiers', '~> 0.11'
21
+ gem 'identifiers', '~> 0.12'
22
22
  ```
23
23
 
24
24
  And then execute:
@@ -2,42 +2,33 @@ module Identifiers
2
2
  class DOI
3
3
  REGEXP = %r{
4
4
  \b
5
- 10 # Directory indicator (always 10)
5
+ 10 # Directory indicator (always 10)
6
6
  \.
7
7
  (?:
8
8
  # ISBN-A
9
- 97[89]\. # ISBN (GS1) Bookland prefix
10
- \d{2,8} # ISBN registration group element and publisher prefix
11
- / # Prefix/suffix divider
12
- \d{1,7} # ISBN title enumerator and check digit
9
+ 97[89]\. # ISBN (GS1) Bookland prefix
10
+ \d{2,8} # ISBN registration group element and publisher prefix
11
+ / # Prefix/suffix divider
12
+ \d{1,7} # ISBN title enumerator and check digit
13
13
  |
14
14
  # DOI
15
- \d{4,9} # Registrant code
16
- / # Prefix/suffix divider
17
- [^[:space:]]+ # DOI suffix
15
+ \d{4,9} # Registrant code
16
+ / # Prefix/suffix divider
17
+ (?:
18
+ # DOI suffix
19
+ [^[:space:]]+;2-[\#0-9a-z] # Early Wiley suffix
20
+ |
21
+ [^[:space:]]+ # Suffix...
22
+ \([^[:space:])]+\) # Ending in balanced parentheses...
23
+ (?![^[:space:]\p{P}]) # Not followed by more suffix or punctuation
24
+ |
25
+ [^[:space:]]+(?![[:space:]])\p{^P} # Suffix ending in non-punctuation
26
+ )
18
27
  )
19
28
  }x
20
- VALID_ENDING = /
21
- (?:
22
- \p{^Punct} # Non-punctuation character
23
- |
24
- \(.+\) # Balanced parentheses
25
- |
26
- 2-\# # Early Wiley DOI suffix
27
- )
28
- \z
29
- /x
30
29
 
31
30
  def self.extract(str)
32
- str.to_s.downcase.scan(REGEXP).map { |doi| extract_one(doi) }.compact
33
- end
34
-
35
- def self.extract_one(str)
36
- while (match = str.to_s.downcase[REGEXP])
37
- break match if match =~ VALID_ENDING
38
-
39
- str = match.sub(/\p{Punct}\z/, '')
40
- end
31
+ str.to_s.downcase.scan(REGEXP)
41
32
  end
42
33
  end
43
34
  end
@@ -1,11 +1,11 @@
1
- module Identifiers
2
- class PubmedId
3
- ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
4
- PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
5
-
6
- def self.extract(str)
7
- str = str.to_s
8
- str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
9
- end
10
- end
11
- end
1
+ module Identifiers
2
+ class PubmedId
3
+ ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
4
+ PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
5
+
6
+ def self.extract(str)
7
+ str = str.to_s
8
+ str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
9
+ end
10
+ end
11
+ end
@@ -66,9 +66,9 @@ RSpec.describe Identifiers::DOI do
66
66
  end
67
67
 
68
68
  it 'extracts old Wiley DOIs' do
69
- str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#'
69
+ str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
70
70
 
71
- expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
71
+ expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
72
72
  end
73
73
 
74
74
  it 'does not extract a closing parenthesis if not part of the DOI' do
@@ -124,4 +124,28 @@ RSpec.describe Identifiers::DOI do
124
124
  it 'extracts DOIs separated by Unicode whitespace' do
125
125
  expect(described_class.extract('10.1234/foo  10.1234/bar')).to contain_exactly('10.1234/foo', '10.1234/bar')
126
126
  end
127
+
128
+ it 'does not extract DOIs with extra digits prefixed' do
129
+ expect(described_class.extract('110.1234/foo')).to be_empty
130
+ end
131
+
132
+ it 'extracts DOIs from a string with trailing closing parentheses' do
133
+ expect(described_class.extract('10.1130/2013.2502(04))')).to contain_exactly('10.1130/2013.2502(04)')
134
+ end
135
+
136
+ it 'extracts DOIs from a string with multiple trailing closing parentheses' do
137
+ expect(described_class.extract('10.1130/2013.2502(04))))')).to contain_exactly('10.1130/2013.2502(04)')
138
+ end
139
+
140
+ it 'extracts DOIs with parentheses within the suffix' do
141
+ expect(described_class.extract('10.1016/0005-2744(70)90072-0')).to contain_exactly('10.1016/0005-2744(70)90072-0')
142
+ end
143
+
144
+ it 'extracts all DOIs from a Crossref sample' do
145
+ Pathname.new(__FILE__).join('..', '..', 'fixtures', 'dois.txt').each_line do |doi|
146
+ doi.chomp!
147
+
148
+ expect(described_class.extract(doi)).to contain_exactly(doi)
149
+ end
150
+ end
127
151
  end
@@ -1,99 +1,99 @@
1
- require 'identifiers/pubmed_id'
2
-
3
- RSpec.describe Identifiers::PubmedId do
4
- it 'extracts PubMed IDs' do
5
- expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
6
- end
7
-
8
- it 'extracts PubMed IDs from a PubMed URL with www' do
9
- url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
10
-
11
- expect(described_class.extract(url)).to contain_exactly('123456')
12
- end
13
-
14
- it 'extracts PubMed IDs from a PubMed URL with www and https' do
15
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
16
-
17
- expect(described_class.extract(url)).to contain_exactly('123456')
18
- end
19
-
20
- it 'extracts PubMed IDs from a PubMed URL without www' do
21
- url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
22
-
23
- expect(described_class.extract(url)).to contain_exactly('123456')
24
- end
25
-
26
- it 'extracts PubMed IDs from a PubMed URL without www but with https' do
27
- url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
28
-
29
- expect(described_class.extract(url)).to contain_exactly('123456')
30
- end
31
-
32
- it 'extracts PubMed IDs from a PubMed mobile URL' do
33
- url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
34
-
35
- expect(described_class.extract(url)).to contain_exactly('123456')
36
- end
37
-
38
- it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
39
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
40
-
41
- expect(described_class.extract(url)).to contain_exactly('123456')
42
- end
43
-
44
- it 'extracts PubMed IDs from a PubMed URL with query parameters' do
45
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
46
-
47
- expect(described_class.extract(url)).to contain_exactly('123456')
48
- end
49
-
50
- it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
51
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
52
-
53
- expect(described_class.extract(url)).to contain_exactly('123456')
54
- end
55
-
56
- it 'extracts both number and URLs PubMed IDs' do
57
- url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
58
-
59
- expect(described_class.extract(url)).to contain_exactly('112233', '123456')
60
- end
61
-
62
- it 'does not return outputs with PubMed IDs in DOIs' do
63
- str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
64
-
65
- expect(described_class.extract(str)).to be_empty
66
- end
67
-
68
- it 'strips leading 0s' do
69
- expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
70
- end
71
-
72
- it 'does not consider 0 as a valid PubMed ID' do
73
- expect(described_class.extract('00000000')).to be_empty
74
- end
75
-
76
- it 'extracts PubMed IDs separated by Unicode whitespace' do
77
- expect(described_class.extract('123 456')).to contain_exactly('123', '456')
78
- end
79
-
80
- it 'considers Fixnum as potential PubMed IDs too' do
81
- expect(described_class.extract(123)).to contain_exactly('123')
82
- end
83
-
84
- it 'extracts PubMed IDs with pmid scheme' do
85
- expect(described_class.extract('pmid:123')).to contain_exactly('123')
86
- end
87
-
88
- it 'strips leading zeroes from pmid scheme' do
89
- expect(described_class.extract('pmid:000123')).to contain_exactly('123')
90
- end
91
-
92
- it 'extracts PubMed IDs with info pmid scheme' do
93
- expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
94
- end
95
-
96
- it 'strips leading zeroes from info pmid scheme' do
97
- expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
98
- end
99
- end
1
+ require 'identifiers/pubmed_id'
2
+
3
+ RSpec.describe Identifiers::PubmedId do
4
+ it 'extracts PubMed IDs' do
5
+ expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
6
+ end
7
+
8
+ it 'extracts PubMed IDs from a PubMed URL with www' do
9
+ url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
10
+
11
+ expect(described_class.extract(url)).to contain_exactly('123456')
12
+ end
13
+
14
+ it 'extracts PubMed IDs from a PubMed URL with www and https' do
15
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
16
+
17
+ expect(described_class.extract(url)).to contain_exactly('123456')
18
+ end
19
+
20
+ it 'extracts PubMed IDs from a PubMed URL without www' do
21
+ url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
22
+
23
+ expect(described_class.extract(url)).to contain_exactly('123456')
24
+ end
25
+
26
+ it 'extracts PubMed IDs from a PubMed URL without www but with https' do
27
+ url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
28
+
29
+ expect(described_class.extract(url)).to contain_exactly('123456')
30
+ end
31
+
32
+ it 'extracts PubMed IDs from a PubMed mobile URL' do
33
+ url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
34
+
35
+ expect(described_class.extract(url)).to contain_exactly('123456')
36
+ end
37
+
38
+ it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
39
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
40
+
41
+ expect(described_class.extract(url)).to contain_exactly('123456')
42
+ end
43
+
44
+ it 'extracts PubMed IDs from a PubMed URL with query parameters' do
45
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
46
+
47
+ expect(described_class.extract(url)).to contain_exactly('123456')
48
+ end
49
+
50
+ it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
51
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
52
+
53
+ expect(described_class.extract(url)).to contain_exactly('123456')
54
+ end
55
+
56
+ it 'extracts both number and URLs PubMed IDs' do
57
+ url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
58
+
59
+ expect(described_class.extract(url)).to contain_exactly('112233', '123456')
60
+ end
61
+
62
+ it 'does not return outputs with PubMed IDs in DOIs' do
63
+ str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
64
+
65
+ expect(described_class.extract(str)).to be_empty
66
+ end
67
+
68
+ it 'strips leading 0s' do
69
+ expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
70
+ end
71
+
72
+ it 'does not consider 0 as a valid PubMed ID' do
73
+ expect(described_class.extract('00000000')).to be_empty
74
+ end
75
+
76
+ it 'extracts PubMed IDs separated by Unicode whitespace' do
77
+ expect(described_class.extract('123 456')).to contain_exactly('123', '456')
78
+ end
79
+
80
+ it 'considers Fixnum as potential PubMed IDs too' do
81
+ expect(described_class.extract(123)).to contain_exactly('123')
82
+ end
83
+
84
+ it 'extracts PubMed IDs with pmid scheme' do
85
+ expect(described_class.extract('pmid:123')).to contain_exactly('123')
86
+ end
87
+
88
+ it 'strips leading zeroes from pmid scheme' do
89
+ expect(described_class.extract('pmid:000123')).to contain_exactly('123')
90
+ end
91
+
92
+ it 'extracts PubMed IDs with info pmid scheme' do
93
+ expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
94
+ end
95
+
96
+ it 'strips leading zeroes from info pmid scheme' do
97
+ expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
98
+ end
99
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,18 +1,18 @@
1
- RSpec.configure do |config|
2
- config.filter_run :focus
3
- config.run_all_when_everything_filtered = true
4
- config.example_status_persistence_file_path = "spec/examples.txt"
5
- config.disable_monkey_patching!
6
- config.warnings = true
7
- config.order = :random
8
- config.default_formatter = 'doc' if config.files_to_run.one?
9
- Kernel.srand config.seed
10
-
11
- config.expect_with :rspec do |expectations|
12
- expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
- end
14
-
15
- config.mock_with :rspec do |mocks|
16
- mocks.verify_partial_doubles = true
17
- end
18
- end
1
+ RSpec.configure do |config|
2
+ config.filter_run :focus
3
+ config.run_all_when_everything_filtered = true
4
+ config.example_status_persistence_file_path = "spec/examples.txt"
5
+ config.disable_monkey_patching!
6
+ config.warnings = true
7
+ config.order = :random
8
+ config.default_formatter = 'doc' if config.files_to_run.one?
9
+ Kernel.srand config.seed
10
+
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+
15
+ config.mock_with :rspec do |mocks|
16
+ mocks.verify_partial_doubles = true
17
+ end
18
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-03-12 00:00:00.000000000 Z
12
+ date: 2018-04-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn
@@ -119,19 +119,19 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
119
  version: '0'
120
120
  requirements: []
121
121
  rubyforge_project:
122
- rubygems_version: 2.6.13
122
+ rubygems_version: 2.7.3
123
123
  signing_key:
124
124
  specification_version: 4
125
125
  summary: Utilities library for various scholarly identifiers used by Altmetric
126
126
  test_files:
127
- - spec/identifiers/ads_bibcode_spec.rb
127
+ - spec/spec_helper.rb
128
+ - spec/identifiers/repec_id_spec.rb
129
+ - spec/identifiers/pubmed_id_spec.rb
128
130
  - spec/identifiers/arxiv_id_spec.rb
129
- - spec/identifiers/doi_spec.rb
131
+ - spec/identifiers/urn_spec.rb
130
132
  - spec/identifiers/handle_spec.rb
133
+ - spec/identifiers/ads_bibcode_spec.rb
131
134
  - spec/identifiers/isbn_spec.rb
135
+ - spec/identifiers/doi_spec.rb
132
136
  - spec/identifiers/national_clinical_trial_id_spec.rb
133
137
  - spec/identifiers/orcid_spec.rb
134
- - spec/identifiers/pubmed_id_spec.rb
135
- - spec/identifiers/repec_id_spec.rb
136
- - spec/identifiers/urn_spec.rb
137
- - spec/spec_helper.rb