identifiers 0.11.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: ab7d56e3e6048da713104ab263f36bdd96a9da98
4
- data.tar.gz: 4d2636ffdeeb7b3a1e563b36c761e39dc81f488e
2
+ SHA256:
3
+ metadata.gz: 6ec6f86cbb54595ef76ab6615ba9e3a5f671478daebf31412f6b6820f1cb020d
4
+ data.tar.gz: 63881a8f3863926d66005133ba9fa0cdf471a7446c2dffa12534ac540911207e
5
5
  SHA512:
6
- metadata.gz: 3a62e98dcf6f35c180e1dd2c2e574f8993c6123a182e6fee782a96bc85e342de5b3894455f8fa99325100edcb8f3a9f0fa33d3135040b775bbf6e0b442bc1992
7
- data.tar.gz: fcc0d3d178e49ee0eefed149200f4f8489d58c4cb953b21a0cab63067ae237432eee607e8e81460f70cbdba9ebb8d1d44c6f3d01fe82f16c9038a12f31aba609
6
+ metadata.gz: 97b2352aa4a99ec8cf5158e1239f273e10f28321df11941445890549b827365e7a8ba1fb54d4a268bd146889f94aaa4fec0135ec1b67f779094557c6987bc466
7
+ data.tar.gz: f0e6a040a480bf9cf42900997415f8341fb7f68274434c936e725bafab484ce703fc6f0e21c5e317a85456c9fc0c2305e80ba9b1bae99cc7e6ba33fa40a890cc
data/CHANGELOG.md CHANGED
@@ -2,8 +2,15 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.12.0] - 2018-04-06
6
+ ### Added
7
+ - Added support for extracting more old Wiley DOIs
8
+
9
+ ### Changed
10
+ - Performance improvements when extracting DOIs with trailing punctuation.
11
+
5
12
  ## [0.11.0] - 2018-03-12
6
- ## Fixed
13
+ ### Fixed
7
14
  - Stricter ISBN extraction: consistent hyphenation (#27) and correct number of groups (#28)
8
15
  - Prevent stack overflow when extracting DOIS (#25)
9
16
 
@@ -74,3 +81,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
74
81
  [0.9.1]: https://github.com/altmetric/identifiers/releases/tag/v0.9.1
75
82
  [0.10.0]: https://github.com/altmetric/identifiers/releases/tag/v0.10.0
76
83
  [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
84
+ [0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0
data/README.md CHANGED
@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
18
18
  Add this line to your application's `Gemfile`:
19
19
 
20
20
  ```ruby
21
- gem 'identifiers', '~> 0.11'
21
+ gem 'identifiers', '~> 0.12'
22
22
  ```
23
23
 
24
24
  And then execute:
@@ -2,42 +2,33 @@ module Identifiers
2
2
  class DOI
3
3
  REGEXP = %r{
4
4
  \b
5
- 10 # Directory indicator (always 10)
5
+ 10 # Directory indicator (always 10)
6
6
  \.
7
7
  (?:
8
8
  # ISBN-A
9
- 97[89]\. # ISBN (GS1) Bookland prefix
10
- \d{2,8} # ISBN registration group element and publisher prefix
11
- / # Prefix/suffix divider
12
- \d{1,7} # ISBN title enumerator and check digit
9
+ 97[89]\. # ISBN (GS1) Bookland prefix
10
+ \d{2,8} # ISBN registration group element and publisher prefix
11
+ / # Prefix/suffix divider
12
+ \d{1,7} # ISBN title enumerator and check digit
13
13
  |
14
14
  # DOI
15
- \d{4,9} # Registrant code
16
- / # Prefix/suffix divider
17
- [^[:space:]]+ # DOI suffix
15
+ \d{4,9} # Registrant code
16
+ / # Prefix/suffix divider
17
+ (?:
18
+ # DOI suffix
19
+ [^[:space:]]+;2-[\#0-9a-z] # Early Wiley suffix
20
+ |
21
+ [^[:space:]]+ # Suffix...
22
+ \([^[:space:])]+\) # Ending in balanced parentheses...
23
+ (?![^[:space:]\p{P}]) # Not followed by more suffix or punctuation
24
+ |
25
+ [^[:space:]]+(?![[:space:]])\p{^P} # Suffix ending in non-punctuation
26
+ )
18
27
  )
19
28
  }x
20
- VALID_ENDING = /
21
- (?:
22
- \p{^Punct} # Non-punctuation character
23
- |
24
- \(.+\) # Balanced parentheses
25
- |
26
- 2-\# # Early Wiley DOI suffix
27
- )
28
- \z
29
- /x
30
29
 
31
30
  def self.extract(str)
32
- str.to_s.downcase.scan(REGEXP).map { |doi| extract_one(doi) }.compact
33
- end
34
-
35
- def self.extract_one(str)
36
- while (match = str.to_s.downcase[REGEXP])
37
- break match if match =~ VALID_ENDING
38
-
39
- str = match.sub(/\p{Punct}\z/, '')
40
- end
31
+ str.to_s.downcase.scan(REGEXP)
41
32
  end
42
33
  end
43
34
  end
@@ -1,11 +1,11 @@
1
- module Identifiers
2
- class PubmedId
3
- ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
4
- PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
5
-
6
- def self.extract(str)
7
- str = str.to_s
8
- str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
9
- end
10
- end
11
- end
1
+ module Identifiers
2
+ class PubmedId
3
+ ZERO_PADDED_NUMBER = %r{(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])}
4
+ PUBMED_URL = %r{(?:https?://(?:www\.)?ncbi\.nlm\.nih\.gov/(?:m/)?pubmed/|pmid:|info:pmid/)0*(\d+)}i
5
+
6
+ def self.extract(str)
7
+ str = str.to_s
8
+ str.scan(ZERO_PADDED_NUMBER).flatten | str.scan(PUBMED_URL).flatten
9
+ end
10
+ end
11
+ end
@@ -66,9 +66,9 @@ RSpec.describe Identifiers::DOI do
66
66
  end
67
67
 
68
68
  it 'extracts old Wiley DOIs' do
69
- str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#'
69
+ str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
70
70
 
71
- expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
71
+ expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
72
72
  end
73
73
 
74
74
  it 'does not extract a closing parenthesis if not part of the DOI' do
@@ -124,4 +124,28 @@ RSpec.describe Identifiers::DOI do
124
124
  it 'extracts DOIs separated by Unicode whitespace' do
125
125
  expect(described_class.extract('10.1234/foo  10.1234/bar')).to contain_exactly('10.1234/foo', '10.1234/bar')
126
126
  end
127
+
128
+ it 'does not extract DOIs with extra digits prefixed' do
129
+ expect(described_class.extract('110.1234/foo')).to be_empty
130
+ end
131
+
132
+ it 'extracts DOIs from a string with trailing closing parentheses' do
133
+ expect(described_class.extract('10.1130/2013.2502(04))')).to contain_exactly('10.1130/2013.2502(04)')
134
+ end
135
+
136
+ it 'extracts DOIs from a string with multiple trailing closing parentheses' do
137
+ expect(described_class.extract('10.1130/2013.2502(04))))')).to contain_exactly('10.1130/2013.2502(04)')
138
+ end
139
+
140
+ it 'extracts DOIs with parentheses within the suffix' do
141
+ expect(described_class.extract('10.1016/0005-2744(70)90072-0')).to contain_exactly('10.1016/0005-2744(70)90072-0')
142
+ end
143
+
144
+ it 'extracts all DOIs from a Crossref sample' do
145
+ Pathname.new(__FILE__).join('..', '..', 'fixtures', 'dois.txt').each_line do |doi|
146
+ doi.chomp!
147
+
148
+ expect(described_class.extract(doi)).to contain_exactly(doi)
149
+ end
150
+ end
127
151
  end
@@ -1,99 +1,99 @@
1
- require 'identifiers/pubmed_id'
2
-
3
- RSpec.describe Identifiers::PubmedId do
4
- it 'extracts PubMed IDs' do
5
- expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
6
- end
7
-
8
- it 'extracts PubMed IDs from a PubMed URL with www' do
9
- url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
10
-
11
- expect(described_class.extract(url)).to contain_exactly('123456')
12
- end
13
-
14
- it 'extracts PubMed IDs from a PubMed URL with www and https' do
15
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
16
-
17
- expect(described_class.extract(url)).to contain_exactly('123456')
18
- end
19
-
20
- it 'extracts PubMed IDs from a PubMed URL without www' do
21
- url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
22
-
23
- expect(described_class.extract(url)).to contain_exactly('123456')
24
- end
25
-
26
- it 'extracts PubMed IDs from a PubMed URL without www but with https' do
27
- url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
28
-
29
- expect(described_class.extract(url)).to contain_exactly('123456')
30
- end
31
-
32
- it 'extracts PubMed IDs from a PubMed mobile URL' do
33
- url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
34
-
35
- expect(described_class.extract(url)).to contain_exactly('123456')
36
- end
37
-
38
- it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
39
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
40
-
41
- expect(described_class.extract(url)).to contain_exactly('123456')
42
- end
43
-
44
- it 'extracts PubMed IDs from a PubMed URL with query parameters' do
45
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
46
-
47
- expect(described_class.extract(url)).to contain_exactly('123456')
48
- end
49
-
50
- it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
51
- url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
52
-
53
- expect(described_class.extract(url)).to contain_exactly('123456')
54
- end
55
-
56
- it 'extracts both number and URLs PubMed IDs' do
57
- url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
58
-
59
- expect(described_class.extract(url)).to contain_exactly('112233', '123456')
60
- end
61
-
62
- it 'does not return outputs with PubMed IDs in DOIs' do
63
- str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
64
-
65
- expect(described_class.extract(str)).to be_empty
66
- end
67
-
68
- it 'strips leading 0s' do
69
- expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
70
- end
71
-
72
- it 'does not consider 0 as a valid PubMed ID' do
73
- expect(described_class.extract('00000000')).to be_empty
74
- end
75
-
76
- it 'extracts PubMed IDs separated by Unicode whitespace' do
77
- expect(described_class.extract('123 456')).to contain_exactly('123', '456')
78
- end
79
-
80
- it 'considers Fixnum as potential PubMed IDs too' do
81
- expect(described_class.extract(123)).to contain_exactly('123')
82
- end
83
-
84
- it 'extracts PubMed IDs with pmid scheme' do
85
- expect(described_class.extract('pmid:123')).to contain_exactly('123')
86
- end
87
-
88
- it 'strips leading zeroes from pmid scheme' do
89
- expect(described_class.extract('pmid:000123')).to contain_exactly('123')
90
- end
91
-
92
- it 'extracts PubMed IDs with info pmid scheme' do
93
- expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
94
- end
95
-
96
- it 'strips leading zeroes from info pmid scheme' do
97
- expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
98
- end
99
- end
1
+ require 'identifiers/pubmed_id'
2
+
3
+ RSpec.describe Identifiers::PubmedId do
4
+ it 'extracts PubMed IDs' do
5
+ expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
6
+ end
7
+
8
+ it 'extracts PubMed IDs from a PubMed URL with www' do
9
+ url = 'http://www.ncbi.nlm.nih.gov/pubmed/123456'
10
+
11
+ expect(described_class.extract(url)).to contain_exactly('123456')
12
+ end
13
+
14
+ it 'extracts PubMed IDs from a PubMed URL with www and https' do
15
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456'
16
+
17
+ expect(described_class.extract(url)).to contain_exactly('123456')
18
+ end
19
+
20
+ it 'extracts PubMed IDs from a PubMed URL without www' do
21
+ url = 'http://ncbi.nlm.nih.gov/pubmed/123456'
22
+
23
+ expect(described_class.extract(url)).to contain_exactly('123456')
24
+ end
25
+
26
+ it 'extracts PubMed IDs from a PubMed URL without www but with https' do
27
+ url = 'https://ncbi.nlm.nih.gov/pubmed/123456'
28
+
29
+ expect(described_class.extract(url)).to contain_exactly('123456')
30
+ end
31
+
32
+ it 'extracts PubMed IDs from a PubMed mobile URL' do
33
+ url = 'https://www.ncbi.nlm.nih.gov/m/pubmed/123456'
34
+
35
+ expect(described_class.extract(url)).to contain_exactly('123456')
36
+ end
37
+
38
+ it 'extracts PubMed IDs from a PubMed URL with hash parameters' do
39
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456#cm6191871_69589'
40
+
41
+ expect(described_class.extract(url)).to contain_exactly('123456')
42
+ end
43
+
44
+ it 'extracts PubMed IDs from a PubMed URL with query parameters' do
45
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/123456?hi=hello&goodbye=bye'
46
+
47
+ expect(described_class.extract(url)).to contain_exactly('123456')
48
+ end
49
+
50
+ it 'extracts zero leading PubMed IDs from a PubMed URL with query parameters' do
51
+ url = 'https://www.ncbi.nlm.nih.gov/pubmed/00123456?hi=hello&goodbye=bye'
52
+
53
+ expect(described_class.extract(url)).to contain_exactly('123456')
54
+ end
55
+
56
+ it 'extracts both number and URLs PubMed IDs' do
57
+ url = 'PubMed ID: 112233 another: https://www.ncbi.nlm.nih.gov/pubmed/123456'
58
+
59
+ expect(described_class.extract(url)).to contain_exactly('112233', '123456')
60
+ end
61
+
62
+ it 'does not return outputs with PubMed IDs in DOIs' do
63
+ str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
64
+
65
+ expect(described_class.extract(str)).to be_empty
66
+ end
67
+
68
+ it 'strips leading 0s' do
69
+ expect(described_class.extract("0000010203\n000456000")).to contain_exactly('10203', '456000')
70
+ end
71
+
72
+ it 'does not consider 0 as a valid PubMed ID' do
73
+ expect(described_class.extract('00000000')).to be_empty
74
+ end
75
+
76
+ it 'extracts PubMed IDs separated by Unicode whitespace' do
77
+ expect(described_class.extract('123 456')).to contain_exactly('123', '456')
78
+ end
79
+
80
+ it 'considers Fixnum as potential PubMed IDs too' do
81
+ expect(described_class.extract(123)).to contain_exactly('123')
82
+ end
83
+
84
+ it 'extracts PubMed IDs with pmid scheme' do
85
+ expect(described_class.extract('pmid:123')).to contain_exactly('123')
86
+ end
87
+
88
+ it 'strips leading zeroes from pmid scheme' do
89
+ expect(described_class.extract('pmid:000123')).to contain_exactly('123')
90
+ end
91
+
92
+ it 'extracts PubMed IDs with info pmid scheme' do
93
+ expect(described_class.extract('info:pmid/123')).to contain_exactly('123')
94
+ end
95
+
96
+ it 'strips leading zeroes from info pmid scheme' do
97
+ expect(described_class.extract('info:pmid/000123')).to contain_exactly('123')
98
+ end
99
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,18 +1,18 @@
1
- RSpec.configure do |config|
2
- config.filter_run :focus
3
- config.run_all_when_everything_filtered = true
4
- config.example_status_persistence_file_path = "spec/examples.txt"
5
- config.disable_monkey_patching!
6
- config.warnings = true
7
- config.order = :random
8
- config.default_formatter = 'doc' if config.files_to_run.one?
9
- Kernel.srand config.seed
10
-
11
- config.expect_with :rspec do |expectations|
12
- expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
- end
14
-
15
- config.mock_with :rspec do |mocks|
16
- mocks.verify_partial_doubles = true
17
- end
18
- end
1
+ RSpec.configure do |config|
2
+ config.filter_run :focus
3
+ config.run_all_when_everything_filtered = true
4
+ config.example_status_persistence_file_path = "spec/examples.txt"
5
+ config.disable_monkey_patching!
6
+ config.warnings = true
7
+ config.order = :random
8
+ config.default_formatter = 'doc' if config.files_to_run.one?
9
+ Kernel.srand config.seed
10
+
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+
15
+ config.mock_with :rspec do |mocks|
16
+ mocks.verify_partial_doubles = true
17
+ end
18
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-03-12 00:00:00.000000000 Z
12
+ date: 2018-04-06 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn
@@ -119,19 +119,19 @@ required_rubygems_version: !ruby/object:Gem::Requirement
119
119
  version: '0'
120
120
  requirements: []
121
121
  rubyforge_project:
122
- rubygems_version: 2.6.13
122
+ rubygems_version: 2.7.3
123
123
  signing_key:
124
124
  specification_version: 4
125
125
  summary: Utilities library for various scholarly identifiers used by Altmetric
126
126
  test_files:
127
- - spec/identifiers/ads_bibcode_spec.rb
127
+ - spec/spec_helper.rb
128
+ - spec/identifiers/repec_id_spec.rb
129
+ - spec/identifiers/pubmed_id_spec.rb
128
130
  - spec/identifiers/arxiv_id_spec.rb
129
- - spec/identifiers/doi_spec.rb
131
+ - spec/identifiers/urn_spec.rb
130
132
  - spec/identifiers/handle_spec.rb
133
+ - spec/identifiers/ads_bibcode_spec.rb
131
134
  - spec/identifiers/isbn_spec.rb
135
+ - spec/identifiers/doi_spec.rb
132
136
  - spec/identifiers/national_clinical_trial_id_spec.rb
133
137
  - spec/identifiers/orcid_spec.rb
134
- - spec/identifiers/pubmed_id_spec.rb
135
- - spec/identifiers/repec_id_spec.rb
136
- - spec/identifiers/urn_spec.rb
137
- - spec/spec_helper.rb