identifiers 0.12.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 423438daad2350706eced26425401a5d97ae715095b54853fe55ba4f8dbbaf2d
4
- data.tar.gz: 2776accbdccfc17965dd69507a0b326b2a57e275811828b05e76b47c0f54bb28
3
+ metadata.gz: a4546231ed2288fa0d807ac0fc71f268f92de75dbc38fb057410441b3b77ca4c
4
+ data.tar.gz: b55eaf0b183185203c0957faf86335ac3b1996ed7312692d92daf013162798ff
5
5
  SHA512:
6
- metadata.gz: e319511db960df762b3a646239edf45ff683bef6227736c673072de6ef5d1649dd300232f93c0af0be1d7744308de4f07ebfd53582add6069c6b3f425249722b
7
- data.tar.gz: 1ebd31499facbc5a51ec0dcf30fadb920e86ad6e53c20f4bb5dd66fa11c9f9cc7842a8b703348f13aac2675c0415c2c0f9464c42baafd055669c881342c33217
6
+ metadata.gz: 81cfdbbd15e12d7394deeaf45463837ecb01a995fe9bc713736578153f1725d0a6cb71913453d918ad2a1d2cd5970caa6d18656df70f3b175628c4f753f07c37
7
+ data.tar.gz: 5472c1c3f7b4c04d6a51f4d6fddfd476d3c8823d6c98a74de295cc730fe72bfb637b88e9c9f606d0144418a3d2b7828faf5ca145c1b53b19d8691c937fcc600f
@@ -2,6 +2,11 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.13.0] - 2019-09-04
6
+ ### Added
7
+ - Added new mode to the DOI extraction, so that it doesn't strip trailing
8
+ periods when in `strict` mode
9
+
5
10
  ## [0.12.1] - 2018-04-09
6
11
  ### Fixed
7
12
  - Restored support for extracting hyphenated ISBN-10s with registration group
@@ -88,3 +93,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
88
93
  [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
89
94
  [0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0
90
95
  [0.12.1]: https://github.com/altmetric/identifiers/releases/tag/v0.12.1
96
+ [0.13.0]: https://github.com/altmetric/identifiers/releases/tag/v0.13.0
data/README.md CHANGED
@@ -32,18 +32,34 @@ Or install it yourself as:
32
32
  ## Usage
33
33
 
34
34
  ```ruby
35
- Identifiers::DOI.extract('example: 10.123/abcd.efghi')
36
- # => ["10.123/abcd.efghi"]
35
+ Identifiers::DOI.extract('example: 10.1234/5678.ABC')
36
+ # => ["10.1234/5678.abc"]
37
37
 
38
38
  Identifiers::DOI.extract('no DOIs here')
39
39
  # => []
40
40
 
41
41
  Identifiers::URN.new('urn:abc:123')
42
42
  # => #<URN:0x007ff11c13d930 @urn="urn:abc:123", @nid="abc", @nss="123">
43
+
43
44
  Identifiers::URN('urn:abc:123')
44
45
  # => #<URN:0x007ff11c0ff568 @urn="urn:abc:123", @nid="abc", @nss="123">
45
46
  ```
46
47
 
48
+ A small percentage of DOIs end in trailing `.`. However, having trailing periods
49
+ being returned by the default extraction method would possibly return quite a few
50
+ false positives.
51
+ `DOI.extract` accepts a `strict` option, which can be set to true if we prefer to
52
+ return DOIs ending in `.`. By default, this option is set to `false`, which strips
53
+ any trailing `.`:
54
+
55
+ ```ruby
56
+ Identifiers::DOI.extract('example: 10.1234/5678.abc.', strict: true)
57
+ # => ["10.1234/5678.abc."]
58
+
59
+ Identifiers::DOI.extract('example: 10.1234/5678.abc.')
60
+ # => ["10.1234/5678.abc"]
61
+ ```
62
+
47
63
  ## By identifier
48
64
 
49
65
  `.extract` is a common method that works across all the supported identifiers.
@@ -24,11 +24,17 @@ module Identifiers
24
24
  |
25
25
  [^[:space:]]+(?![[:space:]])\p{^P} # Suffix ending in non-punctuation
26
26
  )
27
+ \.{0,3} # Allow a DOI to end with up to 3 .
27
28
  )
28
29
  }x
29
30
 
30
- def self.extract(str)
31
- str.to_s.downcase.scan(REGEXP)
31
+ def self.extract(str, options = {})
32
+ strict = options.fetch(:strict, false)
33
+
34
+ dois = str.to_s.downcase.scan(REGEXP)
35
+ dois = dois.map { |doi| doi.gsub(/\.+$/, '') } unless strict
36
+
37
+ dois
32
38
  end
33
39
  end
34
40
  end
@@ -1,151 +1,195 @@
1
1
  require 'identifiers/doi'
2
2
 
3
3
  RSpec.describe Identifiers::DOI do
4
- it 'extracts DOIs from a string' do
5
- str = 'This is an example of a DOI: 10.1049/el.2013.3006'
4
+ OPTIONS = [{ strict: false }, { strict: true }].freeze
6
5
 
7
- expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
6
+ def each_doi(file)
7
+ Pathname.new(__FILE__).join('..', '..', 'fixtures', file).each_line do |doi|
8
+ yield(doi.chomp!)
9
+ end
8
10
  end
9
11
 
10
- it 'extracts DOIs from anywhere in a string' do
11
- str = 'This is an example of a DOI - 10.1049/el.2013.3006 - which is entirely valid'
12
+ OPTIONS.each do |options|
13
+ context "when extracting with options set to #{options.inspect}" do
14
+ it 'extracts DOIs from a string' do
15
+ str = 'This is an example of a DOI: 10.1049/el.2013.3006'
12
16
 
13
- expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
14
- end
17
+ expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
18
+ end
15
19
 
16
- it 'downcases the DOIs extracted' do
17
- str = 'This is an example of a DOI: 10.1097/01.ASW.0000443266.17665.19'
20
+ it 'extracts DOIs from anywhere in a string' do
21
+ str = 'This is an example of a DOI - 10.1049/el.2013.3006 - which is entirely valid'
18
22
 
19
- expect(described_class.extract(str)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
20
- end
23
+ expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
24
+ end
21
25
 
22
- it 'does not extract a PubMed ID' do
23
- str = 'This is NOT a DOI: 123456'
26
+ it 'downcases the DOIs extracted' do
27
+ str = 'This is an example of a DOI: 10.1097/01.ASW.0000443266.17665.19'
24
28
 
25
- expect(described_class.extract(str)).to be_empty
26
- end
29
+ expect(described_class.extract(str, options)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
30
+ end
27
31
 
28
- it 'returns no DOIs if given nothing' do
29
- expect(described_class.extract(nil)).to be_empty
30
- end
32
+ it 'does not extract a PubMed ID' do
33
+ str = 'This is NOT a DOI: 123456'
31
34
 
32
- it 'extracts ISBN-As' do
33
- str = 'This is an ISBN-A: 10.978.8898392/315'
35
+ expect(described_class.extract(str, options)).to be_empty
36
+ end
34
37
 
35
- expect(described_class.extract(str)).to contain_exactly('10.978.8898392/315')
36
- end
38
+ it 'returns no DOIs if given nothing' do
39
+ expect(described_class.extract(nil)).to be_empty
40
+ end
37
41
 
38
- it 'does not extract invalid ISBN-As' do
39
- str = 'This is not an ISBN-A: 10.978.8898392/NotARealIsbnA'
42
+ it 'extracts ISBN-As' do
43
+ str = 'This is an ISBN-A: 10.978.8898392/315'
40
44
 
41
- expect(described_class.extract(str)).to be_empty
42
- end
45
+ expect(described_class.extract(str, options)).to contain_exactly('10.978.8898392/315')
46
+ end
43
47
 
44
- it 'retains closing parentheses that are part of the DOI' do
45
- str = 'This is an example of a DOI: 10.1130/2013.2502(04)'
48
+ it 'does not extract invalid ISBN-As' do
49
+ str = 'This is not an ISBN-A: 10.978.8898392/NotARealIsbnA'
46
50
 
47
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
48
- end
51
+ expect(described_class.extract(str, options)).to be_empty
52
+ end
49
53
 
50
- it 'discards trailing punctuation' do
51
- str = 'This is an example of a DOI: 10.1130/2013.2502.'
54
+ it 'retains closing parentheses that are part of the DOI' do
55
+ str = 'This is an example of a DOI: 10.1130/2013.2502(04)'
52
56
 
53
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
54
- end
57
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
58
+ end
55
59
 
56
- it 'discards multiple contiguous trailing punctuation' do
57
- str = 'This is an example of a DOI: 10.1130/2013.2502...",'
60
+ it 'discards ellipses' do
61
+ str = 'This is an example of a DOI: 10.1130/2013.2502'
58
62
 
59
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
60
- end
63
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
64
+ end
61
65
 
62
- it 'discards trailing Unicode punctuation' do
63
- str = 'This is an example of a DOI: 10.1130/2013.2502…'
66
+ it 'extracts old Wiley DOIs' do
67
+ str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
64
68
 
65
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
66
- end
69
+ expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
70
+ end
67
71
 
68
- it 'extracts old Wiley DOIs' do
69
- str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
72
+ it 'does not extract a closing parenthesis if not part of the DOI' do
73
+ str = '(This is an example of a DOI: 10.1130/2013.2502)'
70
74
 
71
- expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
72
- end
75
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
76
+ end
73
77
 
74
- it 'does not extract a closing parenthesis if not part of the DOI' do
75
- str = '(This is an example of a DOI: 10.1130/2013.2502)'
78
+ it 'discards trailing punctuation from old Wiley DOIs' do
79
+ str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#",'
76
80
 
77
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
78
- end
81
+ expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
82
+ end
79
83
 
80
- it 'discards trailing punctuation from old Wiley DOIs' do
81
- str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#",'
84
+ it 'discards trailing Unicode punctuation after balanced parentheses' do
85
+ str = 'This is an example of a DOI: 10.1130/2013.2502(04)",'
82
86
 
83
- expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
84
- end
87
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
88
+ end
85
89
 
86
- it 'discards trailing punctuation after balanced parentheses' do
87
- str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
90
+ it 'discards contiguous trailing punctuation after unbalanced parentheses' do
91
+ str = '(This is an example of a DOI: 10.1130/2013.2502).",'
88
92
 
89
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
90
- end
93
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
94
+ end
91
95
 
92
- it 'discards contiguous trailing punctuation after balanced parentheses' do
93
- str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
96
+ it 'does not extract DOIs with purely punctuation suffixes' do
97
+ expect(described_class.extract('10.1130/!).",', options)).to be_empty
98
+ end
94
99
 
95
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
96
- end
100
+ it 'extracts DOIs with emoji in them' do
101
+ expect(described_class.extract('10.1234/🐔💩123🐔🐔🐔123', options)).to contain_exactly('10.1234/🐔💩123🐔🐔🐔123')
102
+ end
97
103
 
98
- it 'discards trailing Unicode punctuation after balanced parentheses' do
99
- str = 'This is an example of a DOI: 10.1130/2013.2502(04)…",'
104
+ it 'extracts DOIs separated by Unicode whitespace' do
105
+ expect(described_class.extract('10.1234/foo  10.1234/bar', options)).to contain_exactly('10.1234/foo', '10.1234/bar')
106
+ end
100
107
 
101
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
102
- end
108
+ it 'does not extract DOIs with extra digits prefixed' do
109
+ expect(described_class.extract('110.1234/foo', options)).to be_empty
110
+ end
103
111
 
104
- it 'discards contiguous trailing punctuation after unbalanced parentheses' do
105
- str = '(This is an example of a DOI: 10.1130/2013.2502).",'
112
+ it 'extracts DOIs from a string with trailing closing parentheses' do
113
+ expect(described_class.extract('(10.1130/2013.2502(04))', options)).to contain_exactly('10.1130/2013.2502(04)')
114
+ end
106
115
 
107
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
108
- end
116
+ it 'extracts DOIs from a string with multiple trailing closing parentheses' do
117
+ expect(described_class.extract('10.1130/2013.2502(04))))', options)).to contain_exactly('10.1130/2013.2502(04)')
118
+ end
109
119
 
110
- it 'does not overflow when given lots of trailing punctuation' do
111
- str = '10.1130/2013.2502' + ('.' * 10000)
120
+ it 'extracts DOIs with parentheses within the suffix' do
121
+ expect(described_class.extract('10.1016/0005-2744(70)90072-0', options)).to contain_exactly('10.1016/0005-2744(70)90072-0')
122
+ end
112
123
 
113
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
124
+ it 'extracts all DOIs from a Crossref sample' do
125
+ each_doi('dois.txt') { |doi|
126
+ expect(described_class.extract(doi, options)).to contain_exactly(doi)
127
+ }
128
+ end
129
+ end
114
130
  end
115
131
 
116
- it 'does not extract DOIs with purely punctuation suffixes' do
117
- expect(described_class.extract('10.1130/!).",')).to be_empty
118
- end
132
+ context 'when no options are provided' do
133
+ it 'discards trailing punctuation' do
134
+ str = 'This is an example of a DOI: 10.1130/2013.2502.'
119
135
 
120
- it 'extracts DOIs with emoji in them' do
121
- expect(described_class.extract('10.1234/🐔💩123🐔🐔🐔123')).to contain_exactly('10.1234/🐔💩123🐔🐔🐔123')
122
- end
136
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
137
+ end
123
138
 
124
- it 'extracts DOIs separated by Unicode whitespace' do
125
- expect(described_class.extract('10.1234/foo  10.1234/bar')).to contain_exactly('10.1234/foo', '10.1234/bar')
126
- end
139
+ it 'discards multiple contiguous trailing punctuation' do
140
+ str = 'This is an example of a DOI: 10.1130/2013.2502...",'
127
141
 
128
- it 'does not extract DOIs with extra digits prefixed' do
129
- expect(described_class.extract('110.1234/foo')).to be_empty
130
- end
142
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
143
+ end
131
144
 
132
- it 'extracts DOIs from a string with trailing closing parentheses' do
133
- expect(described_class.extract('10.1130/2013.2502(04))')).to contain_exactly('10.1130/2013.2502(04)')
134
- end
145
+ it 'discards trailing punctuation after balanced parentheses' do
146
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
135
147
 
136
- it 'extracts DOIs from a string with multiple trailing closing parentheses' do
137
- expect(described_class.extract('10.1130/2013.2502(04))))')).to contain_exactly('10.1130/2013.2502(04)')
138
- end
148
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
149
+ end
139
150
 
140
- it 'extracts DOIs with parentheses within the suffix' do
141
- expect(described_class.extract('10.1016/0005-2744(70)90072-0')).to contain_exactly('10.1016/0005-2744(70)90072-0')
151
+ it 'discards contiguous trailing punctuation after balanced parentheses' do
152
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
153
+
154
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
155
+ end
156
+
157
+ it 'does not overflow when given lots of trailing punctuation' do
158
+ str = '10.1130/2013.2502' + ('.' * 10000)
159
+
160
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
161
+ end
142
162
  end
143
163
 
144
- it 'extracts all DOIs from a Crossref sample' do
145
- Pathname.new(__FILE__).join('..', '..', 'fixtures', 'dois.txt').each_line do |doi|
146
- doi.chomp!
164
+ context 'with strict mode on' do
165
+ it 'extracts DOIs ending with trailing periods' do
166
+ str = 'This is an example of a DOI: 10.1130/2013.2502...",'
167
+
168
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
169
+ end
170
+
171
+ it 'keeps trailing punctuation after balanced parentheses' do
172
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
173
+
174
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
175
+ end
176
+
177
+ it 'discards contiguous trailing punctuation after balanced parentheses' do
178
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
179
+
180
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
181
+ end
182
+
183
+ it 'limits the trailing periods to 3' do
184
+ str = 'This is an example of a DOI: 10.1130/2013.2502.......'
185
+
186
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
187
+ end
147
188
 
148
- expect(described_class.extract(doi)).to contain_exactly(doi)
189
+ it 'extracts all DOIs from a Crossref sample, keeping the trailing periods' do
190
+ each_doi('strict_mode_dois.txt') { |doi|
191
+ expect(described_class.extract(doi, strict: true)).to contain_exactly(doi)
192
+ }
149
193
  end
150
194
  end
151
195
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-04-09 00:00:00.000000000 Z
12
+ date: 2019-09-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn
@@ -25,20 +25,6 @@ dependencies:
25
25
  - - "~>"
26
26
  - !ruby/object:Gem::Version
27
27
  version: '2.0'
28
- - !ruby/object:Gem::Dependency
29
- name: bundler
30
- requirement: !ruby/object:Gem::Requirement
31
- requirements:
32
- - - "~>"
33
- - !ruby/object:Gem::Version
34
- version: '1.10'
35
- type: :development
36
- prerelease: false
37
- version_requirements: !ruby/object:Gem::Requirement
38
- requirements:
39
- - - "~>"
40
- - !ruby/object:Gem::Version
41
- version: '1.10'
42
28
  - !ruby/object:Gem::Dependency
43
29
  name: rake
44
30
  requirement: !ruby/object:Gem::Requirement
@@ -118,8 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
104
  - !ruby/object:Gem::Version
119
105
  version: '0'
120
106
  requirements: []
121
- rubyforge_project:
122
- rubygems_version: 2.7.3
107
+ rubygems_version: 3.0.3
123
108
  signing_key:
124
109
  specification_version: 4
125
110
  summary: Utilities library for various scholarly identifiers used by Altmetric