identifiers 0.12.1 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 423438daad2350706eced26425401a5d97ae715095b54853fe55ba4f8dbbaf2d
4
- data.tar.gz: 2776accbdccfc17965dd69507a0b326b2a57e275811828b05e76b47c0f54bb28
3
+ metadata.gz: a4546231ed2288fa0d807ac0fc71f268f92de75dbc38fb057410441b3b77ca4c
4
+ data.tar.gz: b55eaf0b183185203c0957faf86335ac3b1996ed7312692d92daf013162798ff
5
5
  SHA512:
6
- metadata.gz: e319511db960df762b3a646239edf45ff683bef6227736c673072de6ef5d1649dd300232f93c0af0be1d7744308de4f07ebfd53582add6069c6b3f425249722b
7
- data.tar.gz: 1ebd31499facbc5a51ec0dcf30fadb920e86ad6e53c20f4bb5dd66fa11c9f9cc7842a8b703348f13aac2675c0415c2c0f9464c42baafd055669c881342c33217
6
+ metadata.gz: 81cfdbbd15e12d7394deeaf45463837ecb01a995fe9bc713736578153f1725d0a6cb71913453d918ad2a1d2cd5970caa6d18656df70f3b175628c4f753f07c37
7
+ data.tar.gz: 5472c1c3f7b4c04d6a51f4d6fddfd476d3c8823d6c98a74de295cc730fe72bfb637b88e9c9f606d0144418a3d2b7828faf5ca145c1b53b19d8691c937fcc600f
@@ -2,6 +2,11 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.13.0] - 2019-09-04
6
+ ### Added
7
+ - Added new mode to the DOI extraction, so that it doesn't strip trailing
8
+ periods when in `strict` mode
9
+
5
10
  ## [0.12.1] - 2018-04-09
6
11
  ### Fixed
7
12
  - Restored support for extracting hyphenated ISBN-10s with registration group
@@ -88,3 +93,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
88
93
  [0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
89
94
  [0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0
90
95
  [0.12.1]: https://github.com/altmetric/identifiers/releases/tag/v0.12.1
96
+ [0.13.0]: https://github.com/altmetric/identifiers/releases/tag/v0.13.0
data/README.md CHANGED
@@ -32,18 +32,34 @@ Or install it yourself as:
32
32
  ## Usage
33
33
 
34
34
  ```ruby
35
- Identifiers::DOI.extract('example: 10.123/abcd.efghi')
36
- # => ["10.123/abcd.efghi"]
35
+ Identifiers::DOI.extract('example: 10.1234/5678.ABC')
36
+ # => ["10.1234/5678.abc"]
37
37
 
38
38
  Identifiers::DOI.extract('no DOIs here')
39
39
  # => []
40
40
 
41
41
  Identifiers::URN.new('urn:abc:123')
42
42
  # => #<URN:0x007ff11c13d930 @urn="urn:abc:123", @nid="abc", @nss="123">
43
+
43
44
  Identifiers::URN('urn:abc:123')
44
45
  # => #<URN:0x007ff11c0ff568 @urn="urn:abc:123", @nid="abc", @nss="123">
45
46
  ```
46
47
 
48
+ A small percentage of DOIs end in trailing `.`. However, having trailing periods
49
+ being returned by the default extraction method would possibly return quite a few
50
+ false positives.
51
+ `DOI.extract` accepts a `strict` option, which can be set to true if we prefer to
52
+ return DOIs ending in `.`. By default, this option is set to `false`, which strips
53
+ any trailing `.`:
54
+
55
+ ```ruby
56
+ Identifiers::DOI.extract('example: 10.1234/5678.abc.', strict: true)
57
+ # => ["10.1234/5678.abc."]
58
+
59
+ Identifiers::DOI.extract('example: 10.1234/5678.abc.')
60
+ # => ["10.1234/5678.abc"]
61
+ ```
62
+
47
63
  ## By identifier
48
64
 
49
65
  `.extract` is a common method that works across all the supported identifiers.
@@ -24,11 +24,17 @@ module Identifiers
24
24
  |
25
25
  [^[:space:]]+(?![[:space:]])\p{^P} # Suffix ending in non-punctuation
26
26
  )
27
+ \.{0,3} # Allow a DOI to end with up to 3 .
27
28
  )
28
29
  }x
29
30
 
30
- def self.extract(str)
31
- str.to_s.downcase.scan(REGEXP)
31
+ def self.extract(str, options = {})
32
+ strict = options.fetch(:strict, false)
33
+
34
+ dois = str.to_s.downcase.scan(REGEXP)
35
+ dois = dois.map { |doi| doi.gsub(/\.+$/, '') } unless strict
36
+
37
+ dois
32
38
  end
33
39
  end
34
40
  end
@@ -1,151 +1,195 @@
1
1
  require 'identifiers/doi'
2
2
 
3
3
  RSpec.describe Identifiers::DOI do
4
- it 'extracts DOIs from a string' do
5
- str = 'This is an example of a DOI: 10.1049/el.2013.3006'
4
+ OPTIONS = [{ strict: false }, { strict: true }].freeze
6
5
 
7
- expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
6
+ def each_doi(file)
7
+ Pathname.new(__FILE__).join('..', '..', 'fixtures', file).each_line do |doi|
8
+ yield(doi.chomp!)
9
+ end
8
10
  end
9
11
 
10
- it 'extracts DOIs from anywhere in a string' do
11
- str = 'This is an example of a DOI - 10.1049/el.2013.3006 - which is entirely valid'
12
+ OPTIONS.each do |options|
13
+ context "when extracting with options set to #{options.inspect}" do
14
+ it 'extracts DOIs from a string' do
15
+ str = 'This is an example of a DOI: 10.1049/el.2013.3006'
12
16
 
13
- expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
14
- end
17
+ expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
18
+ end
15
19
 
16
- it 'downcases the DOIs extracted' do
17
- str = 'This is an example of a DOI: 10.1097/01.ASW.0000443266.17665.19'
20
+ it 'extracts DOIs from anywhere in a string' do
21
+ str = 'This is an example of a DOI - 10.1049/el.2013.3006 - which is entirely valid'
18
22
 
19
- expect(described_class.extract(str)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
20
- end
23
+ expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
24
+ end
21
25
 
22
- it 'does not extract a PubMed ID' do
23
- str = 'This is NOT a DOI: 123456'
26
+ it 'downcases the DOIs extracted' do
27
+ str = 'This is an example of a DOI: 10.1097/01.ASW.0000443266.17665.19'
24
28
 
25
- expect(described_class.extract(str)).to be_empty
26
- end
29
+ expect(described_class.extract(str, options)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
30
+ end
27
31
 
28
- it 'returns no DOIs if given nothing' do
29
- expect(described_class.extract(nil)).to be_empty
30
- end
32
+ it 'does not extract a PubMed ID' do
33
+ str = 'This is NOT a DOI: 123456'
31
34
 
32
- it 'extracts ISBN-As' do
33
- str = 'This is an ISBN-A: 10.978.8898392/315'
35
+ expect(described_class.extract(str, options)).to be_empty
36
+ end
34
37
 
35
- expect(described_class.extract(str)).to contain_exactly('10.978.8898392/315')
36
- end
38
+ it 'returns no DOIs if given nothing' do
39
+ expect(described_class.extract(nil)).to be_empty
40
+ end
37
41
 
38
- it 'does not extract invalid ISBN-As' do
39
- str = 'This is not an ISBN-A: 10.978.8898392/NotARealIsbnA'
42
+ it 'extracts ISBN-As' do
43
+ str = 'This is an ISBN-A: 10.978.8898392/315'
40
44
 
41
- expect(described_class.extract(str)).to be_empty
42
- end
45
+ expect(described_class.extract(str, options)).to contain_exactly('10.978.8898392/315')
46
+ end
43
47
 
44
- it 'retains closing parentheses that are part of the DOI' do
45
- str = 'This is an example of a DOI: 10.1130/2013.2502(04)'
48
+ it 'does not extract invalid ISBN-As' do
49
+ str = 'This is not an ISBN-A: 10.978.8898392/NotARealIsbnA'
46
50
 
47
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
48
- end
51
+ expect(described_class.extract(str, options)).to be_empty
52
+ end
49
53
 
50
- it 'discards trailing punctuation' do
51
- str = 'This is an example of a DOI: 10.1130/2013.2502.'
54
+ it 'retains closing parentheses that are part of the DOI' do
55
+ str = 'This is an example of a DOI: 10.1130/2013.2502(04)'
52
56
 
53
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
54
- end
57
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
58
+ end
55
59
 
56
- it 'discards multiple contiguous trailing punctuation' do
57
- str = 'This is an example of a DOI: 10.1130/2013.2502...",'
60
+ it 'discards ellipses' do
61
+ str = 'This is an example of a DOI: 10.1130/2013.2502'
58
62
 
59
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
60
- end
63
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
64
+ end
61
65
 
62
- it 'discards trailing Unicode punctuation' do
63
- str = 'This is an example of a DOI: 10.1130/2013.2502…'
66
+ it 'extracts old Wiley DOIs' do
67
+ str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
64
68
 
65
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
66
- end
69
+ expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
70
+ end
67
71
 
68
- it 'extracts old Wiley DOIs' do
69
- str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
72
+ it 'does not extract a closing parenthesis if not part of the DOI' do
73
+ str = '(This is an example of a DOI: 10.1130/2013.2502)'
70
74
 
71
- expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
72
- end
75
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
76
+ end
73
77
 
74
- it 'does not extract a closing parenthesis if not part of the DOI' do
75
- str = '(This is an example of a DOI: 10.1130/2013.2502)'
78
+ it 'discards trailing punctuation from old Wiley DOIs' do
79
+ str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#",'
76
80
 
77
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
78
- end
81
+ expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
82
+ end
79
83
 
80
- it 'discards trailing punctuation from old Wiley DOIs' do
81
- str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#",'
84
+ it 'discards trailing Unicode punctuation after balanced parentheses' do
85
+ str = 'This is an example of a DOI: 10.1130/2013.2502(04)",'
82
86
 
83
- expect(described_class.extract(str)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
84
- end
87
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
88
+ end
85
89
 
86
- it 'discards trailing punctuation after balanced parentheses' do
87
- str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
90
+ it 'discards contiguous trailing punctuation after unbalanced parentheses' do
91
+ str = '(This is an example of a DOI: 10.1130/2013.2502).",'
88
92
 
89
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
90
- end
93
+ expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
94
+ end
91
95
 
92
- it 'discards contiguous trailing punctuation after balanced parentheses' do
93
- str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
96
+ it 'does not extract DOIs with purely punctuation suffixes' do
97
+ expect(described_class.extract('10.1130/!).",', options)).to be_empty
98
+ end
94
99
 
95
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
96
- end
100
+ it 'extracts DOIs with emoji in them' do
101
+ expect(described_class.extract('10.1234/🐔💩123🐔🐔🐔123', options)).to contain_exactly('10.1234/🐔💩123🐔🐔🐔123')
102
+ end
97
103
 
98
- it 'discards trailing Unicode punctuation after balanced parentheses' do
99
- str = 'This is an example of a DOI: 10.1130/2013.2502(04)…",'
104
+ it 'extracts DOIs separated by Unicode whitespace' do
105
+ expect(described_class.extract('10.1234/foo  10.1234/bar', options)).to contain_exactly('10.1234/foo', '10.1234/bar')
106
+ end
100
107
 
101
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
102
- end
108
+ it 'does not extract DOIs with extra digits prefixed' do
109
+ expect(described_class.extract('110.1234/foo', options)).to be_empty
110
+ end
103
111
 
104
- it 'discards contiguous trailing punctuation after unbalanced parentheses' do
105
- str = '(This is an example of a DOI: 10.1130/2013.2502).",'
112
+ it 'extracts DOIs from a string with trailing closing parentheses' do
113
+ expect(described_class.extract('(10.1130/2013.2502(04))', options)).to contain_exactly('10.1130/2013.2502(04)')
114
+ end
106
115
 
107
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
108
- end
116
+ it 'extracts DOIs from a string with multiple trailing closing parentheses' do
117
+ expect(described_class.extract('10.1130/2013.2502(04))))', options)).to contain_exactly('10.1130/2013.2502(04)')
118
+ end
109
119
 
110
- it 'does not overflow when given lots of trailing punctuation' do
111
- str = '10.1130/2013.2502' + ('.' * 10000)
120
+ it 'extracts DOIs with parentheses within the suffix' do
121
+ expect(described_class.extract('10.1016/0005-2744(70)90072-0', options)).to contain_exactly('10.1016/0005-2744(70)90072-0')
122
+ end
112
123
 
113
- expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
124
+ it 'extracts all DOIs from a Crossref sample' do
125
+ each_doi('dois.txt') { |doi|
126
+ expect(described_class.extract(doi, options)).to contain_exactly(doi)
127
+ }
128
+ end
129
+ end
114
130
  end
115
131
 
116
- it 'does not extract DOIs with purely punctuation suffixes' do
117
- expect(described_class.extract('10.1130/!).",')).to be_empty
118
- end
132
+ context 'when no options are provided' do
133
+ it 'discards trailing punctuation' do
134
+ str = 'This is an example of a DOI: 10.1130/2013.2502.'
119
135
 
120
- it 'extracts DOIs with emoji in them' do
121
- expect(described_class.extract('10.1234/🐔💩123🐔🐔🐔123')).to contain_exactly('10.1234/🐔💩123🐔🐔🐔123')
122
- end
136
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
137
+ end
123
138
 
124
- it 'extracts DOIs separated by Unicode whitespace' do
125
- expect(described_class.extract('10.1234/foo  10.1234/bar')).to contain_exactly('10.1234/foo', '10.1234/bar')
126
- end
139
+ it 'discards multiple contiguous trailing punctuation' do
140
+ str = 'This is an example of a DOI: 10.1130/2013.2502...",'
127
141
 
128
- it 'does not extract DOIs with extra digits prefixed' do
129
- expect(described_class.extract('110.1234/foo')).to be_empty
130
- end
142
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
143
+ end
131
144
 
132
- it 'extracts DOIs from a string with trailing closing parentheses' do
133
- expect(described_class.extract('10.1130/2013.2502(04))')).to contain_exactly('10.1130/2013.2502(04)')
134
- end
145
+ it 'discards trailing punctuation after balanced parentheses' do
146
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
135
147
 
136
- it 'extracts DOIs from a string with multiple trailing closing parentheses' do
137
- expect(described_class.extract('10.1130/2013.2502(04))))')).to contain_exactly('10.1130/2013.2502(04)')
138
- end
148
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
149
+ end
139
150
 
140
- it 'extracts DOIs with parentheses within the suffix' do
141
- expect(described_class.extract('10.1016/0005-2744(70)90072-0')).to contain_exactly('10.1016/0005-2744(70)90072-0')
151
+ it 'discards contiguous trailing punctuation after balanced parentheses' do
152
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
153
+
154
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
155
+ end
156
+
157
+ it 'does not overflow when given lots of trailing punctuation' do
158
+ str = '10.1130/2013.2502' + ('.' * 10000)
159
+
160
+ expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
161
+ end
142
162
  end
143
163
 
144
- it 'extracts all DOIs from a Crossref sample' do
145
- Pathname.new(__FILE__).join('..', '..', 'fixtures', 'dois.txt').each_line do |doi|
146
- doi.chomp!
164
+ context 'with strict mode on' do
165
+ it 'extracts DOIs ending with trailing periods' do
166
+ str = 'This is an example of a DOI: 10.1130/2013.2502...",'
167
+
168
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
169
+ end
170
+
171
+ it 'keeps trailing punctuation after balanced parentheses' do
172
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
173
+
174
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
175
+ end
176
+
177
+ it 'discards contiguous trailing punctuation after balanced parentheses' do
178
+ str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
179
+
180
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
181
+ end
182
+
183
+ it 'limits the trailing periods to 3' do
184
+ str = 'This is an example of a DOI: 10.1130/2013.2502.......'
185
+
186
+ expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
187
+ end
147
188
 
148
- expect(described_class.extract(doi)).to contain_exactly(doi)
189
+ it 'extracts all DOIs from a Crossref sample, keeping the trailing periods' do
190
+ each_doi('strict_mode_dois.txt') { |doi|
191
+ expect(described_class.extract(doi, strict: true)).to contain_exactly(doi)
192
+ }
149
193
  end
150
194
  end
151
195
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2018-04-09 00:00:00.000000000 Z
12
+ date: 2019-09-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn
@@ -25,20 +25,6 @@ dependencies:
25
25
  - - "~>"
26
26
  - !ruby/object:Gem::Version
27
27
  version: '2.0'
28
- - !ruby/object:Gem::Dependency
29
- name: bundler
30
- requirement: !ruby/object:Gem::Requirement
31
- requirements:
32
- - - "~>"
33
- - !ruby/object:Gem::Version
34
- version: '1.10'
35
- type: :development
36
- prerelease: false
37
- version_requirements: !ruby/object:Gem::Requirement
38
- requirements:
39
- - - "~>"
40
- - !ruby/object:Gem::Version
41
- version: '1.10'
42
28
  - !ruby/object:Gem::Dependency
43
29
  name: rake
44
30
  requirement: !ruby/object:Gem::Requirement
@@ -118,8 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
118
104
  - !ruby/object:Gem::Version
119
105
  version: '0'
120
106
  requirements: []
121
- rubyforge_project:
122
- rubygems_version: 2.7.3
107
+ rubygems_version: 3.0.3
123
108
  signing_key:
124
109
  specification_version: 4
125
110
  summary: Utilities library for various scholarly identifiers used by Altmetric