identifiers 0.12.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +18 -2
- data/lib/identifiers/doi.rb +8 -2
- data/spec/identifiers/doi_spec.rb +141 -97
- metadata +3 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4546231ed2288fa0d807ac0fc71f268f92de75dbc38fb057410441b3b77ca4c
|
4
|
+
data.tar.gz: b55eaf0b183185203c0957faf86335ac3b1996ed7312692d92daf013162798ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 81cfdbbd15e12d7394deeaf45463837ecb01a995fe9bc713736578153f1725d0a6cb71913453d918ad2a1d2cd5970caa6d18656df70f3b175628c4f753f07c37
|
7
|
+
data.tar.gz: 5472c1c3f7b4c04d6a51f4d6fddfd476d3c8823d6c98a74de295cc730fe72bfb637b88e9c9f606d0144418a3d2b7828faf5ca145c1b53b19d8691c937fcc600f
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,11 @@
|
|
2
2
|
All notable changes to this project will be documented in this file. This
|
3
3
|
project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
|
+
## [0.13.0] - 2019-09-04
|
6
|
+
### Added
|
7
|
+
- Added new mode to the DOI extraction, so that it doesn't strip trailing
|
8
|
+
periods when in `strict` mode
|
9
|
+
|
5
10
|
## [0.12.1] - 2018-04-09
|
6
11
|
### Fixed
|
7
12
|
- Restored support for extracting hyphenated ISBN-10s with registration group
|
@@ -88,3 +93,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
|
|
88
93
|
[0.11.0]: https://github.com/altmetric/identifiers/releases/tag/v0.11.0
|
89
94
|
[0.12.0]: https://github.com/altmetric/identifiers/releases/tag/v0.12.0
|
90
95
|
[0.12.1]: https://github.com/altmetric/identifiers/releases/tag/v0.12.1
|
96
|
+
[0.13.0]: https://github.com/altmetric/identifiers/releases/tag/v0.13.0
|
data/README.md
CHANGED
@@ -32,18 +32,34 @@ Or install it yourself as:
|
|
32
32
|
## Usage
|
33
33
|
|
34
34
|
```ruby
|
35
|
-
Identifiers::DOI.extract('example: 10.
|
36
|
-
# => ["10.
|
35
|
+
Identifiers::DOI.extract('example: 10.1234/5678.ABC')
|
36
|
+
# => ["10.1234/5678.abc"]
|
37
37
|
|
38
38
|
Identifiers::DOI.extract('no DOIs here')
|
39
39
|
# => []
|
40
40
|
|
41
41
|
Identifiers::URN.new('urn:abc:123')
|
42
42
|
# => #<URN:0x007ff11c13d930 @urn="urn:abc:123", @nid="abc", @nss="123">
|
43
|
+
|
43
44
|
Identifiers::URN('urn:abc:123')
|
44
45
|
# => #<URN:0x007ff11c0ff568 @urn="urn:abc:123", @nid="abc", @nss="123">
|
45
46
|
```
|
46
47
|
|
48
|
+
A small percentage of DOIs end in trailing `.`. However, having trailing periods
|
49
|
+
being returned by the default extraction method would possibly return quite a few
|
50
|
+
false positives.
|
51
|
+
`DOI.extract` accepts a `strict` option, which can be set to true if we prefer to
|
52
|
+
return DOIs ending in `.`. By default, this option is set to `false`, which strips
|
53
|
+
any trailing `.`:
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
Identifiers::DOI.extract('example: 10.1234/5678.abc.', strict: true)
|
57
|
+
# => ["10.1234/5678.abc."]
|
58
|
+
|
59
|
+
Identifiers::DOI.extract('example: 10.1234/5678.abc.')
|
60
|
+
# => ["10.1234/5678.abc"]
|
61
|
+
```
|
62
|
+
|
47
63
|
## By identifier
|
48
64
|
|
49
65
|
`.extract` is a common method that works across all the supported identifiers.
|
data/lib/identifiers/doi.rb
CHANGED
@@ -24,11 +24,17 @@ module Identifiers
|
|
24
24
|
|
|
25
25
|
[^[:space:]]+(?![[:space:]])\p{^P} # Suffix ending in non-punctuation
|
26
26
|
)
|
27
|
+
\.{0,3} # Allow a DOI to end with up to 3 .
|
27
28
|
)
|
28
29
|
}x
|
29
30
|
|
30
|
-
def self.extract(str)
|
31
|
-
|
31
|
+
def self.extract(str, options = {})
|
32
|
+
strict = options.fetch(:strict, false)
|
33
|
+
|
34
|
+
dois = str.to_s.downcase.scan(REGEXP)
|
35
|
+
dois = dois.map { |doi| doi.gsub(/\.+$/, '') } unless strict
|
36
|
+
|
37
|
+
dois
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|
@@ -1,151 +1,195 @@
|
|
1
1
|
require 'identifiers/doi'
|
2
2
|
|
3
3
|
RSpec.describe Identifiers::DOI do
|
4
|
-
|
5
|
-
str = 'This is an example of a DOI: 10.1049/el.2013.3006'
|
4
|
+
OPTIONS = [{ strict: false }, { strict: true }].freeze
|
6
5
|
|
7
|
-
|
6
|
+
def each_doi(file)
|
7
|
+
Pathname.new(__FILE__).join('..', '..', 'fixtures', file).each_line do |doi|
|
8
|
+
yield(doi.chomp!)
|
9
|
+
end
|
8
10
|
end
|
9
11
|
|
10
|
-
|
11
|
-
|
12
|
+
OPTIONS.each do |options|
|
13
|
+
context "when extracting with options set to #{options.inspect}" do
|
14
|
+
it 'extracts DOIs from a string' do
|
15
|
+
str = 'This is an example of a DOI: 10.1049/el.2013.3006'
|
12
16
|
|
13
|
-
|
14
|
-
|
17
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
|
18
|
+
end
|
15
19
|
|
16
|
-
|
17
|
-
|
20
|
+
it 'extracts DOIs from anywhere in a string' do
|
21
|
+
str = 'This is an example of a DOI - 10.1049/el.2013.3006 - which is entirely valid'
|
18
22
|
|
19
|
-
|
20
|
-
|
23
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1049/el.2013.3006')
|
24
|
+
end
|
21
25
|
|
22
|
-
|
23
|
-
|
26
|
+
it 'downcases the DOIs extracted' do
|
27
|
+
str = 'This is an example of a DOI: 10.1097/01.ASW.0000443266.17665.19'
|
24
28
|
|
25
|
-
|
26
|
-
|
29
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
|
30
|
+
end
|
27
31
|
|
28
|
-
|
29
|
-
|
30
|
-
end
|
32
|
+
it 'does not extract a PubMed ID' do
|
33
|
+
str = 'This is NOT a DOI: 123456'
|
31
34
|
|
32
|
-
|
33
|
-
|
35
|
+
expect(described_class.extract(str, options)).to be_empty
|
36
|
+
end
|
34
37
|
|
35
|
-
|
36
|
-
|
38
|
+
it 'returns no DOIs if given nothing' do
|
39
|
+
expect(described_class.extract(nil)).to be_empty
|
40
|
+
end
|
37
41
|
|
38
|
-
|
39
|
-
|
42
|
+
it 'extracts ISBN-As' do
|
43
|
+
str = 'This is an ISBN-A: 10.978.8898392/315'
|
40
44
|
|
41
|
-
|
42
|
-
|
45
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.978.8898392/315')
|
46
|
+
end
|
43
47
|
|
44
|
-
|
45
|
-
|
48
|
+
it 'does not extract invalid ISBN-As' do
|
49
|
+
str = 'This is not an ISBN-A: 10.978.8898392/NotARealIsbnA'
|
46
50
|
|
47
|
-
|
48
|
-
|
51
|
+
expect(described_class.extract(str, options)).to be_empty
|
52
|
+
end
|
49
53
|
|
50
|
-
|
51
|
-
|
54
|
+
it 'retains closing parentheses that are part of the DOI' do
|
55
|
+
str = 'This is an example of a DOI: 10.1130/2013.2502(04)'
|
52
56
|
|
53
|
-
|
54
|
-
|
57
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
|
58
|
+
end
|
55
59
|
|
56
|
-
|
57
|
-
|
60
|
+
it 'discards ellipses' do
|
61
|
+
str = 'This is an example of a DOI: 10.1130/2013.2502…'
|
58
62
|
|
59
|
-
|
60
|
-
|
63
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
|
64
|
+
end
|
61
65
|
|
62
|
-
|
63
|
-
|
66
|
+
it 'extracts old Wiley DOIs' do
|
67
|
+
str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-# 10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5'
|
64
68
|
|
65
|
-
|
66
|
-
|
69
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#', '10.1002/(sici)1099-0690(199806)1998:6<1071::aid-ejoc1071>3.0.co;2-5')
|
70
|
+
end
|
67
71
|
|
68
|
-
|
69
|
-
|
72
|
+
it 'does not extract a closing parenthesis if not part of the DOI' do
|
73
|
+
str = '(This is an example of a DOI: 10.1130/2013.2502)'
|
70
74
|
|
71
|
-
|
72
|
-
|
75
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
|
76
|
+
end
|
73
77
|
|
74
|
-
|
75
|
-
|
78
|
+
it 'discards trailing punctuation from old Wiley DOIs' do
|
79
|
+
str = 'This is an example of an old Wiley DOI: 10.1002/(SICI)1096-8644(199601)99:1<135::AID-AJPA8>3.0.CO;2-#",'
|
76
80
|
|
77
|
-
|
78
|
-
|
81
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1002/(sici)1096-8644(199601)99:1<135::aid-ajpa8>3.0.co;2-#')
|
82
|
+
end
|
79
83
|
|
80
|
-
|
81
|
-
|
84
|
+
it 'discards trailing Unicode punctuation after balanced parentheses' do
|
85
|
+
str = 'This is an example of a DOI: 10.1130/2013.2502(04)…",'
|
82
86
|
|
83
|
-
|
84
|
-
|
87
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502(04)')
|
88
|
+
end
|
85
89
|
|
86
|
-
|
87
|
-
|
90
|
+
it 'discards contiguous trailing punctuation after unbalanced parentheses' do
|
91
|
+
str = '(This is an example of a DOI: 10.1130/2013.2502).",'
|
88
92
|
|
89
|
-
|
90
|
-
|
93
|
+
expect(described_class.extract(str, options)).to contain_exactly('10.1130/2013.2502')
|
94
|
+
end
|
91
95
|
|
92
|
-
|
93
|
-
|
96
|
+
it 'does not extract DOIs with purely punctuation suffixes' do
|
97
|
+
expect(described_class.extract('10.1130/!).",', options)).to be_empty
|
98
|
+
end
|
94
99
|
|
95
|
-
|
96
|
-
|
100
|
+
it 'extracts DOIs with emoji in them' do
|
101
|
+
expect(described_class.extract('10.1234/🐔💩123🐔🐔🐔123', options)).to contain_exactly('10.1234/🐔💩123🐔🐔🐔123')
|
102
|
+
end
|
97
103
|
|
98
|
-
|
99
|
-
|
104
|
+
it 'extracts DOIs separated by Unicode whitespace' do
|
105
|
+
expect(described_class.extract('10.1234/foo 10.1234/bar', options)).to contain_exactly('10.1234/foo', '10.1234/bar')
|
106
|
+
end
|
100
107
|
|
101
|
-
|
102
|
-
|
108
|
+
it 'does not extract DOIs with extra digits prefixed' do
|
109
|
+
expect(described_class.extract('110.1234/foo', options)).to be_empty
|
110
|
+
end
|
103
111
|
|
104
|
-
|
105
|
-
|
112
|
+
it 'extracts DOIs from a string with trailing closing parentheses' do
|
113
|
+
expect(described_class.extract('(10.1130/2013.2502(04))', options)).to contain_exactly('10.1130/2013.2502(04)')
|
114
|
+
end
|
106
115
|
|
107
|
-
|
108
|
-
|
116
|
+
it 'extracts DOIs from a string with multiple trailing closing parentheses' do
|
117
|
+
expect(described_class.extract('10.1130/2013.2502(04))))', options)).to contain_exactly('10.1130/2013.2502(04)')
|
118
|
+
end
|
109
119
|
|
110
|
-
|
111
|
-
|
120
|
+
it 'extracts DOIs with parentheses within the suffix' do
|
121
|
+
expect(described_class.extract('10.1016/0005-2744(70)90072-0', options)).to contain_exactly('10.1016/0005-2744(70)90072-0')
|
122
|
+
end
|
112
123
|
|
113
|
-
|
124
|
+
it 'extracts all DOIs from a Crossref sample' do
|
125
|
+
each_doi('dois.txt') { |doi|
|
126
|
+
expect(described_class.extract(doi, options)).to contain_exactly(doi)
|
127
|
+
}
|
128
|
+
end
|
129
|
+
end
|
114
130
|
end
|
115
131
|
|
116
|
-
|
117
|
-
|
118
|
-
|
132
|
+
context 'when no options are provided' do
|
133
|
+
it 'discards trailing punctuation' do
|
134
|
+
str = 'This is an example of a DOI: 10.1130/2013.2502.'
|
119
135
|
|
120
|
-
|
121
|
-
|
122
|
-
end
|
136
|
+
expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
|
137
|
+
end
|
123
138
|
|
124
|
-
|
125
|
-
|
126
|
-
end
|
139
|
+
it 'discards multiple contiguous trailing punctuation' do
|
140
|
+
str = 'This is an example of a DOI: 10.1130/2013.2502...",'
|
127
141
|
|
128
|
-
|
129
|
-
|
130
|
-
end
|
142
|
+
expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
|
143
|
+
end
|
131
144
|
|
132
|
-
|
133
|
-
|
134
|
-
end
|
145
|
+
it 'discards trailing punctuation after balanced parentheses' do
|
146
|
+
str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
|
135
147
|
|
136
|
-
|
137
|
-
|
138
|
-
end
|
148
|
+
expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
|
149
|
+
end
|
139
150
|
|
140
|
-
|
141
|
-
|
151
|
+
it 'discards contiguous trailing punctuation after balanced parentheses' do
|
152
|
+
str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
|
153
|
+
|
154
|
+
expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502(04)')
|
155
|
+
end
|
156
|
+
|
157
|
+
it 'does not overflow when given lots of trailing punctuation' do
|
158
|
+
str = '10.1130/2013.2502' + ('.' * 10000)
|
159
|
+
|
160
|
+
expect(described_class.extract(str)).to contain_exactly('10.1130/2013.2502')
|
161
|
+
end
|
142
162
|
end
|
143
163
|
|
144
|
-
|
145
|
-
|
146
|
-
|
164
|
+
context 'with strict mode on' do
|
165
|
+
it 'extracts DOIs ending with trailing periods' do
|
166
|
+
str = 'This is an example of a DOI: 10.1130/2013.2502...",'
|
167
|
+
|
168
|
+
expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
|
169
|
+
end
|
170
|
+
|
171
|
+
it 'keeps trailing punctuation after balanced parentheses' do
|
172
|
+
str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).'
|
173
|
+
|
174
|
+
expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
|
175
|
+
end
|
176
|
+
|
177
|
+
it 'discards contiguous trailing punctuation after balanced parentheses' do
|
178
|
+
str = 'This is an example of a DOI: This is an example of a DOI: 10.1130/2013.2502(04).",'
|
179
|
+
|
180
|
+
expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502(04).')
|
181
|
+
end
|
182
|
+
|
183
|
+
it 'limits the trailing periods to 3' do
|
184
|
+
str = 'This is an example of a DOI: 10.1130/2013.2502.......'
|
185
|
+
|
186
|
+
expect(described_class.extract(str, strict: true)).to contain_exactly('10.1130/2013.2502...')
|
187
|
+
end
|
147
188
|
|
148
|
-
|
189
|
+
it 'extracts all DOIs from a Crossref sample, keeping the trailing periods' do
|
190
|
+
each_doi('strict_mode_dois.txt') { |doi|
|
191
|
+
expect(described_class.extract(doi, strict: true)).to contain_exactly(doi)
|
192
|
+
}
|
149
193
|
end
|
150
194
|
end
|
151
195
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: identifiers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hernandez
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2019-09-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: urn
|
@@ -25,20 +25,6 @@ dependencies:
|
|
25
25
|
- - "~>"
|
26
26
|
- !ruby/object:Gem::Version
|
27
27
|
version: '2.0'
|
28
|
-
- !ruby/object:Gem::Dependency
|
29
|
-
name: bundler
|
30
|
-
requirement: !ruby/object:Gem::Requirement
|
31
|
-
requirements:
|
32
|
-
- - "~>"
|
33
|
-
- !ruby/object:Gem::Version
|
34
|
-
version: '1.10'
|
35
|
-
type: :development
|
36
|
-
prerelease: false
|
37
|
-
version_requirements: !ruby/object:Gem::Requirement
|
38
|
-
requirements:
|
39
|
-
- - "~>"
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
version: '1.10'
|
42
28
|
- !ruby/object:Gem::Dependency
|
43
29
|
name: rake
|
44
30
|
requirement: !ruby/object:Gem::Requirement
|
@@ -118,8 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
104
|
- !ruby/object:Gem::Version
|
119
105
|
version: '0'
|
120
106
|
requirements: []
|
121
|
-
|
122
|
-
rubygems_version: 2.7.3
|
107
|
+
rubygems_version: 3.0.3
|
123
108
|
signing_key:
|
124
109
|
specification_version: 4
|
125
110
|
summary: Utilities library for various scholarly identifiers used by Altmetric
|