identifiers 0.8.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +1 -1
- data/lib/identifiers.rb +1 -4
- data/lib/identifiers/ads_bibcode.rb +1 -1
- data/lib/identifiers/arxiv_id.rb +24 -6
- data/lib/identifiers/doi.rb +3 -8
- data/lib/identifiers/handle.rb +1 -1
- data/lib/identifiers/isbn.rb +49 -14
- data/lib/identifiers/national_clinical_trial_id.rb +1 -1
- data/lib/identifiers/orcid.rb +3 -3
- data/lib/identifiers/pubmed_id.rb +1 -3
- data/lib/identifiers/repec_id.rb +4 -1
- data/spec/identifiers/ads_bibcode_spec.rb +4 -0
- data/spec/identifiers/arxiv_id_spec.rb +4 -0
- data/spec/identifiers/handle_spec.rb +4 -0
- data/spec/identifiers/isbn_spec.rb +18 -0
- data/spec/identifiers/national_clinical_trial_id_spec.rb +4 -0
- data/spec/identifiers/pubmed_id_spec.rb +4 -0
- data/spec/identifiers/repec_id_spec.rb +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17d7278289ac40fc4fa68b488e85c98ada612f6f
|
4
|
+
data.tar.gz: e846cd17ac15a7cb87705c288eb8c41336721183
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a978d601dee61a25f5f66330e6e9bcd995fe1f217bdf43284350cc678574a1890a25ff289ad50ccebb25545da98acd459cd421e98e32d4e3c5266a41a84eb69
|
7
|
+
data.tar.gz: 30ef27cea3bc785f8f0ed841ac1f8cbfaa1e261b8a00afa9f0d942db0b8bc913f4e39f6ea91068f8eb7054a4cae1d04b17bd47956d3c7759d5ff9d5c2bed01b3
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,10 @@
|
|
2
2
|
All notable changes to this project will be documented in this file. This
|
3
3
|
project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
|
+
## [0.9.0] - 2017-07-31
|
6
|
+
### Added
|
7
|
+
- Support extraction of multiple ISBNs separated by a single space
|
8
|
+
|
5
9
|
## [0.8.1] - 2017-04-10
|
6
10
|
### Fixed
|
7
11
|
- Fixed extraction of multiple DOIs separated by Unicode whitespace
|
@@ -52,3 +56,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
|
|
52
56
|
[0.7.0]: https://github.com/altmetric/identifiers/releases/tag/v0.7.0
|
53
57
|
[0.8.0]: https://github.com/altmetric/identifiers/releases/tag/v0.8.0
|
54
58
|
[0.8.1]: https://github.com/altmetric/identifiers/releases/tag/v0.8.1
|
59
|
+
[0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0
|
data/README.md
CHANGED
data/lib/identifiers.rb
CHANGED
@@ -4,10 +4,7 @@ require 'identifiers/doi'
|
|
4
4
|
require 'identifiers/handle'
|
5
5
|
require 'identifiers/isbn'
|
6
6
|
require 'identifiers/national_clinical_trial_id'
|
7
|
+
require 'identifiers/orcid'
|
7
8
|
require 'identifiers/pubmed_id'
|
8
9
|
require 'identifiers/repec_id'
|
9
10
|
require 'identifiers/urn'
|
10
|
-
require 'identifiers/orcid'
|
11
|
-
|
12
|
-
module Identifiers
|
13
|
-
end
|
data/lib/identifiers/arxiv_id.rb
CHANGED
@@ -1,19 +1,37 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class ArxivId
|
3
|
+
POST_2007_REGEXP = %r{
|
4
|
+
(?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
|
5
|
+
(?:arXiv:)? # Optional arXiv scheme
|
6
|
+
\d{4} # YYMM (two-digit year and two-digit month number)
|
7
|
+
\.
|
8
|
+
\d{4,5} # Zero-padded sequence number of 4- or 5-digits
|
9
|
+
(?:v\d+)? # Literal v followed by version number of 1 or more digits
|
10
|
+
(?=$|[[:space:]]) # Look-ahead for end of string or whitespace
|
11
|
+
}xi
|
12
|
+
PRE_2007_REGEXP = %r{
|
13
|
+
(?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
|
14
|
+
(?:arXiv:)? # Optional arXiv scheme
|
15
|
+
[a-z-]+ # Archive (e.g. "math")
|
16
|
+
(?:\.[A-Z]{2})? # Subject class (where applicable)
|
17
|
+
/
|
18
|
+
\d{2} # Year
|
19
|
+
(?:0[1-9]|1[012]) # Month
|
20
|
+
\d{3} # Number
|
21
|
+
(?:v\d+)? # Literal v followed by version number of 1 or more digits
|
22
|
+
(?=$|[[:space:]]) # Look-ahead for end of string or whitespace
|
23
|
+
}xi
|
24
|
+
|
3
25
|
def self.extract(str)
|
4
26
|
extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
|
5
27
|
end
|
6
28
|
|
7
29
|
def self.extract_post_2007_arxiv_ids(str)
|
8
|
-
str
|
9
|
-
.scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|[[:space:]])}i)
|
10
|
-
.map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
30
|
+
str.to_s.scan(POST_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
11
31
|
end
|
12
32
|
|
13
33
|
def self.extract_pre_2007_arxiv_ids(str)
|
14
|
-
str
|
15
|
-
.scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|[[:space:]])}i)
|
16
|
-
.map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
34
|
+
str.to_s.scan(PRE_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
17
35
|
end
|
18
36
|
end
|
19
37
|
end
|
data/lib/identifiers/doi.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class DOI
|
3
|
-
|
3
|
+
REGEXP = %r{
|
4
4
|
\b
|
5
5
|
10 # Directory indicator (always 10)
|
6
6
|
\.
|
@@ -29,16 +29,11 @@ module Identifiers
|
|
29
29
|
/x
|
30
30
|
|
31
31
|
def self.extract(str)
|
32
|
-
str
|
33
|
-
.to_s
|
34
|
-
.downcase
|
35
|
-
.scan(PATTERN)
|
36
|
-
.map { |doi| strip_punctuation(doi) }
|
37
|
-
.compact
|
32
|
+
str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
|
38
33
|
end
|
39
34
|
|
40
35
|
def self.extract_one(str)
|
41
|
-
match = str.to_s.downcase[
|
36
|
+
match = str.to_s.downcase[REGEXP]
|
42
37
|
return unless match
|
43
38
|
|
44
39
|
strip_punctuation(match)
|
data/lib/identifiers/handle.rb
CHANGED
data/lib/identifiers/isbn.rb
CHANGED
@@ -1,29 +1,64 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class ISBN
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
ISBN_13_REGEXP = /
|
4
|
+
\b
|
5
|
+
97[89] # ISBN (GS1) Bookland prefix
|
6
|
+
[\p{Pd}\p{Zs}]? # Optional hyphenation
|
7
|
+
(?:
|
8
|
+
\d # Digit
|
9
|
+
[\p{Pd}\p{Zs}]? # Optional hyphenation
|
10
|
+
){9}
|
11
|
+
\d # Check digit
|
12
|
+
\b
|
13
|
+
/x
|
14
|
+
ISBN_10_REGEXP = /
|
15
|
+
\b
|
16
|
+
(?:
|
17
|
+
\d # Digit
|
18
|
+
[\p{Pd}\p{Zs}]? # Optional hyphenation
|
19
|
+
){9}
|
20
|
+
[\dX] # Check digit
|
21
|
+
\b
|
22
|
+
/x
|
23
|
+
ISBN_A_REGEXP = %r{
|
24
|
+
\b
|
25
|
+
(?<=10\.) # Directory indicator (always 10)
|
26
|
+
97[89]\. # ISBN (GS1) Bookland prefix
|
27
|
+
\d{2,8} # ISBN registration group element and publisher prefix
|
28
|
+
/ # Prefix/suffix divider
|
29
|
+
\d{1,7} # ISBN title enumerator and check digit
|
30
|
+
\b
|
31
|
+
}x
|
6
32
|
|
7
33
|
def self.extract(str)
|
8
34
|
extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
|
9
35
|
end
|
10
36
|
|
11
37
|
def self.extract_isbn_as(str)
|
12
|
-
extract_thirteen_digit_isbns(str.scan(
|
38
|
+
extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
|
13
39
|
end
|
14
40
|
|
15
41
|
def self.extract_thirteen_digit_isbns(str)
|
16
|
-
str
|
42
|
+
str
|
43
|
+
.to_s
|
44
|
+
.scan(ISBN_13_REGEXP)
|
45
|
+
.map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
|
46
|
+
.select { |isbn| valid_isbn_13?(isbn) }
|
17
47
|
end
|
18
48
|
|
19
49
|
def self.extract_ten_digit_isbns(str)
|
20
|
-
str
|
21
|
-
|
22
|
-
|
23
|
-
isbn
|
50
|
+
str
|
51
|
+
.to_s
|
52
|
+
.scan(ISBN_10_REGEXP)
|
53
|
+
.map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
|
54
|
+
.select { |isbn| valid_isbn_10?(isbn) }
|
55
|
+
.map { |isbn|
|
56
|
+
isbn.chop!
|
57
|
+
isbn.prepend('978')
|
58
|
+
isbn << isbn_13_check_digit(isbn).to_s
|
24
59
|
|
25
|
-
|
26
|
-
|
60
|
+
isbn
|
61
|
+
}
|
27
62
|
end
|
28
63
|
|
29
64
|
def self.isbn_13_check_digit(isbn)
|
@@ -38,7 +73,7 @@ module Identifiers
|
|
38
73
|
end
|
39
74
|
|
40
75
|
def self.valid_isbn_13?(isbn)
|
41
|
-
return false unless isbn =~
|
76
|
+
return false unless isbn =~ ISBN_13_REGEXP
|
42
77
|
|
43
78
|
result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
|
44
79
|
|
@@ -46,7 +81,7 @@ module Identifiers
|
|
46
81
|
end
|
47
82
|
|
48
83
|
def self.valid_isbn_10?(isbn)
|
49
|
-
return false unless isbn =~
|
84
|
+
return false unless isbn =~ ISBN_10_REGEXP
|
50
85
|
|
51
86
|
result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
|
52
87
|
|
@@ -54,7 +89,7 @@ module Identifiers
|
|
54
89
|
end
|
55
90
|
|
56
91
|
def self.digits_of(isbn)
|
57
|
-
isbn.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
|
92
|
+
isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
|
58
93
|
end
|
59
94
|
end
|
60
95
|
end
|
data/lib/identifiers/orcid.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class ORCID
|
3
|
-
|
3
|
+
REGEXP = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
|
4
4
|
|
5
5
|
def self.extract(str)
|
6
|
-
str.scan(
|
6
|
+
str.to_s.scan(REGEXP).select { |orcid| valid?(orcid) }.map(&:upcase)
|
7
7
|
end
|
8
8
|
|
9
9
|
def self.valid?(str)
|
@@ -14,7 +14,7 @@ module Identifiers
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def self.calculate_digit(str)
|
17
|
-
return unless str =~
|
17
|
+
return unless str =~ REGEXP
|
18
18
|
|
19
19
|
base_digits = str.chop.tr('-', '')
|
20
20
|
total = 0
|
data/lib/identifiers/repec_id.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class RepecId
|
3
3
|
def self.extract(str)
|
4
|
-
str
|
4
|
+
str
|
5
|
+
.to_s
|
6
|
+
.scan(/\brepec:[^[:space:]]+\b/i)
|
7
|
+
.map { |repec| "RePEc:#{repec.split(':', 2).last}" }
|
5
8
|
end
|
6
9
|
end
|
7
10
|
end
|
@@ -15,5 +15,9 @@ RSpec.describe Identifiers::AdsBibcode do
|
|
15
15
|
it 'does not extract Bibcodes from DOIs' do
|
16
16
|
expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
|
17
17
|
end
|
18
|
+
|
19
|
+
it 'returns no Bibcode if nothing is given' do
|
20
|
+
expect(described_class.extract(nil)).to be_empty
|
21
|
+
end
|
18
22
|
end
|
19
23
|
end
|
@@ -22,6 +22,10 @@ RSpec.describe Identifiers::ArxivId do
|
|
22
22
|
expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
|
23
23
|
end
|
24
24
|
|
25
|
+
it 'extracts nothing from empty arguments' do
|
26
|
+
expect(described_class.extract(nil)).to be_empty
|
27
|
+
end
|
28
|
+
|
25
29
|
it 'extracts a post 2007 arXiv ID surrounded by Unicode whitespace' do
|
26
30
|
expect(described_class.extract('Example: arXiv:0706.0001 ')).to contain_exactly('0706.0001')
|
27
31
|
end
|
@@ -5,12 +5,24 @@ RSpec.describe Identifiers::ISBN do
|
|
5
5
|
expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
|
6
6
|
end
|
7
7
|
|
8
|
+
it 'extracts ISBNs when given as a number' do
|
9
|
+
isbn = 9780805069099
|
10
|
+
|
11
|
+
expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
|
12
|
+
end
|
13
|
+
|
8
14
|
it 'normalizes 13-digit ISBNs' do
|
9
15
|
str = "978-0-80-506909-9\n978-0-67-187919-8"
|
10
16
|
|
11
17
|
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
|
12
18
|
end
|
13
19
|
|
20
|
+
it 'extracts multiple ISBN-13s separated by a space' do
|
21
|
+
str = '978-0-80-506909-9 978-0-67-187919-8'
|
22
|
+
|
23
|
+
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
|
24
|
+
end
|
25
|
+
|
14
26
|
it 'extracts ISBNs with hyphens' do
|
15
27
|
expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
|
16
28
|
end
|
@@ -41,6 +53,12 @@ RSpec.describe Identifiers::ISBN do
|
|
41
53
|
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
|
42
54
|
end
|
43
55
|
|
56
|
+
it 'extracts multiple 10-digit ISBNs separated by a space' do
|
57
|
+
str = '0-8050-6909-7 2-7594-0269-X'
|
58
|
+
|
59
|
+
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
|
60
|
+
end
|
61
|
+
|
44
62
|
it 'normalizes 10-digit ISBNs with Unicode dashes' do
|
45
63
|
expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
|
46
64
|
end
|
@@ -8,4 +8,8 @@ RSpec.describe Identifiers::NationalClinicalTrialId do
|
|
8
8
|
it 'normalizes NCT IDs' do
|
9
9
|
expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
|
10
10
|
end
|
11
|
+
|
12
|
+
it 'does not match anything with empty arguments' do
|
13
|
+
expect(described_class.extract(nil)).to be_empty
|
14
|
+
end
|
11
15
|
end
|
@@ -22,4 +22,8 @@ RSpec.describe Identifiers::PubmedId do
|
|
22
22
|
it 'extracts PubMed IDs separated by Unicode whitespace' do
|
23
23
|
expect(described_class.extract('123 456')).to contain_exactly('123', '456')
|
24
24
|
end
|
25
|
+
|
26
|
+
it 'considers Fixnum as potential PubmedIds too' do
|
27
|
+
expect(described_class.extract(123)).to contain_exactly('123')
|
28
|
+
end
|
25
29
|
end
|
@@ -18,4 +18,8 @@ RSpec.describe Identifiers::RepecId do
|
|
18
18
|
|
19
19
|
expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
|
20
20
|
end
|
21
|
+
|
22
|
+
it 'extracts nothing when given empty arguments' do
|
23
|
+
expect(described_class.extract(nil)).to be_empty
|
24
|
+
end
|
21
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: identifiers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hernandez
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-07-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: urn
|