identifiers 0.8.1 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +1 -1
- data/lib/identifiers.rb +1 -4
- data/lib/identifiers/ads_bibcode.rb +1 -1
- data/lib/identifiers/arxiv_id.rb +24 -6
- data/lib/identifiers/doi.rb +3 -8
- data/lib/identifiers/handle.rb +1 -1
- data/lib/identifiers/isbn.rb +49 -14
- data/lib/identifiers/national_clinical_trial_id.rb +1 -1
- data/lib/identifiers/orcid.rb +3 -3
- data/lib/identifiers/pubmed_id.rb +1 -3
- data/lib/identifiers/repec_id.rb +4 -1
- data/spec/identifiers/ads_bibcode_spec.rb +4 -0
- data/spec/identifiers/arxiv_id_spec.rb +4 -0
- data/spec/identifiers/handle_spec.rb +4 -0
- data/spec/identifiers/isbn_spec.rb +18 -0
- data/spec/identifiers/national_clinical_trial_id_spec.rb +4 -0
- data/spec/identifiers/pubmed_id_spec.rb +4 -0
- data/spec/identifiers/repec_id_spec.rb +4 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17d7278289ac40fc4fa68b488e85c98ada612f6f
|
4
|
+
data.tar.gz: e846cd17ac15a7cb87705c288eb8c41336721183
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5a978d601dee61a25f5f66330e6e9bcd995fe1f217bdf43284350cc678574a1890a25ff289ad50ccebb25545da98acd459cd421e98e32d4e3c5266a41a84eb69
|
7
|
+
data.tar.gz: 30ef27cea3bc785f8f0ed841ac1f8cbfaa1e261b8a00afa9f0d942db0b8bc913f4e39f6ea91068f8eb7054a4cae1d04b17bd47956d3c7759d5ff9d5c2bed01b3
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,10 @@
|
|
2
2
|
All notable changes to this project will be documented in this file. This
|
3
3
|
project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
|
+
## [0.9.0] - 2017-07-31
|
6
|
+
### Added
|
7
|
+
- Support extraction of multiple ISBNs separated by a single space
|
8
|
+
|
5
9
|
## [0.8.1] - 2017-04-10
|
6
10
|
### Fixed
|
7
11
|
- Fixed extraction of multiple DOIs separated by Unicode whitespace
|
@@ -52,3 +56,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
|
|
52
56
|
[0.7.0]: https://github.com/altmetric/identifiers/releases/tag/v0.7.0
|
53
57
|
[0.8.0]: https://github.com/altmetric/identifiers/releases/tag/v0.8.0
|
54
58
|
[0.8.1]: https://github.com/altmetric/identifiers/releases/tag/v0.8.1
|
59
|
+
[0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0
|
data/README.md
CHANGED
data/lib/identifiers.rb
CHANGED
@@ -4,10 +4,7 @@ require 'identifiers/doi'
|
|
4
4
|
require 'identifiers/handle'
|
5
5
|
require 'identifiers/isbn'
|
6
6
|
require 'identifiers/national_clinical_trial_id'
|
7
|
+
require 'identifiers/orcid'
|
7
8
|
require 'identifiers/pubmed_id'
|
8
9
|
require 'identifiers/repec_id'
|
9
10
|
require 'identifiers/urn'
|
10
|
-
require 'identifiers/orcid'
|
11
|
-
|
12
|
-
module Identifiers
|
13
|
-
end
|
data/lib/identifiers/arxiv_id.rb
CHANGED
@@ -1,19 +1,37 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class ArxivId
|
3
|
+
POST_2007_REGEXP = %r{
|
4
|
+
(?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
|
5
|
+
(?:arXiv:)? # Optional arXiv scheme
|
6
|
+
\d{4} # YYMM (two-digit year and two-digit month number)
|
7
|
+
\.
|
8
|
+
\d{4,5} # Zero-padded sequence number of 4- or 5-digits
|
9
|
+
(?:v\d+)? # Literal v followed by version number of 1 or more digits
|
10
|
+
(?=$|[[:space:]]) # Look-ahead for end of string or whitespace
|
11
|
+
}xi
|
12
|
+
PRE_2007_REGEXP = %r{
|
13
|
+
(?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
|
14
|
+
(?:arXiv:)? # Optional arXiv scheme
|
15
|
+
[a-z-]+ # Archive (e.g. "math")
|
16
|
+
(?:\.[A-Z]{2})? # Subject class (where applicable)
|
17
|
+
/
|
18
|
+
\d{2} # Year
|
19
|
+
(?:0[1-9]|1[012]) # Month
|
20
|
+
\d{3} # Number
|
21
|
+
(?:v\d+)? # Literal v followed by version number of 1 or more digits
|
22
|
+
(?=$|[[:space:]]) # Look-ahead for end of string or whitespace
|
23
|
+
}xi
|
24
|
+
|
3
25
|
def self.extract(str)
|
4
26
|
extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
|
5
27
|
end
|
6
28
|
|
7
29
|
def self.extract_post_2007_arxiv_ids(str)
|
8
|
-
str
|
9
|
-
.scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|[[:space:]])}i)
|
10
|
-
.map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
30
|
+
str.to_s.scan(POST_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
11
31
|
end
|
12
32
|
|
13
33
|
def self.extract_pre_2007_arxiv_ids(str)
|
14
|
-
str
|
15
|
-
.scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|[[:space:]])}i)
|
16
|
-
.map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
34
|
+
str.to_s.scan(PRE_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
|
17
35
|
end
|
18
36
|
end
|
19
37
|
end
|
data/lib/identifiers/doi.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class DOI
|
3
|
-
|
3
|
+
REGEXP = %r{
|
4
4
|
\b
|
5
5
|
10 # Directory indicator (always 10)
|
6
6
|
\.
|
@@ -29,16 +29,11 @@ module Identifiers
|
|
29
29
|
/x
|
30
30
|
|
31
31
|
def self.extract(str)
|
32
|
-
str
|
33
|
-
.to_s
|
34
|
-
.downcase
|
35
|
-
.scan(PATTERN)
|
36
|
-
.map { |doi| strip_punctuation(doi) }
|
37
|
-
.compact
|
32
|
+
str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
|
38
33
|
end
|
39
34
|
|
40
35
|
def self.extract_one(str)
|
41
|
-
match = str.to_s.downcase[
|
36
|
+
match = str.to_s.downcase[REGEXP]
|
42
37
|
return unless match
|
43
38
|
|
44
39
|
strip_punctuation(match)
|
data/lib/identifiers/handle.rb
CHANGED
data/lib/identifiers/isbn.rb
CHANGED
@@ -1,29 +1,64 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class ISBN
|
3
|
-
|
4
|
-
|
5
|
-
|
3
|
+
ISBN_13_REGEXP = /
|
4
|
+
\b
|
5
|
+
97[89] # ISBN (GS1) Bookland prefix
|
6
|
+
[\p{Pd}\p{Zs}]? # Optional hyphenation
|
7
|
+
(?:
|
8
|
+
\d # Digit
|
9
|
+
[\p{Pd}\p{Zs}]? # Optional hyphenation
|
10
|
+
){9}
|
11
|
+
\d # Check digit
|
12
|
+
\b
|
13
|
+
/x
|
14
|
+
ISBN_10_REGEXP = /
|
15
|
+
\b
|
16
|
+
(?:
|
17
|
+
\d # Digit
|
18
|
+
[\p{Pd}\p{Zs}]? # Optional hyphenation
|
19
|
+
){9}
|
20
|
+
[\dX] # Check digit
|
21
|
+
\b
|
22
|
+
/x
|
23
|
+
ISBN_A_REGEXP = %r{
|
24
|
+
\b
|
25
|
+
(?<=10\.) # Directory indicator (always 10)
|
26
|
+
97[89]\. # ISBN (GS1) Bookland prefix
|
27
|
+
\d{2,8} # ISBN registration group element and publisher prefix
|
28
|
+
/ # Prefix/suffix divider
|
29
|
+
\d{1,7} # ISBN title enumerator and check digit
|
30
|
+
\b
|
31
|
+
}x
|
6
32
|
|
7
33
|
def self.extract(str)
|
8
34
|
extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
|
9
35
|
end
|
10
36
|
|
11
37
|
def self.extract_isbn_as(str)
|
12
|
-
extract_thirteen_digit_isbns(str.scan(
|
38
|
+
extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
|
13
39
|
end
|
14
40
|
|
15
41
|
def self.extract_thirteen_digit_isbns(str)
|
16
|
-
str
|
42
|
+
str
|
43
|
+
.to_s
|
44
|
+
.scan(ISBN_13_REGEXP)
|
45
|
+
.map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
|
46
|
+
.select { |isbn| valid_isbn_13?(isbn) }
|
17
47
|
end
|
18
48
|
|
19
49
|
def self.extract_ten_digit_isbns(str)
|
20
|
-
str
|
21
|
-
|
22
|
-
|
23
|
-
isbn
|
50
|
+
str
|
51
|
+
.to_s
|
52
|
+
.scan(ISBN_10_REGEXP)
|
53
|
+
.map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
|
54
|
+
.select { |isbn| valid_isbn_10?(isbn) }
|
55
|
+
.map { |isbn|
|
56
|
+
isbn.chop!
|
57
|
+
isbn.prepend('978')
|
58
|
+
isbn << isbn_13_check_digit(isbn).to_s
|
24
59
|
|
25
|
-
|
26
|
-
|
60
|
+
isbn
|
61
|
+
}
|
27
62
|
end
|
28
63
|
|
29
64
|
def self.isbn_13_check_digit(isbn)
|
@@ -38,7 +73,7 @@ module Identifiers
|
|
38
73
|
end
|
39
74
|
|
40
75
|
def self.valid_isbn_13?(isbn)
|
41
|
-
return false unless isbn =~
|
76
|
+
return false unless isbn =~ ISBN_13_REGEXP
|
42
77
|
|
43
78
|
result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
|
44
79
|
|
@@ -46,7 +81,7 @@ module Identifiers
|
|
46
81
|
end
|
47
82
|
|
48
83
|
def self.valid_isbn_10?(isbn)
|
49
|
-
return false unless isbn =~
|
84
|
+
return false unless isbn =~ ISBN_10_REGEXP
|
50
85
|
|
51
86
|
result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
|
52
87
|
|
@@ -54,7 +89,7 @@ module Identifiers
|
|
54
89
|
end
|
55
90
|
|
56
91
|
def self.digits_of(isbn)
|
57
|
-
isbn.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
|
92
|
+
isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
|
58
93
|
end
|
59
94
|
end
|
60
95
|
end
|
data/lib/identifiers/orcid.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class ORCID
|
3
|
-
|
3
|
+
REGEXP = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
|
4
4
|
|
5
5
|
def self.extract(str)
|
6
|
-
str.scan(
|
6
|
+
str.to_s.scan(REGEXP).select { |orcid| valid?(orcid) }.map(&:upcase)
|
7
7
|
end
|
8
8
|
|
9
9
|
def self.valid?(str)
|
@@ -14,7 +14,7 @@ module Identifiers
|
|
14
14
|
end
|
15
15
|
|
16
16
|
def self.calculate_digit(str)
|
17
|
-
return unless str =~
|
17
|
+
return unless str =~ REGEXP
|
18
18
|
|
19
19
|
base_digits = str.chop.tr('-', '')
|
20
20
|
total = 0
|
data/lib/identifiers/repec_id.rb
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
module Identifiers
|
2
2
|
class RepecId
|
3
3
|
def self.extract(str)
|
4
|
-
str
|
4
|
+
str
|
5
|
+
.to_s
|
6
|
+
.scan(/\brepec:[^[:space:]]+\b/i)
|
7
|
+
.map { |repec| "RePEc:#{repec.split(':', 2).last}" }
|
5
8
|
end
|
6
9
|
end
|
7
10
|
end
|
@@ -15,5 +15,9 @@ RSpec.describe Identifiers::AdsBibcode do
|
|
15
15
|
it 'does not extract Bibcodes from DOIs' do
|
16
16
|
expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
|
17
17
|
end
|
18
|
+
|
19
|
+
it 'returns no Bibcode if nothing is given' do
|
20
|
+
expect(described_class.extract(nil)).to be_empty
|
21
|
+
end
|
18
22
|
end
|
19
23
|
end
|
@@ -22,6 +22,10 @@ RSpec.describe Identifiers::ArxivId do
|
|
22
22
|
expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
|
23
23
|
end
|
24
24
|
|
25
|
+
it 'extracts nothing from empty arguments' do
|
26
|
+
expect(described_class.extract(nil)).to be_empty
|
27
|
+
end
|
28
|
+
|
25
29
|
it 'extracts a post 2007 arXiv ID surrounded by Unicode whitespace' do
|
26
30
|
expect(described_class.extract('Example: arXiv:0706.0001 ')).to contain_exactly('0706.0001')
|
27
31
|
end
|
@@ -5,12 +5,24 @@ RSpec.describe Identifiers::ISBN do
|
|
5
5
|
expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
|
6
6
|
end
|
7
7
|
|
8
|
+
it 'extracts ISBNs when given as a number' do
|
9
|
+
isbn = 9780805069099
|
10
|
+
|
11
|
+
expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
|
12
|
+
end
|
13
|
+
|
8
14
|
it 'normalizes 13-digit ISBNs' do
|
9
15
|
str = "978-0-80-506909-9\n978-0-67-187919-8"
|
10
16
|
|
11
17
|
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
|
12
18
|
end
|
13
19
|
|
20
|
+
it 'extracts multiple ISBN-13s separated by a space' do
|
21
|
+
str = '978-0-80-506909-9 978-0-67-187919-8'
|
22
|
+
|
23
|
+
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
|
24
|
+
end
|
25
|
+
|
14
26
|
it 'extracts ISBNs with hyphens' do
|
15
27
|
expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
|
16
28
|
end
|
@@ -41,6 +53,12 @@ RSpec.describe Identifiers::ISBN do
|
|
41
53
|
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
|
42
54
|
end
|
43
55
|
|
56
|
+
it 'extracts multiple 10-digit ISBNs separated by a space' do
|
57
|
+
str = '0-8050-6909-7 2-7594-0269-X'
|
58
|
+
|
59
|
+
expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
|
60
|
+
end
|
61
|
+
|
44
62
|
it 'normalizes 10-digit ISBNs with Unicode dashes' do
|
45
63
|
expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
|
46
64
|
end
|
@@ -8,4 +8,8 @@ RSpec.describe Identifiers::NationalClinicalTrialId do
|
|
8
8
|
it 'normalizes NCT IDs' do
|
9
9
|
expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
|
10
10
|
end
|
11
|
+
|
12
|
+
it 'does not match anything with empty arguments' do
|
13
|
+
expect(described_class.extract(nil)).to be_empty
|
14
|
+
end
|
11
15
|
end
|
@@ -22,4 +22,8 @@ RSpec.describe Identifiers::PubmedId do
|
|
22
22
|
it 'extracts PubMed IDs separated by Unicode whitespace' do
|
23
23
|
expect(described_class.extract('123 456')).to contain_exactly('123', '456')
|
24
24
|
end
|
25
|
+
|
26
|
+
it 'considers Fixnum as potential PubmedIds too' do
|
27
|
+
expect(described_class.extract(123)).to contain_exactly('123')
|
28
|
+
end
|
25
29
|
end
|
@@ -18,4 +18,8 @@ RSpec.describe Identifiers::RepecId do
|
|
18
18
|
|
19
19
|
expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
|
20
20
|
end
|
21
|
+
|
22
|
+
it 'extracts nothing when given empty arguments' do
|
23
|
+
expect(described_class.extract(nil)).to be_empty
|
24
|
+
end
|
21
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: identifiers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jonathan Hernandez
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-07-31 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: urn
|