identifiers 0.8.1 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ade5848785ab153a6cb5e1b2cffdd05958879943
4
- data.tar.gz: 2b0b6fa55d97c5ae2990d606b210168d33dc5dae
3
+ metadata.gz: 17d7278289ac40fc4fa68b488e85c98ada612f6f
4
+ data.tar.gz: e846cd17ac15a7cb87705c288eb8c41336721183
5
5
  SHA512:
6
- metadata.gz: bbdd699cd75aef87f0318a54acb55784ead6e3ea2f603a5c3e5437ef7d358222842d2892f6ec24e0fc59119be3d621d41227497d1bd3cbafdb159612331a4ccd
7
- data.tar.gz: c26923e8c6c7153fae0dc793eacd6b34e355f78360deee5a47f0c2922076b9e91c5b79c76884f0b3cf0747c93b6c3e2da97090e51fb1114f100ae9d7ae2514d4
6
+ metadata.gz: 5a978d601dee61a25f5f66330e6e9bcd995fe1f217bdf43284350cc678574a1890a25ff289ad50ccebb25545da98acd459cd421e98e32d4e3c5266a41a84eb69
7
+ data.tar.gz: 30ef27cea3bc785f8f0ed841ac1f8cbfaa1e261b8a00afa9f0d942db0b8bc913f4e39f6ea91068f8eb7054a4cae1d04b17bd47956d3c7759d5ff9d5c2bed01b3
data/CHANGELOG.md CHANGED
@@ -2,6 +2,10 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.9.0] - 2017-07-31
6
+ ### Added
7
+ - Support extraction of multiple ISBNs separated by a single space
8
+
5
9
  ## [0.8.1] - 2017-04-10
6
10
  ### Fixed
7
11
  - Fixed extraction of multiple DOIs separated by Unicode whitespace
@@ -52,3 +56,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
52
56
  [0.7.0]: https://github.com/altmetric/identifiers/releases/tag/v0.7.0
53
57
  [0.8.0]: https://github.com/altmetric/identifiers/releases/tag/v0.8.0
54
58
  [0.8.1]: https://github.com/altmetric/identifiers/releases/tag/v0.8.1
59
+ [0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0
data/README.md CHANGED
@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
18
18
  Add this line to your application's `Gemfile`:
19
19
 
20
20
  ```ruby
21
- gem 'identifiers', '~> 0.8'
21
+ gem 'identifiers', '~> 0.9'
22
22
  ```
23
23
 
24
24
  And then execute:
data/lib/identifiers.rb CHANGED
@@ -4,10 +4,7 @@ require 'identifiers/doi'
4
4
  require 'identifiers/handle'
5
5
  require 'identifiers/isbn'
6
6
  require 'identifiers/national_clinical_trial_id'
7
+ require 'identifiers/orcid'
7
8
  require 'identifiers/pubmed_id'
8
9
  require 'identifiers/repec_id'
9
10
  require 'identifiers/urn'
10
- require 'identifiers/orcid'
11
-
12
- module Identifiers
13
- end
@@ -1,7 +1,7 @@
1
1
  module Identifiers
2
2
  class AdsBibcode
3
3
  def self.extract(str)
4
- str.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
4
+ str.to_s.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
5
5
  end
6
6
  end
7
7
  end
@@ -1,19 +1,37 @@
1
1
  module Identifiers
2
2
  class ArxivId
3
+ POST_2007_REGEXP = %r{
4
+ (?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
5
+ (?:arXiv:)? # Optional arXiv scheme
6
+ \d{4} # YYMM (two-digit year and two-digit month number)
7
+ \.
8
+ \d{4,5} # Zero-padded sequence number of 4- or 5-digits
9
+ (?:v\d+)? # Literal v followed by version number of 1 or more digits
10
+ (?=$|[[:space:]]) # Look-ahead for end of string or whitespace
11
+ }xi
12
+ PRE_2007_REGEXP = %r{
13
+ (?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
14
+ (?:arXiv:)? # Optional arXiv scheme
15
+ [a-z-]+ # Archive (e.g. "math")
16
+ (?:\.[A-Z]{2})? # Subject class (where applicable)
17
+ /
18
+ \d{2} # Year
19
+ (?:0[1-9]|1[012]) # Month
20
+ \d{3} # Number
21
+ (?:v\d+)? # Literal v followed by version number of 1 or more digits
22
+ (?=$|[[:space:]]) # Look-ahead for end of string or whitespace
23
+ }xi
24
+
3
25
  def self.extract(str)
4
26
  extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
5
27
  end
6
28
 
7
29
  def self.extract_post_2007_arxiv_ids(str)
8
- str
9
- .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|[[:space:]])}i)
10
- .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
30
+ str.to_s.scan(POST_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
11
31
  end
12
32
 
13
33
  def self.extract_pre_2007_arxiv_ids(str)
14
- str
15
- .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|[[:space:]])}i)
16
- .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
34
+ str.to_s.scan(PRE_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
17
35
  end
18
36
  end
19
37
  end
@@ -1,6 +1,6 @@
1
1
  module Identifiers
2
2
  class DOI
3
- PATTERN = %r{
3
+ REGEXP = %r{
4
4
  \b
5
5
  10 # Directory indicator (always 10)
6
6
  \.
@@ -29,16 +29,11 @@ module Identifiers
29
29
  /x
30
30
 
31
31
  def self.extract(str)
32
- str
33
- .to_s
34
- .downcase
35
- .scan(PATTERN)
36
- .map { |doi| strip_punctuation(doi) }
37
- .compact
32
+ str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
38
33
  end
39
34
 
40
35
  def self.extract_one(str)
41
- match = str.to_s.downcase[PATTERN]
36
+ match = str.to_s.downcase[REGEXP]
42
37
  return unless match
43
38
 
44
39
  strip_punctuation(match)
@@ -1,7 +1,7 @@
1
1
  module Identifiers
2
2
  class Handle
3
3
  def self.extract(str)
4
- str.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
4
+ str.to_s.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
5
5
  end
6
6
  end
7
7
  end
@@ -1,29 +1,64 @@
1
1
  module Identifiers
2
2
  class ISBN
3
- REGEX_13 = /\b97[89]\d{10}\b/
4
- REGEX_10 = /\b\d{9}(?:\d|X)\b/
5
- REGEX_A = %r{\b(?<=10\.)97[89]\.\d{2,8}/\d{1,7}\b}
3
+ ISBN_13_REGEXP = /
4
+ \b
5
+ 97[89] # ISBN (GS1) Bookland prefix
6
+ [\p{Pd}\p{Zs}]? # Optional hyphenation
7
+ (?:
8
+ \d # Digit
9
+ [\p{Pd}\p{Zs}]? # Optional hyphenation
10
+ ){9}
11
+ \d # Check digit
12
+ \b
13
+ /x
14
+ ISBN_10_REGEXP = /
15
+ \b
16
+ (?:
17
+ \d # Digit
18
+ [\p{Pd}\p{Zs}]? # Optional hyphenation
19
+ ){9}
20
+ [\dX] # Check digit
21
+ \b
22
+ /x
23
+ ISBN_A_REGEXP = %r{
24
+ \b
25
+ (?<=10\.) # Directory indicator (always 10)
26
+ 97[89]\. # ISBN (GS1) Bookland prefix
27
+ \d{2,8} # ISBN registration group element and publisher prefix
28
+ / # Prefix/suffix divider
29
+ \d{1,7} # ISBN title enumerator and check digit
30
+ \b
31
+ }x
6
32
 
7
33
  def self.extract(str)
8
34
  extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
9
35
  end
10
36
 
11
37
  def self.extract_isbn_as(str)
12
- extract_thirteen_digit_isbns(str.scan(REGEX_A).join("\n").tr('/.', ''))
38
+ extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
13
39
  end
14
40
 
15
41
  def self.extract_thirteen_digit_isbns(str)
16
- str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=\d)/, '').scan(REGEX_13).select { |isbn| valid_isbn_13?(isbn) }
42
+ str
43
+ .to_s
44
+ .scan(ISBN_13_REGEXP)
45
+ .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
46
+ .select { |isbn| valid_isbn_13?(isbn) }
17
47
  end
18
48
 
19
49
  def self.extract_ten_digit_isbns(str)
20
- str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=[\dX])/i, '').scan(REGEX_10).select { |isbn| valid_isbn_10?(isbn) }.map { |isbn|
21
- isbn.chop!
22
- isbn.prepend('978')
23
- isbn << isbn_13_check_digit(isbn).to_s
50
+ str
51
+ .to_s
52
+ .scan(ISBN_10_REGEXP)
53
+ .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
54
+ .select { |isbn| valid_isbn_10?(isbn) }
55
+ .map { |isbn|
56
+ isbn.chop!
57
+ isbn.prepend('978')
58
+ isbn << isbn_13_check_digit(isbn).to_s
24
59
 
25
- isbn
26
- }
60
+ isbn
61
+ }
27
62
  end
28
63
 
29
64
  def self.isbn_13_check_digit(isbn)
@@ -38,7 +73,7 @@ module Identifiers
38
73
  end
39
74
 
40
75
  def self.valid_isbn_13?(isbn)
41
- return false unless isbn =~ REGEX_13
76
+ return false unless isbn =~ ISBN_13_REGEXP
42
77
 
43
78
  result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
44
79
 
@@ -46,7 +81,7 @@ module Identifiers
46
81
  end
47
82
 
48
83
  def self.valid_isbn_10?(isbn)
49
- return false unless isbn =~ REGEX_10
84
+ return false unless isbn =~ ISBN_10_REGEXP
50
85
 
51
86
  result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
52
87
 
@@ -54,7 +89,7 @@ module Identifiers
54
89
  end
55
90
 
56
91
  def self.digits_of(isbn)
57
- isbn.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
92
+ isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
58
93
  end
59
94
  end
60
95
  end
@@ -1,7 +1,7 @@
1
1
  module Identifiers
2
2
  class NationalClinicalTrialId
3
3
  def self.extract(str)
4
- str.scan(/\bNCT\d+\b/i).map(&:upcase)
4
+ str.to_s.scan(/\bNCT\d+\b/i).map(&:upcase)
5
5
  end
6
6
  end
7
7
  end
@@ -1,9 +1,9 @@
1
1
  module Identifiers
2
2
  class ORCID
3
- REGEX = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
3
+ REGEXP = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
4
4
 
5
5
  def self.extract(str)
6
- str.scan(REGEX).select { |orcid| valid?(orcid) }.map(&:upcase)
6
+ str.to_s.scan(REGEXP).select { |orcid| valid?(orcid) }.map(&:upcase)
7
7
  end
8
8
 
9
9
  def self.valid?(str)
@@ -14,7 +14,7 @@ module Identifiers
14
14
  end
15
15
 
16
16
  def self.calculate_digit(str)
17
- return unless str =~ REGEX
17
+ return unless str =~ REGEXP
18
18
 
19
19
  base_digits = str.chop.tr('-', '')
20
20
  total = 0
@@ -1,9 +1,7 @@
1
1
  module Identifiers
2
2
  class PubmedId
3
3
  def self.extract(str)
4
- str
5
- .scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/)
6
- .flatten
4
+ str.to_s.scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/).flatten
7
5
  end
8
6
  end
9
7
  end
@@ -1,7 +1,10 @@
1
1
  module Identifiers
2
2
  class RepecId
3
3
  def self.extract(str)
4
- str.scan(/\brepec:[^[:space:]]+\b/i).map { |repec| "RePEc:#{repec.split(':', 2).last}" }
4
+ str
5
+ .to_s
6
+ .scan(/\brepec:[^[:space:]]+\b/i)
7
+ .map { |repec| "RePEc:#{repec.split(':', 2).last}" }
5
8
  end
6
9
  end
7
10
  end
@@ -15,5 +15,9 @@ RSpec.describe Identifiers::AdsBibcode do
15
15
  it 'does not extract Bibcodes from DOIs' do
16
16
  expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
17
17
  end
18
+
19
+ it 'returns no Bibcode if nothing is given' do
20
+ expect(described_class.extract(nil)).to be_empty
21
+ end
18
22
  end
19
23
  end
@@ -22,6 +22,10 @@ RSpec.describe Identifiers::ArxivId do
22
22
  expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
23
23
  end
24
24
 
25
+ it 'extracts nothing from empty arguments' do
26
+ expect(described_class.extract(nil)).to be_empty
27
+ end
28
+
25
29
  it 'extracts a post 2007 arXiv ID surrounded by Unicode whitespace' do
26
30
  expect(described_class.extract('Example: arXiv:0706.0001 ')).to contain_exactly('0706.0001')
27
31
  end
@@ -18,4 +18,8 @@ RSpec.describe Identifiers::Handle do
18
18
 
19
19
  expect(described_class.extract(str)).to contain_exactly('10149/596901', '10251/79612')
20
20
  end
21
+
22
+ it 'extracts nothing from empty arguments' do
23
+ expect(described_class.extract(nil)).to be_empty
24
+ end
21
25
  end
@@ -5,12 +5,24 @@ RSpec.describe Identifiers::ISBN do
5
5
  expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
6
  end
7
7
 
8
+ it 'extracts ISBNs when given as a number' do
9
+ isbn = 9780805069099
10
+
11
+ expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
12
+ end
13
+
8
14
  it 'normalizes 13-digit ISBNs' do
9
15
  str = "978-0-80-506909-9\n978-0-67-187919-8"
10
16
 
11
17
  expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
12
18
  end
13
19
 
20
+ it 'extracts multiple ISBN-13s separated by a space' do
21
+ str = '978-0-80-506909-9 978-0-67-187919-8'
22
+
23
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
24
+ end
25
+
14
26
  it 'extracts ISBNs with hyphens' do
15
27
  expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
16
28
  end
@@ -41,6 +53,12 @@ RSpec.describe Identifiers::ISBN do
41
53
  expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
42
54
  end
43
55
 
56
+ it 'extracts multiple 10-digit ISBNs separated by a space' do
57
+ str = '0-8050-6909-7 2-7594-0269-X'
58
+
59
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
60
+ end
61
+
44
62
  it 'normalizes 10-digit ISBNs with Unicode dashes' do
45
63
  expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
46
64
  end
@@ -8,4 +8,8 @@ RSpec.describe Identifiers::NationalClinicalTrialId do
8
8
  it 'normalizes NCT IDs' do
9
9
  expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
10
10
  end
11
+
12
+ it 'does not match anything with empty arguments' do
13
+ expect(described_class.extract(nil)).to be_empty
14
+ end
11
15
  end
@@ -22,4 +22,8 @@ RSpec.describe Identifiers::PubmedId do
22
22
  it 'extracts PubMed IDs separated by Unicode whitespace' do
23
23
  expect(described_class.extract('123 456')).to contain_exactly('123', '456')
24
24
  end
25
+
26
+ it 'considers Fixnum as potential PubmedIds too' do
27
+ expect(described_class.extract(123)).to contain_exactly('123')
28
+ end
25
29
  end
@@ -18,4 +18,8 @@ RSpec.describe Identifiers::RepecId do
18
18
 
19
19
  expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
20
20
  end
21
+
22
+ it 'extracts nothing when given empty arguments' do
23
+ expect(described_class.extract(nil)).to be_empty
24
+ end
21
25
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-04-10 00:00:00.000000000 Z
12
+ date: 2017-07-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn