identifiers 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ade5848785ab153a6cb5e1b2cffdd05958879943
4
- data.tar.gz: 2b0b6fa55d97c5ae2990d606b210168d33dc5dae
3
+ metadata.gz: 17d7278289ac40fc4fa68b488e85c98ada612f6f
4
+ data.tar.gz: e846cd17ac15a7cb87705c288eb8c41336721183
5
5
  SHA512:
6
- metadata.gz: bbdd699cd75aef87f0318a54acb55784ead6e3ea2f603a5c3e5437ef7d358222842d2892f6ec24e0fc59119be3d621d41227497d1bd3cbafdb159612331a4ccd
7
- data.tar.gz: c26923e8c6c7153fae0dc793eacd6b34e355f78360deee5a47f0c2922076b9e91c5b79c76884f0b3cf0747c93b6c3e2da97090e51fb1114f100ae9d7ae2514d4
6
+ metadata.gz: 5a978d601dee61a25f5f66330e6e9bcd995fe1f217bdf43284350cc678574a1890a25ff289ad50ccebb25545da98acd459cd421e98e32d4e3c5266a41a84eb69
7
+ data.tar.gz: 30ef27cea3bc785f8f0ed841ac1f8cbfaa1e261b8a00afa9f0d942db0b8bc913f4e39f6ea91068f8eb7054a4cae1d04b17bd47956d3c7759d5ff9d5c2bed01b3
data/CHANGELOG.md CHANGED
@@ -2,6 +2,10 @@
2
2
  All notable changes to this project will be documented in this file. This
3
3
  project adheres to [Semantic Versioning](http://semver.org/).
4
4
 
5
+ ## [0.9.0] - 2017-07-31
6
+ ### Added
7
+ - Support extraction of multiple ISBNs separated by a single space
8
+
5
9
  ## [0.8.1] - 2017-04-10
6
10
  ### Fixed
7
11
  - Fixed extraction of multiple DOIs separated by Unicode whitespace
@@ -52,3 +56,4 @@ project adheres to [Semantic Versioning](http://semver.org/).
52
56
  [0.7.0]: https://github.com/altmetric/identifiers/releases/tag/v0.7.0
53
57
  [0.8.0]: https://github.com/altmetric/identifiers/releases/tag/v0.8.0
54
58
  [0.8.1]: https://github.com/altmetric/identifiers/releases/tag/v0.8.1
59
+ [0.9.0]: https://github.com/altmetric/identifiers/releases/tag/v0.9.0
data/README.md CHANGED
@@ -18,7 +18,7 @@ Collection of utilities related to the extraction, validation and normalization
18
18
  Add this line to your application's `Gemfile`:
19
19
 
20
20
  ```ruby
21
- gem 'identifiers', '~> 0.8'
21
+ gem 'identifiers', '~> 0.9'
22
22
  ```
23
23
 
24
24
  And then execute:
data/lib/identifiers.rb CHANGED
@@ -4,10 +4,7 @@ require 'identifiers/doi'
4
4
  require 'identifiers/handle'
5
5
  require 'identifiers/isbn'
6
6
  require 'identifiers/national_clinical_trial_id'
7
+ require 'identifiers/orcid'
7
8
  require 'identifiers/pubmed_id'
8
9
  require 'identifiers/repec_id'
9
10
  require 'identifiers/urn'
10
- require 'identifiers/orcid'
11
-
12
- module Identifiers
13
- end
@@ -1,7 +1,7 @@
1
1
  module Identifiers
2
2
  class AdsBibcode
3
3
  def self.extract(str)
4
- str.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
4
+ str.to_s.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
5
5
  end
6
6
  end
7
7
  end
@@ -1,19 +1,37 @@
1
1
  module Identifiers
2
2
  class ArxivId
3
+ POST_2007_REGEXP = %r{
4
+ (?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
5
+ (?:arXiv:)? # Optional arXiv scheme
6
+ \d{4} # YYMM (two-digit year and two-digit month number)
7
+ \.
8
+ \d{4,5} # Zero-padded sequence number of 4- or 5-digits
9
+ (?:v\d+)? # Literal v followed by version number of 1 or more digits
10
+ (?=$|[[:space:]]) # Look-ahead for end of string or whitespace
11
+ }xi
12
+ PRE_2007_REGEXP = %r{
13
+ (?<=^|[[:space:]/]) # Look-behind for the start of the string, whitespace or a forward slash
14
+ (?:arXiv:)? # Optional arXiv scheme
15
+ [a-z-]+ # Archive (e.g. "math")
16
+ (?:\.[A-Z]{2})? # Subject class (where applicable)
17
+ /
18
+ \d{2} # Year
19
+ (?:0[1-9]|1[012]) # Month
20
+ \d{3} # Number
21
+ (?:v\d+)? # Literal v followed by version number of 1 or more digits
22
+ (?=$|[[:space:]]) # Look-ahead for end of string or whitespace
23
+ }xi
24
+
3
25
  def self.extract(str)
4
26
  extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
5
27
  end
6
28
 
7
29
  def self.extract_post_2007_arxiv_ids(str)
8
- str
9
- .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|[[:space:]])}i)
10
- .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
30
+ str.to_s.scan(POST_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
11
31
  end
12
32
 
13
33
  def self.extract_pre_2007_arxiv_ids(str)
14
- str
15
- .scan(%r{(?<=^|[[:space:]/])(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|[[:space:]])}i)
16
- .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
34
+ str.to_s.scan(PRE_2007_REGEXP).map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
17
35
  end
18
36
  end
19
37
  end
@@ -1,6 +1,6 @@
1
1
  module Identifiers
2
2
  class DOI
3
- PATTERN = %r{
3
+ REGEXP = %r{
4
4
  \b
5
5
  10 # Directory indicator (always 10)
6
6
  \.
@@ -29,16 +29,11 @@ module Identifiers
29
29
  /x
30
30
 
31
31
  def self.extract(str)
32
- str
33
- .to_s
34
- .downcase
35
- .scan(PATTERN)
36
- .map { |doi| strip_punctuation(doi) }
37
- .compact
32
+ str.to_s.downcase.scan(REGEXP).map { |doi| strip_punctuation(doi) }.compact
38
33
  end
39
34
 
40
35
  def self.extract_one(str)
41
- match = str.to_s.downcase[PATTERN]
36
+ match = str.to_s.downcase[REGEXP]
42
37
  return unless match
43
38
 
44
39
  strip_punctuation(match)
@@ -1,7 +1,7 @@
1
1
  module Identifiers
2
2
  class Handle
3
3
  def self.extract(str)
4
- str.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
4
+ str.to_s.scan(%r{\b[0-9.]+/[^[:space:]]+\b}i)
5
5
  end
6
6
  end
7
7
  end
@@ -1,29 +1,64 @@
1
1
  module Identifiers
2
2
  class ISBN
3
- REGEX_13 = /\b97[89]\d{10}\b/
4
- REGEX_10 = /\b\d{9}(?:\d|X)\b/
5
- REGEX_A = %r{\b(?<=10\.)97[89]\.\d{2,8}/\d{1,7}\b}
3
+ ISBN_13_REGEXP = /
4
+ \b
5
+ 97[89] # ISBN (GS1) Bookland prefix
6
+ [\p{Pd}\p{Zs}]? # Optional hyphenation
7
+ (?:
8
+ \d # Digit
9
+ [\p{Pd}\p{Zs}]? # Optional hyphenation
10
+ ){9}
11
+ \d # Check digit
12
+ \b
13
+ /x
14
+ ISBN_10_REGEXP = /
15
+ \b
16
+ (?:
17
+ \d # Digit
18
+ [\p{Pd}\p{Zs}]? # Optional hyphenation
19
+ ){9}
20
+ [\dX] # Check digit
21
+ \b
22
+ /x
23
+ ISBN_A_REGEXP = %r{
24
+ \b
25
+ (?<=10\.) # Directory indicator (always 10)
26
+ 97[89]\. # ISBN (GS1) Bookland prefix
27
+ \d{2,8} # ISBN registration group element and publisher prefix
28
+ / # Prefix/suffix divider
29
+ \d{1,7} # ISBN title enumerator and check digit
30
+ \b
31
+ }x
6
32
 
7
33
  def self.extract(str)
8
34
  extract_isbn_as(str) + extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
9
35
  end
10
36
 
11
37
  def self.extract_isbn_as(str)
12
- extract_thirteen_digit_isbns(str.scan(REGEX_A).join("\n").tr('/.', ''))
38
+ extract_thirteen_digit_isbns(str.to_s.scan(ISBN_A_REGEXP).join("\n").tr('/.', ''))
13
39
  end
14
40
 
15
41
  def self.extract_thirteen_digit_isbns(str)
16
- str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=\d)/, '').scan(REGEX_13).select { |isbn| valid_isbn_13?(isbn) }
42
+ str
43
+ .to_s
44
+ .scan(ISBN_13_REGEXP)
45
+ .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
46
+ .select { |isbn| valid_isbn_13?(isbn) }
17
47
  end
18
48
 
19
49
  def self.extract_ten_digit_isbns(str)
20
- str.gsub(/(?<=\d)[\p{Pd}\p{Zs}](?=[\dX])/i, '').scan(REGEX_10).select { |isbn| valid_isbn_10?(isbn) }.map { |isbn|
21
- isbn.chop!
22
- isbn.prepend('978')
23
- isbn << isbn_13_check_digit(isbn).to_s
50
+ str
51
+ .to_s
52
+ .scan(ISBN_10_REGEXP)
53
+ .map { |isbn| isbn.gsub(/[\p{Pd}\p{Zs}]/, '') }
54
+ .select { |isbn| valid_isbn_10?(isbn) }
55
+ .map { |isbn|
56
+ isbn.chop!
57
+ isbn.prepend('978')
58
+ isbn << isbn_13_check_digit(isbn).to_s
24
59
 
25
- isbn
26
- }
60
+ isbn
61
+ }
27
62
  end
28
63
 
29
64
  def self.isbn_13_check_digit(isbn)
@@ -38,7 +73,7 @@ module Identifiers
38
73
  end
39
74
 
40
75
  def self.valid_isbn_13?(isbn)
41
- return false unless isbn =~ REGEX_13
76
+ return false unless isbn =~ ISBN_13_REGEXP
42
77
 
43
78
  result = digits_of(isbn).zip([1, 3].cycle).map { |digit, weight| digit * weight }.reduce(:+)
44
79
 
@@ -46,7 +81,7 @@ module Identifiers
46
81
  end
47
82
 
48
83
  def self.valid_isbn_10?(isbn)
49
- return false unless isbn =~ REGEX_10
84
+ return false unless isbn =~ ISBN_10_REGEXP
50
85
 
51
86
  result = digits_of(isbn).with_index.map { |digit, weight| digit * weight.succ }.reduce(:+)
52
87
 
@@ -54,7 +89,7 @@ module Identifiers
54
89
  end
55
90
 
56
91
  def self.digits_of(isbn)
57
- isbn.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
92
+ isbn.to_s.each_char.map { |char| char == 'X' ? 10 : Integer(char) }.to_enum
58
93
  end
59
94
  end
60
95
  end
@@ -1,7 +1,7 @@
1
1
  module Identifiers
2
2
  class NationalClinicalTrialId
3
3
  def self.extract(str)
4
- str.scan(/\bNCT\d+\b/i).map(&:upcase)
4
+ str.to_s.scan(/\bNCT\d+\b/i).map(&:upcase)
5
5
  end
6
6
  end
7
7
  end
@@ -1,9 +1,9 @@
1
1
  module Identifiers
2
2
  class ORCID
3
- REGEX = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
3
+ REGEXP = /\b(?:\d{4}-){3}\d{3}[\dx]\b/i
4
4
 
5
5
  def self.extract(str)
6
- str.scan(REGEX).select { |orcid| valid?(orcid) }.map(&:upcase)
6
+ str.to_s.scan(REGEXP).select { |orcid| valid?(orcid) }.map(&:upcase)
7
7
  end
8
8
 
9
9
  def self.valid?(str)
@@ -14,7 +14,7 @@ module Identifiers
14
14
  end
15
15
 
16
16
  def self.calculate_digit(str)
17
- return unless str =~ REGEX
17
+ return unless str =~ REGEXP
18
18
 
19
19
  base_digits = str.chop.tr('-', '')
20
20
  total = 0
@@ -1,9 +1,7 @@
1
1
  module Identifiers
2
2
  class PubmedId
3
3
  def self.extract(str)
4
- str
5
- .scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/)
6
- .flatten
4
+ str.to_s.scan(/(?<=^|[[:space:]])0*(?!0)(\d+)(?=$|[[:space:]])/).flatten
7
5
  end
8
6
  end
9
7
  end
@@ -1,7 +1,10 @@
1
1
  module Identifiers
2
2
  class RepecId
3
3
  def self.extract(str)
4
- str.scan(/\brepec:[^[:space:]]+\b/i).map { |repec| "RePEc:#{repec.split(':', 2).last}" }
4
+ str
5
+ .to_s
6
+ .scan(/\brepec:[^[:space:]]+\b/i)
7
+ .map { |repec| "RePEc:#{repec.split(':', 2).last}" }
5
8
  end
6
9
  end
7
10
  end
@@ -15,5 +15,9 @@ RSpec.describe Identifiers::AdsBibcode do
15
15
  it 'does not extract Bibcodes from DOIs' do
16
16
  expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
17
17
  end
18
+
19
+ it 'returns no Bibcode if nothing is given' do
20
+ expect(described_class.extract(nil)).to be_empty
21
+ end
18
22
  end
19
23
  end
@@ -22,6 +22,10 @@ RSpec.describe Identifiers::ArxivId do
22
22
  expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
23
23
  end
24
24
 
25
+ it 'extracts nothing from empty arguments' do
26
+ expect(described_class.extract(nil)).to be_empty
27
+ end
28
+
25
29
  it 'extracts a post 2007 arXiv ID surrounded by Unicode whitespace' do
26
30
  expect(described_class.extract('Example: arXiv:0706.0001 ')).to contain_exactly('0706.0001')
27
31
  end
@@ -18,4 +18,8 @@ RSpec.describe Identifiers::Handle do
18
18
 
19
19
  expect(described_class.extract(str)).to contain_exactly('10149/596901', '10251/79612')
20
20
  end
21
+
22
+ it 'extracts nothing from empty arguments' do
23
+ expect(described_class.extract(nil)).to be_empty
24
+ end
21
25
  end
@@ -5,12 +5,24 @@ RSpec.describe Identifiers::ISBN do
5
5
  expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
6
  end
7
7
 
8
+ it 'extracts ISBNs when given as a number' do
9
+ isbn = 9780805069099
10
+
11
+ expect(described_class.extract(isbn)).to contain_exactly('9780805069099')
12
+ end
13
+
8
14
  it 'normalizes 13-digit ISBNs' do
9
15
  str = "978-0-80-506909-9\n978-0-67-187919-8"
10
16
 
11
17
  expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
12
18
  end
13
19
 
20
+ it 'extracts multiple ISBN-13s separated by a space' do
21
+ str = '978-0-80-506909-9 978-0-67-187919-8'
22
+
23
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
24
+ end
25
+
14
26
  it 'extracts ISBNs with hyphens' do
15
27
  expect(described_class.extract('ISBN: 978-0-80-506909-9')).to contain_exactly('9780805069099')
16
28
  end
@@ -41,6 +53,12 @@ RSpec.describe Identifiers::ISBN do
41
53
  expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
42
54
  end
43
55
 
56
+ it 'extracts multiple 10-digit ISBNs separated by a space' do
57
+ str = '0-8050-6909-7 2-7594-0269-X'
58
+
59
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
60
+ end
61
+
44
62
  it 'normalizes 10-digit ISBNs with Unicode dashes' do
45
63
  expect(described_class.extract('0–8050–6909–7')).to contain_exactly('9780805069099')
46
64
  end
@@ -8,4 +8,8 @@ RSpec.describe Identifiers::NationalClinicalTrialId do
8
8
  it 'normalizes NCT IDs' do
9
9
  expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
10
10
  end
11
+
12
+ it 'does not match anything with empty arguments' do
13
+ expect(described_class.extract(nil)).to be_empty
14
+ end
11
15
  end
@@ -22,4 +22,8 @@ RSpec.describe Identifiers::PubmedId do
22
22
  it 'extracts PubMed IDs separated by Unicode whitespace' do
23
23
  expect(described_class.extract('123 456')).to contain_exactly('123', '456')
24
24
  end
25
+
26
+ it 'considers Fixnum as potential PubmedIds too' do
27
+ expect(described_class.extract(123)).to contain_exactly('123')
28
+ end
25
29
  end
@@ -18,4 +18,8 @@ RSpec.describe Identifiers::RepecId do
18
18
 
19
19
  expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
20
20
  end
21
+
22
+ it 'extracts nothing when given empty arguments' do
23
+ expect(described_class.extract(nil)).to be_empty
24
+ end
21
25
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: identifiers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.1
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Hernandez
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2017-04-10 00:00:00.000000000 Z
12
+ date: 2017-07-31 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: urn