identifiers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 64dea49b2835751962c267670fdef1c57696eceb
4
+ data.tar.gz: 5d4bfc54e1a55194e2286c80bd24cb7fa255aeb3
5
+ SHA512:
6
+ metadata.gz: b0dd1c0a6b57d8e3a4066487e79b8fda6552c2e443ae58869d1271abe8ed526ba8aaf3a4a9a1ec833df547934d4f4fcbce5e61858956b035af69bd868b95b23d
7
+ data.tar.gz: 93b41b272316d5cfe8ce134f9f2e09714c7723ed33ab1186e6b13d17409ca3fd00574d5bf26a0f68ed536e2a4db7db68536f983a191f9a3d193038650262d31b
data/CHANGELOG.md ADDED
@@ -0,0 +1,7 @@
1
+ # Change Log
2
+ All notable changes to this project will be documented in this file. This
3
+ project adheres to [Semantic Versioning](http://semver.org/).
4
+
5
+ ## [0.1.0] - 2016-10-21
6
+ ### Added
7
+ - Initial release
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Altmetric LLP
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # Identifiers
2
+
3
+ Collection of utilities related to the extraction, validation and normalization of various scholarly identifiers. The supported list is:
4
+ - [ADS Bibcodes](http://adsdoc.harvard.edu/abs_doc/help_pages/bibcodes.html)
5
+ - [arXiv](https://arxiv.org/help/arxiv_identifier)
6
+ - [DOI](https://www.doi.org/)
7
+ - [Handle](https://en.wikipedia.org/wiki/Handle_System)
8
+ - [ISBN](https://en.wikipedia.org/wiki/International_Standard_Book_Number)
9
+ - [National Clinic Trials](https://clinicaltrials.gov/)
10
+ - [PubMed](http://www.ncbi.nlm.nih.gov/pubmed)
11
+ - [RePEc](https://en.wikipedia.org/wiki/Research_Papers_in_Economics)
12
+ - [URN](https://en.wikipedia.org/wiki/Uniform_Resource_Name)
13
+
14
+ ## Installation
15
+
16
+ Add this line to your application's Gemfile:
17
+
18
+ ```ruby
19
+ gem 'identifiers', '~> 0.1'
20
+ ```
21
+
22
+ And then execute:
23
+
24
+ $ bundle
25
+
26
+ Or install it yourself as:
27
+
28
+ $ gem install identifiers
29
+
30
+ ## Usage
31
+
32
+ ```ruby
33
+ Identifiers::DOI.extract('example: 10.123/abcd.efghi')
34
+
35
+ Identifiers::URN.new('urn:abc:123')
36
+ Identifiers::URN('urn:abc:123')
37
+ ```
38
+
39
+ ## Contributing
40
+
41
+ Bug reports and pull requests are welcome on GitHub at https://github.com/altmetric/identifiers.
42
+
43
+ ## License
44
+
45
+ Copyright © 2016 Altmetric LLP
46
+
47
+ Distributed under the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,12 @@
1
+ require 'identifiers/ads_bibcode'
2
+ require 'identifiers/arxiv_id'
3
+ require 'identifiers/doi'
4
+ require 'identifiers/handle'
5
+ require 'identifiers/isbn'
6
+ require 'identifiers/national_clinical_trial_id'
7
+ require 'identifiers/pubmed_id'
8
+ require 'identifiers/repec_id'
9
+ require 'identifiers/urn'
10
+
11
+ module Identifiers
12
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class AdsBibcode
3
+ def self.extract(str)
4
+ str.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,19 @@
1
+ module Identifiers
2
+ class ArxivId
3
+ def self.extract(str)
4
+ extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
5
+ end
6
+
7
+ def self.extract_post_2007_arxiv_ids(str)
8
+ str
9
+ .scan(%r{(?<=^|\s|/)(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|\s)}i)
10
+ .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
11
+ end
12
+
13
+ def self.extract_pre_2007_arxiv_ids(str)
14
+ str
15
+ .scan(%r{(?<=^|\s|/)(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|\s)}i)
16
+ .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class DOI
3
+ def self.extract(str)
4
+ str.scan(%r{\b10\.\d{3,}/\S+\b}).map(&:downcase)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class Handle
3
+ def self.extract(str)
4
+ str.scan(%r{\b[0-9.]+/\S+\b}i)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,46 @@
1
+ module Identifiers
2
+ class ISBN
3
+ REGEX_13 = /\b97[89]\d{10}\b/
4
+ REGEX_10 = /\b\d{9}(?:\d|X)\b/
5
+
6
+ def self.extract(str)
7
+ extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
8
+ end
9
+
10
+ def self.extract_thirteen_digit_isbns(str)
11
+ str.tr('-', '').scan(REGEX_13).select { |isbn| valid_isbn_13?(isbn) }
12
+ end
13
+
14
+ def self.extract_ten_digit_isbns(str)
15
+ str.tr('-', '').scan(REGEX_10).select { |isbn| valid_isbn_10?(isbn) }.map { |isbn|
16
+ isbn.chop!
17
+ isbn.prepend('978')
18
+ isbn << isbn_13_check_digit(isbn).to_s
19
+
20
+ isbn
21
+ }
22
+ end
23
+
24
+ def self.isbn_13_check_digit(isbn)
25
+ 10 - isbn.each_char.zip([1, 3].cycle).reduce(0) { |sum, values| sum + (Integer(values[0]) * values[1]) } % 10
26
+ end
27
+
28
+ def self.valid_isbn_13?(isbn)
29
+ return false unless isbn =~ REGEX_13
30
+
31
+ result = isbn.each_char.zip([1, 3].cycle).reduce(0) { |sum, values| sum + (Integer(values[0]) * values[1]) }
32
+
33
+ (result % 10).zero?
34
+ end
35
+
36
+ def self.valid_isbn_10?(isbn)
37
+ return false unless isbn =~ REGEX_10
38
+
39
+ result = isbn.each_char.with_index.reduce(0) { |sum, values|
40
+ sum + (Integer(values[0].sub('X', '10')) * values[1].succ)
41
+ }
42
+
43
+ (result % 11).zero?
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class NationalClinicalTrialId
3
+ def self.extract(str)
4
+ str.scan(/\bNCT\d+\b/i).map(&:upcase)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class PubmedId
3
+ def self.extract(str)
4
+ str.scan(/(?<=^|\s)\d+(?=$|\s)/)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class RepecId
3
+ def self.extract(str)
4
+ str.scan(/\brepec:\S+\b/i).map { |repec| "RePEc:#{repec.split(':', 2).last}" }
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,5 @@
1
+ require 'urn'
2
+
3
+ module Identifiers
4
+ URN = ::URN
5
+ end
@@ -0,0 +1,19 @@
1
+ require 'identifiers/ads_bibcode'
2
+
3
+ RSpec.describe Identifiers::AdsBibcode do
4
+ describe '#extract' do
5
+ it 'does extract a bibcode' do
6
+ str = 'This is a Bibcode: 1974AJ.....79..819H'
7
+
8
+ expect(described_class.extract(str)).to contain_exactly('1974AJ.....79..819H')
9
+ end
10
+
11
+ it 'does extract a PhD Thesis Bibcode' do
12
+ expect(described_class.extract('2004PhRvL..93o0801M')).to contain_exactly('2004PhRvL..93o0801M')
13
+ end
14
+
15
+ it 'does not extract Bibcodes from DOIs' do
16
+ expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,25 @@
1
+ require 'identifiers/arxiv_id'
2
+
3
+ RSpec.describe Identifiers::ArxivId do
4
+ describe '#extract' do
5
+ it 'does extract a pre 2007 arXiv ID' do
6
+ expect(described_class.extract('Example: math.GT/0309136')).to contain_exactly('math.GT/0309136')
7
+ end
8
+
9
+ it 'does extract a post 2007 unversioned arXiv ID' do
10
+ expect(described_class.extract('Example: arXiv:0706.0001')).to contain_exactly('0706.0001')
11
+ end
12
+
13
+ it 'does extract a post 2007 versioned arXiv ID' do
14
+ expect(described_class.extract('Example: arXiv:1501.00001v2')).to contain_exactly('1501.00001v2')
15
+ end
16
+
17
+ it 'does not extract IDs from DOIs that end in a valid arXiv ID' do
18
+ expect(described_class.extract('10.1049/el.2013.3006')).to be_empty
19
+ end
20
+
21
+ it 'does not extract IDs from DOIs that contain a valid arXiv ID' do
22
+ expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ require 'identifiers/doi'
2
+
3
+ RSpec.describe Identifiers::DOI do
4
+ it 'extracts DOIs from a string' do
5
+ str = 'This is an example of DOI: 10.1049/el.2013.3006'
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
8
+ end
9
+
10
+ it 'downcase the DOIs extracted' do
11
+ str = 'This is an example of DOI: 10.1097/01.ASW.0000443266.17665.19'
12
+
13
+ expect(described_class.extract(str)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
14
+ end
15
+
16
+ it 'does not extract a PUBMED ID' do
17
+ str = 'This is NOT a DOI: 123456'
18
+
19
+ expect(described_class.extract(str)).to be_empty
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ require 'identifiers/handle'
2
+
3
+ RSpec.describe Identifiers::Handle do
4
+ it 'extracts a Handle' do
5
+ str = 'http://hdl.handle.net/10149/596901'
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('10149/596901')
8
+ end
9
+
10
+ it 'extracts another Handle' do
11
+ str = 'http://hdl.handle.net/2117/83545it.ly/1UtXnTW'
12
+
13
+ expect(described_class.extract(str)).to contain_exactly('2117/83545it.ly/1UtXnTW')
14
+ end
15
+ end
@@ -0,0 +1,26 @@
1
+ require 'identifiers/isbn'
2
+
3
+ RSpec.describe Identifiers::ISBN do
4
+ it 'extracts a ISBN' do
5
+ expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
+ end
7
+
8
+ it 'normalizes 13-digit ISBNs' do
9
+ str = "978-0-80-506909-9\n978-0-67-187919-8"
10
+
11
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
12
+ end
13
+
14
+ it 'normalizes 10-digit ISBNs' do
15
+ str = "0-8050-6909-7 \n 2-7594-0269-X"
16
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
17
+ end
18
+
19
+ it 'does not extract invalid 13-digit ISBNs' do
20
+ expect(described_class.extract('9783319217280')).to be_empty
21
+ end
22
+
23
+ it 'does not extract invalid 10-digit ISBNs' do
24
+ expect(described_class.extract('3319217280')).to be_empty
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ require 'identifiers/national_clinical_trial_id'
2
+
3
+ RSpec.describe Identifiers::NationalClinicalTrialId do
4
+ it 'extract NCT IDs' do
5
+ expect(described_class.extract("NCT00000106\nNCT00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
6
+ end
7
+
8
+ it 'normalizes NCT IDs' do
9
+ expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
10
+ end
11
+ end
@@ -0,0 +1,13 @@
1
+ require 'identifiers/pubmed_id'
2
+
3
+ RSpec.describe Identifiers::PubmedId do
4
+ it 'extracts PubMed IDs' do
5
+ expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
6
+ end
7
+
8
+ it 'does not return outputs with PubMed IDs in DOIs' do
9
+ str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
10
+
11
+ expect(described_class.extract(str)).to be_empty
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ require 'identifiers/repec_id'
2
+
3
+ RSpec.describe Identifiers::RepecId do
4
+ it 'extracts RePEc IDs' do
5
+ str = "RePEc:wbk:wbpubs:2266\nRePEc:inn:wpaper:2016-03"
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
8
+ end
9
+
10
+ it 'normalizes RePec IDs' do
11
+ str = "REPEC:wbk:wbpubs:2266\nrepec:inn:wpaper:2016-03"
12
+
13
+ expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
14
+ end
15
+ end
@@ -0,0 +1,9 @@
1
+ require 'identifiers/urn'
2
+
3
+ RSpec.describe Identifiers::URN do
4
+ it 'extracts URNs' do
5
+ str = 'En un pueblo italiano urn:1234:abc al pie de la montaña URN:foo:bar%23.\\'
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('urn:1234:abc', 'URN:foo:bar%23.')
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ RSpec.configure do |config|
2
+ config.filter_run :focus
3
+ config.run_all_when_everything_filtered = true
4
+ config.example_status_persistence_file_path = "spec/examples.txt"
5
+ config.disable_monkey_patching!
6
+ config.warnings = true
7
+ config.order = :random
8
+ config.default_formatter = 'doc' if config.files_to_run.one?
9
+ Kernel.srand config.seed
10
+
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+
15
+ config.mock_with :rspec do |mocks|
16
+ mocks.verify_partial_doubles = true
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: identifiers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jonathan Hernandez
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-10-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: urn
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.10'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.4'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.4'
69
+ description:
70
+ email:
71
+ - support@altmetric.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - CHANGELOG.md
77
+ - LICENSE.txt
78
+ - README.md
79
+ - lib/identifiers.rb
80
+ - lib/identifiers/ads_bibcode.rb
81
+ - lib/identifiers/arxiv_id.rb
82
+ - lib/identifiers/doi.rb
83
+ - lib/identifiers/handle.rb
84
+ - lib/identifiers/isbn.rb
85
+ - lib/identifiers/national_clinical_trial_id.rb
86
+ - lib/identifiers/pubmed_id.rb
87
+ - lib/identifiers/repec_id.rb
88
+ - lib/identifiers/urn.rb
89
+ - spec/identifiers/ads_bibcode_spec.rb
90
+ - spec/identifiers/arxiv_id_spec.rb
91
+ - spec/identifiers/doi_spec.rb
92
+ - spec/identifiers/handle_spec.rb
93
+ - spec/identifiers/isbn_spec.rb
94
+ - spec/identifiers/national_clinical_trial_id_spec.rb
95
+ - spec/identifiers/pubmed_id_spec.rb
96
+ - spec/identifiers/repec_id_spec.rb
97
+ - spec/identifiers/urn_spec.rb
98
+ - spec/spec_helper.rb
99
+ homepage: https://github.com/altmetric/identifiers
100
+ licenses:
101
+ - MIT
102
+ metadata: {}
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.5.1
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Utilities library for various scholarly identifiers used by Altmetric
123
+ test_files:
124
+ - spec/identifiers/ads_bibcode_spec.rb
125
+ - spec/identifiers/arxiv_id_spec.rb
126
+ - spec/identifiers/doi_spec.rb
127
+ - spec/identifiers/handle_spec.rb
128
+ - spec/identifiers/isbn_spec.rb
129
+ - spec/identifiers/national_clinical_trial_id_spec.rb
130
+ - spec/identifiers/pubmed_id_spec.rb
131
+ - spec/identifiers/repec_id_spec.rb
132
+ - spec/identifiers/urn_spec.rb
133
+ - spec/spec_helper.rb