identifiers 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 64dea49b2835751962c267670fdef1c57696eceb
4
+ data.tar.gz: 5d4bfc54e1a55194e2286c80bd24cb7fa255aeb3
5
+ SHA512:
6
+ metadata.gz: b0dd1c0a6b57d8e3a4066487e79b8fda6552c2e443ae58869d1271abe8ed526ba8aaf3a4a9a1ec833df547934d4f4fcbce5e61858956b035af69bd868b95b23d
7
+ data.tar.gz: 93b41b272316d5cfe8ce134f9f2e09714c7723ed33ab1186e6b13d17409ca3fd00574d5bf26a0f68ed536e2a4db7db68536f983a191f9a3d193038650262d31b
data/CHANGELOG.md ADDED
@@ -0,0 +1,7 @@
1
+ # Change Log
2
+ All notable changes to this project will be documented in this file. This
3
+ project adheres to [Semantic Versioning](http://semver.org/).
4
+
5
+ ## [0.1.0] - 2016-10-21
6
+ ### Added
7
+ - Initial release
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Altmetric LLP
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # Identifiers
2
+
3
+ Collection of utilities related to the extraction, validation and normalization of various scholarly identifiers. The supported list is:
4
+ - [ADS Bibcodes](http://adsdoc.harvard.edu/abs_doc/help_pages/bibcodes.html)
5
+ - [arXiv](https://arxiv.org/help/arxiv_identifier)
6
+ - [DOI](https://www.doi.org/)
7
+ - [Handle](https://en.wikipedia.org/wiki/Handle_System)
8
+ - [ISBN](https://en.wikipedia.org/wiki/International_Standard_Book_Number)
9
+ - [National Clinic Trials](https://clinicaltrials.gov/)
10
+ - [PubMed](http://www.ncbi.nlm.nih.gov/pubmed)
11
+ - [RePEc](https://en.wikipedia.org/wiki/Research_Papers_in_Economics)
12
+ - [URN](https://en.wikipedia.org/wiki/Uniform_Resource_Name)
13
+
14
+ ## Installation
15
+
16
+ Add this line to your application's Gemfile:
17
+
18
+ ```ruby
19
+ gem 'identifiers', '~> 0.1'
20
+ ```
21
+
22
+ And then execute:
23
+
24
+ $ bundle
25
+
26
+ Or install it yourself as:
27
+
28
+ $ gem install identifiers
29
+
30
+ ## Usage
31
+
32
+ ```ruby
33
+ Identifiers::DOI.extract('example: 10.123/abcd.efghi')
34
+
35
+ Identifiers::URN.new('urn:abc:123')
36
+ Identifiers::URN('urn:abc:123')
37
+ ```
38
+
39
+ ## Contributing
40
+
41
+ Bug reports and pull requests are welcome on GitHub at https://github.com/altmetric/identifiers.
42
+
43
+ ## License
44
+
45
+ Copyright © 2016 Altmetric LLP
46
+
47
+ Distributed under the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,12 @@
1
+ require 'identifiers/ads_bibcode'
2
+ require 'identifiers/arxiv_id'
3
+ require 'identifiers/doi'
4
+ require 'identifiers/handle'
5
+ require 'identifiers/isbn'
6
+ require 'identifiers/national_clinical_trial_id'
7
+ require 'identifiers/pubmed_id'
8
+ require 'identifiers/repec_id'
9
+ require 'identifiers/urn'
10
+
11
+ module Identifiers
12
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class AdsBibcode
3
+ def self.extract(str)
4
+ str.scan(/\b\d{4}[a-z][0-9a-z&.]{14}\b/i)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,19 @@
1
+ module Identifiers
2
+ class ArxivId
3
+ def self.extract(str)
4
+ extract_pre_2007_arxiv_ids(str) + extract_post_2007_arxiv_ids(str)
5
+ end
6
+
7
+ def self.extract_post_2007_arxiv_ids(str)
8
+ str
9
+ .scan(%r{(?<=^|\s|/)(?:arXiv:)?\d{4}\.\d{4,5}(?:v\d+)?(?=$|\s)}i)
10
+ .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
11
+ end
12
+
13
+ def self.extract_pre_2007_arxiv_ids(str)
14
+ str
15
+ .scan(%r{(?<=^|\s|/)(?:arXiv:)?[a-z-]+(?:\.[A-Z]{2})?/\d{2}(?:0[1-9]|1[012])\d{3}(?:v\d+)?(?=$|\s)}i)
16
+ .map { |arxiv_id| arxiv_id.sub(/\AarXiv:/i, '') }
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class DOI
3
+ def self.extract(str)
4
+ str.scan(%r{\b10\.\d{3,}/\S+\b}).map(&:downcase)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class Handle
3
+ def self.extract(str)
4
+ str.scan(%r{\b[0-9.]+/\S+\b}i)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,46 @@
1
+ module Identifiers
2
+ class ISBN
3
+ REGEX_13 = /\b97[89]\d{10}\b/
4
+ REGEX_10 = /\b\d{9}(?:\d|X)\b/
5
+
6
+ def self.extract(str)
7
+ extract_thirteen_digit_isbns(str) + extract_ten_digit_isbns(str)
8
+ end
9
+
10
+ def self.extract_thirteen_digit_isbns(str)
11
+ str.tr('-', '').scan(REGEX_13).select { |isbn| valid_isbn_13?(isbn) }
12
+ end
13
+
14
+ def self.extract_ten_digit_isbns(str)
15
+ str.tr('-', '').scan(REGEX_10).select { |isbn| valid_isbn_10?(isbn) }.map { |isbn|
16
+ isbn.chop!
17
+ isbn.prepend('978')
18
+ isbn << isbn_13_check_digit(isbn).to_s
19
+
20
+ isbn
21
+ }
22
+ end
23
+
24
+ def self.isbn_13_check_digit(isbn)
25
+ 10 - isbn.each_char.zip([1, 3].cycle).reduce(0) { |sum, values| sum + (Integer(values[0]) * values[1]) } % 10
26
+ end
27
+
28
+ def self.valid_isbn_13?(isbn)
29
+ return false unless isbn =~ REGEX_13
30
+
31
+ result = isbn.each_char.zip([1, 3].cycle).reduce(0) { |sum, values| sum + (Integer(values[0]) * values[1]) }
32
+
33
+ (result % 10).zero?
34
+ end
35
+
36
+ def self.valid_isbn_10?(isbn)
37
+ return false unless isbn =~ REGEX_10
38
+
39
+ result = isbn.each_char.with_index.reduce(0) { |sum, values|
40
+ sum + (Integer(values[0].sub('X', '10')) * values[1].succ)
41
+ }
42
+
43
+ (result % 11).zero?
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class NationalClinicalTrialId
3
+ def self.extract(str)
4
+ str.scan(/\bNCT\d+\b/i).map(&:upcase)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class PubmedId
3
+ def self.extract(str)
4
+ str.scan(/(?<=^|\s)\d+(?=$|\s)/)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module Identifiers
2
+ class RepecId
3
+ def self.extract(str)
4
+ str.scan(/\brepec:\S+\b/i).map { |repec| "RePEc:#{repec.split(':', 2).last}" }
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,5 @@
1
+ require 'urn'
2
+
3
+ module Identifiers
4
+ URN = ::URN
5
+ end
@@ -0,0 +1,19 @@
1
+ require 'identifiers/ads_bibcode'
2
+
3
+ RSpec.describe Identifiers::AdsBibcode do
4
+ describe '#extract' do
5
+ it 'does extract a bibcode' do
6
+ str = 'This is a Bibcode: 1974AJ.....79..819H'
7
+
8
+ expect(described_class.extract(str)).to contain_exactly('1974AJ.....79..819H')
9
+ end
10
+
11
+ it 'does extract a PhD Thesis Bibcode' do
12
+ expect(described_class.extract('2004PhRvL..93o0801M')).to contain_exactly('2004PhRvL..93o0801M')
13
+ end
14
+
15
+ it 'does not extract Bibcodes from DOIs' do
16
+ expect(described_class.extract('10.1097/01.ASW.0000443266.17665.19')).to be_empty
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,25 @@
1
+ require 'identifiers/arxiv_id'
2
+
3
+ RSpec.describe Identifiers::ArxivId do
4
+ describe '#extract' do
5
+ it 'does extract a pre 2007 arXiv ID' do
6
+ expect(described_class.extract('Example: math.GT/0309136')).to contain_exactly('math.GT/0309136')
7
+ end
8
+
9
+ it 'does extract a post 2007 unversioned arXiv ID' do
10
+ expect(described_class.extract('Example: arXiv:0706.0001')).to contain_exactly('0706.0001')
11
+ end
12
+
13
+ it 'does extract a post 2007 versioned arXiv ID' do
14
+ expect(described_class.extract('Example: arXiv:1501.00001v2')).to contain_exactly('1501.00001v2')
15
+ end
16
+
17
+ it 'does not extract IDs from DOIs that end in a valid arXiv ID' do
18
+ expect(described_class.extract('10.1049/el.2013.3006')).to be_empty
19
+ end
20
+
21
+ it 'does not extract IDs from DOIs that contain a valid arXiv ID' do
22
+ expect(described_class.extract('10.2310/7290.2014.00033')).to be_empty
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,21 @@
1
+ require 'identifiers/doi'
2
+
3
+ RSpec.describe Identifiers::DOI do
4
+ it 'extracts DOIs from a string' do
5
+ str = 'This is an example of DOI: 10.1049/el.2013.3006'
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('10.1049/el.2013.3006')
8
+ end
9
+
10
+ it 'downcase the DOIs extracted' do
11
+ str = 'This is an example of DOI: 10.1097/01.ASW.0000443266.17665.19'
12
+
13
+ expect(described_class.extract(str)).to contain_exactly('10.1097/01.asw.0000443266.17665.19')
14
+ end
15
+
16
+ it 'does not extract a PUBMED ID' do
17
+ str = 'This is NOT a DOI: 123456'
18
+
19
+ expect(described_class.extract(str)).to be_empty
20
+ end
21
+ end
@@ -0,0 +1,15 @@
1
+ require 'identifiers/handle'
2
+
3
+ RSpec.describe Identifiers::Handle do
4
+ it 'extracts a Handle' do
5
+ str = 'http://hdl.handle.net/10149/596901'
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('10149/596901')
8
+ end
9
+
10
+ it 'extracts another Handle' do
11
+ str = 'http://hdl.handle.net/2117/83545it.ly/1UtXnTW'
12
+
13
+ expect(described_class.extract(str)).to contain_exactly('2117/83545it.ly/1UtXnTW')
14
+ end
15
+ end
@@ -0,0 +1,26 @@
1
+ require 'identifiers/isbn'
2
+
3
+ RSpec.describe Identifiers::ISBN do
4
+ it 'extracts a ISBN' do
5
+ expect(described_class.extract('ISBN: 9780805069099')).to contain_exactly('9780805069099')
6
+ end
7
+
8
+ it 'normalizes 13-digit ISBNs' do
9
+ str = "978-0-80-506909-9\n978-0-67-187919-8"
10
+
11
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9780671879198')
12
+ end
13
+
14
+ it 'normalizes 10-digit ISBNs' do
15
+ str = "0-8050-6909-7 \n 2-7594-0269-X"
16
+ expect(described_class.extract(str)).to contain_exactly('9780805069099', '9782759402694')
17
+ end
18
+
19
+ it 'does not extract invalid 13-digit ISBNs' do
20
+ expect(described_class.extract('9783319217280')).to be_empty
21
+ end
22
+
23
+ it 'does not extract invalid 10-digit ISBNs' do
24
+ expect(described_class.extract('3319217280')).to be_empty
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ require 'identifiers/national_clinical_trial_id'
2
+
3
+ RSpec.describe Identifiers::NationalClinicalTrialId do
4
+ it 'extract NCT IDs' do
5
+ expect(described_class.extract("NCT00000106\nNCT00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
6
+ end
7
+
8
+ it 'normalizes NCT IDs' do
9
+ expect(described_class.extract("nct00000106\nnCt00000107")).to contain_exactly('NCT00000106', 'NCT00000107')
10
+ end
11
+ end
@@ -0,0 +1,13 @@
1
+ require 'identifiers/pubmed_id'
2
+
3
+ RSpec.describe Identifiers::PubmedId do
4
+ it 'extracts PubMed IDs' do
5
+ expect(described_class.extract("123\n456")).to contain_exactly('123', '456')
6
+ end
7
+
8
+ it 'does not return outputs with PubMed IDs in DOIs' do
9
+ str = "10.1038/nplants.2015.3\n10.1126/science.286.5445.1679e"
10
+
11
+ expect(described_class.extract(str)).to be_empty
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ require 'identifiers/repec_id'
2
+
3
+ RSpec.describe Identifiers::RepecId do
4
+ it 'extracts RePEc IDs' do
5
+ str = "RePEc:wbk:wbpubs:2266\nRePEc:inn:wpaper:2016-03"
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
8
+ end
9
+
10
+ it 'normalizes RePec IDs' do
11
+ str = "REPEC:wbk:wbpubs:2266\nrepec:inn:wpaper:2016-03"
12
+
13
+ expect(described_class.extract(str)).to contain_exactly('RePEc:wbk:wbpubs:2266', 'RePEc:inn:wpaper:2016-03')
14
+ end
15
+ end
@@ -0,0 +1,9 @@
1
+ require 'identifiers/urn'
2
+
3
+ RSpec.describe Identifiers::URN do
4
+ it 'extracts URNs' do
5
+ str = 'En un pueblo italiano urn:1234:abc al pie de la montaña URN:foo:bar%23.\\'
6
+
7
+ expect(described_class.extract(str)).to contain_exactly('urn:1234:abc', 'URN:foo:bar%23.')
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ RSpec.configure do |config|
2
+ config.filter_run :focus
3
+ config.run_all_when_everything_filtered = true
4
+ config.example_status_persistence_file_path = "spec/examples.txt"
5
+ config.disable_monkey_patching!
6
+ config.warnings = true
7
+ config.order = :random
8
+ config.default_formatter = 'doc' if config.files_to_run.one?
9
+ Kernel.srand config.seed
10
+
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+
15
+ config.mock_with :rspec do |mocks|
16
+ mocks.verify_partial_doubles = true
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: identifiers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jonathan Hernandez
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-10-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: urn
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.10'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.10'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.4'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.4'
69
+ description:
70
+ email:
71
+ - support@altmetric.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - CHANGELOG.md
77
+ - LICENSE.txt
78
+ - README.md
79
+ - lib/identifiers.rb
80
+ - lib/identifiers/ads_bibcode.rb
81
+ - lib/identifiers/arxiv_id.rb
82
+ - lib/identifiers/doi.rb
83
+ - lib/identifiers/handle.rb
84
+ - lib/identifiers/isbn.rb
85
+ - lib/identifiers/national_clinical_trial_id.rb
86
+ - lib/identifiers/pubmed_id.rb
87
+ - lib/identifiers/repec_id.rb
88
+ - lib/identifiers/urn.rb
89
+ - spec/identifiers/ads_bibcode_spec.rb
90
+ - spec/identifiers/arxiv_id_spec.rb
91
+ - spec/identifiers/doi_spec.rb
92
+ - spec/identifiers/handle_spec.rb
93
+ - spec/identifiers/isbn_spec.rb
94
+ - spec/identifiers/national_clinical_trial_id_spec.rb
95
+ - spec/identifiers/pubmed_id_spec.rb
96
+ - spec/identifiers/repec_id_spec.rb
97
+ - spec/identifiers/urn_spec.rb
98
+ - spec/spec_helper.rb
99
+ homepage: https://github.com/altmetric/identifiers
100
+ licenses:
101
+ - MIT
102
+ metadata: {}
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.5.1
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Utilities library for various scholarly identifiers used by Altmetric
123
+ test_files:
124
+ - spec/identifiers/ads_bibcode_spec.rb
125
+ - spec/identifiers/arxiv_id_spec.rb
126
+ - spec/identifiers/doi_spec.rb
127
+ - spec/identifiers/handle_spec.rb
128
+ - spec/identifiers/isbn_spec.rb
129
+ - spec/identifiers/national_clinical_trial_id_spec.rb
130
+ - spec/identifiers/pubmed_id_spec.rb
131
+ - spec/identifiers/repec_id_spec.rb
132
+ - spec/identifiers/urn_spec.rb
133
+ - spec/spec_helper.rb