bolognese 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +56 -0
  3. data/.travis.yml +23 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +118 -0
  6. data/LICENSE +21 -0
  7. data/README.md +3 -0
  8. data/Rakefile +12 -0
  9. data/bin/bolognese +5 -0
  10. data/bolognese.gemspec +40 -0
  11. data/lib/bolognese.rb +12 -0
  12. data/lib/bolognese/author_utils.rb +61 -0
  13. data/lib/bolognese/cli.rb +38 -0
  14. data/lib/bolognese/crossref.rb +202 -0
  15. data/lib/bolognese/datacite.rb +157 -0
  16. data/lib/bolognese/date_utils.rb +48 -0
  17. data/lib/bolognese/doi_utils.rb +48 -0
  18. data/lib/bolognese/github.rb +106 -0
  19. data/lib/bolognese/metadata.rb +30 -0
  20. data/lib/bolognese/orcid.rb +24 -0
  21. data/lib/bolognese/pid_utils.rb +23 -0
  22. data/lib/bolognese/pubmed.rb +34 -0
  23. data/lib/bolognese/string.rb +5 -0
  24. data/lib/bolognese/utils.rb +27 -0
  25. data/lib/bolognese/version.rb +3 -0
  26. data/spec/cli_spec.rb +37 -0
  27. data/spec/crossref_spec.rb +113 -0
  28. data/spec/datacite_spec.rb +49 -0
  29. data/spec/doi_spec.rb +89 -0
  30. data/spec/fixtures/crossref.xml +742 -0
  31. data/spec/fixtures/datacite.xml +40 -0
  32. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/crossref/as_crossref.yml +760 -0
  33. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/crossref/as_schema_org.yml +1476 -0
  34. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/datacite/as_datacite.yml +214 -0
  35. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/datacite/as_schema_org.yml +384 -0
  36. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/crossref.yml +44 -0
  37. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/datacite.yml +44 -0
  38. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/medra.yml +44 -0
  39. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/not_found.yml +44 -0
  40. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/DOI_test.yml +843 -0
  41. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/DOI_with_SICI_DOI.yml +277 -0
  42. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/DOI_with_data_citation.yml +15755 -0
  43. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/date_in_future.yml +2691 -0
  44. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/journal_article.yml +1857 -0
  45. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/not_found_error.yml +93 -0
  46. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/posted_content.yml +5715 -0
  47. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/get_metadata/BlogPosting.yml +307 -0
  48. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/get_metadata/Dataset.yml +343 -0
  49. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/crossref.yml +44 -0
  50. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/crossref_doi_not_url.yml +44 -0
  51. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/datacite.yml +44 -0
  52. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/datacite_doi_http.yml +44 -0
  53. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/orcid.yml +44 -0
  54. data/spec/metadata_spec.rb +35 -0
  55. data/spec/orcid_spec.rb +23 -0
  56. data/spec/spec_helper.rb +88 -0
  57. metadata +419 -0
@@ -0,0 +1,157 @@
1
+ require_relative 'doi_utils'
2
+ require_relative 'utils'
3
+
4
+ module Bolognese
5
+ class Datacite < Metadata
6
+ include Bolognese::DoiUtils
7
+ include Bolognese::Utils
8
+
9
+ DATACITE_TYPE_TRANSLATIONS = {
10
+ "Audiovisual" => "VideoObject",
11
+ "Collection" => "Collection",
12
+ "Dataset" => "Dataset",
13
+ "Event" => "Event",
14
+ "Image" => "ImageObject",
15
+ "InteractiveResource" => nil,
16
+ "Model" => nil,
17
+ "PhysicalObject" => nil,
18
+ "Service" => "Service",
19
+ "Software" => "SoftwareSourceCode",
20
+ "Sound" => "AudioObject",
21
+ "Text" => "ScholarlyArticle",
22
+ "Workflow" => nil,
23
+ "Other" => "CreativeWork"
24
+ }
25
+
26
+ attr_reader = :id, :metadata, :schema_org
27
+
28
+ def initialize(doi)
29
+ @id = normalize_doi(doi)
30
+ end
31
+
32
+ def raw
33
+ response = Maremma.get(id, accept: "application/vnd.datacite.datacite+xml", raw: true)
34
+ @raw = response.body.fetch("data", nil)
35
+ end
36
+
37
+ def metadata
38
+ @metadata ||= raw.present? ? Maremma.from_xml(raw).fetch("resource", {}) : {}
39
+ end
40
+
41
+ def exists?
42
+ metadata.present?
43
+ end
44
+
45
+ def type
46
+ k = metadata.dig("resourceType", "resourceTypeGeneral")
47
+ DATACITE_TYPE_TRANSLATIONS[k.to_s.dasherize] || "CreativeWork"
48
+ end
49
+
50
+ def additional_type
51
+ metadata.fetch("resourceType", {}).fetch("text", nil) ||
52
+ metadata.fetch("resourceType", {}).fetch("resourceTypeGeneral", nil)
53
+ end
54
+
55
+ def name
56
+ metadata.dig("titles", "title")
57
+ end
58
+
59
+ def alternate_name
60
+ metadata.dig("alternateIdentifiers", "alternateIdentifier", "text")
61
+ end
62
+
63
+ def description
64
+ metadata.dig("descriptions", "description", "text")
65
+ end
66
+
67
+ def license
68
+ metadata.dig("rightsList", "rights", "rightsURI")
69
+ end
70
+
71
+ def keywords
72
+ Array(metadata.dig("subjects", "subject")).join(", ")
73
+ end
74
+
75
+ def author
76
+ authors = metadata.dig("creators", "creator")
77
+ authors = [authors] if authors.is_a?(Hash)
78
+ get_authors(authors)
79
+ end
80
+
81
+ def version
82
+ metadata.fetch("version", nil)
83
+ end
84
+
85
+ def dates
86
+ Array(metadata.dig("dates", "date"))
87
+ end
88
+
89
+ def date_created
90
+ created = dates.find { |d| d["dateType"] == "Created" } || {}
91
+ created.fetch("text", nil)
92
+ end
93
+
94
+ def date_published
95
+ published = dates.find { |d| d["dateType"] == "Issued" } || {}
96
+ published.fetch("text", nil) || metadata.fetch("publicationYear")
97
+ end
98
+
99
+ def date_modified
100
+ modified = dates.find { |d| d["dateType"] == "Updated" } || {}
101
+ modified.fetch("text", nil)
102
+ end
103
+
104
+ def related_identifiers(relation_type)
105
+ Array(metadata.dig("relatedIdentifiers", "relatedIdentifier"))
106
+ .select { |r| relation_type.split(" ").include?(r["relationType"]) && %w(DOI URL).include?(r["relatedIdentifierType"]) }
107
+ .map do |work|
108
+ work_id = work["relatedIdentifierType"] == "DOI" ? normalize_doi(work["text"]) : work["text"]
109
+ { "@type" => "CreativeWork",
110
+ "@id" => work_id }
111
+ end
112
+ end
113
+
114
+ def is_part_of
115
+ related_identifiers("IsPartOf").first
116
+ end
117
+
118
+ def has_part
119
+ related_identifiers("HasPart").presence
120
+ end
121
+
122
+ def citation
123
+ related_identifiers("Cites IsCitedBy Supplements IsSupplementTo References IsReferencedBy").presence
124
+ end
125
+
126
+ def publisher
127
+ metadata.fetch("publisher")
128
+ end
129
+
130
+ def provider
131
+ { "@type" => "Organization",
132
+ "name" => "DataCite" }
133
+ end
134
+
135
+ def as_schema_org
136
+ { "@context" => "http://schema.org",
137
+ "@type" => type,
138
+ "@id" => id,
139
+ "name" => name,
140
+ "alternateName" => alternate_name,
141
+ "author" => author,
142
+ "description" => description,
143
+ "license" => license,
144
+ "version" => version,
145
+ "keywords" => keywords,
146
+ "dateCreated" => date_created,
147
+ "datePublished" => date_published,
148
+ "dateModified" => date_modified,
149
+ "isPartOf" => is_part_of,
150
+ "hasPart" => has_part,
151
+ "citation" => citation,
152
+ "publisher" => publisher,
153
+ "provider" => provider
154
+ }.compact
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,48 @@
1
+ module Bolognese
2
+ module DateUtils
3
+ def get_date_parts(iso8601_time)
4
+ return { "date_parts" => [[]] } if iso8601_time.nil?
5
+
6
+ year = iso8601_time[0..3].to_i
7
+ month = iso8601_time[5..6].to_i
8
+ day = iso8601_time[8..9].to_i
9
+ { 'date-parts' => [[year, month, day].reject { |part| part == 0 }] }
10
+ end
11
+
12
+ def get_year_month(iso8601_time)
13
+ return [] if iso8601_time.nil?
14
+
15
+ year = iso8601_time[0..3]
16
+ month = iso8601_time[5..6]
17
+
18
+ [year.to_i, month.to_i].reject { |part| part == 0 }
19
+ end
20
+
21
+ def get_year_month_day(iso8601_time)
22
+ return [] if iso8601_time.nil?
23
+
24
+ year = iso8601_time[0..3]
25
+ month = iso8601_time[5..6]
26
+ day = iso8601_time[8..9]
27
+
28
+ [year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }
29
+ end
30
+
31
+ def get_date_parts_from_parts(year, month = nil, day = nil)
32
+ { 'date-parts' => [[year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }] }
33
+ end
34
+
35
+ def get_date_from_parts(year, month = nil, day = nil)
36
+ [year.to_s.rjust(4, '0'), month.to_s.rjust(2, '0'), day.to_s.rjust(2, '0')].reject { |part| part == "00" }.join("-")
37
+ end
38
+
39
+ # parsing of incomplete iso8601 timestamps such as 2015-04 is broken
40
+ # in standard library
41
+ # return nil if invalid iso8601 timestamp
42
+ def get_datetime_from_iso8601(iso8601_time)
43
+ ISO8601::DateTime.new(iso8601_time).to_time.utc
44
+ rescue
45
+ nil
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,48 @@
1
+ module Bolognese
2
+ module DoiUtils
3
+ def validate_doi(doi)
4
+ Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(doi)).last
5
+ end
6
+
7
+ def normalize_doi(doi)
8
+ doi = validate_doi(doi)
9
+ return nil unless doi.present?
10
+
11
+ # remove non-printing whitespace and downcase
12
+ doi = doi.gsub(/\u200B/, '').downcase
13
+
14
+ # turn DOI into URL, escape unsafe characters
15
+ "https://doi.org/" + Addressable::URI.encode(doi)
16
+ end
17
+
18
+ def doi_from_url(url)
19
+ if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
20
+ uri = Addressable::URI.parse(url)
21
+ uri.path[1..-1].upcase
22
+ elsif url.is_a?(String) && url.starts_with?("doi:")
23
+ url[4..-1].upcase
24
+ end
25
+ end
26
+
27
+ def doi_as_url(doi)
28
+ "https://doi.org/#{clean_doi(doi)}" if doi.present?
29
+ end
30
+
31
+ # get DOI registration agency, assume a normalized DOI
32
+ def get_doi_ra(doi)
33
+ return {} if doi.blank?
34
+
35
+ url = "https://doi.crossref.org/doiRA/#{doi_from_url(doi)}"
36
+ response = Maremma.get(url, host: true, timeout: 120)
37
+
38
+ ra = response.body.fetch("data", {}).first.fetch("RA", nil)
39
+ if ra.present?
40
+ { "id" => ra.downcase,
41
+ "name" => ra }
42
+ else
43
+ { "errors" => response.body.fetch("errors", nil) || response.body.fetch("data", nil) }
44
+ end
45
+ end
46
+
47
+ end
48
+ end
@@ -0,0 +1,106 @@
1
+ module Bolognese
2
+ class Github < Metadata
3
+ # def get_github_metadata(url, options = {})
4
+ # return {} if url.blank?
5
+
6
+ # github_hash = github_from_url(url)
7
+ # repo_url = "https://api.github.com/repos/#{github_hash[:owner]}/#{github_hash[:repo]}"
8
+ # response = Maremma.get(repo_url, options.merge(bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN']))
9
+
10
+ # return { error: 'Resource not found.', status: 404 } if response.body.fetch("errors", nil).present?
11
+
12
+ # author = get_github_owner(github_hash[:owner])
13
+
14
+ # language = response.body.fetch("data", {}).fetch('language', nil)
15
+ # type = language.present? && language != "HTML" ? 'computer_program' : 'webpage'
16
+
17
+ # { "author" => [get_one_author(author)],
18
+ # "title" => response.body.fetch("data", {}).fetch('description', nil).presence || github_hash[:repo],
19
+ # "container-title" => "Github",
20
+ # "issued" => response.body.fetch("data", {}).fetch('created_at', nil).presence || "0000",
21
+ # "URL" => url,
22
+ # "type" => type }
23
+ # end
24
+
25
+ # def get_github_owner_metadata(url, options = {})
26
+ # return {} if url.blank?
27
+
28
+ # github_hash = github_from_url(url)
29
+ # owner_url = "https://api.github.com/users/#{github_hash[:owner]}"
30
+ # response = Maremma.get(owner_url, options.merge(bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN']))
31
+
32
+ # return { error: 'Resource not found.', status: 404 } if response.body.fetch("data", {}).fetch("message", nil) == "Not Found"
33
+
34
+ # author = response.body.fetch("data", {}).fetch('name', nil).presence || github_hash[:owner]
35
+ # title = "Github profile for #{author}"
36
+
37
+ # { "author" => [get_one_author(author)],
38
+ # "title" => title,
39
+ # "container-title" => "Github",
40
+ # "issued" => response.body.fetch("data", {}).fetch('created_at', nil).presence || "0000",
41
+ # "URL" => url,
42
+ # "type" => 'entry' }
43
+ # end
44
+
45
+ # def get_github_release_metadata(url, options = {})
46
+ # return {} if url.blank?
47
+
48
+ # github_hash = github_from_url(url)
49
+ # release_url = "https://api.github.com/repos/#{github_hash[:owner]}/#{github_hash[:repo]}/releases/tags/#{github_hash[:release]}"
50
+ # response = Maremma.get(release_url, options.merge(bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN']))
51
+
52
+ # return { error: 'Resource not found.', status: 404 } if response.body.fetch("data", {})["message"] == "Not Found"
53
+
54
+ # author = get_github_owner(github_hash[:owner])
55
+
56
+ # { "author" => [get_one_author(author)],
57
+ # "title" => response.body.fetch("data", {}).fetch('name', nil),
58
+ # "container-title" => "Github",
59
+ # "issued" => response.body.fetch("data", {}).fetch('created_at', nil).presence || "0000",
60
+ # "URL" => url,
61
+ # "type" => 'computer_program' }
62
+ # end
63
+
64
+ # def get_github_owner(owner)
65
+ # url = "https://api.github.com/users/#{owner}"
66
+ # response = Maremma.get(url, bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN'])
67
+
68
+ # return nil if response.body.fetch("data", {}).fetch("message", nil) == "Not Found"
69
+
70
+ # response.body.fetch("data", {}).fetch('name', nil).presence || owner
71
+ # end
72
+
73
+ # def github_from_url(url)
74
+ # return {} unless /\Ahttps:\/\/github\.com\/(.+)(?:\/)?(.+)?(?:\/tree\/)?(.*)\z/.match(url)
75
+ # words = URI.parse(url).path[1..-1].split('/')
76
+
77
+ # { owner: words[0],
78
+ # repo: words[1],
79
+ # release: words[3] }.compact
80
+ # end
81
+
82
+ # def github_repo_from_url(url)
83
+ # github_from_url(url).fetch(:repo, nil)
84
+ # end
85
+
86
+ # def github_release_from_url(url)
87
+ # github_from_url(url).fetch(:release, nil)
88
+ # end
89
+
90
+ # def github_owner_from_url(url)
91
+ # github_from_url(url).fetch(:owner, nil)
92
+ # end
93
+
94
+ # def github_as_owner_url(github_hash)
95
+ # "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present?
96
+ # end
97
+
98
+ # def github_as_repo_url(github_hash)
99
+ # "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}" if github_hash[:repo].present?
100
+ # end
101
+
102
+ # def github_as_release_url(github_hash)
103
+ # "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}" if github_hash[:release].present?
104
+ # end
105
+ end
106
+ end
@@ -0,0 +1,30 @@
1
+ require_relative 'doi_utils'
2
+ require_relative 'author_utils'
3
+ require_relative 'date_utils'
4
+ require_relative 'pid_utils'
5
+ require_relative 'utils'
6
+
7
+ module Bolognese
8
+ class Metadata
9
+ include Bolognese::DoiUtils
10
+ include Bolognese::AuthorUtils
11
+ include Bolognese::DateUtils
12
+ include Bolognese::PidUtils
13
+ include Bolognese::Utils
14
+
15
+ attr_reader :id, :provider
16
+
17
+ def initialize(id)
18
+ @id = normalize_id(id)
19
+ @provider = find_provider(@id)
20
+ end
21
+
22
+ def normalize_id(id)
23
+ normalize_doi(id) || normalize_orcid(id)
24
+ end
25
+
26
+ def find_provider(id)
27
+ get_doi_ra(id).fetch("id", nil) || "orcid"
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,24 @@
1
+ module Bolognese
2
+ class Orcid < Metadata
3
+ include Bolognese::PidUtils
4
+ # def get_orcid_metadata(orcid, options = {})
5
+ # return {} if orcid.blank?
6
+
7
+ # url = "https://pub.orcid.org/v2.0/#{orcid}/person"
8
+ # response = Maremma.get(url, options.merge(accept: "json"))
9
+
10
+ # name = response.body.fetch("data", {}).fetch("name", nil)
11
+ # return { "errors" => 'Resource not found.' } unless name.present?
12
+
13
+ # author = { "family" => name.fetch("family-name", {}).fetch("value", nil),
14
+ # "given" => name.fetch("given-names", {}).fetch("value", nil) }
15
+
16
+ # { "author" => [author],
17
+ # "title" => "ORCID record for #{[author.fetch('given', nil), author.fetch('family', nil)].compact.join(' ')}",
18
+ # "container-title" => "ORCID Registry",
19
+ # "issued" => Time.now.year.to_s,
20
+ # "URL" => orcid_as_url(orcid),
21
+ # "type" => 'entry' }
22
+ # end
23
+ end
24
+ end
@@ -0,0 +1,23 @@
1
+ module Bolognese
2
+ module PidUtils
3
+ def normalize_orcid(orcid)
4
+ orcid = validate_orcid(orcid)
5
+ return nil unless orcid.present?
6
+
7
+ # turn ORCID ID into URL
8
+ "http://orcid.org/" + Addressable::URI.encode(orcid)
9
+ end
10
+
11
+ def orcid_from_url(url)
12
+ Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
13
+ end
14
+
15
+ def orcid_as_url(orcid)
16
+ "http://orcid.org/#{orcid}" if orcid.present?
17
+ end
18
+
19
+ def validate_orcid(orcid)
20
+ Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,34 @@
1
+ module Bolognese
2
+ module Pubmed
3
+ # def get_pubmed_metadata(pmid, options = {})
4
+ # return {} if pmid.blank?
5
+
6
+ # url = "http://www.ebi.ac.uk/europepmc/webservices/rest/search/query=ext_id:#{pmid}&format=json"
7
+ # response = Maremma.get(url, options)
8
+
9
+ # metadata = response.body.fetch("data", {}).fetch("resultList", {}).fetch("result", []).first
10
+ # return { error: 'Resource not found.', status: 404 } if metadata.blank?
11
+
12
+ # metadata["issued"] = metadata.fetch("pubYear", nil)
13
+
14
+ # author_string = metadata.fetch("authorString", "").chomp(".")
15
+ # metadata["author"] = get_authors(author_string.split(", "))
16
+
17
+ # metadata["title"] = metadata.fetch("title", "").chomp(".")
18
+ # metadata["container-title"] = metadata.fetch("journalTitle", nil)
19
+ # metadata["volume"] = metadata.fetch("journalVolume", nil)
20
+ # metadata["page"] = metadata.fetch("pageInfo", nil)
21
+ # metadata["type"] = "article-journal"
22
+
23
+ # metadata
24
+ # end
25
+
26
+ def pmid_as_url(pmid)
27
+ "http://www.ncbi.nlm.nih.gov/pubmed/#{pmid}" if pmid.present?
28
+ end
29
+
30
+ def pmcid_as_url(pmcid)
31
+ "http://www.ncbi.nlm.nih.gov/pmc/articles/PMC#{pmcid}" if pmcid.present?
32
+ end
33
+ end
34
+ end