bolognese 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +56 -0
  3. data/.travis.yml +23 -0
  4. data/Gemfile +3 -0
  5. data/Gemfile.lock +118 -0
  6. data/LICENSE +21 -0
  7. data/README.md +3 -0
  8. data/Rakefile +12 -0
  9. data/bin/bolognese +5 -0
  10. data/bolognese.gemspec +40 -0
  11. data/lib/bolognese.rb +12 -0
  12. data/lib/bolognese/author_utils.rb +61 -0
  13. data/lib/bolognese/cli.rb +38 -0
  14. data/lib/bolognese/crossref.rb +202 -0
  15. data/lib/bolognese/datacite.rb +157 -0
  16. data/lib/bolognese/date_utils.rb +48 -0
  17. data/lib/bolognese/doi_utils.rb +48 -0
  18. data/lib/bolognese/github.rb +106 -0
  19. data/lib/bolognese/metadata.rb +30 -0
  20. data/lib/bolognese/orcid.rb +24 -0
  21. data/lib/bolognese/pid_utils.rb +23 -0
  22. data/lib/bolognese/pubmed.rb +34 -0
  23. data/lib/bolognese/string.rb +5 -0
  24. data/lib/bolognese/utils.rb +27 -0
  25. data/lib/bolognese/version.rb +3 -0
  26. data/spec/cli_spec.rb +37 -0
  27. data/spec/crossref_spec.rb +113 -0
  28. data/spec/datacite_spec.rb +49 -0
  29. data/spec/doi_spec.rb +89 -0
  30. data/spec/fixtures/crossref.xml +742 -0
  31. data/spec/fixtures/datacite.xml +40 -0
  32. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/crossref/as_crossref.yml +760 -0
  33. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/crossref/as_schema_org.yml +1476 -0
  34. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/datacite/as_datacite.yml +214 -0
  35. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/datacite/as_schema_org.yml +384 -0
  36. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/crossref.yml +44 -0
  37. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/datacite.yml +44 -0
  38. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/medra.yml +44 -0
  39. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/doi_registration_agency/not_found.yml +44 -0
  40. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/DOI_test.yml +843 -0
  41. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/DOI_with_SICI_DOI.yml +277 -0
  42. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/DOI_with_data_citation.yml +15755 -0
  43. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/date_in_future.yml +2691 -0
  44. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/journal_article.yml +1857 -0
  45. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/not_found_error.yml +93 -0
  46. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/posted_content.yml +5715 -0
  47. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/get_metadata/BlogPosting.yml +307 -0
  48. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/get_metadata/Dataset.yml +343 -0
  49. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/crossref.yml +44 -0
  50. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/crossref_doi_not_url.yml +44 -0
  51. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/datacite.yml +44 -0
  52. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/datacite_doi_http.yml +44 -0
  53. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/orcid.yml +44 -0
  54. data/spec/metadata_spec.rb +35 -0
  55. data/spec/orcid_spec.rb +23 -0
  56. data/spec/spec_helper.rb +88 -0
  57. metadata +419 -0
@@ -0,0 +1,157 @@
1
+ require_relative 'doi_utils'
2
+ require_relative 'utils'
3
+
4
+ module Bolognese
5
+ class Datacite < Metadata
6
+ include Bolognese::DoiUtils
7
+ include Bolognese::Utils
8
+
9
+ DATACITE_TYPE_TRANSLATIONS = {
10
+ "Audiovisual" => "VideoObject",
11
+ "Collection" => "Collection",
12
+ "Dataset" => "Dataset",
13
+ "Event" => "Event",
14
+ "Image" => "ImageObject",
15
+ "InteractiveResource" => nil,
16
+ "Model" => nil,
17
+ "PhysicalObject" => nil,
18
+ "Service" => "Service",
19
+ "Software" => "SoftwareSourceCode",
20
+ "Sound" => "AudioObject",
21
+ "Text" => "ScholarlyArticle",
22
+ "Workflow" => nil,
23
+ "Other" => "CreativeWork"
24
+ }
25
+
26
+ attr_reader = :id, :metadata, :schema_org
27
+
28
+ def initialize(doi)
29
+ @id = normalize_doi(doi)
30
+ end
31
+
32
+ def raw
33
+ response = Maremma.get(id, accept: "application/vnd.datacite.datacite+xml", raw: true)
34
+ @raw = response.body.fetch("data", nil)
35
+ end
36
+
37
+ def metadata
38
+ @metadata ||= raw.present? ? Maremma.from_xml(raw).fetch("resource", {}) : {}
39
+ end
40
+
41
+ def exists?
42
+ metadata.present?
43
+ end
44
+
45
+ def type
46
+ k = metadata.dig("resourceType", "resourceTypeGeneral")
47
+ DATACITE_TYPE_TRANSLATIONS[k.to_s.dasherize] || "CreativeWork"
48
+ end
49
+
50
+ def additional_type
51
+ metadata.fetch("resourceType", {}).fetch("text", nil) ||
52
+ metadata.fetch("resourceType", {}).fetch("resourceTypeGeneral", nil)
53
+ end
54
+
55
+ def name
56
+ metadata.dig("titles", "title")
57
+ end
58
+
59
+ def alternate_name
60
+ metadata.dig("alternateIdentifiers", "alternateIdentifier", "text")
61
+ end
62
+
63
+ def description
64
+ metadata.dig("descriptions", "description", "text")
65
+ end
66
+
67
+ def license
68
+ metadata.dig("rightsList", "rights", "rightsURI")
69
+ end
70
+
71
+ def keywords
72
+ Array(metadata.dig("subjects", "subject")).join(", ")
73
+ end
74
+
75
+ def author
76
+ authors = metadata.dig("creators", "creator")
77
+ authors = [authors] if authors.is_a?(Hash)
78
+ get_authors(authors)
79
+ end
80
+
81
+ def version
82
+ metadata.fetch("version", nil)
83
+ end
84
+
85
+ def dates
86
+ Array(metadata.dig("dates", "date"))
87
+ end
88
+
89
+ def date_created
90
+ created = dates.find { |d| d["dateType"] == "Created" } || {}
91
+ created.fetch("text", nil)
92
+ end
93
+
94
+ def date_published
95
+ published = dates.find { |d| d["dateType"] == "Issued" } || {}
96
+ published.fetch("text", nil) || metadata.fetch("publicationYear")
97
+ end
98
+
99
+ def date_modified
100
+ modified = dates.find { |d| d["dateType"] == "Updated" } || {}
101
+ modified.fetch("text", nil)
102
+ end
103
+
104
+ def related_identifiers(relation_type)
105
+ Array(metadata.dig("relatedIdentifiers", "relatedIdentifier"))
106
+ .select { |r| relation_type.split(" ").include?(r["relationType"]) && %w(DOI URL).include?(r["relatedIdentifierType"]) }
107
+ .map do |work|
108
+ work_id = work["relatedIdentifierType"] == "DOI" ? normalize_doi(work["text"]) : work["text"]
109
+ { "@type" => "CreativeWork",
110
+ "@id" => work_id }
111
+ end
112
+ end
113
+
114
+ def is_part_of
115
+ related_identifiers("IsPartOf").first
116
+ end
117
+
118
+ def has_part
119
+ related_identifiers("HasPart").presence
120
+ end
121
+
122
+ def citation
123
+ related_identifiers("Cites IsCitedBy Supplements IsSupplementTo References IsReferencedBy").presence
124
+ end
125
+
126
+ def publisher
127
+ metadata.fetch("publisher")
128
+ end
129
+
130
+ def provider
131
+ { "@type" => "Organization",
132
+ "name" => "DataCite" }
133
+ end
134
+
135
+ def as_schema_org
136
+ { "@context" => "http://schema.org",
137
+ "@type" => type,
138
+ "@id" => id,
139
+ "name" => name,
140
+ "alternateName" => alternate_name,
141
+ "author" => author,
142
+ "description" => description,
143
+ "license" => license,
144
+ "version" => version,
145
+ "keywords" => keywords,
146
+ "dateCreated" => date_created,
147
+ "datePublished" => date_published,
148
+ "dateModified" => date_modified,
149
+ "isPartOf" => is_part_of,
150
+ "hasPart" => has_part,
151
+ "citation" => citation,
152
+ "publisher" => publisher,
153
+ "provider" => provider
154
+ }.compact
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,48 @@
1
+ module Bolognese
2
+ module DateUtils
3
+ def get_date_parts(iso8601_time)
4
+ return { "date_parts" => [[]] } if iso8601_time.nil?
5
+
6
+ year = iso8601_time[0..3].to_i
7
+ month = iso8601_time[5..6].to_i
8
+ day = iso8601_time[8..9].to_i
9
+ { 'date-parts' => [[year, month, day].reject { |part| part == 0 }] }
10
+ end
11
+
12
+ def get_year_month(iso8601_time)
13
+ return [] if iso8601_time.nil?
14
+
15
+ year = iso8601_time[0..3]
16
+ month = iso8601_time[5..6]
17
+
18
+ [year.to_i, month.to_i].reject { |part| part == 0 }
19
+ end
20
+
21
+ def get_year_month_day(iso8601_time)
22
+ return [] if iso8601_time.nil?
23
+
24
+ year = iso8601_time[0..3]
25
+ month = iso8601_time[5..6]
26
+ day = iso8601_time[8..9]
27
+
28
+ [year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }
29
+ end
30
+
31
+ def get_date_parts_from_parts(year, month = nil, day = nil)
32
+ { 'date-parts' => [[year.to_i, month.to_i, day.to_i].reject { |part| part == 0 }] }
33
+ end
34
+
35
+ def get_date_from_parts(year, month = nil, day = nil)
36
+ [year.to_s.rjust(4, '0'), month.to_s.rjust(2, '0'), day.to_s.rjust(2, '0')].reject { |part| part == "00" }.join("-")
37
+ end
38
+
39
+ # parsing of incomplete iso8601 timestamps such as 2015-04 is broken
40
+ # in standard library
41
+ # return nil if invalid iso8601 timestamp
42
+ def get_datetime_from_iso8601(iso8601_time)
43
+ ISO8601::DateTime.new(iso8601_time).to_time.utc
44
+ rescue
45
+ nil
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,48 @@
1
+ module Bolognese
2
+ module DoiUtils
3
+ def validate_doi(doi)
4
+ Array(/\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(doi)).last
5
+ end
6
+
7
+ def normalize_doi(doi)
8
+ doi = validate_doi(doi)
9
+ return nil unless doi.present?
10
+
11
+ # remove non-printing whitespace and downcase
12
+ doi = doi.gsub(/\u200B/, '').downcase
13
+
14
+ # turn DOI into URL, escape unsafe characters
15
+ "https://doi.org/" + Addressable::URI.encode(doi)
16
+ end
17
+
18
+ def doi_from_url(url)
19
+ if /(http|https):\/\/(dx\.)?doi\.org\/(\w+)/.match(url)
20
+ uri = Addressable::URI.parse(url)
21
+ uri.path[1..-1].upcase
22
+ elsif url.is_a?(String) && url.starts_with?("doi:")
23
+ url[4..-1].upcase
24
+ end
25
+ end
26
+
27
+ def doi_as_url(doi)
28
+ "https://doi.org/#{clean_doi(doi)}" if doi.present?
29
+ end
30
+
31
+ # get DOI registration agency, assume a normalized DOI
32
+ def get_doi_ra(doi)
33
+ return {} if doi.blank?
34
+
35
+ url = "https://doi.crossref.org/doiRA/#{doi_from_url(doi)}"
36
+ response = Maremma.get(url, host: true, timeout: 120)
37
+
38
+ ra = response.body.fetch("data", {}).first.fetch("RA", nil)
39
+ if ra.present?
40
+ { "id" => ra.downcase,
41
+ "name" => ra }
42
+ else
43
+ { "errors" => response.body.fetch("errors", nil) || response.body.fetch("data", nil) }
44
+ end
45
+ end
46
+
47
+ end
48
+ end
@@ -0,0 +1,106 @@
1
+ module Bolognese
2
+ class Github < Metadata
3
+ # def get_github_metadata(url, options = {})
4
+ # return {} if url.blank?
5
+
6
+ # github_hash = github_from_url(url)
7
+ # repo_url = "https://api.github.com/repos/#{github_hash[:owner]}/#{github_hash[:repo]}"
8
+ # response = Maremma.get(repo_url, options.merge(bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN']))
9
+
10
+ # return { error: 'Resource not found.', status: 404 } if response.body.fetch("errors", nil).present?
11
+
12
+ # author = get_github_owner(github_hash[:owner])
13
+
14
+ # language = response.body.fetch("data", {}).fetch('language', nil)
15
+ # type = language.present? && language != "HTML" ? 'computer_program' : 'webpage'
16
+
17
+ # { "author" => [get_one_author(author)],
18
+ # "title" => response.body.fetch("data", {}).fetch('description', nil).presence || github_hash[:repo],
19
+ # "container-title" => "Github",
20
+ # "issued" => response.body.fetch("data", {}).fetch('created_at', nil).presence || "0000",
21
+ # "URL" => url,
22
+ # "type" => type }
23
+ # end
24
+
25
+ # def get_github_owner_metadata(url, options = {})
26
+ # return {} if url.blank?
27
+
28
+ # github_hash = github_from_url(url)
29
+ # owner_url = "https://api.github.com/users/#{github_hash[:owner]}"
30
+ # response = Maremma.get(owner_url, options.merge(bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN']))
31
+
32
+ # return { error: 'Resource not found.', status: 404 } if response.body.fetch("data", {}).fetch("message", nil) == "Not Found"
33
+
34
+ # author = response.body.fetch("data", {}).fetch('name', nil).presence || github_hash[:owner]
35
+ # title = "Github profile for #{author}"
36
+
37
+ # { "author" => [get_one_author(author)],
38
+ # "title" => title,
39
+ # "container-title" => "Github",
40
+ # "issued" => response.body.fetch("data", {}).fetch('created_at', nil).presence || "0000",
41
+ # "URL" => url,
42
+ # "type" => 'entry' }
43
+ # end
44
+
45
+ # def get_github_release_metadata(url, options = {})
46
+ # return {} if url.blank?
47
+
48
+ # github_hash = github_from_url(url)
49
+ # release_url = "https://api.github.com/repos/#{github_hash[:owner]}/#{github_hash[:repo]}/releases/tags/#{github_hash[:release]}"
50
+ # response = Maremma.get(release_url, options.merge(bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN']))
51
+
52
+ # return { error: 'Resource not found.', status: 404 } if response.body.fetch("data", {})["message"] == "Not Found"
53
+
54
+ # author = get_github_owner(github_hash[:owner])
55
+
56
+ # { "author" => [get_one_author(author)],
57
+ # "title" => response.body.fetch("data", {}).fetch('name', nil),
58
+ # "container-title" => "Github",
59
+ # "issued" => response.body.fetch("data", {}).fetch('created_at', nil).presence || "0000",
60
+ # "URL" => url,
61
+ # "type" => 'computer_program' }
62
+ # end
63
+
64
+ # def get_github_owner(owner)
65
+ # url = "https://api.github.com/users/#{owner}"
66
+ # response = Maremma.get(url, bearer: ENV['GITHUB_PERSONAL_ACCESS_TOKEN'])
67
+
68
+ # return nil if response.body.fetch("data", {}).fetch("message", nil) == "Not Found"
69
+
70
+ # response.body.fetch("data", {}).fetch('name', nil).presence || owner
71
+ # end
72
+
73
+ # def github_from_url(url)
74
+ # return {} unless /\Ahttps:\/\/github\.com\/(.+)(?:\/)?(.+)?(?:\/tree\/)?(.*)\z/.match(url)
75
+ # words = URI.parse(url).path[1..-1].split('/')
76
+
77
+ # { owner: words[0],
78
+ # repo: words[1],
79
+ # release: words[3] }.compact
80
+ # end
81
+
82
+ # def github_repo_from_url(url)
83
+ # github_from_url(url).fetch(:repo, nil)
84
+ # end
85
+
86
+ # def github_release_from_url(url)
87
+ # github_from_url(url).fetch(:release, nil)
88
+ # end
89
+
90
+ # def github_owner_from_url(url)
91
+ # github_from_url(url).fetch(:owner, nil)
92
+ # end
93
+
94
+ # def github_as_owner_url(github_hash)
95
+ # "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present?
96
+ # end
97
+
98
+ # def github_as_repo_url(github_hash)
99
+ # "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}" if github_hash[:repo].present?
100
+ # end
101
+
102
+ # def github_as_release_url(github_hash)
103
+ # "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}" if github_hash[:release].present?
104
+ # end
105
+ end
106
+ end
@@ -0,0 +1,30 @@
1
+ require_relative 'doi_utils'
2
+ require_relative 'author_utils'
3
+ require_relative 'date_utils'
4
+ require_relative 'pid_utils'
5
+ require_relative 'utils'
6
+
7
+ module Bolognese
8
+ class Metadata
9
+ include Bolognese::DoiUtils
10
+ include Bolognese::AuthorUtils
11
+ include Bolognese::DateUtils
12
+ include Bolognese::PidUtils
13
+ include Bolognese::Utils
14
+
15
+ attr_reader :id, :provider
16
+
17
+ def initialize(id)
18
+ @id = normalize_id(id)
19
+ @provider = find_provider(@id)
20
+ end
21
+
22
+ def normalize_id(id)
23
+ normalize_doi(id) || normalize_orcid(id)
24
+ end
25
+
26
+ def find_provider(id)
27
+ get_doi_ra(id).fetch("id", nil) || "orcid"
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,24 @@
1
+ module Bolognese
2
+ class Orcid < Metadata
3
+ include Bolognese::PidUtils
4
+ # def get_orcid_metadata(orcid, options = {})
5
+ # return {} if orcid.blank?
6
+
7
+ # url = "https://pub.orcid.org/v2.0/#{orcid}/person"
8
+ # response = Maremma.get(url, options.merge(accept: "json"))
9
+
10
+ # name = response.body.fetch("data", {}).fetch("name", nil)
11
+ # return { "errors" => 'Resource not found.' } unless name.present?
12
+
13
+ # author = { "family" => name.fetch("family-name", {}).fetch("value", nil),
14
+ # "given" => name.fetch("given-names", {}).fetch("value", nil) }
15
+
16
+ # { "author" => [author],
17
+ # "title" => "ORCID record for #{[author.fetch('given', nil), author.fetch('family', nil)].compact.join(' ')}",
18
+ # "container-title" => "ORCID Registry",
19
+ # "issued" => Time.now.year.to_s,
20
+ # "URL" => orcid_as_url(orcid),
21
+ # "type" => 'entry' }
22
+ # end
23
+ end
24
+ end
@@ -0,0 +1,23 @@
1
+ module Bolognese
2
+ module PidUtils
3
+ def normalize_orcid(orcid)
4
+ orcid = validate_orcid(orcid)
5
+ return nil unless orcid.present?
6
+
7
+ # turn ORCID ID into URL
8
+ "http://orcid.org/" + Addressable::URI.encode(orcid)
9
+ end
10
+
11
+ def orcid_from_url(url)
12
+ Array(/\Ahttp:\/\/orcid\.org\/(.+)/.match(url)).last
13
+ end
14
+
15
+ def orcid_as_url(orcid)
16
+ "http://orcid.org/#{orcid}" if orcid.present?
17
+ end
18
+
19
+ def validate_orcid(orcid)
20
+ Array(/\A(?:http:\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(orcid)).last
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,34 @@
1
+ module Bolognese
2
+ module Pubmed
3
+ # def get_pubmed_metadata(pmid, options = {})
4
+ # return {} if pmid.blank?
5
+
6
+ # url = "http://www.ebi.ac.uk/europepmc/webservices/rest/search/query=ext_id:#{pmid}&format=json"
7
+ # response = Maremma.get(url, options)
8
+
9
+ # metadata = response.body.fetch("data", {}).fetch("resultList", {}).fetch("result", []).first
10
+ # return { error: 'Resource not found.', status: 404 } if metadata.blank?
11
+
12
+ # metadata["issued"] = metadata.fetch("pubYear", nil)
13
+
14
+ # author_string = metadata.fetch("authorString", "").chomp(".")
15
+ # metadata["author"] = get_authors(author_string.split(", "))
16
+
17
+ # metadata["title"] = metadata.fetch("title", "").chomp(".")
18
+ # metadata["container-title"] = metadata.fetch("journalTitle", nil)
19
+ # metadata["volume"] = metadata.fetch("journalVolume", nil)
20
+ # metadata["page"] = metadata.fetch("pageInfo", nil)
21
+ # metadata["type"] = "article-journal"
22
+
23
+ # metadata
24
+ # end
25
+
26
+ def pmid_as_url(pmid)
27
+ "http://www.ncbi.nlm.nih.gov/pubmed/#{pmid}" if pmid.present?
28
+ end
29
+
30
+ def pmcid_as_url(pmcid)
31
+ "http://www.ncbi.nlm.nih.gov/pmc/articles/PMC#{pmcid}" if pmcid.present?
32
+ end
33
+ end
34
+ end