bolognese 0.4.3 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +7 -1
  3. data/Gemfile.lock +1 -1
  4. data/README.md +326 -11
  5. data/lib/bolognese/author_utils.rb +7 -5
  6. data/lib/bolognese/cli.rb +20 -19
  7. data/lib/bolognese/crossref.rb +11 -49
  8. data/lib/bolognese/datacite.rb +16 -33
  9. data/lib/bolognese/datacite_utils.rb +28 -25
  10. data/lib/bolognese/doi_utils.rb +1 -1
  11. data/lib/bolognese/metadata.rb +55 -13
  12. data/lib/bolognese/schema_org.rb +12 -60
  13. data/lib/bolognese/utils.rb +24 -12
  14. data/lib/bolognese/version.rb +1 -1
  15. data/spec/cli_spec.rb +13 -0
  16. data/spec/crossref_spec.rb +6 -1
  17. data/spec/datacite_spec.rb +6 -1
  18. data/spec/fixtures/schema_org.json +44 -0
  19. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/crossref/default.yml +760 -0
  20. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/datacite/default.yml +214 -0
  21. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/schema_org/default.yml +653 -0
  22. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/Schema_org_JSON.yml +719 -0
  23. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/normalize_id/doi.yml +930 -0
  24. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/normalize_id/url.yml +930 -0
  25. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/get_metadata/Schema_org_JSON.yml +173 -0
  26. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/crossref_doi_not_url.yml +2 -2
  27. data/spec/fixtures/vcr_cassettes/Bolognese_SchemaOrg/get_metadata/BlogPosting.yml +42 -21
  28. data/spec/fixtures/vcr_cassettes/Bolognese_SchemaOrg/get_metadata/BlogPosting_schema_org_JSON.yml +653 -0
  29. data/spec/fixtures/vcr_cassettes/Bolognese_SchemaOrg/get_metadata_as_datacite_xml/with_data_citation.yml +653 -0
  30. data/spec/metadata_spec.rb +9 -12
  31. data/spec/schema_org_spec.rb +41 -3
  32. data/spec/utils_spec.rb +3 -3
  33. metadata +12 -2
@@ -2,8 +2,14 @@
2
2
 
3
3
  require "thor"
4
4
 
5
+ require_relative 'doi_utils'
6
+ require_relative 'utils'
7
+
5
8
  module Bolognese
6
9
  class CLI < Thor
10
+ include Bolognese::DoiUtils
11
+ include Bolognese::Utils
12
+
7
13
  def self.exit_on_failure?
8
14
  true
9
15
  end
@@ -16,26 +22,21 @@ module Bolognese
16
22
  puts Bolognese::VERSION
17
23
  end
18
24
 
19
- desc "read pid", "read metadata for PID"
25
+ desc "read id", "read metadata for ID"
20
26
  method_option :as, default: "schema_org"
21
- def read(pid)
22
- provider = Metadata.new(id: pid).provider
23
-
24
- case
25
- when provider == "crossref" && options[:as] == "crossref"
26
- puts Crossref.new(id: pid).raw
27
- when provider == "crossref" && options[:as] == "datacite"
28
- puts Crossref.new(id: pid).as_datacite
29
- when provider == "crossref"
30
- puts Crossref.new(id: pid).as_schema_org
31
- when provider == "datacite" && options[:as] == "datacite"
32
- puts Datacite.new(id: pid).raw
33
- when provider == "datacite"
34
- puts Datacite.new(id: pid).as_schema_org
35
- when provider == "schema_org" && options[:as] == "datacite"
36
- puts SchemaOrg.new(id: pid).as_datacite
37
- when provider == "schema_org"
38
- puts SchemaOrg.new(id: pid).as_schema_org
27
+ def read(id)
28
+ id = normalize_id(id)
29
+ provider = find_provider(id)
30
+ output = options[:as] || "schema_org"
31
+
32
+ if provider.present?
33
+ p = case provider
34
+ when "crossref" then Crossref.new(id: id)
35
+ when "datacite" then Datacite.new(id: id)
36
+ else SchemaOrg.new(id: id)
37
+ end
38
+
39
+ puts p.send(output)
39
40
  else
40
41
  puts "not implemented"
41
42
  end
@@ -31,8 +31,6 @@ module Bolognese
31
31
  "PostedContent" => nil
32
32
  }
33
33
 
34
- attr_reader = :id, :raw, :metadata, :schema_org
35
-
36
34
  def initialize(id: nil, string: nil)
37
35
  id = normalize_doi(id) if id.present?
38
36
 
@@ -44,6 +42,10 @@ module Bolognese
44
42
  end
45
43
  end
46
44
 
45
+ alias_method :crossref, :raw
46
+ alias_method :as_crossref, :raw
47
+ alias_method :schema_org, :as_schema_org
48
+
47
49
  def metadata
48
50
  @metadata ||= raw.present? ? Maremma.from_xml(raw).fetch("doi_records", {}).fetch("doi_record", {}) : {}
49
51
  end
@@ -120,23 +122,17 @@ module Bolognese
120
122
  end
121
123
  end
122
124
 
123
- def keywords
124
-
125
- end
126
-
127
125
  def author
128
- person = bibliographic_metadata.dig("contributors", "person_name")
129
- Array(person).select { |a| a["contributor_role"] == "author" }.map do |a|
130
- { "@type" => "Person",
131
- "@id" => parse_attribute(a["ORCID"]),
132
- "givenName" => a["given_name"],
133
- "familyName" => a["surname"] }.compact
134
- end
126
+ people("author")
135
127
  end
136
128
 
137
129
  def editor
130
+ people("editor")
131
+ end
132
+
133
+ def people(contributor_role)
138
134
  person = bibliographic_metadata.dig("contributors", "person_name")
139
- Array(person).select { |a| a["contributor_role"] == "editor" }.map do |a|
135
+ Array(person).select { |a| a["contributor_role"] == contributor_role }.map do |a|
140
136
  { "@type" => "Person",
141
137
  "@id" => parse_attribute(a["ORCID"]),
142
138
  "givenName" => a["given_name"],
@@ -144,14 +140,6 @@ module Bolognese
144
140
  end.presence
145
141
  end
146
142
 
147
- def version
148
-
149
- end
150
-
151
- def date_created
152
-
153
- end
154
-
155
143
  def date_published
156
144
  pub_date = bibliographic_metadata.fetch("publication_date", nil) ||
157
145
  bibliographic_metadata.fetch("acceptance_date", nil)
@@ -188,15 +176,11 @@ module Bolognese
188
176
  is_part_of.fetch("name", nil)
189
177
  end
190
178
 
191
- def has_part
192
-
193
- end
194
-
195
179
  def citation
196
180
  citations = bibliographic_metadata.dig("citation_list", "citation")
197
181
  Array.wrap(citations).map do |c|
198
182
  { "@type" => "CreativeWork",
199
- "@id" => normalize_url(c["doi"]),
183
+ "@id" => normalize_id(c["doi"]),
200
184
  "position" => c["key"],
201
185
  "name" => c["article_title"],
202
186
  "datePublished" => c["cYear"] }.compact
@@ -207,27 +191,5 @@ module Bolognese
207
191
  { "@type" => "Organization",
208
192
  "name" => "Crossref" }
209
193
  end
210
-
211
- def as_schema_org
212
- { "@context" => "http://schema.org",
213
- "@type" => type,
214
- "@id" => id,
215
- "additionalType" => additional_type,
216
- "name" => name,
217
- "alternateName" => alternate_name,
218
- "author" => author,
219
- "editor" => editor,
220
- "description" => description,
221
- "license" => license,
222
- "datePublished" => date_published,
223
- "dateModified" => date_modified,
224
- "pageStart" => page_start,
225
- "pageEnd" => page_end,
226
- "isPartOf" => is_part_of,
227
- "hasPart" => has_part,
228
- "citation" => citation,
229
- "provider" => provider
230
- }.compact
231
- end
232
194
  end
233
195
  end
@@ -18,8 +18,6 @@ module Bolognese
18
18
  "Other" => "CreativeWork"
19
19
  }
20
20
 
21
- attr_reader = :id, :raw, :metadata, :schema_org
22
-
23
21
  def initialize(id: nil, string: nil)
24
22
  id = normalize_doi(id) if id.present?
25
23
 
@@ -31,6 +29,9 @@ module Bolognese
31
29
  end
32
30
  end
33
31
 
32
+ alias_method :datacite, :raw
33
+ alias_method :schema_org, :as_schema_org
34
+
34
35
  def metadata
35
36
  @metadata ||= raw.present? ? Maremma.from_xml(raw).fetch("resource", {}) : {}
36
37
  end
@@ -79,17 +80,19 @@ module Bolognese
79
80
  end
80
81
 
81
82
  def keywords
82
- Array(metadata.dig("subjects", "subject")).join(", ")
83
+ Array.wrap(metadata.dig("subjects", "subject")).join(", ").presence
83
84
  end
84
85
 
85
86
  def author
86
87
  authors = metadata.dig("creators", "creator")
87
88
  authors = [authors] if authors.is_a?(Hash)
88
- get_authors(authors)
89
+ get_authors(authors).presence
89
90
  end
90
91
 
91
92
  def editor
92
-
93
+ editors = metadata.dig("contributors", "contributor")
94
+ editors = [editors] if editors.is_a?(Hash)
95
+ get_authors(editors).presence
93
96
  end
94
97
 
95
98
  def version
@@ -100,19 +103,21 @@ module Bolognese
100
103
  Array.wrap(metadata.dig("dates", "date"))
101
104
  end
102
105
 
106
+ def date(date_type)
107
+ dd = dates.find { |d| d["dateType"] == date_type } || {}
108
+ dd.fetch("text", nil)
109
+ end
110
+
103
111
  def date_created
104
- created = dates.find { |d| d["dateType"] == "Created" } || {}
105
- created.fetch("text", nil)
112
+ date("Created")
106
113
  end
107
114
 
108
115
  def date_published
109
- published = dates.find { |d| d["dateType"] == "Issued" } || {}
110
- published.fetch("text", nil) || metadata.fetch("publicationYear")
116
+ date("Issued") || metadata.fetch("publicationYear")
111
117
  end
112
118
 
113
119
  def date_modified
114
- modified = dates.find { |d| d["dateType"] == "Updated" } || {}
115
- modified.fetch("text", nil)
120
+ date("Updated")
116
121
  end
117
122
 
118
123
  def related_identifiers(relation_type)
@@ -150,27 +155,5 @@ module Bolognese
150
155
  { "@type" => "Organization",
151
156
  "name" => "DataCite" }
152
157
  end
153
-
154
- def as_schema_org
155
- { "@context" => "http://schema.org",
156
- "@type" => type,
157
- "@id" => id,
158
- "name" => name,
159
- "alternateName" => alternate_name,
160
- "author" => author,
161
- "description" => description,
162
- "license" => license,
163
- "version" => version,
164
- "keywords" => keywords,
165
- "dateCreated" => date_created,
166
- "datePublished" => date_published,
167
- "dateModified" => date_modified,
168
- "isPartOf" => is_part_of,
169
- "hasPart" => has_part,
170
- "citation" => citation,
171
- "publisher" => publisher,
172
- "provider" => provider
173
- }.compact
174
- end
175
158
  end
176
159
  end
@@ -2,16 +2,24 @@ module Bolognese
2
2
  module DataciteUtils
3
3
 
4
4
  SO_TO_DC_TRANSLATIONS = {
5
- "VideoObject" => "Audiovisual",
5
+ "Article" => "Text",
6
+ "AudioObject" => "Sound",
7
+ "Blog" => "Text",
8
+ "BlogPosting" => "Text",
6
9
  "Collection" => "Collection",
10
+ "CreativeWork" => "Other",
11
+ "DataCatalog" => "Dataset",
7
12
  "Dataset" => "Dataset",
8
13
  "Event" => "Event",
9
14
  "ImageObject" => "Image",
15
+ "Movie" => "Audiovisual",
16
+ "PublicationIssue" => "Text",
17
+ "ScholarlyArticle" => "Text",
10
18
  "Service" => "Service",
11
19
  "SoftwareSourceCode" => "Software",
12
- "AudioObject" => "Sound",
13
- "ScholarlyArticle" => "Text",
14
- "CreativeWork" => "Other"
20
+ "VideoObject" => "Audiovisual",
21
+ "WebPage" => "Text",
22
+ "WebSite" => "Text"
15
23
  }
16
24
 
17
25
  LICENSE_NAMES = {
@@ -114,13 +122,19 @@ module Bolognese
114
122
  end
115
123
 
116
124
  def insert_publication_year(xml)
117
- xml.publicationYear(date_published[0..3])
125
+ xml.publicationYear(date_published && date_published[0..3])
126
+ end
127
+
128
+ def resource_type
129
+ { "resource_type_general" => SO_TO_DC_TRANSLATIONS[type] || "Other",
130
+ "text" => additional_type || type }
118
131
  end
119
132
 
120
133
  def insert_resource_type(xml)
121
134
  return xml unless type.present?
122
135
 
123
- xml.resourceType(additional_type, 'resourceTypeGeneral' => SO_TO_DC_TRANSLATIONS[type])
136
+ xml.resourceType(resource_type["text"],
137
+ 'resourceTypeGeneral' => resource_type["resource_type_general"])
124
138
  end
125
139
 
126
140
  def insert_alternate_identifiers(xml)
@@ -160,28 +174,17 @@ module Bolognese
160
174
  end
161
175
 
162
176
  def rel_identifiers
163
- ipo = Array.wrap(is_part_of).map do |i|
164
- {
165
- "text" => i["@id"],
166
- "related_identifier_type" => validate_url(i["@id"]),
167
- "relation_type" => "IsPartOf" }
168
- end.select { |i| i["related_identifier_type"].present? }
169
-
170
- hp = Array.wrap(has_part).map do |i|
171
- {
172
- "text" => i["@id"],
173
- "related_identifier_type" => validate_url(i["@id"]),
174
- "relation_type" => "HasPart" }
175
- end.select { |i| i["related_identifier_type"].present? }
177
+ rel_identifier(rel_ids: is_part_of, relation_type: "IsPartOf") +
178
+ rel_identifier(rel_ids: has_part, relation_type: "HasPart") +
179
+ rel_identifier(rel_ids: citation, relation_type: "References")
180
+ end
176
181
 
177
- c = Array.wrap(citation).map do |i|
178
- {
179
- "text" => i["@id"],
182
+ def rel_identifier(rel_ids: nil, relation_type: nil)
183
+ Array.wrap(rel_ids).map do |i|
184
+ { "text" => i["@id"],
180
185
  "related_identifier_type" => validate_url(i["@id"]),
181
- "relation_type" => "References" }
186
+ "relation_type" => relation_type }
182
187
  end.select { |i| i["related_identifier_type"].present? }
183
-
184
- ipo + hp + c
185
188
  end
186
189
 
187
190
  def insert_related_identifiers(xml)
@@ -9,7 +9,7 @@ module Bolognese
9
9
  return nil unless doi.present?
10
10
 
11
11
  # remove non-printing whitespace and downcase
12
- doi = doi.gsub(/\u200B/, '').downcase
12
+ doi = doi.delete("\u200B").downcase
13
13
 
14
14
  # turn DOI into URL, escape unsafe characters
15
15
  "https://doi.org/" + Addressable::URI.encode(doi)
@@ -14,23 +14,65 @@ module Bolognese
14
14
 
15
15
  attr_reader :id, :raw, :provider
16
16
 
17
- def initialize(id: nil)
18
- @id = normalize_id(id)
19
- @provider = find_provider(@id)
17
+ alias_method :datacite, :as_datacite
18
+
19
+ def url
20
+
21
+ end
22
+
23
+ def version
24
+
25
+ end
26
+
27
+ def keywords
28
+
29
+ end
30
+
31
+ def date_created
32
+
20
33
  end
21
34
 
22
- def normalize_id(id)
23
- normalize_doi(id) || normalize_orcid(id)
35
+ def page_start
36
+
37
+ end
38
+
39
+ def page_end
40
+
41
+ end
42
+
43
+ def has_part
44
+
45
+ end
46
+
47
+ def publisher
48
+
24
49
  end
25
50
 
26
- def find_provider(id)
27
- if /\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
28
- get_doi_ra(id).fetch("id", nil)
29
- elsif /\A(?:(http|https):\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id)
30
- "orcid"
31
- else
32
- "schema_org"
33
- end
51
+ def as_schema_org
52
+ { "@context" => "http://schema.org",
53
+ "@type" => type,
54
+ "@id" => id,
55
+ "url" => url,
56
+ "additionalType" => additional_type,
57
+ "name" => name,
58
+ "alternateName" => alternate_name,
59
+ "author" => author,
60
+ "editor" => editor,
61
+ "description" => description,
62
+ "license" => license,
63
+ "version" => version,
64
+ "keywords" => keywords,
65
+ "dateCreated" => date_created,
66
+ "datePublished" => date_published,
67
+ "dateModified" => date_modified,
68
+ "pageStart" => page_start,
69
+ "pageEnd" => page_end,
70
+ "isPartOf" => is_part_of,
71
+ "hasPart" => has_part,
72
+ "citation" => citation,
73
+ "publisher" => publisher,
74
+ "provider" => provider
75
+ }.compact.to_json
34
76
  end
35
77
  end
36
78
  end
@@ -1,46 +1,22 @@
1
1
  module Bolognese
2
2
  class SchemaOrg < Metadata
3
3
 
4
- DC_TO_SO_TRANSLATIONS = {
5
- "Audiovisual" => "VideoObject",
6
- "Collection" => "Collection",
7
- "Dataset" => "Dataset",
8
- "Event" => "Event",
9
- "Image" => "ImageObject",
10
- "InteractiveResource" => nil,
11
- "Model" => nil,
12
- "PhysicalObject" => nil,
13
- "Service" => "Service",
14
- "Software" => "SoftwareSourceCode",
15
- "Sound" => "AudioObject",
16
- "Text" => "ScholarlyArticle",
17
- "Workflow" => nil,
18
- "Other" => "CreativeWork"
19
- }
20
-
21
- attr_reader = :id, :raw, :metadata, :schema_org
22
-
23
4
  def initialize(id: nil, string: nil)
24
- id = normalize_url(id) if id.present?
5
+ id = normalize_id(id) if id.present?
25
6
 
26
7
  if string.present?
27
8
  @raw = string
28
9
  elsif id.present?
29
10
  response = Maremma.get(id)
30
- @raw = response.body.fetch("data", nil)
11
+ doc = Nokogiri::XML(response.body.fetch("data", nil))
12
+ @raw = doc.at_xpath('//script[@type="application/ld+json"]')
31
13
  end
32
14
  end
33
15
 
16
+ alias_method :schema_org, :as_schema_org
17
+
34
18
  def metadata
35
- @metadata ||= begin
36
- if raw.present?
37
- doc = Nokogiri::XML(raw)
38
- tag = doc.at_xpath('//script[@type="application/ld+json"]')
39
- Maremma.from_json(tag)
40
- else
41
- {}
42
- end
43
- end
19
+ @metadata ||= raw.present? ? Maremma.from_json(raw) : {}
44
20
  end
45
21
 
46
22
  def exists?
@@ -52,11 +28,11 @@ module Bolognese
52
28
  end
53
29
 
54
30
  def id
55
- normalize_url(metadata.fetch("@id", nil))
31
+ normalize_id(metadata.fetch("@id", nil))
56
32
  end
57
33
 
58
34
  def url
59
- normalize_url(metadata.fetch("url", nil))
35
+ normalize_id(metadata.fetch("url", nil))
60
36
  end
61
37
 
62
38
  def type
@@ -76,11 +52,11 @@ module Bolognese
76
52
  end
77
53
 
78
54
  def author
79
- Array(metadata.fetch("author", nil)).map { |a| a.except("name") }
55
+ Array(metadata.fetch("author", nil)).map { |a| a.except("name") }.presence
80
56
  end
81
57
 
82
58
  def editor
83
- Array(metadata.fetch("editor", nil)).map { |a| a.except("name") }
59
+ Array(metadata.fetch("editor", nil)).map { |a| a.except("name") }.presence
84
60
  end
85
61
 
86
62
  def description
@@ -120,11 +96,11 @@ module Bolognese
120
96
  end
121
97
 
122
98
  def has_part
123
- related_identifiers("hasPart")
99
+ related_identifiers("hasPart").presence
124
100
  end
125
101
 
126
102
  def citation
127
- related_identifiers("citation")
103
+ related_identifiers("citation").presence
128
104
  end
129
105
 
130
106
  def publisher
@@ -142,29 +118,5 @@ module Bolognese
142
118
  def provider
143
119
  metadata.fetch("provider", nil)
144
120
  end
145
-
146
- def as_schema_org
147
- { "@context" => "http://schema.org",
148
- "@type" => type,
149
- "@id" => id,
150
- "url" => url,
151
- "name" => name,
152
- "alternateName" => alternate_name,
153
- "author" => author,
154
- "editor" => editor,
155
- "description" => description,
156
- "license" => license,
157
- "version" => version,
158
- "keywords" => keywords,
159
- "dateCreated" => date_created,
160
- "datePublished" => date_published,
161
- "dateModified" => date_modified,
162
- "isPartOf" => is_part_of,
163
- "hasPart" => has_part,
164
- "citation" => citation,
165
- "publisher" => publisher,
166
- "provider" => provider
167
- }.compact
168
- end
169
121
  end
170
122
  end