bolognese 0.4.3 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +7 -1
  3. data/Gemfile.lock +1 -1
  4. data/README.md +326 -11
  5. data/lib/bolognese/author_utils.rb +7 -5
  6. data/lib/bolognese/cli.rb +20 -19
  7. data/lib/bolognese/crossref.rb +11 -49
  8. data/lib/bolognese/datacite.rb +16 -33
  9. data/lib/bolognese/datacite_utils.rb +28 -25
  10. data/lib/bolognese/doi_utils.rb +1 -1
  11. data/lib/bolognese/metadata.rb +55 -13
  12. data/lib/bolognese/schema_org.rb +12 -60
  13. data/lib/bolognese/utils.rb +24 -12
  14. data/lib/bolognese/version.rb +1 -1
  15. data/spec/cli_spec.rb +13 -0
  16. data/spec/crossref_spec.rb +6 -1
  17. data/spec/datacite_spec.rb +6 -1
  18. data/spec/fixtures/schema_org.json +44 -0
  19. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/crossref/default.yml +760 -0
  20. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/datacite/default.yml +214 -0
  21. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/read/schema_org/default.yml +653 -0
  22. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_metadata/Schema_org_JSON.yml +719 -0
  23. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/normalize_id/doi.yml +930 -0
  24. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/normalize_id/url.yml +930 -0
  25. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/get_metadata/Schema_org_JSON.yml +173 -0
  26. data/spec/fixtures/vcr_cassettes/Bolognese_Metadata/find_PID_provider/crossref_doi_not_url.yml +2 -2
  27. data/spec/fixtures/vcr_cassettes/Bolognese_SchemaOrg/get_metadata/BlogPosting.yml +42 -21
  28. data/spec/fixtures/vcr_cassettes/Bolognese_SchemaOrg/get_metadata/BlogPosting_schema_org_JSON.yml +653 -0
  29. data/spec/fixtures/vcr_cassettes/Bolognese_SchemaOrg/get_metadata_as_datacite_xml/with_data_citation.yml +653 -0
  30. data/spec/metadata_spec.rb +9 -12
  31. data/spec/schema_org_spec.rb +41 -3
  32. data/spec/utils_spec.rb +3 -3
  33. metadata +12 -2
@@ -2,8 +2,14 @@
2
2
 
3
3
  require "thor"
4
4
 
5
+ require_relative 'doi_utils'
6
+ require_relative 'utils'
7
+
5
8
  module Bolognese
6
9
  class CLI < Thor
10
+ include Bolognese::DoiUtils
11
+ include Bolognese::Utils
12
+
7
13
  def self.exit_on_failure?
8
14
  true
9
15
  end
@@ -16,26 +22,21 @@ module Bolognese
16
22
  puts Bolognese::VERSION
17
23
  end
18
24
 
19
- desc "read pid", "read metadata for PID"
25
+ desc "read id", "read metadata for ID"
20
26
  method_option :as, default: "schema_org"
21
- def read(pid)
22
- provider = Metadata.new(id: pid).provider
23
-
24
- case
25
- when provider == "crossref" && options[:as] == "crossref"
26
- puts Crossref.new(id: pid).raw
27
- when provider == "crossref" && options[:as] == "datacite"
28
- puts Crossref.new(id: pid).as_datacite
29
- when provider == "crossref"
30
- puts Crossref.new(id: pid).as_schema_org
31
- when provider == "datacite" && options[:as] == "datacite"
32
- puts Datacite.new(id: pid).raw
33
- when provider == "datacite"
34
- puts Datacite.new(id: pid).as_schema_org
35
- when provider == "schema_org" && options[:as] == "datacite"
36
- puts SchemaOrg.new(id: pid).as_datacite
37
- when provider == "schema_org"
38
- puts SchemaOrg.new(id: pid).as_schema_org
27
+ def read(id)
28
+ id = normalize_id(id)
29
+ provider = find_provider(id)
30
+ output = options[:as] || "schema_org"
31
+
32
+ if provider.present?
33
+ p = case provider
34
+ when "crossref" then Crossref.new(id: id)
35
+ when "datacite" then Datacite.new(id: id)
36
+ else SchemaOrg.new(id: id)
37
+ end
38
+
39
+ puts p.send(output)
39
40
  else
40
41
  puts "not implemented"
41
42
  end
@@ -31,8 +31,6 @@ module Bolognese
31
31
  "PostedContent" => nil
32
32
  }
33
33
 
34
- attr_reader = :id, :raw, :metadata, :schema_org
35
-
36
34
  def initialize(id: nil, string: nil)
37
35
  id = normalize_doi(id) if id.present?
38
36
 
@@ -44,6 +42,10 @@ module Bolognese
44
42
  end
45
43
  end
46
44
 
45
+ alias_method :crossref, :raw
46
+ alias_method :as_crossref, :raw
47
+ alias_method :schema_org, :as_schema_org
48
+
47
49
  def metadata
48
50
  @metadata ||= raw.present? ? Maremma.from_xml(raw).fetch("doi_records", {}).fetch("doi_record", {}) : {}
49
51
  end
@@ -120,23 +122,17 @@ module Bolognese
120
122
  end
121
123
  end
122
124
 
123
- def keywords
124
-
125
- end
126
-
127
125
  def author
128
- person = bibliographic_metadata.dig("contributors", "person_name")
129
- Array(person).select { |a| a["contributor_role"] == "author" }.map do |a|
130
- { "@type" => "Person",
131
- "@id" => parse_attribute(a["ORCID"]),
132
- "givenName" => a["given_name"],
133
- "familyName" => a["surname"] }.compact
134
- end
126
+ people("author")
135
127
  end
136
128
 
137
129
  def editor
130
+ people("editor")
131
+ end
132
+
133
+ def people(contributor_role)
138
134
  person = bibliographic_metadata.dig("contributors", "person_name")
139
- Array(person).select { |a| a["contributor_role"] == "editor" }.map do |a|
135
+ Array(person).select { |a| a["contributor_role"] == contributor_role }.map do |a|
140
136
  { "@type" => "Person",
141
137
  "@id" => parse_attribute(a["ORCID"]),
142
138
  "givenName" => a["given_name"],
@@ -144,14 +140,6 @@ module Bolognese
144
140
  end.presence
145
141
  end
146
142
 
147
- def version
148
-
149
- end
150
-
151
- def date_created
152
-
153
- end
154
-
155
143
  def date_published
156
144
  pub_date = bibliographic_metadata.fetch("publication_date", nil) ||
157
145
  bibliographic_metadata.fetch("acceptance_date", nil)
@@ -188,15 +176,11 @@ module Bolognese
188
176
  is_part_of.fetch("name", nil)
189
177
  end
190
178
 
191
- def has_part
192
-
193
- end
194
-
195
179
  def citation
196
180
  citations = bibliographic_metadata.dig("citation_list", "citation")
197
181
  Array.wrap(citations).map do |c|
198
182
  { "@type" => "CreativeWork",
199
- "@id" => normalize_url(c["doi"]),
183
+ "@id" => normalize_id(c["doi"]),
200
184
  "position" => c["key"],
201
185
  "name" => c["article_title"],
202
186
  "datePublished" => c["cYear"] }.compact
@@ -207,27 +191,5 @@ module Bolognese
207
191
  { "@type" => "Organization",
208
192
  "name" => "Crossref" }
209
193
  end
210
-
211
- def as_schema_org
212
- { "@context" => "http://schema.org",
213
- "@type" => type,
214
- "@id" => id,
215
- "additionalType" => additional_type,
216
- "name" => name,
217
- "alternateName" => alternate_name,
218
- "author" => author,
219
- "editor" => editor,
220
- "description" => description,
221
- "license" => license,
222
- "datePublished" => date_published,
223
- "dateModified" => date_modified,
224
- "pageStart" => page_start,
225
- "pageEnd" => page_end,
226
- "isPartOf" => is_part_of,
227
- "hasPart" => has_part,
228
- "citation" => citation,
229
- "provider" => provider
230
- }.compact
231
- end
232
194
  end
233
195
  end
@@ -18,8 +18,6 @@ module Bolognese
18
18
  "Other" => "CreativeWork"
19
19
  }
20
20
 
21
- attr_reader = :id, :raw, :metadata, :schema_org
22
-
23
21
  def initialize(id: nil, string: nil)
24
22
  id = normalize_doi(id) if id.present?
25
23
 
@@ -31,6 +29,9 @@ module Bolognese
31
29
  end
32
30
  end
33
31
 
32
+ alias_method :datacite, :raw
33
+ alias_method :schema_org, :as_schema_org
34
+
34
35
  def metadata
35
36
  @metadata ||= raw.present? ? Maremma.from_xml(raw).fetch("resource", {}) : {}
36
37
  end
@@ -79,17 +80,19 @@ module Bolognese
79
80
  end
80
81
 
81
82
  def keywords
82
- Array(metadata.dig("subjects", "subject")).join(", ")
83
+ Array.wrap(metadata.dig("subjects", "subject")).join(", ").presence
83
84
  end
84
85
 
85
86
  def author
86
87
  authors = metadata.dig("creators", "creator")
87
88
  authors = [authors] if authors.is_a?(Hash)
88
- get_authors(authors)
89
+ get_authors(authors).presence
89
90
  end
90
91
 
91
92
  def editor
92
-
93
+ editors = metadata.dig("contributors", "contributor")
94
+ editors = [editors] if editors.is_a?(Hash)
95
+ get_authors(editors).presence
93
96
  end
94
97
 
95
98
  def version
@@ -100,19 +103,21 @@ module Bolognese
100
103
  Array.wrap(metadata.dig("dates", "date"))
101
104
  end
102
105
 
106
+ def date(date_type)
107
+ dd = dates.find { |d| d["dateType"] == date_type } || {}
108
+ dd.fetch("text", nil)
109
+ end
110
+
103
111
  def date_created
104
- created = dates.find { |d| d["dateType"] == "Created" } || {}
105
- created.fetch("text", nil)
112
+ date("Created")
106
113
  end
107
114
 
108
115
  def date_published
109
- published = dates.find { |d| d["dateType"] == "Issued" } || {}
110
- published.fetch("text", nil) || metadata.fetch("publicationYear")
116
+ date("Issued") || metadata.fetch("publicationYear")
111
117
  end
112
118
 
113
119
  def date_modified
114
- modified = dates.find { |d| d["dateType"] == "Updated" } || {}
115
- modified.fetch("text", nil)
120
+ date("Updated")
116
121
  end
117
122
 
118
123
  def related_identifiers(relation_type)
@@ -150,27 +155,5 @@ module Bolognese
150
155
  { "@type" => "Organization",
151
156
  "name" => "DataCite" }
152
157
  end
153
-
154
- def as_schema_org
155
- { "@context" => "http://schema.org",
156
- "@type" => type,
157
- "@id" => id,
158
- "name" => name,
159
- "alternateName" => alternate_name,
160
- "author" => author,
161
- "description" => description,
162
- "license" => license,
163
- "version" => version,
164
- "keywords" => keywords,
165
- "dateCreated" => date_created,
166
- "datePublished" => date_published,
167
- "dateModified" => date_modified,
168
- "isPartOf" => is_part_of,
169
- "hasPart" => has_part,
170
- "citation" => citation,
171
- "publisher" => publisher,
172
- "provider" => provider
173
- }.compact
174
- end
175
158
  end
176
159
  end
@@ -2,16 +2,24 @@ module Bolognese
2
2
  module DataciteUtils
3
3
 
4
4
  SO_TO_DC_TRANSLATIONS = {
5
- "VideoObject" => "Audiovisual",
5
+ "Article" => "Text",
6
+ "AudioObject" => "Sound",
7
+ "Blog" => "Text",
8
+ "BlogPosting" => "Text",
6
9
  "Collection" => "Collection",
10
+ "CreativeWork" => "Other",
11
+ "DataCatalog" => "Dataset",
7
12
  "Dataset" => "Dataset",
8
13
  "Event" => "Event",
9
14
  "ImageObject" => "Image",
15
+ "Movie" => "Audiovisual",
16
+ "PublicationIssue" => "Text",
17
+ "ScholarlyArticle" => "Text",
10
18
  "Service" => "Service",
11
19
  "SoftwareSourceCode" => "Software",
12
- "AudioObject" => "Sound",
13
- "ScholarlyArticle" => "Text",
14
- "CreativeWork" => "Other"
20
+ "VideoObject" => "Audiovisual",
21
+ "WebPage" => "Text",
22
+ "WebSite" => "Text"
15
23
  }
16
24
 
17
25
  LICENSE_NAMES = {
@@ -114,13 +122,19 @@ module Bolognese
114
122
  end
115
123
 
116
124
  def insert_publication_year(xml)
117
- xml.publicationYear(date_published[0..3])
125
+ xml.publicationYear(date_published && date_published[0..3])
126
+ end
127
+
128
+ def resource_type
129
+ { "resource_type_general" => SO_TO_DC_TRANSLATIONS[type] || "Other",
130
+ "text" => additional_type || type }
118
131
  end
119
132
 
120
133
  def insert_resource_type(xml)
121
134
  return xml unless type.present?
122
135
 
123
- xml.resourceType(additional_type, 'resourceTypeGeneral' => SO_TO_DC_TRANSLATIONS[type])
136
+ xml.resourceType(resource_type["text"],
137
+ 'resourceTypeGeneral' => resource_type["resource_type_general"])
124
138
  end
125
139
 
126
140
  def insert_alternate_identifiers(xml)
@@ -160,28 +174,17 @@ module Bolognese
160
174
  end
161
175
 
162
176
  def rel_identifiers
163
- ipo = Array.wrap(is_part_of).map do |i|
164
- {
165
- "text" => i["@id"],
166
- "related_identifier_type" => validate_url(i["@id"]),
167
- "relation_type" => "IsPartOf" }
168
- end.select { |i| i["related_identifier_type"].present? }
169
-
170
- hp = Array.wrap(has_part).map do |i|
171
- {
172
- "text" => i["@id"],
173
- "related_identifier_type" => validate_url(i["@id"]),
174
- "relation_type" => "HasPart" }
175
- end.select { |i| i["related_identifier_type"].present? }
177
+ rel_identifier(rel_ids: is_part_of, relation_type: "IsPartOf") +
178
+ rel_identifier(rel_ids: has_part, relation_type: "HasPart") +
179
+ rel_identifier(rel_ids: citation, relation_type: "References")
180
+ end
176
181
 
177
- c = Array.wrap(citation).map do |i|
178
- {
179
- "text" => i["@id"],
182
+ def rel_identifier(rel_ids: nil, relation_type: nil)
183
+ Array.wrap(rel_ids).map do |i|
184
+ { "text" => i["@id"],
180
185
  "related_identifier_type" => validate_url(i["@id"]),
181
- "relation_type" => "References" }
186
+ "relation_type" => relation_type }
182
187
  end.select { |i| i["related_identifier_type"].present? }
183
-
184
- ipo + hp + c
185
188
  end
186
189
 
187
190
  def insert_related_identifiers(xml)
@@ -9,7 +9,7 @@ module Bolognese
9
9
  return nil unless doi.present?
10
10
 
11
11
  # remove non-printing whitespace and downcase
12
- doi = doi.gsub(/\u200B/, '').downcase
12
+ doi = doi.delete("\u200B").downcase
13
13
 
14
14
  # turn DOI into URL, escape unsafe characters
15
15
  "https://doi.org/" + Addressable::URI.encode(doi)
@@ -14,23 +14,65 @@ module Bolognese
14
14
 
15
15
  attr_reader :id, :raw, :provider
16
16
 
17
- def initialize(id: nil)
18
- @id = normalize_id(id)
19
- @provider = find_provider(@id)
17
+ alias_method :datacite, :as_datacite
18
+
19
+ def url
20
+
21
+ end
22
+
23
+ def version
24
+
25
+ end
26
+
27
+ def keywords
28
+
29
+ end
30
+
31
+ def date_created
32
+
20
33
  end
21
34
 
22
- def normalize_id(id)
23
- normalize_doi(id) || normalize_orcid(id)
35
+ def page_start
36
+
37
+ end
38
+
39
+ def page_end
40
+
41
+ end
42
+
43
+ def has_part
44
+
45
+ end
46
+
47
+ def publisher
48
+
24
49
  end
25
50
 
26
- def find_provider(id)
27
- if /\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
28
- get_doi_ra(id).fetch("id", nil)
29
- elsif /\A(?:(http|https):\/\/orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id)
30
- "orcid"
31
- else
32
- "schema_org"
33
- end
51
+ def as_schema_org
52
+ { "@context" => "http://schema.org",
53
+ "@type" => type,
54
+ "@id" => id,
55
+ "url" => url,
56
+ "additionalType" => additional_type,
57
+ "name" => name,
58
+ "alternateName" => alternate_name,
59
+ "author" => author,
60
+ "editor" => editor,
61
+ "description" => description,
62
+ "license" => license,
63
+ "version" => version,
64
+ "keywords" => keywords,
65
+ "dateCreated" => date_created,
66
+ "datePublished" => date_published,
67
+ "dateModified" => date_modified,
68
+ "pageStart" => page_start,
69
+ "pageEnd" => page_end,
70
+ "isPartOf" => is_part_of,
71
+ "hasPart" => has_part,
72
+ "citation" => citation,
73
+ "publisher" => publisher,
74
+ "provider" => provider
75
+ }.compact.to_json
34
76
  end
35
77
  end
36
78
  end
@@ -1,46 +1,22 @@
1
1
  module Bolognese
2
2
  class SchemaOrg < Metadata
3
3
 
4
- DC_TO_SO_TRANSLATIONS = {
5
- "Audiovisual" => "VideoObject",
6
- "Collection" => "Collection",
7
- "Dataset" => "Dataset",
8
- "Event" => "Event",
9
- "Image" => "ImageObject",
10
- "InteractiveResource" => nil,
11
- "Model" => nil,
12
- "PhysicalObject" => nil,
13
- "Service" => "Service",
14
- "Software" => "SoftwareSourceCode",
15
- "Sound" => "AudioObject",
16
- "Text" => "ScholarlyArticle",
17
- "Workflow" => nil,
18
- "Other" => "CreativeWork"
19
- }
20
-
21
- attr_reader = :id, :raw, :metadata, :schema_org
22
-
23
4
  def initialize(id: nil, string: nil)
24
- id = normalize_url(id) if id.present?
5
+ id = normalize_id(id) if id.present?
25
6
 
26
7
  if string.present?
27
8
  @raw = string
28
9
  elsif id.present?
29
10
  response = Maremma.get(id)
30
- @raw = response.body.fetch("data", nil)
11
+ doc = Nokogiri::XML(response.body.fetch("data", nil))
12
+ @raw = doc.at_xpath('//script[@type="application/ld+json"]')
31
13
  end
32
14
  end
33
15
 
16
+ alias_method :schema_org, :as_schema_org
17
+
34
18
  def metadata
35
- @metadata ||= begin
36
- if raw.present?
37
- doc = Nokogiri::XML(raw)
38
- tag = doc.at_xpath('//script[@type="application/ld+json"]')
39
- Maremma.from_json(tag)
40
- else
41
- {}
42
- end
43
- end
19
+ @metadata ||= raw.present? ? Maremma.from_json(raw) : {}
44
20
  end
45
21
 
46
22
  def exists?
@@ -52,11 +28,11 @@ module Bolognese
52
28
  end
53
29
 
54
30
  def id
55
- normalize_url(metadata.fetch("@id", nil))
31
+ normalize_id(metadata.fetch("@id", nil))
56
32
  end
57
33
 
58
34
  def url
59
- normalize_url(metadata.fetch("url", nil))
35
+ normalize_id(metadata.fetch("url", nil))
60
36
  end
61
37
 
62
38
  def type
@@ -76,11 +52,11 @@ module Bolognese
76
52
  end
77
53
 
78
54
  def author
79
- Array(metadata.fetch("author", nil)).map { |a| a.except("name") }
55
+ Array(metadata.fetch("author", nil)).map { |a| a.except("name") }.presence
80
56
  end
81
57
 
82
58
  def editor
83
- Array(metadata.fetch("editor", nil)).map { |a| a.except("name") }
59
+ Array(metadata.fetch("editor", nil)).map { |a| a.except("name") }.presence
84
60
  end
85
61
 
86
62
  def description
@@ -120,11 +96,11 @@ module Bolognese
120
96
  end
121
97
 
122
98
  def has_part
123
- related_identifiers("hasPart")
99
+ related_identifiers("hasPart").presence
124
100
  end
125
101
 
126
102
  def citation
127
- related_identifiers("citation")
103
+ related_identifiers("citation").presence
128
104
  end
129
105
 
130
106
  def publisher
@@ -142,29 +118,5 @@ module Bolognese
142
118
  def provider
143
119
  metadata.fetch("provider", nil)
144
120
  end
145
-
146
- def as_schema_org
147
- { "@context" => "http://schema.org",
148
- "@type" => type,
149
- "@id" => id,
150
- "url" => url,
151
- "name" => name,
152
- "alternateName" => alternate_name,
153
- "author" => author,
154
- "editor" => editor,
155
- "description" => description,
156
- "license" => license,
157
- "version" => version,
158
- "keywords" => keywords,
159
- "dateCreated" => date_created,
160
- "datePublished" => date_published,
161
- "dateModified" => date_modified,
162
- "isPartOf" => is_part_of,
163
- "hasPart" => has_part,
164
- "citation" => citation,
165
- "publisher" => publisher,
166
- "provider" => provider
167
- }.compact
168
- end
169
121
  end
170
122
  end