bolognese 0.7.2 → 0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -1
  3. data/README.md +25 -16
  4. data/bolognese.gemspec +2 -1
  5. data/codemeta.json +39 -0
  6. data/lib/bolognese.rb +4 -0
  7. data/lib/bolognese/array.rb +11 -0
  8. data/lib/bolognese/author_utils.rb +35 -21
  9. data/lib/bolognese/bibtex.rb +4 -4
  10. data/lib/bolognese/codemeta.rb +8 -13
  11. data/lib/bolognese/crossref.rb +22 -20
  12. data/lib/bolognese/datacite.rb +61 -61
  13. data/lib/bolognese/datacite_json.rb +208 -0
  14. data/lib/bolognese/datacite_utils.rb +17 -48
  15. data/lib/bolognese/metadata.rb +83 -22
  16. data/lib/bolognese/schema_org.rb +42 -16
  17. data/lib/bolognese/utils.rb +79 -13
  18. data/lib/bolognese/version.rb +1 -1
  19. data/lib/bolognese/whitelist_scrubber.rb +45 -0
  20. data/spec/array_spec.rb +20 -0
  21. data/spec/author_utils_spec.rb +93 -9
  22. data/spec/bibtex_spec.rb +4 -4
  23. data/spec/cli_spec.rb +5 -0
  24. data/spec/codemeta_spec.rb +41 -31
  25. data/spec/crossref_spec.rb +47 -72
  26. data/spec/datacite_json_spec.rb +65 -0
  27. data/spec/datacite_spec.rb +67 -83
  28. data/spec/datacite_utils_spec.rb +9 -14
  29. data/spec/fixtures/datacite.json +49 -0
  30. data/spec/fixtures/datacite_software.json +18 -0
  31. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/convert_from_id/datacite/to_datacite_json.yml +214 -0
  32. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_from_schema_org/with_id.yml +930 -0
  33. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_to_schema_org/with_id.yml +930 -0
  34. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/author.yml +137 -860
  35. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/no_author.yml +137 -860
  36. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/single_author.yml +137 -860
  37. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/with_organization.yml +137 -860
  38. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/from_schema_org/with_id.yml +930 -0
  39. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_ORCID.yml +155 -0
  40. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_no_ORCID.yml +134 -0
  41. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_familyName.yml +155 -0
  42. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order.yml +186 -0
  43. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order_with_ORCID.yml +177 -0
  44. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_sort-order.yml +173 -0
  45. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/is_organization.yml +207 -0
  46. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_comma.yml +207 -0
  47. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_family_name.yml +207 -0
  48. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_id.yml +207 -0
  49. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_no_info.yml +207 -0
  50. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_organization.yml +207 -0
  51. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_person.yml +207 -0
  52. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_only_keep_specific_tags.yml +930 -0
  53. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_remove_a_tags.yml +930 -0
  54. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/to_schema_org/with_id.yml +930 -0
  55. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/insert_related_identifiers/related_identifier.yml +173 -0
  56. data/spec/fixtures/vcr_cassettes/Bolognese_DataciteJson/get_metadata_as_bibtex/BlogPosting.yml +155 -0
  57. data/spec/schema_org_spec.rb +17 -14
  58. data/spec/utils_spec.rb +32 -2
  59. metadata +54 -4
@@ -12,12 +12,36 @@ module Bolognese
12
12
  include Bolognese::DataciteUtils
13
13
  include Bolognese::Utils
14
14
 
15
+ DC_TO_SO_TRANSLATIONS = {
16
+ "Audiovisual" => "VideoObject",
17
+ "Collection" => "Collection",
18
+ "Dataset" => "Dataset",
19
+ "Event" => "Event",
20
+ "Image" => "ImageObject",
21
+ "InteractiveResource" => nil,
22
+ "Model" => nil,
23
+ "PhysicalObject" => nil,
24
+ "Service" => "Service",
25
+ "Software" => "SoftwareSourceCode",
26
+ "Sound" => "AudioObject",
27
+ "Text" => "ScholarlyArticle",
28
+ "Workflow" => nil,
29
+ "Other" => "CreativeWork"
30
+ }
31
+
15
32
  attr_reader :id, :raw, :provider, :schema_version, :license, :citation,
16
33
  :additional_type, :alternate_name, :url, :version, :keywords, :editor,
17
34
  :page_start, :page_end, :date_modified, :language, :spatial_coverage,
18
35
  :content_size, :funder, :journal, :bibtex_type, :date_created, :has_part,
19
- :publisher, :contributor, :same_as, :predecessor_of,
20
- :successor_of, :should_passthru, :datacite_errors
36
+ :publisher, :contributor, :same_as, :is_previous_version_of, :is_new_version_of,
37
+ :should_passthru, :datacite_errors, :date_accepted, :date_available,
38
+ :date_copyrighted, :date_collected, :date_submitted, :date_valid,
39
+ :is_cited_by, :cites, :is_supplement_to, :is_supplemented_by,
40
+ :is_continued_by, :continues, :has_metadata, :is_metadata_for,
41
+ :is_referenced_by, :references, :is_documented_by, :documents,
42
+ :is_compiled_by, :compiles, :is_variant_form_of, :is_original_form_of,
43
+ :is_reviewed_by, :reviews, :is_derived_from, :is_source_of, :format,
44
+ :related_identifier
21
45
 
22
46
  def publication_year
23
47
  date_published && date_published[0..3]
@@ -31,22 +55,19 @@ module Bolognese
31
55
  [page_start, page_end].compact.join("-").presence
32
56
  end
33
57
 
34
- def publisher_string
35
- publisher.to_h.fetch("name", nil)
36
- end
37
-
38
58
  def schema_org
39
- { "@context" => id.present? ? "http://schema.org" : nil,
59
+ hsh = {
60
+ "@context" => id.present? ? "http://schema.org" : nil,
40
61
  "@type" => type,
41
62
  "@id" => id,
42
63
  "url" => url,
43
64
  "additionalType" => additional_type,
44
- "name" => name,
65
+ "name" => title,
45
66
  "alternateName" => alternate_name,
46
- "author" => author,
67
+ "author" => to_schema_org(author),
47
68
  "editor" => editor,
48
- "description" => description,
49
- "license" => license,
69
+ "description" => description.present? ? description["text"] : nil,
70
+ "license" => license.present? ? license["id"] : nil,
50
71
  "version" => version,
51
72
  "keywords" => keywords,
52
73
  "language" => language,
@@ -60,32 +81,72 @@ module Bolognese
60
81
  "sameAs" => same_as,
61
82
  "isPartOf" => is_part_of,
62
83
  "hasPart" => has_part,
63
- "predecessor_of" => predecessor_of,
64
- "successor_of" => successor_of,
65
- "citation" => citation,
84
+ "predecessor_of" => is_previous_version_of,
85
+ "successor_of" => is_new_version_of,
86
+ "citation" => references,
66
87
  "schemaVersion" => schema_version,
67
- "publisher" => publisher,
88
+ "publisher" => { "@type" => "Organization", "name" => publisher },
68
89
  "funder" => funder,
90
+ "provider" => { "@type" => "Organization", "name" => provider }
91
+ }.compact
92
+ JSON.pretty_generate hsh
93
+ end
94
+
95
+ def datacite_json
96
+ hsh = {
97
+ "id" => id,
98
+ "doi" => doi,
99
+ "creator" => author,
100
+ "title" => title,
101
+ "publisher" => publisher,
102
+ "publication-year" => publication_year,
103
+ "resource-type-general" => resource_type_general,
104
+ "resource-type" => additional_type,
105
+ "subject" => keywords.present? ? keywords.split(", ") : nil,
106
+ "contributor" => contributor,
107
+ "date-accepted" => date_accepted,
108
+ "date-available" => date_available,
109
+ "date-copyrighted" => date_copyrighted,
110
+ "date-collected" => date_collected,
111
+ "date-created" => date_created,
112
+ "date-published" => date_published,
113
+ "date-modified" => date_modified,
114
+ "date-submitted" => date_submitted,
115
+ "date-valid" => date_valid,
116
+ "language" => language,
117
+ "alternate-identifier" => alternate_name,
118
+ "related_identifier" => related_identifier,
119
+ "size" => content_size,
120
+ "format" => format,
121
+ "version" => version,
122
+ "rights" => license,
123
+ "description" => description,
124
+ "geo-location" => spatial_coverage,
125
+ "funding-reference" => funder,
126
+ "schemaVersion" => schema_version,
69
127
  "provider" => provider
70
- }.compact.to_json
128
+ }.compact
129
+ JSON.pretty_generate hsh
71
130
  end
72
131
 
73
132
  def codemeta
74
- { "@context" => id.present? ? "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld" : nil,
133
+ hsh = {
134
+ "@context" => id.present? ? "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld" : nil,
75
135
  "@type" => type,
76
136
  "@id" => id,
77
137
  "identifier" => id,
78
138
  "codeRepository" => url,
79
- "title" => name,
139
+ "title" => title,
80
140
  "agents" => author,
81
- "description" => description,
141
+ "description" => description.present? ? description["text"] : nil,
82
142
  "version" => version,
83
143
  "tags" => keywords.to_s.split(", ").presence,
84
144
  "dateCreated" => date_created,
85
145
  "datePublished" => date_published,
86
146
  "dateModified" => date_modified,
87
147
  "publisher" => publisher
88
- }.compact.to_json
148
+ }.compact
149
+ JSON.pretty_generate hsh
89
150
  end
90
151
 
91
152
  def bibtex
@@ -97,10 +158,10 @@ module Bolognese
97
158
  author: authors_as_string(author),
98
159
  keywords: keywords,
99
160
  language: language,
100
- title: name,
161
+ title: title,
101
162
  journal: journal,
102
163
  pages: pagination,
103
- publisher: publisher_string,
164
+ publisher: publisher,
104
165
  year: publication_year
105
166
  }.compact
106
167
  BibTeX::Entry.new(bib).to_s
@@ -1,6 +1,15 @@
1
1
  module Bolognese
2
2
  class SchemaOrg < Metadata
3
3
 
4
+ SO_TO_DC_RELATION_TYPES = {
5
+ "citation" => "References",
6
+ "sameAs" => "IsIdenticalTo",
7
+ "isPartOf" => "IsPartOf",
8
+ "hasPart" => "HasPart",
9
+ "isPredecessor" => "IsPreviousVersionOf",
10
+ "isSuccessor" => "IsNewVersionOf"
11
+ }
12
+
4
13
  def initialize(id: nil, string: nil)
5
14
  id = normalize_id(id) if id.present?
6
15
 
@@ -53,7 +62,7 @@ module Bolognese
53
62
  Bolognese::Bibtex::SO_TO_BIB_TRANSLATIONS[type] || "misc"
54
63
  end
55
64
 
56
- def name
65
+ def title
57
66
  metadata.fetch("name", nil)
58
67
  end
59
68
 
@@ -62,21 +71,21 @@ module Bolognese
62
71
  end
63
72
 
64
73
  def author
65
- arr = Array.wrap(metadata.fetch("author", nil)).map { |a| a.except("name") }
66
- array_unwrap(arr)
74
+ authors = from_schema_org(Array.wrap(metadata.fetch("author", nil)))
75
+ get_authors(authors)
67
76
  end
68
77
 
69
78
  def editor
70
- arr = Array.wrap(metadata.fetch("editor", nil)).map { |a| a.except("name") }
71
- array_unwrap(arr)
79
+ editors = from_schema_org(Array.wrap(metadata.fetch("editor", nil)))
80
+ get_authors(editors)
72
81
  end
73
82
 
74
83
  def description
75
- metadata.fetch("description", nil)
84
+ { "text" => metadata.fetch("description", nil) }
76
85
  end
77
86
 
78
87
  def license
79
- metadata.fetch("license", nil)
88
+ { "id" => metadata.fetch("license", nil) }
80
89
  end
81
90
 
82
91
  def version
@@ -99,28 +108,45 @@ module Bolognese
99
108
  metadata.fetch("dateModified", nil)
100
109
  end
101
110
 
102
- def related_identifiers(relation_type)
103
- normalize_ids(metadata.fetch(relation_type, nil))
111
+ def related_identifier
112
+ Array.wrap(is_identical_to) +
113
+ Array.wrap(is_part_of) +
114
+ Array.wrap(has_part) +
115
+ Array.wrap(is_previous_version_of) +
116
+ Array.wrap(is_new_version_of) +
117
+ Array.wrap(references)
118
+ end
119
+
120
+ def get_related_identifier(relation_type: nil)
121
+ normalize_ids(metadata.fetch(relation_type, nil), SO_TO_DC_RELATION_TYPES[relation_type])
104
122
  end
105
123
 
106
- def same_as
107
- related_identifiers("isIdenticalTo")
124
+ def is_identical_to
125
+ get_related_identifier(relation_type: "sameAs")
108
126
  end
109
127
 
110
128
  def is_part_of
111
- related_identifiers("isPartOf")
129
+ get_related_identifier(relation_type: "isPartOf")
112
130
  end
113
131
 
114
132
  def has_part
115
- related_identifiers("hasPart")
133
+ get_related_identifier(relation_type: "hasPart")
134
+ end
135
+
136
+ def is_previous_version_of
137
+ get_related_identifier(relation_type: "isPredecessor")
138
+ end
139
+
140
+ def is_new_version_of
141
+ get_related_identifier(relation_type: "isSuccessor")
116
142
  end
117
143
 
118
- def citation
119
- related_identifiers("citation")
144
+ def references
145
+ get_related_identifier(relation_type: "citation")
120
146
  end
121
147
 
122
148
  def publisher
123
- metadata.fetch("publisher", nil)
149
+ metadata.dig("publisher", "name")
124
150
  end
125
151
 
126
152
  def container_title
@@ -1,5 +1,13 @@
1
1
  module Bolognese
2
2
  module Utils
3
+ LICENSE_NAMES = {
4
+ "http://creativecommons.org/publicdomain/zero/1.0/" => "Public Domain (CC0 1.0)",
5
+ "http://creativecommons.org/licenses/by/3.0/" => "Creative Commons Attribution 3.0 (CC-BY 3.0)",
6
+ "http://creativecommons.org/licenses/by/4.0/" => "Creative Commons Attribution 4.0 (CC-BY 4.0)",
7
+ "http://creativecommons.org/licenses/by-nc/4.0/" => "Creative Commons Attribution Noncommercial 4.0 (CC-BY-NC 4.0)",
8
+ "http://creativecommons.org/licenses/by-sa/4.0/" => "Creative Commons Attribution Share Alike 4.0 (CC-BY-SA 4.0)",
9
+ "http://creativecommons.org/licenses/by-nc-nd/4.0/" => "Creative Commons Attribution Noncommercial No Derivatives 4.0 (CC-BY-NC-ND 4.0)"
10
+ }
3
11
 
4
12
  def find_from_format(id: nil, string: nil, ext: nil, filename: nil)
5
13
  if id.present?
@@ -30,6 +38,8 @@ module Bolognese
30
38
  "crossref"
31
39
  elsif options[:ext] == ".xml" && Maremma.from_xml(string).dig("resource", "xmlns").start_with?("http://datacite.org/schema/kernel")
32
40
  "datacite"
41
+ elsif options[:ext] == ".json" && Maremma.from_json(string).dig("resource", "xmlns").to_s.start_with?("http://datacite.org/schema/kernel")
42
+ "datacite_json"
33
43
  elsif options[:filename] == "codemeta.json"
34
44
  "codemeta"
35
45
  end
@@ -41,6 +51,7 @@ module Bolognese
41
51
  when "crossref" then Crossref.new(id: id, string: string)
42
52
  when "datacite" then Datacite.new(id: id, string: string, regenerate: options[:regenerate])
43
53
  when "codemeta" then Codemeta.new(id: id, string: string)
54
+ when "datacite_json" then DataciteJson.new(string: string)
44
55
  when "bibtex" then Bibtex.new(string: string)
45
56
  else SchemaOrg.new(id: id)
46
57
  end
@@ -83,21 +94,12 @@ module Bolognese
83
94
  elsif element.is_a?(Hash)
84
95
  element.fetch(content, nil)
85
96
  elsif element.is_a?(Array)
86
- a = element.map { |e| e.fetch(content, nil) }.uniq
87
- array_unwrap(a)
97
+ a = element.map { |e| e.fetch(content, nil) }.uniq.unwrap
88
98
  else
89
99
  nil
90
100
  end
91
101
  end
92
102
 
93
- def array_unwrap(element)
94
- case element.length
95
- when 0 then nil
96
- when 1 then element.first
97
- else element
98
- end
99
- end
100
-
101
103
  def normalize_id(id)
102
104
  return nil unless id.present?
103
105
 
@@ -112,9 +114,73 @@ module Bolognese
112
114
  "http://orcid.org/" + Addressable::URI.encode(orcid)
113
115
  end
114
116
 
115
- def normalize_ids(list)
116
- arr = Array.wrap(list).map { |url| url.merge("@id" => normalize_id(url["@id"])) }
117
- array_unwrap(arr)
117
+ def normalize_ids(list, relation_type = "References")
118
+ Array.wrap(list).map do |url|
119
+ { "id" => normalize_id(url["@id"]),
120
+ "type" => url["@type"],
121
+ "name" => url["name"],
122
+ "relationType" => relation_type }.compact
123
+ end.unwrap
124
+ end
125
+
126
+ # find Creative Commons or OSI license in licenses array, normalize url and name
127
+ def normalize_licenses(licenses)
128
+ standard_licenses = Array.wrap(licenses).map { |l| URI.parse(l["url"]) }.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] }
129
+ return licenses unless standard_licenses.present?
130
+
131
+ # use HTTPS
132
+ uri.scheme = "https"
133
+
134
+ # use host name without subdomain
135
+ uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last
136
+
137
+ # normalize URLs
138
+ if uri.host == "creativecommons.org"
139
+ uri.path = uri.path.split('/')[0..-2].join("/") if uri.path.split('/').last == "legalcode"
140
+ uri.path << '/' unless uri.path.end_with?('/')
141
+ else
142
+ uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '')
143
+ uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase }
144
+ uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize }
145
+ uri.path = uri.path.sub(/([^0-9\-]+)(-)?([1-9])?(\.)?([0-9])?$/) do
146
+ m = Regexp.last_match
147
+ text = m[1]
148
+
149
+ if m[3].present?
150
+ version = [m[3], m[5].presence || "0"].join(".")
151
+ [text, version].join("-")
152
+ else
153
+ text
154
+ end
155
+ end
156
+ end
157
+
158
+ uri.to_s
159
+ rescue URI::InvalidURIError
160
+ nil
161
+ end
162
+
163
+ def to_schema_org(element)
164
+ Array.wrap(element).map do |a|
165
+ a["@type"] = a["type"]
166
+ a["@id"] = a["id"]
167
+ a.except("type", "id").compact
168
+ end.unwrap
169
+ end
170
+
171
+ def from_schema_org(element)
172
+ Array.wrap(element).map do |a|
173
+ a["type"] = a["@type"]
174
+ a["id"] = a["@id"]
175
+ a.except("@type", "@id").compact
176
+ end.unwrap
177
+ end
178
+
179
+ def sanitize(text, options={})
180
+ options[:tags] ||= Set.new(%w(strong em b i code pre sub sup br))
181
+ custom_scrubber = Bolognese::WhitelistScrubber.new(options)
182
+
183
+ Loofah.scrub_fragment(text, custom_scrubber).to_s.gsub(/\u00a0/, ' ').strip
118
184
  end
119
185
 
120
186
  def github_from_url(url)
@@ -1,3 +1,3 @@
1
1
  module Bolognese
2
- VERSION = "0.7.2"
2
+ VERSION = "0.8"
3
3
  end
@@ -0,0 +1,45 @@
1
+ # modified from https://gist.github.com/ivan-kolmychek/ee2fdc53f3e2c637271d
2
+
3
+ module Bolognese
4
+ class WhitelistScrubber < Loofah::Scrubber
5
+ def initialize(options={})
6
+ @direction = :bottom_up
7
+ @tags = options[:tags]
8
+ @attributes = options[:attributes]
9
+ end
10
+
11
+ def scrub(node)
12
+ scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
13
+ node.before node.children
14
+ node.remove
15
+ end
16
+
17
+ private
18
+
19
+ def scrub_node_attributes(node)
20
+ fallback_scrub_node_attributes(node) and return true unless @attributes.present? && @attributes.respond_to?(:include?)
21
+ node.attribute_nodes.each do |attr_node|
22
+ attr_node.remove unless @attributes.include?(attr_node.name)
23
+ end
24
+ end
25
+
26
+ def allowed_not_element_node_types
27
+ [ Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE ]
28
+ end
29
+
30
+ def fallback_scrub_node_attributes(node)
31
+ Loofah::HTML5::Scrub.scrub_attributes(node)
32
+ end
33
+
34
+ def fallback_allowed_element_detection(node)
35
+ Loofah::HTML5::Scrub.allowed_element?(node.name)
36
+ end
37
+
38
+ def node_allowed?(node)
39
+ return fallback_allowed_element_detection(node) unless @tags.present? && @tags.respond_to?(:include?)
40
+ return true if allowed_not_element_node_types.include?(node.type)
41
+ return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
42
+ @tags.include? node.name
43
+ end
44
+ end
45
+ end