bolognese 0.7.2 → 0.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -1
  3. data/README.md +25 -16
  4. data/bolognese.gemspec +2 -1
  5. data/codemeta.json +39 -0
  6. data/lib/bolognese.rb +4 -0
  7. data/lib/bolognese/array.rb +11 -0
  8. data/lib/bolognese/author_utils.rb +35 -21
  9. data/lib/bolognese/bibtex.rb +4 -4
  10. data/lib/bolognese/codemeta.rb +8 -13
  11. data/lib/bolognese/crossref.rb +22 -20
  12. data/lib/bolognese/datacite.rb +61 -61
  13. data/lib/bolognese/datacite_json.rb +208 -0
  14. data/lib/bolognese/datacite_utils.rb +17 -48
  15. data/lib/bolognese/metadata.rb +83 -22
  16. data/lib/bolognese/schema_org.rb +42 -16
  17. data/lib/bolognese/utils.rb +79 -13
  18. data/lib/bolognese/version.rb +1 -1
  19. data/lib/bolognese/whitelist_scrubber.rb +45 -0
  20. data/spec/array_spec.rb +20 -0
  21. data/spec/author_utils_spec.rb +93 -9
  22. data/spec/bibtex_spec.rb +4 -4
  23. data/spec/cli_spec.rb +5 -0
  24. data/spec/codemeta_spec.rb +41 -31
  25. data/spec/crossref_spec.rb +47 -72
  26. data/spec/datacite_json_spec.rb +65 -0
  27. data/spec/datacite_spec.rb +67 -83
  28. data/spec/datacite_utils_spec.rb +9 -14
  29. data/spec/fixtures/datacite.json +49 -0
  30. data/spec/fixtures/datacite_software.json +18 -0
  31. data/spec/fixtures/vcr_cassettes/Bolognese_CLI/convert_from_id/datacite/to_datacite_json.yml +214 -0
  32. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_from_schema_org/with_id.yml +930 -0
  33. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_to_schema_org/with_id.yml +930 -0
  34. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/author.yml +137 -860
  35. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/no_author.yml +137 -860
  36. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/single_author.yml +137 -860
  37. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/with_organization.yml +137 -860
  38. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/from_schema_org/with_id.yml +930 -0
  39. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_ORCID.yml +155 -0
  40. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_no_ORCID.yml +134 -0
  41. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_familyName.yml +155 -0
  42. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order.yml +186 -0
  43. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order_with_ORCID.yml +177 -0
  44. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_sort-order.yml +173 -0
  45. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/is_organization.yml +207 -0
  46. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_comma.yml +207 -0
  47. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_family_name.yml +207 -0
  48. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_id.yml +207 -0
  49. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_no_info.yml +207 -0
  50. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_organization.yml +207 -0
  51. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_person.yml +207 -0
  52. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_only_keep_specific_tags.yml +930 -0
  53. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_remove_a_tags.yml +930 -0
  54. data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/to_schema_org/with_id.yml +930 -0
  55. data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/insert_related_identifiers/related_identifier.yml +173 -0
  56. data/spec/fixtures/vcr_cassettes/Bolognese_DataciteJson/get_metadata_as_bibtex/BlogPosting.yml +155 -0
  57. data/spec/schema_org_spec.rb +17 -14
  58. data/spec/utils_spec.rb +32 -2
  59. metadata +54 -4
@@ -12,12 +12,36 @@ module Bolognese
12
12
  include Bolognese::DataciteUtils
13
13
  include Bolognese::Utils
14
14
 
15
+ DC_TO_SO_TRANSLATIONS = {
16
+ "Audiovisual" => "VideoObject",
17
+ "Collection" => "Collection",
18
+ "Dataset" => "Dataset",
19
+ "Event" => "Event",
20
+ "Image" => "ImageObject",
21
+ "InteractiveResource" => nil,
22
+ "Model" => nil,
23
+ "PhysicalObject" => nil,
24
+ "Service" => "Service",
25
+ "Software" => "SoftwareSourceCode",
26
+ "Sound" => "AudioObject",
27
+ "Text" => "ScholarlyArticle",
28
+ "Workflow" => nil,
29
+ "Other" => "CreativeWork"
30
+ }
31
+
15
32
  attr_reader :id, :raw, :provider, :schema_version, :license, :citation,
16
33
  :additional_type, :alternate_name, :url, :version, :keywords, :editor,
17
34
  :page_start, :page_end, :date_modified, :language, :spatial_coverage,
18
35
  :content_size, :funder, :journal, :bibtex_type, :date_created, :has_part,
19
- :publisher, :contributor, :same_as, :predecessor_of,
20
- :successor_of, :should_passthru, :datacite_errors
36
+ :publisher, :contributor, :same_as, :is_previous_version_of, :is_new_version_of,
37
+ :should_passthru, :datacite_errors, :date_accepted, :date_available,
38
+ :date_copyrighted, :date_collected, :date_submitted, :date_valid,
39
+ :is_cited_by, :cites, :is_supplement_to, :is_supplemented_by,
40
+ :is_continued_by, :continues, :has_metadata, :is_metadata_for,
41
+ :is_referenced_by, :references, :is_documented_by, :documents,
42
+ :is_compiled_by, :compiles, :is_variant_form_of, :is_original_form_of,
43
+ :is_reviewed_by, :reviews, :is_derived_from, :is_source_of, :format,
44
+ :related_identifier
21
45
 
22
46
  def publication_year
23
47
  date_published && date_published[0..3]
@@ -31,22 +55,19 @@ module Bolognese
31
55
  [page_start, page_end].compact.join("-").presence
32
56
  end
33
57
 
34
- def publisher_string
35
- publisher.to_h.fetch("name", nil)
36
- end
37
-
38
58
  def schema_org
39
- { "@context" => id.present? ? "http://schema.org" : nil,
59
+ hsh = {
60
+ "@context" => id.present? ? "http://schema.org" : nil,
40
61
  "@type" => type,
41
62
  "@id" => id,
42
63
  "url" => url,
43
64
  "additionalType" => additional_type,
44
- "name" => name,
65
+ "name" => title,
45
66
  "alternateName" => alternate_name,
46
- "author" => author,
67
+ "author" => to_schema_org(author),
47
68
  "editor" => editor,
48
- "description" => description,
49
- "license" => license,
69
+ "description" => description.present? ? description["text"] : nil,
70
+ "license" => license.present? ? license["id"] : nil,
50
71
  "version" => version,
51
72
  "keywords" => keywords,
52
73
  "language" => language,
@@ -60,32 +81,72 @@ module Bolognese
60
81
  "sameAs" => same_as,
61
82
  "isPartOf" => is_part_of,
62
83
  "hasPart" => has_part,
63
- "predecessor_of" => predecessor_of,
64
- "successor_of" => successor_of,
65
- "citation" => citation,
84
+ "predecessor_of" => is_previous_version_of,
85
+ "successor_of" => is_new_version_of,
86
+ "citation" => references,
66
87
  "schemaVersion" => schema_version,
67
- "publisher" => publisher,
88
+ "publisher" => { "@type" => "Organization", "name" => publisher },
68
89
  "funder" => funder,
90
+ "provider" => { "@type" => "Organization", "name" => provider }
91
+ }.compact
92
+ JSON.pretty_generate hsh
93
+ end
94
+
95
+ def datacite_json
96
+ hsh = {
97
+ "id" => id,
98
+ "doi" => doi,
99
+ "creator" => author,
100
+ "title" => title,
101
+ "publisher" => publisher,
102
+ "publication-year" => publication_year,
103
+ "resource-type-general" => resource_type_general,
104
+ "resource-type" => additional_type,
105
+ "subject" => keywords.present? ? keywords.split(", ") : nil,
106
+ "contributor" => contributor,
107
+ "date-accepted" => date_accepted,
108
+ "date-available" => date_available,
109
+ "date-copyrighted" => date_copyrighted,
110
+ "date-collected" => date_collected,
111
+ "date-created" => date_created,
112
+ "date-published" => date_published,
113
+ "date-modified" => date_modified,
114
+ "date-submitted" => date_submitted,
115
+ "date-valid" => date_valid,
116
+ "language" => language,
117
+ "alternate-identifier" => alternate_name,
118
+ "related_identifier" => related_identifier,
119
+ "size" => content_size,
120
+ "format" => format,
121
+ "version" => version,
122
+ "rights" => license,
123
+ "description" => description,
124
+ "geo-location" => spatial_coverage,
125
+ "funding-reference" => funder,
126
+ "schemaVersion" => schema_version,
69
127
  "provider" => provider
70
- }.compact.to_json
128
+ }.compact
129
+ JSON.pretty_generate hsh
71
130
  end
72
131
 
73
132
  def codemeta
74
- { "@context" => id.present? ? "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld" : nil,
133
+ hsh = {
134
+ "@context" => id.present? ? "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld" : nil,
75
135
  "@type" => type,
76
136
  "@id" => id,
77
137
  "identifier" => id,
78
138
  "codeRepository" => url,
79
- "title" => name,
139
+ "title" => title,
80
140
  "agents" => author,
81
- "description" => description,
141
+ "description" => description.present? ? description["text"] : nil,
82
142
  "version" => version,
83
143
  "tags" => keywords.to_s.split(", ").presence,
84
144
  "dateCreated" => date_created,
85
145
  "datePublished" => date_published,
86
146
  "dateModified" => date_modified,
87
147
  "publisher" => publisher
88
- }.compact.to_json
148
+ }.compact
149
+ JSON.pretty_generate hsh
89
150
  end
90
151
 
91
152
  def bibtex
@@ -97,10 +158,10 @@ module Bolognese
97
158
  author: authors_as_string(author),
98
159
  keywords: keywords,
99
160
  language: language,
100
- title: name,
161
+ title: title,
101
162
  journal: journal,
102
163
  pages: pagination,
103
- publisher: publisher_string,
164
+ publisher: publisher,
104
165
  year: publication_year
105
166
  }.compact
106
167
  BibTeX::Entry.new(bib).to_s
@@ -1,6 +1,15 @@
1
1
  module Bolognese
2
2
  class SchemaOrg < Metadata
3
3
 
4
+ SO_TO_DC_RELATION_TYPES = {
5
+ "citation" => "References",
6
+ "sameAs" => "IsIdenticalTo",
7
+ "isPartOf" => "IsPartOf",
8
+ "hasPart" => "HasPart",
9
+ "isPredecessor" => "IsPreviousVersionOf",
10
+ "isSuccessor" => "IsNewVersionOf"
11
+ }
12
+
4
13
  def initialize(id: nil, string: nil)
5
14
  id = normalize_id(id) if id.present?
6
15
 
@@ -53,7 +62,7 @@ module Bolognese
53
62
  Bolognese::Bibtex::SO_TO_BIB_TRANSLATIONS[type] || "misc"
54
63
  end
55
64
 
56
- def name
65
+ def title
57
66
  metadata.fetch("name", nil)
58
67
  end
59
68
 
@@ -62,21 +71,21 @@ module Bolognese
62
71
  end
63
72
 
64
73
  def author
65
- arr = Array.wrap(metadata.fetch("author", nil)).map { |a| a.except("name") }
66
- array_unwrap(arr)
74
+ authors = from_schema_org(Array.wrap(metadata.fetch("author", nil)))
75
+ get_authors(authors)
67
76
  end
68
77
 
69
78
  def editor
70
- arr = Array.wrap(metadata.fetch("editor", nil)).map { |a| a.except("name") }
71
- array_unwrap(arr)
79
+ editors = from_schema_org(Array.wrap(metadata.fetch("editor", nil)))
80
+ get_authors(editors)
72
81
  end
73
82
 
74
83
  def description
75
- metadata.fetch("description", nil)
84
+ { "text" => metadata.fetch("description", nil) }
76
85
  end
77
86
 
78
87
  def license
79
- metadata.fetch("license", nil)
88
+ { "id" => metadata.fetch("license", nil) }
80
89
  end
81
90
 
82
91
  def version
@@ -99,28 +108,45 @@ module Bolognese
99
108
  metadata.fetch("dateModified", nil)
100
109
  end
101
110
 
102
- def related_identifiers(relation_type)
103
- normalize_ids(metadata.fetch(relation_type, nil))
111
+ def related_identifier
112
+ Array.wrap(is_identical_to) +
113
+ Array.wrap(is_part_of) +
114
+ Array.wrap(has_part) +
115
+ Array.wrap(is_previous_version_of) +
116
+ Array.wrap(is_new_version_of) +
117
+ Array.wrap(references)
118
+ end
119
+
120
+ def get_related_identifier(relation_type: nil)
121
+ normalize_ids(metadata.fetch(relation_type, nil), SO_TO_DC_RELATION_TYPES[relation_type])
104
122
  end
105
123
 
106
- def same_as
107
- related_identifiers("isIdenticalTo")
124
+ def is_identical_to
125
+ get_related_identifier(relation_type: "sameAs")
108
126
  end
109
127
 
110
128
  def is_part_of
111
- related_identifiers("isPartOf")
129
+ get_related_identifier(relation_type: "isPartOf")
112
130
  end
113
131
 
114
132
  def has_part
115
- related_identifiers("hasPart")
133
+ get_related_identifier(relation_type: "hasPart")
134
+ end
135
+
136
+ def is_previous_version_of
137
+ get_related_identifier(relation_type: "isPredecessor")
138
+ end
139
+
140
+ def is_new_version_of
141
+ get_related_identifier(relation_type: "isSuccessor")
116
142
  end
117
143
 
118
- def citation
119
- related_identifiers("citation")
144
+ def references
145
+ get_related_identifier(relation_type: "citation")
120
146
  end
121
147
 
122
148
  def publisher
123
- metadata.fetch("publisher", nil)
149
+ metadata.dig("publisher", "name")
124
150
  end
125
151
 
126
152
  def container_title
@@ -1,5 +1,13 @@
1
1
  module Bolognese
2
2
  module Utils
3
+ LICENSE_NAMES = {
4
+ "http://creativecommons.org/publicdomain/zero/1.0/" => "Public Domain (CC0 1.0)",
5
+ "http://creativecommons.org/licenses/by/3.0/" => "Creative Commons Attribution 3.0 (CC-BY 3.0)",
6
+ "http://creativecommons.org/licenses/by/4.0/" => "Creative Commons Attribution 4.0 (CC-BY 4.0)",
7
+ "http://creativecommons.org/licenses/by-nc/4.0/" => "Creative Commons Attribution Noncommercial 4.0 (CC-BY-NC 4.0)",
8
+ "http://creativecommons.org/licenses/by-sa/4.0/" => "Creative Commons Attribution Share Alike 4.0 (CC-BY-SA 4.0)",
9
+ "http://creativecommons.org/licenses/by-nc-nd/4.0/" => "Creative Commons Attribution Noncommercial No Derivatives 4.0 (CC-BY-NC-ND 4.0)"
10
+ }
3
11
 
4
12
  def find_from_format(id: nil, string: nil, ext: nil, filename: nil)
5
13
  if id.present?
@@ -30,6 +38,8 @@ module Bolognese
30
38
  "crossref"
31
39
  elsif options[:ext] == ".xml" && Maremma.from_xml(string).dig("resource", "xmlns").start_with?("http://datacite.org/schema/kernel")
32
40
  "datacite"
41
+ elsif options[:ext] == ".json" && Maremma.from_json(string).dig("resource", "xmlns").to_s.start_with?("http://datacite.org/schema/kernel")
42
+ "datacite_json"
33
43
  elsif options[:filename] == "codemeta.json"
34
44
  "codemeta"
35
45
  end
@@ -41,6 +51,7 @@ module Bolognese
41
51
  when "crossref" then Crossref.new(id: id, string: string)
42
52
  when "datacite" then Datacite.new(id: id, string: string, regenerate: options[:regenerate])
43
53
  when "codemeta" then Codemeta.new(id: id, string: string)
54
+ when "datacite_json" then DataciteJson.new(string: string)
44
55
  when "bibtex" then Bibtex.new(string: string)
45
56
  else SchemaOrg.new(id: id)
46
57
  end
@@ -83,21 +94,12 @@ module Bolognese
83
94
  elsif element.is_a?(Hash)
84
95
  element.fetch(content, nil)
85
96
  elsif element.is_a?(Array)
86
- a = element.map { |e| e.fetch(content, nil) }.uniq
87
- array_unwrap(a)
97
+ a = element.map { |e| e.fetch(content, nil) }.uniq.unwrap
88
98
  else
89
99
  nil
90
100
  end
91
101
  end
92
102
 
93
- def array_unwrap(element)
94
- case element.length
95
- when 0 then nil
96
- when 1 then element.first
97
- else element
98
- end
99
- end
100
-
101
103
  def normalize_id(id)
102
104
  return nil unless id.present?
103
105
 
@@ -112,9 +114,73 @@ module Bolognese
112
114
  "http://orcid.org/" + Addressable::URI.encode(orcid)
113
115
  end
114
116
 
115
- def normalize_ids(list)
116
- arr = Array.wrap(list).map { |url| url.merge("@id" => normalize_id(url["@id"])) }
117
- array_unwrap(arr)
117
+ def normalize_ids(list, relation_type = "References")
118
+ Array.wrap(list).map do |url|
119
+ { "id" => normalize_id(url["@id"]),
120
+ "type" => url["@type"],
121
+ "name" => url["name"],
122
+ "relationType" => relation_type }.compact
123
+ end.unwrap
124
+ end
125
+
126
+ # find Creative Commons or OSI license in licenses array, normalize url and name
127
+ def normalize_licenses(licenses)
128
+ standard_licenses = Array.wrap(licenses).map { |l| URI.parse(l["url"]) }.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] }
129
+ return licenses unless standard_licenses.present?
130
+
131
+ # use HTTPS
132
+ uri.scheme = "https"
133
+
134
+ # use host name without subdomain
135
+ uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last
136
+
137
+ # normalize URLs
138
+ if uri.host == "creativecommons.org"
139
+ uri.path = uri.path.split('/')[0..-2].join("/") if uri.path.split('/').last == "legalcode"
140
+ uri.path << '/' unless uri.path.end_with?('/')
141
+ else
142
+ uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '')
143
+ uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase }
144
+ uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize }
145
+ uri.path = uri.path.sub(/([^0-9\-]+)(-)?([1-9])?(\.)?([0-9])?$/) do
146
+ m = Regexp.last_match
147
+ text = m[1]
148
+
149
+ if m[3].present?
150
+ version = [m[3], m[5].presence || "0"].join(".")
151
+ [text, version].join("-")
152
+ else
153
+ text
154
+ end
155
+ end
156
+ end
157
+
158
+ uri.to_s
159
+ rescue URI::InvalidURIError
160
+ nil
161
+ end
162
+
163
+ def to_schema_org(element)
164
+ Array.wrap(element).map do |a|
165
+ a["@type"] = a["type"]
166
+ a["@id"] = a["id"]
167
+ a.except("type", "id").compact
168
+ end.unwrap
169
+ end
170
+
171
+ def from_schema_org(element)
172
+ Array.wrap(element).map do |a|
173
+ a["type"] = a["@type"]
174
+ a["id"] = a["@id"]
175
+ a.except("@type", "@id").compact
176
+ end.unwrap
177
+ end
178
+
179
+ def sanitize(text, options={})
180
+ options[:tags] ||= Set.new(%w(strong em b i code pre sub sup br))
181
+ custom_scrubber = Bolognese::WhitelistScrubber.new(options)
182
+
183
+ Loofah.scrub_fragment(text, custom_scrubber).to_s.gsub(/\u00a0/, ' ').strip
118
184
  end
119
185
 
120
186
  def github_from_url(url)
@@ -1,3 +1,3 @@
1
1
  module Bolognese
2
- VERSION = "0.7.2"
2
+ VERSION = "0.8"
3
3
  end
@@ -0,0 +1,45 @@
1
+ # modified from https://gist.github.com/ivan-kolmychek/ee2fdc53f3e2c637271d
2
+
3
+ module Bolognese
4
+ class WhitelistScrubber < Loofah::Scrubber
5
+ def initialize(options={})
6
+ @direction = :bottom_up
7
+ @tags = options[:tags]
8
+ @attributes = options[:attributes]
9
+ end
10
+
11
+ def scrub(node)
12
+ scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
13
+ node.before node.children
14
+ node.remove
15
+ end
16
+
17
+ private
18
+
19
+ def scrub_node_attributes(node)
20
+ fallback_scrub_node_attributes(node) and return true unless @attributes.present? && @attributes.respond_to?(:include?)
21
+ node.attribute_nodes.each do |attr_node|
22
+ attr_node.remove unless @attributes.include?(attr_node.name)
23
+ end
24
+ end
25
+
26
+ def allowed_not_element_node_types
27
+ [ Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE ]
28
+ end
29
+
30
+ def fallback_scrub_node_attributes(node)
31
+ Loofah::HTML5::Scrub.scrub_attributes(node)
32
+ end
33
+
34
+ def fallback_allowed_element_detection(node)
35
+ Loofah::HTML5::Scrub.allowed_element?(node.name)
36
+ end
37
+
38
+ def node_allowed?(node)
39
+ return fallback_allowed_element_detection(node) unless @tags.present? && @tags.respond_to?(:include?)
40
+ return true if allowed_not_element_node_types.include?(node.type)
41
+ return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
42
+ @tags.include? node.name
43
+ end
44
+ end
45
+ end