briard 2.4.1 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/codeql-analysis.yml +72 -0
  3. data/.github/workflows/rubocop.yml +50 -0
  4. data/.rubocop.yml +144 -620
  5. data/.rubocop_todo.yml +76 -0
  6. data/CHANGELOG.md +22 -0
  7. data/Gemfile +2 -0
  8. data/Gemfile.lock +43 -6
  9. data/Rakefile +1 -1
  10. data/{bolognese.gemspec → briard.gemspec} +46 -38
  11. data/lib/briard/array.rb +2 -2
  12. data/lib/briard/author_utils.rb +79 -71
  13. data/lib/briard/cli.rb +12 -13
  14. data/lib/briard/crossref_utils.rb +73 -61
  15. data/lib/briard/datacite_utils.rb +132 -106
  16. data/lib/briard/doi_utils.rb +10 -10
  17. data/lib/briard/metadata.rb +96 -106
  18. data/lib/briard/metadata_utils.rb +87 -78
  19. data/lib/briard/readers/bibtex_reader.rb +65 -65
  20. data/lib/briard/readers/cff_reader.rb +88 -70
  21. data/lib/briard/readers/citeproc_reader.rb +90 -84
  22. data/lib/briard/readers/codemeta_reader.rb +68 -50
  23. data/lib/briard/readers/crosscite_reader.rb +2 -2
  24. data/lib/briard/readers/crossref_reader.rb +249 -210
  25. data/lib/briard/readers/datacite_json_reader.rb +3 -3
  26. data/lib/briard/readers/datacite_reader.rb +225 -189
  27. data/lib/briard/readers/npm_reader.rb +49 -42
  28. data/lib/briard/readers/ris_reader.rb +82 -80
  29. data/lib/briard/readers/schema_org_reader.rb +182 -159
  30. data/lib/briard/string.rb +1 -1
  31. data/lib/briard/utils.rb +4 -4
  32. data/lib/briard/version.rb +3 -1
  33. data/lib/briard/whitelist_scrubber.rb +11 -4
  34. data/lib/briard/writers/bibtex_writer.rb +14 -8
  35. data/lib/briard/writers/cff_writer.rb +33 -26
  36. data/lib/briard/writers/codemeta_writer.rb +19 -15
  37. data/lib/briard/writers/csv_writer.rb +6 -4
  38. data/lib/briard/writers/datacite_json_writer.rb +8 -2
  39. data/lib/briard/writers/jats_writer.rb +33 -28
  40. data/lib/briard/writers/rdf_xml_writer.rb +1 -1
  41. data/lib/briard/writers/ris_writer.rb +30 -18
  42. data/lib/briard/writers/turtle_writer.rb +1 -1
  43. data/lib/briard.rb +6 -6
  44. data/rubocop.sarif +0 -0
  45. data/spec/array_spec.rb +5 -5
  46. data/spec/author_utils_spec.rb +151 -132
  47. data/spec/datacite_utils_spec.rb +135 -83
  48. data/spec/doi_utils_spec.rb +168 -164
  49. data/spec/find_from_format_spec.rb +69 -69
  50. data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/onlies_keep_specific_tags.yml +65 -0
  51. data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/removes_a_tags.yml +65 -0
  52. data/spec/metadata_spec.rb +91 -90
  53. data/spec/readers/bibtex_reader_spec.rb +43 -38
  54. data/spec/readers/cff_reader_spec.rb +165 -153
  55. data/spec/readers/citeproc_reader_spec.rb +45 -40
  56. data/spec/readers/codemeta_reader_spec.rb +128 -115
  57. data/spec/readers/crosscite_reader_spec.rb +34 -24
  58. data/spec/readers/crossref_reader_spec.rb +1098 -939
  59. data/spec/readers/datacite_json_reader_spec.rb +53 -40
  60. data/spec/readers/datacite_reader_spec.rb +1541 -1337
  61. data/spec/readers/npm_reader_spec.rb +48 -43
  62. data/spec/readers/ris_reader_spec.rb +53 -47
  63. data/spec/readers/schema_org_reader_spec.rb +329 -267
  64. data/spec/spec_helper.rb +6 -5
  65. data/spec/utils_spec.rb +371 -347
  66. data/spec/writers/bibtex_writer_spec.rb +143 -143
  67. data/spec/writers/cff_writer_spec.rb +96 -90
  68. data/spec/writers/citation_writer_spec.rb +34 -33
  69. data/spec/writers/citeproc_writer_spec.rb +226 -224
  70. data/spec/writers/codemeta_writer_spec.rb +18 -16
  71. data/spec/writers/crosscite_writer_spec.rb +91 -73
  72. data/spec/writers/crossref_writer_spec.rb +99 -91
  73. data/spec/writers/csv_writer_spec.rb +70 -70
  74. data/spec/writers/datacite_json_writer_spec.rb +78 -68
  75. data/spec/writers/datacite_writer_spec.rb +417 -322
  76. data/spec/writers/jats_writer_spec.rb +177 -161
  77. data/spec/writers/rdf_xml_writer_spec.rb +68 -63
  78. data/spec/writers/ris_writer_spec.rb +162 -162
  79. data/spec/writers/turtle_writer_spec.rb +47 -47
  80. metadata +250 -160
  81. data/.github/workflows/release.yml +0 -47
@@ -4,281 +4,304 @@ module Briard
4
4
  module Readers
5
5
  module SchemaOrgReader
6
6
  SO_TO_DC_RELATION_TYPES = {
7
- "citation" => "References",
8
- "isBasedOn" => "IsSupplementedBy",
9
- "sameAs" => "IsIdenticalTo",
10
- "isPartOf" => "IsPartOf",
11
- "hasPart" => "HasPart",
12
- "isPredecessor" => "IsPreviousVersionOf",
13
- "isSuccessor" => "IsNewVersionOf"
14
- }
7
+ 'citation' => 'References',
8
+ 'isBasedOn' => 'IsSupplementedBy',
9
+ 'sameAs' => 'IsIdenticalTo',
10
+ 'isPartOf' => 'IsPartOf',
11
+ 'hasPart' => 'HasPart',
12
+ 'isPredecessor' => 'IsPreviousVersionOf',
13
+ 'isSuccessor' => 'IsNewVersionOf'
14
+ }.freeze
15
15
 
16
16
  SO_TO_DC_REVERSE_RELATION_TYPES = {
17
- "citation" => "IsReferencedBy",
18
- "isBasedOn" => "IsSupplementTo",
19
- "sameAs" => "IsIdenticalTo",
20
- "isPartOf" => "HasPart",
21
- "hasPart" => "IsPartOf",
22
- "isPredecessor" => "IsNewVersionOf",
23
- "isSuccessor" => "IsPreviousVersionOf"
24
- }
25
-
26
- def get_schema_org(id: nil, **options)
27
- return { "string" => nil, "state" => "not_found" } unless id.present?
17
+ 'citation' => 'IsReferencedBy',
18
+ 'isBasedOn' => 'IsSupplementTo',
19
+ 'sameAs' => 'IsIdenticalTo',
20
+ 'isPartOf' => 'HasPart',
21
+ 'hasPart' => 'IsPartOf',
22
+ 'isPredecessor' => 'IsNewVersionOf',
23
+ 'isSuccessor' => 'IsPreviousVersionOf'
24
+ }.freeze
25
+
26
+ def get_schema_org(id: nil, **_options)
27
+ return { 'string' => nil, 'state' => 'not_found' } unless id.present?
28
28
 
29
29
  url = normalize_id(id)
30
30
  response = Maremma.get(url, raw: true)
31
31
 
32
32
  # some responses are returned as a hash
33
- if response.body["data"].is_a?(Hash)
34
- string = response.body.dig("data", "html", "head", "script", 1, "__content__")
33
+ if response.body['data'].is_a?(Hash)
34
+ string = response.body.dig('data', 'html', 'head', 'script', 1, '__content__')
35
35
  else
36
- doc = Nokogiri::XML(response.body.fetch("data", nil), nil, 'UTF-8')
37
-
36
+ doc = Nokogiri::XML(response.body.fetch('data', nil), nil, 'UTF-8')
37
+
38
38
  # workaround for xhtml documents
39
39
  nodeset = doc.at("script[type='application/ld+json']")
40
- hsh = JSON.parse(nodeset || "{}")
41
-
40
+ hsh = JSON.parse(nodeset || '{}')
41
+
42
42
  # workaround for doi as canonical_url but not included with schema.org
43
43
  link = doc.css("link[rel='canonical']")
44
- hsh.merge!({ "@id" => link[0]["href"] }) if link.present?
44
+ hsh['@id'] = link[0]['href'] if link.present?
45
45
 
46
46
  # workaround if license included but not with schema.org
47
47
  license = doc.at("meta[name='DCTERMS.license']")
48
- hsh.merge!({ "license" => license["content"] }) if license.present?
49
-
48
+ hsh['license'] = license['content'] if license.present?
49
+
50
50
  # workaround for html language attribute if no language is set via schema.org
51
51
  lang = doc.at('html')['lang']
52
- hsh.merge!({ "inLanguage" => lang }) if hsh["inLanguage"].blank?
52
+ hsh['inLanguage'] = lang if hsh['inLanguage'].blank?
53
53
 
54
54
  # workaround if issn not included with schema.org
55
55
  name = doc.at("meta[property='og:site_name']")
56
56
  issn = doc.at("meta[name='citation_issn']")
57
- hsh.merge!({ "isPartOf" => { "name" => name ? name["content"] : nil, "issn" => issn ? issn["content"] : nil }.compact })
57
+ hsh['isPartOf'] = { 'name' => name ? name['content'] : nil,
58
+ 'issn' => issn ? issn['content'] : nil }.compact
58
59
 
59
60
  string = hsh.to_json if hsh.present?
60
61
  end
61
62
 
62
- { "string" => string }
63
+ { 'string' => string }
63
64
  end
64
65
 
65
66
  def read_schema_org(string: nil, **options)
66
67
  if string.present?
67
68
  errors = jsonlint(string)
68
- return { "errors" => errors } if errors.present?
69
+ return { 'errors' => errors } if errors.present?
69
70
  end
70
71
 
71
- read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url, :sandbox, :validate, :ra))
72
+ read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url,
73
+ :sandbox, :validate, :ra))
72
74
 
73
75
  meta = string.present? ? Maremma.from_json(string) : {}
74
76
 
75
- identifiers = Array.wrap(meta.fetch("identifier", nil)).map do |r|
77
+ identifiers = Array.wrap(meta.fetch('identifier', nil)).map do |r|
76
78
  r = normalize_id(r) if r.is_a?(String)
77
- if r.is_a?(String) && !r.start_with?("https://doi.org")
78
- { "identifierType" => "URL", "identifier" => r }
79
+ if r.is_a?(String) && URI(r).host != 'doi.org'
80
+ { 'identifierType' => 'URL', 'identifier' => r }
79
81
  elsif r.is_a?(Hash)
80
- { "identifierType" => get_identifier_type(r["propertyID"]), "identifier" => r["value"] }
82
+ { 'identifierType' => get_identifier_type(r['propertyID']), 'identifier' => r['value'] }
81
83
  end
82
84
  end.compact.uniq
83
85
 
84
86
  id = options[:doi]
85
- id = meta.fetch("@id", nil) if id.blank? && meta.fetch("@id", nil).to_s.start_with?("https://doi.org")
86
- id = meta.fetch("identifier", nil) if id.blank? # && meta.fetch("identifier", nil).to_s.start_with?("https://doi.org")#&& meta.fetch("@", nil).start_with?("https://doi.org")
87
+ id = meta.fetch('@id', nil) if id.blank? && URI(meta.fetch('@id', '')).host == 'doi.org'
88
+ id = meta.fetch('identifier', nil) if id.blank?
87
89
  id = normalize_id(id)
88
90
 
89
- schema_org = meta.fetch("@type", nil) && meta.fetch("@type").camelcase
91
+ schema_org = meta.fetch('@type', nil) && meta.fetch('@type').camelcase
90
92
  resource_type_general = Briard::Utils::SO_TO_DC_TRANSLATIONS[schema_org]
91
93
  types = {
92
- "resourceTypeGeneral" => resource_type_general,
93
- "resourceType" => meta.fetch("additionalType", nil),
94
- "schemaOrg" => schema_org,
95
- "citeproc" => Briard::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || "article-journal",
96
- "bibtex" => Briard::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || "misc",
97
- "ris" => Briard::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || "GEN"
94
+ 'resourceTypeGeneral' => resource_type_general,
95
+ 'resourceType' => meta.fetch('additionalType', nil),
96
+ 'schemaOrg' => schema_org,
97
+ 'citeproc' => Briard::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || 'article-journal',
98
+ 'bibtex' => Briard::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || 'misc',
99
+ 'ris' => Briard::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || 'GEN'
98
100
  }.compact
99
- authors = meta.fetch("author", nil) || meta.fetch("creator", nil)
101
+ authors = meta.fetch('author', nil) || meta.fetch('creator', nil)
100
102
  # Authors should be an object, if it's just a plain string don't try and parse it.
101
- if not authors.is_a?(String)
103
+ unless authors.is_a?(String)
102
104
  creators = get_authors(from_schema_org_creators(Array.wrap(authors)))
103
105
  end
104
- contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch("editor", nil))))
105
- publisher = parse_attributes(meta.fetch("publisher", nil), content: "name", first: true)
106
+ contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch('editor',
107
+ nil))))
108
+ publisher = parse_attributes(meta.fetch('publisher', nil), content: 'name', first: true)
106
109
 
107
- ct = (schema_org == "Dataset") ? "includedInDataCatalog" : "Periodical"
110
+ ct = schema_org == 'Dataset' ? 'includedInDataCatalog' : 'Periodical'
108
111
  container = if meta.fetch(ct, nil).present?
109
- url = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: "url", first: true)
110
-
111
- {
112
- "type" => (schema_org == "Dataset") ? "DataRepository" : "Periodical",
113
- "title" => parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: "name", first: true),
114
- "identifier" => url,
115
- "identifierType" => url.present? ? "URL" : nil,
116
- "volume" => meta.fetch("volumeNumber", nil),
117
- "issue" => meta.fetch("issueNumber", nil),
118
- "firstPage" => meta.fetch("pageStart", nil),
119
- "lastPage" => meta.fetch("pageEnd", nil)
120
- }.compact
121
- elsif ["BlogPosting", "Article"].include?(schema_org)
122
- issn = meta.dig("isPartOf", "issn")
123
-
124
- {
125
- "type" => "Blog",
126
- "title" => meta.dig("isPartOf", "name"),
127
- "identifier" => issn,
128
- "identifierType" => issn.present? ? "ISSN" : nil
129
- }.compact
130
- else
131
- {}
132
- end
112
+ url = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'url',
113
+ first: true)
114
+
115
+ {
116
+ 'type' => schema_org == 'Dataset' ? 'DataRepository' : 'Periodical',
117
+ 'title' => parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'name',
118
+ first: true),
119
+ 'identifier' => url,
120
+ 'identifierType' => url.present? ? 'URL' : nil,
121
+ 'volume' => meta.fetch('volumeNumber', nil),
122
+ 'issue' => meta.fetch('issueNumber', nil),
123
+ 'firstPage' => meta.fetch('pageStart', nil),
124
+ 'lastPage' => meta.fetch('pageEnd', nil)
125
+ }.compact
126
+ elsif %w[BlogPosting Article].include?(schema_org)
127
+ issn = meta.dig('isPartOf', 'issn')
128
+
129
+ {
130
+ 'type' => 'Blog',
131
+ 'title' => meta.dig('isPartOf', 'name'),
132
+ 'identifier' => issn,
133
+ 'identifierType' => issn.present? ? 'ISSN' : nil
134
+ }.compact
135
+ else
136
+ {}
137
+ end
133
138
 
134
139
  related_identifiers = Array.wrap(schema_org_is_identical_to(meta)) +
135
- Array.wrap(schema_org_is_part_of(meta)) +
136
- Array.wrap(schema_org_has_part(meta)) +
137
- Array.wrap(schema_org_is_previous_version_of(meta)) +
138
- Array.wrap(schema_org_is_new_version_of(meta)) +
139
- Array.wrap(schema_org_references(meta)) +
140
- Array.wrap(schema_org_is_referenced_by(meta)) +
141
- Array.wrap(schema_org_is_supplement_to(meta)) +
142
- Array.wrap(schema_org_is_supplemented_by(meta))
143
-
144
- rights_list = Array.wrap(meta.fetch("license", nil)).compact.map do |rl|
140
+ Array.wrap(schema_org_is_part_of(meta)) +
141
+ Array.wrap(schema_org_has_part(meta)) +
142
+ Array.wrap(schema_org_is_previous_version_of(meta)) +
143
+ Array.wrap(schema_org_is_new_version_of(meta)) +
144
+ Array.wrap(schema_org_references(meta)) +
145
+ Array.wrap(schema_org_is_referenced_by(meta)) +
146
+ Array.wrap(schema_org_is_supplement_to(meta)) +
147
+ Array.wrap(schema_org_is_supplemented_by(meta))
148
+
149
+ rights_list = Array.wrap(meta.fetch('license', nil)).compact.map do |rl|
145
150
  if rl.is_a?(String)
146
- hsh_to_spdx("rightsURI" => rl)
151
+ hsh_to_spdx('rightsURI' => rl)
147
152
  else
148
- hsh_to_spdx("__content__" => rl["name"], "rightsURI" => rl["id"])
153
+ hsh_to_spdx('__content__' => rl['name'], 'rightsURI' => rl['id'])
149
154
  end
150
155
  end
151
156
 
152
- funding_references = Array.wrap(meta.fetch("funder", nil)).compact.map do |fr|
153
- if fr["@id"].present?
157
+ funding_references = Array.wrap(meta.fetch('funder', nil)).compact.map do |fr|
158
+ if fr['@id'].present?
154
159
  {
155
- "funderName" => fr["name"],
156
- "funderIdentifier" => fr["@id"],
157
- "funderIdentifierType" => fr["@id"].to_s.start_with?("https://doi.org/10.13039") ? "Crossref Funder ID" : "Other" }.compact
160
+ 'funderName' => fr['name'],
161
+ 'funderIdentifier' => fr['@id'],
162
+ 'funderIdentifierType' => fr['@id'].to_s.start_with?('https://doi.org/10.13039') ? 'Crossref Funder ID' : 'Other'
163
+ }.compact
158
164
  else
159
- {
160
- "funderName" => fr["name"] }.compact
165
+ { 'funderName' => fr['name'] }.compact
161
166
  end
162
167
  end
163
168
 
164
169
  # strip milliseconds from iso8601, as edtf library doesn't handle them
165
170
  dates = []
166
- dates << { "date" => strip_milliseconds(meta.fetch("datePublished")), "dateType" => "Issued" } if Date.edtf(strip_milliseconds(meta.fetch("datePublished", nil))).present?
167
- dates << { "date" => strip_milliseconds(meta.fetch("dateCreated")), "dateType" => "Created" } if Date.edtf(strip_milliseconds(meta.fetch("dateCreated", nil))).present?
168
- dates << { "date" => strip_milliseconds(meta.fetch("dateModified")), "dateType" => "Updated" } if Date.edtf(strip_milliseconds(meta.fetch("dateModified", nil))).present?
169
- publication_year = meta.fetch("datePublished")[0..3] if meta.fetch("datePublished", nil).present?
170
-
171
- if meta.fetch("inLanguage", nil).is_a?(String)
172
- language = meta.fetch("inLanguage")
173
- elsif meta.fetch("inLanguage", nil).is_a?(Object)
174
- language = meta.dig("inLanguage", 'alternateName') || meta.dig("inLanguage", 'name')
175
- else
176
- language = nil
171
+ if Date.edtf(strip_milliseconds(meta.fetch('datePublished', nil))).present?
172
+ dates << { 'date' => strip_milliseconds(meta.fetch('datePublished')),
173
+ 'dateType' => 'Issued' }
177
174
  end
178
-
179
- state = meta.present? || read_options.present? ? "findable" : "not_found"
180
- geo_locations = Array.wrap(meta.fetch("spatialCoverage", nil)).map do |gl|
181
- if gl.dig("geo", "box")
182
- s, w, n, e = gl.dig("geo", "box").split(" ", 4)
175
+ if Date.edtf(strip_milliseconds(meta.fetch('dateCreated', nil))).present?
176
+ dates << { 'date' => strip_milliseconds(meta.fetch('dateCreated')),
177
+ 'dateType' => 'Created' }
178
+ end
179
+ if Date.edtf(strip_milliseconds(meta.fetch('dateModified', nil))).present?
180
+ dates << { 'date' => strip_milliseconds(meta.fetch('dateModified')),
181
+ 'dateType' => 'Updated' }
182
+ end
183
+ publication_year = meta.fetch('datePublished')[0..3] if meta.fetch('datePublished',
184
+ nil).present?
185
+
186
+ language = case meta.fetch('inLanguage', nil)
187
+ when String
188
+ meta.fetch('inLanguage')
189
+ when Object
190
+ meta.dig('inLanguage', 'alternateName') || meta.dig('inLanguage', 'name')
191
+ end
192
+
193
+ state = meta.present? || read_options.present? ? 'findable' : 'not_found'
194
+ geo_locations = Array.wrap(meta.fetch('spatialCoverage', nil)).map do |gl|
195
+ if gl.dig('geo', 'box')
196
+ s, w, n, e = gl.dig('geo', 'box').split(' ', 4)
183
197
  geo_location_box = {
184
- "westBoundLongitude" => w,
185
- "eastBoundLongitude" => e,
186
- "southBoundLatitude" => s,
187
- "northBoundLatitude" => n,
198
+ 'westBoundLongitude' => w,
199
+ 'eastBoundLongitude' => e,
200
+ 'southBoundLatitude' => s,
201
+ 'northBoundLatitude' => n
188
202
  }.compact.presence
189
203
  else
190
204
  geo_location_box = nil
191
205
  end
192
- geo_location_point = { "pointLongitude" => gl.dig("geo", "longitude"), "pointLatitude" => gl.dig("geo", "latitude") }.compact.presence
206
+ geo_location_point = { 'pointLongitude' => gl.dig('geo', 'longitude'),
207
+ 'pointLatitude' => gl.dig('geo', 'latitude') }.compact.presence
193
208
 
194
209
  {
195
- "geoLocationPlace" => gl.dig("geo", "address"),
196
- "geoLocationPoint" => geo_location_point,
197
- "geoLocationBox" => geo_location_box
210
+ 'geoLocationPlace' => gl.dig('geo', 'address'),
211
+ 'geoLocationPoint' => geo_location_point,
212
+ 'geoLocationBox' => geo_location_box
198
213
  }.compact
199
214
  end
200
215
 
201
216
  # handle keywords as array and as comma-separated string
202
- subjects = meta.fetch("keywords", nil)
203
- subjects = subjects.to_s.downcase.split(", ") if subjects.is_a?(String)
217
+ subjects = meta.fetch('keywords', nil)
218
+ subjects = subjects.to_s.downcase.split(', ') if subjects.is_a?(String)
204
219
  subjects = Array.wrap(subjects).reduce([]) do |sum, subject|
205
220
  sum += name_to_fos(subject)
206
221
  sum
207
222
  end
208
223
 
209
- { "id" => id,
210
- "types" => types,
211
- "doi" => validate_doi(id),
212
- "identifiers" => identifiers,
213
- "url" => normalize_id(meta.fetch("url", nil)),
214
- "content_url" => Array.wrap(meta.fetch("contentUrl", nil)),
215
- "sizes" => Array.wrap(meta.fetch("contenSize", nil)).presence,
216
- "formats" => Array.wrap(meta.fetch("encodingFormat", nil) || meta.fetch("fileFormat", nil)),
217
- "titles" => meta.fetch("name", nil).present? ? [{ "title" => meta.fetch("name", nil) }] : [{ "title" => meta.fetch("headline", nil) }],
218
- "creators" => creators,
219
- "contributors" => contributors,
220
- "publisher" => publisher,
221
- "agency" => parse_attributes(meta.fetch("provider", nil), content: "name", first: true),
222
- "container" => container,
223
- "related_identifiers" => related_identifiers,
224
- "publication_year" => publication_year,
225
- "dates" => dates,
226
- "descriptions" => meta.fetch("description", nil).present? ? [{ "description" => sanitize(meta.fetch("description")), "descriptionType" => "Abstract" }] : nil,
227
- "rights_list" => rights_list,
228
- "version_info" => meta.fetch("version", nil).to_s.presence,
229
- "subjects" => subjects,
230
- "language" => language,
231
- "state" => state,
232
- "schema_version" => meta.fetch("schemaVersion", nil).to_s.presence,
233
- "funding_references" => funding_references,
234
- "geo_locations" => geo_locations
235
- }.merge(read_options)
224
+ { 'id' => id,
225
+ 'types' => types,
226
+ 'doi' => validate_doi(id),
227
+ 'identifiers' => identifiers,
228
+ 'url' => normalize_id(meta.fetch('url', nil)),
229
+ 'content_url' => Array.wrap(meta.fetch('contentUrl', nil)),
230
+ 'sizes' => Array.wrap(meta.fetch('contenSize', nil)).presence,
231
+ 'formats' => Array.wrap(meta.fetch('encodingFormat',
232
+ nil) || meta.fetch('fileFormat', nil)),
233
+ 'titles' => if meta.fetch('name', nil).present?
234
+ [{ 'title' => meta.fetch('name', nil) }]
235
+ else
236
+ [{ 'title' => meta.fetch('headline', nil) }]
237
+ end,
238
+ 'creators' => creators,
239
+ 'contributors' => contributors,
240
+ 'publisher' => publisher,
241
+ 'agency' => parse_attributes(meta.fetch('provider', nil), content: 'name', first: true),
242
+ 'container' => container,
243
+ 'related_identifiers' => related_identifiers,
244
+ 'publication_year' => publication_year,
245
+ 'dates' => dates,
246
+ 'descriptions' => if meta.fetch('description', nil).present?
247
+ [{ 'description' => sanitize(meta.fetch('description')),
248
+ 'descriptionType' => 'Abstract' }]
249
+ end,
250
+ 'rights_list' => rights_list,
251
+ 'version_info' => meta.fetch('version', nil).to_s.presence,
252
+ 'subjects' => subjects,
253
+ 'language' => language,
254
+ 'state' => state,
255
+ 'schema_version' => meta.fetch('schemaVersion', nil).to_s.presence,
256
+ 'funding_references' => funding_references,
257
+ 'geo_locations' => geo_locations }.merge(read_options)
236
258
  end
237
259
 
238
260
  def schema_org_related_identifier(meta, relation_type: nil)
239
- normalize_ids(ids: meta.fetch(relation_type, nil), relation_type: SO_TO_DC_RELATION_TYPES[relation_type])
261
+ normalize_ids(ids: meta.fetch(relation_type, nil),
262
+ relation_type: SO_TO_DC_RELATION_TYPES[relation_type])
240
263
  end
241
264
 
242
265
  def schema_org_reverse_related_identifier(meta, relation_type: nil)
243
- normalize_ids(ids: meta.dig("@reverse", relation_type), relation_type: SO_TO_DC_REVERSE_RELATION_TYPES[relation_type])
266
+ normalize_ids(ids: meta.dig('@reverse', relation_type),
267
+ relation_type: SO_TO_DC_REVERSE_RELATION_TYPES[relation_type])
244
268
  end
245
269
 
246
270
  def schema_org_is_identical_to(meta)
247
- schema_org_related_identifier(meta, relation_type: "sameAs")
271
+ schema_org_related_identifier(meta, relation_type: 'sameAs')
248
272
  end
249
273
 
250
274
  def schema_org_is_part_of(meta)
251
- schema_org_related_identifier(meta, relation_type: "isPartOf")
275
+ schema_org_related_identifier(meta, relation_type: 'isPartOf')
252
276
  end
253
277
 
254
278
  def schema_org_has_part(meta)
255
- schema_org_related_identifier(meta, relation_type: "hasPart")
279
+ schema_org_related_identifier(meta, relation_type: 'hasPart')
256
280
  end
257
281
 
258
282
  def schema_org_is_previous_version_of(meta)
259
- schema_org_related_identifier(meta, relation_type: "PredecessorOf")
283
+ schema_org_related_identifier(meta, relation_type: 'PredecessorOf')
260
284
  end
261
285
 
262
286
  def schema_org_is_new_version_of(meta)
263
- schema_org_related_identifier(meta, relation_type: "SuccessorOf")
287
+ schema_org_related_identifier(meta, relation_type: 'SuccessorOf')
264
288
  end
265
289
 
266
290
  def schema_org_references(meta)
267
- schema_org_related_identifier(meta, relation_type: "citation")
291
+ schema_org_related_identifier(meta, relation_type: 'citation')
268
292
  end
269
293
 
270
294
  def schema_org_is_referenced_by(meta)
271
- schema_org_reverse_related_identifier(meta, relation_type: "citation")
295
+ schema_org_reverse_related_identifier(meta, relation_type: 'citation')
272
296
  end
273
297
 
274
298
  def schema_org_is_supplement_to(meta)
275
- schema_org_reverse_related_identifier(meta, relation_type: "isBasedOn")
299
+ schema_org_reverse_related_identifier(meta, relation_type: 'isBasedOn')
276
300
  end
277
301
 
278
302
  def schema_org_is_supplemented_by(meta)
279
- schema_org_related_identifier(meta, relation_type: "isBasedOn")
303
+ schema_org_related_identifier(meta, relation_type: 'isBasedOn')
280
304
  end
281
-
282
305
  end
283
306
  end
284
307
  end
data/lib/briard/string.rb CHANGED
@@ -2,6 +2,6 @@
2
2
 
3
3
  class String
4
4
  def my_titleize
5
- self.gsub(/\b(['’]?[a-z])/) { "#{$1.capitalize}" }
5
+ gsub(/\b(['’]?[a-z])/) { ::Regexp.last_match(1).capitalize.to_s }
6
6
  end
7
7
  end
data/lib/briard/utils.rb CHANGED
@@ -500,7 +500,7 @@ module Briard
500
500
  def find_from_format_by_id(id)
501
501
  id = normalize_id(id)
502
502
 
503
- if /\A(?:(http|https):\/(\/)?(dx\.)?(doi.org|handle.stage.datacite.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
503
+ if /\A(?:(http|https):\/(\/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
504
504
  ra = get_doi_ra(id)
505
505
  %w(DataCite Crossref mEDRA KISTI JaLC OP).include?(ra) ? ra.downcase : nil
506
506
  elsif /\A(?:(http|https):\/(\/)?orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id)
@@ -537,7 +537,7 @@ module Briard
537
537
  "datacite"
538
538
  elsif options[:ext] == ".cff"
539
539
  "cff"
540
- elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org")
540
+ elsif options[:ext] == ".json" && URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
541
541
  "schema_org"
542
542
  elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
543
543
  "codemeta"
@@ -555,7 +555,7 @@ module Briard
555
555
  "crossref"
556
556
  elsif Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") }
557
557
  "datacite"
558
- elsif Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org")
558
+ elsif URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
559
559
  "schema_org"
560
560
  elsif Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
561
561
  "codemeta"
@@ -940,7 +940,7 @@ module Briard
940
940
  end
941
941
 
942
942
  # alternatively find the nameIdentifier in the sameAs attribute
943
- c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| item.start_with?("https://orcid.org") }
943
+ c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| URI(item).host == "orcid.org" }
944
944
 
945
945
  c["nameIdentifier"] = [{ "__content__" => c["@id"], "nameIdentifierScheme" => "ORCID", "schemeUri" => "https://orcid.org" }] if normalize_orcid(c["@id"])
946
946
  c["@type"] = c["@type"].find { |t| %w(Person Organization).include?(t) } if c["@type"].is_a?(Array)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Briard
2
- VERSION = "2.4.1"
4
+ VERSION = '2.6.0'
3
5
  end
@@ -4,7 +4,7 @@
4
4
 
5
5
  module Briard
6
6
  class WhitelistScrubber < Loofah::Scrubber
7
- def initialize(options={})
7
+ def initialize(options = {})
8
8
  @direction = :bottom_up
9
9
  @tags = options[:tags]
10
10
  @attributes = options[:attributes]
@@ -12,6 +12,7 @@ module Briard
12
12
 
13
13
  def scrub(node)
14
14
  scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
15
+
15
16
  node.before node.children
16
17
  node.remove
17
18
  end
@@ -19,14 +20,17 @@ module Briard
19
20
  private
20
21
 
21
22
  def scrub_node_attributes(node)
22
- fallback_scrub_node_attributes(node) and return true unless @attributes.present? && @attributes.respond_to?(:include?)
23
+ unless @attributes.present? && @attributes.respond_to?(:include?)
24
+ fallback_scrub_node_attributes(node) and return true
25
+ end
26
+
23
27
  node.attribute_nodes.each do |attr_node|
24
28
  attr_node.remove unless @attributes.include?(attr_node.name)
25
29
  end
26
30
  end
27
31
 
28
32
  def allowed_not_element_node_types
29
- [ Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE ]
33
+ [Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE]
30
34
  end
31
35
 
32
36
  def fallback_scrub_node_attributes(node)
@@ -38,9 +42,12 @@ module Briard
38
42
  end
39
43
 
40
44
  def node_allowed?(node)
41
- return fallback_allowed_element_detection(node) unless @tags.present? && @tags.respond_to?(:include?)
45
+ unless @tags.present? && @tags.respond_to?(:include?)
46
+ return fallback_allowed_element_detection(node)
47
+ end
42
48
  return true if allowed_not_element_node_types.include?(node.type)
43
49
  return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
50
+
44
51
  @tags.include? node.name
45
52
  end
46
53
  end
@@ -6,24 +6,30 @@ module Briard
6
6
  def bibtex
7
7
  return nil unless valid?
8
8
 
9
- pages = container.to_h["firstPage"].present? ? [container["firstPage"], container["lastPage"]].compact.join("-") : nil
9
+ pages = if container.to_h['firstPage'].present?
10
+ [container['firstPage'], container['lastPage']].compact.join('-')
11
+ end
10
12
 
11
13
  bib = {
12
- bibtex_type: types["bibtex"].presence || "misc",
14
+ bibtex_type: types['bibtex'].presence || 'misc',
13
15
  bibtex_key: normalize_doi(doi),
14
16
  doi: doi,
15
17
  url: url,
16
18
  author: authors_as_string(creators),
17
- keywords: subjects.present? ? Array.wrap(subjects).map { |k| parse_attributes(k, content: "subject", first: true) }.join(", ") : nil,
19
+ keywords: if subjects.present?
20
+ Array.wrap(subjects).map do |k|
21
+ parse_attributes(k, content: 'subject', first: true)
22
+ end.join(', ')
23
+ end,
18
24
  language: language,
19
- title: parse_attributes(titles, content: "title", first: true),
20
- journal: container && container["title"],
21
- volume: container.to_h["volume"],
22
- issue: container.to_h["issue"],
25
+ title: parse_attributes(titles, content: 'title', first: true),
26
+ journal: container && container['title'],
27
+ volume: container.to_h['volume'],
28
+ issue: container.to_h['issue'],
23
29
  pages: pages,
24
30
  publisher: publisher,
25
31
  year: publication_year,
26
- copyright: Array.wrap(rights_list).map { |l| l["rights"] }.first,
32
+ copyright: Array.wrap(rights_list).map { |l| l['rights'] }.first
27
33
  }.compact
28
34
  BibTeX::Entry.new(bib).to_s
29
35
  end