briard 2.4.2 → 2.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (83) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/codeql-analysis.yml +72 -0
  3. data/.github/workflows/rubocop.yml +50 -0
  4. data/.gitignore +1 -0
  5. data/.rubocop.yml +144 -620
  6. data/.rubocop_todo.yml +76 -0
  7. data/CHANGELOG.md +18 -0
  8. data/Gemfile +2 -0
  9. data/Gemfile.lock +43 -9
  10. data/Rakefile +1 -1
  11. data/{bolognese.gemspec → briard.gemspec} +46 -39
  12. data/lib/briard/array.rb +2 -2
  13. data/lib/briard/author_utils.rb +79 -71
  14. data/lib/briard/cli.rb +12 -13
  15. data/lib/briard/crossref_utils.rb +73 -61
  16. data/lib/briard/datacite_utils.rb +132 -106
  17. data/lib/briard/doi_utils.rb +10 -10
  18. data/lib/briard/metadata.rb +96 -106
  19. data/lib/briard/metadata_utils.rb +87 -78
  20. data/lib/briard/readers/bibtex_reader.rb +65 -65
  21. data/lib/briard/readers/cff_reader.rb +88 -70
  22. data/lib/briard/readers/citeproc_reader.rb +90 -84
  23. data/lib/briard/readers/codemeta_reader.rb +68 -50
  24. data/lib/briard/readers/crosscite_reader.rb +2 -2
  25. data/lib/briard/readers/crossref_reader.rb +249 -210
  26. data/lib/briard/readers/datacite_json_reader.rb +3 -3
  27. data/lib/briard/readers/datacite_reader.rb +225 -189
  28. data/lib/briard/readers/npm_reader.rb +49 -42
  29. data/lib/briard/readers/ris_reader.rb +82 -80
  30. data/lib/briard/readers/schema_org_reader.rb +182 -159
  31. data/lib/briard/string.rb +1 -1
  32. data/lib/briard/utils.rb +4 -4
  33. data/lib/briard/version.rb +3 -1
  34. data/lib/briard/whitelist_scrubber.rb +11 -4
  35. data/lib/briard/writers/bibtex_writer.rb +14 -8
  36. data/lib/briard/writers/cff_writer.rb +33 -26
  37. data/lib/briard/writers/codemeta_writer.rb +19 -15
  38. data/lib/briard/writers/csv_writer.rb +6 -4
  39. data/lib/briard/writers/datacite_json_writer.rb +8 -2
  40. data/lib/briard/writers/jats_writer.rb +33 -28
  41. data/lib/briard/writers/rdf_xml_writer.rb +1 -1
  42. data/lib/briard/writers/ris_writer.rb +30 -18
  43. data/lib/briard/writers/turtle_writer.rb +1 -1
  44. data/lib/briard.rb +6 -6
  45. data/rubocop.sarif +0 -0
  46. data/spec/array_spec.rb +5 -5
  47. data/spec/author_utils_spec.rb +151 -132
  48. data/spec/datacite_utils_spec.rb +135 -83
  49. data/spec/doi_utils_spec.rb +168 -164
  50. data/spec/find_from_format_spec.rb +69 -69
  51. data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/onlies_keep_specific_tags.yml +65 -0
  52. data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/removes_a_tags.yml +65 -0
  53. data/spec/metadata_spec.rb +91 -90
  54. data/spec/readers/bibtex_reader_spec.rb +43 -38
  55. data/spec/readers/cff_reader_spec.rb +165 -153
  56. data/spec/readers/citeproc_reader_spec.rb +45 -40
  57. data/spec/readers/codemeta_reader_spec.rb +128 -115
  58. data/spec/readers/crosscite_reader_spec.rb +34 -24
  59. data/spec/readers/crossref_reader_spec.rb +1098 -939
  60. data/spec/readers/datacite_json_reader_spec.rb +53 -40
  61. data/spec/readers/datacite_reader_spec.rb +1541 -1337
  62. data/spec/readers/npm_reader_spec.rb +48 -43
  63. data/spec/readers/ris_reader_spec.rb +53 -47
  64. data/spec/readers/schema_org_reader_spec.rb +329 -267
  65. data/spec/spec_helper.rb +6 -5
  66. data/spec/utils_spec.rb +371 -347
  67. data/spec/writers/bibtex_writer_spec.rb +143 -143
  68. data/spec/writers/cff_writer_spec.rb +96 -90
  69. data/spec/writers/citation_writer_spec.rb +34 -33
  70. data/spec/writers/citeproc_writer_spec.rb +226 -224
  71. data/spec/writers/codemeta_writer_spec.rb +18 -16
  72. data/spec/writers/crosscite_writer_spec.rb +91 -73
  73. data/spec/writers/crossref_writer_spec.rb +99 -91
  74. data/spec/writers/csv_writer_spec.rb +70 -70
  75. data/spec/writers/datacite_json_writer_spec.rb +78 -68
  76. data/spec/writers/datacite_writer_spec.rb +417 -322
  77. data/spec/writers/jats_writer_spec.rb +177 -161
  78. data/spec/writers/rdf_xml_writer_spec.rb +68 -63
  79. data/spec/writers/ris_writer_spec.rb +162 -162
  80. data/spec/writers/schema_org_writer_spec.rb +329 -294
  81. data/spec/writers/turtle_writer_spec.rb +47 -47
  82. metadata +242 -166
  83. data/.github/workflows/release.yml +0 -47
@@ -4,281 +4,304 @@ module Briard
4
4
  module Readers
5
5
  module SchemaOrgReader
6
6
  SO_TO_DC_RELATION_TYPES = {
7
- "citation" => "References",
8
- "isBasedOn" => "IsSupplementedBy",
9
- "sameAs" => "IsIdenticalTo",
10
- "isPartOf" => "IsPartOf",
11
- "hasPart" => "HasPart",
12
- "isPredecessor" => "IsPreviousVersionOf",
13
- "isSuccessor" => "IsNewVersionOf"
14
- }
7
+ 'citation' => 'References',
8
+ 'isBasedOn' => 'IsSupplementedBy',
9
+ 'sameAs' => 'IsIdenticalTo',
10
+ 'isPartOf' => 'IsPartOf',
11
+ 'hasPart' => 'HasPart',
12
+ 'isPredecessor' => 'IsPreviousVersionOf',
13
+ 'isSuccessor' => 'IsNewVersionOf'
14
+ }.freeze
15
15
 
16
16
  SO_TO_DC_REVERSE_RELATION_TYPES = {
17
- "citation" => "IsReferencedBy",
18
- "isBasedOn" => "IsSupplementTo",
19
- "sameAs" => "IsIdenticalTo",
20
- "isPartOf" => "HasPart",
21
- "hasPart" => "IsPartOf",
22
- "isPredecessor" => "IsNewVersionOf",
23
- "isSuccessor" => "IsPreviousVersionOf"
24
- }
25
-
26
- def get_schema_org(id: nil, **options)
27
- return { "string" => nil, "state" => "not_found" } unless id.present?
17
+ 'citation' => 'IsReferencedBy',
18
+ 'isBasedOn' => 'IsSupplementTo',
19
+ 'sameAs' => 'IsIdenticalTo',
20
+ 'isPartOf' => 'HasPart',
21
+ 'hasPart' => 'IsPartOf',
22
+ 'isPredecessor' => 'IsNewVersionOf',
23
+ 'isSuccessor' => 'IsPreviousVersionOf'
24
+ }.freeze
25
+
26
+ def get_schema_org(id: nil, **_options)
27
+ return { 'string' => nil, 'state' => 'not_found' } unless id.present?
28
28
 
29
29
  url = normalize_id(id)
30
30
  response = Maremma.get(url, raw: true)
31
31
 
32
32
  # some responses are returned as a hash
33
- if response.body["data"].is_a?(Hash)
34
- string = response.body.dig("data", "html", "head", "script", 1, "__content__")
33
+ if response.body['data'].is_a?(Hash)
34
+ string = response.body.dig('data', 'html', 'head', 'script', 1, '__content__')
35
35
  else
36
- doc = Nokogiri::XML(response.body.fetch("data", nil), nil, 'UTF-8')
37
-
36
+ doc = Nokogiri::XML(response.body.fetch('data', nil), nil, 'UTF-8')
37
+
38
38
  # workaround for xhtml documents
39
39
  nodeset = doc.at("script[type='application/ld+json']")
40
- hsh = JSON.parse(nodeset || "{}")
41
-
40
+ hsh = JSON.parse(nodeset || '{}')
41
+
42
42
  # workaround for doi as canonical_url but not included with schema.org
43
43
  link = doc.css("link[rel='canonical']")
44
- hsh.merge!({ "@id" => link[0]["href"] }) if link.present?
44
+ hsh['@id'] = link[0]['href'] if link.present?
45
45
 
46
46
  # workaround if license included but not with schema.org
47
47
  license = doc.at("meta[name='DCTERMS.license']")
48
- hsh.merge!({ "license" => license["content"] }) if license.present?
49
-
48
+ hsh['license'] = license['content'] if license.present?
49
+
50
50
  # workaround for html language attribute if no language is set via schema.org
51
51
  lang = doc.at('html')['lang']
52
- hsh.merge!({ "inLanguage" => lang }) if hsh["inLanguage"].blank?
52
+ hsh['inLanguage'] = lang if hsh['inLanguage'].blank?
53
53
 
54
54
  # workaround if issn not included with schema.org
55
55
  name = doc.at("meta[property='og:site_name']")
56
56
  issn = doc.at("meta[name='citation_issn']")
57
- hsh.merge!({ "isPartOf" => { "name" => name ? name["content"] : nil, "issn" => issn ? issn["content"] : nil }.compact })
57
+ hsh['isPartOf'] = { 'name' => name ? name['content'] : nil,
58
+ 'issn' => issn ? issn['content'] : nil }.compact
58
59
 
59
60
  string = hsh.to_json if hsh.present?
60
61
  end
61
62
 
62
- { "string" => string }
63
+ { 'string' => string }
63
64
  end
64
65
 
65
66
  def read_schema_org(string: nil, **options)
66
67
  if string.present?
67
68
  errors = jsonlint(string)
68
- return { "errors" => errors } if errors.present?
69
+ return { 'errors' => errors } if errors.present?
69
70
  end
70
71
 
71
- read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url, :sandbox, :validate, :ra))
72
+ read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url,
73
+ :sandbox, :validate, :ra))
72
74
 
73
75
  meta = string.present? ? Maremma.from_json(string) : {}
74
76
 
75
- identifiers = Array.wrap(meta.fetch("identifier", nil)).map do |r|
77
+ identifiers = Array.wrap(meta.fetch('identifier', nil)).map do |r|
76
78
  r = normalize_id(r) if r.is_a?(String)
77
- if r.is_a?(String) && !r.start_with?("https://doi.org")
78
- { "identifierType" => "URL", "identifier" => r }
79
+ if r.is_a?(String) && URI(r).host != 'doi.org'
80
+ { 'identifierType' => 'URL', 'identifier' => r }
79
81
  elsif r.is_a?(Hash)
80
- { "identifierType" => get_identifier_type(r["propertyID"]), "identifier" => r["value"] }
82
+ { 'identifierType' => get_identifier_type(r['propertyID']), 'identifier' => r['value'] }
81
83
  end
82
84
  end.compact.uniq
83
85
 
84
86
  id = options[:doi]
85
- id = meta.fetch("@id", nil) if id.blank? && meta.fetch("@id", nil).to_s.start_with?("https://doi.org")
86
- id = meta.fetch("identifier", nil) if id.blank? # && meta.fetch("identifier", nil).to_s.start_with?("https://doi.org")#&& meta.fetch("@", nil).start_with?("https://doi.org")
87
+ id = meta.fetch('@id', nil) if id.blank? && URI(meta.fetch('@id', '')).host == 'doi.org'
88
+ id = meta.fetch('identifier', nil) if id.blank?
87
89
  id = normalize_id(id)
88
90
 
89
- schema_org = meta.fetch("@type", nil) && meta.fetch("@type").camelcase
91
+ schema_org = meta.fetch('@type', nil) && meta.fetch('@type').camelcase
90
92
  resource_type_general = Briard::Utils::SO_TO_DC_TRANSLATIONS[schema_org]
91
93
  types = {
92
- "resourceTypeGeneral" => resource_type_general,
93
- "resourceType" => meta.fetch("additionalType", nil),
94
- "schemaOrg" => schema_org,
95
- "citeproc" => Briard::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || "article-journal",
96
- "bibtex" => Briard::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || "misc",
97
- "ris" => Briard::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || "GEN"
94
+ 'resourceTypeGeneral' => resource_type_general,
95
+ 'resourceType' => meta.fetch('additionalType', nil),
96
+ 'schemaOrg' => schema_org,
97
+ 'citeproc' => Briard::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || 'article-journal',
98
+ 'bibtex' => Briard::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || 'misc',
99
+ 'ris' => Briard::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || 'GEN'
98
100
  }.compact
99
- authors = meta.fetch("author", nil) || meta.fetch("creator", nil)
101
+ authors = meta.fetch('author', nil) || meta.fetch('creator', nil)
100
102
  # Authors should be an object, if it's just a plain string don't try and parse it.
101
- if not authors.is_a?(String)
103
+ unless authors.is_a?(String)
102
104
  creators = get_authors(from_schema_org_creators(Array.wrap(authors)))
103
105
  end
104
- contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch("editor", nil))))
105
- publisher = parse_attributes(meta.fetch("publisher", nil), content: "name", first: true)
106
+ contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch('editor',
107
+ nil))))
108
+ publisher = parse_attributes(meta.fetch('publisher', nil), content: 'name', first: true)
106
109
 
107
- ct = (schema_org == "Dataset") ? "includedInDataCatalog" : "Periodical"
110
+ ct = schema_org == 'Dataset' ? 'includedInDataCatalog' : 'Periodical'
108
111
  container = if meta.fetch(ct, nil).present?
109
- url = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: "url", first: true)
110
-
111
- {
112
- "type" => (schema_org == "Dataset") ? "DataRepository" : "Periodical",
113
- "title" => parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: "name", first: true),
114
- "identifier" => url,
115
- "identifierType" => url.present? ? "URL" : nil,
116
- "volume" => meta.fetch("volumeNumber", nil),
117
- "issue" => meta.fetch("issueNumber", nil),
118
- "firstPage" => meta.fetch("pageStart", nil),
119
- "lastPage" => meta.fetch("pageEnd", nil)
120
- }.compact
121
- elsif ["BlogPosting", "Article"].include?(schema_org)
122
- issn = meta.dig("isPartOf", "issn")
123
-
124
- {
125
- "type" => "Blog",
126
- "title" => meta.dig("isPartOf", "name"),
127
- "identifier" => issn,
128
- "identifierType" => issn.present? ? "ISSN" : nil
129
- }.compact
130
- else
131
- {}
132
- end
112
+ url = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'url',
113
+ first: true)
114
+
115
+ {
116
+ 'type' => schema_org == 'Dataset' ? 'DataRepository' : 'Periodical',
117
+ 'title' => parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'name',
118
+ first: true),
119
+ 'identifier' => url,
120
+ 'identifierType' => url.present? ? 'URL' : nil,
121
+ 'volume' => meta.fetch('volumeNumber', nil),
122
+ 'issue' => meta.fetch('issueNumber', nil),
123
+ 'firstPage' => meta.fetch('pageStart', nil),
124
+ 'lastPage' => meta.fetch('pageEnd', nil)
125
+ }.compact
126
+ elsif %w[BlogPosting Article].include?(schema_org)
127
+ issn = meta.dig('isPartOf', 'issn')
128
+
129
+ {
130
+ 'type' => 'Blog',
131
+ 'title' => meta.dig('isPartOf', 'name'),
132
+ 'identifier' => issn,
133
+ 'identifierType' => issn.present? ? 'ISSN' : nil
134
+ }.compact
135
+ else
136
+ {}
137
+ end
133
138
 
134
139
  related_identifiers = Array.wrap(schema_org_is_identical_to(meta)) +
135
- Array.wrap(schema_org_is_part_of(meta)) +
136
- Array.wrap(schema_org_has_part(meta)) +
137
- Array.wrap(schema_org_is_previous_version_of(meta)) +
138
- Array.wrap(schema_org_is_new_version_of(meta)) +
139
- Array.wrap(schema_org_references(meta)) +
140
- Array.wrap(schema_org_is_referenced_by(meta)) +
141
- Array.wrap(schema_org_is_supplement_to(meta)) +
142
- Array.wrap(schema_org_is_supplemented_by(meta))
143
-
144
- rights_list = Array.wrap(meta.fetch("license", nil)).compact.map do |rl|
140
+ Array.wrap(schema_org_is_part_of(meta)) +
141
+ Array.wrap(schema_org_has_part(meta)) +
142
+ Array.wrap(schema_org_is_previous_version_of(meta)) +
143
+ Array.wrap(schema_org_is_new_version_of(meta)) +
144
+ Array.wrap(schema_org_references(meta)) +
145
+ Array.wrap(schema_org_is_referenced_by(meta)) +
146
+ Array.wrap(schema_org_is_supplement_to(meta)) +
147
+ Array.wrap(schema_org_is_supplemented_by(meta))
148
+
149
+ rights_list = Array.wrap(meta.fetch('license', nil)).compact.map do |rl|
145
150
  if rl.is_a?(String)
146
- hsh_to_spdx("rightsURI" => rl)
151
+ hsh_to_spdx('rightsURI' => rl)
147
152
  else
148
- hsh_to_spdx("__content__" => rl["name"], "rightsURI" => rl["id"])
153
+ hsh_to_spdx('__content__' => rl['name'], 'rightsURI' => rl['id'])
149
154
  end
150
155
  end
151
156
 
152
- funding_references = Array.wrap(meta.fetch("funder", nil)).compact.map do |fr|
153
- if fr["@id"].present?
157
+ funding_references = Array.wrap(meta.fetch('funder', nil)).compact.map do |fr|
158
+ if fr['@id'].present?
154
159
  {
155
- "funderName" => fr["name"],
156
- "funderIdentifier" => fr["@id"],
157
- "funderIdentifierType" => fr["@id"].to_s.start_with?("https://doi.org/10.13039") ? "Crossref Funder ID" : "Other" }.compact
160
+ 'funderName' => fr['name'],
161
+ 'funderIdentifier' => fr['@id'],
162
+ 'funderIdentifierType' => fr['@id'].to_s.start_with?('https://doi.org/10.13039') ? 'Crossref Funder ID' : 'Other'
163
+ }.compact
158
164
  else
159
- {
160
- "funderName" => fr["name"] }.compact
165
+ { 'funderName' => fr['name'] }.compact
161
166
  end
162
167
  end
163
168
 
164
169
  # strip milliseconds from iso8601, as edtf library doesn't handle them
165
170
  dates = []
166
- dates << { "date" => strip_milliseconds(meta.fetch("datePublished")), "dateType" => "Issued" } if Date.edtf(strip_milliseconds(meta.fetch("datePublished", nil))).present?
167
- dates << { "date" => strip_milliseconds(meta.fetch("dateCreated")), "dateType" => "Created" } if Date.edtf(strip_milliseconds(meta.fetch("dateCreated", nil))).present?
168
- dates << { "date" => strip_milliseconds(meta.fetch("dateModified")), "dateType" => "Updated" } if Date.edtf(strip_milliseconds(meta.fetch("dateModified", nil))).present?
169
- publication_year = meta.fetch("datePublished")[0..3] if meta.fetch("datePublished", nil).present?
170
-
171
- if meta.fetch("inLanguage", nil).is_a?(String)
172
- language = meta.fetch("inLanguage")
173
- elsif meta.fetch("inLanguage", nil).is_a?(Object)
174
- language = meta.dig("inLanguage", 'alternateName') || meta.dig("inLanguage", 'name')
175
- else
176
- language = nil
171
+ if Date.edtf(strip_milliseconds(meta.fetch('datePublished', nil))).present?
172
+ dates << { 'date' => strip_milliseconds(meta.fetch('datePublished')),
173
+ 'dateType' => 'Issued' }
177
174
  end
178
-
179
- state = meta.present? || read_options.present? ? "findable" : "not_found"
180
- geo_locations = Array.wrap(meta.fetch("spatialCoverage", nil)).map do |gl|
181
- if gl.dig("geo", "box")
182
- s, w, n, e = gl.dig("geo", "box").split(" ", 4)
175
+ if Date.edtf(strip_milliseconds(meta.fetch('dateCreated', nil))).present?
176
+ dates << { 'date' => strip_milliseconds(meta.fetch('dateCreated')),
177
+ 'dateType' => 'Created' }
178
+ end
179
+ if Date.edtf(strip_milliseconds(meta.fetch('dateModified', nil))).present?
180
+ dates << { 'date' => strip_milliseconds(meta.fetch('dateModified')),
181
+ 'dateType' => 'Updated' }
182
+ end
183
+ publication_year = meta.fetch('datePublished')[0..3] if meta.fetch('datePublished',
184
+ nil).present?
185
+
186
+ language = case meta.fetch('inLanguage', nil)
187
+ when String
188
+ meta.fetch('inLanguage')
189
+ when Object
190
+ meta.dig('inLanguage', 'alternateName') || meta.dig('inLanguage', 'name')
191
+ end
192
+
193
+ state = meta.present? || read_options.present? ? 'findable' : 'not_found'
194
+ geo_locations = Array.wrap(meta.fetch('spatialCoverage', nil)).map do |gl|
195
+ if gl.dig('geo', 'box')
196
+ s, w, n, e = gl.dig('geo', 'box').split(' ', 4)
183
197
  geo_location_box = {
184
- "westBoundLongitude" => w,
185
- "eastBoundLongitude" => e,
186
- "southBoundLatitude" => s,
187
- "northBoundLatitude" => n,
198
+ 'westBoundLongitude' => w,
199
+ 'eastBoundLongitude' => e,
200
+ 'southBoundLatitude' => s,
201
+ 'northBoundLatitude' => n
188
202
  }.compact.presence
189
203
  else
190
204
  geo_location_box = nil
191
205
  end
192
- geo_location_point = { "pointLongitude" => gl.dig("geo", "longitude"), "pointLatitude" => gl.dig("geo", "latitude") }.compact.presence
206
+ geo_location_point = { 'pointLongitude' => gl.dig('geo', 'longitude'),
207
+ 'pointLatitude' => gl.dig('geo', 'latitude') }.compact.presence
193
208
 
194
209
  {
195
- "geoLocationPlace" => gl.dig("geo", "address"),
196
- "geoLocationPoint" => geo_location_point,
197
- "geoLocationBox" => geo_location_box
210
+ 'geoLocationPlace' => gl.dig('geo', 'address'),
211
+ 'geoLocationPoint' => geo_location_point,
212
+ 'geoLocationBox' => geo_location_box
198
213
  }.compact
199
214
  end
200
215
 
201
216
  # handle keywords as array and as comma-separated string
202
- subjects = meta.fetch("keywords", nil)
203
- subjects = subjects.to_s.downcase.split(", ") if subjects.is_a?(String)
217
+ subjects = meta.fetch('keywords', nil)
218
+ subjects = subjects.to_s.downcase.split(', ') if subjects.is_a?(String)
204
219
  subjects = Array.wrap(subjects).reduce([]) do |sum, subject|
205
220
  sum += name_to_fos(subject)
206
221
  sum
207
222
  end
208
223
 
209
- { "id" => id,
210
- "types" => types,
211
- "doi" => validate_doi(id),
212
- "identifiers" => identifiers,
213
- "url" => normalize_id(meta.fetch("url", nil)),
214
- "content_url" => Array.wrap(meta.fetch("contentUrl", nil)),
215
- "sizes" => Array.wrap(meta.fetch("contenSize", nil)).presence,
216
- "formats" => Array.wrap(meta.fetch("encodingFormat", nil) || meta.fetch("fileFormat", nil)),
217
- "titles" => meta.fetch("name", nil).present? ? [{ "title" => meta.fetch("name", nil) }] : [{ "title" => meta.fetch("headline", nil) }],
218
- "creators" => creators,
219
- "contributors" => contributors,
220
- "publisher" => publisher,
221
- "agency" => parse_attributes(meta.fetch("provider", nil), content: "name", first: true),
222
- "container" => container,
223
- "related_identifiers" => related_identifiers,
224
- "publication_year" => publication_year,
225
- "dates" => dates,
226
- "descriptions" => meta.fetch("description", nil).present? ? [{ "description" => sanitize(meta.fetch("description")), "descriptionType" => "Abstract" }] : nil,
227
- "rights_list" => rights_list,
228
- "version_info" => meta.fetch("version", nil).to_s.presence,
229
- "subjects" => subjects,
230
- "language" => language,
231
- "state" => state,
232
- "schema_version" => meta.fetch("schemaVersion", nil).to_s.presence,
233
- "funding_references" => funding_references,
234
- "geo_locations" => geo_locations
235
- }.merge(read_options)
224
+ { 'id' => id,
225
+ 'types' => types,
226
+ 'doi' => validate_doi(id),
227
+ 'identifiers' => identifiers,
228
+ 'url' => normalize_id(meta.fetch('url', nil)),
229
+ 'content_url' => Array.wrap(meta.fetch('contentUrl', nil)),
230
+ 'sizes' => Array.wrap(meta.fetch('contenSize', nil)).presence,
231
+ 'formats' => Array.wrap(meta.fetch('encodingFormat',
232
+ nil) || meta.fetch('fileFormat', nil)),
233
+ 'titles' => if meta.fetch('name', nil).present?
234
+ [{ 'title' => meta.fetch('name', nil) }]
235
+ else
236
+ [{ 'title' => meta.fetch('headline', nil) }]
237
+ end,
238
+ 'creators' => creators,
239
+ 'contributors' => contributors,
240
+ 'publisher' => publisher,
241
+ 'agency' => parse_attributes(meta.fetch('provider', nil), content: 'name', first: true),
242
+ 'container' => container,
243
+ 'related_identifiers' => related_identifiers,
244
+ 'publication_year' => publication_year,
245
+ 'dates' => dates,
246
+ 'descriptions' => if meta.fetch('description', nil).present?
247
+ [{ 'description' => sanitize(meta.fetch('description')),
248
+ 'descriptionType' => 'Abstract' }]
249
+ end,
250
+ 'rights_list' => rights_list,
251
+ 'version_info' => meta.fetch('version', nil).to_s.presence,
252
+ 'subjects' => subjects,
253
+ 'language' => language,
254
+ 'state' => state,
255
+ 'schema_version' => meta.fetch('schemaVersion', nil).to_s.presence,
256
+ 'funding_references' => funding_references,
257
+ 'geo_locations' => geo_locations }.merge(read_options)
236
258
  end
237
259
 
238
260
  def schema_org_related_identifier(meta, relation_type: nil)
239
- normalize_ids(ids: meta.fetch(relation_type, nil), relation_type: SO_TO_DC_RELATION_TYPES[relation_type])
261
+ normalize_ids(ids: meta.fetch(relation_type, nil),
262
+ relation_type: SO_TO_DC_RELATION_TYPES[relation_type])
240
263
  end
241
264
 
242
265
  def schema_org_reverse_related_identifier(meta, relation_type: nil)
243
- normalize_ids(ids: meta.dig("@reverse", relation_type), relation_type: SO_TO_DC_REVERSE_RELATION_TYPES[relation_type])
266
+ normalize_ids(ids: meta.dig('@reverse', relation_type),
267
+ relation_type: SO_TO_DC_REVERSE_RELATION_TYPES[relation_type])
244
268
  end
245
269
 
246
270
  def schema_org_is_identical_to(meta)
247
- schema_org_related_identifier(meta, relation_type: "sameAs")
271
+ schema_org_related_identifier(meta, relation_type: 'sameAs')
248
272
  end
249
273
 
250
274
  def schema_org_is_part_of(meta)
251
- schema_org_related_identifier(meta, relation_type: "isPartOf")
275
+ schema_org_related_identifier(meta, relation_type: 'isPartOf')
252
276
  end
253
277
 
254
278
  def schema_org_has_part(meta)
255
- schema_org_related_identifier(meta, relation_type: "hasPart")
279
+ schema_org_related_identifier(meta, relation_type: 'hasPart')
256
280
  end
257
281
 
258
282
  def schema_org_is_previous_version_of(meta)
259
- schema_org_related_identifier(meta, relation_type: "PredecessorOf")
283
+ schema_org_related_identifier(meta, relation_type: 'PredecessorOf')
260
284
  end
261
285
 
262
286
  def schema_org_is_new_version_of(meta)
263
- schema_org_related_identifier(meta, relation_type: "SuccessorOf")
287
+ schema_org_related_identifier(meta, relation_type: 'SuccessorOf')
264
288
  end
265
289
 
266
290
  def schema_org_references(meta)
267
- schema_org_related_identifier(meta, relation_type: "citation")
291
+ schema_org_related_identifier(meta, relation_type: 'citation')
268
292
  end
269
293
 
270
294
  def schema_org_is_referenced_by(meta)
271
- schema_org_reverse_related_identifier(meta, relation_type: "citation")
295
+ schema_org_reverse_related_identifier(meta, relation_type: 'citation')
272
296
  end
273
297
 
274
298
  def schema_org_is_supplement_to(meta)
275
- schema_org_reverse_related_identifier(meta, relation_type: "isBasedOn")
299
+ schema_org_reverse_related_identifier(meta, relation_type: 'isBasedOn')
276
300
  end
277
301
 
278
302
  def schema_org_is_supplemented_by(meta)
279
- schema_org_related_identifier(meta, relation_type: "isBasedOn")
303
+ schema_org_related_identifier(meta, relation_type: 'isBasedOn')
280
304
  end
281
-
282
305
  end
283
306
  end
284
307
  end
data/lib/briard/string.rb CHANGED
@@ -2,6 +2,6 @@
2
2
 
3
3
  class String
4
4
  def my_titleize
5
- self.gsub(/\b(['’]?[a-z])/) { "#{$1.capitalize}" }
5
+ gsub(/\b(['’]?[a-z])/) { ::Regexp.last_match(1).capitalize.to_s }
6
6
  end
7
7
  end
data/lib/briard/utils.rb CHANGED
@@ -500,7 +500,7 @@ module Briard
500
500
  def find_from_format_by_id(id)
501
501
  id = normalize_id(id)
502
502
 
503
- if /\A(?:(http|https):\/(\/)?(dx\.)?(doi.org|handle.stage.datacite.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
503
+ if /\A(?:(http|https):\/(\/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
504
504
  ra = get_doi_ra(id)
505
505
  %w(DataCite Crossref mEDRA KISTI JaLC OP).include?(ra) ? ra.downcase : nil
506
506
  elsif /\A(?:(http|https):\/(\/)?orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id)
@@ -537,7 +537,7 @@ module Briard
537
537
  "datacite"
538
538
  elsif options[:ext] == ".cff"
539
539
  "cff"
540
- elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org")
540
+ elsif options[:ext] == ".json" && URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
541
541
  "schema_org"
542
542
  elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
543
543
  "codemeta"
@@ -555,7 +555,7 @@ module Briard
555
555
  "crossref"
556
556
  elsif Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") }
557
557
  "datacite"
558
- elsif Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org", "https://schema.org")
558
+ elsif URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
559
559
  "schema_org"
560
560
  elsif Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
561
561
  "codemeta"
@@ -940,7 +940,7 @@ module Briard
940
940
  end
941
941
 
942
942
  # alternatively find the nameIdentifier in the sameAs attribute
943
- c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| item.start_with?("https://orcid.org") }
943
+ c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| URI(item).host == "orcid.org" }
944
944
 
945
945
  c["nameIdentifier"] = [{ "__content__" => c["@id"], "nameIdentifierScheme" => "ORCID", "schemeUri" => "https://orcid.org" }] if normalize_orcid(c["@id"])
946
946
  c["@type"] = c["@type"].find { |t| %w(Person Organization).include?(t) } if c["@type"].is_a?(Array)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Briard
2
- VERSION = "2.4.2"
4
+ VERSION = '2.6.1'
3
5
  end
@@ -4,7 +4,7 @@
4
4
 
5
5
  module Briard
6
6
  class WhitelistScrubber < Loofah::Scrubber
7
- def initialize(options={})
7
+ def initialize(options = {})
8
8
  @direction = :bottom_up
9
9
  @tags = options[:tags]
10
10
  @attributes = options[:attributes]
@@ -12,6 +12,7 @@ module Briard
12
12
 
13
13
  def scrub(node)
14
14
  scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
15
+
15
16
  node.before node.children
16
17
  node.remove
17
18
  end
@@ -19,14 +20,17 @@ module Briard
19
20
  private
20
21
 
21
22
  def scrub_node_attributes(node)
22
- fallback_scrub_node_attributes(node) and return true unless @attributes.present? && @attributes.respond_to?(:include?)
23
+ unless @attributes.present? && @attributes.respond_to?(:include?)
24
+ fallback_scrub_node_attributes(node) and return true
25
+ end
26
+
23
27
  node.attribute_nodes.each do |attr_node|
24
28
  attr_node.remove unless @attributes.include?(attr_node.name)
25
29
  end
26
30
  end
27
31
 
28
32
  def allowed_not_element_node_types
29
- [ Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE ]
33
+ [Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE]
30
34
  end
31
35
 
32
36
  def fallback_scrub_node_attributes(node)
@@ -38,9 +42,12 @@ module Briard
38
42
  end
39
43
 
40
44
  def node_allowed?(node)
41
- return fallback_allowed_element_detection(node) unless @tags.present? && @tags.respond_to?(:include?)
45
+ unless @tags.present? && @tags.respond_to?(:include?)
46
+ return fallback_allowed_element_detection(node)
47
+ end
42
48
  return true if allowed_not_element_node_types.include?(node.type)
43
49
  return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
50
+
44
51
  @tags.include? node.name
45
52
  end
46
53
  end
@@ -6,24 +6,30 @@ module Briard
6
6
  def bibtex
7
7
  return nil unless valid?
8
8
 
9
- pages = container.to_h["firstPage"].present? ? [container["firstPage"], container["lastPage"]].compact.join("-") : nil
9
+ pages = if container.to_h['firstPage'].present?
10
+ [container['firstPage'], container['lastPage']].compact.join('-')
11
+ end
10
12
 
11
13
  bib = {
12
- bibtex_type: types["bibtex"].presence || "misc",
14
+ bibtex_type: types['bibtex'].presence || 'misc',
13
15
  bibtex_key: normalize_doi(doi),
14
16
  doi: doi,
15
17
  url: url,
16
18
  author: authors_as_string(creators),
17
- keywords: subjects.present? ? Array.wrap(subjects).map { |k| parse_attributes(k, content: "subject", first: true) }.join(", ") : nil,
19
+ keywords: if subjects.present?
20
+ Array.wrap(subjects).map do |k|
21
+ parse_attributes(k, content: 'subject', first: true)
22
+ end.join(', ')
23
+ end,
18
24
  language: language,
19
- title: parse_attributes(titles, content: "title", first: true),
20
- journal: container && container["title"],
21
- volume: container.to_h["volume"],
22
- issue: container.to_h["issue"],
25
+ title: parse_attributes(titles, content: 'title', first: true),
26
+ journal: container && container['title'],
27
+ volume: container.to_h['volume'],
28
+ issue: container.to_h['issue'],
23
29
  pages: pages,
24
30
  publisher: publisher,
25
31
  year: publication_year,
26
- copyright: Array.wrap(rights_list).map { |l| l["rights"] }.first,
32
+ copyright: Array.wrap(rights_list).map { |l| l['rights'] }.first
27
33
  }.compact
28
34
  BibTeX::Entry.new(bib).to_s
29
35
  end