briard 2.4.1 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/codeql-analysis.yml +72 -0
- data/.github/workflows/rubocop.yml +50 -0
- data/.rubocop.yml +144 -620
- data/.rubocop_todo.yml +76 -0
- data/CHANGELOG.md +22 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +43 -6
- data/Rakefile +1 -1
- data/{bolognese.gemspec → briard.gemspec} +46 -38
- data/lib/briard/array.rb +2 -2
- data/lib/briard/author_utils.rb +79 -71
- data/lib/briard/cli.rb +12 -13
- data/lib/briard/crossref_utils.rb +73 -61
- data/lib/briard/datacite_utils.rb +132 -106
- data/lib/briard/doi_utils.rb +10 -10
- data/lib/briard/metadata.rb +96 -106
- data/lib/briard/metadata_utils.rb +87 -78
- data/lib/briard/readers/bibtex_reader.rb +65 -65
- data/lib/briard/readers/cff_reader.rb +88 -70
- data/lib/briard/readers/citeproc_reader.rb +90 -84
- data/lib/briard/readers/codemeta_reader.rb +68 -50
- data/lib/briard/readers/crosscite_reader.rb +2 -2
- data/lib/briard/readers/crossref_reader.rb +249 -210
- data/lib/briard/readers/datacite_json_reader.rb +3 -3
- data/lib/briard/readers/datacite_reader.rb +225 -189
- data/lib/briard/readers/npm_reader.rb +49 -42
- data/lib/briard/readers/ris_reader.rb +82 -80
- data/lib/briard/readers/schema_org_reader.rb +182 -159
- data/lib/briard/string.rb +1 -1
- data/lib/briard/utils.rb +4 -4
- data/lib/briard/version.rb +3 -1
- data/lib/briard/whitelist_scrubber.rb +11 -4
- data/lib/briard/writers/bibtex_writer.rb +14 -8
- data/lib/briard/writers/cff_writer.rb +33 -26
- data/lib/briard/writers/codemeta_writer.rb +19 -15
- data/lib/briard/writers/csv_writer.rb +6 -4
- data/lib/briard/writers/datacite_json_writer.rb +8 -2
- data/lib/briard/writers/jats_writer.rb +33 -28
- data/lib/briard/writers/rdf_xml_writer.rb +1 -1
- data/lib/briard/writers/ris_writer.rb +30 -18
- data/lib/briard/writers/turtle_writer.rb +1 -1
- data/lib/briard.rb +6 -6
- data/rubocop.sarif +0 -0
- data/spec/array_spec.rb +5 -5
- data/spec/author_utils_spec.rb +151 -132
- data/spec/datacite_utils_spec.rb +135 -83
- data/spec/doi_utils_spec.rb +168 -164
- data/spec/find_from_format_spec.rb +69 -69
- data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/onlies_keep_specific_tags.yml +65 -0
- data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/removes_a_tags.yml +65 -0
- data/spec/metadata_spec.rb +91 -90
- data/spec/readers/bibtex_reader_spec.rb +43 -38
- data/spec/readers/cff_reader_spec.rb +165 -153
- data/spec/readers/citeproc_reader_spec.rb +45 -40
- data/spec/readers/codemeta_reader_spec.rb +128 -115
- data/spec/readers/crosscite_reader_spec.rb +34 -24
- data/spec/readers/crossref_reader_spec.rb +1098 -939
- data/spec/readers/datacite_json_reader_spec.rb +53 -40
- data/spec/readers/datacite_reader_spec.rb +1541 -1337
- data/spec/readers/npm_reader_spec.rb +48 -43
- data/spec/readers/ris_reader_spec.rb +53 -47
- data/spec/readers/schema_org_reader_spec.rb +329 -267
- data/spec/spec_helper.rb +6 -5
- data/spec/utils_spec.rb +371 -347
- data/spec/writers/bibtex_writer_spec.rb +143 -143
- data/spec/writers/cff_writer_spec.rb +96 -90
- data/spec/writers/citation_writer_spec.rb +34 -33
- data/spec/writers/citeproc_writer_spec.rb +226 -224
- data/spec/writers/codemeta_writer_spec.rb +18 -16
- data/spec/writers/crosscite_writer_spec.rb +91 -73
- data/spec/writers/crossref_writer_spec.rb +99 -91
- data/spec/writers/csv_writer_spec.rb +70 -70
- data/spec/writers/datacite_json_writer_spec.rb +78 -68
- data/spec/writers/datacite_writer_spec.rb +417 -322
- data/spec/writers/jats_writer_spec.rb +177 -161
- data/spec/writers/rdf_xml_writer_spec.rb +68 -63
- data/spec/writers/ris_writer_spec.rb +162 -162
- data/spec/writers/turtle_writer_spec.rb +47 -47
- metadata +250 -160
- data/.github/workflows/release.yml +0 -47
@@ -4,281 +4,304 @@ module Briard
|
|
4
4
|
module Readers
|
5
5
|
module SchemaOrgReader
|
6
6
|
SO_TO_DC_RELATION_TYPES = {
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
}
|
7
|
+
'citation' => 'References',
|
8
|
+
'isBasedOn' => 'IsSupplementedBy',
|
9
|
+
'sameAs' => 'IsIdenticalTo',
|
10
|
+
'isPartOf' => 'IsPartOf',
|
11
|
+
'hasPart' => 'HasPart',
|
12
|
+
'isPredecessor' => 'IsPreviousVersionOf',
|
13
|
+
'isSuccessor' => 'IsNewVersionOf'
|
14
|
+
}.freeze
|
15
15
|
|
16
16
|
SO_TO_DC_REVERSE_RELATION_TYPES = {
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
25
|
-
|
26
|
-
def get_schema_org(id: nil, **
|
27
|
-
return {
|
17
|
+
'citation' => 'IsReferencedBy',
|
18
|
+
'isBasedOn' => 'IsSupplementTo',
|
19
|
+
'sameAs' => 'IsIdenticalTo',
|
20
|
+
'isPartOf' => 'HasPart',
|
21
|
+
'hasPart' => 'IsPartOf',
|
22
|
+
'isPredecessor' => 'IsNewVersionOf',
|
23
|
+
'isSuccessor' => 'IsPreviousVersionOf'
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
def get_schema_org(id: nil, **_options)
|
27
|
+
return { 'string' => nil, 'state' => 'not_found' } unless id.present?
|
28
28
|
|
29
29
|
url = normalize_id(id)
|
30
30
|
response = Maremma.get(url, raw: true)
|
31
31
|
|
32
32
|
# some responses are returned as a hash
|
33
|
-
if response.body[
|
34
|
-
string = response.body.dig(
|
33
|
+
if response.body['data'].is_a?(Hash)
|
34
|
+
string = response.body.dig('data', 'html', 'head', 'script', 1, '__content__')
|
35
35
|
else
|
36
|
-
doc = Nokogiri::XML(response.body.fetch(
|
37
|
-
|
36
|
+
doc = Nokogiri::XML(response.body.fetch('data', nil), nil, 'UTF-8')
|
37
|
+
|
38
38
|
# workaround for xhtml documents
|
39
39
|
nodeset = doc.at("script[type='application/ld+json']")
|
40
|
-
hsh = JSON.parse(nodeset ||
|
41
|
-
|
40
|
+
hsh = JSON.parse(nodeset || '{}')
|
41
|
+
|
42
42
|
# workaround for doi as canonical_url but not included with schema.org
|
43
43
|
link = doc.css("link[rel='canonical']")
|
44
|
-
hsh
|
44
|
+
hsh['@id'] = link[0]['href'] if link.present?
|
45
45
|
|
46
46
|
# workaround if license included but not with schema.org
|
47
47
|
license = doc.at("meta[name='DCTERMS.license']")
|
48
|
-
hsh
|
49
|
-
|
48
|
+
hsh['license'] = license['content'] if license.present?
|
49
|
+
|
50
50
|
# workaround for html language attribute if no language is set via schema.org
|
51
51
|
lang = doc.at('html')['lang']
|
52
|
-
hsh
|
52
|
+
hsh['inLanguage'] = lang if hsh['inLanguage'].blank?
|
53
53
|
|
54
54
|
# workaround if issn not included with schema.org
|
55
55
|
name = doc.at("meta[property='og:site_name']")
|
56
56
|
issn = doc.at("meta[name='citation_issn']")
|
57
|
-
hsh
|
57
|
+
hsh['isPartOf'] = { 'name' => name ? name['content'] : nil,
|
58
|
+
'issn' => issn ? issn['content'] : nil }.compact
|
58
59
|
|
59
60
|
string = hsh.to_json if hsh.present?
|
60
61
|
end
|
61
62
|
|
62
|
-
{
|
63
|
+
{ 'string' => string }
|
63
64
|
end
|
64
65
|
|
65
66
|
def read_schema_org(string: nil, **options)
|
66
67
|
if string.present?
|
67
68
|
errors = jsonlint(string)
|
68
|
-
return {
|
69
|
+
return { 'errors' => errors } if errors.present?
|
69
70
|
end
|
70
71
|
|
71
|
-
read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url,
|
72
|
+
read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url,
|
73
|
+
:sandbox, :validate, :ra))
|
72
74
|
|
73
75
|
meta = string.present? ? Maremma.from_json(string) : {}
|
74
76
|
|
75
|
-
identifiers = Array.wrap(meta.fetch(
|
77
|
+
identifiers = Array.wrap(meta.fetch('identifier', nil)).map do |r|
|
76
78
|
r = normalize_id(r) if r.is_a?(String)
|
77
|
-
if r.is_a?(String) &&
|
78
|
-
|
79
|
+
if r.is_a?(String) && URI(r).host != 'doi.org'
|
80
|
+
{ 'identifierType' => 'URL', 'identifier' => r }
|
79
81
|
elsif r.is_a?(Hash)
|
80
|
-
{
|
82
|
+
{ 'identifierType' => get_identifier_type(r['propertyID']), 'identifier' => r['value'] }
|
81
83
|
end
|
82
84
|
end.compact.uniq
|
83
85
|
|
84
86
|
id = options[:doi]
|
85
|
-
id = meta.fetch(
|
86
|
-
id = meta.fetch(
|
87
|
+
id = meta.fetch('@id', nil) if id.blank? && URI(meta.fetch('@id', '')).host == 'doi.org'
|
88
|
+
id = meta.fetch('identifier', nil) if id.blank?
|
87
89
|
id = normalize_id(id)
|
88
90
|
|
89
|
-
schema_org = meta.fetch(
|
91
|
+
schema_org = meta.fetch('@type', nil) && meta.fetch('@type').camelcase
|
90
92
|
resource_type_general = Briard::Utils::SO_TO_DC_TRANSLATIONS[schema_org]
|
91
93
|
types = {
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
94
|
+
'resourceTypeGeneral' => resource_type_general,
|
95
|
+
'resourceType' => meta.fetch('additionalType', nil),
|
96
|
+
'schemaOrg' => schema_org,
|
97
|
+
'citeproc' => Briard::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || 'article-journal',
|
98
|
+
'bibtex' => Briard::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || 'misc',
|
99
|
+
'ris' => Briard::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || 'GEN'
|
98
100
|
}.compact
|
99
|
-
authors = meta.fetch(
|
101
|
+
authors = meta.fetch('author', nil) || meta.fetch('creator', nil)
|
100
102
|
# Authors should be an object, if it's just a plain string don't try and parse it.
|
101
|
-
|
103
|
+
unless authors.is_a?(String)
|
102
104
|
creators = get_authors(from_schema_org_creators(Array.wrap(authors)))
|
103
105
|
end
|
104
|
-
contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch(
|
105
|
-
|
106
|
+
contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch('editor',
|
107
|
+
nil))))
|
108
|
+
publisher = parse_attributes(meta.fetch('publisher', nil), content: 'name', first: true)
|
106
109
|
|
107
|
-
ct =
|
110
|
+
ct = schema_org == 'Dataset' ? 'includedInDataCatalog' : 'Periodical'
|
108
111
|
container = if meta.fetch(ct, nil).present?
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
112
|
+
url = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'url',
|
113
|
+
first: true)
|
114
|
+
|
115
|
+
{
|
116
|
+
'type' => schema_org == 'Dataset' ? 'DataRepository' : 'Periodical',
|
117
|
+
'title' => parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'name',
|
118
|
+
first: true),
|
119
|
+
'identifier' => url,
|
120
|
+
'identifierType' => url.present? ? 'URL' : nil,
|
121
|
+
'volume' => meta.fetch('volumeNumber', nil),
|
122
|
+
'issue' => meta.fetch('issueNumber', nil),
|
123
|
+
'firstPage' => meta.fetch('pageStart', nil),
|
124
|
+
'lastPage' => meta.fetch('pageEnd', nil)
|
125
|
+
}.compact
|
126
|
+
elsif %w[BlogPosting Article].include?(schema_org)
|
127
|
+
issn = meta.dig('isPartOf', 'issn')
|
128
|
+
|
129
|
+
{
|
130
|
+
'type' => 'Blog',
|
131
|
+
'title' => meta.dig('isPartOf', 'name'),
|
132
|
+
'identifier' => issn,
|
133
|
+
'identifierType' => issn.present? ? 'ISSN' : nil
|
134
|
+
}.compact
|
135
|
+
else
|
136
|
+
{}
|
137
|
+
end
|
133
138
|
|
134
139
|
related_identifiers = Array.wrap(schema_org_is_identical_to(meta)) +
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
rights_list = Array.wrap(meta.fetch(
|
140
|
+
Array.wrap(schema_org_is_part_of(meta)) +
|
141
|
+
Array.wrap(schema_org_has_part(meta)) +
|
142
|
+
Array.wrap(schema_org_is_previous_version_of(meta)) +
|
143
|
+
Array.wrap(schema_org_is_new_version_of(meta)) +
|
144
|
+
Array.wrap(schema_org_references(meta)) +
|
145
|
+
Array.wrap(schema_org_is_referenced_by(meta)) +
|
146
|
+
Array.wrap(schema_org_is_supplement_to(meta)) +
|
147
|
+
Array.wrap(schema_org_is_supplemented_by(meta))
|
148
|
+
|
149
|
+
rights_list = Array.wrap(meta.fetch('license', nil)).compact.map do |rl|
|
145
150
|
if rl.is_a?(String)
|
146
|
-
hsh_to_spdx(
|
151
|
+
hsh_to_spdx('rightsURI' => rl)
|
147
152
|
else
|
148
|
-
hsh_to_spdx(
|
153
|
+
hsh_to_spdx('__content__' => rl['name'], 'rightsURI' => rl['id'])
|
149
154
|
end
|
150
155
|
end
|
151
156
|
|
152
|
-
funding_references = Array.wrap(meta.fetch(
|
153
|
-
if fr[
|
157
|
+
funding_references = Array.wrap(meta.fetch('funder', nil)).compact.map do |fr|
|
158
|
+
if fr['@id'].present?
|
154
159
|
{
|
155
|
-
|
156
|
-
|
157
|
-
|
160
|
+
'funderName' => fr['name'],
|
161
|
+
'funderIdentifier' => fr['@id'],
|
162
|
+
'funderIdentifierType' => fr['@id'].to_s.start_with?('https://doi.org/10.13039') ? 'Crossref Funder ID' : 'Other'
|
163
|
+
}.compact
|
158
164
|
else
|
159
|
-
{
|
160
|
-
"funderName" => fr["name"] }.compact
|
165
|
+
{ 'funderName' => fr['name'] }.compact
|
161
166
|
end
|
162
167
|
end
|
163
168
|
|
164
169
|
# strip milliseconds from iso8601, as edtf library doesn't handle them
|
165
170
|
dates = []
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
publication_year = meta.fetch("datePublished")[0..3] if meta.fetch("datePublished", nil).present?
|
170
|
-
|
171
|
-
if meta.fetch("inLanguage", nil).is_a?(String)
|
172
|
-
language = meta.fetch("inLanguage")
|
173
|
-
elsif meta.fetch("inLanguage", nil).is_a?(Object)
|
174
|
-
language = meta.dig("inLanguage", 'alternateName') || meta.dig("inLanguage", 'name')
|
175
|
-
else
|
176
|
-
language = nil
|
171
|
+
if Date.edtf(strip_milliseconds(meta.fetch('datePublished', nil))).present?
|
172
|
+
dates << { 'date' => strip_milliseconds(meta.fetch('datePublished')),
|
173
|
+
'dateType' => 'Issued' }
|
177
174
|
end
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
175
|
+
if Date.edtf(strip_milliseconds(meta.fetch('dateCreated', nil))).present?
|
176
|
+
dates << { 'date' => strip_milliseconds(meta.fetch('dateCreated')),
|
177
|
+
'dateType' => 'Created' }
|
178
|
+
end
|
179
|
+
if Date.edtf(strip_milliseconds(meta.fetch('dateModified', nil))).present?
|
180
|
+
dates << { 'date' => strip_milliseconds(meta.fetch('dateModified')),
|
181
|
+
'dateType' => 'Updated' }
|
182
|
+
end
|
183
|
+
publication_year = meta.fetch('datePublished')[0..3] if meta.fetch('datePublished',
|
184
|
+
nil).present?
|
185
|
+
|
186
|
+
language = case meta.fetch('inLanguage', nil)
|
187
|
+
when String
|
188
|
+
meta.fetch('inLanguage')
|
189
|
+
when Object
|
190
|
+
meta.dig('inLanguage', 'alternateName') || meta.dig('inLanguage', 'name')
|
191
|
+
end
|
192
|
+
|
193
|
+
state = meta.present? || read_options.present? ? 'findable' : 'not_found'
|
194
|
+
geo_locations = Array.wrap(meta.fetch('spatialCoverage', nil)).map do |gl|
|
195
|
+
if gl.dig('geo', 'box')
|
196
|
+
s, w, n, e = gl.dig('geo', 'box').split(' ', 4)
|
183
197
|
geo_location_box = {
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
198
|
+
'westBoundLongitude' => w,
|
199
|
+
'eastBoundLongitude' => e,
|
200
|
+
'southBoundLatitude' => s,
|
201
|
+
'northBoundLatitude' => n
|
188
202
|
}.compact.presence
|
189
203
|
else
|
190
204
|
geo_location_box = nil
|
191
205
|
end
|
192
|
-
geo_location_point = {
|
206
|
+
geo_location_point = { 'pointLongitude' => gl.dig('geo', 'longitude'),
|
207
|
+
'pointLatitude' => gl.dig('geo', 'latitude') }.compact.presence
|
193
208
|
|
194
209
|
{
|
195
|
-
|
196
|
-
|
197
|
-
|
210
|
+
'geoLocationPlace' => gl.dig('geo', 'address'),
|
211
|
+
'geoLocationPoint' => geo_location_point,
|
212
|
+
'geoLocationBox' => geo_location_box
|
198
213
|
}.compact
|
199
214
|
end
|
200
215
|
|
201
216
|
# handle keywords as array and as comma-separated string
|
202
|
-
subjects = meta.fetch(
|
203
|
-
subjects = subjects.to_s.downcase.split(
|
217
|
+
subjects = meta.fetch('keywords', nil)
|
218
|
+
subjects = subjects.to_s.downcase.split(', ') if subjects.is_a?(String)
|
204
219
|
subjects = Array.wrap(subjects).reduce([]) do |sum, subject|
|
205
220
|
sum += name_to_fos(subject)
|
206
221
|
sum
|
207
222
|
end
|
208
223
|
|
209
|
-
{
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
224
|
+
{ 'id' => id,
|
225
|
+
'types' => types,
|
226
|
+
'doi' => validate_doi(id),
|
227
|
+
'identifiers' => identifiers,
|
228
|
+
'url' => normalize_id(meta.fetch('url', nil)),
|
229
|
+
'content_url' => Array.wrap(meta.fetch('contentUrl', nil)),
|
230
|
+
'sizes' => Array.wrap(meta.fetch('contenSize', nil)).presence,
|
231
|
+
'formats' => Array.wrap(meta.fetch('encodingFormat',
|
232
|
+
nil) || meta.fetch('fileFormat', nil)),
|
233
|
+
'titles' => if meta.fetch('name', nil).present?
|
234
|
+
[{ 'title' => meta.fetch('name', nil) }]
|
235
|
+
else
|
236
|
+
[{ 'title' => meta.fetch('headline', nil) }]
|
237
|
+
end,
|
238
|
+
'creators' => creators,
|
239
|
+
'contributors' => contributors,
|
240
|
+
'publisher' => publisher,
|
241
|
+
'agency' => parse_attributes(meta.fetch('provider', nil), content: 'name', first: true),
|
242
|
+
'container' => container,
|
243
|
+
'related_identifiers' => related_identifiers,
|
244
|
+
'publication_year' => publication_year,
|
245
|
+
'dates' => dates,
|
246
|
+
'descriptions' => if meta.fetch('description', nil).present?
|
247
|
+
[{ 'description' => sanitize(meta.fetch('description')),
|
248
|
+
'descriptionType' => 'Abstract' }]
|
249
|
+
end,
|
250
|
+
'rights_list' => rights_list,
|
251
|
+
'version_info' => meta.fetch('version', nil).to_s.presence,
|
252
|
+
'subjects' => subjects,
|
253
|
+
'language' => language,
|
254
|
+
'state' => state,
|
255
|
+
'schema_version' => meta.fetch('schemaVersion', nil).to_s.presence,
|
256
|
+
'funding_references' => funding_references,
|
257
|
+
'geo_locations' => geo_locations }.merge(read_options)
|
236
258
|
end
|
237
259
|
|
238
260
|
def schema_org_related_identifier(meta, relation_type: nil)
|
239
|
-
normalize_ids(ids: meta.fetch(relation_type, nil),
|
261
|
+
normalize_ids(ids: meta.fetch(relation_type, nil),
|
262
|
+
relation_type: SO_TO_DC_RELATION_TYPES[relation_type])
|
240
263
|
end
|
241
264
|
|
242
265
|
def schema_org_reverse_related_identifier(meta, relation_type: nil)
|
243
|
-
normalize_ids(ids: meta.dig(
|
266
|
+
normalize_ids(ids: meta.dig('@reverse', relation_type),
|
267
|
+
relation_type: SO_TO_DC_REVERSE_RELATION_TYPES[relation_type])
|
244
268
|
end
|
245
269
|
|
246
270
|
def schema_org_is_identical_to(meta)
|
247
|
-
schema_org_related_identifier(meta, relation_type:
|
271
|
+
schema_org_related_identifier(meta, relation_type: 'sameAs')
|
248
272
|
end
|
249
273
|
|
250
274
|
def schema_org_is_part_of(meta)
|
251
|
-
schema_org_related_identifier(meta, relation_type:
|
275
|
+
schema_org_related_identifier(meta, relation_type: 'isPartOf')
|
252
276
|
end
|
253
277
|
|
254
278
|
def schema_org_has_part(meta)
|
255
|
-
schema_org_related_identifier(meta, relation_type:
|
279
|
+
schema_org_related_identifier(meta, relation_type: 'hasPart')
|
256
280
|
end
|
257
281
|
|
258
282
|
def schema_org_is_previous_version_of(meta)
|
259
|
-
schema_org_related_identifier(meta, relation_type:
|
283
|
+
schema_org_related_identifier(meta, relation_type: 'PredecessorOf')
|
260
284
|
end
|
261
285
|
|
262
286
|
def schema_org_is_new_version_of(meta)
|
263
|
-
schema_org_related_identifier(meta, relation_type:
|
287
|
+
schema_org_related_identifier(meta, relation_type: 'SuccessorOf')
|
264
288
|
end
|
265
289
|
|
266
290
|
def schema_org_references(meta)
|
267
|
-
schema_org_related_identifier(meta, relation_type:
|
291
|
+
schema_org_related_identifier(meta, relation_type: 'citation')
|
268
292
|
end
|
269
293
|
|
270
294
|
def schema_org_is_referenced_by(meta)
|
271
|
-
schema_org_reverse_related_identifier(meta, relation_type:
|
295
|
+
schema_org_reverse_related_identifier(meta, relation_type: 'citation')
|
272
296
|
end
|
273
297
|
|
274
298
|
def schema_org_is_supplement_to(meta)
|
275
|
-
schema_org_reverse_related_identifier(meta, relation_type:
|
299
|
+
schema_org_reverse_related_identifier(meta, relation_type: 'isBasedOn')
|
276
300
|
end
|
277
301
|
|
278
302
|
def schema_org_is_supplemented_by(meta)
|
279
|
-
schema_org_related_identifier(meta, relation_type:
|
303
|
+
schema_org_related_identifier(meta, relation_type: 'isBasedOn')
|
280
304
|
end
|
281
|
-
|
282
305
|
end
|
283
306
|
end
|
284
307
|
end
|
data/lib/briard/string.rb
CHANGED
data/lib/briard/utils.rb
CHANGED
@@ -500,7 +500,7 @@ module Briard
|
|
500
500
|
def find_from_format_by_id(id)
|
501
501
|
id = normalize_id(id)
|
502
502
|
|
503
|
-
if /\A(?:(http|https):\/(\/)?(dx\.)?(doi
|
503
|
+
if /\A(?:(http|https):\/(\/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
|
504
504
|
ra = get_doi_ra(id)
|
505
505
|
%w(DataCite Crossref mEDRA KISTI JaLC OP).include?(ra) ? ra.downcase : nil
|
506
506
|
elsif /\A(?:(http|https):\/(\/)?orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id)
|
@@ -537,7 +537,7 @@ module Briard
|
|
537
537
|
"datacite"
|
538
538
|
elsif options[:ext] == ".cff"
|
539
539
|
"cff"
|
540
|
-
elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.
|
540
|
+
elsif options[:ext] == ".json" && URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
|
541
541
|
"schema_org"
|
542
542
|
elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
|
543
543
|
"codemeta"
|
@@ -555,7 +555,7 @@ module Briard
|
|
555
555
|
"crossref"
|
556
556
|
elsif Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") }
|
557
557
|
"datacite"
|
558
|
-
elsif Maremma.from_json(string).to_h.
|
558
|
+
elsif URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
|
559
559
|
"schema_org"
|
560
560
|
elsif Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
|
561
561
|
"codemeta"
|
@@ -940,7 +940,7 @@ module Briard
|
|
940
940
|
end
|
941
941
|
|
942
942
|
# alternatively find the nameIdentifier in the sameAs attribute
|
943
|
-
c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| item.
|
943
|
+
c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| URI(item).host == "orcid.org" }
|
944
944
|
|
945
945
|
c["nameIdentifier"] = [{ "__content__" => c["@id"], "nameIdentifierScheme" => "ORCID", "schemeUri" => "https://orcid.org" }] if normalize_orcid(c["@id"])
|
946
946
|
c["@type"] = c["@type"].find { |t| %w(Person Organization).include?(t) } if c["@type"].is_a?(Array)
|
data/lib/briard/version.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
module Briard
|
6
6
|
class WhitelistScrubber < Loofah::Scrubber
|
7
|
-
def initialize(options={})
|
7
|
+
def initialize(options = {})
|
8
8
|
@direction = :bottom_up
|
9
9
|
@tags = options[:tags]
|
10
10
|
@attributes = options[:attributes]
|
@@ -12,6 +12,7 @@ module Briard
|
|
12
12
|
|
13
13
|
def scrub(node)
|
14
14
|
scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
|
15
|
+
|
15
16
|
node.before node.children
|
16
17
|
node.remove
|
17
18
|
end
|
@@ -19,14 +20,17 @@ module Briard
|
|
19
20
|
private
|
20
21
|
|
21
22
|
def scrub_node_attributes(node)
|
22
|
-
|
23
|
+
unless @attributes.present? && @attributes.respond_to?(:include?)
|
24
|
+
fallback_scrub_node_attributes(node) and return true
|
25
|
+
end
|
26
|
+
|
23
27
|
node.attribute_nodes.each do |attr_node|
|
24
28
|
attr_node.remove unless @attributes.include?(attr_node.name)
|
25
29
|
end
|
26
30
|
end
|
27
31
|
|
28
32
|
def allowed_not_element_node_types
|
29
|
-
[
|
33
|
+
[Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE]
|
30
34
|
end
|
31
35
|
|
32
36
|
def fallback_scrub_node_attributes(node)
|
@@ -38,9 +42,12 @@ module Briard
|
|
38
42
|
end
|
39
43
|
|
40
44
|
def node_allowed?(node)
|
41
|
-
|
45
|
+
unless @tags.present? && @tags.respond_to?(:include?)
|
46
|
+
return fallback_allowed_element_detection(node)
|
47
|
+
end
|
42
48
|
return true if allowed_not_element_node_types.include?(node.type)
|
43
49
|
return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
|
50
|
+
|
44
51
|
@tags.include? node.name
|
45
52
|
end
|
46
53
|
end
|
@@ -6,24 +6,30 @@ module Briard
|
|
6
6
|
def bibtex
|
7
7
|
return nil unless valid?
|
8
8
|
|
9
|
-
pages = container.to_h[
|
9
|
+
pages = if container.to_h['firstPage'].present?
|
10
|
+
[container['firstPage'], container['lastPage']].compact.join('-')
|
11
|
+
end
|
10
12
|
|
11
13
|
bib = {
|
12
|
-
bibtex_type: types[
|
14
|
+
bibtex_type: types['bibtex'].presence || 'misc',
|
13
15
|
bibtex_key: normalize_doi(doi),
|
14
16
|
doi: doi,
|
15
17
|
url: url,
|
16
18
|
author: authors_as_string(creators),
|
17
|
-
keywords: subjects.present?
|
19
|
+
keywords: if subjects.present?
|
20
|
+
Array.wrap(subjects).map do |k|
|
21
|
+
parse_attributes(k, content: 'subject', first: true)
|
22
|
+
end.join(', ')
|
23
|
+
end,
|
18
24
|
language: language,
|
19
|
-
title: parse_attributes(titles, content:
|
20
|
-
journal: container && container[
|
21
|
-
volume: container.to_h[
|
22
|
-
issue: container.to_h[
|
25
|
+
title: parse_attributes(titles, content: 'title', first: true),
|
26
|
+
journal: container && container['title'],
|
27
|
+
volume: container.to_h['volume'],
|
28
|
+
issue: container.to_h['issue'],
|
23
29
|
pages: pages,
|
24
30
|
publisher: publisher,
|
25
31
|
year: publication_year,
|
26
|
-
copyright: Array.wrap(rights_list).map { |l| l[
|
32
|
+
copyright: Array.wrap(rights_list).map { |l| l['rights'] }.first
|
27
33
|
}.compact
|
28
34
|
BibTeX::Entry.new(bib).to_s
|
29
35
|
end
|