briard 2.4.2 → 2.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/codeql-analysis.yml +72 -0
- data/.github/workflows/rubocop.yml +50 -0
- data/.rubocop.yml +144 -620
- data/.rubocop_todo.yml +76 -0
- data/CHANGELOG.md +18 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +40 -6
- data/Rakefile +1 -1
- data/{bolognese.gemspec → briard.gemspec} +46 -39
- data/lib/briard/array.rb +2 -2
- data/lib/briard/author_utils.rb +79 -71
- data/lib/briard/cli.rb +12 -13
- data/lib/briard/crossref_utils.rb +73 -61
- data/lib/briard/datacite_utils.rb +132 -106
- data/lib/briard/doi_utils.rb +10 -10
- data/lib/briard/metadata.rb +96 -106
- data/lib/briard/metadata_utils.rb +87 -78
- data/lib/briard/readers/bibtex_reader.rb +65 -65
- data/lib/briard/readers/cff_reader.rb +88 -70
- data/lib/briard/readers/citeproc_reader.rb +90 -84
- data/lib/briard/readers/codemeta_reader.rb +68 -50
- data/lib/briard/readers/crosscite_reader.rb +2 -2
- data/lib/briard/readers/crossref_reader.rb +249 -210
- data/lib/briard/readers/datacite_json_reader.rb +3 -3
- data/lib/briard/readers/datacite_reader.rb +225 -189
- data/lib/briard/readers/npm_reader.rb +49 -42
- data/lib/briard/readers/ris_reader.rb +82 -80
- data/lib/briard/readers/schema_org_reader.rb +182 -159
- data/lib/briard/string.rb +1 -1
- data/lib/briard/utils.rb +4 -4
- data/lib/briard/version.rb +3 -1
- data/lib/briard/whitelist_scrubber.rb +11 -4
- data/lib/briard/writers/bibtex_writer.rb +14 -8
- data/lib/briard/writers/cff_writer.rb +33 -26
- data/lib/briard/writers/codemeta_writer.rb +19 -15
- data/lib/briard/writers/csv_writer.rb +6 -4
- data/lib/briard/writers/datacite_json_writer.rb +8 -2
- data/lib/briard/writers/jats_writer.rb +33 -28
- data/lib/briard/writers/rdf_xml_writer.rb +1 -1
- data/lib/briard/writers/ris_writer.rb +30 -18
- data/lib/briard/writers/turtle_writer.rb +1 -1
- data/lib/briard.rb +6 -6
- data/rubocop.sarif +0 -0
- data/spec/array_spec.rb +5 -5
- data/spec/author_utils_spec.rb +151 -132
- data/spec/datacite_utils_spec.rb +135 -83
- data/spec/doi_utils_spec.rb +168 -164
- data/spec/find_from_format_spec.rb +69 -69
- data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/onlies_keep_specific_tags.yml +65 -0
- data/spec/fixtures/vcr_cassettes/Briard_Metadata/sanitize/removes_a_tags.yml +65 -0
- data/spec/metadata_spec.rb +91 -90
- data/spec/readers/bibtex_reader_spec.rb +43 -38
- data/spec/readers/cff_reader_spec.rb +165 -153
- data/spec/readers/citeproc_reader_spec.rb +45 -40
- data/spec/readers/codemeta_reader_spec.rb +128 -115
- data/spec/readers/crosscite_reader_spec.rb +34 -24
- data/spec/readers/crossref_reader_spec.rb +1098 -939
- data/spec/readers/datacite_json_reader_spec.rb +53 -40
- data/spec/readers/datacite_reader_spec.rb +1541 -1337
- data/spec/readers/npm_reader_spec.rb +48 -43
- data/spec/readers/ris_reader_spec.rb +53 -47
- data/spec/readers/schema_org_reader_spec.rb +329 -267
- data/spec/spec_helper.rb +6 -5
- data/spec/utils_spec.rb +371 -347
- data/spec/writers/bibtex_writer_spec.rb +143 -143
- data/spec/writers/cff_writer_spec.rb +96 -90
- data/spec/writers/citation_writer_spec.rb +34 -33
- data/spec/writers/citeproc_writer_spec.rb +226 -224
- data/spec/writers/codemeta_writer_spec.rb +18 -16
- data/spec/writers/crosscite_writer_spec.rb +91 -73
- data/spec/writers/crossref_writer_spec.rb +99 -91
- data/spec/writers/csv_writer_spec.rb +70 -70
- data/spec/writers/datacite_json_writer_spec.rb +78 -68
- data/spec/writers/datacite_writer_spec.rb +417 -322
- data/spec/writers/jats_writer_spec.rb +177 -161
- data/spec/writers/rdf_xml_writer_spec.rb +68 -63
- data/spec/writers/ris_writer_spec.rb +162 -162
- data/spec/writers/turtle_writer_spec.rb +47 -47
- metadata +242 -166
- data/.github/workflows/release.yml +0 -47
@@ -4,281 +4,304 @@ module Briard
|
|
4
4
|
module Readers
|
5
5
|
module SchemaOrgReader
|
6
6
|
SO_TO_DC_RELATION_TYPES = {
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
}
|
7
|
+
'citation' => 'References',
|
8
|
+
'isBasedOn' => 'IsSupplementedBy',
|
9
|
+
'sameAs' => 'IsIdenticalTo',
|
10
|
+
'isPartOf' => 'IsPartOf',
|
11
|
+
'hasPart' => 'HasPart',
|
12
|
+
'isPredecessor' => 'IsPreviousVersionOf',
|
13
|
+
'isSuccessor' => 'IsNewVersionOf'
|
14
|
+
}.freeze
|
15
15
|
|
16
16
|
SO_TO_DC_REVERSE_RELATION_TYPES = {
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
}
|
25
|
-
|
26
|
-
def get_schema_org(id: nil, **
|
27
|
-
return {
|
17
|
+
'citation' => 'IsReferencedBy',
|
18
|
+
'isBasedOn' => 'IsSupplementTo',
|
19
|
+
'sameAs' => 'IsIdenticalTo',
|
20
|
+
'isPartOf' => 'HasPart',
|
21
|
+
'hasPart' => 'IsPartOf',
|
22
|
+
'isPredecessor' => 'IsNewVersionOf',
|
23
|
+
'isSuccessor' => 'IsPreviousVersionOf'
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
def get_schema_org(id: nil, **_options)
|
27
|
+
return { 'string' => nil, 'state' => 'not_found' } unless id.present?
|
28
28
|
|
29
29
|
url = normalize_id(id)
|
30
30
|
response = Maremma.get(url, raw: true)
|
31
31
|
|
32
32
|
# some responses are returned as a hash
|
33
|
-
if response.body[
|
34
|
-
string = response.body.dig(
|
33
|
+
if response.body['data'].is_a?(Hash)
|
34
|
+
string = response.body.dig('data', 'html', 'head', 'script', 1, '__content__')
|
35
35
|
else
|
36
|
-
doc = Nokogiri::XML(response.body.fetch(
|
37
|
-
|
36
|
+
doc = Nokogiri::XML(response.body.fetch('data', nil), nil, 'UTF-8')
|
37
|
+
|
38
38
|
# workaround for xhtml documents
|
39
39
|
nodeset = doc.at("script[type='application/ld+json']")
|
40
|
-
hsh = JSON.parse(nodeset ||
|
41
|
-
|
40
|
+
hsh = JSON.parse(nodeset || '{}')
|
41
|
+
|
42
42
|
# workaround for doi as canonical_url but not included with schema.org
|
43
43
|
link = doc.css("link[rel='canonical']")
|
44
|
-
hsh
|
44
|
+
hsh['@id'] = link[0]['href'] if link.present?
|
45
45
|
|
46
46
|
# workaround if license included but not with schema.org
|
47
47
|
license = doc.at("meta[name='DCTERMS.license']")
|
48
|
-
hsh
|
49
|
-
|
48
|
+
hsh['license'] = license['content'] if license.present?
|
49
|
+
|
50
50
|
# workaround for html language attribute if no language is set via schema.org
|
51
51
|
lang = doc.at('html')['lang']
|
52
|
-
hsh
|
52
|
+
hsh['inLanguage'] = lang if hsh['inLanguage'].blank?
|
53
53
|
|
54
54
|
# workaround if issn not included with schema.org
|
55
55
|
name = doc.at("meta[property='og:site_name']")
|
56
56
|
issn = doc.at("meta[name='citation_issn']")
|
57
|
-
hsh
|
57
|
+
hsh['isPartOf'] = { 'name' => name ? name['content'] : nil,
|
58
|
+
'issn' => issn ? issn['content'] : nil }.compact
|
58
59
|
|
59
60
|
string = hsh.to_json if hsh.present?
|
60
61
|
end
|
61
62
|
|
62
|
-
{
|
63
|
+
{ 'string' => string }
|
63
64
|
end
|
64
65
|
|
65
66
|
def read_schema_org(string: nil, **options)
|
66
67
|
if string.present?
|
67
68
|
errors = jsonlint(string)
|
68
|
-
return {
|
69
|
+
return { 'errors' => errors } if errors.present?
|
69
70
|
end
|
70
71
|
|
71
|
-
read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url,
|
72
|
+
read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url,
|
73
|
+
:sandbox, :validate, :ra))
|
72
74
|
|
73
75
|
meta = string.present? ? Maremma.from_json(string) : {}
|
74
76
|
|
75
|
-
identifiers = Array.wrap(meta.fetch(
|
77
|
+
identifiers = Array.wrap(meta.fetch('identifier', nil)).map do |r|
|
76
78
|
r = normalize_id(r) if r.is_a?(String)
|
77
|
-
if r.is_a?(String) &&
|
78
|
-
|
79
|
+
if r.is_a?(String) && URI(r).host != 'doi.org'
|
80
|
+
{ 'identifierType' => 'URL', 'identifier' => r }
|
79
81
|
elsif r.is_a?(Hash)
|
80
|
-
{
|
82
|
+
{ 'identifierType' => get_identifier_type(r['propertyID']), 'identifier' => r['value'] }
|
81
83
|
end
|
82
84
|
end.compact.uniq
|
83
85
|
|
84
86
|
id = options[:doi]
|
85
|
-
id = meta.fetch(
|
86
|
-
id = meta.fetch(
|
87
|
+
id = meta.fetch('@id', nil) if id.blank? && URI(meta.fetch('@id', '')).host == 'doi.org'
|
88
|
+
id = meta.fetch('identifier', nil) if id.blank?
|
87
89
|
id = normalize_id(id)
|
88
90
|
|
89
|
-
schema_org = meta.fetch(
|
91
|
+
schema_org = meta.fetch('@type', nil) && meta.fetch('@type').camelcase
|
90
92
|
resource_type_general = Briard::Utils::SO_TO_DC_TRANSLATIONS[schema_org]
|
91
93
|
types = {
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
94
|
+
'resourceTypeGeneral' => resource_type_general,
|
95
|
+
'resourceType' => meta.fetch('additionalType', nil),
|
96
|
+
'schemaOrg' => schema_org,
|
97
|
+
'citeproc' => Briard::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || 'article-journal',
|
98
|
+
'bibtex' => Briard::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || 'misc',
|
99
|
+
'ris' => Briard::Utils::SO_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || 'GEN'
|
98
100
|
}.compact
|
99
|
-
authors = meta.fetch(
|
101
|
+
authors = meta.fetch('author', nil) || meta.fetch('creator', nil)
|
100
102
|
# Authors should be an object, if it's just a plain string don't try and parse it.
|
101
|
-
|
103
|
+
unless authors.is_a?(String)
|
102
104
|
creators = get_authors(from_schema_org_creators(Array.wrap(authors)))
|
103
105
|
end
|
104
|
-
contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch(
|
105
|
-
|
106
|
+
contributors = get_authors(from_schema_org_contributors(Array.wrap(meta.fetch('editor',
|
107
|
+
nil))))
|
108
|
+
publisher = parse_attributes(meta.fetch('publisher', nil), content: 'name', first: true)
|
106
109
|
|
107
|
-
ct =
|
110
|
+
ct = schema_org == 'Dataset' ? 'includedInDataCatalog' : 'Periodical'
|
108
111
|
container = if meta.fetch(ct, nil).present?
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
112
|
+
url = parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'url',
|
113
|
+
first: true)
|
114
|
+
|
115
|
+
{
|
116
|
+
'type' => schema_org == 'Dataset' ? 'DataRepository' : 'Periodical',
|
117
|
+
'title' => parse_attributes(from_schema_org(meta.fetch(ct, nil)), content: 'name',
|
118
|
+
first: true),
|
119
|
+
'identifier' => url,
|
120
|
+
'identifierType' => url.present? ? 'URL' : nil,
|
121
|
+
'volume' => meta.fetch('volumeNumber', nil),
|
122
|
+
'issue' => meta.fetch('issueNumber', nil),
|
123
|
+
'firstPage' => meta.fetch('pageStart', nil),
|
124
|
+
'lastPage' => meta.fetch('pageEnd', nil)
|
125
|
+
}.compact
|
126
|
+
elsif %w[BlogPosting Article].include?(schema_org)
|
127
|
+
issn = meta.dig('isPartOf', 'issn')
|
128
|
+
|
129
|
+
{
|
130
|
+
'type' => 'Blog',
|
131
|
+
'title' => meta.dig('isPartOf', 'name'),
|
132
|
+
'identifier' => issn,
|
133
|
+
'identifierType' => issn.present? ? 'ISSN' : nil
|
134
|
+
}.compact
|
135
|
+
else
|
136
|
+
{}
|
137
|
+
end
|
133
138
|
|
134
139
|
related_identifiers = Array.wrap(schema_org_is_identical_to(meta)) +
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
rights_list = Array.wrap(meta.fetch(
|
140
|
+
Array.wrap(schema_org_is_part_of(meta)) +
|
141
|
+
Array.wrap(schema_org_has_part(meta)) +
|
142
|
+
Array.wrap(schema_org_is_previous_version_of(meta)) +
|
143
|
+
Array.wrap(schema_org_is_new_version_of(meta)) +
|
144
|
+
Array.wrap(schema_org_references(meta)) +
|
145
|
+
Array.wrap(schema_org_is_referenced_by(meta)) +
|
146
|
+
Array.wrap(schema_org_is_supplement_to(meta)) +
|
147
|
+
Array.wrap(schema_org_is_supplemented_by(meta))
|
148
|
+
|
149
|
+
rights_list = Array.wrap(meta.fetch('license', nil)).compact.map do |rl|
|
145
150
|
if rl.is_a?(String)
|
146
|
-
hsh_to_spdx(
|
151
|
+
hsh_to_spdx('rightsURI' => rl)
|
147
152
|
else
|
148
|
-
hsh_to_spdx(
|
153
|
+
hsh_to_spdx('__content__' => rl['name'], 'rightsURI' => rl['id'])
|
149
154
|
end
|
150
155
|
end
|
151
156
|
|
152
|
-
funding_references = Array.wrap(meta.fetch(
|
153
|
-
if fr[
|
157
|
+
funding_references = Array.wrap(meta.fetch('funder', nil)).compact.map do |fr|
|
158
|
+
if fr['@id'].present?
|
154
159
|
{
|
155
|
-
|
156
|
-
|
157
|
-
|
160
|
+
'funderName' => fr['name'],
|
161
|
+
'funderIdentifier' => fr['@id'],
|
162
|
+
'funderIdentifierType' => fr['@id'].to_s.start_with?('https://doi.org/10.13039') ? 'Crossref Funder ID' : 'Other'
|
163
|
+
}.compact
|
158
164
|
else
|
159
|
-
{
|
160
|
-
"funderName" => fr["name"] }.compact
|
165
|
+
{ 'funderName' => fr['name'] }.compact
|
161
166
|
end
|
162
167
|
end
|
163
168
|
|
164
169
|
# strip milliseconds from iso8601, as edtf library doesn't handle them
|
165
170
|
dates = []
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
publication_year = meta.fetch("datePublished")[0..3] if meta.fetch("datePublished", nil).present?
|
170
|
-
|
171
|
-
if meta.fetch("inLanguage", nil).is_a?(String)
|
172
|
-
language = meta.fetch("inLanguage")
|
173
|
-
elsif meta.fetch("inLanguage", nil).is_a?(Object)
|
174
|
-
language = meta.dig("inLanguage", 'alternateName') || meta.dig("inLanguage", 'name')
|
175
|
-
else
|
176
|
-
language = nil
|
171
|
+
if Date.edtf(strip_milliseconds(meta.fetch('datePublished', nil))).present?
|
172
|
+
dates << { 'date' => strip_milliseconds(meta.fetch('datePublished')),
|
173
|
+
'dateType' => 'Issued' }
|
177
174
|
end
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
175
|
+
if Date.edtf(strip_milliseconds(meta.fetch('dateCreated', nil))).present?
|
176
|
+
dates << { 'date' => strip_milliseconds(meta.fetch('dateCreated')),
|
177
|
+
'dateType' => 'Created' }
|
178
|
+
end
|
179
|
+
if Date.edtf(strip_milliseconds(meta.fetch('dateModified', nil))).present?
|
180
|
+
dates << { 'date' => strip_milliseconds(meta.fetch('dateModified')),
|
181
|
+
'dateType' => 'Updated' }
|
182
|
+
end
|
183
|
+
publication_year = meta.fetch('datePublished')[0..3] if meta.fetch('datePublished',
|
184
|
+
nil).present?
|
185
|
+
|
186
|
+
language = case meta.fetch('inLanguage', nil)
|
187
|
+
when String
|
188
|
+
meta.fetch('inLanguage')
|
189
|
+
when Object
|
190
|
+
meta.dig('inLanguage', 'alternateName') || meta.dig('inLanguage', 'name')
|
191
|
+
end
|
192
|
+
|
193
|
+
state = meta.present? || read_options.present? ? 'findable' : 'not_found'
|
194
|
+
geo_locations = Array.wrap(meta.fetch('spatialCoverage', nil)).map do |gl|
|
195
|
+
if gl.dig('geo', 'box')
|
196
|
+
s, w, n, e = gl.dig('geo', 'box').split(' ', 4)
|
183
197
|
geo_location_box = {
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
198
|
+
'westBoundLongitude' => w,
|
199
|
+
'eastBoundLongitude' => e,
|
200
|
+
'southBoundLatitude' => s,
|
201
|
+
'northBoundLatitude' => n
|
188
202
|
}.compact.presence
|
189
203
|
else
|
190
204
|
geo_location_box = nil
|
191
205
|
end
|
192
|
-
geo_location_point = {
|
206
|
+
geo_location_point = { 'pointLongitude' => gl.dig('geo', 'longitude'),
|
207
|
+
'pointLatitude' => gl.dig('geo', 'latitude') }.compact.presence
|
193
208
|
|
194
209
|
{
|
195
|
-
|
196
|
-
|
197
|
-
|
210
|
+
'geoLocationPlace' => gl.dig('geo', 'address'),
|
211
|
+
'geoLocationPoint' => geo_location_point,
|
212
|
+
'geoLocationBox' => geo_location_box
|
198
213
|
}.compact
|
199
214
|
end
|
200
215
|
|
201
216
|
# handle keywords as array and as comma-separated string
|
202
|
-
subjects = meta.fetch(
|
203
|
-
subjects = subjects.to_s.downcase.split(
|
217
|
+
subjects = meta.fetch('keywords', nil)
|
218
|
+
subjects = subjects.to_s.downcase.split(', ') if subjects.is_a?(String)
|
204
219
|
subjects = Array.wrap(subjects).reduce([]) do |sum, subject|
|
205
220
|
sum += name_to_fos(subject)
|
206
221
|
sum
|
207
222
|
end
|
208
223
|
|
209
|
-
{
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
224
|
+
{ 'id' => id,
|
225
|
+
'types' => types,
|
226
|
+
'doi' => validate_doi(id),
|
227
|
+
'identifiers' => identifiers,
|
228
|
+
'url' => normalize_id(meta.fetch('url', nil)),
|
229
|
+
'content_url' => Array.wrap(meta.fetch('contentUrl', nil)),
|
230
|
+
'sizes' => Array.wrap(meta.fetch('contenSize', nil)).presence,
|
231
|
+
'formats' => Array.wrap(meta.fetch('encodingFormat',
|
232
|
+
nil) || meta.fetch('fileFormat', nil)),
|
233
|
+
'titles' => if meta.fetch('name', nil).present?
|
234
|
+
[{ 'title' => meta.fetch('name', nil) }]
|
235
|
+
else
|
236
|
+
[{ 'title' => meta.fetch('headline', nil) }]
|
237
|
+
end,
|
238
|
+
'creators' => creators,
|
239
|
+
'contributors' => contributors,
|
240
|
+
'publisher' => publisher,
|
241
|
+
'agency' => parse_attributes(meta.fetch('provider', nil), content: 'name', first: true),
|
242
|
+
'container' => container,
|
243
|
+
'related_identifiers' => related_identifiers,
|
244
|
+
'publication_year' => publication_year,
|
245
|
+
'dates' => dates,
|
246
|
+
'descriptions' => if meta.fetch('description', nil).present?
|
247
|
+
[{ 'description' => sanitize(meta.fetch('description')),
|
248
|
+
'descriptionType' => 'Abstract' }]
|
249
|
+
end,
|
250
|
+
'rights_list' => rights_list,
|
251
|
+
'version_info' => meta.fetch('version', nil).to_s.presence,
|
252
|
+
'subjects' => subjects,
|
253
|
+
'language' => language,
|
254
|
+
'state' => state,
|
255
|
+
'schema_version' => meta.fetch('schemaVersion', nil).to_s.presence,
|
256
|
+
'funding_references' => funding_references,
|
257
|
+
'geo_locations' => geo_locations }.merge(read_options)
|
236
258
|
end
|
237
259
|
|
238
260
|
def schema_org_related_identifier(meta, relation_type: nil)
|
239
|
-
normalize_ids(ids: meta.fetch(relation_type, nil),
|
261
|
+
normalize_ids(ids: meta.fetch(relation_type, nil),
|
262
|
+
relation_type: SO_TO_DC_RELATION_TYPES[relation_type])
|
240
263
|
end
|
241
264
|
|
242
265
|
def schema_org_reverse_related_identifier(meta, relation_type: nil)
|
243
|
-
normalize_ids(ids: meta.dig(
|
266
|
+
normalize_ids(ids: meta.dig('@reverse', relation_type),
|
267
|
+
relation_type: SO_TO_DC_REVERSE_RELATION_TYPES[relation_type])
|
244
268
|
end
|
245
269
|
|
246
270
|
def schema_org_is_identical_to(meta)
|
247
|
-
schema_org_related_identifier(meta, relation_type:
|
271
|
+
schema_org_related_identifier(meta, relation_type: 'sameAs')
|
248
272
|
end
|
249
273
|
|
250
274
|
def schema_org_is_part_of(meta)
|
251
|
-
schema_org_related_identifier(meta, relation_type:
|
275
|
+
schema_org_related_identifier(meta, relation_type: 'isPartOf')
|
252
276
|
end
|
253
277
|
|
254
278
|
def schema_org_has_part(meta)
|
255
|
-
schema_org_related_identifier(meta, relation_type:
|
279
|
+
schema_org_related_identifier(meta, relation_type: 'hasPart')
|
256
280
|
end
|
257
281
|
|
258
282
|
def schema_org_is_previous_version_of(meta)
|
259
|
-
schema_org_related_identifier(meta, relation_type:
|
283
|
+
schema_org_related_identifier(meta, relation_type: 'PredecessorOf')
|
260
284
|
end
|
261
285
|
|
262
286
|
def schema_org_is_new_version_of(meta)
|
263
|
-
schema_org_related_identifier(meta, relation_type:
|
287
|
+
schema_org_related_identifier(meta, relation_type: 'SuccessorOf')
|
264
288
|
end
|
265
289
|
|
266
290
|
def schema_org_references(meta)
|
267
|
-
schema_org_related_identifier(meta, relation_type:
|
291
|
+
schema_org_related_identifier(meta, relation_type: 'citation')
|
268
292
|
end
|
269
293
|
|
270
294
|
def schema_org_is_referenced_by(meta)
|
271
|
-
schema_org_reverse_related_identifier(meta, relation_type:
|
295
|
+
schema_org_reverse_related_identifier(meta, relation_type: 'citation')
|
272
296
|
end
|
273
297
|
|
274
298
|
def schema_org_is_supplement_to(meta)
|
275
|
-
schema_org_reverse_related_identifier(meta, relation_type:
|
299
|
+
schema_org_reverse_related_identifier(meta, relation_type: 'isBasedOn')
|
276
300
|
end
|
277
301
|
|
278
302
|
def schema_org_is_supplemented_by(meta)
|
279
|
-
schema_org_related_identifier(meta, relation_type:
|
303
|
+
schema_org_related_identifier(meta, relation_type: 'isBasedOn')
|
280
304
|
end
|
281
|
-
|
282
305
|
end
|
283
306
|
end
|
284
307
|
end
|
data/lib/briard/string.rb
CHANGED
data/lib/briard/utils.rb
CHANGED
@@ -500,7 +500,7 @@ module Briard
|
|
500
500
|
def find_from_format_by_id(id)
|
501
501
|
id = normalize_id(id)
|
502
502
|
|
503
|
-
if /\A(?:(http|https):\/(\/)?(dx\.)?(doi
|
503
|
+
if /\A(?:(http|https):\/(\/)?(dx\.)?(doi\.org|handle\.stage\.datacite\.org)\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id)
|
504
504
|
ra = get_doi_ra(id)
|
505
505
|
%w(DataCite Crossref mEDRA KISTI JaLC OP).include?(ra) ? ra.downcase : nil
|
506
506
|
elsif /\A(?:(http|https):\/(\/)?orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id)
|
@@ -537,7 +537,7 @@ module Briard
|
|
537
537
|
"datacite"
|
538
538
|
elsif options[:ext] == ".cff"
|
539
539
|
"cff"
|
540
|
-
elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.
|
540
|
+
elsif options[:ext] == ".json" && URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
|
541
541
|
"schema_org"
|
542
542
|
elsif options[:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
|
543
543
|
"codemeta"
|
@@ -555,7 +555,7 @@ module Briard
|
|
555
555
|
"crossref"
|
556
556
|
elsif Nokogiri::XML(string, nil, 'UTF-8', &:noblanks).collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") }
|
557
557
|
"datacite"
|
558
|
-
elsif Maremma.from_json(string).to_h.
|
558
|
+
elsif URI(Maremma.from_json(string).to_h.fetch("@context", "")).host == "schema.org"
|
559
559
|
"schema_org"
|
560
560
|
elsif Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld")
|
561
561
|
"codemeta"
|
@@ -940,7 +940,7 @@ module Briard
|
|
940
940
|
end
|
941
941
|
|
942
942
|
# alternatively find the nameIdentifier in the sameAs attribute
|
943
|
-
c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| item.
|
943
|
+
c["@id"] = c["sameAs"].first if Array(c["sameAs"]).find { |item| URI(item).host == "orcid.org" }
|
944
944
|
|
945
945
|
c["nameIdentifier"] = [{ "__content__" => c["@id"], "nameIdentifierScheme" => "ORCID", "schemeUri" => "https://orcid.org" }] if normalize_orcid(c["@id"])
|
946
946
|
c["@type"] = c["@type"].find { |t| %w(Person Organization).include?(t) } if c["@type"].is_a?(Array)
|
data/lib/briard/version.rb
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
module Briard
|
6
6
|
class WhitelistScrubber < Loofah::Scrubber
|
7
|
-
def initialize(options={})
|
7
|
+
def initialize(options = {})
|
8
8
|
@direction = :bottom_up
|
9
9
|
@tags = options[:tags]
|
10
10
|
@attributes = options[:attributes]
|
@@ -12,6 +12,7 @@ module Briard
|
|
12
12
|
|
13
13
|
def scrub(node)
|
14
14
|
scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
|
15
|
+
|
15
16
|
node.before node.children
|
16
17
|
node.remove
|
17
18
|
end
|
@@ -19,14 +20,17 @@ module Briard
|
|
19
20
|
private
|
20
21
|
|
21
22
|
def scrub_node_attributes(node)
|
22
|
-
|
23
|
+
unless @attributes.present? && @attributes.respond_to?(:include?)
|
24
|
+
fallback_scrub_node_attributes(node) and return true
|
25
|
+
end
|
26
|
+
|
23
27
|
node.attribute_nodes.each do |attr_node|
|
24
28
|
attr_node.remove unless @attributes.include?(attr_node.name)
|
25
29
|
end
|
26
30
|
end
|
27
31
|
|
28
32
|
def allowed_not_element_node_types
|
29
|
-
[
|
33
|
+
[Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE]
|
30
34
|
end
|
31
35
|
|
32
36
|
def fallback_scrub_node_attributes(node)
|
@@ -38,9 +42,12 @@ module Briard
|
|
38
42
|
end
|
39
43
|
|
40
44
|
def node_allowed?(node)
|
41
|
-
|
45
|
+
unless @tags.present? && @tags.respond_to?(:include?)
|
46
|
+
return fallback_allowed_element_detection(node)
|
47
|
+
end
|
42
48
|
return true if allowed_not_element_node_types.include?(node.type)
|
43
49
|
return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
|
50
|
+
|
44
51
|
@tags.include? node.name
|
45
52
|
end
|
46
53
|
end
|
@@ -6,24 +6,30 @@ module Briard
|
|
6
6
|
def bibtex
|
7
7
|
return nil unless valid?
|
8
8
|
|
9
|
-
pages = container.to_h[
|
9
|
+
pages = if container.to_h['firstPage'].present?
|
10
|
+
[container['firstPage'], container['lastPage']].compact.join('-')
|
11
|
+
end
|
10
12
|
|
11
13
|
bib = {
|
12
|
-
bibtex_type: types[
|
14
|
+
bibtex_type: types['bibtex'].presence || 'misc',
|
13
15
|
bibtex_key: normalize_doi(doi),
|
14
16
|
doi: doi,
|
15
17
|
url: url,
|
16
18
|
author: authors_as_string(creators),
|
17
|
-
keywords: subjects.present?
|
19
|
+
keywords: if subjects.present?
|
20
|
+
Array.wrap(subjects).map do |k|
|
21
|
+
parse_attributes(k, content: 'subject', first: true)
|
22
|
+
end.join(', ')
|
23
|
+
end,
|
18
24
|
language: language,
|
19
|
-
title: parse_attributes(titles, content:
|
20
|
-
journal: container && container[
|
21
|
-
volume: container.to_h[
|
22
|
-
issue: container.to_h[
|
25
|
+
title: parse_attributes(titles, content: 'title', first: true),
|
26
|
+
journal: container && container['title'],
|
27
|
+
volume: container.to_h['volume'],
|
28
|
+
issue: container.to_h['issue'],
|
23
29
|
pages: pages,
|
24
30
|
publisher: publisher,
|
25
31
|
year: publication_year,
|
26
|
-
copyright: Array.wrap(rights_list).map { |l| l[
|
32
|
+
copyright: Array.wrap(rights_list).map { |l| l['rights'] }.first
|
27
33
|
}.compact
|
28
34
|
BibTeX::Entry.new(bib).to_s
|
29
35
|
end
|