bolognese 0.7.2 → 0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -1
- data/README.md +25 -16
- data/bolognese.gemspec +2 -1
- data/codemeta.json +39 -0
- data/lib/bolognese.rb +4 -0
- data/lib/bolognese/array.rb +11 -0
- data/lib/bolognese/author_utils.rb +35 -21
- data/lib/bolognese/bibtex.rb +4 -4
- data/lib/bolognese/codemeta.rb +8 -13
- data/lib/bolognese/crossref.rb +22 -20
- data/lib/bolognese/datacite.rb +61 -61
- data/lib/bolognese/datacite_json.rb +208 -0
- data/lib/bolognese/datacite_utils.rb +17 -48
- data/lib/bolognese/metadata.rb +83 -22
- data/lib/bolognese/schema_org.rb +42 -16
- data/lib/bolognese/utils.rb +79 -13
- data/lib/bolognese/version.rb +1 -1
- data/lib/bolognese/whitelist_scrubber.rb +45 -0
- data/spec/array_spec.rb +20 -0
- data/spec/author_utils_spec.rb +93 -9
- data/spec/bibtex_spec.rb +4 -4
- data/spec/cli_spec.rb +5 -0
- data/spec/codemeta_spec.rb +41 -31
- data/spec/crossref_spec.rb +47 -72
- data/spec/datacite_json_spec.rb +65 -0
- data/spec/datacite_spec.rb +67 -83
- data/spec/datacite_utils_spec.rb +9 -14
- data/spec/fixtures/datacite.json +49 -0
- data/spec/fixtures/datacite_software.json +18 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_CLI/convert_from_id/datacite/to_datacite_json.yml +214 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_from_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_to_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/author.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/no_author.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/single_author.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/with_organization.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/from_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_ORCID.yml +155 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_no_ORCID.yml +134 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_familyName.yml +155 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order.yml +186 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order_with_ORCID.yml +177 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_sort-order.yml +173 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/is_organization.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_comma.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_family_name.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_id.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_no_info.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_organization.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_person.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_only_keep_specific_tags.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_remove_a_tags.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/to_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/insert_related_identifiers/related_identifier.yml +173 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_DataciteJson/get_metadata_as_bibtex/BlogPosting.yml +155 -0
- data/spec/schema_org_spec.rb +17 -14
- data/spec/utils_spec.rb +32 -2
- metadata +54 -4
data/lib/bolognese/metadata.rb
CHANGED
@@ -12,12 +12,36 @@ module Bolognese
|
|
12
12
|
include Bolognese::DataciteUtils
|
13
13
|
include Bolognese::Utils
|
14
14
|
|
15
|
+
DC_TO_SO_TRANSLATIONS = {
|
16
|
+
"Audiovisual" => "VideoObject",
|
17
|
+
"Collection" => "Collection",
|
18
|
+
"Dataset" => "Dataset",
|
19
|
+
"Event" => "Event",
|
20
|
+
"Image" => "ImageObject",
|
21
|
+
"InteractiveResource" => nil,
|
22
|
+
"Model" => nil,
|
23
|
+
"PhysicalObject" => nil,
|
24
|
+
"Service" => "Service",
|
25
|
+
"Software" => "SoftwareSourceCode",
|
26
|
+
"Sound" => "AudioObject",
|
27
|
+
"Text" => "ScholarlyArticle",
|
28
|
+
"Workflow" => nil,
|
29
|
+
"Other" => "CreativeWork"
|
30
|
+
}
|
31
|
+
|
15
32
|
attr_reader :id, :raw, :provider, :schema_version, :license, :citation,
|
16
33
|
:additional_type, :alternate_name, :url, :version, :keywords, :editor,
|
17
34
|
:page_start, :page_end, :date_modified, :language, :spatial_coverage,
|
18
35
|
:content_size, :funder, :journal, :bibtex_type, :date_created, :has_part,
|
19
|
-
:publisher, :contributor, :same_as, :
|
20
|
-
:
|
36
|
+
:publisher, :contributor, :same_as, :is_previous_version_of, :is_new_version_of,
|
37
|
+
:should_passthru, :datacite_errors, :date_accepted, :date_available,
|
38
|
+
:date_copyrighted, :date_collected, :date_submitted, :date_valid,
|
39
|
+
:is_cited_by, :cites, :is_supplement_to, :is_supplemented_by,
|
40
|
+
:is_continued_by, :continues, :has_metadata, :is_metadata_for,
|
41
|
+
:is_referenced_by, :references, :is_documented_by, :documents,
|
42
|
+
:is_compiled_by, :compiles, :is_variant_form_of, :is_original_form_of,
|
43
|
+
:is_reviewed_by, :reviews, :is_derived_from, :is_source_of, :format,
|
44
|
+
:related_identifier
|
21
45
|
|
22
46
|
def publication_year
|
23
47
|
date_published && date_published[0..3]
|
@@ -31,22 +55,19 @@ module Bolognese
|
|
31
55
|
[page_start, page_end].compact.join("-").presence
|
32
56
|
end
|
33
57
|
|
34
|
-
def publisher_string
|
35
|
-
publisher.to_h.fetch("name", nil)
|
36
|
-
end
|
37
|
-
|
38
58
|
def schema_org
|
39
|
-
|
59
|
+
hsh = {
|
60
|
+
"@context" => id.present? ? "http://schema.org" : nil,
|
40
61
|
"@type" => type,
|
41
62
|
"@id" => id,
|
42
63
|
"url" => url,
|
43
64
|
"additionalType" => additional_type,
|
44
|
-
"name" =>
|
65
|
+
"name" => title,
|
45
66
|
"alternateName" => alternate_name,
|
46
|
-
"author" => author,
|
67
|
+
"author" => to_schema_org(author),
|
47
68
|
"editor" => editor,
|
48
|
-
"description" => description,
|
49
|
-
"license" => license,
|
69
|
+
"description" => description.present? ? description["text"] : nil,
|
70
|
+
"license" => license.present? ? license["id"] : nil,
|
50
71
|
"version" => version,
|
51
72
|
"keywords" => keywords,
|
52
73
|
"language" => language,
|
@@ -60,32 +81,72 @@ module Bolognese
|
|
60
81
|
"sameAs" => same_as,
|
61
82
|
"isPartOf" => is_part_of,
|
62
83
|
"hasPart" => has_part,
|
63
|
-
"predecessor_of" =>
|
64
|
-
"successor_of" =>
|
65
|
-
"citation" =>
|
84
|
+
"predecessor_of" => is_previous_version_of,
|
85
|
+
"successor_of" => is_new_version_of,
|
86
|
+
"citation" => references,
|
66
87
|
"schemaVersion" => schema_version,
|
67
|
-
"publisher" => publisher,
|
88
|
+
"publisher" => { "@type" => "Organization", "name" => publisher },
|
68
89
|
"funder" => funder,
|
90
|
+
"provider" => { "@type" => "Organization", "name" => provider }
|
91
|
+
}.compact
|
92
|
+
JSON.pretty_generate hsh
|
93
|
+
end
|
94
|
+
|
95
|
+
def datacite_json
|
96
|
+
hsh = {
|
97
|
+
"id" => id,
|
98
|
+
"doi" => doi,
|
99
|
+
"creator" => author,
|
100
|
+
"title" => title,
|
101
|
+
"publisher" => publisher,
|
102
|
+
"publication-year" => publication_year,
|
103
|
+
"resource-type-general" => resource_type_general,
|
104
|
+
"resource-type" => additional_type,
|
105
|
+
"subject" => keywords.present? ? keywords.split(", ") : nil,
|
106
|
+
"contributor" => contributor,
|
107
|
+
"date-accepted" => date_accepted,
|
108
|
+
"date-available" => date_available,
|
109
|
+
"date-copyrighted" => date_copyrighted,
|
110
|
+
"date-collected" => date_collected,
|
111
|
+
"date-created" => date_created,
|
112
|
+
"date-published" => date_published,
|
113
|
+
"date-modified" => date_modified,
|
114
|
+
"date-submitted" => date_submitted,
|
115
|
+
"date-valid" => date_valid,
|
116
|
+
"language" => language,
|
117
|
+
"alternate-identifier" => alternate_name,
|
118
|
+
"related_identifier" => related_identifier,
|
119
|
+
"size" => content_size,
|
120
|
+
"format" => format,
|
121
|
+
"version" => version,
|
122
|
+
"rights" => license,
|
123
|
+
"description" => description,
|
124
|
+
"geo-location" => spatial_coverage,
|
125
|
+
"funding-reference" => funder,
|
126
|
+
"schemaVersion" => schema_version,
|
69
127
|
"provider" => provider
|
70
|
-
}.compact
|
128
|
+
}.compact
|
129
|
+
JSON.pretty_generate hsh
|
71
130
|
end
|
72
131
|
|
73
132
|
def codemeta
|
74
|
-
|
133
|
+
hsh = {
|
134
|
+
"@context" => id.present? ? "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld" : nil,
|
75
135
|
"@type" => type,
|
76
136
|
"@id" => id,
|
77
137
|
"identifier" => id,
|
78
138
|
"codeRepository" => url,
|
79
|
-
"title" =>
|
139
|
+
"title" => title,
|
80
140
|
"agents" => author,
|
81
|
-
"description" => description,
|
141
|
+
"description" => description.present? ? description["text"] : nil,
|
82
142
|
"version" => version,
|
83
143
|
"tags" => keywords.to_s.split(", ").presence,
|
84
144
|
"dateCreated" => date_created,
|
85
145
|
"datePublished" => date_published,
|
86
146
|
"dateModified" => date_modified,
|
87
147
|
"publisher" => publisher
|
88
|
-
}.compact
|
148
|
+
}.compact
|
149
|
+
JSON.pretty_generate hsh
|
89
150
|
end
|
90
151
|
|
91
152
|
def bibtex
|
@@ -97,10 +158,10 @@ module Bolognese
|
|
97
158
|
author: authors_as_string(author),
|
98
159
|
keywords: keywords,
|
99
160
|
language: language,
|
100
|
-
title:
|
161
|
+
title: title,
|
101
162
|
journal: journal,
|
102
163
|
pages: pagination,
|
103
|
-
publisher:
|
164
|
+
publisher: publisher,
|
104
165
|
year: publication_year
|
105
166
|
}.compact
|
106
167
|
BibTeX::Entry.new(bib).to_s
|
data/lib/bolognese/schema_org.rb
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
module Bolognese
|
2
2
|
class SchemaOrg < Metadata
|
3
3
|
|
4
|
+
SO_TO_DC_RELATION_TYPES = {
|
5
|
+
"citation" => "References",
|
6
|
+
"sameAs" => "IsIdenticalTo",
|
7
|
+
"isPartOf" => "IsPartOf",
|
8
|
+
"hasPart" => "HasPart",
|
9
|
+
"isPredecessor" => "IsPreviousVersionOf",
|
10
|
+
"isSuccessor" => "IsNewVersionOf"
|
11
|
+
}
|
12
|
+
|
4
13
|
def initialize(id: nil, string: nil)
|
5
14
|
id = normalize_id(id) if id.present?
|
6
15
|
|
@@ -53,7 +62,7 @@ module Bolognese
|
|
53
62
|
Bolognese::Bibtex::SO_TO_BIB_TRANSLATIONS[type] || "misc"
|
54
63
|
end
|
55
64
|
|
56
|
-
def
|
65
|
+
def title
|
57
66
|
metadata.fetch("name", nil)
|
58
67
|
end
|
59
68
|
|
@@ -62,21 +71,21 @@ module Bolognese
|
|
62
71
|
end
|
63
72
|
|
64
73
|
def author
|
65
|
-
|
66
|
-
|
74
|
+
authors = from_schema_org(Array.wrap(metadata.fetch("author", nil)))
|
75
|
+
get_authors(authors)
|
67
76
|
end
|
68
77
|
|
69
78
|
def editor
|
70
|
-
|
71
|
-
|
79
|
+
editors = from_schema_org(Array.wrap(metadata.fetch("editor", nil)))
|
80
|
+
get_authors(editors)
|
72
81
|
end
|
73
82
|
|
74
83
|
def description
|
75
|
-
metadata.fetch("description", nil)
|
84
|
+
{ "text" => metadata.fetch("description", nil) }
|
76
85
|
end
|
77
86
|
|
78
87
|
def license
|
79
|
-
metadata.fetch("license", nil)
|
88
|
+
{ "id" => metadata.fetch("license", nil) }
|
80
89
|
end
|
81
90
|
|
82
91
|
def version
|
@@ -99,28 +108,45 @@ module Bolognese
|
|
99
108
|
metadata.fetch("dateModified", nil)
|
100
109
|
end
|
101
110
|
|
102
|
-
def
|
103
|
-
|
111
|
+
def related_identifier
|
112
|
+
Array.wrap(is_identical_to) +
|
113
|
+
Array.wrap(is_part_of) +
|
114
|
+
Array.wrap(has_part) +
|
115
|
+
Array.wrap(is_previous_version_of) +
|
116
|
+
Array.wrap(is_new_version_of) +
|
117
|
+
Array.wrap(references)
|
118
|
+
end
|
119
|
+
|
120
|
+
def get_related_identifier(relation_type: nil)
|
121
|
+
normalize_ids(metadata.fetch(relation_type, nil), SO_TO_DC_RELATION_TYPES[relation_type])
|
104
122
|
end
|
105
123
|
|
106
|
-
def
|
107
|
-
|
124
|
+
def is_identical_to
|
125
|
+
get_related_identifier(relation_type: "sameAs")
|
108
126
|
end
|
109
127
|
|
110
128
|
def is_part_of
|
111
|
-
|
129
|
+
get_related_identifier(relation_type: "isPartOf")
|
112
130
|
end
|
113
131
|
|
114
132
|
def has_part
|
115
|
-
|
133
|
+
get_related_identifier(relation_type: "hasPart")
|
134
|
+
end
|
135
|
+
|
136
|
+
def is_previous_version_of
|
137
|
+
get_related_identifier(relation_type: "isPredecessor")
|
138
|
+
end
|
139
|
+
|
140
|
+
def is_new_version_of
|
141
|
+
get_related_identifier(relation_type: "isSuccessor")
|
116
142
|
end
|
117
143
|
|
118
|
-
def
|
119
|
-
|
144
|
+
def references
|
145
|
+
get_related_identifier(relation_type: "citation")
|
120
146
|
end
|
121
147
|
|
122
148
|
def publisher
|
123
|
-
metadata.
|
149
|
+
metadata.dig("publisher", "name")
|
124
150
|
end
|
125
151
|
|
126
152
|
def container_title
|
data/lib/bolognese/utils.rb
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
module Bolognese
|
2
2
|
module Utils
|
3
|
+
LICENSE_NAMES = {
|
4
|
+
"http://creativecommons.org/publicdomain/zero/1.0/" => "Public Domain (CC0 1.0)",
|
5
|
+
"http://creativecommons.org/licenses/by/3.0/" => "Creative Commons Attribution 3.0 (CC-BY 3.0)",
|
6
|
+
"http://creativecommons.org/licenses/by/4.0/" => "Creative Commons Attribution 4.0 (CC-BY 4.0)",
|
7
|
+
"http://creativecommons.org/licenses/by-nc/4.0/" => "Creative Commons Attribution Noncommercial 4.0 (CC-BY-NC 4.0)",
|
8
|
+
"http://creativecommons.org/licenses/by-sa/4.0/" => "Creative Commons Attribution Share Alike 4.0 (CC-BY-SA 4.0)",
|
9
|
+
"http://creativecommons.org/licenses/by-nc-nd/4.0/" => "Creative Commons Attribution Noncommercial No Derivatives 4.0 (CC-BY-NC-ND 4.0)"
|
10
|
+
}
|
3
11
|
|
4
12
|
def find_from_format(id: nil, string: nil, ext: nil, filename: nil)
|
5
13
|
if id.present?
|
@@ -30,6 +38,8 @@ module Bolognese
|
|
30
38
|
"crossref"
|
31
39
|
elsif options[:ext] == ".xml" && Maremma.from_xml(string).dig("resource", "xmlns").start_with?("http://datacite.org/schema/kernel")
|
32
40
|
"datacite"
|
41
|
+
elsif options[:ext] == ".json" && Maremma.from_json(string).dig("resource", "xmlns").to_s.start_with?("http://datacite.org/schema/kernel")
|
42
|
+
"datacite_json"
|
33
43
|
elsif options[:filename] == "codemeta.json"
|
34
44
|
"codemeta"
|
35
45
|
end
|
@@ -41,6 +51,7 @@ module Bolognese
|
|
41
51
|
when "crossref" then Crossref.new(id: id, string: string)
|
42
52
|
when "datacite" then Datacite.new(id: id, string: string, regenerate: options[:regenerate])
|
43
53
|
when "codemeta" then Codemeta.new(id: id, string: string)
|
54
|
+
when "datacite_json" then DataciteJson.new(string: string)
|
44
55
|
when "bibtex" then Bibtex.new(string: string)
|
45
56
|
else SchemaOrg.new(id: id)
|
46
57
|
end
|
@@ -83,21 +94,12 @@ module Bolognese
|
|
83
94
|
elsif element.is_a?(Hash)
|
84
95
|
element.fetch(content, nil)
|
85
96
|
elsif element.is_a?(Array)
|
86
|
-
a = element.map { |e| e.fetch(content, nil) }.uniq
|
87
|
-
array_unwrap(a)
|
97
|
+
a = element.map { |e| e.fetch(content, nil) }.uniq.unwrap
|
88
98
|
else
|
89
99
|
nil
|
90
100
|
end
|
91
101
|
end
|
92
102
|
|
93
|
-
def array_unwrap(element)
|
94
|
-
case element.length
|
95
|
-
when 0 then nil
|
96
|
-
when 1 then element.first
|
97
|
-
else element
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
103
|
def normalize_id(id)
|
102
104
|
return nil unless id.present?
|
103
105
|
|
@@ -112,9 +114,73 @@ module Bolognese
|
|
112
114
|
"http://orcid.org/" + Addressable::URI.encode(orcid)
|
113
115
|
end
|
114
116
|
|
115
|
-
def normalize_ids(list)
|
116
|
-
|
117
|
-
|
117
|
+
def normalize_ids(list, relation_type = "References")
|
118
|
+
Array.wrap(list).map do |url|
|
119
|
+
{ "id" => normalize_id(url["@id"]),
|
120
|
+
"type" => url["@type"],
|
121
|
+
"name" => url["name"],
|
122
|
+
"relationType" => relation_type }.compact
|
123
|
+
end.unwrap
|
124
|
+
end
|
125
|
+
|
126
|
+
# find Creative Commons or OSI license in licenses array, normalize url and name
|
127
|
+
def normalize_licenses(licenses)
|
128
|
+
standard_licenses = Array.wrap(licenses).map { |l| URI.parse(l["url"]) }.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] }
|
129
|
+
return licenses unless standard_licenses.present?
|
130
|
+
|
131
|
+
# use HTTPS
|
132
|
+
uri.scheme = "https"
|
133
|
+
|
134
|
+
# use host name without subdomain
|
135
|
+
uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last
|
136
|
+
|
137
|
+
# normalize URLs
|
138
|
+
if uri.host == "creativecommons.org"
|
139
|
+
uri.path = uri.path.split('/')[0..-2].join("/") if uri.path.split('/').last == "legalcode"
|
140
|
+
uri.path << '/' unless uri.path.end_with?('/')
|
141
|
+
else
|
142
|
+
uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '')
|
143
|
+
uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase }
|
144
|
+
uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize }
|
145
|
+
uri.path = uri.path.sub(/([^0-9\-]+)(-)?([1-9])?(\.)?([0-9])?$/) do
|
146
|
+
m = Regexp.last_match
|
147
|
+
text = m[1]
|
148
|
+
|
149
|
+
if m[3].present?
|
150
|
+
version = [m[3], m[5].presence || "0"].join(".")
|
151
|
+
[text, version].join("-")
|
152
|
+
else
|
153
|
+
text
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
uri.to_s
|
159
|
+
rescue URI::InvalidURIError
|
160
|
+
nil
|
161
|
+
end
|
162
|
+
|
163
|
+
def to_schema_org(element)
|
164
|
+
Array.wrap(element).map do |a|
|
165
|
+
a["@type"] = a["type"]
|
166
|
+
a["@id"] = a["id"]
|
167
|
+
a.except("type", "id").compact
|
168
|
+
end.unwrap
|
169
|
+
end
|
170
|
+
|
171
|
+
def from_schema_org(element)
|
172
|
+
Array.wrap(element).map do |a|
|
173
|
+
a["type"] = a["@type"]
|
174
|
+
a["id"] = a["@id"]
|
175
|
+
a.except("@type", "@id").compact
|
176
|
+
end.unwrap
|
177
|
+
end
|
178
|
+
|
179
|
+
def sanitize(text, options={})
|
180
|
+
options[:tags] ||= Set.new(%w(strong em b i code pre sub sup br))
|
181
|
+
custom_scrubber = Bolognese::WhitelistScrubber.new(options)
|
182
|
+
|
183
|
+
Loofah.scrub_fragment(text, custom_scrubber).to_s.gsub(/\u00a0/, ' ').strip
|
118
184
|
end
|
119
185
|
|
120
186
|
def github_from_url(url)
|
data/lib/bolognese/version.rb
CHANGED
@@ -0,0 +1,45 @@
|
|
1
|
+
# modified from https://gist.github.com/ivan-kolmychek/ee2fdc53f3e2c637271d
|
2
|
+
|
3
|
+
module Bolognese
|
4
|
+
class WhitelistScrubber < Loofah::Scrubber
|
5
|
+
def initialize(options={})
|
6
|
+
@direction = :bottom_up
|
7
|
+
@tags = options[:tags]
|
8
|
+
@attributes = options[:attributes]
|
9
|
+
end
|
10
|
+
|
11
|
+
def scrub(node)
|
12
|
+
scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
|
13
|
+
node.before node.children
|
14
|
+
node.remove
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def scrub_node_attributes(node)
|
20
|
+
fallback_scrub_node_attributes(node) and return true unless @attributes.present? && @attributes.respond_to?(:include?)
|
21
|
+
node.attribute_nodes.each do |attr_node|
|
22
|
+
attr_node.remove unless @attributes.include?(attr_node.name)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def allowed_not_element_node_types
|
27
|
+
[ Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE ]
|
28
|
+
end
|
29
|
+
|
30
|
+
def fallback_scrub_node_attributes(node)
|
31
|
+
Loofah::HTML5::Scrub.scrub_attributes(node)
|
32
|
+
end
|
33
|
+
|
34
|
+
def fallback_allowed_element_detection(node)
|
35
|
+
Loofah::HTML5::Scrub.allowed_element?(node.name)
|
36
|
+
end
|
37
|
+
|
38
|
+
def node_allowed?(node)
|
39
|
+
return fallback_allowed_element_detection(node) unless @tags.present? && @tags.respond_to?(:include?)
|
40
|
+
return true if allowed_not_element_node_types.include?(node.type)
|
41
|
+
return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
|
42
|
+
@tags.include? node.name
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|