bolognese 0.7.2 → 0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -1
- data/README.md +25 -16
- data/bolognese.gemspec +2 -1
- data/codemeta.json +39 -0
- data/lib/bolognese.rb +4 -0
- data/lib/bolognese/array.rb +11 -0
- data/lib/bolognese/author_utils.rb +35 -21
- data/lib/bolognese/bibtex.rb +4 -4
- data/lib/bolognese/codemeta.rb +8 -13
- data/lib/bolognese/crossref.rb +22 -20
- data/lib/bolognese/datacite.rb +61 -61
- data/lib/bolognese/datacite_json.rb +208 -0
- data/lib/bolognese/datacite_utils.rb +17 -48
- data/lib/bolognese/metadata.rb +83 -22
- data/lib/bolognese/schema_org.rb +42 -16
- data/lib/bolognese/utils.rb +79 -13
- data/lib/bolognese/version.rb +1 -1
- data/lib/bolognese/whitelist_scrubber.rb +45 -0
- data/spec/array_spec.rb +20 -0
- data/spec/author_utils_spec.rb +93 -9
- data/spec/bibtex_spec.rb +4 -4
- data/spec/cli_spec.rb +5 -0
- data/spec/codemeta_spec.rb +41 -31
- data/spec/crossref_spec.rb +47 -72
- data/spec/datacite_json_spec.rb +65 -0
- data/spec/datacite_spec.rb +67 -83
- data/spec/datacite_utils_spec.rb +9 -14
- data/spec/fixtures/datacite.json +49 -0
- data/spec/fixtures/datacite_software.json +18 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_CLI/convert_from_id/datacite/to_datacite_json.yml +214 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_from_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/author_to_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/author.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/no_author.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/single_author.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/authors_as_string/with_organization.yml +137 -860
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/from_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_ORCID.yml +155 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_name_identifier/has_no_ORCID.yml +134 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_familyName.yml +155 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order.yml +186 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_display-order_with_ORCID.yml +177 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/has_name_in_sort-order.yml +173 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/get_one_author/is_organization.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_comma.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_family_name.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_id.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_no_info.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_organization.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/is_personal_name_/has_type_person.yml +207 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_only_keep_specific_tags.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/sanitize/should_remove_a_tags.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Crossref/to_schema_org/with_id.yml +930 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_Datacite/insert_related_identifiers/related_identifier.yml +173 -0
- data/spec/fixtures/vcr_cassettes/Bolognese_DataciteJson/get_metadata_as_bibtex/BlogPosting.yml +155 -0
- data/spec/schema_org_spec.rb +17 -14
- data/spec/utils_spec.rb +32 -2
- metadata +54 -4
data/lib/bolognese/metadata.rb
CHANGED
|
@@ -12,12 +12,36 @@ module Bolognese
|
|
|
12
12
|
include Bolognese::DataciteUtils
|
|
13
13
|
include Bolognese::Utils
|
|
14
14
|
|
|
15
|
+
DC_TO_SO_TRANSLATIONS = {
|
|
16
|
+
"Audiovisual" => "VideoObject",
|
|
17
|
+
"Collection" => "Collection",
|
|
18
|
+
"Dataset" => "Dataset",
|
|
19
|
+
"Event" => "Event",
|
|
20
|
+
"Image" => "ImageObject",
|
|
21
|
+
"InteractiveResource" => nil,
|
|
22
|
+
"Model" => nil,
|
|
23
|
+
"PhysicalObject" => nil,
|
|
24
|
+
"Service" => "Service",
|
|
25
|
+
"Software" => "SoftwareSourceCode",
|
|
26
|
+
"Sound" => "AudioObject",
|
|
27
|
+
"Text" => "ScholarlyArticle",
|
|
28
|
+
"Workflow" => nil,
|
|
29
|
+
"Other" => "CreativeWork"
|
|
30
|
+
}
|
|
31
|
+
|
|
15
32
|
attr_reader :id, :raw, :provider, :schema_version, :license, :citation,
|
|
16
33
|
:additional_type, :alternate_name, :url, :version, :keywords, :editor,
|
|
17
34
|
:page_start, :page_end, :date_modified, :language, :spatial_coverage,
|
|
18
35
|
:content_size, :funder, :journal, :bibtex_type, :date_created, :has_part,
|
|
19
|
-
:publisher, :contributor, :same_as, :
|
|
20
|
-
:
|
|
36
|
+
:publisher, :contributor, :same_as, :is_previous_version_of, :is_new_version_of,
|
|
37
|
+
:should_passthru, :datacite_errors, :date_accepted, :date_available,
|
|
38
|
+
:date_copyrighted, :date_collected, :date_submitted, :date_valid,
|
|
39
|
+
:is_cited_by, :cites, :is_supplement_to, :is_supplemented_by,
|
|
40
|
+
:is_continued_by, :continues, :has_metadata, :is_metadata_for,
|
|
41
|
+
:is_referenced_by, :references, :is_documented_by, :documents,
|
|
42
|
+
:is_compiled_by, :compiles, :is_variant_form_of, :is_original_form_of,
|
|
43
|
+
:is_reviewed_by, :reviews, :is_derived_from, :is_source_of, :format,
|
|
44
|
+
:related_identifier
|
|
21
45
|
|
|
22
46
|
def publication_year
|
|
23
47
|
date_published && date_published[0..3]
|
|
@@ -31,22 +55,19 @@ module Bolognese
|
|
|
31
55
|
[page_start, page_end].compact.join("-").presence
|
|
32
56
|
end
|
|
33
57
|
|
|
34
|
-
def publisher_string
|
|
35
|
-
publisher.to_h.fetch("name", nil)
|
|
36
|
-
end
|
|
37
|
-
|
|
38
58
|
def schema_org
|
|
39
|
-
|
|
59
|
+
hsh = {
|
|
60
|
+
"@context" => id.present? ? "http://schema.org" : nil,
|
|
40
61
|
"@type" => type,
|
|
41
62
|
"@id" => id,
|
|
42
63
|
"url" => url,
|
|
43
64
|
"additionalType" => additional_type,
|
|
44
|
-
"name" =>
|
|
65
|
+
"name" => title,
|
|
45
66
|
"alternateName" => alternate_name,
|
|
46
|
-
"author" => author,
|
|
67
|
+
"author" => to_schema_org(author),
|
|
47
68
|
"editor" => editor,
|
|
48
|
-
"description" => description,
|
|
49
|
-
"license" => license,
|
|
69
|
+
"description" => description.present? ? description["text"] : nil,
|
|
70
|
+
"license" => license.present? ? license["id"] : nil,
|
|
50
71
|
"version" => version,
|
|
51
72
|
"keywords" => keywords,
|
|
52
73
|
"language" => language,
|
|
@@ -60,32 +81,72 @@ module Bolognese
|
|
|
60
81
|
"sameAs" => same_as,
|
|
61
82
|
"isPartOf" => is_part_of,
|
|
62
83
|
"hasPart" => has_part,
|
|
63
|
-
"predecessor_of" =>
|
|
64
|
-
"successor_of" =>
|
|
65
|
-
"citation" =>
|
|
84
|
+
"predecessor_of" => is_previous_version_of,
|
|
85
|
+
"successor_of" => is_new_version_of,
|
|
86
|
+
"citation" => references,
|
|
66
87
|
"schemaVersion" => schema_version,
|
|
67
|
-
"publisher" => publisher,
|
|
88
|
+
"publisher" => { "@type" => "Organization", "name" => publisher },
|
|
68
89
|
"funder" => funder,
|
|
90
|
+
"provider" => { "@type" => "Organization", "name" => provider }
|
|
91
|
+
}.compact
|
|
92
|
+
JSON.pretty_generate hsh
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def datacite_json
|
|
96
|
+
hsh = {
|
|
97
|
+
"id" => id,
|
|
98
|
+
"doi" => doi,
|
|
99
|
+
"creator" => author,
|
|
100
|
+
"title" => title,
|
|
101
|
+
"publisher" => publisher,
|
|
102
|
+
"publication-year" => publication_year,
|
|
103
|
+
"resource-type-general" => resource_type_general,
|
|
104
|
+
"resource-type" => additional_type,
|
|
105
|
+
"subject" => keywords.present? ? keywords.split(", ") : nil,
|
|
106
|
+
"contributor" => contributor,
|
|
107
|
+
"date-accepted" => date_accepted,
|
|
108
|
+
"date-available" => date_available,
|
|
109
|
+
"date-copyrighted" => date_copyrighted,
|
|
110
|
+
"date-collected" => date_collected,
|
|
111
|
+
"date-created" => date_created,
|
|
112
|
+
"date-published" => date_published,
|
|
113
|
+
"date-modified" => date_modified,
|
|
114
|
+
"date-submitted" => date_submitted,
|
|
115
|
+
"date-valid" => date_valid,
|
|
116
|
+
"language" => language,
|
|
117
|
+
"alternate-identifier" => alternate_name,
|
|
118
|
+
"related_identifier" => related_identifier,
|
|
119
|
+
"size" => content_size,
|
|
120
|
+
"format" => format,
|
|
121
|
+
"version" => version,
|
|
122
|
+
"rights" => license,
|
|
123
|
+
"description" => description,
|
|
124
|
+
"geo-location" => spatial_coverage,
|
|
125
|
+
"funding-reference" => funder,
|
|
126
|
+
"schemaVersion" => schema_version,
|
|
69
127
|
"provider" => provider
|
|
70
|
-
}.compact
|
|
128
|
+
}.compact
|
|
129
|
+
JSON.pretty_generate hsh
|
|
71
130
|
end
|
|
72
131
|
|
|
73
132
|
def codemeta
|
|
74
|
-
|
|
133
|
+
hsh = {
|
|
134
|
+
"@context" => id.present? ? "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld" : nil,
|
|
75
135
|
"@type" => type,
|
|
76
136
|
"@id" => id,
|
|
77
137
|
"identifier" => id,
|
|
78
138
|
"codeRepository" => url,
|
|
79
|
-
"title" =>
|
|
139
|
+
"title" => title,
|
|
80
140
|
"agents" => author,
|
|
81
|
-
"description" => description,
|
|
141
|
+
"description" => description.present? ? description["text"] : nil,
|
|
82
142
|
"version" => version,
|
|
83
143
|
"tags" => keywords.to_s.split(", ").presence,
|
|
84
144
|
"dateCreated" => date_created,
|
|
85
145
|
"datePublished" => date_published,
|
|
86
146
|
"dateModified" => date_modified,
|
|
87
147
|
"publisher" => publisher
|
|
88
|
-
}.compact
|
|
148
|
+
}.compact
|
|
149
|
+
JSON.pretty_generate hsh
|
|
89
150
|
end
|
|
90
151
|
|
|
91
152
|
def bibtex
|
|
@@ -97,10 +158,10 @@ module Bolognese
|
|
|
97
158
|
author: authors_as_string(author),
|
|
98
159
|
keywords: keywords,
|
|
99
160
|
language: language,
|
|
100
|
-
title:
|
|
161
|
+
title: title,
|
|
101
162
|
journal: journal,
|
|
102
163
|
pages: pagination,
|
|
103
|
-
publisher:
|
|
164
|
+
publisher: publisher,
|
|
104
165
|
year: publication_year
|
|
105
166
|
}.compact
|
|
106
167
|
BibTeX::Entry.new(bib).to_s
|
data/lib/bolognese/schema_org.rb
CHANGED
|
@@ -1,6 +1,15 @@
|
|
|
1
1
|
module Bolognese
|
|
2
2
|
class SchemaOrg < Metadata
|
|
3
3
|
|
|
4
|
+
SO_TO_DC_RELATION_TYPES = {
|
|
5
|
+
"citation" => "References",
|
|
6
|
+
"sameAs" => "IsIdenticalTo",
|
|
7
|
+
"isPartOf" => "IsPartOf",
|
|
8
|
+
"hasPart" => "HasPart",
|
|
9
|
+
"isPredecessor" => "IsPreviousVersionOf",
|
|
10
|
+
"isSuccessor" => "IsNewVersionOf"
|
|
11
|
+
}
|
|
12
|
+
|
|
4
13
|
def initialize(id: nil, string: nil)
|
|
5
14
|
id = normalize_id(id) if id.present?
|
|
6
15
|
|
|
@@ -53,7 +62,7 @@ module Bolognese
|
|
|
53
62
|
Bolognese::Bibtex::SO_TO_BIB_TRANSLATIONS[type] || "misc"
|
|
54
63
|
end
|
|
55
64
|
|
|
56
|
-
def
|
|
65
|
+
def title
|
|
57
66
|
metadata.fetch("name", nil)
|
|
58
67
|
end
|
|
59
68
|
|
|
@@ -62,21 +71,21 @@ module Bolognese
|
|
|
62
71
|
end
|
|
63
72
|
|
|
64
73
|
def author
|
|
65
|
-
|
|
66
|
-
|
|
74
|
+
authors = from_schema_org(Array.wrap(metadata.fetch("author", nil)))
|
|
75
|
+
get_authors(authors)
|
|
67
76
|
end
|
|
68
77
|
|
|
69
78
|
def editor
|
|
70
|
-
|
|
71
|
-
|
|
79
|
+
editors = from_schema_org(Array.wrap(metadata.fetch("editor", nil)))
|
|
80
|
+
get_authors(editors)
|
|
72
81
|
end
|
|
73
82
|
|
|
74
83
|
def description
|
|
75
|
-
metadata.fetch("description", nil)
|
|
84
|
+
{ "text" => metadata.fetch("description", nil) }
|
|
76
85
|
end
|
|
77
86
|
|
|
78
87
|
def license
|
|
79
|
-
metadata.fetch("license", nil)
|
|
88
|
+
{ "id" => metadata.fetch("license", nil) }
|
|
80
89
|
end
|
|
81
90
|
|
|
82
91
|
def version
|
|
@@ -99,28 +108,45 @@ module Bolognese
|
|
|
99
108
|
metadata.fetch("dateModified", nil)
|
|
100
109
|
end
|
|
101
110
|
|
|
102
|
-
def
|
|
103
|
-
|
|
111
|
+
def related_identifier
|
|
112
|
+
Array.wrap(is_identical_to) +
|
|
113
|
+
Array.wrap(is_part_of) +
|
|
114
|
+
Array.wrap(has_part) +
|
|
115
|
+
Array.wrap(is_previous_version_of) +
|
|
116
|
+
Array.wrap(is_new_version_of) +
|
|
117
|
+
Array.wrap(references)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def get_related_identifier(relation_type: nil)
|
|
121
|
+
normalize_ids(metadata.fetch(relation_type, nil), SO_TO_DC_RELATION_TYPES[relation_type])
|
|
104
122
|
end
|
|
105
123
|
|
|
106
|
-
def
|
|
107
|
-
|
|
124
|
+
def is_identical_to
|
|
125
|
+
get_related_identifier(relation_type: "sameAs")
|
|
108
126
|
end
|
|
109
127
|
|
|
110
128
|
def is_part_of
|
|
111
|
-
|
|
129
|
+
get_related_identifier(relation_type: "isPartOf")
|
|
112
130
|
end
|
|
113
131
|
|
|
114
132
|
def has_part
|
|
115
|
-
|
|
133
|
+
get_related_identifier(relation_type: "hasPart")
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def is_previous_version_of
|
|
137
|
+
get_related_identifier(relation_type: "isPredecessor")
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def is_new_version_of
|
|
141
|
+
get_related_identifier(relation_type: "isSuccessor")
|
|
116
142
|
end
|
|
117
143
|
|
|
118
|
-
def
|
|
119
|
-
|
|
144
|
+
def references
|
|
145
|
+
get_related_identifier(relation_type: "citation")
|
|
120
146
|
end
|
|
121
147
|
|
|
122
148
|
def publisher
|
|
123
|
-
metadata.
|
|
149
|
+
metadata.dig("publisher", "name")
|
|
124
150
|
end
|
|
125
151
|
|
|
126
152
|
def container_title
|
data/lib/bolognese/utils.rb
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
module Bolognese
|
|
2
2
|
module Utils
|
|
3
|
+
LICENSE_NAMES = {
|
|
4
|
+
"http://creativecommons.org/publicdomain/zero/1.0/" => "Public Domain (CC0 1.0)",
|
|
5
|
+
"http://creativecommons.org/licenses/by/3.0/" => "Creative Commons Attribution 3.0 (CC-BY 3.0)",
|
|
6
|
+
"http://creativecommons.org/licenses/by/4.0/" => "Creative Commons Attribution 4.0 (CC-BY 4.0)",
|
|
7
|
+
"http://creativecommons.org/licenses/by-nc/4.0/" => "Creative Commons Attribution Noncommercial 4.0 (CC-BY-NC 4.0)",
|
|
8
|
+
"http://creativecommons.org/licenses/by-sa/4.0/" => "Creative Commons Attribution Share Alike 4.0 (CC-BY-SA 4.0)",
|
|
9
|
+
"http://creativecommons.org/licenses/by-nc-nd/4.0/" => "Creative Commons Attribution Noncommercial No Derivatives 4.0 (CC-BY-NC-ND 4.0)"
|
|
10
|
+
}
|
|
3
11
|
|
|
4
12
|
def find_from_format(id: nil, string: nil, ext: nil, filename: nil)
|
|
5
13
|
if id.present?
|
|
@@ -30,6 +38,8 @@ module Bolognese
|
|
|
30
38
|
"crossref"
|
|
31
39
|
elsif options[:ext] == ".xml" && Maremma.from_xml(string).dig("resource", "xmlns").start_with?("http://datacite.org/schema/kernel")
|
|
32
40
|
"datacite"
|
|
41
|
+
elsif options[:ext] == ".json" && Maremma.from_json(string).dig("resource", "xmlns").to_s.start_with?("http://datacite.org/schema/kernel")
|
|
42
|
+
"datacite_json"
|
|
33
43
|
elsif options[:filename] == "codemeta.json"
|
|
34
44
|
"codemeta"
|
|
35
45
|
end
|
|
@@ -41,6 +51,7 @@ module Bolognese
|
|
|
41
51
|
when "crossref" then Crossref.new(id: id, string: string)
|
|
42
52
|
when "datacite" then Datacite.new(id: id, string: string, regenerate: options[:regenerate])
|
|
43
53
|
when "codemeta" then Codemeta.new(id: id, string: string)
|
|
54
|
+
when "datacite_json" then DataciteJson.new(string: string)
|
|
44
55
|
when "bibtex" then Bibtex.new(string: string)
|
|
45
56
|
else SchemaOrg.new(id: id)
|
|
46
57
|
end
|
|
@@ -83,21 +94,12 @@ module Bolognese
|
|
|
83
94
|
elsif element.is_a?(Hash)
|
|
84
95
|
element.fetch(content, nil)
|
|
85
96
|
elsif element.is_a?(Array)
|
|
86
|
-
a = element.map { |e| e.fetch(content, nil) }.uniq
|
|
87
|
-
array_unwrap(a)
|
|
97
|
+
a = element.map { |e| e.fetch(content, nil) }.uniq.unwrap
|
|
88
98
|
else
|
|
89
99
|
nil
|
|
90
100
|
end
|
|
91
101
|
end
|
|
92
102
|
|
|
93
|
-
def array_unwrap(element)
|
|
94
|
-
case element.length
|
|
95
|
-
when 0 then nil
|
|
96
|
-
when 1 then element.first
|
|
97
|
-
else element
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
|
|
101
103
|
def normalize_id(id)
|
|
102
104
|
return nil unless id.present?
|
|
103
105
|
|
|
@@ -112,9 +114,73 @@ module Bolognese
|
|
|
112
114
|
"http://orcid.org/" + Addressable::URI.encode(orcid)
|
|
113
115
|
end
|
|
114
116
|
|
|
115
|
-
def normalize_ids(list)
|
|
116
|
-
|
|
117
|
-
|
|
117
|
+
def normalize_ids(list, relation_type = "References")
|
|
118
|
+
Array.wrap(list).map do |url|
|
|
119
|
+
{ "id" => normalize_id(url["@id"]),
|
|
120
|
+
"type" => url["@type"],
|
|
121
|
+
"name" => url["name"],
|
|
122
|
+
"relationType" => relation_type }.compact
|
|
123
|
+
end.unwrap
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# find Creative Commons or OSI license in licenses array, normalize url and name
|
|
127
|
+
def normalize_licenses(licenses)
|
|
128
|
+
standard_licenses = Array.wrap(licenses).map { |l| URI.parse(l["url"]) }.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] }
|
|
129
|
+
return licenses unless standard_licenses.present?
|
|
130
|
+
|
|
131
|
+
# use HTTPS
|
|
132
|
+
uri.scheme = "https"
|
|
133
|
+
|
|
134
|
+
# use host name without subdomain
|
|
135
|
+
uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last
|
|
136
|
+
|
|
137
|
+
# normalize URLs
|
|
138
|
+
if uri.host == "creativecommons.org"
|
|
139
|
+
uri.path = uri.path.split('/')[0..-2].join("/") if uri.path.split('/').last == "legalcode"
|
|
140
|
+
uri.path << '/' unless uri.path.end_with?('/')
|
|
141
|
+
else
|
|
142
|
+
uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '')
|
|
143
|
+
uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase }
|
|
144
|
+
uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize }
|
|
145
|
+
uri.path = uri.path.sub(/([^0-9\-]+)(-)?([1-9])?(\.)?([0-9])?$/) do
|
|
146
|
+
m = Regexp.last_match
|
|
147
|
+
text = m[1]
|
|
148
|
+
|
|
149
|
+
if m[3].present?
|
|
150
|
+
version = [m[3], m[5].presence || "0"].join(".")
|
|
151
|
+
[text, version].join("-")
|
|
152
|
+
else
|
|
153
|
+
text
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
uri.to_s
|
|
159
|
+
rescue URI::InvalidURIError
|
|
160
|
+
nil
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def to_schema_org(element)
|
|
164
|
+
Array.wrap(element).map do |a|
|
|
165
|
+
a["@type"] = a["type"]
|
|
166
|
+
a["@id"] = a["id"]
|
|
167
|
+
a.except("type", "id").compact
|
|
168
|
+
end.unwrap
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def from_schema_org(element)
|
|
172
|
+
Array.wrap(element).map do |a|
|
|
173
|
+
a["type"] = a["@type"]
|
|
174
|
+
a["id"] = a["@id"]
|
|
175
|
+
a.except("@type", "@id").compact
|
|
176
|
+
end.unwrap
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def sanitize(text, options={})
|
|
180
|
+
options[:tags] ||= Set.new(%w(strong em b i code pre sub sup br))
|
|
181
|
+
custom_scrubber = Bolognese::WhitelistScrubber.new(options)
|
|
182
|
+
|
|
183
|
+
Loofah.scrub_fragment(text, custom_scrubber).to_s.gsub(/\u00a0/, ' ').strip
|
|
118
184
|
end
|
|
119
185
|
|
|
120
186
|
def github_from_url(url)
|
data/lib/bolognese/version.rb
CHANGED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# modified from https://gist.github.com/ivan-kolmychek/ee2fdc53f3e2c637271d
|
|
2
|
+
|
|
3
|
+
module Bolognese
|
|
4
|
+
class WhitelistScrubber < Loofah::Scrubber
|
|
5
|
+
def initialize(options={})
|
|
6
|
+
@direction = :bottom_up
|
|
7
|
+
@tags = options[:tags]
|
|
8
|
+
@attributes = options[:attributes]
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def scrub(node)
|
|
12
|
+
scrub_node_attributes(node) and return CONTINUE if node_allowed?(node)
|
|
13
|
+
node.before node.children
|
|
14
|
+
node.remove
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def scrub_node_attributes(node)
|
|
20
|
+
fallback_scrub_node_attributes(node) and return true unless @attributes.present? && @attributes.respond_to?(:include?)
|
|
21
|
+
node.attribute_nodes.each do |attr_node|
|
|
22
|
+
attr_node.remove unless @attributes.include?(attr_node.name)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def allowed_not_element_node_types
|
|
27
|
+
[ Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE ]
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def fallback_scrub_node_attributes(node)
|
|
31
|
+
Loofah::HTML5::Scrub.scrub_attributes(node)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def fallback_allowed_element_detection(node)
|
|
35
|
+
Loofah::HTML5::Scrub.allowed_element?(node.name)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def node_allowed?(node)
|
|
39
|
+
return fallback_allowed_element_detection(node) unless @tags.present? && @tags.respond_to?(:include?)
|
|
40
|
+
return true if allowed_not_element_node_types.include?(node.type)
|
|
41
|
+
return false unless node.type == Nokogiri::XML::Node::ELEMENT_NODE
|
|
42
|
+
@tags.include? node.name
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|