selma 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,92 @@
1
+ use magnus::{error::Error, exception, gc, value::Value, RTypedData, TryConvert, TypedData};
2
+ use std::{marker::PhantomData, ops::Deref};
3
+
4
+ // NOTE: My Rust isn't good enough to know what any of this does,
5
+ // but it was taken from https://cs.github.com/bytecodealliance/wasmtime-rb/blob/a843e4b4582a945f2c881b8bd3e2b87688ab5509/ext/src/helpers/wrapped_struct.rs#L4
6
+
7
+ /// A small wrapper for `RTypedData` that keeps track of the concrete struct
8
+ /// type, and the underlying [`Value`] for GC purposes.
9
+ #[derive(Debug)]
10
+ #[repr(transparent)]
11
+ pub struct WrappedStruct<T: TypedData> {
12
+ inner: RTypedData,
13
+ phantom: PhantomData<T>,
14
+ }
15
+
16
+ impl<T: TypedData> Clone for WrappedStruct<T> {
17
+ fn clone(&self) -> Self {
18
+ Self {
19
+ inner: self.inner,
20
+ phantom: PhantomData,
21
+ }
22
+ }
23
+ }
24
+ impl<T: TypedData> Copy for WrappedStruct<T> {}
25
+
26
+ impl<T: TypedData> WrappedStruct<T> {
27
+ /// Gets the underlying struct.
28
+ pub fn get(&self) -> Result<&T, Error> {
29
+ self.inner.try_convert()
30
+ }
31
+
32
+ /// Gets the underlying struct with a `'static` lifetime.
33
+ pub fn get_static(&self) -> Result<&'static T, Error> {
34
+ self.inner.try_convert()
35
+ }
36
+
37
+ /// Get the Ruby [`Value`] for this struct.
38
+ pub fn to_value(self) -> Value {
39
+ self.inner.into()
40
+ }
41
+
42
+ /// Marks the Ruby [`Value`] for GC.
43
+ pub fn mark(&self) {
44
+ gc::mark(&self.inner.into());
45
+ }
46
+ }
47
+
48
+ impl<T: TypedData> From<WrappedStruct<T>> for Value {
49
+ fn from(wrapped_struct: WrappedStruct<T>) -> Self {
50
+ wrapped_struct.to_value()
51
+ }
52
+ }
53
+
54
+ impl<T: TypedData> Deref for WrappedStruct<T> {
55
+ type Target = RTypedData;
56
+
57
+ fn deref(&self) -> &Self::Target {
58
+ &self.inner
59
+ }
60
+ }
61
+
62
+ impl<T: TypedData> From<T> for WrappedStruct<T> {
63
+ fn from(t: T) -> Self {
64
+ Self {
65
+ inner: RTypedData::wrap(t),
66
+ phantom: PhantomData,
67
+ }
68
+ }
69
+ }
70
+
71
+ impl<T> TryConvert for WrappedStruct<T>
72
+ where
73
+ T: TypedData,
74
+ {
75
+ fn try_convert(val: Value) -> Result<Self, Error> {
76
+ let inner = RTypedData::from_value(val).ok_or_else(|| {
77
+ Error::new(
78
+ exception::type_error(),
79
+ format!(
80
+ "no implicit conversion of {} into {}",
81
+ unsafe { val.classname() },
82
+ T::class()
83
+ ),
84
+ )
85
+ })?;
86
+
87
+ Ok(Self {
88
+ inner,
89
+ phantom: PhantomData,
90
+ })
91
+ }
92
+ }
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ # native precompiled gems package shared libraries in <gem_dir>/lib/selma/<ruby_version>
5
+ # load the precompiled extension file
6
+ ruby_version = /\d+\.\d+/.match(RUBY_VERSION)
7
+ require_relative "#{ruby_version}/selma"
8
+ rescue LoadError
9
+ # fall back to the extension compiled upon installation.
10
+ # use "require" instead of "require_relative" because non-native gems will place C extension files
11
+ # in Gem::BasicSpecification#extension_dir after compilation (during normal installation), which
12
+ # is in $LOAD_PATH but not necessarily relative to this file (see nokogiri#2300)
13
+ require "selma/selma"
14
+ end
data/lib/selma/html.rb ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class HTML
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Rewriter
5
+ end
6
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ BASIC = freeze_config(
7
+ elements: ["a", "abbr", "blockquote", "b", "br", "cite", "code", "dd", "dfn", "dl", "dt", "em", "i", "kbd",
8
+ "li", "mark", "ol", "p", "pre", "q", "s", "samp", "small", "strike", "strong", "sub", "sup", "time", "u", "ul", "var",],
9
+
10
+ attributes: {
11
+ "a" => ["href"],
12
+ "abbr" => ["title"],
13
+ "blockquote" => ["cite"],
14
+ "dfn" => ["title"],
15
+ "q" => ["cite"],
16
+ "time" => ["datetime", "pubdate"],
17
+ },
18
+
19
+ protocols: {
20
+ "a" => { "href" => ["ftp", "http", "https", "mailto", :relative] },
21
+ "blockquote" => { "cite" => ["http", "https", :relative] },
22
+ "q" => { "cite" => ["http", "https", :relative] },
23
+ },
24
+ )
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ DEFAULT = freeze_config(
7
+ # Whether or not to allow HTML comments. Allowing comments is strongly
8
+ # discouraged, since IE allows script execution within conditional
9
+ # comments.
10
+ allow_comments: false,
11
+
12
+ # Whether or not to allow well-formed HTML doctype declarations such as
13
+ # "<!DOCTYPE html>" when sanitizing a document.
14
+ allow_doctype: false,
15
+
16
+ # HTML attributes to allow in specific elements. By default, no attributes
17
+ # are allowed. Use the symbol :data to indicate that arbitrary HTML5
18
+ # data-* attributes should be allowed.
19
+ attributes: {},
20
+
21
+ # HTML elements to allow. By default, no elements are allowed (which means
22
+ # that all HTML will be stripped).
23
+ elements: [],
24
+
25
+ # URL handling protocols to allow in specific attributes. By default, no
26
+ # protocols are allowed. Use :relative in place of a protocol if you want
27
+ # to allow relative URLs sans protocol.
28
+ protocols: {},
29
+
30
+ # An Array of element names whose contents will be removed. The contents
31
+ # of all other filtered elements will be left behind.
32
+ remove_contents: ["iframe", "math", "noembed", "noframes", "noscript", "plaintext", "script", "style", "svg",
33
+ "xmp",],
34
+
35
+ # Elements which, when removed, should have their contents surrounded by
36
+ # whitespace.
37
+ whitespace_elements: ["address", "article", "aside", "blockquote", "br", "dd", "div", "dl", "dt", "footer",
38
+ "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "nav", "ol", "p", "pre", "section", "ul",],
39
+ )
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ RELAXED = freeze_config(
7
+ elements: BASIC[:elements] + ["address", "article", "aside", "bdi", "bdo", "body", "caption", "col",
8
+ "colgroup", "data", "del", "div", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "img", "ins", "main", "nav", "rp", "rt", "ruby", "section", "span", "style", "summary", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "wbr",],
9
+
10
+ allow_doctype: true,
11
+
12
+ attributes: merge(BASIC[:attributes],
13
+ :all => ["class", "dir", "hidden", "id", "lang", "style", "tabindex", "title", "translate"],
14
+ "a" => ["href", "hreflang", "name", "rel"],
15
+ "col" => ["span", "width"],
16
+ "colgroup" => ["span", "width"],
17
+ "data" => ["value"],
18
+ "del" => ["cite", "datetime"],
19
+ "img" => ["align", "alt", "border", "height", "src", "srcset", "width"],
20
+ "ins" => ["cite", "datetime"],
21
+ "li" => ["value"],
22
+ "ol" => ["reversed", "start", "type"],
23
+ "style" => ["media", "scoped", "type"],
24
+ "table" => ["align", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "rules", "sortable",
25
+ "summary", "width",],
26
+ "td" => ["abbr", "align", "axis", "colspan", "headers", "rowspan", "valign", "width"],
27
+ "th" => ["abbr", "align", "axis", "colspan", "headers", "rowspan", "scope", "sorted", "valign", "width"],
28
+ "ul" => ["type"]),
29
+
30
+ protocols: merge(BASIC[:protocols],
31
+ "del" => { "cite" => ["http", "https", :relative] },
32
+ "img" => { "src" => ["http", "https", :relative] },
33
+ "ins" => { "cite" => ["http", "https", :relative] }),
34
+ )
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ RESTRICTED = freeze_config(
7
+ elements: ["b", "em", "i", "strong", "u"],
8
+
9
+ whitespace_elements: DEFAULT[:whitespace_elements],
10
+ )
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ module Selma
6
+ class Sanitizer
7
+ module Config
8
+ class << self
9
+ # Deeply freezes and returns the given configuration Hash.
10
+ def freeze_config(config)
11
+ case config
12
+ when Hash
13
+ config.each_value { |c| freeze_config(c) }
14
+ when Array, Set
15
+ config.each { |c| freeze_config(c) }
16
+ end
17
+
18
+ config.freeze
19
+ end
20
+
21
+ # Returns a new Hash containing the result of deeply merging *other_config*
22
+ # into *config*. Does not modify *config* or *other_config*.
23
+ #
24
+ # This is the safest way to use a built-in config as the basis for
25
+ # your own custom config.
26
+ def merge(config, other_config = {})
27
+ raise ArgumentError, "config must be a Hash" unless config.is_a?(Hash)
28
+ raise ArgumentError, "other_config must be a Hash" unless other_config.is_a?(Hash)
29
+
30
+ merged = {}
31
+ keys = Set.new(config.keys + other_config.keys)
32
+
33
+ keys.each do |key|
34
+ oldval = config[key]
35
+
36
+ if other_config.key?(key)
37
+ newval = other_config[key]
38
+
39
+ merged[key] = if oldval.is_a?(Hash) && newval.is_a?(Hash)
40
+ oldval.empty? ? newval.dup : merge(oldval, newval)
41
+ elsif newval.is_a?(Array) && key != :transformers
42
+ Set.new(newval)
43
+ else
44
+ can_dupe?(newval) ? newval.dup : newval
45
+ end
46
+ else
47
+ merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
48
+ end
49
+ end
50
+
51
+ merged
52
+ end
53
+
54
+ # Returns `true` if `dup` may be safely called on _value_, `false`
55
+ # otherwise.
56
+ def can_dupe?(value)
57
+ !(value == true || value == false || value.nil? || value.is_a?(Method) || value.is_a?(Numeric) || value.is_a?(Symbol))
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ require "selma/sanitizer/config/basic"
65
+ require "selma/sanitizer/config/default"
66
+ require "selma/sanitizer/config/relaxed"
67
+ require "selma/sanitizer/config/restricted"
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "selma/sanitizer/config"
4
+
5
+ module Selma
6
+ class Sanitizer
7
+ ALLOW = 1 << 0
8
+ ESCAPE_TAGFILTER = (1 << 1)
9
+ REMOVE_CONTENTS = (1 << 2)
10
+ WRAP_WHITESPACE = (1 << 3)
11
+
12
+ # initialize is in Rust, this just helps manage config setup in Ruby
13
+ # TODO: could this just become initialize?
14
+ def setup
15
+ allow_element(config[:elements] || [])
16
+
17
+ (config[:attributes] || {}).each do |element, attrs|
18
+ allow_attribute(element, attrs)
19
+ end
20
+
21
+ (config[:protocols] || {}).each do |element, protocols|
22
+ protocols.each do |attribute, pr|
23
+ allow_protocol(element, attribute, pr)
24
+ end
25
+ end
26
+
27
+ remove_contents(config[:remove_contents]) if config.include?(:remove_contents)
28
+
29
+ wrap_with_whitespace(config[:whitespace_elements]) if config.include?(:whitespace_elements)
30
+
31
+ set_escape_tagfilter(config.fetch(:escape_tagfilter, true))
32
+ set_allow_comments(config.fetch(:allow_comments, false))
33
+ set_allow_doctype(config.fetch(:allow_doctype, true))
34
+ end
35
+
36
+ def elements
37
+ config[:elements]
38
+ end
39
+
40
+ def allow_element(elements)
41
+ elements.flatten.each { |e| set_flag(e, ALLOW, true) }
42
+ end
43
+
44
+ def disallow_element(elements)
45
+ elements.flatten.each { |e| set_flag(e, ALLOW, false) }
46
+ end
47
+
48
+ def allow_attribute(element, attrs)
49
+ attrs.flatten.each { |attr| set_allowed_attribute(element, attr, true) }
50
+ end
51
+
52
+ def require_any_attributes(element, attrs)
53
+ if attr.empty?
54
+ set_required_attribute(element, "*", true)
55
+ else
56
+ attrs.flatten.each { |attr| set_required_attribute(element, attr, true) }
57
+ end
58
+ end
59
+
60
+ def disallow_attribute(element, attrs)
61
+ attrs.flatten.each { |attr| set_allowed_attribute(element, attr, false) }
62
+ end
63
+
64
+ def allow_class(element, *klass)
65
+ klass.flatten.each { |k| set_allowed_class(element, k, true) }
66
+ end
67
+
68
+ def allow_protocol(element, attr, protos)
69
+ protos = [protos] unless protos.is_a?(Array)
70
+ set_allowed_protocols(element, attr, protos)
71
+ end
72
+
73
+ def remove_contents(elements)
74
+ if elements.is_a?(TrueClass) || elements.is_a?(FalseClass)
75
+ set_all_flags(REMOVE_CONTENTS, elements)
76
+ else
77
+ elements.flatten.each { |e| set_flag(e, REMOVE_CONTENTS, true) }
78
+ end
79
+ end
80
+
81
+ def wrap_with_whitespace(elements)
82
+ elements.flatten.each { |e| set_flag(e, WRAP_WHITESPACE, true) }
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Selector
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ VERSION = "0.0.1"
5
+ end
data/lib/selma.rb ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ if ENV.fetch("DEBUG", false)
4
+ require "amazing_print"
5
+ require "debug"
6
+ end
7
+
8
+ require_relative "selma/extension"
9
+
10
+ require_relative "selma/sanitizer"
11
+ require_relative "selma/html"
12
+ require_relative "selma/rewriter"
13
+ require_relative "selma/selector"
data/selma.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require "selma/version"
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "selma"
9
+ spec.version = Selma::VERSION
10
+ spec.authors = ["Garen J. Torikian"]
11
+ spec.email = ["gjtorikian@gmail.com"]
12
+
13
+ spec.summary = "Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html parser."
14
+ spec.license = "MIT"
15
+
16
+ spec.required_ruby_version = "~> 3.1"
17
+ # https://github.com/rubygems/rubygems/pull/5852#issuecomment-1231118509
18
+ spec.required_rubygems_version = ">= 3.3.22"
19
+
20
+ spec.files = ["LICENSE.txt", "README.md", "selma.gemspec"]
21
+ spec.files += Dir.glob("lib/**/*.rb")
22
+ spec.files += Dir.glob("ext/**/*.{rs,toml,lock,rb}")
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
25
+
26
+ spec.require_paths = ["lib"]
27
+ spec.extensions = ["ext/selma/Cargo.toml"]
28
+
29
+ spec.metadata = {
30
+ "allowed_push_host" => "https://rubygems.org",
31
+ "funding_uri" => "https://github.com/sponsors/gjtorikian/",
32
+ "source_code_uri" => "https://github.com/gjtorikian/selma",
33
+ "rubygems_mfa_required" => "true",
34
+ }
35
+
36
+ spec.add_dependency("rb_sys", "~> 0.9")
37
+
38
+ spec.add_development_dependency("rake", "~> 13.0")
39
+ spec.add_development_dependency("rake-compiler", "~> 1.2")
40
+ spec.add_development_dependency("rake-compiler-dock", "~> 1.2")
41
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: selma
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Garen J. Torikian
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.2'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler-dock
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.2'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.2'
69
+ description:
70
+ email:
71
+ - gjtorikian@gmail.com
72
+ executables: []
73
+ extensions:
74
+ - ext/selma/Cargo.toml
75
+ extra_rdoc_files: []
76
+ files:
77
+ - LICENSE.txt
78
+ - README.md
79
+ - ext/selma/Cargo.toml
80
+ - ext/selma/_util.rb
81
+ - ext/selma/extconf.rb
82
+ - ext/selma/src/html.rs
83
+ - ext/selma/src/html/element.rs
84
+ - ext/selma/src/html/end_tag.rs
85
+ - ext/selma/src/lib.rs
86
+ - ext/selma/src/native_ref_wrap.rs
87
+ - ext/selma/src/rewriter.rs
88
+ - ext/selma/src/sanitizer.rs
89
+ - ext/selma/src/selector.rs
90
+ - ext/selma/src/tags.rs
91
+ - ext/selma/src/wrapped_struct.rs
92
+ - lib/selma.rb
93
+ - lib/selma/extension.rb
94
+ - lib/selma/html.rb
95
+ - lib/selma/rewriter.rb
96
+ - lib/selma/sanitizer.rb
97
+ - lib/selma/sanitizer/config.rb
98
+ - lib/selma/sanitizer/config/basic.rb
99
+ - lib/selma/sanitizer/config/default.rb
100
+ - lib/selma/sanitizer/config/relaxed.rb
101
+ - lib/selma/sanitizer/config/restricted.rb
102
+ - lib/selma/selector.rb
103
+ - lib/selma/version.rb
104
+ - selma.gemspec
105
+ homepage:
106
+ licenses:
107
+ - MIT
108
+ metadata:
109
+ allowed_push_host: https://rubygems.org
110
+ funding_uri: https://github.com/sponsors/gjtorikian/
111
+ source_code_uri: https://github.com/gjtorikian/selma
112
+ rubygems_mfa_required: 'true'
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - "~>"
120
+ - !ruby/object:Gem::Version
121
+ version: '3.1'
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: 3.3.22
127
+ requirements: []
128
+ rubygems_version: 3.3.26
129
+ signing_key:
130
+ specification_version: 4
131
+ summary: Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html
132
+ parser.
133
+ test_files: []