selma 0.0.2-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,92 @@
1
+ use magnus::{error::Error, exception, gc, value::Value, RTypedData, TryConvert, TypedData};
2
+ use std::{marker::PhantomData, ops::Deref};
3
+
4
+ // NOTE: My Rust isn't good enough to know what any of this does,
5
+ // but it was taken from https://cs.github.com/bytecodealliance/wasmtime-rb/blob/a843e4b4582a945f2c881b8bd3e2b87688ab5509/ext/src/helpers/wrapped_struct.rs#L4
6
+
7
+ /// A small wrapper for `RTypedData` that keeps track of the concrete struct
8
+ /// type, and the underlying [`Value`] for GC purposes.
9
+ #[derive(Debug)]
10
+ #[repr(transparent)]
11
+ pub struct WrappedStruct<T: TypedData> {
12
+ inner: RTypedData,
13
+ phantom: PhantomData<T>,
14
+ }
15
+
16
+ impl<T: TypedData> Clone for WrappedStruct<T> {
17
+ fn clone(&self) -> Self {
18
+ Self {
19
+ inner: self.inner,
20
+ phantom: PhantomData,
21
+ }
22
+ }
23
+ }
24
+ impl<T: TypedData> Copy for WrappedStruct<T> {}
25
+
26
+ impl<T: TypedData> WrappedStruct<T> {
27
+ /// Gets the underlying struct.
28
+ pub fn get(&self) -> Result<&T, Error> {
29
+ self.inner.try_convert()
30
+ }
31
+
32
+ /// Gets the underlying struct with a `'static` lifetime.
33
+ pub fn get_static(&self) -> Result<&'static T, Error> {
34
+ self.inner.try_convert()
35
+ }
36
+
37
+ /// Get the Ruby [`Value`] for this struct.
38
+ pub fn to_value(self) -> Value {
39
+ self.inner.into()
40
+ }
41
+
42
+ /// Marks the Ruby [`Value`] for GC.
43
+ pub fn mark(&self) {
44
+ gc::mark(&self.inner.into());
45
+ }
46
+ }
47
+
48
+ impl<T: TypedData> From<WrappedStruct<T>> for Value {
49
+ fn from(wrapped_struct: WrappedStruct<T>) -> Self {
50
+ wrapped_struct.to_value()
51
+ }
52
+ }
53
+
54
+ impl<T: TypedData> Deref for WrappedStruct<T> {
55
+ type Target = RTypedData;
56
+
57
+ fn deref(&self) -> &Self::Target {
58
+ &self.inner
59
+ }
60
+ }
61
+
62
+ impl<T: TypedData> From<T> for WrappedStruct<T> {
63
+ fn from(t: T) -> Self {
64
+ Self {
65
+ inner: RTypedData::wrap(t),
66
+ phantom: PhantomData,
67
+ }
68
+ }
69
+ }
70
+
71
+ impl<T> TryConvert for WrappedStruct<T>
72
+ where
73
+ T: TypedData,
74
+ {
75
+ fn try_convert(val: Value) -> Result<Self, Error> {
76
+ let inner = RTypedData::from_value(val).ok_or_else(|| {
77
+ Error::new(
78
+ exception::type_error(),
79
+ format!(
80
+ "no implicit conversion of {} into {}",
81
+ unsafe { val.classname() },
82
+ T::class()
83
+ ),
84
+ )
85
+ })?;
86
+
87
+ Ok(Self {
88
+ inner,
89
+ phantom: PhantomData,
90
+ })
91
+ }
92
+ }
Binary file
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ # native precompiled gems package shared libraries in <gem_dir>/lib/selma/<ruby_version>
5
+ # load the precompiled extension file
6
+ ruby_version = /\d+\.\d+/.match(RUBY_VERSION)
7
+ require_relative "#{ruby_version}/selma"
8
+ rescue LoadError
9
+ # fall back to the extension compiled upon installation.
10
+ # use "require" instead of "require_relative" because non-native gems will place C extension files
11
+ # in Gem::BasicSpecification#extension_dir after compilation (during normal installation), which
12
+ # is in $LOAD_PATH but not necessarily relative to this file (see nokogiri#2300)
13
+ require "selma/selma"
14
+ end
data/lib/selma/html.rb ADDED
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class HTML
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Rewriter
5
+ end
6
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ BASIC = freeze_config(
7
+ elements: ["a", "abbr", "blockquote", "b", "br", "cite", "code", "dd", "dfn", "dl", "dt", "em", "i", "kbd",
8
+ "li", "mark", "ol", "p", "pre", "q", "s", "samp", "small", "strike", "strong", "sub", "sup", "time", "u", "ul", "var",],
9
+
10
+ attributes: {
11
+ "a" => ["href"],
12
+ "abbr" => ["title"],
13
+ "blockquote" => ["cite"],
14
+ "dfn" => ["title"],
15
+ "q" => ["cite"],
16
+ "time" => ["datetime", "pubdate"],
17
+ },
18
+
19
+ protocols: {
20
+ "a" => { "href" => ["ftp", "http", "https", "mailto", :relative] },
21
+ "blockquote" => { "cite" => ["http", "https", :relative] },
22
+ "q" => { "cite" => ["http", "https", :relative] },
23
+ },
24
+ )
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ DEFAULT = freeze_config(
7
+ # Whether or not to allow HTML comments. Allowing comments is strongly
8
+ # discouraged, since IE allows script execution within conditional
9
+ # comments.
10
+ allow_comments: false,
11
+
12
+ # Whether or not to allow well-formed HTML doctype declarations such as
13
+ # "<!DOCTYPE html>" when sanitizing a document.
14
+ allow_doctype: false,
15
+
16
+ # HTML attributes to allow in specific elements. By default, no attributes
17
+ # are allowed. Use the symbol :data to indicate that arbitrary HTML5
18
+ # data-* attributes should be allowed.
19
+ attributes: {},
20
+
21
+ # HTML elements to allow. By default, no elements are allowed (which means
22
+ # that all HTML will be stripped).
23
+ elements: [],
24
+
25
+ # URL handling protocols to allow in specific attributes. By default, no
26
+ # protocols are allowed. Use :relative in place of a protocol if you want
27
+ # to allow relative URLs sans protocol.
28
+ protocols: {},
29
+
30
+ # An Array of element names whose contents will be removed. The contents
31
+ # of all other filtered elements will be left behind.
32
+ remove_contents: ["iframe", "math", "noembed", "noframes", "noscript", "plaintext", "script", "style", "svg",
33
+ "xmp",],
34
+
35
+ # Elements which, when removed, should have their contents surrounded by
36
+ # whitespace.
37
+ whitespace_elements: ["address", "article", "aside", "blockquote", "br", "dd", "div", "dl", "dt", "footer",
38
+ "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "li", "nav", "ol", "p", "pre", "section", "ul",],
39
+ )
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ RELAXED = freeze_config(
7
+ elements: BASIC[:elements] + ["address", "article", "aside", "bdi", "bdo", "body", "caption", "col",
8
+ "colgroup", "data", "del", "div", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "img", "ins", "main", "nav", "rp", "rt", "ruby", "section", "span", "style", "summary", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "wbr",],
9
+
10
+ allow_doctype: true,
11
+
12
+ attributes: merge(BASIC[:attributes],
13
+ :all => ["class", "dir", "hidden", "id", "lang", "style", "tabindex", "title", "translate"],
14
+ "a" => ["href", "hreflang", "name", "rel"],
15
+ "col" => ["span", "width"],
16
+ "colgroup" => ["span", "width"],
17
+ "data" => ["value"],
18
+ "del" => ["cite", "datetime"],
19
+ "img" => ["align", "alt", "border", "height", "src", "srcset", "width"],
20
+ "ins" => ["cite", "datetime"],
21
+ "li" => ["value"],
22
+ "ol" => ["reversed", "start", "type"],
23
+ "style" => ["media", "scoped", "type"],
24
+ "table" => ["align", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "rules", "sortable",
25
+ "summary", "width",],
26
+ "td" => ["abbr", "align", "axis", "colspan", "headers", "rowspan", "valign", "width"],
27
+ "th" => ["abbr", "align", "axis", "colspan", "headers", "rowspan", "scope", "sorted", "valign", "width"],
28
+ "ul" => ["type"]),
29
+
30
+ protocols: merge(BASIC[:protocols],
31
+ "del" => { "cite" => ["http", "https", :relative] },
32
+ "img" => { "src" => ["http", "https", :relative] },
33
+ "ins" => { "cite" => ["http", "https", :relative] }),
34
+ )
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Sanitizer
5
+ module Config
6
+ RESTRICTED = freeze_config(
7
+ elements: ["b", "em", "i", "strong", "u"],
8
+
9
+ whitespace_elements: DEFAULT[:whitespace_elements],
10
+ )
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+
5
+ module Selma
6
+ class Sanitizer
7
+ module Config
8
+ class << self
9
+ # Deeply freezes and returns the given configuration Hash.
10
+ def freeze_config(config)
11
+ case config
12
+ when Hash
13
+ config.each_value { |c| freeze_config(c) }
14
+ when Array, Set
15
+ config.each { |c| freeze_config(c) }
16
+ end
17
+
18
+ config.freeze
19
+ end
20
+
21
+ # Returns a new Hash containing the result of deeply merging *other_config*
22
+ # into *config*. Does not modify *config* or *other_config*.
23
+ #
24
+ # This is the safest way to use a built-in config as the basis for
25
+ # your own custom config.
26
+ def merge(config, other_config = {})
27
+ raise ArgumentError, "config must be a Hash" unless config.is_a?(Hash)
28
+ raise ArgumentError, "other_config must be a Hash" unless other_config.is_a?(Hash)
29
+
30
+ merged = {}
31
+ keys = Set.new(config.keys + other_config.keys)
32
+
33
+ keys.each do |key|
34
+ oldval = config[key]
35
+
36
+ if other_config.key?(key)
37
+ newval = other_config[key]
38
+
39
+ merged[key] = if oldval.is_a?(Hash) && newval.is_a?(Hash)
40
+ oldval.empty? ? newval.dup : merge(oldval, newval)
41
+ elsif newval.is_a?(Array) && key != :transformers
42
+ Set.new(newval)
43
+ else
44
+ can_dupe?(newval) ? newval.dup : newval
45
+ end
46
+ else
47
+ merged[key] = can_dupe?(oldval) ? oldval.dup : oldval
48
+ end
49
+ end
50
+
51
+ merged
52
+ end
53
+
54
+ # Returns `true` if `dup` may be safely called on _value_, `false`
55
+ # otherwise.
56
+ def can_dupe?(value)
57
+ !(value == true || value == false || value.nil? || value.is_a?(Method) || value.is_a?(Numeric) || value.is_a?(Symbol))
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ require "selma/sanitizer/config/basic"
65
+ require "selma/sanitizer/config/default"
66
+ require "selma/sanitizer/config/relaxed"
67
+ require "selma/sanitizer/config/restricted"
@@ -0,0 +1,85 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "selma/sanitizer/config"
4
+
5
+ module Selma
6
+ class Sanitizer
7
+ ALLOW = 1 << 0
8
+ ESCAPE_TAGFILTER = (1 << 1)
9
+ REMOVE_CONTENTS = (1 << 2)
10
+ WRAP_WHITESPACE = (1 << 3)
11
+
12
+ # initialize is in Rust, this just helps manage config setup in Ruby
13
+ # TODO: could this just become initialize?
14
+ def setup
15
+ allow_element(config[:elements] || [])
16
+
17
+ (config[:attributes] || {}).each do |element, attrs|
18
+ allow_attribute(element, attrs)
19
+ end
20
+
21
+ (config[:protocols] || {}).each do |element, protocols|
22
+ protocols.each do |attribute, pr|
23
+ allow_protocol(element, attribute, pr)
24
+ end
25
+ end
26
+
27
+ remove_contents(config[:remove_contents]) if config.include?(:remove_contents)
28
+
29
+ wrap_with_whitespace(config[:whitespace_elements]) if config.include?(:whitespace_elements)
30
+
31
+ set_escape_tagfilter(config.fetch(:escape_tagfilter, true))
32
+ set_allow_comments(config.fetch(:allow_comments, false))
33
+ set_allow_doctype(config.fetch(:allow_doctype, true))
34
+ end
35
+
36
+ def elements
37
+ config[:elements]
38
+ end
39
+
40
+ def allow_element(elements)
41
+ elements.flatten.each { |e| set_flag(e, ALLOW, true) }
42
+ end
43
+
44
+ def disallow_element(elements)
45
+ elements.flatten.each { |e| set_flag(e, ALLOW, false) }
46
+ end
47
+
48
+ def allow_attribute(element, attrs)
49
+ attrs.flatten.each { |attr| set_allowed_attribute(element, attr, true) }
50
+ end
51
+
52
+ def require_any_attributes(element, attrs)
53
+ if attr.empty?
54
+ set_required_attribute(element, "*", true)
55
+ else
56
+ attrs.flatten.each { |attr| set_required_attribute(element, attr, true) }
57
+ end
58
+ end
59
+
60
+ def disallow_attribute(element, attrs)
61
+ attrs.flatten.each { |attr| set_allowed_attribute(element, attr, false) }
62
+ end
63
+
64
+ def allow_class(element, *klass)
65
+ klass.flatten.each { |k| set_allowed_class(element, k, true) }
66
+ end
67
+
68
+ def allow_protocol(element, attr, protos)
69
+ protos = [protos] unless protos.is_a?(Array)
70
+ set_allowed_protocols(element, attr, protos)
71
+ end
72
+
73
+ def remove_contents(elements)
74
+ if elements.is_a?(TrueClass) || elements.is_a?(FalseClass)
75
+ set_all_flags(REMOVE_CONTENTS, elements)
76
+ else
77
+ elements.flatten.each { |e| set_flag(e, REMOVE_CONTENTS, true) }
78
+ end
79
+ end
80
+
81
+ def wrap_with_whitespace(elements)
82
+ elements.flatten.each { |e| set_flag(e, WRAP_WHITESPACE, true) }
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ class Selector
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Selma
4
+ VERSION = "0.0.2"
5
+ end
data/lib/selma.rb ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ if ENV.fetch("DEBUG", false)
4
+ require "amazing_print"
5
+ require "debug"
6
+ end
7
+
8
+ require_relative "selma/extension"
9
+
10
+ require_relative "selma/sanitizer"
11
+ require_relative "selma/html"
12
+ require_relative "selma/rewriter"
13
+ require_relative "selma/selector"
data/selma.gemspec ADDED
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require "selma/version"
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "selma"
9
+ spec.version = Selma::VERSION
10
+ spec.authors = ["Garen J. Torikian"]
11
+ spec.email = ["gjtorikian@gmail.com"]
12
+
13
+ spec.summary = "Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html parser."
14
+ spec.license = "MIT"
15
+
16
+ spec.required_ruby_version = "~> 3.1"
17
+ # https://github.com/rubygems/rubygems/pull/5852#issuecomment-1231118509
18
+ spec.required_rubygems_version = ">= 3.3.22"
19
+
20
+ spec.files = ["LICENSE.txt", "README.md", "selma.gemspec"]
21
+ spec.files += Dir.glob("lib/**/*.rb")
22
+ spec.files += Dir.glob("ext/**/*.{rs,toml,lock,rb}")
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
25
+
26
+ spec.require_paths = ["lib"]
27
+ spec.extensions = ["ext/selma/Cargo.toml"]
28
+
29
+ spec.metadata = {
30
+ "allowed_push_host" => "https://rubygems.org",
31
+ "funding_uri" => "https://github.com/sponsors/gjtorikian/",
32
+ "source_code_uri" => "https://github.com/gjtorikian/selma",
33
+ "rubygems_mfa_required" => "true",
34
+ }
35
+
36
+ spec.add_dependency("rb_sys", "~> 0.9")
37
+
38
+ spec.add_development_dependency("rake", "~> 13.0")
39
+ spec.add_development_dependency("rake-compiler", "~> 1.2")
40
+ spec.add_development_dependency("rake-compiler-dock", "~> 1.2")
41
+ end
metadata ADDED
@@ -0,0 +1,136 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: selma
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: x64-mingw-ucrt
6
+ authors:
7
+ - Garen J. Torikian
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2022-12-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '13.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '13.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.2'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler-dock
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.2'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.2'
69
+ description:
70
+ email:
71
+ - gjtorikian@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - LICENSE.txt
77
+ - README.md
78
+ - ext/selma/Cargo.toml
79
+ - ext/selma/_util.rb
80
+ - ext/selma/extconf.rb
81
+ - ext/selma/src/html.rs
82
+ - ext/selma/src/html/element.rs
83
+ - ext/selma/src/html/end_tag.rs
84
+ - ext/selma/src/lib.rs
85
+ - ext/selma/src/native_ref_wrap.rs
86
+ - ext/selma/src/rewriter.rs
87
+ - ext/selma/src/sanitizer.rs
88
+ - ext/selma/src/selector.rs
89
+ - ext/selma/src/tags.rs
90
+ - ext/selma/src/wrapped_struct.rs
91
+ - lib/selma.rb
92
+ - lib/selma/3.1/selma.so
93
+ - lib/selma/extension.rb
94
+ - lib/selma/html.rb
95
+ - lib/selma/rewriter.rb
96
+ - lib/selma/sanitizer.rb
97
+ - lib/selma/sanitizer/config.rb
98
+ - lib/selma/sanitizer/config/basic.rb
99
+ - lib/selma/sanitizer/config/default.rb
100
+ - lib/selma/sanitizer/config/relaxed.rb
101
+ - lib/selma/sanitizer/config/restricted.rb
102
+ - lib/selma/selector.rb
103
+ - lib/selma/version.rb
104
+ - selma.gemspec
105
+ homepage:
106
+ licenses:
107
+ - MIT
108
+ metadata:
109
+ allowed_push_host: https://rubygems.org
110
+ funding_uri: https://github.com/sponsors/gjtorikian/
111
+ source_code_uri: https://github.com/gjtorikian/selma
112
+ rubygems_mfa_required: 'true'
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: '3.1'
122
+ - - "<"
123
+ - !ruby/object:Gem::Version
124
+ version: 3.2.dev
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: 3.3.22
130
+ requirements: []
131
+ rubygems_version: 3.3.22
132
+ signing_key:
133
+ specification_version: 4
134
+ summary: Selma selects and matches HTML nodes using CSS rules. Backed by Rust's lol_html
135
+ parser.
136
+ test_files: []