xrb-sanitize 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: dfcb33d4fedbdd7120af3a40c693f1a2964e9d838d0578cf34e1f695aefbecc3
4
+ data.tar.gz: fc0887981944741f22ae2d5cafb7702723affd96238d83277a12fb5cd868d991
5
+ SHA512:
6
+ metadata.gz: 507a7104250bdb9c9519df83b01040e0e3e63daf51644b2b272daa332a9086822dd297bd05b02c2691da2ba43117d92ff2a0a6b299e8f73305ef1f4ee414c925
7
+ data.tar.gz: d8eff16de230db5f695e7708246ddd7c7cb94a92deddde76df0dd64dec6350861b1bc1d4a46d7d53eb5410627f51834feff5ee7b81076fa957f61b823468bbdb
checksums.yaml.gz.sig ADDED
@@ -0,0 +1,2 @@
1
+ r}M�&,f���������lPkf�u�)�����,�CVIQ'�S ԍ���:�
2
+ X&F���#6C� �Q����e����sy��?,�
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ require 'xrb/parsers'
7
+ require 'xrb/builder'
8
+ require 'xrb/entities'
9
+
10
+ module XRB
11
+ module Sanitize
12
+ # Provides a high level interface for parsing markup.
13
+ class Filter
14
+ TAG = 1
15
+
16
+ DOCTYPE = 2
17
+ COMMENT = 4
18
+ INSTRUCTION = 8
19
+ CDATA = 16
20
+ TEXT = 32
21
+
22
+ CONTENT = DOCTYPE | COMMENT | INSTRUCTION | CDATA | TEXT
23
+ ALL = TAG | CONTENT
24
+
25
+ def self.parse(input, output = nil, entities = XRB::Entities::HTML5)
26
+ # This allows us to handle passing in a string:
27
+ input = XRB::Buffer(input)
28
+
29
+ output ||= MarkupString.new.force_encoding(input.encoding)
30
+
31
+ delegate = self.new(output, entities)
32
+
33
+ delegate.parse!(input)
34
+
35
+ return delegate
36
+ end
37
+
38
+ Node = Struct.new(:name, :tag, :skip) do
39
+ def skip!(mode = ALL)
40
+ self.skip |= mode
41
+ end
42
+
43
+ def skip?(mode = ALL)
44
+ (self.skip & mode) == mode
45
+ end
46
+
47
+ def accept!(mode = ALL)
48
+ self.skip &= ~mode
49
+ end
50
+
51
+ def [] key
52
+ self.tag&.attributes[key]
53
+ end
54
+
55
+ def limit_attributes(keys)
56
+ self.tag&.attributes&.select!{|key, value| keys.include?(key)}
57
+ end
58
+ end
59
+
60
+ def initialize(output, entities)
61
+ @output = output
62
+
63
+ @entities = entities
64
+
65
+ @current = nil
66
+ @stack = []
67
+
68
+ @current = @top = Node.new(nil, nil, 0)
69
+
70
+ @skip = nil
71
+ end
72
+
73
+ attr :output
74
+
75
+ # The current node being parsed.
76
+ attr :current
77
+
78
+ attr :stack
79
+
80
+ def top
81
+ @stack.last || @top
82
+ end
83
+
84
+ def parse!(input)
85
+ parse_begin
86
+
87
+ XRB::Parsers.parse_markup(input, self, @entities)
88
+
89
+ parse_end
90
+
91
+ return self
92
+ end
93
+
94
+ def parse_begin
95
+ end
96
+
97
+ def parse_end
98
+ while @stack.size > 1
99
+ close_tag(@stack.last.name)
100
+ end
101
+ end
102
+
103
+ def open_tag_begin(name, offset)
104
+ tag = Tag.new(name, false, {})
105
+
106
+ @current = Node.new(name, tag, current.skip)
107
+ end
108
+
109
+ def attribute(key, value)
110
+ @current.tag.attributes[key] = value
111
+ end
112
+
113
+ def open_tag_end(self_closing)
114
+ if self_closing
115
+ @current.tag.closed = true
116
+ else
117
+ @stack << @current
118
+ end
119
+
120
+ filter(@current)
121
+
122
+ @current.tag.write_opening_tag(@output) unless @current.skip? TAG
123
+
124
+ # If the tag was self-closing, it's no longer current at this point, we are back in the context of the parent tag.
125
+ @current = self.top if self_closing
126
+ end
127
+
128
+ def close_tag(name, offset = nil)
129
+ while node = @stack.pop
130
+ node.tag.write_closing_tag(@output) unless node.skip? TAG
131
+
132
+ break if node.name == name
133
+ end
134
+
135
+ @current = self.top
136
+ end
137
+
138
+ def filter(tag)
139
+ return tag
140
+ end
141
+
142
+ def doctype(string)
143
+ @output << string unless current.skip? DOCTYPE
144
+ end
145
+
146
+ def comment(string)
147
+ @output << string unless current.skip? COMMENT
148
+ end
149
+
150
+ def instruction(string)
151
+ @output << string unless current.skip? INSTRUCTION
152
+ end
153
+
154
+ def cdata(string)
155
+ @output << string unless current.skip? CDATA
156
+ end
157
+
158
+ def text(string)
159
+ Markup.append(@output, string) unless current.skip? TEXT
160
+ end
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ require_relative 'filter'
7
+
8
+ require 'set'
9
+
10
+ module XRB
11
+ module Sanitize
12
+ class Fragment < Filter
13
+ STANDARD_ATTRIBUTES = Set.new(['class', 'style']).freeze
14
+
15
+ ALLOWED_TAGS = {
16
+ 'div' => STANDARD_ATTRIBUTES,
17
+ 'span' => STANDARD_ATTRIBUTES,
18
+ 'br' => STANDARD_ATTRIBUTES,
19
+ 'b' => STANDARD_ATTRIBUTES,
20
+ 'i' => STANDARD_ATTRIBUTES,
21
+ 'em' => STANDARD_ATTRIBUTES,
22
+ 'strong' => STANDARD_ATTRIBUTES,
23
+ 'ul' => STANDARD_ATTRIBUTES,
24
+ 'ol' => STANDARD_ATTRIBUTES,
25
+ 'li' => STANDARD_ATTRIBUTES,
26
+ 'dl' => STANDARD_ATTRIBUTES,
27
+ 'dt' => STANDARD_ATTRIBUTES,
28
+ 'dd' => STANDARD_ATTRIBUTES,
29
+ 'strike' => STANDARD_ATTRIBUTES,
30
+ 'h1' => STANDARD_ATTRIBUTES,
31
+ 'h2' => STANDARD_ATTRIBUTES,
32
+ 'h3' => STANDARD_ATTRIBUTES,
33
+ 'h4' => STANDARD_ATTRIBUTES,
34
+ 'h5' => STANDARD_ATTRIBUTES,
35
+ 'h6' => STANDARD_ATTRIBUTES,
36
+ 'p' => STANDARD_ATTRIBUTES,
37
+ 'img' => STANDARD_ATTRIBUTES + ['src', 'alt', 'width', 'height'],
38
+ 'image' => STANDARD_ATTRIBUTES,
39
+ 'a' => STANDARD_ATTRIBUTES + ['href', 'target']
40
+ }.freeze
41
+
42
+ def filter(node)
43
+ if attributes = ALLOWED_TAGS[node.name]
44
+ node.limit_attributes(attributes)
45
+
46
+ node.accept!
47
+ else
48
+ node.skip!
49
+ end
50
+ end
51
+
52
+ def doctype(string)
53
+ end
54
+
55
+ def instruction(string)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ require_relative 'filter'
7
+
8
+ module XRB
9
+ module Sanitize
10
+ class Text < Filter
11
+ CLOSING = {
12
+ "p" => "\n\n",
13
+ "div" => "\n\n",
14
+ }
15
+
16
+ def filter(node)
17
+ if node.name == "br"
18
+ text("\n\n")
19
+ end
20
+
21
+ if node.name == 'script'
22
+ node.skip!(ALL) # Skip everything including content.
23
+ else
24
+ node.skip!(TAG) # Only skip the tag output, but not the content.
25
+ end
26
+ end
27
+
28
+ def close_tag(name, offset = nil)
29
+ super
30
+
31
+ if value = CLOSING[name]
32
+ text(value)
33
+ end
34
+ end
35
+
36
+ def doctype(string)
37
+ end
38
+
39
+ def comment(string)
40
+ end
41
+
42
+ def instruction(string)
43
+ end
44
+
45
+ def cdata(string)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ module XRB
7
+ module Sanitize
8
+ VERSION = "0.7.0"
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2024, by Samuel Williams.
5
+
6
+ require_relative 'sanitize/text'
7
+ require_relative 'sanitize/fragment'
data/license.md ADDED
@@ -0,0 +1,21 @@
1
+ # MIT License
2
+
3
+ Copyright, 2018-2024, by Samuel Williams.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/readme.md ADDED
@@ -0,0 +1,49 @@
1
+ # XRB::Sanitize
2
+
3
+ Sanitize markup by adding, changing or removing tags, using the [xrb](https://github.com/ioquatix/xrb) stream processor (which has a naive C implementation).
4
+
5
+ [![Development Status](https://github.com/socketry/xrb-sanitize/workflows/Test/badge.svg)](https://github.com/socketry/xrb-sanitize/actions?workflow=Test)
6
+
7
+ ## Motivation
8
+
9
+ I use the [sanitize](https://github.com/rgrove/sanitize/) gem and generally it's great. However, it's performance can be an issue and additionally, it doesn't preserve tag namespaces when parsing fragments due to how Nokogiri works internally. This is a problem when processing content destined for [utopia](https://github.com/ioquatix/utopia) since it heavily depends on tag namespaces.
10
+
11
+ ## Is it fast?
12
+
13
+ In my informal testing, this gem is about \~50x faster than the [sanitize](https://github.com/rgrove/sanitize/) gem when generating plain text.
14
+
15
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
16
+ Warming up --------------------------------------
17
+ Sanitize 438.000 i/100ms
18
+ XRB::Sanitize 7.935k i/100ms
19
+ Calculating -------------------------------------
20
+ Sanitize 4.365k (± 0.1%) i/s - 21.900k in 5.017157s
21
+ XRB::Sanitize 78.670k (± 0.1%) i/s - 396.750k in 5.043233s
22
+
23
+ Comparison:
24
+ XRB::Sanitize: 78669.9 i/s
25
+ Sanitize: 4365.0 i/s - 18.02x slower
26
+
27
+ ## Usage
28
+
29
+ Please see the [project documentation](https://socketry.github.io/xrb-sanitize/) for more details.
30
+
31
+ - [Getting Started](https://socketry.github.io/xrb-sanitize/guides/getting-started/index) - This guide explains how to get started with the `XRB::Sanitize` gem.
32
+
33
+ ## Contributing
34
+
35
+ We welcome contributions to this project.
36
+
37
+ 1. Fork it.
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`).
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`).
40
+ 4. Push to the branch (`git push origin my-new-feature`).
41
+ 5. Create new Pull Request.
42
+
43
+ ### Developer Certificate of Origin
44
+
45
+ This project uses the [Developer Certificate of Origin](https://developercertificate.org/). All contributors to this project must agree to this document to have their contributions accepted.
46
+
47
+ ### Contributor Covenant
48
+
49
+ This project is governed by the [Contributor Covenant](https://www.contributor-covenant.org/). All contributors and participants agree to abide by its terms.
data.tar.gz.sig ADDED
Binary file
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xrb-sanitize
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.0
5
+ platform: ruby
6
+ authors:
7
+ - Samuel Williams
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIE2DCCA0CgAwIBAgIBATANBgkqhkiG9w0BAQsFADBhMRgwFgYDVQQDDA9zYW11
14
+ ZWwud2lsbGlhbXMxHTAbBgoJkiaJk/IsZAEZFg1vcmlvbnRyYW5zZmVyMRIwEAYK
15
+ CZImiZPyLGQBGRYCY28xEjAQBgoJkiaJk/IsZAEZFgJuejAeFw0yMjA4MDYwNDUz
16
+ MjRaFw0zMjA4MDMwNDUzMjRaMGExGDAWBgNVBAMMD3NhbXVlbC53aWxsaWFtczEd
17
+ MBsGCgmSJomT8ixkARkWDW9yaW9udHJhbnNmZXIxEjAQBgoJkiaJk/IsZAEZFgJj
18
+ bzESMBAGCgmSJomT8ixkARkWAm56MIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIB
19
+ igKCAYEAomvSopQXQ24+9DBB6I6jxRI2auu3VVb4nOjmmHq7XWM4u3HL+pni63X2
20
+ 9qZdoq9xt7H+RPbwL28LDpDNflYQXoOhoVhQ37Pjn9YDjl8/4/9xa9+NUpl9XDIW
21
+ sGkaOY0eqsQm1pEWkHJr3zn/fxoKPZPfaJOglovdxf7dgsHz67Xgd/ka+Wo1YqoE
22
+ e5AUKRwUuvaUaumAKgPH+4E4oiLXI4T1Ff5Q7xxv6yXvHuYtlMHhYfgNn8iiW8WN
23
+ XibYXPNP7NtieSQqwR/xM6IRSoyXKuS+ZNGDPUUGk8RoiV/xvVN4LrVm9upSc0ss
24
+ RZ6qwOQmXCo/lLcDUxJAgG95cPw//sI00tZan75VgsGzSWAOdjQpFM0l4dxvKwHn
25
+ tUeT3ZsAgt0JnGqNm2Bkz81kG4A2hSyFZTFA8vZGhp+hz+8Q573tAR89y9YJBdYM
26
+ zp0FM4zwMNEUwgfRzv1tEVVUEXmoFCyhzonUUw4nE4CFu/sE3ffhjKcXcY//qiSW
27
+ xm4erY3XAgMBAAGjgZowgZcwCQYDVR0TBAIwADALBgNVHQ8EBAMCBLAwHQYDVR0O
28
+ BBYEFO9t7XWuFf2SKLmuijgqR4sGDlRsMC4GA1UdEQQnMCWBI3NhbXVlbC53aWxs
29
+ aWFtc0BvcmlvbnRyYW5zZmVyLmNvLm56MC4GA1UdEgQnMCWBI3NhbXVlbC53aWxs
30
+ aWFtc0BvcmlvbnRyYW5zZmVyLmNvLm56MA0GCSqGSIb3DQEBCwUAA4IBgQB5sxkE
31
+ cBsSYwK6fYpM+hA5B5yZY2+L0Z+27jF1pWGgbhPH8/FjjBLVn+VFok3CDpRqwXCl
32
+ xCO40JEkKdznNy2avOMra6PFiQyOE74kCtv7P+Fdc+FhgqI5lMon6tt9rNeXmnW/
33
+ c1NaMRdxy999hmRGzUSFjozcCwxpy/LwabxtdXwXgSay4mQ32EDjqR1TixS1+smp
34
+ 8C/NCWgpIfzpHGJsjvmH2wAfKtTTqB9CVKLCWEnCHyCaRVuKkrKjqhYCdmMBqCws
35
+ JkxfQWC+jBVeG9ZtPhQgZpfhvh+6hMhraUYRQ6XGyvBqEUe+yo6DKIT3MtGE2+CP
36
+ eX9i9ZWBydWb8/rvmwmX2kkcBbX0hZS1rcR593hGc61JR6lvkGYQ2MYskBveyaxt
37
+ Q2K9NVun/S785AP05vKkXZEFYxqG6EW012U4oLcFl5MySFajYXRYbuUpH6AY+HP8
38
+ voD0MPg1DssDLKwXyt1eKD/+Fq0bFWhwVM/1XiAXL7lyYUyOq24KHgQ2Csg=
39
+ -----END CERTIFICATE-----
40
+ date: 2024-04-28 00:00:00.000000000 Z
41
+ dependencies:
42
+ - !ruby/object:Gem::Dependency
43
+ name: xrb
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.3'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.3'
56
+ description:
57
+ email:
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/xrb/sanitize.rb
63
+ - lib/xrb/sanitize/filter.rb
64
+ - lib/xrb/sanitize/fragment.rb
65
+ - lib/xrb/sanitize/text.rb
66
+ - lib/xrb/sanitize/version.rb
67
+ - license.md
68
+ - readme.md
69
+ homepage: https://github.com/ioquatix/xrb-sanitize
70
+ licenses:
71
+ - MIT
72
+ metadata:
73
+ documentation_uri: https://socketry.github.io/xrb-sanitize/
74
+ funding_uri: https://github.com/sponsors/ioquatix/
75
+ source_code_uri: https://github.com/ioquatix/xrb-sanitize.git
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '3.1'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubygems_version: 3.5.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Sanitize markdown according to a set of rules.
95
+ test_files: []
metadata.gz.sig ADDED
Binary file