xrb-sanitize 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: dfcb33d4fedbdd7120af3a40c693f1a2964e9d838d0578cf34e1f695aefbecc3
4
+ data.tar.gz: fc0887981944741f22ae2d5cafb7702723affd96238d83277a12fb5cd868d991
5
+ SHA512:
6
+ metadata.gz: 507a7104250bdb9c9519df83b01040e0e3e63daf51644b2b272daa332a9086822dd297bd05b02c2691da2ba43117d92ff2a0a6b299e8f73305ef1f4ee414c925
7
+ data.tar.gz: d8eff16de230db5f695e7708246ddd7c7cb94a92deddde76df0dd64dec6350861b1bc1d4a46d7d53eb5410627f51834feff5ee7b81076fa957f61b823468bbdb
checksums.yaml.gz.sig ADDED
@@ -0,0 +1,2 @@
1
+ r}M�&,f���������lPkf�u�)�����,�CVIQ'�S ԍ���:�
2
+ X&F���#6C� �Q����e����sy��?,�
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ require 'xrb/parsers'
7
+ require 'xrb/builder'
8
+ require 'xrb/entities'
9
+
10
+ module XRB
11
+ module Sanitize
12
+ # Provides a high level interface for parsing markup.
13
+ class Filter
14
+ TAG = 1
15
+
16
+ DOCTYPE = 2
17
+ COMMENT = 4
18
+ INSTRUCTION = 8
19
+ CDATA = 16
20
+ TEXT = 32
21
+
22
+ CONTENT = DOCTYPE | COMMENT | INSTRUCTION | CDATA | TEXT
23
+ ALL = TAG | CONTENT
24
+
25
+ def self.parse(input, output = nil, entities = XRB::Entities::HTML5)
26
+ # This allows us to handle passing in a string:
27
+ input = XRB::Buffer(input)
28
+
29
+ output ||= MarkupString.new.force_encoding(input.encoding)
30
+
31
+ delegate = self.new(output, entities)
32
+
33
+ delegate.parse!(input)
34
+
35
+ return delegate
36
+ end
37
+
38
+ Node = Struct.new(:name, :tag, :skip) do
39
+ def skip!(mode = ALL)
40
+ self.skip |= mode
41
+ end
42
+
43
+ def skip?(mode = ALL)
44
+ (self.skip & mode) == mode
45
+ end
46
+
47
+ def accept!(mode = ALL)
48
+ self.skip &= ~mode
49
+ end
50
+
51
+ def [] key
52
+ self.tag&.attributes[key]
53
+ end
54
+
55
+ def limit_attributes(keys)
56
+ self.tag&.attributes&.select!{|key, value| keys.include?(key)}
57
+ end
58
+ end
59
+
60
+ def initialize(output, entities)
61
+ @output = output
62
+
63
+ @entities = entities
64
+
65
+ @current = nil
66
+ @stack = []
67
+
68
+ @current = @top = Node.new(nil, nil, 0)
69
+
70
+ @skip = nil
71
+ end
72
+
73
+ attr :output
74
+
75
+ # The current node being parsed.
76
+ attr :current
77
+
78
+ attr :stack
79
+
80
+ def top
81
+ @stack.last || @top
82
+ end
83
+
84
+ def parse!(input)
85
+ parse_begin
86
+
87
+ XRB::Parsers.parse_markup(input, self, @entities)
88
+
89
+ parse_end
90
+
91
+ return self
92
+ end
93
+
94
+ def parse_begin
95
+ end
96
+
97
+ def parse_end
98
+ while @stack.size > 1
99
+ close_tag(@stack.last.name)
100
+ end
101
+ end
102
+
103
+ def open_tag_begin(name, offset)
104
+ tag = Tag.new(name, false, {})
105
+
106
+ @current = Node.new(name, tag, current.skip)
107
+ end
108
+
109
+ def attribute(key, value)
110
+ @current.tag.attributes[key] = value
111
+ end
112
+
113
+ def open_tag_end(self_closing)
114
+ if self_closing
115
+ @current.tag.closed = true
116
+ else
117
+ @stack << @current
118
+ end
119
+
120
+ filter(@current)
121
+
122
+ @current.tag.write_opening_tag(@output) unless @current.skip? TAG
123
+
124
+ # If the tag was self-closing, it's no longer current at this point, we are back in the context of the parent tag.
125
+ @current = self.top if self_closing
126
+ end
127
+
128
+ def close_tag(name, offset = nil)
129
+ while node = @stack.pop
130
+ node.tag.write_closing_tag(@output) unless node.skip? TAG
131
+
132
+ break if node.name == name
133
+ end
134
+
135
+ @current = self.top
136
+ end
137
+
138
+ def filter(tag)
139
+ return tag
140
+ end
141
+
142
+ def doctype(string)
143
+ @output << string unless current.skip? DOCTYPE
144
+ end
145
+
146
+ def comment(string)
147
+ @output << string unless current.skip? COMMENT
148
+ end
149
+
150
+ def instruction(string)
151
+ @output << string unless current.skip? INSTRUCTION
152
+ end
153
+
154
+ def cdata(string)
155
+ @output << string unless current.skip? CDATA
156
+ end
157
+
158
+ def text(string)
159
+ Markup.append(@output, string) unless current.skip? TEXT
160
+ end
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ require_relative 'filter'
7
+
8
+ require 'set'
9
+
10
+ module XRB
11
+ module Sanitize
12
+ class Fragment < Filter
13
+ STANDARD_ATTRIBUTES = Set.new(['class', 'style']).freeze
14
+
15
+ ALLOWED_TAGS = {
16
+ 'div' => STANDARD_ATTRIBUTES,
17
+ 'span' => STANDARD_ATTRIBUTES,
18
+ 'br' => STANDARD_ATTRIBUTES,
19
+ 'b' => STANDARD_ATTRIBUTES,
20
+ 'i' => STANDARD_ATTRIBUTES,
21
+ 'em' => STANDARD_ATTRIBUTES,
22
+ 'strong' => STANDARD_ATTRIBUTES,
23
+ 'ul' => STANDARD_ATTRIBUTES,
24
+ 'ol' => STANDARD_ATTRIBUTES,
25
+ 'li' => STANDARD_ATTRIBUTES,
26
+ 'dl' => STANDARD_ATTRIBUTES,
27
+ 'dt' => STANDARD_ATTRIBUTES,
28
+ 'dd' => STANDARD_ATTRIBUTES,
29
+ 'strike' => STANDARD_ATTRIBUTES,
30
+ 'h1' => STANDARD_ATTRIBUTES,
31
+ 'h2' => STANDARD_ATTRIBUTES,
32
+ 'h3' => STANDARD_ATTRIBUTES,
33
+ 'h4' => STANDARD_ATTRIBUTES,
34
+ 'h5' => STANDARD_ATTRIBUTES,
35
+ 'h6' => STANDARD_ATTRIBUTES,
36
+ 'p' => STANDARD_ATTRIBUTES,
37
+ 'img' => STANDARD_ATTRIBUTES + ['src', 'alt', 'width', 'height'],
38
+ 'image' => STANDARD_ATTRIBUTES,
39
+ 'a' => STANDARD_ATTRIBUTES + ['href', 'target']
40
+ }.freeze
41
+
42
+ def filter(node)
43
+ if attributes = ALLOWED_TAGS[node.name]
44
+ node.limit_attributes(attributes)
45
+
46
+ node.accept!
47
+ else
48
+ node.skip!
49
+ end
50
+ end
51
+
52
+ def doctype(string)
53
+ end
54
+
55
+ def instruction(string)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ require_relative 'filter'
7
+
8
+ module XRB
9
+ module Sanitize
10
+ class Text < Filter
11
+ CLOSING = {
12
+ "p" => "\n\n",
13
+ "div" => "\n\n",
14
+ }
15
+
16
+ def filter(node)
17
+ if node.name == "br"
18
+ text("\n\n")
19
+ end
20
+
21
+ if node.name == 'script'
22
+ node.skip!(ALL) # Skip everything including content.
23
+ else
24
+ node.skip!(TAG) # Only skip the tag output, but not the content.
25
+ end
26
+ end
27
+
28
+ def close_tag(name, offset = nil)
29
+ super
30
+
31
+ if value = CLOSING[name]
32
+ text(value)
33
+ end
34
+ end
35
+
36
+ def doctype(string)
37
+ end
38
+
39
+ def comment(string)
40
+ end
41
+
42
+ def instruction(string)
43
+ end
44
+
45
+ def cdata(string)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2018-2024, by Samuel Williams.
5
+
6
+ module XRB
7
+ module Sanitize
8
+ VERSION = "0.7.0"
9
+ end
10
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Released under the MIT License.
4
+ # Copyright, 2024, by Samuel Williams.
5
+
6
+ require_relative 'sanitize/text'
7
+ require_relative 'sanitize/fragment'
data/license.md ADDED
@@ -0,0 +1,21 @@
1
+ # MIT License
2
+
3
+ Copyright, 2018-2024, by Samuel Williams.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/readme.md ADDED
@@ -0,0 +1,49 @@
1
+ # XRB::Sanitize
2
+
3
+ Sanitize markup by adding, changing or removing tags, using the [xrb](https://github.com/ioquatix/xrb) stream processor (which has a naive C implementation).
4
+
5
+ [![Development Status](https://github.com/socketry/xrb-sanitize/workflows/Test/badge.svg)](https://github.com/socketry/xrb-sanitize/actions?workflow=Test)
6
+
7
+ ## Motivation
8
+
9
+ I use the [sanitize](https://github.com/rgrove/sanitize/) gem and generally it's great. However, it's performance can be an issue and additionally, it doesn't preserve tag namespaces when parsing fragments due to how Nokogiri works internally. This is a problem when processing content destined for [utopia](https://github.com/ioquatix/utopia) since it heavily depends on tag namespaces.
10
+
11
+ ## Is it fast?
12
+
13
+ In my informal testing, this gem is about \~50x faster than the [sanitize](https://github.com/rgrove/sanitize/) gem when generating plain text.
14
+
15
+ ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
16
+ Warming up --------------------------------------
17
+ Sanitize 438.000 i/100ms
18
+ XRB::Sanitize 7.935k i/100ms
19
+ Calculating -------------------------------------
20
+ Sanitize 4.365k (± 0.1%) i/s - 21.900k in 5.017157s
21
+ XRB::Sanitize 78.670k (± 0.1%) i/s - 396.750k in 5.043233s
22
+
23
+ Comparison:
24
+ XRB::Sanitize: 78669.9 i/s
25
+ Sanitize: 4365.0 i/s - 18.02x slower
26
+
27
+ ## Usage
28
+
29
+ Please see the [project documentation](https://socketry.github.io/xrb-sanitize/) for more details.
30
+
31
+ - [Getting Started](https://socketry.github.io/xrb-sanitize/guides/getting-started/index) - This guide explains how to get started with the `XRB::Sanitize` gem.
32
+
33
+ ## Contributing
34
+
35
+ We welcome contributions to this project.
36
+
37
+ 1. Fork it.
38
+ 2. Create your feature branch (`git checkout -b my-new-feature`).
39
+ 3. Commit your changes (`git commit -am 'Add some feature'`).
40
+ 4. Push to the branch (`git push origin my-new-feature`).
41
+ 5. Create new Pull Request.
42
+
43
+ ### Developer Certificate of Origin
44
+
45
+ This project uses the [Developer Certificate of Origin](https://developercertificate.org/). All contributors to this project must agree to this document to have their contributions accepted.
46
+
47
+ ### Contributor Covenant
48
+
49
+ This project is governed by the [Contributor Covenant](https://www.contributor-covenant.org/). All contributors and participants agree to abide by its terms.
data.tar.gz.sig ADDED
Binary file
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: xrb-sanitize
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.0
5
+ platform: ruby
6
+ authors:
7
+ - Samuel Williams
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain:
11
+ - |
12
+ -----BEGIN CERTIFICATE-----
13
+ MIIE2DCCA0CgAwIBAgIBATANBgkqhkiG9w0BAQsFADBhMRgwFgYDVQQDDA9zYW11
14
+ ZWwud2lsbGlhbXMxHTAbBgoJkiaJk/IsZAEZFg1vcmlvbnRyYW5zZmVyMRIwEAYK
15
+ CZImiZPyLGQBGRYCY28xEjAQBgoJkiaJk/IsZAEZFgJuejAeFw0yMjA4MDYwNDUz
16
+ MjRaFw0zMjA4MDMwNDUzMjRaMGExGDAWBgNVBAMMD3NhbXVlbC53aWxsaWFtczEd
17
+ MBsGCgmSJomT8ixkARkWDW9yaW9udHJhbnNmZXIxEjAQBgoJkiaJk/IsZAEZFgJj
18
+ bzESMBAGCgmSJomT8ixkARkWAm56MIIBojANBgkqhkiG9w0BAQEFAAOCAY8AMIIB
19
+ igKCAYEAomvSopQXQ24+9DBB6I6jxRI2auu3VVb4nOjmmHq7XWM4u3HL+pni63X2
20
+ 9qZdoq9xt7H+RPbwL28LDpDNflYQXoOhoVhQ37Pjn9YDjl8/4/9xa9+NUpl9XDIW
21
+ sGkaOY0eqsQm1pEWkHJr3zn/fxoKPZPfaJOglovdxf7dgsHz67Xgd/ka+Wo1YqoE
22
+ e5AUKRwUuvaUaumAKgPH+4E4oiLXI4T1Ff5Q7xxv6yXvHuYtlMHhYfgNn8iiW8WN
23
+ XibYXPNP7NtieSQqwR/xM6IRSoyXKuS+ZNGDPUUGk8RoiV/xvVN4LrVm9upSc0ss
24
+ RZ6qwOQmXCo/lLcDUxJAgG95cPw//sI00tZan75VgsGzSWAOdjQpFM0l4dxvKwHn
25
+ tUeT3ZsAgt0JnGqNm2Bkz81kG4A2hSyFZTFA8vZGhp+hz+8Q573tAR89y9YJBdYM
26
+ zp0FM4zwMNEUwgfRzv1tEVVUEXmoFCyhzonUUw4nE4CFu/sE3ffhjKcXcY//qiSW
27
+ xm4erY3XAgMBAAGjgZowgZcwCQYDVR0TBAIwADALBgNVHQ8EBAMCBLAwHQYDVR0O
28
+ BBYEFO9t7XWuFf2SKLmuijgqR4sGDlRsMC4GA1UdEQQnMCWBI3NhbXVlbC53aWxs
29
+ aWFtc0BvcmlvbnRyYW5zZmVyLmNvLm56MC4GA1UdEgQnMCWBI3NhbXVlbC53aWxs
30
+ aWFtc0BvcmlvbnRyYW5zZmVyLmNvLm56MA0GCSqGSIb3DQEBCwUAA4IBgQB5sxkE
31
+ cBsSYwK6fYpM+hA5B5yZY2+L0Z+27jF1pWGgbhPH8/FjjBLVn+VFok3CDpRqwXCl
32
+ xCO40JEkKdznNy2avOMra6PFiQyOE74kCtv7P+Fdc+FhgqI5lMon6tt9rNeXmnW/
33
+ c1NaMRdxy999hmRGzUSFjozcCwxpy/LwabxtdXwXgSay4mQ32EDjqR1TixS1+smp
34
+ 8C/NCWgpIfzpHGJsjvmH2wAfKtTTqB9CVKLCWEnCHyCaRVuKkrKjqhYCdmMBqCws
35
+ JkxfQWC+jBVeG9ZtPhQgZpfhvh+6hMhraUYRQ6XGyvBqEUe+yo6DKIT3MtGE2+CP
36
+ eX9i9ZWBydWb8/rvmwmX2kkcBbX0hZS1rcR593hGc61JR6lvkGYQ2MYskBveyaxt
37
+ Q2K9NVun/S785AP05vKkXZEFYxqG6EW012U4oLcFl5MySFajYXRYbuUpH6AY+HP8
38
+ voD0MPg1DssDLKwXyt1eKD/+Fq0bFWhwVM/1XiAXL7lyYUyOq24KHgQ2Csg=
39
+ -----END CERTIFICATE-----
40
+ date: 2024-04-28 00:00:00.000000000 Z
41
+ dependencies:
42
+ - !ruby/object:Gem::Dependency
43
+ name: xrb
44
+ requirement: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - "~>"
47
+ - !ruby/object:Gem::Version
48
+ version: '0.3'
49
+ type: :runtime
50
+ prerelease: false
51
+ version_requirements: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - "~>"
54
+ - !ruby/object:Gem::Version
55
+ version: '0.3'
56
+ description:
57
+ email:
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/xrb/sanitize.rb
63
+ - lib/xrb/sanitize/filter.rb
64
+ - lib/xrb/sanitize/fragment.rb
65
+ - lib/xrb/sanitize/text.rb
66
+ - lib/xrb/sanitize/version.rb
67
+ - license.md
68
+ - readme.md
69
+ homepage: https://github.com/ioquatix/xrb-sanitize
70
+ licenses:
71
+ - MIT
72
+ metadata:
73
+ documentation_uri: https://socketry.github.io/xrb-sanitize/
74
+ funding_uri: https://github.com/sponsors/ioquatix/
75
+ source_code_uri: https://github.com/ioquatix/xrb-sanitize.git
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: '3.1'
85
+ required_rubygems_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ requirements: []
91
+ rubygems_version: 3.5.3
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Sanitize markdown according to a set of rules.
95
+ test_files: []
metadata.gz.sig ADDED
Binary file