escape_escape_escape 0.3.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -11
- data/VERSION +1 -1
- data/escape_escape_escape.gemspec +5 -2
- data/lib/escape_escape_escape.rb +219 -47
- data/specs/as_ruby/0001-html.rb +60 -0
- data/specs/as_ruby/0002-decode_html.rb +13 -0
- data/specs/as_ruby/0003-css_attr.rb +10 -0
- data/specs/as_ruby/0003-css_selector.rb +12 -0
- data/specs/as_ruby/0003-css_value.rb +53 -0
- data/specs/as_ruby/0004-==.rb +5 -0
- data/specs/as_ruby/0020-href.rb +118 -0
- data/specs/as_ruby/0030-clean_utf8.rb +34 -0
- data/specs/as_ruby/0040-escape.rb +41 -0
- data/specs/escape_escape_escape.rb +133 -21
- data/specs/lib/helpers.rb +1 -0
- metadata +61 -23
- data/LICENSE.txt +0 -23
- data/lib/beta.rb +0 -270
- data/lib/e_e_e.js +0 -258
- data/package.json +0 -31
- data/specs/as_json/0001-html.json +0 -23
- data/specs/as_json/0002-inner_html.json +0 -16
- data/specs/as_json/0010-text.json +0 -29
- data/specs/helpers.rb +0 -4
- data/test/sanitize_attrs.js +0 -132
- data/test/sanitize_html.js +0 -57
- data/test/sanitize_un_escape.js +0 -41
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86100684d36a9aff31d78415463e2a3c357fe646
|
4
|
+
data.tar.gz: 77b8c43cc053204953f747630c7fdd5938034ec5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7705788caaf5f6c4996b5381c1b3d2e09d390a7ab5a795a0aa8d32ccbcdec772942caf3c433d06aa3145d36244b617f545193657ee8493582011d94b48ffeec5
|
7
|
+
data.tar.gz: 5a4785d7e96190194040cf9d9c7e766b1095ae742327d0e9b3486a1fcdd89e85c19cfb576486b3afad59e6cb7b3350e61ae7ebeb77d35a4568296a9bb0ef02e9
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
Escape_Escape_Escape
|
2
2
|
====================
|
3
3
|
|
4
4
|
My way of escaping and sanitizing HTML.
|
@@ -6,20 +6,15 @@ This is very personal to me, so you won't
|
|
6
6
|
find it useful or flexible to meet your needs.
|
7
7
|
|
8
8
|
|
9
|
-
NPM Use:
|
10
|
-
=====================
|
11
|
-
|
12
|
-
|
13
|
-
// npm install escape_escape_escape
|
14
|
-
|
15
|
-
var E = require("escape_escape_escape").Sanitize.html;
|
16
|
-
E("The <strong>brave</strong> and the <b>bold</b>.");
|
17
|
-
|
18
9
|
Rubygems Use:
|
19
10
|
=====================
|
20
11
|
|
21
|
-
|
22
12
|
# gem install escape_escape_escape
|
23
13
|
|
24
14
|
Escape_Escape_Escape.html my_html_string
|
25
15
|
Escape_Escape_Escape.text my_text_string
|
16
|
+
|
17
|
+
NOTE: Node and NPM Use:
|
18
|
+
=====================
|
19
|
+
|
20
|
+
This is no longer a npm module.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
1.1.0
|
@@ -21,8 +21,10 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
22
22
|
spec.require_paths = ["lib"]
|
23
23
|
|
24
|
-
spec.
|
25
|
-
spec.
|
24
|
+
spec.add_runtime_dependency "addressable" , "> 2.3.5"
|
25
|
+
spec.add_runtime_dependency "escape_utils" , "> 1.0.0"
|
26
|
+
spec.add_runtime_dependency "unf" , "> 0.1.3"
|
27
|
+
spec.add_runtime_dependency "htmlentities" , ">= 4.3.2"
|
26
28
|
|
27
29
|
spec.add_development_dependency "pry" , ">= 0.9"
|
28
30
|
spec.add_development_dependency "rake" , ">= 10.3"
|
@@ -30,4 +32,5 @@ Gem::Specification.new do |spec|
|
|
30
32
|
spec.add_development_dependency "bacon" , ">= 1.0"
|
31
33
|
spec.add_development_dependency "Bacon_Colored" , ">= 0.1"
|
32
34
|
spec.add_development_dependency "multi_json" , ">= 1.10"
|
35
|
+
spec.add_development_dependency "sanitize" , ">= 3.0.1"
|
33
36
|
end
|
data/lib/escape_escape_escape.rb
CHANGED
@@ -1,31 +1,95 @@
|
|
1
1
|
|
2
|
+
require 'unf'
|
2
3
|
|
3
|
-
require "
|
4
|
+
require "escape_utils"
|
5
|
+
|
6
|
+
require 'escape_utils/html/rack' # to patch Rack::Utils
|
7
|
+
require 'escape_utils/html/erb' # to patch ERB::Util
|
8
|
+
require 'escape_utils/html/cgi' # to patch CGI
|
9
|
+
require 'escape_utils/html/haml' # to patch Haml::Helpers
|
10
|
+
|
11
|
+
require 'escape_utils/url/cgi' # to patch CGI
|
12
|
+
require 'escape_utils/url/erb' # to patch ERB::Util
|
13
|
+
require 'escape_utils/url/rack' # to patch Rack::Utils
|
14
|
+
require 'escape_utils/url/uri' # to patch URI
|
15
|
+
|
16
|
+
# ======================
|
4
17
|
require "htmlentities"
|
18
|
+
# ======================
|
19
|
+
#
|
20
|
+
require "uri"
|
21
|
+
require 'cgi' # Don't use URI.escape because it does not escape all invalid characters.
|
22
|
+
require "addressable/uri"
|
23
|
+
# ======================
|
24
|
+
|
25
|
+
def Escape_Escape_Escape s
|
26
|
+
Escape_Escape_Escape.escape(s)
|
27
|
+
end
|
5
28
|
|
6
29
|
class Escape_Escape_Escape
|
7
30
|
|
8
|
-
|
31
|
+
# === From sanitize gem:
|
32
|
+
# https://raw.githubusercontent.com/rgrove/sanitize/master/lib/sanitize.rb
|
33
|
+
REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
|
34
|
+
# ==================================================================================
|
35
|
+
|
36
|
+
CODER = HTMLEntities.new(:xhtml1)
|
37
|
+
|
38
|
+
Invalid = Class.new(RuntimeError)
|
39
|
+
Invalid_HREF = Class.new(RuntimeError)
|
40
|
+
|
41
|
+
Invalid_Type = Class.new(RuntimeError)
|
42
|
+
|
43
|
+
TAG_PATTERN = /\A[a-z]([a-z0-9\_]{0,}[a-z]{1,})?\z/i
|
44
|
+
|
45
|
+
VALID_CSS_VALUE = /\A[a-z0-9\;\-\_\#\ ]+\z/i
|
46
|
+
VALID_CSS_SELECTOR = /\A[a-z0-9\#\:\_\-\.\ ]+\z/i
|
47
|
+
VALID_CSS_ATTR = /\A[a-z0-9-]+\z/i
|
9
48
|
|
10
|
-
REPEATING_DOTS = /\.{1,}\//
|
11
49
|
INVALID_FILE_NAME_CHARS = /[^a-z0-9\_\.]{1,}/i
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
50
|
+
|
51
|
+
TABS = /\t*/
|
52
|
+
TAB = "\t"
|
53
|
+
HTML_TAB = "	"
|
54
|
+
TWO_SPACES = ' '
|
55
|
+
BLANK = ''
|
56
|
+
SPACE = ' '
|
57
|
+
|
58
|
+
NL = "\n";
|
59
|
+
SPACES = /\ +/;
|
60
|
+
|
61
|
+
VALID_HTML_ID = /\A[0-9a-z_]+\z/i;
|
62
|
+
VALID_HTML_TAG = /\A[0-9a-z_]+\z/i;
|
63
|
+
|
64
|
+
REPEATING_DOTS = /\.{1,}/
|
65
|
+
|
66
|
+
# === MULTI_CONTROL_CHARS: ==================================
|
67
|
+
#
|
68
|
+
# Unicode whitespaces, like 160 codepoint, tabs, etc.
|
69
|
+
# Excludes newline.
|
70
|
+
#
|
71
|
+
# Examples:
|
72
|
+
# \r\n \r\n -> \n \n
|
73
|
+
#
|
74
|
+
# NOTE: Don't use "\x20" because that is the space character.
|
75
|
+
#
|
76
|
+
# Whitespace regex ([:space:]) from:
|
77
|
+
# http://www.rubyinside.com/the-split-is-not-enough-whitespace-shenigans-for-rubyists-5980.html
|
78
|
+
#
|
79
|
+
# =====================================================
|
80
|
+
MULTI_CONTROL_AND_UNPRINTABLE = /[[:space:][:cntrl:]\x00-\x1f&&[^\n\ [:print:]]]+/i
|
81
|
+
# =====================================================
|
82
|
+
|
83
|
+
ENCODING_OPTIONS_CLEAN_UTF8 = {
|
84
|
+
:invalid => :replace, # Replace invalid byte sequences
|
85
|
+
:undef => :replace, # Replace anything not defined in ASCII
|
86
|
+
:replace => '' # Use a blank for those replacements
|
87
|
+
# :universal_newline => true # Always break lines with \n, not \r\n
|
88
|
+
# -- this is not working with :replace, so it has to be done manually
|
89
|
+
# with .gsub
|
90
|
+
}
|
91
|
+
|
92
|
+
CONFIG = {
|
29
93
|
:protocols => {
|
30
94
|
"a"=>{
|
31
95
|
"href"=>["ftp", "http", "https", "mailto", :relative]
|
@@ -36,51 +100,159 @@ class Escape_Escape_Escape
|
|
36
100
|
}
|
37
101
|
}
|
38
102
|
|
39
|
-
|
40
|
-
:invalid => :replace, # Replace invalid byte sequences
|
41
|
-
:undef => :replace, # Replace anything not defined in ASCII
|
42
|
-
:replace => '' # Use a blank for those replacements
|
43
|
-
# :newline => :universal
|
44
|
-
# :universal_newline => true # Always break lines with \n, not \r\n
|
45
|
-
}
|
46
|
-
|
103
|
+
class << self # ======================================================
|
47
104
|
|
105
|
+
def regexp str
|
106
|
+
@regexp_opts ||= Regexp::FIXEDENCODING | Regexp::IGNORECASE
|
107
|
+
Regexp.new(clean_utf8(str), @regexp_opts)
|
108
|
+
end
|
48
109
|
|
49
|
-
|
110
|
+
# ===============================================
|
111
|
+
# Raises: TZInfo::InvalidTimezoneIdentifier.
|
112
|
+
# ===============================================
|
113
|
+
def validate_timezone(timezone)
|
114
|
+
TZInfo::Timezone.get( timezone.to_s.strip ).identifier
|
115
|
+
end
|
50
116
|
|
51
|
-
#
|
117
|
+
# ==================================================================
|
118
|
+
# * normalized to :KC
|
119
|
+
# * "\r\n" changed to "\n"
|
120
|
+
# * all control characters stripped except for "\n"
|
121
|
+
# and end.
|
122
|
+
# Normalization, then strip:
|
123
|
+
# http://msdn.microsoft.com/en-us/library/dd374126(v=vs.85).aspx
|
124
|
+
# http://www.unicode.org/faq/normalization.html
|
125
|
+
#
|
126
|
+
# Getting rid of non-ascii characters in ruby:
|
52
127
|
# http://stackoverflow.com/questions/1268289/how-to-get-rid-of-non-ascii-characters-in-ruby
|
53
128
|
#
|
54
129
|
# Test:
|
55
130
|
# [160, 160,64, 116, 119, 101, 108, 108, 121, 109, 101, 160, 102, 105, 108, 109].
|
56
131
|
# inject('', :<<)
|
57
132
|
#
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
133
|
+
# Options:
|
134
|
+
#
|
135
|
+
# :tabs
|
136
|
+
# :spaces
|
137
|
+
#
|
138
|
+
def clean_utf8 raw_s, *opts
|
139
|
+
|
140
|
+
fail("Not a string: #{raw_s.inspect}") unless raw_s.is_a?(String)
|
141
|
+
|
142
|
+
# === Check options. ==================================================================
|
143
|
+
@plaintext_allowed_options ||= [ :spaces, :tabs ]
|
144
|
+
invalid_opts = opts - @plaintext_allowed_options
|
145
|
+
fail(ArgumentError, "INVALID OPTION: #{invalid_opts.inspect}" ) if !invalid_opts.empty?
|
146
|
+
# =====================================================================================
|
147
|
+
|
148
|
+
raw_s = raw_s.dup
|
149
|
+
|
150
|
+
# === Save tabs if requested.
|
151
|
+
raw_s.gsub!(TAB, HTML_TAB) if opts.include?(:tabs)
|
152
|
+
|
153
|
+
raw_s.encode!(Encoding.find('utf-8') , ENCODING_OPTIONS_CLEAN_UTF8)
|
154
|
+
raw_s.scrub!
|
155
|
+
raw_s.gsub!(TAB , TWO_SPACES)
|
156
|
+
raw_s.gsub!(MULTI_CONTROL_AND_UNPRINTABLE , BLANK)
|
157
|
+
raw_s.gsub!(REGEX_UNSUITABLE_CHARS , ' ')
|
158
|
+
|
159
|
+
clean = raw_s.to_nfkc
|
160
|
+
|
161
|
+
# Save whitespace or strip.
|
162
|
+
if !opts.include?(:spaces)
|
163
|
+
clean.strip!
|
164
|
+
end
|
165
|
+
|
166
|
+
# Put back tabs by request.
|
167
|
+
if opts.include?(:tabs)
|
168
|
+
clean.gsub!(HTML_TAB, TAB)
|
169
|
+
end
|
67
170
|
|
68
|
-
|
69
|
-
clean_utf8 s
|
171
|
+
clean
|
70
172
|
end
|
71
173
|
|
72
|
-
|
73
|
-
|
174
|
+
# ===============================================
|
175
|
+
#
|
176
|
+
# Handles urls and relative paths.
|
177
|
+
#
|
178
|
+
# Inspired from:
|
179
|
+
# http://stackoverflow.com/a/13041565
|
180
|
+
#
|
181
|
+
# ===============================================
|
182
|
+
alias_method :path, def href raw_str
|
183
|
+
fail("Not a string: #{raw_str.inspect}") unless raw_str.is_a?(String)
|
184
|
+
|
185
|
+
begin
|
186
|
+
uri = URI.parse(decode_html(raw_str))
|
187
|
+
if uri.scheme
|
188
|
+
uri.scheme = uri.scheme.to_s.strip.downcase
|
189
|
+
end
|
190
|
+
|
191
|
+
fail( Invalid_HREF, "javascript:// is not allowed" ) if (uri.scheme || ''.freeze)['javascript'.freeze]
|
192
|
+
fail( Invalid_HREF, "address is invalid") if !uri.host && !uri.relative?
|
193
|
+
|
194
|
+
html(EscapeUtils.escape_uri uri.to_s)
|
195
|
+
rescue URI::InvalidURIError => e
|
196
|
+
raise Invalid_HREF, e.message
|
197
|
+
end
|
74
198
|
end
|
75
199
|
|
76
|
-
|
77
|
-
|
200
|
+
# ===============================================
|
201
|
+
# HTML
|
202
|
+
# ===============================================
|
203
|
+
|
204
|
+
def tag( raw_tag )
|
205
|
+
return nil unless raw_tag[TAG_PATTERN]
|
206
|
+
raw_tag
|
78
207
|
end
|
79
208
|
|
80
|
-
def
|
81
|
-
|
209
|
+
def decode_html raw
|
210
|
+
fail("Not a string: #{raw.inspect}") unless raw.is_a?(String)
|
211
|
+
CODER.decode clean_utf8(raw)
|
82
212
|
end
|
83
213
|
|
214
|
+
%w{attr selector value}.each { |name|
|
215
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
216
|
+
def css_#{name} raw
|
217
|
+
fail(Invalid_Type, "Not a string: \#{raw.inspect}") unless raw.is_a?(String)
|
218
|
+
clean = html(raw)
|
219
|
+
return clean if clean[VALID_CSS_#{name.upcase}]
|
220
|
+
fail Invalid, "contains invalid chars: \#{raw.inspect}"
|
221
|
+
end
|
222
|
+
EOF
|
223
|
+
}
|
224
|
+
|
225
|
+
# ===============================================
|
226
|
+
# A better alternative than "Rack::Utils.escape_html". Escapes
|
227
|
+
# various characters (including '&', '<', '>', and both quotation mark types)
|
228
|
+
# to HTML decimal entities. Also escapes the characters from
|
229
|
+
# <HTML_ESCAPE_TABLE>.
|
230
|
+
#
|
231
|
+
# Text has to be UTF-8 before encoding, according to HTMLEntities gem.
|
232
|
+
# Therefore, all text is run through <plaintext> before encoding.
|
233
|
+
# ===============================================
|
234
|
+
def html( raw_text )
|
235
|
+
EscapeUtils.escape_html(decode_html(raw_text))
|
236
|
+
end # === def html
|
237
|
+
|
238
|
+
def escape o, method_name = :html
|
239
|
+
if o.kind_of? Hash
|
240
|
+
return(
|
241
|
+
o.inject({}) { |memo, (k, v)|
|
242
|
+
memo[escape(k,method_name)] = escape(v, method_name)
|
243
|
+
memo
|
244
|
+
}
|
245
|
+
)
|
246
|
+
end
|
247
|
+
|
248
|
+
return(send(method_name, o.to_s).to_sym) if o.is_a?(Symbol)
|
249
|
+
return(o.map { |v| escape(v, method_name) }) if o.kind_of? Array
|
250
|
+
return send(method_name, o) if o.is_a?(String)
|
251
|
+
return send(method_name, o.to_s) if o == true || o == false || o.kind_of?(Numeric)
|
252
|
+
|
253
|
+
fail Invalid, "Not a String, Number, Array, or Hash"
|
254
|
+
end # === def
|
255
|
+
|
84
256
|
end # === class self ===
|
85
257
|
|
86
258
|
end # === class Escape_Escape_Escape ===
|
@@ -0,0 +1,60 @@
|
|
1
|
+
it "does not re-escape already escaped html"
|
2
|
+
input "<p>Hello & GoodBye</p>"
|
3
|
+
output "<p>Hello & GoodBye</p>"
|
4
|
+
|
5
|
+
it "normalizes UNICODE: Ⅷ => VIII"
|
6
|
+
input "<p> Ⅷ </p>"
|
7
|
+
output "<p> VIII </p>"
|
8
|
+
|
9
|
+
it "normalizes UNICODE: \u2167 => VIII"
|
10
|
+
input "<p> \u2167 </p>"
|
11
|
+
output "<p> VIII </p>"
|
12
|
+
|
13
|
+
it "encodes apostrophe: ' -> '"
|
14
|
+
input "Chars: ' '"
|
15
|
+
output "Chars: ' '"
|
16
|
+
|
17
|
+
it 'does not re-escape already escaped text mixed with HTML'
|
18
|
+
input "<p>Hi</p><p>Hi</p>"
|
19
|
+
output "<p>Hi</p><p>Hi</p>"
|
20
|
+
|
21
|
+
it 'does not escape special chars: "Hello ©®∆"'
|
22
|
+
input "Hello & World ©®∆"
|
23
|
+
output "Hello & World ©®∆"
|
24
|
+
|
25
|
+
it 'escapes all 70 different combos of "<"'
|
26
|
+
input BRACKETS
|
27
|
+
stack [:split, :uniq, :join, [' '], "< %3C &lt &LT &LT; &#60 &#060 &#0060 &#00060 &#000060 &#0000060 &#x3c &#x03c &#x003c &#x0003c &#x00003c &#x000003c &#x000003c; &#X3c &#X03c &#X003c &#X0003c &#X00003c &#X000003c &#X000003c; &#x3C &#x03C &#x003C &#x0003C &#x00003C &#x000003C &#x000003C; &#X3C &#X03C &#X003C &#X0003C &#X00003C &#X000003C &#X000003C;"]
|
28
|
+
|
29
|
+
|
30
|
+
it "fails with RuntimeError if: true"
|
31
|
+
input true
|
32
|
+
raises RuntimeError, /Not a string: true/
|
33
|
+
|
34
|
+
|
35
|
+
it "fails with RuntimeError if: false"
|
36
|
+
input false
|
37
|
+
raises RuntimeError, /Not a string: false/
|
38
|
+
|
39
|
+
|
40
|
+
it "fails with RuntimeError if numeric"
|
41
|
+
input 1
|
42
|
+
raises RuntimeError, /Not a string: 1/
|
43
|
+
|
44
|
+
it 'removes Unicode characters that do not belong in html'
|
45
|
+
input "b \u0340 \u0341 \u17a3 \u17d3 \u2028 \u2029 \u202a"
|
46
|
+
output "b"
|
47
|
+
|
48
|
+
it "removes unprintable characters"
|
49
|
+
input "end-\u2028-\u2029-"
|
50
|
+
output "end---"
|
51
|
+
|
52
|
+
it "escapes /:"
|
53
|
+
input "/"
|
54
|
+
output "&sol;"
|
55
|
+
|
56
|
+
it "escapes / regardless of case:"
|
57
|
+
input "&soL; &SoL; &SOL;"
|
58
|
+
output "&soL; &SoL; &SOL;"
|
59
|
+
|
60
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
it "un-escapes special chars: \"Hello ©®∆\""
|
2
|
+
input "Hello & World ©®∆"
|
3
|
+
output "Hello & World ©®∆"
|
4
|
+
|
5
|
+
it 'un-escapes escaped text mixed with HTML'
|
6
|
+
input "<p>Hi&</p>"
|
7
|
+
output "<p>Hi&</p>"
|
8
|
+
|
9
|
+
|
10
|
+
it 'un-escapes all 70 different combos of "<"'
|
11
|
+
input BRACKETS
|
12
|
+
stack [:split, :uniq, :join, [' '], '< %3C < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < <']
|
13
|
+
|