escape_escape_escape 0.3.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7232497e102ab2bd27d1892c77ec7eb9f2957e27
4
- data.tar.gz: 263451a1649efb22f1fc21457c240b29ad3de001
3
+ metadata.gz: 86100684d36a9aff31d78415463e2a3c357fe646
4
+ data.tar.gz: 77b8c43cc053204953f747630c7fdd5938034ec5
5
5
  SHA512:
6
- metadata.gz: 065639dcd17ec6ed58702292f1a1f5d637aa5b4220ba3644a7b8823f63432f6d9c99502d22fc5cf0d5592323959edeb6ac4f31ee212f38f0f0ef212fad670d0c
7
- data.tar.gz: 2efc664d236dc1c7eb9aa4f0347daab1e453b34ee77fc788b28187dd179a64efe803855ea1a9ef3e48ace9268637d2d318de60c8052af1393b96f15799ea59b0
6
+ metadata.gz: 7705788caaf5f6c4996b5381c1b3d2e09d390a7ab5a795a0aa8d32ccbcdec772942caf3c433d06aa3145d36244b617f545193657ee8493582011d94b48ffeec5
7
+ data.tar.gz: 5a4785d7e96190194040cf9d9c7e766b1095ae742327d0e9b3486a1fcdd89e85c19cfb576486b3afad59e6cb7b3350e61ae7ebeb77d35a4568296a9bb0ef02e9
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- What is it?
1
+ Escape_Escape_Escape
2
2
  ====================
3
3
 
4
4
  My way of escaping and sanitizing HTML.
@@ -6,20 +6,15 @@ This is very personal to me, so you won't
6
6
  find it useful or flexible to meet your needs.
7
7
 
8
8
 
9
- NPM Use:
10
- =====================
11
-
12
-
13
- // npm install escape_escape_escape
14
-
15
- var E = require("escape_escape_escape").Sanitize.html;
16
- E("The <strong>brave</strong> and the <b>bold</b>.");
17
-
18
9
  Rubygems Use:
19
10
  =====================
20
11
 
21
-
22
12
  # gem install escape_escape_escape
23
13
 
24
14
  Escape_Escape_Escape.html my_html_string
25
15
  Escape_Escape_Escape.text my_text_string
16
+
17
+ NOTE: Node and NPM Use:
18
+ =====================
19
+
20
+ This is no longer a npm module.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 1.1.0
@@ -21,8 +21,10 @@ Gem::Specification.new do |spec|
21
21
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
22
22
  spec.require_paths = ["lib"]
23
23
 
24
- spec.add_dependency "sanitize" , ">= 3.0"
25
- spec.add_dependency "htmlentities" , ">= 4.3.2"
24
+ spec.add_runtime_dependency "addressable" , "> 2.3.5"
25
+ spec.add_runtime_dependency "escape_utils" , "> 1.0.0"
26
+ spec.add_runtime_dependency "unf" , "> 0.1.3"
27
+ spec.add_runtime_dependency "htmlentities" , ">= 4.3.2"
26
28
 
27
29
  spec.add_development_dependency "pry" , ">= 0.9"
28
30
  spec.add_development_dependency "rake" , ">= 10.3"
@@ -30,4 +32,5 @@ Gem::Specification.new do |spec|
30
32
  spec.add_development_dependency "bacon" , ">= 1.0"
31
33
  spec.add_development_dependency "Bacon_Colored" , ">= 0.1"
32
34
  spec.add_development_dependency "multi_json" , ">= 1.10"
35
+ spec.add_development_dependency "sanitize" , ">= 3.0.1"
33
36
  end
@@ -1,31 +1,95 @@
1
1
 
2
+ require 'unf'
2
3
 
3
- require "sanitize"
4
+ require "escape_utils"
5
+
6
+ require 'escape_utils/html/rack' # to patch Rack::Utils
7
+ require 'escape_utils/html/erb' # to patch ERB::Util
8
+ require 'escape_utils/html/cgi' # to patch CGI
9
+ require 'escape_utils/html/haml' # to patch Haml::Helpers
10
+
11
+ require 'escape_utils/url/cgi' # to patch CGI
12
+ require 'escape_utils/url/erb' # to patch ERB::Util
13
+ require 'escape_utils/url/rack' # to patch Rack::Utils
14
+ require 'escape_utils/url/uri' # to patch URI
15
+
16
+ # ======================
4
17
  require "htmlentities"
18
+ # ======================
19
+ #
20
+ require "uri"
21
+ require 'cgi' # Don't use URI.escape because it does not escape all invalid characters.
22
+ require "addressable/uri"
23
+ # ======================
24
+
25
+ def Escape_Escape_Escape s
26
+ Escape_Escape_Escape.escape(s)
27
+ end
5
28
 
6
29
  class Escape_Escape_Escape
7
30
 
8
- CODER = HTMLEntities.new(:xhtml1)
31
+ # === From sanitize gem:
32
+ # https://raw.githubusercontent.com/rgrove/sanitize/master/lib/sanitize.rb
33
+ REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
34
+ # ==================================================================================
35
+
36
+ CODER = HTMLEntities.new(:xhtml1)
37
+
38
+ Invalid = Class.new(RuntimeError)
39
+ Invalid_HREF = Class.new(RuntimeError)
40
+
41
+ Invalid_Type = Class.new(RuntimeError)
42
+
43
+ TAG_PATTERN = /\A[a-z]([a-z0-9\_]{0,}[a-z]{1,})?\z/i
44
+
45
+ VALID_CSS_VALUE = /\A[a-z0-9\;\-\_\#\ ]+\z/i
46
+ VALID_CSS_SELECTOR = /\A[a-z0-9\#\:\_\-\.\ ]+\z/i
47
+ VALID_CSS_ATTR = /\A[a-z0-9-]+\z/i
9
48
 
10
- REPEATING_DOTS = /\.{1,}\//
11
49
  INVALID_FILE_NAME_CHARS = /[^a-z0-9\_\.]{1,}/i
12
- UN_PRINT_ABLE = /[^[:print:]\n]/
13
- CR = "\r"
14
- TABS = "\t"
15
- CONTROL_CHARS = /[[:cntrl:]\x00-\x1f]/ # Don't use "\x20" because that is the space character.
16
- WHITE_SPACE = /[[:space:]]&&[^\n]/ # http://www.rubyinside.com/the-split-is-not-enough-whitespace-shenigans-for-rubyists-5980.html
17
- CONFIG = {
18
- :attributes => Sanitize::Config::RELAXED[:attributes].dup,
19
- :css => Sanitize::Config::RELAXED[:css].dup,
20
- :allow_doctype => true,
21
- :elements => %{
22
- a blockquote body br caption cite code div
23
- img pre p span
24
- h1 h2 h3 h4
25
- i em strong sub sup
26
- ol li ul
27
- html title style
28
- },
50
+
51
+ TABS = /\t*/
52
+ TAB = "\t"
53
+ HTML_TAB = "&#09;"
54
+ TWO_SPACES = ' '
55
+ BLANK = ''
56
+ SPACE = ' '
57
+
58
+ NL = "\n";
59
+ SPACES = /\ +/;
60
+
61
+ VALID_HTML_ID = /\A[0-9a-z_]+\z/i;
62
+ VALID_HTML_TAG = /\A[0-9a-z_]+\z/i;
63
+
64
+ REPEATING_DOTS = /\.{1,}/
65
+
66
+ # === MULTI_CONTROL_CHARS: ==================================
67
+ #
68
+ # Unicode whitespaces, like 160 codepoint, tabs, etc.
69
+ # Excludes newline.
70
+ #
71
+ # Examples:
72
+ # \r\n \r\n -> \n \n
73
+ #
74
+ # NOTE: Don't use "\x20" because that is the space character.
75
+ #
76
+ # Whitespace regex ([:space:]) from:
77
+ # http://www.rubyinside.com/the-split-is-not-enough-whitespace-shenigans-for-rubyists-5980.html
78
+ #
79
+ # =====================================================
80
+ MULTI_CONTROL_AND_UNPRINTABLE = /[[:space:][:cntrl:]\x00-\x1f&&[^\n\ [:print:]]]+/i
81
+ # =====================================================
82
+
83
+ ENCODING_OPTIONS_CLEAN_UTF8 = {
84
+ :invalid => :replace, # Replace invalid byte sequences
85
+ :undef => :replace, # Replace anything not defined in ASCII
86
+ :replace => '' # Use a blank for those replacements
87
+ # :universal_newline => true # Always break lines with \n, not \r\n
88
+ # -- this is not working with :replace, so it has to be done manually
89
+ # with .gsub
90
+ }
91
+
92
+ CONFIG = {
29
93
  :protocols => {
30
94
  "a"=>{
31
95
  "href"=>["ftp", "http", "https", "mailto", :relative]
@@ -36,51 +100,159 @@ class Escape_Escape_Escape
36
100
  }
37
101
  }
38
102
 
39
- ENCODING_OPTIONS_CLEAN_UTF8 = {
40
- :invalid => :replace, # Replace invalid byte sequences
41
- :undef => :replace, # Replace anything not defined in ASCII
42
- :replace => '' # Use a blank for those replacements
43
- # :newline => :universal
44
- # :universal_newline => true # Always break lines with \n, not \r\n
45
- }
46
-
103
+ class << self # ======================================================
47
104
 
105
+ def regexp str
106
+ @regexp_opts ||= Regexp::FIXEDENCODING | Regexp::IGNORECASE
107
+ Regexp.new(clean_utf8(str), @regexp_opts)
108
+ end
48
109
 
49
- class << self # ======================================================
110
+ # ===============================================
111
+ # Raises: TZInfo::InvalidTimezoneIdentifier.
112
+ # ===============================================
113
+ def validate_timezone(timezone)
114
+ TZInfo::Timezone.get( timezone.to_s.strip ).identifier
115
+ end
50
116
 
51
- # From:
117
+ # ==================================================================
118
+ # * normalized to :KC
119
+ # * "\r\n" changed to "\n"
120
+ # * all control characters stripped except for "\n"
121
+ # and end.
122
+ # Normalization, then strip:
123
+ # http://msdn.microsoft.com/en-us/library/dd374126(v=vs.85).aspx
124
+ # http://www.unicode.org/faq/normalization.html
125
+ #
126
+ # Getting rid of non-ascii characters in ruby:
52
127
  # http://stackoverflow.com/questions/1268289/how-to-get-rid-of-non-ascii-characters-in-ruby
53
128
  #
54
129
  # Test:
55
130
  # [160, 160,64, 116, 119, 101, 108, 108, 121, 109, 101, 160, 102, 105, 108, 109].
56
131
  # inject('', :<<)
57
132
  #
58
- def clean_utf8 s
59
- s.
60
- encode(Encoding.find('utf-8') , ENCODING_OPTIONS_CLEAN_UTF8).
61
- gsub(TABS , " ").
62
- gsub(CR , "").
63
- gsub(UN_PRINT_ABLE , '').
64
- gsub(CONTROL_CHARS , "\n" ).
65
- gsub(WHITE_SPACE , " ")
66
- end
133
+ # Options:
134
+ #
135
+ # :tabs
136
+ # :spaces
137
+ #
138
+ def clean_utf8 raw_s, *opts
139
+
140
+ fail("Not a string: #{raw_s.inspect}") unless raw_s.is_a?(String)
141
+
142
+ # === Check options. ==================================================================
143
+ @plaintext_allowed_options ||= [ :spaces, :tabs ]
144
+ invalid_opts = opts - @plaintext_allowed_options
145
+ fail(ArgumentError, "INVALID OPTION: #{invalid_opts.inspect}" ) if !invalid_opts.empty?
146
+ # =====================================================================================
147
+
148
+ raw_s = raw_s.dup
149
+
150
+ # === Save tabs if requested.
151
+ raw_s.gsub!(TAB, HTML_TAB) if opts.include?(:tabs)
152
+
153
+ raw_s.encode!(Encoding.find('utf-8') , ENCODING_OPTIONS_CLEAN_UTF8)
154
+ raw_s.scrub!
155
+ raw_s.gsub!(TAB , TWO_SPACES)
156
+ raw_s.gsub!(MULTI_CONTROL_AND_UNPRINTABLE , BLANK)
157
+ raw_s.gsub!(REGEX_UNSUITABLE_CHARS , ' ')
158
+
159
+ clean = raw_s.to_nfkc
160
+
161
+ # Save whitespace or strip.
162
+ if !opts.include?(:spaces)
163
+ clean.strip!
164
+ end
165
+
166
+ # Put back tabs by request.
167
+ if opts.include?(:tabs)
168
+ clean.gsub!(HTML_TAB, TAB)
169
+ end
67
170
 
68
- def text s
69
- clean_utf8 s
171
+ clean
70
172
  end
71
173
 
72
- def html s
73
- Sanitize.fragment( clean_utf8(s), CONFIG )
174
+ # ===============================================
175
+ #
176
+ # Handles urls and relative paths.
177
+ #
178
+ # Inspired from:
179
+ # http://stackoverflow.com/a/13041565
180
+ #
181
+ # ===============================================
182
+ alias_method :path, def href raw_str
183
+ fail("Not a string: #{raw_str.inspect}") unless raw_str.is_a?(String)
184
+
185
+ begin
186
+ uri = URI.parse(decode_html(raw_str))
187
+ if uri.scheme
188
+ uri.scheme = uri.scheme.to_s.strip.downcase
189
+ end
190
+
191
+ fail( Invalid_HREF, "javascript:// is not allowed" ) if (uri.scheme || ''.freeze)['javascript'.freeze]
192
+ fail( Invalid_HREF, "address is invalid") if !uri.host && !uri.relative?
193
+
194
+ html(EscapeUtils.escape_uri uri.to_s)
195
+ rescue URI::InvalidURIError => e
196
+ raise Invalid_HREF, e.message
197
+ end
74
198
  end
75
199
 
76
- def unescape_inner_html s
77
- CODER.decode(clean_utf8(s))
200
+ # ===============================================
201
+ # HTML
202
+ # ===============================================
203
+
204
+ def tag( raw_tag )
205
+ return nil unless raw_tag[TAG_PATTERN]
206
+ raw_tag
78
207
  end
79
208
 
80
- def inner_html s
81
- CODER.encode(unescape_inner_html(s), :named, :hexadecimal)
209
+ def decode_html raw
210
+ fail("Not a string: #{raw.inspect}") unless raw.is_a?(String)
211
+ CODER.decode clean_utf8(raw)
82
212
  end
83
213
 
214
+ %w{attr selector value}.each { |name|
215
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
216
+ def css_#{name} raw
217
+ fail(Invalid_Type, "Not a string: \#{raw.inspect}") unless raw.is_a?(String)
218
+ clean = html(raw)
219
+ return clean if clean[VALID_CSS_#{name.upcase}]
220
+ fail Invalid, "contains invalid chars: \#{raw.inspect}"
221
+ end
222
+ EOF
223
+ }
224
+
225
+ # ===============================================
226
+ # A better alternative than "Rack::Utils.escape_html". Escapes
227
+ # various characters (including '&', '<', '>', and both quotation mark types)
228
+ # to HTML decimal entities. Also escapes the characters from
229
+ # <HTML_ESCAPE_TABLE>.
230
+ #
231
+ # Text has to be UTF-8 before encoding, according to HTMLEntities gem.
232
+ # Therefore, all text is run through <plaintext> before encoding.
233
+ # ===============================================
234
+ def html( raw_text )
235
+ EscapeUtils.escape_html(decode_html(raw_text))
236
+ end # === def html
237
+
238
+ def escape o, method_name = :html
239
+ if o.kind_of? Hash
240
+ return(
241
+ o.inject({}) { |memo, (k, v)|
242
+ memo[escape(k,method_name)] = escape(v, method_name)
243
+ memo
244
+ }
245
+ )
246
+ end
247
+
248
+ return(send(method_name, o.to_s).to_sym) if o.is_a?(Symbol)
249
+ return(o.map { |v| escape(v, method_name) }) if o.kind_of? Array
250
+ return send(method_name, o) if o.is_a?(String)
251
+ return send(method_name, o.to_s) if o == true || o == false || o.kind_of?(Numeric)
252
+
253
+ fail Invalid, "Not a String, Number, Array, or Hash"
254
+ end # === def
255
+
84
256
  end # === class self ===
85
257
 
86
258
  end # === class Escape_Escape_Escape ===
@@ -0,0 +1,60 @@
1
+ it "does not re-escape already escaped html"
2
+ input "<p>Hello &amp; GoodBye</p>"
3
+ output "&lt;p&gt;Hello &amp; GoodBye&lt;&#47;p&gt;"
4
+
5
+ it "normalizes UNICODE: Ⅷ => VIII"
6
+ input "<p> Ⅷ </p>"
7
+ output "&lt;p&gt; VIII &lt;&#47;p&gt;"
8
+
9
+ it "normalizes UNICODE: \u2167 => VIII"
10
+ input "<p> \u2167 </p>"
11
+ output "&lt;p&gt; VIII &lt;&#47;p&gt;"
12
+
13
+ it "encodes apostrophe: ' -> &#39;"
14
+ input "Chars: ' '"
15
+ output "Chars: &#39; &#39;"
16
+
17
+ it 'does not re-escape already escaped text mixed with HTML'
18
+ input "&lt;p&gt;Hi&lt;&#47;p&gt;<p>Hi</p>"
19
+ output "&lt;p&gt;Hi&lt;&#47;p&gt;&lt;p&gt;Hi&lt;&#47;p&gt;"
20
+
21
+ it 'does not escape special chars: "Hello ©®∆"'
22
+ input "Hello & World ©®∆"
23
+ output "Hello &amp; World ©®∆"
24
+
25
+ it 'escapes all 70 different combos of "<"'
26
+ input BRACKETS
27
+ stack [:split, :uniq, :join, [' '], "&lt; %3C &amp;lt &amp;LT &amp;LT; &amp;#60 &amp;#060 &amp;#0060 &amp;#00060 &amp;#000060 &amp;#0000060 &amp;#x3c &amp;#x03c &amp;#x003c &amp;#x0003c &amp;#x00003c &amp;#x000003c &amp;#x000003c; &amp;#X3c &amp;#X03c &amp;#X003c &amp;#X0003c &amp;#X00003c &amp;#X000003c &amp;#X000003c; &amp;#x3C &amp;#x03C &amp;#x003C &amp;#x0003C &amp;#x00003C &amp;#x000003C &amp;#x000003C; &amp;#X3C &amp;#X03C &amp;#X003C &amp;#X0003C &amp;#X00003C &amp;#X000003C &amp;#X000003C;"]
28
+
29
+
30
+ it "fails with RuntimeError if: true"
31
+ input true
32
+ raises RuntimeError, /Not a string: true/
33
+
34
+
35
+ it "fails with RuntimeError if: false"
36
+ input false
37
+ raises RuntimeError, /Not a string: false/
38
+
39
+
40
+ it "fails with RuntimeError if numeric"
41
+ input 1
42
+ raises RuntimeError, /Not a string: 1/
43
+
44
+ it 'removes Unicode characters that do not belong in html'
45
+ input "b \u0340 \u0341 \u17a3 \u17d3 \u2028 \u2029 \u202a"
46
+ output "b"
47
+
48
+ it "removes unprintable characters"
49
+ input "end-\u2028-\u2029-"
50
+ output "end---"
51
+
52
+ it "escapes &sol;:"
53
+ input "&sol;"
54
+ output "&amp;sol;"
55
+
56
+ it "escapes &sol; regardless of case:"
57
+ input "&soL; &SoL; &SOL;"
58
+ output "&amp;soL; &amp;SoL; &amp;SOL;"
59
+
60
+
@@ -0,0 +1,13 @@
1
+ it "un-escapes special chars: \"Hello ©®∆\""
2
+ input "Hello &amp; World &#169;&#174;&#8710;"
3
+ output "Hello & World ©®∆"
4
+
5
+ it 'un-escapes escaped text mixed with HTML'
6
+ input "<p>Hi&amp;</p>"
7
+ output "<p>Hi&</p>"
8
+
9
+
10
+ it 'un-escapes all 70 different combos of "<"'
11
+ input BRACKETS
12
+ stack [:split, :uniq, :join, [' '], '< %3C &lt &LT &LT; &#60 &#060 &#0060 &#00060 &#000060 &#0000060 &#x3c &#x03c &#x003c &#x0003c &#x00003c &#x000003c &#x000003c; &#X3c &#X03c &#X003c &#X0003c &#X00003c &#X000003c &#X000003c; &#x3C &#x03C &#x003C &#x0003C &#x00003C &#x000003C &#x000003C; &#X3C &#X03C &#X003C &#X0003C &#X00003C &#X000003C &#X000003C;']
13
+
@@ -0,0 +1,10 @@
1
+
2
+
3
+ it 'returns string if valid'
4
+ input '-moz-def'
5
+ output '-moz-def'
6
+
7
+
8
+ it 'raises Invalid if it contains unallowed chars:'
9
+ input 'moz def'
10
+ raises Escape_Escape_Escape::Invalid, /contains invalid chars/