escape_escape_escape 0.3.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7232497e102ab2bd27d1892c77ec7eb9f2957e27
4
- data.tar.gz: 263451a1649efb22f1fc21457c240b29ad3de001
3
+ metadata.gz: 86100684d36a9aff31d78415463e2a3c357fe646
4
+ data.tar.gz: 77b8c43cc053204953f747630c7fdd5938034ec5
5
5
  SHA512:
6
- metadata.gz: 065639dcd17ec6ed58702292f1a1f5d637aa5b4220ba3644a7b8823f63432f6d9c99502d22fc5cf0d5592323959edeb6ac4f31ee212f38f0f0ef212fad670d0c
7
- data.tar.gz: 2efc664d236dc1c7eb9aa4f0347daab1e453b34ee77fc788b28187dd179a64efe803855ea1a9ef3e48ace9268637d2d318de60c8052af1393b96f15799ea59b0
6
+ metadata.gz: 7705788caaf5f6c4996b5381c1b3d2e09d390a7ab5a795a0aa8d32ccbcdec772942caf3c433d06aa3145d36244b617f545193657ee8493582011d94b48ffeec5
7
+ data.tar.gz: 5a4785d7e96190194040cf9d9c7e766b1095ae742327d0e9b3486a1fcdd89e85c19cfb576486b3afad59e6cb7b3350e61ae7ebeb77d35a4568296a9bb0ef02e9
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- What is it?
1
+ Escape_Escape_Escape
2
2
  ====================
3
3
 
4
4
  My way of escaping and sanitizing HTML.
@@ -6,20 +6,15 @@ This is very personal to me, so you won't
6
6
  find it useful or flexible to meet your needs.
7
7
 
8
8
 
9
- NPM Use:
10
- =====================
11
-
12
-
13
- // npm install escape_escape_escape
14
-
15
- var E = require("escape_escape_escape").Sanitize.html;
16
- E("The <strong>brave</strong> and the <b>bold</b>.");
17
-
18
9
  Rubygems Use:
19
10
  =====================
20
11
 
21
-
22
12
  # gem install escape_escape_escape
23
13
 
24
14
  Escape_Escape_Escape.html my_html_string
25
15
  Escape_Escape_Escape.text my_text_string
16
+
17
+ NOTE: Node and NPM Use:
18
+ =====================
19
+
20
+ This is no longer a npm module.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.0
1
+ 1.1.0
@@ -21,8 +21,10 @@ Gem::Specification.new do |spec|
21
21
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
22
22
  spec.require_paths = ["lib"]
23
23
 
24
- spec.add_dependency "sanitize" , ">= 3.0"
25
- spec.add_dependency "htmlentities" , ">= 4.3.2"
24
+ spec.add_runtime_dependency "addressable" , "> 2.3.5"
25
+ spec.add_runtime_dependency "escape_utils" , "> 1.0.0"
26
+ spec.add_runtime_dependency "unf" , "> 0.1.3"
27
+ spec.add_runtime_dependency "htmlentities" , ">= 4.3.2"
26
28
 
27
29
  spec.add_development_dependency "pry" , ">= 0.9"
28
30
  spec.add_development_dependency "rake" , ">= 10.3"
@@ -30,4 +32,5 @@ Gem::Specification.new do |spec|
30
32
  spec.add_development_dependency "bacon" , ">= 1.0"
31
33
  spec.add_development_dependency "Bacon_Colored" , ">= 0.1"
32
34
  spec.add_development_dependency "multi_json" , ">= 1.10"
35
+ spec.add_development_dependency "sanitize" , ">= 3.0.1"
33
36
  end
@@ -1,31 +1,95 @@
1
1
 
2
+ require 'unf'
2
3
 
3
- require "sanitize"
4
+ require "escape_utils"
5
+
6
+ require 'escape_utils/html/rack' # to patch Rack::Utils
7
+ require 'escape_utils/html/erb' # to patch ERB::Util
8
+ require 'escape_utils/html/cgi' # to patch CGI
9
+ require 'escape_utils/html/haml' # to patch Haml::Helpers
10
+
11
+ require 'escape_utils/url/cgi' # to patch CGI
12
+ require 'escape_utils/url/erb' # to patch ERB::Util
13
+ require 'escape_utils/url/rack' # to patch Rack::Utils
14
+ require 'escape_utils/url/uri' # to patch URI
15
+
16
+ # ======================
4
17
  require "htmlentities"
18
+ # ======================
19
+ #
20
+ require "uri"
21
+ require 'cgi' # Don't use URI.escape because it does not escape all invalid characters.
22
+ require "addressable/uri"
23
+ # ======================
24
+
25
+ def Escape_Escape_Escape s
26
+ Escape_Escape_Escape.escape(s)
27
+ end
5
28
 
6
29
  class Escape_Escape_Escape
7
30
 
8
- CODER = HTMLEntities.new(:xhtml1)
31
+ # === From sanitize gem:
32
+ # https://raw.githubusercontent.com/rgrove/sanitize/master/lib/sanitize.rb
33
+ REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
34
+ # ==================================================================================
35
+
36
+ CODER = HTMLEntities.new(:xhtml1)
37
+
38
+ Invalid = Class.new(RuntimeError)
39
+ Invalid_HREF = Class.new(RuntimeError)
40
+
41
+ Invalid_Type = Class.new(RuntimeError)
42
+
43
+ TAG_PATTERN = /\A[a-z]([a-z0-9\_]{0,}[a-z]{1,})?\z/i
44
+
45
+ VALID_CSS_VALUE = /\A[a-z0-9\;\-\_\#\ ]+\z/i
46
+ VALID_CSS_SELECTOR = /\A[a-z0-9\#\:\_\-\.\ ]+\z/i
47
+ VALID_CSS_ATTR = /\A[a-z0-9-]+\z/i
9
48
 
10
- REPEATING_DOTS = /\.{1,}\//
11
49
  INVALID_FILE_NAME_CHARS = /[^a-z0-9\_\.]{1,}/i
12
- UN_PRINT_ABLE = /[^[:print:]\n]/
13
- CR = "\r"
14
- TABS = "\t"
15
- CONTROL_CHARS = /[[:cntrl:]\x00-\x1f]/ # Don't use "\x20" because that is the space character.
16
- WHITE_SPACE = /[[:space:]]&&[^\n]/ # http://www.rubyinside.com/the-split-is-not-enough-whitespace-shenigans-for-rubyists-5980.html
17
- CONFIG = {
18
- :attributes => Sanitize::Config::RELAXED[:attributes].dup,
19
- :css => Sanitize::Config::RELAXED[:css].dup,
20
- :allow_doctype => true,
21
- :elements => %{
22
- a blockquote body br caption cite code div
23
- img pre p span
24
- h1 h2 h3 h4
25
- i em strong sub sup
26
- ol li ul
27
- html title style
28
- },
50
+
51
+ TABS = /\t*/
52
+ TAB = "\t"
53
+ HTML_TAB = "&#09;"
54
+ TWO_SPACES = ' '
55
+ BLANK = ''
56
+ SPACE = ' '
57
+
58
+ NL = "\n";
59
+ SPACES = /\ +/;
60
+
61
+ VALID_HTML_ID = /\A[0-9a-z_]+\z/i;
62
+ VALID_HTML_TAG = /\A[0-9a-z_]+\z/i;
63
+
64
+ REPEATING_DOTS = /\.{1,}/
65
+
66
+ # === MULTI_CONTROL_CHARS: ==================================
67
+ #
68
+ # Unicode whitespaces, like 160 codepoint, tabs, etc.
69
+ # Excludes newline.
70
+ #
71
+ # Examples:
72
+ # \r\n \r\n -> \n \n
73
+ #
74
+ # NOTE: Don't use "\x20" because that is the space character.
75
+ #
76
+ # Whitespace regex ([:space:]) from:
77
+ # http://www.rubyinside.com/the-split-is-not-enough-whitespace-shenigans-for-rubyists-5980.html
78
+ #
79
+ # =====================================================
80
+ MULTI_CONTROL_AND_UNPRINTABLE = /[[:space:][:cntrl:]\x00-\x1f&&[^\n\ [:print:]]]+/i
81
+ # =====================================================
82
+
83
+ ENCODING_OPTIONS_CLEAN_UTF8 = {
84
+ :invalid => :replace, # Replace invalid byte sequences
85
+ :undef => :replace, # Replace anything not defined in ASCII
86
+ :replace => '' # Use a blank for those replacements
87
+ # :universal_newline => true # Always break lines with \n, not \r\n
88
+ # -- this is not working with :replace, so it has to be done manually
89
+ # with .gsub
90
+ }
91
+
92
+ CONFIG = {
29
93
  :protocols => {
30
94
  "a"=>{
31
95
  "href"=>["ftp", "http", "https", "mailto", :relative]
@@ -36,51 +100,159 @@ class Escape_Escape_Escape
36
100
  }
37
101
  }
38
102
 
39
- ENCODING_OPTIONS_CLEAN_UTF8 = {
40
- :invalid => :replace, # Replace invalid byte sequences
41
- :undef => :replace, # Replace anything not defined in ASCII
42
- :replace => '' # Use a blank for those replacements
43
- # :newline => :universal
44
- # :universal_newline => true # Always break lines with \n, not \r\n
45
- }
46
-
103
+ class << self # ======================================================
47
104
 
105
+ def regexp str
106
+ @regexp_opts ||= Regexp::FIXEDENCODING | Regexp::IGNORECASE
107
+ Regexp.new(clean_utf8(str), @regexp_opts)
108
+ end
48
109
 
49
- class << self # ======================================================
110
+ # ===============================================
111
+ # Raises: TZInfo::InvalidTimezoneIdentifier.
112
+ # ===============================================
113
+ def validate_timezone(timezone)
114
+ TZInfo::Timezone.get( timezone.to_s.strip ).identifier
115
+ end
50
116
 
51
- # From:
117
+ # ==================================================================
118
+ # * normalized to :KC
119
+ # * "\r\n" changed to "\n"
120
+ # * all control characters stripped except for "\n"
121
+ # and end.
122
+ # Normalization, then strip:
123
+ # http://msdn.microsoft.com/en-us/library/dd374126(v=vs.85).aspx
124
+ # http://www.unicode.org/faq/normalization.html
125
+ #
126
+ # Getting rid of non-ascii characters in ruby:
52
127
  # http://stackoverflow.com/questions/1268289/how-to-get-rid-of-non-ascii-characters-in-ruby
53
128
  #
54
129
  # Test:
55
130
  # [160, 160,64, 116, 119, 101, 108, 108, 121, 109, 101, 160, 102, 105, 108, 109].
56
131
  # inject('', :<<)
57
132
  #
58
- def clean_utf8 s
59
- s.
60
- encode(Encoding.find('utf-8') , ENCODING_OPTIONS_CLEAN_UTF8).
61
- gsub(TABS , " ").
62
- gsub(CR , "").
63
- gsub(UN_PRINT_ABLE , '').
64
- gsub(CONTROL_CHARS , "\n" ).
65
- gsub(WHITE_SPACE , " ")
66
- end
133
+ # Options:
134
+ #
135
+ # :tabs
136
+ # :spaces
137
+ #
138
+ def clean_utf8 raw_s, *opts
139
+
140
+ fail("Not a string: #{raw_s.inspect}") unless raw_s.is_a?(String)
141
+
142
+ # === Check options. ==================================================================
143
+ @plaintext_allowed_options ||= [ :spaces, :tabs ]
144
+ invalid_opts = opts - @plaintext_allowed_options
145
+ fail(ArgumentError, "INVALID OPTION: #{invalid_opts.inspect}" ) if !invalid_opts.empty?
146
+ # =====================================================================================
147
+
148
+ raw_s = raw_s.dup
149
+
150
+ # === Save tabs if requested.
151
+ raw_s.gsub!(TAB, HTML_TAB) if opts.include?(:tabs)
152
+
153
+ raw_s.encode!(Encoding.find('utf-8') , ENCODING_OPTIONS_CLEAN_UTF8)
154
+ raw_s.scrub!
155
+ raw_s.gsub!(TAB , TWO_SPACES)
156
+ raw_s.gsub!(MULTI_CONTROL_AND_UNPRINTABLE , BLANK)
157
+ raw_s.gsub!(REGEX_UNSUITABLE_CHARS , ' ')
158
+
159
+ clean = raw_s.to_nfkc
160
+
161
+ # Save whitespace or strip.
162
+ if !opts.include?(:spaces)
163
+ clean.strip!
164
+ end
165
+
166
+ # Put back tabs by request.
167
+ if opts.include?(:tabs)
168
+ clean.gsub!(HTML_TAB, TAB)
169
+ end
67
170
 
68
- def text s
69
- clean_utf8 s
171
+ clean
70
172
  end
71
173
 
72
- def html s
73
- Sanitize.fragment( clean_utf8(s), CONFIG )
174
+ # ===============================================
175
+ #
176
+ # Handles urls and relative paths.
177
+ #
178
+ # Inspired from:
179
+ # http://stackoverflow.com/a/13041565
180
+ #
181
+ # ===============================================
182
+ alias_method :path, def href raw_str
183
+ fail("Not a string: #{raw_str.inspect}") unless raw_str.is_a?(String)
184
+
185
+ begin
186
+ uri = URI.parse(decode_html(raw_str))
187
+ if uri.scheme
188
+ uri.scheme = uri.scheme.to_s.strip.downcase
189
+ end
190
+
191
+ fail( Invalid_HREF, "javascript:// is not allowed" ) if (uri.scheme || ''.freeze)['javascript'.freeze]
192
+ fail( Invalid_HREF, "address is invalid") if !uri.host && !uri.relative?
193
+
194
+ html(EscapeUtils.escape_uri uri.to_s)
195
+ rescue URI::InvalidURIError => e
196
+ raise Invalid_HREF, e.message
197
+ end
74
198
  end
75
199
 
76
- def unescape_inner_html s
77
- CODER.decode(clean_utf8(s))
200
+ # ===============================================
201
+ # HTML
202
+ # ===============================================
203
+
204
+ def tag( raw_tag )
205
+ return nil unless raw_tag[TAG_PATTERN]
206
+ raw_tag
78
207
  end
79
208
 
80
- def inner_html s
81
- CODER.encode(unescape_inner_html(s), :named, :hexadecimal)
209
+ def decode_html raw
210
+ fail("Not a string: #{raw.inspect}") unless raw.is_a?(String)
211
+ CODER.decode clean_utf8(raw)
82
212
  end
83
213
 
214
+ %w{attr selector value}.each { |name|
215
+ eval <<-EOF, nil, __FILE__, __LINE__ + 1
216
+ def css_#{name} raw
217
+ fail(Invalid_Type, "Not a string: \#{raw.inspect}") unless raw.is_a?(String)
218
+ clean = html(raw)
219
+ return clean if clean[VALID_CSS_#{name.upcase}]
220
+ fail Invalid, "contains invalid chars: \#{raw.inspect}"
221
+ end
222
+ EOF
223
+ }
224
+
225
+ # ===============================================
226
+ # A better alternative than "Rack::Utils.escape_html". Escapes
227
+ # various characters (including '&', '<', '>', and both quotation mark types)
228
+ # to HTML decimal entities. Also escapes the characters from
229
+ # <HTML_ESCAPE_TABLE>.
230
+ #
231
+ # Text has to be UTF-8 before encoding, according to HTMLEntities gem.
232
+ # Therefore, all text is run through <plaintext> before encoding.
233
+ # ===============================================
234
+ def html( raw_text )
235
+ EscapeUtils.escape_html(decode_html(raw_text))
236
+ end # === def html
237
+
238
+ def escape o, method_name = :html
239
+ if o.kind_of? Hash
240
+ return(
241
+ o.inject({}) { |memo, (k, v)|
242
+ memo[escape(k,method_name)] = escape(v, method_name)
243
+ memo
244
+ }
245
+ )
246
+ end
247
+
248
+ return(send(method_name, o.to_s).to_sym) if o.is_a?(Symbol)
249
+ return(o.map { |v| escape(v, method_name) }) if o.kind_of? Array
250
+ return send(method_name, o) if o.is_a?(String)
251
+ return send(method_name, o.to_s) if o == true || o == false || o.kind_of?(Numeric)
252
+
253
+ fail Invalid, "Not a String, Number, Array, or Hash"
254
+ end # === def
255
+
84
256
  end # === class self ===
85
257
 
86
258
  end # === class Escape_Escape_Escape ===
@@ -0,0 +1,60 @@
1
+ it "does not re-escape already escaped html"
2
+ input "<p>Hello &amp; GoodBye</p>"
3
+ output "&lt;p&gt;Hello &amp; GoodBye&lt;&#47;p&gt;"
4
+
5
+ it "normalizes UNICODE: Ⅷ => VIII"
6
+ input "<p> Ⅷ </p>"
7
+ output "&lt;p&gt; VIII &lt;&#47;p&gt;"
8
+
9
+ it "normalizes UNICODE: \u2167 => VIII"
10
+ input "<p> \u2167 </p>"
11
+ output "&lt;p&gt; VIII &lt;&#47;p&gt;"
12
+
13
+ it "encodes apostrophe: ' -> &#39;"
14
+ input "Chars: ' '"
15
+ output "Chars: &#39; &#39;"
16
+
17
+ it 'does not re-escape already escaped text mixed with HTML'
18
+ input "&lt;p&gt;Hi&lt;&#47;p&gt;<p>Hi</p>"
19
+ output "&lt;p&gt;Hi&lt;&#47;p&gt;&lt;p&gt;Hi&lt;&#47;p&gt;"
20
+
21
+ it 'does not escape special chars: "Hello ©®∆"'
22
+ input "Hello & World ©®∆"
23
+ output "Hello &amp; World ©®∆"
24
+
25
+ it 'escapes all 70 different combos of "<"'
26
+ input BRACKETS
27
+ stack [:split, :uniq, :join, [' '], "&lt; %3C &amp;lt &amp;LT &amp;LT; &amp;#60 &amp;#060 &amp;#0060 &amp;#00060 &amp;#000060 &amp;#0000060 &amp;#x3c &amp;#x03c &amp;#x003c &amp;#x0003c &amp;#x00003c &amp;#x000003c &amp;#x000003c; &amp;#X3c &amp;#X03c &amp;#X003c &amp;#X0003c &amp;#X00003c &amp;#X000003c &amp;#X000003c; &amp;#x3C &amp;#x03C &amp;#x003C &amp;#x0003C &amp;#x00003C &amp;#x000003C &amp;#x000003C; &amp;#X3C &amp;#X03C &amp;#X003C &amp;#X0003C &amp;#X00003C &amp;#X000003C &amp;#X000003C;"]
28
+
29
+
30
+ it "fails with RuntimeError if: true"
31
+ input true
32
+ raises RuntimeError, /Not a string: true/
33
+
34
+
35
+ it "fails with RuntimeError if: false"
36
+ input false
37
+ raises RuntimeError, /Not a string: false/
38
+
39
+
40
+ it "fails with RuntimeError if numeric"
41
+ input 1
42
+ raises RuntimeError, /Not a string: 1/
43
+
44
+ it 'removes Unicode characters that do not belong in html'
45
+ input "b \u0340 \u0341 \u17a3 \u17d3 \u2028 \u2029 \u202a"
46
+ output "b"
47
+
48
+ it "removes unprintable characters"
49
+ input "end-\u2028-\u2029-"
50
+ output "end---"
51
+
52
+ it "escapes &sol;:"
53
+ input "&sol;"
54
+ output "&amp;sol;"
55
+
56
+ it "escapes &sol; regardless of case:"
57
+ input "&soL; &SoL; &SOL;"
58
+ output "&amp;soL; &amp;SoL; &amp;SOL;"
59
+
60
+
@@ -0,0 +1,13 @@
1
+ it "un-escapes special chars: \"Hello ©®∆\""
2
+ input "Hello &amp; World &#169;&#174;&#8710;"
3
+ output "Hello & World ©®∆"
4
+
5
+ it 'un-escapes escaped text mixed with HTML'
6
+ input "<p>Hi&amp;</p>"
7
+ output "<p>Hi&</p>"
8
+
9
+
10
+ it 'un-escapes all 70 different combos of "<"'
11
+ input BRACKETS
12
+ stack [:split, :uniq, :join, [' '], '< %3C &lt &LT &LT; &#60 &#060 &#0060 &#00060 &#000060 &#0000060 &#x3c &#x03c &#x003c &#x0003c &#x00003c &#x000003c &#x000003c; &#X3c &#X03c &#X003c &#X0003c &#X00003c &#X000003c &#X000003c; &#x3C &#x03C &#x003C &#x0003C &#x00003C &#x000003C &#x000003C; &#X3C &#X03C &#X003C &#X0003C &#X00003C &#X000003C &#X000003C;']
13
+
@@ -0,0 +1,10 @@
1
+
2
+
3
+ it 'returns string if valid'
4
+ input '-moz-def'
5
+ output '-moz-def'
6
+
7
+
8
+ it 'raises Invalid if it contains unallowed chars:'
9
+ input 'moz def'
10
+ raises Escape_Escape_Escape::Invalid, /contains invalid chars/