RubyGems - escape_escape_escape - Versions diffs - 0.3.0 → 1.1.0 - Mend

escape_escape_escape 0.3.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/README.md +6 -11
data/VERSION +1 -1
data/escape_escape_escape.gemspec +5 -2
data/lib/escape_escape_escape.rb +219 -47
data/specs/as_ruby/0001-html.rb +60 -0
data/specs/as_ruby/0002-decode_html.rb +13 -0
data/specs/as_ruby/0003-css_attr.rb +10 -0
data/specs/as_ruby/0003-css_selector.rb +12 -0
data/specs/as_ruby/0003-css_value.rb +53 -0
data/specs/as_ruby/0004-==.rb +5 -0
data/specs/as_ruby/0020-href.rb +118 -0
data/specs/as_ruby/0030-clean_utf8.rb +34 -0
data/specs/as_ruby/0040-escape.rb +41 -0
data/specs/escape_escape_escape.rb +133 -21
data/specs/lib/helpers.rb +1 -0
metadata +61 -23
data/LICENSE.txt +0 -23
data/lib/beta.rb +0 -270
data/lib/e_e_e.js +0 -258
data/package.json +0 -31
data/specs/as_json/0001-html.json +0 -23
data/specs/as_json/0002-inner_html.json +0 -16
data/specs/as_json/0010-text.json +0 -29
data/specs/helpers.rb +0 -4
data/test/sanitize_attrs.js +0 -132
data/test/sanitize_html.js +0 -57
data/test/sanitize_un_escape.js +0 -41

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7232497e102ab2bd27d1892c77ec7eb9f2957e27
-  data.tar.gz: 263451a1649efb22f1fc21457c240b29ad3de001
+  metadata.gz: 86100684d36a9aff31d78415463e2a3c357fe646
+  data.tar.gz: 77b8c43cc053204953f747630c7fdd5938034ec5
 SHA512:
-  metadata.gz: 065639dcd17ec6ed58702292f1a1f5d637aa5b4220ba3644a7b8823f63432f6d9c99502d22fc5cf0d5592323959edeb6ac4f31ee212f38f0f0ef212fad670d0c
-  data.tar.gz: 2efc664d236dc1c7eb9aa4f0347daab1e453b34ee77fc788b28187dd179a64efe803855ea1a9ef3e48ace9268637d2d318de60c8052af1393b96f15799ea59b0
+  metadata.gz: 7705788caaf5f6c4996b5381c1b3d2e09d390a7ab5a795a0aa8d32ccbcdec772942caf3c433d06aa3145d36244b617f545193657ee8493582011d94b48ffeec5
+  data.tar.gz: 5a4785d7e96190194040cf9d9c7e766b1095ae742327d0e9b3486a1fcdd89e85c19cfb576486b3afad59e6cb7b3350e61ae7ebeb77d35a4568296a9bb0ef02e9

data/README.md CHANGED

@@ -1,4 +1,4 @@
-What is it?
+Escape_Escape_Escape
 ====================
 My way of escaping and sanitizing HTML.
@@ -6,20 +6,15 @@ This is very personal to me, so you won't
 find it useful or flexible to meet your needs.
-NPM Use:
-=====================
-    // npm install escape_escape_escape
-    var E = require("escape_escape_escape").Sanitize.html;
-    E("The <strong>brave</strong> and the <b>bold</b>.");
 Rubygems Use:
 =====================
     # gem install escape_escape_escape
     Escape_Escape_Escape.html my_html_string
     Escape_Escape_Escape.text my_text_string
+NOTE: Node and NPM Use:
+=====================
+This is no longer a npm module.

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.3.0
1	+ 1.1.0

data/escape_escape_escape.gemspec CHANGED

@@ -21,8 +21,10 @@ Gem::Specification.new do |spec|
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ["lib"]
-  spec.add_dependency "sanitize"     , ">= 3.0"
-  spec.add_dependency "htmlentities" , ">= 4.3.2"
+  spec.add_runtime_dependency "addressable" , "> 2.3.5"
+  spec.add_runtime_dependency "escape_utils" , "> 1.0.0"
+  spec.add_runtime_dependency "unf" , "> 0.1.3"
+  spec.add_runtime_dependency "htmlentities" , ">= 4.3.2"
   spec.add_development_dependency "pry"           , ">= 0.9"
   spec.add_development_dependency "rake"          , ">= 10.3"
@@ -30,4 +32,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "bacon"         , ">= 1.0"
   spec.add_development_dependency "Bacon_Colored" , ">= 0.1"
   spec.add_development_dependency "multi_json"    , ">= 1.10"
+  spec.add_development_dependency "sanitize"    , ">= 3.0.1"
 end

data/lib/escape_escape_escape.rb CHANGED

@@ -1,31 +1,95 @@
+require 'unf'
-require "sanitize"
+require "escape_utils"
+require 'escape_utils/html/rack' # to patch Rack::Utils
+require 'escape_utils/html/erb' # to patch ERB::Util
+require 'escape_utils/html/cgi' # to patch CGI
+require 'escape_utils/html/haml' # to patch Haml::Helpers
+require 'escape_utils/url/cgi' # to patch CGI
+require 'escape_utils/url/erb' # to patch ERB::Util
+require 'escape_utils/url/rack' # to patch Rack::Utils
+require 'escape_utils/url/uri' # to patch URI
+# ======================
 require "htmlentities"
+# ======================
+#
+require "uri"
+require 'cgi' # Don't use URI.escape because it does not escape all invalid characters.
+require "addressable/uri"
+# ======================
+def Escape_Escape_Escape s
+  Escape_Escape_Escape.escape(s)
+end
 class Escape_Escape_Escape
-  CODER = HTMLEntities.new(:xhtml1)
+  # === From sanitize gem:
+  #   https://raw.githubusercontent.com/rgrove/sanitize/master/lib/sanitize.rb
+  REGEX_UNSUITABLE_CHARS = /[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u
+  # ==================================================================================
+  CODER              = HTMLEntities.new(:xhtml1)
+  Invalid            = Class.new(RuntimeError)
+  Invalid_HREF       = Class.new(RuntimeError)
+  Invalid_Type       = Class.new(RuntimeError)
+  TAG_PATTERN        = /\A[a-z]([a-z0-9\_]{0,}[a-z]{1,})?\z/i
+  VALID_CSS_VALUE    = /\A[a-z0-9\;\-\_\#\ ]+\z/i
+  VALID_CSS_SELECTOR = /\A[a-z0-9\#\:\_\-\.\ ]+\z/i
+  VALID_CSS_ATTR     = /\A[a-z0-9-]+\z/i
-  REPEATING_DOTS          = /\.{1,}\//
   INVALID_FILE_NAME_CHARS = /[^a-z0-9\_\.]{1,}/i
-  UN_PRINT_ABLE           = /[^[:print:]\n]/
-  CR                      = "\r"
-  TABS                    = "\t"
-  CONTROL_CHARS           = /[[:cntrl:]\x00-\x1f]/  # Don't use "\x20" because that is the space character.
-  WHITE_SPACE             = /[[:space:]]&&[^\n]/            # http://www.rubyinside.com/the-split-is-not-enough-whitespace-shenigans-for-rubyists-5980.html
-  CONFIG                  = {
-    :attributes    => Sanitize::Config::RELAXED[:attributes].dup,
-    :css           => Sanitize::Config::RELAXED[:css].dup,
-    :allow_doctype => true,
-    :elements => %{
-      a blockquote body br caption cite code div
-      img pre p span
-      h1 h2 h3 h4
-      i em strong sub sup
-      ol li ul
-      html title style
-    },
+  TABS           = /\t*/
+  TAB            = "\t"
+  HTML_TAB       = "&#09;"
+  TWO_SPACES     = '  '
+  BLANK          = ''
+  SPACE          = ' '
+  NL             = "\n";
+  SPACES         = /\ +/;
+  VALID_HTML_ID  = /\A[0-9a-z_]+\z/i;
+  VALID_HTML_TAG = /\A[0-9a-z_]+\z/i;
+  REPEATING_DOTS = /\.{1,}/
+  # === MULTI_CONTROL_CHARS: ==================================
+  #
+  # Unicode whitespaces, like 160 codepoint, tabs, etc.
+  # Excludes newline.
+  #
+  # Examples:
+  #   \r\n \r\n -> \n \n
+  #
+  # NOTE: Don't use "\x20" because that is the space character.
+  #
+  # Whitespace regex ([:space:]) from:
+  #   http://www.rubyinside.com/the-split-is-not-enough-whitespace-shenigans-for-rubyists-5980.html
+  #
+  # =====================================================
+  MULTI_CONTROL_AND_UNPRINTABLE = /[[:space:][:cntrl:]\x00-\x1f&&[^\n\ [:print:]]]+/i
+  # =====================================================
+  ENCODING_OPTIONS_CLEAN_UTF8 = {
+    :invalid           => :replace, # Replace invalid byte sequences
+    :undef             => :replace, # Replace anything not defined in ASCII
+    :replace           => '' # Use a blank for those replacements
+    # :universal_newline => true # Always break lines with \n, not \r\n
+    #   -- this is not working with :replace, so it has to be done manually
+    #      with .gsub
+  }
+  CONFIG = {
     :protocols => {
       "a"=>{
         "href"=>["ftp", "http", "https", "mailto", :relative]
@@ -36,51 +100,159 @@ class Escape_Escape_Escape
     }
   }
-  ENCODING_OPTIONS_CLEAN_UTF8 = {
-    :invalid           => :replace, # Replace invalid byte sequences
-    :undef             => :replace, # Replace anything not defined in ASCII
-    :replace           => '' # Use a blank for those replacements
-    # :newline         => :universal
-    # :universal_newline => true # Always break lines with \n, not \r\n
-  }
+  class << self # ======================================================
+    def regexp str
+      @regexp_opts ||= Regexp::FIXEDENCODING | Regexp::IGNORECASE
+      Regexp.new(clean_utf8(str), @regexp_opts)
+    end
-  class << self # ======================================================
+    # ===============================================
+    # Raises: TZInfo::InvalidTimezoneIdentifier.
+    # ===============================================
+    def validate_timezone(timezone)
+      TZInfo::Timezone.get( timezone.to_s.strip ).identifier
+    end
-    # From:
+    # ==================================================================
+    # * normalized to :KC
+    # * "\r\n" changed to "\n"
+    # * all control characters stripped except for "\n"
+    # and end.
+    # Normalization, then strip:
+    #   http://msdn.microsoft.com/en-us/library/dd374126(v=vs.85).aspx
+    #   http://www.unicode.org/faq/normalization.html
+    #
+    # Getting rid of non-ascii characters in ruby:
     # http://stackoverflow.com/questions/1268289/how-to-get-rid-of-non-ascii-characters-in-ruby
     #
     # Test:
     # [160, 160,64, 116, 119, 101, 108, 108, 121, 109, 101, 160, 102, 105, 108, 109].
     # inject('', :<<)
     #
-    def clean_utf8 s
-      s.
-        encode(Encoding.find('utf-8') , ENCODING_OPTIONS_CLEAN_UTF8).
-        gsub(TABS                     , "  ").
-        gsub(CR                       , "").
-        gsub(UN_PRINT_ABLE            , '').
-        gsub(CONTROL_CHARS            , "\n" ).
-        gsub(WHITE_SPACE              , " ")
-    end
+    # Options:
+    #
+    #   :tabs
+    #   :spaces
+    #
+    def clean_utf8 raw_s, *opts
+      fail("Not a string: #{raw_s.inspect}") unless raw_s.is_a?(String)
+      # === Check options. ==================================================================
+      @plaintext_allowed_options ||= [ :spaces, :tabs ]
+      invalid_opts = opts - @plaintext_allowed_options
+      fail(ArgumentError, "INVALID OPTION: #{invalid_opts.inspect}" ) if !invalid_opts.empty?
+      # =====================================================================================
+      raw_s = raw_s.dup
+      # === Save tabs if requested.
+      raw_s.gsub!(TAB, HTML_TAB) if opts.include?(:tabs)
+      raw_s.encode!(Encoding.find('utf-8') , ENCODING_OPTIONS_CLEAN_UTF8)
+      raw_s.scrub!
+      raw_s.gsub!(TAB                           , TWO_SPACES)
+      raw_s.gsub!(MULTI_CONTROL_AND_UNPRINTABLE , BLANK)
+      raw_s.gsub!(REGEX_UNSUITABLE_CHARS        , ' ')
+      clean = raw_s.to_nfkc
+      # Save whitespace or strip.
+      if !opts.include?(:spaces)
+        clean.strip!
+      end
+      # Put back tabs by request.
+      if opts.include?(:tabs)
+        clean.gsub!(HTML_TAB, TAB)
+      end
-    def text s
-      clean_utf8 s
+      clean
     end
-    def html s
-      Sanitize.fragment( clean_utf8(s), CONFIG )
+    # ===============================================
+    #
+    # Handles urls and relative paths.
+    #
+    # Inspired from:
+    #   http://stackoverflow.com/a/13041565
+    #
+    # ===============================================
+    alias_method :path, def href raw_str
+      fail("Not a string: #{raw_str.inspect}") unless raw_str.is_a?(String)
+      begin
+        uri = URI.parse(decode_html(raw_str))
+        if uri.scheme
+          uri.scheme = uri.scheme.to_s.strip.downcase
+        end
+        fail( Invalid_HREF, "javascript:// is not allowed" ) if (uri.scheme || ''.freeze)['javascript'.freeze]
+        fail( Invalid_HREF, "address is invalid") if !uri.host && !uri.relative?
+        html(EscapeUtils.escape_uri uri.to_s)
+      rescue URI::InvalidURIError => e
+        raise Invalid_HREF, e.message
+      end
     end
-    def unescape_inner_html s
-      CODER.decode(clean_utf8(s))
+    # ===============================================
+    # HTML
+    # ===============================================
+    def tag( raw_tag )
+      return nil unless raw_tag[TAG_PATTERN]
+      raw_tag
     end
-    def inner_html s
-      CODER.encode(unescape_inner_html(s), :named, :hexadecimal)
+    def decode_html raw
+      fail("Not a string: #{raw.inspect}") unless raw.is_a?(String)
+      CODER.decode clean_utf8(raw)
     end
+    %w{attr selector value}.each { |name|
+      eval <<-EOF, nil, __FILE__, __LINE__ + 1
+        def css_#{name} raw
+          fail(Invalid_Type, "Not a string: \#{raw.inspect}") unless raw.is_a?(String)
+          clean = html(raw)
+          return clean if clean[VALID_CSS_#{name.upcase}]
+          fail Invalid, "contains invalid chars: \#{raw.inspect}"
+        end
+      EOF
+    }
+    # ===============================================
+    # A better alternative than "Rack::Utils.escape_html". Escapes
+    # various characters (including '&', '<', '>', and both quotation mark types)
+    # to HTML decimal entities. Also escapes the characters from
+    # <HTML_ESCAPE_TABLE>.
+    #
+    # Text has to be UTF-8 before encoding, according to HTMLEntities gem.
+    # Therefore, all text is run through <plaintext> before encoding.
+    # ===============================================
+    def html( raw_text )
+      EscapeUtils.escape_html(decode_html(raw_text))
+    end # === def html
+    def escape o, method_name = :html
+      if o.kind_of? Hash
+        return(
+          o.inject({}) { |memo, (k, v)|
+            memo[escape(k,method_name)] = escape(v, method_name)
+            memo
+          }
+        )
+      end
+      return(send(method_name, o.to_s).to_sym) if o.is_a?(Symbol)
+      return(o.map { |v| escape(v, method_name) }) if o.kind_of? Array
+      return send(method_name, o) if o.is_a?(String)
+      return send(method_name, o.to_s) if o == true || o == false || o.kind_of?(Numeric)
+      fail Invalid, "Not a String, Number, Array, or Hash"
+    end # === def
   end # === class self ===
 end # === class Escape_Escape_Escape ===

data/specs/as_ruby/0001-html.rb ADDED

@@ -0,0 +1,60 @@
+it     "does not re-escape already escaped html"
+input  "<p>Hello &amp; GoodBye</p>"
+output "&lt;p&gt;Hello &amp; GoodBye&lt;&#47;p&gt;"
+it     "normalizes UNICODE: Ⅷ => VIII"
+input  "<p> Ⅷ </p>"
+output "&lt;p&gt; VIII &lt;&#47;p&gt;"
+it     "normalizes UNICODE: \u2167 => VIII"
+input  "<p> \u2167 </p>"
+output "&lt;p&gt; VIII &lt;&#47;p&gt;"
+it     "encodes apostrophe: ' -> &#39;"
+input  "Chars: ' '"
+output "Chars: &#39; &#39;"
+it     'does not re-escape already escaped text mixed with HTML'
+input  "&lt;p&gt;Hi&lt;&#47;p&gt;<p>Hi</p>"
+output "&lt;p&gt;Hi&lt;&#47;p&gt;&lt;p&gt;Hi&lt;&#47;p&gt;"
+it     'does not escape special chars: "Hello ©®∆"'
+input  "Hello & World ©®∆"
+output "Hello &amp; World ©®∆"
+it      'escapes all 70 different combos of "<"'
+input   BRACKETS
+stack   [:split, :uniq, :join, [' '], "&lt; %3C &amp;lt &amp;LT &amp;LT; &amp;#60 &amp;#060 &amp;#0060 &amp;#00060 &amp;#000060 &amp;#0000060 &amp;#x3c &amp;#x03c &amp;#x003c &amp;#x0003c &amp;#x00003c &amp;#x000003c &amp;#x000003c; &amp;#X3c &amp;#X03c &amp;#X003c &amp;#X0003c &amp;#X00003c &amp;#X000003c &amp;#X000003c; &amp;#x3C &amp;#x03C &amp;#x003C &amp;#x0003C &amp;#x00003C &amp;#x000003C &amp;#x000003C; &amp;#X3C &amp;#X03C &amp;#X003C &amp;#X0003C &amp;#X00003C &amp;#X000003C &amp;#X000003C;"]
+it     "fails with RuntimeError if: true"
+input  true
+raises RuntimeError, /Not a string: true/
+it     "fails with RuntimeError if: false"
+input  false
+raises RuntimeError, /Not a string: false/
+it     "fails with RuntimeError if numeric"
+input  1
+raises RuntimeError, /Not a string: 1/
+it     'removes Unicode characters that do not belong in html'
+input  "b \u0340 \u0341 \u17a3 \u17d3 \u2028 \u2029 \u202a"
+output "b"
+it     "removes unprintable characters"
+input  "end-\u2028-\u2029-"
+output "end---"
+it     "escapes &sol;:"
+input  "&sol;"
+output "&amp;sol;"
+it     "escapes &sol; regardless of case:"
+input  "&soL; &SoL; &SOL;"
+output "&amp;soL; &amp;SoL; &amp;SOL;"

data/specs/as_ruby/0002-decode_html.rb ADDED

@@ -0,0 +1,13 @@
+it     "un-escapes special chars: \"Hello ©®∆\""
+input  "Hello &amp; World &#169;&#174;&#8710;"
+output "Hello & World ©®∆"
+it    'un-escapes escaped text mixed with HTML'
+input  "<p>Hi&amp;</p>"
+output "<p>Hi&</p>"
+it 'un-escapes all 70 different combos of "<"'
+input BRACKETS
+stack   [:split, :uniq, :join, [' '], '< %3C &lt &LT &LT; &#60 &#060 &#0060 &#00060 &#000060 &#0000060 &#x3c &#x03c &#x003c &#x0003c &#x00003c &#x000003c &#x000003c; &#X3c &#X03c &#X003c &#X0003c &#X00003c &#X000003c &#X000003c; &#x3C &#x03C &#x003C &#x0003C &#x00003C &#x000003C &#x000003C; &#X3C &#X03C &#X003C &#X0003C &#X00003C &#X000003C &#X000003C;']

data/specs/as_ruby/0003-css_attr.rb ADDED

@@ -0,0 +1,10 @@
+it     'returns string if valid'
+input  '-moz-def'
+output '-moz-def'
+it     'raises Invalid if it contains unallowed chars:'
+input  'moz def'
+raises Escape_Escape_Escape::Invalid, /contains invalid chars/