RubyGems - hpricot - Versions diffs - 0.4-mswin32 → 0.5-mswin32 - Mend

hpricot 0.4-mswin32 → 0.5-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/CHANGELOG +16 -0
data/README +279 -4
data/Rakefile +12 -3
data/ext/hpricot_scan/hpricot_scan.c +3106 -3348
data/ext/hpricot_scan/hpricot_scan.rl +78 -38
data/lib/hpricot.rb +19 -0
data/lib/hpricot/elements.rb +194 -87
data/lib/hpricot/inspect.rb +13 -0
data/lib/hpricot/parse.rb +83 -99
data/lib/hpricot/tag.rb +114 -40
data/lib/hpricot/traverse.rb +311 -61
data/lib/hpricot_scan.so +0 -0
data/test/files/cy0.html +3653 -0
data/test/files/utf8.html +1054 -0
data/test/files/week9.html +1723 -0
data/test/test_parser.rb +160 -10
data/test/test_paths.rb +16 -0
data/test/test_preserved.rb +46 -0
data/test/test_xml.rb +15 -0
metadata +41 -35

data/ext/hpricot_scan/hpricot_scan.rl CHANGED

@@ -8,14 +8,21 @@
  */
 #include <ruby.h>
+#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net.  So sorry!"
 static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
       sym_cdata, sym_text;
+static VALUE rb_eHpricotParseError;
 static ID s_read, s_to_str;
 #define ELE(N) \
-  if (tokend > tokstart) { \
-    ele_open = 0; \
-    rb_yield_tokens(sym_##N, tag, attr, tokstart == 0 ? Qnil : rb_str_new(tokstart, tokend-tokstart), taint); \
+  if (tokend > tokstart || text == 1) { \
+    VALUE raw_string = Qnil; \
+    ele_open = 0; text = 0; \
+    if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
+      raw_string = rb_str_new(tokstart, tokend-tokstart); \
+    } \
+    rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
   }
 #define SET(N, E) \
@@ -34,6 +41,24 @@ static ID s_read, s_to_str;
       rb_hash_aset(attr, K, V); \
     }
+#define TEXT_PASS() \
+    if (text == 0) \
+    { \
+      if (ele_open == 1) { \
+        ele_open = 0; \
+        if (tokstart > 0) { \
+          mark_tag = tokstart; \
+        } \
+      } else { \
+        mark_tag = p; \
+      } \
+      attr = Qnil; \
+      tag = Qnil; \
+      text = 1; \
+    }
+#define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
 %%{
   machine hpricot_scan;
@@ -55,6 +80,10 @@ static ID s_read, s_to_str;
   action tag { SET(tag, p); }
   action tagc { SET(tag, p-1); }
   action aval { SET(aval, p); }
+  action aunq {
+    if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
+    else { SET(aval, p); }
+  }
   action akey { SET(akey, p); }
   action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
   action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
@@ -79,7 +108,7 @@ static ID s_read, s_to_str;
   #
   newline = '\n' @{curline += 1;} ;
 # qtext = '"' ( '\"' | [^\n"] )* '"' | "'" ( "\\'" | [^\n'] )* "'" ;
-  NameChar = [\-A-Za-z0-9._:] ;
+  NameChar = [\-A-Za-z0-9._:?] ;
   Name = [A-Za-z_:] NameChar* ;
   StartComment = "<!--" ;
   EndComment = "-->" ;
@@ -87,14 +116,14 @@ static ID s_read, s_to_str;
   EndCdata = "]]>" ;
   NameCap = Name >_tag %tag;
-  NameAttr = Name >_akey %akey ;
+  NameAttr = NameChar+ >_akey %akey ;
   Q1Attr = [^']* >_aval %aval ;
   Q2Attr = [^"]* >_aval %aval ;
-  UnqAttr = [^ \t\n<>"'] >_aval [^ \t\n<>]* %aval ;
+  UnqAttr = ( space >_aval | [^ \t\n<>"'] >_aval [^ \t\n<>]* %aunq ) ;
   Nmtoken = NameChar+ >_akey %akey ;
   Attr =  NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
-  AttrEnd = ( NameAttr space* "=" space* UnqAttr | Nmtoken >new_attr %save_attr ) ;
+  AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
   AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
   StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
   EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
@@ -113,14 +142,23 @@ static ID s_read, s_to_str;
     "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
   ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
   DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
-  StartXmlProcIns = "<?" Name space+ ;
+  StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
   EndXmlProcIns = "?>" ;
-  html_comment := (any | newline )* >_tag :>> EndComment >tagc @{ ELE(comment); fgoto main; };
+  html_comment := |*
+    EndComment @{ EBLK(comment, 3); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
-  html_cdata := (any | newline )* >_tag :>> EndCdata >tagc @{ ELE(cdata); fgoto main; };
+  html_cdata := |*
+    EndCdata @{ EBLK(cdata, 3); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
-  html_procins := (any | newline )* >_tag :>> EndXmlProcIns >tagc @{ ELE(procins); fgoto main; };
+  html_procins := |*
+    EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
   main := |*
     XmlDecl >newEle { ELE(xmldecl); };
@@ -131,23 +169,7 @@ static ID s_read, s_to_str;
     EmptyTag >newEle { ELE(emptytag); };
     StartComment >newEle { fgoto html_comment; };
     StartCdata >newEle { fgoto html_cdata; };
-    any | newline {
-      if (text == 0)
-      {
-        if (ele_open == 1) {
-          ele_open = 0;
-          if (tokstart > 0) {
-            mark_tag = tokstart;
-          }
-        } else {
-          mark_tag = p;
-        }
-        attr = Qnil;
-        tag = Qnil;
-        text = 1;
-      }
-    };
+    any | newline { TEXT_PASS(); };
   *|;
 }%%
@@ -173,13 +195,12 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
 VALUE hpricot_scan(VALUE self, VALUE port)
 {
-  static char buf[BUFSIZE];
   int cs, act, have = 0, nread = 0, curline = 1, text = 0;
-  char *tokstart = 0, *tokend = 0;
+  char *tokstart = 0, *tokend = 0, *buf = NULL;
-  VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil;
+  VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
   char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
-  int done = 0, ele_open = 0;
+  int done = 0, ele_open = 0, buffer_size = 0;
   int taint = OBJ_TAINTED( port );
   if ( !rb_respond_to( port, s_read ) )
@@ -195,18 +216,27 @@ VALUE hpricot_scan(VALUE self, VALUE port)
     }
   }
+  buffer_size = BUFSIZE;
+  if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
+    bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
+    if (!NIL_P(bufsize)) {
+      buffer_size = NUM2INT(bufsize);
+    }
+  }
+  buf = ALLOC_N(char, buffer_size);
   %% write init;
   while ( !done ) {
     VALUE str;
     char *p = buf + have, *pe;
-    int len, space = BUFSIZE - have;
+    int len, space = buffer_size - have;
     if ( space == 0 ) {
       /* We've used up the entire buffer storing an already-parsed token
-       * prefix that must be preserved. */
-      fprintf(stderr, "OUT OF BUFFER SPACE\n" );
-      exit(1);
+       * prefix that must be preserved.  Likely caused by super-long attributes.
+       * See ticket #13. */
+      rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
     }
     if ( rb_respond_to( port, s_read ) )
@@ -233,8 +263,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
     %% write exec;
     if ( cs == hpricot_scan_error ) {
-      fprintf(stderr, "PARSE ERROR\n" );
-      break;
+      free(buf);
+      if ( !NIL_P(tag) )
+      {
+        rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
+      }
+      else
+      {
+        rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
+      }
     }
     if ( done && ele_open )
@@ -279,12 +316,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
       tokstart = buf;
     }
   }
+  free(buf);
 }
 void Init_hpricot_scan()
 {
   VALUE mHpricot = rb_define_module("Hpricot");
+  rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
   rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
+  rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
   s_read = rb_intern("read");
   s_to_str = rb_intern("to_str");

data/lib/hpricot.rb CHANGED

@@ -1,3 +1,22 @@
+# == About hpricot.rb
+#
+# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
+#
+# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
+# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
+# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
+# * hpricot/modules.rb: categorizes the various elements using mixins.
+# * hpricot/traverse.rb: methods for searching documents.
+# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
+# * hpricot/inspect.rb: methods for displaying documents in a readable form.
+# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
+# See http://git.bitwi.se/ruby-character-encodings.git/.
+begin
+  require 'encoding/character/utf-8'
+rescue LoadError
+end
 require 'hpricot_scan'
 require 'hpricot/tag'
 require 'hpricot/modules'

data/lib/hpricot/elements.rb CHANGED

@@ -1,66 +1,163 @@
 module Hpricot
+# Once you've matched a list of elements, you will often need to handle them as
+# a group.  Or you may want to perform the same action on each of them.
+# Hpricot::Elements is an extension of Ruby's array class, with some methods
+# added for altering elements contained in the array.
+#
+# If you need to create an element array from regular elements:
+#
+#   Hpricot::Elements[ele1, ele2, ele3]
+#
+# Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
+# Hpricot::Doc, etc.)
+#
+# == Continuing Searches
+#
+# Usually the Hpricot::Elements you're working on comes from a search you've
+# done.  Well, you can continue searching the list by using the same <tt>at</tt>
+# and <tt>search</tt> methods you can use on plain elements.
+#
+#   elements = doc.search("/div/p")
+#   elements = elements.search("/a[@href='http://hoodwink.d/']")
+#   elements = elements.at("img")
+#
+# == Altering Elements
+#
+# When you're altering elements in the list, your changes will be reflected in
+# the document you started searching from.
+#
+#   doc = Hpricot("That's my <b>spoon</b>, Tyler.")
+#   doc.at("b").swap("<i>fork</i>")
+#   doc.to_html
+#     #=> "That's my <i>fork</i>, Tyler."
+#
+# == Getting More Detailed
+#
+# If you can't find a method here that does what you need, you may need to
+# loop through the elements and find a method in Hpricot::Container::Trav
+# which can do what you need.
+#
+# For example, you may want to search for all the H3 header tags in a document
+# and grab all the tags underneath the header, but not inside the header.
+# A good method for this is <tt>next_sibling</tt>:
+#
+#   doc.search("h3").each do |h3|
+#     while ele = h3.next_sibling
+#       ary << ele   # stuff away all the elements under the h3
+#     end
+#   end
+#
+# Most of the useful element methods are in the mixins Hpricot::Traverse
+# and Hpricot::Container::Trav.
   class Elements < Array
+    # Searches this list for any elements (or children of these elements) matching
+    # the CSS or XPath expression +expr+.  Root is assumed to be the element scanned.
+    #
+    # See Hpricot::Container::Trav.search for more.
     def search(*expr,&blk)
-      map { |x| x.search(*expr,&blk) }.flatten.uniq
+      Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
     end
     alias_method :/, :search
+    # Searches this list for the first element (or child of these elements) matching
+    # the CSS or XPath expression +expr+.  Root is assumed to be the element scanned.
+    #
+    # See Hpricot::Container::Trav.at for more.
     def at(expr, &blk)
       search(expr, &blk).first
     end
     alias_method :%, :at
+    # Convert this group of elements into a complete HTML fragment, returned as a
+    # string.
     def to_html
       map { |x| x.output("") }.join
     end
     alias_method :to_s, :to_html
-    def inner_html(*str)
-      if str.empty?
+    # Returns an HTML fragment built of the contents of each element in this list.
+    #
+    # If a HTML +string+ is supplied, this method acts like inner_html=.
+    def inner_html(*string)
+      if string.empty?
         map { |x| x.inner_html }.join
       else
-        x = self.inner_html = str.pop || x
+        x = self.inner_html = string.pop || x
       end
     end
-    alias_method :text, :inner_html
     alias_method :html, :inner_html
     alias_method :innerHTML, :inner_html
-    def inner_html=(str)
-      each { |x| x.inner_html = str }
+    # Replaces the contents of each element in this list.  Supply an HTML +string+,
+    # which is loaded into Hpricot objects and inserted into every element in this
+    # list.
+    def inner_html=(string)
+      each { |x| x.inner_html = string }
     end
     alias_method :html=, :inner_html=
     alias_method :innerHTML=, :inner_html=
-    def filter(expr)
-        nodes, = Elements.filter(self, expr)
-        nodes
+    # Returns an string containing the text contents of each element in this list.
+    # All HTML tags are removed.
+    def inner_text
+      map { |x| x.inner_text }.join
     end
+    alias_method :text, :inner_text
+    # Remove all elements in this list from the document which contains them.
+    #
+    #   doc = Hpricot("<html>Remove this: <b>here</b></html>")
+    #   doc.search("b").remove
+    #   doc.to_html
+    #     => "<html>Remove this: </html>"
+    #
     def remove
       each { |x| x.parent.children.delete(x) }
     end
+    # Empty the elements in this list, by removing their insides.
+    #
+    #   doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
+    #   doc.search("i").empty
+    #   doc.to_html
+    #     => "<p> We have <i></i> to say.</p>"
+    #
     def empty
       each { |x| x.inner_html = nil }
     end
+    # Add to the end of the contents inside each element in this list.
+    # Pass in an HTML +str+, which is turned into Hpricot elements.
     def append(str)
       each { |x| x.inner_html += str }
     end
+    # Add to the start of the contents inside each element in this list.
+    # Pass in an HTML +str+, which is turned into Hpricot elements.
     def prepend(str)
       each { |x| x.inner_html = str + x.inner_html }
     end
+    # Add some HTML just previous to each element in this list.
+    # Pass in an HTML +str+, which is turned into Hpricot elements.
     def before(str)
       each { |x| x.parent.insert_before Hpricot.make(str), x }
     end
+    # Just after each element in this list, add some HTML.
+    # Pass in an HTML +str+, which is turned into Hpricot elements.
     def after(str)
       each { |x| x.parent.insert_after Hpricot.make(str), x }
     end
+    # Wraps each element in the list inside the element created by HTML +str+.
+    # If more than one element is found in the string, Hpricot locates the
+    # deepest spot inside the first element.
+    #
+    #  doc.search("a[@href]").
+    #      wrap(%{<div class="link"><div class="link_inner"></div></div>})
+    #
+    # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
     def wrap(str)
       each do |x|
         wrap = Hpricot.make(str)
@@ -74,15 +171,15 @@ module Hpricot
       end
     end
-    def not(expr)
-        if expr.is_a? Container::Trav
-            nodes = self - [expr]
-        else
-            nodes, = Elements.filter(self, expr, false)
-        end
-        nodes
-    end
+    # Sets an attribute for all elements in this list.  You may use
+    # a simple pair (<em>attribute name</em>, <em>attribute value</em>):
+    #
+    #   doc.search('p').set(:class, 'outline')
+    #
+    # Or, use a hash of pairs:
+    #
+    #   doc.search('div#sidebar').set(:class => 'outline', :id => 'topbar')
+    #
     def set(k, v = nil)
       case k
       when Hash
@@ -96,9 +193,9 @@ module Hpricot
       end
     end
-    ATTR_RE = %r!\[ *(@)([a-zA-Z0-9\(\)_-]+) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
+    ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
     BRACK_RE = %r!(\[) *([^\]]*) *\]!i
-    FUNC_RE = %r!(:)([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)'\"]*)['\"]? *\)!
+    FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
     CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
     def self.filter(nodes, expr, truth = true)
@@ -112,16 +209,20 @@ module Hpricot
                 m[0] = "@#{m.slice!(2,1)}"
             end
+            if m[0] == '[' && m[1] =~ /^\d+$/
+                m = [":", "nth", m[1].to_i-1]
+            end
             if m[0] == ":" && m[1] == "not"
                 nodes, = Elements.filter(nodes, m[2], false)
             else
-                meth = "filter[#{m[0]}]"
-                if Container::Trav.method_defined? meth
-                    args = m[1..-1]
+                meth = "filter[#{m[0]}#{m[1]}]"
+                if Traverse.method_defined? meth
+                    args = m[2..-1]
                 else
-                    meth = "filter[#{m[0]}#{m[1]}]"
-                    if Container::Trav.method_defined? meth
-                        args = m[2..-1]
+                    meth = "filter[#{m[0]}]"
+                    if Traverse.method_defined? meth
+                        args = m[1..-1]
                     end
                 end
                 i = -1
@@ -134,7 +235,19 @@ module Hpricot
         [nodes, expr]
     end
-    def inspect; "#<#{self.class}#{super}>" end
+    def filter(expr)
+        nodes, = Elements.filter(self, expr)
+        nodes
+    end
+    def not(expr)
+        if expr.is_a? Traverse
+            nodes = self - [expr]
+        else
+            nodes, = Elements.filter(self, expr, false)
+        end
+        nodes
+    end
     private
     def copy_node(node, l)
@@ -145,50 +258,51 @@ module Hpricot
   end
-  module Container::Trav
+  module Traverse
     def self.filter(tok, &blk)
       define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
     end
     filter '' do |name,i|
-      name == '*' || self.name.downcase == name.downcase
+      name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
     end
     filter '#' do |id,i|
-      get_attribute('id').to_s == id
+      self.elem? and get_attribute('id').to_s == id
     end
     filter '.' do |name,i|
-      classes.include? name
+      self.elem? and classes.include? name
     end
     filter :lt do |num,i|
-      parent.containers.index(self) < num.to_i
+      self.position < num.to_i
     end
     filter :gt do |num,i|
-      parent.containers.index(self) > num.to_i
+      self.position > num.to_i
     end
-    nth = proc { |num,i| parent.containers.index(self) == num.to_i }
+    nth = proc { |num,i| self.position == num.to_i }
+    nth_first = proc { |*a| self.position == 0 }
+    nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
     filter :nth, &nth
     filter :eq, &nth
+    filter ":nth-of-type", &nth
-    filter :first do |num,i|
-      parent.containers.index(self) == 0
-    end
+    filter :first, &nth_first
+    filter ":first-of-type", &nth_first
-    filter :last do |i|
-      self == parent.containers.last
-    end
+    filter :last, &nth_last
+    filter ":last-of-type", &nth_last
     filter :even do |num,i|
-      parent.containers.index(self) % 2 == 0
+      self.position % 2 == 0
     end
     filter :odd do |num,i|
-      parent.containers.index(self) % 2 == 1
+      self.position % 2 == 1
     end
     filter ':first-child' do |i|
@@ -204,32 +318,19 @@ module Hpricot
     end
     filter ":last-child" do |i|
-      self == parent.containers.first
+      self == parent.containers.last
     end
     filter ":nth-last-child" do |arg,i|
       self == parent.containers[-1-arg.to_i]
     end
-    filter ":first-of-type" do |i|
-      self == parent.containers.detect { |x| x.name == arg }
-    end
-    filter ":nth-of-type" do |arg,i|
-      self == parent.containers.find_all { |x| x.name == arg }[arg.to_i]
-    end
-    filter ":last-of-type" do |i|
-      self == parent.containers.find_all { |x| x.name == self.name }.last
-    end
-    filter :"nth-last-of-type" do |arg,i|
-      self == parent.containers.find_all { |x| x.name == arg }[-1-arg.to_i]
+    filter ":nth-last-of-type" do |arg,i|
+      self == parent.children_of_type(self.name)[-1-arg.to_i]
     end
     filter ":only-of-type" do |arg,i|
-      of_type = parent.containers.find_all { |x| x.name == arg }
-      of_type.length == 1
+      parent.children_of_type(self.name).length == 1
     end
     filter ":only-child" do |arg,i|
@@ -237,55 +338,61 @@ module Hpricot
     end
     filter :parent do
-      childNodes.length > 0
+      containers.length > 0
     end
     filter :empty do
-      childNodes.length == 0
+      containers.length == 0
     end
     filter :root do
       self.is_a? Hpricot::Doc
     end
-    filter :contains do |arg,|
-      html.include? arg
-    end
-    filter '@=' do |attr,val,i|
-      get_attribute(attr).to_s == val
-    end
-    filter '@!=' do |attr,val,i|
-      get_attribute(attr).to_s != val
-    end
-    filter '@~=' do |attr,val,i|
-      get_attribute(attr).to_s.split(/\s+/).include? val
+    filter 'text' do
+      self.text?
     end
-    filter '@|=' do |attr,val,i|
-      get_attribute(attr).to_s =~ /^#{Regexp::quote val}(-|$)/
+    filter 'comment' do
+      self.comment?
     end
-    filter '@^=' do |attr,val,i|
-      get_attribute(attr).to_s.index(val) == 0
+    filter :contains do |arg,|
+      html.include? arg
     end
-    filter '@$=' do |attr,val,i|
-      get_attribute(attr).to_s =~ /#{Regexp::quote val}$/
+    pred_procs =
+      {'text()' => proc { |ele, *_| ele.inner_text.strip },
+       '@'      => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
+    oper_procs =
+      {'='      => proc { |a,b| a == b },
+       '!='     => proc { |a,b| a != b },
+       '~='     => proc { |a,b| a.split(/\s+/).include?(b) },
+       '|='     => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
+       '^='     => proc { |a,b| a.index(b) == 0 },
+       '$='     => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
+       '*='     => proc { |a,b| idx = a.index(b) }}
+    pred_procs.each do |pred_n, pred_f|
+      oper_procs.each do |oper_n, oper_f|
+        filter "#{pred_n}#{oper_n}" do |*a|
+          qual = pred_f[self, *a]
+          oper_f[qual, a[-2]] if qual
+        end
+      end
     end
-    filter '@*=' do |attr,val,i|
-      get_attribute(attr).to_s.index(val) >= 0
+    filter 'text()' do |val,i|
+      !self.inner_text.strip.empty?
     end
     filter '@' do |attr,val,i|
-      has_attribute? attr
+      self.elem? and has_attribute? attr
     end
     filter '[' do |val,i|
-      search(val).length > 0
+      self.elem? and search(val).length > 0
     end
   end