hpricot 0.4-mswin32 → 0.5-mswin32

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,14 +8,21 @@
8
8
  */
9
9
  #include <ruby.h>
10
10
 
11
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
12
+
11
13
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
12
14
  sym_cdata, sym_text;
15
+ static VALUE rb_eHpricotParseError;
13
16
  static ID s_read, s_to_str;
14
17
 
15
18
  #define ELE(N) \
16
- if (tokend > tokstart) { \
17
- ele_open = 0; \
18
- rb_yield_tokens(sym_##N, tag, attr, tokstart == 0 ? Qnil : rb_str_new(tokstart, tokend-tokstart), taint); \
19
+ if (tokend > tokstart || text == 1) { \
20
+ VALUE raw_string = Qnil; \
21
+ ele_open = 0; text = 0; \
22
+ if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
23
+ raw_string = rb_str_new(tokstart, tokend-tokstart); \
24
+ } \
25
+ rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
19
26
  }
20
27
 
21
28
  #define SET(N, E) \
@@ -34,6 +41,24 @@ static ID s_read, s_to_str;
34
41
  rb_hash_aset(attr, K, V); \
35
42
  }
36
43
 
44
+ #define TEXT_PASS() \
45
+ if (text == 0) \
46
+ { \
47
+ if (ele_open == 1) { \
48
+ ele_open = 0; \
49
+ if (tokstart > 0) { \
50
+ mark_tag = tokstart; \
51
+ } \
52
+ } else { \
53
+ mark_tag = p; \
54
+ } \
55
+ attr = Qnil; \
56
+ tag = Qnil; \
57
+ text = 1; \
58
+ }
59
+
60
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
61
+
37
62
  %%{
38
63
  machine hpricot_scan;
39
64
 
@@ -55,6 +80,10 @@ static ID s_read, s_to_str;
55
80
  action tag { SET(tag, p); }
56
81
  action tagc { SET(tag, p-1); }
57
82
  action aval { SET(aval, p); }
83
+ action aunq {
84
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
85
+ else { SET(aval, p); }
86
+ }
58
87
  action akey { SET(akey, p); }
59
88
  action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
60
89
  action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
@@ -79,7 +108,7 @@ static ID s_read, s_to_str;
79
108
  #
80
109
  newline = '\n' @{curline += 1;} ;
81
110
  # qtext = '"' ( '\"' | [^\n"] )* '"' | "'" ( "\\'" | [^\n'] )* "'" ;
82
- NameChar = [\-A-Za-z0-9._:] ;
111
+ NameChar = [\-A-Za-z0-9._:?] ;
83
112
  Name = [A-Za-z_:] NameChar* ;
84
113
  StartComment = "<!--" ;
85
114
  EndComment = "-->" ;
@@ -87,14 +116,14 @@ static ID s_read, s_to_str;
87
116
  EndCdata = "]]>" ;
88
117
 
89
118
  NameCap = Name >_tag %tag;
90
- NameAttr = Name >_akey %akey ;
119
+ NameAttr = NameChar+ >_akey %akey ;
91
120
  Q1Attr = [^']* >_aval %aval ;
92
121
  Q2Attr = [^"]* >_aval %aval ;
93
- UnqAttr = [^ \t\n<>"'] >_aval [^ \t\n<>]* %aval ;
122
+ UnqAttr = ( space >_aval | [^ \t\n<>"'] >_aval [^ \t\n<>]* %aunq ) ;
94
123
  Nmtoken = NameChar+ >_akey %akey ;
95
124
 
96
125
  Attr = NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
97
- AttrEnd = ( NameAttr space* "=" space* UnqAttr | Nmtoken >new_attr %save_attr ) ;
126
+ AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
98
127
  AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
99
128
  StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
100
129
  EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
@@ -113,14 +142,23 @@ static ID s_read, s_to_str;
113
142
  "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
114
143
  ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
115
144
  DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
116
- StartXmlProcIns = "<?" Name space+ ;
145
+ StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
117
146
  EndXmlProcIns = "?>" ;
118
147
 
119
- html_comment := (any | newline )* >_tag :>> EndComment >tagc @{ ELE(comment); fgoto main; };
148
+ html_comment := |*
149
+ EndComment @{ EBLK(comment, 3); fgoto main; };
150
+ any | newline { TEXT_PASS(); };
151
+ *|;
120
152
 
121
- html_cdata := (any | newline )* >_tag :>> EndCdata >tagc @{ ELE(cdata); fgoto main; };
153
+ html_cdata := |*
154
+ EndCdata @{ EBLK(cdata, 3); fgoto main; };
155
+ any | newline { TEXT_PASS(); };
156
+ *|;
122
157
 
123
- html_procins := (any | newline )* >_tag :>> EndXmlProcIns >tagc @{ ELE(procins); fgoto main; };
158
+ html_procins := |*
159
+ EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
160
+ any | newline { TEXT_PASS(); };
161
+ *|;
124
162
 
125
163
  main := |*
126
164
  XmlDecl >newEle { ELE(xmldecl); };
@@ -131,23 +169,7 @@ static ID s_read, s_to_str;
131
169
  EmptyTag >newEle { ELE(emptytag); };
132
170
  StartComment >newEle { fgoto html_comment; };
133
171
  StartCdata >newEle { fgoto html_cdata; };
134
-
135
- any | newline {
136
- if (text == 0)
137
- {
138
- if (ele_open == 1) {
139
- ele_open = 0;
140
- if (tokstart > 0) {
141
- mark_tag = tokstart;
142
- }
143
- } else {
144
- mark_tag = p;
145
- }
146
- attr = Qnil;
147
- tag = Qnil;
148
- text = 1;
149
- }
150
- };
172
+ any | newline { TEXT_PASS(); };
151
173
  *|;
152
174
  }%%
153
175
 
@@ -173,13 +195,12 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
173
195
 
174
196
  VALUE hpricot_scan(VALUE self, VALUE port)
175
197
  {
176
- static char buf[BUFSIZE];
177
198
  int cs, act, have = 0, nread = 0, curline = 1, text = 0;
178
- char *tokstart = 0, *tokend = 0;
199
+ char *tokstart = 0, *tokend = 0, *buf = NULL;
179
200
 
180
- VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil;
201
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
181
202
  char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
182
- int done = 0, ele_open = 0;
203
+ int done = 0, ele_open = 0, buffer_size = 0;
183
204
 
184
205
  int taint = OBJ_TAINTED( port );
185
206
  if ( !rb_respond_to( port, s_read ) )
@@ -195,18 +216,27 @@ VALUE hpricot_scan(VALUE self, VALUE port)
195
216
  }
196
217
  }
197
218
 
219
+ buffer_size = BUFSIZE;
220
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
221
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
222
+ if (!NIL_P(bufsize)) {
223
+ buffer_size = NUM2INT(bufsize);
224
+ }
225
+ }
226
+ buf = ALLOC_N(char, buffer_size);
227
+
198
228
  %% write init;
199
229
 
200
230
  while ( !done ) {
201
231
  VALUE str;
202
232
  char *p = buf + have, *pe;
203
- int len, space = BUFSIZE - have;
233
+ int len, space = buffer_size - have;
204
234
 
205
235
  if ( space == 0 ) {
206
236
  /* We've used up the entire buffer storing an already-parsed token
207
- * prefix that must be preserved. */
208
- fprintf(stderr, "OUT OF BUFFER SPACE\n" );
209
- exit(1);
237
+ * prefix that must be preserved. Likely caused by super-long attributes.
238
+ * See ticket #13. */
239
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
210
240
  }
211
241
 
212
242
  if ( rb_respond_to( port, s_read ) )
@@ -233,8 +263,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
233
263
  %% write exec;
234
264
 
235
265
  if ( cs == hpricot_scan_error ) {
236
- fprintf(stderr, "PARSE ERROR\n" );
237
- break;
266
+ free(buf);
267
+ if ( !NIL_P(tag) )
268
+ {
269
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
270
+ }
271
+ else
272
+ {
273
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
274
+ }
238
275
  }
239
276
 
240
277
  if ( done && ele_open )
@@ -279,12 +316,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
279
316
  tokstart = buf;
280
317
  }
281
318
  }
319
+ free(buf);
282
320
  }
283
321
 
284
322
  void Init_hpricot_scan()
285
323
  {
286
324
  VALUE mHpricot = rb_define_module("Hpricot");
325
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
287
326
  rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
327
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
288
328
 
289
329
  s_read = rb_intern("read");
290
330
  s_to_str = rb_intern("to_str");
@@ -1,3 +1,22 @@
1
+ # == About hpricot.rb
2
+ #
3
+ # All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
4
+ #
5
+ # * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
6
+ # * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
7
+ # * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
8
+ # * hpricot/modules.rb: categorizes the various elements using mixins.
9
+ # * hpricot/traverse.rb: methods for searching documents.
10
+ # * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
11
+ # * hpricot/inspect.rb: methods for displaying documents in a readable form.
12
+
13
+ # If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
14
+ # See http://git.bitwi.se/ruby-character-encodings.git/.
15
+ begin
16
+ require 'encoding/character/utf-8'
17
+ rescue LoadError
18
+ end
19
+
1
20
  require 'hpricot_scan'
2
21
  require 'hpricot/tag'
3
22
  require 'hpricot/modules'
@@ -1,66 +1,163 @@
1
1
  module Hpricot
2
+ # Once you've matched a list of elements, you will often need to handle them as
3
+ # a group. Or you may want to perform the same action on each of them.
4
+ # Hpricot::Elements is an extension of Ruby's array class, with some methods
5
+ # added for altering elements contained in the array.
6
+ #
7
+ # If you need to create an element array from regular elements:
8
+ #
9
+ # Hpricot::Elements[ele1, ele2, ele3]
10
+ #
11
+ # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
12
+ # Hpricot::Doc, etc.)
13
+ #
14
+ # == Continuing Searches
15
+ #
16
+ # Usually the Hpricot::Elements you're working on comes from a search you've
17
+ # done. Well, you can continue searching the list by using the same <tt>at</tt>
18
+ # and <tt>search</tt> methods you can use on plain elements.
19
+ #
20
+ # elements = doc.search("/div/p")
21
+ # elements = elements.search("/a[@href='http://hoodwink.d/']")
22
+ # elements = elements.at("img")
23
+ #
24
+ # == Altering Elements
25
+ #
26
+ # When you're altering elements in the list, your changes will be reflected in
27
+ # the document you started searching from.
28
+ #
29
+ # doc = Hpricot("That's my <b>spoon</b>, Tyler.")
30
+ # doc.at("b").swap("<i>fork</i>")
31
+ # doc.to_html
32
+ # #=> "That's my <i>fork</i>, Tyler."
33
+ #
34
+ # == Getting More Detailed
35
+ #
36
+ # If you can't find a method here that does what you need, you may need to
37
+ # loop through the elements and find a method in Hpricot::Container::Trav
38
+ # which can do what you need.
39
+ #
40
+ # For example, you may want to search for all the H3 header tags in a document
41
+ # and grab all the tags underneath the header, but not inside the header.
42
+ # A good method for this is <tt>next_sibling</tt>:
43
+ #
44
+ # doc.search("h3").each do |h3|
45
+ # while ele = h3.next_sibling
46
+ # ary << ele # stuff away all the elements under the h3
47
+ # end
48
+ # end
49
+ #
50
+ # Most of the useful element methods are in the mixins Hpricot::Traverse
51
+ # and Hpricot::Container::Trav.
2
52
  class Elements < Array
53
+ # Searches this list for any elements (or children of these elements) matching
54
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
55
+ #
56
+ # See Hpricot::Container::Trav.search for more.
3
57
  def search(*expr,&blk)
4
- map { |x| x.search(*expr,&blk) }.flatten.uniq
58
+ Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
5
59
  end
6
60
  alias_method :/, :search
7
61
 
62
+ # Searches this list for the first element (or child of these elements) matching
63
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
64
+ #
65
+ # See Hpricot::Container::Trav.at for more.
8
66
  def at(expr, &blk)
9
67
  search(expr, &blk).first
10
68
  end
11
69
  alias_method :%, :at
12
70
 
71
+ # Convert this group of elements into a complete HTML fragment, returned as a
72
+ # string.
13
73
  def to_html
14
74
  map { |x| x.output("") }.join
15
75
  end
16
76
  alias_method :to_s, :to_html
17
77
 
18
- def inner_html(*str)
19
- if str.empty?
78
+ # Returns an HTML fragment built of the contents of each element in this list.
79
+ #
80
+ # If a HTML +string+ is supplied, this method acts like inner_html=.
81
+ def inner_html(*string)
82
+ if string.empty?
20
83
  map { |x| x.inner_html }.join
21
84
  else
22
- x = self.inner_html = str.pop || x
85
+ x = self.inner_html = string.pop || x
23
86
  end
24
87
  end
25
- alias_method :text, :inner_html
26
88
  alias_method :html, :inner_html
27
89
  alias_method :innerHTML, :inner_html
28
90
 
29
- def inner_html=(str)
30
- each { |x| x.inner_html = str }
91
+ # Replaces the contents of each element in this list. Supply an HTML +string+,
92
+ # which is loaded into Hpricot objects and inserted into every element in this
93
+ # list.
94
+ def inner_html=(string)
95
+ each { |x| x.inner_html = string }
31
96
  end
32
97
  alias_method :html=, :inner_html=
33
98
  alias_method :innerHTML=, :inner_html=
34
99
 
35
- def filter(expr)
36
- nodes, = Elements.filter(self, expr)
37
- nodes
100
+ # Returns an string containing the text contents of each element in this list.
101
+ # All HTML tags are removed.
102
+ def inner_text
103
+ map { |x| x.inner_text }.join
38
104
  end
105
+ alias_method :text, :inner_text
39
106
 
107
+ # Remove all elements in this list from the document which contains them.
108
+ #
109
+ # doc = Hpricot("<html>Remove this: <b>here</b></html>")
110
+ # doc.search("b").remove
111
+ # doc.to_html
112
+ # => "<html>Remove this: </html>"
113
+ #
40
114
  def remove
41
115
  each { |x| x.parent.children.delete(x) }
42
116
  end
43
117
 
118
+ # Empty the elements in this list, by removing their insides.
119
+ #
120
+ # doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
121
+ # doc.search("i").empty
122
+ # doc.to_html
123
+ # => "<p> We have <i></i> to say.</p>"
124
+ #
44
125
  def empty
45
126
  each { |x| x.inner_html = nil }
46
127
  end
47
128
 
129
+ # Add to the end of the contents inside each element in this list.
130
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
48
131
  def append(str)
49
132
  each { |x| x.inner_html += str }
50
133
  end
51
134
 
135
+ # Add to the start of the contents inside each element in this list.
136
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
52
137
  def prepend(str)
53
138
  each { |x| x.inner_html = str + x.inner_html }
54
139
  end
55
-
140
+
141
+ # Add some HTML just previous to each element in this list.
142
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
56
143
  def before(str)
57
144
  each { |x| x.parent.insert_before Hpricot.make(str), x }
58
145
  end
59
146
 
147
+ # Just after each element in this list, add some HTML.
148
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
60
149
  def after(str)
61
150
  each { |x| x.parent.insert_after Hpricot.make(str), x }
62
151
  end
63
152
 
153
+ # Wraps each element in the list inside the element created by HTML +str+.
154
+ # If more than one element is found in the string, Hpricot locates the
155
+ # deepest spot inside the first element.
156
+ #
157
+ # doc.search("a[@href]").
158
+ # wrap(%{<div class="link"><div class="link_inner"></div></div>})
159
+ #
160
+ # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
64
161
  def wrap(str)
65
162
  each do |x|
66
163
  wrap = Hpricot.make(str)
@@ -74,15 +171,15 @@ module Hpricot
74
171
  end
75
172
  end
76
173
 
77
- def not(expr)
78
- if expr.is_a? Container::Trav
79
- nodes = self - [expr]
80
- else
81
- nodes, = Elements.filter(self, expr, false)
82
- end
83
- nodes
84
- end
85
-
174
+ # Sets an attribute for all elements in this list. You may use
175
+ # a simple pair (<em>attribute name</em>, <em>attribute value</em>):
176
+ #
177
+ # doc.search('p').set(:class, 'outline')
178
+ #
179
+ # Or, use a hash of pairs:
180
+ #
181
+ # doc.search('div#sidebar').set(:class => 'outline', :id => 'topbar')
182
+ #
86
183
  def set(k, v = nil)
87
184
  case k
88
185
  when Hash
@@ -96,9 +193,9 @@ module Hpricot
96
193
  end
97
194
  end
98
195
 
99
- ATTR_RE = %r!\[ *(@)([a-zA-Z0-9\(\)_-]+) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
196
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
100
197
  BRACK_RE = %r!(\[) *([^\]]*) *\]!i
101
- FUNC_RE = %r!(:)([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)'\"]*)['\"]? *\)!
198
+ FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
102
199
  CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
103
200
 
104
201
  def self.filter(nodes, expr, truth = true)
@@ -112,16 +209,20 @@ module Hpricot
112
209
  m[0] = "@#{m.slice!(2,1)}"
113
210
  end
114
211
 
212
+ if m[0] == '[' && m[1] =~ /^\d+$/
213
+ m = [":", "nth", m[1].to_i-1]
214
+ end
215
+
115
216
  if m[0] == ":" && m[1] == "not"
116
217
  nodes, = Elements.filter(nodes, m[2], false)
117
218
  else
118
- meth = "filter[#{m[0]}]"
119
- if Container::Trav.method_defined? meth
120
- args = m[1..-1]
219
+ meth = "filter[#{m[0]}#{m[1]}]"
220
+ if Traverse.method_defined? meth
221
+ args = m[2..-1]
121
222
  else
122
- meth = "filter[#{m[0]}#{m[1]}]"
123
- if Container::Trav.method_defined? meth
124
- args = m[2..-1]
223
+ meth = "filter[#{m[0]}]"
224
+ if Traverse.method_defined? meth
225
+ args = m[1..-1]
125
226
  end
126
227
  end
127
228
  i = -1
@@ -134,7 +235,19 @@ module Hpricot
134
235
  [nodes, expr]
135
236
  end
136
237
 
137
- def inspect; "#<#{self.class}#{super}>" end
238
+ def filter(expr)
239
+ nodes, = Elements.filter(self, expr)
240
+ nodes
241
+ end
242
+
243
+ def not(expr)
244
+ if expr.is_a? Traverse
245
+ nodes = self - [expr]
246
+ else
247
+ nodes, = Elements.filter(self, expr, false)
248
+ end
249
+ nodes
250
+ end
138
251
 
139
252
  private
140
253
  def copy_node(node, l)
@@ -145,50 +258,51 @@ module Hpricot
145
258
 
146
259
  end
147
260
 
148
- module Container::Trav
261
+ module Traverse
149
262
  def self.filter(tok, &blk)
150
263
  define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
151
264
  end
152
265
 
153
266
  filter '' do |name,i|
154
- name == '*' || self.name.downcase == name.downcase
267
+ name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
155
268
  end
156
269
 
157
270
  filter '#' do |id,i|
158
- get_attribute('id').to_s == id
271
+ self.elem? and get_attribute('id').to_s == id
159
272
  end
160
273
 
161
274
  filter '.' do |name,i|
162
- classes.include? name
275
+ self.elem? and classes.include? name
163
276
  end
164
277
 
165
278
  filter :lt do |num,i|
166
- parent.containers.index(self) < num.to_i
279
+ self.position < num.to_i
167
280
  end
168
281
 
169
282
  filter :gt do |num,i|
170
- parent.containers.index(self) > num.to_i
283
+ self.position > num.to_i
171
284
  end
172
285
 
173
- nth = proc { |num,i| parent.containers.index(self) == num.to_i }
286
+ nth = proc { |num,i| self.position == num.to_i }
287
+ nth_first = proc { |*a| self.position == 0 }
288
+ nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
174
289
 
175
290
  filter :nth, &nth
176
291
  filter :eq, &nth
292
+ filter ":nth-of-type", &nth
177
293
 
178
- filter :first do |num,i|
179
- parent.containers.index(self) == 0
180
- end
294
+ filter :first, &nth_first
295
+ filter ":first-of-type", &nth_first
181
296
 
182
- filter :last do |i|
183
- self == parent.containers.last
184
- end
297
+ filter :last, &nth_last
298
+ filter ":last-of-type", &nth_last
185
299
 
186
300
  filter :even do |num,i|
187
- parent.containers.index(self) % 2 == 0
301
+ self.position % 2 == 0
188
302
  end
189
303
 
190
304
  filter :odd do |num,i|
191
- parent.containers.index(self) % 2 == 1
305
+ self.position % 2 == 1
192
306
  end
193
307
 
194
308
  filter ':first-child' do |i|
@@ -204,32 +318,19 @@ module Hpricot
204
318
  end
205
319
 
206
320
  filter ":last-child" do |i|
207
- self == parent.containers.first
321
+ self == parent.containers.last
208
322
  end
209
323
 
210
324
  filter ":nth-last-child" do |arg,i|
211
325
  self == parent.containers[-1-arg.to_i]
212
326
  end
213
327
 
214
- filter ":first-of-type" do |i|
215
- self == parent.containers.detect { |x| x.name == arg }
216
- end
217
-
218
- filter ":nth-of-type" do |arg,i|
219
- self == parent.containers.find_all { |x| x.name == arg }[arg.to_i]
220
- end
221
-
222
- filter ":last-of-type" do |i|
223
- self == parent.containers.find_all { |x| x.name == self.name }.last
224
- end
225
-
226
- filter :"nth-last-of-type" do |arg,i|
227
- self == parent.containers.find_all { |x| x.name == arg }[-1-arg.to_i]
328
+ filter ":nth-last-of-type" do |arg,i|
329
+ self == parent.children_of_type(self.name)[-1-arg.to_i]
228
330
  end
229
331
 
230
332
  filter ":only-of-type" do |arg,i|
231
- of_type = parent.containers.find_all { |x| x.name == arg }
232
- of_type.length == 1
333
+ parent.children_of_type(self.name).length == 1
233
334
  end
234
335
 
235
336
  filter ":only-child" do |arg,i|
@@ -237,55 +338,61 @@ module Hpricot
237
338
  end
238
339
 
239
340
  filter :parent do
240
- childNodes.length > 0
341
+ containers.length > 0
241
342
  end
242
343
 
243
344
  filter :empty do
244
- childNodes.length == 0
345
+ containers.length == 0
245
346
  end
246
347
 
247
348
  filter :root do
248
349
  self.is_a? Hpricot::Doc
249
350
  end
250
351
 
251
- filter :contains do |arg,|
252
- html.include? arg
253
- end
254
-
255
- filter '@=' do |attr,val,i|
256
- get_attribute(attr).to_s == val
257
- end
258
-
259
- filter '@!=' do |attr,val,i|
260
- get_attribute(attr).to_s != val
261
- end
262
-
263
- filter '@~=' do |attr,val,i|
264
- get_attribute(attr).to_s.split(/\s+/).include? val
352
+ filter 'text' do
353
+ self.text?
265
354
  end
266
355
 
267
- filter '@|=' do |attr,val,i|
268
- get_attribute(attr).to_s =~ /^#{Regexp::quote val}(-|$)/
356
+ filter 'comment' do
357
+ self.comment?
269
358
  end
270
359
 
271
- filter '@^=' do |attr,val,i|
272
- get_attribute(attr).to_s.index(val) == 0
360
+ filter :contains do |arg,|
361
+ html.include? arg
273
362
  end
274
363
 
275
- filter '@$=' do |attr,val,i|
276
- get_attribute(attr).to_s =~ /#{Regexp::quote val}$/
364
+ pred_procs =
365
+ {'text()' => proc { |ele, *_| ele.inner_text.strip },
366
+ '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
367
+
368
+ oper_procs =
369
+ {'=' => proc { |a,b| a == b },
370
+ '!=' => proc { |a,b| a != b },
371
+ '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
372
+ '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
373
+ '^=' => proc { |a,b| a.index(b) == 0 },
374
+ '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
375
+ '*=' => proc { |a,b| idx = a.index(b) }}
376
+
377
+ pred_procs.each do |pred_n, pred_f|
378
+ oper_procs.each do |oper_n, oper_f|
379
+ filter "#{pred_n}#{oper_n}" do |*a|
380
+ qual = pred_f[self, *a]
381
+ oper_f[qual, a[-2]] if qual
382
+ end
383
+ end
277
384
  end
278
385
 
279
- filter '@*=' do |attr,val,i|
280
- get_attribute(attr).to_s.index(val) >= 0
386
+ filter 'text()' do |val,i|
387
+ !self.inner_text.strip.empty?
281
388
  end
282
389
 
283
390
  filter '@' do |attr,val,i|
284
- has_attribute? attr
391
+ self.elem? and has_attribute? attr
285
392
  end
286
393
 
287
394
  filter '[' do |val,i|
288
- search(val).length > 0
395
+ self.elem? and search(val).length > 0
289
396
  end
290
397
 
291
398
  end