hpricot 0.4-mswin32 → 0.5-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,14 +8,21 @@
8
8
  */
9
9
  #include <ruby.h>
10
10
 
11
+ #define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
12
+
11
13
  static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
12
14
  sym_cdata, sym_text;
15
+ static VALUE rb_eHpricotParseError;
13
16
  static ID s_read, s_to_str;
14
17
 
15
18
  #define ELE(N) \
16
- if (tokend > tokstart) { \
17
- ele_open = 0; \
18
- rb_yield_tokens(sym_##N, tag, attr, tokstart == 0 ? Qnil : rb_str_new(tokstart, tokend-tokstart), taint); \
19
+ if (tokend > tokstart || text == 1) { \
20
+ VALUE raw_string = Qnil; \
21
+ ele_open = 0; text = 0; \
22
+ if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
23
+ raw_string = rb_str_new(tokstart, tokend-tokstart); \
24
+ } \
25
+ rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
19
26
  }
20
27
 
21
28
  #define SET(N, E) \
@@ -34,6 +41,24 @@ static ID s_read, s_to_str;
34
41
  rb_hash_aset(attr, K, V); \
35
42
  }
36
43
 
44
+ #define TEXT_PASS() \
45
+ if (text == 0) \
46
+ { \
47
+ if (ele_open == 1) { \
48
+ ele_open = 0; \
49
+ if (tokstart > 0) { \
50
+ mark_tag = tokstart; \
51
+ } \
52
+ } else { \
53
+ mark_tag = p; \
54
+ } \
55
+ attr = Qnil; \
56
+ tag = Qnil; \
57
+ text = 1; \
58
+ }
59
+
60
+ #define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
61
+
37
62
  %%{
38
63
  machine hpricot_scan;
39
64
 
@@ -55,6 +80,10 @@ static ID s_read, s_to_str;
55
80
  action tag { SET(tag, p); }
56
81
  action tagc { SET(tag, p-1); }
57
82
  action aval { SET(aval, p); }
83
+ action aunq {
84
+ if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
85
+ else { SET(aval, p); }
86
+ }
58
87
  action akey { SET(akey, p); }
59
88
  action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
60
89
  action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
@@ -79,7 +108,7 @@ static ID s_read, s_to_str;
79
108
  #
80
109
  newline = '\n' @{curline += 1;} ;
81
110
  # qtext = '"' ( '\"' | [^\n"] )* '"' | "'" ( "\\'" | [^\n'] )* "'" ;
82
- NameChar = [\-A-Za-z0-9._:] ;
111
+ NameChar = [\-A-Za-z0-9._:?] ;
83
112
  Name = [A-Za-z_:] NameChar* ;
84
113
  StartComment = "<!--" ;
85
114
  EndComment = "-->" ;
@@ -87,14 +116,14 @@ static ID s_read, s_to_str;
87
116
  EndCdata = "]]>" ;
88
117
 
89
118
  NameCap = Name >_tag %tag;
90
- NameAttr = Name >_akey %akey ;
119
+ NameAttr = NameChar+ >_akey %akey ;
91
120
  Q1Attr = [^']* >_aval %aval ;
92
121
  Q2Attr = [^"]* >_aval %aval ;
93
- UnqAttr = [^ \t\n<>"'] >_aval [^ \t\n<>]* %aval ;
122
+ UnqAttr = ( space >_aval | [^ \t\n<>"'] >_aval [^ \t\n<>]* %aunq ) ;
94
123
  Nmtoken = NameChar+ >_akey %akey ;
95
124
 
96
125
  Attr = NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
97
- AttrEnd = ( NameAttr space* "=" space* UnqAttr | Nmtoken >new_attr %save_attr ) ;
126
+ AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
98
127
  AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
99
128
  StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
100
129
  EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
@@ -113,14 +142,23 @@ static ID s_read, s_to_str;
113
142
  "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
114
143
  ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
115
144
  DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
116
- StartXmlProcIns = "<?" Name space+ ;
145
+ StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
117
146
  EndXmlProcIns = "?>" ;
118
147
 
119
- html_comment := (any | newline )* >_tag :>> EndComment >tagc @{ ELE(comment); fgoto main; };
148
+ html_comment := |*
149
+ EndComment @{ EBLK(comment, 3); fgoto main; };
150
+ any | newline { TEXT_PASS(); };
151
+ *|;
120
152
 
121
- html_cdata := (any | newline )* >_tag :>> EndCdata >tagc @{ ELE(cdata); fgoto main; };
153
+ html_cdata := |*
154
+ EndCdata @{ EBLK(cdata, 3); fgoto main; };
155
+ any | newline { TEXT_PASS(); };
156
+ *|;
122
157
 
123
- html_procins := (any | newline )* >_tag :>> EndXmlProcIns >tagc @{ ELE(procins); fgoto main; };
158
+ html_procins := |*
159
+ EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
160
+ any | newline { TEXT_PASS(); };
161
+ *|;
124
162
 
125
163
  main := |*
126
164
  XmlDecl >newEle { ELE(xmldecl); };
@@ -131,23 +169,7 @@ static ID s_read, s_to_str;
131
169
  EmptyTag >newEle { ELE(emptytag); };
132
170
  StartComment >newEle { fgoto html_comment; };
133
171
  StartCdata >newEle { fgoto html_cdata; };
134
-
135
- any | newline {
136
- if (text == 0)
137
- {
138
- if (ele_open == 1) {
139
- ele_open = 0;
140
- if (tokstart > 0) {
141
- mark_tag = tokstart;
142
- }
143
- } else {
144
- mark_tag = p;
145
- }
146
- attr = Qnil;
147
- tag = Qnil;
148
- text = 1;
149
- }
150
- };
172
+ any | newline { TEXT_PASS(); };
151
173
  *|;
152
174
  }%%
153
175
 
@@ -173,13 +195,12 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
173
195
 
174
196
  VALUE hpricot_scan(VALUE self, VALUE port)
175
197
  {
176
- static char buf[BUFSIZE];
177
198
  int cs, act, have = 0, nread = 0, curline = 1, text = 0;
178
- char *tokstart = 0, *tokend = 0;
199
+ char *tokstart = 0, *tokend = 0, *buf = NULL;
179
200
 
180
- VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil;
201
+ VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
181
202
  char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
182
- int done = 0, ele_open = 0;
203
+ int done = 0, ele_open = 0, buffer_size = 0;
183
204
 
184
205
  int taint = OBJ_TAINTED( port );
185
206
  if ( !rb_respond_to( port, s_read ) )
@@ -195,18 +216,27 @@ VALUE hpricot_scan(VALUE self, VALUE port)
195
216
  }
196
217
  }
197
218
 
219
+ buffer_size = BUFSIZE;
220
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
221
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
222
+ if (!NIL_P(bufsize)) {
223
+ buffer_size = NUM2INT(bufsize);
224
+ }
225
+ }
226
+ buf = ALLOC_N(char, buffer_size);
227
+
198
228
  %% write init;
199
229
 
200
230
  while ( !done ) {
201
231
  VALUE str;
202
232
  char *p = buf + have, *pe;
203
- int len, space = BUFSIZE - have;
233
+ int len, space = buffer_size - have;
204
234
 
205
235
  if ( space == 0 ) {
206
236
  /* We've used up the entire buffer storing an already-parsed token
207
- * prefix that must be preserved. */
208
- fprintf(stderr, "OUT OF BUFFER SPACE\n" );
209
- exit(1);
237
+ * prefix that must be preserved. Likely caused by super-long attributes.
238
+ * See ticket #13. */
239
+ rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
210
240
  }
211
241
 
212
242
  if ( rb_respond_to( port, s_read ) )
@@ -233,8 +263,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
233
263
  %% write exec;
234
264
 
235
265
  if ( cs == hpricot_scan_error ) {
236
- fprintf(stderr, "PARSE ERROR\n" );
237
- break;
266
+ free(buf);
267
+ if ( !NIL_P(tag) )
268
+ {
269
+ rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
270
+ }
271
+ else
272
+ {
273
+ rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
274
+ }
238
275
  }
239
276
 
240
277
  if ( done && ele_open )
@@ -279,12 +316,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
279
316
  tokstart = buf;
280
317
  }
281
318
  }
319
+ free(buf);
282
320
  }
283
321
 
284
322
  void Init_hpricot_scan()
285
323
  {
286
324
  VALUE mHpricot = rb_define_module("Hpricot");
325
+ rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
287
326
  rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
327
+ rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
288
328
 
289
329
  s_read = rb_intern("read");
290
330
  s_to_str = rb_intern("to_str");
@@ -1,3 +1,22 @@
1
+ # == About hpricot.rb
2
+ #
3
+ # All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
4
+ #
5
+ # * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
6
+ # * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
7
+ # * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
8
+ # * hpricot/modules.rb: categorizes the various elements using mixins.
9
+ # * hpricot/traverse.rb: methods for searching documents.
10
+ # * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
11
+ # * hpricot/inspect.rb: methods for displaying documents in a readable form.
12
+
13
+ # If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
14
+ # See http://git.bitwi.se/ruby-character-encodings.git/.
15
+ begin
16
+ require 'encoding/character/utf-8'
17
+ rescue LoadError
18
+ end
19
+
1
20
  require 'hpricot_scan'
2
21
  require 'hpricot/tag'
3
22
  require 'hpricot/modules'
@@ -1,66 +1,163 @@
1
1
  module Hpricot
2
+ # Once you've matched a list of elements, you will often need to handle them as
3
+ # a group. Or you may want to perform the same action on each of them.
4
+ # Hpricot::Elements is an extension of Ruby's array class, with some methods
5
+ # added for altering elements contained in the array.
6
+ #
7
+ # If you need to create an element array from regular elements:
8
+ #
9
+ # Hpricot::Elements[ele1, ele2, ele3]
10
+ #
11
+ # Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
12
+ # Hpricot::Doc, etc.)
13
+ #
14
+ # == Continuing Searches
15
+ #
16
+ # Usually the Hpricot::Elements you're working on comes from a search you've
17
+ # done. Well, you can continue searching the list by using the same <tt>at</tt>
18
+ # and <tt>search</tt> methods you can use on plain elements.
19
+ #
20
+ # elements = doc.search("/div/p")
21
+ # elements = elements.search("/a[@href='http://hoodwink.d/']")
22
+ # elements = elements.at("img")
23
+ #
24
+ # == Altering Elements
25
+ #
26
+ # When you're altering elements in the list, your changes will be reflected in
27
+ # the document you started searching from.
28
+ #
29
+ # doc = Hpricot("That's my <b>spoon</b>, Tyler.")
30
+ # doc.at("b").swap("<i>fork</i>")
31
+ # doc.to_html
32
+ # #=> "That's my <i>fork</i>, Tyler."
33
+ #
34
+ # == Getting More Detailed
35
+ #
36
+ # If you can't find a method here that does what you need, you may need to
37
+ # loop through the elements and find a method in Hpricot::Container::Trav
38
+ # which can do what you need.
39
+ #
40
+ # For example, you may want to search for all the H3 header tags in a document
41
+ # and grab all the tags underneath the header, but not inside the header.
42
+ # A good method for this is <tt>next_sibling</tt>:
43
+ #
44
+ # doc.search("h3").each do |h3|
45
+ # while ele = h3.next_sibling
46
+ # ary << ele # stuff away all the elements under the h3
47
+ # end
48
+ # end
49
+ #
50
+ # Most of the useful element methods are in the mixins Hpricot::Traverse
51
+ # and Hpricot::Container::Trav.
2
52
  class Elements < Array
53
+ # Searches this list for any elements (or children of these elements) matching
54
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
55
+ #
56
+ # See Hpricot::Container::Trav.search for more.
3
57
  def search(*expr,&blk)
4
- map { |x| x.search(*expr,&blk) }.flatten.uniq
58
+ Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
5
59
  end
6
60
  alias_method :/, :search
7
61
 
62
+ # Searches this list for the first element (or child of these elements) matching
63
+ # the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
64
+ #
65
+ # See Hpricot::Container::Trav.at for more.
8
66
  def at(expr, &blk)
9
67
  search(expr, &blk).first
10
68
  end
11
69
  alias_method :%, :at
12
70
 
71
+ # Convert this group of elements into a complete HTML fragment, returned as a
72
+ # string.
13
73
  def to_html
14
74
  map { |x| x.output("") }.join
15
75
  end
16
76
  alias_method :to_s, :to_html
17
77
 
18
- def inner_html(*str)
19
- if str.empty?
78
+ # Returns an HTML fragment built of the contents of each element in this list.
79
+ #
80
+ # If a HTML +string+ is supplied, this method acts like inner_html=.
81
+ def inner_html(*string)
82
+ if string.empty?
20
83
  map { |x| x.inner_html }.join
21
84
  else
22
- x = self.inner_html = str.pop || x
85
+ x = self.inner_html = string.pop || x
23
86
  end
24
87
  end
25
- alias_method :text, :inner_html
26
88
  alias_method :html, :inner_html
27
89
  alias_method :innerHTML, :inner_html
28
90
 
29
- def inner_html=(str)
30
- each { |x| x.inner_html = str }
91
+ # Replaces the contents of each element in this list. Supply an HTML +string+,
92
+ # which is loaded into Hpricot objects and inserted into every element in this
93
+ # list.
94
+ def inner_html=(string)
95
+ each { |x| x.inner_html = string }
31
96
  end
32
97
  alias_method :html=, :inner_html=
33
98
  alias_method :innerHTML=, :inner_html=
34
99
 
35
- def filter(expr)
36
- nodes, = Elements.filter(self, expr)
37
- nodes
100
+ # Returns an string containing the text contents of each element in this list.
101
+ # All HTML tags are removed.
102
+ def inner_text
103
+ map { |x| x.inner_text }.join
38
104
  end
105
+ alias_method :text, :inner_text
39
106
 
107
+ # Remove all elements in this list from the document which contains them.
108
+ #
109
+ # doc = Hpricot("<html>Remove this: <b>here</b></html>")
110
+ # doc.search("b").remove
111
+ # doc.to_html
112
+ # => "<html>Remove this: </html>"
113
+ #
40
114
  def remove
41
115
  each { |x| x.parent.children.delete(x) }
42
116
  end
43
117
 
118
+ # Empty the elements in this list, by removing their insides.
119
+ #
120
+ # doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
121
+ # doc.search("i").empty
122
+ # doc.to_html
123
+ # => "<p> We have <i></i> to say.</p>"
124
+ #
44
125
  def empty
45
126
  each { |x| x.inner_html = nil }
46
127
  end
47
128
 
129
+ # Add to the end of the contents inside each element in this list.
130
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
48
131
  def append(str)
49
132
  each { |x| x.inner_html += str }
50
133
  end
51
134
 
135
+ # Add to the start of the contents inside each element in this list.
136
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
52
137
  def prepend(str)
53
138
  each { |x| x.inner_html = str + x.inner_html }
54
139
  end
55
-
140
+
141
+ # Add some HTML just previous to each element in this list.
142
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
56
143
  def before(str)
57
144
  each { |x| x.parent.insert_before Hpricot.make(str), x }
58
145
  end
59
146
 
147
+ # Just after each element in this list, add some HTML.
148
+ # Pass in an HTML +str+, which is turned into Hpricot elements.
60
149
  def after(str)
61
150
  each { |x| x.parent.insert_after Hpricot.make(str), x }
62
151
  end
63
152
 
153
+ # Wraps each element in the list inside the element created by HTML +str+.
154
+ # If more than one element is found in the string, Hpricot locates the
155
+ # deepest spot inside the first element.
156
+ #
157
+ # doc.search("a[@href]").
158
+ # wrap(%{<div class="link"><div class="link_inner"></div></div>})
159
+ #
160
+ # This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
64
161
  def wrap(str)
65
162
  each do |x|
66
163
  wrap = Hpricot.make(str)
@@ -74,15 +171,15 @@ module Hpricot
74
171
  end
75
172
  end
76
173
 
77
- def not(expr)
78
- if expr.is_a? Container::Trav
79
- nodes = self - [expr]
80
- else
81
- nodes, = Elements.filter(self, expr, false)
82
- end
83
- nodes
84
- end
85
-
174
+ # Sets an attribute for all elements in this list. You may use
175
+ # a simple pair (<em>attribute name</em>, <em>attribute value</em>):
176
+ #
177
+ # doc.search('p').set(:class, 'outline')
178
+ #
179
+ # Or, use a hash of pairs:
180
+ #
181
+ # doc.search('div#sidebar').set(:class => 'outline', :id => 'topbar')
182
+ #
86
183
  def set(k, v = nil)
87
184
  case k
88
185
  when Hash
@@ -96,9 +193,9 @@ module Hpricot
96
193
  end
97
194
  end
98
195
 
99
- ATTR_RE = %r!\[ *(@)([a-zA-Z0-9\(\)_-]+) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
196
+ ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
100
197
  BRACK_RE = %r!(\[) *([^\]]*) *\]!i
101
- FUNC_RE = %r!(:)([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)'\"]*)['\"]? *\)!
198
+ FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
102
199
  CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
103
200
 
104
201
  def self.filter(nodes, expr, truth = true)
@@ -112,16 +209,20 @@ module Hpricot
112
209
  m[0] = "@#{m.slice!(2,1)}"
113
210
  end
114
211
 
212
+ if m[0] == '[' && m[1] =~ /^\d+$/
213
+ m = [":", "nth", m[1].to_i-1]
214
+ end
215
+
115
216
  if m[0] == ":" && m[1] == "not"
116
217
  nodes, = Elements.filter(nodes, m[2], false)
117
218
  else
118
- meth = "filter[#{m[0]}]"
119
- if Container::Trav.method_defined? meth
120
- args = m[1..-1]
219
+ meth = "filter[#{m[0]}#{m[1]}]"
220
+ if Traverse.method_defined? meth
221
+ args = m[2..-1]
121
222
  else
122
- meth = "filter[#{m[0]}#{m[1]}]"
123
- if Container::Trav.method_defined? meth
124
- args = m[2..-1]
223
+ meth = "filter[#{m[0]}]"
224
+ if Traverse.method_defined? meth
225
+ args = m[1..-1]
125
226
  end
126
227
  end
127
228
  i = -1
@@ -134,7 +235,19 @@ module Hpricot
134
235
  [nodes, expr]
135
236
  end
136
237
 
137
- def inspect; "#<#{self.class}#{super}>" end
238
+ def filter(expr)
239
+ nodes, = Elements.filter(self, expr)
240
+ nodes
241
+ end
242
+
243
+ def not(expr)
244
+ if expr.is_a? Traverse
245
+ nodes = self - [expr]
246
+ else
247
+ nodes, = Elements.filter(self, expr, false)
248
+ end
249
+ nodes
250
+ end
138
251
 
139
252
  private
140
253
  def copy_node(node, l)
@@ -145,50 +258,51 @@ module Hpricot
145
258
 
146
259
  end
147
260
 
148
- module Container::Trav
261
+ module Traverse
149
262
  def self.filter(tok, &blk)
150
263
  define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
151
264
  end
152
265
 
153
266
  filter '' do |name,i|
154
- name == '*' || self.name.downcase == name.downcase
267
+ name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
155
268
  end
156
269
 
157
270
  filter '#' do |id,i|
158
- get_attribute('id').to_s == id
271
+ self.elem? and get_attribute('id').to_s == id
159
272
  end
160
273
 
161
274
  filter '.' do |name,i|
162
- classes.include? name
275
+ self.elem? and classes.include? name
163
276
  end
164
277
 
165
278
  filter :lt do |num,i|
166
- parent.containers.index(self) < num.to_i
279
+ self.position < num.to_i
167
280
  end
168
281
 
169
282
  filter :gt do |num,i|
170
- parent.containers.index(self) > num.to_i
283
+ self.position > num.to_i
171
284
  end
172
285
 
173
- nth = proc { |num,i| parent.containers.index(self) == num.to_i }
286
+ nth = proc { |num,i| self.position == num.to_i }
287
+ nth_first = proc { |*a| self.position == 0 }
288
+ nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
174
289
 
175
290
  filter :nth, &nth
176
291
  filter :eq, &nth
292
+ filter ":nth-of-type", &nth
177
293
 
178
- filter :first do |num,i|
179
- parent.containers.index(self) == 0
180
- end
294
+ filter :first, &nth_first
295
+ filter ":first-of-type", &nth_first
181
296
 
182
- filter :last do |i|
183
- self == parent.containers.last
184
- end
297
+ filter :last, &nth_last
298
+ filter ":last-of-type", &nth_last
185
299
 
186
300
  filter :even do |num,i|
187
- parent.containers.index(self) % 2 == 0
301
+ self.position % 2 == 0
188
302
  end
189
303
 
190
304
  filter :odd do |num,i|
191
- parent.containers.index(self) % 2 == 1
305
+ self.position % 2 == 1
192
306
  end
193
307
 
194
308
  filter ':first-child' do |i|
@@ -204,32 +318,19 @@ module Hpricot
204
318
  end
205
319
 
206
320
  filter ":last-child" do |i|
207
- self == parent.containers.first
321
+ self == parent.containers.last
208
322
  end
209
323
 
210
324
  filter ":nth-last-child" do |arg,i|
211
325
  self == parent.containers[-1-arg.to_i]
212
326
  end
213
327
 
214
- filter ":first-of-type" do |i|
215
- self == parent.containers.detect { |x| x.name == arg }
216
- end
217
-
218
- filter ":nth-of-type" do |arg,i|
219
- self == parent.containers.find_all { |x| x.name == arg }[arg.to_i]
220
- end
221
-
222
- filter ":last-of-type" do |i|
223
- self == parent.containers.find_all { |x| x.name == self.name }.last
224
- end
225
-
226
- filter :"nth-last-of-type" do |arg,i|
227
- self == parent.containers.find_all { |x| x.name == arg }[-1-arg.to_i]
328
+ filter ":nth-last-of-type" do |arg,i|
329
+ self == parent.children_of_type(self.name)[-1-arg.to_i]
228
330
  end
229
331
 
230
332
  filter ":only-of-type" do |arg,i|
231
- of_type = parent.containers.find_all { |x| x.name == arg }
232
- of_type.length == 1
333
+ parent.children_of_type(self.name).length == 1
233
334
  end
234
335
 
235
336
  filter ":only-child" do |arg,i|
@@ -237,55 +338,61 @@ module Hpricot
237
338
  end
238
339
 
239
340
  filter :parent do
240
- childNodes.length > 0
341
+ containers.length > 0
241
342
  end
242
343
 
243
344
  filter :empty do
244
- childNodes.length == 0
345
+ containers.length == 0
245
346
  end
246
347
 
247
348
  filter :root do
248
349
  self.is_a? Hpricot::Doc
249
350
  end
250
351
 
251
- filter :contains do |arg,|
252
- html.include? arg
253
- end
254
-
255
- filter '@=' do |attr,val,i|
256
- get_attribute(attr).to_s == val
257
- end
258
-
259
- filter '@!=' do |attr,val,i|
260
- get_attribute(attr).to_s != val
261
- end
262
-
263
- filter '@~=' do |attr,val,i|
264
- get_attribute(attr).to_s.split(/\s+/).include? val
352
+ filter 'text' do
353
+ self.text?
265
354
  end
266
355
 
267
- filter '@|=' do |attr,val,i|
268
- get_attribute(attr).to_s =~ /^#{Regexp::quote val}(-|$)/
356
+ filter 'comment' do
357
+ self.comment?
269
358
  end
270
359
 
271
- filter '@^=' do |attr,val,i|
272
- get_attribute(attr).to_s.index(val) == 0
360
+ filter :contains do |arg,|
361
+ html.include? arg
273
362
  end
274
363
 
275
- filter '@$=' do |attr,val,i|
276
- get_attribute(attr).to_s =~ /#{Regexp::quote val}$/
364
+ pred_procs =
365
+ {'text()' => proc { |ele, *_| ele.inner_text.strip },
366
+ '@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
367
+
368
+ oper_procs =
369
+ {'=' => proc { |a,b| a == b },
370
+ '!=' => proc { |a,b| a != b },
371
+ '~=' => proc { |a,b| a.split(/\s+/).include?(b) },
372
+ '|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
373
+ '^=' => proc { |a,b| a.index(b) == 0 },
374
+ '$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
375
+ '*=' => proc { |a,b| idx = a.index(b) }}
376
+
377
+ pred_procs.each do |pred_n, pred_f|
378
+ oper_procs.each do |oper_n, oper_f|
379
+ filter "#{pred_n}#{oper_n}" do |*a|
380
+ qual = pred_f[self, *a]
381
+ oper_f[qual, a[-2]] if qual
382
+ end
383
+ end
277
384
  end
278
385
 
279
- filter '@*=' do |attr,val,i|
280
- get_attribute(attr).to_s.index(val) >= 0
386
+ filter 'text()' do |val,i|
387
+ !self.inner_text.strip.empty?
281
388
  end
282
389
 
283
390
  filter '@' do |attr,val,i|
284
- has_attribute? attr
391
+ self.elem? and has_attribute? attr
285
392
  end
286
393
 
287
394
  filter '[' do |val,i|
288
- search(val).length > 0
395
+ self.elem? and search(val).length > 0
289
396
  end
290
397
 
291
398
  end