hpricot 0.4-mswin32 → 0.5-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +16 -0
- data/README +279 -4
- data/Rakefile +12 -3
- data/ext/hpricot_scan/hpricot_scan.c +3106 -3348
- data/ext/hpricot_scan/hpricot_scan.rl +78 -38
- data/lib/hpricot.rb +19 -0
- data/lib/hpricot/elements.rb +194 -87
- data/lib/hpricot/inspect.rb +13 -0
- data/lib/hpricot/parse.rb +83 -99
- data/lib/hpricot/tag.rb +114 -40
- data/lib/hpricot/traverse.rb +311 -61
- data/lib/hpricot_scan.so +0 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/test_parser.rb +160 -10
- data/test/test_paths.rb +16 -0
- data/test/test_preserved.rb +46 -0
- data/test/test_xml.rb +15 -0
- metadata +41 -35
@@ -8,14 +8,21 @@
|
|
8
8
|
*/
|
9
9
|
#include <ruby.h>
|
10
10
|
|
11
|
+
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
12
|
+
|
11
13
|
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
12
14
|
sym_cdata, sym_text;
|
15
|
+
static VALUE rb_eHpricotParseError;
|
13
16
|
static ID s_read, s_to_str;
|
14
17
|
|
15
18
|
#define ELE(N) \
|
16
|
-
if (tokend > tokstart) { \
|
17
|
-
|
18
|
-
|
19
|
+
if (tokend > tokstart || text == 1) { \
|
20
|
+
VALUE raw_string = Qnil; \
|
21
|
+
ele_open = 0; text = 0; \
|
22
|
+
if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
|
23
|
+
raw_string = rb_str_new(tokstart, tokend-tokstart); \
|
24
|
+
} \
|
25
|
+
rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
|
19
26
|
}
|
20
27
|
|
21
28
|
#define SET(N, E) \
|
@@ -34,6 +41,24 @@ static ID s_read, s_to_str;
|
|
34
41
|
rb_hash_aset(attr, K, V); \
|
35
42
|
}
|
36
43
|
|
44
|
+
#define TEXT_PASS() \
|
45
|
+
if (text == 0) \
|
46
|
+
{ \
|
47
|
+
if (ele_open == 1) { \
|
48
|
+
ele_open = 0; \
|
49
|
+
if (tokstart > 0) { \
|
50
|
+
mark_tag = tokstart; \
|
51
|
+
} \
|
52
|
+
} else { \
|
53
|
+
mark_tag = p; \
|
54
|
+
} \
|
55
|
+
attr = Qnil; \
|
56
|
+
tag = Qnil; \
|
57
|
+
text = 1; \
|
58
|
+
}
|
59
|
+
|
60
|
+
#define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
|
61
|
+
|
37
62
|
%%{
|
38
63
|
machine hpricot_scan;
|
39
64
|
|
@@ -55,6 +80,10 @@ static ID s_read, s_to_str;
|
|
55
80
|
action tag { SET(tag, p); }
|
56
81
|
action tagc { SET(tag, p-1); }
|
57
82
|
action aval { SET(aval, p); }
|
83
|
+
action aunq {
|
84
|
+
if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
|
85
|
+
else { SET(aval, p); }
|
86
|
+
}
|
58
87
|
action akey { SET(akey, p); }
|
59
88
|
action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
|
60
89
|
action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
|
@@ -79,7 +108,7 @@ static ID s_read, s_to_str;
|
|
79
108
|
#
|
80
109
|
newline = '\n' @{curline += 1;} ;
|
81
110
|
# qtext = '"' ( '\"' | [^\n"] )* '"' | "'" ( "\\'" | [^\n'] )* "'" ;
|
82
|
-
NameChar = [\-A-Za-z0-9._
|
111
|
+
NameChar = [\-A-Za-z0-9._:?] ;
|
83
112
|
Name = [A-Za-z_:] NameChar* ;
|
84
113
|
StartComment = "<!--" ;
|
85
114
|
EndComment = "-->" ;
|
@@ -87,14 +116,14 @@ static ID s_read, s_to_str;
|
|
87
116
|
EndCdata = "]]>" ;
|
88
117
|
|
89
118
|
NameCap = Name >_tag %tag;
|
90
|
-
NameAttr =
|
119
|
+
NameAttr = NameChar+ >_akey %akey ;
|
91
120
|
Q1Attr = [^']* >_aval %aval ;
|
92
121
|
Q2Attr = [^"]* >_aval %aval ;
|
93
|
-
UnqAttr = [^ \t\n<>"'] >_aval [^ \t\n<>]* %
|
122
|
+
UnqAttr = ( space >_aval | [^ \t\n<>"'] >_aval [^ \t\n<>]* %aunq ) ;
|
94
123
|
Nmtoken = NameChar+ >_akey %akey ;
|
95
124
|
|
96
125
|
Attr = NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
|
97
|
-
AttrEnd = ( NameAttr space* "=" space* UnqAttr | Nmtoken >new_attr %save_attr ) ;
|
126
|
+
AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
|
98
127
|
AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
|
99
128
|
StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
|
100
129
|
EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
|
@@ -113,14 +142,23 @@ static ID s_read, s_to_str;
|
|
113
142
|
"'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
|
114
143
|
ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
|
115
144
|
DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
|
116
|
-
StartXmlProcIns = "<?" Name space+ ;
|
145
|
+
StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
|
117
146
|
EndXmlProcIns = "?>" ;
|
118
147
|
|
119
|
-
html_comment :=
|
148
|
+
html_comment := |*
|
149
|
+
EndComment @{ EBLK(comment, 3); fgoto main; };
|
150
|
+
any | newline { TEXT_PASS(); };
|
151
|
+
*|;
|
120
152
|
|
121
|
-
html_cdata :=
|
153
|
+
html_cdata := |*
|
154
|
+
EndCdata @{ EBLK(cdata, 3); fgoto main; };
|
155
|
+
any | newline { TEXT_PASS(); };
|
156
|
+
*|;
|
122
157
|
|
123
|
-
html_procins :=
|
158
|
+
html_procins := |*
|
159
|
+
EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
|
160
|
+
any | newline { TEXT_PASS(); };
|
161
|
+
*|;
|
124
162
|
|
125
163
|
main := |*
|
126
164
|
XmlDecl >newEle { ELE(xmldecl); };
|
@@ -131,23 +169,7 @@ static ID s_read, s_to_str;
|
|
131
169
|
EmptyTag >newEle { ELE(emptytag); };
|
132
170
|
StartComment >newEle { fgoto html_comment; };
|
133
171
|
StartCdata >newEle { fgoto html_cdata; };
|
134
|
-
|
135
|
-
any | newline {
|
136
|
-
if (text == 0)
|
137
|
-
{
|
138
|
-
if (ele_open == 1) {
|
139
|
-
ele_open = 0;
|
140
|
-
if (tokstart > 0) {
|
141
|
-
mark_tag = tokstart;
|
142
|
-
}
|
143
|
-
} else {
|
144
|
-
mark_tag = p;
|
145
|
-
}
|
146
|
-
attr = Qnil;
|
147
|
-
tag = Qnil;
|
148
|
-
text = 1;
|
149
|
-
}
|
150
|
-
};
|
172
|
+
any | newline { TEXT_PASS(); };
|
151
173
|
*|;
|
152
174
|
}%%
|
153
175
|
|
@@ -173,13 +195,12 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
|
173
195
|
|
174
196
|
VALUE hpricot_scan(VALUE self, VALUE port)
|
175
197
|
{
|
176
|
-
static char buf[BUFSIZE];
|
177
198
|
int cs, act, have = 0, nread = 0, curline = 1, text = 0;
|
178
|
-
char *tokstart = 0, *tokend = 0;
|
199
|
+
char *tokstart = 0, *tokend = 0, *buf = NULL;
|
179
200
|
|
180
|
-
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil;
|
201
|
+
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
|
181
202
|
char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
|
182
|
-
int done = 0, ele_open = 0;
|
203
|
+
int done = 0, ele_open = 0, buffer_size = 0;
|
183
204
|
|
184
205
|
int taint = OBJ_TAINTED( port );
|
185
206
|
if ( !rb_respond_to( port, s_read ) )
|
@@ -195,18 +216,27 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
195
216
|
}
|
196
217
|
}
|
197
218
|
|
219
|
+
buffer_size = BUFSIZE;
|
220
|
+
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
221
|
+
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
222
|
+
if (!NIL_P(bufsize)) {
|
223
|
+
buffer_size = NUM2INT(bufsize);
|
224
|
+
}
|
225
|
+
}
|
226
|
+
buf = ALLOC_N(char, buffer_size);
|
227
|
+
|
198
228
|
%% write init;
|
199
229
|
|
200
230
|
while ( !done ) {
|
201
231
|
VALUE str;
|
202
232
|
char *p = buf + have, *pe;
|
203
|
-
int len, space =
|
233
|
+
int len, space = buffer_size - have;
|
204
234
|
|
205
235
|
if ( space == 0 ) {
|
206
236
|
/* We've used up the entire buffer storing an already-parsed token
|
207
|
-
* prefix that must be preserved.
|
208
|
-
|
209
|
-
|
237
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
238
|
+
* See ticket #13. */
|
239
|
+
rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
|
210
240
|
}
|
211
241
|
|
212
242
|
if ( rb_respond_to( port, s_read ) )
|
@@ -233,8 +263,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
233
263
|
%% write exec;
|
234
264
|
|
235
265
|
if ( cs == hpricot_scan_error ) {
|
236
|
-
|
237
|
-
|
266
|
+
free(buf);
|
267
|
+
if ( !NIL_P(tag) )
|
268
|
+
{
|
269
|
+
rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
|
270
|
+
}
|
271
|
+
else
|
272
|
+
{
|
273
|
+
rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
|
274
|
+
}
|
238
275
|
}
|
239
276
|
|
240
277
|
if ( done && ele_open )
|
@@ -279,12 +316,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
279
316
|
tokstart = buf;
|
280
317
|
}
|
281
318
|
}
|
319
|
+
free(buf);
|
282
320
|
}
|
283
321
|
|
284
322
|
void Init_hpricot_scan()
|
285
323
|
{
|
286
324
|
VALUE mHpricot = rb_define_module("Hpricot");
|
325
|
+
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
|
287
326
|
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
|
327
|
+
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
|
288
328
|
|
289
329
|
s_read = rb_intern("read");
|
290
330
|
s_to_str = rb_intern("to_str");
|
data/lib/hpricot.rb
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
+
# == About hpricot.rb
|
2
|
+
#
|
3
|
+
# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
|
4
|
+
#
|
5
|
+
# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
|
6
|
+
# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
|
7
|
+
# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
|
8
|
+
# * hpricot/modules.rb: categorizes the various elements using mixins.
|
9
|
+
# * hpricot/traverse.rb: methods for searching documents.
|
10
|
+
# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
|
11
|
+
# * hpricot/inspect.rb: methods for displaying documents in a readable form.
|
12
|
+
|
13
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
14
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
15
|
+
begin
|
16
|
+
require 'encoding/character/utf-8'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
1
20
|
require 'hpricot_scan'
|
2
21
|
require 'hpricot/tag'
|
3
22
|
require 'hpricot/modules'
|
data/lib/hpricot/elements.rb
CHANGED
@@ -1,66 +1,163 @@
|
|
1
1
|
module Hpricot
|
2
|
+
# Once you've matched a list of elements, you will often need to handle them as
|
3
|
+
# a group. Or you may want to perform the same action on each of them.
|
4
|
+
# Hpricot::Elements is an extension of Ruby's array class, with some methods
|
5
|
+
# added for altering elements contained in the array.
|
6
|
+
#
|
7
|
+
# If you need to create an element array from regular elements:
|
8
|
+
#
|
9
|
+
# Hpricot::Elements[ele1, ele2, ele3]
|
10
|
+
#
|
11
|
+
# Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
|
12
|
+
# Hpricot::Doc, etc.)
|
13
|
+
#
|
14
|
+
# == Continuing Searches
|
15
|
+
#
|
16
|
+
# Usually the Hpricot::Elements you're working on comes from a search you've
|
17
|
+
# done. Well, you can continue searching the list by using the same <tt>at</tt>
|
18
|
+
# and <tt>search</tt> methods you can use on plain elements.
|
19
|
+
#
|
20
|
+
# elements = doc.search("/div/p")
|
21
|
+
# elements = elements.search("/a[@href='http://hoodwink.d/']")
|
22
|
+
# elements = elements.at("img")
|
23
|
+
#
|
24
|
+
# == Altering Elements
|
25
|
+
#
|
26
|
+
# When you're altering elements in the list, your changes will be reflected in
|
27
|
+
# the document you started searching from.
|
28
|
+
#
|
29
|
+
# doc = Hpricot("That's my <b>spoon</b>, Tyler.")
|
30
|
+
# doc.at("b").swap("<i>fork</i>")
|
31
|
+
# doc.to_html
|
32
|
+
# #=> "That's my <i>fork</i>, Tyler."
|
33
|
+
#
|
34
|
+
# == Getting More Detailed
|
35
|
+
#
|
36
|
+
# If you can't find a method here that does what you need, you may need to
|
37
|
+
# loop through the elements and find a method in Hpricot::Container::Trav
|
38
|
+
# which can do what you need.
|
39
|
+
#
|
40
|
+
# For example, you may want to search for all the H3 header tags in a document
|
41
|
+
# and grab all the tags underneath the header, but not inside the header.
|
42
|
+
# A good method for this is <tt>next_sibling</tt>:
|
43
|
+
#
|
44
|
+
# doc.search("h3").each do |h3|
|
45
|
+
# while ele = h3.next_sibling
|
46
|
+
# ary << ele # stuff away all the elements under the h3
|
47
|
+
# end
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# Most of the useful element methods are in the mixins Hpricot::Traverse
|
51
|
+
# and Hpricot::Container::Trav.
|
2
52
|
class Elements < Array
|
53
|
+
# Searches this list for any elements (or children of these elements) matching
|
54
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
55
|
+
#
|
56
|
+
# See Hpricot::Container::Trav.search for more.
|
3
57
|
def search(*expr,&blk)
|
4
|
-
map { |x| x.search(*expr,&blk) }.flatten.uniq
|
58
|
+
Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
|
5
59
|
end
|
6
60
|
alias_method :/, :search
|
7
61
|
|
62
|
+
# Searches this list for the first element (or child of these elements) matching
|
63
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
64
|
+
#
|
65
|
+
# See Hpricot::Container::Trav.at for more.
|
8
66
|
def at(expr, &blk)
|
9
67
|
search(expr, &blk).first
|
10
68
|
end
|
11
69
|
alias_method :%, :at
|
12
70
|
|
71
|
+
# Convert this group of elements into a complete HTML fragment, returned as a
|
72
|
+
# string.
|
13
73
|
def to_html
|
14
74
|
map { |x| x.output("") }.join
|
15
75
|
end
|
16
76
|
alias_method :to_s, :to_html
|
17
77
|
|
18
|
-
|
19
|
-
|
78
|
+
# Returns an HTML fragment built of the contents of each element in this list.
|
79
|
+
#
|
80
|
+
# If a HTML +string+ is supplied, this method acts like inner_html=.
|
81
|
+
def inner_html(*string)
|
82
|
+
if string.empty?
|
20
83
|
map { |x| x.inner_html }.join
|
21
84
|
else
|
22
|
-
x = self.inner_html =
|
85
|
+
x = self.inner_html = string.pop || x
|
23
86
|
end
|
24
87
|
end
|
25
|
-
alias_method :text, :inner_html
|
26
88
|
alias_method :html, :inner_html
|
27
89
|
alias_method :innerHTML, :inner_html
|
28
90
|
|
29
|
-
|
30
|
-
|
91
|
+
# Replaces the contents of each element in this list. Supply an HTML +string+,
|
92
|
+
# which is loaded into Hpricot objects and inserted into every element in this
|
93
|
+
# list.
|
94
|
+
def inner_html=(string)
|
95
|
+
each { |x| x.inner_html = string }
|
31
96
|
end
|
32
97
|
alias_method :html=, :inner_html=
|
33
98
|
alias_method :innerHTML=, :inner_html=
|
34
99
|
|
35
|
-
|
36
|
-
|
37
|
-
|
100
|
+
# Returns an string containing the text contents of each element in this list.
|
101
|
+
# All HTML tags are removed.
|
102
|
+
def inner_text
|
103
|
+
map { |x| x.inner_text }.join
|
38
104
|
end
|
105
|
+
alias_method :text, :inner_text
|
39
106
|
|
107
|
+
# Remove all elements in this list from the document which contains them.
|
108
|
+
#
|
109
|
+
# doc = Hpricot("<html>Remove this: <b>here</b></html>")
|
110
|
+
# doc.search("b").remove
|
111
|
+
# doc.to_html
|
112
|
+
# => "<html>Remove this: </html>"
|
113
|
+
#
|
40
114
|
def remove
|
41
115
|
each { |x| x.parent.children.delete(x) }
|
42
116
|
end
|
43
117
|
|
118
|
+
# Empty the elements in this list, by removing their insides.
|
119
|
+
#
|
120
|
+
# doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
|
121
|
+
# doc.search("i").empty
|
122
|
+
# doc.to_html
|
123
|
+
# => "<p> We have <i></i> to say.</p>"
|
124
|
+
#
|
44
125
|
def empty
|
45
126
|
each { |x| x.inner_html = nil }
|
46
127
|
end
|
47
128
|
|
129
|
+
# Add to the end of the contents inside each element in this list.
|
130
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
48
131
|
def append(str)
|
49
132
|
each { |x| x.inner_html += str }
|
50
133
|
end
|
51
134
|
|
135
|
+
# Add to the start of the contents inside each element in this list.
|
136
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
52
137
|
def prepend(str)
|
53
138
|
each { |x| x.inner_html = str + x.inner_html }
|
54
139
|
end
|
55
|
-
|
140
|
+
|
141
|
+
# Add some HTML just previous to each element in this list.
|
142
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
56
143
|
def before(str)
|
57
144
|
each { |x| x.parent.insert_before Hpricot.make(str), x }
|
58
145
|
end
|
59
146
|
|
147
|
+
# Just after each element in this list, add some HTML.
|
148
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
60
149
|
def after(str)
|
61
150
|
each { |x| x.parent.insert_after Hpricot.make(str), x }
|
62
151
|
end
|
63
152
|
|
153
|
+
# Wraps each element in the list inside the element created by HTML +str+.
|
154
|
+
# If more than one element is found in the string, Hpricot locates the
|
155
|
+
# deepest spot inside the first element.
|
156
|
+
#
|
157
|
+
# doc.search("a[@href]").
|
158
|
+
# wrap(%{<div class="link"><div class="link_inner"></div></div>})
|
159
|
+
#
|
160
|
+
# This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
|
64
161
|
def wrap(str)
|
65
162
|
each do |x|
|
66
163
|
wrap = Hpricot.make(str)
|
@@ -74,15 +171,15 @@ module Hpricot
|
|
74
171
|
end
|
75
172
|
end
|
76
173
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
174
|
+
# Sets an attribute for all elements in this list. You may use
|
175
|
+
# a simple pair (<em>attribute name</em>, <em>attribute value</em>):
|
176
|
+
#
|
177
|
+
# doc.search('p').set(:class, 'outline')
|
178
|
+
#
|
179
|
+
# Or, use a hash of pairs:
|
180
|
+
#
|
181
|
+
# doc.search('div#sidebar').set(:class => 'outline', :id => 'topbar')
|
182
|
+
#
|
86
183
|
def set(k, v = nil)
|
87
184
|
case k
|
88
185
|
when Hash
|
@@ -96,9 +193,9 @@ module Hpricot
|
|
96
193
|
end
|
97
194
|
end
|
98
195
|
|
99
|
-
ATTR_RE = %r!\[ *(@)([
|
196
|
+
ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
|
100
197
|
BRACK_RE = %r!(\[) *([^\]]*) *\]!i
|
101
|
-
FUNC_RE = %r!(:)([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)
|
198
|
+
FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
|
102
199
|
CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
|
103
200
|
|
104
201
|
def self.filter(nodes, expr, truth = true)
|
@@ -112,16 +209,20 @@ module Hpricot
|
|
112
209
|
m[0] = "@#{m.slice!(2,1)}"
|
113
210
|
end
|
114
211
|
|
212
|
+
if m[0] == '[' && m[1] =~ /^\d+$/
|
213
|
+
m = [":", "nth", m[1].to_i-1]
|
214
|
+
end
|
215
|
+
|
115
216
|
if m[0] == ":" && m[1] == "not"
|
116
217
|
nodes, = Elements.filter(nodes, m[2], false)
|
117
218
|
else
|
118
|
-
meth = "filter[#{m[0]}]"
|
119
|
-
if
|
120
|
-
args = m[
|
219
|
+
meth = "filter[#{m[0]}#{m[1]}]"
|
220
|
+
if Traverse.method_defined? meth
|
221
|
+
args = m[2..-1]
|
121
222
|
else
|
122
|
-
meth = "filter[#{m[0]}
|
123
|
-
if
|
124
|
-
args = m[
|
223
|
+
meth = "filter[#{m[0]}]"
|
224
|
+
if Traverse.method_defined? meth
|
225
|
+
args = m[1..-1]
|
125
226
|
end
|
126
227
|
end
|
127
228
|
i = -1
|
@@ -134,7 +235,19 @@ module Hpricot
|
|
134
235
|
[nodes, expr]
|
135
236
|
end
|
136
237
|
|
137
|
-
def
|
238
|
+
def filter(expr)
|
239
|
+
nodes, = Elements.filter(self, expr)
|
240
|
+
nodes
|
241
|
+
end
|
242
|
+
|
243
|
+
def not(expr)
|
244
|
+
if expr.is_a? Traverse
|
245
|
+
nodes = self - [expr]
|
246
|
+
else
|
247
|
+
nodes, = Elements.filter(self, expr, false)
|
248
|
+
end
|
249
|
+
nodes
|
250
|
+
end
|
138
251
|
|
139
252
|
private
|
140
253
|
def copy_node(node, l)
|
@@ -145,50 +258,51 @@ module Hpricot
|
|
145
258
|
|
146
259
|
end
|
147
260
|
|
148
|
-
module
|
261
|
+
module Traverse
|
149
262
|
def self.filter(tok, &blk)
|
150
263
|
define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
|
151
264
|
end
|
152
265
|
|
153
266
|
filter '' do |name,i|
|
154
|
-
name == '*' || self.name.downcase == name.downcase
|
267
|
+
name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
|
155
268
|
end
|
156
269
|
|
157
270
|
filter '#' do |id,i|
|
158
|
-
get_attribute('id').to_s == id
|
271
|
+
self.elem? and get_attribute('id').to_s == id
|
159
272
|
end
|
160
273
|
|
161
274
|
filter '.' do |name,i|
|
162
|
-
classes.include? name
|
275
|
+
self.elem? and classes.include? name
|
163
276
|
end
|
164
277
|
|
165
278
|
filter :lt do |num,i|
|
166
|
-
|
279
|
+
self.position < num.to_i
|
167
280
|
end
|
168
281
|
|
169
282
|
filter :gt do |num,i|
|
170
|
-
|
283
|
+
self.position > num.to_i
|
171
284
|
end
|
172
285
|
|
173
|
-
nth = proc { |num,i|
|
286
|
+
nth = proc { |num,i| self.position == num.to_i }
|
287
|
+
nth_first = proc { |*a| self.position == 0 }
|
288
|
+
nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
|
174
289
|
|
175
290
|
filter :nth, &nth
|
176
291
|
filter :eq, &nth
|
292
|
+
filter ":nth-of-type", &nth
|
177
293
|
|
178
|
-
filter :first
|
179
|
-
|
180
|
-
end
|
294
|
+
filter :first, &nth_first
|
295
|
+
filter ":first-of-type", &nth_first
|
181
296
|
|
182
|
-
filter :last
|
183
|
-
|
184
|
-
end
|
297
|
+
filter :last, &nth_last
|
298
|
+
filter ":last-of-type", &nth_last
|
185
299
|
|
186
300
|
filter :even do |num,i|
|
187
|
-
|
301
|
+
self.position % 2 == 0
|
188
302
|
end
|
189
303
|
|
190
304
|
filter :odd do |num,i|
|
191
|
-
|
305
|
+
self.position % 2 == 1
|
192
306
|
end
|
193
307
|
|
194
308
|
filter ':first-child' do |i|
|
@@ -204,32 +318,19 @@ module Hpricot
|
|
204
318
|
end
|
205
319
|
|
206
320
|
filter ":last-child" do |i|
|
207
|
-
self == parent.containers.
|
321
|
+
self == parent.containers.last
|
208
322
|
end
|
209
323
|
|
210
324
|
filter ":nth-last-child" do |arg,i|
|
211
325
|
self == parent.containers[-1-arg.to_i]
|
212
326
|
end
|
213
327
|
|
214
|
-
filter ":
|
215
|
-
self == parent.
|
216
|
-
end
|
217
|
-
|
218
|
-
filter ":nth-of-type" do |arg,i|
|
219
|
-
self == parent.containers.find_all { |x| x.name == arg }[arg.to_i]
|
220
|
-
end
|
221
|
-
|
222
|
-
filter ":last-of-type" do |i|
|
223
|
-
self == parent.containers.find_all { |x| x.name == self.name }.last
|
224
|
-
end
|
225
|
-
|
226
|
-
filter :"nth-last-of-type" do |arg,i|
|
227
|
-
self == parent.containers.find_all { |x| x.name == arg }[-1-arg.to_i]
|
328
|
+
filter ":nth-last-of-type" do |arg,i|
|
329
|
+
self == parent.children_of_type(self.name)[-1-arg.to_i]
|
228
330
|
end
|
229
331
|
|
230
332
|
filter ":only-of-type" do |arg,i|
|
231
|
-
|
232
|
-
of_type.length == 1
|
333
|
+
parent.children_of_type(self.name).length == 1
|
233
334
|
end
|
234
335
|
|
235
336
|
filter ":only-child" do |arg,i|
|
@@ -237,55 +338,61 @@ module Hpricot
|
|
237
338
|
end
|
238
339
|
|
239
340
|
filter :parent do
|
240
|
-
|
341
|
+
containers.length > 0
|
241
342
|
end
|
242
343
|
|
243
344
|
filter :empty do
|
244
|
-
|
345
|
+
containers.length == 0
|
245
346
|
end
|
246
347
|
|
247
348
|
filter :root do
|
248
349
|
self.is_a? Hpricot::Doc
|
249
350
|
end
|
250
351
|
|
251
|
-
filter
|
252
|
-
|
253
|
-
end
|
254
|
-
|
255
|
-
filter '@=' do |attr,val,i|
|
256
|
-
get_attribute(attr).to_s == val
|
257
|
-
end
|
258
|
-
|
259
|
-
filter '@!=' do |attr,val,i|
|
260
|
-
get_attribute(attr).to_s != val
|
261
|
-
end
|
262
|
-
|
263
|
-
filter '@~=' do |attr,val,i|
|
264
|
-
get_attribute(attr).to_s.split(/\s+/).include? val
|
352
|
+
filter 'text' do
|
353
|
+
self.text?
|
265
354
|
end
|
266
355
|
|
267
|
-
filter '
|
268
|
-
|
356
|
+
filter 'comment' do
|
357
|
+
self.comment?
|
269
358
|
end
|
270
359
|
|
271
|
-
filter
|
272
|
-
|
360
|
+
filter :contains do |arg,|
|
361
|
+
html.include? arg
|
273
362
|
end
|
274
363
|
|
275
|
-
|
276
|
-
|
364
|
+
pred_procs =
|
365
|
+
{'text()' => proc { |ele, *_| ele.inner_text.strip },
|
366
|
+
'@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
|
367
|
+
|
368
|
+
oper_procs =
|
369
|
+
{'=' => proc { |a,b| a == b },
|
370
|
+
'!=' => proc { |a,b| a != b },
|
371
|
+
'~=' => proc { |a,b| a.split(/\s+/).include?(b) },
|
372
|
+
'|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
|
373
|
+
'^=' => proc { |a,b| a.index(b) == 0 },
|
374
|
+
'$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
|
375
|
+
'*=' => proc { |a,b| idx = a.index(b) }}
|
376
|
+
|
377
|
+
pred_procs.each do |pred_n, pred_f|
|
378
|
+
oper_procs.each do |oper_n, oper_f|
|
379
|
+
filter "#{pred_n}#{oper_n}" do |*a|
|
380
|
+
qual = pred_f[self, *a]
|
381
|
+
oper_f[qual, a[-2]] if qual
|
382
|
+
end
|
383
|
+
end
|
277
384
|
end
|
278
385
|
|
279
|
-
filter '
|
280
|
-
|
386
|
+
filter 'text()' do |val,i|
|
387
|
+
!self.inner_text.strip.empty?
|
281
388
|
end
|
282
389
|
|
283
390
|
filter '@' do |attr,val,i|
|
284
|
-
has_attribute? attr
|
391
|
+
self.elem? and has_attribute? attr
|
285
392
|
end
|
286
393
|
|
287
394
|
filter '[' do |val,i|
|
288
|
-
search(val).length > 0
|
395
|
+
self.elem? and search(val).length > 0
|
289
396
|
end
|
290
397
|
|
291
398
|
end
|