hpricot 0.4-mswin32 → 0.5-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +16 -0
- data/README +279 -4
- data/Rakefile +12 -3
- data/ext/hpricot_scan/hpricot_scan.c +3106 -3348
- data/ext/hpricot_scan/hpricot_scan.rl +78 -38
- data/lib/hpricot.rb +19 -0
- data/lib/hpricot/elements.rb +194 -87
- data/lib/hpricot/inspect.rb +13 -0
- data/lib/hpricot/parse.rb +83 -99
- data/lib/hpricot/tag.rb +114 -40
- data/lib/hpricot/traverse.rb +311 -61
- data/lib/hpricot_scan.so +0 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/test_parser.rb +160 -10
- data/test/test_paths.rb +16 -0
- data/test/test_preserved.rb +46 -0
- data/test/test_xml.rb +15 -0
- metadata +41 -35
@@ -8,14 +8,21 @@
|
|
8
8
|
*/
|
9
9
|
#include <ruby.h>
|
10
10
|
|
11
|
+
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
12
|
+
|
11
13
|
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
12
14
|
sym_cdata, sym_text;
|
15
|
+
static VALUE rb_eHpricotParseError;
|
13
16
|
static ID s_read, s_to_str;
|
14
17
|
|
15
18
|
#define ELE(N) \
|
16
|
-
if (tokend > tokstart) { \
|
17
|
-
|
18
|
-
|
19
|
+
if (tokend > tokstart || text == 1) { \
|
20
|
+
VALUE raw_string = Qnil; \
|
21
|
+
ele_open = 0; text = 0; \
|
22
|
+
if (tokstart != 0 && sym_##N != sym_cdata && sym_##N != sym_text && sym_##N != sym_procins && sym_##N != sym_comment) { \
|
23
|
+
raw_string = rb_str_new(tokstart, tokend-tokstart); \
|
24
|
+
} \
|
25
|
+
rb_yield_tokens(sym_##N, tag, attr, raw_string, taint); \
|
19
26
|
}
|
20
27
|
|
21
28
|
#define SET(N, E) \
|
@@ -34,6 +41,24 @@ static ID s_read, s_to_str;
|
|
34
41
|
rb_hash_aset(attr, K, V); \
|
35
42
|
}
|
36
43
|
|
44
|
+
#define TEXT_PASS() \
|
45
|
+
if (text == 0) \
|
46
|
+
{ \
|
47
|
+
if (ele_open == 1) { \
|
48
|
+
ele_open = 0; \
|
49
|
+
if (tokstart > 0) { \
|
50
|
+
mark_tag = tokstart; \
|
51
|
+
} \
|
52
|
+
} else { \
|
53
|
+
mark_tag = p; \
|
54
|
+
} \
|
55
|
+
attr = Qnil; \
|
56
|
+
tag = Qnil; \
|
57
|
+
text = 1; \
|
58
|
+
}
|
59
|
+
|
60
|
+
#define EBLK(N, T) CAT(tag, p - T + 1); ELE(N);
|
61
|
+
|
37
62
|
%%{
|
38
63
|
machine hpricot_scan;
|
39
64
|
|
@@ -55,6 +80,10 @@ static ID s_read, s_to_str;
|
|
55
80
|
action tag { SET(tag, p); }
|
56
81
|
action tagc { SET(tag, p-1); }
|
57
82
|
action aval { SET(aval, p); }
|
83
|
+
action aunq {
|
84
|
+
if (*(p-1) == '"' || *(p-1) == '\'') { SET(aval, p-1); }
|
85
|
+
else { SET(aval, p); }
|
86
|
+
}
|
58
87
|
action akey { SET(akey, p); }
|
59
88
|
action xmlver { SET(aval, p); ATTR(rb_str_new2("version"), aval); }
|
60
89
|
action xmlenc { SET(aval, p); ATTR(rb_str_new2("encoding"), aval); }
|
@@ -79,7 +108,7 @@ static ID s_read, s_to_str;
|
|
79
108
|
#
|
80
109
|
newline = '\n' @{curline += 1;} ;
|
81
110
|
# qtext = '"' ( '\"' | [^\n"] )* '"' | "'" ( "\\'" | [^\n'] )* "'" ;
|
82
|
-
NameChar = [\-A-Za-z0-9._
|
111
|
+
NameChar = [\-A-Za-z0-9._:?] ;
|
83
112
|
Name = [A-Za-z_:] NameChar* ;
|
84
113
|
StartComment = "<!--" ;
|
85
114
|
EndComment = "-->" ;
|
@@ -87,14 +116,14 @@ static ID s_read, s_to_str;
|
|
87
116
|
EndCdata = "]]>" ;
|
88
117
|
|
89
118
|
NameCap = Name >_tag %tag;
|
90
|
-
NameAttr =
|
119
|
+
NameAttr = NameChar+ >_akey %akey ;
|
91
120
|
Q1Attr = [^']* >_aval %aval ;
|
92
121
|
Q2Attr = [^"]* >_aval %aval ;
|
93
|
-
UnqAttr = [^ \t\n<>"'] >_aval [^ \t\n<>]* %
|
122
|
+
UnqAttr = ( space >_aval | [^ \t\n<>"'] >_aval [^ \t\n<>]* %aunq ) ;
|
94
123
|
Nmtoken = NameChar+ >_akey %akey ;
|
95
124
|
|
96
125
|
Attr = NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
|
97
|
-
AttrEnd = ( NameAttr space* "=" space* UnqAttr | Nmtoken >new_attr %save_attr ) ;
|
126
|
+
AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
|
98
127
|
AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
|
99
128
|
StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
|
100
129
|
EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
|
@@ -113,14 +142,23 @@ static ID s_read, s_to_str;
|
|
113
142
|
"'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
|
114
143
|
ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
|
115
144
|
DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
|
116
|
-
StartXmlProcIns = "<?" Name space+ ;
|
145
|
+
StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
|
117
146
|
EndXmlProcIns = "?>" ;
|
118
147
|
|
119
|
-
html_comment :=
|
148
|
+
html_comment := |*
|
149
|
+
EndComment @{ EBLK(comment, 3); fgoto main; };
|
150
|
+
any | newline { TEXT_PASS(); };
|
151
|
+
*|;
|
120
152
|
|
121
|
-
html_cdata :=
|
153
|
+
html_cdata := |*
|
154
|
+
EndCdata @{ EBLK(cdata, 3); fgoto main; };
|
155
|
+
any | newline { TEXT_PASS(); };
|
156
|
+
*|;
|
122
157
|
|
123
|
-
html_procins :=
|
158
|
+
html_procins := |*
|
159
|
+
EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
|
160
|
+
any | newline { TEXT_PASS(); };
|
161
|
+
*|;
|
124
162
|
|
125
163
|
main := |*
|
126
164
|
XmlDecl >newEle { ELE(xmldecl); };
|
@@ -131,23 +169,7 @@ static ID s_read, s_to_str;
|
|
131
169
|
EmptyTag >newEle { ELE(emptytag); };
|
132
170
|
StartComment >newEle { fgoto html_comment; };
|
133
171
|
StartCdata >newEle { fgoto html_cdata; };
|
134
|
-
|
135
|
-
any | newline {
|
136
|
-
if (text == 0)
|
137
|
-
{
|
138
|
-
if (ele_open == 1) {
|
139
|
-
ele_open = 0;
|
140
|
-
if (tokstart > 0) {
|
141
|
-
mark_tag = tokstart;
|
142
|
-
}
|
143
|
-
} else {
|
144
|
-
mark_tag = p;
|
145
|
-
}
|
146
|
-
attr = Qnil;
|
147
|
-
tag = Qnil;
|
148
|
-
text = 1;
|
149
|
-
}
|
150
|
-
};
|
172
|
+
any | newline { TEXT_PASS(); };
|
151
173
|
*|;
|
152
174
|
}%%
|
153
175
|
|
@@ -173,13 +195,12 @@ void rb_yield_tokens(VALUE sym, VALUE tag, VALUE attr, VALUE raw, int taint)
|
|
173
195
|
|
174
196
|
VALUE hpricot_scan(VALUE self, VALUE port)
|
175
197
|
{
|
176
|
-
static char buf[BUFSIZE];
|
177
198
|
int cs, act, have = 0, nread = 0, curline = 1, text = 0;
|
178
|
-
char *tokstart = 0, *tokend = 0;
|
199
|
+
char *tokstart = 0, *tokend = 0, *buf = NULL;
|
179
200
|
|
180
|
-
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil;
|
201
|
+
VALUE attr = Qnil, tag = Qnil, akey = Qnil, aval = Qnil, bufsize = Qnil;
|
181
202
|
char *mark_tag = 0, *mark_akey = 0, *mark_aval = 0;
|
182
|
-
int done = 0, ele_open = 0;
|
203
|
+
int done = 0, ele_open = 0, buffer_size = 0;
|
183
204
|
|
184
205
|
int taint = OBJ_TAINTED( port );
|
185
206
|
if ( !rb_respond_to( port, s_read ) )
|
@@ -195,18 +216,27 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
195
216
|
}
|
196
217
|
}
|
197
218
|
|
219
|
+
buffer_size = BUFSIZE;
|
220
|
+
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
221
|
+
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
222
|
+
if (!NIL_P(bufsize)) {
|
223
|
+
buffer_size = NUM2INT(bufsize);
|
224
|
+
}
|
225
|
+
}
|
226
|
+
buf = ALLOC_N(char, buffer_size);
|
227
|
+
|
198
228
|
%% write init;
|
199
229
|
|
200
230
|
while ( !done ) {
|
201
231
|
VALUE str;
|
202
232
|
char *p = buf + have, *pe;
|
203
|
-
int len, space =
|
233
|
+
int len, space = buffer_size - have;
|
204
234
|
|
205
235
|
if ( space == 0 ) {
|
206
236
|
/* We've used up the entire buffer storing an already-parsed token
|
207
|
-
* prefix that must be preserved.
|
208
|
-
|
209
|
-
|
237
|
+
* prefix that must be preserved. Likely caused by super-long attributes.
|
238
|
+
* See ticket #13. */
|
239
|
+
rb_raise(rb_eHpricotParseError, "ran out of buffer space on element <%s>, starting on line %d.", RSTRING(tag)->ptr, curline);
|
210
240
|
}
|
211
241
|
|
212
242
|
if ( rb_respond_to( port, s_read ) )
|
@@ -233,8 +263,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
233
263
|
%% write exec;
|
234
264
|
|
235
265
|
if ( cs == hpricot_scan_error ) {
|
236
|
-
|
237
|
-
|
266
|
+
free(buf);
|
267
|
+
if ( !NIL_P(tag) )
|
268
|
+
{
|
269
|
+
rb_raise(rb_eHpricotParseError, "parse error on element <%s>, starting on line %d.\n" NO_WAY_SERIOUSLY, RSTRING(tag)->ptr, curline);
|
270
|
+
}
|
271
|
+
else
|
272
|
+
{
|
273
|
+
rb_raise(rb_eHpricotParseError, "parse error on line %d.\n" NO_WAY_SERIOUSLY, curline);
|
274
|
+
}
|
238
275
|
}
|
239
276
|
|
240
277
|
if ( done && ele_open )
|
@@ -279,12 +316,15 @@ VALUE hpricot_scan(VALUE self, VALUE port)
|
|
279
316
|
tokstart = buf;
|
280
317
|
}
|
281
318
|
}
|
319
|
+
free(buf);
|
282
320
|
}
|
283
321
|
|
284
322
|
void Init_hpricot_scan()
|
285
323
|
{
|
286
324
|
VALUE mHpricot = rb_define_module("Hpricot");
|
325
|
+
rb_define_attr(rb_singleton_class(mHpricot), "buffer_size", 1, 1);
|
287
326
|
rb_define_singleton_method(mHpricot, "scan", hpricot_scan, 1);
|
327
|
+
rb_eHpricotParseError = rb_define_class_under(mHpricot, "ParseError", rb_eException);
|
288
328
|
|
289
329
|
s_read = rb_intern("read");
|
290
330
|
s_to_str = rb_intern("to_str");
|
data/lib/hpricot.rb
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
+
# == About hpricot.rb
|
2
|
+
#
|
3
|
+
# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
|
4
|
+
#
|
5
|
+
# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
|
6
|
+
# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
|
7
|
+
# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
|
8
|
+
# * hpricot/modules.rb: categorizes the various elements using mixins.
|
9
|
+
# * hpricot/traverse.rb: methods for searching documents.
|
10
|
+
# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
|
11
|
+
# * hpricot/inspect.rb: methods for displaying documents in a readable form.
|
12
|
+
|
13
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
14
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
15
|
+
begin
|
16
|
+
require 'encoding/character/utf-8'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
1
20
|
require 'hpricot_scan'
|
2
21
|
require 'hpricot/tag'
|
3
22
|
require 'hpricot/modules'
|
data/lib/hpricot/elements.rb
CHANGED
@@ -1,66 +1,163 @@
|
|
1
1
|
module Hpricot
|
2
|
+
# Once you've matched a list of elements, you will often need to handle them as
|
3
|
+
# a group. Or you may want to perform the same action on each of them.
|
4
|
+
# Hpricot::Elements is an extension of Ruby's array class, with some methods
|
5
|
+
# added for altering elements contained in the array.
|
6
|
+
#
|
7
|
+
# If you need to create an element array from regular elements:
|
8
|
+
#
|
9
|
+
# Hpricot::Elements[ele1, ele2, ele3]
|
10
|
+
#
|
11
|
+
# Assuming that ele1, ele2 and ele3 contain element objects (Hpricot::Elem,
|
12
|
+
# Hpricot::Doc, etc.)
|
13
|
+
#
|
14
|
+
# == Continuing Searches
|
15
|
+
#
|
16
|
+
# Usually the Hpricot::Elements you're working on comes from a search you've
|
17
|
+
# done. Well, you can continue searching the list by using the same <tt>at</tt>
|
18
|
+
# and <tt>search</tt> methods you can use on plain elements.
|
19
|
+
#
|
20
|
+
# elements = doc.search("/div/p")
|
21
|
+
# elements = elements.search("/a[@href='http://hoodwink.d/']")
|
22
|
+
# elements = elements.at("img")
|
23
|
+
#
|
24
|
+
# == Altering Elements
|
25
|
+
#
|
26
|
+
# When you're altering elements in the list, your changes will be reflected in
|
27
|
+
# the document you started searching from.
|
28
|
+
#
|
29
|
+
# doc = Hpricot("That's my <b>spoon</b>, Tyler.")
|
30
|
+
# doc.at("b").swap("<i>fork</i>")
|
31
|
+
# doc.to_html
|
32
|
+
# #=> "That's my <i>fork</i>, Tyler."
|
33
|
+
#
|
34
|
+
# == Getting More Detailed
|
35
|
+
#
|
36
|
+
# If you can't find a method here that does what you need, you may need to
|
37
|
+
# loop through the elements and find a method in Hpricot::Container::Trav
|
38
|
+
# which can do what you need.
|
39
|
+
#
|
40
|
+
# For example, you may want to search for all the H3 header tags in a document
|
41
|
+
# and grab all the tags underneath the header, but not inside the header.
|
42
|
+
# A good method for this is <tt>next_sibling</tt>:
|
43
|
+
#
|
44
|
+
# doc.search("h3").each do |h3|
|
45
|
+
# while ele = h3.next_sibling
|
46
|
+
# ary << ele # stuff away all the elements under the h3
|
47
|
+
# end
|
48
|
+
# end
|
49
|
+
#
|
50
|
+
# Most of the useful element methods are in the mixins Hpricot::Traverse
|
51
|
+
# and Hpricot::Container::Trav.
|
2
52
|
class Elements < Array
|
53
|
+
# Searches this list for any elements (or children of these elements) matching
|
54
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
55
|
+
#
|
56
|
+
# See Hpricot::Container::Trav.search for more.
|
3
57
|
def search(*expr,&blk)
|
4
|
-
map { |x| x.search(*expr,&blk) }.flatten.uniq
|
58
|
+
Elements[*map { |x| x.search(*expr,&blk) }.flatten.uniq]
|
5
59
|
end
|
6
60
|
alias_method :/, :search
|
7
61
|
|
62
|
+
# Searches this list for the first element (or child of these elements) matching
|
63
|
+
# the CSS or XPath expression +expr+. Root is assumed to be the element scanned.
|
64
|
+
#
|
65
|
+
# See Hpricot::Container::Trav.at for more.
|
8
66
|
def at(expr, &blk)
|
9
67
|
search(expr, &blk).first
|
10
68
|
end
|
11
69
|
alias_method :%, :at
|
12
70
|
|
71
|
+
# Convert this group of elements into a complete HTML fragment, returned as a
|
72
|
+
# string.
|
13
73
|
def to_html
|
14
74
|
map { |x| x.output("") }.join
|
15
75
|
end
|
16
76
|
alias_method :to_s, :to_html
|
17
77
|
|
18
|
-
|
19
|
-
|
78
|
+
# Returns an HTML fragment built of the contents of each element in this list.
|
79
|
+
#
|
80
|
+
# If a HTML +string+ is supplied, this method acts like inner_html=.
|
81
|
+
def inner_html(*string)
|
82
|
+
if string.empty?
|
20
83
|
map { |x| x.inner_html }.join
|
21
84
|
else
|
22
|
-
x = self.inner_html =
|
85
|
+
x = self.inner_html = string.pop || x
|
23
86
|
end
|
24
87
|
end
|
25
|
-
alias_method :text, :inner_html
|
26
88
|
alias_method :html, :inner_html
|
27
89
|
alias_method :innerHTML, :inner_html
|
28
90
|
|
29
|
-
|
30
|
-
|
91
|
+
# Replaces the contents of each element in this list. Supply an HTML +string+,
|
92
|
+
# which is loaded into Hpricot objects and inserted into every element in this
|
93
|
+
# list.
|
94
|
+
def inner_html=(string)
|
95
|
+
each { |x| x.inner_html = string }
|
31
96
|
end
|
32
97
|
alias_method :html=, :inner_html=
|
33
98
|
alias_method :innerHTML=, :inner_html=
|
34
99
|
|
35
|
-
|
36
|
-
|
37
|
-
|
100
|
+
# Returns an string containing the text contents of each element in this list.
|
101
|
+
# All HTML tags are removed.
|
102
|
+
def inner_text
|
103
|
+
map { |x| x.inner_text }.join
|
38
104
|
end
|
105
|
+
alias_method :text, :inner_text
|
39
106
|
|
107
|
+
# Remove all elements in this list from the document which contains them.
|
108
|
+
#
|
109
|
+
# doc = Hpricot("<html>Remove this: <b>here</b></html>")
|
110
|
+
# doc.search("b").remove
|
111
|
+
# doc.to_html
|
112
|
+
# => "<html>Remove this: </html>"
|
113
|
+
#
|
40
114
|
def remove
|
41
115
|
each { |x| x.parent.children.delete(x) }
|
42
116
|
end
|
43
117
|
|
118
|
+
# Empty the elements in this list, by removing their insides.
|
119
|
+
#
|
120
|
+
# doc = Hpricot("<p> We have <i>so much</i> to say.</p>")
|
121
|
+
# doc.search("i").empty
|
122
|
+
# doc.to_html
|
123
|
+
# => "<p> We have <i></i> to say.</p>"
|
124
|
+
#
|
44
125
|
def empty
|
45
126
|
each { |x| x.inner_html = nil }
|
46
127
|
end
|
47
128
|
|
129
|
+
# Add to the end of the contents inside each element in this list.
|
130
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
48
131
|
def append(str)
|
49
132
|
each { |x| x.inner_html += str }
|
50
133
|
end
|
51
134
|
|
135
|
+
# Add to the start of the contents inside each element in this list.
|
136
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
52
137
|
def prepend(str)
|
53
138
|
each { |x| x.inner_html = str + x.inner_html }
|
54
139
|
end
|
55
|
-
|
140
|
+
|
141
|
+
# Add some HTML just previous to each element in this list.
|
142
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
56
143
|
def before(str)
|
57
144
|
each { |x| x.parent.insert_before Hpricot.make(str), x }
|
58
145
|
end
|
59
146
|
|
147
|
+
# Just after each element in this list, add some HTML.
|
148
|
+
# Pass in an HTML +str+, which is turned into Hpricot elements.
|
60
149
|
def after(str)
|
61
150
|
each { |x| x.parent.insert_after Hpricot.make(str), x }
|
62
151
|
end
|
63
152
|
|
153
|
+
# Wraps each element in the list inside the element created by HTML +str+.
|
154
|
+
# If more than one element is found in the string, Hpricot locates the
|
155
|
+
# deepest spot inside the first element.
|
156
|
+
#
|
157
|
+
# doc.search("a[@href]").
|
158
|
+
# wrap(%{<div class="link"><div class="link_inner"></div></div>})
|
159
|
+
#
|
160
|
+
# This code wraps every link on the page inside a +div.link+ and a +div.link_inner+ nest.
|
64
161
|
def wrap(str)
|
65
162
|
each do |x|
|
66
163
|
wrap = Hpricot.make(str)
|
@@ -74,15 +171,15 @@ module Hpricot
|
|
74
171
|
end
|
75
172
|
end
|
76
173
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
174
|
+
# Sets an attribute for all elements in this list. You may use
|
175
|
+
# a simple pair (<em>attribute name</em>, <em>attribute value</em>):
|
176
|
+
#
|
177
|
+
# doc.search('p').set(:class, 'outline')
|
178
|
+
#
|
179
|
+
# Or, use a hash of pairs:
|
180
|
+
#
|
181
|
+
# doc.search('div#sidebar').set(:class => 'outline', :id => 'topbar')
|
182
|
+
#
|
86
183
|
def set(k, v = nil)
|
87
184
|
case k
|
88
185
|
when Hash
|
@@ -96,9 +193,9 @@ module Hpricot
|
|
96
193
|
end
|
97
194
|
end
|
98
195
|
|
99
|
-
ATTR_RE = %r!\[ *(@)([
|
196
|
+
ATTR_RE = %r!\[ *(?:(@)([\w\(\)-]+)|([\w\(\)-]+\(\))) *([~\!\|\*$\^=]*) *'?"?([^'"]*)'?"? *\]!i
|
100
197
|
BRACK_RE = %r!(\[) *([^\]]*) *\]!i
|
101
|
-
FUNC_RE = %r!(:)([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)
|
198
|
+
FUNC_RE = %r!(:)?([a-zA-Z0-9\*_-]*)\( *[\"']?([^ \)]*?)['\"]? *\)!
|
102
199
|
CATCH_RE = %r!([:\.#]*)([a-zA-Z0-9\*_-]+)!
|
103
200
|
|
104
201
|
def self.filter(nodes, expr, truth = true)
|
@@ -112,16 +209,20 @@ module Hpricot
|
|
112
209
|
m[0] = "@#{m.slice!(2,1)}"
|
113
210
|
end
|
114
211
|
|
212
|
+
if m[0] == '[' && m[1] =~ /^\d+$/
|
213
|
+
m = [":", "nth", m[1].to_i-1]
|
214
|
+
end
|
215
|
+
|
115
216
|
if m[0] == ":" && m[1] == "not"
|
116
217
|
nodes, = Elements.filter(nodes, m[2], false)
|
117
218
|
else
|
118
|
-
meth = "filter[#{m[0]}]"
|
119
|
-
if
|
120
|
-
args = m[
|
219
|
+
meth = "filter[#{m[0]}#{m[1]}]"
|
220
|
+
if Traverse.method_defined? meth
|
221
|
+
args = m[2..-1]
|
121
222
|
else
|
122
|
-
meth = "filter[#{m[0]}
|
123
|
-
if
|
124
|
-
args = m[
|
223
|
+
meth = "filter[#{m[0]}]"
|
224
|
+
if Traverse.method_defined? meth
|
225
|
+
args = m[1..-1]
|
125
226
|
end
|
126
227
|
end
|
127
228
|
i = -1
|
@@ -134,7 +235,19 @@ module Hpricot
|
|
134
235
|
[nodes, expr]
|
135
236
|
end
|
136
237
|
|
137
|
-
def
|
238
|
+
def filter(expr)
|
239
|
+
nodes, = Elements.filter(self, expr)
|
240
|
+
nodes
|
241
|
+
end
|
242
|
+
|
243
|
+
def not(expr)
|
244
|
+
if expr.is_a? Traverse
|
245
|
+
nodes = self - [expr]
|
246
|
+
else
|
247
|
+
nodes, = Elements.filter(self, expr, false)
|
248
|
+
end
|
249
|
+
nodes
|
250
|
+
end
|
138
251
|
|
139
252
|
private
|
140
253
|
def copy_node(node, l)
|
@@ -145,50 +258,51 @@ module Hpricot
|
|
145
258
|
|
146
259
|
end
|
147
260
|
|
148
|
-
module
|
261
|
+
module Traverse
|
149
262
|
def self.filter(tok, &blk)
|
150
263
|
define_method("filter[#{tok.is_a?(String) ? tok : tok.inspect}]", &blk)
|
151
264
|
end
|
152
265
|
|
153
266
|
filter '' do |name,i|
|
154
|
-
name == '*' || self.name.downcase == name.downcase
|
267
|
+
name == '*' || (self.respond_to?(:name) && self.name.downcase == name.downcase)
|
155
268
|
end
|
156
269
|
|
157
270
|
filter '#' do |id,i|
|
158
|
-
get_attribute('id').to_s == id
|
271
|
+
self.elem? and get_attribute('id').to_s == id
|
159
272
|
end
|
160
273
|
|
161
274
|
filter '.' do |name,i|
|
162
|
-
classes.include? name
|
275
|
+
self.elem? and classes.include? name
|
163
276
|
end
|
164
277
|
|
165
278
|
filter :lt do |num,i|
|
166
|
-
|
279
|
+
self.position < num.to_i
|
167
280
|
end
|
168
281
|
|
169
282
|
filter :gt do |num,i|
|
170
|
-
|
283
|
+
self.position > num.to_i
|
171
284
|
end
|
172
285
|
|
173
|
-
nth = proc { |num,i|
|
286
|
+
nth = proc { |num,i| self.position == num.to_i }
|
287
|
+
nth_first = proc { |*a| self.position == 0 }
|
288
|
+
nth_last = proc { |*a| self == parent.children_of_type(self.name).last }
|
174
289
|
|
175
290
|
filter :nth, &nth
|
176
291
|
filter :eq, &nth
|
292
|
+
filter ":nth-of-type", &nth
|
177
293
|
|
178
|
-
filter :first
|
179
|
-
|
180
|
-
end
|
294
|
+
filter :first, &nth_first
|
295
|
+
filter ":first-of-type", &nth_first
|
181
296
|
|
182
|
-
filter :last
|
183
|
-
|
184
|
-
end
|
297
|
+
filter :last, &nth_last
|
298
|
+
filter ":last-of-type", &nth_last
|
185
299
|
|
186
300
|
filter :even do |num,i|
|
187
|
-
|
301
|
+
self.position % 2 == 0
|
188
302
|
end
|
189
303
|
|
190
304
|
filter :odd do |num,i|
|
191
|
-
|
305
|
+
self.position % 2 == 1
|
192
306
|
end
|
193
307
|
|
194
308
|
filter ':first-child' do |i|
|
@@ -204,32 +318,19 @@ module Hpricot
|
|
204
318
|
end
|
205
319
|
|
206
320
|
filter ":last-child" do |i|
|
207
|
-
self == parent.containers.
|
321
|
+
self == parent.containers.last
|
208
322
|
end
|
209
323
|
|
210
324
|
filter ":nth-last-child" do |arg,i|
|
211
325
|
self == parent.containers[-1-arg.to_i]
|
212
326
|
end
|
213
327
|
|
214
|
-
filter ":
|
215
|
-
self == parent.
|
216
|
-
end
|
217
|
-
|
218
|
-
filter ":nth-of-type" do |arg,i|
|
219
|
-
self == parent.containers.find_all { |x| x.name == arg }[arg.to_i]
|
220
|
-
end
|
221
|
-
|
222
|
-
filter ":last-of-type" do |i|
|
223
|
-
self == parent.containers.find_all { |x| x.name == self.name }.last
|
224
|
-
end
|
225
|
-
|
226
|
-
filter :"nth-last-of-type" do |arg,i|
|
227
|
-
self == parent.containers.find_all { |x| x.name == arg }[-1-arg.to_i]
|
328
|
+
filter ":nth-last-of-type" do |arg,i|
|
329
|
+
self == parent.children_of_type(self.name)[-1-arg.to_i]
|
228
330
|
end
|
229
331
|
|
230
332
|
filter ":only-of-type" do |arg,i|
|
231
|
-
|
232
|
-
of_type.length == 1
|
333
|
+
parent.children_of_type(self.name).length == 1
|
233
334
|
end
|
234
335
|
|
235
336
|
filter ":only-child" do |arg,i|
|
@@ -237,55 +338,61 @@ module Hpricot
|
|
237
338
|
end
|
238
339
|
|
239
340
|
filter :parent do
|
240
|
-
|
341
|
+
containers.length > 0
|
241
342
|
end
|
242
343
|
|
243
344
|
filter :empty do
|
244
|
-
|
345
|
+
containers.length == 0
|
245
346
|
end
|
246
347
|
|
247
348
|
filter :root do
|
248
349
|
self.is_a? Hpricot::Doc
|
249
350
|
end
|
250
351
|
|
251
|
-
filter
|
252
|
-
|
253
|
-
end
|
254
|
-
|
255
|
-
filter '@=' do |attr,val,i|
|
256
|
-
get_attribute(attr).to_s == val
|
257
|
-
end
|
258
|
-
|
259
|
-
filter '@!=' do |attr,val,i|
|
260
|
-
get_attribute(attr).to_s != val
|
261
|
-
end
|
262
|
-
|
263
|
-
filter '@~=' do |attr,val,i|
|
264
|
-
get_attribute(attr).to_s.split(/\s+/).include? val
|
352
|
+
filter 'text' do
|
353
|
+
self.text?
|
265
354
|
end
|
266
355
|
|
267
|
-
filter '
|
268
|
-
|
356
|
+
filter 'comment' do
|
357
|
+
self.comment?
|
269
358
|
end
|
270
359
|
|
271
|
-
filter
|
272
|
-
|
360
|
+
filter :contains do |arg,|
|
361
|
+
html.include? arg
|
273
362
|
end
|
274
363
|
|
275
|
-
|
276
|
-
|
364
|
+
pred_procs =
|
365
|
+
{'text()' => proc { |ele, *_| ele.inner_text.strip },
|
366
|
+
'@' => proc { |ele, attr, *_| ele.get_attribute(attr).to_s if ele.elem? }}
|
367
|
+
|
368
|
+
oper_procs =
|
369
|
+
{'=' => proc { |a,b| a == b },
|
370
|
+
'!=' => proc { |a,b| a != b },
|
371
|
+
'~=' => proc { |a,b| a.split(/\s+/).include?(b) },
|
372
|
+
'|=' => proc { |a,b| a =~ /^#{Regexp::quote b}(-|$)/ },
|
373
|
+
'^=' => proc { |a,b| a.index(b) == 0 },
|
374
|
+
'$=' => proc { |a,b| a =~ /#{Regexp::quote b}$/ },
|
375
|
+
'*=' => proc { |a,b| idx = a.index(b) }}
|
376
|
+
|
377
|
+
pred_procs.each do |pred_n, pred_f|
|
378
|
+
oper_procs.each do |oper_n, oper_f|
|
379
|
+
filter "#{pred_n}#{oper_n}" do |*a|
|
380
|
+
qual = pred_f[self, *a]
|
381
|
+
oper_f[qual, a[-2]] if qual
|
382
|
+
end
|
383
|
+
end
|
277
384
|
end
|
278
385
|
|
279
|
-
filter '
|
280
|
-
|
386
|
+
filter 'text()' do |val,i|
|
387
|
+
!self.inner_text.strip.empty?
|
281
388
|
end
|
282
389
|
|
283
390
|
filter '@' do |attr,val,i|
|
284
|
-
has_attribute? attr
|
391
|
+
self.elem? and has_attribute? attr
|
285
392
|
end
|
286
393
|
|
287
394
|
filter '[' do |val,i|
|
288
|
-
search(val).length > 0
|
395
|
+
self.elem? and search(val).length > 0
|
289
396
|
end
|
290
397
|
|
291
398
|
end
|