adamh-hpricot 0.6.171 → 0.6.210
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +3 -2
- data/ext/fast_xs/fast_xs.c +11 -5
- data/ext/hpricot_scan/hpricot_css.c +2109 -2105
- data/ext/hpricot_scan/hpricot_scan.c +188 -139
- data/ext/hpricot_scan/hpricot_scan.rl +65 -14
- data/lib/hpricot/builder.rb +17 -10
- data/lib/hpricot/elements.rb +10 -10
- data/lib/hpricot/htmlinfo.rb +19 -0
- data/lib/hpricot/parse.rb +3 -1
- data/lib/hpricot/tag.rb +13 -1
- data/test/files/boingboing.html +1 -1
- data/test/test_builder.rb +4 -4
- data/test/test_parser.rb +18 -9
- data/test/test_preserved.rb +6 -2
- metadata +2 -2
@@ -19,10 +19,10 @@ VALUE hpricot_css(VALUE, VALUE, VALUE, VALUE, VALUE);
|
|
19
19
|
#define NO_WAY_SERIOUSLY "*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!"
|
20
20
|
|
21
21
|
static VALUE sym_xmldecl, sym_doctype, sym_procins, sym_stag, sym_etag, sym_emptytag, sym_comment,
|
22
|
-
sym_cdata, sym_text, sym_EMPTY;
|
22
|
+
sym_cdata, sym_text, sym_EMPTY, sym_CDATA;
|
23
23
|
static VALUE mHpricot, rb_eHpricotParseError;
|
24
24
|
static VALUE cBaseEle, cBogusETag, cCData, cComment, cDoc, cDocType, cElem, cETag, cText,
|
25
|
-
cXMLDecl, cProcIns;
|
25
|
+
cXMLDecl, cProcIns, symAllow, symDeny;
|
26
26
|
static ID s_ElementContent;
|
27
27
|
static ID s_downcase, s_new, s_parent, s_read, s_to_str;
|
28
28
|
static ID iv_parent;
|
@@ -30,7 +30,7 @@ static VALUE reProcInsParse;
|
|
30
30
|
|
31
31
|
typedef struct {
|
32
32
|
int name;
|
33
|
-
VALUE tag, attr, etag, raw;
|
33
|
+
VALUE tag, attr, etag, raw, EC;
|
34
34
|
VALUE parent, children;
|
35
35
|
} hpricot_ele;
|
36
36
|
|
@@ -249,6 +249,7 @@ hpricot_ele_clear_raw(VALUE self)
|
|
249
249
|
he->tag = tag; \
|
250
250
|
he->attr = attr; \
|
251
251
|
he->raw = Qnil; \
|
252
|
+
he->EC = ec; \
|
252
253
|
he->etag = he->parent = he->children = Qnil; \
|
253
254
|
if (raw != NULL && (sym == sym_emptytag || sym == sym_stag || sym == sym_etag || sym == sym_doctype)) { \
|
254
255
|
he->raw = rb_str_new(raw, rawlen); \
|
@@ -262,7 +263,7 @@ hpricot_ele_alloc(VALUE klass)
|
|
262
263
|
VALUE ele;
|
263
264
|
hpricot_ele *he = ALLOC(hpricot_ele);
|
264
265
|
he->name = 0;
|
265
|
-
he->tag = he->attr = he->raw = Qnil;
|
266
|
+
he->tag = he->attr = he->raw = he->EC = Qnil;
|
266
267
|
he->etag = he->parent = he->children = Qnil;
|
267
268
|
ele = Data_Wrap_Struct(klass, hpricot_ele_mark, hpricot_ele_free, he);
|
268
269
|
return ele;
|
@@ -272,7 +273,7 @@ hpricot_ele_alloc(VALUE klass)
|
|
272
273
|
// the swift, compact parser logic. most of the complicated stuff is done
|
273
274
|
// in the lexer. this step just pairs up the start and end tags.
|
274
275
|
//
|
275
|
-
|
276
|
+
void
|
276
277
|
rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw, int rawlen, int taint)
|
277
278
|
{
|
278
279
|
VALUE ele, ec = Qnil;
|
@@ -281,6 +282,16 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
281
282
|
// in html mode, fix up start tags incorrectly formed as empty tags
|
282
283
|
//
|
283
284
|
if (!S->xml) {
|
285
|
+
hpricot_ele *last;
|
286
|
+
Data_Get_Struct(S->focus, hpricot_ele, last);
|
287
|
+
if (last->EC == sym_CDATA &&
|
288
|
+
(sym != sym_procins && sym != sym_comment && sym != sym_cdata && sym != sym_text) &&
|
289
|
+
!(sym == sym_etag && rb_str_hash(tag) == last->name))
|
290
|
+
{
|
291
|
+
sym = sym_text;
|
292
|
+
tag = rb_str_new(raw, rawlen);
|
293
|
+
}
|
294
|
+
|
284
295
|
if (sym == sym_emptytag || sym == sym_stag || sym == sym_etag) {
|
285
296
|
ec = rb_hash_aref(S->EC, tag);
|
286
297
|
if (NIL_P(ec)) {
|
@@ -300,6 +311,37 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
300
311
|
if (sym == sym_emptytag || sym == sym_stag) {
|
301
312
|
H_ELE(cElem);
|
302
313
|
he->name = rb_str_hash(tag);
|
314
|
+
|
315
|
+
if (!S->xml) {
|
316
|
+
VALUE match = Qnil, e = S->focus;
|
317
|
+
while (e != S->doc)
|
318
|
+
{
|
319
|
+
hpricot_ele *hee;
|
320
|
+
Data_Get_Struct(e, hpricot_ele, hee);
|
321
|
+
|
322
|
+
if (TYPE(hee->EC) == T_HASH)
|
323
|
+
{
|
324
|
+
VALUE has = rb_hash_lookup(hee->EC, INT2NUM(he->name));
|
325
|
+
if (has != Qnil) {
|
326
|
+
if (has == Qtrue) {
|
327
|
+
if (match == Qnil)
|
328
|
+
match = e;
|
329
|
+
} else if (has == symAllow) {
|
330
|
+
match = S->focus;
|
331
|
+
} else if (has == symDeny) {
|
332
|
+
match = Qnil;
|
333
|
+
}
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
e = hee->parent;
|
338
|
+
}
|
339
|
+
|
340
|
+
if (match == Qnil)
|
341
|
+
match = S->focus;
|
342
|
+
S->focus = match;
|
343
|
+
}
|
344
|
+
|
303
345
|
rb_hpricot_add(S->focus, ele);
|
304
346
|
|
305
347
|
//
|
@@ -326,6 +368,8 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
326
368
|
// another optimization will be to improve this very simple
|
327
369
|
// O(n) tag search, where n is the depth of the focused tag.
|
328
370
|
//
|
371
|
+
// (see also: the search above for fixups)
|
372
|
+
//
|
329
373
|
name = rb_str_hash(tag);
|
330
374
|
while (e != S->doc)
|
331
375
|
{
|
@@ -371,8 +415,10 @@ rb_hpricot_token(hpricot_state *S, VALUE sym, VALUE tag, VALUE attr, char *raw,
|
|
371
415
|
VALUE match = rb_funcall(tag, rb_intern("match"), 1, reProcInsParse);
|
372
416
|
tag = rb_reg_nth_match(1, match);
|
373
417
|
attr = rb_reg_nth_match(2, match);
|
374
|
-
|
375
|
-
|
418
|
+
{
|
419
|
+
H_ELE(cProcIns);
|
420
|
+
rb_hpricot_add(S->focus, ele);
|
421
|
+
}
|
376
422
|
} else if (sym == sym_text) {
|
377
423
|
// TODO: add raw_string as well?
|
378
424
|
if (!NIL_P(S->last) && RBASIC(S->last)->klass == cText) {
|
@@ -420,8 +466,8 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
420
466
|
|
421
467
|
if (!rb_block_given_p())
|
422
468
|
{
|
423
|
-
S = ALLOC(hpricot_state);
|
424
469
|
hpricot_ele *he = ALLOC(hpricot_ele);
|
470
|
+
S = ALLOC(hpricot_state);
|
425
471
|
MEMZERO(he, hpricot_ele, 1);
|
426
472
|
he->tag = he->attr = he->etag = he->parent = he->children = Qnil;
|
427
473
|
S->doc = Data_Wrap_Struct(cDoc, hpricot_ele_mark, hpricot_ele_free, he);
|
@@ -432,6 +478,7 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
432
478
|
S->strict = OPT(opts, xhtml_strict);
|
433
479
|
S->fixup = OPT(opts, fixup_tags);
|
434
480
|
if (S->strict) S->fixup = 1;
|
481
|
+
rb_ivar_set(S->doc, rb_intern("@options"), opts);
|
435
482
|
|
436
483
|
S->EC = rb_const_get(mHpricot, s_ElementContent);
|
437
484
|
}
|
@@ -463,7 +510,7 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
463
510
|
mark_aval_diff = mark_aval - buf;
|
464
511
|
|
465
512
|
buffer_size += BUFSIZE;
|
466
|
-
|
513
|
+
REALLOC_N(buf, char, buffer_size);
|
467
514
|
|
468
515
|
space = buffer_size - have;
|
469
516
|
|
@@ -477,16 +524,17 @@ VALUE hpricot_scan(int argc, VALUE *argv, VALUE self)
|
|
477
524
|
|
478
525
|
if ( rb_respond_to( port, s_read ) )
|
479
526
|
{
|
480
|
-
str = rb_funcall(
|
527
|
+
str = rb_funcall(port, s_read, 1, INT2FIX(space));
|
528
|
+
len = RSTRING_LEN(str);
|
529
|
+
memcpy(p, StringValuePtr(str), len);
|
481
530
|
}
|
482
531
|
else
|
483
532
|
{
|
484
|
-
|
533
|
+
len = RSTRING_LEN(port) - nread;
|
534
|
+
if (len > space) len = space;
|
535
|
+
memcpy(p, StringValuePtr(port) + nread, len);
|
485
536
|
}
|
486
537
|
|
487
|
-
StringValue(str);
|
488
|
-
memcpy( p, RSTRING_PTR(str), RSTRING_LEN(str) );
|
489
|
-
len = RSTRING_LEN(str);
|
490
538
|
nread += len;
|
491
539
|
|
492
540
|
/* If this is the last buffer, tack on an EOF. */
|
@@ -627,6 +675,8 @@ void Init_hpricot_scan()
|
|
627
675
|
rb_define_method(cProcIns, "content=", hpricot_ele_set_attr, 1);
|
628
676
|
|
629
677
|
s_ElementContent = rb_intern("ElementContent");
|
678
|
+
symAllow = ID2SYM(rb_intern("allow"));
|
679
|
+
symDeny = ID2SYM(rb_intern("deny"));
|
630
680
|
s_downcase = rb_intern("downcase");
|
631
681
|
s_new = rb_intern("new");
|
632
682
|
s_parent = rb_intern("parent");
|
@@ -643,6 +693,7 @@ void Init_hpricot_scan()
|
|
643
693
|
sym_cdata = ID2SYM(rb_intern("cdata"));
|
644
694
|
sym_text = ID2SYM(rb_intern("text"));
|
645
695
|
sym_EMPTY = ID2SYM(rb_intern("EMPTY"));
|
696
|
+
sym_CDATA = ID2SYM(rb_intern("CDATA"));
|
646
697
|
|
647
698
|
rb_const_set(mHpricot, rb_intern("ProcInsParse"),
|
648
699
|
reProcInsParse = rb_eval_string("/\\A<\\?(\\S+)\\s+(.+)/m"));
|
data/lib/hpricot/builder.rb
CHANGED
@@ -16,7 +16,7 @@ module Hpricot
|
|
16
16
|
assigns.each do |k, v|
|
17
17
|
ele.instance_variable_set("@#{k}", v)
|
18
18
|
end
|
19
|
-
ele.instance_eval
|
19
|
+
ele.instance_eval(&blk)
|
20
20
|
ele
|
21
21
|
end
|
22
22
|
|
@@ -38,14 +38,21 @@ module Hpricot
|
|
38
38
|
@@default[option] = value
|
39
39
|
end
|
40
40
|
|
41
|
+
def add_child ele
|
42
|
+
ele.parent = self
|
43
|
+
self.children ||= []
|
44
|
+
self.children << ele
|
45
|
+
ele
|
46
|
+
end
|
47
|
+
|
41
48
|
# Write a +string+ to the HTML stream, making sure to escape it.
|
42
49
|
def text!(string)
|
43
|
-
|
50
|
+
add_child Text.new(string.fast_xs)
|
44
51
|
end
|
45
52
|
|
46
53
|
# Write a +string+ to the HTML stream without escaping it.
|
47
54
|
def text(string)
|
48
|
-
|
55
|
+
add_child Text.new(string)
|
49
56
|
nil
|
50
57
|
end
|
51
58
|
alias_method :<<, :text
|
@@ -60,11 +67,11 @@ module Hpricot
|
|
60
67
|
raise InvalidXhtmlError, "no element `#{tag}' for #{tagset.doctype}"
|
61
68
|
elsif args.last.respond_to?(:to_hash)
|
62
69
|
attrs = args.last.to_hash
|
63
|
-
|
70
|
+
|
64
71
|
if @tagset.forms.include?(tag) and attrs[:id]
|
65
72
|
attrs[:name] ||= attrs[:id]
|
66
73
|
end
|
67
|
-
|
74
|
+
|
68
75
|
attrs.each do |k, v|
|
69
76
|
atname = k.to_s.downcase.intern
|
70
77
|
unless k =~ /:/ or @tagset.tagset[tag].include? atname
|
@@ -106,7 +113,7 @@ module Hpricot
|
|
106
113
|
build(f, &block)
|
107
114
|
end
|
108
115
|
|
109
|
-
|
116
|
+
add_child f
|
110
117
|
f
|
111
118
|
end
|
112
119
|
|
@@ -139,11 +146,11 @@ module Hpricot
|
|
139
146
|
end
|
140
147
|
|
141
148
|
def doctype(target, pub, sys)
|
142
|
-
|
149
|
+
add_child DocType.new(target, pub, sys)
|
143
150
|
end
|
144
151
|
|
145
152
|
remove_method :head
|
146
|
-
|
153
|
+
|
147
154
|
# Builds a head tag. Adds a <tt>meta</tt> tag inside with Content-Type
|
148
155
|
# set to <tt>text/html; charset=utf-8</tt>.
|
149
156
|
def head(*args, &block)
|
@@ -187,7 +194,7 @@ module Hpricot
|
|
187
194
|
def initialize(builder, sym)
|
188
195
|
@builder, @sym, @attrs = builder, sym, {}
|
189
196
|
end
|
190
|
-
|
197
|
+
|
191
198
|
# Adds attributes to an element. Bang methods set the :id attribute.
|
192
199
|
# Other methods add to the :class attribute.
|
193
200
|
def method_missing(id_or_class, *args, &block)
|
@@ -201,7 +208,7 @@ module Hpricot
|
|
201
208
|
args.push(@attrs)
|
202
209
|
return @builder.tag!(@sym, *args, &block)
|
203
210
|
end
|
204
|
-
|
211
|
+
|
205
212
|
return self
|
206
213
|
end
|
207
214
|
|
data/lib/hpricot/elements.rb
CHANGED
@@ -275,7 +275,7 @@ module Hpricot
|
|
275
275
|
expr = $'
|
276
276
|
m.compact!
|
277
277
|
if m[0] == '@'
|
278
|
-
m[0] = "@#{m.slice!(2,1)}"
|
278
|
+
m[0] = "@#{m.slice!(2,1).join}"
|
279
279
|
end
|
280
280
|
|
281
281
|
if m[0] == '[' && m[1] =~ /^\d+$/
|
@@ -300,10 +300,10 @@ module Hpricot
|
|
300
300
|
args = m[1..-1]
|
301
301
|
end
|
302
302
|
end
|
303
|
-
|
303
|
+
args << -1
|
304
304
|
nodes = Elements[*nodes.find_all do |x|
|
305
|
-
|
306
|
-
x.send(meth, *
|
305
|
+
args[-1] += 1
|
306
|
+
x.send(meth, *args) ? truth : !truth
|
307
307
|
end]
|
308
308
|
end
|
309
309
|
end
|
@@ -446,23 +446,23 @@ module Hpricot
|
|
446
446
|
parent.containers.length == 1
|
447
447
|
end
|
448
448
|
|
449
|
-
filter :parent do
|
449
|
+
filter :parent do |*a|
|
450
450
|
containers.length > 0
|
451
451
|
end
|
452
452
|
|
453
|
-
filter :empty do
|
453
|
+
filter :empty do |*a|
|
454
454
|
containers.length == 0
|
455
455
|
end
|
456
456
|
|
457
|
-
filter :root do
|
457
|
+
filter :root do |*a|
|
458
458
|
self.is_a? Hpricot::Doc
|
459
459
|
end
|
460
460
|
|
461
|
-
filter 'text' do
|
461
|
+
filter 'text' do |*a|
|
462
462
|
self.text?
|
463
463
|
end
|
464
464
|
|
465
|
-
filter 'comment' do
|
465
|
+
filter 'comment' do |*a|
|
466
466
|
self.comment?
|
467
467
|
end
|
468
468
|
|
@@ -495,7 +495,7 @@ module Hpricot
|
|
495
495
|
end
|
496
496
|
|
497
497
|
filter 'text()' do |val,i|
|
498
|
-
|
498
|
+
self.children.grep(Hpricot::Text).detect { |x| x.content =~ /\S/ } if self.children
|
499
499
|
end
|
500
500
|
|
501
501
|
filter '@' do |attr,val,i|
|
data/lib/hpricot/htmlinfo.rb
CHANGED
@@ -473,9 +473,23 @@ module Hpricot
|
|
473
473
|
"menu", "noframes", "noscript", "object", "ol", "p", "pre", "q", "s",
|
474
474
|
"samp", "script", "select", "small", "span", "strike", "strong", "sub",
|
475
475
|
"sup", "table", "textarea", "tt", "u", "ul", "var"]}
|
476
|
+
ElementContent.keys.each do |k|
|
477
|
+
v = ElementContent[k]
|
478
|
+
if v.is_a? Array
|
479
|
+
ElementContent[k] = v.inject({}) do |h, name|
|
480
|
+
h[name.hash] = true
|
481
|
+
h
|
482
|
+
end
|
483
|
+
end
|
484
|
+
end
|
476
485
|
|
477
486
|
ElementInclusions =
|
478
487
|
{"head"=>["link", "meta", "object", "script", "style"], "body"=>["del", "ins"]}
|
488
|
+
ElementInclusions.each do |k, v|
|
489
|
+
v.each do |name|
|
490
|
+
ElementContent[k][name.hash] = :allow
|
491
|
+
end
|
492
|
+
end
|
479
493
|
|
480
494
|
ElementExclusions =
|
481
495
|
{"button"=>
|
@@ -496,6 +510,11 @@ module Hpricot
|
|
496
510
|
"h1", "h2", "h3", "h4", "h5", "h6", "hr", "isindex", "menu", "noframes",
|
497
511
|
"noscript", "ol", "p", "pre", "table", "ul"],
|
498
512
|
"label"=>["label"]}
|
513
|
+
ElementExclusions.each do |k, v|
|
514
|
+
v.each do |name|
|
515
|
+
ElementContent[k][name.hash] = :deny
|
516
|
+
end
|
517
|
+
end
|
499
518
|
|
500
519
|
OmittedAttrName =
|
501
520
|
{"h6"=>
|
data/lib/hpricot/parse.rb
CHANGED
data/lib/hpricot/tag.rb
CHANGED
@@ -12,6 +12,9 @@ module Hpricot
|
|
12
12
|
Hpricot.make(input, @options, &blk).children
|
13
13
|
end
|
14
14
|
def altered!; end
|
15
|
+
def inspect_tree
|
16
|
+
children.map { |x| x.inspect_tree }.join if children
|
17
|
+
end
|
15
18
|
end
|
16
19
|
|
17
20
|
class BaseEle
|
@@ -29,6 +32,9 @@ module Hpricot
|
|
29
32
|
def altered!
|
30
33
|
clear_raw
|
31
34
|
end
|
35
|
+
def inspect_tree(depth = 0)
|
36
|
+
%{#{" " * depth}} + self.class.name.split(/::/).last.downcase + "\n"
|
37
|
+
end
|
32
38
|
end
|
33
39
|
|
34
40
|
class Elem
|
@@ -43,6 +49,8 @@ module Hpricot
|
|
43
49
|
hsh[k] = Hpricot.uxs(v)
|
44
50
|
hsh
|
45
51
|
end
|
52
|
+
else
|
53
|
+
{}
|
46
54
|
end
|
47
55
|
end
|
48
56
|
def to_plain_text
|
@@ -87,6 +95,10 @@ module Hpricot
|
|
87
95
|
end.join
|
88
96
|
end
|
89
97
|
end
|
98
|
+
def inspect_tree(depth = 0)
|
99
|
+
%{#{" " * depth}} + name + "\n" +
|
100
|
+
(children ? children.map { |x| x.inspect_tree(depth + 1) }.join : "")
|
101
|
+
end
|
90
102
|
end
|
91
103
|
|
92
104
|
class ETag
|
@@ -115,7 +127,7 @@ module Hpricot
|
|
115
127
|
def output(out, opts = {})
|
116
128
|
out <<
|
117
129
|
if_output(opts) do
|
118
|
-
content
|
130
|
+
content.to_s
|
119
131
|
end
|
120
132
|
end
|
121
133
|
end
|
data/test/files/boingboing.html
CHANGED
data/test/test_builder.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#!/usr/bin/env ruby
|
2
3
|
|
3
4
|
require 'test/unit'
|
@@ -17,9 +18,9 @@ class TestBuilder < Test::Unit::TestCase
|
|
17
18
|
end
|
18
19
|
|
19
20
|
def test_latin1_entities
|
20
|
-
doc = Hpricot() { b "
|
21
|
+
doc = Hpricot() { b "ۥ" }
|
21
22
|
assert_equal "<b>ۥ</b>", doc.to_html
|
22
|
-
assert_equal "
|
23
|
+
assert_equal "ۥ", doc.at("text()").to_s
|
23
24
|
end
|
24
25
|
|
25
26
|
def test_escaping_attrs
|
@@ -29,8 +30,7 @@ class TestBuilder < Test::Unit::TestCase
|
|
29
30
|
end
|
30
31
|
|
31
32
|
def test_korean_utf8_entities
|
32
|
-
|
33
|
-
a = "\xed\x95\x9c\xea\xb8\x80"
|
33
|
+
a = '한글'
|
34
34
|
doc = Hpricot() { b a }
|
35
35
|
assert_equal "<b>한글</b>", doc.to_html
|
36
36
|
end
|
data/test/test_parser.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#!/usr/bin/env ruby
|
2
3
|
|
3
4
|
require 'test/unit'
|
@@ -121,7 +122,7 @@ class TestParser < Test::Unit::TestCase
|
|
121
122
|
assert_equal 60, @boingboing.search("h3").length
|
122
123
|
assert_equal 59, @boingboing.search("h3[text()!='College kids reportedly taking more smart drugs']").length
|
123
124
|
assert_equal 17, @boingboing.search("h3[text()$='s']").length
|
124
|
-
assert_equal
|
125
|
+
assert_equal 116, @boingboing.search("p[text()]").length
|
125
126
|
assert_equal 211, @boingboing.search("p").length
|
126
127
|
end
|
127
128
|
|
@@ -161,10 +162,10 @@ class TestParser < Test::Unit::TestCase
|
|
161
162
|
assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
|
162
163
|
assert_equal 18, @boingboing.search("//script").length
|
163
164
|
divs = @boingboing.search("//script/../div")
|
164
|
-
assert_equal
|
165
|
+
assert_equal 2, divs.length
|
165
166
|
imgs = @boingboing.search('//div/p/a/img')
|
166
|
-
assert_equal
|
167
|
-
assert_equal
|
167
|
+
assert_equal 16, imgs.length
|
168
|
+
assert_equal 16, @boingboing.search('//div').search('p/a/img').length
|
168
169
|
assert imgs.all? { |x| x.name == 'img' }
|
169
170
|
end
|
170
171
|
|
@@ -172,10 +173,10 @@ class TestParser < Test::Unit::TestCase
|
|
172
173
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
173
174
|
assert_equal 2, @boingboing.search('//link[@rel="alternate"]').length
|
174
175
|
p_imgs = @boingboing.search('//div/p[/a/img]')
|
175
|
-
assert_equal
|
176
|
+
assert_equal 16, p_imgs.length
|
176
177
|
assert p_imgs.all? { |x| x.name == 'p' }
|
177
178
|
p_imgs = @boingboing.search('//div/p[a/img]')
|
178
|
-
assert_equal
|
179
|
+
assert_equal 16, p_imgs.length
|
179
180
|
assert p_imgs.all? { |x| x.name == 'p' }
|
180
181
|
assert_equal 1, @boingboing.search('//input[@checked]').length
|
181
182
|
end
|
@@ -218,7 +219,7 @@ class TestParser < Test::Unit::TestCase
|
|
218
219
|
def test_many_paths
|
219
220
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
220
221
|
assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
|
221
|
-
assert_equal
|
222
|
+
assert_equal 18, @boingboing.search('//div/p[a/img]|//link[@rel="alternate"]').length
|
222
223
|
end
|
223
224
|
|
224
225
|
def test_stacked_search
|
@@ -391,10 +392,18 @@ class TestParser < Test::Unit::TestCase
|
|
391
392
|
end
|
392
393
|
|
393
394
|
def test_uxs_handles_numeric_values
|
394
|
-
|
395
|
+
if String.method_defined? :encoding
|
396
|
+
assert_equal "é", Hpricot.uxs('é')
|
397
|
+
else
|
398
|
+
assert_equal "\303\251", Hpricot.uxs('é')
|
399
|
+
end
|
395
400
|
end
|
396
401
|
|
397
402
|
def test_uxs_handles_entities
|
398
|
-
|
403
|
+
if String.method_defined? :encoding
|
404
|
+
assert_equal "é", Hpricot.uxs('é')
|
405
|
+
else
|
406
|
+
assert_equal "\303\251", Hpricot.uxs('é')
|
407
|
+
end
|
399
408
|
end
|
400
409
|
end
|
data/test/test_preserved.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#!/usr/bin/env ruby
|
2
3
|
|
3
4
|
require 'test/unit'
|
@@ -9,7 +10,10 @@ class TestPreserved < Test::Unit::TestCase
|
|
9
10
|
doc = Hpricot(str)
|
10
11
|
yield doc if block_given?
|
11
12
|
str2 = doc.to_original_html
|
12
|
-
|
13
|
+
if RUBY_VERSION =~ /^1.9/
|
14
|
+
str2.force_encoding('UTF-8')
|
15
|
+
end
|
16
|
+
str.lines.zip(str2.lines).each do |s1, s2|
|
13
17
|
assert_equal s1, s2
|
14
18
|
end
|
15
19
|
end
|
@@ -40,7 +44,7 @@ class TestPreserved < Test::Unit::TestCase
|
|
40
44
|
|
41
45
|
def test_escaping_of_contents
|
42
46
|
doc = Hpricot(TestFiles::BOINGBOING)
|
43
|
-
assert_equal "Fukuda
|
47
|
+
assert_equal "Fukuda’s Automatic Door opens around your body as you pass through it. The idea is to save energy and keep the room clean.", doc.at("img[@alt='200606131240']").next.to_s.strip
|
44
48
|
end
|
45
49
|
|
46
50
|
def test_files
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: adamh-hpricot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.210
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- why the lucky stiff
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-02-07 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|