hpricot 0.5-mswin32 → 0.6-mswin32
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -1
- data/README +4 -1
- data/Rakefile +111 -75
- data/ext/hpricot_scan/HpricotScanService.java +1340 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_scan.c +2435 -2181
- data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
- data/ext/hpricot_scan/hpricot_scan.rl +3 -70
- data/lib/hpricot.rb +1 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +200 -0
- data/lib/hpricot/elements.rb +143 -32
- data/lib/hpricot/inspect.rb +7 -3
- data/lib/hpricot/parse.rb +128 -101
- data/lib/hpricot/tag.rb +23 -15
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +93 -33
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/i686-linux/hpricot_scan.so +0 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +1 -1
- data/test/test_alter.rb +65 -0
- data/test/test_builder.rb +24 -0
- data/test/test_parser.rb +92 -4
- data/test/test_preserved.rb +20 -0
- data/test/test_xml.rb +13 -0
- metadata +34 -14
- data/lib/hpricot/text.rb +0 -115
- data/lib/hpricot_scan.so +0 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
<html>
|
2
|
+
<HEAD>
|
3
|
+
<meta http-equiv="Refresh" content="0; url=http://tenderlovemaking.com">
|
4
|
+
<META http-equiv="Refresh" content="0; url=http://tenderlovemaking.com">
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
<a href ="http://tenderlovemaking.com/">My Site!</a>
|
8
|
+
<A href ="http://whytheluckystiff.net/">Your Site!</A>
|
9
|
+
<MAP>
|
10
|
+
<area HREF="http://whytheluckystiff.net/" COORDS="1,2,3,4"></area>
|
11
|
+
<AREA HREF="http://tenderlovemaking.com/" COORDS="1,2,3,4">
|
12
|
+
</area>
|
13
|
+
<AREA HREF="http://tenderlovemaking.com/" COORDS="5,5,10,10" />
|
14
|
+
</MAP>
|
15
|
+
</body>
|
16
|
+
</html>
|
data/test/files/why.xml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
<?xml version='1.0'?><rss xmlns:admin='http://webns.net/mvcb/' version='2.0' xmlns:sy='http://purl.org/rss/1.0/modules/syndication/' xmlns:dc='http://purl.org/dc/elements/1.1/' xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
|
2
|
+
<channel>
|
3
|
+
<title>why the lucky stiff</title>
|
4
|
+
<link>http://whytheluckystiff.net</link>
|
5
|
+
<description>hex-editing reality to give us infinite grenades!!</description>
|
6
|
+
<dc:language>en-us</dc:language>
|
7
|
+
<dc:creator/>
|
8
|
+
<dc:date>2007-01-16T22:39:04+00:00</dc:date>
|
9
|
+
<admin:generatorAgent rdf:resource='http://hobix.com/?v=0.4'/>
|
10
|
+
<sy:updatePeriod>hourly</sy:updatePeriod>
|
11
|
+
<sy:updateFrequency>1</sy:updateFrequency>
|
12
|
+
<sy:updateBase>2000-01-01T12:00+00:00</sy:updateBase>
|
13
|
+
<item><title>1.3</title><link>http://whytheluckystiff.net/quatrains/1.3.html</link><guid isPermaLink='false'>quatrains/1.3@http://whytheluckystiff.net</guid><dc:subject>quatrains</dc:subject><dc:subject>quatrains</dc:subject><dc:creator>why the lucky stiff</dc:creator><dc:date>2007-01-14T08:47:05+00:00</dc:date><description><blockquote>
|
14
|
+
<p>That cadillac of yours and that driver of yours!<br />You and your teacups rattling away in the back seat!<br />You always took the mike, oh, and all those cowboys you shot!<br />I held your hand! And I&#8217;ll shoot a cowboy one day!</p>
|
15
|
+
</blockquote>
|
16
|
+
<blockquote>
|
17
|
+
<p>You said, &#8220;Let&#8217;s run into the woods like kids!&#8221; <br />You said, &#8220;Let&#8217;s rub our hands together super-hot!&#8221; <br />And we scalded the trees and left octagons, I think that was you and<br />You threw parties on the roof!</p>
|
18
|
+
</blockquote></description></item></channel>
|
19
|
+
</rss>
|
data/test/load_files.rb
CHANGED
data/test/test_alter.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'hpricot'
|
5
|
+
require 'load_files'
|
6
|
+
|
7
|
+
class TestAlter < Test::Unit::TestCase
|
8
|
+
def setup
|
9
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_before
|
13
|
+
test0 = "<link rel='stylesheet' href='test0.css' />"
|
14
|
+
@basic.at("link").before(test0)
|
15
|
+
assert_equal 'test0.css', @basic.at("link").attributes['href']
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_after
|
19
|
+
test_inf = "<link rel='stylesheet' href='test_inf.css' />"
|
20
|
+
@basic.search("link")[-1].after(test_inf)
|
21
|
+
assert_equal 'test_inf.css', @basic.search("link")[-1].attributes['href']
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_wrap
|
25
|
+
ohmy = (@basic/"p.ohmy").wrap("<div id='wrapper'></div>")
|
26
|
+
assert_equal 'wrapper', ohmy[0].parent['id']
|
27
|
+
assert_equal 'ohmy', Hpricot(@basic.to_html).at("#wrapper").children[0]['class']
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_add_class
|
31
|
+
first_p = (@basic/"p:first").add_class("testing123")
|
32
|
+
assert first_p[0].get_attribute("class").split(" ").include?("testing123")
|
33
|
+
assert (Hpricot(@basic.to_html)/"p:first")[0].attributes["class"].split(" ").include?("testing123")
|
34
|
+
assert !(Hpricot(@basic.to_html)/"p:gt(0)")[0].attributes["class"].split(" ").include?("testing123")
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_change_attributes
|
38
|
+
all_ps = (@basic/"p").attr("title", "Some Title")
|
39
|
+
all_as = (@basic/"a").attr("href", "http://my_new_href.com")
|
40
|
+
all_lb = (@basic/"link").attr("href") { |e| e.name }
|
41
|
+
assert_changed(@basic, "p", all_ps) {|p| p.attributes["title"] == "Some Title"}
|
42
|
+
assert_changed(@basic, "a", all_as) {|a| a.attributes["href"] == "http://my_new_href.com"}
|
43
|
+
assert_changed(@basic, "link", all_lb) {|a| a.attributes["href"] == "link" }
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_remove_attr
|
47
|
+
all_rl = (@basic/"link").remove_attr("href")
|
48
|
+
assert_changed(@basic, "link", all_rl) { |link| link['href'].nil? }
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_remove_class
|
52
|
+
all_c1 = (@basic/"p[@class*='last']").remove_class("last")
|
53
|
+
assert_changed(@basic, "p[@class*='last']", all_c1) { |p| p['class'] == 'final' }
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_remove_all_classes
|
57
|
+
all_c2 = (@basic/"p[@class]").remove_class
|
58
|
+
assert_changed(@basic, "p[@class]", all_c2) { |p| p['class'].nil? }
|
59
|
+
end
|
60
|
+
|
61
|
+
def assert_changed original, selector, set, &block
|
62
|
+
assert set.all?(&block)
|
63
|
+
assert Hpricot(original.to_html).search(selector).all?(&block)
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'hpricot'
|
5
|
+
|
6
|
+
class TestBuilder < Test::Unit::TestCase
|
7
|
+
def test_escaping_text
|
8
|
+
doc = Hpricot() { b "<a\"b>" }
|
9
|
+
assert_equal "<b><a"b></b>", doc.to_html
|
10
|
+
assert_equal %{<a"b>}, doc.at("text()").to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_no_escaping_text
|
14
|
+
doc = Hpricot() { div.test.me! { text "<a\"b>" } }
|
15
|
+
assert_equal %{<div class="test" id="me"><a"b></div>}, doc.to_html
|
16
|
+
assert_equal %{<a"b>}, doc.at("text()").to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_latin1_entities
|
20
|
+
doc = Hpricot() { b "\200\225" }
|
21
|
+
assert_equal "<b>ۥ</b>", doc.to_html
|
22
|
+
assert_equal "\342\202\254\342\200\242", doc.at("text()").to_s
|
23
|
+
end
|
24
|
+
end
|
data/test/test_parser.rb
CHANGED
@@ -23,6 +23,19 @@ class TestParser < Test::Unit::TestCase
|
|
23
23
|
assert_equal 'FOO', Hpricot.make("FOO").first.content
|
24
24
|
end
|
25
25
|
|
26
|
+
def test_filter_by_attr
|
27
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
28
|
+
|
29
|
+
# this link is escaped in the doc
|
30
|
+
link = 'http://www.youtube.com/watch?v=TvSNXyNw26g&search=chris%20ware'
|
31
|
+
assert_equal link, @boingboing.at("a[@href='#{link}']")['href']
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_filter_contains
|
35
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
36
|
+
assert_equal '<title>Sample XHTML</title>', @basic.search("title:contains('Sample')").to_s
|
37
|
+
end
|
38
|
+
|
26
39
|
def test_get_element_by_id
|
27
40
|
@basic = Hpricot.parse(TestFiles::BASIC)
|
28
41
|
assert_equal 'link1', @basic.get_element_by_id('link1')['id']
|
@@ -84,6 +97,12 @@ class TestParser < Test::Unit::TestCase
|
|
84
97
|
assert_equal "<p>one</p>", h.search("//div/p:first()").to_s
|
85
98
|
end
|
86
99
|
|
100
|
+
def test_pace
|
101
|
+
doc = Hpricot(TestFiles::PACE_APPLICATION)
|
102
|
+
assert_equal 'get', doc.at('form[@name=frmSect11]')['method']
|
103
|
+
# assert_equal '2', doc.at('#hdnSpouse')['value']
|
104
|
+
end
|
105
|
+
|
87
106
|
def test_scan_boingboing
|
88
107
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
89
108
|
assert_equal 60, (@boingboing/'p.posted').length
|
@@ -95,7 +114,7 @@ class TestParser < Test::Unit::TestCase
|
|
95
114
|
assert_equal 60, @boingboing.search("h3").length
|
96
115
|
assert_equal 59, @boingboing.search("h3[text()!='College kids reportedly taking more smart drugs']").length
|
97
116
|
assert_equal 17, @boingboing.search("h3[text()$='s']").length
|
98
|
-
assert_equal
|
117
|
+
assert_equal 129, @boingboing.search("p[text()]").length
|
99
118
|
assert_equal 211, @boingboing.search("p").length
|
100
119
|
end
|
101
120
|
|
@@ -135,8 +154,7 @@ class TestParser < Test::Unit::TestCase
|
|
135
154
|
assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
|
136
155
|
assert_equal 18, @boingboing.search("//script").length
|
137
156
|
divs = @boingboing.search("//script/../div")
|
138
|
-
assert_equal
|
139
|
-
assert_equal 1, divs.search('a').length
|
157
|
+
assert_equal 1, divs.length
|
140
158
|
imgs = @boingboing.search('//div/p/a/img')
|
141
159
|
assert_equal 15, imgs.length
|
142
160
|
assert_equal 17, @boingboing.search('//div').search('p/a/img').length
|
@@ -155,9 +173,16 @@ class TestParser < Test::Unit::TestCase
|
|
155
173
|
assert_equal 1, @boingboing.search('//input[@checked]').length
|
156
174
|
end
|
157
175
|
|
176
|
+
def test_tag_case
|
177
|
+
@tenderlove = Hpricot.parse(TestFiles::TENDERLOVE)
|
178
|
+
assert_equal 2, @tenderlove.search('//a').length
|
179
|
+
assert_equal 3, @tenderlove.search('//area').length
|
180
|
+
assert_equal 2, @tenderlove.search('//meta').length
|
181
|
+
end
|
182
|
+
|
158
183
|
def test_alt_predicates
|
159
184
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
160
|
-
assert_equal
|
185
|
+
assert_equal 1, @boingboing.search('//table/tr:last').length
|
161
186
|
|
162
187
|
@basic = Hpricot.parse(TestFiles::BASIC)
|
163
188
|
assert_equal "<p>The third paragraph</p>",
|
@@ -167,6 +192,22 @@ class TestParser < Test::Unit::TestCase
|
|
167
192
|
assert_equal 'last final', @basic.search('//p:last-of-type').first.get_attribute('class')
|
168
193
|
end
|
169
194
|
|
195
|
+
def test_insert_after # ticket #63
|
196
|
+
doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
|
197
|
+
(doc/'div').each do |element|
|
198
|
+
element.after('<p>Paragraph 1</p><p>Paragraph 2</p>')
|
199
|
+
end
|
200
|
+
assert_equal doc.to_html, '<html><body><div id="a-div"></div><p>Paragraph 1</p><p>Paragraph 2</p></body></html>'
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_insert_before # ticket #61
|
204
|
+
doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
|
205
|
+
(doc/'div').each do |element|
|
206
|
+
element.before('<p>Paragraph 1</p><p>Paragraph 2</p>')
|
207
|
+
end
|
208
|
+
assert_equal doc.to_html, '<html><body><p>Paragraph 1</p><p>Paragraph 2</p><div id="a-div"></div></body></html>'
|
209
|
+
end
|
210
|
+
|
170
211
|
def test_many_paths
|
171
212
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
172
213
|
assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
|
@@ -264,6 +305,53 @@ class TestParser < Test::Unit::TestCase
|
|
264
305
|
end
|
265
306
|
end
|
266
307
|
|
308
|
+
def test_youtube_attr
|
309
|
+
str = <<-edoc
|
310
|
+
<html><body>
|
311
|
+
Lorem ipsum. Jolly roger, ding-dong sing-a-long
|
312
|
+
<object width="425" height="350">
|
313
|
+
<param name="movie" value="http://www.youtube.com/v/NbDQ4M_cuwA"></param>
|
314
|
+
<param name="wmode" value="transparent"></param>
|
315
|
+
<embed src="http://www.youtube.com/v/NbDQ4M_cuwA"
|
316
|
+
type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
|
317
|
+
</embed>
|
318
|
+
</object>
|
319
|
+
Check out my posting, I have bright mice in large clown cars.
|
320
|
+
<object width="425" height="350">
|
321
|
+
<param name="movie" value="http://www.youtube.com/v/foobar"></param>
|
322
|
+
<param name="wmode" value="transparent"></param>
|
323
|
+
<embed src="http://www.youtube.com/v/foobar"
|
324
|
+
type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
|
325
|
+
</embed>
|
326
|
+
</object>
|
327
|
+
</body></html?
|
328
|
+
edoc
|
329
|
+
doc = Hpricot(str)
|
330
|
+
assert_equal "http://www.youtube.com/v/NbDQ4M_cuwA",
|
331
|
+
doc.at("//object/param[@value='http://www.youtube.com/v/NbDQ4M_cuwA']")['value']
|
332
|
+
end
|
333
|
+
|
334
|
+
# ticket #84 by jamezilla
|
335
|
+
def test_screwed_xmlns
|
336
|
+
doc = Hpricot(<<-edoc)
|
337
|
+
<?xml:namespace prefix = cwi />
|
338
|
+
<html><body>HAI</body></html>
|
339
|
+
edoc
|
340
|
+
assert_equal "HAI", doc.at("body").inner_text
|
341
|
+
end
|
342
|
+
|
343
|
+
# Reported by Jonathan Nichols on the Hpricot list (24 May 2007)
|
344
|
+
def test_self_closed_form
|
345
|
+
doc = Hpricot(<<-edoc)
|
346
|
+
<body>
|
347
|
+
<form action="/loginRegForm" name="regForm" method="POST" />
|
348
|
+
<input type="button">
|
349
|
+
</form>
|
350
|
+
</body>
|
351
|
+
edoc
|
352
|
+
assert_equal "button", doc.at("//form/input")['type']
|
353
|
+
end
|
354
|
+
|
267
355
|
def test_filters
|
268
356
|
@basic = Hpricot.parse(TestFiles::BASIC)
|
269
357
|
assert_equal 0, (@basic/"title:parent").size
|
data/test/test_preserved.rb
CHANGED
@@ -38,9 +38,29 @@ class TestPreserved < Test::Unit::TestCase
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
+
def test_escaping_of_contents
|
42
|
+
doc = Hpricot(TestFiles::BOINGBOING)
|
43
|
+
assert_equal "Fukuda\342\200\231s Automatic Door opens around your body as you pass through it. The idea is to save energy and keep the room clean.", doc.at("img[@alt='200606131240']").next.to_s.strip
|
44
|
+
end
|
45
|
+
|
41
46
|
def test_files
|
42
47
|
assert_roundtrip TestFiles::BASIC
|
43
48
|
assert_roundtrip TestFiles::BOINGBOING
|
44
49
|
assert_roundtrip TestFiles::CY0
|
45
50
|
end
|
51
|
+
|
52
|
+
def test_escaping_of_attrs
|
53
|
+
# ampersands in URLs
|
54
|
+
str = %{<a href="http://google.com/search?q=hpricot&l=en">Google</a>}
|
55
|
+
link = (doc = Hpricot(str)).at(:a)
|
56
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link['href']
|
57
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.attributes['href']
|
58
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.get_attribute('href')
|
59
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.raw_attributes['href']
|
60
|
+
assert_equal str, doc.to_html
|
61
|
+
|
62
|
+
# alter the url
|
63
|
+
link['href'] = "javascript:alert(\"AGGA-KA-BOO!\")"
|
64
|
+
assert_equal %{<a href="javascript:alert("AGGA-KA-BOO!")">Google</a>}, doc.to_html
|
65
|
+
end
|
46
66
|
end
|
data/test/test_xml.rb
CHANGED
@@ -12,4 +12,17 @@ class TestParser < Test::Unit::TestCase
|
|
12
12
|
assert_equal "this is title", (doc/:rss/:channel/:title).text
|
13
13
|
assert_equal "http://fake.com", (doc/:rss/:channel/:link).text
|
14
14
|
end
|
15
|
+
|
16
|
+
# make sure XML doesn't get downcased
|
17
|
+
def test_casing
|
18
|
+
doc = Hpricot::XML(TestFiles::WHY)
|
19
|
+
assert_equal "hourly", (doc.at "sy:updatePeriod").inner_html
|
20
|
+
assert_equal 1, (doc/"guid[@isPermaLink]").length
|
21
|
+
end
|
22
|
+
|
23
|
+
# be sure tags named "text" are ok
|
24
|
+
def test_text_tags
|
25
|
+
doc = Hpricot::XML("<feed><title>City Poisoned</title><text>Rita Lee has poisoned Brazil.</text></feed>")
|
26
|
+
assert_equal "City Poisoned", (doc/"title").text
|
27
|
+
end
|
15
28
|
end
|
metadata
CHANGED
@@ -3,10 +3,11 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2007-
|
6
|
+
version: "0.6"
|
7
|
+
date: 2007-06-15 00:00:00 -07:00
|
8
8
|
summary: a swift, liberal HTML parser with a fantastic library
|
9
9
|
require_paths:
|
10
|
+
- lib/i686-linux
|
10
11
|
- lib
|
11
12
|
email: why@ruby-lang.org
|
12
13
|
homepage: http://code.whytheluckystiff.net/hpricot/
|
@@ -15,7 +16,7 @@ description: a swift, liberal HTML parser with a fantastic library
|
|
15
16
|
autorequire:
|
16
17
|
default_executable:
|
17
18
|
bindir: bin
|
18
|
-
has_rdoc:
|
19
|
+
has_rdoc: true
|
19
20
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
21
|
requirements:
|
21
22
|
- - ">"
|
@@ -29,41 +30,60 @@ post_install_message:
|
|
29
30
|
authors:
|
30
31
|
- why the lucky stiff
|
31
32
|
files:
|
33
|
+
- CHANGELOG
|
34
|
+
- COPYING
|
35
|
+
- README
|
36
|
+
- Rakefile
|
37
|
+
- test/files
|
32
38
|
- test/test_preserved.rb
|
33
39
|
- test/test_paths.rb
|
34
40
|
- test/load_files.rb
|
35
41
|
- test/test_xml.rb
|
36
42
|
- test/test_parser.rb
|
43
|
+
- test/test_alter.rb
|
44
|
+
- test/test_builder.rb
|
45
|
+
- test/files/why.xml
|
37
46
|
- test/files/boingboing.html
|
38
47
|
- test/files/uswebgen.html
|
39
48
|
- test/files/immob.html
|
40
49
|
- test/files/week9.html
|
41
50
|
- test/files/utf8.html
|
42
|
-
- test/files/cy0.html
|
43
51
|
- test/files/basic.xhtml
|
52
|
+
- test/files/cy0.html
|
53
|
+
- test/files/tenderlove.html
|
54
|
+
- test/files/pace_application.html
|
55
|
+
- lib/hpricot
|
44
56
|
- lib/hpricot.rb
|
57
|
+
- lib/i686-linux
|
58
|
+
- lib/hpricot/builder.rb
|
45
59
|
- lib/hpricot/htmlinfo.rb
|
46
|
-
- lib/hpricot/
|
60
|
+
- lib/hpricot/xchar.rb
|
47
61
|
- lib/hpricot/inspect.rb
|
48
62
|
- lib/hpricot/modules.rb
|
49
63
|
- lib/hpricot/parse.rb
|
50
64
|
- lib/hpricot/tag.rb
|
51
65
|
- lib/hpricot/traverse.rb
|
52
66
|
- lib/hpricot/elements.rb
|
67
|
+
- lib/hpricot/tags.rb
|
68
|
+
- lib/hpricot/blankslate.rb
|
69
|
+
- extras/mingw-rbconfig.rb
|
70
|
+
- ext/hpricot_scan/hpricot_scan.h
|
71
|
+
- ext/hpricot_scan/HpricotScanService.java
|
53
72
|
- ext/hpricot_scan/hpricot_scan.c
|
54
73
|
- ext/hpricot_scan/extconf.rb
|
55
|
-
- ext/hpricot_scan/
|
74
|
+
- ext/hpricot_scan/hpricot_common.rl
|
56
75
|
- ext/hpricot_scan/hpricot_scan.rl
|
57
|
-
-
|
58
|
-
-
|
59
|
-
- Rakefile
|
60
|
-
- COPYING
|
61
|
-
- extras/mingw-rbconfig.rb
|
62
|
-
- lib/hpricot_scan.so
|
76
|
+
- ext/hpricot_scan/hpricot_scan.java.rl
|
77
|
+
- lib/i686-linux/hpricot_scan.so
|
63
78
|
test_files: []
|
64
79
|
|
65
|
-
rdoc_options:
|
66
|
-
|
80
|
+
rdoc_options:
|
81
|
+
- --quiet
|
82
|
+
- --title
|
83
|
+
- The Hpricot Reference
|
84
|
+
- --main
|
85
|
+
- README
|
86
|
+
- --inline-source
|
67
87
|
extra_rdoc_files:
|
68
88
|
- README
|
69
89
|
- CHANGELOG
|
data/lib/hpricot/text.rb
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
require 'hpricot/modules'
|
2
|
-
require 'hpricot/raw_string'
|
3
|
-
require 'hpricot/htmlinfo'
|
4
|
-
require 'hpricot/encoder'
|
5
|
-
require 'hpricot/fstr'
|
6
|
-
require 'iconv'
|
7
|
-
|
8
|
-
module Hpricot
|
9
|
-
class Text
|
10
|
-
# :stopdoc:
|
11
|
-
class << self
|
12
|
-
alias new_internal new
|
13
|
-
end
|
14
|
-
# :startdoc:
|
15
|
-
|
16
|
-
def Text.new(arg)
|
17
|
-
arg = arg.to_node if Hpricot::Location === arg
|
18
|
-
if Text === arg
|
19
|
-
new_internal arg.rcdata, arg.normalized_rcdata
|
20
|
-
elsif String === arg
|
21
|
-
arg2 = arg.gsub(/&/, '&')
|
22
|
-
arg = arg2.freeze if arg != arg2
|
23
|
-
new_internal arg
|
24
|
-
else
|
25
|
-
raise TypeError, "cannot initialize Text with #{arg.inspect}"
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
|
30
|
-
init_raw_string
|
31
|
-
@rcdata = rcdata && Hpricot.frozen_string(rcdata)
|
32
|
-
@normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
|
33
|
-
end
|
34
|
-
attr_reader :rcdata, :normalized_rcdata
|
35
|
-
|
36
|
-
def internal_normalize(rcdata)
|
37
|
-
# - character references are decoded as much as possible.
|
38
|
-
# - undecodable character references are converted to decimal numeric character refereces.
|
39
|
-
result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
|
40
|
-
u = nil
|
41
|
-
if $1
|
42
|
-
u = $1.to_i
|
43
|
-
elsif $2
|
44
|
-
u = $2.hex
|
45
|
-
elsif $3
|
46
|
-
u = NamedCharacters[$3]
|
47
|
-
end
|
48
|
-
if !u || u < 0 || 0x7fffffff < u
|
49
|
-
'?'
|
50
|
-
elsif u == 38 # '&' character.
|
51
|
-
'&'
|
52
|
-
elsif u <= 0x7f
|
53
|
-
[u].pack("C")
|
54
|
-
else
|
55
|
-
begin
|
56
|
-
Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
|
57
|
-
rescue Iconv::Failure
|
58
|
-
"&##{u};"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
}
|
62
|
-
Hpricot.frozen_string(result)
|
63
|
-
end
|
64
|
-
private :internal_normalize
|
65
|
-
|
66
|
-
# Hpricot::Text#to_s converts the text to a string.
|
67
|
-
# - character references are decoded as much as possible.
|
68
|
-
# - undecodable character reference are converted to `?' character.
|
69
|
-
def to_s
|
70
|
-
@normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
|
71
|
-
u = $1.to_i
|
72
|
-
if 0 <= u && u <= 0x7f
|
73
|
-
[u].pack("C")
|
74
|
-
else
|
75
|
-
'?'
|
76
|
-
end
|
77
|
-
}
|
78
|
-
end
|
79
|
-
|
80
|
-
def empty?
|
81
|
-
@normalized_rcdata.empty?
|
82
|
-
end
|
83
|
-
|
84
|
-
def strip
|
85
|
-
rcdata = @normalized_rcdata.dup
|
86
|
-
rcdata.sub!(/\A(?:\s| )+/, '')
|
87
|
-
rcdata.sub!(/(?:\s| )+\z/, '')
|
88
|
-
if rcdata == @normalized_rcdata
|
89
|
-
self
|
90
|
-
else
|
91
|
-
rcdata.freeze
|
92
|
-
Text.new_internal(rcdata, rcdata)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
# Hpricot::Text.concat returns a text which is concatenation of arguments.
|
97
|
-
#
|
98
|
-
# An argument should be one of follows.
|
99
|
-
# - String
|
100
|
-
# - Hpricot::Text
|
101
|
-
# - Hpricot::Location which points Hpricot::Text
|
102
|
-
def Text.concat(*args)
|
103
|
-
rcdata = ''
|
104
|
-
args.each {|arg|
|
105
|
-
arg = arg.to_node if Hpricot::Location === arg
|
106
|
-
if Text === arg
|
107
|
-
rcdata << arg.rcdata
|
108
|
-
else
|
109
|
-
rcdata << arg.gsub(/&/, '&')
|
110
|
-
end
|
111
|
-
}
|
112
|
-
new_internal rcdata
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|