hpricot 0.5-mswin32 → 0.6-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -1
- data/README +4 -1
- data/Rakefile +111 -75
- data/ext/hpricot_scan/HpricotScanService.java +1340 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_scan.c +2435 -2181
- data/ext/hpricot_scan/hpricot_scan.java.rl +363 -0
- data/ext/hpricot_scan/hpricot_scan.rl +3 -70
- data/lib/hpricot.rb +1 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +200 -0
- data/lib/hpricot/elements.rb +143 -32
- data/lib/hpricot/inspect.rb +7 -3
- data/lib/hpricot/parse.rb +128 -101
- data/lib/hpricot/tag.rb +23 -15
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +93 -33
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/i686-linux/hpricot_scan.so +0 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +1 -1
- data/test/test_alter.rb +65 -0
- data/test/test_builder.rb +24 -0
- data/test/test_parser.rb +92 -4
- data/test/test_preserved.rb +20 -0
- data/test/test_xml.rb +13 -0
- metadata +34 -14
- data/lib/hpricot/text.rb +0 -115
- data/lib/hpricot_scan.so +0 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
<html>
|
2
|
+
<HEAD>
|
3
|
+
<meta http-equiv="Refresh" content="0; url=http://tenderlovemaking.com">
|
4
|
+
<META http-equiv="Refresh" content="0; url=http://tenderlovemaking.com">
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
<a href ="http://tenderlovemaking.com/">My Site!</a>
|
8
|
+
<A href ="http://whytheluckystiff.net/">Your Site!</A>
|
9
|
+
<MAP>
|
10
|
+
<area HREF="http://whytheluckystiff.net/" COORDS="1,2,3,4"></area>
|
11
|
+
<AREA HREF="http://tenderlovemaking.com/" COORDS="1,2,3,4">
|
12
|
+
</area>
|
13
|
+
<AREA HREF="http://tenderlovemaking.com/" COORDS="5,5,10,10" />
|
14
|
+
</MAP>
|
15
|
+
</body>
|
16
|
+
</html>
|
data/test/files/why.xml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
<?xml version='1.0'?><rss xmlns:admin='http://webns.net/mvcb/' version='2.0' xmlns:sy='http://purl.org/rss/1.0/modules/syndication/' xmlns:dc='http://purl.org/dc/elements/1.1/' xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>
|
2
|
+
<channel>
|
3
|
+
<title>why the lucky stiff</title>
|
4
|
+
<link>http://whytheluckystiff.net</link>
|
5
|
+
<description>hex-editing reality to give us infinite grenades!!</description>
|
6
|
+
<dc:language>en-us</dc:language>
|
7
|
+
<dc:creator/>
|
8
|
+
<dc:date>2007-01-16T22:39:04+00:00</dc:date>
|
9
|
+
<admin:generatorAgent rdf:resource='http://hobix.com/?v=0.4'/>
|
10
|
+
<sy:updatePeriod>hourly</sy:updatePeriod>
|
11
|
+
<sy:updateFrequency>1</sy:updateFrequency>
|
12
|
+
<sy:updateBase>2000-01-01T12:00+00:00</sy:updateBase>
|
13
|
+
<item><title>1.3</title><link>http://whytheluckystiff.net/quatrains/1.3.html</link><guid isPermaLink='false'>quatrains/1.3@http://whytheluckystiff.net</guid><dc:subject>quatrains</dc:subject><dc:subject>quatrains</dc:subject><dc:creator>why the lucky stiff</dc:creator><dc:date>2007-01-14T08:47:05+00:00</dc:date><description><blockquote>
|
14
|
+
<p>That cadillac of yours and that driver of yours!<br />You and your teacups rattling away in the back seat!<br />You always took the mike, oh, and all those cowboys you shot!<br />I held your hand! And I&#8217;ll shoot a cowboy one day!</p>
|
15
|
+
</blockquote>
|
16
|
+
<blockquote>
|
17
|
+
<p>You said, &#8220;Let&#8217;s run into the woods like kids!&#8221; <br />You said, &#8220;Let&#8217;s rub our hands together super-hot!&#8221; <br />And we scalded the trees and left octagons, I think that was you and<br />You threw parties on the roof!</p>
|
18
|
+
</blockquote></description></item></channel>
|
19
|
+
</rss>
|
data/test/load_files.rb
CHANGED
data/test/test_alter.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'hpricot'
|
5
|
+
require 'load_files'
|
6
|
+
|
7
|
+
class TestAlter < Test::Unit::TestCase
|
8
|
+
def setup
|
9
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_before
|
13
|
+
test0 = "<link rel='stylesheet' href='test0.css' />"
|
14
|
+
@basic.at("link").before(test0)
|
15
|
+
assert_equal 'test0.css', @basic.at("link").attributes['href']
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_after
|
19
|
+
test_inf = "<link rel='stylesheet' href='test_inf.css' />"
|
20
|
+
@basic.search("link")[-1].after(test_inf)
|
21
|
+
assert_equal 'test_inf.css', @basic.search("link")[-1].attributes['href']
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_wrap
|
25
|
+
ohmy = (@basic/"p.ohmy").wrap("<div id='wrapper'></div>")
|
26
|
+
assert_equal 'wrapper', ohmy[0].parent['id']
|
27
|
+
assert_equal 'ohmy', Hpricot(@basic.to_html).at("#wrapper").children[0]['class']
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_add_class
|
31
|
+
first_p = (@basic/"p:first").add_class("testing123")
|
32
|
+
assert first_p[0].get_attribute("class").split(" ").include?("testing123")
|
33
|
+
assert (Hpricot(@basic.to_html)/"p:first")[0].attributes["class"].split(" ").include?("testing123")
|
34
|
+
assert !(Hpricot(@basic.to_html)/"p:gt(0)")[0].attributes["class"].split(" ").include?("testing123")
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_change_attributes
|
38
|
+
all_ps = (@basic/"p").attr("title", "Some Title")
|
39
|
+
all_as = (@basic/"a").attr("href", "http://my_new_href.com")
|
40
|
+
all_lb = (@basic/"link").attr("href") { |e| e.name }
|
41
|
+
assert_changed(@basic, "p", all_ps) {|p| p.attributes["title"] == "Some Title"}
|
42
|
+
assert_changed(@basic, "a", all_as) {|a| a.attributes["href"] == "http://my_new_href.com"}
|
43
|
+
assert_changed(@basic, "link", all_lb) {|a| a.attributes["href"] == "link" }
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_remove_attr
|
47
|
+
all_rl = (@basic/"link").remove_attr("href")
|
48
|
+
assert_changed(@basic, "link", all_rl) { |link| link['href'].nil? }
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_remove_class
|
52
|
+
all_c1 = (@basic/"p[@class*='last']").remove_class("last")
|
53
|
+
assert_changed(@basic, "p[@class*='last']", all_c1) { |p| p['class'] == 'final' }
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_remove_all_classes
|
57
|
+
all_c2 = (@basic/"p[@class]").remove_class
|
58
|
+
assert_changed(@basic, "p[@class]", all_c2) { |p| p['class'].nil? }
|
59
|
+
end
|
60
|
+
|
61
|
+
def assert_changed original, selector, set, &block
|
62
|
+
assert set.all?(&block)
|
63
|
+
assert Hpricot(original.to_html).search(selector).all?(&block)
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'hpricot'
|
5
|
+
|
6
|
+
class TestBuilder < Test::Unit::TestCase
|
7
|
+
def test_escaping_text
|
8
|
+
doc = Hpricot() { b "<a\"b>" }
|
9
|
+
assert_equal "<b><a"b></b>", doc.to_html
|
10
|
+
assert_equal %{<a"b>}, doc.at("text()").to_s
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_no_escaping_text
|
14
|
+
doc = Hpricot() { div.test.me! { text "<a\"b>" } }
|
15
|
+
assert_equal %{<div class="test" id="me"><a"b></div>}, doc.to_html
|
16
|
+
assert_equal %{<a"b>}, doc.at("text()").to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_latin1_entities
|
20
|
+
doc = Hpricot() { b "\200\225" }
|
21
|
+
assert_equal "<b>ۥ</b>", doc.to_html
|
22
|
+
assert_equal "\342\202\254\342\200\242", doc.at("text()").to_s
|
23
|
+
end
|
24
|
+
end
|
data/test/test_parser.rb
CHANGED
@@ -23,6 +23,19 @@ class TestParser < Test::Unit::TestCase
|
|
23
23
|
assert_equal 'FOO', Hpricot.make("FOO").first.content
|
24
24
|
end
|
25
25
|
|
26
|
+
def test_filter_by_attr
|
27
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
28
|
+
|
29
|
+
# this link is escaped in the doc
|
30
|
+
link = 'http://www.youtube.com/watch?v=TvSNXyNw26g&search=chris%20ware'
|
31
|
+
assert_equal link, @boingboing.at("a[@href='#{link}']")['href']
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_filter_contains
|
35
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
36
|
+
assert_equal '<title>Sample XHTML</title>', @basic.search("title:contains('Sample')").to_s
|
37
|
+
end
|
38
|
+
|
26
39
|
def test_get_element_by_id
|
27
40
|
@basic = Hpricot.parse(TestFiles::BASIC)
|
28
41
|
assert_equal 'link1', @basic.get_element_by_id('link1')['id']
|
@@ -84,6 +97,12 @@ class TestParser < Test::Unit::TestCase
|
|
84
97
|
assert_equal "<p>one</p>", h.search("//div/p:first()").to_s
|
85
98
|
end
|
86
99
|
|
100
|
+
def test_pace
|
101
|
+
doc = Hpricot(TestFiles::PACE_APPLICATION)
|
102
|
+
assert_equal 'get', doc.at('form[@name=frmSect11]')['method']
|
103
|
+
# assert_equal '2', doc.at('#hdnSpouse')['value']
|
104
|
+
end
|
105
|
+
|
87
106
|
def test_scan_boingboing
|
88
107
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
89
108
|
assert_equal 60, (@boingboing/'p.posted').length
|
@@ -95,7 +114,7 @@ class TestParser < Test::Unit::TestCase
|
|
95
114
|
assert_equal 60, @boingboing.search("h3").length
|
96
115
|
assert_equal 59, @boingboing.search("h3[text()!='College kids reportedly taking more smart drugs']").length
|
97
116
|
assert_equal 17, @boingboing.search("h3[text()$='s']").length
|
98
|
-
assert_equal
|
117
|
+
assert_equal 129, @boingboing.search("p[text()]").length
|
99
118
|
assert_equal 211, @boingboing.search("p").length
|
100
119
|
end
|
101
120
|
|
@@ -135,8 +154,7 @@ class TestParser < Test::Unit::TestCase
|
|
135
154
|
assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
|
136
155
|
assert_equal 18, @boingboing.search("//script").length
|
137
156
|
divs = @boingboing.search("//script/../div")
|
138
|
-
assert_equal
|
139
|
-
assert_equal 1, divs.search('a').length
|
157
|
+
assert_equal 1, divs.length
|
140
158
|
imgs = @boingboing.search('//div/p/a/img')
|
141
159
|
assert_equal 15, imgs.length
|
142
160
|
assert_equal 17, @boingboing.search('//div').search('p/a/img').length
|
@@ -155,9 +173,16 @@ class TestParser < Test::Unit::TestCase
|
|
155
173
|
assert_equal 1, @boingboing.search('//input[@checked]').length
|
156
174
|
end
|
157
175
|
|
176
|
+
def test_tag_case
|
177
|
+
@tenderlove = Hpricot.parse(TestFiles::TENDERLOVE)
|
178
|
+
assert_equal 2, @tenderlove.search('//a').length
|
179
|
+
assert_equal 3, @tenderlove.search('//area').length
|
180
|
+
assert_equal 2, @tenderlove.search('//meta').length
|
181
|
+
end
|
182
|
+
|
158
183
|
def test_alt_predicates
|
159
184
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
160
|
-
assert_equal
|
185
|
+
assert_equal 1, @boingboing.search('//table/tr:last').length
|
161
186
|
|
162
187
|
@basic = Hpricot.parse(TestFiles::BASIC)
|
163
188
|
assert_equal "<p>The third paragraph</p>",
|
@@ -167,6 +192,22 @@ class TestParser < Test::Unit::TestCase
|
|
167
192
|
assert_equal 'last final', @basic.search('//p:last-of-type').first.get_attribute('class')
|
168
193
|
end
|
169
194
|
|
195
|
+
def test_insert_after # ticket #63
|
196
|
+
doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
|
197
|
+
(doc/'div').each do |element|
|
198
|
+
element.after('<p>Paragraph 1</p><p>Paragraph 2</p>')
|
199
|
+
end
|
200
|
+
assert_equal doc.to_html, '<html><body><div id="a-div"></div><p>Paragraph 1</p><p>Paragraph 2</p></body></html>'
|
201
|
+
end
|
202
|
+
|
203
|
+
def test_insert_before # ticket #61
|
204
|
+
doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
|
205
|
+
(doc/'div').each do |element|
|
206
|
+
element.before('<p>Paragraph 1</p><p>Paragraph 2</p>')
|
207
|
+
end
|
208
|
+
assert_equal doc.to_html, '<html><body><p>Paragraph 1</p><p>Paragraph 2</p><div id="a-div"></div></body></html>'
|
209
|
+
end
|
210
|
+
|
170
211
|
def test_many_paths
|
171
212
|
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
172
213
|
assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
|
@@ -264,6 +305,53 @@ class TestParser < Test::Unit::TestCase
|
|
264
305
|
end
|
265
306
|
end
|
266
307
|
|
308
|
+
def test_youtube_attr
|
309
|
+
str = <<-edoc
|
310
|
+
<html><body>
|
311
|
+
Lorem ipsum. Jolly roger, ding-dong sing-a-long
|
312
|
+
<object width="425" height="350">
|
313
|
+
<param name="movie" value="http://www.youtube.com/v/NbDQ4M_cuwA"></param>
|
314
|
+
<param name="wmode" value="transparent"></param>
|
315
|
+
<embed src="http://www.youtube.com/v/NbDQ4M_cuwA"
|
316
|
+
type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
|
317
|
+
</embed>
|
318
|
+
</object>
|
319
|
+
Check out my posting, I have bright mice in large clown cars.
|
320
|
+
<object width="425" height="350">
|
321
|
+
<param name="movie" value="http://www.youtube.com/v/foobar"></param>
|
322
|
+
<param name="wmode" value="transparent"></param>
|
323
|
+
<embed src="http://www.youtube.com/v/foobar"
|
324
|
+
type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
|
325
|
+
</embed>
|
326
|
+
</object>
|
327
|
+
</body></html?
|
328
|
+
edoc
|
329
|
+
doc = Hpricot(str)
|
330
|
+
assert_equal "http://www.youtube.com/v/NbDQ4M_cuwA",
|
331
|
+
doc.at("//object/param[@value='http://www.youtube.com/v/NbDQ4M_cuwA']")['value']
|
332
|
+
end
|
333
|
+
|
334
|
+
# ticket #84 by jamezilla
|
335
|
+
def test_screwed_xmlns
|
336
|
+
doc = Hpricot(<<-edoc)
|
337
|
+
<?xml:namespace prefix = cwi />
|
338
|
+
<html><body>HAI</body></html>
|
339
|
+
edoc
|
340
|
+
assert_equal "HAI", doc.at("body").inner_text
|
341
|
+
end
|
342
|
+
|
343
|
+
# Reported by Jonathan Nichols on the Hpricot list (24 May 2007)
|
344
|
+
def test_self_closed_form
|
345
|
+
doc = Hpricot(<<-edoc)
|
346
|
+
<body>
|
347
|
+
<form action="/loginRegForm" name="regForm" method="POST" />
|
348
|
+
<input type="button">
|
349
|
+
</form>
|
350
|
+
</body>
|
351
|
+
edoc
|
352
|
+
assert_equal "button", doc.at("//form/input")['type']
|
353
|
+
end
|
354
|
+
|
267
355
|
def test_filters
|
268
356
|
@basic = Hpricot.parse(TestFiles::BASIC)
|
269
357
|
assert_equal 0, (@basic/"title:parent").size
|
data/test/test_preserved.rb
CHANGED
@@ -38,9 +38,29 @@ class TestPreserved < Test::Unit::TestCase
|
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
|
+
def test_escaping_of_contents
|
42
|
+
doc = Hpricot(TestFiles::BOINGBOING)
|
43
|
+
assert_equal "Fukuda\342\200\231s Automatic Door opens around your body as you pass through it. The idea is to save energy and keep the room clean.", doc.at("img[@alt='200606131240']").next.to_s.strip
|
44
|
+
end
|
45
|
+
|
41
46
|
def test_files
|
42
47
|
assert_roundtrip TestFiles::BASIC
|
43
48
|
assert_roundtrip TestFiles::BOINGBOING
|
44
49
|
assert_roundtrip TestFiles::CY0
|
45
50
|
end
|
51
|
+
|
52
|
+
def test_escaping_of_attrs
|
53
|
+
# ampersands in URLs
|
54
|
+
str = %{<a href="http://google.com/search?q=hpricot&l=en">Google</a>}
|
55
|
+
link = (doc = Hpricot(str)).at(:a)
|
56
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link['href']
|
57
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.attributes['href']
|
58
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.get_attribute('href')
|
59
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.raw_attributes['href']
|
60
|
+
assert_equal str, doc.to_html
|
61
|
+
|
62
|
+
# alter the url
|
63
|
+
link['href'] = "javascript:alert(\"AGGA-KA-BOO!\")"
|
64
|
+
assert_equal %{<a href="javascript:alert("AGGA-KA-BOO!")">Google</a>}, doc.to_html
|
65
|
+
end
|
46
66
|
end
|
data/test/test_xml.rb
CHANGED
@@ -12,4 +12,17 @@ class TestParser < Test::Unit::TestCase
|
|
12
12
|
assert_equal "this is title", (doc/:rss/:channel/:title).text
|
13
13
|
assert_equal "http://fake.com", (doc/:rss/:channel/:link).text
|
14
14
|
end
|
15
|
+
|
16
|
+
# make sure XML doesn't get downcased
|
17
|
+
def test_casing
|
18
|
+
doc = Hpricot::XML(TestFiles::WHY)
|
19
|
+
assert_equal "hourly", (doc.at "sy:updatePeriod").inner_html
|
20
|
+
assert_equal 1, (doc/"guid[@isPermaLink]").length
|
21
|
+
end
|
22
|
+
|
23
|
+
# be sure tags named "text" are ok
|
24
|
+
def test_text_tags
|
25
|
+
doc = Hpricot::XML("<feed><title>City Poisoned</title><text>Rita Lee has poisoned Brazil.</text></feed>")
|
26
|
+
assert_equal "City Poisoned", (doc/"title").text
|
27
|
+
end
|
15
28
|
end
|
metadata
CHANGED
@@ -3,10 +3,11 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: "0.
|
7
|
-
date: 2007-
|
6
|
+
version: "0.6"
|
7
|
+
date: 2007-06-15 00:00:00 -07:00
|
8
8
|
summary: a swift, liberal HTML parser with a fantastic library
|
9
9
|
require_paths:
|
10
|
+
- lib/i686-linux
|
10
11
|
- lib
|
11
12
|
email: why@ruby-lang.org
|
12
13
|
homepage: http://code.whytheluckystiff.net/hpricot/
|
@@ -15,7 +16,7 @@ description: a swift, liberal HTML parser with a fantastic library
|
|
15
16
|
autorequire:
|
16
17
|
default_executable:
|
17
18
|
bindir: bin
|
18
|
-
has_rdoc:
|
19
|
+
has_rdoc: true
|
19
20
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
21
|
requirements:
|
21
22
|
- - ">"
|
@@ -29,41 +30,60 @@ post_install_message:
|
|
29
30
|
authors:
|
30
31
|
- why the lucky stiff
|
31
32
|
files:
|
33
|
+
- CHANGELOG
|
34
|
+
- COPYING
|
35
|
+
- README
|
36
|
+
- Rakefile
|
37
|
+
- test/files
|
32
38
|
- test/test_preserved.rb
|
33
39
|
- test/test_paths.rb
|
34
40
|
- test/load_files.rb
|
35
41
|
- test/test_xml.rb
|
36
42
|
- test/test_parser.rb
|
43
|
+
- test/test_alter.rb
|
44
|
+
- test/test_builder.rb
|
45
|
+
- test/files/why.xml
|
37
46
|
- test/files/boingboing.html
|
38
47
|
- test/files/uswebgen.html
|
39
48
|
- test/files/immob.html
|
40
49
|
- test/files/week9.html
|
41
50
|
- test/files/utf8.html
|
42
|
-
- test/files/cy0.html
|
43
51
|
- test/files/basic.xhtml
|
52
|
+
- test/files/cy0.html
|
53
|
+
- test/files/tenderlove.html
|
54
|
+
- test/files/pace_application.html
|
55
|
+
- lib/hpricot
|
44
56
|
- lib/hpricot.rb
|
57
|
+
- lib/i686-linux
|
58
|
+
- lib/hpricot/builder.rb
|
45
59
|
- lib/hpricot/htmlinfo.rb
|
46
|
-
- lib/hpricot/
|
60
|
+
- lib/hpricot/xchar.rb
|
47
61
|
- lib/hpricot/inspect.rb
|
48
62
|
- lib/hpricot/modules.rb
|
49
63
|
- lib/hpricot/parse.rb
|
50
64
|
- lib/hpricot/tag.rb
|
51
65
|
- lib/hpricot/traverse.rb
|
52
66
|
- lib/hpricot/elements.rb
|
67
|
+
- lib/hpricot/tags.rb
|
68
|
+
- lib/hpricot/blankslate.rb
|
69
|
+
- extras/mingw-rbconfig.rb
|
70
|
+
- ext/hpricot_scan/hpricot_scan.h
|
71
|
+
- ext/hpricot_scan/HpricotScanService.java
|
53
72
|
- ext/hpricot_scan/hpricot_scan.c
|
54
73
|
- ext/hpricot_scan/extconf.rb
|
55
|
-
- ext/hpricot_scan/
|
74
|
+
- ext/hpricot_scan/hpricot_common.rl
|
56
75
|
- ext/hpricot_scan/hpricot_scan.rl
|
57
|
-
-
|
58
|
-
-
|
59
|
-
- Rakefile
|
60
|
-
- COPYING
|
61
|
-
- extras/mingw-rbconfig.rb
|
62
|
-
- lib/hpricot_scan.so
|
76
|
+
- ext/hpricot_scan/hpricot_scan.java.rl
|
77
|
+
- lib/i686-linux/hpricot_scan.so
|
63
78
|
test_files: []
|
64
79
|
|
65
|
-
rdoc_options:
|
66
|
-
|
80
|
+
rdoc_options:
|
81
|
+
- --quiet
|
82
|
+
- --title
|
83
|
+
- The Hpricot Reference
|
84
|
+
- --main
|
85
|
+
- README
|
86
|
+
- --inline-source
|
67
87
|
extra_rdoc_files:
|
68
88
|
- README
|
69
89
|
- CHANGELOG
|
data/lib/hpricot/text.rb
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
require 'hpricot/modules'
|
2
|
-
require 'hpricot/raw_string'
|
3
|
-
require 'hpricot/htmlinfo'
|
4
|
-
require 'hpricot/encoder'
|
5
|
-
require 'hpricot/fstr'
|
6
|
-
require 'iconv'
|
7
|
-
|
8
|
-
module Hpricot
|
9
|
-
class Text
|
10
|
-
# :stopdoc:
|
11
|
-
class << self
|
12
|
-
alias new_internal new
|
13
|
-
end
|
14
|
-
# :startdoc:
|
15
|
-
|
16
|
-
def Text.new(arg)
|
17
|
-
arg = arg.to_node if Hpricot::Location === arg
|
18
|
-
if Text === arg
|
19
|
-
new_internal arg.rcdata, arg.normalized_rcdata
|
20
|
-
elsif String === arg
|
21
|
-
arg2 = arg.gsub(/&/, '&')
|
22
|
-
arg = arg2.freeze if arg != arg2
|
23
|
-
new_internal arg
|
24
|
-
else
|
25
|
-
raise TypeError, "cannot initialize Text with #{arg.inspect}"
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
def initialize(rcdata, normalized_rcdata=internal_normalize(rcdata)) # :notnew:
|
30
|
-
init_raw_string
|
31
|
-
@rcdata = rcdata && Hpricot.frozen_string(rcdata)
|
32
|
-
@normalized_rcdata = @rcdata == normalized_rcdata ? @rcdata : normalized_rcdata
|
33
|
-
end
|
34
|
-
attr_reader :rcdata, :normalized_rcdata
|
35
|
-
|
36
|
-
def internal_normalize(rcdata)
|
37
|
-
# - character references are decoded as much as possible.
|
38
|
-
# - undecodable character references are converted to decimal numeric character refereces.
|
39
|
-
result = rcdata.gsub(/&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([A-Za-z][A-Za-z0-9]*));/o) {|s|
|
40
|
-
u = nil
|
41
|
-
if $1
|
42
|
-
u = $1.to_i
|
43
|
-
elsif $2
|
44
|
-
u = $2.hex
|
45
|
-
elsif $3
|
46
|
-
u = NamedCharacters[$3]
|
47
|
-
end
|
48
|
-
if !u || u < 0 || 0x7fffffff < u
|
49
|
-
'?'
|
50
|
-
elsif u == 38 # '&' character.
|
51
|
-
'&'
|
52
|
-
elsif u <= 0x7f
|
53
|
-
[u].pack("C")
|
54
|
-
else
|
55
|
-
begin
|
56
|
-
Iconv.conv(Encoder.internal_charset, 'UTF-8', [u].pack("U"))
|
57
|
-
rescue Iconv::Failure
|
58
|
-
"&##{u};"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
}
|
62
|
-
Hpricot.frozen_string(result)
|
63
|
-
end
|
64
|
-
private :internal_normalize
|
65
|
-
|
66
|
-
# Hpricot::Text#to_s converts the text to a string.
|
67
|
-
# - character references are decoded as much as possible.
|
68
|
-
# - undecodable character reference are converted to `?' character.
|
69
|
-
def to_s
|
70
|
-
@normalized_rcdata.gsub(/&(?:#([0-9]+));/o) {|s|
|
71
|
-
u = $1.to_i
|
72
|
-
if 0 <= u && u <= 0x7f
|
73
|
-
[u].pack("C")
|
74
|
-
else
|
75
|
-
'?'
|
76
|
-
end
|
77
|
-
}
|
78
|
-
end
|
79
|
-
|
80
|
-
def empty?
|
81
|
-
@normalized_rcdata.empty?
|
82
|
-
end
|
83
|
-
|
84
|
-
def strip
|
85
|
-
rcdata = @normalized_rcdata.dup
|
86
|
-
rcdata.sub!(/\A(?:\s| )+/, '')
|
87
|
-
rcdata.sub!(/(?:\s| )+\z/, '')
|
88
|
-
if rcdata == @normalized_rcdata
|
89
|
-
self
|
90
|
-
else
|
91
|
-
rcdata.freeze
|
92
|
-
Text.new_internal(rcdata, rcdata)
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
# Hpricot::Text.concat returns a text which is concatenation of arguments.
|
97
|
-
#
|
98
|
-
# An argument should be one of follows.
|
99
|
-
# - String
|
100
|
-
# - Hpricot::Text
|
101
|
-
# - Hpricot::Location which points Hpricot::Text
|
102
|
-
def Text.concat(*args)
|
103
|
-
rcdata = ''
|
104
|
-
args.each {|arg|
|
105
|
-
arg = arg.to_node if Hpricot::Location === arg
|
106
|
-
if Text === arg
|
107
|
-
rcdata << arg.rcdata
|
108
|
-
else
|
109
|
-
rcdata << arg.gsub(/&/, '&')
|
110
|
-
end
|
111
|
-
}
|
112
|
-
new_internal rcdata
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|