stepheneb-hpricot 0.8.265
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +75 -0
- data/COPYING +18 -0
- data/README +284 -0
- data/Rakefile +264 -0
- data/ext/fast_xs/FastXsService.java +1018 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +200 -0
- data/ext/hpricot_scan/HpricotScanService.java +2090 -0
- data/ext/hpricot_scan/extconf.rb +6 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3506 -0
- data/ext/hpricot_scan/hpricot_scan.c +6931 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1152 -0
- data/ext/hpricot_scan/hpricot_scan.rl +788 -0
- data/extras/mingw-rbconfig.rb +176 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +216 -0
- data/lib/hpricot/elements.rb +510 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +94 -0
- data/lib/hpricot.rb +26 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/test_alter.rb +95 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +428 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +79 -0
- data/test/test_xml.rb +28 -0
- metadata +108 -0
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# The XChar library is provided courtesy of Sam Ruby (See
|
4
|
+
# http://intertwingly.net/stories/2005/09/28/xchar.rb)
|
5
|
+
|
6
|
+
# --------------------------------------------------------------------
|
7
|
+
|
8
|
+
######################################################################
|
9
|
+
module Hpricot
|
10
|
+
|
11
|
+
####################################################################
|
12
|
+
# XML Character converter, from Sam Ruby:
|
13
|
+
# (see http://intertwingly.net/stories/2005/09/28/xchar.rb).
|
14
|
+
#
|
15
|
+
module XChar # :nodoc:
|
16
|
+
|
17
|
+
# See
|
18
|
+
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
19
|
+
# for details.
|
20
|
+
CP1252 = { # :nodoc:
|
21
|
+
128 => 8364, # euro sign
|
22
|
+
130 => 8218, # single low-9 quotation mark
|
23
|
+
131 => 402, # latin small letter f with hook
|
24
|
+
132 => 8222, # double low-9 quotation mark
|
25
|
+
133 => 8230, # horizontal ellipsis
|
26
|
+
134 => 8224, # dagger
|
27
|
+
135 => 8225, # double dagger
|
28
|
+
136 => 710, # modifier letter circumflex accent
|
29
|
+
137 => 8240, # per mille sign
|
30
|
+
138 => 352, # latin capital letter s with caron
|
31
|
+
139 => 8249, # single left-pointing angle quotation mark
|
32
|
+
140 => 338, # latin capital ligature oe
|
33
|
+
142 => 381, # latin capital letter z with caron
|
34
|
+
145 => 8216, # left single quotation mark
|
35
|
+
146 => 8217, # right single quotation mark
|
36
|
+
147 => 8220, # left double quotation mark
|
37
|
+
148 => 8221, # right double quotation mark
|
38
|
+
149 => 8226, # bullet
|
39
|
+
150 => 8211, # en dash
|
40
|
+
151 => 8212, # em dash
|
41
|
+
152 => 732, # small tilde
|
42
|
+
153 => 8482, # trade mark sign
|
43
|
+
154 => 353, # latin small letter s with caron
|
44
|
+
155 => 8250, # single right-pointing angle quotation mark
|
45
|
+
156 => 339, # latin small ligature oe
|
46
|
+
158 => 382, # latin small letter z with caron
|
47
|
+
159 => 376, # latin capital letter y with diaeresis
|
48
|
+
}
|
49
|
+
|
50
|
+
# See http://www.w3.org/TR/REC-xml/#dt-chardata for details.
|
51
|
+
PREDEFINED = {
|
52
|
+
34 => '"', # quotation mark
|
53
|
+
38 => '&', # ampersand
|
54
|
+
60 => '<', # left angle bracket
|
55
|
+
62 => '>' # right angle bracket
|
56
|
+
}
|
57
|
+
PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
|
58
|
+
|
59
|
+
# See http://www.w3.org/TR/REC-xml/#charsets for details.
|
60
|
+
VALID = [
|
61
|
+
0x9, 0xA, 0xD,
|
62
|
+
(0x20..0xD7FF),
|
63
|
+
(0xE000..0xFFFD),
|
64
|
+
(0x10000..0x10FFFF)
|
65
|
+
]
|
66
|
+
end
|
67
|
+
|
68
|
+
class << self
|
69
|
+
# XML escaped version of chr
|
70
|
+
def xchr(str)
|
71
|
+
n = XChar::CP1252[str] || str
|
72
|
+
case n when *XChar::VALID
|
73
|
+
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
74
|
+
else
|
75
|
+
'*'
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# XML escaped version of to_s
|
80
|
+
def xs(str)
|
81
|
+
str.to_s.unpack('U*').map {|n| xchr(n)}.join # ASCII, UTF-8
|
82
|
+
rescue
|
83
|
+
str.to_s.unpack('C*').map {|n| xchr(n)}.join # ISO-8859-1, WIN-1252
|
84
|
+
end
|
85
|
+
|
86
|
+
# XML unescape
|
87
|
+
def uxs(str)
|
88
|
+
str.to_s.
|
89
|
+
gsub(/\&\w+;/) { |x| (XChar::PREDEFINED_U[x] || ??).chr }.
|
90
|
+
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
data/lib/hpricot.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# == About hpricot.rb
|
2
|
+
#
|
3
|
+
# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
|
4
|
+
#
|
5
|
+
# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
|
6
|
+
# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
|
7
|
+
# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
|
8
|
+
# * hpricot/modules.rb: categorizes the various elements using mixins.
|
9
|
+
# * hpricot/traverse.rb: methods for searching documents.
|
10
|
+
# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
|
11
|
+
# * hpricot/inspect.rb: methods for displaying documents in a readable form.
|
12
|
+
|
13
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
14
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
15
|
+
begin
|
16
|
+
require 'encoding/character/utf-8'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'hpricot_scan'
|
21
|
+
require 'hpricot/tag'
|
22
|
+
require 'hpricot/modules'
|
23
|
+
require 'hpricot/traverse'
|
24
|
+
require 'hpricot/inspect'
|
25
|
+
require 'hpricot/parse'
|
26
|
+
require 'hpricot/builder'
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "DTD/xhtml1-strict.dtd">
|
3
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
4
|
+
<head>
|
5
|
+
<title>Sample XHTML</title>
|
6
|
+
<link rel='stylesheet' href='test1.css' />
|
7
|
+
<link rel='stylesheet' href='test2.css' />
|
8
|
+
<link rel='stylesheet' href='test3.css' />
|
9
|
+
</head>
|
10
|
+
<body id='body1'>
|
11
|
+
<p>Sample XHTML for <a id="link1" href="http://code.whytheluckystiff.net/mouseHole/">MouseHole 2</a>.</p>
|
12
|
+
<p class='ohmy'>Please filter <a id="link2" href="http://hobix.com/">me</a>!</p>
|
13
|
+
<p>The third paragraph</p>
|
14
|
+
<p class="last final"><b>THE FINAL PARAGRAPH</b></p>
|
15
|
+
</body>
|
16
|
+
</html>
|
17
|
+
|