webtranslateit-hpricot 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/CHANGELOG +122 -0
- data/COPYING +18 -0
- data/README.md +295 -0
- data/Rakefile +237 -0
- data/ext/fast_xs/FastXsService.java +1123 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +210 -0
- data/ext/hpricot_scan/HpricotCss.java +850 -0
- data/ext/hpricot_scan/HpricotScanService.java +2085 -0
- data/ext/hpricot_scan/MANIFEST +0 -0
- data/ext/hpricot_scan/extconf.rb +9 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3511 -0
- data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
- data/ext/hpricot_scan/hpricot_css.rl +120 -0
- data/ext/hpricot_scan/hpricot_scan.c +6848 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
- data/ext/hpricot_scan/hpricot_scan.rl +911 -0
- data/extras/hpricot.png +0 -0
- data/hpricot.gemspec +18 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +217 -0
- data/lib/hpricot/elements.rb +514 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +95 -0
- data/lib/hpricot.rb +26 -0
- data/setup.rb +1585 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +96 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +496 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +88 -0
- data/test/test_xml.rb +28 -0
- metadata +106 -0
@@ -0,0 +1,95 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# The XChar library is provided courtesy of Sam Ruby (See
|
4
|
+
# http://intertwingly.net/stories/2005/09/28/xchar.rb)
|
5
|
+
|
6
|
+
# --------------------------------------------------------------------
|
7
|
+
|
8
|
+
######################################################################
|
9
|
+
module Hpricot
|
10
|
+
|
11
|
+
####################################################################
|
12
|
+
# XML Character converter, from Sam Ruby:
|
13
|
+
# (see http://intertwingly.net/stories/2005/09/28/xchar.rb).
|
14
|
+
#
|
15
|
+
module XChar # :nodoc:
|
16
|
+
|
17
|
+
# See
|
18
|
+
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
19
|
+
# for details.
|
20
|
+
CP1252 = { # :nodoc:
|
21
|
+
128 => 8364, # euro sign
|
22
|
+
130 => 8218, # single low-9 quotation mark
|
23
|
+
131 => 402, # latin small letter f with hook
|
24
|
+
132 => 8222, # double low-9 quotation mark
|
25
|
+
133 => 8230, # horizontal ellipsis
|
26
|
+
134 => 8224, # dagger
|
27
|
+
135 => 8225, # double dagger
|
28
|
+
136 => 710, # modifier letter circumflex accent
|
29
|
+
137 => 8240, # per mille sign
|
30
|
+
138 => 352, # latin capital letter s with caron
|
31
|
+
139 => 8249, # single left-pointing angle quotation mark
|
32
|
+
140 => 338, # latin capital ligature oe
|
33
|
+
142 => 381, # latin capital letter z with caron
|
34
|
+
145 => 8216, # left single quotation mark
|
35
|
+
146 => 8217, # right single quotation mark
|
36
|
+
147 => 8220, # left double quotation mark
|
37
|
+
148 => 8221, # right double quotation mark
|
38
|
+
149 => 8226, # bullet
|
39
|
+
150 => 8211, # en dash
|
40
|
+
151 => 8212, # em dash
|
41
|
+
152 => 732, # small tilde
|
42
|
+
153 => 8482, # trade mark sign
|
43
|
+
154 => 353, # latin small letter s with caron
|
44
|
+
155 => 8250, # single right-pointing angle quotation mark
|
45
|
+
156 => 339, # latin small ligature oe
|
46
|
+
158 => 382, # latin small letter z with caron
|
47
|
+
159 => 376, # latin capital letter y with diaeresis
|
48
|
+
}
|
49
|
+
|
50
|
+
# See http://www.w3.org/TR/REC-xml/#dt-chardata for details.
|
51
|
+
PREDEFINED = {
|
52
|
+
34 => '"', # quotation mark
|
53
|
+
38 => '&', # ampersand
|
54
|
+
60 => '<', # left angle bracket
|
55
|
+
62 => '>' # right angle bracket
|
56
|
+
}
|
57
|
+
PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
|
58
|
+
|
59
|
+
# See http://www.w3.org/TR/REC-xml/#charsets for details.
|
60
|
+
VALID = [
|
61
|
+
0x9, 0xA, 0xD,
|
62
|
+
(0x20..0xD7FF),
|
63
|
+
(0xE000..0xFFFD),
|
64
|
+
(0x10000..0x10FFFF)
|
65
|
+
]
|
66
|
+
end
|
67
|
+
|
68
|
+
class << self
|
69
|
+
# XML escaped version of chr
|
70
|
+
def xchr(str)
|
71
|
+
n = XChar::CP1252[str] || str
|
72
|
+
case n when *XChar::VALID
|
73
|
+
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
74
|
+
else
|
75
|
+
'*'
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# XML escaped version of to_s
|
80
|
+
def xs(str)
|
81
|
+
str.to_s.unpack('U*').map {|n| xchr(n)}.join # ASCII, UTF-8
|
82
|
+
rescue
|
83
|
+
str.to_s.unpack('C*').map {|n| xchr(n)}.join # ISO-8859-1, WIN-1252
|
84
|
+
end
|
85
|
+
|
86
|
+
# XML unescape
|
87
|
+
def uxs(str)
|
88
|
+
str.to_s.
|
89
|
+
gsub(/\&\w+;/) { |x| (XChar::PREDEFINED_U[x] || 63).chr }. # 63 = ?? (query char)
|
90
|
+
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }.
|
91
|
+
gsub(/\&\#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U*") }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
data/lib/hpricot.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# == About hpricot.rb
|
2
|
+
#
|
3
|
+
# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
|
4
|
+
#
|
5
|
+
# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
|
6
|
+
# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
|
7
|
+
# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
|
8
|
+
# * hpricot/modules.rb: categorizes the various elements using mixins.
|
9
|
+
# * hpricot/traverse.rb: methods for searching documents.
|
10
|
+
# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
|
11
|
+
# * hpricot/inspect.rb: methods for displaying documents in a readable form.
|
12
|
+
|
13
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
14
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
15
|
+
begin
|
16
|
+
require 'encoding/character/utf-8'
|
17
|
+
rescue LoadError
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'hpricot_scan'
|
21
|
+
require 'hpricot/tag'
|
22
|
+
require 'hpricot/modules'
|
23
|
+
require 'hpricot/traverse'
|
24
|
+
require 'hpricot/inspect'
|
25
|
+
require 'hpricot/parse'
|
26
|
+
require 'hpricot/builder'
|