webtranslateit-hpricot 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/CHANGELOG +122 -0
- data/COPYING +18 -0
- data/README.md +295 -0
- data/Rakefile +237 -0
- data/ext/fast_xs/FastXsService.java +1123 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +210 -0
- data/ext/hpricot_scan/HpricotCss.java +850 -0
- data/ext/hpricot_scan/HpricotScanService.java +2085 -0
- data/ext/hpricot_scan/MANIFEST +0 -0
- data/ext/hpricot_scan/extconf.rb +9 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3511 -0
- data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
- data/ext/hpricot_scan/hpricot_css.rl +120 -0
- data/ext/hpricot_scan/hpricot_scan.c +6848 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
- data/ext/hpricot_scan/hpricot_scan.rl +911 -0
- data/extras/hpricot.png +0 -0
- data/hpricot.gemspec +18 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +217 -0
- data/lib/hpricot/elements.rb +514 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +95 -0
- data/lib/hpricot.rb +26 -0
- data/setup.rb +1585 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +96 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +496 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +88 -0
- data/test/test_xml.rb +28 -0
- metadata +106 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
# The XChar library is provided courtesy of Sam Ruby (See
|
|
4
|
+
# http://intertwingly.net/stories/2005/09/28/xchar.rb)
|
|
5
|
+
|
|
6
|
+
# --------------------------------------------------------------------
|
|
7
|
+
|
|
8
|
+
######################################################################
|
|
9
|
+
module Hpricot
|
|
10
|
+
|
|
11
|
+
####################################################################
|
|
12
|
+
# XML Character converter, from Sam Ruby:
|
|
13
|
+
# (see http://intertwingly.net/stories/2005/09/28/xchar.rb).
|
|
14
|
+
#
|
|
15
|
+
module XChar # :nodoc:
|
|
16
|
+
|
|
17
|
+
# See
|
|
18
|
+
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
|
19
|
+
# for details.
|
|
20
|
+
CP1252 = { # :nodoc:
|
|
21
|
+
128 => 8364, # euro sign
|
|
22
|
+
130 => 8218, # single low-9 quotation mark
|
|
23
|
+
131 => 402, # latin small letter f with hook
|
|
24
|
+
132 => 8222, # double low-9 quotation mark
|
|
25
|
+
133 => 8230, # horizontal ellipsis
|
|
26
|
+
134 => 8224, # dagger
|
|
27
|
+
135 => 8225, # double dagger
|
|
28
|
+
136 => 710, # modifier letter circumflex accent
|
|
29
|
+
137 => 8240, # per mille sign
|
|
30
|
+
138 => 352, # latin capital letter s with caron
|
|
31
|
+
139 => 8249, # single left-pointing angle quotation mark
|
|
32
|
+
140 => 338, # latin capital ligature oe
|
|
33
|
+
142 => 381, # latin capital letter z with caron
|
|
34
|
+
145 => 8216, # left single quotation mark
|
|
35
|
+
146 => 8217, # right single quotation mark
|
|
36
|
+
147 => 8220, # left double quotation mark
|
|
37
|
+
148 => 8221, # right double quotation mark
|
|
38
|
+
149 => 8226, # bullet
|
|
39
|
+
150 => 8211, # en dash
|
|
40
|
+
151 => 8212, # em dash
|
|
41
|
+
152 => 732, # small tilde
|
|
42
|
+
153 => 8482, # trade mark sign
|
|
43
|
+
154 => 353, # latin small letter s with caron
|
|
44
|
+
155 => 8250, # single right-pointing angle quotation mark
|
|
45
|
+
156 => 339, # latin small ligature oe
|
|
46
|
+
158 => 382, # latin small letter z with caron
|
|
47
|
+
159 => 376, # latin capital letter y with diaeresis
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# See http://www.w3.org/TR/REC-xml/#dt-chardata for details.
|
|
51
|
+
PREDEFINED = {
|
|
52
|
+
34 => '"', # quotation mark
|
|
53
|
+
38 => '&', # ampersand
|
|
54
|
+
60 => '<', # left angle bracket
|
|
55
|
+
62 => '>' # right angle bracket
|
|
56
|
+
}
|
|
57
|
+
PREDEFINED_U = PREDEFINED.inject({}) { |hsh, (k, v)| hsh[v] = k; hsh }
|
|
58
|
+
|
|
59
|
+
# See http://www.w3.org/TR/REC-xml/#charsets for details.
|
|
60
|
+
VALID = [
|
|
61
|
+
0x9, 0xA, 0xD,
|
|
62
|
+
(0x20..0xD7FF),
|
|
63
|
+
(0xE000..0xFFFD),
|
|
64
|
+
(0x10000..0x10FFFF)
|
|
65
|
+
]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
class << self
|
|
69
|
+
# XML escaped version of chr
|
|
70
|
+
def xchr(str)
|
|
71
|
+
n = XChar::CP1252[str] || str
|
|
72
|
+
case n when *XChar::VALID
|
|
73
|
+
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
|
74
|
+
else
|
|
75
|
+
'*'
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# XML escaped version of to_s
|
|
80
|
+
def xs(str)
|
|
81
|
+
str.to_s.unpack('U*').map {|n| xchr(n)}.join # ASCII, UTF-8
|
|
82
|
+
rescue
|
|
83
|
+
str.to_s.unpack('C*').map {|n| xchr(n)}.join # ISO-8859-1, WIN-1252
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# XML unescape
|
|
87
|
+
def uxs(str)
|
|
88
|
+
str.to_s.
|
|
89
|
+
gsub(/\&\w+;/) { |x| (XChar::PREDEFINED_U[x] || 63).chr }. # 63 = ?? (query char)
|
|
90
|
+
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }.
|
|
91
|
+
gsub(/\&\#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack("U*") }
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
data/lib/hpricot.rb
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# == About hpricot.rb
|
|
2
|
+
#
|
|
3
|
+
# All of Hpricot's various part are loaded when you use <tt>require 'hpricot'</tt>.
|
|
4
|
+
#
|
|
5
|
+
# * hpricot_scan: the scanner (a C extension for Ruby) which turns an HTML stream into tokens.
|
|
6
|
+
# * hpricot/parse.rb: uses the scanner to sort through tokens and give you back a complete document object.
|
|
7
|
+
# * hpricot/tag.rb: sets up objects for the various types of elements in an HTML document.
|
|
8
|
+
# * hpricot/modules.rb: categorizes the various elements using mixins.
|
|
9
|
+
# * hpricot/traverse.rb: methods for searching documents.
|
|
10
|
+
# * hpricot/elements.rb: methods for dealing with a group of elements as an Hpricot::Elements list.
|
|
11
|
+
# * hpricot/inspect.rb: methods for displaying documents in a readable form.
|
|
12
|
+
|
|
13
|
+
# If available, Nikolai's UTF-8 library will ease use of utf-8 documents.
|
|
14
|
+
# See http://git.bitwi.se/ruby-character-encodings.git/.
|
|
15
|
+
begin
|
|
16
|
+
require 'encoding/character/utf-8'
|
|
17
|
+
rescue LoadError
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
require 'hpricot_scan'
|
|
21
|
+
require 'hpricot/tag'
|
|
22
|
+
require 'hpricot/modules'
|
|
23
|
+
require 'hpricot/traverse'
|
|
24
|
+
require 'hpricot/inspect'
|
|
25
|
+
require 'hpricot/parse'
|
|
26
|
+
require 'hpricot/builder'
|