htree 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +4 -0
- data/Makefile +20 -0
- data/Manifest +58 -0
- data/README +61 -0
- data/Rakefile +37 -0
- data/htree.gemspec +32 -0
- data/init.rb +1 -0
- data/install.rb +112 -0
- data/lib/htree.rb +97 -0
- data/lib/htree/container.rb +8 -0
- data/lib/htree/context.rb +69 -0
- data/lib/htree/display.rb +46 -0
- data/lib/htree/doc.rb +149 -0
- data/lib/htree/elem.rb +262 -0
- data/lib/htree/encoder.rb +217 -0
- data/lib/htree/equality.rb +219 -0
- data/lib/htree/extract_text.rb +37 -0
- data/lib/htree/fstr.rb +32 -0
- data/lib/htree/gencode.rb +193 -0
- data/lib/htree/htmlinfo.rb +672 -0
- data/lib/htree/inspect.rb +108 -0
- data/lib/htree/leaf.rb +92 -0
- data/lib/htree/loc.rb +369 -0
- data/lib/htree/modules.rb +49 -0
- data/lib/htree/name.rb +122 -0
- data/lib/htree/output.rb +212 -0
- data/lib/htree/parse.rb +410 -0
- data/lib/htree/raw_string.rb +127 -0
- data/lib/htree/regexp-util.rb +19 -0
- data/lib/htree/rexml.rb +131 -0
- data/lib/htree/scan.rb +176 -0
- data/lib/htree/tag.rb +113 -0
- data/lib/htree/template.rb +961 -0
- data/lib/htree/text.rb +115 -0
- data/lib/htree/traverse.rb +497 -0
- data/test-all.rb +5 -0
- data/test/assign.html +1 -0
- data/test/template.html +4 -0
- data/test/test-attr.rb +67 -0
- data/test/test-charset.rb +79 -0
- data/test/test-context.rb +29 -0
- data/test/test-display_xml.rb +45 -0
- data/test/test-elem-new.rb +101 -0
- data/test/test-encoder.rb +53 -0
- data/test/test-equality.rb +55 -0
- data/test/test-extract_text.rb +18 -0
- data/test/test-gencode.rb +27 -0
- data/test/test-leaf.rb +25 -0
- data/test/test-loc.rb +60 -0
- data/test/test-namespace.rb +147 -0
- data/test/test-output.rb +133 -0
- data/test/test-parse.rb +115 -0
- data/test/test-raw_string.rb +17 -0
- data/test/test-rexml.rb +70 -0
- data/test/test-scan.rb +153 -0
- data/test/test-security.rb +37 -0
- data/test/test-subnode.rb +142 -0
- data/test/test-template.rb +313 -0
- data/test/test-text.rb +43 -0
- data/test/test-traverse.rb +69 -0
- metadata +166 -0
- metadata.gz.sig +1 -0
@@ -0,0 +1,217 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
|
3
|
+
module HTree
|
4
|
+
class Encoder
|
5
|
+
# HTree::Encoder.internal_charset returns the MIME charset corresponding to $KCODE.
|
6
|
+
#
|
7
|
+
# - 'ISO-8859-1' when $KCODE=='NONE'
|
8
|
+
# - 'UTF-8' when $KCODE=='UTF8'
|
9
|
+
# - 'EUC-JP' when $KCODE=='EUC'
|
10
|
+
# - 'Shift_JIS' when $KCODE=='SJIS'
|
11
|
+
#
|
12
|
+
# This mapping ignores EUC-KR and various single byte charset other than ISO-8859-1 at least.
|
13
|
+
# This should be fixed when Ruby is m17nized.
|
14
|
+
def Encoder.internal_charset
|
15
|
+
if Object.const_defined? :Encoding
|
16
|
+
Encoding.default_external.name
|
17
|
+
else
|
18
|
+
KcodeCharset[$KCODE]
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(output_encoding, internal_encoding=HTree::Encoder.internal_charset)
|
23
|
+
@buf = ''
|
24
|
+
@internal_encoding = internal_encoding
|
25
|
+
@output_encoding = output_encoding
|
26
|
+
@ic = Iconv.new(output_encoding, @internal_encoding)
|
27
|
+
@charpat = FirstCharPattern[internal_encoding]
|
28
|
+
@subcharset_list = SubCharset[output_encoding] || []
|
29
|
+
@subcharset_ic = {}
|
30
|
+
@subcharset_list.each {|subcharset|
|
31
|
+
@subcharset_ic[subcharset] = Iconv.new(subcharset, @internal_encoding)
|
32
|
+
}
|
33
|
+
@html_output = false
|
34
|
+
end
|
35
|
+
|
36
|
+
# :stopdoc:
|
37
|
+
def html_output?
|
38
|
+
@html_output
|
39
|
+
end
|
40
|
+
|
41
|
+
def html_output=(flag)
|
42
|
+
@html_output = flag
|
43
|
+
end
|
44
|
+
|
45
|
+
def output_cdata_content_do(out, pre, body, post)
|
46
|
+
if @html_output
|
47
|
+
pre.call
|
48
|
+
body.call
|
49
|
+
post.call(out)
|
50
|
+
else
|
51
|
+
body.call
|
52
|
+
end
|
53
|
+
return out
|
54
|
+
end
|
55
|
+
|
56
|
+
def output_slash_if_xml
|
57
|
+
if !@html_output
|
58
|
+
output_string('/')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def output_cdata_content(content, context)
|
63
|
+
if @html_output
|
64
|
+
# xxx: should raise an error for non-text node?
|
65
|
+
texts = content.grep(HTree::Text)
|
66
|
+
text = HTree::Text.concat(*texts)
|
67
|
+
text.output_cdata(self)
|
68
|
+
else
|
69
|
+
content.each {|n| n.output(self, context) }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def output_cdata_for_html(*args)
|
74
|
+
str = args.join('')
|
75
|
+
if %r{</} =~ str
|
76
|
+
raise ArgumentError, "cdata contains '</' : #{str.inspect}"
|
77
|
+
end
|
78
|
+
output_string str
|
79
|
+
end
|
80
|
+
|
81
|
+
def output_string(internal_str, external_str=@ic.iconv(internal_str))
|
82
|
+
@buf.force_encoding(external_str.encoding) if @buf.empty? && @buf.respond_to?(:force_encoding) # xxx: should be fixed Ruby itself
|
83
|
+
@buf << external_str
|
84
|
+
@subcharset_ic.reject! {|subcharset, ic|
|
85
|
+
begin
|
86
|
+
ic.iconv(internal_str) != external_str
|
87
|
+
rescue Iconv::Failure
|
88
|
+
true
|
89
|
+
end
|
90
|
+
}
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
|
94
|
+
def output_text(string)
|
95
|
+
begin
|
96
|
+
output_string string, @ic.iconv(string)
|
97
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => e
|
98
|
+
success = e.success
|
99
|
+
output_string string[0, string.length - e.failed.length], success
|
100
|
+
unless /\A./m =~ e.failed
|
101
|
+
# xxx: should be configulable?
|
102
|
+
#raise ArgumentError, "cannot extract first character: #{e.failed.dump}"
|
103
|
+
string = e.failed[1, e.failed.length-1]
|
104
|
+
output_string '?'
|
105
|
+
retry
|
106
|
+
end
|
107
|
+
char = $&
|
108
|
+
rest = $'
|
109
|
+
begin
|
110
|
+
ucode = Iconv.conv("UTF-8", @internal_encoding, char).unpack("U")[0]
|
111
|
+
char = "&##{ucode};"
|
112
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter
|
113
|
+
# xxx: should be configulable?
|
114
|
+
char = '?'
|
115
|
+
end
|
116
|
+
output_string char
|
117
|
+
string = rest
|
118
|
+
retry
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
ChRef = {
|
123
|
+
'&' => '&',
|
124
|
+
'<' => '<',
|
125
|
+
'>' => '>',
|
126
|
+
'"' => '"',
|
127
|
+
}
|
128
|
+
|
129
|
+
def output_dynamic_text(string)
|
130
|
+
if string.respond_to? :rcdata
|
131
|
+
output_text(string.rcdata.gsub(/[<>]/) { ChRef[$&] })
|
132
|
+
else
|
133
|
+
output_text(string.to_s.gsub(/[&<>]/) { ChRef[$&] })
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def output_dynamic_attvalue(string)
|
138
|
+
if string.respond_to? :rcdata
|
139
|
+
output_text(string.rcdata.gsub(/[<>"]/) { ChRef[$&] })
|
140
|
+
else
|
141
|
+
output_text(string.to_s.gsub(/[&<>"]/) { ChRef[$&] })
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# :startdoc:
|
146
|
+
|
147
|
+
def finish
|
148
|
+
external_str = @ic.close
|
149
|
+
@buf << external_str
|
150
|
+
@subcharset_ic.reject! {|subcharset, ic|
|
151
|
+
begin
|
152
|
+
ic.close != external_str
|
153
|
+
rescue Iconv::Failure
|
154
|
+
true
|
155
|
+
end
|
156
|
+
}
|
157
|
+
@buf
|
158
|
+
end
|
159
|
+
|
160
|
+
def finish_with_xmldecl
|
161
|
+
content = finish
|
162
|
+
xmldecl = Iconv.conv(@output_encoding, 'US-ASCII',
|
163
|
+
"<?xml version=\"1.0\" encoding=\"#{minimal_charset}\"?>")
|
164
|
+
xmldecl + content
|
165
|
+
end
|
166
|
+
|
167
|
+
def minimal_charset
|
168
|
+
@subcharset_list.each {|subcharset|
|
169
|
+
if @subcharset_ic.include? subcharset
|
170
|
+
return subcharset
|
171
|
+
end
|
172
|
+
}
|
173
|
+
@output_encoding
|
174
|
+
end
|
175
|
+
|
176
|
+
# :stopdoc:
|
177
|
+
|
178
|
+
KcodeCharset = {
|
179
|
+
'EUC' => 'EUC-JP',
|
180
|
+
'SJIS' => 'Shift_JIS',
|
181
|
+
'UTF8' => 'UTF-8',
|
182
|
+
'NONE' => 'ISO-8859-1',
|
183
|
+
}
|
184
|
+
|
185
|
+
FirstCharPattern = {
|
186
|
+
'EUC-JP' => /\A(?:
|
187
|
+
[\x00-\x7f]
|
188
|
+
|[\xa1-\xfe][\xa1-\xfe]
|
189
|
+
|\x8e[\xa1-\xfe]
|
190
|
+
|\x8f[\xa1-\xfe][\xa1-\xfe])/nx,
|
191
|
+
'Shift_JIS' => /\A(?:
|
192
|
+
[\x00-\x7f]
|
193
|
+
|[\x81-\x9f][\x40-\x7e\x80-\xfc]
|
194
|
+
|[\xa1-\xdf]
|
195
|
+
|[\xe0-\xfc][\x40-\x7e\x80-\xfc])/nx,
|
196
|
+
'UTF-8' => /\A(?:
|
197
|
+
[\x00-\x7f]
|
198
|
+
|[\xc0-\xdf][\x80-\xbf]
|
199
|
+
|[\xe0-\xef][\x80-\xbf][\x80-\xbf]
|
200
|
+
|[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]
|
201
|
+
|[\xf8-\xfb][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf]
|
202
|
+
|[\xfc-\xfd][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf])/nx,
|
203
|
+
'ISO-8859-1' => /\A[\x00-\xff]/n
|
204
|
+
}
|
205
|
+
|
206
|
+
SubCharset = {
|
207
|
+
'ISO-2022-JP-2' => ['US-ASCII', 'ISO-2022-JP'],
|
208
|
+
'ISO-2022-JP-3' => ['US-ASCII', 'ISO-2022-JP'],
|
209
|
+
'UTF-16BE' => [],
|
210
|
+
'UTF-16LE' => [],
|
211
|
+
'UTF-16' => [],
|
212
|
+
}
|
213
|
+
SubCharset.default = ['US-ASCII']
|
214
|
+
|
215
|
+
# :startdoc:
|
216
|
+
end
|
217
|
+
end
|
@@ -0,0 +1,219 @@
|
|
1
|
+
require 'htree/doc'
|
2
|
+
require 'htree/elem'
|
3
|
+
require 'htree/leaf'
|
4
|
+
require 'htree/tag'
|
5
|
+
require 'htree/raw_string'
|
6
|
+
require 'htree/context'
|
7
|
+
|
8
|
+
module HTree
|
9
|
+
# compare tree structures.
|
10
|
+
def ==(other)
|
11
|
+
check_equality(self, other, :usual_equal_object)
|
12
|
+
end
|
13
|
+
alias eql? ==
|
14
|
+
|
15
|
+
# hash value for the tree structure.
|
16
|
+
def hash
|
17
|
+
return @hash_code if defined? @hash_code
|
18
|
+
@hash_code = usual_equal_object.hash
|
19
|
+
end
|
20
|
+
|
21
|
+
# :stopdoc:
|
22
|
+
|
23
|
+
def usual_equal_object
|
24
|
+
return @usual_equal_object if defined? @usual_equal_object
|
25
|
+
@usual_equal_object = make_usual_equal_object
|
26
|
+
end
|
27
|
+
|
28
|
+
def make_usual_equal_object
|
29
|
+
raise NotImplementedError
|
30
|
+
end
|
31
|
+
|
32
|
+
def exact_equal_object
|
33
|
+
return @exact_equal_object if defined? @exact_equal_object
|
34
|
+
@exact_equal_object = make_exact_equal_object
|
35
|
+
end
|
36
|
+
|
37
|
+
def make_exact_equal_object
|
38
|
+
raise NotImplementedError
|
39
|
+
end
|
40
|
+
|
41
|
+
def exact_equal?(other)
|
42
|
+
check_equality(self, other, :exact_equal_object)
|
43
|
+
end
|
44
|
+
|
45
|
+
def check_equality(obj1, obj2, equal_object_method)
|
46
|
+
return false unless obj1.class == obj2.class
|
47
|
+
if obj1.class == Array
|
48
|
+
return false unless obj1.length == obj2.length
|
49
|
+
obj1.each_with_index {|c1, i|
|
50
|
+
return false unless c1.class == obj2[i].class
|
51
|
+
}
|
52
|
+
obj1.each_with_index {|c1, i|
|
53
|
+
return false unless check_equality(c1, obj2[i], equal_object_method)
|
54
|
+
}
|
55
|
+
true
|
56
|
+
elsif obj1.respond_to? equal_object_method
|
57
|
+
o1 = obj1.send(equal_object_method)
|
58
|
+
o2 = obj2.send(equal_object_method)
|
59
|
+
check_equality(o1, o2, equal_object_method)
|
60
|
+
else
|
61
|
+
obj1 == obj2
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class Doc
|
66
|
+
alias exact_equal_object children
|
67
|
+
alias usual_equal_object children
|
68
|
+
end
|
69
|
+
|
70
|
+
class Elem
|
71
|
+
def make_exact_equal_object
|
72
|
+
[@stag, @children, @empty, @etag]
|
73
|
+
end
|
74
|
+
|
75
|
+
def make_usual_equal_object
|
76
|
+
[@stag, @children]
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class Name
|
81
|
+
def make_exact_equal_object
|
82
|
+
[@namespace_prefix, @namespace_uri, @local_name]
|
83
|
+
end
|
84
|
+
|
85
|
+
def make_usual_equal_object
|
86
|
+
xmlns? ? @local_name : [@namespace_uri, @local_name]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
module Util
|
91
|
+
module_function
|
92
|
+
def cmp_with_nil(a, b)
|
93
|
+
if a == nil
|
94
|
+
if b == nil
|
95
|
+
0
|
96
|
+
else
|
97
|
+
-1
|
98
|
+
end
|
99
|
+
else
|
100
|
+
if b == nil
|
101
|
+
1
|
102
|
+
else
|
103
|
+
a <=> b
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
class Context
|
110
|
+
def make_exact_equal_object
|
111
|
+
@namespaces.keys.sort {|prefix1, prefix2|
|
112
|
+
Util.cmp_with_nil(prefix1, prefix2)
|
113
|
+
}.map {|prefix| [prefix, @namespaces[prefix]] }
|
114
|
+
end
|
115
|
+
|
116
|
+
# make_usual_equal_object is not used through STag#make_usual_equal_object
|
117
|
+
# NotImplementedError is suitable?
|
118
|
+
alias make_usual_equal_object make_exact_equal_object
|
119
|
+
end
|
120
|
+
|
121
|
+
class STag
|
122
|
+
def make_exact_equal_object
|
123
|
+
[@raw_string,
|
124
|
+
@name,
|
125
|
+
@attributes.sort {|(n1,t1), (n2, t2)|
|
126
|
+
Util.cmp_with_nil(n1.namespace_prefix, n2.namespace_prefix).nonzero? ||
|
127
|
+
Util.cmp_with_nil(n1.namespace_uri, n2.namespace_uri).nonzero? ||
|
128
|
+
Util.cmp_with_nil(n1.local_name, n2.local_name)
|
129
|
+
},
|
130
|
+
@inherited_context
|
131
|
+
]
|
132
|
+
end
|
133
|
+
|
134
|
+
def make_usual_equal_object
|
135
|
+
[@name,
|
136
|
+
@attributes.find_all {|n,t| !n.xmlns? }.sort {|(n1,t1), (n2, t2)|
|
137
|
+
Util.cmp_with_nil(n1.namespace_prefix, n2.namespace_prefix).nonzero? ||
|
138
|
+
Util.cmp_with_nil(n1.namespace_uri, n2.namespace_uri).nonzero? ||
|
139
|
+
Util.cmp_with_nil(n1.local_name, n2.local_name)
|
140
|
+
}
|
141
|
+
]
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
class ETag
|
147
|
+
def make_exact_equal_object
|
148
|
+
[@raw_string, @qualified_name]
|
149
|
+
end
|
150
|
+
|
151
|
+
alias usual_equal_object qualified_name
|
152
|
+
end
|
153
|
+
|
154
|
+
class Text
|
155
|
+
def make_exact_equal_object
|
156
|
+
[@raw_string, @rcdata]
|
157
|
+
end
|
158
|
+
|
159
|
+
def make_usual_equal_object
|
160
|
+
@normalized_rcdata
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
class XMLDecl
|
165
|
+
def make_exact_equal_object
|
166
|
+
[@raw_string, @version, @encoding, @standalone]
|
167
|
+
end
|
168
|
+
|
169
|
+
def make_usual_equal_object
|
170
|
+
[@version, @encoding, @standalone]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
class DocType
|
175
|
+
def make_exact_equal_object
|
176
|
+
[@raw_string, @root_element_name, @system_identifier, @public_identifier]
|
177
|
+
end
|
178
|
+
|
179
|
+
def make_usual_equal_object
|
180
|
+
[@root_element_name, @system_identifier, @public_identifier]
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
class ProcIns
|
185
|
+
def make_exact_equal_object
|
186
|
+
[@raw_string, @target, @content]
|
187
|
+
end
|
188
|
+
|
189
|
+
def make_usual_equal_object
|
190
|
+
[@target, @content]
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
class Comment
|
195
|
+
def make_exact_equal_object
|
196
|
+
[@raw_string, @content]
|
197
|
+
end
|
198
|
+
|
199
|
+
alias usual_equal_object content
|
200
|
+
end
|
201
|
+
|
202
|
+
class BogusETag
|
203
|
+
def make_exact_equal_object
|
204
|
+
[@etag]
|
205
|
+
end
|
206
|
+
|
207
|
+
alias usual_equal_object make_exact_equal_object
|
208
|
+
end
|
209
|
+
|
210
|
+
class Location
|
211
|
+
def make_exact_equal_object
|
212
|
+
[@parent, @index, @node]
|
213
|
+
end
|
214
|
+
|
215
|
+
alias usual_equal_object make_exact_equal_object
|
216
|
+
end
|
217
|
+
|
218
|
+
# :startdoc:
|
219
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'htree/text'
|
2
|
+
require 'htree/doc'
|
3
|
+
require 'htree/elem'
|
4
|
+
|
5
|
+
module HTree
|
6
|
+
module Node
|
7
|
+
def extract_text
|
8
|
+
raise NotImplementedError
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class Location
|
13
|
+
def extract_text
|
14
|
+
to_node.extract_text
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# :stopdoc:
|
19
|
+
module Container
|
20
|
+
def extract_text
|
21
|
+
Text.concat(*@children.map {|n| n.extract_text })
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
module Leaf
|
26
|
+
def extract_text
|
27
|
+
Text.new('')
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
class Text
|
32
|
+
def extract_text
|
33
|
+
self
|
34
|
+
end
|
35
|
+
end
|
36
|
+
# :startdoc:
|
37
|
+
end
|