htree 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data.tar.gz.sig +4 -0
  2. data/Makefile +20 -0
  3. data/Manifest +58 -0
  4. data/README +61 -0
  5. data/Rakefile +37 -0
  6. data/htree.gemspec +32 -0
  7. data/init.rb +1 -0
  8. data/install.rb +112 -0
  9. data/lib/htree.rb +97 -0
  10. data/lib/htree/container.rb +8 -0
  11. data/lib/htree/context.rb +69 -0
  12. data/lib/htree/display.rb +46 -0
  13. data/lib/htree/doc.rb +149 -0
  14. data/lib/htree/elem.rb +262 -0
  15. data/lib/htree/encoder.rb +217 -0
  16. data/lib/htree/equality.rb +219 -0
  17. data/lib/htree/extract_text.rb +37 -0
  18. data/lib/htree/fstr.rb +32 -0
  19. data/lib/htree/gencode.rb +193 -0
  20. data/lib/htree/htmlinfo.rb +672 -0
  21. data/lib/htree/inspect.rb +108 -0
  22. data/lib/htree/leaf.rb +92 -0
  23. data/lib/htree/loc.rb +369 -0
  24. data/lib/htree/modules.rb +49 -0
  25. data/lib/htree/name.rb +122 -0
  26. data/lib/htree/output.rb +212 -0
  27. data/lib/htree/parse.rb +410 -0
  28. data/lib/htree/raw_string.rb +127 -0
  29. data/lib/htree/regexp-util.rb +19 -0
  30. data/lib/htree/rexml.rb +131 -0
  31. data/lib/htree/scan.rb +176 -0
  32. data/lib/htree/tag.rb +113 -0
  33. data/lib/htree/template.rb +961 -0
  34. data/lib/htree/text.rb +115 -0
  35. data/lib/htree/traverse.rb +497 -0
  36. data/test-all.rb +5 -0
  37. data/test/assign.html +1 -0
  38. data/test/template.html +4 -0
  39. data/test/test-attr.rb +67 -0
  40. data/test/test-charset.rb +79 -0
  41. data/test/test-context.rb +29 -0
  42. data/test/test-display_xml.rb +45 -0
  43. data/test/test-elem-new.rb +101 -0
  44. data/test/test-encoder.rb +53 -0
  45. data/test/test-equality.rb +55 -0
  46. data/test/test-extract_text.rb +18 -0
  47. data/test/test-gencode.rb +27 -0
  48. data/test/test-leaf.rb +25 -0
  49. data/test/test-loc.rb +60 -0
  50. data/test/test-namespace.rb +147 -0
  51. data/test/test-output.rb +133 -0
  52. data/test/test-parse.rb +115 -0
  53. data/test/test-raw_string.rb +17 -0
  54. data/test/test-rexml.rb +70 -0
  55. data/test/test-scan.rb +153 -0
  56. data/test/test-security.rb +37 -0
  57. data/test/test-subnode.rb +142 -0
  58. data/test/test-template.rb +313 -0
  59. data/test/test-text.rb +43 -0
  60. data/test/test-traverse.rb +69 -0
  61. metadata +166 -0
  62. metadata.gz.sig +1 -0
@@ -0,0 +1,217 @@
1
+ require 'iconv'
2
+
3
+ module HTree
4
+ class Encoder
5
+ # HTree::Encoder.internal_charset returns the MIME charset corresponding to $KCODE.
6
+ #
7
+ # - 'ISO-8859-1' when $KCODE=='NONE'
8
+ # - 'UTF-8' when $KCODE=='UTF8'
9
+ # - 'EUC-JP' when $KCODE=='EUC'
10
+ # - 'Shift_JIS' when $KCODE=='SJIS'
11
+ #
12
+ # This mapping ignores EUC-KR and various single byte charset other than ISO-8859-1 at least.
13
+ # This should be fixed when Ruby is m17nized.
14
+ def Encoder.internal_charset
15
+ if Object.const_defined? :Encoding
16
+ Encoding.default_external.name
17
+ else
18
+ KcodeCharset[$KCODE]
19
+ end
20
+ end
21
+
22
+ def initialize(output_encoding, internal_encoding=HTree::Encoder.internal_charset)
23
+ @buf = ''
24
+ @internal_encoding = internal_encoding
25
+ @output_encoding = output_encoding
26
+ @ic = Iconv.new(output_encoding, @internal_encoding)
27
+ @charpat = FirstCharPattern[internal_encoding]
28
+ @subcharset_list = SubCharset[output_encoding] || []
29
+ @subcharset_ic = {}
30
+ @subcharset_list.each {|subcharset|
31
+ @subcharset_ic[subcharset] = Iconv.new(subcharset, @internal_encoding)
32
+ }
33
+ @html_output = false
34
+ end
35
+
36
+ # :stopdoc:
37
+ def html_output?
38
+ @html_output
39
+ end
40
+
41
+ def html_output=(flag)
42
+ @html_output = flag
43
+ end
44
+
45
+ def output_cdata_content_do(out, pre, body, post)
46
+ if @html_output
47
+ pre.call
48
+ body.call
49
+ post.call(out)
50
+ else
51
+ body.call
52
+ end
53
+ return out
54
+ end
55
+
56
+ def output_slash_if_xml
57
+ if !@html_output
58
+ output_string('/')
59
+ end
60
+ end
61
+
62
+ def output_cdata_content(content, context)
63
+ if @html_output
64
+ # xxx: should raise an error for non-text node?
65
+ texts = content.grep(HTree::Text)
66
+ text = HTree::Text.concat(*texts)
67
+ text.output_cdata(self)
68
+ else
69
+ content.each {|n| n.output(self, context) }
70
+ end
71
+ end
72
+
73
+ def output_cdata_for_html(*args)
74
+ str = args.join('')
75
+ if %r{</} =~ str
76
+ raise ArgumentError, "cdata contains '</' : #{str.inspect}"
77
+ end
78
+ output_string str
79
+ end
80
+
81
+ def output_string(internal_str, external_str=@ic.iconv(internal_str))
82
+ @buf.force_encoding(external_str.encoding) if @buf.empty? && @buf.respond_to?(:force_encoding) # xxx: should be fixed Ruby itself
83
+ @buf << external_str
84
+ @subcharset_ic.reject! {|subcharset, ic|
85
+ begin
86
+ ic.iconv(internal_str) != external_str
87
+ rescue Iconv::Failure
88
+ true
89
+ end
90
+ }
91
+ nil
92
+ end
93
+
94
+ def output_text(string)
95
+ begin
96
+ output_string string, @ic.iconv(string)
97
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => e
98
+ success = e.success
99
+ output_string string[0, string.length - e.failed.length], success
100
+ unless /\A./m =~ e.failed
101
+ # xxx: should be configulable?
102
+ #raise ArgumentError, "cannot extract first character: #{e.failed.dump}"
103
+ string = e.failed[1, e.failed.length-1]
104
+ output_string '?'
105
+ retry
106
+ end
107
+ char = $&
108
+ rest = $'
109
+ begin
110
+ ucode = Iconv.conv("UTF-8", @internal_encoding, char).unpack("U")[0]
111
+ char = "&##{ucode};"
112
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter
113
+ # xxx: should be configulable?
114
+ char = '?'
115
+ end
116
+ output_string char
117
+ string = rest
118
+ retry
119
+ end
120
+ end
121
+
122
+ ChRef = {
123
+ '&' => '&amp;',
124
+ '<' => '&lt;',
125
+ '>' => '&gt;',
126
+ '"' => '&quot;',
127
+ }
128
+
129
+ def output_dynamic_text(string)
130
+ if string.respond_to? :rcdata
131
+ output_text(string.rcdata.gsub(/[<>]/) { ChRef[$&] })
132
+ else
133
+ output_text(string.to_s.gsub(/[&<>]/) { ChRef[$&] })
134
+ end
135
+ end
136
+
137
+ def output_dynamic_attvalue(string)
138
+ if string.respond_to? :rcdata
139
+ output_text(string.rcdata.gsub(/[<>"]/) { ChRef[$&] })
140
+ else
141
+ output_text(string.to_s.gsub(/[&<>"]/) { ChRef[$&] })
142
+ end
143
+ end
144
+
145
+ # :startdoc:
146
+
147
+ def finish
148
+ external_str = @ic.close
149
+ @buf << external_str
150
+ @subcharset_ic.reject! {|subcharset, ic|
151
+ begin
152
+ ic.close != external_str
153
+ rescue Iconv::Failure
154
+ true
155
+ end
156
+ }
157
+ @buf
158
+ end
159
+
160
+ def finish_with_xmldecl
161
+ content = finish
162
+ xmldecl = Iconv.conv(@output_encoding, 'US-ASCII',
163
+ "<?xml version=\"1.0\" encoding=\"#{minimal_charset}\"?>")
164
+ xmldecl + content
165
+ end
166
+
167
+ def minimal_charset
168
+ @subcharset_list.each {|subcharset|
169
+ if @subcharset_ic.include? subcharset
170
+ return subcharset
171
+ end
172
+ }
173
+ @output_encoding
174
+ end
175
+
176
+ # :stopdoc:
177
+
178
+ KcodeCharset = {
179
+ 'EUC' => 'EUC-JP',
180
+ 'SJIS' => 'Shift_JIS',
181
+ 'UTF8' => 'UTF-8',
182
+ 'NONE' => 'ISO-8859-1',
183
+ }
184
+
185
+ FirstCharPattern = {
186
+ 'EUC-JP' => /\A(?:
187
+ [\x00-\x7f]
188
+ |[\xa1-\xfe][\xa1-\xfe]
189
+ |\x8e[\xa1-\xfe]
190
+ |\x8f[\xa1-\xfe][\xa1-\xfe])/nx,
191
+ 'Shift_JIS' => /\A(?:
192
+ [\x00-\x7f]
193
+ |[\x81-\x9f][\x40-\x7e\x80-\xfc]
194
+ |[\xa1-\xdf]
195
+ |[\xe0-\xfc][\x40-\x7e\x80-\xfc])/nx,
196
+ 'UTF-8' => /\A(?:
197
+ [\x00-\x7f]
198
+ |[\xc0-\xdf][\x80-\xbf]
199
+ |[\xe0-\xef][\x80-\xbf][\x80-\xbf]
200
+ |[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]
201
+ |[\xf8-\xfb][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf]
202
+ |[\xfc-\xfd][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf])/nx,
203
+ 'ISO-8859-1' => /\A[\x00-\xff]/n
204
+ }
205
+
206
+ SubCharset = {
207
+ 'ISO-2022-JP-2' => ['US-ASCII', 'ISO-2022-JP'],
208
+ 'ISO-2022-JP-3' => ['US-ASCII', 'ISO-2022-JP'],
209
+ 'UTF-16BE' => [],
210
+ 'UTF-16LE' => [],
211
+ 'UTF-16' => [],
212
+ }
213
+ SubCharset.default = ['US-ASCII']
214
+
215
+ # :startdoc:
216
+ end
217
+ end
@@ -0,0 +1,219 @@
1
+ require 'htree/doc'
2
+ require 'htree/elem'
3
+ require 'htree/leaf'
4
+ require 'htree/tag'
5
+ require 'htree/raw_string'
6
+ require 'htree/context'
7
+
8
+ module HTree
9
+ # compare tree structures.
10
+ def ==(other)
11
+ check_equality(self, other, :usual_equal_object)
12
+ end
13
+ alias eql? ==
14
+
15
+ # hash value for the tree structure.
16
+ def hash
17
+ return @hash_code if defined? @hash_code
18
+ @hash_code = usual_equal_object.hash
19
+ end
20
+
21
+ # :stopdoc:
22
+
23
+ def usual_equal_object
24
+ return @usual_equal_object if defined? @usual_equal_object
25
+ @usual_equal_object = make_usual_equal_object
26
+ end
27
+
28
+ def make_usual_equal_object
29
+ raise NotImplementedError
30
+ end
31
+
32
+ def exact_equal_object
33
+ return @exact_equal_object if defined? @exact_equal_object
34
+ @exact_equal_object = make_exact_equal_object
35
+ end
36
+
37
+ def make_exact_equal_object
38
+ raise NotImplementedError
39
+ end
40
+
41
+ def exact_equal?(other)
42
+ check_equality(self, other, :exact_equal_object)
43
+ end
44
+
45
+ def check_equality(obj1, obj2, equal_object_method)
46
+ return false unless obj1.class == obj2.class
47
+ if obj1.class == Array
48
+ return false unless obj1.length == obj2.length
49
+ obj1.each_with_index {|c1, i|
50
+ return false unless c1.class == obj2[i].class
51
+ }
52
+ obj1.each_with_index {|c1, i|
53
+ return false unless check_equality(c1, obj2[i], equal_object_method)
54
+ }
55
+ true
56
+ elsif obj1.respond_to? equal_object_method
57
+ o1 = obj1.send(equal_object_method)
58
+ o2 = obj2.send(equal_object_method)
59
+ check_equality(o1, o2, equal_object_method)
60
+ else
61
+ obj1 == obj2
62
+ end
63
+ end
64
+
65
+ class Doc
66
+ alias exact_equal_object children
67
+ alias usual_equal_object children
68
+ end
69
+
70
+ class Elem
71
+ def make_exact_equal_object
72
+ [@stag, @children, @empty, @etag]
73
+ end
74
+
75
+ def make_usual_equal_object
76
+ [@stag, @children]
77
+ end
78
+ end
79
+
80
+ class Name
81
+ def make_exact_equal_object
82
+ [@namespace_prefix, @namespace_uri, @local_name]
83
+ end
84
+
85
+ def make_usual_equal_object
86
+ xmlns? ? @local_name : [@namespace_uri, @local_name]
87
+ end
88
+ end
89
+
90
+ module Util
91
+ module_function
92
+ def cmp_with_nil(a, b)
93
+ if a == nil
94
+ if b == nil
95
+ 0
96
+ else
97
+ -1
98
+ end
99
+ else
100
+ if b == nil
101
+ 1
102
+ else
103
+ a <=> b
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ class Context
110
+ def make_exact_equal_object
111
+ @namespaces.keys.sort {|prefix1, prefix2|
112
+ Util.cmp_with_nil(prefix1, prefix2)
113
+ }.map {|prefix| [prefix, @namespaces[prefix]] }
114
+ end
115
+
116
+ # make_usual_equal_object is not used through STag#make_usual_equal_object
117
+ # NotImplementedError is suitable?
118
+ alias make_usual_equal_object make_exact_equal_object
119
+ end
120
+
121
+ class STag
122
+ def make_exact_equal_object
123
+ [@raw_string,
124
+ @name,
125
+ @attributes.sort {|(n1,t1), (n2, t2)|
126
+ Util.cmp_with_nil(n1.namespace_prefix, n2.namespace_prefix).nonzero? ||
127
+ Util.cmp_with_nil(n1.namespace_uri, n2.namespace_uri).nonzero? ||
128
+ Util.cmp_with_nil(n1.local_name, n2.local_name)
129
+ },
130
+ @inherited_context
131
+ ]
132
+ end
133
+
134
+ def make_usual_equal_object
135
+ [@name,
136
+ @attributes.find_all {|n,t| !n.xmlns? }.sort {|(n1,t1), (n2, t2)|
137
+ Util.cmp_with_nil(n1.namespace_prefix, n2.namespace_prefix).nonzero? ||
138
+ Util.cmp_with_nil(n1.namespace_uri, n2.namespace_uri).nonzero? ||
139
+ Util.cmp_with_nil(n1.local_name, n2.local_name)
140
+ }
141
+ ]
142
+ end
143
+
144
+ end
145
+
146
+ class ETag
147
+ def make_exact_equal_object
148
+ [@raw_string, @qualified_name]
149
+ end
150
+
151
+ alias usual_equal_object qualified_name
152
+ end
153
+
154
+ class Text
155
+ def make_exact_equal_object
156
+ [@raw_string, @rcdata]
157
+ end
158
+
159
+ def make_usual_equal_object
160
+ @normalized_rcdata
161
+ end
162
+ end
163
+
164
+ class XMLDecl
165
+ def make_exact_equal_object
166
+ [@raw_string, @version, @encoding, @standalone]
167
+ end
168
+
169
+ def make_usual_equal_object
170
+ [@version, @encoding, @standalone]
171
+ end
172
+ end
173
+
174
+ class DocType
175
+ def make_exact_equal_object
176
+ [@raw_string, @root_element_name, @system_identifier, @public_identifier]
177
+ end
178
+
179
+ def make_usual_equal_object
180
+ [@root_element_name, @system_identifier, @public_identifier]
181
+ end
182
+ end
183
+
184
+ class ProcIns
185
+ def make_exact_equal_object
186
+ [@raw_string, @target, @content]
187
+ end
188
+
189
+ def make_usual_equal_object
190
+ [@target, @content]
191
+ end
192
+ end
193
+
194
+ class Comment
195
+ def make_exact_equal_object
196
+ [@raw_string, @content]
197
+ end
198
+
199
+ alias usual_equal_object content
200
+ end
201
+
202
+ class BogusETag
203
+ def make_exact_equal_object
204
+ [@etag]
205
+ end
206
+
207
+ alias usual_equal_object make_exact_equal_object
208
+ end
209
+
210
+ class Location
211
+ def make_exact_equal_object
212
+ [@parent, @index, @node]
213
+ end
214
+
215
+ alias usual_equal_object make_exact_equal_object
216
+ end
217
+
218
+ # :startdoc:
219
+ end
@@ -0,0 +1,37 @@
1
+ require 'htree/text'
2
+ require 'htree/doc'
3
+ require 'htree/elem'
4
+
5
+ module HTree
6
+ module Node
7
+ def extract_text
8
+ raise NotImplementedError
9
+ end
10
+ end
11
+
12
+ class Location
13
+ def extract_text
14
+ to_node.extract_text
15
+ end
16
+ end
17
+
18
+ # :stopdoc:
19
+ module Container
20
+ def extract_text
21
+ Text.concat(*@children.map {|n| n.extract_text })
22
+ end
23
+ end
24
+
25
+ module Leaf
26
+ def extract_text
27
+ Text.new('')
28
+ end
29
+ end
30
+
31
+ class Text
32
+ def extract_text
33
+ self
34
+ end
35
+ end
36
+ # :startdoc:
37
+ end