htree 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data.tar.gz.sig +4 -0
  2. data/Makefile +20 -0
  3. data/Manifest +58 -0
  4. data/README +61 -0
  5. data/Rakefile +37 -0
  6. data/htree.gemspec +32 -0
  7. data/init.rb +1 -0
  8. data/install.rb +112 -0
  9. data/lib/htree.rb +97 -0
  10. data/lib/htree/container.rb +8 -0
  11. data/lib/htree/context.rb +69 -0
  12. data/lib/htree/display.rb +46 -0
  13. data/lib/htree/doc.rb +149 -0
  14. data/lib/htree/elem.rb +262 -0
  15. data/lib/htree/encoder.rb +217 -0
  16. data/lib/htree/equality.rb +219 -0
  17. data/lib/htree/extract_text.rb +37 -0
  18. data/lib/htree/fstr.rb +32 -0
  19. data/lib/htree/gencode.rb +193 -0
  20. data/lib/htree/htmlinfo.rb +672 -0
  21. data/lib/htree/inspect.rb +108 -0
  22. data/lib/htree/leaf.rb +92 -0
  23. data/lib/htree/loc.rb +369 -0
  24. data/lib/htree/modules.rb +49 -0
  25. data/lib/htree/name.rb +122 -0
  26. data/lib/htree/output.rb +212 -0
  27. data/lib/htree/parse.rb +410 -0
  28. data/lib/htree/raw_string.rb +127 -0
  29. data/lib/htree/regexp-util.rb +19 -0
  30. data/lib/htree/rexml.rb +131 -0
  31. data/lib/htree/scan.rb +176 -0
  32. data/lib/htree/tag.rb +113 -0
  33. data/lib/htree/template.rb +961 -0
  34. data/lib/htree/text.rb +115 -0
  35. data/lib/htree/traverse.rb +497 -0
  36. data/test-all.rb +5 -0
  37. data/test/assign.html +1 -0
  38. data/test/template.html +4 -0
  39. data/test/test-attr.rb +67 -0
  40. data/test/test-charset.rb +79 -0
  41. data/test/test-context.rb +29 -0
  42. data/test/test-display_xml.rb +45 -0
  43. data/test/test-elem-new.rb +101 -0
  44. data/test/test-encoder.rb +53 -0
  45. data/test/test-equality.rb +55 -0
  46. data/test/test-extract_text.rb +18 -0
  47. data/test/test-gencode.rb +27 -0
  48. data/test/test-leaf.rb +25 -0
  49. data/test/test-loc.rb +60 -0
  50. data/test/test-namespace.rb +147 -0
  51. data/test/test-output.rb +133 -0
  52. data/test/test-parse.rb +115 -0
  53. data/test/test-raw_string.rb +17 -0
  54. data/test/test-rexml.rb +70 -0
  55. data/test/test-scan.rb +153 -0
  56. data/test/test-security.rb +37 -0
  57. data/test/test-subnode.rb +142 -0
  58. data/test/test-template.rb +313 -0
  59. data/test/test-text.rb +43 -0
  60. data/test/test-traverse.rb +69 -0
  61. metadata +166 -0
  62. metadata.gz.sig +1 -0
@@ -0,0 +1,127 @@
1
+ require 'htree/modules'
2
+ require 'htree/fstr'
3
+
4
+ module HTree
5
+ module Node
6
+ # raw_string returns a source string recorded by parsing.
7
+ # It returns +nil+ if the node is constructed not via parsing.
8
+ def raw_string
9
+ catch(:raw_string_tag) {
10
+ return raw_string_internal('')
11
+ }
12
+ nil
13
+ end
14
+ end
15
+
16
+ # :stopdoc:
17
+ class Doc
18
+ def raw_string_internal(result)
19
+ @children.each {|n|
20
+ n.raw_string_internal(result)
21
+ }
22
+ end
23
+ end
24
+
25
+ class Elem
26
+ def raw_string_internal(result)
27
+ @stag.raw_string_internal(result)
28
+ @children.each {|n| n.raw_string_internal(result) }
29
+ @etag.raw_string_internal(result) if @etag
30
+ end
31
+ end
32
+
33
+ module Tag
34
+ def init_raw_string() @raw_string = nil end
35
+ def raw_string=(arg) @raw_string = HTree.frozen_string(arg) end
36
+ def raw_string_internal(result)
37
+ throw :raw_string_tag if !@raw_string
38
+ result << @raw_string
39
+ end
40
+ end
41
+
42
+ module Leaf
43
+ def init_raw_string() @raw_string = nil end
44
+ def raw_string=(arg) @raw_string = HTree.frozen_string(arg) end
45
+ def raw_string_internal(result)
46
+ throw :raw_string_tag if !@raw_string
47
+ result << @raw_string
48
+ end
49
+ end
50
+
51
+ class Text
52
+ def raw_string=(arg)
53
+ if arg == @rcdata then
54
+ @raw_string = @rcdata
55
+ else
56
+ super
57
+ end
58
+ end
59
+ end
60
+ # :startdoc:
61
+
62
+ module Node
63
+ def eliminate_raw_string
64
+ raise NotImplementedError
65
+ end
66
+ end
67
+
68
+ # :stopdoc:
69
+
70
+ class Doc
71
+ def eliminate_raw_string
72
+ Doc.new(@children.map {|c| c.eliminate_raw_string })
73
+ end
74
+ end
75
+
76
+ class Elem
77
+ def eliminate_raw_string
78
+ Elem.new!(
79
+ @stag.eliminate_raw_string,
80
+ @empty ? nil : @children.map {|c| c.eliminate_raw_string },
81
+ @etag && @etag.eliminate_raw_string)
82
+ end
83
+ end
84
+
85
+ class Text
86
+ def eliminate_raw_string
87
+ Text.new_internal(@rcdata)
88
+ end
89
+ end
90
+
91
+ class STag
92
+ def eliminate_raw_string
93
+ STag.new(@qualified_name, @attributes, @inherited_context)
94
+ end
95
+ end
96
+
97
+ class ETag
98
+ def eliminate_raw_string
99
+ self.class.new(@qualified_name)
100
+ end
101
+ end
102
+
103
+ class XMLDecl
104
+ def eliminate_raw_string
105
+ XMLDecl.new(@version, @encoding, @standalone)
106
+ end
107
+ end
108
+
109
+ class DocType
110
+ def eliminate_raw_string
111
+ DocType.new(@root_element_name, @public_identifier, @system_identifier)
112
+ end
113
+ end
114
+
115
+ class ProcIns
116
+ def eliminate_raw_string
117
+ ProcIns.new(@target, @content)
118
+ end
119
+ end
120
+
121
+ class Comment
122
+ def eliminate_raw_string
123
+ Comment.new(@content)
124
+ end
125
+ end
126
+ # :startdoc:
127
+ end
@@ -0,0 +1,19 @@
1
+ class Regexp
2
+ def disable_capture
3
+ re = ''
4
+ self.source.scan(/\\.|[^\\\(]+|\(\?|\(/m) {|s|
5
+ if s == '('
6
+ re << '(?:'
7
+ else
8
+ re << s
9
+ end
10
+ }
11
+ if re.respond_to? :force_encoding
12
+ re.force_encoding(self.encoding)
13
+ Regexp.new(re, self.options)
14
+ else
15
+ Regexp.new(re, self.options, self.kcode)
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,131 @@
1
+ # = REXML Tree Generator
2
+ #
3
+ # HTree::Node#to_rexml is used for converting HTree to REXML.
4
+ #
5
+ # == Method Summary
6
+ #
7
+ # - HTree::Node#to_rexml -> REXML::Child
8
+ #
9
+ # == Example
10
+ #
11
+ # HTree.parse(...).to_rexml #=> REXML::Document
12
+ #
13
+ # == Comparison between HTree and REXML.
14
+ #
15
+ # - HTree parser is permissive HTML/XML parser.
16
+ # REXML parser is strict XML parser.
17
+ # HTree is recommended if you need to parse realworld HTML.
18
+ # REXML is recommended if you need strict error checking.
19
+ # - HTree object is immutable.
20
+ # REXML object is mutable.
21
+ # REXML should be used if you need modification.
22
+ #
23
+ require 'htree/modules'
24
+ require 'htree/output' # HTree::DocType#generate_content
25
+
26
+ module HTree
27
+ module Node
28
+ # convert to REXML tree.
29
+ def to_rexml
30
+ require 'rexml/document'
31
+ to_rexml_internal(nil, DefaultContext)
32
+ end
33
+ end
34
+
35
+ # :stopdoc:
36
+
37
+ class Doc
38
+ def to_rexml_internal(parent, context)
39
+ raise ArgumentError, "parent must be nil" if parent != nil
40
+ result = REXML::Document.new
41
+ self.children.each {|c|
42
+ c.to_rexml_internal(result, context)
43
+ }
44
+ result
45
+ end
46
+ end
47
+
48
+ class Elem
49
+ def to_rexml_internal(parent, context)
50
+ ename = self.element_name
51
+ ns_decl = {}
52
+ if context.namespace_uri(ename.namespace_prefix) != ename.namespace_uri
53
+ ns_decl[ename.namespace_prefix] = ename.namespace_uri
54
+ end
55
+
56
+ if ename.namespace_prefix
57
+ result = REXML::Element.new("#{ename.namespace_prefix}:#{ename.local_name}", parent)
58
+ else
59
+ result = REXML::Element.new(ename.local_name, parent)
60
+ end
61
+
62
+ self.each_attribute {|aname, atext|
63
+ if aname.namespace_prefix
64
+ if context.namespace_uri(aname.namespace_prefix) != aname.namespace_uri
65
+ ns_decl[aname.namespace_prefix] = aname.namespace_uri
66
+ end
67
+ result.add_attribute("#{aname.namespace_prefix}:#{aname.local_name}", atext.to_s)
68
+ else
69
+ result.add_attribute(aname.local_name, atext.to_s)
70
+ end
71
+ }
72
+
73
+ ns_decl.each {|k, v|
74
+ if k
75
+ result.add_namespace(k, v)
76
+ else
77
+ result.add_namespace(v)
78
+ end
79
+ }
80
+ context = context.subst_namespaces(ns_decl)
81
+
82
+ self.children.each {|c|
83
+ c.to_rexml_internal(result, context)
84
+ }
85
+ result
86
+ end
87
+ end
88
+
89
+ class Text
90
+ def to_rexml_internal(parent, context)
91
+ rcdata = self.rcdata.gsub(/[<>]/) { Encoder::ChRef[$&] }
92
+ REXML::Text.new(rcdata, true, parent, true)
93
+ end
94
+ end
95
+
96
+ class XMLDecl
97
+ def to_rexml_internal(parent, context)
98
+ r = REXML::XMLDecl.new(self.version, self.encoding, self.standalone)
99
+ parent << r if parent
100
+ r
101
+ end
102
+ end
103
+
104
+ class DocType
105
+ def to_rexml_internal(parent, context)
106
+ REXML::DocType.new([self.root_element_name, self.generate_content], parent)
107
+ end
108
+ end
109
+
110
+ class ProcIns
111
+ def to_rexml_internal(parent, context)
112
+ r = REXML::Instruction.new(self.target, self.content)
113
+ parent << r if parent
114
+ r
115
+ end
116
+ end
117
+
118
+ class Comment
119
+ def to_rexml_internal(parent, context)
120
+ REXML::Comment.new(self.content, parent)
121
+ end
122
+ end
123
+
124
+ class BogusETag
125
+ def to_rexml_internal(parent, context)
126
+ nil
127
+ end
128
+ end
129
+
130
+ # :startdoc:
131
+ end
@@ -0,0 +1,176 @@
1
+ require 'htree/htmlinfo'
2
+ require 'htree/regexp-util'
3
+ require 'htree/fstr'
4
+
5
+ module HTree
6
+ # :stopdoc:
7
+ module Pat
8
+ NameChar = /[-A-Za-z0-9._:]/
9
+ Name = /[A-Za-z_:]#{NameChar}*/
10
+ Nmtoken = /#{NameChar}+/
11
+
12
+ Comment_C = /<!--(.*?)-->/m
13
+ Comment = Comment_C.disable_capture
14
+ CDATA_C = /<!\[CDATA\[(.*?)\]\]>/m
15
+ CDATA = CDATA_C.disable_capture
16
+
17
+ QuotedAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)')/
18
+ QuotedAttr = QuotedAttr_C.disable_capture
19
+ ValidAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)'|(#{NameChar}*))|(#{Nmtoken})/
20
+ ValidAttr = ValidAttr_C.disable_capture
21
+ InvalidAttr1_C = /(#{Name})\s*=\s*(?:'([^'<>]*)'|"([^"<>]*)"|([^\s<>"']*(?![^\s<>"'])))|(#{Nmtoken})/
22
+ InvalidAttr1 = InvalidAttr1_C.disable_capture
23
+ InvalidAttr1End_C = /(#{Name})(?:\s*=\s*(?:'([^'<>]*)|"([^"<>]*)))/
24
+ InvalidAttr1End = InvalidAttr1End_C.disable_capture
25
+
26
+ QuotedStartTag_C = /<(#{Name})((?:\s+#{QuotedAttr})*)\s*>/
27
+ QuotedStartTag = QuotedStartTag_C.disable_capture
28
+ ValidStartTag_C = /<(#{Name})((?:\s+#{ValidAttr})*)\s*>/
29
+ ValidStartTag = ValidStartTag_C.disable_capture
30
+ InvalidStartTag_C = /<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*>/
31
+ InvalidStartTag = InvalidStartTag_C.disable_capture
32
+ StartTag = /#{QuotedStartTag}|#{ValidStartTag}|#{InvalidStartTag}/
33
+
34
+ QuotedEmptyTag_C = %r{<(#{Name})((?:\s+#{QuotedAttr})*)\s*/>}
35
+ QuotedEmptyTag = QuotedEmptyTag_C.disable_capture
36
+ ValidEmptyTag_C = %r{<(#{Name})((?:\s+#{ValidAttr})*)\s*/>}
37
+ ValidEmptyTag = ValidEmptyTag_C.disable_capture
38
+ InvalidEmptyTag_C = %r{<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*/>}
39
+ InvalidEmptyTag = InvalidEmptyTag_C.disable_capture
40
+ EmptyTag = /#{QuotedEmptyTag}|#{ValidEmptyTag}|#{InvalidEmptyTag}/
41
+
42
+ EndTag_C = %r{</(#{Name})\s*>}
43
+ EndTag = EndTag_C.disable_capture
44
+
45
+ XmlVersionNum = /[a-zA-Z0-9_.:-]+/
46
+ XmlVersionInfo_C = /\s+version\s*=\s*(?:'(#{XmlVersionNum})'|"(#{XmlVersionNum})")/
47
+ XmlVersionInfo = XmlVersionInfo_C.disable_capture
48
+ XmlEncName = /[A-Za-z][A-Za-z0-9._-]*/
49
+ XmlEncodingDecl_C = /\s+encoding\s*=\s*(?:"(#{XmlEncName})"|'(#{XmlEncName})')/
50
+ XmlEncodingDecl = XmlEncodingDecl_C.disable_capture
51
+ XmlSDDecl_C = /\s+standalone\s*=\s*(?:'(yes|no)'|"(yes|no)")/
52
+ XmlSDDecl = XmlSDDecl_C.disable_capture
53
+ XmlDecl_C = /<\?xml#{XmlVersionInfo_C}#{XmlEncodingDecl_C}?#{XmlSDDecl_C}?\s*\?>/
54
+ XmlDecl = /<\?xml#{XmlVersionInfo}#{XmlEncodingDecl}?#{XmlSDDecl}?\s*\?>/
55
+
56
+ # xxx: internal DTD subset is not recognized: '[' (markupdecl | DeclSep)* ']' S?)?
57
+ SystemLiteral_C = /"([^"]*)"|'([^']*)'/
58
+ PubidLiteral_C = %r{"([\sa-zA-Z0-9\-'()+,./:=?;!*\#@$_%]*)"|'([\sa-zA-Z0-9\-()+,./:=?;!*\#@$_%]*)'}
59
+ ExternalID_C = /(?:SYSTEM|PUBLIC\s+#{PubidLiteral_C})(?:\s+#{SystemLiteral_C})?/
60
+ DocType_C = /<!DOCTYPE\s+(#{Name})(?:\s+#{ExternalID_C})?\s*(?:\[.*?\]\s*)?>/m
61
+ DocType = DocType_C.disable_capture
62
+
63
+ XmlProcIns_C = /<\?(#{Name})(?:\s+(.*?))?\?>/m
64
+ XmlProcIns = XmlProcIns_C.disable_capture
65
+ #ProcIns = /<\?([^>]*)>/m
66
+ end
67
+
68
+ def HTree.scan(input, is_xml=false)
69
+ is_html = false
70
+ cdata_content = nil
71
+ cdata_content_string = nil
72
+ pcdata = ''
73
+ first_element = true
74
+ index_otherstring = 1
75
+ index_str = 2
76
+ index_xmldecl = 3
77
+ index_doctype = 4
78
+ index_xmlprocins = 5
79
+ index_quotedstarttag = 6
80
+ index_quotedemptytag = 7
81
+ index_starttag = 8
82
+ index_endtag = 9
83
+ index_emptytag = 10
84
+ index_comment = 11
85
+ index_cdata = 12
86
+ index_end = 13
87
+ pat = /\G(.*?)((#{Pat::XmlDecl})
88
+ |(#{Pat::DocType})
89
+ |(#{Pat::XmlProcIns})
90
+ |(#{Pat::QuotedStartTag})
91
+ |(#{Pat::QuotedEmptyTag})
92
+ |(#{Pat::StartTag})
93
+ |(#{Pat::EndTag})
94
+ |(#{Pat::EmptyTag})
95
+ |(#{Pat::Comment})
96
+ |(#{Pat::CDATA})
97
+ |(\z))
98
+ /oxm
99
+ input.scan(pat) {
100
+ match = $~
101
+ if cdata_content
102
+ cdata_content_string << match[index_otherstring]
103
+ str = match[index_str]
104
+ if match[index_endtag] && str[Pat::Name] == cdata_content
105
+ unless cdata_content_string.empty?
106
+ yield [:text_cdata_content, HTree.frozen_string(cdata_content_string)]
107
+ end
108
+ yield [:etag, HTree.frozen_string(str)]
109
+ cdata_content = nil
110
+ cdata_content_string = nil
111
+ elsif match[index_end]
112
+ cdata_content_string << str
113
+ unless cdata_content_string.empty?
114
+ yield [:text_cdata_content, HTree.frozen_string(cdata_content_string)]
115
+ end
116
+ cdata_content = nil
117
+ cdata_content_string = nil
118
+ else
119
+ cdata_content_string << str
120
+ end
121
+ else
122
+ pcdata << match[index_otherstring]
123
+ str = match[index_str]
124
+ if !pcdata.empty?
125
+ yield [:text_pcdata, HTree.frozen_string(pcdata)]
126
+ pcdata = ''
127
+ end
128
+ if match[index_xmldecl]
129
+ yield [:xmldecl, HTree.frozen_string(str)]
130
+ is_xml = true
131
+ elsif match[index_doctype]
132
+ Pat::DocType_C =~ str
133
+ root_element_name = $1
134
+ public_identifier = $2 || $3
135
+ system_identifier = $4 || $5
136
+ is_html = true if /\Ahtml\z/i =~ root_element_name
137
+ is_xml = true if public_identifier && %r{\A-//W3C//DTD XHTML } =~ public_identifier
138
+ yield [:doctype, HTree.frozen_string(str)]
139
+ elsif match[index_xmlprocins]
140
+ yield [:procins, HTree.frozen_string(str)]
141
+ elsif match[index_starttag] || match[index_quotedstarttag]
142
+ yield stag = [:stag, HTree.frozen_string(str)]
143
+ tagname = str[Pat::Name]
144
+ if first_element
145
+ if /\A(?:html|head|title|isindex|base|script|style|meta|link|object)\z/i =~ tagname
146
+ is_html = true
147
+ else
148
+ is_xml = true
149
+ end
150
+ first_element = false
151
+ end
152
+ if !is_xml && ElementContent[tagname] == :CDATA
153
+ cdata_content = tagname
154
+ cdata_content_string = ''
155
+ end
156
+ elsif match[index_endtag]
157
+ yield [:etag, HTree.frozen_string(str)]
158
+ elsif match[index_emptytag] || match[index_quotedemptytag]
159
+ yield [:emptytag, HTree.frozen_string(str)]
160
+ first_element = false
161
+ #is_xml = true
162
+ elsif match[index_comment]
163
+ yield [:comment, HTree.frozen_string(str)]
164
+ elsif match[index_cdata]
165
+ yield [:text_cdata_section, HTree.frozen_string(str)]
166
+ elsif match[index_end]
167
+ # pass
168
+ else
169
+ raise Exception, "unknown match [bug]"
170
+ end
171
+ end
172
+ }
173
+ return is_xml, is_html
174
+ end
175
+ # :startdoc:
176
+ end