htree 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data.tar.gz.sig +4 -0
  2. data/Makefile +20 -0
  3. data/Manifest +58 -0
  4. data/README +61 -0
  5. data/Rakefile +37 -0
  6. data/htree.gemspec +32 -0
  7. data/init.rb +1 -0
  8. data/install.rb +112 -0
  9. data/lib/htree.rb +97 -0
  10. data/lib/htree/container.rb +8 -0
  11. data/lib/htree/context.rb +69 -0
  12. data/lib/htree/display.rb +46 -0
  13. data/lib/htree/doc.rb +149 -0
  14. data/lib/htree/elem.rb +262 -0
  15. data/lib/htree/encoder.rb +217 -0
  16. data/lib/htree/equality.rb +219 -0
  17. data/lib/htree/extract_text.rb +37 -0
  18. data/lib/htree/fstr.rb +32 -0
  19. data/lib/htree/gencode.rb +193 -0
  20. data/lib/htree/htmlinfo.rb +672 -0
  21. data/lib/htree/inspect.rb +108 -0
  22. data/lib/htree/leaf.rb +92 -0
  23. data/lib/htree/loc.rb +369 -0
  24. data/lib/htree/modules.rb +49 -0
  25. data/lib/htree/name.rb +122 -0
  26. data/lib/htree/output.rb +212 -0
  27. data/lib/htree/parse.rb +410 -0
  28. data/lib/htree/raw_string.rb +127 -0
  29. data/lib/htree/regexp-util.rb +19 -0
  30. data/lib/htree/rexml.rb +131 -0
  31. data/lib/htree/scan.rb +176 -0
  32. data/lib/htree/tag.rb +113 -0
  33. data/lib/htree/template.rb +961 -0
  34. data/lib/htree/text.rb +115 -0
  35. data/lib/htree/traverse.rb +497 -0
  36. data/test-all.rb +5 -0
  37. data/test/assign.html +1 -0
  38. data/test/template.html +4 -0
  39. data/test/test-attr.rb +67 -0
  40. data/test/test-charset.rb +79 -0
  41. data/test/test-context.rb +29 -0
  42. data/test/test-display_xml.rb +45 -0
  43. data/test/test-elem-new.rb +101 -0
  44. data/test/test-encoder.rb +53 -0
  45. data/test/test-equality.rb +55 -0
  46. data/test/test-extract_text.rb +18 -0
  47. data/test/test-gencode.rb +27 -0
  48. data/test/test-leaf.rb +25 -0
  49. data/test/test-loc.rb +60 -0
  50. data/test/test-namespace.rb +147 -0
  51. data/test/test-output.rb +133 -0
  52. data/test/test-parse.rb +115 -0
  53. data/test/test-raw_string.rb +17 -0
  54. data/test/test-rexml.rb +70 -0
  55. data/test/test-scan.rb +153 -0
  56. data/test/test-security.rb +37 -0
  57. data/test/test-subnode.rb +142 -0
  58. data/test/test-template.rb +313 -0
  59. data/test/test-text.rb +43 -0
  60. data/test/test-traverse.rb +69 -0
  61. metadata +166 -0
  62. metadata.gz.sig +1 -0
@@ -0,0 +1,127 @@
1
+ require 'htree/modules'
2
+ require 'htree/fstr'
3
+
4
+ module HTree
5
+ module Node
6
+ # raw_string returns a source string recorded by parsing.
7
+ # It returns +nil+ if the node is constructed not via parsing.
8
+ def raw_string
9
+ catch(:raw_string_tag) {
10
+ return raw_string_internal('')
11
+ }
12
+ nil
13
+ end
14
+ end
15
+
16
+ # :stopdoc:
17
+ class Doc
18
+ def raw_string_internal(result)
19
+ @children.each {|n|
20
+ n.raw_string_internal(result)
21
+ }
22
+ end
23
+ end
24
+
25
+ class Elem
26
+ def raw_string_internal(result)
27
+ @stag.raw_string_internal(result)
28
+ @children.each {|n| n.raw_string_internal(result) }
29
+ @etag.raw_string_internal(result) if @etag
30
+ end
31
+ end
32
+
33
+ module Tag
34
+ def init_raw_string() @raw_string = nil end
35
+ def raw_string=(arg) @raw_string = HTree.frozen_string(arg) end
36
+ def raw_string_internal(result)
37
+ throw :raw_string_tag if !@raw_string
38
+ result << @raw_string
39
+ end
40
+ end
41
+
42
+ module Leaf
43
+ def init_raw_string() @raw_string = nil end
44
+ def raw_string=(arg) @raw_string = HTree.frozen_string(arg) end
45
+ def raw_string_internal(result)
46
+ throw :raw_string_tag if !@raw_string
47
+ result << @raw_string
48
+ end
49
+ end
50
+
51
+ class Text
52
+ def raw_string=(arg)
53
+ if arg == @rcdata then
54
+ @raw_string = @rcdata
55
+ else
56
+ super
57
+ end
58
+ end
59
+ end
60
+ # :startdoc:
61
+
62
+ module Node
63
+ def eliminate_raw_string
64
+ raise NotImplementedError
65
+ end
66
+ end
67
+
68
+ # :stopdoc:
69
+
70
+ class Doc
71
+ def eliminate_raw_string
72
+ Doc.new(@children.map {|c| c.eliminate_raw_string })
73
+ end
74
+ end
75
+
76
+ class Elem
77
+ def eliminate_raw_string
78
+ Elem.new!(
79
+ @stag.eliminate_raw_string,
80
+ @empty ? nil : @children.map {|c| c.eliminate_raw_string },
81
+ @etag && @etag.eliminate_raw_string)
82
+ end
83
+ end
84
+
85
+ class Text
86
+ def eliminate_raw_string
87
+ Text.new_internal(@rcdata)
88
+ end
89
+ end
90
+
91
+ class STag
92
+ def eliminate_raw_string
93
+ STag.new(@qualified_name, @attributes, @inherited_context)
94
+ end
95
+ end
96
+
97
+ class ETag
98
+ def eliminate_raw_string
99
+ self.class.new(@qualified_name)
100
+ end
101
+ end
102
+
103
+ class XMLDecl
104
+ def eliminate_raw_string
105
+ XMLDecl.new(@version, @encoding, @standalone)
106
+ end
107
+ end
108
+
109
+ class DocType
110
+ def eliminate_raw_string
111
+ DocType.new(@root_element_name, @public_identifier, @system_identifier)
112
+ end
113
+ end
114
+
115
+ class ProcIns
116
+ def eliminate_raw_string
117
+ ProcIns.new(@target, @content)
118
+ end
119
+ end
120
+
121
+ class Comment
122
+ def eliminate_raw_string
123
+ Comment.new(@content)
124
+ end
125
+ end
126
+ # :startdoc:
127
+ end
@@ -0,0 +1,19 @@
1
+ class Regexp
2
+ def disable_capture
3
+ re = ''
4
+ self.source.scan(/\\.|[^\\\(]+|\(\?|\(/m) {|s|
5
+ if s == '('
6
+ re << '(?:'
7
+ else
8
+ re << s
9
+ end
10
+ }
11
+ if re.respond_to? :force_encoding
12
+ re.force_encoding(self.encoding)
13
+ Regexp.new(re, self.options)
14
+ else
15
+ Regexp.new(re, self.options, self.kcode)
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,131 @@
1
+ # = REXML Tree Generator
2
+ #
3
+ # HTree::Node#to_rexml is used for converting HTree to REXML.
4
+ #
5
+ # == Method Summary
6
+ #
7
+ # - HTree::Node#to_rexml -> REXML::Child
8
+ #
9
+ # == Example
10
+ #
11
+ # HTree.parse(...).to_rexml #=> REXML::Document
12
+ #
13
+ # == Comparison between HTree and REXML.
14
+ #
15
+ # - HTree parser is permissive HTML/XML parser.
16
+ # REXML parser is strict XML parser.
17
+ # HTree is recommended if you need to parse realworld HTML.
18
+ # REXML is recommended if you need strict error checking.
19
+ # - HTree object is immutable.
20
+ # REXML object is mutable.
21
+ # REXML should be used if you need modification.
22
+ #
23
+ require 'htree/modules'
24
+ require 'htree/output' # HTree::DocType#generate_content
25
+
26
+ module HTree
27
+ module Node
28
+ # convert to REXML tree.
29
+ def to_rexml
30
+ require 'rexml/document'
31
+ to_rexml_internal(nil, DefaultContext)
32
+ end
33
+ end
34
+
35
+ # :stopdoc:
36
+
37
+ class Doc
38
+ def to_rexml_internal(parent, context)
39
+ raise ArgumentError, "parent must be nil" if parent != nil
40
+ result = REXML::Document.new
41
+ self.children.each {|c|
42
+ c.to_rexml_internal(result, context)
43
+ }
44
+ result
45
+ end
46
+ end
47
+
48
+ class Elem
49
+ def to_rexml_internal(parent, context)
50
+ ename = self.element_name
51
+ ns_decl = {}
52
+ if context.namespace_uri(ename.namespace_prefix) != ename.namespace_uri
53
+ ns_decl[ename.namespace_prefix] = ename.namespace_uri
54
+ end
55
+
56
+ if ename.namespace_prefix
57
+ result = REXML::Element.new("#{ename.namespace_prefix}:#{ename.local_name}", parent)
58
+ else
59
+ result = REXML::Element.new(ename.local_name, parent)
60
+ end
61
+
62
+ self.each_attribute {|aname, atext|
63
+ if aname.namespace_prefix
64
+ if context.namespace_uri(aname.namespace_prefix) != aname.namespace_uri
65
+ ns_decl[aname.namespace_prefix] = aname.namespace_uri
66
+ end
67
+ result.add_attribute("#{aname.namespace_prefix}:#{aname.local_name}", atext.to_s)
68
+ else
69
+ result.add_attribute(aname.local_name, atext.to_s)
70
+ end
71
+ }
72
+
73
+ ns_decl.each {|k, v|
74
+ if k
75
+ result.add_namespace(k, v)
76
+ else
77
+ result.add_namespace(v)
78
+ end
79
+ }
80
+ context = context.subst_namespaces(ns_decl)
81
+
82
+ self.children.each {|c|
83
+ c.to_rexml_internal(result, context)
84
+ }
85
+ result
86
+ end
87
+ end
88
+
89
+ class Text
90
+ def to_rexml_internal(parent, context)
91
+ rcdata = self.rcdata.gsub(/[<>]/) { Encoder::ChRef[$&] }
92
+ REXML::Text.new(rcdata, true, parent, true)
93
+ end
94
+ end
95
+
96
+ class XMLDecl
97
+ def to_rexml_internal(parent, context)
98
+ r = REXML::XMLDecl.new(self.version, self.encoding, self.standalone)
99
+ parent << r if parent
100
+ r
101
+ end
102
+ end
103
+
104
+ class DocType
105
+ def to_rexml_internal(parent, context)
106
+ REXML::DocType.new([self.root_element_name, self.generate_content], parent)
107
+ end
108
+ end
109
+
110
+ class ProcIns
111
+ def to_rexml_internal(parent, context)
112
+ r = REXML::Instruction.new(self.target, self.content)
113
+ parent << r if parent
114
+ r
115
+ end
116
+ end
117
+
118
+ class Comment
119
+ def to_rexml_internal(parent, context)
120
+ REXML::Comment.new(self.content, parent)
121
+ end
122
+ end
123
+
124
+ class BogusETag
125
+ def to_rexml_internal(parent, context)
126
+ nil
127
+ end
128
+ end
129
+
130
+ # :startdoc:
131
+ end
@@ -0,0 +1,176 @@
1
+ require 'htree/htmlinfo'
2
+ require 'htree/regexp-util'
3
+ require 'htree/fstr'
4
+
5
+ module HTree
6
+ # :stopdoc:
7
+ module Pat
8
+ NameChar = /[-A-Za-z0-9._:]/
9
+ Name = /[A-Za-z_:]#{NameChar}*/
10
+ Nmtoken = /#{NameChar}+/
11
+
12
+ Comment_C = /<!--(.*?)-->/m
13
+ Comment = Comment_C.disable_capture
14
+ CDATA_C = /<!\[CDATA\[(.*?)\]\]>/m
15
+ CDATA = CDATA_C.disable_capture
16
+
17
+ QuotedAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)')/
18
+ QuotedAttr = QuotedAttr_C.disable_capture
19
+ ValidAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)'|(#{NameChar}*))|(#{Nmtoken})/
20
+ ValidAttr = ValidAttr_C.disable_capture
21
+ InvalidAttr1_C = /(#{Name})\s*=\s*(?:'([^'<>]*)'|"([^"<>]*)"|([^\s<>"']*(?![^\s<>"'])))|(#{Nmtoken})/
22
+ InvalidAttr1 = InvalidAttr1_C.disable_capture
23
+ InvalidAttr1End_C = /(#{Name})(?:\s*=\s*(?:'([^'<>]*)|"([^"<>]*)))/
24
+ InvalidAttr1End = InvalidAttr1End_C.disable_capture
25
+
26
+ QuotedStartTag_C = /<(#{Name})((?:\s+#{QuotedAttr})*)\s*>/
27
+ QuotedStartTag = QuotedStartTag_C.disable_capture
28
+ ValidStartTag_C = /<(#{Name})((?:\s+#{ValidAttr})*)\s*>/
29
+ ValidStartTag = ValidStartTag_C.disable_capture
30
+ InvalidStartTag_C = /<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*>/
31
+ InvalidStartTag = InvalidStartTag_C.disable_capture
32
+ StartTag = /#{QuotedStartTag}|#{ValidStartTag}|#{InvalidStartTag}/
33
+
34
+ QuotedEmptyTag_C = %r{<(#{Name})((?:\s+#{QuotedAttr})*)\s*/>}
35
+ QuotedEmptyTag = QuotedEmptyTag_C.disable_capture
36
+ ValidEmptyTag_C = %r{<(#{Name})((?:\s+#{ValidAttr})*)\s*/>}
37
+ ValidEmptyTag = ValidEmptyTag_C.disable_capture
38
+ InvalidEmptyTag_C = %r{<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*/>}
39
+ InvalidEmptyTag = InvalidEmptyTag_C.disable_capture
40
+ EmptyTag = /#{QuotedEmptyTag}|#{ValidEmptyTag}|#{InvalidEmptyTag}/
41
+
42
+ EndTag_C = %r{</(#{Name})\s*>}
43
+ EndTag = EndTag_C.disable_capture
44
+
45
+ XmlVersionNum = /[a-zA-Z0-9_.:-]+/
46
+ XmlVersionInfo_C = /\s+version\s*=\s*(?:'(#{XmlVersionNum})'|"(#{XmlVersionNum})")/
47
+ XmlVersionInfo = XmlVersionInfo_C.disable_capture
48
+ XmlEncName = /[A-Za-z][A-Za-z0-9._-]*/
49
+ XmlEncodingDecl_C = /\s+encoding\s*=\s*(?:"(#{XmlEncName})"|'(#{XmlEncName})')/
50
+ XmlEncodingDecl = XmlEncodingDecl_C.disable_capture
51
+ XmlSDDecl_C = /\s+standalone\s*=\s*(?:'(yes|no)'|"(yes|no)")/
52
+ XmlSDDecl = XmlSDDecl_C.disable_capture
53
+ XmlDecl_C = /<\?xml#{XmlVersionInfo_C}#{XmlEncodingDecl_C}?#{XmlSDDecl_C}?\s*\?>/
54
+ XmlDecl = /<\?xml#{XmlVersionInfo}#{XmlEncodingDecl}?#{XmlSDDecl}?\s*\?>/
55
+
56
+ # xxx: internal DTD subset is not recognized: '[' (markupdecl | DeclSep)* ']' S?)?
57
+ SystemLiteral_C = /"([^"]*)"|'([^']*)'/
58
+ PubidLiteral_C = %r{"([\sa-zA-Z0-9\-'()+,./:=?;!*\#@$_%]*)"|'([\sa-zA-Z0-9\-()+,./:=?;!*\#@$_%]*)'}
59
+ ExternalID_C = /(?:SYSTEM|PUBLIC\s+#{PubidLiteral_C})(?:\s+#{SystemLiteral_C})?/
60
+ DocType_C = /<!DOCTYPE\s+(#{Name})(?:\s+#{ExternalID_C})?\s*(?:\[.*?\]\s*)?>/m
61
+ DocType = DocType_C.disable_capture
62
+
63
+ XmlProcIns_C = /<\?(#{Name})(?:\s+(.*?))?\?>/m
64
+ XmlProcIns = XmlProcIns_C.disable_capture
65
+ #ProcIns = /<\?([^>]*)>/m
66
+ end
67
+
68
+ def HTree.scan(input, is_xml=false)
69
+ is_html = false
70
+ cdata_content = nil
71
+ cdata_content_string = nil
72
+ pcdata = ''
73
+ first_element = true
74
+ index_otherstring = 1
75
+ index_str = 2
76
+ index_xmldecl = 3
77
+ index_doctype = 4
78
+ index_xmlprocins = 5
79
+ index_quotedstarttag = 6
80
+ index_quotedemptytag = 7
81
+ index_starttag = 8
82
+ index_endtag = 9
83
+ index_emptytag = 10
84
+ index_comment = 11
85
+ index_cdata = 12
86
+ index_end = 13
87
+ pat = /\G(.*?)((#{Pat::XmlDecl})
88
+ |(#{Pat::DocType})
89
+ |(#{Pat::XmlProcIns})
90
+ |(#{Pat::QuotedStartTag})
91
+ |(#{Pat::QuotedEmptyTag})
92
+ |(#{Pat::StartTag})
93
+ |(#{Pat::EndTag})
94
+ |(#{Pat::EmptyTag})
95
+ |(#{Pat::Comment})
96
+ |(#{Pat::CDATA})
97
+ |(\z))
98
+ /oxm
99
+ input.scan(pat) {
100
+ match = $~
101
+ if cdata_content
102
+ cdata_content_string << match[index_otherstring]
103
+ str = match[index_str]
104
+ if match[index_endtag] && str[Pat::Name] == cdata_content
105
+ unless cdata_content_string.empty?
106
+ yield [:text_cdata_content, HTree.frozen_string(cdata_content_string)]
107
+ end
108
+ yield [:etag, HTree.frozen_string(str)]
109
+ cdata_content = nil
110
+ cdata_content_string = nil
111
+ elsif match[index_end]
112
+ cdata_content_string << str
113
+ unless cdata_content_string.empty?
114
+ yield [:text_cdata_content, HTree.frozen_string(cdata_content_string)]
115
+ end
116
+ cdata_content = nil
117
+ cdata_content_string = nil
118
+ else
119
+ cdata_content_string << str
120
+ end
121
+ else
122
+ pcdata << match[index_otherstring]
123
+ str = match[index_str]
124
+ if !pcdata.empty?
125
+ yield [:text_pcdata, HTree.frozen_string(pcdata)]
126
+ pcdata = ''
127
+ end
128
+ if match[index_xmldecl]
129
+ yield [:xmldecl, HTree.frozen_string(str)]
130
+ is_xml = true
131
+ elsif match[index_doctype]
132
+ Pat::DocType_C =~ str
133
+ root_element_name = $1
134
+ public_identifier = $2 || $3
135
+ system_identifier = $4 || $5
136
+ is_html = true if /\Ahtml\z/i =~ root_element_name
137
+ is_xml = true if public_identifier && %r{\A-//W3C//DTD XHTML } =~ public_identifier
138
+ yield [:doctype, HTree.frozen_string(str)]
139
+ elsif match[index_xmlprocins]
140
+ yield [:procins, HTree.frozen_string(str)]
141
+ elsif match[index_starttag] || match[index_quotedstarttag]
142
+ yield stag = [:stag, HTree.frozen_string(str)]
143
+ tagname = str[Pat::Name]
144
+ if first_element
145
+ if /\A(?:html|head|title|isindex|base|script|style|meta|link|object)\z/i =~ tagname
146
+ is_html = true
147
+ else
148
+ is_xml = true
149
+ end
150
+ first_element = false
151
+ end
152
+ if !is_xml && ElementContent[tagname] == :CDATA
153
+ cdata_content = tagname
154
+ cdata_content_string = ''
155
+ end
156
+ elsif match[index_endtag]
157
+ yield [:etag, HTree.frozen_string(str)]
158
+ elsif match[index_emptytag] || match[index_quotedemptytag]
159
+ yield [:emptytag, HTree.frozen_string(str)]
160
+ first_element = false
161
+ #is_xml = true
162
+ elsif match[index_comment]
163
+ yield [:comment, HTree.frozen_string(str)]
164
+ elsif match[index_cdata]
165
+ yield [:text_cdata_section, HTree.frozen_string(str)]
166
+ elsif match[index_end]
167
+ # pass
168
+ else
169
+ raise Exception, "unknown match [bug]"
170
+ end
171
+ end
172
+ }
173
+ return is_xml, is_html
174
+ end
175
+ # :startdoc:
176
+ end