thbar-hpricot 0.8.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7045 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +902 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +514 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +40 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +219 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +839 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/test/files/basic.xhtml +17 -0
  33. data/test/files/boingboing.html +2266 -0
  34. data/test/files/cy0.html +3653 -0
  35. data/test/files/immob.html +400 -0
  36. data/test/files/pace_application.html +1320 -0
  37. data/test/files/tenderlove.html +16 -0
  38. data/test/files/uswebgen.html +220 -0
  39. data/test/files/utf8.html +1054 -0
  40. data/test/files/week9.html +1723 -0
  41. data/test/files/why.xml +19 -0
  42. data/test/load_files.rb +7 -0
  43. data/test/nokogiri-bench.rb +64 -0
  44. data/test/test_alter.rb +96 -0
  45. data/test/test_builder.rb +37 -0
  46. data/test/test_parser.rb +457 -0
  47. data/test/test_paths.rb +25 -0
  48. data/test/test_preserved.rb +88 -0
  49. data/test/test_xml.rb +28 -0
  50. metadata +124 -0
@@ -0,0 +1,103 @@
1
+ require 'pp'
2
+
3
+ module Hpricot
4
+ # :stopdoc:
5
+ class Elements
6
+ def pretty_print(q)
7
+ q.object_group(self) { super }
8
+ end
9
+ alias inspect pretty_print_inspect
10
+ end
11
+
12
+ class Doc
13
+ def pretty_print(q)
14
+ q.object_group(self) { children.each {|elt| q.breakable; q.pp elt } if children }
15
+ end
16
+ alias inspect pretty_print_inspect
17
+ end
18
+
19
+ module Leaf
20
+ def pretty_print(q)
21
+ q.group(1, '{', '}') {
22
+ q.text self.class.name.sub(/.*::/,'').downcase
23
+ if rs = raw_string
24
+ rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
25
+ q.breakable
26
+ q.pp line
27
+ }
28
+ elsif self.respond_to? :to_s
29
+ q.breakable
30
+ q.text self.to_s
31
+ end
32
+ }
33
+ end
34
+ alias inspect pretty_print_inspect
35
+ end
36
+
37
+ class Elem
38
+ def pretty_print(q)
39
+ if empty?
40
+ q.group(1, '{emptyelem', '}') {
41
+ q.breakable; pretty_print_stag q
42
+ }
43
+ else
44
+ q.group(1, "{elem", "}") {
45
+ q.breakable; pretty_print_stag q
46
+ if children
47
+ children.each {|elt| q.breakable; q.pp elt }
48
+ end
49
+ if etag
50
+ q.breakable; q.text etag
51
+ end
52
+ }
53
+ end
54
+ end
55
+ def pretty_print_stag(q)
56
+ q.group(1, '<', '>') {
57
+ q.text name
58
+
59
+ if raw_attributes
60
+ raw_attributes.each {|n, t|
61
+ q.breakable
62
+ if t
63
+ q.text "#{n}=\"#{Hpricot.uxs(t)}\""
64
+ else
65
+ q.text n
66
+ end
67
+ }
68
+ end
69
+ }
70
+ end
71
+ alias inspect pretty_print_inspect
72
+ end
73
+
74
+ class ETag
75
+ def pretty_print(q)
76
+ q.group(1, '</', '>') {
77
+ q.text name
78
+ }
79
+ end
80
+ alias inspect pretty_print_inspect
81
+ end
82
+
83
+ class Text
84
+ def pretty_print(q)
85
+ q.text content.dump
86
+ end
87
+ end
88
+
89
+ class BogusETag
90
+ def pretty_print(q)
91
+ q.group(1, '{', '}') {
92
+ q.text self.class.name.sub(/.*::/,'').downcase
93
+ if rs = raw_string
94
+ q.breakable
95
+ q.text rs
96
+ else
97
+ q.text "</#{name}>"
98
+ end
99
+ }
100
+ end
101
+ end
102
+ # :startdoc:
103
+ end
@@ -0,0 +1,40 @@
1
+ module Hpricot
2
+ class Name; include Hpricot end
3
+ class Context; include Hpricot end
4
+
5
+ # :stopdoc:
6
+ module Tag; include Hpricot end
7
+ class ETag; include Tag end
8
+ # :startdoc:
9
+
10
+ module Node; include Hpricot end
11
+ class ETag; include Node end
12
+ module Container; include Node end
13
+ class Doc; include Container end
14
+ class Elem; include Container end
15
+
16
+ module Leaf; include Node end
17
+ class CData; include Leaf end
18
+ class Text; include Leaf end
19
+ class XMLDecl; include Leaf end
20
+ class DocType; include Leaf end
21
+ class ProcIns; include Leaf end
22
+ class Comment; include Leaf end
23
+ class BogusETag; include Leaf end
24
+
25
+ module Traverse end
26
+ module Container::Trav; include Traverse end
27
+ module Leaf::Trav; include Traverse end
28
+ class Doc; module Trav; include Container::Trav end; include Trav end
29
+ class Elem; module Trav; include Container::Trav end; include Trav end
30
+ class CData; module Trav; include Leaf::Trav end; include Trav end
31
+ class Text; module Trav; include Leaf::Trav end; include Trav end
32
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
33
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
34
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
35
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
36
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
37
+
38
+ class Error < StandardError; end
39
+ end
40
+
@@ -0,0 +1,38 @@
1
+ require 'hpricot/htmlinfo'
2
+
3
+ def Hpricot(input = nil, opts = {}, &blk)
4
+ Hpricot.make(input, opts, &blk)
5
+ end
6
+
7
+ module Hpricot
8
+ # Exception class used for any errors related to deficiencies in the system when
9
+ # handling the character encodings of a document.
10
+ class EncodingError < StandardError; end
11
+
12
+ # Hpricot.parse parses <i>input</i> and return a document tree.
13
+ # represented by Hpricot::Doc.
14
+ def Hpricot.parse(input = nil, opts = {}, &blk)
15
+ make(input, opts, &blk)
16
+ end
17
+
18
+ # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
+ # and returning a document tree.
20
+ def Hpricot.XML(input = nil, opts = {}, &blk)
21
+ opts.merge! :xml => true
22
+ make(input, opts, &blk)
23
+ end
24
+
25
+ # :stopdoc:
26
+
27
+ def Hpricot.make(input = nil, opts = {}, &blk)
28
+ if blk
29
+ doc = Hpricot.build(&blk)
30
+ doc.instance_variable_set("@options", opts)
31
+ doc
32
+ else
33
+ Hpricot.scan(input, opts)
34
+ end
35
+ end
36
+
37
+ # :startdoc:
38
+ end
@@ -0,0 +1,219 @@
1
+ module Hpricot
2
+ # :stopdoc:
3
+
4
+ class Doc
5
+ def output(out, opts = {})
6
+ children.each do |n|
7
+ n.output(out, opts)
8
+ end if children
9
+ out
10
+ end
11
+ def make(input = nil, &blk)
12
+ Hpricot.make(input, @options, &blk).children
13
+ end
14
+ def altered!; end
15
+ def inspect_tree
16
+ children.map { |x| x.inspect_tree }.join if children
17
+ end
18
+ end
19
+
20
+ module Node
21
+ def html_quote(str)
22
+ "\"" + str.gsub('"', '\\"') + "\""
23
+ end
24
+ def clear_raw; end
25
+ def if_output(opts)
26
+ if opts[:preserve] and not raw_string.nil?
27
+ raw_string
28
+ else
29
+ yield opts
30
+ end
31
+ end
32
+ def pathname; self.name end
33
+ def altered!
34
+ clear_raw
35
+ end
36
+ def inspect_tree(depth = 0)
37
+ %{#{" " * depth}} + self.class.name.split(/::/).last.downcase + "\n"
38
+ end
39
+ end
40
+
41
+ class Attributes
42
+ attr_accessor :element
43
+ def initialize e
44
+ @element = e
45
+ end
46
+ def [] k
47
+ Hpricot.uxs((@element.raw_attributes || {})[k])
48
+ end
49
+ def []= k, v
50
+ (@element.raw_attributes ||= {})[k] = v.fast_xs
51
+ end
52
+ def to_hash
53
+ if @element.raw_attributes
54
+ @element.raw_attributes.inject({}) do |hsh, (k, v)|
55
+ hsh[k] = Hpricot.uxs(v)
56
+ hsh
57
+ end
58
+ else
59
+ {}
60
+ end
61
+ end
62
+ def to_s
63
+ to_hash.to_s
64
+ end
65
+ def inspect
66
+ to_hash.inspect
67
+ end
68
+ end
69
+
70
+ class Elem
71
+ def initialize tag, attrs = nil, children = nil, etag = nil
72
+ self.name, self.raw_attributes, self.children, self.etag =
73
+ tag, attrs, children, etag
74
+ end
75
+ def empty?; children.nil? or children.empty? end
76
+ def attributes
77
+ Attributes.new self
78
+ end
79
+ def to_plain_text
80
+ if self.name == 'br'
81
+ "\n"
82
+ elsif self.name == 'p'
83
+ "\n\n" + super + "\n\n"
84
+ elsif self.name == 'a' and self.has_attribute?('href')
85
+ "#{super} [#{self['href']}]"
86
+ elsif self.name == 'img' and self.has_attribute?('src')
87
+ "[img:#{self['src']}]"
88
+ else
89
+ super
90
+ end
91
+ end
92
+ def pathname; self.name end
93
+ def output(out, opts = {})
94
+ out <<
95
+ if_output(opts) do
96
+ "<#{name}#{attributes_as_html}" +
97
+ ((empty? and not etag) ? " /" : "") +
98
+ ">"
99
+ end
100
+ if children
101
+ children.each { |n| n.output(out, opts) }
102
+ end
103
+ if opts[:preserve]
104
+ out << etag if etag
105
+ elsif etag or !empty?
106
+ out << "</#{name}>"
107
+ end
108
+ out
109
+ end
110
+ def attributes_as_html
111
+ if raw_attributes
112
+ raw_attributes.map do |aname, aval|
113
+ " #{aname}" +
114
+ (aval ? "=#{html_quote aval}" : "")
115
+ end.join
116
+ end
117
+ end
118
+ def inspect_tree(depth = 0)
119
+ %{#{" " * depth}} + name + "\n" +
120
+ (children ? children.map { |x| x.inspect_tree(depth + 1) }.join : "")
121
+ end
122
+ end
123
+
124
+ class BogusETag
125
+ def initialize name; self.name = name end
126
+ def output(out, opts = {})
127
+ out << if_output(opts) { "" }
128
+ end
129
+ end
130
+
131
+ class ETag < BogusETag
132
+ def output(out, opts = {}); out << if_output(opts) { '' }; end
133
+ end
134
+
135
+ class Text
136
+ def initialize content; self.content = content end
137
+ def pathname; "text()" end
138
+ def to_s
139
+ Hpricot.uxs(content)
140
+ end
141
+ alias_method :inner_text, :to_s
142
+ alias_method :to_plain_text, :to_s
143
+ def << str; self.content << str end
144
+ def output(out, opts = {})
145
+ out <<
146
+ if_output(opts) do
147
+ content.to_s
148
+ end
149
+ end
150
+ end
151
+
152
+ class CData
153
+ def initialize content; self.content = content end
154
+ alias_method :to_s, :content
155
+ alias_method :to_plain_text, :content
156
+ alias_method :inner_text, :content
157
+ def raw_string; "<![CDATA[#{content}]]>" end
158
+ def output(out, opts = {})
159
+ out <<
160
+ if_output(opts) do
161
+ "<![CDATA[#{content}]]>"
162
+ end
163
+ end
164
+ end
165
+
166
+ class XMLDecl
167
+ def pathname; "xmldecl()" end
168
+ def output(out, opts = {})
169
+ out <<
170
+ if_output(opts) do
171
+ "<?xml version=\"#{version}\"" +
172
+ (encoding ? " encoding=\"#{encoding}\"" : "") +
173
+ (standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
174
+ "?>"
175
+ end
176
+ end
177
+ end
178
+
179
+ class DocType
180
+ def initialize target, pub, sys
181
+ self.target, self.public_id, self.system_id = target, pub, sys
182
+ end
183
+ def pathname; "doctype()" end
184
+ def output(out, opts = {})
185
+ out <<
186
+ if_output(opts) do
187
+ "<!DOCTYPE #{target} " +
188
+ (public_id ? "PUBLIC \"#{public_id}\"" : "SYSTEM") +
189
+ (system_id ? " #{html_quote(system_id)}" : "") + ">"
190
+ end
191
+ end
192
+ end
193
+
194
+ class ProcIns
195
+ def pathname; "procins()" end
196
+ def raw_string; output("") end
197
+ def output(out, opts = {})
198
+ out <<
199
+ if_output(opts) do
200
+ "<?#{target}" +
201
+ (content ? " #{content}" : "") +
202
+ "?>"
203
+ end
204
+ end
205
+ end
206
+
207
+ class Comment
208
+ def pathname; "comment()" end
209
+ def raw_string; "<!--#{content}-->" end
210
+ def output(out, opts = {})
211
+ out <<
212
+ if_output(opts) do
213
+ "<!--#{content}-->"
214
+ end
215
+ end
216
+ end
217
+
218
+ # :startdoc:
219
+ end
@@ -0,0 +1,164 @@
1
+ module Hpricot
2
+
3
+ FORM_TAGS = [ :form, :input, :select, :textarea ]
4
+ SELF_CLOSING_TAGS = [ :base, :meta, :link, :hr, :br, :param, :img, :area, :input, :col ]
5
+
6
+ # Common sets of attributes.
7
+ AttrCore = [:id, :class, :style, :title]
8
+ AttrI18n = [:lang, 'xml:lang'.intern, :dir]
9
+ AttrEvents = [:onclick, :ondblclick, :onmousedown, :onmouseup, :onmouseover, :onmousemove,
10
+ :onmouseout, :onkeypress, :onkeydown, :onkeyup]
11
+ AttrFocus = [:accesskey, :tabindex, :onfocus, :onblur]
12
+ AttrHAlign = [:align, :char, :charoff]
13
+ AttrVAlign = [:valign]
14
+ Attrs = AttrCore + AttrI18n + AttrEvents
15
+
16
+ # All the tags and attributes from XHTML 1.0 Strict
17
+ class XHTMLStrict
18
+ class << self
19
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
20
+ end
21
+ @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
22
+ @tagset = {
23
+ :html => AttrI18n + [:id, :xmlns],
24
+ :head => AttrI18n + [:id, :profile],
25
+ :title => AttrI18n + [:id],
26
+ :base => [:href, :id],
27
+ :meta => AttrI18n + [:id, :http, :name, :content, :scheme, 'http-equiv'.intern],
28
+ :link => Attrs + [:charset, :href, :hreflang, :type, :rel, :rev, :media],
29
+ :style => AttrI18n + [:id, :type, :media, :title, 'xml:space'.intern],
30
+ :script => [:id, :charset, :type, :src, :defer, 'xml:space'.intern],
31
+ :noscript => Attrs,
32
+ :body => Attrs + [:onload, :onunload],
33
+ :div => Attrs,
34
+ :p => Attrs,
35
+ :ul => Attrs,
36
+ :ol => Attrs,
37
+ :li => Attrs,
38
+ :dl => Attrs,
39
+ :dt => Attrs,
40
+ :dd => Attrs,
41
+ :address => Attrs,
42
+ :hr => Attrs,
43
+ :pre => Attrs + ['xml:space'.intern],
44
+ :blockquote => Attrs + [:cite],
45
+ :ins => Attrs + [:cite, :datetime],
46
+ :del => Attrs + [:cite, :datetime],
47
+ :a => Attrs + AttrFocus + [:charset, :type, :name, :href, :hreflang, :rel, :rev, :shape, :coords],
48
+ :span => Attrs,
49
+ :bdo => AttrCore + AttrEvents + [:lang, 'xml:lang'.intern, :dir],
50
+ :br => AttrCore,
51
+ :em => Attrs,
52
+ :strong => Attrs,
53
+ :dfn => Attrs,
54
+ :code => Attrs,
55
+ :samp => Attrs,
56
+ :kbd => Attrs,
57
+ :var => Attrs,
58
+ :cite => Attrs,
59
+ :abbr => Attrs,
60
+ :acronym => Attrs,
61
+ :q => Attrs + [:cite],
62
+ :sub => Attrs,
63
+ :sup => Attrs,
64
+ :tt => Attrs,
65
+ :i => Attrs,
66
+ :b => Attrs,
67
+ :big => Attrs,
68
+ :small => Attrs,
69
+ :object => Attrs + [:declare, :classid, :codebase, :data, :type, :codetype, :archive, :standby, :height, :width, :usemap, :name, :tabindex],
70
+ :param => [:id, :name, :value, :valuetype, :type],
71
+ :img => Attrs + [:src, :alt, :longdesc, :height, :width, :usemap, :ismap],
72
+ :map => AttrI18n + AttrEvents + [:id, :class, :style, :title, :name],
73
+ :area => Attrs + AttrFocus + [:shape, :coords, :href, :nohref, :alt],
74
+ :form => Attrs + [:action, :method, :enctype, :onsubmit, :onreset, :accept, :accept],
75
+ :label => Attrs + [:for, :accesskey, :onfocus, :onblur],
76
+ :input => Attrs + AttrFocus + [:type, :name, :value, :checked, :disabled, :readonly, :size, :maxlength, :src, :alt, :usemap, :onselect, :onchange, :accept],
77
+ :select => Attrs + [:name, :size, :multiple, :disabled, :tabindex, :onfocus, :onblur, :onchange],
78
+ :optgroup => Attrs + [:disabled, :label],
79
+ :option => Attrs + [:selected, :disabled, :label, :value],
80
+ :textarea => Attrs + AttrFocus + [:name, :rows, :cols, :disabled, :readonly, :onselect, :onchange],
81
+ :fieldset => Attrs,
82
+ :legend => Attrs + [:accesskey],
83
+ :button => Attrs + AttrFocus + [:name, :value, :type, :disabled],
84
+ :table => Attrs + [:summary, :width, :border, :frame, :rules, :cellspacing, :cellpadding],
85
+ :caption => Attrs,
86
+ :colgroup => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
87
+ :col => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
88
+ :thead => Attrs + AttrHAlign + AttrVAlign,
89
+ :tfoot => Attrs + AttrHAlign + AttrVAlign,
90
+ :tbody => Attrs + AttrHAlign + AttrVAlign,
91
+ :tr => Attrs + AttrHAlign + AttrVAlign,
92
+ :th => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
93
+ :td => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
94
+ :h1 => Attrs,
95
+ :h2 => Attrs,
96
+ :h3 => Attrs,
97
+ :h4 => Attrs,
98
+ :h5 => Attrs,
99
+ :h6 => Attrs
100
+ }
101
+
102
+ @tags = @tagset.keys
103
+ @forms = @tags & FORM_TAGS
104
+ @self_closing = @tags & SELF_CLOSING_TAGS
105
+ end
106
+
107
+ # Additional tags found in XHTML 1.0 Transitional
108
+ class XHTMLTransitional
109
+ class << self
110
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
111
+ end
112
+ @doctype = ["-//W3C//DTD XHTML 1.0 Transitional//EN", "DTD/xhtml1-transitional.dtd"]
113
+ @tagset = XHTMLStrict.tagset.merge \
114
+ :strike => Attrs,
115
+ :center => Attrs,
116
+ :dir => Attrs + [:compact],
117
+ :noframes => Attrs,
118
+ :basefont => [:id, :size, :color, :face],
119
+ :u => Attrs,
120
+ :menu => Attrs + [:compact],
121
+ :iframe => AttrCore + [:longdesc, :name, :src, :frameborder, :marginwidth, :marginheight, :scrolling, :align, :height, :width],
122
+ :font => AttrCore + AttrI18n + [:size, :color, :face],
123
+ :s => Attrs,
124
+ :applet => AttrCore + [:codebase, :archive, :code, :object, :alt, :name, :width, :height, :align, :hspace, :vspace],
125
+ :isindex => AttrCore + AttrI18n + [:prompt]
126
+
127
+ # Additional attributes found in XHTML 1.0 Transitional
128
+ { :script => [:language],
129
+ :a => [:target],
130
+ :td => [:bgcolor, :nowrap, :width, :height],
131
+ :p => [:align],
132
+ :h5 => [:align],
133
+ :h3 => [:align],
134
+ :li => [:type, :value],
135
+ :div => [:align],
136
+ :pre => [:width],
137
+ :body => [:background, :bgcolor, :text, :link, :vlink, :alink],
138
+ :ol => [:type, :compact, :start],
139
+ :h4 => [:align],
140
+ :h2 => [:align],
141
+ :object => [:align, :border, :hspace, :vspace],
142
+ :img => [:name, :align, :border, :hspace, :vspace],
143
+ :link => [:target],
144
+ :legend => [:align],
145
+ :dl => [:compact],
146
+ :input => [:align],
147
+ :h6 => [:align],
148
+ :hr => [:align, :noshade, :size, :width],
149
+ :base => [:target],
150
+ :ul => [:type, :compact],
151
+ :br => [:clear],
152
+ :form => [:name, :target],
153
+ :area => [:target],
154
+ :h1 => [:align]
155
+ }.each do |k, v|
156
+ @tagset[k] += v
157
+ end
158
+
159
+ @tags = @tagset.keys
160
+ @forms = @tags & FORM_TAGS
161
+ @self_closing = @tags & SELF_CLOSING_TAGS
162
+ end
163
+
164
+ end