martiantim-hpricot 0.8.236

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. data/CHANGELOG +75 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +201 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_scan.c +6776 -0
  13. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  14. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  15. data/ext/hpricot_scan/hpricot_scan.rl +793 -0
  16. data/extras/mingw-rbconfig.rb +176 -0
  17. data/lib/hpricot.rb +26 -0
  18. data/lib/hpricot/blankslate.rb +63 -0
  19. data/lib/hpricot/builder.rb +216 -0
  20. data/lib/hpricot/elements.rb +510 -0
  21. data/lib/hpricot/htmlinfo.rb +691 -0
  22. data/lib/hpricot/inspect.rb +103 -0
  23. data/lib/hpricot/modules.rb +40 -0
  24. data/lib/hpricot/parse.rb +38 -0
  25. data/lib/hpricot/tag.rb +200 -0
  26. data/lib/hpricot/tags.rb +164 -0
  27. data/lib/hpricot/traverse.rb +838 -0
  28. data/lib/hpricot/xchar.rb +94 -0
  29. data/test/files/basic.xhtml +17 -0
  30. data/test/files/boingboing.html +2266 -0
  31. data/test/files/cy0.html +3653 -0
  32. data/test/files/immob.html +400 -0
  33. data/test/files/pace_application.html +1320 -0
  34. data/test/files/tenderlove.html +16 -0
  35. data/test/files/uswebgen.html +220 -0
  36. data/test/files/utf8.html +1054 -0
  37. data/test/files/week9.html +1723 -0
  38. data/test/files/why.xml +19 -0
  39. data/test/load_files.rb +7 -0
  40. data/test/test_alter.rb +77 -0
  41. data/test/test_builder.rb +37 -0
  42. data/test/test_parser.rb +420 -0
  43. data/test/test_paths.rb +25 -0
  44. data/test/test_preserved.rb +70 -0
  45. data/test/test_xml.rb +28 -0
  46. metadata +107 -0
@@ -0,0 +1,103 @@
1
+ require 'pp'
2
+
3
+ module Hpricot
4
+ # :stopdoc:
5
+ class Elements
6
+ def pretty_print(q)
7
+ q.object_group(self) { super }
8
+ end
9
+ alias inspect pretty_print_inspect
10
+ end
11
+
12
+ class Doc
13
+ def pretty_print(q)
14
+ q.object_group(self) { children.each {|elt| q.breakable; q.pp elt } if children }
15
+ end
16
+ alias inspect pretty_print_inspect
17
+ end
18
+
19
+ module Leaf
20
+ def pretty_print(q)
21
+ q.group(1, '{', '}') {
22
+ q.text self.class.name.sub(/.*::/,'').downcase
23
+ if rs = raw_string
24
+ rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
25
+ q.breakable
26
+ q.pp line
27
+ }
28
+ elsif self.respond_to? :to_s
29
+ q.breakable
30
+ q.text self.to_s
31
+ end
32
+ }
33
+ end
34
+ alias inspect pretty_print_inspect
35
+ end
36
+
37
+ class Elem
38
+ def pretty_print(q)
39
+ if empty?
40
+ q.group(1, '{emptyelem', '}') {
41
+ q.breakable; pretty_print_stag q
42
+ }
43
+ else
44
+ q.group(1, "{elem", "}") {
45
+ q.breakable; pretty_print_stag q
46
+ if children
47
+ children.each {|elt| q.breakable; q.pp elt }
48
+ end
49
+ if etag
50
+ q.breakable; q.text etag
51
+ end
52
+ }
53
+ end
54
+ end
55
+ def pretty_print_stag(q)
56
+ q.group(1, '<', '>') {
57
+ q.text name
58
+
59
+ if raw_attributes
60
+ raw_attributes.each {|n, t|
61
+ q.breakable
62
+ if t
63
+ q.text "#{n}=\"#{Hpricot.uxs(t)}\""
64
+ else
65
+ q.text n
66
+ end
67
+ }
68
+ end
69
+ }
70
+ end
71
+ alias inspect pretty_print_inspect
72
+ end
73
+
74
+ class ETag
75
+ def pretty_print(q)
76
+ q.group(1, '</', '>') {
77
+ q.text name
78
+ }
79
+ end
80
+ alias inspect pretty_print_inspect
81
+ end
82
+
83
+ class Text
84
+ def pretty_print(q)
85
+ q.text content.dump
86
+ end
87
+ end
88
+
89
+ class BogusETag
90
+ def pretty_print(q)
91
+ q.group(1, '{', '}') {
92
+ q.text self.class.name.sub(/.*::/,'').downcase
93
+ if rs = raw_string
94
+ q.breakable
95
+ q.text rs
96
+ else
97
+ q.text "</#{name}>"
98
+ end
99
+ }
100
+ end
101
+ end
102
+ # :startdoc:
103
+ end
@@ -0,0 +1,40 @@
1
+ module Hpricot
2
+ class Name; include Hpricot end
3
+ class Context; include Hpricot end
4
+
5
+ # :stopdoc:
6
+ module Tag; include Hpricot end
7
+ class ETag; include Tag end
8
+ # :startdoc:
9
+
10
+ module Node; include Hpricot end
11
+ class ETag; include Node end
12
+ module Container; include Node end
13
+ class Doc; include Container end
14
+ class Elem; include Container end
15
+
16
+ module Leaf; include Node end
17
+ class CData; include Leaf end
18
+ class Text; include Leaf end
19
+ class XMLDecl; include Leaf end
20
+ class DocType; include Leaf end
21
+ class ProcIns; include Leaf end
22
+ class Comment; include Leaf end
23
+ class BogusETag; include Leaf end
24
+
25
+ module Traverse end
26
+ module Container::Trav; include Traverse end
27
+ module Leaf::Trav; include Traverse end
28
+ class Doc; module Trav; include Container::Trav end; include Trav end
29
+ class Elem; module Trav; include Container::Trav end; include Trav end
30
+ class CData; module Trav; include Leaf::Trav end; include Trav end
31
+ class Text; module Trav; include Leaf::Trav end; include Trav end
32
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
33
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
34
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
35
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
36
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
37
+
38
+ class Error < StandardError; end
39
+ end
40
+
@@ -0,0 +1,38 @@
1
+ require 'hpricot/htmlinfo'
2
+
3
+ def Hpricot(input = nil, opts = {}, &blk)
4
+ Hpricot.make(input, opts, &blk)
5
+ end
6
+
7
+ module Hpricot
8
+ # Exception class used for any errors related to deficiencies in the system when
9
+ # handling the character encodings of a document.
10
+ class EncodingError < StandardError; end
11
+
12
+ # Hpricot.parse parses <i>input</i> and return a document tree.
13
+ # represented by Hpricot::Doc.
14
+ def Hpricot.parse(input = nil, opts = {}, &blk)
15
+ make(input, opts, &blk)
16
+ end
17
+
18
+ # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
+ # and returning a document tree.
20
+ def Hpricot.XML(input = nil, opts = {}, &blk)
21
+ opts.merge! :xml => true
22
+ make(input, opts, &blk)
23
+ end
24
+
25
+ # :stopdoc:
26
+
27
+ def Hpricot.make(input = nil, opts = {}, &blk)
28
+ if blk
29
+ doc = Hpricot.build(&blk)
30
+ doc.instance_variable_set("@options", opts)
31
+ doc
32
+ else
33
+ Hpricot.scan(input, opts)
34
+ end
35
+ end
36
+
37
+ # :startdoc:
38
+ end
@@ -0,0 +1,200 @@
1
+ module Hpricot
2
+ # :stopdoc:
3
+
4
+ class Doc
5
+ def output(out, opts = {})
6
+ children.each do |n|
7
+ n.output(out, opts)
8
+ end if children
9
+ out
10
+ end
11
+ def make(input = nil, &blk)
12
+ Hpricot.make(input, @options, &blk).children
13
+ end
14
+ def altered!; end
15
+ def inspect_tree
16
+ children.map { |x| x.inspect_tree }.join if children
17
+ end
18
+ end
19
+
20
+ module Node
21
+ def html_quote(str)
22
+ "\"" + str.gsub('"', '\\"') + "\""
23
+ end
24
+ def clear_raw; end
25
+ def if_output(opts)
26
+ if opts[:preserve] and not raw_string.nil?
27
+ raw_string
28
+ else
29
+ yield opts
30
+ end
31
+ end
32
+ def pathname; self.name end
33
+ def altered!
34
+ clear_raw
35
+ end
36
+ def inspect_tree(depth = 0)
37
+ %{#{" " * depth}} + self.class.name.split(/::/).last.downcase + "\n"
38
+ end
39
+ end
40
+
41
+ class Elem
42
+ def initialize tag, attrs = nil, children = nil, etag = nil
43
+ self.name, self.raw_attributes, self.children, self.etag =
44
+ tag, attrs, children, etag
45
+ end
46
+ def empty?; children.nil? or children.empty? end
47
+ def attributes
48
+ if raw_attributes
49
+ raw_attributes.inject({}) do |hsh, (k, v)|
50
+ hsh[k] = Hpricot.uxs(v)
51
+ hsh
52
+ end
53
+ else
54
+ {}
55
+ end
56
+ end
57
+ def to_plain_text
58
+ if self.name == 'br'
59
+ "\n"
60
+ elsif self.name == 'p'
61
+ "\n\n" + super + "\n\n"
62
+ elsif self.name == 'a' and self.has_attribute?('href')
63
+ "#{super} [#{self['href']}]"
64
+ elsif self.name == 'img' and self.has_attribute?('src')
65
+ "[img:#{self['src']}]"
66
+ else
67
+ super
68
+ end
69
+ end
70
+ def pathname; self.name end
71
+ def output(out, opts = {})
72
+ out <<
73
+ if_output(opts) do
74
+ "<#{name}#{attributes_as_html}" +
75
+ ((empty? and not etag) ? " /" : "") +
76
+ ">"
77
+ end
78
+ if children
79
+ children.each { |n| n.output(out, opts) }
80
+ end
81
+ if opts[:preserve]
82
+ out << etag if etag
83
+ elsif etag or !empty?
84
+ out << "</#{name}>"
85
+ end
86
+ out
87
+ end
88
+ def attributes_as_html
89
+ if raw_attributes
90
+ raw_attributes.map do |aname, aval|
91
+ " #{aname}" +
92
+ (aval ? "=#{html_quote aval}" : "")
93
+ end.join
94
+ end
95
+ end
96
+ def inspect_tree(depth = 0)
97
+ %{#{" " * depth}} + name + "\n" +
98
+ (children ? children.map { |x| x.inspect_tree(depth + 1) }.join : "")
99
+ end
100
+ end
101
+
102
+ class BogusETag
103
+ def initialize name; self.name = name end
104
+ def output(out, opts = {})
105
+ out <<
106
+ if_output(opts) do
107
+ "</#{name}>"
108
+ end
109
+ end
110
+ end
111
+
112
+ class ETag < BogusETag
113
+ def output(out, opts = {}); out << if_output(opts) { '' }; end
114
+ end
115
+
116
+ class Text
117
+ def initialize content; self.content = content end
118
+ def pathname; "text()" end
119
+ def to_s
120
+ Hpricot.uxs(content)
121
+ end
122
+ alias_method :inner_text, :to_s
123
+ alias_method :to_plain_text, :to_s
124
+ def << str; self.content << str end
125
+ def output(out, opts = {})
126
+ out <<
127
+ if_output(opts) do
128
+ content.to_s
129
+ end
130
+ end
131
+ end
132
+
133
+ class CData
134
+ def initialize content; self.content = content end
135
+ alias_method :to_s, :content
136
+ alias_method :to_plain_text, :content
137
+ alias_method :inner_text, :content
138
+ def raw_string; "<![CDATA[#{content}]]>" end
139
+ def output(out, opts = {})
140
+ out <<
141
+ if_output(opts) do
142
+ "<![CDATA[#{content}]]>"
143
+ end
144
+ end
145
+ end
146
+
147
+ class XMLDecl
148
+ def pathname; "xmldecl()" end
149
+ def output(out, opts = {})
150
+ out <<
151
+ if_output(opts) do
152
+ "<?xml version=\"#{version}\"" +
153
+ (encoding ? " encoding=\"#{encoding}\"" : "") +
154
+ (standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
155
+ "?>"
156
+ end
157
+ end
158
+ end
159
+
160
+ class DocType
161
+ def initialize target, pub, sys
162
+ self.target, self.public_id, self.system_id = target, pub, sys
163
+ end
164
+ def pathname; "doctype()" end
165
+ def output(out, opts = {})
166
+ out <<
167
+ if_output(opts) do
168
+ "<!DOCTYPE #{target} " +
169
+ (public_id ? "PUBLIC \"#{public_id}\"" : "SYSTEM") +
170
+ (system_id ? " #{html_quote(system_id)}" : "") + ">"
171
+ end
172
+ end
173
+ end
174
+
175
+ class ProcIns
176
+ def pathname; "procins()" end
177
+ def raw_string; output("") end
178
+ def output(out, opts = {})
179
+ out <<
180
+ if_output(opts) do
181
+ "<?#{target}" +
182
+ (content ? " #{content}" : "") +
183
+ "?>"
184
+ end
185
+ end
186
+ end
187
+
188
+ class Comment
189
+ def pathname; "comment()" end
190
+ def raw_string; "<!--#{content}-->" end
191
+ def output(out, opts = {})
192
+ out <<
193
+ if_output(opts) do
194
+ "<!--#{content}-->"
195
+ end
196
+ end
197
+ end
198
+
199
+ # :startdoc:
200
+ end
@@ -0,0 +1,164 @@
1
+ module Hpricot
2
+
3
+ FORM_TAGS = [ :form, :input, :select, :textarea ]
4
+ SELF_CLOSING_TAGS = [ :base, :meta, :link, :hr, :br, :param, :img, :area, :input, :col ]
5
+
6
+ # Common sets of attributes.
7
+ AttrCore = [:id, :class, :style, :title]
8
+ AttrI18n = [:lang, 'xml:lang'.intern, :dir]
9
+ AttrEvents = [:onclick, :ondblclick, :onmousedown, :onmouseup, :onmouseover, :onmousemove,
10
+ :onmouseout, :onkeypress, :onkeydown, :onkeyup]
11
+ AttrFocus = [:accesskey, :tabindex, :onfocus, :onblur]
12
+ AttrHAlign = [:align, :char, :charoff]
13
+ AttrVAlign = [:valign]
14
+ Attrs = AttrCore + AttrI18n + AttrEvents
15
+
16
+ # All the tags and attributes from XHTML 1.0 Strict
17
+ class XHTMLStrict
18
+ class << self
19
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
20
+ end
21
+ @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
22
+ @tagset = {
23
+ :html => AttrI18n + [:id, :xmlns],
24
+ :head => AttrI18n + [:id, :profile],
25
+ :title => AttrI18n + [:id],
26
+ :base => [:href, :id],
27
+ :meta => AttrI18n + [:id, :http, :name, :content, :scheme, 'http-equiv'.intern],
28
+ :link => Attrs + [:charset, :href, :hreflang, :type, :rel, :rev, :media],
29
+ :style => AttrI18n + [:id, :type, :media, :title, 'xml:space'.intern],
30
+ :script => [:id, :charset, :type, :src, :defer, 'xml:space'.intern],
31
+ :noscript => Attrs,
32
+ :body => Attrs + [:onload, :onunload],
33
+ :div => Attrs,
34
+ :p => Attrs,
35
+ :ul => Attrs,
36
+ :ol => Attrs,
37
+ :li => Attrs,
38
+ :dl => Attrs,
39
+ :dt => Attrs,
40
+ :dd => Attrs,
41
+ :address => Attrs,
42
+ :hr => Attrs,
43
+ :pre => Attrs + ['xml:space'.intern],
44
+ :blockquote => Attrs + [:cite],
45
+ :ins => Attrs + [:cite, :datetime],
46
+ :del => Attrs + [:cite, :datetime],
47
+ :a => Attrs + AttrFocus + [:charset, :type, :name, :href, :hreflang, :rel, :rev, :shape, :coords],
48
+ :span => Attrs,
49
+ :bdo => AttrCore + AttrEvents + [:lang, 'xml:lang'.intern, :dir],
50
+ :br => AttrCore,
51
+ :em => Attrs,
52
+ :strong => Attrs,
53
+ :dfn => Attrs,
54
+ :code => Attrs,
55
+ :samp => Attrs,
56
+ :kbd => Attrs,
57
+ :var => Attrs,
58
+ :cite => Attrs,
59
+ :abbr => Attrs,
60
+ :acronym => Attrs,
61
+ :q => Attrs + [:cite],
62
+ :sub => Attrs,
63
+ :sup => Attrs,
64
+ :tt => Attrs,
65
+ :i => Attrs,
66
+ :b => Attrs,
67
+ :big => Attrs,
68
+ :small => Attrs,
69
+ :object => Attrs + [:declare, :classid, :codebase, :data, :type, :codetype, :archive, :standby, :height, :width, :usemap, :name, :tabindex],
70
+ :param => [:id, :name, :value, :valuetype, :type],
71
+ :img => Attrs + [:src, :alt, :longdesc, :height, :width, :usemap, :ismap],
72
+ :map => AttrI18n + AttrEvents + [:id, :class, :style, :title, :name],
73
+ :area => Attrs + AttrFocus + [:shape, :coords, :href, :nohref, :alt],
74
+ :form => Attrs + [:action, :method, :enctype, :onsubmit, :onreset, :accept, :accept],
75
+ :label => Attrs + [:for, :accesskey, :onfocus, :onblur],
76
+ :input => Attrs + AttrFocus + [:type, :name, :value, :checked, :disabled, :readonly, :size, :maxlength, :src, :alt, :usemap, :onselect, :onchange, :accept],
77
+ :select => Attrs + [:name, :size, :multiple, :disabled, :tabindex, :onfocus, :onblur, :onchange],
78
+ :optgroup => Attrs + [:disabled, :label],
79
+ :option => Attrs + [:selected, :disabled, :label, :value],
80
+ :textarea => Attrs + AttrFocus + [:name, :rows, :cols, :disabled, :readonly, :onselect, :onchange],
81
+ :fieldset => Attrs,
82
+ :legend => Attrs + [:accesskey],
83
+ :button => Attrs + AttrFocus + [:name, :value, :type, :disabled],
84
+ :table => Attrs + [:summary, :width, :border, :frame, :rules, :cellspacing, :cellpadding],
85
+ :caption => Attrs,
86
+ :colgroup => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
87
+ :col => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
88
+ :thead => Attrs + AttrHAlign + AttrVAlign,
89
+ :tfoot => Attrs + AttrHAlign + AttrVAlign,
90
+ :tbody => Attrs + AttrHAlign + AttrVAlign,
91
+ :tr => Attrs + AttrHAlign + AttrVAlign,
92
+ :th => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
93
+ :td => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
94
+ :h1 => Attrs,
95
+ :h2 => Attrs,
96
+ :h3 => Attrs,
97
+ :h4 => Attrs,
98
+ :h5 => Attrs,
99
+ :h6 => Attrs
100
+ }
101
+
102
+ @tags = @tagset.keys
103
+ @forms = @tags & FORM_TAGS
104
+ @self_closing = @tags & SELF_CLOSING_TAGS
105
+ end
106
+
107
+ # Additional tags found in XHTML 1.0 Transitional
108
+ class XHTMLTransitional
109
+ class << self
110
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
111
+ end
112
+ @doctype = ["-//W3C//DTD XHTML 1.0 Transitional//EN", "DTD/xhtml1-transitional.dtd"]
113
+ @tagset = XHTMLStrict.tagset.merge \
114
+ :strike => Attrs,
115
+ :center => Attrs,
116
+ :dir => Attrs + [:compact],
117
+ :noframes => Attrs,
118
+ :basefont => [:id, :size, :color, :face],
119
+ :u => Attrs,
120
+ :menu => Attrs + [:compact],
121
+ :iframe => AttrCore + [:longdesc, :name, :src, :frameborder, :marginwidth, :marginheight, :scrolling, :align, :height, :width],
122
+ :font => AttrCore + AttrI18n + [:size, :color, :face],
123
+ :s => Attrs,
124
+ :applet => AttrCore + [:codebase, :archive, :code, :object, :alt, :name, :width, :height, :align, :hspace, :vspace],
125
+ :isindex => AttrCore + AttrI18n + [:prompt]
126
+
127
+ # Additional attributes found in XHTML 1.0 Transitional
128
+ { :script => [:language],
129
+ :a => [:target],
130
+ :td => [:bgcolor, :nowrap, :width, :height],
131
+ :p => [:align],
132
+ :h5 => [:align],
133
+ :h3 => [:align],
134
+ :li => [:type, :value],
135
+ :div => [:align],
136
+ :pre => [:width],
137
+ :body => [:background, :bgcolor, :text, :link, :vlink, :alink],
138
+ :ol => [:type, :compact, :start],
139
+ :h4 => [:align],
140
+ :h2 => [:align],
141
+ :object => [:align, :border, :hspace, :vspace],
142
+ :img => [:name, :align, :border, :hspace, :vspace],
143
+ :link => [:target],
144
+ :legend => [:align],
145
+ :dl => [:compact],
146
+ :input => [:align],
147
+ :h6 => [:align],
148
+ :hr => [:align, :noshade, :size, :width],
149
+ :base => [:target],
150
+ :ul => [:type, :compact],
151
+ :br => [:clear],
152
+ :form => [:name, :target],
153
+ :area => [:target],
154
+ :h1 => [:align]
155
+ }.each do |k, v|
156
+ @tagset[k] += v
157
+ end
158
+
159
+ @tags = @tagset.keys
160
+ @forms = @tags & FORM_TAGS
161
+ @self_closing = @tags & SELF_CLOSING_TAGS
162
+ end
163
+
164
+ end