hpricot 0.7-x86-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/CHANGELOG +68 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +200 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  13. data/ext/hpricot_scan/hpricot_scan.c +6704 -0
  14. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  15. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  16. data/ext/hpricot_scan/hpricot_scan.rl +722 -0
  17. data/ext/hpricot_scan/test.rb +4 -0
  18. data/extras/mingw-rbconfig.rb +176 -0
  19. data/lib/fast_xs.so +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +510 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +38 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +198 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +838 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/lib/hpricot_scan.so +0 -0
  33. data/test/files/basic.xhtml +17 -0
  34. data/test/files/boingboing.html +2266 -0
  35. data/test/files/cy0.html +3653 -0
  36. data/test/files/immob.html +400 -0
  37. data/test/files/pace_application.html +1320 -0
  38. data/test/files/tenderlove.html +16 -0
  39. data/test/files/uswebgen.html +220 -0
  40. data/test/files/utf8.html +1054 -0
  41. data/test/files/week9.html +1723 -0
  42. data/test/files/why.xml +19 -0
  43. data/test/load_files.rb +7 -0
  44. data/test/nokogiri-bench.rb +64 -0
  45. data/test/test_alter.rb +77 -0
  46. data/test/test_builder.rb +37 -0
  47. data/test/test_parser.rb +409 -0
  48. data/test/test_paths.rb +25 -0
  49. data/test/test_preserved.rb +70 -0
  50. data/test/test_xml.rb +28 -0
  51. metadata +111 -0
@@ -0,0 +1,103 @@
1
+ require 'pp'
2
+
3
+ module Hpricot
4
+ # :stopdoc:
5
+ class Elements
6
+ def pretty_print(q)
7
+ q.object_group(self) { super }
8
+ end
9
+ alias inspect pretty_print_inspect
10
+ end
11
+
12
+ class Doc
13
+ def pretty_print(q)
14
+ q.object_group(self) { children.each {|elt| q.breakable; q.pp elt } }
15
+ end
16
+ alias inspect pretty_print_inspect
17
+ end
18
+
19
+ module Leaf
20
+ def pretty_print(q)
21
+ q.group(1, '{', '}') {
22
+ q.text self.class.name.sub(/.*::/,'').downcase
23
+ if rs = raw_string
24
+ rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
25
+ q.breakable
26
+ q.pp line
27
+ }
28
+ elsif self.respond_to? :to_s
29
+ q.breakable
30
+ q.text self.to_s
31
+ end
32
+ }
33
+ end
34
+ alias inspect pretty_print_inspect
35
+ end
36
+
37
+ class Elem
38
+ def pretty_print(q)
39
+ if empty?
40
+ q.group(1, '{emptyelem', '}') {
41
+ q.breakable; pretty_print_stag q
42
+ }
43
+ else
44
+ q.group(1, "{elem", "}") {
45
+ q.breakable; pretty_print_stag q
46
+ if children
47
+ children.each {|elt| q.breakable; q.pp elt }
48
+ end
49
+ if etag
50
+ q.breakable; q.pp etag
51
+ end
52
+ }
53
+ end
54
+ end
55
+ def pretty_print_stag(q)
56
+ q.group(1, '<', '>') {
57
+ q.text name
58
+
59
+ if raw_attributes
60
+ raw_attributes.each {|n, t|
61
+ q.breakable
62
+ if t
63
+ q.text "#{n}=\"#{Hpricot.uxs(t)}\""
64
+ else
65
+ q.text n
66
+ end
67
+ }
68
+ end
69
+ }
70
+ end
71
+ alias inspect pretty_print_inspect
72
+ end
73
+
74
+ class ETag
75
+ def pretty_print(q)
76
+ q.group(1, '</', '>') {
77
+ q.text name
78
+ }
79
+ end
80
+ alias inspect pretty_print_inspect
81
+ end
82
+
83
+ class Text
84
+ def pretty_print(q)
85
+ q.text content.dump
86
+ end
87
+ end
88
+
89
+ class BogusETag
90
+ def pretty_print(q)
91
+ q.group(1, '{', '}') {
92
+ q.text self.class.name.sub(/.*::/,'').downcase
93
+ if rs = raw_string
94
+ q.breakable
95
+ q.text rs
96
+ else
97
+ q.text "</#{name}>"
98
+ end
99
+ }
100
+ end
101
+ end
102
+ # :startdoc:
103
+ end
@@ -0,0 +1,38 @@
1
+ module Hpricot
2
+ class Name; include Hpricot end
3
+ class Context; include Hpricot end
4
+
5
+ # :stopdoc:
6
+ module Tag; include Hpricot end
7
+ class ETag; include Tag end
8
+ # :startdoc:
9
+
10
+ module Node; include Hpricot end
11
+ module Container; include Node end
12
+ class Doc; include Container end
13
+ class Elem; include Container end
14
+
15
+ module Leaf; include Node end
16
+ class Text; include Leaf end
17
+ class XMLDecl; include Leaf end
18
+ class DocType; include Leaf end
19
+ class ProcIns; include Leaf end
20
+ class Comment; include Leaf end
21
+ class BogusETag; include Leaf end
22
+
23
+ module Traverse end
24
+ module Container::Trav; include Traverse end
25
+ module Leaf::Trav; include Traverse end
26
+ class Doc; module Trav; include Container::Trav end; include Trav end
27
+ class Elem; module Trav; include Container::Trav end; include Trav end
28
+ class CData; module Trav; include Leaf::Trav end; include Trav end
29
+ class Text; module Trav; include Leaf::Trav end; include Trav end
30
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
31
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
32
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
33
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
34
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
35
+
36
+ class Error < StandardError; end
37
+ end
38
+
@@ -0,0 +1,38 @@
1
+ require 'hpricot/htmlinfo'
2
+
3
+ def Hpricot(input = nil, opts = {}, &blk)
4
+ Hpricot.make(input, opts, &blk)
5
+ end
6
+
7
+ module Hpricot
8
+ # Exception class used for any errors related to deficiencies in the system when
9
+ # handling the character encodings of a document.
10
+ class EncodingError < StandardError; end
11
+
12
+ # Hpricot.parse parses <i>input</i> and return a document tree.
13
+ # represented by Hpricot::Doc.
14
+ def Hpricot.parse(input = nil, opts = {}, &blk)
15
+ make(input, opts, &blk)
16
+ end
17
+
18
+ # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
+ # and returning a document tree.
20
+ def Hpricot.XML(input = nil, opts = {}, &blk)
21
+ opts.merge! :xml => true
22
+ make(input, opts, &blk)
23
+ end
24
+
25
+ # :stopdoc:
26
+
27
+ def Hpricot.make(input = nil, opts = {}, &blk)
28
+ if blk
29
+ doc = Hpricot.build(&blk)
30
+ doc.instance_variable_set("@options", opts)
31
+ doc
32
+ else
33
+ Hpricot.scan(input, opts)
34
+ end
35
+ end
36
+
37
+ # :startdoc:
38
+ end
@@ -0,0 +1,198 @@
1
+ module Hpricot
2
+ # :stopdoc:
3
+
4
+ class Doc
5
+ def output(out, opts = {})
6
+ children.each do |n|
7
+ n.output(out, opts)
8
+ end if children
9
+ out
10
+ end
11
+ def make(input = nil, &blk)
12
+ Hpricot.make(input, @options, &blk).children
13
+ end
14
+ def altered!; end
15
+ def inspect_tree
16
+ children.map { |x| x.inspect_tree }.join if children
17
+ end
18
+ end
19
+
20
+ class BaseEle
21
+ def html_quote(str)
22
+ "\"" + str.gsub('"', '\\"') + "\""
23
+ end
24
+ def if_output(opts)
25
+ if opts[:preserve] and not raw_string.nil?
26
+ raw_string
27
+ else
28
+ yield opts
29
+ end
30
+ end
31
+ def pathname; self.name end
32
+ def altered!
33
+ clear_raw
34
+ end
35
+ def inspect_tree(depth = 0)
36
+ %{#{" " * depth}} + self.class.name.split(/::/).last.downcase + "\n"
37
+ end
38
+ end
39
+
40
+ class Elem
41
+ def initialize tag, attrs = nil, children = nil, etag = nil
42
+ self.name, self.raw_attributes, self.children, self.etag =
43
+ tag, attrs, children, etag
44
+ end
45
+ def empty?; children.nil? or children.empty? end
46
+ def attributes
47
+ if raw_attributes
48
+ raw_attributes.inject({}) do |hsh, (k, v)|
49
+ hsh[k] = Hpricot.uxs(v)
50
+ hsh
51
+ end
52
+ else
53
+ {}
54
+ end
55
+ end
56
+ def to_plain_text
57
+ if self.name == 'br'
58
+ "\n"
59
+ elsif self.name == 'p'
60
+ "\n\n" + super + "\n\n"
61
+ elsif self.name == 'a' and self.has_attribute?('href')
62
+ "#{super} [#{self['href']}]"
63
+ elsif self.name == 'img' and self.has_attribute?('src')
64
+ "[img:#{self['src']}]"
65
+ else
66
+ super
67
+ end
68
+ end
69
+ def pathname; self.name end
70
+ def output(out, opts = {})
71
+ out <<
72
+ if_output(opts) do
73
+ "<#{name}#{attributes_as_html}" +
74
+ ((empty? and not etag) ? " /" : "") +
75
+ ">"
76
+ end
77
+ if children
78
+ children.each { |n| n.output(out, opts) }
79
+ end
80
+ if etag
81
+ etag.output(out, opts)
82
+ elsif !opts[:preserve] && !empty?
83
+ out <<
84
+ if_output(opts) do
85
+ "</#{name}>"
86
+ end
87
+ end
88
+ out
89
+ end
90
+ def attributes_as_html
91
+ if raw_attributes
92
+ raw_attributes.map do |aname, aval|
93
+ " #{aname}" +
94
+ (aval ? "=#{html_quote aval}" : "")
95
+ end.join
96
+ end
97
+ end
98
+ def inspect_tree(depth = 0)
99
+ %{#{" " * depth}} + name + "\n" +
100
+ (children ? children.map { |x| x.inspect_tree(depth + 1) }.join : "")
101
+ end
102
+ end
103
+
104
+ class ETag
105
+ def initialize name; self.name = name end
106
+ def output(out, opts = {})
107
+ out <<
108
+ if_output(opts) do
109
+ "</#{name}>"
110
+ end
111
+ end
112
+ end
113
+
114
+ class BogusETag
115
+ def output(out, opts = {}); out << if_output(opts) { '' }; end
116
+ end
117
+
118
+ class Text
119
+ def initialize content; self.content = content end
120
+ def pathname; "text()" end
121
+ def to_s
122
+ Hpricot.uxs(content)
123
+ end
124
+ alias_method :inner_text, :to_s
125
+ alias_method :to_plain_text, :to_s
126
+ def << str; self.content << str end
127
+ def output(out, opts = {})
128
+ out <<
129
+ if_output(opts) do
130
+ content.to_s
131
+ end
132
+ end
133
+ end
134
+
135
+ class CData
136
+ def initialize content; self.content = content end
137
+ alias_method :to_s, :content
138
+ alias_method :to_plain_text, :content
139
+ def output(out, opts = {})
140
+ out <<
141
+ if_output(opts) do
142
+ "<![CDATA[#{content}]]>"
143
+ end
144
+ end
145
+ end
146
+
147
+ class XMLDecl
148
+ def pathname; "xmldecl()" end
149
+ def output(out, opts = {})
150
+ out <<
151
+ if_output(opts) do
152
+ "<?xml version=\"#{version}\"" +
153
+ (encoding ? " encoding=\"#{encoding}\"" : "") +
154
+ (standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
155
+ "?>"
156
+ end
157
+ end
158
+ end
159
+
160
+ class DocType
161
+ def initialize target, pub, sys
162
+ self.target, self.public_id, self.system_id = target, pub, sys
163
+ end
164
+ def pathname; "doctype()" end
165
+ def output(out, opts = {})
166
+ out <<
167
+ if_output(opts) do
168
+ "<!DOCTYPE #{target} " +
169
+ (public_id ? "PUBLIC \"#{public_id}\"" : "SYSTEM") +
170
+ (system_id ? " #{html_quote(system_id)}" : "") + ">"
171
+ end
172
+ end
173
+ end
174
+
175
+ class ProcIns
176
+ def pathname; "procins()" end
177
+ def output(out, opts = {})
178
+ out <<
179
+ if_output(opts) do
180
+ "<?#{target}" +
181
+ (content ? " #{content}" : "") +
182
+ "?>"
183
+ end
184
+ end
185
+ end
186
+
187
+ class Comment
188
+ def pathname; "comment()" end
189
+ def output(out, opts = {})
190
+ out <<
191
+ if_output(opts) do
192
+ "<!--#{content}-->"
193
+ end
194
+ end
195
+ end
196
+
197
+ # :startdoc:
198
+ end
@@ -0,0 +1,164 @@
1
+ module Hpricot
2
+
3
+ FORM_TAGS = [ :form, :input, :select, :textarea ]
4
+ SELF_CLOSING_TAGS = [ :base, :meta, :link, :hr, :br, :param, :img, :area, :input, :col ]
5
+
6
+ # Common sets of attributes.
7
+ AttrCore = [:id, :class, :style, :title]
8
+ AttrI18n = [:lang, 'xml:lang'.intern, :dir]
9
+ AttrEvents = [:onclick, :ondblclick, :onmousedown, :onmouseup, :onmouseover, :onmousemove,
10
+ :onmouseout, :onkeypress, :onkeydown, :onkeyup]
11
+ AttrFocus = [:accesskey, :tabindex, :onfocus, :onblur]
12
+ AttrHAlign = [:align, :char, :charoff]
13
+ AttrVAlign = [:valign]
14
+ Attrs = AttrCore + AttrI18n + AttrEvents
15
+
16
+ # All the tags and attributes from XHTML 1.0 Strict
17
+ class XHTMLStrict
18
+ class << self
19
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
20
+ end
21
+ @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
22
+ @tagset = {
23
+ :html => AttrI18n + [:id, :xmlns],
24
+ :head => AttrI18n + [:id, :profile],
25
+ :title => AttrI18n + [:id],
26
+ :base => [:href, :id],
27
+ :meta => AttrI18n + [:id, :http, :name, :content, :scheme, 'http-equiv'.intern],
28
+ :link => Attrs + [:charset, :href, :hreflang, :type, :rel, :rev, :media],
29
+ :style => AttrI18n + [:id, :type, :media, :title, 'xml:space'.intern],
30
+ :script => [:id, :charset, :type, :src, :defer, 'xml:space'.intern],
31
+ :noscript => Attrs,
32
+ :body => Attrs + [:onload, :onunload],
33
+ :div => Attrs,
34
+ :p => Attrs,
35
+ :ul => Attrs,
36
+ :ol => Attrs,
37
+ :li => Attrs,
38
+ :dl => Attrs,
39
+ :dt => Attrs,
40
+ :dd => Attrs,
41
+ :address => Attrs,
42
+ :hr => Attrs,
43
+ :pre => Attrs + ['xml:space'.intern],
44
+ :blockquote => Attrs + [:cite],
45
+ :ins => Attrs + [:cite, :datetime],
46
+ :del => Attrs + [:cite, :datetime],
47
+ :a => Attrs + AttrFocus + [:charset, :type, :name, :href, :hreflang, :rel, :rev, :shape, :coords],
48
+ :span => Attrs,
49
+ :bdo => AttrCore + AttrEvents + [:lang, 'xml:lang'.intern, :dir],
50
+ :br => AttrCore,
51
+ :em => Attrs,
52
+ :strong => Attrs,
53
+ :dfn => Attrs,
54
+ :code => Attrs,
55
+ :samp => Attrs,
56
+ :kbd => Attrs,
57
+ :var => Attrs,
58
+ :cite => Attrs,
59
+ :abbr => Attrs,
60
+ :acronym => Attrs,
61
+ :q => Attrs + [:cite],
62
+ :sub => Attrs,
63
+ :sup => Attrs,
64
+ :tt => Attrs,
65
+ :i => Attrs,
66
+ :b => Attrs,
67
+ :big => Attrs,
68
+ :small => Attrs,
69
+ :object => Attrs + [:declare, :classid, :codebase, :data, :type, :codetype, :archive, :standby, :height, :width, :usemap, :name, :tabindex],
70
+ :param => [:id, :name, :value, :valuetype, :type],
71
+ :img => Attrs + [:src, :alt, :longdesc, :height, :width, :usemap, :ismap],
72
+ :map => AttrI18n + AttrEvents + [:id, :class, :style, :title, :name],
73
+ :area => Attrs + AttrFocus + [:shape, :coords, :href, :nohref, :alt],
74
+ :form => Attrs + [:action, :method, :enctype, :onsubmit, :onreset, :accept, :accept],
75
+ :label => Attrs + [:for, :accesskey, :onfocus, :onblur],
76
+ :input => Attrs + AttrFocus + [:type, :name, :value, :checked, :disabled, :readonly, :size, :maxlength, :src, :alt, :usemap, :onselect, :onchange, :accept],
77
+ :select => Attrs + [:name, :size, :multiple, :disabled, :tabindex, :onfocus, :onblur, :onchange],
78
+ :optgroup => Attrs + [:disabled, :label],
79
+ :option => Attrs + [:selected, :disabled, :label, :value],
80
+ :textarea => Attrs + AttrFocus + [:name, :rows, :cols, :disabled, :readonly, :onselect, :onchange],
81
+ :fieldset => Attrs,
82
+ :legend => Attrs + [:accesskey],
83
+ :button => Attrs + AttrFocus + [:name, :value, :type, :disabled],
84
+ :table => Attrs + [:summary, :width, :border, :frame, :rules, :cellspacing, :cellpadding],
85
+ :caption => Attrs,
86
+ :colgroup => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
87
+ :col => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
88
+ :thead => Attrs + AttrHAlign + AttrVAlign,
89
+ :tfoot => Attrs + AttrHAlign + AttrVAlign,
90
+ :tbody => Attrs + AttrHAlign + AttrVAlign,
91
+ :tr => Attrs + AttrHAlign + AttrVAlign,
92
+ :th => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
93
+ :td => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
94
+ :h1 => Attrs,
95
+ :h2 => Attrs,
96
+ :h3 => Attrs,
97
+ :h4 => Attrs,
98
+ :h5 => Attrs,
99
+ :h6 => Attrs
100
+ }
101
+
102
+ @tags = @tagset.keys
103
+ @forms = @tags & FORM_TAGS
104
+ @self_closing = @tags & SELF_CLOSING_TAGS
105
+ end
106
+
107
+ # Additional tags found in XHTML 1.0 Transitional
108
+ class XHTMLTransitional
109
+ class << self
110
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
111
+ end
112
+ @doctype = ["-//W3C//DTD XHTML 1.0 Transitional//EN", "DTD/xhtml1-transitional.dtd"]
113
+ @tagset = XHTMLStrict.tagset.merge \
114
+ :strike => Attrs,
115
+ :center => Attrs,
116
+ :dir => Attrs + [:compact],
117
+ :noframes => Attrs,
118
+ :basefont => [:id, :size, :color, :face],
119
+ :u => Attrs,
120
+ :menu => Attrs + [:compact],
121
+ :iframe => AttrCore + [:longdesc, :name, :src, :frameborder, :marginwidth, :marginheight, :scrolling, :align, :height, :width],
122
+ :font => AttrCore + AttrI18n + [:size, :color, :face],
123
+ :s => Attrs,
124
+ :applet => AttrCore + [:codebase, :archive, :code, :object, :alt, :name, :width, :height, :align, :hspace, :vspace],
125
+ :isindex => AttrCore + AttrI18n + [:prompt]
126
+
127
+ # Additional attributes found in XHTML 1.0 Transitional
128
+ { :script => [:language],
129
+ :a => [:target],
130
+ :td => [:bgcolor, :nowrap, :width, :height],
131
+ :p => [:align],
132
+ :h5 => [:align],
133
+ :h3 => [:align],
134
+ :li => [:type, :value],
135
+ :div => [:align],
136
+ :pre => [:width],
137
+ :body => [:background, :bgcolor, :text, :link, :vlink, :alink],
138
+ :ol => [:type, :compact, :start],
139
+ :h4 => [:align],
140
+ :h2 => [:align],
141
+ :object => [:align, :border, :hspace, :vspace],
142
+ :img => [:name, :align, :border, :hspace, :vspace],
143
+ :link => [:target],
144
+ :legend => [:align],
145
+ :dl => [:compact],
146
+ :input => [:align],
147
+ :h6 => [:align],
148
+ :hr => [:align, :noshade, :size, :width],
149
+ :base => [:target],
150
+ :ul => [:type, :compact],
151
+ :br => [:clear],
152
+ :form => [:name, :target],
153
+ :area => [:target],
154
+ :h1 => [:align]
155
+ }.each do |k, v|
156
+ @tagset[k] += v
157
+ end
158
+
159
+ @tags = @tagset.keys
160
+ @forms = @tags & FORM_TAGS
161
+ @self_closing = @tags & SELF_CLOSING_TAGS
162
+ end
163
+
164
+ end