hpricot 0.7-x86-mswin32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/CHANGELOG +68 -0
  2. data/COPYING +18 -0
  3. data/README +284 -0
  4. data/Rakefile +260 -0
  5. data/ext/fast_xs/FastXsService.java +1018 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +200 -0
  8. data/ext/hpricot_scan/HpricotScanService.java +1305 -0
  9. data/ext/hpricot_scan/extconf.rb +6 -0
  10. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  11. data/ext/hpricot_scan/hpricot_css.c +3502 -0
  12. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  13. data/ext/hpricot_scan/hpricot_scan.c +6704 -0
  14. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  15. data/ext/hpricot_scan/hpricot_scan.java.rl +373 -0
  16. data/ext/hpricot_scan/hpricot_scan.rl +722 -0
  17. data/ext/hpricot_scan/test.rb +4 -0
  18. data/extras/mingw-rbconfig.rb +176 -0
  19. data/lib/fast_xs.so +0 -0
  20. data/lib/hpricot.rb +26 -0
  21. data/lib/hpricot/blankslate.rb +63 -0
  22. data/lib/hpricot/builder.rb +216 -0
  23. data/lib/hpricot/elements.rb +510 -0
  24. data/lib/hpricot/htmlinfo.rb +691 -0
  25. data/lib/hpricot/inspect.rb +103 -0
  26. data/lib/hpricot/modules.rb +38 -0
  27. data/lib/hpricot/parse.rb +38 -0
  28. data/lib/hpricot/tag.rb +198 -0
  29. data/lib/hpricot/tags.rb +164 -0
  30. data/lib/hpricot/traverse.rb +838 -0
  31. data/lib/hpricot/xchar.rb +94 -0
  32. data/lib/hpricot_scan.so +0 -0
  33. data/test/files/basic.xhtml +17 -0
  34. data/test/files/boingboing.html +2266 -0
  35. data/test/files/cy0.html +3653 -0
  36. data/test/files/immob.html +400 -0
  37. data/test/files/pace_application.html +1320 -0
  38. data/test/files/tenderlove.html +16 -0
  39. data/test/files/uswebgen.html +220 -0
  40. data/test/files/utf8.html +1054 -0
  41. data/test/files/week9.html +1723 -0
  42. data/test/files/why.xml +19 -0
  43. data/test/load_files.rb +7 -0
  44. data/test/nokogiri-bench.rb +64 -0
  45. data/test/test_alter.rb +77 -0
  46. data/test/test_builder.rb +37 -0
  47. data/test/test_parser.rb +409 -0
  48. data/test/test_paths.rb +25 -0
  49. data/test/test_preserved.rb +70 -0
  50. data/test/test_xml.rb +28 -0
  51. metadata +111 -0
@@ -0,0 +1,103 @@
1
+ require 'pp'
2
+
3
+ module Hpricot
4
+ # :stopdoc:
5
+ class Elements
6
+ def pretty_print(q)
7
+ q.object_group(self) { super }
8
+ end
9
+ alias inspect pretty_print_inspect
10
+ end
11
+
12
+ class Doc
13
+ def pretty_print(q)
14
+ q.object_group(self) { children.each {|elt| q.breakable; q.pp elt } }
15
+ end
16
+ alias inspect pretty_print_inspect
17
+ end
18
+
19
+ module Leaf
20
+ def pretty_print(q)
21
+ q.group(1, '{', '}') {
22
+ q.text self.class.name.sub(/.*::/,'').downcase
23
+ if rs = raw_string
24
+ rs.scan(/[^\r\n]*(?:\r\n?|\n|[^\r\n]\z)/) {|line|
25
+ q.breakable
26
+ q.pp line
27
+ }
28
+ elsif self.respond_to? :to_s
29
+ q.breakable
30
+ q.text self.to_s
31
+ end
32
+ }
33
+ end
34
+ alias inspect pretty_print_inspect
35
+ end
36
+
37
+ class Elem
38
+ def pretty_print(q)
39
+ if empty?
40
+ q.group(1, '{emptyelem', '}') {
41
+ q.breakable; pretty_print_stag q
42
+ }
43
+ else
44
+ q.group(1, "{elem", "}") {
45
+ q.breakable; pretty_print_stag q
46
+ if children
47
+ children.each {|elt| q.breakable; q.pp elt }
48
+ end
49
+ if etag
50
+ q.breakable; q.pp etag
51
+ end
52
+ }
53
+ end
54
+ end
55
+ def pretty_print_stag(q)
56
+ q.group(1, '<', '>') {
57
+ q.text name
58
+
59
+ if raw_attributes
60
+ raw_attributes.each {|n, t|
61
+ q.breakable
62
+ if t
63
+ q.text "#{n}=\"#{Hpricot.uxs(t)}\""
64
+ else
65
+ q.text n
66
+ end
67
+ }
68
+ end
69
+ }
70
+ end
71
+ alias inspect pretty_print_inspect
72
+ end
73
+
74
+ class ETag
75
+ def pretty_print(q)
76
+ q.group(1, '</', '>') {
77
+ q.text name
78
+ }
79
+ end
80
+ alias inspect pretty_print_inspect
81
+ end
82
+
83
+ class Text
84
+ def pretty_print(q)
85
+ q.text content.dump
86
+ end
87
+ end
88
+
89
+ class BogusETag
90
+ def pretty_print(q)
91
+ q.group(1, '{', '}') {
92
+ q.text self.class.name.sub(/.*::/,'').downcase
93
+ if rs = raw_string
94
+ q.breakable
95
+ q.text rs
96
+ else
97
+ q.text "</#{name}>"
98
+ end
99
+ }
100
+ end
101
+ end
102
+ # :startdoc:
103
+ end
@@ -0,0 +1,38 @@
1
+ module Hpricot
2
+ class Name; include Hpricot end
3
+ class Context; include Hpricot end
4
+
5
+ # :stopdoc:
6
+ module Tag; include Hpricot end
7
+ class ETag; include Tag end
8
+ # :startdoc:
9
+
10
+ module Node; include Hpricot end
11
+ module Container; include Node end
12
+ class Doc; include Container end
13
+ class Elem; include Container end
14
+
15
+ module Leaf; include Node end
16
+ class Text; include Leaf end
17
+ class XMLDecl; include Leaf end
18
+ class DocType; include Leaf end
19
+ class ProcIns; include Leaf end
20
+ class Comment; include Leaf end
21
+ class BogusETag; include Leaf end
22
+
23
+ module Traverse end
24
+ module Container::Trav; include Traverse end
25
+ module Leaf::Trav; include Traverse end
26
+ class Doc; module Trav; include Container::Trav end; include Trav end
27
+ class Elem; module Trav; include Container::Trav end; include Trav end
28
+ class CData; module Trav; include Leaf::Trav end; include Trav end
29
+ class Text; module Trav; include Leaf::Trav end; include Trav end
30
+ class XMLDecl; module Trav; include Leaf::Trav end; include Trav end
31
+ class DocType; module Trav; include Leaf::Trav end; include Trav end
32
+ class ProcIns; module Trav; include Leaf::Trav end; include Trav end
33
+ class Comment; module Trav; include Leaf::Trav end; include Trav end
34
+ class BogusETag; module Trav; include Leaf::Trav end; include Trav end
35
+
36
+ class Error < StandardError; end
37
+ end
38
+
@@ -0,0 +1,38 @@
1
+ require 'hpricot/htmlinfo'
2
+
3
+ def Hpricot(input = nil, opts = {}, &blk)
4
+ Hpricot.make(input, opts, &blk)
5
+ end
6
+
7
+ module Hpricot
8
+ # Exception class used for any errors related to deficiencies in the system when
9
+ # handling the character encodings of a document.
10
+ class EncodingError < StandardError; end
11
+
12
+ # Hpricot.parse parses <i>input</i> and return a document tree.
13
+ # represented by Hpricot::Doc.
14
+ def Hpricot.parse(input = nil, opts = {}, &blk)
15
+ make(input, opts, &blk)
16
+ end
17
+
18
+ # Hpricot::XML parses <i>input</i>, disregarding all the HTML rules
19
+ # and returning a document tree.
20
+ def Hpricot.XML(input = nil, opts = {}, &blk)
21
+ opts.merge! :xml => true
22
+ make(input, opts, &blk)
23
+ end
24
+
25
+ # :stopdoc:
26
+
27
+ def Hpricot.make(input = nil, opts = {}, &blk)
28
+ if blk
29
+ doc = Hpricot.build(&blk)
30
+ doc.instance_variable_set("@options", opts)
31
+ doc
32
+ else
33
+ Hpricot.scan(input, opts)
34
+ end
35
+ end
36
+
37
+ # :startdoc:
38
+ end
@@ -0,0 +1,198 @@
1
+ module Hpricot
2
+ # :stopdoc:
3
+
4
+ class Doc
5
+ def output(out, opts = {})
6
+ children.each do |n|
7
+ n.output(out, opts)
8
+ end if children
9
+ out
10
+ end
11
+ def make(input = nil, &blk)
12
+ Hpricot.make(input, @options, &blk).children
13
+ end
14
+ def altered!; end
15
+ def inspect_tree
16
+ children.map { |x| x.inspect_tree }.join if children
17
+ end
18
+ end
19
+
20
+ class BaseEle
21
+ def html_quote(str)
22
+ "\"" + str.gsub('"', '\\"') + "\""
23
+ end
24
+ def if_output(opts)
25
+ if opts[:preserve] and not raw_string.nil?
26
+ raw_string
27
+ else
28
+ yield opts
29
+ end
30
+ end
31
+ def pathname; self.name end
32
+ def altered!
33
+ clear_raw
34
+ end
35
+ def inspect_tree(depth = 0)
36
+ %{#{" " * depth}} + self.class.name.split(/::/).last.downcase + "\n"
37
+ end
38
+ end
39
+
40
+ class Elem
41
+ def initialize tag, attrs = nil, children = nil, etag = nil
42
+ self.name, self.raw_attributes, self.children, self.etag =
43
+ tag, attrs, children, etag
44
+ end
45
+ def empty?; children.nil? or children.empty? end
46
+ def attributes
47
+ if raw_attributes
48
+ raw_attributes.inject({}) do |hsh, (k, v)|
49
+ hsh[k] = Hpricot.uxs(v)
50
+ hsh
51
+ end
52
+ else
53
+ {}
54
+ end
55
+ end
56
+ def to_plain_text
57
+ if self.name == 'br'
58
+ "\n"
59
+ elsif self.name == 'p'
60
+ "\n\n" + super + "\n\n"
61
+ elsif self.name == 'a' and self.has_attribute?('href')
62
+ "#{super} [#{self['href']}]"
63
+ elsif self.name == 'img' and self.has_attribute?('src')
64
+ "[img:#{self['src']}]"
65
+ else
66
+ super
67
+ end
68
+ end
69
+ def pathname; self.name end
70
+ def output(out, opts = {})
71
+ out <<
72
+ if_output(opts) do
73
+ "<#{name}#{attributes_as_html}" +
74
+ ((empty? and not etag) ? " /" : "") +
75
+ ">"
76
+ end
77
+ if children
78
+ children.each { |n| n.output(out, opts) }
79
+ end
80
+ if etag
81
+ etag.output(out, opts)
82
+ elsif !opts[:preserve] && !empty?
83
+ out <<
84
+ if_output(opts) do
85
+ "</#{name}>"
86
+ end
87
+ end
88
+ out
89
+ end
90
+ def attributes_as_html
91
+ if raw_attributes
92
+ raw_attributes.map do |aname, aval|
93
+ " #{aname}" +
94
+ (aval ? "=#{html_quote aval}" : "")
95
+ end.join
96
+ end
97
+ end
98
+ def inspect_tree(depth = 0)
99
+ %{#{" " * depth}} + name + "\n" +
100
+ (children ? children.map { |x| x.inspect_tree(depth + 1) }.join : "")
101
+ end
102
+ end
103
+
104
+ class ETag
105
+ def initialize name; self.name = name end
106
+ def output(out, opts = {})
107
+ out <<
108
+ if_output(opts) do
109
+ "</#{name}>"
110
+ end
111
+ end
112
+ end
113
+
114
+ class BogusETag
115
+ def output(out, opts = {}); out << if_output(opts) { '' }; end
116
+ end
117
+
118
+ class Text
119
+ def initialize content; self.content = content end
120
+ def pathname; "text()" end
121
+ def to_s
122
+ Hpricot.uxs(content)
123
+ end
124
+ alias_method :inner_text, :to_s
125
+ alias_method :to_plain_text, :to_s
126
+ def << str; self.content << str end
127
+ def output(out, opts = {})
128
+ out <<
129
+ if_output(opts) do
130
+ content.to_s
131
+ end
132
+ end
133
+ end
134
+
135
+ class CData
136
+ def initialize content; self.content = content end
137
+ alias_method :to_s, :content
138
+ alias_method :to_plain_text, :content
139
+ def output(out, opts = {})
140
+ out <<
141
+ if_output(opts) do
142
+ "<![CDATA[#{content}]]>"
143
+ end
144
+ end
145
+ end
146
+
147
+ class XMLDecl
148
+ def pathname; "xmldecl()" end
149
+ def output(out, opts = {})
150
+ out <<
151
+ if_output(opts) do
152
+ "<?xml version=\"#{version}\"" +
153
+ (encoding ? " encoding=\"#{encoding}\"" : "") +
154
+ (standalone != nil ? " standalone=\"#{standalone ? 'yes' : 'no'}\"" : "") +
155
+ "?>"
156
+ end
157
+ end
158
+ end
159
+
160
+ class DocType
161
+ def initialize target, pub, sys
162
+ self.target, self.public_id, self.system_id = target, pub, sys
163
+ end
164
+ def pathname; "doctype()" end
165
+ def output(out, opts = {})
166
+ out <<
167
+ if_output(opts) do
168
+ "<!DOCTYPE #{target} " +
169
+ (public_id ? "PUBLIC \"#{public_id}\"" : "SYSTEM") +
170
+ (system_id ? " #{html_quote(system_id)}" : "") + ">"
171
+ end
172
+ end
173
+ end
174
+
175
+ class ProcIns
176
+ def pathname; "procins()" end
177
+ def output(out, opts = {})
178
+ out <<
179
+ if_output(opts) do
180
+ "<?#{target}" +
181
+ (content ? " #{content}" : "") +
182
+ "?>"
183
+ end
184
+ end
185
+ end
186
+
187
+ class Comment
188
+ def pathname; "comment()" end
189
+ def output(out, opts = {})
190
+ out <<
191
+ if_output(opts) do
192
+ "<!--#{content}-->"
193
+ end
194
+ end
195
+ end
196
+
197
+ # :startdoc:
198
+ end
@@ -0,0 +1,164 @@
1
+ module Hpricot
2
+
3
+ FORM_TAGS = [ :form, :input, :select, :textarea ]
4
+ SELF_CLOSING_TAGS = [ :base, :meta, :link, :hr, :br, :param, :img, :area, :input, :col ]
5
+
6
+ # Common sets of attributes.
7
+ AttrCore = [:id, :class, :style, :title]
8
+ AttrI18n = [:lang, 'xml:lang'.intern, :dir]
9
+ AttrEvents = [:onclick, :ondblclick, :onmousedown, :onmouseup, :onmouseover, :onmousemove,
10
+ :onmouseout, :onkeypress, :onkeydown, :onkeyup]
11
+ AttrFocus = [:accesskey, :tabindex, :onfocus, :onblur]
12
+ AttrHAlign = [:align, :char, :charoff]
13
+ AttrVAlign = [:valign]
14
+ Attrs = AttrCore + AttrI18n + AttrEvents
15
+
16
+ # All the tags and attributes from XHTML 1.0 Strict
17
+ class XHTMLStrict
18
+ class << self
19
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
20
+ end
21
+ @doctype = ["-//W3C//DTD XHTML 1.0 Strict//EN", "DTD/xhtml1-strict.dtd"]
22
+ @tagset = {
23
+ :html => AttrI18n + [:id, :xmlns],
24
+ :head => AttrI18n + [:id, :profile],
25
+ :title => AttrI18n + [:id],
26
+ :base => [:href, :id],
27
+ :meta => AttrI18n + [:id, :http, :name, :content, :scheme, 'http-equiv'.intern],
28
+ :link => Attrs + [:charset, :href, :hreflang, :type, :rel, :rev, :media],
29
+ :style => AttrI18n + [:id, :type, :media, :title, 'xml:space'.intern],
30
+ :script => [:id, :charset, :type, :src, :defer, 'xml:space'.intern],
31
+ :noscript => Attrs,
32
+ :body => Attrs + [:onload, :onunload],
33
+ :div => Attrs,
34
+ :p => Attrs,
35
+ :ul => Attrs,
36
+ :ol => Attrs,
37
+ :li => Attrs,
38
+ :dl => Attrs,
39
+ :dt => Attrs,
40
+ :dd => Attrs,
41
+ :address => Attrs,
42
+ :hr => Attrs,
43
+ :pre => Attrs + ['xml:space'.intern],
44
+ :blockquote => Attrs + [:cite],
45
+ :ins => Attrs + [:cite, :datetime],
46
+ :del => Attrs + [:cite, :datetime],
47
+ :a => Attrs + AttrFocus + [:charset, :type, :name, :href, :hreflang, :rel, :rev, :shape, :coords],
48
+ :span => Attrs,
49
+ :bdo => AttrCore + AttrEvents + [:lang, 'xml:lang'.intern, :dir],
50
+ :br => AttrCore,
51
+ :em => Attrs,
52
+ :strong => Attrs,
53
+ :dfn => Attrs,
54
+ :code => Attrs,
55
+ :samp => Attrs,
56
+ :kbd => Attrs,
57
+ :var => Attrs,
58
+ :cite => Attrs,
59
+ :abbr => Attrs,
60
+ :acronym => Attrs,
61
+ :q => Attrs + [:cite],
62
+ :sub => Attrs,
63
+ :sup => Attrs,
64
+ :tt => Attrs,
65
+ :i => Attrs,
66
+ :b => Attrs,
67
+ :big => Attrs,
68
+ :small => Attrs,
69
+ :object => Attrs + [:declare, :classid, :codebase, :data, :type, :codetype, :archive, :standby, :height, :width, :usemap, :name, :tabindex],
70
+ :param => [:id, :name, :value, :valuetype, :type],
71
+ :img => Attrs + [:src, :alt, :longdesc, :height, :width, :usemap, :ismap],
72
+ :map => AttrI18n + AttrEvents + [:id, :class, :style, :title, :name],
73
+ :area => Attrs + AttrFocus + [:shape, :coords, :href, :nohref, :alt],
74
+ :form => Attrs + [:action, :method, :enctype, :onsubmit, :onreset, :accept, :accept],
75
+ :label => Attrs + [:for, :accesskey, :onfocus, :onblur],
76
+ :input => Attrs + AttrFocus + [:type, :name, :value, :checked, :disabled, :readonly, :size, :maxlength, :src, :alt, :usemap, :onselect, :onchange, :accept],
77
+ :select => Attrs + [:name, :size, :multiple, :disabled, :tabindex, :onfocus, :onblur, :onchange],
78
+ :optgroup => Attrs + [:disabled, :label],
79
+ :option => Attrs + [:selected, :disabled, :label, :value],
80
+ :textarea => Attrs + AttrFocus + [:name, :rows, :cols, :disabled, :readonly, :onselect, :onchange],
81
+ :fieldset => Attrs,
82
+ :legend => Attrs + [:accesskey],
83
+ :button => Attrs + AttrFocus + [:name, :value, :type, :disabled],
84
+ :table => Attrs + [:summary, :width, :border, :frame, :rules, :cellspacing, :cellpadding],
85
+ :caption => Attrs,
86
+ :colgroup => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
87
+ :col => Attrs + AttrHAlign + AttrVAlign + [:span, :width],
88
+ :thead => Attrs + AttrHAlign + AttrVAlign,
89
+ :tfoot => Attrs + AttrHAlign + AttrVAlign,
90
+ :tbody => Attrs + AttrHAlign + AttrVAlign,
91
+ :tr => Attrs + AttrHAlign + AttrVAlign,
92
+ :th => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
93
+ :td => Attrs + AttrHAlign + AttrVAlign + [:abbr, :axis, :headers, :scope, :rowspan, :colspan],
94
+ :h1 => Attrs,
95
+ :h2 => Attrs,
96
+ :h3 => Attrs,
97
+ :h4 => Attrs,
98
+ :h5 => Attrs,
99
+ :h6 => Attrs
100
+ }
101
+
102
+ @tags = @tagset.keys
103
+ @forms = @tags & FORM_TAGS
104
+ @self_closing = @tags & SELF_CLOSING_TAGS
105
+ end
106
+
107
+ # Additional tags found in XHTML 1.0 Transitional
108
+ class XHTMLTransitional
109
+ class << self
110
+ attr_accessor :tags, :tagset, :forms, :self_closing, :doctype
111
+ end
112
+ @doctype = ["-//W3C//DTD XHTML 1.0 Transitional//EN", "DTD/xhtml1-transitional.dtd"]
113
+ @tagset = XHTMLStrict.tagset.merge \
114
+ :strike => Attrs,
115
+ :center => Attrs,
116
+ :dir => Attrs + [:compact],
117
+ :noframes => Attrs,
118
+ :basefont => [:id, :size, :color, :face],
119
+ :u => Attrs,
120
+ :menu => Attrs + [:compact],
121
+ :iframe => AttrCore + [:longdesc, :name, :src, :frameborder, :marginwidth, :marginheight, :scrolling, :align, :height, :width],
122
+ :font => AttrCore + AttrI18n + [:size, :color, :face],
123
+ :s => Attrs,
124
+ :applet => AttrCore + [:codebase, :archive, :code, :object, :alt, :name, :width, :height, :align, :hspace, :vspace],
125
+ :isindex => AttrCore + AttrI18n + [:prompt]
126
+
127
+ # Additional attributes found in XHTML 1.0 Transitional
128
+ { :script => [:language],
129
+ :a => [:target],
130
+ :td => [:bgcolor, :nowrap, :width, :height],
131
+ :p => [:align],
132
+ :h5 => [:align],
133
+ :h3 => [:align],
134
+ :li => [:type, :value],
135
+ :div => [:align],
136
+ :pre => [:width],
137
+ :body => [:background, :bgcolor, :text, :link, :vlink, :alink],
138
+ :ol => [:type, :compact, :start],
139
+ :h4 => [:align],
140
+ :h2 => [:align],
141
+ :object => [:align, :border, :hspace, :vspace],
142
+ :img => [:name, :align, :border, :hspace, :vspace],
143
+ :link => [:target],
144
+ :legend => [:align],
145
+ :dl => [:compact],
146
+ :input => [:align],
147
+ :h6 => [:align],
148
+ :hr => [:align, :noshade, :size, :width],
149
+ :base => [:target],
150
+ :ul => [:type, :compact],
151
+ :br => [:clear],
152
+ :form => [:name, :target],
153
+ :area => [:target],
154
+ :h1 => [:align]
155
+ }.each do |k, v|
156
+ @tagset[k] += v
157
+ end
158
+
159
+ @tags = @tagset.keys
160
+ @forms = @tags & FORM_TAGS
161
+ @self_closing = @tags & SELF_CLOSING_TAGS
162
+ end
163
+
164
+ end