miketracy-wwmd 0.2.11
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/README +62 -0
- data/README.txt +62 -0
- data/Rakefile +34 -0
- data/examples/config_example.yaml +24 -0
- data/examples/wwmd_example.rb +73 -0
- data/lib/wwmd.rb +78 -0
- data/lib/wwmd/encoding.rb +40 -0
- data/lib/wwmd/form.rb +110 -0
- data/lib/wwmd/form_array.rb +273 -0
- data/lib/wwmd/guid.rb +155 -0
- data/lib/wwmd/hpricot_html2text.rb +76 -0
- data/lib/wwmd/mixins.rb +318 -0
- data/lib/wwmd/mixins_extends.rb +188 -0
- data/lib/wwmd/mixins_external.rb +18 -0
- data/lib/wwmd/nokogiri_html2text.rb +41 -0
- data/lib/wwmd/page.rb +414 -0
- data/lib/wwmd/page/auth.rb +183 -0
- data/lib/wwmd/page/config.rb +44 -0
- data/lib/wwmd/page/constants.rb +60 -0
- data/lib/wwmd/page/headers.rb +107 -0
- data/lib/wwmd/page/inputs.rb +47 -0
- data/lib/wwmd/page/irb_helpers.rb +90 -0
- data/lib/wwmd/page/scrape.rb +202 -0
- data/lib/wwmd/page/spider.rb +127 -0
- data/lib/wwmd/page/urlparse.rb +79 -0
- data/lib/wwmd/page/utils.rb +30 -0
- data/lib/wwmd/viewstate.rb +118 -0
- data/lib/wwmd/viewstate/viewstate_class_helpers.rb +35 -0
- data/lib/wwmd/viewstate/viewstate_deserializer_methods.rb +213 -0
- data/lib/wwmd/viewstate/viewstate_from_xml.rb +126 -0
- data/lib/wwmd/viewstate/viewstate_types.rb +51 -0
- data/lib/wwmd/viewstate/viewstate_utils.rb +157 -0
- data/lib/wwmd/viewstate/viewstate_yaml.rb +25 -0
- data/lib/wwmd/viewstate/vs_array.rb +36 -0
- data/lib/wwmd/viewstate/vs_binary_serialized.rb +28 -0
- data/lib/wwmd/viewstate/vs_hashtable.rb +40 -0
- data/lib/wwmd/viewstate/vs_hybrid_dict.rb +40 -0
- data/lib/wwmd/viewstate/vs_indexed_string.rb +6 -0
- data/lib/wwmd/viewstate/vs_indexed_string_ref.rb +22 -0
- data/lib/wwmd/viewstate/vs_int_enum.rb +25 -0
- data/lib/wwmd/viewstate/vs_list.rb +32 -0
- data/lib/wwmd/viewstate/vs_pair.rb +27 -0
- data/lib/wwmd/viewstate/vs_read_types.rb +11 -0
- data/lib/wwmd/viewstate/vs_read_value.rb +33 -0
- data/lib/wwmd/viewstate/vs_sparse_array.rb +56 -0
- data/lib/wwmd/viewstate/vs_string.rb +29 -0
- data/lib/wwmd/viewstate/vs_string_array.rb +37 -0
- data/lib/wwmd/viewstate/vs_string_formatted.rb +30 -0
- data/lib/wwmd/viewstate/vs_triplet.rb +29 -0
- data/lib/wwmd/viewstate/vs_type.rb +21 -0
- data/lib/wwmd/viewstate/vs_unit.rb +28 -0
- data/lib/wwmd/viewstate/vs_value.rb +33 -0
- data/spec/README +3 -0
- data/spec/form_array.spec +49 -0
- data/spec/spider_csrf_test.spec +28 -0
- data/spec/urlparse_test.spec +89 -0
- data/tasks/ann.rake +80 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +201 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +51 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/setup.rb +292 -0
- data/tasks/spec.rake +54 -0
- data/tasks/test.rake +40 -0
- data/tasks/zentest.rake +36 -0
- metadata +164 -0
@@ -0,0 +1,273 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
This is a weird kind of data structure for no other reason than
|
3
|
+
I wanted to keep the form inputs in order when they come in.
|
4
|
+
|
5
|
+
Accessing this either as a hash or an array (but => won't work)
|
6
|
+
|
7
|
+
Some of the methods in here are kept for backward compat before the refactor
|
8
|
+
and now everything in this array should be accessed with []= and []
|
9
|
+
=end
|
10
|
+
|
11
|
+
module WWMD
|
12
|
+
class FormArray < Array
|
13
|
+
|
14
|
+
def initialize(fields=nil)
|
15
|
+
if not fields.nil?
|
16
|
+
# this first one is an array of field objects
|
17
|
+
if fields.class == Array
|
18
|
+
fields.each do |f|
|
19
|
+
name = f['name']
|
20
|
+
if self.name_exists(name)
|
21
|
+
if f['type'] == "hidden"
|
22
|
+
self.set name,f.get_value
|
23
|
+
elsif f['type'] == "checkbox" and f.to_html.grep(/checked/) != ''
|
24
|
+
self[name] = f.get_value
|
25
|
+
end
|
26
|
+
else
|
27
|
+
self << [ f['name'],f.get_value ]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
elsif fields.class == Hash
|
31
|
+
fields.each_pair { |k,v| self[k] = v }
|
32
|
+
elsif fields.class == String
|
33
|
+
fields.split("&").each do |f|
|
34
|
+
k,v = f.split("=",2)
|
35
|
+
self[k] = v
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# "deep enough" copy of this object to make it a real copy
|
42
|
+
# instead of references to the arrays that already exist
|
43
|
+
def clone
|
44
|
+
ret = self.class.new
|
45
|
+
self.each { |r| ret << r.clone }
|
46
|
+
return ret
|
47
|
+
end
|
48
|
+
|
49
|
+
def clear
|
50
|
+
self.delete_if { |x| true }
|
51
|
+
end
|
52
|
+
|
53
|
+
# check if the passed name exists in the form
|
54
|
+
def include?(key)
|
55
|
+
self.map { |x| x.first }.flatten.include?(key)
|
56
|
+
end
|
57
|
+
|
58
|
+
alias_method :name_exists, :include?#:nodoc:
|
59
|
+
alias_method :name_exists?, :include?#:nodoc:
|
60
|
+
alias_method :has_key?, :include?#:nodoc:
|
61
|
+
|
62
|
+
# add key/value pairs to form
|
63
|
+
def add(key,value)
|
64
|
+
self << [key,value]
|
65
|
+
end
|
66
|
+
|
67
|
+
def clear_viewstate
|
68
|
+
self.each { |k,v|
|
69
|
+
self[k] = "" if k == "__VIEWSTATE"
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
alias_method :extend!, :add #:nodoc (this is here for backward compat)
|
74
|
+
|
75
|
+
# key = Fixnum set value at index key
|
76
|
+
# key = String find key named string and set value
|
77
|
+
def set_value!(key,value)
|
78
|
+
if key.class == Fixnum
|
79
|
+
self[key][1] = value
|
80
|
+
return [self[key][0], value]
|
81
|
+
end
|
82
|
+
self.each_index do |i|
|
83
|
+
if self[i][0] == key
|
84
|
+
self[i] = [key,value]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
return [key,value]
|
88
|
+
end
|
89
|
+
|
90
|
+
alias_method :old_get, :[]#:nodoc:
|
91
|
+
def [](*args)
|
92
|
+
if args.first.class == Fixnum
|
93
|
+
self.old_get(args.first)
|
94
|
+
else
|
95
|
+
self.get_value(args.first)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
alias_method :old_set, :[]=#:nodoc:
|
100
|
+
# set a key using its index, array key or add using a new key i.e.:
|
101
|
+
# if setting:
|
102
|
+
# form = [['key','value'],['foo','bar']]
|
103
|
+
# form[0] = ["replacekey","newalue"]
|
104
|
+
# form["replacekey"] = "newervalue"
|
105
|
+
# if adding:
|
106
|
+
# form["newkey"] = "value"
|
107
|
+
#
|
108
|
+
def []=(*args)
|
109
|
+
key,value = args
|
110
|
+
if args.first.kind_of?(Fixnum)
|
111
|
+
return self.old_set(*args)
|
112
|
+
elsif self.has_key?(key)
|
113
|
+
return self.set_value(key,value)
|
114
|
+
else
|
115
|
+
return self.add(key,value)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
alias_method :set_value, :set_value!
|
120
|
+
alias_method :set, :set_value!
|
121
|
+
|
122
|
+
def get_value(key)
|
123
|
+
if key.class == Fixnum
|
124
|
+
return self[key][1]
|
125
|
+
end
|
126
|
+
self.each_index do |i|
|
127
|
+
if self[i][0] == key
|
128
|
+
return self[i][1]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
return nil
|
132
|
+
end
|
133
|
+
|
134
|
+
alias_method :get, :get_value
|
135
|
+
|
136
|
+
def setall!(value)
|
137
|
+
self.each_index { |i| self.set_value!(i,value) }
|
138
|
+
end
|
139
|
+
|
140
|
+
alias_method :setall, :setall!#:nodoc:
|
141
|
+
alias_method :set_all!, :setall!#:nodoc:
|
142
|
+
alias_method :set_all, :setall!#:nodoc:
|
143
|
+
|
144
|
+
# delete all key = value pairs from self where key = key
|
145
|
+
def delete_key(key)
|
146
|
+
self.reject! { |x,y| x == key }
|
147
|
+
end
|
148
|
+
|
149
|
+
alias_method :delete_keys!, :delete_key #:nodoc:
|
150
|
+
alias_method :delete_key!, :delete_key #:nodoc:
|
151
|
+
|
152
|
+
# escape form keys in place
|
153
|
+
def escape_keys!(reg=WWMD::ESCAPE[:url])
|
154
|
+
return nil if reg == :none
|
155
|
+
self.map! { |x,y| [x.escape(reg),y] }
|
156
|
+
end
|
157
|
+
|
158
|
+
# unescape form keys in place
|
159
|
+
def unescape_keys!(reg=WWMD::ESCAPE[:url])
|
160
|
+
return nil if reg == :none
|
161
|
+
self.map! { |x,y| [x.unescape,y] }
|
162
|
+
end
|
163
|
+
|
164
|
+
# escape form values in place
|
165
|
+
def escape_all!(reg=WWMD::ESCAPE[:url])
|
166
|
+
return nil if reg == :none
|
167
|
+
self.map! { |x,y| [x,y.escape(reg)] }
|
168
|
+
end
|
169
|
+
|
170
|
+
alias_method :escape_all, :escape_all!#:nodoc:
|
171
|
+
|
172
|
+
# unescape all form values in place
|
173
|
+
def unescape_all!
|
174
|
+
self.map! { |x,y| [x,y.unescape] }
|
175
|
+
end
|
176
|
+
|
177
|
+
alias_method :unescape_all, :unescape_all!#:nodoc:
|
178
|
+
|
179
|
+
# convert form into a post parameters string
|
180
|
+
def to_post
|
181
|
+
ret = []
|
182
|
+
self.each do |i|
|
183
|
+
ret.push(i.join("="))
|
184
|
+
end
|
185
|
+
ret.join("&")
|
186
|
+
end
|
187
|
+
|
188
|
+
# convert form into a get parameters string
|
189
|
+
#
|
190
|
+
# pass me a base to get a full url to pass to Page.get
|
191
|
+
def to_get(base="")
|
192
|
+
ret = []
|
193
|
+
self.each do |i|
|
194
|
+
ret.push(i.join("="))
|
195
|
+
end
|
196
|
+
ret = ret.join("&")
|
197
|
+
return base.clip + "?" + ret.to_s
|
198
|
+
end
|
199
|
+
|
200
|
+
# IRB: puts the form in human readable format
|
201
|
+
# if you <tt>form.show(true)</tt> it will show unescaped values
|
202
|
+
def show(unescape=false)
|
203
|
+
if unescape
|
204
|
+
self.each_index { |i| puts i.to_s + " :: " + self[i][0].to_s + " = " + self[i][1].to_s.unescape }
|
205
|
+
else
|
206
|
+
self.each_index { |i| puts i.to_s + " :: " + self[i][0].to_s + " = " + self[i][1].to_s }
|
207
|
+
end
|
208
|
+
return nil
|
209
|
+
end
|
210
|
+
|
211
|
+
# meh
|
212
|
+
def add_viewstate#:nodoc:
|
213
|
+
self.insert(0,[ "__VIEWSTATE","" ])
|
214
|
+
self.insert(0,[ "__EVENTARGUMENT","" ])
|
215
|
+
self.insert(0,[ "__EVENTTARGET","" ])
|
216
|
+
self.insert(0,[ "__EVENTVALIDATION","" ])
|
217
|
+
return nil
|
218
|
+
end
|
219
|
+
|
220
|
+
# alias_method, :add_state, :add_viewstate#:nodoc:
|
221
|
+
|
222
|
+
# remove form elements with null values
|
223
|
+
def remove_nulls!
|
224
|
+
self.delete_if { |x| x[1].to_s.empty? || x[1].nil? }
|
225
|
+
end
|
226
|
+
|
227
|
+
alias_method :squeeze!, :remove_nulls!
|
228
|
+
|
229
|
+
# remove form elements with null keys (for housekeeping returns)
|
230
|
+
def remove_null_keys!
|
231
|
+
self.delete_if { |x,y| x.to_s.empty? || x.nil? }
|
232
|
+
end
|
233
|
+
|
234
|
+
alias_method :squeeze_keys!, :remove_null_keys!
|
235
|
+
|
236
|
+
# dump a web page containing a csrf example of the current FormArray
|
237
|
+
def to_csrf(action)
|
238
|
+
ret = ""
|
239
|
+
ret << "<html><body>\n"
|
240
|
+
ret << "<form method='post' id='wwmdtest' name='wwmdtest' action='#{action}'>\n"
|
241
|
+
self.each do |key,val|
|
242
|
+
val = val.unescape.gsub(/'/) { %q[\'] }
|
243
|
+
ret << "<input name='#{key.to_s.unescape}' type='hidden' value='#{val}' />\n"
|
244
|
+
# ret << "<input name='#{key.to_s.unescape}' type='hidden' value='#{val.to_s.unescape.gsub(/'/,"\\'")}' />\n"
|
245
|
+
end
|
246
|
+
ret << "</form>\n"
|
247
|
+
ret << "<script>document.wwmdtest.submit()</script>\n"
|
248
|
+
ret << "</body></html>\n"
|
249
|
+
return ret
|
250
|
+
end
|
251
|
+
|
252
|
+
def keys
|
253
|
+
self.map { |k,v| k }
|
254
|
+
end
|
255
|
+
|
256
|
+
def burpify #:nodoc:
|
257
|
+
ret = self.clone
|
258
|
+
ret.each_index do |i|
|
259
|
+
next if ret[i][0] =~ /^__/
|
260
|
+
ret.set_value!(i,"#{ret.get_value(i)}" + "\302\247" + "\302\247")
|
261
|
+
end
|
262
|
+
system("echo '#{ret.to_post}' | pbcopy")
|
263
|
+
return ret
|
264
|
+
end
|
265
|
+
|
266
|
+
# return md5 hash of sorted list of keys
|
267
|
+
def fingerprint
|
268
|
+
return self.map { |k,v| k }.sort.to_s.md5
|
269
|
+
end
|
270
|
+
alias_method :fp, :fingerprint #:nodoc:
|
271
|
+
|
272
|
+
end
|
273
|
+
end
|
data/lib/wwmd/guid.rb
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Guid - Ruby library for portable GUID/UUID generation.
|
3
|
+
|
4
|
+
Copyright (c) 2004 David Garamond <davegaramond at icqmail com>
|
5
|
+
|
6
|
+
This library is free software; you can redistribute it and/or modify it
|
7
|
+
under the same terms as Ruby itself.
|
8
|
+
|
9
|
+
(small hack to fix for mac mtracy@matasano.com)
|
10
|
+
=end
|
11
|
+
|
12
|
+
if RUBY_PLATFORM =~ /win/i && ! RUBY_PLATFORM =~ /darwin/i
|
13
|
+
module Guid_Win32_#:nodoc:
|
14
|
+
require 'Win32API'
|
15
|
+
|
16
|
+
PROV_RSA_FULL = 1
|
17
|
+
CRYPT_VERIFYCONTEXT = 0xF0000000
|
18
|
+
FORMAT_MESSAGE_IGNORE_INSERTS = 0x00000200
|
19
|
+
FORMAT_MESSAGE_FROM_SYSTEM = 0x00001000
|
20
|
+
|
21
|
+
CryptAcquireContext = Win32API.new("advapi32", "CryptAcquireContext",
|
22
|
+
'PPPII', 'L')
|
23
|
+
CryptGenRandom = Win32API.new("advapi32", "CryptGenRandom",
|
24
|
+
'LIP', 'L')
|
25
|
+
CryptReleaseContext = Win32API.new("advapi32", "CryptReleaseContext",
|
26
|
+
'LI', 'L')
|
27
|
+
GetLastError = Win32API.new("kernel32", "GetLastError", '', 'L')
|
28
|
+
FormatMessageA = Win32API.new("kernel32", "FormatMessageA",
|
29
|
+
'LPLLPLPPPPPPPP', 'L')
|
30
|
+
|
31
|
+
def lastErrorMessage
|
32
|
+
code = GetLastError.call
|
33
|
+
msg = "\0" * 1024
|
34
|
+
len = FormatMessageA.call(FORMAT_MESSAGE_IGNORE_INSERTS +
|
35
|
+
FORMAT_MESSAGE_FROM_SYSTEM, 0,
|
36
|
+
code, 0, msg, 1024, nil, nil,
|
37
|
+
nil, nil, nil, nil, nil, nil)
|
38
|
+
msg[0, len].tr("\r", '').chomp
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize
|
42
|
+
hProvStr = " " * 4
|
43
|
+
if CryptAcquireContext.call(hProvStr, nil, nil, PROV_RSA_FULL,
|
44
|
+
CRYPT_VERIFYCONTEXT) == 0
|
45
|
+
raise SystemCallError, "CryptAcquireContext failed: #{lastErrorMessage}"
|
46
|
+
end
|
47
|
+
hProv, = hProvStr.unpack('L')
|
48
|
+
@bytes = " " * 16
|
49
|
+
if CryptGenRandom.call(hProv, 16, @bytes) == 0
|
50
|
+
raise SystemCallError, "CryptGenRandom failed: #{lastErrorMessage}"
|
51
|
+
end
|
52
|
+
if CryptReleaseContext.call(hProv, 0) == 0
|
53
|
+
raise SystemCallError, "CryptReleaseContext failed: #{lastErrorMessage}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
module Guid_Unix_#:nodoc:
|
60
|
+
@@random_device = nil
|
61
|
+
|
62
|
+
def initialize
|
63
|
+
if !@@random_device
|
64
|
+
if File.exists? "/dev/urandom"
|
65
|
+
@@random_device = File.open "/dev/urandom", "r"
|
66
|
+
elsif File.exists? "/dev/random"
|
67
|
+
@@random_device = File.open "/dev/random", "r"
|
68
|
+
else
|
69
|
+
raise RuntimeError, "Can't find random device"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
@bytes = @@random_device.read(16)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class Guid
|
78
|
+
if RUBY_PLATFORM =~ /win/ && ! RUBY_PLATFORM =~ /darwin/i
|
79
|
+
include Guid_Win32_
|
80
|
+
else
|
81
|
+
include Guid_Unix_
|
82
|
+
end
|
83
|
+
|
84
|
+
def hexdigest
|
85
|
+
@bytes.unpack("h*")[0]
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_s
|
89
|
+
@bytes.unpack("h8 h4 h4 h4 h12").join "-"
|
90
|
+
end
|
91
|
+
|
92
|
+
def inspect
|
93
|
+
to_s
|
94
|
+
end
|
95
|
+
|
96
|
+
def raw
|
97
|
+
@bytes
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.from_s(s)
|
101
|
+
raise ArgumentError, "Invalid GUID hexstring" unless
|
102
|
+
s =~ /\A[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}\z/i
|
103
|
+
guid = Guid.allocate
|
104
|
+
guid.instance_eval { @bytes = [s.gsub(/[^0-9a-f]+/i, '')].pack "h*" }
|
105
|
+
guid
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.from_raw(bytes)
|
109
|
+
raise ArgumentError, "Invalid GUID raw bytes, length must be 16 bytes" unless
|
110
|
+
bytes.length == 16
|
111
|
+
guid = Guid.allocate
|
112
|
+
guid.instance_eval { @bytes = bytes }
|
113
|
+
guid
|
114
|
+
end
|
115
|
+
|
116
|
+
def ==(other)
|
117
|
+
@bytes == other.raw
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
if __FILE__ == $0
|
122
|
+
require 'test/unit'
|
123
|
+
|
124
|
+
class GuidTest < Test::Unit::TestCase#:nodoc:
|
125
|
+
def test_new
|
126
|
+
g = Guid.new
|
127
|
+
|
128
|
+
# different representations of guid: hexdigest, hex+dashes, raw bytes
|
129
|
+
assert_equal(0, g.to_s =~ /\A[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\z/)
|
130
|
+
assert_equal(16, g.raw.length)
|
131
|
+
assert_equal(0, g.hexdigest =~ /\A[0-9a-f]{32}\z/)
|
132
|
+
assert_equal(g.hexdigest, g.to_s.gsub(/-/, ''))
|
133
|
+
|
134
|
+
# must be different each time we produce (this is just a simple test)
|
135
|
+
g2 = Guid.new
|
136
|
+
assert_equal(true, g != g2)
|
137
|
+
assert_equal(true, g.to_s != g2.to_s)
|
138
|
+
assert_equal(true, g.raw != g2.raw)
|
139
|
+
assert_equal(true, g.hexdigest != g2.hexdigest)
|
140
|
+
assert_equal(1000, (1..1000).select { |i| g != Guid.new }.length)
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_from_s
|
144
|
+
g = Guid.new
|
145
|
+
g2 = Guid.from_s(g.to_s)
|
146
|
+
assert_equal(g, g2)
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_from_raw
|
150
|
+
g = Guid.new
|
151
|
+
g2 = Guid.from_raw(g.raw)
|
152
|
+
assert_equal(g, g2)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# Geoff Davis geoff at geoffdavis.net
|
2
|
+
# Wed May 2 20:08:44 EDT 2007
|
3
|
+
# http://rubyforge.org/pipermail/raleigh-rb-members/2007-May/000789.html
|
4
|
+
# modified by mtracy at matasano.com for WWMD
|
5
|
+
|
6
|
+
module WWMD
|
7
|
+
InlineTags = ['a','abbr','acronym','address','b','bdo','big','cite','code','del','dfn','em','font','i','ins','kbd','label','noframes','noscript','q','s','samp','small','span','strike','strong','sub','sup','td','th','tt','u','html','body','table']
|
8
|
+
BlockTags = ['blockquote','br','center','dd','div','fieldset','form','h1','h2','h3', 'h4','h5','h6','hr','p','pre','tr','var',]
|
9
|
+
ListTags = ['dir','dl','menu','ol','ul']
|
10
|
+
ItemTags = ['li','dt']
|
11
|
+
# AsciiEquivalents = {"amp"=>"&","bull"=>"*","copy"=>"(c)","laquo"=>"<<","raquo"=>">>","ge"=> ">=","le"=>"<=","mdash"=>"-","ndash"=>"-","plusmn"=>"+/-","times"=>"x"}
|
12
|
+
|
13
|
+
# NamedCharRegex = Regexp.new("(&("+Hpricot::NamedCharacters.keys.join("|")+");)")
|
14
|
+
|
15
|
+
class Page
|
16
|
+
def element_to_text(n)
|
17
|
+
tag = n.etag || n.stag
|
18
|
+
name = tag.name.downcase
|
19
|
+
s = ""
|
20
|
+
is_block = BlockTags.include?(name)
|
21
|
+
is_list = ListTags.include?(name)
|
22
|
+
is_item = ItemTags.include?(name)
|
23
|
+
is_inline = InlineTags.include?(name)
|
24
|
+
if is_block or is_list or is_item or is_inline
|
25
|
+
n.each_child do |c|
|
26
|
+
s += node_to_text(c)
|
27
|
+
end
|
28
|
+
if is_block or is_list
|
29
|
+
s += "\n"
|
30
|
+
elsif is_item
|
31
|
+
s = "* " + s + "\n"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
s
|
35
|
+
end
|
36
|
+
|
37
|
+
def node_to_text(n)
|
38
|
+
return "" if n.comment?
|
39
|
+
return element_to_text(n) if n.elem?
|
40
|
+
return n.inner_text if n.text?
|
41
|
+
|
42
|
+
s = ""
|
43
|
+
begin
|
44
|
+
n.each_child do |c|
|
45
|
+
s += node_to_text(c)
|
46
|
+
end
|
47
|
+
rescue => e
|
48
|
+
# puts "WARNING: #{e.inspect}"
|
49
|
+
end
|
50
|
+
return s
|
51
|
+
end
|
52
|
+
|
53
|
+
# def lookup_named_char(s)
|
54
|
+
# c = Hpricot::NamedCharacters[s[1...-1]]
|
55
|
+
# c.chr if c
|
56
|
+
# end
|
57
|
+
|
58
|
+
def html2text
|
59
|
+
doc = self.scrape.hdoc
|
60
|
+
text = node_to_text(doc)
|
61
|
+
# text.gsub!(NamedCharRegex){|s| "#{lookup_named_char(s)}"}
|
62
|
+
# clean up white space
|
63
|
+
text.gsub!("\r"," ")
|
64
|
+
text.squeeze!(" ")
|
65
|
+
text.strip!
|
66
|
+
ret = ''
|
67
|
+
text.split(/\n/).each do |l|
|
68
|
+
l.strip!
|
69
|
+
next if l == ''
|
70
|
+
next if l =~ /^\?+$/
|
71
|
+
ret += "#{l}\n"
|
72
|
+
end
|
73
|
+
return ret
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|