miketracy-wwmd 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/README +62 -0
- data/README.txt +62 -0
- data/Rakefile +34 -0
- data/examples/config_example.yaml +24 -0
- data/examples/wwmd_example.rb +73 -0
- data/lib/wwmd.rb +78 -0
- data/lib/wwmd/encoding.rb +40 -0
- data/lib/wwmd/form.rb +110 -0
- data/lib/wwmd/form_array.rb +273 -0
- data/lib/wwmd/guid.rb +155 -0
- data/lib/wwmd/hpricot_html2text.rb +76 -0
- data/lib/wwmd/mixins.rb +318 -0
- data/lib/wwmd/mixins_extends.rb +188 -0
- data/lib/wwmd/mixins_external.rb +18 -0
- data/lib/wwmd/nokogiri_html2text.rb +41 -0
- data/lib/wwmd/page.rb +414 -0
- data/lib/wwmd/page/auth.rb +183 -0
- data/lib/wwmd/page/config.rb +44 -0
- data/lib/wwmd/page/constants.rb +60 -0
- data/lib/wwmd/page/headers.rb +107 -0
- data/lib/wwmd/page/inputs.rb +47 -0
- data/lib/wwmd/page/irb_helpers.rb +90 -0
- data/lib/wwmd/page/scrape.rb +202 -0
- data/lib/wwmd/page/spider.rb +127 -0
- data/lib/wwmd/page/urlparse.rb +79 -0
- data/lib/wwmd/page/utils.rb +30 -0
- data/lib/wwmd/viewstate.rb +118 -0
- data/lib/wwmd/viewstate/viewstate_class_helpers.rb +35 -0
- data/lib/wwmd/viewstate/viewstate_deserializer_methods.rb +213 -0
- data/lib/wwmd/viewstate/viewstate_from_xml.rb +126 -0
- data/lib/wwmd/viewstate/viewstate_types.rb +51 -0
- data/lib/wwmd/viewstate/viewstate_utils.rb +157 -0
- data/lib/wwmd/viewstate/viewstate_yaml.rb +25 -0
- data/lib/wwmd/viewstate/vs_array.rb +36 -0
- data/lib/wwmd/viewstate/vs_binary_serialized.rb +28 -0
- data/lib/wwmd/viewstate/vs_hashtable.rb +40 -0
- data/lib/wwmd/viewstate/vs_hybrid_dict.rb +40 -0
- data/lib/wwmd/viewstate/vs_indexed_string.rb +6 -0
- data/lib/wwmd/viewstate/vs_indexed_string_ref.rb +22 -0
- data/lib/wwmd/viewstate/vs_int_enum.rb +25 -0
- data/lib/wwmd/viewstate/vs_list.rb +32 -0
- data/lib/wwmd/viewstate/vs_pair.rb +27 -0
- data/lib/wwmd/viewstate/vs_read_types.rb +11 -0
- data/lib/wwmd/viewstate/vs_read_value.rb +33 -0
- data/lib/wwmd/viewstate/vs_sparse_array.rb +56 -0
- data/lib/wwmd/viewstate/vs_string.rb +29 -0
- data/lib/wwmd/viewstate/vs_string_array.rb +37 -0
- data/lib/wwmd/viewstate/vs_string_formatted.rb +30 -0
- data/lib/wwmd/viewstate/vs_triplet.rb +29 -0
- data/lib/wwmd/viewstate/vs_type.rb +21 -0
- data/lib/wwmd/viewstate/vs_unit.rb +28 -0
- data/lib/wwmd/viewstate/vs_value.rb +33 -0
- data/spec/README +3 -0
- data/spec/form_array.spec +49 -0
- data/spec/spider_csrf_test.spec +28 -0
- data/spec/urlparse_test.spec +89 -0
- data/tasks/ann.rake +80 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +201 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +51 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/setup.rb +292 -0
- data/tasks/spec.rake +54 -0
- data/tasks/test.rake +40 -0
- data/tasks/zentest.rake +36 -0
- metadata +164 -0
@@ -0,0 +1,273 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
This is a weird kind of data structure for no other reason than
|
3
|
+
I wanted to keep the form inputs in order when they come in.
|
4
|
+
|
5
|
+
Accessing this either as a hash or an array (but => won't work)
|
6
|
+
|
7
|
+
Some of the methods in here are kept for backward compat before the refactor
|
8
|
+
and now everything in this array should be accessed with []= and []
|
9
|
+
=end
|
10
|
+
|
11
|
+
module WWMD
|
12
|
+
class FormArray < Array
|
13
|
+
|
14
|
+
def initialize(fields=nil)
|
15
|
+
if not fields.nil?
|
16
|
+
# this first one is an array of field objects
|
17
|
+
if fields.class == Array
|
18
|
+
fields.each do |f|
|
19
|
+
name = f['name']
|
20
|
+
if self.name_exists(name)
|
21
|
+
if f['type'] == "hidden"
|
22
|
+
self.set name,f.get_value
|
23
|
+
elsif f['type'] == "checkbox" and f.to_html.grep(/checked/) != ''
|
24
|
+
self[name] = f.get_value
|
25
|
+
end
|
26
|
+
else
|
27
|
+
self << [ f['name'],f.get_value ]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
elsif fields.class == Hash
|
31
|
+
fields.each_pair { |k,v| self[k] = v }
|
32
|
+
elsif fields.class == String
|
33
|
+
fields.split("&").each do |f|
|
34
|
+
k,v = f.split("=",2)
|
35
|
+
self[k] = v
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# "deep enough" copy of this object to make it a real copy
|
42
|
+
# instead of references to the arrays that already exist
|
43
|
+
def clone
|
44
|
+
ret = self.class.new
|
45
|
+
self.each { |r| ret << r.clone }
|
46
|
+
return ret
|
47
|
+
end
|
48
|
+
|
49
|
+
def clear
|
50
|
+
self.delete_if { |x| true }
|
51
|
+
end
|
52
|
+
|
53
|
+
# check if the passed name exists in the form
|
54
|
+
def include?(key)
|
55
|
+
self.map { |x| x.first }.flatten.include?(key)
|
56
|
+
end
|
57
|
+
|
58
|
+
alias_method :name_exists, :include?#:nodoc:
|
59
|
+
alias_method :name_exists?, :include?#:nodoc:
|
60
|
+
alias_method :has_key?, :include?#:nodoc:
|
61
|
+
|
62
|
+
# add key/value pairs to form
|
63
|
+
def add(key,value)
|
64
|
+
self << [key,value]
|
65
|
+
end
|
66
|
+
|
67
|
+
def clear_viewstate
|
68
|
+
self.each { |k,v|
|
69
|
+
self[k] = "" if k == "__VIEWSTATE"
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
alias_method :extend!, :add #:nodoc (this is here for backward compat)
|
74
|
+
|
75
|
+
# key = Fixnum set value at index key
|
76
|
+
# key = String find key named string and set value
|
77
|
+
def set_value!(key,value)
|
78
|
+
if key.class == Fixnum
|
79
|
+
self[key][1] = value
|
80
|
+
return [self[key][0], value]
|
81
|
+
end
|
82
|
+
self.each_index do |i|
|
83
|
+
if self[i][0] == key
|
84
|
+
self[i] = [key,value]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
return [key,value]
|
88
|
+
end
|
89
|
+
|
90
|
+
alias_method :old_get, :[]#:nodoc:
|
91
|
+
def [](*args)
|
92
|
+
if args.first.class == Fixnum
|
93
|
+
self.old_get(args.first)
|
94
|
+
else
|
95
|
+
self.get_value(args.first)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
alias_method :old_set, :[]=#:nodoc:
|
100
|
+
# set a key using its index, array key or add using a new key i.e.:
|
101
|
+
# if setting:
|
102
|
+
# form = [['key','value'],['foo','bar']]
|
103
|
+
# form[0] = ["replacekey","newalue"]
|
104
|
+
# form["replacekey"] = "newervalue"
|
105
|
+
# if adding:
|
106
|
+
# form["newkey"] = "value"
|
107
|
+
#
|
108
|
+
def []=(*args)
|
109
|
+
key,value = args
|
110
|
+
if args.first.kind_of?(Fixnum)
|
111
|
+
return self.old_set(*args)
|
112
|
+
elsif self.has_key?(key)
|
113
|
+
return self.set_value(key,value)
|
114
|
+
else
|
115
|
+
return self.add(key,value)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
alias_method :set_value, :set_value!
|
120
|
+
alias_method :set, :set_value!
|
121
|
+
|
122
|
+
def get_value(key)
|
123
|
+
if key.class == Fixnum
|
124
|
+
return self[key][1]
|
125
|
+
end
|
126
|
+
self.each_index do |i|
|
127
|
+
if self[i][0] == key
|
128
|
+
return self[i][1]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
return nil
|
132
|
+
end
|
133
|
+
|
134
|
+
alias_method :get, :get_value
|
135
|
+
|
136
|
+
def setall!(value)
|
137
|
+
self.each_index { |i| self.set_value!(i,value) }
|
138
|
+
end
|
139
|
+
|
140
|
+
alias_method :setall, :setall!#:nodoc:
|
141
|
+
alias_method :set_all!, :setall!#:nodoc:
|
142
|
+
alias_method :set_all, :setall!#:nodoc:
|
143
|
+
|
144
|
+
# delete all key = value pairs from self where key = key
|
145
|
+
def delete_key(key)
|
146
|
+
self.reject! { |x,y| x == key }
|
147
|
+
end
|
148
|
+
|
149
|
+
alias_method :delete_keys!, :delete_key #:nodoc:
|
150
|
+
alias_method :delete_key!, :delete_key #:nodoc:
|
151
|
+
|
152
|
+
# escape form keys in place
|
153
|
+
def escape_keys!(reg=WWMD::ESCAPE[:url])
|
154
|
+
return nil if reg == :none
|
155
|
+
self.map! { |x,y| [x.escape(reg),y] }
|
156
|
+
end
|
157
|
+
|
158
|
+
# unescape form keys in place
|
159
|
+
def unescape_keys!(reg=WWMD::ESCAPE[:url])
|
160
|
+
return nil if reg == :none
|
161
|
+
self.map! { |x,y| [x.unescape,y] }
|
162
|
+
end
|
163
|
+
|
164
|
+
# escape form values in place
|
165
|
+
def escape_all!(reg=WWMD::ESCAPE[:url])
|
166
|
+
return nil if reg == :none
|
167
|
+
self.map! { |x,y| [x,y.escape(reg)] }
|
168
|
+
end
|
169
|
+
|
170
|
+
alias_method :escape_all, :escape_all!#:nodoc:
|
171
|
+
|
172
|
+
# unescape all form values in place
|
173
|
+
def unescape_all!
|
174
|
+
self.map! { |x,y| [x,y.unescape] }
|
175
|
+
end
|
176
|
+
|
177
|
+
alias_method :unescape_all, :unescape_all!#:nodoc:
|
178
|
+
|
179
|
+
# convert form into a post parameters string
|
180
|
+
def to_post
|
181
|
+
ret = []
|
182
|
+
self.each do |i|
|
183
|
+
ret.push(i.join("="))
|
184
|
+
end
|
185
|
+
ret.join("&")
|
186
|
+
end
|
187
|
+
|
188
|
+
# convert form into a get parameters string
|
189
|
+
#
|
190
|
+
# pass me a base to get a full url to pass to Page.get
|
191
|
+
def to_get(base="")
|
192
|
+
ret = []
|
193
|
+
self.each do |i|
|
194
|
+
ret.push(i.join("="))
|
195
|
+
end
|
196
|
+
ret = ret.join("&")
|
197
|
+
return base.clip + "?" + ret.to_s
|
198
|
+
end
|
199
|
+
|
200
|
+
# IRB: puts the form in human readable format
|
201
|
+
# if you <tt>form.show(true)</tt> it will show unescaped values
|
202
|
+
def show(unescape=false)
|
203
|
+
if unescape
|
204
|
+
self.each_index { |i| puts i.to_s + " :: " + self[i][0].to_s + " = " + self[i][1].to_s.unescape }
|
205
|
+
else
|
206
|
+
self.each_index { |i| puts i.to_s + " :: " + self[i][0].to_s + " = " + self[i][1].to_s }
|
207
|
+
end
|
208
|
+
return nil
|
209
|
+
end
|
210
|
+
|
211
|
+
# meh
|
212
|
+
def add_viewstate#:nodoc:
|
213
|
+
self.insert(0,[ "__VIEWSTATE","" ])
|
214
|
+
self.insert(0,[ "__EVENTARGUMENT","" ])
|
215
|
+
self.insert(0,[ "__EVENTTARGET","" ])
|
216
|
+
self.insert(0,[ "__EVENTVALIDATION","" ])
|
217
|
+
return nil
|
218
|
+
end
|
219
|
+
|
220
|
+
# alias_method, :add_state, :add_viewstate#:nodoc:
|
221
|
+
|
222
|
+
# remove form elements with null values
|
223
|
+
def remove_nulls!
|
224
|
+
self.delete_if { |x| x[1].to_s.empty? || x[1].nil? }
|
225
|
+
end
|
226
|
+
|
227
|
+
alias_method :squeeze!, :remove_nulls!
|
228
|
+
|
229
|
+
# remove form elements with null keys (for housekeeping returns)
|
230
|
+
def remove_null_keys!
|
231
|
+
self.delete_if { |x,y| x.to_s.empty? || x.nil? }
|
232
|
+
end
|
233
|
+
|
234
|
+
alias_method :squeeze_keys!, :remove_null_keys!
|
235
|
+
|
236
|
+
# dump a web page containing a csrf example of the current FormArray
|
237
|
+
def to_csrf(action)
|
238
|
+
ret = ""
|
239
|
+
ret << "<html><body>\n"
|
240
|
+
ret << "<form method='post' id='wwmdtest' name='wwmdtest' action='#{action}'>\n"
|
241
|
+
self.each do |key,val|
|
242
|
+
val = val.unescape.gsub(/'/) { %q[\'] }
|
243
|
+
ret << "<input name='#{key.to_s.unescape}' type='hidden' value='#{val}' />\n"
|
244
|
+
# ret << "<input name='#{key.to_s.unescape}' type='hidden' value='#{val.to_s.unescape.gsub(/'/,"\\'")}' />\n"
|
245
|
+
end
|
246
|
+
ret << "</form>\n"
|
247
|
+
ret << "<script>document.wwmdtest.submit()</script>\n"
|
248
|
+
ret << "</body></html>\n"
|
249
|
+
return ret
|
250
|
+
end
|
251
|
+
|
252
|
+
def keys
|
253
|
+
self.map { |k,v| k }
|
254
|
+
end
|
255
|
+
|
256
|
+
def burpify #:nodoc:
|
257
|
+
ret = self.clone
|
258
|
+
ret.each_index do |i|
|
259
|
+
next if ret[i][0] =~ /^__/
|
260
|
+
ret.set_value!(i,"#{ret.get_value(i)}" + "\302\247" + "\302\247")
|
261
|
+
end
|
262
|
+
system("echo '#{ret.to_post}' | pbcopy")
|
263
|
+
return ret
|
264
|
+
end
|
265
|
+
|
266
|
+
# return md5 hash of sorted list of keys
|
267
|
+
def fingerprint
|
268
|
+
return self.map { |k,v| k }.sort.to_s.md5
|
269
|
+
end
|
270
|
+
alias_method :fp, :fingerprint #:nodoc:
|
271
|
+
|
272
|
+
end
|
273
|
+
end
|
data/lib/wwmd/guid.rb
ADDED
@@ -0,0 +1,155 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Guid - Ruby library for portable GUID/UUID generation.
|
3
|
+
|
4
|
+
Copyright (c) 2004 David Garamond <davegaramond at icqmail com>
|
5
|
+
|
6
|
+
This library is free software; you can redistribute it and/or modify it
|
7
|
+
under the same terms as Ruby itself.
|
8
|
+
|
9
|
+
(small hack to fix for mac mtracy@matasano.com)
|
10
|
+
=end
|
11
|
+
|
12
|
+
if RUBY_PLATFORM =~ /win/i && ! RUBY_PLATFORM =~ /darwin/i
|
13
|
+
module Guid_Win32_#:nodoc:
|
14
|
+
require 'Win32API'
|
15
|
+
|
16
|
+
PROV_RSA_FULL = 1
|
17
|
+
CRYPT_VERIFYCONTEXT = 0xF0000000
|
18
|
+
FORMAT_MESSAGE_IGNORE_INSERTS = 0x00000200
|
19
|
+
FORMAT_MESSAGE_FROM_SYSTEM = 0x00001000
|
20
|
+
|
21
|
+
CryptAcquireContext = Win32API.new("advapi32", "CryptAcquireContext",
|
22
|
+
'PPPII', 'L')
|
23
|
+
CryptGenRandom = Win32API.new("advapi32", "CryptGenRandom",
|
24
|
+
'LIP', 'L')
|
25
|
+
CryptReleaseContext = Win32API.new("advapi32", "CryptReleaseContext",
|
26
|
+
'LI', 'L')
|
27
|
+
GetLastError = Win32API.new("kernel32", "GetLastError", '', 'L')
|
28
|
+
FormatMessageA = Win32API.new("kernel32", "FormatMessageA",
|
29
|
+
'LPLLPLPPPPPPPP', 'L')
|
30
|
+
|
31
|
+
def lastErrorMessage
|
32
|
+
code = GetLastError.call
|
33
|
+
msg = "\0" * 1024
|
34
|
+
len = FormatMessageA.call(FORMAT_MESSAGE_IGNORE_INSERTS +
|
35
|
+
FORMAT_MESSAGE_FROM_SYSTEM, 0,
|
36
|
+
code, 0, msg, 1024, nil, nil,
|
37
|
+
nil, nil, nil, nil, nil, nil)
|
38
|
+
msg[0, len].tr("\r", '').chomp
|
39
|
+
end
|
40
|
+
|
41
|
+
def initialize
|
42
|
+
hProvStr = " " * 4
|
43
|
+
if CryptAcquireContext.call(hProvStr, nil, nil, PROV_RSA_FULL,
|
44
|
+
CRYPT_VERIFYCONTEXT) == 0
|
45
|
+
raise SystemCallError, "CryptAcquireContext failed: #{lastErrorMessage}"
|
46
|
+
end
|
47
|
+
hProv, = hProvStr.unpack('L')
|
48
|
+
@bytes = " " * 16
|
49
|
+
if CryptGenRandom.call(hProv, 16, @bytes) == 0
|
50
|
+
raise SystemCallError, "CryptGenRandom failed: #{lastErrorMessage}"
|
51
|
+
end
|
52
|
+
if CryptReleaseContext.call(hProv, 0) == 0
|
53
|
+
raise SystemCallError, "CryptReleaseContext failed: #{lastErrorMessage}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
module Guid_Unix_#:nodoc:
|
60
|
+
@@random_device = nil
|
61
|
+
|
62
|
+
def initialize
|
63
|
+
if !@@random_device
|
64
|
+
if File.exists? "/dev/urandom"
|
65
|
+
@@random_device = File.open "/dev/urandom", "r"
|
66
|
+
elsif File.exists? "/dev/random"
|
67
|
+
@@random_device = File.open "/dev/random", "r"
|
68
|
+
else
|
69
|
+
raise RuntimeError, "Can't find random device"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
@bytes = @@random_device.read(16)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
class Guid
|
78
|
+
if RUBY_PLATFORM =~ /win/ && ! RUBY_PLATFORM =~ /darwin/i
|
79
|
+
include Guid_Win32_
|
80
|
+
else
|
81
|
+
include Guid_Unix_
|
82
|
+
end
|
83
|
+
|
84
|
+
def hexdigest
|
85
|
+
@bytes.unpack("h*")[0]
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_s
|
89
|
+
@bytes.unpack("h8 h4 h4 h4 h12").join "-"
|
90
|
+
end
|
91
|
+
|
92
|
+
def inspect
|
93
|
+
to_s
|
94
|
+
end
|
95
|
+
|
96
|
+
def raw
|
97
|
+
@bytes
|
98
|
+
end
|
99
|
+
|
100
|
+
def self.from_s(s)
|
101
|
+
raise ArgumentError, "Invalid GUID hexstring" unless
|
102
|
+
s =~ /\A[0-9a-f]{8}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{4}-?[0-9a-f]{12}\z/i
|
103
|
+
guid = Guid.allocate
|
104
|
+
guid.instance_eval { @bytes = [s.gsub(/[^0-9a-f]+/i, '')].pack "h*" }
|
105
|
+
guid
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.from_raw(bytes)
|
109
|
+
raise ArgumentError, "Invalid GUID raw bytes, length must be 16 bytes" unless
|
110
|
+
bytes.length == 16
|
111
|
+
guid = Guid.allocate
|
112
|
+
guid.instance_eval { @bytes = bytes }
|
113
|
+
guid
|
114
|
+
end
|
115
|
+
|
116
|
+
def ==(other)
|
117
|
+
@bytes == other.raw
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
if __FILE__ == $0
|
122
|
+
require 'test/unit'
|
123
|
+
|
124
|
+
class GuidTest < Test::Unit::TestCase#:nodoc:
|
125
|
+
def test_new
|
126
|
+
g = Guid.new
|
127
|
+
|
128
|
+
# different representations of guid: hexdigest, hex+dashes, raw bytes
|
129
|
+
assert_equal(0, g.to_s =~ /\A[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\z/)
|
130
|
+
assert_equal(16, g.raw.length)
|
131
|
+
assert_equal(0, g.hexdigest =~ /\A[0-9a-f]{32}\z/)
|
132
|
+
assert_equal(g.hexdigest, g.to_s.gsub(/-/, ''))
|
133
|
+
|
134
|
+
# must be different each time we produce (this is just a simple test)
|
135
|
+
g2 = Guid.new
|
136
|
+
assert_equal(true, g != g2)
|
137
|
+
assert_equal(true, g.to_s != g2.to_s)
|
138
|
+
assert_equal(true, g.raw != g2.raw)
|
139
|
+
assert_equal(true, g.hexdigest != g2.hexdigest)
|
140
|
+
assert_equal(1000, (1..1000).select { |i| g != Guid.new }.length)
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_from_s
|
144
|
+
g = Guid.new
|
145
|
+
g2 = Guid.from_s(g.to_s)
|
146
|
+
assert_equal(g, g2)
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_from_raw
|
150
|
+
g = Guid.new
|
151
|
+
g2 = Guid.from_raw(g.raw)
|
152
|
+
assert_equal(g, g2)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# Geoff Davis geoff at geoffdavis.net
|
2
|
+
# Wed May 2 20:08:44 EDT 2007
|
3
|
+
# http://rubyforge.org/pipermail/raleigh-rb-members/2007-May/000789.html
|
4
|
+
# modified by mtracy at matasano.com for WWMD
|
5
|
+
|
6
|
+
module WWMD
|
7
|
+
InlineTags = ['a','abbr','acronym','address','b','bdo','big','cite','code','del','dfn','em','font','i','ins','kbd','label','noframes','noscript','q','s','samp','small','span','strike','strong','sub','sup','td','th','tt','u','html','body','table']
|
8
|
+
BlockTags = ['blockquote','br','center','dd','div','fieldset','form','h1','h2','h3', 'h4','h5','h6','hr','p','pre','tr','var',]
|
9
|
+
ListTags = ['dir','dl','menu','ol','ul']
|
10
|
+
ItemTags = ['li','dt']
|
11
|
+
# AsciiEquivalents = {"amp"=>"&","bull"=>"*","copy"=>"(c)","laquo"=>"<<","raquo"=>">>","ge"=> ">=","le"=>"<=","mdash"=>"-","ndash"=>"-","plusmn"=>"+/-","times"=>"x"}
|
12
|
+
|
13
|
+
# NamedCharRegex = Regexp.new("(&("+Hpricot::NamedCharacters.keys.join("|")+");)")
|
14
|
+
|
15
|
+
class Page
|
16
|
+
def element_to_text(n)
|
17
|
+
tag = n.etag || n.stag
|
18
|
+
name = tag.name.downcase
|
19
|
+
s = ""
|
20
|
+
is_block = BlockTags.include?(name)
|
21
|
+
is_list = ListTags.include?(name)
|
22
|
+
is_item = ItemTags.include?(name)
|
23
|
+
is_inline = InlineTags.include?(name)
|
24
|
+
if is_block or is_list or is_item or is_inline
|
25
|
+
n.each_child do |c|
|
26
|
+
s += node_to_text(c)
|
27
|
+
end
|
28
|
+
if is_block or is_list
|
29
|
+
s += "\n"
|
30
|
+
elsif is_item
|
31
|
+
s = "* " + s + "\n"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
s
|
35
|
+
end
|
36
|
+
|
37
|
+
def node_to_text(n)
|
38
|
+
return "" if n.comment?
|
39
|
+
return element_to_text(n) if n.elem?
|
40
|
+
return n.inner_text if n.text?
|
41
|
+
|
42
|
+
s = ""
|
43
|
+
begin
|
44
|
+
n.each_child do |c|
|
45
|
+
s += node_to_text(c)
|
46
|
+
end
|
47
|
+
rescue => e
|
48
|
+
# puts "WARNING: #{e.inspect}"
|
49
|
+
end
|
50
|
+
return s
|
51
|
+
end
|
52
|
+
|
53
|
+
# def lookup_named_char(s)
|
54
|
+
# c = Hpricot::NamedCharacters[s[1...-1]]
|
55
|
+
# c.chr if c
|
56
|
+
# end
|
57
|
+
|
58
|
+
def html2text
|
59
|
+
doc = self.scrape.hdoc
|
60
|
+
text = node_to_text(doc)
|
61
|
+
# text.gsub!(NamedCharRegex){|s| "#{lookup_named_char(s)}"}
|
62
|
+
# clean up white space
|
63
|
+
text.gsub!("\r"," ")
|
64
|
+
text.squeeze!(" ")
|
65
|
+
text.strip!
|
66
|
+
ret = ''
|
67
|
+
text.split(/\n/).each do |l|
|
68
|
+
l.strip!
|
69
|
+
next if l == ''
|
70
|
+
next if l =~ /^\?+$/
|
71
|
+
ret += "#{l}\n"
|
72
|
+
end
|
73
|
+
return ret
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|