wwmd 0.2.20.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (80) hide show
  1. data/History.txt +38 -0
  2. data/README.rdoc +87 -0
  3. data/Rakefile +33 -0
  4. data/examples/config_example.yaml +24 -0
  5. data/examples/wwmd_example.rb +73 -0
  6. data/lib/wwmd.rb +84 -0
  7. data/lib/wwmd/class_extensions.rb +4 -0
  8. data/lib/wwmd/class_extensions/extensions_base.rb +251 -0
  9. data/lib/wwmd/class_extensions/extensions_encoding.rb +79 -0
  10. data/lib/wwmd/class_extensions/extensions_external.rb +18 -0
  11. data/lib/wwmd/class_extensions/extensions_nilclass.rb +11 -0
  12. data/lib/wwmd/class_extensions/extensions_rbkb.rb +193 -0
  13. data/lib/wwmd/class_extensions/mixins_string_encoding.rb +40 -0
  14. data/lib/wwmd/guid.rb +155 -0
  15. data/lib/wwmd/page.rb +3 -0
  16. data/lib/wwmd/page/_fa.old +302 -0
  17. data/lib/wwmd/page/auth.rb +17 -0
  18. data/lib/wwmd/page/constants.rb +63 -0
  19. data/lib/wwmd/page/form.rb +99 -0
  20. data/lib/wwmd/page/form_array.rb +304 -0
  21. data/lib/wwmd/page/headers.rb +118 -0
  22. data/lib/wwmd/page/helpers.rb +41 -0
  23. data/lib/wwmd/page/html2text_hpricot.rb +76 -0
  24. data/lib/wwmd/page/html2text_nokogiri.rb +42 -0
  25. data/lib/wwmd/page/inputs.rb +47 -0
  26. data/lib/wwmd/page/irb_helpers.rb +114 -0
  27. data/lib/wwmd/page/page.rb +257 -0
  28. data/lib/wwmd/page/parsing_convenience.rb +98 -0
  29. data/lib/wwmd/page/reporting_helpers.rb +89 -0
  30. data/lib/wwmd/page/scrape.rb +196 -0
  31. data/lib/wwmd/page/spider.rb +127 -0
  32. data/lib/wwmd/urlparse.rb +125 -0
  33. data/lib/wwmd/viewstate.rb +17 -0
  34. data/lib/wwmd/viewstate/viewstate.rb +101 -0
  35. data/lib/wwmd/viewstate/viewstate_deserializer_methods.rb +217 -0
  36. data/lib/wwmd/viewstate/viewstate_from_xml.rb +129 -0
  37. data/lib/wwmd/viewstate/viewstate_types.rb +51 -0
  38. data/lib/wwmd/viewstate/viewstate_utils.rb +164 -0
  39. data/lib/wwmd/viewstate/viewstate_yaml.rb +25 -0
  40. data/lib/wwmd/viewstate/vs_stubs.rb +22 -0
  41. data/lib/wwmd/viewstate/vs_stubs/vs_array.rb +38 -0
  42. data/lib/wwmd/viewstate/vs_stubs/vs_binary_serialized.rb +30 -0
  43. data/lib/wwmd/viewstate/vs_stubs/vs_hashtable.rb +42 -0
  44. data/lib/wwmd/viewstate/vs_stubs/vs_hybrid_dict.rb +42 -0
  45. data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string.rb +6 -0
  46. data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string_ref.rb +24 -0
  47. data/lib/wwmd/viewstate/vs_stubs/vs_int_enum.rb +27 -0
  48. data/lib/wwmd/viewstate/vs_stubs/vs_list.rb +34 -0
  49. data/lib/wwmd/viewstate/vs_stubs/vs_pair.rb +29 -0
  50. data/lib/wwmd/viewstate/vs_stubs/vs_read_types.rb +11 -0
  51. data/lib/wwmd/viewstate/vs_stubs/vs_read_value.rb +35 -0
  52. data/lib/wwmd/viewstate/vs_stubs/vs_sparse_array.rb +58 -0
  53. data/lib/wwmd/viewstate/vs_stubs/vs_string.rb +33 -0
  54. data/lib/wwmd/viewstate/vs_stubs/vs_string_array.rb +39 -0
  55. data/lib/wwmd/viewstate/vs_stubs/vs_string_formatted.rb +32 -0
  56. data/lib/wwmd/viewstate/vs_stubs/vs_stub_helpers.rb +37 -0
  57. data/lib/wwmd/viewstate/vs_stubs/vs_triplet.rb +31 -0
  58. data/lib/wwmd/viewstate/vs_stubs/vs_type.rb +23 -0
  59. data/lib/wwmd/viewstate/vs_stubs/vs_unit.rb +30 -0
  60. data/lib/wwmd/viewstate/vs_stubs/vs_value.rb +35 -0
  61. data/lib/wwmd/wwmd_config.rb +52 -0
  62. data/lib/wwmd/wwmd_puts.rb +9 -0
  63. data/lib/wwmd/wwmd_utils.rb +28 -0
  64. data/spec/README +3 -0
  65. data/spec/form_array.spec +49 -0
  66. data/spec/spider_csrf_test.spec +28 -0
  67. data/spec/urlparse_test.spec +101 -0
  68. data/tasks/ann.rake +80 -0
  69. data/tasks/bones.rake +20 -0
  70. data/tasks/gem.rake +201 -0
  71. data/tasks/git.rake +40 -0
  72. data/tasks/notes.rake +27 -0
  73. data/tasks/post_load.rake +34 -0
  74. data/tasks/rdoc.rake +51 -0
  75. data/tasks/rubyforge.rake +55 -0
  76. data/tasks/setup.rb +292 -0
  77. data/tasks/spec.rake +54 -0
  78. data/tasks/test.rake +40 -0
  79. data/tasks/zentest.rake +36 -0
  80. metadata +222 -0
@@ -0,0 +1,98 @@
1
+ module WWMD
2
+ class Page
3
+ #:section: Parsing convenience methods
4
+ # methods that help parse and find information on a page including
5
+ # access to forms etc.
6
+
7
+ # grep for regexp and remove leading whitespace
8
+ def grep(reg)
9
+ self.body_data.grep(reg).map { |i| i.gsub(/^\s+/, "") }
10
+ end
11
+
12
+ # return this page's form (at index id) as a FormArray
13
+ def get_form(id=nil)
14
+ id = 0 if not id
15
+ return nil if forms.empty? || !forms[id]
16
+ f = @forms[id]
17
+ action = f.action
18
+ action ||= action
19
+ action ||= cur
20
+ action ||= "PARSE_ERROR"
21
+ url_action = @urlparse.parse(self.cur,action).to_s
22
+ type = f.type
23
+ FormArray.new do |x|
24
+ x.set_fields(f.fields)
25
+ x.action = url_action
26
+ x.type = type
27
+ end
28
+ end
29
+
30
+ # return the complete url to the form action on this page
31
+ def action(id=nil)
32
+ id = 0 if not id
33
+ act = self.forms[id].action
34
+ return self.last_effective_url if (act.nil? || act.empty?)
35
+ return @urlparse.parse(self.last_effective_url,act).to_s
36
+ end
37
+
38
+ # return an array of Element objects for an xpath search
39
+ def search(xpath)
40
+ self.scrape.hdoc.search(xpath)
41
+ end
42
+
43
+ # return an array of inner_html for each <script> tag encountered
44
+ def dump_scripts
45
+ self.get_tags("//script").map { |s| s.inner_html if s.inner_html.strip != '' }
46
+ end
47
+
48
+ alias_method :scripts, :dump_scripts
49
+
50
+ # set link using an integer link from self.report
51
+ #--
52
+ # NOTE: I always use page.get(page.l(1)) anyway.
53
+ #++
54
+ def set_link(index)
55
+ self.url = @links[index]
56
+ end
57
+
58
+ # return link at index from @links array
59
+ def get_link(index)
60
+ @links[index]
61
+ end
62
+
63
+ alias_method :link, :get_link #:nodoc:
64
+ alias_method :l, :get_link #:nodoc:
65
+
66
+ def all_tags#:nodoc:
67
+ return self.search("*").map { |x| x.name }
68
+ end
69
+
70
+ def furl(url)
71
+ self.url = @urlparse.parse(self.base_url,url).to_s
72
+ end
73
+
74
+ # set self.opts[:base_url]
75
+ def setbase(url=nil)
76
+ return nil if not url
77
+ self.opts[:base_url] = url
78
+ self.base_url = url
79
+ end
80
+
81
+ # write self.body_data to file
82
+ def write(filename)
83
+ File.write(filename,self.body_data)
84
+ return "wrote to " + filename
85
+ end
86
+
87
+ # read self.body_data from file
88
+ def read(filename)
89
+ self.body_data = File.read(filename)
90
+ self.set_data
91
+ end
92
+
93
+ # alias_method for body_data
94
+ def raw
95
+ self.body_data
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,89 @@
1
+ module WWMD
2
+ class Page
3
+ attr_accessor :status
4
+ #:section: Reporting helper methods
5
+ # These are methods that generate data for a parsed page
6
+
7
+ # return text representation of page code
8
+ #
9
+ # override with specific statuses in helper depending on page text
10
+ # etc to include statuses outside 200 = OK and other = ERR
11
+ def page_status
12
+ @status = "OK"
13
+ @status = "ERR" if self.response_code > 399
14
+ end
15
+
16
+ # alias_method :status, :page_status#:nodoc:
17
+
18
+ # return value of @logged_in
19
+ def logged_in?
20
+ return @logged_in
21
+ end
22
+
23
+ # return a string of flags:
24
+ # Ll links
25
+ # Jj javascript includes
26
+ # Ff forms
27
+ # Cc comments
28
+ def report_flags
29
+ self.has_links? ? ret = "L" : ret = "l"
30
+ self.has_jlinks? ? ret += "J" : ret += "j"
31
+ self.has_form? ? ret += "F" : ret += "f"
32
+ self.has_comments? ? ret += "C" : ret += "c"
33
+ return ret
34
+ end
35
+
36
+ def has_links?; return !@links.empty?; end
37
+ def has_jlinks?; return !@jlinks.empty?; end
38
+ def has_form?; return !(@forms.size < 1); end
39
+ def has_comments?; return !@comments.empty?; end
40
+
41
+ # return page size in bytes
42
+ def size
43
+ return self.body_data.size
44
+ end
45
+
46
+ # return md5sum for self.body_data
47
+ def md5
48
+ return self.body_data.md5
49
+ end
50
+
51
+ # does this response have SET-COOKIE headers?
52
+ def set_cookies?
53
+ ret = FormArray.new()
54
+ self.header_data.each do |x|
55
+ if x[0].upcase == "SET-COOKIE"
56
+ ret << x[1].split(";").first.split("=",2)
57
+ end
58
+ end
59
+ ret
60
+ end
61
+ alias_method :set_cookies, :set_cookies?
62
+
63
+ def time
64
+ self.total_time
65
+ end
66
+
67
+ # return MD5 for DOM fingerprint
68
+ # take all tag names in page.to_s.md5
69
+ def fingerprint
70
+ self.all_tags.to_s.md5
71
+ end
72
+ alias_method :fp, :fingerprint #:nodoc:
73
+
74
+ # alias_method for last_effective_url
75
+ def current_url
76
+ self.last_effective_url
77
+ end
78
+
79
+ alias_method :current, :current_url
80
+ alias_method :cur, :current_url
81
+ alias_method :now, :current_url
82
+
83
+ # the last http response code
84
+ def code
85
+ self.response_code # .to_s
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,196 @@
1
+ module WWMD
2
+ LINKS_REGEXP = [
3
+ /window\.open\s*\(([^\)]+)/i,
4
+ /open_window\s*\(([^\)]+)/i,
5
+ /window\.location\s*=\s*(['"][^'"]+['"])/i,
6
+ /.*location.href\s*=\s*(['"][^'"]+['"])/i,
7
+ /document.forms.*action\s*=\s*(['"][^'"]+['"])/i,
8
+ /Ajax\.Request\s*\((['"][^'"]+['"])/i,
9
+ ]
10
+
11
+ class Scrape
12
+
13
+ attr_accessor :debug
14
+ attr_accessor :warn
15
+ attr_accessor :links # links found on page
16
+ attr_accessor :jlinks # links to javascript includes
17
+ attr_reader :hdoc
18
+
19
+ # create a new scrape object using passed HTML
20
+ def initialize(page='<>')
21
+ @page = page
22
+ @hdoc = HDOC.parse(@page)
23
+ @links = Array.new
24
+ @debug = false
25
+ @warn = false
26
+ end
27
+
28
+ # reset this scrape object (called by WWMD::Page)
29
+ def reset(page)
30
+ @page = page
31
+ @hdoc = HDOC.parse(@page)
32
+ @links = Array.new
33
+ end
34
+
35
+ # scan the passed string for the configured regular expressions
36
+ # and return them as an array
37
+ def urls_from_regexp(content,re,split=0)
38
+ ret = []
39
+ scrape = content.scan(re)
40
+ scrape.each do |url|
41
+ # cheat and take split string(,)[split]
42
+ add = url.to_s.split(',')[split].gsub(/['"]/, '')
43
+ next if (add == '' || add.nil?)
44
+ ret << add
45
+ end
46
+ return ret
47
+ end
48
+
49
+ # xpath search for tags and return the passed attribute
50
+ # urls_from_xpath("//a","href")
51
+ def urls_from_xpath(xpath,attr)
52
+ ret = []
53
+ @hdoc.search(xpath).each do |elem|
54
+ url = elem[attr]
55
+ next if url.empty?
56
+ ret << url.strip
57
+ end
58
+ return ret
59
+ end
60
+
61
+ # <b>NEED</b> to move this to external configuration
62
+ #
63
+ # list of urls we don't care to store in our links list
64
+ def reject_links
65
+ putw "WARN: override reject_links in helper script" if @warn
66
+ default_reject_links
67
+ end
68
+
69
+ # default reject links (override using reject_links in helper script)
70
+ def default_reject_links
71
+ @links.reject! do |url|
72
+ url.nil? ||
73
+ url.extname == ".css" ||
74
+ url.extname == ".pdf" ||
75
+ url =~ /javascript:/i ||
76
+ url =~ /mailto:/i ||
77
+ url =~ /[\[\]]/ ||
78
+ url =~ /^#/
79
+ end
80
+ end
81
+
82
+ # return an array of Form objects for forms on page
83
+ def for_forms
84
+ ret = []
85
+ @hdoc.search("//form").each { |f| ret << Form.new(f) }
86
+ ret
87
+ end
88
+
89
+ # use xpath searches to get
90
+ # * //a href
91
+ # * //area href
92
+ # * //frame src
93
+ # * //iframe src
94
+ # * //form action
95
+ # * //meta refresh content urls
96
+ # then get //script tags and regexp out links in javascript function calls
97
+ # from elem.inner_html
98
+ def for_links(reject=true)
99
+ self.urls_from_xpath("//a","href").each { |url| @links << url } # get <a href=""> elements
100
+ self.urls_from_xpath("//area","href").each { |url| @links << url } # get <area href=""> elements
101
+ self.urls_from_xpath("//frame","src").each { |url| @links << url } # get <frame src=""> elements
102
+ self.urls_from_xpath("//iframe","src").each { |url| @links << url } # get <iframe src=""> elements
103
+ self.urls_from_xpath("//form","action").each { |url| @links << url } # get <form action=""> elements
104
+
105
+ # <meta> refresh
106
+ @hdoc.search("//meta").each do |meta|
107
+ next if meta['http-equiv'] != "refresh"
108
+ next if not (content = meta['content'].split(/=/)[1])
109
+ @links << content.strip
110
+ end
111
+
112
+ # add urls from onclick handlers
113
+ @hdoc.search("*[@onclick]").each do |onclick|
114
+ LINKS_REGEXP.each do |re|
115
+ self.urls_from_regexp(onclick['onclick'],re).each do |url|
116
+ @links << url
117
+ end
118
+ end
119
+ end
120
+
121
+ # add urls_from_regexp (limit to <script> tags (elem.inner_html))
122
+ @hdoc.search("//script").each do |scr|
123
+ LINKS_REGEXP.each do |re|
124
+ self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url }
125
+ end
126
+ end
127
+
128
+ # re-define urls_from_helper in what you mix in
129
+ begin
130
+ self.urls_from_helper
131
+ end
132
+
133
+ self.reject_links; # reject links we don't care about
134
+ return @links
135
+ end
136
+
137
+ # scrape the page for <script src=""> tags
138
+ def for_javascript_links
139
+ urls = []
140
+ @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] }
141
+ urls.reject! { |url| File.extname(url).clip != ".js" }
142
+ return urls
143
+ end
144
+
145
+ # scan page for comment fields
146
+ def for_comments
147
+ @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s }
148
+ end
149
+
150
+ # scrape the page for a meta refresh tag and return the url from the contents attribute or nil
151
+ def for_meta_refresh
152
+ has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh')
153
+ if has_mr
154
+ urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] }
155
+ if urls.size > 1
156
+ STDERR.puts "PARSE ERROR: more than one meta refresh tag"
157
+ return "ERR"
158
+ end
159
+ k,v = urls.first.split("=",2)
160
+ if k.upcase.strip != "URL"
161
+ STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url"
162
+ return "ERR"
163
+ end
164
+ return v.strip
165
+ else
166
+ return nil
167
+ end
168
+ end
169
+
170
+ # scrape the page for a script tag that contains a bare location.href tag (to redirect the page)
171
+ def for_javascript_redirect
172
+ redirs = []
173
+ @hdoc.search("//script").each do |scr|
174
+ scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x }
175
+ end
176
+ if redirs.size > 1
177
+ STDERR.puts "PARSE ERROR: more than one javascript redirect"
178
+ return "ERR"
179
+ end
180
+ return redirs.first if not redirs.empty?
181
+ return nil
182
+ end
183
+
184
+ # renamed class variable (for backward compat)
185
+ def warnings#:nodoc:
186
+ return @warn
187
+ end
188
+
189
+ # define an urls_from_helper method in your task specific script
190
+ def urls_from_helper
191
+ putw "WARN: Please set an urls_from_helper override in your helper script" if @warn
192
+ return nil
193
+ end
194
+
195
+ end
196
+ end
@@ -0,0 +1,127 @@
1
+ module WWMD
2
+ # when a WWMD::Page object is created, it created its own WWMD::Spider object
3
+ # which can be accessed using <tt>page.spider.method</tt>. The <tt>page.set_data</tt>
4
+ # method calls <tt>page.spider.add</tt> with the current url and a list of scraped
5
+ # links from the page. This class doesn't do any real heavy lifting.
6
+ #
7
+ # a simple spider can be written just by recursing through page.spider.next until
8
+ # it's empty.
9
+ class Spider
10
+
11
+ attr_accessor :queued
12
+ attr_accessor :visited
13
+ attr_accessor :bypass
14
+ attr_accessor :local_only
15
+ attr_reader :opts
16
+ attr_accessor :ignore
17
+ attr_accessor :csrf_token
18
+
19
+ DEFAULT_IGNORE = [
20
+ /logoff/i,
21
+ /logout/i,
22
+ ]
23
+
24
+ # pass me opts and an array of regexps to ignore
25
+ # we have a set of sane(ish) defaults here
26
+ def initialize(opts={},ignore=nil)
27
+ @opts = opts
28
+ @visited = []
29
+ @queued = []
30
+ @local_only = true
31
+ @csrf_token = nil
32
+ if !opts[:spider_local_only].nil?
33
+ @local_only = opts[:spider_local_only]
34
+ end
35
+ @ignore = ignore || DEFAULT_IGNORE
36
+ end
37
+
38
+ # push an url onto the queue
39
+ def push_url(url)
40
+ return false if _check_ignore(url)
41
+ if @local_only
42
+ return false if !(url =~ /#{@opts[:base_url]}/)
43
+ end
44
+ return false if (@visited.include?(url) or @queued.include?(url))
45
+ @queued.push(url)
46
+ true
47
+ end
48
+ alias_method :push, :push_url
49
+
50
+ # skip items in the queue
51
+ def skip(tim=1)
52
+ tim.times { |i| @queued.shift }
53
+ true
54
+ end
55
+
56
+ # get the next url in the queue
57
+ def get_next
58
+ queued.shift
59
+ end
60
+
61
+ alias_method :next, :get_next
62
+
63
+ # more elements in the queue?
64
+ def next?
65
+ !queued.empty?
66
+ end
67
+
68
+ # get the last ul we visited? this doesn't look right
69
+ def get_last(url)
70
+ tmp = @visited.reject { |v| v =~ /#{url}/ }
71
+ return tmp[-1]
72
+ end
73
+
74
+ # show the visited list (or the entry in the list at [id])
75
+ def show_visited(id=nil)
76
+ if id.nil?
77
+ @visited.each_index { |i| putx i.to_s + " :: " + @visited[i].to_s }
78
+ return nil
79
+ else
80
+ return @visited[id]
81
+ end
82
+ end
83
+
84
+ alias_method :v, :show_visited
85
+
86
+ # return the current queue (or the entry in the queue at [id]
87
+ def show_queue(id=nil)
88
+ if id.nil?
89
+ @queued.each_index { |i| putx i.to_s + " :: " + @queued[i].to_s }
90
+ return nil
91
+ else
92
+ return @queued[id]
93
+ end
94
+ end
95
+
96
+ alias_method :q, :show_queue
97
+
98
+ # add url to queue
99
+ def add(url='',links=[])
100
+ return nil if @visited.include?(url)
101
+ @visited.push(url)
102
+ links.each { |l| self.push_url l }
103
+ nil
104
+ end
105
+
106
+ # set up the ignore list
107
+ # ignore list is an array of regexp objects
108
+ # remember to set this up before calling any Page methods
109
+ def set_ignore(arr)
110
+ @ignore = arr
111
+ end
112
+
113
+ def _de_csrf(url)
114
+ return url if @csrf_token.nil?
115
+ act,params = url.clopa
116
+ form = params.to_form
117
+ return url if !form.has_key?(@csrf_token)
118
+ form[@csrf_token] = ''
119
+ url = act + form.to_get
120
+ end
121
+
122
+ def _check_ignore(url)
123
+ @ignore.each { |x| return true if (url =~ x) }
124
+ return false
125
+ end
126
+ end
127
+ end