iZsh-wwmd 0.2.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. data/History.txt +24 -0
  2. data/README.rdoc +87 -0
  3. data/Rakefile +34 -0
  4. data/examples/config_example.yaml +24 -0
  5. data/examples/wwmd_example.rb +73 -0
  6. data/lib/wwmd.rb +78 -0
  7. data/lib/wwmd/class_extensions.rb +2 -0
  8. data/lib/wwmd/class_extensions/extensions_base.rb +235 -0
  9. data/lib/wwmd/class_extensions/extensions_encoding.rb +79 -0
  10. data/lib/wwmd/class_extensions/extensions_external.rb +18 -0
  11. data/lib/wwmd/class_extensions/extensions_nilclass.rb +11 -0
  12. data/lib/wwmd/class_extensions/extensions_rbkb.rb +188 -0
  13. data/lib/wwmd/class_extensions/mixins_string_encoding.rb +40 -0
  14. data/lib/wwmd/guid.rb +155 -0
  15. data/lib/wwmd/page.rb +3 -0
  16. data/lib/wwmd/page/auth.rb +17 -0
  17. data/lib/wwmd/page/constants.rb +64 -0
  18. data/lib/wwmd/page/form.rb +99 -0
  19. data/lib/wwmd/page/form_array.rb +296 -0
  20. data/lib/wwmd/page/headers.rb +111 -0
  21. data/lib/wwmd/page/helpers.rb +30 -0
  22. data/lib/wwmd/page/html2text_hpricot.rb +76 -0
  23. data/lib/wwmd/page/html2text_nokogiri.rb +42 -0
  24. data/lib/wwmd/page/inputs.rb +47 -0
  25. data/lib/wwmd/page/irb_helpers.rb +114 -0
  26. data/lib/wwmd/page/page.rb +241 -0
  27. data/lib/wwmd/page/parsing_convenience.rb +94 -0
  28. data/lib/wwmd/page/reporting_helpers.rb +87 -0
  29. data/lib/wwmd/page/scrape.rb +198 -0
  30. data/lib/wwmd/page/spider.rb +127 -0
  31. data/lib/wwmd/urlparse.rb +104 -0
  32. data/lib/wwmd/viewstate.rb +17 -0
  33. data/lib/wwmd/viewstate/viewstate.rb +101 -0
  34. data/lib/wwmd/viewstate/viewstate_deserializer_methods.rb +217 -0
  35. data/lib/wwmd/viewstate/viewstate_from_xml.rb +128 -0
  36. data/lib/wwmd/viewstate/viewstate_types.rb +51 -0
  37. data/lib/wwmd/viewstate/viewstate_utils.rb +162 -0
  38. data/lib/wwmd/viewstate/viewstate_yaml.rb +25 -0
  39. data/lib/wwmd/viewstate/vs_stubs.rb +22 -0
  40. data/lib/wwmd/viewstate/vs_stubs/vs_array.rb +38 -0
  41. data/lib/wwmd/viewstate/vs_stubs/vs_binary_serialized.rb +30 -0
  42. data/lib/wwmd/viewstate/vs_stubs/vs_hashtable.rb +42 -0
  43. data/lib/wwmd/viewstate/vs_stubs/vs_hybrid_dict.rb +42 -0
  44. data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string.rb +6 -0
  45. data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string_ref.rb +24 -0
  46. data/lib/wwmd/viewstate/vs_stubs/vs_int_enum.rb +27 -0
  47. data/lib/wwmd/viewstate/vs_stubs/vs_list.rb +34 -0
  48. data/lib/wwmd/viewstate/vs_stubs/vs_pair.rb +29 -0
  49. data/lib/wwmd/viewstate/vs_stubs/vs_read_types.rb +11 -0
  50. data/lib/wwmd/viewstate/vs_stubs/vs_read_value.rb +35 -0
  51. data/lib/wwmd/viewstate/vs_stubs/vs_sparse_array.rb +58 -0
  52. data/lib/wwmd/viewstate/vs_stubs/vs_string.rb +33 -0
  53. data/lib/wwmd/viewstate/vs_stubs/vs_string_array.rb +39 -0
  54. data/lib/wwmd/viewstate/vs_stubs/vs_string_formatted.rb +32 -0
  55. data/lib/wwmd/viewstate/vs_stubs/vs_stub_helpers.rb +37 -0
  56. data/lib/wwmd/viewstate/vs_stubs/vs_triplet.rb +31 -0
  57. data/lib/wwmd/viewstate/vs_stubs/vs_type.rb +23 -0
  58. data/lib/wwmd/viewstate/vs_stubs/vs_unit.rb +30 -0
  59. data/lib/wwmd/viewstate/vs_stubs/vs_value.rb +35 -0
  60. data/lib/wwmd/wwmd_config.rb +52 -0
  61. data/lib/wwmd/wwmd_puts.rb +9 -0
  62. data/lib/wwmd/wwmd_utils.rb +28 -0
  63. data/spec/README +3 -0
  64. data/spec/form_array.spec +49 -0
  65. data/spec/spider_csrf_test.spec +28 -0
  66. data/spec/urlparse_test.spec +101 -0
  67. data/tasks/ann.rake +80 -0
  68. data/tasks/bones.rake +20 -0
  69. data/tasks/gem.rake +201 -0
  70. data/tasks/git.rake +40 -0
  71. data/tasks/notes.rake +27 -0
  72. data/tasks/post_load.rake +34 -0
  73. data/tasks/rdoc.rake +51 -0
  74. data/tasks/rubyforge.rake +55 -0
  75. data/tasks/setup.rb +292 -0
  76. data/tasks/spec.rake +54 -0
  77. data/tasks/test.rake +40 -0
  78. data/tasks/zentest.rake +36 -0
  79. metadata +174 -0
@@ -0,0 +1,94 @@
1
+ module WWMD
2
+ class Page
3
+ #:section: Parsing convenience methods
4
+ # methods that help parse and find information on a page including
5
+ # access to forms etc.
6
+
7
+ # grep for regexp and remove leading whitespace
8
+ def grep(reg)
9
+ self.body_data.grep(reg).map { |i| i.gsub(/^\s+/, "") }
10
+ end
11
+
12
+ # return this page's form (at index id) as a FormArray
13
+ def get_form(id=nil)
14
+ id = 0 if not id
15
+ return nil if forms.empty? || !forms[id]
16
+ f = @forms[id]
17
+ url_action = @urlparse.parse(self.cur,f.action).to_s
18
+ type = f.type
19
+ FormArray.new do |x|
20
+ x.set_fields(f.fields)
21
+ x.action = url_action
22
+ x.type = type
23
+ end
24
+ end
25
+
26
+ # return the complete url to the form action on this page
27
+ def action(id=nil)
28
+ id = 0 if not id
29
+ act = self.forms[id].action
30
+ return self.last_effective_url if (act.nil? || act.empty?)
31
+ return @urlparse.parse(self.last_effective_url,act).to_s
32
+ end
33
+
34
+ # return an array of Element objects for an xpath search
35
+ def search(xpath)
36
+ self.scrape.hdoc.search(xpath)
37
+ end
38
+
39
+ # return an array of inner_html for each <script> tag encountered
40
+ def dump_scripts
41
+ self.get_tags("//script").map { |s| s.inner_html if s.inner_html.strip != '' }
42
+ end
43
+
44
+ alias_method :scripts, :dump_scripts
45
+
46
+ # set link using an integer link from self.report
47
+ #--
48
+ # NOTE: I always use page.get(page.l(1)) anyway.
49
+ #++
50
+ def set_link(index)
51
+ self.url = @links[index]
52
+ end
53
+
54
+ # return link at index from @links array
55
+ def get_link(index)
56
+ @links[index]
57
+ end
58
+
59
+ alias_method :link, :get_link #:nodoc:
60
+ alias_method :l, :get_link #:nodoc:
61
+
62
+ def all_tags#:nodoc:
63
+ return self.search("*").map { |x| x.name }
64
+ end
65
+
66
+ def furl(url)
67
+ self.url = @urlparse.parse(self.base_url,url).to_s
68
+ end
69
+
70
+ # set self.opts[:base_url]
71
+ def setbase(url=nil)
72
+ return nil if not url
73
+ self.opts[:base_url] = url
74
+ self.base_url = url
75
+ end
76
+
77
+ # write self.body_data to file
78
+ def write(filename)
79
+ File.write(filename,self.body_data)
80
+ return "wrote to " + filename
81
+ end
82
+
83
+ # read self.body_data from file
84
+ def read(filename)
85
+ self.body_data = File.read(filename)
86
+ self.set_data
87
+ end
88
+
89
+ # alias_method for body_data
90
+ def raw
91
+ self.body_data
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,87 @@
1
+ module WWMD
2
+ class Page
3
+ #:section: Reporting helper methods
4
+ # These are methods that generate data for a parsed page
5
+
6
+ # return text representation of page code
7
+ #
8
+ # override with specific statuses in helper depending on page text
9
+ # etc to include statuses outside 200 = OK and other = ERR
10
+ def page_status
11
+ return "ERR" if self.response_code != 200
12
+ return "OK"
13
+ end
14
+
15
+ alias_method :status, :page_status#:nodoc:
16
+
17
+ # return value of @logged_in
18
+ def logged_in?
19
+ return @logged_in
20
+ end
21
+
22
+ # return a string of flags:
23
+ # Ll links
24
+ # Jj javascript includes
25
+ # Ff forms
26
+ # Cc comments
27
+ def report_flags
28
+ self.has_links? ? ret = "L" : ret = "l"
29
+ self.has_jlinks? ? ret += "J" : ret += "j"
30
+ self.has_form? ? ret += "F" : ret += "f"
31
+ self.has_comments? ? ret += "C" : ret += "c"
32
+ return ret
33
+ end
34
+
35
+ def has_links?; return !@links.empty?; end
36
+ def has_jlinks?; return !@jlinks.empty?; end
37
+ def has_form?; return !(@forms.size < 1); end
38
+ def has_comments?; return !@comments.empty?; end
39
+
40
+ # return page size in bytes
41
+ def size
42
+ return self.body_data.size
43
+ end
44
+
45
+ # return md5sum for self.body_data
46
+ def md5
47
+ return self.body_data.md5
48
+ end
49
+
50
+ # does this response have SET-COOKIE headers?
51
+ def set_cookies?
52
+ ret = []
53
+ self.header_data.each do |x|
54
+ if x[0].upcase == "SET-COOKIE"
55
+ ret << x[1]
56
+ end
57
+ end
58
+ return ret
59
+ end
60
+
61
+ def time
62
+ self.total_time
63
+ end
64
+
65
+ # return MD5 for DOM fingerprint
66
+ # take all tag names in page.to_s.md5
67
+ def fingerprint
68
+ self.all_tags.to_s.md5
69
+ end
70
+ alias_method :fp, :fingerprint #:nodoc:
71
+
72
+ # alias_method for last_effective_url
73
+ def current_url
74
+ self.last_effective_url
75
+ end
76
+
77
+ alias_method :current, :current_url
78
+ alias_method :cur, :current_url
79
+ alias_method :now, :current_url
80
+
81
+ # the last http response code
82
+ def code
83
+ self.response_code # .to_s
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,198 @@
1
+ # o hai! I need your help.
2
+
3
+ module WWMD
4
+ LINKS_REGEXP = [
5
+ /window\.open\s*\(([^\)]+)/i,
6
+ /open_window\s*\(([^\)]+)/i,
7
+ /window\.location\s*=\s*(['"][^'"]+['"])/i,
8
+ /.*location.href\s*=\s*(['"][^'"]+['"])/i,
9
+ /document.forms.*action\s*=\s*(['"][^'"]+['"])/i,
10
+ /Ajax\.Request\s*\((['"][^'"]+['"])/i,
11
+ ]
12
+
13
+ class Scrape
14
+
15
+ attr_accessor :debug
16
+ attr_accessor :warn
17
+ attr_accessor :links # links found on page
18
+ attr_accessor :jlinks # links to javascript includes
19
+ attr_reader :hdoc
20
+
21
+ # create a new scrape object using passed HTML
22
+ def initialize(page='<>')
23
+ @page = page
24
+ @hdoc = HDOC.parse(@page)
25
+ @links = Array.new
26
+ @debug = false
27
+ @warn = false
28
+ end
29
+
30
+ # reset this scrape object (called by WWMD::Page)
31
+ def reset(page)
32
+ @page = page
33
+ @hdoc = HDOC.parse(@page)
34
+ @links = Array.new
35
+ end
36
+
37
+ # scan the passed string for the configured regular expressions
38
+ # and return them as an array
39
+ def urls_from_regexp(content,re,split=0)
40
+ ret = []
41
+ scrape = content.scan(re)
42
+ scrape.each do |url|
43
+ # cheat and take split string(,)[split]
44
+ add = url.to_s.split(',')[split].gsub(/['"]/, '')
45
+ next if (add == '' || add.nil?)
46
+ ret << add
47
+ end
48
+ return ret
49
+ end
50
+
51
+ # xpath search for tags and return the passed attribute
52
+ # urls_from_xpath("//a","href")
53
+ def urls_from_xpath(xpath,attr)
54
+ ret = []
55
+ @hdoc.search(xpath).each do |elem|
56
+ url = elem[attr]
57
+ next if url.empty?
58
+ ret << url.strip
59
+ end
60
+ return ret
61
+ end
62
+
63
+ # <b>NEED</b> to move this to external configuration
64
+ #
65
+ # list of urls we don't care to store in our links list
66
+ def reject_links
67
+ putw "WARN: override reject_links in helper script" if @warn
68
+ default_reject_links
69
+ end
70
+
71
+ # default reject links (override using reject_links in helper script)
72
+ def default_reject_links
73
+ @links.reject! do |url|
74
+ url.nil? ||
75
+ url.extname == ".css" ||
76
+ url.extname == ".pdf" ||
77
+ url =~ /javascript:/i ||
78
+ url =~ /mailto:/i ||
79
+ url =~ /[\[\]]/ ||
80
+ url =~ /^#/
81
+ end
82
+ end
83
+
84
+ # return an array of Form objects for forms on page
85
+ def for_forms
86
+ ret = []
87
+ @hdoc.search("//form").each { |f| ret << Form.new(f) }
88
+ ret
89
+ end
90
+
91
+ # use xpath searches to get
92
+ # * //a href
93
+ # * //area href
94
+ # * //frame src
95
+ # * //iframe src
96
+ # * //form action
97
+ # * //meta refresh content urls
98
+ # then get //script tags and regexp out links in javascript function calls
99
+ # from elem.inner_html
100
+ def for_links(reject=true)
101
+ self.urls_from_xpath("//a","href").each { |url| @links << url }; # get <a href=""> elements
102
+ self.urls_from_xpath("//area","href").each { |url| @links << url }; # get <area href=""> elements
103
+ self.urls_from_xpath("//frame","src").each { |url| @links << url }; # get <frame src=""> elements
104
+ self.urls_from_xpath("//iframe","src").each { |url| @links << url }; # get <iframe src=""> elements
105
+ self.urls_from_xpath("//form","action").each { |url| @links << url }; # get <form action=""> elements
106
+
107
+ # <meta> refresh
108
+ @hdoc.search("//meta").each do |meta|
109
+ next if meta['http-equiv'] != "refresh"
110
+ next if not (content = meta['content'].split(/=/)[1])
111
+ @links << content.strip
112
+ end
113
+
114
+ # add urls from onclick handlers
115
+ @hdoc.search("*[@onclick]").each do |onclick|
116
+ LINKS_REGEXP.each do |re|
117
+ self.urls_from_regexp(onclick['onclick'],re).each do |url|
118
+ @links << url
119
+ end
120
+ end
121
+ end
122
+
123
+ # add urls_from_regexp (limit to <script> tags (elem.inner_html))
124
+ @hdoc.search("//script").each do |scr|
125
+ LINKS_REGEXP.each do |re|
126
+ self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url }
127
+ end
128
+ end
129
+
130
+ # re-define urls_from_helper in what you mix in
131
+ begin
132
+ self.urls_from_helper
133
+ end
134
+
135
+ self.reject_links; # reject links we don't care about
136
+ return @links
137
+ end
138
+
139
+ # scrape the page for <script src=""> tags
140
+ def for_javascript_links
141
+ urls = []
142
+ @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] }
143
+ urls.reject! { |url| File.extname(url).clip != ".js" }
144
+ return urls
145
+ end
146
+
147
+ # scan page for comment fields
148
+ def for_comments
149
+ @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s }
150
+ end
151
+
152
+ # scrape the page for a meta refresh tag and return the url from the contents attribute or nil
153
+ def for_meta_refresh
154
+ has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh')
155
+ if has_mr
156
+ urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] }
157
+ if urls.size > 1
158
+ STDERR.puts "PARSE ERROR: more than one meta refresh tag"
159
+ return "ERR"
160
+ end
161
+ k,v = urls.first.split("=",2)
162
+ if k.upcase.strip != "URL"
163
+ STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url"
164
+ return "ERR"
165
+ end
166
+ return v.strip
167
+ else
168
+ return nil
169
+ end
170
+ end
171
+
172
+ # scrape the page for a script tag that contains a bare location.href tag (to redirect the page)
173
+ def for_javascript_redirect
174
+ redirs = []
175
+ @hdoc.search("//script").each do |scr|
176
+ scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x }
177
+ end
178
+ if redirs.size > 1
179
+ STDERR.puts "PARSE ERROR: more than one javascript redirect"
180
+ return "ERR"
181
+ end
182
+ return redirs.first if not redirs.empty?
183
+ return nil
184
+ end
185
+
186
+ # renamed class variable (for backward compat)
187
+ def warnings#:nodoc:
188
+ return @warn
189
+ end
190
+
191
+ # define an urls_from_helper method in your task specific script
192
+ def urls_from_helper
193
+ putw "WARN: Please set an urls_from_helper override in your helper script" if @warn
194
+ return nil
195
+ end
196
+
197
+ end
198
+ end
@@ -0,0 +1,127 @@
1
+ module WWMD
2
+ # when a WWMD::Page object is created, it created its own WWMD::Spider object
3
+ # which can be accessed using <tt>page.spider.method</tt>. The <tt>page.set_data</tt>
4
+ # method calls <tt>page.spider.add</tt> with the current url and a list of scraped
5
+ # links from the page. This class doesn't do any real heavy lifting.
6
+ #
7
+ # a simple spider can be written just by recursing through page.spider.next until
8
+ # it's empty.
9
+ class Spider
10
+
11
+ attr_accessor :queued
12
+ attr_accessor :visited
13
+ attr_accessor :bypass
14
+ attr_accessor :local_only
15
+ attr_reader :opts
16
+ attr_accessor :ignore
17
+ attr_accessor :csrf_token
18
+
19
+ DEFAULT_IGNORE = [
20
+ /logoff/i,
21
+ /logout/i,
22
+ ]
23
+
24
+ # pass me opts and an array of regexps to ignore
25
+ # we have a set of sane(ish) defaults here
26
+ def initialize(opts={},ignore=nil)
27
+ @opts = opts
28
+ @visited = []
29
+ @queued = []
30
+ @local_only = true
31
+ @csrf_token = nil
32
+ if !opts[:spider_local_only].nil?
33
+ @local_only = opts[:spider_local_only]
34
+ end
35
+ @ignore = ignore || DEFAULT_IGNORE
36
+ end
37
+
38
+ # push an url onto the queue
39
+ def push_url(url)
40
+ return false if _check_ignore(url)
41
+ if @local_only
42
+ return false if !(url =~ /#{@opts[:base_url]}/)
43
+ end
44
+ return false if (@visited.include?(url) or @queued.include?(url))
45
+ @queued.push(url)
46
+ true
47
+ end
48
+ alias_method :push, :push_url
49
+
50
+ # skip items in the queue
51
+ def skip(tim=1)
52
+ tim.times { |i| @queued.shift }
53
+ true
54
+ end
55
+
56
+ # get the next url in the queue
57
+ def get_next
58
+ queued.shift
59
+ end
60
+
61
+ alias_method :next, :get_next
62
+
63
+ # more elements in the queue?
64
+ def next?
65
+ !queued.empty?
66
+ end
67
+
68
+ # get the last ul we visited? this doesn't look right
69
+ def get_last(url)
70
+ tmp = @visited.reject { |v| v =~ /#{url}/ }
71
+ return tmp[-1]
72
+ end
73
+
74
+ # show the visited list (or the entry in the list at [id])
75
+ def show_visited(id=nil)
76
+ if id.nil?
77
+ @visited.each_index { |i| putx i.to_s + " :: " + @visited[i].to_s }
78
+ return nil
79
+ else
80
+ return @visited[id]
81
+ end
82
+ end
83
+
84
+ alias_method :v, :show_visited
85
+
86
+ # return the current queue (or the entry in the queue at [id]
87
+ def show_queue(id=nil)
88
+ if id.nil?
89
+ @queued.each_index { |i| putx i.to_s + " :: " + @queued[i].to_s }
90
+ return nil
91
+ else
92
+ return @queued[id]
93
+ end
94
+ end
95
+
96
+ alias_method :q, :show_queue
97
+
98
+ # add url to queue
99
+ def add(url='',links=[])
100
+ return nil if @visited.include?(url)
101
+ @visited.push(url)
102
+ links.each { |l| self.push_url l }
103
+ nil
104
+ end
105
+
106
+ # set up the ignore list
107
+ # ignore list is an array of regexp objects
108
+ # remember to set this up before calling any Page methods
109
+ def set_ignore(arr)
110
+ @ignore = arr
111
+ end
112
+
113
+ def _de_csrf(url)
114
+ return url if @csrf_token.nil?
115
+ act,params = url.clopa
116
+ form = params.to_form
117
+ return url if !form.has_key?(@csrf_token)
118
+ form[@csrf_token] = ''
119
+ url = act + form.to_get
120
+ end
121
+
122
+ def _check_ignore(url)
123
+ @ignore.each { |x| return true if (url =~ x) }
124
+ return false
125
+ end
126
+ end
127
+ end