wwmd 0.2.20.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. data/History.txt +38 -0
  2. data/README.rdoc +87 -0
  3. data/Rakefile +33 -0
  4. data/examples/config_example.yaml +24 -0
  5. data/examples/wwmd_example.rb +73 -0
  6. data/lib/wwmd.rb +84 -0
  7. data/lib/wwmd/class_extensions.rb +4 -0
  8. data/lib/wwmd/class_extensions/extensions_base.rb +251 -0
  9. data/lib/wwmd/class_extensions/extensions_encoding.rb +79 -0
  10. data/lib/wwmd/class_extensions/extensions_external.rb +18 -0
  11. data/lib/wwmd/class_extensions/extensions_nilclass.rb +11 -0
  12. data/lib/wwmd/class_extensions/extensions_rbkb.rb +193 -0
  13. data/lib/wwmd/class_extensions/mixins_string_encoding.rb +40 -0
  14. data/lib/wwmd/guid.rb +155 -0
  15. data/lib/wwmd/page.rb +3 -0
  16. data/lib/wwmd/page/_fa.old +302 -0
  17. data/lib/wwmd/page/auth.rb +17 -0
  18. data/lib/wwmd/page/constants.rb +63 -0
  19. data/lib/wwmd/page/form.rb +99 -0
  20. data/lib/wwmd/page/form_array.rb +304 -0
  21. data/lib/wwmd/page/headers.rb +118 -0
  22. data/lib/wwmd/page/helpers.rb +41 -0
  23. data/lib/wwmd/page/html2text_hpricot.rb +76 -0
  24. data/lib/wwmd/page/html2text_nokogiri.rb +42 -0
  25. data/lib/wwmd/page/inputs.rb +47 -0
  26. data/lib/wwmd/page/irb_helpers.rb +114 -0
  27. data/lib/wwmd/page/page.rb +257 -0
  28. data/lib/wwmd/page/parsing_convenience.rb +98 -0
  29. data/lib/wwmd/page/reporting_helpers.rb +89 -0
  30. data/lib/wwmd/page/scrape.rb +196 -0
  31. data/lib/wwmd/page/spider.rb +127 -0
  32. data/lib/wwmd/urlparse.rb +125 -0
  33. data/lib/wwmd/viewstate.rb +17 -0
  34. data/lib/wwmd/viewstate/viewstate.rb +101 -0
  35. data/lib/wwmd/viewstate/viewstate_deserializer_methods.rb +217 -0
  36. data/lib/wwmd/viewstate/viewstate_from_xml.rb +129 -0
  37. data/lib/wwmd/viewstate/viewstate_types.rb +51 -0
  38. data/lib/wwmd/viewstate/viewstate_utils.rb +164 -0
  39. data/lib/wwmd/viewstate/viewstate_yaml.rb +25 -0
  40. data/lib/wwmd/viewstate/vs_stubs.rb +22 -0
  41. data/lib/wwmd/viewstate/vs_stubs/vs_array.rb +38 -0
  42. data/lib/wwmd/viewstate/vs_stubs/vs_binary_serialized.rb +30 -0
  43. data/lib/wwmd/viewstate/vs_stubs/vs_hashtable.rb +42 -0
  44. data/lib/wwmd/viewstate/vs_stubs/vs_hybrid_dict.rb +42 -0
  45. data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string.rb +6 -0
  46. data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string_ref.rb +24 -0
  47. data/lib/wwmd/viewstate/vs_stubs/vs_int_enum.rb +27 -0
  48. data/lib/wwmd/viewstate/vs_stubs/vs_list.rb +34 -0
  49. data/lib/wwmd/viewstate/vs_stubs/vs_pair.rb +29 -0
  50. data/lib/wwmd/viewstate/vs_stubs/vs_read_types.rb +11 -0
  51. data/lib/wwmd/viewstate/vs_stubs/vs_read_value.rb +35 -0
  52. data/lib/wwmd/viewstate/vs_stubs/vs_sparse_array.rb +58 -0
  53. data/lib/wwmd/viewstate/vs_stubs/vs_string.rb +33 -0
  54. data/lib/wwmd/viewstate/vs_stubs/vs_string_array.rb +39 -0
  55. data/lib/wwmd/viewstate/vs_stubs/vs_string_formatted.rb +32 -0
  56. data/lib/wwmd/viewstate/vs_stubs/vs_stub_helpers.rb +37 -0
  57. data/lib/wwmd/viewstate/vs_stubs/vs_triplet.rb +31 -0
  58. data/lib/wwmd/viewstate/vs_stubs/vs_type.rb +23 -0
  59. data/lib/wwmd/viewstate/vs_stubs/vs_unit.rb +30 -0
  60. data/lib/wwmd/viewstate/vs_stubs/vs_value.rb +35 -0
  61. data/lib/wwmd/wwmd_config.rb +52 -0
  62. data/lib/wwmd/wwmd_puts.rb +9 -0
  63. data/lib/wwmd/wwmd_utils.rb +28 -0
  64. data/spec/README +3 -0
  65. data/spec/form_array.spec +49 -0
  66. data/spec/spider_csrf_test.spec +28 -0
  67. data/spec/urlparse_test.spec +101 -0
  68. data/tasks/ann.rake +80 -0
  69. data/tasks/bones.rake +20 -0
  70. data/tasks/gem.rake +201 -0
  71. data/tasks/git.rake +40 -0
  72. data/tasks/notes.rake +27 -0
  73. data/tasks/post_load.rake +34 -0
  74. data/tasks/rdoc.rake +51 -0
  75. data/tasks/rubyforge.rake +55 -0
  76. data/tasks/setup.rb +292 -0
  77. data/tasks/spec.rake +54 -0
  78. data/tasks/test.rake +40 -0
  79. data/tasks/zentest.rake +36 -0
  80. metadata +222 -0
@@ -0,0 +1,98 @@
1
+ module WWMD
2
+ class Page
3
+ #:section: Parsing convenience methods
4
+ # methods that help parse and find information on a page including
5
+ # access to forms etc.
6
+
7
+ # grep for regexp and remove leading whitespace
8
+ def grep(reg)
9
+ self.body_data.grep(reg).map { |i| i.gsub(/^\s+/, "") }
10
+ end
11
+
12
+ # return this page's form (at index id) as a FormArray
13
+ def get_form(id=nil)
14
+ id = 0 if not id
15
+ return nil if forms.empty? || !forms[id]
16
+ f = @forms[id]
17
+ action = f.action
18
+ action ||= action
19
+ action ||= cur
20
+ action ||= "PARSE_ERROR"
21
+ url_action = @urlparse.parse(self.cur,action).to_s
22
+ type = f.type
23
+ FormArray.new do |x|
24
+ x.set_fields(f.fields)
25
+ x.action = url_action
26
+ x.type = type
27
+ end
28
+ end
29
+
30
+ # return the complete url to the form action on this page
31
+ def action(id=nil)
32
+ id = 0 if not id
33
+ act = self.forms[id].action
34
+ return self.last_effective_url if (act.nil? || act.empty?)
35
+ return @urlparse.parse(self.last_effective_url,act).to_s
36
+ end
37
+
38
+ # return an array of Element objects for an xpath search
39
+ def search(xpath)
40
+ self.scrape.hdoc.search(xpath)
41
+ end
42
+
43
+ # return an array of inner_html for each <script> tag encountered
44
+ def dump_scripts
45
+ self.get_tags("//script").map { |s| s.inner_html if s.inner_html.strip != '' }
46
+ end
47
+
48
+ alias_method :scripts, :dump_scripts
49
+
50
+ # set link using an integer link from self.report
51
+ #--
52
+ # NOTE: I always use page.get(page.l(1)) anyway.
53
+ #++
54
+ def set_link(index)
55
+ self.url = @links[index]
56
+ end
57
+
58
+ # return link at index from @links array
59
+ def get_link(index)
60
+ @links[index]
61
+ end
62
+
63
+ alias_method :link, :get_link #:nodoc:
64
+ alias_method :l, :get_link #:nodoc:
65
+
66
+ def all_tags#:nodoc:
67
+ return self.search("*").map { |x| x.name }
68
+ end
69
+
70
+ def furl(url)
71
+ self.url = @urlparse.parse(self.base_url,url).to_s
72
+ end
73
+
74
+ # set self.opts[:base_url]
75
+ def setbase(url=nil)
76
+ return nil if not url
77
+ self.opts[:base_url] = url
78
+ self.base_url = url
79
+ end
80
+
81
+ # write self.body_data to file
82
+ def write(filename)
83
+ File.write(filename,self.body_data)
84
+ return "wrote to " + filename
85
+ end
86
+
87
+ # read self.body_data from file
88
+ def read(filename)
89
+ self.body_data = File.read(filename)
90
+ self.set_data
91
+ end
92
+
93
+ # alias_method for body_data
94
+ def raw
95
+ self.body_data
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,89 @@
1
+ module WWMD
2
+ class Page
3
+ attr_accessor :status
4
+ #:section: Reporting helper methods
5
+ # These are methods that generate data for a parsed page
6
+
7
+ # return text representation of page code
8
+ #
9
+ # override with specific statuses in helper depending on page text
10
+ # etc to include statuses outside 200 = OK and other = ERR
11
+ def page_status
12
+ @status = "OK"
13
+ @status = "ERR" if self.response_code > 399
14
+ end
15
+
16
+ # alias_method :status, :page_status#:nodoc:
17
+
18
+ # return value of @logged_in
19
+ def logged_in?
20
+ return @logged_in
21
+ end
22
+
23
+ # return a string of flags:
24
+ # Ll links
25
+ # Jj javascript includes
26
+ # Ff forms
27
+ # Cc comments
28
+ def report_flags
29
+ self.has_links? ? ret = "L" : ret = "l"
30
+ self.has_jlinks? ? ret += "J" : ret += "j"
31
+ self.has_form? ? ret += "F" : ret += "f"
32
+ self.has_comments? ? ret += "C" : ret += "c"
33
+ return ret
34
+ end
35
+
36
+ def has_links?; return !@links.empty?; end
37
+ def has_jlinks?; return !@jlinks.empty?; end
38
+ def has_form?; return !(@forms.size < 1); end
39
+ def has_comments?; return !@comments.empty?; end
40
+
41
+ # return page size in bytes
42
+ def size
43
+ return self.body_data.size
44
+ end
45
+
46
+ # return md5sum for self.body_data
47
+ def md5
48
+ return self.body_data.md5
49
+ end
50
+
51
+ # does this response have SET-COOKIE headers?
52
+ def set_cookies?
53
+ ret = FormArray.new()
54
+ self.header_data.each do |x|
55
+ if x[0].upcase == "SET-COOKIE"
56
+ ret << x[1].split(";").first.split("=",2)
57
+ end
58
+ end
59
+ ret
60
+ end
61
+ alias_method :set_cookies, :set_cookies?
62
+
63
+ def time
64
+ self.total_time
65
+ end
66
+
67
+ # return MD5 for DOM fingerprint
68
+ # take all tag names in page.to_s.md5
69
+ def fingerprint
70
+ self.all_tags.to_s.md5
71
+ end
72
+ alias_method :fp, :fingerprint #:nodoc:
73
+
74
+ # alias_method for last_effective_url
75
+ def current_url
76
+ self.last_effective_url
77
+ end
78
+
79
+ alias_method :current, :current_url
80
+ alias_method :cur, :current_url
81
+ alias_method :now, :current_url
82
+
83
+ # the last http response code
84
+ def code
85
+ self.response_code # .to_s
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,196 @@
1
+ module WWMD
2
+ LINKS_REGEXP = [
3
+ /window\.open\s*\(([^\)]+)/i,
4
+ /open_window\s*\(([^\)]+)/i,
5
+ /window\.location\s*=\s*(['"][^'"]+['"])/i,
6
+ /.*location.href\s*=\s*(['"][^'"]+['"])/i,
7
+ /document.forms.*action\s*=\s*(['"][^'"]+['"])/i,
8
+ /Ajax\.Request\s*\((['"][^'"]+['"])/i,
9
+ ]
10
+
11
+ class Scrape
12
+
13
+ attr_accessor :debug
14
+ attr_accessor :warn
15
+ attr_accessor :links # links found on page
16
+ attr_accessor :jlinks # links to javascript includes
17
+ attr_reader :hdoc
18
+
19
+ # create a new scrape object using passed HTML
20
+ def initialize(page='<>')
21
+ @page = page
22
+ @hdoc = HDOC.parse(@page)
23
+ @links = Array.new
24
+ @debug = false
25
+ @warn = false
26
+ end
27
+
28
+ # reset this scrape object (called by WWMD::Page)
29
+ def reset(page)
30
+ @page = page
31
+ @hdoc = HDOC.parse(@page)
32
+ @links = Array.new
33
+ end
34
+
35
+ # scan the passed string for the configured regular expressions
36
+ # and return them as an array
37
+ def urls_from_regexp(content,re,split=0)
38
+ ret = []
39
+ scrape = content.scan(re)
40
+ scrape.each do |url|
41
+ # cheat and take split string(,)[split]
42
+ add = url.to_s.split(',')[split].gsub(/['"]/, '')
43
+ next if (add == '' || add.nil?)
44
+ ret << add
45
+ end
46
+ return ret
47
+ end
48
+
49
+ # xpath search for tags and return the passed attribute
50
+ # urls_from_xpath("//a","href")
51
+ def urls_from_xpath(xpath,attr)
52
+ ret = []
53
+ @hdoc.search(xpath).each do |elem|
54
+ url = elem[attr]
55
+ next if url.empty?
56
+ ret << url.strip
57
+ end
58
+ return ret
59
+ end
60
+
61
+ # <b>NEED</b> to move this to external configuration
62
+ #
63
+ # list of urls we don't care to store in our links list
64
+ def reject_links
65
+ putw "WARN: override reject_links in helper script" if @warn
66
+ default_reject_links
67
+ end
68
+
69
+ # default reject links (override using reject_links in helper script)
70
+ def default_reject_links
71
+ @links.reject! do |url|
72
+ url.nil? ||
73
+ url.extname == ".css" ||
74
+ url.extname == ".pdf" ||
75
+ url =~ /javascript:/i ||
76
+ url =~ /mailto:/i ||
77
+ url =~ /[\[\]]/ ||
78
+ url =~ /^#/
79
+ end
80
+ end
81
+
82
+ # return an array of Form objects for forms on page
83
+ def for_forms
84
+ ret = []
85
+ @hdoc.search("//form").each { |f| ret << Form.new(f) }
86
+ ret
87
+ end
88
+
89
+ # use xpath searches to get
90
+ # * //a href
91
+ # * //area href
92
+ # * //frame src
93
+ # * //iframe src
94
+ # * //form action
95
+ # * //meta refresh content urls
96
+ # then get //script tags and regexp out links in javascript function calls
97
+ # from elem.inner_html
98
+ def for_links(reject=true)
99
+ self.urls_from_xpath("//a","href").each { |url| @links << url } # get <a href=""> elements
100
+ self.urls_from_xpath("//area","href").each { |url| @links << url } # get <area href=""> elements
101
+ self.urls_from_xpath("//frame","src").each { |url| @links << url } # get <frame src=""> elements
102
+ self.urls_from_xpath("//iframe","src").each { |url| @links << url } # get <iframe src=""> elements
103
+ self.urls_from_xpath("//form","action").each { |url| @links << url } # get <form action=""> elements
104
+
105
+ # <meta> refresh
106
+ @hdoc.search("//meta").each do |meta|
107
+ next if meta['http-equiv'] != "refresh"
108
+ next if not (content = meta['content'].split(/=/)[1])
109
+ @links << content.strip
110
+ end
111
+
112
+ # add urls from onclick handlers
113
+ @hdoc.search("*[@onclick]").each do |onclick|
114
+ LINKS_REGEXP.each do |re|
115
+ self.urls_from_regexp(onclick['onclick'],re).each do |url|
116
+ @links << url
117
+ end
118
+ end
119
+ end
120
+
121
+ # add urls_from_regexp (limit to <script> tags (elem.inner_html))
122
+ @hdoc.search("//script").each do |scr|
123
+ LINKS_REGEXP.each do |re|
124
+ self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url }
125
+ end
126
+ end
127
+
128
+ # re-define urls_from_helper in what you mix in
129
+ begin
130
+ self.urls_from_helper
131
+ end
132
+
133
+ self.reject_links; # reject links we don't care about
134
+ return @links
135
+ end
136
+
137
+ # scrape the page for <script src=""> tags
138
+ def for_javascript_links
139
+ urls = []
140
+ @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] }
141
+ urls.reject! { |url| File.extname(url).clip != ".js" }
142
+ return urls
143
+ end
144
+
145
+ # scan page for comment fields
146
+ def for_comments
147
+ @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s }
148
+ end
149
+
150
+ # scrape the page for a meta refresh tag and return the url from the contents attribute or nil
151
+ def for_meta_refresh
152
+ has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh')
153
+ if has_mr
154
+ urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] }
155
+ if urls.size > 1
156
+ STDERR.puts "PARSE ERROR: more than one meta refresh tag"
157
+ return "ERR"
158
+ end
159
+ k,v = urls.first.split("=",2)
160
+ if k.upcase.strip != "URL"
161
+ STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url"
162
+ return "ERR"
163
+ end
164
+ return v.strip
165
+ else
166
+ return nil
167
+ end
168
+ end
169
+
170
+ # scrape the page for a script tag that contains a bare location.href tag (to redirect the page)
171
+ def for_javascript_redirect
172
+ redirs = []
173
+ @hdoc.search("//script").each do |scr|
174
+ scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x }
175
+ end
176
+ if redirs.size > 1
177
+ STDERR.puts "PARSE ERROR: more than one javascript redirect"
178
+ return "ERR"
179
+ end
180
+ return redirs.first if not redirs.empty?
181
+ return nil
182
+ end
183
+
184
+ # renamed class variable (for backward compat)
185
+ def warnings#:nodoc:
186
+ return @warn
187
+ end
188
+
189
+ # define an urls_from_helper method in your task specific script
190
+ def urls_from_helper
191
+ putw "WARN: Please set an urls_from_helper override in your helper script" if @warn
192
+ return nil
193
+ end
194
+
195
+ end
196
+ end
@@ -0,0 +1,127 @@
1
+ module WWMD
2
+ # when a WWMD::Page object is created, it created its own WWMD::Spider object
3
+ # which can be accessed using <tt>page.spider.method</tt>. The <tt>page.set_data</tt>
4
+ # method calls <tt>page.spider.add</tt> with the current url and a list of scraped
5
+ # links from the page. This class doesn't do any real heavy lifting.
6
+ #
7
+ # a simple spider can be written just by recursing through page.spider.next until
8
+ # it's empty.
9
+ class Spider
10
+
11
+ attr_accessor :queued
12
+ attr_accessor :visited
13
+ attr_accessor :bypass
14
+ attr_accessor :local_only
15
+ attr_reader :opts
16
+ attr_accessor :ignore
17
+ attr_accessor :csrf_token
18
+
19
+ DEFAULT_IGNORE = [
20
+ /logoff/i,
21
+ /logout/i,
22
+ ]
23
+
24
+ # pass me opts and an array of regexps to ignore
25
+ # we have a set of sane(ish) defaults here
26
+ def initialize(opts={},ignore=nil)
27
+ @opts = opts
28
+ @visited = []
29
+ @queued = []
30
+ @local_only = true
31
+ @csrf_token = nil
32
+ if !opts[:spider_local_only].nil?
33
+ @local_only = opts[:spider_local_only]
34
+ end
35
+ @ignore = ignore || DEFAULT_IGNORE
36
+ end
37
+
38
+ # push an url onto the queue
39
+ def push_url(url)
40
+ return false if _check_ignore(url)
41
+ if @local_only
42
+ return false if !(url =~ /#{@opts[:base_url]}/)
43
+ end
44
+ return false if (@visited.include?(url) or @queued.include?(url))
45
+ @queued.push(url)
46
+ true
47
+ end
48
+ alias_method :push, :push_url
49
+
50
+ # skip items in the queue
51
+ def skip(tim=1)
52
+ tim.times { |i| @queued.shift }
53
+ true
54
+ end
55
+
56
+ # get the next url in the queue
57
+ def get_next
58
+ queued.shift
59
+ end
60
+
61
+ alias_method :next, :get_next
62
+
63
+ # more elements in the queue?
64
+ def next?
65
+ !queued.empty?
66
+ end
67
+
68
+ # get the last ul we visited? this doesn't look right
69
+ def get_last(url)
70
+ tmp = @visited.reject { |v| v =~ /#{url}/ }
71
+ return tmp[-1]
72
+ end
73
+
74
+ # show the visited list (or the entry in the list at [id])
75
+ def show_visited(id=nil)
76
+ if id.nil?
77
+ @visited.each_index { |i| putx i.to_s + " :: " + @visited[i].to_s }
78
+ return nil
79
+ else
80
+ return @visited[id]
81
+ end
82
+ end
83
+
84
+ alias_method :v, :show_visited
85
+
86
+ # return the current queue (or the entry in the queue at [id]
87
+ def show_queue(id=nil)
88
+ if id.nil?
89
+ @queued.each_index { |i| putx i.to_s + " :: " + @queued[i].to_s }
90
+ return nil
91
+ else
92
+ return @queued[id]
93
+ end
94
+ end
95
+
96
+ alias_method :q, :show_queue
97
+
98
+ # add url to queue
99
+ def add(url='',links=[])
100
+ return nil if @visited.include?(url)
101
+ @visited.push(url)
102
+ links.each { |l| self.push_url l }
103
+ nil
104
+ end
105
+
106
+ # set up the ignore list
107
+ # ignore list is an array of regexp objects
108
+ # remember to set this up before calling any Page methods
109
+ def set_ignore(arr)
110
+ @ignore = arr
111
+ end
112
+
113
+ def _de_csrf(url)
114
+ return url if @csrf_token.nil?
115
+ act,params = url.clopa
116
+ form = params.to_form
117
+ return url if !form.has_key?(@csrf_token)
118
+ form[@csrf_token] = ''
119
+ url = act + form.to_get
120
+ end
121
+
122
+ def _check_ignore(url)
123
+ @ignore.each { |x| return true if (url =~ x) }
124
+ return false
125
+ end
126
+ end
127
+ end