wwmd 0.2.20.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +38 -0
- data/README.rdoc +87 -0
- data/Rakefile +33 -0
- data/examples/config_example.yaml +24 -0
- data/examples/wwmd_example.rb +73 -0
- data/lib/wwmd.rb +84 -0
- data/lib/wwmd/class_extensions.rb +4 -0
- data/lib/wwmd/class_extensions/extensions_base.rb +251 -0
- data/lib/wwmd/class_extensions/extensions_encoding.rb +79 -0
- data/lib/wwmd/class_extensions/extensions_external.rb +18 -0
- data/lib/wwmd/class_extensions/extensions_nilclass.rb +11 -0
- data/lib/wwmd/class_extensions/extensions_rbkb.rb +193 -0
- data/lib/wwmd/class_extensions/mixins_string_encoding.rb +40 -0
- data/lib/wwmd/guid.rb +155 -0
- data/lib/wwmd/page.rb +3 -0
- data/lib/wwmd/page/_fa.old +302 -0
- data/lib/wwmd/page/auth.rb +17 -0
- data/lib/wwmd/page/constants.rb +63 -0
- data/lib/wwmd/page/form.rb +99 -0
- data/lib/wwmd/page/form_array.rb +304 -0
- data/lib/wwmd/page/headers.rb +118 -0
- data/lib/wwmd/page/helpers.rb +41 -0
- data/lib/wwmd/page/html2text_hpricot.rb +76 -0
- data/lib/wwmd/page/html2text_nokogiri.rb +42 -0
- data/lib/wwmd/page/inputs.rb +47 -0
- data/lib/wwmd/page/irb_helpers.rb +114 -0
- data/lib/wwmd/page/page.rb +257 -0
- data/lib/wwmd/page/parsing_convenience.rb +98 -0
- data/lib/wwmd/page/reporting_helpers.rb +89 -0
- data/lib/wwmd/page/scrape.rb +196 -0
- data/lib/wwmd/page/spider.rb +127 -0
- data/lib/wwmd/urlparse.rb +125 -0
- data/lib/wwmd/viewstate.rb +17 -0
- data/lib/wwmd/viewstate/viewstate.rb +101 -0
- data/lib/wwmd/viewstate/viewstate_deserializer_methods.rb +217 -0
- data/lib/wwmd/viewstate/viewstate_from_xml.rb +129 -0
- data/lib/wwmd/viewstate/viewstate_types.rb +51 -0
- data/lib/wwmd/viewstate/viewstate_utils.rb +164 -0
- data/lib/wwmd/viewstate/viewstate_yaml.rb +25 -0
- data/lib/wwmd/viewstate/vs_stubs.rb +22 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_array.rb +38 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_binary_serialized.rb +30 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_hashtable.rb +42 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_hybrid_dict.rb +42 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string.rb +6 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_indexed_string_ref.rb +24 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_int_enum.rb +27 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_list.rb +34 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_pair.rb +29 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_read_types.rb +11 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_read_value.rb +35 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_sparse_array.rb +58 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_string.rb +33 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_string_array.rb +39 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_string_formatted.rb +32 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_stub_helpers.rb +37 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_triplet.rb +31 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_type.rb +23 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_unit.rb +30 -0
- data/lib/wwmd/viewstate/vs_stubs/vs_value.rb +35 -0
- data/lib/wwmd/wwmd_config.rb +52 -0
- data/lib/wwmd/wwmd_puts.rb +9 -0
- data/lib/wwmd/wwmd_utils.rb +28 -0
- data/spec/README +3 -0
- data/spec/form_array.spec +49 -0
- data/spec/spider_csrf_test.spec +28 -0
- data/spec/urlparse_test.spec +101 -0
- data/tasks/ann.rake +80 -0
- data/tasks/bones.rake +20 -0
- data/tasks/gem.rake +201 -0
- data/tasks/git.rake +40 -0
- data/tasks/notes.rake +27 -0
- data/tasks/post_load.rake +34 -0
- data/tasks/rdoc.rake +51 -0
- data/tasks/rubyforge.rake +55 -0
- data/tasks/setup.rb +292 -0
- data/tasks/spec.rake +54 -0
- data/tasks/test.rake +40 -0
- data/tasks/zentest.rake +36 -0
- metadata +222 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
module WWMD
|
2
|
+
class Page
|
3
|
+
#:section: Parsing convenience methods
|
4
|
+
# methods that help parse and find information on a page including
|
5
|
+
# access to forms etc.
|
6
|
+
|
7
|
+
# grep for regexp and remove leading whitespace
|
8
|
+
def grep(reg)
|
9
|
+
self.body_data.grep(reg).map { |i| i.gsub(/^\s+/, "") }
|
10
|
+
end
|
11
|
+
|
12
|
+
# return this page's form (at index id) as a FormArray
|
13
|
+
def get_form(id=nil)
|
14
|
+
id = 0 if not id
|
15
|
+
return nil if forms.empty? || !forms[id]
|
16
|
+
f = @forms[id]
|
17
|
+
action = f.action
|
18
|
+
action ||= action
|
19
|
+
action ||= cur
|
20
|
+
action ||= "PARSE_ERROR"
|
21
|
+
url_action = @urlparse.parse(self.cur,action).to_s
|
22
|
+
type = f.type
|
23
|
+
FormArray.new do |x|
|
24
|
+
x.set_fields(f.fields)
|
25
|
+
x.action = url_action
|
26
|
+
x.type = type
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# return the complete url to the form action on this page
|
31
|
+
def action(id=nil)
|
32
|
+
id = 0 if not id
|
33
|
+
act = self.forms[id].action
|
34
|
+
return self.last_effective_url if (act.nil? || act.empty?)
|
35
|
+
return @urlparse.parse(self.last_effective_url,act).to_s
|
36
|
+
end
|
37
|
+
|
38
|
+
# return an array of Element objects for an xpath search
|
39
|
+
def search(xpath)
|
40
|
+
self.scrape.hdoc.search(xpath)
|
41
|
+
end
|
42
|
+
|
43
|
+
# return an array of inner_html for each <script> tag encountered
|
44
|
+
def dump_scripts
|
45
|
+
self.get_tags("//script").map { |s| s.inner_html if s.inner_html.strip != '' }
|
46
|
+
end
|
47
|
+
|
48
|
+
alias_method :scripts, :dump_scripts
|
49
|
+
|
50
|
+
# set link using an integer link from self.report
|
51
|
+
#--
|
52
|
+
# NOTE: I always use page.get(page.l(1)) anyway.
|
53
|
+
#++
|
54
|
+
def set_link(index)
|
55
|
+
self.url = @links[index]
|
56
|
+
end
|
57
|
+
|
58
|
+
# return link at index from @links array
|
59
|
+
def get_link(index)
|
60
|
+
@links[index]
|
61
|
+
end
|
62
|
+
|
63
|
+
alias_method :link, :get_link #:nodoc:
|
64
|
+
alias_method :l, :get_link #:nodoc:
|
65
|
+
|
66
|
+
def all_tags#:nodoc:
|
67
|
+
return self.search("*").map { |x| x.name }
|
68
|
+
end
|
69
|
+
|
70
|
+
def furl(url)
|
71
|
+
self.url = @urlparse.parse(self.base_url,url).to_s
|
72
|
+
end
|
73
|
+
|
74
|
+
# set self.opts[:base_url]
|
75
|
+
def setbase(url=nil)
|
76
|
+
return nil if not url
|
77
|
+
self.opts[:base_url] = url
|
78
|
+
self.base_url = url
|
79
|
+
end
|
80
|
+
|
81
|
+
# write self.body_data to file
|
82
|
+
def write(filename)
|
83
|
+
File.write(filename,self.body_data)
|
84
|
+
return "wrote to " + filename
|
85
|
+
end
|
86
|
+
|
87
|
+
# read self.body_data from file
|
88
|
+
def read(filename)
|
89
|
+
self.body_data = File.read(filename)
|
90
|
+
self.set_data
|
91
|
+
end
|
92
|
+
|
93
|
+
# alias_method for body_data
|
94
|
+
def raw
|
95
|
+
self.body_data
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module WWMD
|
2
|
+
class Page
|
3
|
+
attr_accessor :status
|
4
|
+
#:section: Reporting helper methods
|
5
|
+
# These are methods that generate data for a parsed page
|
6
|
+
|
7
|
+
# return text representation of page code
|
8
|
+
#
|
9
|
+
# override with specific statuses in helper depending on page text
|
10
|
+
# etc to include statuses outside 200 = OK and other = ERR
|
11
|
+
def page_status
|
12
|
+
@status = "OK"
|
13
|
+
@status = "ERR" if self.response_code > 399
|
14
|
+
end
|
15
|
+
|
16
|
+
# alias_method :status, :page_status#:nodoc:
|
17
|
+
|
18
|
+
# return value of @logged_in
|
19
|
+
def logged_in?
|
20
|
+
return @logged_in
|
21
|
+
end
|
22
|
+
|
23
|
+
# return a string of flags:
|
24
|
+
# Ll links
|
25
|
+
# Jj javascript includes
|
26
|
+
# Ff forms
|
27
|
+
# Cc comments
|
28
|
+
def report_flags
|
29
|
+
self.has_links? ? ret = "L" : ret = "l"
|
30
|
+
self.has_jlinks? ? ret += "J" : ret += "j"
|
31
|
+
self.has_form? ? ret += "F" : ret += "f"
|
32
|
+
self.has_comments? ? ret += "C" : ret += "c"
|
33
|
+
return ret
|
34
|
+
end
|
35
|
+
|
36
|
+
def has_links?; return !@links.empty?; end
|
37
|
+
def has_jlinks?; return !@jlinks.empty?; end
|
38
|
+
def has_form?; return !(@forms.size < 1); end
|
39
|
+
def has_comments?; return !@comments.empty?; end
|
40
|
+
|
41
|
+
# return page size in bytes
|
42
|
+
def size
|
43
|
+
return self.body_data.size
|
44
|
+
end
|
45
|
+
|
46
|
+
# return md5sum for self.body_data
|
47
|
+
def md5
|
48
|
+
return self.body_data.md5
|
49
|
+
end
|
50
|
+
|
51
|
+
# does this response have SET-COOKIE headers?
|
52
|
+
def set_cookies?
|
53
|
+
ret = FormArray.new()
|
54
|
+
self.header_data.each do |x|
|
55
|
+
if x[0].upcase == "SET-COOKIE"
|
56
|
+
ret << x[1].split(";").first.split("=",2)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
ret
|
60
|
+
end
|
61
|
+
alias_method :set_cookies, :set_cookies?
|
62
|
+
|
63
|
+
def time
|
64
|
+
self.total_time
|
65
|
+
end
|
66
|
+
|
67
|
+
# return MD5 for DOM fingerprint
|
68
|
+
# take all tag names in page.to_s.md5
|
69
|
+
def fingerprint
|
70
|
+
self.all_tags.to_s.md5
|
71
|
+
end
|
72
|
+
alias_method :fp, :fingerprint #:nodoc:
|
73
|
+
|
74
|
+
# alias_method for last_effective_url
|
75
|
+
def current_url
|
76
|
+
self.last_effective_url
|
77
|
+
end
|
78
|
+
|
79
|
+
alias_method :current, :current_url
|
80
|
+
alias_method :cur, :current_url
|
81
|
+
alias_method :now, :current_url
|
82
|
+
|
83
|
+
# the last http response code
|
84
|
+
def code
|
85
|
+
self.response_code # .to_s
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
module WWMD
|
2
|
+
LINKS_REGEXP = [
|
3
|
+
/window\.open\s*\(([^\)]+)/i,
|
4
|
+
/open_window\s*\(([^\)]+)/i,
|
5
|
+
/window\.location\s*=\s*(['"][^'"]+['"])/i,
|
6
|
+
/.*location.href\s*=\s*(['"][^'"]+['"])/i,
|
7
|
+
/document.forms.*action\s*=\s*(['"][^'"]+['"])/i,
|
8
|
+
/Ajax\.Request\s*\((['"][^'"]+['"])/i,
|
9
|
+
]
|
10
|
+
|
11
|
+
class Scrape
|
12
|
+
|
13
|
+
attr_accessor :debug
|
14
|
+
attr_accessor :warn
|
15
|
+
attr_accessor :links # links found on page
|
16
|
+
attr_accessor :jlinks # links to javascript includes
|
17
|
+
attr_reader :hdoc
|
18
|
+
|
19
|
+
# create a new scrape object using passed HTML
|
20
|
+
def initialize(page='<>')
|
21
|
+
@page = page
|
22
|
+
@hdoc = HDOC.parse(@page)
|
23
|
+
@links = Array.new
|
24
|
+
@debug = false
|
25
|
+
@warn = false
|
26
|
+
end
|
27
|
+
|
28
|
+
# reset this scrape object (called by WWMD::Page)
|
29
|
+
def reset(page)
|
30
|
+
@page = page
|
31
|
+
@hdoc = HDOC.parse(@page)
|
32
|
+
@links = Array.new
|
33
|
+
end
|
34
|
+
|
35
|
+
# scan the passed string for the configured regular expressions
|
36
|
+
# and return them as an array
|
37
|
+
def urls_from_regexp(content,re,split=0)
|
38
|
+
ret = []
|
39
|
+
scrape = content.scan(re)
|
40
|
+
scrape.each do |url|
|
41
|
+
# cheat and take split string(,)[split]
|
42
|
+
add = url.to_s.split(',')[split].gsub(/['"]/, '')
|
43
|
+
next if (add == '' || add.nil?)
|
44
|
+
ret << add
|
45
|
+
end
|
46
|
+
return ret
|
47
|
+
end
|
48
|
+
|
49
|
+
# xpath search for tags and return the passed attribute
|
50
|
+
# urls_from_xpath("//a","href")
|
51
|
+
def urls_from_xpath(xpath,attr)
|
52
|
+
ret = []
|
53
|
+
@hdoc.search(xpath).each do |elem|
|
54
|
+
url = elem[attr]
|
55
|
+
next if url.empty?
|
56
|
+
ret << url.strip
|
57
|
+
end
|
58
|
+
return ret
|
59
|
+
end
|
60
|
+
|
61
|
+
# <b>NEED</b> to move this to external configuration
|
62
|
+
#
|
63
|
+
# list of urls we don't care to store in our links list
|
64
|
+
def reject_links
|
65
|
+
putw "WARN: override reject_links in helper script" if @warn
|
66
|
+
default_reject_links
|
67
|
+
end
|
68
|
+
|
69
|
+
# default reject links (override using reject_links in helper script)
|
70
|
+
def default_reject_links
|
71
|
+
@links.reject! do |url|
|
72
|
+
url.nil? ||
|
73
|
+
url.extname == ".css" ||
|
74
|
+
url.extname == ".pdf" ||
|
75
|
+
url =~ /javascript:/i ||
|
76
|
+
url =~ /mailto:/i ||
|
77
|
+
url =~ /[\[\]]/ ||
|
78
|
+
url =~ /^#/
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# return an array of Form objects for forms on page
|
83
|
+
def for_forms
|
84
|
+
ret = []
|
85
|
+
@hdoc.search("//form").each { |f| ret << Form.new(f) }
|
86
|
+
ret
|
87
|
+
end
|
88
|
+
|
89
|
+
# use xpath searches to get
|
90
|
+
# * //a href
|
91
|
+
# * //area href
|
92
|
+
# * //frame src
|
93
|
+
# * //iframe src
|
94
|
+
# * //form action
|
95
|
+
# * //meta refresh content urls
|
96
|
+
# then get //script tags and regexp out links in javascript function calls
|
97
|
+
# from elem.inner_html
|
98
|
+
def for_links(reject=true)
|
99
|
+
self.urls_from_xpath("//a","href").each { |url| @links << url } # get <a href=""> elements
|
100
|
+
self.urls_from_xpath("//area","href").each { |url| @links << url } # get <area href=""> elements
|
101
|
+
self.urls_from_xpath("//frame","src").each { |url| @links << url } # get <frame src=""> elements
|
102
|
+
self.urls_from_xpath("//iframe","src").each { |url| @links << url } # get <iframe src=""> elements
|
103
|
+
self.urls_from_xpath("//form","action").each { |url| @links << url } # get <form action=""> elements
|
104
|
+
|
105
|
+
# <meta> refresh
|
106
|
+
@hdoc.search("//meta").each do |meta|
|
107
|
+
next if meta['http-equiv'] != "refresh"
|
108
|
+
next if not (content = meta['content'].split(/=/)[1])
|
109
|
+
@links << content.strip
|
110
|
+
end
|
111
|
+
|
112
|
+
# add urls from onclick handlers
|
113
|
+
@hdoc.search("*[@onclick]").each do |onclick|
|
114
|
+
LINKS_REGEXP.each do |re|
|
115
|
+
self.urls_from_regexp(onclick['onclick'],re).each do |url|
|
116
|
+
@links << url
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# add urls_from_regexp (limit to <script> tags (elem.inner_html))
|
122
|
+
@hdoc.search("//script").each do |scr|
|
123
|
+
LINKS_REGEXP.each do |re|
|
124
|
+
self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url }
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# re-define urls_from_helper in what you mix in
|
129
|
+
begin
|
130
|
+
self.urls_from_helper
|
131
|
+
end
|
132
|
+
|
133
|
+
self.reject_links; # reject links we don't care about
|
134
|
+
return @links
|
135
|
+
end
|
136
|
+
|
137
|
+
# scrape the page for <script src=""> tags
|
138
|
+
def for_javascript_links
|
139
|
+
urls = []
|
140
|
+
@hdoc.search("//script[@src]").each { |tag| urls << tag['src'] }
|
141
|
+
urls.reject! { |url| File.extname(url).clip != ".js" }
|
142
|
+
return urls
|
143
|
+
end
|
144
|
+
|
145
|
+
# scan page for comment fields
|
146
|
+
def for_comments
|
147
|
+
@page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s }
|
148
|
+
end
|
149
|
+
|
150
|
+
# scrape the page for a meta refresh tag and return the url from the contents attribute or nil
|
151
|
+
def for_meta_refresh
|
152
|
+
has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh')
|
153
|
+
if has_mr
|
154
|
+
urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] }
|
155
|
+
if urls.size > 1
|
156
|
+
STDERR.puts "PARSE ERROR: more than one meta refresh tag"
|
157
|
+
return "ERR"
|
158
|
+
end
|
159
|
+
k,v = urls.first.split("=",2)
|
160
|
+
if k.upcase.strip != "URL"
|
161
|
+
STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url"
|
162
|
+
return "ERR"
|
163
|
+
end
|
164
|
+
return v.strip
|
165
|
+
else
|
166
|
+
return nil
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# scrape the page for a script tag that contains a bare location.href tag (to redirect the page)
|
171
|
+
def for_javascript_redirect
|
172
|
+
redirs = []
|
173
|
+
@hdoc.search("//script").each do |scr|
|
174
|
+
scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x }
|
175
|
+
end
|
176
|
+
if redirs.size > 1
|
177
|
+
STDERR.puts "PARSE ERROR: more than one javascript redirect"
|
178
|
+
return "ERR"
|
179
|
+
end
|
180
|
+
return redirs.first if not redirs.empty?
|
181
|
+
return nil
|
182
|
+
end
|
183
|
+
|
184
|
+
# renamed class variable (for backward compat)
|
185
|
+
def warnings#:nodoc:
|
186
|
+
return @warn
|
187
|
+
end
|
188
|
+
|
189
|
+
# define an urls_from_helper method in your task specific script
|
190
|
+
def urls_from_helper
|
191
|
+
putw "WARN: Please set an urls_from_helper override in your helper script" if @warn
|
192
|
+
return nil
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
module WWMD
|
2
|
+
# when a WWMD::Page object is created, it created its own WWMD::Spider object
|
3
|
+
# which can be accessed using <tt>page.spider.method</tt>. The <tt>page.set_data</tt>
|
4
|
+
# method calls <tt>page.spider.add</tt> with the current url and a list of scraped
|
5
|
+
# links from the page. This class doesn't do any real heavy lifting.
|
6
|
+
#
|
7
|
+
# a simple spider can be written just by recursing through page.spider.next until
|
8
|
+
# it's empty.
|
9
|
+
class Spider
|
10
|
+
|
11
|
+
attr_accessor :queued
|
12
|
+
attr_accessor :visited
|
13
|
+
attr_accessor :bypass
|
14
|
+
attr_accessor :local_only
|
15
|
+
attr_reader :opts
|
16
|
+
attr_accessor :ignore
|
17
|
+
attr_accessor :csrf_token
|
18
|
+
|
19
|
+
DEFAULT_IGNORE = [
|
20
|
+
/logoff/i,
|
21
|
+
/logout/i,
|
22
|
+
]
|
23
|
+
|
24
|
+
# pass me opts and an array of regexps to ignore
|
25
|
+
# we have a set of sane(ish) defaults here
|
26
|
+
def initialize(opts={},ignore=nil)
|
27
|
+
@opts = opts
|
28
|
+
@visited = []
|
29
|
+
@queued = []
|
30
|
+
@local_only = true
|
31
|
+
@csrf_token = nil
|
32
|
+
if !opts[:spider_local_only].nil?
|
33
|
+
@local_only = opts[:spider_local_only]
|
34
|
+
end
|
35
|
+
@ignore = ignore || DEFAULT_IGNORE
|
36
|
+
end
|
37
|
+
|
38
|
+
# push an url onto the queue
|
39
|
+
def push_url(url)
|
40
|
+
return false if _check_ignore(url)
|
41
|
+
if @local_only
|
42
|
+
return false if !(url =~ /#{@opts[:base_url]}/)
|
43
|
+
end
|
44
|
+
return false if (@visited.include?(url) or @queued.include?(url))
|
45
|
+
@queued.push(url)
|
46
|
+
true
|
47
|
+
end
|
48
|
+
alias_method :push, :push_url
|
49
|
+
|
50
|
+
# skip items in the queue
|
51
|
+
def skip(tim=1)
|
52
|
+
tim.times { |i| @queued.shift }
|
53
|
+
true
|
54
|
+
end
|
55
|
+
|
56
|
+
# get the next url in the queue
|
57
|
+
def get_next
|
58
|
+
queued.shift
|
59
|
+
end
|
60
|
+
|
61
|
+
alias_method :next, :get_next
|
62
|
+
|
63
|
+
# more elements in the queue?
|
64
|
+
def next?
|
65
|
+
!queued.empty?
|
66
|
+
end
|
67
|
+
|
68
|
+
# get the last ul we visited? this doesn't look right
|
69
|
+
def get_last(url)
|
70
|
+
tmp = @visited.reject { |v| v =~ /#{url}/ }
|
71
|
+
return tmp[-1]
|
72
|
+
end
|
73
|
+
|
74
|
+
# show the visited list (or the entry in the list at [id])
|
75
|
+
def show_visited(id=nil)
|
76
|
+
if id.nil?
|
77
|
+
@visited.each_index { |i| putx i.to_s + " :: " + @visited[i].to_s }
|
78
|
+
return nil
|
79
|
+
else
|
80
|
+
return @visited[id]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
alias_method :v, :show_visited
|
85
|
+
|
86
|
+
# return the current queue (or the entry in the queue at [id]
|
87
|
+
def show_queue(id=nil)
|
88
|
+
if id.nil?
|
89
|
+
@queued.each_index { |i| putx i.to_s + " :: " + @queued[i].to_s }
|
90
|
+
return nil
|
91
|
+
else
|
92
|
+
return @queued[id]
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
alias_method :q, :show_queue
|
97
|
+
|
98
|
+
# add url to queue
|
99
|
+
def add(url='',links=[])
|
100
|
+
return nil if @visited.include?(url)
|
101
|
+
@visited.push(url)
|
102
|
+
links.each { |l| self.push_url l }
|
103
|
+
nil
|
104
|
+
end
|
105
|
+
|
106
|
+
# set up the ignore list
|
107
|
+
# ignore list is an array of regexp objects
|
108
|
+
# remember to set this up before calling any Page methods
|
109
|
+
def set_ignore(arr)
|
110
|
+
@ignore = arr
|
111
|
+
end
|
112
|
+
|
113
|
+
def _de_csrf(url)
|
114
|
+
return url if @csrf_token.nil?
|
115
|
+
act,params = url.clopa
|
116
|
+
form = params.to_form
|
117
|
+
return url if !form.has_key?(@csrf_token)
|
118
|
+
form[@csrf_token] = ''
|
119
|
+
url = act + form.to_get
|
120
|
+
end
|
121
|
+
|
122
|
+
def _check_ignore(url)
|
123
|
+
@ignore.each { |x| return true if (url =~ x) }
|
124
|
+
return false
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|