scrubyt 0.3.4 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +31 -0
- data/README +1 -1
- data/Rakefile +4 -9
- data/lib/scrubyt.rb +37 -56
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +2 -183
- data/lib/scrubyt/core/navigation/navigation_actions.rb +30 -48
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +1 -5
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +0 -8
- data/lib/scrubyt/core/scraping/pattern.rb +6 -27
- data/lib/scrubyt/core/scraping/result_indexer.rb +0 -4
- data/lib/scrubyt/core/shared/extractor.rb +15 -1
- data/lib/scrubyt/output/result_node.rb +42 -6
- data/lib/scrubyt/output/scrubyt_result.rb +35 -30
- data/lib/scrubyt/utils/ruby_extensions.rb +0 -53
- data/lib/scrubyt/utils/xpathutils.rb +2 -1
- metadata +84 -119
- data/lib/scrubyt/output/export.rb +0 -157
@@ -8,103 +8,13 @@ module Scrubyt
|
|
8
8
|
#which is loading a document (even by submitting a form or clicking a link)
|
9
9
|
#and related things like setting a proxy etc. you should find it here.
|
10
10
|
module FetchAction
|
11
|
-
|
12
11
|
@@current_doc_url = nil
|
13
12
|
@@current_doc_protocol = nil
|
14
13
|
@@base_dir = nil
|
15
14
|
@@host_name = nil
|
16
|
-
@@agent = WWW::Mechanize.new
|
17
15
|
@@history = []
|
18
|
-
|
19
|
-
|
20
|
-
#Action to fetch a document (either a file or a http address)
|
21
|
-
#
|
22
|
-
#*parameters*
|
23
|
-
#
|
24
|
-
#_doc_url_ - the url or file name to fetch
|
25
|
-
def self.fetch(doc_url, *args)
|
26
|
-
#Refactor this crap!!! with option_accessor stuff
|
27
|
-
|
28
|
-
if args.size > 0
|
29
|
-
proxy = args[0][:proxy]
|
30
|
-
mechanize_doc = args[0][:mechanize_doc]
|
31
|
-
resolve = args[0][:resolve]
|
32
|
-
basic_auth = args[0][:basic_auth]
|
33
|
-
user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
|
34
|
-
#Refactor this whole stuff as well!!! It looks awful...
|
35
|
-
parse_and_set_proxy(proxy) if proxy
|
36
|
-
set_user_agent(user_agent)
|
37
|
-
parse_and_set_basic_auth(basic_auth) if basic_auth
|
38
|
-
else
|
39
|
-
mechanize_doc = nil
|
40
|
-
resolve = :full
|
41
|
-
end
|
42
|
-
|
43
|
-
@@current_doc_url = doc_url
|
44
|
-
@@current_doc_protocol = determine_protocol
|
45
|
-
|
46
|
-
if mechanize_doc.nil? && @@current_doc_protocol != 'file'
|
47
|
-
handle_relative_path(doc_url)
|
48
|
-
handle_relative_url(doc_url, resolve)
|
49
|
-
|
50
|
-
Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
|
51
|
-
|
52
|
-
unless 'file' == @@current_doc_protocol
|
53
|
-
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
54
|
-
end
|
55
|
-
else
|
56
|
-
@@mechanize_doc = mechanize_doc
|
57
|
-
end
|
58
|
-
|
59
|
-
if @@current_doc_protocol == 'file'
|
60
|
-
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
|
61
|
-
else
|
62
|
-
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
|
63
|
-
store_host_name(self.get_current_doc_url) # in case we're on a new host
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
##
|
68
|
-
#Submit the last form;
|
69
|
-
def self.submit(current_form, button=nil, type=nil)
|
70
|
-
Scrubyt.log :ACTION, 'Submitting form...'
|
71
|
-
if button == nil
|
72
|
-
result_page = @@agent.submit(current_form)
|
73
|
-
elsif type
|
74
|
-
result_page = current_form.submit(button)
|
75
|
-
else
|
76
|
-
result_page = @@agent.submit(current_form, button)
|
77
|
-
end
|
78
|
-
@@current_doc_url = result_page.uri.to_s
|
79
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
80
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
81
|
-
end
|
82
|
-
|
83
|
-
##
|
84
|
-
#Click the link specified by the text
|
85
|
-
def self.click_link(link_spec,index = 0)
|
86
|
-
Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
|
87
|
-
if link_spec.is_a? Hash
|
88
|
-
clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
|
89
|
-
else
|
90
|
-
clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
|
91
|
-
end
|
92
|
-
clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
|
93
|
-
result_page = @@agent.click(clicked_elem)
|
94
|
-
@@current_doc_url = result_page.uri.to_s
|
95
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
96
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
97
|
-
end
|
98
|
-
|
99
|
-
def self.click_image_map(index = 0)
|
100
|
-
Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
|
101
|
-
uri = @@mechanize_doc.search("//area")[index]['href']
|
102
|
-
result_page = @@agent.get(uri)
|
103
|
-
@@current_doc_url = result_page.uri.to_s
|
104
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
105
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
106
|
-
end
|
107
|
-
|
16
|
+
@@current_form = nil
|
17
|
+
|
108
18
|
##
|
109
19
|
# At any given point, the current document can be queried with this method; Typically used
|
110
20
|
# when the navigation is over and the result document is passed to the wrapper
|
@@ -140,96 +50,5 @@ module Scrubyt
|
|
140
50
|
def store_host_name(doc_url)
|
141
51
|
FetchAction.store_host_name(doc_url)
|
142
52
|
end
|
143
|
-
|
144
|
-
def self.store_host_name(doc_url)
|
145
|
-
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
146
|
-
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
147
|
-
@@host_name = doc_url if @@host_name == nil
|
148
|
-
@@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
|
149
|
-
@@original_host_name ||= @@host_name
|
150
|
-
end #end of method store_host_name
|
151
|
-
|
152
|
-
def self.determine_protocol
|
153
|
-
old_protocol = @@current_doc_protocol
|
154
|
-
new_protocol = case @@current_doc_url
|
155
|
-
when /^https/
|
156
|
-
'https'
|
157
|
-
when /^http/
|
158
|
-
'http'
|
159
|
-
when /^www/
|
160
|
-
'http'
|
161
|
-
else
|
162
|
-
'file'
|
163
|
-
end
|
164
|
-
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
165
|
-
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
166
|
-
new_protocol
|
167
|
-
end
|
168
|
-
|
169
|
-
def self.parse_and_set_proxy(proxy)
|
170
|
-
if proxy.downcase == 'localhost'
|
171
|
-
@@host = 'localhost'
|
172
|
-
@@port = proxy.split(':').last
|
173
|
-
else
|
174
|
-
parts = proxy.split(':')
|
175
|
-
@@port = parts.delete_at(-1)
|
176
|
-
@@host = parts.join(':')
|
177
|
-
if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
|
178
|
-
Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
|
179
|
-
exit
|
180
|
-
end
|
181
|
-
end
|
182
|
-
Scrubyt.log :ACTION, "Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
|
183
|
-
@@agent.set_proxy(@@host, @@port)
|
184
|
-
end
|
185
|
-
|
186
|
-
def self.parse_and_set_basic_auth(basic_auth)
|
187
|
-
login, pass = basic_auth.split('@')
|
188
|
-
Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
|
189
|
-
@@agent.basic_auth(login, pass)
|
190
|
-
end
|
191
|
-
|
192
|
-
def self.set_user_agent(user_agent)
|
193
|
-
Scrubyt.log :ACTION, "Setting user-agent to #{user_agent}"
|
194
|
-
@@agent.user_agent = user_agent
|
195
|
-
end
|
196
|
-
|
197
|
-
def self.handle_relative_path(doc_url)
|
198
|
-
if @@base_dir == nil
|
199
|
-
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
200
|
-
else
|
201
|
-
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
|
-
def self.handle_relative_url(doc_url, resolve)
|
206
|
-
return if doc_url =~ /^http/
|
207
|
-
if doc_url !~ /^\//
|
208
|
-
first_char = doc_url[0..0]
|
209
|
-
doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
|
210
|
-
if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
|
211
|
-
current_uri = @@mechanize_doc.uri.to_s
|
212
|
-
current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
|
213
|
-
if (current_uri.include? '?')
|
214
|
-
current_uri = current_uri.scan(/.+\//)[0]
|
215
|
-
else
|
216
|
-
current_uri += '/' unless current_uri[-1..-1] == '/'
|
217
|
-
end
|
218
|
-
@@current_doc_url = current_uri + doc_url
|
219
|
-
return
|
220
|
-
end
|
221
|
-
end
|
222
|
-
case resolve
|
223
|
-
when :full
|
224
|
-
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
225
|
-
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
226
|
-
when :host
|
227
|
-
base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
|
228
|
-
@@current_doc_url = base_host_name + doc_url
|
229
|
-
else
|
230
|
-
#custom resilving
|
231
|
-
@@current_doc_url = resolve + doc_url
|
232
|
-
end
|
233
|
-
end
|
234
53
|
end
|
235
54
|
end
|
@@ -23,35 +23,32 @@ module Scrubyt
|
|
23
23
|
#textfield is 'q'
|
24
24
|
#
|
25
25
|
#_query_string_ - the string that should be entered into the textfield
|
26
|
-
def fill_textfield(textfield_name, query_string)
|
27
|
-
|
28
|
-
|
26
|
+
def fill_textfield(textfield_name, query_string, use_value = nil)
|
27
|
+
FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
|
28
|
+
end
|
29
|
+
|
30
|
+
def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
|
31
|
+
FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
|
29
32
|
end
|
30
33
|
|
31
34
|
##
|
32
35
|
#Action to fill a textarea with text
|
33
36
|
def fill_textarea(textarea_name, text)
|
34
|
-
|
35
|
-
eval("@current_form['#{textarea_name}'] = '#{text}'")
|
37
|
+
FetchAction.fill_textarea(textarea_name, text)
|
36
38
|
end
|
37
39
|
|
38
40
|
##
|
39
41
|
#Action for selecting an option from a dropdown box
|
40
42
|
def select_option(selectlist_name, option)
|
41
|
-
|
42
|
-
select_list = @current_form.fields.find {|f| f.name == selectlist_name}
|
43
|
-
searched_option = select_list.options.find{|f| f.text.strip == option}
|
44
|
-
searched_option.click
|
43
|
+
FetchAction.select_option(selectlist_name, option)
|
45
44
|
end
|
46
45
|
|
47
46
|
def check_checkbox(checkbox_name)
|
48
|
-
|
49
|
-
@current_form.checkboxes.name(checkbox_name).check
|
47
|
+
FetchAction.check_checkbox(checkbox_name)
|
50
48
|
end
|
51
49
|
|
52
50
|
def check_radiobutton(checkbox_name, index=0)
|
53
|
-
|
54
|
-
@current_form.radiobuttons.name(checkbox_name)[index].check
|
51
|
+
FetchAction.check_radiobutton(checkbox_name, index=0)
|
55
52
|
end
|
56
53
|
|
57
54
|
##
|
@@ -62,52 +59,37 @@ module Scrubyt
|
|
62
59
|
##
|
63
60
|
#Submit the current form
|
64
61
|
def submit(index=nil, type=nil)
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
FetchAction.submit(@current_form, button,type)
|
71
|
-
#-----------------------------------------
|
72
|
-
else
|
73
|
-
FetchAction.submit(@current_form, @current_form.buttons[index])
|
74
|
-
end
|
62
|
+
FetchAction.submit(nil, index, type)
|
63
|
+
end
|
64
|
+
|
65
|
+
def submit_and_wait(sleep_time, index=nil, type=nil)
|
66
|
+
FetchAction.submit(index, sleep_time, type)
|
75
67
|
end
|
76
68
|
|
77
69
|
##
|
78
70
|
#Click the link specified by the text
|
79
71
|
def click_link(link_spec,index=0)
|
80
|
-
FetchAction.click_link(link_spec,index)
|
72
|
+
FetchAction.click_link(link_spec,index, 0)
|
73
|
+
end
|
74
|
+
|
75
|
+
def click_link_and_wait(link_spec, sleep_secs=0)
|
76
|
+
FetchAction.click_link(link_spec, 0, sleep_secs)
|
77
|
+
end
|
78
|
+
|
79
|
+
def click_by_xpath(xpath)
|
80
|
+
FetchAction.click_by_xpath(xpath)
|
81
81
|
end
|
82
82
|
|
83
83
|
def click_image_map(index=0)
|
84
84
|
FetchAction.click_image_map(index)
|
85
85
|
end
|
86
86
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
end
|
94
|
-
|
95
|
-
def find_form_based_on_tag(tag, possible_attrs)
|
96
|
-
lookup_attribute_name = nil
|
97
|
-
lookup_attribute_value = nil
|
98
|
-
|
99
|
-
possible_attrs.each { |a|
|
100
|
-
lookup_attribute_name = a
|
101
|
-
lookup_attribute_value = tag.attributes[a]
|
102
|
-
break if lookup_attribute_value != nil
|
103
|
-
}
|
104
|
-
i = 0
|
105
|
-
loop do
|
106
|
-
@current_form = FetchAction.get_mechanize_doc.forms[i]
|
107
|
-
return nil if @current_form == nil
|
108
|
-
break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
109
|
-
i+= 1
|
110
|
-
end
|
87
|
+
def frame(attribute,value)
|
88
|
+
FetchAction.frame(attribute,value)
|
89
|
+
end
|
90
|
+
|
91
|
+
def wait(time=1)
|
92
|
+
FetchAction.wait(time)
|
111
93
|
end
|
112
94
|
end
|
113
95
|
end
|
@@ -53,7 +53,6 @@ module Scrubyt
|
|
53
53
|
:constraints, :xpath, :regexp, :example, :final_result)
|
54
54
|
|
55
55
|
def self.create(parent_pattern, example=nil)
|
56
|
-
|
57
56
|
filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
|
58
57
|
if filter_name == 'RootFilter'
|
59
58
|
BaseFilter.new(parent_pattern, example)
|
@@ -76,14 +75,15 @@ module Scrubyt
|
|
76
75
|
end
|
77
76
|
end
|
78
77
|
|
79
|
-
def to_sexp
|
80
|
-
nil
|
81
|
-
end
|
82
|
-
|
83
78
|
private
|
84
79
|
#We don't want this to be accessible from outside
|
85
80
|
def initialize(parent_pattern, example)
|
86
|
-
|
81
|
+
case parent_pattern.example_type
|
82
|
+
when :xpath
|
83
|
+
@example_type = EXAMPLE_TYPE_XPATH
|
84
|
+
else
|
85
|
+
@example_type = BaseFilter.determine_example_type(example)
|
86
|
+
end
|
87
87
|
@parent_pattern = parent_pattern
|
88
88
|
@example = example
|
89
89
|
@xpath = nil #The xpath to evaluate this filter
|
@@ -7,12 +7,16 @@ module Scrubyt
|
|
7
7
|
else
|
8
8
|
url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
|
9
9
|
end
|
10
|
-
|
11
10
|
@parent_pattern.extractor.store_page
|
12
11
|
original_host_name = @parent_pattern.extractor.get_host_name
|
13
12
|
@parent_pattern.extractor.restore_host_name
|
14
13
|
|
15
|
-
|
14
|
+
begin
|
15
|
+
FetchAction.fetch url, :resolve => @parent_pattern.resolve
|
16
|
+
rescue
|
17
|
+
Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
|
18
|
+
end
|
19
|
+
|
16
20
|
|
17
21
|
if @detail_extractor.nil?
|
18
22
|
@detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
|
@@ -29,9 +33,5 @@ module Scrubyt
|
|
29
33
|
root_results
|
30
34
|
end
|
31
35
|
|
32
|
-
def get_detail_sexp
|
33
|
-
[:block, *@detail_extractor.result.root_patterns.to_sexp_array]
|
34
|
-
end
|
35
|
-
|
36
36
|
end
|
37
37
|
end
|
@@ -9,7 +9,6 @@ module Scrubyt
|
|
9
9
|
index = @example.scan(/\]:(.+)/).flatten
|
10
10
|
index = 0 if index.empty?
|
11
11
|
index = index[0].to_i unless index[0] == "all"
|
12
|
-
|
13
12
|
result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
|
14
13
|
return "" unless result
|
15
14
|
|
@@ -22,7 +21,7 @@ module Scrubyt
|
|
22
21
|
|
23
22
|
def find_string(source)
|
24
23
|
str = @example.scan(/find\((.+)\)/).flatten[0]
|
25
|
-
strings_to_find = str.include?
|
24
|
+
strings_to_find = str.include?('|') ? str.split('|') : [str]
|
26
25
|
strings_to_find.each do |s|
|
27
26
|
result = SharedUtils.traverse_for_match(source,/#{s}/i)
|
28
27
|
return [s] unless result.empty?
|
@@ -30,9 +29,6 @@ module Scrubyt
|
|
30
29
|
return []
|
31
30
|
end
|
32
31
|
|
33
|
-
def to_sexp
|
34
|
-
[:str, @example]
|
35
|
-
end #end of method to_sexp
|
36
32
|
end #End of class TextFilter
|
37
33
|
end #End of module Scrubyt
|
38
34
|
|