scrubyt 0.3.4 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +31 -0
- data/README +1 -1
- data/Rakefile +4 -9
- data/lib/scrubyt.rb +37 -56
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +2 -183
- data/lib/scrubyt/core/navigation/navigation_actions.rb +30 -48
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +1 -5
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +0 -8
- data/lib/scrubyt/core/scraping/pattern.rb +6 -27
- data/lib/scrubyt/core/scraping/result_indexer.rb +0 -4
- data/lib/scrubyt/core/shared/extractor.rb +15 -1
- data/lib/scrubyt/output/result_node.rb +42 -6
- data/lib/scrubyt/output/scrubyt_result.rb +35 -30
- data/lib/scrubyt/utils/ruby_extensions.rb +0 -53
- data/lib/scrubyt/utils/xpathutils.rb +2 -1
- metadata +84 -119
- data/lib/scrubyt/output/export.rb +0 -157
@@ -8,103 +8,13 @@ module Scrubyt
|
|
8
8
|
#which is loading a document (even by submitting a form or clicking a link)
|
9
9
|
#and related things like setting a proxy etc. you should find it here.
|
10
10
|
module FetchAction
|
11
|
-
|
12
11
|
@@current_doc_url = nil
|
13
12
|
@@current_doc_protocol = nil
|
14
13
|
@@base_dir = nil
|
15
14
|
@@host_name = nil
|
16
|
-
@@agent = WWW::Mechanize.new
|
17
15
|
@@history = []
|
18
|
-
|
19
|
-
|
20
|
-
#Action to fetch a document (either a file or a http address)
|
21
|
-
#
|
22
|
-
#*parameters*
|
23
|
-
#
|
24
|
-
#_doc_url_ - the url or file name to fetch
|
25
|
-
def self.fetch(doc_url, *args)
|
26
|
-
#Refactor this crap!!! with option_accessor stuff
|
27
|
-
|
28
|
-
if args.size > 0
|
29
|
-
proxy = args[0][:proxy]
|
30
|
-
mechanize_doc = args[0][:mechanize_doc]
|
31
|
-
resolve = args[0][:resolve]
|
32
|
-
basic_auth = args[0][:basic_auth]
|
33
|
-
user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
|
34
|
-
#Refactor this whole stuff as well!!! It looks awful...
|
35
|
-
parse_and_set_proxy(proxy) if proxy
|
36
|
-
set_user_agent(user_agent)
|
37
|
-
parse_and_set_basic_auth(basic_auth) if basic_auth
|
38
|
-
else
|
39
|
-
mechanize_doc = nil
|
40
|
-
resolve = :full
|
41
|
-
end
|
42
|
-
|
43
|
-
@@current_doc_url = doc_url
|
44
|
-
@@current_doc_protocol = determine_protocol
|
45
|
-
|
46
|
-
if mechanize_doc.nil? && @@current_doc_protocol != 'file'
|
47
|
-
handle_relative_path(doc_url)
|
48
|
-
handle_relative_url(doc_url, resolve)
|
49
|
-
|
50
|
-
Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
|
51
|
-
|
52
|
-
unless 'file' == @@current_doc_protocol
|
53
|
-
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
54
|
-
end
|
55
|
-
else
|
56
|
-
@@mechanize_doc = mechanize_doc
|
57
|
-
end
|
58
|
-
|
59
|
-
if @@current_doc_protocol == 'file'
|
60
|
-
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
|
61
|
-
else
|
62
|
-
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
|
63
|
-
store_host_name(self.get_current_doc_url) # in case we're on a new host
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
##
|
68
|
-
#Submit the last form;
|
69
|
-
def self.submit(current_form, button=nil, type=nil)
|
70
|
-
Scrubyt.log :ACTION, 'Submitting form...'
|
71
|
-
if button == nil
|
72
|
-
result_page = @@agent.submit(current_form)
|
73
|
-
elsif type
|
74
|
-
result_page = current_form.submit(button)
|
75
|
-
else
|
76
|
-
result_page = @@agent.submit(current_form, button)
|
77
|
-
end
|
78
|
-
@@current_doc_url = result_page.uri.to_s
|
79
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
80
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
81
|
-
end
|
82
|
-
|
83
|
-
##
|
84
|
-
#Click the link specified by the text
|
85
|
-
def self.click_link(link_spec,index = 0)
|
86
|
-
Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
|
87
|
-
if link_spec.is_a? Hash
|
88
|
-
clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
|
89
|
-
else
|
90
|
-
clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
|
91
|
-
end
|
92
|
-
clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
|
93
|
-
result_page = @@agent.click(clicked_elem)
|
94
|
-
@@current_doc_url = result_page.uri.to_s
|
95
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
96
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
97
|
-
end
|
98
|
-
|
99
|
-
def self.click_image_map(index = 0)
|
100
|
-
Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
|
101
|
-
uri = @@mechanize_doc.search("//area")[index]['href']
|
102
|
-
result_page = @@agent.get(uri)
|
103
|
-
@@current_doc_url = result_page.uri.to_s
|
104
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
105
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
106
|
-
end
|
107
|
-
|
16
|
+
@@current_form = nil
|
17
|
+
|
108
18
|
##
|
109
19
|
# At any given point, the current document can be queried with this method; Typically used
|
110
20
|
# when the navigation is over and the result document is passed to the wrapper
|
@@ -140,96 +50,5 @@ module Scrubyt
|
|
140
50
|
def store_host_name(doc_url)
|
141
51
|
FetchAction.store_host_name(doc_url)
|
142
52
|
end
|
143
|
-
|
144
|
-
def self.store_host_name(doc_url)
|
145
|
-
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
146
|
-
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
147
|
-
@@host_name = doc_url if @@host_name == nil
|
148
|
-
@@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
|
149
|
-
@@original_host_name ||= @@host_name
|
150
|
-
end #end of method store_host_name
|
151
|
-
|
152
|
-
def self.determine_protocol
|
153
|
-
old_protocol = @@current_doc_protocol
|
154
|
-
new_protocol = case @@current_doc_url
|
155
|
-
when /^https/
|
156
|
-
'https'
|
157
|
-
when /^http/
|
158
|
-
'http'
|
159
|
-
when /^www/
|
160
|
-
'http'
|
161
|
-
else
|
162
|
-
'file'
|
163
|
-
end
|
164
|
-
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
165
|
-
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
166
|
-
new_protocol
|
167
|
-
end
|
168
|
-
|
169
|
-
def self.parse_and_set_proxy(proxy)
|
170
|
-
if proxy.downcase == 'localhost'
|
171
|
-
@@host = 'localhost'
|
172
|
-
@@port = proxy.split(':').last
|
173
|
-
else
|
174
|
-
parts = proxy.split(':')
|
175
|
-
@@port = parts.delete_at(-1)
|
176
|
-
@@host = parts.join(':')
|
177
|
-
if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
|
178
|
-
Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
|
179
|
-
exit
|
180
|
-
end
|
181
|
-
end
|
182
|
-
Scrubyt.log :ACTION, "Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
|
183
|
-
@@agent.set_proxy(@@host, @@port)
|
184
|
-
end
|
185
|
-
|
186
|
-
def self.parse_and_set_basic_auth(basic_auth)
|
187
|
-
login, pass = basic_auth.split('@')
|
188
|
-
Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
|
189
|
-
@@agent.basic_auth(login, pass)
|
190
|
-
end
|
191
|
-
|
192
|
-
def self.set_user_agent(user_agent)
|
193
|
-
Scrubyt.log :ACTION, "Setting user-agent to #{user_agent}"
|
194
|
-
@@agent.user_agent = user_agent
|
195
|
-
end
|
196
|
-
|
197
|
-
def self.handle_relative_path(doc_url)
|
198
|
-
if @@base_dir == nil
|
199
|
-
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
200
|
-
else
|
201
|
-
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
202
|
-
end
|
203
|
-
end
|
204
|
-
|
205
|
-
def self.handle_relative_url(doc_url, resolve)
|
206
|
-
return if doc_url =~ /^http/
|
207
|
-
if doc_url !~ /^\//
|
208
|
-
first_char = doc_url[0..0]
|
209
|
-
doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
|
210
|
-
if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
|
211
|
-
current_uri = @@mechanize_doc.uri.to_s
|
212
|
-
current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
|
213
|
-
if (current_uri.include? '?')
|
214
|
-
current_uri = current_uri.scan(/.+\//)[0]
|
215
|
-
else
|
216
|
-
current_uri += '/' unless current_uri[-1..-1] == '/'
|
217
|
-
end
|
218
|
-
@@current_doc_url = current_uri + doc_url
|
219
|
-
return
|
220
|
-
end
|
221
|
-
end
|
222
|
-
case resolve
|
223
|
-
when :full
|
224
|
-
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
225
|
-
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
226
|
-
when :host
|
227
|
-
base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
|
228
|
-
@@current_doc_url = base_host_name + doc_url
|
229
|
-
else
|
230
|
-
#custom resilving
|
231
|
-
@@current_doc_url = resolve + doc_url
|
232
|
-
end
|
233
|
-
end
|
234
53
|
end
|
235
54
|
end
|
@@ -23,35 +23,32 @@ module Scrubyt
|
|
23
23
|
#textfield is 'q'
|
24
24
|
#
|
25
25
|
#_query_string_ - the string that should be entered into the textfield
|
26
|
-
def fill_textfield(textfield_name, query_string)
|
27
|
-
|
28
|
-
|
26
|
+
def fill_textfield(textfield_name, query_string, use_value = nil)
|
27
|
+
FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
|
28
|
+
end
|
29
|
+
|
30
|
+
def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
|
31
|
+
FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
|
29
32
|
end
|
30
33
|
|
31
34
|
##
|
32
35
|
#Action to fill a textarea with text
|
33
36
|
def fill_textarea(textarea_name, text)
|
34
|
-
|
35
|
-
eval("@current_form['#{textarea_name}'] = '#{text}'")
|
37
|
+
FetchAction.fill_textarea(textarea_name, text)
|
36
38
|
end
|
37
39
|
|
38
40
|
##
|
39
41
|
#Action for selecting an option from a dropdown box
|
40
42
|
def select_option(selectlist_name, option)
|
41
|
-
|
42
|
-
select_list = @current_form.fields.find {|f| f.name == selectlist_name}
|
43
|
-
searched_option = select_list.options.find{|f| f.text.strip == option}
|
44
|
-
searched_option.click
|
43
|
+
FetchAction.select_option(selectlist_name, option)
|
45
44
|
end
|
46
45
|
|
47
46
|
def check_checkbox(checkbox_name)
|
48
|
-
|
49
|
-
@current_form.checkboxes.name(checkbox_name).check
|
47
|
+
FetchAction.check_checkbox(checkbox_name)
|
50
48
|
end
|
51
49
|
|
52
50
|
def check_radiobutton(checkbox_name, index=0)
|
53
|
-
|
54
|
-
@current_form.radiobuttons.name(checkbox_name)[index].check
|
51
|
+
FetchAction.check_radiobutton(checkbox_name, index=0)
|
55
52
|
end
|
56
53
|
|
57
54
|
##
|
@@ -62,52 +59,37 @@ module Scrubyt
|
|
62
59
|
##
|
63
60
|
#Submit the current form
|
64
61
|
def submit(index=nil, type=nil)
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
FetchAction.submit(@current_form, button,type)
|
71
|
-
#-----------------------------------------
|
72
|
-
else
|
73
|
-
FetchAction.submit(@current_form, @current_form.buttons[index])
|
74
|
-
end
|
62
|
+
FetchAction.submit(nil, index, type)
|
63
|
+
end
|
64
|
+
|
65
|
+
def submit_and_wait(sleep_time, index=nil, type=nil)
|
66
|
+
FetchAction.submit(index, sleep_time, type)
|
75
67
|
end
|
76
68
|
|
77
69
|
##
|
78
70
|
#Click the link specified by the text
|
79
71
|
def click_link(link_spec,index=0)
|
80
|
-
FetchAction.click_link(link_spec,index)
|
72
|
+
FetchAction.click_link(link_spec,index, 0)
|
73
|
+
end
|
74
|
+
|
75
|
+
def click_link_and_wait(link_spec, sleep_secs=0)
|
76
|
+
FetchAction.click_link(link_spec, 0, sleep_secs)
|
77
|
+
end
|
78
|
+
|
79
|
+
def click_by_xpath(xpath)
|
80
|
+
FetchAction.click_by_xpath(xpath)
|
81
81
|
end
|
82
82
|
|
83
83
|
def click_image_map(index=0)
|
84
84
|
FetchAction.click_image_map(index)
|
85
85
|
end
|
86
86
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
end
|
94
|
-
|
95
|
-
def find_form_based_on_tag(tag, possible_attrs)
|
96
|
-
lookup_attribute_name = nil
|
97
|
-
lookup_attribute_value = nil
|
98
|
-
|
99
|
-
possible_attrs.each { |a|
|
100
|
-
lookup_attribute_name = a
|
101
|
-
lookup_attribute_value = tag.attributes[a]
|
102
|
-
break if lookup_attribute_value != nil
|
103
|
-
}
|
104
|
-
i = 0
|
105
|
-
loop do
|
106
|
-
@current_form = FetchAction.get_mechanize_doc.forms[i]
|
107
|
-
return nil if @current_form == nil
|
108
|
-
break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
109
|
-
i+= 1
|
110
|
-
end
|
87
|
+
def frame(attribute,value)
|
88
|
+
FetchAction.frame(attribute,value)
|
89
|
+
end
|
90
|
+
|
91
|
+
def wait(time=1)
|
92
|
+
FetchAction.wait(time)
|
111
93
|
end
|
112
94
|
end
|
113
95
|
end
|
@@ -53,7 +53,6 @@ module Scrubyt
|
|
53
53
|
:constraints, :xpath, :regexp, :example, :final_result)
|
54
54
|
|
55
55
|
def self.create(parent_pattern, example=nil)
|
56
|
-
|
57
56
|
filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
|
58
57
|
if filter_name == 'RootFilter'
|
59
58
|
BaseFilter.new(parent_pattern, example)
|
@@ -76,14 +75,15 @@ module Scrubyt
|
|
76
75
|
end
|
77
76
|
end
|
78
77
|
|
79
|
-
def to_sexp
|
80
|
-
nil
|
81
|
-
end
|
82
|
-
|
83
78
|
private
|
84
79
|
#We don't want this to be accessible from outside
|
85
80
|
def initialize(parent_pattern, example)
|
86
|
-
|
81
|
+
case parent_pattern.example_type
|
82
|
+
when :xpath
|
83
|
+
@example_type = EXAMPLE_TYPE_XPATH
|
84
|
+
else
|
85
|
+
@example_type = BaseFilter.determine_example_type(example)
|
86
|
+
end
|
87
87
|
@parent_pattern = parent_pattern
|
88
88
|
@example = example
|
89
89
|
@xpath = nil #The xpath to evaluate this filter
|
@@ -7,12 +7,16 @@ module Scrubyt
|
|
7
7
|
else
|
8
8
|
url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
|
9
9
|
end
|
10
|
-
|
11
10
|
@parent_pattern.extractor.store_page
|
12
11
|
original_host_name = @parent_pattern.extractor.get_host_name
|
13
12
|
@parent_pattern.extractor.restore_host_name
|
14
13
|
|
15
|
-
|
14
|
+
begin
|
15
|
+
FetchAction.fetch url, :resolve => @parent_pattern.resolve
|
16
|
+
rescue
|
17
|
+
Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
|
18
|
+
end
|
19
|
+
|
16
20
|
|
17
21
|
if @detail_extractor.nil?
|
18
22
|
@detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
|
@@ -29,9 +33,5 @@ module Scrubyt
|
|
29
33
|
root_results
|
30
34
|
end
|
31
35
|
|
32
|
-
def get_detail_sexp
|
33
|
-
[:block, *@detail_extractor.result.root_patterns.to_sexp_array]
|
34
|
-
end
|
35
|
-
|
36
36
|
end
|
37
37
|
end
|
@@ -9,7 +9,6 @@ module Scrubyt
|
|
9
9
|
index = @example.scan(/\]:(.+)/).flatten
|
10
10
|
index = 0 if index.empty?
|
11
11
|
index = index[0].to_i unless index[0] == "all"
|
12
|
-
|
13
12
|
result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
|
14
13
|
return "" unless result
|
15
14
|
|
@@ -22,7 +21,7 @@ module Scrubyt
|
|
22
21
|
|
23
22
|
def find_string(source)
|
24
23
|
str = @example.scan(/find\((.+)\)/).flatten[0]
|
25
|
-
strings_to_find = str.include?
|
24
|
+
strings_to_find = str.include?('|') ? str.split('|') : [str]
|
26
25
|
strings_to_find.each do |s|
|
27
26
|
result = SharedUtils.traverse_for_match(source,/#{s}/i)
|
28
27
|
return [s] unless result.empty?
|
@@ -30,9 +29,6 @@ module Scrubyt
|
|
30
29
|
return []
|
31
30
|
end
|
32
31
|
|
33
|
-
def to_sexp
|
34
|
-
[:str, @example]
|
35
|
-
end #end of method to_sexp
|
36
32
|
end #End of class TextFilter
|
37
33
|
end #End of module Scrubyt
|
38
34
|
|