andyverprauskus-scrubyt 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/CHANGELOG +355 -0
  2. data/COPYING +340 -0
  3. data/README.rdoc +121 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt.rb +53 -0
  6. data/lib/scrubyt/core/navigation/agents/firewatir.rb +318 -0
  7. data/lib/scrubyt/core/navigation/agents/mechanize.rb +312 -0
  8. data/lib/scrubyt/core/navigation/fetch_action.rb +63 -0
  9. data/lib/scrubyt/core/navigation/navigation_actions.rb +107 -0
  10. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  11. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  12. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  13. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  14. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  15. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  16. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  17. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  18. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  19. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  20. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  21. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  22. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  23. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  24. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  25. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  26. data/lib/scrubyt/core/shared/extractor.rb +183 -0
  27. data/lib/scrubyt/logging.rb +154 -0
  28. data/lib/scrubyt/output/post_processor.rb +139 -0
  29. data/lib/scrubyt/output/result.rb +44 -0
  30. data/lib/scrubyt/output/result_dumper.rb +154 -0
  31. data/lib/scrubyt/output/result_node.rb +145 -0
  32. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  33. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  34. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  35. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  36. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  37. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +120 -0
@@ -0,0 +1,312 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Fetching pages (and related functionality)</tt>
6
+ #
7
+ #Since lot of things are happening during (and before)
8
+ #the fetching of a document, I decided to move out fetching related
9
+ #functionality to a separate class - so if you are looking for anything
10
+ #which is loading a document (even by submitting a form or clicking a link)
11
+ #and related things like setting a proxy etc. you should find it here.
12
+ module Navigation
13
+ module Mechanize
14
+
15
+ def self.included(base)
16
+ base.module_eval do
17
+ @@agent = WWW::Mechanize.new
18
+ @@current_doc_url = nil
19
+ @@current_doc_protocol = nil
20
+ @@base_dir = nil
21
+ @@host_name = nil
22
+ @@history = []
23
+
24
+ ##
25
+ #Action to fetch a document (either a file or a http address)
26
+ #
27
+ #*parameters*
28
+ #
29
+ #_doc_url_ - the url or file name to fetch
30
+ def self.fetch(doc_url, *args)
31
+ #Refactor this crap!!! with option_accessor stuff
32
+ if args.size > 0
33
+ mechanize_doc = args[0][:mechanize_doc]
34
+ html = args[0][:html]
35
+ resolve = args[0][:resolve]
36
+ basic_auth = args[0][:basic_auth]
37
+ parse_and_set_basic_auth(basic_auth) if basic_auth
38
+ proxy = args[0][:proxy]
39
+ parse_and_set_proxy(proxy) if proxy
40
+ if html
41
+ @@current_doc_protocol = 'string'
42
+ mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
43
+ end
44
+ else
45
+ mechanize_doc = nil
46
+ resolve = :full
47
+ end
48
+
49
+ @@current_doc_url = doc_url
50
+ @@current_doc_protocol = determine_protocol
51
+
52
+ if mechanize_doc.nil? && @@current_doc_protocol != 'file'
53
+ handle_relative_path(doc_url)
54
+ handle_relative_url(doc_url, resolve)
55
+ Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
56
+
57
+ unless 'file' == @@current_doc_protocol
58
+ @@mechanize_doc = @@agent.get(@@current_doc_url)
59
+ end
60
+ else
61
+ @@mechanize_doc = mechanize_doc
62
+ end
63
+
64
+ if @@current_doc_protocol == 'file'
65
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
66
+ else
67
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
68
+ store_host_name(self.get_current_doc_url) #if self.get_current_doc_url # in case we're on a new host
69
+ end
70
+ end
71
+
72
+ ##
73
+ #Submit the last form;
74
+ def self.submit(index=nil, sleep_time=nil, type=nil)
75
+ Scrubyt.log :ACTION, 'Submitting form...'
76
+ if index == nil
77
+ #result_page = @@agent.submit(@@current_form)
78
+ result_page = process_submit(@@current_form)
79
+ #----- added by nickmerwin@gmail.com -----
80
+ elsif index.class == String && !type.nil?
81
+ button = @@current_form.buttons.detect{|b| b.name == index}
82
+ #result_page = @@current_form.submit(button)
83
+ result_page = process_submit(@@current_form, button,type)
84
+ #-----------------------------------------
85
+ else
86
+ result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
87
+ end
88
+ @@current_doc_url = result_page.uri.to_s
89
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
90
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
91
+ end
92
+
93
+ ##
94
+ #Click the link specified by the text
95
+ def self.click_link(link_spec,index = 0,wait_secs=0)
96
+ Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
97
+ if link_spec.is_a? Hash
98
+ clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
99
+ else
100
+ clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
101
+ end
102
+ clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
103
+ result_page = @@agent.click(clicked_elem)
104
+ @@current_doc_url = result_page.uri.to_s
105
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
106
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
107
+ end
108
+
109
+ def self.click_image_map(index = 0)
110
+ Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
111
+ uri = @@mechanize_doc.search("//area")[index]['href']
112
+ result_page = @@agent.get(uri)
113
+ @@current_doc_url = result_page.uri.to_s
114
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
115
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
116
+ end
117
+
118
+ def self.store_host_name(doc_url)
119
+ @@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
120
+ @@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
121
+ @@host_name = doc_url if @@host_name == nil
122
+ @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
123
+ @@original_host_name ||= @@host_name
124
+ end #end of method store_host_name
125
+
126
+ def self.parse_and_set_proxy(proxy)
127
+ @@proxy_user = @@proxy_pass = nil
128
+ if proxy.downcase.include?('localhost')
129
+ @@host = 'localhost'
130
+ @@port = proxy.split(':').last
131
+ else
132
+ parts = proxy.split(':')
133
+ if (parts.size > 2)
134
+ user_pass = parts[1].split('@')
135
+ @@proxy_user = parts[0]
136
+ @@proxy_pass = user_pass[0]
137
+ @@host = user_pass[1]
138
+ @@port = parts[2]
139
+ else
140
+ if (parts[0].include?('@'))
141
+ user_host = parts[0].split('@')
142
+ @@proxy_user = user_host[0]
143
+ @@host = user_host[1]
144
+ @@port = parts[1]
145
+ else
146
+ @@host = parts[0]
147
+ @@port = parts[1]
148
+ end
149
+ end
150
+
151
+ if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
152
+ puts "Invalid proxy specification..."
153
+ puts "neither host nor port can be nil!"
154
+ exit
155
+ end
156
+ end
157
+ Scrubyt.log :ACTION, "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>, username=<#{@@proxy_user}>, password=<#{@@proxy_pass}>"
158
+ @@agent.set_proxy(@@host, @@port, @@proxy_user, @@proxy_pass)
159
+ end
160
+
161
+ def self.determine_protocol
162
+ old_protocol = @@current_doc_protocol
163
+ new_protocol = case @@current_doc_url
164
+ when /^https/
165
+ 'https'
166
+ when /^http/
167
+ 'http'
168
+ when /^www/
169
+ 'http'
170
+ else
171
+ 'file'
172
+ end
173
+ return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
174
+ return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
175
+ new_protocol
176
+ end
177
+
178
+ def self.handle_relative_path(doc_url)
179
+ if @@base_dir == nil
180
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
181
+ else
182
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
183
+ end
184
+ end
185
+
186
+ def self.handle_relative_url(doc_url, resolve)
187
+ return if doc_url =~ /^http/
188
+ if doc_url !~ /^\//
189
+ first_char = doc_url[0..0]
190
+ doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
191
+ if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
192
+ current_uri = @@mechanize_doc.uri.to_s
193
+ current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
194
+ if (current_uri.include? '?')
195
+ current_uri = current_uri.scan(/.+\//)[0]
196
+ else
197
+ current_uri += '/' unless current_uri[-1..-1] == '/'
198
+ end
199
+ @@current_doc_url = current_uri + doc_url
200
+ return
201
+ end
202
+ end
203
+ case resolve
204
+ when :full
205
+ @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
206
+ @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
207
+ when :host
208
+ base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
209
+ @@current_doc_url = base_host_name + doc_url
210
+ else
211
+ #custom resilving
212
+ @@current_doc_url = resolve + doc_url
213
+ end
214
+ end
215
+
216
+ def self.fill_textfield(textfield_name, query_string, *unused)
217
+ lookup_form_for_tag('input','textfield',textfield_name,query_string)
218
+ if(@@current_form)
219
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
220
+ else
221
+ Scrubyt.log :ERROR, "Couldn't find the form that contains this textfield. Please report a bug!"
222
+ end
223
+ end
224
+
225
+ ##
226
+ #Action to fill a textarea with text
227
+ def self.fill_textarea(textarea_name, text)
228
+ lookup_form_for_tag('textarea','textarea',textarea_name,text)
229
+ eval("@@current_form['#{textarea_name}'] = '#{text}'")
230
+ end
231
+
232
+ ##
233
+ #Action for selecting an option from a dropdown box
234
+ def self.select_option(selectlist_name, option)
235
+ lookup_form_for_tag('select','select list',selectlist_name,option)
236
+ select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
237
+ searched_option = select_list.options.find{|f| f.text.strip == option}
238
+ searched_option.click
239
+ end
240
+
241
+ def self.check_checkbox(checkbox_name)
242
+ lookup_form_for_tag('input','checkbox',checkbox_name, '')
243
+ @@current_form.checkboxes.name(checkbox_name).check
244
+ end
245
+
246
+ def self.check_radiobutton(checkbox_name, index=0)
247
+ lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
248
+ @@current_form.radiobuttons.name(checkbox_name)[index].check
249
+ end
250
+
251
+ #private
252
+ def self.process_submit(current_form, button=nil, type=nil)
253
+ if button == nil
254
+ result_page = @@agent.submit(current_form)
255
+ elsif type
256
+ result_page = current_form.submit(button)
257
+ else
258
+ result_page = @@agent.submit(current_form, button)
259
+ end
260
+ #@@current_doc_url = result_page.uri.to_s
261
+ #Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
262
+ #fetch(@@current_doc_url, :mechanize_doc => result_page)
263
+ result_page
264
+ end
265
+
266
+ def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
267
+ Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
268
+ widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
269
+ form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
270
+ puts "=" * 100
271
+ puts ">>#{Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)}<<"
272
+ puts "=" * 100
273
+ xp = Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)
274
+ form_element = FetchAction.get_mechanize_doc/xp
275
+
276
+ FetchAction.get_mechanize_doc.forms.each do |f|
277
+ @@current_form = f
278
+ break if f.form_node == form_element
279
+ end
280
+
281
+
282
+ #find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
283
+ end
284
+
285
+ def self.find_form_based_on_tag(tag, possible_attrs)
286
+ lookup_attribute_name = nil
287
+ lookup_attribute_value = nil
288
+
289
+ possible_attrs.each { |a|
290
+ lookup_attribute_name = a
291
+ lookup_attribute_value = tag.attributes[a]
292
+ break if lookup_attribute_value != nil
293
+ }
294
+
295
+ #puts lookup_attribute_name
296
+ #puts lookup_attribute_value
297
+
298
+ i = 0
299
+ loop do
300
+ @@current_form = FetchAction.get_mechanize_doc.forms[i]
301
+ #p @@current_form.form_node
302
+ return nil if @@current_form == nil
303
+ #puts ">>#{@@current_form.form_node.attributes[lookup_attribute_name].to_s}<< :: >>#{lookup_attribute_value}<<"
304
+ break if @@current_form.form_node.attributes[lookup_attribute_name].to_s == lookup_attribute_value
305
+ i+= 1
306
+ end
307
+ end
308
+ end
309
+ end
310
+ end
311
+ end
312
+ end
@@ -0,0 +1,63 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Fetching pages (and related functionality)</tt>
4
+ #
5
+ #Since lot of things are happening during (and before)
6
+ #the fetching of a document, I decided to move out fetching related
7
+ #functionality to a separate class - so if you are looking for anything
8
+ #which is loading a document (even by submitting a form or clicking a link)
9
+ #and related things like setting a proxy etc. you should find it here.
10
+ module FetchAction
11
+ @@current_doc_url = nil
12
+ @@current_doc_protocol = nil
13
+ @@base_dir = nil
14
+ @@host_name = nil
15
+ @@history = []
16
+ @@current_form = nil
17
+ @@extractor = nil
18
+
19
+ def self.extractor=(extractor)
20
+ @@extractor = extractor
21
+ end
22
+
23
+ def self.extractor
24
+ return @@extractor
25
+ end
26
+
27
+ ##
28
+ # At any given point, the current document can be queried with this method; Typically used
29
+ # when the navigation is over and the result document is passed to the wrapper
30
+ def self.get_current_doc_url
31
+ @@current_doc_url
32
+ end
33
+
34
+ def self.get_mechanize_doc
35
+ @@mechanize_doc
36
+ end
37
+
38
+ def self.get_hpricot_doc
39
+ @@hpricot_doc
40
+ end
41
+
42
+ def get_host_name
43
+ @@host_name
44
+ end
45
+
46
+ def restore_host_name
47
+ return if @@current_doc_protocol == 'file'
48
+ @@host_name = @@original_host_name
49
+ end
50
+
51
+ def store_page
52
+ @@history.push @@hpricot_doc
53
+ end
54
+
55
+ def restore_page
56
+ @@hpricot_doc = @@history.pop
57
+ end
58
+
59
+ def store_host_name(doc_url)
60
+ FetchAction.store_host_name(doc_url)
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,107 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Describing actions which interact with the page</tt>
4
+ #
5
+ #This class contains all the actions that are used to navigate on web pages;
6
+ #first of all, *fetch* for downloading the pages - then various actions
7
+ #like filling textfields, submitting formst, clicking links and more
8
+ module NavigationActions
9
+
10
+ def self.extend_object(obj)
11
+ super(obj)
12
+ obj.instance_eval do
13
+ @current_form = nil
14
+ end
15
+ end
16
+
17
+ ##
18
+ #Action to fill a textfield with a query string
19
+ #
20
+ ##*parameters*
21
+ #
22
+ #_textfield_name_ - the name of the textfield (e.g. the name of the google search
23
+ #textfield is 'q'
24
+ #
25
+ #_query_string_ - the string that should be entered into the textfield
26
+ def fill_textfield(textfield_name, query_string, use_value = nil)
27
+ FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
28
+ end
29
+
30
+ def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
31
+ FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
32
+ end
33
+
34
+ ##
35
+ #Action to fill a textarea with text
36
+ def fill_textarea(textarea_name, text)
37
+ FetchAction.fill_textarea(textarea_name, text)
38
+ end
39
+
40
+ ##
41
+ #Action for selecting an option from a dropdown box
42
+ def select_option(selectlist_name, option)
43
+ FetchAction.select_option(selectlist_name, option)
44
+ end
45
+
46
+ def check_checkbox(checkbox_name)
47
+ FetchAction.check_checkbox(checkbox_name)
48
+ end
49
+
50
+ def check_radiobutton(checkbox_name, index=0)
51
+ FetchAction.check_radiobutton(checkbox_name, index=0)
52
+ end
53
+
54
+ ##
55
+ #Fetch the document
56
+ def fetch(*args)
57
+ FetchAction.fetch(*args)
58
+ end
59
+
60
+ def use_current_page
61
+ FetchAction.use_current_page
62
+ end
63
+ ##
64
+ #Submit the current form
65
+ def submit(index=nil, type=nil)
66
+ FetchAction.submit(nil, index, type)
67
+ end
68
+
69
+ def submit_and_wait(sleep_time, index=nil, type=nil)
70
+ FetchAction.submit(index, sleep_time, type)
71
+ end
72
+
73
+ ##
74
+ #Click the link specified by the text
75
+ def click_link(link_spec,index=0)
76
+ FetchAction.click_link(link_spec,index, 0)
77
+ end
78
+
79
+ def click_link_and_wait(link_spec, sleep_secs=0)
80
+ FetchAction.click_link(link_spec, 0, sleep_secs)
81
+ end
82
+
83
+ def click_by_xpath_if_exists(xpath, sleep_secs=0)
84
+ FetchAction.click_by_xpath_if_exists(xpath, sleep_secs)
85
+ end
86
+
87
+ def click_by_xpath(xpath)
88
+ FetchAction.click_by_xpath(xpath)
89
+ end
90
+
91
+ def click_by_xpath_and_wait(xpath, secs)
92
+ FetchAction.click_by_xpath(xpath, secs)
93
+ end
94
+
95
+ def click_image_map(index=0)
96
+ FetchAction.click_image_map(index)
97
+ end
98
+
99
+ def frame(attribute,value)
100
+ FetchAction.frame(attribute,value)
101
+ end
102
+
103
+ def wait(time=1)
104
+ FetchAction.wait(time)
105
+ end
106
+ end
107
+ end