jspradlin-scrubyt 0.4.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/CHANGELOG +343 -0
  2. data/COPYING +340 -0
  3. data/README +120 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt.rb +45 -0
  6. data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
  7. data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
  8. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  9. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  10. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  11. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  12. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  13. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  14. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  15. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  16. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  17. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  18. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  19. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  20. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  21. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  22. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  23. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  24. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  25. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  26. data/lib/scrubyt/core/shared/extractor.rb +167 -0
  27. data/lib/scrubyt/logging.rb +154 -0
  28. data/lib/scrubyt/output/post_processor.rb +139 -0
  29. data/lib/scrubyt/output/result.rb +44 -0
  30. data/lib/scrubyt/output/result_dumper.rb +154 -0
  31. data/lib/scrubyt/output/result_node.rb +142 -0
  32. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  33. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  34. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  35. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  36. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  37. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +117 -0
@@ -0,0 +1,289 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Fetching pages (and related functionality)</tt>
6
+ #
7
+ #Since lot of things are happening during (and before)
8
+ #the fetching of a document, I decided to move out fetching related
9
+ #functionality to a separate class - so if you are looking for anything
10
+ #which is loading a document (even by submitting a form or clicking a link)
11
+ #and related things like setting a proxy etc. you should find it here.
12
+ module Navigation
13
+ module Mechanize
14
+
15
+ def self.included(base)
16
+ base.module_eval do
17
+ @@agent = WWW::Mechanize.new
18
+ @@current_doc_url = nil
19
+ @@current_doc_protocol = nil
20
+ @@base_dir = nil
21
+ @@host_name = nil
22
+ @@history = []
23
+
24
+ ##
25
+ #Action to fetch a document (either a file or a http address)
26
+ #
27
+ #*parameters*
28
+ #
29
+ #_doc_url_ - the url or file name to fetch
30
+ def self.fetch(doc_url, *args)
31
+ #Refactor this crap!!! with option_accessor stuff
32
+ if args.size > 0
33
+ mechanize_doc = args[0][:mechanize_doc]
34
+ html = args[0][:html]
35
+ resolve = args[0][:resolve]
36
+ basic_auth = args[0][:basic_auth]
37
+ parse_and_set_basic_auth(basic_auth) if basic_auth
38
+ proxy = args[0][:proxy]
39
+ parse_and_set_proxy(proxy) if proxy
40
+ if html
41
+ @@current_doc_protocol = 'string'
42
+ mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
43
+ end
44
+ else
45
+ mechanize_doc = nil
46
+ resolve = :full
47
+ end
48
+
49
+ @@current_doc_url = doc_url
50
+ @@current_doc_protocol = determine_protocol
51
+
52
+ if mechanize_doc.nil? && @@current_doc_protocol != 'file'
53
+ handle_relative_path(doc_url)
54
+ handle_relative_url(doc_url, resolve)
55
+ Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
56
+
57
+ unless 'file' == @@current_doc_protocol
58
+ @@mechanize_doc = @@agent.get(@@current_doc_url)
59
+ end
60
+ else
61
+ @@mechanize_doc = mechanize_doc
62
+ end
63
+
64
+ if @@current_doc_protocol == 'file'
65
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
66
+ else
67
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
68
+ store_host_name(self.get_current_doc_url) #if self.get_current_doc_url # in case we're on a new host
69
+ end
70
+ end
71
+
72
+ ##
73
+ #Submit the last form;
74
+ def self.submit(index=nil, sleep_time=nil, type=nil)
75
+ Scrubyt.log :ACTION, 'Submitting form...'
76
+ if index == nil
77
+ result_page = @@agent.submit(@@current_form)
78
+ process_submit(@@current_form)
79
+ #----- added by nickmerwin@gmail.com -----
80
+ elsif index.class == String && !type.nil?
81
+ button = @@current_form.buttons.detect{|b| b.name == index}
82
+ result_page = @@current_form.submit(button)
83
+ process_submit(@@current_form, button,type)
84
+ #-----------------------------------------
85
+ else
86
+ result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
87
+ end
88
+ @@current_doc_url = result_page.uri.to_s
89
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
90
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
91
+ end
92
+
93
+ ##
94
+ #Click the link specified by the text
95
+ def self.click_link(link_spec,index = 0,wait_secs=0)
96
+ Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
97
+ if link_spec.is_a? Hash
98
+ clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
99
+ else
100
+ clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
101
+ end
102
+ clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
103
+ result_page = @@agent.click(clicked_elem)
104
+ @@current_doc_url = result_page.uri.to_s
105
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
106
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
107
+ end
108
+
109
+ def self.click_image_map(index = 0)
110
+ Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
111
+ uri = @@mechanize_doc.search("//area")[index]['href']
112
+ result_page = @@agent.get(uri)
113
+ @@current_doc_url = result_page.uri.to_s
114
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
115
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
116
+ end
117
+
118
+ def self.store_host_name(doc_url)
119
+ @@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
120
+ @@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
121
+ @@host_name = doc_url if @@host_name == nil
122
+ @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
123
+ @@original_host_name ||= @@host_name
124
+ end #end of method store_host_name
125
+
126
+ def self.parse_and_set_proxy(proxy)
127
+ @@proxy_user = @@proxy_pass = nil
128
+ if proxy.downcase.include?('localhost')
129
+ @@host = 'localhost'
130
+ @@port = proxy.split(':').last
131
+ else
132
+ parts = proxy.split(':')
133
+ if (parts.size > 2)
134
+ user_pass = parts[1].split('@')
135
+ @@proxy_user = parts[0]
136
+ @@proxy_pass = user_pass[0]
137
+ @@host = user_pass[1]
138
+ @@port = parts[2]
139
+ else
140
+ if (parts[0].include?('@'))
141
+ user_host = parts[0].split('@')
142
+ @@proxy_user = user_host[0]
143
+ @@host = user_host[1]
144
+ @@port = parts[1]
145
+ else
146
+ @@host = parts[0]
147
+ @@port = parts[1]
148
+ end
149
+ end
150
+
151
+ if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
152
+ puts "Invalid proxy specification..."
153
+ puts "neither host nor port can be nil!"
154
+ exit
155
+ end
156
+ end
157
+ Scrubyt.log :ACTION, "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>, username=<#{@@proxy_user}>, password=<#{@@proxy_pass}>"
158
+ @@agent.set_proxy(@@host, @@port, @@proxy_user, @@proxy_pass)
159
+ end
160
+
161
+ def self.determine_protocol
162
+ old_protocol = @@current_doc_protocol
163
+ new_protocol = case @@current_doc_url
164
+ when /^https/
165
+ 'https'
166
+ when /^http/
167
+ 'http'
168
+ when /^www/
169
+ 'http'
170
+ else
171
+ 'file'
172
+ end
173
+ return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
174
+ return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
175
+ new_protocol
176
+ end
177
+
178
+ def self.handle_relative_path(doc_url)
179
+ if @@base_dir == nil
180
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
181
+ else
182
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
183
+ end
184
+ end
185
+
186
+ def self.handle_relative_url(doc_url, resolve)
187
+ return if doc_url =~ /^http/
188
+ if doc_url !~ /^\//
189
+ first_char = doc_url[0..0]
190
+ doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
191
+ if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
192
+ current_uri = @@mechanize_doc.uri.to_s
193
+ current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
194
+ if (current_uri.include? '?')
195
+ current_uri = current_uri.scan(/.+\//)[0]
196
+ else
197
+ current_uri += '/' unless current_uri[-1..-1] == '/'
198
+ end
199
+ @@current_doc_url = current_uri + doc_url
200
+ return
201
+ end
202
+ end
203
+ case resolve
204
+ when :full
205
+ @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
206
+ @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
207
+ when :host
208
+ base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
209
+ @@current_doc_url = base_host_name + doc_url
210
+ else
211
+ #custom resilving
212
+ @@current_doc_url = resolve + doc_url
213
+ end
214
+ end
215
+
216
+ def self.fill_textfield(textfield_name, query_string, *unused)
217
+ lookup_form_for_tag('input','textfield',textfield_name,query_string)
218
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
219
+ end
220
+
221
+ ##
222
+ #Action to fill a textarea with text
223
+ def self.fill_textarea(textarea_name, text)
224
+ lookup_form_for_tag('textarea','textarea',textarea_name,text)
225
+ eval("@@current_form['#{textarea_name}'] = '#{text}'")
226
+ end
227
+
228
+ ##
229
+ #Action for selecting an option from a dropdown box
230
+ def self.select_option(selectlist_name, option)
231
+ lookup_form_for_tag('select','select list',selectlist_name,option)
232
+ select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
233
+ searched_option = select_list.options.find{|f| f.text.strip == option}
234
+ searched_option.click
235
+ end
236
+
237
+ def self.check_checkbox(checkbox_name)
238
+ lookup_form_for_tag('input','checkbox',checkbox_name, '')
239
+ @@current_form.checkboxes.name(checkbox_name).check
240
+ end
241
+
242
+ def self.check_radiobutton(checkbox_name, index=0)
243
+ lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
244
+ @@current_form.radiobuttons.name(checkbox_name)[index].check
245
+ end
246
+
247
+ #private
248
+ def self.process_submit(current_form, button=nil, type=nil)
249
+ if button == nil
250
+ result_page = @@agent.submit(current_form)
251
+ elsif type
252
+ result_page = current_form.submit(button)
253
+ else
254
+ result_page = @@agent.submit(current_form, button)
255
+ end
256
+ @@current_doc_url = result_page.uri.to_s
257
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
258
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
259
+ end
260
+
261
+ def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
262
+ Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
263
+ widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
264
+ form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
265
+ find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
266
+ end
267
+
268
+ def self.find_form_based_on_tag(tag, possible_attrs)
269
+ lookup_attribute_name = nil
270
+ lookup_attribute_value = nil
271
+
272
+ possible_attrs.each { |a|
273
+ lookup_attribute_name = a
274
+ lookup_attribute_value = tag.attributes[a]
275
+ break if lookup_attribute_value != nil
276
+ }
277
+ i = 0
278
+ loop do
279
+ @@current_form = FetchAction.get_mechanize_doc.forms[i]
280
+ return nil if @@current_form == nil
281
+ break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
282
+ i+= 1
283
+ end
284
+ end
285
+ end
286
+ end
287
+ end
288
+ end
289
+ end
@@ -0,0 +1,54 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Fetching pages (and related functionality)</tt>
4
+ #
5
+ #Since lot of things are happening during (and before)
6
+ #the fetching of a document, I decided to move out fetching related
7
+ #functionality to a separate class - so if you are looking for anything
8
+ #which is loading a document (even by submitting a form or clicking a link)
9
+ #and related things like setting a proxy etc. you should find it here.
10
+ module FetchAction
11
+ @@current_doc_url = nil
12
+ @@current_doc_protocol = nil
13
+ @@base_dir = nil
14
+ @@host_name = nil
15
+ @@history = []
16
+ @@current_form = nil
17
+
18
+ ##
19
+ # At any given point, the current document can be queried with this method; Typically used
20
+ # when the navigation is over and the result document is passed to the wrapper
21
+ def self.get_current_doc_url
22
+ @@current_doc_url
23
+ end
24
+
25
+ def self.get_mechanize_doc
26
+ @@mechanize_doc
27
+ end
28
+
29
+ def self.get_hpricot_doc
30
+ @@hpricot_doc
31
+ end
32
+
33
+ def get_host_name
34
+ @@host_name
35
+ end
36
+
37
+ def restore_host_name
38
+ return if @@current_doc_protocol == 'file'
39
+ @@host_name = @@original_host_name
40
+ end
41
+
42
+ def store_page
43
+ @@history.push @@hpricot_doc
44
+ end
45
+
46
+ def restore_page
47
+ @@hpricot_doc = @@history.pop
48
+ end
49
+
50
+ def store_host_name(doc_url)
51
+ FetchAction.store_host_name(doc_url)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,95 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Describing actions which interact with the page</tt>
4
+ #
5
+ #This class contains all the actions that are used to navigate on web pages;
6
+ #first of all, *fetch* for downloading the pages - then various actions
7
+ #like filling textfields, submitting formst, clicking links and more
8
+ module NavigationActions
9
+
10
+ def self.extend_object(obj)
11
+ super(obj)
12
+ obj.instance_eval do
13
+ @current_form = nil
14
+ end
15
+ end
16
+
17
+ ##
18
+ #Action to fill a textfield with a query string
19
+ #
20
+ ##*parameters*
21
+ #
22
+ #_textfield_name_ - the name of the textfield (e.g. the name of the google search
23
+ #textfield is 'q'
24
+ #
25
+ #_query_string_ - the string that should be entered into the textfield
26
+ def fill_textfield(textfield_name, query_string, use_value = nil)
27
+ FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
28
+ end
29
+
30
+ def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
31
+ FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
32
+ end
33
+
34
+ ##
35
+ #Action to fill a textarea with text
36
+ def fill_textarea(textarea_name, text)
37
+ FetchAction.fill_textarea(textarea_name, text)
38
+ end
39
+
40
+ ##
41
+ #Action for selecting an option from a dropdown box
42
+ def select_option(selectlist_name, option)
43
+ FetchAction.select_option(selectlist_name, option)
44
+ end
45
+
46
+ def check_checkbox(checkbox_name)
47
+ FetchAction.check_checkbox(checkbox_name)
48
+ end
49
+
50
+ def check_radiobutton(checkbox_name, index=0)
51
+ FetchAction.check_radiobutton(checkbox_name, index=0)
52
+ end
53
+
54
+ ##
55
+ #Fetch the document
56
+ def fetch(*args)
57
+ FetchAction.fetch(*args)
58
+ end
59
+ ##
60
+ #Submit the current form
61
+ def submit(index=nil, type=nil)
62
+ FetchAction.submit(nil, index, type)
63
+ end
64
+
65
+ def submit_and_wait(sleep_time, index=nil, type=nil)
66
+ FetchAction.submit(index, sleep_time, type)
67
+ end
68
+
69
+ ##
70
+ #Click the link specified by the text
71
+ def click_link(link_spec,index=0)
72
+ FetchAction.click_link(link_spec,index, 0)
73
+ end
74
+
75
+ def click_link_and_wait(link_spec, sleep_secs=0)
76
+ FetchAction.click_link(link_spec, 0, sleep_secs)
77
+ end
78
+
79
+ def click_by_xpath(xpath)
80
+ FetchAction.click_by_xpath(xpath)
81
+ end
82
+
83
+ def click_image_map(index=0)
84
+ FetchAction.click_image_map(index)
85
+ end
86
+
87
+ def frame(attribute,value)
88
+ FetchAction.frame(attribute,value)
89
+ end
90
+
91
+ def wait(time=1)
92
+ FetchAction.wait(time)
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,30 @@
1
+ module Scrubyt
2
+ ##
3
+ #=<tt>Represents a compund example</tt>
4
+ #
5
+ #There are two types of string examples in scRUBYt! right now:
6
+ #the simple example and the compound example. The simple example
7
+ #is specified by a string, and a compound example is specified with
8
+ #:contains, :begins_with and :ends_with descriptors - which can be
9
+ #both regexps or strings
10
+ class CompoundExample
11
+
12
+ DESCRIPTORS = [:contains, :begins_with, :ends_with]
13
+
14
+ attr_accessor :descriptor_hash
15
+
16
+ def initialize(descriptor_hash)
17
+ @descriptor_hash = descriptor_hash
18
+ end
19
+
20
+ ##
21
+ #Is the hash passed to this function a compound example descriptor hash?
22
+ #Need to decide this when parsing pattern parameters
23
+ def self.compound_example?(hash)
24
+ hash.each do |k,v|
25
+ return false if !DESCRIPTORS.include? k
26
+ end
27
+ true
28
+ end# end of method
29
+ end# #end of class CompoundExample
30
+ end# end of module Scrubyt