scrubyt 0.3.4 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,103 +8,13 @@ module Scrubyt
8
8
  #which is loading a document (even by submitting a form or clicking a link)
9
9
  #and related things like setting a proxy etc. you should find it here.
10
10
  module FetchAction
11
-
12
11
  @@current_doc_url = nil
13
12
  @@current_doc_protocol = nil
14
13
  @@base_dir = nil
15
14
  @@host_name = nil
16
- @@agent = WWW::Mechanize.new
17
15
  @@history = []
18
-
19
- ##
20
- #Action to fetch a document (either a file or a http address)
21
- #
22
- #*parameters*
23
- #
24
- #_doc_url_ - the url or file name to fetch
25
- def self.fetch(doc_url, *args)
26
- #Refactor this crap!!! with option_accessor stuff
27
-
28
- if args.size > 0
29
- proxy = args[0][:proxy]
30
- mechanize_doc = args[0][:mechanize_doc]
31
- resolve = args[0][:resolve]
32
- basic_auth = args[0][:basic_auth]
33
- user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
34
- #Refactor this whole stuff as well!!! It looks awful...
35
- parse_and_set_proxy(proxy) if proxy
36
- set_user_agent(user_agent)
37
- parse_and_set_basic_auth(basic_auth) if basic_auth
38
- else
39
- mechanize_doc = nil
40
- resolve = :full
41
- end
42
-
43
- @@current_doc_url = doc_url
44
- @@current_doc_protocol = determine_protocol
45
-
46
- if mechanize_doc.nil? && @@current_doc_protocol != 'file'
47
- handle_relative_path(doc_url)
48
- handle_relative_url(doc_url, resolve)
49
-
50
- Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
51
-
52
- unless 'file' == @@current_doc_protocol
53
- @@mechanize_doc = @@agent.get(@@current_doc_url)
54
- end
55
- else
56
- @@mechanize_doc = mechanize_doc
57
- end
58
-
59
- if @@current_doc_protocol == 'file'
60
- @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
61
- else
62
- @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
63
- store_host_name(self.get_current_doc_url) # in case we're on a new host
64
- end
65
- end
66
-
67
- ##
68
- #Submit the last form;
69
- def self.submit(current_form, button=nil, type=nil)
70
- Scrubyt.log :ACTION, 'Submitting form...'
71
- if button == nil
72
- result_page = @@agent.submit(current_form)
73
- elsif type
74
- result_page = current_form.submit(button)
75
- else
76
- result_page = @@agent.submit(current_form, button)
77
- end
78
- @@current_doc_url = result_page.uri.to_s
79
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
80
- fetch(@@current_doc_url, :mechanize_doc => result_page)
81
- end
82
-
83
- ##
84
- #Click the link specified by the text
85
- def self.click_link(link_spec,index = 0)
86
- Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
87
- if link_spec.is_a? Hash
88
- clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
89
- else
90
- clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
91
- end
92
- clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
93
- result_page = @@agent.click(clicked_elem)
94
- @@current_doc_url = result_page.uri.to_s
95
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
96
- fetch(@@current_doc_url, :mechanize_doc => result_page)
97
- end
98
-
99
- def self.click_image_map(index = 0)
100
- Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
101
- uri = @@mechanize_doc.search("//area")[index]['href']
102
- result_page = @@agent.get(uri)
103
- @@current_doc_url = result_page.uri.to_s
104
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
105
- fetch(@@current_doc_url, :mechanize_doc => result_page)
106
- end
107
-
16
+ @@current_form = nil
17
+
108
18
  ##
109
19
  # At any given point, the current document can be queried with this method; Typically used
110
20
  # when the navigation is over and the result document is passed to the wrapper
@@ -140,96 +50,5 @@ module Scrubyt
140
50
  def store_host_name(doc_url)
141
51
  FetchAction.store_host_name(doc_url)
142
52
  end
143
-
144
- def self.store_host_name(doc_url)
145
- @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
146
- @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
147
- @@host_name = doc_url if @@host_name == nil
148
- @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
149
- @@original_host_name ||= @@host_name
150
- end #end of method store_host_name
151
-
152
- def self.determine_protocol
153
- old_protocol = @@current_doc_protocol
154
- new_protocol = case @@current_doc_url
155
- when /^https/
156
- 'https'
157
- when /^http/
158
- 'http'
159
- when /^www/
160
- 'http'
161
- else
162
- 'file'
163
- end
164
- return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
165
- return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
166
- new_protocol
167
- end
168
-
169
- def self.parse_and_set_proxy(proxy)
170
- if proxy.downcase == 'localhost'
171
- @@host = 'localhost'
172
- @@port = proxy.split(':').last
173
- else
174
- parts = proxy.split(':')
175
- @@port = parts.delete_at(-1)
176
- @@host = parts.join(':')
177
- if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
178
- Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
179
- exit
180
- end
181
- end
182
- Scrubyt.log :ACTION, "Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
183
- @@agent.set_proxy(@@host, @@port)
184
- end
185
-
186
- def self.parse_and_set_basic_auth(basic_auth)
187
- login, pass = basic_auth.split('@')
188
- Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
189
- @@agent.basic_auth(login, pass)
190
- end
191
-
192
- def self.set_user_agent(user_agent)
193
- Scrubyt.log :ACTION, "Setting user-agent to #{user_agent}"
194
- @@agent.user_agent = user_agent
195
- end
196
-
197
- def self.handle_relative_path(doc_url)
198
- if @@base_dir == nil
199
- @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
200
- else
201
- @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
202
- end
203
- end
204
-
205
- def self.handle_relative_url(doc_url, resolve)
206
- return if doc_url =~ /^http/
207
- if doc_url !~ /^\//
208
- first_char = doc_url[0..0]
209
- doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
210
- if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
211
- current_uri = @@mechanize_doc.uri.to_s
212
- current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
213
- if (current_uri.include? '?')
214
- current_uri = current_uri.scan(/.+\//)[0]
215
- else
216
- current_uri += '/' unless current_uri[-1..-1] == '/'
217
- end
218
- @@current_doc_url = current_uri + doc_url
219
- return
220
- end
221
- end
222
- case resolve
223
- when :full
224
- @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
225
- @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
226
- when :host
227
- base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
228
- @@current_doc_url = base_host_name + doc_url
229
- else
230
- #custom resilving
231
- @@current_doc_url = resolve + doc_url
232
- end
233
- end
234
53
  end
235
54
  end
@@ -23,35 +23,32 @@ module Scrubyt
23
23
  #textfield is 'q'
24
24
  #
25
25
  #_query_string_ - the string that should be entered into the textfield
26
- def fill_textfield(textfield_name, query_string)
27
- lookup_form_for_tag('input','textfield',textfield_name,query_string)
28
- eval("@current_form['#{textfield_name}'] = '#{query_string}'")
26
+ def fill_textfield(textfield_name, query_string, use_value = nil)
27
+ FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
28
+ end
29
+
30
+ def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
31
+ FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
29
32
  end
30
33
 
31
34
  ##
32
35
  #Action to fill a textarea with text
33
36
  def fill_textarea(textarea_name, text)
34
- lookup_form_for_tag('textarea','textarea',textarea_name,text)
35
- eval("@current_form['#{textarea_name}'] = '#{text}'")
37
+ FetchAction.fill_textarea(textarea_name, text)
36
38
  end
37
39
 
38
40
  ##
39
41
  #Action for selecting an option from a dropdown box
40
42
  def select_option(selectlist_name, option)
41
- lookup_form_for_tag('select','select list',selectlist_name,option)
42
- select_list = @current_form.fields.find {|f| f.name == selectlist_name}
43
- searched_option = select_list.options.find{|f| f.text.strip == option}
44
- searched_option.click
43
+ FetchAction.select_option(selectlist_name, option)
45
44
  end
46
45
 
47
46
  def check_checkbox(checkbox_name)
48
- lookup_form_for_tag('input','checkbox',checkbox_name, '')
49
- @current_form.checkboxes.name(checkbox_name).check
47
+ FetchAction.check_checkbox(checkbox_name)
50
48
  end
51
49
 
52
50
  def check_radiobutton(checkbox_name, index=0)
53
- lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
54
- @current_form.radiobuttons.name(checkbox_name)[index].check
51
+ FetchAction.check_radiobutton(checkbox_name, index=0)
55
52
  end
56
53
 
57
54
  ##
@@ -62,52 +59,37 @@ module Scrubyt
62
59
  ##
63
60
  #Submit the current form
64
61
  def submit(index=nil, type=nil)
65
- if index == nil
66
- FetchAction.submit(@current_form)
67
- #----- added by nickmerwin@gmail.com -----
68
- elsif index.class == String
69
- button = @current_form.buttons.detect{|b| b.name == index}
70
- FetchAction.submit(@current_form, button,type)
71
- #-----------------------------------------
72
- else
73
- FetchAction.submit(@current_form, @current_form.buttons[index])
74
- end
62
+ FetchAction.submit(nil, index, type)
63
+ end
64
+
65
+ def submit_and_wait(sleep_time, index=nil, type=nil)
66
+ FetchAction.submit(index, sleep_time, type)
75
67
  end
76
68
 
77
69
  ##
78
70
  #Click the link specified by the text
79
71
  def click_link(link_spec,index=0)
80
- FetchAction.click_link(link_spec,index)
72
+ FetchAction.click_link(link_spec,index, 0)
73
+ end
74
+
75
+ def click_link_and_wait(link_spec, sleep_secs=0)
76
+ FetchAction.click_link(link_spec, 0, sleep_secs)
77
+ end
78
+
79
+ def click_by_xpath(xpath)
80
+ FetchAction.click_by_xpath(xpath)
81
81
  end
82
82
 
83
83
  def click_image_map(index=0)
84
84
  FetchAction.click_image_map(index)
85
85
  end
86
86
 
87
- private
88
- def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
89
- Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
90
- widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
91
- form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
92
- find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
93
- end
94
-
95
- def find_form_based_on_tag(tag, possible_attrs)
96
- lookup_attribute_name = nil
97
- lookup_attribute_value = nil
98
-
99
- possible_attrs.each { |a|
100
- lookup_attribute_name = a
101
- lookup_attribute_value = tag.attributes[a]
102
- break if lookup_attribute_value != nil
103
- }
104
- i = 0
105
- loop do
106
- @current_form = FetchAction.get_mechanize_doc.forms[i]
107
- return nil if @current_form == nil
108
- break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
109
- i+= 1
110
- end
87
+ def frame(attribute,value)
88
+ FetchAction.frame(attribute,value)
89
+ end
90
+
91
+ def wait(time=1)
92
+ FetchAction.wait(time)
111
93
  end
112
94
  end
113
95
  end
@@ -10,8 +10,5 @@ module Scrubyt
10
10
  end
11
11
  end
12
12
 
13
- def to_sexp
14
- [:str, @example]
15
- end #end of method to_sexp
16
13
  end #End of class AttributeFilter
17
14
  end #End of module Scrubyt
@@ -53,7 +53,6 @@ module Scrubyt
53
53
  :constraints, :xpath, :regexp, :example, :final_result)
54
54
 
55
55
  def self.create(parent_pattern, example=nil)
56
-
57
56
  filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
58
57
  if filter_name == 'RootFilter'
59
58
  BaseFilter.new(parent_pattern, example)
@@ -76,14 +75,15 @@ module Scrubyt
76
75
  end
77
76
  end
78
77
 
79
- def to_sexp
80
- nil
81
- end
82
-
83
78
  private
84
79
  #We don't want this to be accessible from outside
85
80
  def initialize(parent_pattern, example)
86
- @example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
81
+ case parent_pattern.example_type
82
+ when :xpath
83
+ @example_type = EXAMPLE_TYPE_XPATH
84
+ else
85
+ @example_type = BaseFilter.determine_example_type(example)
86
+ end
87
87
  @parent_pattern = parent_pattern
88
88
  @example = example
89
89
  @xpath = nil #The xpath to evaluate this filter
@@ -5,8 +5,5 @@ module Scrubyt
5
5
  return @example
6
6
  end
7
7
 
8
- def to_sexp
9
- [:str, @example]
10
- end #end of method to_sexp
11
8
  end #End of class ConstantFilter
12
9
  end #End of module Scrubyt
@@ -7,12 +7,16 @@ module Scrubyt
7
7
  else
8
8
  url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
9
9
  end
10
-
11
10
  @parent_pattern.extractor.store_page
12
11
  original_host_name = @parent_pattern.extractor.get_host_name
13
12
  @parent_pattern.extractor.restore_host_name
14
13
 
15
- FetchAction.fetch url, :resolve => @parent_pattern.resolve
14
+ begin
15
+ FetchAction.fetch url, :resolve => @parent_pattern.resolve
16
+ rescue
17
+ Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
18
+ end
19
+
16
20
 
17
21
  if @detail_extractor.nil?
18
22
  @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
@@ -29,9 +33,5 @@ module Scrubyt
29
33
  root_results
30
34
  end
31
35
 
32
- def get_detail_sexp
33
- [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
34
- end
35
-
36
36
  end
37
37
  end
@@ -8,10 +8,6 @@ module Scrubyt
8
8
  download_file(source)
9
9
  end #end of method
10
10
 
11
- def to_sexp
12
- [:str, @example]
13
- end #end of method to_sexp
14
-
15
11
  private
16
12
  def download_file(source)
17
13
  return '' if source.size < 4
@@ -5,8 +5,5 @@ module Scrubyt
5
5
  source.inner_html
6
6
  end
7
7
 
8
- def to_sexp
9
- nil
10
- end #end of method
11
8
  end #End of class TreeFilter
12
9
  end #End of module Scrubyt
@@ -9,9 +9,5 @@ module Scrubyt
9
9
  end
10
10
  end
11
11
 
12
- def to_sexp
13
- [:lit, @example]
14
- end
15
-
16
12
  end #End of class TreeFilter
17
13
  end #End of module Scrubyt
@@ -7,8 +7,5 @@ module Scrubyt
7
7
  @example.call param
8
8
  end
9
9
 
10
- def to_sexp
11
- [:str, "FIXME!!! Can't dump Proc"]
12
- end #end of method to_sexp
13
10
  end #End of class ConstantFilter
14
11
  end #End of module Scrubyt
@@ -9,7 +9,6 @@ module Scrubyt
9
9
  index = @example.scan(/\]:(.+)/).flatten
10
10
  index = 0 if index.empty?
11
11
  index = index[0].to_i unless index[0] == "all"
12
-
13
12
  result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
14
13
  return "" unless result
15
14
 
@@ -22,7 +21,7 @@ module Scrubyt
22
21
 
23
22
  def find_string(source)
24
23
  str = @example.scan(/find\((.+)\)/).flatten[0]
25
- strings_to_find = str.include? ('|') ? str.split('|') : [str]
24
+ strings_to_find = str.include?('|') ? str.split('|') : [str]
26
25
  strings_to_find.each do |s|
27
26
  result = SharedUtils.traverse_for_match(source,/#{s}/i)
28
27
  return [s] unless result.empty?
@@ -30,9 +29,6 @@ module Scrubyt
30
29
  return []
31
30
  end
32
31
 
33
- def to_sexp
34
- [:str, @example]
35
- end #end of method to_sexp
36
32
  end #End of class TextFilter
37
33
  end #End of module Scrubyt
38
34