scrubyt 0.3.4 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,103 +8,13 @@ module Scrubyt
8
8
  #which is loading a document (even by submitting a form or clicking a link)
9
9
  #and related things like setting a proxy etc. you should find it here.
10
10
  module FetchAction
11
-
12
11
  @@current_doc_url = nil
13
12
  @@current_doc_protocol = nil
14
13
  @@base_dir = nil
15
14
  @@host_name = nil
16
- @@agent = WWW::Mechanize.new
17
15
  @@history = []
18
-
19
- ##
20
- #Action to fetch a document (either a file or a http address)
21
- #
22
- #*parameters*
23
- #
24
- #_doc_url_ - the url or file name to fetch
25
- def self.fetch(doc_url, *args)
26
- #Refactor this crap!!! with option_accessor stuff
27
-
28
- if args.size > 0
29
- proxy = args[0][:proxy]
30
- mechanize_doc = args[0][:mechanize_doc]
31
- resolve = args[0][:resolve]
32
- basic_auth = args[0][:basic_auth]
33
- user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
34
- #Refactor this whole stuff as well!!! It looks awful...
35
- parse_and_set_proxy(proxy) if proxy
36
- set_user_agent(user_agent)
37
- parse_and_set_basic_auth(basic_auth) if basic_auth
38
- else
39
- mechanize_doc = nil
40
- resolve = :full
41
- end
42
-
43
- @@current_doc_url = doc_url
44
- @@current_doc_protocol = determine_protocol
45
-
46
- if mechanize_doc.nil? && @@current_doc_protocol != 'file'
47
- handle_relative_path(doc_url)
48
- handle_relative_url(doc_url, resolve)
49
-
50
- Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
51
-
52
- unless 'file' == @@current_doc_protocol
53
- @@mechanize_doc = @@agent.get(@@current_doc_url)
54
- end
55
- else
56
- @@mechanize_doc = mechanize_doc
57
- end
58
-
59
- if @@current_doc_protocol == 'file'
60
- @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
61
- else
62
- @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
63
- store_host_name(self.get_current_doc_url) # in case we're on a new host
64
- end
65
- end
66
-
67
- ##
68
- #Submit the last form;
69
- def self.submit(current_form, button=nil, type=nil)
70
- Scrubyt.log :ACTION, 'Submitting form...'
71
- if button == nil
72
- result_page = @@agent.submit(current_form)
73
- elsif type
74
- result_page = current_form.submit(button)
75
- else
76
- result_page = @@agent.submit(current_form, button)
77
- end
78
- @@current_doc_url = result_page.uri.to_s
79
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
80
- fetch(@@current_doc_url, :mechanize_doc => result_page)
81
- end
82
-
83
- ##
84
- #Click the link specified by the text
85
- def self.click_link(link_spec,index = 0)
86
- Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
87
- if link_spec.is_a? Hash
88
- clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
89
- else
90
- clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
91
- end
92
- clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
93
- result_page = @@agent.click(clicked_elem)
94
- @@current_doc_url = result_page.uri.to_s
95
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
96
- fetch(@@current_doc_url, :mechanize_doc => result_page)
97
- end
98
-
99
- def self.click_image_map(index = 0)
100
- Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
101
- uri = @@mechanize_doc.search("//area")[index]['href']
102
- result_page = @@agent.get(uri)
103
- @@current_doc_url = result_page.uri.to_s
104
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
105
- fetch(@@current_doc_url, :mechanize_doc => result_page)
106
- end
107
-
16
+ @@current_form = nil
17
+
108
18
  ##
109
19
  # At any given point, the current document can be queried with this method; Typically used
110
20
  # when the navigation is over and the result document is passed to the wrapper
@@ -140,96 +50,5 @@ module Scrubyt
140
50
  def store_host_name(doc_url)
141
51
  FetchAction.store_host_name(doc_url)
142
52
  end
143
-
144
- def self.store_host_name(doc_url)
145
- @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
146
- @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
147
- @@host_name = doc_url if @@host_name == nil
148
- @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
149
- @@original_host_name ||= @@host_name
150
- end #end of method store_host_name
151
-
152
- def self.determine_protocol
153
- old_protocol = @@current_doc_protocol
154
- new_protocol = case @@current_doc_url
155
- when /^https/
156
- 'https'
157
- when /^http/
158
- 'http'
159
- when /^www/
160
- 'http'
161
- else
162
- 'file'
163
- end
164
- return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
165
- return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
166
- new_protocol
167
- end
168
-
169
- def self.parse_and_set_proxy(proxy)
170
- if proxy.downcase == 'localhost'
171
- @@host = 'localhost'
172
- @@port = proxy.split(':').last
173
- else
174
- parts = proxy.split(':')
175
- @@port = parts.delete_at(-1)
176
- @@host = parts.join(':')
177
- if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
178
- Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
179
- exit
180
- end
181
- end
182
- Scrubyt.log :ACTION, "Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
183
- @@agent.set_proxy(@@host, @@port)
184
- end
185
-
186
- def self.parse_and_set_basic_auth(basic_auth)
187
- login, pass = basic_auth.split('@')
188
- Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
189
- @@agent.basic_auth(login, pass)
190
- end
191
-
192
- def self.set_user_agent(user_agent)
193
- Scrubyt.log :ACTION, "Setting user-agent to #{user_agent}"
194
- @@agent.user_agent = user_agent
195
- end
196
-
197
- def self.handle_relative_path(doc_url)
198
- if @@base_dir == nil
199
- @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
200
- else
201
- @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
202
- end
203
- end
204
-
205
- def self.handle_relative_url(doc_url, resolve)
206
- return if doc_url =~ /^http/
207
- if doc_url !~ /^\//
208
- first_char = doc_url[0..0]
209
- doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
210
- if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
211
- current_uri = @@mechanize_doc.uri.to_s
212
- current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
213
- if (current_uri.include? '?')
214
- current_uri = current_uri.scan(/.+\//)[0]
215
- else
216
- current_uri += '/' unless current_uri[-1..-1] == '/'
217
- end
218
- @@current_doc_url = current_uri + doc_url
219
- return
220
- end
221
- end
222
- case resolve
223
- when :full
224
- @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
225
- @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
226
- when :host
227
- base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
228
- @@current_doc_url = base_host_name + doc_url
229
- else
230
- #custom resilving
231
- @@current_doc_url = resolve + doc_url
232
- end
233
- end
234
53
  end
235
54
  end
@@ -23,35 +23,32 @@ module Scrubyt
23
23
  #textfield is 'q'
24
24
  #
25
25
  #_query_string_ - the string that should be entered into the textfield
26
- def fill_textfield(textfield_name, query_string)
27
- lookup_form_for_tag('input','textfield',textfield_name,query_string)
28
- eval("@current_form['#{textfield_name}'] = '#{query_string}'")
26
+ def fill_textfield(textfield_name, query_string, use_value = nil)
27
+ FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
28
+ end
29
+
30
+ def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
31
+ FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
29
32
  end
30
33
 
31
34
  ##
32
35
  #Action to fill a textarea with text
33
36
  def fill_textarea(textarea_name, text)
34
- lookup_form_for_tag('textarea','textarea',textarea_name,text)
35
- eval("@current_form['#{textarea_name}'] = '#{text}'")
37
+ FetchAction.fill_textarea(textarea_name, text)
36
38
  end
37
39
 
38
40
  ##
39
41
  #Action for selecting an option from a dropdown box
40
42
  def select_option(selectlist_name, option)
41
- lookup_form_for_tag('select','select list',selectlist_name,option)
42
- select_list = @current_form.fields.find {|f| f.name == selectlist_name}
43
- searched_option = select_list.options.find{|f| f.text.strip == option}
44
- searched_option.click
43
+ FetchAction.select_option(selectlist_name, option)
45
44
  end
46
45
 
47
46
  def check_checkbox(checkbox_name)
48
- lookup_form_for_tag('input','checkbox',checkbox_name, '')
49
- @current_form.checkboxes.name(checkbox_name).check
47
+ FetchAction.check_checkbox(checkbox_name)
50
48
  end
51
49
 
52
50
  def check_radiobutton(checkbox_name, index=0)
53
- lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
54
- @current_form.radiobuttons.name(checkbox_name)[index].check
51
+ FetchAction.check_radiobutton(checkbox_name, index=0)
55
52
  end
56
53
 
57
54
  ##
@@ -62,52 +59,37 @@ module Scrubyt
62
59
  ##
63
60
  #Submit the current form
64
61
  def submit(index=nil, type=nil)
65
- if index == nil
66
- FetchAction.submit(@current_form)
67
- #----- added by nickmerwin@gmail.com -----
68
- elsif index.class == String
69
- button = @current_form.buttons.detect{|b| b.name == index}
70
- FetchAction.submit(@current_form, button,type)
71
- #-----------------------------------------
72
- else
73
- FetchAction.submit(@current_form, @current_form.buttons[index])
74
- end
62
+ FetchAction.submit(nil, index, type)
63
+ end
64
+
65
+ def submit_and_wait(sleep_time, index=nil, type=nil)
66
+ FetchAction.submit(index, sleep_time, type)
75
67
  end
76
68
 
77
69
  ##
78
70
  #Click the link specified by the text
79
71
  def click_link(link_spec,index=0)
80
- FetchAction.click_link(link_spec,index)
72
+ FetchAction.click_link(link_spec,index, 0)
73
+ end
74
+
75
+ def click_link_and_wait(link_spec, sleep_secs=0)
76
+ FetchAction.click_link(link_spec, 0, sleep_secs)
77
+ end
78
+
79
+ def click_by_xpath(xpath)
80
+ FetchAction.click_by_xpath(xpath)
81
81
  end
82
82
 
83
83
  def click_image_map(index=0)
84
84
  FetchAction.click_image_map(index)
85
85
  end
86
86
 
87
- private
88
- def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
89
- Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
90
- widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
91
- form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
92
- find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
93
- end
94
-
95
- def find_form_based_on_tag(tag, possible_attrs)
96
- lookup_attribute_name = nil
97
- lookup_attribute_value = nil
98
-
99
- possible_attrs.each { |a|
100
- lookup_attribute_name = a
101
- lookup_attribute_value = tag.attributes[a]
102
- break if lookup_attribute_value != nil
103
- }
104
- i = 0
105
- loop do
106
- @current_form = FetchAction.get_mechanize_doc.forms[i]
107
- return nil if @current_form == nil
108
- break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
109
- i+= 1
110
- end
87
+ def frame(attribute,value)
88
+ FetchAction.frame(attribute,value)
89
+ end
90
+
91
+ def wait(time=1)
92
+ FetchAction.wait(time)
111
93
  end
112
94
  end
113
95
  end
@@ -10,8 +10,5 @@ module Scrubyt
10
10
  end
11
11
  end
12
12
 
13
- def to_sexp
14
- [:str, @example]
15
- end #end of method to_sexp
16
13
  end #End of class AttributeFilter
17
14
  end #End of module Scrubyt
@@ -53,7 +53,6 @@ module Scrubyt
53
53
  :constraints, :xpath, :regexp, :example, :final_result)
54
54
 
55
55
  def self.create(parent_pattern, example=nil)
56
-
57
56
  filter_name = (parent_pattern.type.to_s.split("_").map!{|e| e.capitalize }.join) + 'Filter'
58
57
  if filter_name == 'RootFilter'
59
58
  BaseFilter.new(parent_pattern, example)
@@ -76,14 +75,15 @@ module Scrubyt
76
75
  end
77
76
  end
78
77
 
79
- def to_sexp
80
- nil
81
- end
82
-
83
78
  private
84
79
  #We don't want this to be accessible from outside
85
80
  def initialize(parent_pattern, example)
86
- @example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
81
+ case parent_pattern.example_type
82
+ when :xpath
83
+ @example_type = EXAMPLE_TYPE_XPATH
84
+ else
85
+ @example_type = BaseFilter.determine_example_type(example)
86
+ end
87
87
  @parent_pattern = parent_pattern
88
88
  @example = example
89
89
  @xpath = nil #The xpath to evaluate this filter
@@ -5,8 +5,5 @@ module Scrubyt
5
5
  return @example
6
6
  end
7
7
 
8
- def to_sexp
9
- [:str, @example]
10
- end #end of method to_sexp
11
8
  end #End of class ConstantFilter
12
9
  end #End of module Scrubyt
@@ -7,12 +7,16 @@ module Scrubyt
7
7
  else
8
8
  url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
9
9
  end
10
-
11
10
  @parent_pattern.extractor.store_page
12
11
  original_host_name = @parent_pattern.extractor.get_host_name
13
12
  @parent_pattern.extractor.restore_host_name
14
13
 
15
- FetchAction.fetch url, :resolve => @parent_pattern.resolve
14
+ begin
15
+ FetchAction.fetch url, :resolve => @parent_pattern.resolve
16
+ rescue
17
+ Scrubyt.log :ERROR, "Couldn't get page, probably returned 404 or 500 status code"
18
+ end
19
+
16
20
 
17
21
  if @detail_extractor.nil?
18
22
  @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
@@ -29,9 +33,5 @@ module Scrubyt
29
33
  root_results
30
34
  end
31
35
 
32
- def get_detail_sexp
33
- [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
34
- end
35
-
36
36
  end
37
37
  end
@@ -8,10 +8,6 @@ module Scrubyt
8
8
  download_file(source)
9
9
  end #end of method
10
10
 
11
- def to_sexp
12
- [:str, @example]
13
- end #end of method to_sexp
14
-
15
11
  private
16
12
  def download_file(source)
17
13
  return '' if source.size < 4
@@ -5,8 +5,5 @@ module Scrubyt
5
5
  source.inner_html
6
6
  end
7
7
 
8
- def to_sexp
9
- nil
10
- end #end of method
11
8
  end #End of class TreeFilter
12
9
  end #End of module Scrubyt
@@ -9,9 +9,5 @@ module Scrubyt
9
9
  end
10
10
  end
11
11
 
12
- def to_sexp
13
- [:lit, @example]
14
- end
15
-
16
12
  end #End of class TreeFilter
17
13
  end #End of module Scrubyt
@@ -7,8 +7,5 @@ module Scrubyt
7
7
  @example.call param
8
8
  end
9
9
 
10
- def to_sexp
11
- [:str, "FIXME!!! Can't dump Proc"]
12
- end #end of method to_sexp
13
10
  end #End of class ConstantFilter
14
11
  end #End of module Scrubyt
@@ -9,7 +9,6 @@ module Scrubyt
9
9
  index = @example.scan(/\]:(.+)/).flatten
10
10
  index = 0 if index.empty?
11
11
  index = index[0].to_i unless index[0] == "all"
12
-
13
12
  result = (index.is_a? Fixnum) ? (SharedUtils.traverse_for_match(source,/#{text}/)[index]) : (SharedUtils.traverse_for_match(source,/#{text}/))
14
13
  return "" unless result
15
14
 
@@ -22,7 +21,7 @@ module Scrubyt
22
21
 
23
22
  def find_string(source)
24
23
  str = @example.scan(/find\((.+)\)/).flatten[0]
25
- strings_to_find = str.include? ('|') ? str.split('|') : [str]
24
+ strings_to_find = str.include?('|') ? str.split('|') : [str]
26
25
  strings_to_find.each do |s|
27
26
  result = SharedUtils.traverse_for_match(source,/#{s}/i)
28
27
  return [s] unless result.empty?
@@ -30,9 +29,6 @@ module Scrubyt
30
29
  return []
31
30
  end
32
31
 
33
- def to_sexp
34
- [:str, @example]
35
- end #end of method to_sexp
36
32
  end #End of class TextFilter
37
33
  end #End of module Scrubyt
38
34