scrubyt 0.4.1 → 0.4.05

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,20 +1,15 @@
1
1
  = scRUBYt! Changelog
2
2
 
3
3
  == 0.4.05
4
- == 20th October
4
+ == 14th November
5
5
 
6
6
  =<tt>changes:</tt>
7
- - [NEW] possibility to use FireWatir as the agent for scraping (credit: Glenn Gillen, Glen Gillen and... did I mention Glenn already?)
7
+ - [NEW] possibility to use FireWatir as the agent for scraping (credit: Glen Gillen)
8
8
  - [FIX] navigation doesn't crash if a 404/500 is returned (credit: Glen Gillen)
9
- - [NEW] navigation action: click_by_xpath to click arbitrary elements
9
+ - [NEW] navigation actions: click_by_xpath, click_link_and_wait
10
10
  - [MOD] dropped dependencies: RubyInline, ParseTree, Ruby2Ruby (hooray for win32 users)
11
- - [NEW] scraping through frames (e.g. google analytics)
12
11
  - [MOD] exporting temporarily doesn't work - for now, generated XPaths are printed to the screen
13
- - [MOD] possibility to wait after clicking link/filling textfield (to be able to scrape inserted AJAX stuff)
14
- - [NEW] possibility to fetch from a string, by specifying nil as the url and the html string with the :html option
15
- - [FIX] firewatir slowness (credit: jak4)
16
12
  - [FIX] lot of bugfixes and stability fixes
17
- -
18
13
 
19
14
  == 0.4.0 (unofficial)
20
15
  === 31st October, 2007
@@ -27,10 +22,6 @@
27
22
  == 0.3.4
28
23
  === 26th September, 2007
29
24
 
30
- =<tt>changes:</tt>
31
- It seems I have been too busy to update the changelog ;)
32
-
33
-
34
25
  == 0.3.1
35
26
  === 29th May, 2007
36
27
 
data/Rakefile CHANGED
@@ -17,7 +17,7 @@ task "cleanup_readme" => ["rdoc"]
17
17
 
18
18
  gem_spec = Gem::Specification.new do |s|
19
19
  s.name = 'scrubyt'
20
- s.version = '0.4.1'
20
+ s.version = '0.4.05'
21
21
  s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
22
22
  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
23
  # Files containing Test::Unit test cases.
@@ -94,8 +94,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
94
94
  pkg.need_tar = false
95
95
  end
96
96
 
97
- #Rake::PackageTask.new('scrubyt-examples', '0.4.03') do |pkg|
98
- # pkg.need_zip = true
99
- # pkg.need_tar = true
100
- # pkg.package_files.include("examples/**/*")
101
- #end
97
+ Rake::PackageTask.new('scrubyt-examples', gem_spec.version) do |pkg|
98
+ pkg.need_zip = true
99
+ pkg.need_tar = true
100
+ pkg.package_files.include("examples/**/*")
101
+ end
@@ -21,7 +21,6 @@ module Scrubyt
21
21
  @@host_name = nil
22
22
  @@history = []
23
23
  @@current_form = nil
24
- @@current_frame = nil
25
24
 
26
25
  ##
27
26
  #Action to fetch a document (either a file or a http address)
@@ -59,33 +58,12 @@ module Scrubyt
59
58
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
60
59
  store_host_name(@@agent.url) # in case we're on a new host
61
60
  end
62
-
63
- def self.frame(attribute, value)
64
- if @@current_frame
65
- @@current_frame.frame(attribute, value)
66
- else
67
- @@current_frame = @@agent.frame(attribute, value)
68
- end
69
- end
70
61
 
71
62
  ##
72
63
  #Submit the last form;
73
- def self.submit(current_form, sleep_time=nil, button=nil, type=nil)
74
- if @@current_frame
75
- #BRUTAL hax but FW is such a shitty piece of software
76
- #this sucks FAIL omg
77
- @@current_frame.locate
78
- form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
79
- form.submit
80
- else
81
- @@agent.element_by_xpath(@@current_form).submit
82
- end
83
-
84
- if sleep_time
85
- sleep sleep_time
86
- @@agent.wait
87
- end
88
-
64
+ def self.submit(current_form, button=nil, type=nil)
65
+ @@agent.element_by_xpath(@@current_form).submit
66
+ @@agent.wait
89
67
  @@current_doc_url = @@agent.url
90
68
  @@mechanize_doc = "<html>#{@@agent.html}</html>"
91
69
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
@@ -196,18 +174,9 @@ module Scrubyt
196
174
  end
197
175
  end
198
176
 
199
- def self.fill_textfield(textfield_name, query_string, wait_secs, useValue)
177
+ def self.fill_textfield(textfield_name, query_string)
200
178
  @@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
201
- target = @@current_frame || @@agent
202
- if useValue
203
- target.text_field(:name,textfield_name).value = query_string
204
- else
205
- target.text_field(:name,textfield_name).set(query_string)
206
- end
207
- sleep(wait_secs) if wait_secs > 0
208
- @@mechanize_doc = "<html>#{@@agent.html}</html>"
209
- @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
210
-
179
+ @@agent.text_field(:name,textfield_name).set(query_string)
211
180
  end
212
181
 
213
182
  ##
@@ -32,21 +32,16 @@ module Scrubyt
32
32
 
33
33
  if args.size > 0
34
34
  mechanize_doc = args[0][:mechanize_doc]
35
- html = args[0][:html]
36
35
  resolve = args[0][:resolve]
37
36
  basic_auth = args[0][:basic_auth]
38
37
  parse_and_set_basic_auth(basic_auth) if basic_auth
39
- if html
40
- @@current_doc_protocol = 'string'
41
- mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
42
- end
43
38
  else
44
39
  mechanize_doc = nil
45
40
  resolve = :full
46
41
  end
47
42
 
48
43
  @@current_doc_url = doc_url
49
- @@current_doc_protocol ||= determine_protocol
44
+ @@current_doc_protocol = determine_protocol
50
45
 
51
46
  if mechanize_doc.nil? && @@current_doc_protocol != 'file'
52
47
  handle_relative_path(doc_url)
@@ -64,13 +59,13 @@ module Scrubyt
64
59
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
65
60
  else
66
61
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
67
- store_host_name(self.get_current_doc_url) if self.get_current_doc_url # in case we're on a new host
62
+ store_host_name(self.get_current_doc_url) # in case we're on a new host
68
63
  end
69
64
  end
70
65
 
71
66
  ##
72
67
  #Submit the last form;
73
- def self.submit(index=nil, sleep_time=nil, type=nil)
68
+ def self.submit(index=nil, type=nil)
74
69
  Scrubyt.log :ACTION, 'Submitting form...'
75
70
  if index == nil
76
71
  result_page = @@agent.submit(@@current_form)
@@ -91,7 +86,7 @@ module Scrubyt
91
86
 
92
87
  ##
93
88
  #Click the link specified by the text
94
- def self.click_link(link_spec,index = 0,wait_secs=0)
89
+ def self.click_link(link_spec,index = 0)
95
90
  Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
96
91
  if link_spec.is_a? Hash
97
92
  clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
@@ -177,7 +172,7 @@ module Scrubyt
177
172
  end
178
173
  end
179
174
 
180
- def self.fill_textfield(textfield_name, query_string, *unused)
175
+ def self.fill_textfield(textfield_name, query_string)
181
176
  lookup_form_for_tag('input','textfield',textfield_name,query_string)
182
177
  eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
183
178
  end
@@ -23,12 +23,8 @@ module Scrubyt
23
23
  #textfield is 'q'
24
24
  #
25
25
  #_query_string_ - the string that should be entered into the textfield
26
- def fill_textfield(textfield_name, query_string, use_value = nil)
27
- FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
28
- end
29
-
30
- def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
31
- FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
26
+ def fill_textfield(textfield_name, query_string)
27
+ FetchAction.fill_textfield(textfield_name, query_string)
32
28
  end
33
29
 
34
30
  ##
@@ -59,17 +55,13 @@ module Scrubyt
59
55
  ##
60
56
  #Submit the current form
61
57
  def submit(index=nil, type=nil)
62
- FetchAction.submit(nil, index, type)
63
- end
64
-
65
- def submit_and_wait(sleep_time, index=nil, type=nil)
66
- FetchAction.submit(index, sleep_time, type)
58
+ FetchAction.submit(index, type)
67
59
  end
68
60
 
69
61
  ##
70
62
  #Click the link specified by the text
71
- def click_link(link_spec,index=0)
72
- FetchAction.click_link(link_spec,index, 0)
63
+ def click_link(link_spec,index=0, sleep_secs=0)
64
+ FetchAction.click_link(link_spec,index, sleep_secs)
73
65
  end
74
66
 
75
67
  def click_link_and_wait(link_spec, sleep_secs=0)
@@ -84,10 +76,6 @@ module Scrubyt
84
76
  FetchAction.click_image_map(index)
85
77
  end
86
78
 
87
- def frame(attribute,value)
88
- FetchAction.frame(attribute,value)
89
- end
90
-
91
79
  def wait(time=1)
92
80
  FetchAction.wait(time)
93
81
  end
@@ -10,5 +10,8 @@ module Scrubyt
10
10
  end
11
11
  end
12
12
 
13
+ def to_sexp
14
+ [:str, @example]
15
+ end #end of method to_sexp
13
16
  end #End of class AttributeFilter
14
17
  end #End of module Scrubyt
@@ -5,5 +5,8 @@ module Scrubyt
5
5
  return @example
6
6
  end
7
7
 
8
+ def to_sexp
9
+ [:str, @example]
10
+ end #end of method to_sexp
8
11
  end #End of class ConstantFilter
9
12
  end #End of module Scrubyt
@@ -33,5 +33,9 @@ module Scrubyt
33
33
  root_results
34
34
  end
35
35
 
36
+ def get_detail_sexp
37
+ [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
38
+ end
39
+
36
40
  end
37
41
  end
@@ -8,6 +8,10 @@ module Scrubyt
8
8
  download_file(source)
9
9
  end #end of method
10
10
 
11
+ def to_sexp
12
+ [:str, @example]
13
+ end #end of method to_sexp
14
+
11
15
  private
12
16
  def download_file(source)
13
17
  return '' if source.size < 4
@@ -5,5 +5,8 @@ module Scrubyt
5
5
  source.inner_html
6
6
  end
7
7
 
8
+ def to_sexp
9
+ nil
10
+ end #end of method
8
11
  end #End of class TreeFilter
9
12
  end #End of module Scrubyt
@@ -9,5 +9,9 @@ module Scrubyt
9
9
  end
10
10
  end
11
11
 
12
+ def to_sexp
13
+ [:lit, @example]
14
+ end
15
+
12
16
  end #End of class TreeFilter
13
17
  end #End of module Scrubyt
@@ -7,5 +7,8 @@ module Scrubyt
7
7
  @example.call param
8
8
  end
9
9
 
10
+ def to_sexp
11
+ [:str, "FIXME!!! Can't dump Proc"]
12
+ end #end of method to_sexp
10
13
  end #End of class ConstantFilter
11
14
  end #End of module Scrubyt
@@ -29,6 +29,9 @@ module Scrubyt
29
29
  return []
30
30
  end
31
31
 
32
+ def to_sexp
33
+ [:str, @example]
34
+ end #end of method to_sexp
32
35
  end #End of class TextFilter
33
36
  end #End of module Scrubyt
34
37
 
@@ -134,5 +134,13 @@ module Scrubyt
134
134
  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
135
135
  end
136
136
 
137
+ def to_sexp
138
+ if @example =~ /.+\[@.+\]$/
139
+ [:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
140
+ else
141
+ [:str, @xpath]
142
+ end
143
+ end
144
+
137
145
  end #End of class TreeFilter
138
146
  end #End of module Scrubyt
@@ -78,7 +78,6 @@ module Scrubyt
78
78
 
79
79
  #grab any examples that are defined
80
80
  examples = look_for_examples(args)
81
-
82
81
  #parse the options hash if provided
83
82
  parse_options_hash(args[-1]) if args[-1].is_a? Hash
84
83
 
@@ -309,6 +308,32 @@ module Scrubyt
309
308
  end
310
309
  end
311
310
 
311
+ def to_sexp
312
+ #collect arguments
313
+ args = []
314
+ args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
315
+ args.push(@options.to_sexp) if !@options.empty?
316
+
317
+ #build main call
318
+ sexp = [:fcall, @name, [:array, *args]]
319
+
320
+ if type == :detail_page
321
+ #add detail page extractor
322
+ sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
323
+ else
324
+ #add child block if the pattern has children
325
+ sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
326
+ end
327
+
328
+ #add modifier calls - TODO: remove when everything is exported to the options hash
329
+ @modifier_calls.each do |modifier_sexp|
330
+ sexp = [:call, sexp, *modifier_sexp]
331
+ end
332
+
333
+ #return complete sexp
334
+ sexp
335
+ end
336
+
312
337
  private
313
338
  def parse_options_hash(hash)
314
339
  #merge provided hash
@@ -48,6 +48,10 @@ module Scrubyt
48
48
  ary
49
49
  end
50
50
 
51
+ # def to_sexp
52
+ # [:array, *@indices_to_extract.collect { |index| [:lit, index] }]
53
+ # end
54
+
51
55
  private
52
56
  ##
53
57
  #Do not return the whole result set, just specified indices - like
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrubyt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.05
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Szinek
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-12-10 00:00:00 +01:00
12
+ date: 2008-11-15 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  requirements: []
101
101
 
102
102
  rubyforge_project:
103
- rubygems_version: 1.3.1
103
+ rubygems_version: 1.2.0
104
104
  signing_key:
105
105
  specification_version: 2
106
106
  summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)