scrubyt 0.4.1 → 0.4.05

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,20 +1,15 @@
1
1
  = scRUBYt! Changelog
2
2
 
3
3
  == 0.4.05
4
- == 20th October
4
+ == 14th November
5
5
 
6
6
  =<tt>changes:</tt>
7
- - [NEW] possibility to use FireWatir as the agent for scraping (credit: Glenn Gillen, Glen Gillen and... did I mention Glenn already?)
7
+ - [NEW] possibility to use FireWatir as the agent for scraping (credit: Glen Gillen)
8
8
  - [FIX] navigation doesn't crash if a 404/500 is returned (credit: Glen Gillen)
9
- - [NEW] navigation action: click_by_xpath to click arbitrary elements
9
+ - [NEW] navigation actions: click_by_xpath, click_link_and_wait
10
10
  - [MOD] dropped dependencies: RubyInline, ParseTree, Ruby2Ruby (hooray for win32 users)
11
- - [NEW] scraping through frames (e.g. google analytics)
12
11
  - [MOD] exporting temporarily doesn't work - for now, generated XPaths are printed to the screen
13
- - [MOD] possibility to wait after clicking link/filling textfield (to be able to scrape inserted AJAX stuff)
14
- - [NEW] possibility to fetch from a string, by specifying nil as the url and the html string with the :html option
15
- - [FIX] firewatir slowness (credit: jak4)
16
12
  - [FIX] lot of bugfixes and stability fixes
17
- -
18
13
 
19
14
  == 0.4.0 (unofficial)
20
15
  === 31st October, 2007
@@ -27,10 +22,6 @@
27
22
  == 0.3.4
28
23
  === 26th September, 2007
29
24
 
30
- =<tt>changes:</tt>
31
- It seems I have been too busy to update the changelog ;)
32
-
33
-
34
25
  == 0.3.1
35
26
  === 29th May, 2007
36
27
 
data/Rakefile CHANGED
@@ -17,7 +17,7 @@ task "cleanup_readme" => ["rdoc"]
17
17
 
18
18
  gem_spec = Gem::Specification.new do |s|
19
19
  s.name = 'scrubyt'
20
- s.version = '0.4.1'
20
+ s.version = '0.4.05'
21
21
  s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
22
22
  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
23
  # Files containing Test::Unit test cases.
@@ -94,8 +94,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
94
94
  pkg.need_tar = false
95
95
  end
96
96
 
97
- #Rake::PackageTask.new('scrubyt-examples', '0.4.03') do |pkg|
98
- # pkg.need_zip = true
99
- # pkg.need_tar = true
100
- # pkg.package_files.include("examples/**/*")
101
- #end
97
+ Rake::PackageTask.new('scrubyt-examples', gem_spec.version) do |pkg|
98
+ pkg.need_zip = true
99
+ pkg.need_tar = true
100
+ pkg.package_files.include("examples/**/*")
101
+ end
@@ -21,7 +21,6 @@ module Scrubyt
21
21
  @@host_name = nil
22
22
  @@history = []
23
23
  @@current_form = nil
24
- @@current_frame = nil
25
24
 
26
25
  ##
27
26
  #Action to fetch a document (either a file or a http address)
@@ -59,33 +58,12 @@ module Scrubyt
59
58
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
60
59
  store_host_name(@@agent.url) # in case we're on a new host
61
60
  end
62
-
63
- def self.frame(attribute, value)
64
- if @@current_frame
65
- @@current_frame.frame(attribute, value)
66
- else
67
- @@current_frame = @@agent.frame(attribute, value)
68
- end
69
- end
70
61
 
71
62
  ##
72
63
  #Submit the last form;
73
- def self.submit(current_form, sleep_time=nil, button=nil, type=nil)
74
- if @@current_frame
75
- #BRUTAL hax but FW is such a shitty piece of software
76
- #this sucks FAIL omg
77
- @@current_frame.locate
78
- form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
79
- form.submit
80
- else
81
- @@agent.element_by_xpath(@@current_form).submit
82
- end
83
-
84
- if sleep_time
85
- sleep sleep_time
86
- @@agent.wait
87
- end
88
-
64
+ def self.submit(current_form, button=nil, type=nil)
65
+ @@agent.element_by_xpath(@@current_form).submit
66
+ @@agent.wait
89
67
  @@current_doc_url = @@agent.url
90
68
  @@mechanize_doc = "<html>#{@@agent.html}</html>"
91
69
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
@@ -196,18 +174,9 @@ module Scrubyt
196
174
  end
197
175
  end
198
176
 
199
- def self.fill_textfield(textfield_name, query_string, wait_secs, useValue)
177
+ def self.fill_textfield(textfield_name, query_string)
200
178
  @@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
201
- target = @@current_frame || @@agent
202
- if useValue
203
- target.text_field(:name,textfield_name).value = query_string
204
- else
205
- target.text_field(:name,textfield_name).set(query_string)
206
- end
207
- sleep(wait_secs) if wait_secs > 0
208
- @@mechanize_doc = "<html>#{@@agent.html}</html>"
209
- @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
210
-
179
+ @@agent.text_field(:name,textfield_name).set(query_string)
211
180
  end
212
181
 
213
182
  ##
@@ -32,21 +32,16 @@ module Scrubyt
32
32
 
33
33
  if args.size > 0
34
34
  mechanize_doc = args[0][:mechanize_doc]
35
- html = args[0][:html]
36
35
  resolve = args[0][:resolve]
37
36
  basic_auth = args[0][:basic_auth]
38
37
  parse_and_set_basic_auth(basic_auth) if basic_auth
39
- if html
40
- @@current_doc_protocol = 'string'
41
- mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
42
- end
43
38
  else
44
39
  mechanize_doc = nil
45
40
  resolve = :full
46
41
  end
47
42
 
48
43
  @@current_doc_url = doc_url
49
- @@current_doc_protocol ||= determine_protocol
44
+ @@current_doc_protocol = determine_protocol
50
45
 
51
46
  if mechanize_doc.nil? && @@current_doc_protocol != 'file'
52
47
  handle_relative_path(doc_url)
@@ -64,13 +59,13 @@ module Scrubyt
64
59
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
65
60
  else
66
61
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
67
- store_host_name(self.get_current_doc_url) if self.get_current_doc_url # in case we're on a new host
62
+ store_host_name(self.get_current_doc_url) # in case we're on a new host
68
63
  end
69
64
  end
70
65
 
71
66
  ##
72
67
  #Submit the last form;
73
- def self.submit(index=nil, sleep_time=nil, type=nil)
68
+ def self.submit(index=nil, type=nil)
74
69
  Scrubyt.log :ACTION, 'Submitting form...'
75
70
  if index == nil
76
71
  result_page = @@agent.submit(@@current_form)
@@ -91,7 +86,7 @@ module Scrubyt
91
86
 
92
87
  ##
93
88
  #Click the link specified by the text
94
- def self.click_link(link_spec,index = 0,wait_secs=0)
89
+ def self.click_link(link_spec,index = 0)
95
90
  Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
96
91
  if link_spec.is_a? Hash
97
92
  clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
@@ -177,7 +172,7 @@ module Scrubyt
177
172
  end
178
173
  end
179
174
 
180
- def self.fill_textfield(textfield_name, query_string, *unused)
175
+ def self.fill_textfield(textfield_name, query_string)
181
176
  lookup_form_for_tag('input','textfield',textfield_name,query_string)
182
177
  eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
183
178
  end
@@ -23,12 +23,8 @@ module Scrubyt
23
23
  #textfield is 'q'
24
24
  #
25
25
  #_query_string_ - the string that should be entered into the textfield
26
- def fill_textfield(textfield_name, query_string, use_value = nil)
27
- FetchAction.fill_textfield(textfield_name, query_string, 0, use_value)
28
- end
29
-
30
- def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
31
- FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
26
+ def fill_textfield(textfield_name, query_string)
27
+ FetchAction.fill_textfield(textfield_name, query_string)
32
28
  end
33
29
 
34
30
  ##
@@ -59,17 +55,13 @@ module Scrubyt
59
55
  ##
60
56
  #Submit the current form
61
57
  def submit(index=nil, type=nil)
62
- FetchAction.submit(nil, index, type)
63
- end
64
-
65
- def submit_and_wait(sleep_time, index=nil, type=nil)
66
- FetchAction.submit(index, sleep_time, type)
58
+ FetchAction.submit(index, type)
67
59
  end
68
60
 
69
61
  ##
70
62
  #Click the link specified by the text
71
- def click_link(link_spec,index=0)
72
- FetchAction.click_link(link_spec,index, 0)
63
+ def click_link(link_spec,index=0, sleep_secs=0)
64
+ FetchAction.click_link(link_spec,index, sleep_secs)
73
65
  end
74
66
 
75
67
  def click_link_and_wait(link_spec, sleep_secs=0)
@@ -84,10 +76,6 @@ module Scrubyt
84
76
  FetchAction.click_image_map(index)
85
77
  end
86
78
 
87
- def frame(attribute,value)
88
- FetchAction.frame(attribute,value)
89
- end
90
-
91
79
  def wait(time=1)
92
80
  FetchAction.wait(time)
93
81
  end
@@ -10,5 +10,8 @@ module Scrubyt
10
10
  end
11
11
  end
12
12
 
13
+ def to_sexp
14
+ [:str, @example]
15
+ end #end of method to_sexp
13
16
  end #End of class AttributeFilter
14
17
  end #End of module Scrubyt
@@ -5,5 +5,8 @@ module Scrubyt
5
5
  return @example
6
6
  end
7
7
 
8
+ def to_sexp
9
+ [:str, @example]
10
+ end #end of method to_sexp
8
11
  end #End of class ConstantFilter
9
12
  end #End of module Scrubyt
@@ -33,5 +33,9 @@ module Scrubyt
33
33
  root_results
34
34
  end
35
35
 
36
+ def get_detail_sexp
37
+ [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
38
+ end
39
+
36
40
  end
37
41
  end
@@ -8,6 +8,10 @@ module Scrubyt
8
8
  download_file(source)
9
9
  end #end of method
10
10
 
11
+ def to_sexp
12
+ [:str, @example]
13
+ end #end of method to_sexp
14
+
11
15
  private
12
16
  def download_file(source)
13
17
  return '' if source.size < 4
@@ -5,5 +5,8 @@ module Scrubyt
5
5
  source.inner_html
6
6
  end
7
7
 
8
+ def to_sexp
9
+ nil
10
+ end #end of method
8
11
  end #End of class TreeFilter
9
12
  end #End of module Scrubyt
@@ -9,5 +9,9 @@ module Scrubyt
9
9
  end
10
10
  end
11
11
 
12
+ def to_sexp
13
+ [:lit, @example]
14
+ end
15
+
12
16
  end #End of class TreeFilter
13
17
  end #End of module Scrubyt
@@ -7,5 +7,8 @@ module Scrubyt
7
7
  @example.call param
8
8
  end
9
9
 
10
+ def to_sexp
11
+ [:str, "FIXME!!! Can't dump Proc"]
12
+ end #end of method to_sexp
10
13
  end #End of class ConstantFilter
11
14
  end #End of module Scrubyt
@@ -29,6 +29,9 @@ module Scrubyt
29
29
  return []
30
30
  end
31
31
 
32
+ def to_sexp
33
+ [:str, @example]
34
+ end #end of method to_sexp
32
35
  end #End of class TextFilter
33
36
  end #End of module Scrubyt
34
37
 
@@ -134,5 +134,13 @@ module Scrubyt
134
134
  @xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
135
135
  end
136
136
 
137
+ def to_sexp
138
+ if @example =~ /.+\[@.+\]$/
139
+ [:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
140
+ else
141
+ [:str, @xpath]
142
+ end
143
+ end
144
+
137
145
  end #End of class TreeFilter
138
146
  end #End of module Scrubyt
@@ -78,7 +78,6 @@ module Scrubyt
78
78
 
79
79
  #grab any examples that are defined
80
80
  examples = look_for_examples(args)
81
-
82
81
  #parse the options hash if provided
83
82
  parse_options_hash(args[-1]) if args[-1].is_a? Hash
84
83
 
@@ -309,6 +308,32 @@ module Scrubyt
309
308
  end
310
309
  end
311
310
 
311
+ def to_sexp
312
+ #collect arguments
313
+ args = []
314
+ args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
315
+ args.push(@options.to_sexp) if !@options.empty?
316
+
317
+ #build main call
318
+ sexp = [:fcall, @name, [:array, *args]]
319
+
320
+ if type == :detail_page
321
+ #add detail page extractor
322
+ sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
323
+ else
324
+ #add child block if the pattern has children
325
+ sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
326
+ end
327
+
328
+ #add modifier calls - TODO: remove when everything is exported to the options hash
329
+ @modifier_calls.each do |modifier_sexp|
330
+ sexp = [:call, sexp, *modifier_sexp]
331
+ end
332
+
333
+ #return complete sexp
334
+ sexp
335
+ end
336
+
312
337
  private
313
338
  def parse_options_hash(hash)
314
339
  #merge provided hash
@@ -48,6 +48,10 @@ module Scrubyt
48
48
  ary
49
49
  end
50
50
 
51
+ # def to_sexp
52
+ # [:array, *@indices_to_extract.collect { |index| [:lit, index] }]
53
+ # end
54
+
51
55
  private
52
56
  ##
53
57
  #Do not return the whole result set, just specified indices - like
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrubyt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.05
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Szinek
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-12-10 00:00:00 +01:00
12
+ date: 2008-11-15 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
100
  requirements: []
101
101
 
102
102
  rubyforge_project:
103
- rubygems_version: 1.3.1
103
+ rubygems_version: 1.2.0
104
104
  signing_key:
105
105
  specification_version: 2
106
106
  summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)