scrubyt 0.3.0 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,7 +1,7 @@
1
1
  = scRUBYt! Changelog
2
2
 
3
- == 0.3.0
4
- === 21st May, 2007
3
+ == 0.3.1
4
+ === 29th May, 2007
5
5
 
6
6
  =<tt>changes:</tt>
7
7
 
@@ -17,19 +17,26 @@
17
17
  but it did not work for all cases)
18
18
  [NEW] possibility to click button with it's text (instead of it's index)
19
19
  (credit: Nick Merwin)
20
+ [NEW] clicking radio buttons
20
21
  [NEW] can click on image buttons (by specifying the name of the button)
21
22
  [NEW] possibility to extract an URL with one step, like so:
22
23
  link 'The Difference/@href'
23
- i.e. give me the href attribute of the element matched by the example 'The Difference'
24
+ i.e. give me the href attribute of the element matched by the example 'The Difference'
24
25
  [NEW] new way to match an element of the page:
25
26
  div 'div[The Difference]'
26
27
  means 'return the div which contains the string "The Difference"'. This is
27
- useful if the XPath of the element is non-constant across the same site (e.g.
28
- sometimes a banner or add is added, sometimes not etc.)
29
- [FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase automatically
28
+ useful if the XPath of the element is non-constant across the same site
29
+ (e.g.sometimes a banner or add is added, sometimes not etc.)
30
+ [NEW] Clicking image maps; At the moment this is achieved by specifying an
31
+ index, like
32
+ click_image_map 3
33
+ which means click the 4th link in the image map
34
+ [FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase
35
+ automatically
30
36
  [FIX] Fixed: correctly downloading image if the src
31
37
  attribute had a leading space, as in
32
38
  <img src=' /files/downloads/images/image.jpg'/>
39
+ [FIX] Other misc fixes - a ton of them!
33
40
 
34
41
  == 0.2.7
35
42
  === 12th April, 2007
data/Rakefile CHANGED
@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
17
17
 
18
18
  gem_spec = Gem::Specification.new do |s|
19
19
  s.name = 'scrubyt'
20
- s.version = '0.3.0'
21
- s.summary = 'A powerful Web-scraping framework'
20
+ s.version = '0.3.4'
21
+ s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
22
22
  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
23
  # Files containing Test::Unit test cases.
24
24
  s.test_files = FileList['test/unittests/**/*']
@@ -29,9 +29,12 @@ gem_spec = Gem::Specification.new do |s|
29
29
  s.homepage = 'http://www.scrubyt.org'
30
30
  s.add_dependency('hpricot', '>= 0.5')
31
31
  s.add_dependency('mechanize', '>= 0.6.3')
32
- #s.add_dependency('parsetree', '>= 1.7.0')
33
- #s.add_dependency('ruby2ruby', '>= 1.1.5')
34
- s.has_rdoc = 'true'
32
+ s.add_dependency('ParseTreeReloaded')
33
+ s.add_dependency('RubyInlineAcceleration')
34
+ s.add_dependency('RubyInline', '= 3.6.3')
35
+ s.add_dependency('ParseTree', '= 1.7.1')
36
+ s.add_dependency('ruby2ruby', '= 1.1.6')
37
+ #s.has_rdoc = 'true'
35
38
  end
36
39
 
37
40
  ###################################################
@@ -56,10 +59,19 @@ Rake::TestTask.new(:test_blackbox) do |task|
56
59
  task.test_files = ['test/blackbox_test.rb']
57
60
  end
58
61
 
62
+ task "test_specific" do
63
+ ruby "test/blackbox_test.rb #{ARGV[1]}"
64
+ end
65
+
59
66
  Rake::TestTask.new(:test_non_blackbox) do |task|
60
67
  task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
61
68
  end
62
69
 
70
+ task "rcov" do
71
+ sh 'rcov --xrefs test/*.rb'
72
+ puts 'Report done.'
73
+ end
74
+
63
75
  task "cleanup_readme" do
64
76
  puts "Cleaning up README..."
65
77
  readme_in = open('./doc/files/README.html')
@@ -87,8 +99,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
87
99
  pkg.need_tar = false
88
100
  end
89
101
 
90
- Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
91
- pkg.need_zip = true
92
- pkg.need_tar = true
93
- pkg.package_files.include("examples/**/*")
94
- end
102
+ #Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
103
+ # pkg.need_zip = true
104
+ # pkg.need_tar = true
105
+ # pkg.package_files.include("examples/**/*")
106
+ #end
@@ -1,3 +1,6 @@
1
+ $KCODE = 'u'
2
+ require 'jcode'
3
+
1
4
  #ruby core
2
5
  require 'open-uri'
3
6
  require 'erb'
@@ -7,6 +10,7 @@ require 'rubygems'
7
10
  require 'mechanize'
8
11
  require 'hpricot'
9
12
  require 'parse_tree_reloaded'
13
+ require 'rexml/text'
10
14
 
11
15
  #little hack to avoid that ruby2ruby tries to load the original parse_tree
12
16
  if Gem
@@ -42,16 +46,17 @@ require 'scrubyt/core/scraping/compound_example.rb'
42
46
  require 'scrubyt/output/result_node.rb'
43
47
  require 'scrubyt/output/scrubyt_result.rb'
44
48
  require 'scrubyt/output/export.rb'
49
+ require 'scrubyt/core/navigation/navigation_actions.rb'
50
+ require 'scrubyt/core/navigation/fetch_action.rb'
45
51
  require 'scrubyt/core/shared/extractor.rb'
46
52
  require 'scrubyt/core/scraping/filters/base_filter.rb'
47
53
  require 'scrubyt/core/scraping/filters/attribute_filter.rb'
54
+ require 'scrubyt/core/scraping/filters/constant_filter.rb'
55
+ require 'scrubyt/core/scraping/filters/script_filter.rb'
56
+ require 'scrubyt/core/scraping/filters/text_filter.rb'
48
57
  require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
49
58
  require 'scrubyt/core/scraping/filters/download_filter.rb'
50
59
  require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
51
60
  require 'scrubyt/core/scraping/filters/regexp_filter.rb'
52
61
  require 'scrubyt/core/scraping/filters/tree_filter.rb'
53
62
  require 'scrubyt/core/scraping/pattern.rb'
54
- require 'scrubyt/core/navigation/navigation_actions.rb'
55
- require 'scrubyt/core/navigation/fetch_action.rb'
56
- require 'scrubyt/core/shared/evaluation_context.rb'
57
- require 'scrubyt/core/shared/u_r_i_builder.rb'
@@ -7,7 +7,7 @@ module Scrubyt
7
7
  #functionality to a separate class - so if you are looking for anything
8
8
  #which is loading a document (even by submitting a form or clicking a link)
9
9
  #and related things like setting a proxy etc. you should find it here.
10
- class FetchAction
10
+ module FetchAction
11
11
 
12
12
  @@current_doc_url = nil
13
13
  @@current_doc_protocol = nil
@@ -30,7 +30,7 @@ module Scrubyt
30
30
  mechanize_doc = args[0][:mechanize_doc]
31
31
  resolve = args[0][:resolve]
32
32
  basic_auth = args[0][:basic_auth]
33
- user_agent = args[0][:user_agent] || "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"
33
+ user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
34
34
  #Refactor this whole stuff as well!!! It looks awful...
35
35
  parse_and_set_proxy(proxy) if proxy
36
36
  set_user_agent(user_agent)
@@ -120,23 +120,27 @@ module Scrubyt
120
120
  @@hpricot_doc
121
121
  end
122
122
 
123
- def self.get_host_name
123
+ def get_host_name
124
124
  @@host_name
125
125
  end
126
126
 
127
- def self.restore_host_name
127
+ def restore_host_name
128
128
  return if @@current_doc_protocol == 'file'
129
129
  @@host_name = @@original_host_name
130
130
  end
131
131
 
132
- def self.store_page
132
+ def store_page
133
133
  @@history.push @@hpricot_doc
134
134
  end
135
135
 
136
- def self.restore_page
136
+ def restore_page
137
137
  @@hpricot_doc = @@history.pop
138
138
  end
139
139
 
140
+ def store_host_name(doc_url)
141
+ FetchAction.store_host_name(doc_url)
142
+ end
143
+
140
144
  def self.store_host_name(doc_url)
141
145
  @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
142
146
  @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
@@ -171,8 +175,7 @@ module Scrubyt
171
175
  @@port = parts.delete_at(-1)
172
176
  @@host = parts.join(':')
173
177
  if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
174
- puts "Invalid proxy specification..."
175
- puts "neither host nor port can be nil!"
178
+ Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
176
179
  exit
177
180
  end
178
181
  end
@@ -227,6 +230,6 @@ module Scrubyt
227
230
  #custom resilving
228
231
  @@current_doc_url = resolve + doc_url
229
232
  end
230
- end #end of function handle_relative_url
231
- end #end of class FetchAction
232
- end #end of module Scrubyt
233
+ end
234
+ end
235
+ end
@@ -5,25 +5,15 @@ module Scrubyt
5
5
  #This class contains all the actions that are used to navigate on web pages;
6
6
  #first of all, *fetch* for downloading the pages - then various actions
7
7
  #like filling textfields, submitting formst, clicking links and more
8
- class NavigationActions
9
- #These are reserved keywords - they can not be the name of any pattern
10
- #since they are reserved for describing the navigation
11
- KEYWORDS = ['fetch',
12
- 'fill_textfield',
13
- 'fill_textarea',
14
- 'submit',
15
- 'click_link',
16
- 'click_image_map',
17
- 'select_option',
18
- 'check_checkbox',
19
- 'check_radiobutton',
20
- 'end']
21
-
22
- def initialize
23
- @@current_form = nil
24
- FetchAction.new
8
+ module NavigationActions
9
+
10
+ def self.extend_object(obj)
11
+ super(obj)
12
+ obj.instance_eval do
13
+ @current_form = nil
14
+ end
25
15
  end
26
-
16
+
27
17
  ##
28
18
  #Action to fill a textfield with a query string
29
19
  #
@@ -33,90 +23,76 @@ module Scrubyt
33
23
  #textfield is 'q'
34
24
  #
35
25
  #_query_string_ - the string that should be entered into the textfield
36
- def self.fill_textfield(textfield_name, query_string)
26
+ def fill_textfield(textfield_name, query_string)
37
27
  lookup_form_for_tag('input','textfield',textfield_name,query_string)
38
- eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
28
+ eval("@current_form['#{textfield_name}'] = '#{query_string}'")
39
29
  end
40
-
30
+
41
31
  ##
42
32
  #Action to fill a textarea with text
43
- def self.fill_textarea(textarea_name, text)
33
+ def fill_textarea(textarea_name, text)
44
34
  lookup_form_for_tag('textarea','textarea',textarea_name,text)
45
- eval("@@current_form['#{textarea_name}'] = '#{text}'")
35
+ eval("@current_form['#{textarea_name}'] = '#{text}'")
46
36
  end
47
-
37
+
48
38
  ##
49
39
  #Action for selecting an option from a dropdown box
50
- def self.select_option(selectlist_name, option)
40
+ def select_option(selectlist_name, option)
51
41
  lookup_form_for_tag('select','select list',selectlist_name,option)
52
- select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
42
+ select_list = @current_form.fields.find {|f| f.name == selectlist_name}
53
43
  searched_option = select_list.options.find{|f| f.text.strip == option}
54
44
  searched_option.click
55
45
  end
56
-
57
- def self.check_checkbox(checkbox_name)
46
+
47
+ def check_checkbox(checkbox_name)
58
48
  lookup_form_for_tag('input','checkbox',checkbox_name, '')
59
- @@current_form.checkboxes.name(checkbox_name).check
49
+ @current_form.checkboxes.name(checkbox_name).check
60
50
  end
61
-
62
- def self.check_radiobutton(checkbox_name, index=0)
51
+
52
+ def check_radiobutton(checkbox_name, index=0)
63
53
  lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
64
- @@current_form.radiobuttons.name(checkbox_name)[index].check
54
+ @current_form.radiobuttons.name(checkbox_name)[index].check
65
55
  end
66
-
56
+
67
57
  ##
68
58
  #Fetch the document
69
- def self.fetch(*args)
59
+ def fetch(*args)
70
60
  FetchAction.fetch(*args)
71
61
  end
72
62
  ##
73
- #Submit the current form (delegate it to NavigationActions)
74
- def self.submit(index=nil, type=nil)
63
+ #Submit the current form
64
+ def submit(index=nil, type=nil)
75
65
  if index == nil
76
- FetchAction.submit(@@current_form)
77
- #----- added by nickmerwin@gmail.com -----
66
+ FetchAction.submit(@current_form)
67
+ #----- added by nickmerwin@gmail.com -----
78
68
  elsif index.class == String
79
- button = @@current_form.buttons.detect{|b| b.name == index}
80
- FetchAction.submit(@@current_form, button,type)
81
- #-----------------------------------------
69
+ button = @current_form.buttons.detect{|b| b.name == index}
70
+ FetchAction.submit(@current_form, button,type)
71
+ #-----------------------------------------
82
72
  else
83
- FetchAction.submit(@@current_form, @@current_form.buttons[index])
73
+ FetchAction.submit(@current_form, @current_form.buttons[index])
84
74
  end
85
75
  end
86
-
76
+
87
77
  ##
88
- #Click the link specified by the text ((delegate it to NavigationActions)
89
- def self.click_link(link_spec,index=0)
78
+ #Click the link specified by the text
79
+ def click_link(link_spec,index=0)
90
80
  FetchAction.click_link(link_spec,index)
91
81
  end
92
-
93
- def self.click_image_map(index=0)
82
+
83
+ def click_image_map(index=0)
94
84
  FetchAction.click_image_map(index)
95
85
  end
96
-
97
- def self.get_hpricot_doc
98
- FetchAction.get_hpricot_doc
99
- end
100
-
101
- def self.get_current_doc_url
102
- FetchAction.get_current_doc_url
103
- end
104
-
105
- def self.get_host_name
106
- FetchAction.get_host_name
107
- end
108
-
109
- private
110
- def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
86
+
87
+ private
88
+ def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
111
89
  Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
112
90
  widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
113
- p widget
114
91
  form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
115
- p form_tag
116
92
  find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
117
93
  end
118
94
 
119
- def self.find_form_based_on_tag(tag, possible_attrs)
95
+ def find_form_based_on_tag(tag, possible_attrs)
120
96
  lookup_attribute_name = nil
121
97
  lookup_attribute_value = nil
122
98
 
@@ -127,12 +103,11 @@ private
127
103
  }
128
104
  i = 0
129
105
  loop do
130
- @@current_form = FetchAction.get_mechanize_doc.forms[i]
131
- return nil if @@current_form == nil
132
- puts i
133
- break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
106
+ @current_form = FetchAction.get_mechanize_doc.forms[i]
107
+ return nil if @current_form == nil
108
+ break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
134
109
  i+= 1
135
110
  end
136
- end#find_form_based_on_tag
137
- end#end of class NavigationActions
138
- end#end of module Scrubyt
111
+ end
112
+ end
113
+ end
@@ -66,12 +66,13 @@ module Scrubyt
66
66
  #should not be called directly
67
67
 
68
68
  #TODO still used?
69
+ alias_method :throw_method_missing, :method_missing
69
70
  def method_missing(method_name, *args, &block)
70
71
  case method_name.to_s
71
72
  when /^ensure.+/
72
73
  constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
73
74
  else
74
- raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
75
+ throw_method_missing(method_name, *args, &block)
75
76
  end
76
77
  end
77
78
 
@@ -82,7 +83,7 @@ module Scrubyt
82
83
  private
83
84
  #We don't want this to be accessible from outside
84
85
  def initialize(parent_pattern, example)
85
- @example_type = BaseFilter.determine_example_type(example)
86
+ @example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
86
87
  @parent_pattern = parent_pattern
87
88
  @example = example
88
89
  @xpath = nil #The xpath to evaluate this filter
@@ -0,0 +1,12 @@
1
+ module Scrubyt
2
+ class ConstantFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return @example
6
+ end
7
+
8
+ def to_sexp
9
+ [:str, @example]
10
+ end #end of method to_sexp
11
+ end #End of class ConstantFilter
12
+ end #End of module Scrubyt
@@ -2,13 +2,36 @@ module Scrubyt
2
2
  class DetailPageFilter < BaseFilter
3
3
 
4
4
  def evaluate(source)
5
- if source.is_a? String
6
- @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
5
+ if source.is_a?(String)
6
+ url = source
7
7
  else
8
- @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
9
- XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
10
- @parent_pattern, @parent_pattern.resolve)
8
+ url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
11
9
  end
12
- end #end of method
13
- end #End of class DetailPageFilter
14
- end #End of module Scrubyt
10
+
11
+ @parent_pattern.extractor.store_page
12
+ original_host_name = @parent_pattern.extractor.get_host_name
13
+ @parent_pattern.extractor.restore_host_name
14
+
15
+ FetchAction.fetch url, :resolve => @parent_pattern.resolve
16
+
17
+ if @detail_extractor.nil?
18
+ @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
19
+ root_results = @detail_extractor.result
20
+ else
21
+ root_results = @detail_extractor.evaluate_extractor
22
+ end
23
+
24
+
25
+
26
+ @parent_pattern.extractor.restore_page
27
+ @parent_pattern.extractor.store_host_name original_host_name
28
+
29
+ root_results
30
+ end
31
+
32
+ def get_detail_sexp
33
+ [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
34
+ end
35
+
36
+ end
37
+ end