scrubyt 0.3.0 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,7 +1,7 @@
1
1
  = scRUBYt! Changelog
2
2
 
3
- == 0.3.0
4
- === 21st May, 2007
3
+ == 0.3.1
4
+ === 29th May, 2007
5
5
 
6
6
  =<tt>changes:</tt>
7
7
 
@@ -17,19 +17,26 @@
17
17
  but it did not work for all cases)
18
18
  [NEW] possibility to click button with it's text (instead of it's index)
19
19
  (credit: Nick Merwin)
20
+ [NEW] clicking radio buttons
20
21
  [NEW] can click on image buttons (by specifying the name of the button)
21
22
  [NEW] possibility to extract an URL with one step, like so:
22
23
  link 'The Difference/@href'
23
- i.e. give me the href attribute of the element matched by the example 'The Difference'
24
+ i.e. give me the href attribute of the element matched by the example 'The Difference'
24
25
  [NEW] new way to match an element of the page:
25
26
  div 'div[The Difference]'
26
27
  means 'return the div which contains the string "The Difference"'. This is
27
- useful if the XPath of the element is non-constant across the same site (e.g.
28
- sometimes a banner or add is added, sometimes not etc.)
29
- [FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase automatically
28
+ useful if the XPath of the element is non-constant across the same site
29
+ (e.g.sometimes a banner or add is added, sometimes not etc.)
30
+ [NEW] Clicking image maps; At the moment this is achieved by specifying an
31
+ index, like
32
+ click_image_map 3
33
+ which means click the 4th link in the image map
34
+ [FIX] Replacing \240 (&nbsp;) with space in the preprocessing phase
35
+ automatically
30
36
  [FIX] Fixed: correctly downloading image if the src
31
37
  attribute had a leading space, as in
32
38
  <img src=' /files/downloads/images/image.jpg'/>
39
+ [FIX] Other misc fixes - a ton of them!
33
40
 
34
41
  == 0.2.7
35
42
  === 12th April, 2007
data/Rakefile CHANGED
@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
17
17
 
18
18
  gem_spec = Gem::Specification.new do |s|
19
19
  s.name = 'scrubyt'
20
- s.version = '0.3.0'
21
- s.summary = 'A powerful Web-scraping framework'
20
+ s.version = '0.3.4'
21
+ s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
22
22
  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
23
  # Files containing Test::Unit test cases.
24
24
  s.test_files = FileList['test/unittests/**/*']
@@ -29,9 +29,12 @@ gem_spec = Gem::Specification.new do |s|
29
29
  s.homepage = 'http://www.scrubyt.org'
30
30
  s.add_dependency('hpricot', '>= 0.5')
31
31
  s.add_dependency('mechanize', '>= 0.6.3')
32
- #s.add_dependency('parsetree', '>= 1.7.0')
33
- #s.add_dependency('ruby2ruby', '>= 1.1.5')
34
- s.has_rdoc = 'true'
32
+ s.add_dependency('ParseTreeReloaded')
33
+ s.add_dependency('RubyInlineAcceleration')
34
+ s.add_dependency('RubyInline', '= 3.6.3')
35
+ s.add_dependency('ParseTree', '= 1.7.1')
36
+ s.add_dependency('ruby2ruby', '= 1.1.6')
37
+ #s.has_rdoc = 'true'
35
38
  end
36
39
 
37
40
  ###################################################
@@ -56,10 +59,19 @@ Rake::TestTask.new(:test_blackbox) do |task|
56
59
  task.test_files = ['test/blackbox_test.rb']
57
60
  end
58
61
 
62
+ task "test_specific" do
63
+ ruby "test/blackbox_test.rb #{ARGV[1]}"
64
+ end
65
+
59
66
  Rake::TestTask.new(:test_non_blackbox) do |task|
60
67
  task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
61
68
  end
62
69
 
70
+ task "rcov" do
71
+ sh 'rcov --xrefs test/*.rb'
72
+ puts 'Report done.'
73
+ end
74
+
63
75
  task "cleanup_readme" do
64
76
  puts "Cleaning up README..."
65
77
  readme_in = open('./doc/files/README.html')
@@ -87,8 +99,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
87
99
  pkg.need_tar = false
88
100
  end
89
101
 
90
- Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
91
- pkg.need_zip = true
92
- pkg.need_tar = true
93
- pkg.package_files.include("examples/**/*")
94
- end
102
+ #Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
103
+ # pkg.need_zip = true
104
+ # pkg.need_tar = true
105
+ # pkg.package_files.include("examples/**/*")
106
+ #end
@@ -1,3 +1,6 @@
1
+ $KCODE = 'u'
2
+ require 'jcode'
3
+
1
4
  #ruby core
2
5
  require 'open-uri'
3
6
  require 'erb'
@@ -7,6 +10,7 @@ require 'rubygems'
7
10
  require 'mechanize'
8
11
  require 'hpricot'
9
12
  require 'parse_tree_reloaded'
13
+ require 'rexml/text'
10
14
 
11
15
  #little hack to avoid that ruby2ruby tries to load the original parse_tree
12
16
  if Gem
@@ -42,16 +46,17 @@ require 'scrubyt/core/scraping/compound_example.rb'
42
46
  require 'scrubyt/output/result_node.rb'
43
47
  require 'scrubyt/output/scrubyt_result.rb'
44
48
  require 'scrubyt/output/export.rb'
49
+ require 'scrubyt/core/navigation/navigation_actions.rb'
50
+ require 'scrubyt/core/navigation/fetch_action.rb'
45
51
  require 'scrubyt/core/shared/extractor.rb'
46
52
  require 'scrubyt/core/scraping/filters/base_filter.rb'
47
53
  require 'scrubyt/core/scraping/filters/attribute_filter.rb'
54
+ require 'scrubyt/core/scraping/filters/constant_filter.rb'
55
+ require 'scrubyt/core/scraping/filters/script_filter.rb'
56
+ require 'scrubyt/core/scraping/filters/text_filter.rb'
48
57
  require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
49
58
  require 'scrubyt/core/scraping/filters/download_filter.rb'
50
59
  require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
51
60
  require 'scrubyt/core/scraping/filters/regexp_filter.rb'
52
61
  require 'scrubyt/core/scraping/filters/tree_filter.rb'
53
62
  require 'scrubyt/core/scraping/pattern.rb'
54
- require 'scrubyt/core/navigation/navigation_actions.rb'
55
- require 'scrubyt/core/navigation/fetch_action.rb'
56
- require 'scrubyt/core/shared/evaluation_context.rb'
57
- require 'scrubyt/core/shared/u_r_i_builder.rb'
@@ -7,7 +7,7 @@ module Scrubyt
7
7
  #functionality to a separate class - so if you are looking for anything
8
8
  #which is loading a document (even by submitting a form or clicking a link)
9
9
  #and related things like setting a proxy etc. you should find it here.
10
- class FetchAction
10
+ module FetchAction
11
11
 
12
12
  @@current_doc_url = nil
13
13
  @@current_doc_protocol = nil
@@ -30,7 +30,7 @@ module Scrubyt
30
30
  mechanize_doc = args[0][:mechanize_doc]
31
31
  resolve = args[0][:resolve]
32
32
  basic_auth = args[0][:basic_auth]
33
- user_agent = args[0][:user_agent] || "Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"
33
+ user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
34
34
  #Refactor this whole stuff as well!!! It looks awful...
35
35
  parse_and_set_proxy(proxy) if proxy
36
36
  set_user_agent(user_agent)
@@ -120,23 +120,27 @@ module Scrubyt
120
120
  @@hpricot_doc
121
121
  end
122
122
 
123
- def self.get_host_name
123
+ def get_host_name
124
124
  @@host_name
125
125
  end
126
126
 
127
- def self.restore_host_name
127
+ def restore_host_name
128
128
  return if @@current_doc_protocol == 'file'
129
129
  @@host_name = @@original_host_name
130
130
  end
131
131
 
132
- def self.store_page
132
+ def store_page
133
133
  @@history.push @@hpricot_doc
134
134
  end
135
135
 
136
- def self.restore_page
136
+ def restore_page
137
137
  @@hpricot_doc = @@history.pop
138
138
  end
139
139
 
140
+ def store_host_name(doc_url)
141
+ FetchAction.store_host_name(doc_url)
142
+ end
143
+
140
144
  def self.store_host_name(doc_url)
141
145
  @@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
142
146
  @@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
@@ -171,8 +175,7 @@ module Scrubyt
171
175
  @@port = parts.delete_at(-1)
172
176
  @@host = parts.join(':')
173
177
  if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
174
- puts "Invalid proxy specification..."
175
- puts "neither host nor port can be nil!"
178
+ Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
176
179
  exit
177
180
  end
178
181
  end
@@ -227,6 +230,6 @@ module Scrubyt
227
230
  #custom resilving
228
231
  @@current_doc_url = resolve + doc_url
229
232
  end
230
- end #end of function handle_relative_url
231
- end #end of class FetchAction
232
- end #end of module Scrubyt
233
+ end
234
+ end
235
+ end
@@ -5,25 +5,15 @@ module Scrubyt
5
5
  #This class contains all the actions that are used to navigate on web pages;
6
6
  #first of all, *fetch* for downloading the pages - then various actions
7
7
  #like filling textfields, submitting formst, clicking links and more
8
- class NavigationActions
9
- #These are reserved keywords - they can not be the name of any pattern
10
- #since they are reserved for describing the navigation
11
- KEYWORDS = ['fetch',
12
- 'fill_textfield',
13
- 'fill_textarea',
14
- 'submit',
15
- 'click_link',
16
- 'click_image_map',
17
- 'select_option',
18
- 'check_checkbox',
19
- 'check_radiobutton',
20
- 'end']
21
-
22
- def initialize
23
- @@current_form = nil
24
- FetchAction.new
8
+ module NavigationActions
9
+
10
+ def self.extend_object(obj)
11
+ super(obj)
12
+ obj.instance_eval do
13
+ @current_form = nil
14
+ end
25
15
  end
26
-
16
+
27
17
  ##
28
18
  #Action to fill a textfield with a query string
29
19
  #
@@ -33,90 +23,76 @@ module Scrubyt
33
23
  #textfield is 'q'
34
24
  #
35
25
  #_query_string_ - the string that should be entered into the textfield
36
- def self.fill_textfield(textfield_name, query_string)
26
+ def fill_textfield(textfield_name, query_string)
37
27
  lookup_form_for_tag('input','textfield',textfield_name,query_string)
38
- eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
28
+ eval("@current_form['#{textfield_name}'] = '#{query_string}'")
39
29
  end
40
-
30
+
41
31
  ##
42
32
  #Action to fill a textarea with text
43
- def self.fill_textarea(textarea_name, text)
33
+ def fill_textarea(textarea_name, text)
44
34
  lookup_form_for_tag('textarea','textarea',textarea_name,text)
45
- eval("@@current_form['#{textarea_name}'] = '#{text}'")
35
+ eval("@current_form['#{textarea_name}'] = '#{text}'")
46
36
  end
47
-
37
+
48
38
  ##
49
39
  #Action for selecting an option from a dropdown box
50
- def self.select_option(selectlist_name, option)
40
+ def select_option(selectlist_name, option)
51
41
  lookup_form_for_tag('select','select list',selectlist_name,option)
52
- select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
42
+ select_list = @current_form.fields.find {|f| f.name == selectlist_name}
53
43
  searched_option = select_list.options.find{|f| f.text.strip == option}
54
44
  searched_option.click
55
45
  end
56
-
57
- def self.check_checkbox(checkbox_name)
46
+
47
+ def check_checkbox(checkbox_name)
58
48
  lookup_form_for_tag('input','checkbox',checkbox_name, '')
59
- @@current_form.checkboxes.name(checkbox_name).check
49
+ @current_form.checkboxes.name(checkbox_name).check
60
50
  end
61
-
62
- def self.check_radiobutton(checkbox_name, index=0)
51
+
52
+ def check_radiobutton(checkbox_name, index=0)
63
53
  lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
64
- @@current_form.radiobuttons.name(checkbox_name)[index].check
54
+ @current_form.radiobuttons.name(checkbox_name)[index].check
65
55
  end
66
-
56
+
67
57
  ##
68
58
  #Fetch the document
69
- def self.fetch(*args)
59
+ def fetch(*args)
70
60
  FetchAction.fetch(*args)
71
61
  end
72
62
  ##
73
- #Submit the current form (delegate it to NavigationActions)
74
- def self.submit(index=nil, type=nil)
63
+ #Submit the current form
64
+ def submit(index=nil, type=nil)
75
65
  if index == nil
76
- FetchAction.submit(@@current_form)
77
- #----- added by nickmerwin@gmail.com -----
66
+ FetchAction.submit(@current_form)
67
+ #----- added by nickmerwin@gmail.com -----
78
68
  elsif index.class == String
79
- button = @@current_form.buttons.detect{|b| b.name == index}
80
- FetchAction.submit(@@current_form, button,type)
81
- #-----------------------------------------
69
+ button = @current_form.buttons.detect{|b| b.name == index}
70
+ FetchAction.submit(@current_form, button,type)
71
+ #-----------------------------------------
82
72
  else
83
- FetchAction.submit(@@current_form, @@current_form.buttons[index])
73
+ FetchAction.submit(@current_form, @current_form.buttons[index])
84
74
  end
85
75
  end
86
-
76
+
87
77
  ##
88
- #Click the link specified by the text ((delegate it to NavigationActions)
89
- def self.click_link(link_spec,index=0)
78
+ #Click the link specified by the text
79
+ def click_link(link_spec,index=0)
90
80
  FetchAction.click_link(link_spec,index)
91
81
  end
92
-
93
- def self.click_image_map(index=0)
82
+
83
+ def click_image_map(index=0)
94
84
  FetchAction.click_image_map(index)
95
85
  end
96
-
97
- def self.get_hpricot_doc
98
- FetchAction.get_hpricot_doc
99
- end
100
-
101
- def self.get_current_doc_url
102
- FetchAction.get_current_doc_url
103
- end
104
-
105
- def self.get_host_name
106
- FetchAction.get_host_name
107
- end
108
-
109
- private
110
- def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
86
+
87
+ private
88
+ def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
111
89
  Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
112
90
  widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
113
- p widget
114
91
  form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
115
- p form_tag
116
92
  find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
117
93
  end
118
94
 
119
- def self.find_form_based_on_tag(tag, possible_attrs)
95
+ def find_form_based_on_tag(tag, possible_attrs)
120
96
  lookup_attribute_name = nil
121
97
  lookup_attribute_value = nil
122
98
 
@@ -127,12 +103,11 @@ private
127
103
  }
128
104
  i = 0
129
105
  loop do
130
- @@current_form = FetchAction.get_mechanize_doc.forms[i]
131
- return nil if @@current_form == nil
132
- puts i
133
- break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
106
+ @current_form = FetchAction.get_mechanize_doc.forms[i]
107
+ return nil if @current_form == nil
108
+ break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
134
109
  i+= 1
135
110
  end
136
- end#find_form_based_on_tag
137
- end#end of class NavigationActions
138
- end#end of module Scrubyt
111
+ end
112
+ end
113
+ end
@@ -66,12 +66,13 @@ module Scrubyt
66
66
  #should not be called directly
67
67
 
68
68
  #TODO still used?
69
+ alias_method :throw_method_missing, :method_missing
69
70
  def method_missing(method_name, *args, &block)
70
71
  case method_name.to_s
71
72
  when /^ensure.+/
72
73
  constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
73
74
  else
74
- raise NoMethodError.new(method_name.to_s, method_name.to_s, args)
75
+ throw_method_missing(method_name, *args, &block)
75
76
  end
76
77
  end
77
78
 
@@ -82,7 +83,7 @@ module Scrubyt
82
83
  private
83
84
  #We don't want this to be accessible from outside
84
85
  def initialize(parent_pattern, example)
85
- @example_type = BaseFilter.determine_example_type(example)
86
+ @example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
86
87
  @parent_pattern = parent_pattern
87
88
  @example = example
88
89
  @xpath = nil #The xpath to evaluate this filter
@@ -0,0 +1,12 @@
1
+ module Scrubyt
2
+ class ConstantFilter < BaseFilter
3
+
4
+ def evaluate(source)
5
+ return @example
6
+ end
7
+
8
+ def to_sexp
9
+ [:str, @example]
10
+ end #end of method to_sexp
11
+ end #End of class ConstantFilter
12
+ end #End of module Scrubyt
@@ -2,13 +2,36 @@ module Scrubyt
2
2
  class DetailPageFilter < BaseFilter
3
3
 
4
4
  def evaluate(source)
5
- if source.is_a? String
6
- @parent_pattern.evaluation_context.extractor.evaluate_subextractor(source, @parent_pattern, @parent_pattern.resolve)
5
+ if source.is_a?(String)
6
+ url = source
7
7
  else
8
- @parent_pattern.evaluation_context.extractor.evaluate_subextractor(
9
- XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
10
- @parent_pattern, @parent_pattern.resolve)
8
+ url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
11
9
  end
12
- end #end of method
13
- end #End of class DetailPageFilter
14
- end #End of module Scrubyt
10
+
11
+ @parent_pattern.extractor.store_page
12
+ original_host_name = @parent_pattern.extractor.get_host_name
13
+ @parent_pattern.extractor.restore_host_name
14
+
15
+ FetchAction.fetch url, :resolve => @parent_pattern.resolve
16
+
17
+ if @detail_extractor.nil?
18
+ @detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
19
+ root_results = @detail_extractor.result
20
+ else
21
+ root_results = @detail_extractor.evaluate_extractor
22
+ end
23
+
24
+
25
+
26
+ @parent_pattern.extractor.restore_page
27
+ @parent_pattern.extractor.store_host_name original_host_name
28
+
29
+ root_results
30
+ end
31
+
32
+ def get_detail_sexp
33
+ [:block, *@detail_extractor.result.root_patterns.to_sexp_array]
34
+ end
35
+
36
+ end
37
+ end