scrubyt 0.3.0 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
data/CHANGELOG
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
|
-
== 0.3.
|
4
|
-
===
|
3
|
+
== 0.3.1
|
4
|
+
=== 29th May, 2007
|
5
5
|
|
6
6
|
=<tt>changes:</tt>
|
7
7
|
|
@@ -17,19 +17,26 @@
|
|
17
17
|
but it did not work for all cases)
|
18
18
|
[NEW] possibility to click button with it's text (instead of it's index)
|
19
19
|
(credit: Nick Merwin)
|
20
|
+
[NEW] clicking radio buttons
|
20
21
|
[NEW] can click on image buttons (by specifying the name of the button)
|
21
22
|
[NEW] possibility to extract an URL with one step, like so:
|
22
23
|
link 'The Difference/@href'
|
23
|
-
i.e. give me the href attribute of the element matched by the example 'The
|
24
|
+
i.e. give me the href attribute of the element matched by the example 'The Difference'
|
24
25
|
[NEW] new way to match an element of the page:
|
25
26
|
div 'div[The Difference]'
|
26
27
|
means 'return the div which contains the string "The Difference"'. This is
|
27
|
-
useful if the XPath of the element is non-constant across the same site
|
28
|
-
sometimes a banner or add is added, sometimes not etc.)
|
29
|
-
[
|
28
|
+
useful if the XPath of the element is non-constant across the same site
|
29
|
+
(e.g.sometimes a banner or add is added, sometimes not etc.)
|
30
|
+
[NEW] Clicking image maps; At the moment this is achieved by specifying an
|
31
|
+
index, like
|
32
|
+
click_image_map 3
|
33
|
+
which means click the 4th link in the image map
|
34
|
+
[FIX] Replacing \240 ( ) with space in the preprocessing phase
|
35
|
+
automatically
|
30
36
|
[FIX] Fixed: correctly downloading image if the src
|
31
37
|
attribute had a leading space, as in
|
32
38
|
<img src=' /files/downloads/images/image.jpg'/>
|
39
|
+
[FIX] Other misc fixes - a ton of them!
|
33
40
|
|
34
41
|
== 0.2.7
|
35
42
|
=== 12th April, 2007
|
data/Rakefile
CHANGED
@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
|
|
17
17
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
19
19
|
s.name = 'scrubyt'
|
20
|
-
s.version = '0.3.
|
21
|
-
s.summary = 'A powerful Web-scraping framework'
|
20
|
+
s.version = '0.3.4'
|
21
|
+
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
23
|
# Files containing Test::Unit test cases.
|
24
24
|
s.test_files = FileList['test/unittests/**/*']
|
@@ -29,9 +29,12 @@ gem_spec = Gem::Specification.new do |s|
|
|
29
29
|
s.homepage = 'http://www.scrubyt.org'
|
30
30
|
s.add_dependency('hpricot', '>= 0.5')
|
31
31
|
s.add_dependency('mechanize', '>= 0.6.3')
|
32
|
-
|
33
|
-
|
34
|
-
s.
|
32
|
+
s.add_dependency('ParseTreeReloaded')
|
33
|
+
s.add_dependency('RubyInlineAcceleration')
|
34
|
+
s.add_dependency('RubyInline', '= 3.6.3')
|
35
|
+
s.add_dependency('ParseTree', '= 1.7.1')
|
36
|
+
s.add_dependency('ruby2ruby', '= 1.1.6')
|
37
|
+
#s.has_rdoc = 'true'
|
35
38
|
end
|
36
39
|
|
37
40
|
###################################################
|
@@ -56,10 +59,19 @@ Rake::TestTask.new(:test_blackbox) do |task|
|
|
56
59
|
task.test_files = ['test/blackbox_test.rb']
|
57
60
|
end
|
58
61
|
|
62
|
+
task "test_specific" do
|
63
|
+
ruby "test/blackbox_test.rb #{ARGV[1]}"
|
64
|
+
end
|
65
|
+
|
59
66
|
Rake::TestTask.new(:test_non_blackbox) do |task|
|
60
67
|
task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
|
61
68
|
end
|
62
69
|
|
70
|
+
task "rcov" do
|
71
|
+
sh 'rcov --xrefs test/*.rb'
|
72
|
+
puts 'Report done.'
|
73
|
+
end
|
74
|
+
|
63
75
|
task "cleanup_readme" do
|
64
76
|
puts "Cleaning up README..."
|
65
77
|
readme_in = open('./doc/files/README.html')
|
@@ -87,8 +99,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
87
99
|
pkg.need_tar = false
|
88
100
|
end
|
89
101
|
|
90
|
-
Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
|
91
|
-
pkg.need_zip = true
|
92
|
-
pkg.need_tar = true
|
93
|
-
pkg.package_files.include("examples/**/*")
|
94
|
-
end
|
102
|
+
#Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
|
103
|
+
# pkg.need_zip = true
|
104
|
+
# pkg.need_tar = true
|
105
|
+
# pkg.package_files.include("examples/**/*")
|
106
|
+
#end
|
data/lib/scrubyt.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
$KCODE = 'u'
|
2
|
+
require 'jcode'
|
3
|
+
|
1
4
|
#ruby core
|
2
5
|
require 'open-uri'
|
3
6
|
require 'erb'
|
@@ -7,6 +10,7 @@ require 'rubygems'
|
|
7
10
|
require 'mechanize'
|
8
11
|
require 'hpricot'
|
9
12
|
require 'parse_tree_reloaded'
|
13
|
+
require 'rexml/text'
|
10
14
|
|
11
15
|
#little hack to avoid that ruby2ruby tries to load the original parse_tree
|
12
16
|
if Gem
|
@@ -42,16 +46,17 @@ require 'scrubyt/core/scraping/compound_example.rb'
|
|
42
46
|
require 'scrubyt/output/result_node.rb'
|
43
47
|
require 'scrubyt/output/scrubyt_result.rb'
|
44
48
|
require 'scrubyt/output/export.rb'
|
49
|
+
require 'scrubyt/core/navigation/navigation_actions.rb'
|
50
|
+
require 'scrubyt/core/navigation/fetch_action.rb'
|
45
51
|
require 'scrubyt/core/shared/extractor.rb'
|
46
52
|
require 'scrubyt/core/scraping/filters/base_filter.rb'
|
47
53
|
require 'scrubyt/core/scraping/filters/attribute_filter.rb'
|
54
|
+
require 'scrubyt/core/scraping/filters/constant_filter.rb'
|
55
|
+
require 'scrubyt/core/scraping/filters/script_filter.rb'
|
56
|
+
require 'scrubyt/core/scraping/filters/text_filter.rb'
|
48
57
|
require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
|
49
58
|
require 'scrubyt/core/scraping/filters/download_filter.rb'
|
50
59
|
require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
|
51
60
|
require 'scrubyt/core/scraping/filters/regexp_filter.rb'
|
52
61
|
require 'scrubyt/core/scraping/filters/tree_filter.rb'
|
53
62
|
require 'scrubyt/core/scraping/pattern.rb'
|
54
|
-
require 'scrubyt/core/navigation/navigation_actions.rb'
|
55
|
-
require 'scrubyt/core/navigation/fetch_action.rb'
|
56
|
-
require 'scrubyt/core/shared/evaluation_context.rb'
|
57
|
-
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
@@ -7,7 +7,7 @@ module Scrubyt
|
|
7
7
|
#functionality to a separate class - so if you are looking for anything
|
8
8
|
#which is loading a document (even by submitting a form or clicking a link)
|
9
9
|
#and related things like setting a proxy etc. you should find it here.
|
10
|
-
|
10
|
+
module FetchAction
|
11
11
|
|
12
12
|
@@current_doc_url = nil
|
13
13
|
@@current_doc_protocol = nil
|
@@ -30,7 +30,7 @@ module Scrubyt
|
|
30
30
|
mechanize_doc = args[0][:mechanize_doc]
|
31
31
|
resolve = args[0][:resolve]
|
32
32
|
basic_auth = args[0][:basic_auth]
|
33
|
-
user_agent = args[0][:user_agent] || "Mozilla/5.0 (
|
33
|
+
user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
|
34
34
|
#Refactor this whole stuff as well!!! It looks awful...
|
35
35
|
parse_and_set_proxy(proxy) if proxy
|
36
36
|
set_user_agent(user_agent)
|
@@ -120,23 +120,27 @@ module Scrubyt
|
|
120
120
|
@@hpricot_doc
|
121
121
|
end
|
122
122
|
|
123
|
-
def
|
123
|
+
def get_host_name
|
124
124
|
@@host_name
|
125
125
|
end
|
126
126
|
|
127
|
-
def
|
127
|
+
def restore_host_name
|
128
128
|
return if @@current_doc_protocol == 'file'
|
129
129
|
@@host_name = @@original_host_name
|
130
130
|
end
|
131
131
|
|
132
|
-
def
|
132
|
+
def store_page
|
133
133
|
@@history.push @@hpricot_doc
|
134
134
|
end
|
135
135
|
|
136
|
-
def
|
136
|
+
def restore_page
|
137
137
|
@@hpricot_doc = @@history.pop
|
138
138
|
end
|
139
139
|
|
140
|
+
def store_host_name(doc_url)
|
141
|
+
FetchAction.store_host_name(doc_url)
|
142
|
+
end
|
143
|
+
|
140
144
|
def self.store_host_name(doc_url)
|
141
145
|
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
142
146
|
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
@@ -171,8 +175,7 @@ module Scrubyt
|
|
171
175
|
@@port = parts.delete_at(-1)
|
172
176
|
@@host = parts.join(':')
|
173
177
|
if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
|
174
|
-
|
175
|
-
puts "neither host nor port can be nil!"
|
178
|
+
Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
|
176
179
|
exit
|
177
180
|
end
|
178
181
|
end
|
@@ -227,6 +230,6 @@ module Scrubyt
|
|
227
230
|
#custom resilving
|
228
231
|
@@current_doc_url = resolve + doc_url
|
229
232
|
end
|
230
|
-
end
|
231
|
-
end
|
232
|
-
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
@@ -5,25 +5,15 @@ module Scrubyt
|
|
5
5
|
#This class contains all the actions that are used to navigate on web pages;
|
6
6
|
#first of all, *fetch* for downloading the pages - then various actions
|
7
7
|
#like filling textfields, submitting formst, clicking links and more
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
'click_link',
|
16
|
-
'click_image_map',
|
17
|
-
'select_option',
|
18
|
-
'check_checkbox',
|
19
|
-
'check_radiobutton',
|
20
|
-
'end']
|
21
|
-
|
22
|
-
def initialize
|
23
|
-
@@current_form = nil
|
24
|
-
FetchAction.new
|
8
|
+
module NavigationActions
|
9
|
+
|
10
|
+
def self.extend_object(obj)
|
11
|
+
super(obj)
|
12
|
+
obj.instance_eval do
|
13
|
+
@current_form = nil
|
14
|
+
end
|
25
15
|
end
|
26
|
-
|
16
|
+
|
27
17
|
##
|
28
18
|
#Action to fill a textfield with a query string
|
29
19
|
#
|
@@ -33,90 +23,76 @@ module Scrubyt
|
|
33
23
|
#textfield is 'q'
|
34
24
|
#
|
35
25
|
#_query_string_ - the string that should be entered into the textfield
|
36
|
-
def
|
26
|
+
def fill_textfield(textfield_name, query_string)
|
37
27
|
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
38
|
-
eval("
|
28
|
+
eval("@current_form['#{textfield_name}'] = '#{query_string}'")
|
39
29
|
end
|
40
|
-
|
30
|
+
|
41
31
|
##
|
42
32
|
#Action to fill a textarea with text
|
43
|
-
def
|
33
|
+
def fill_textarea(textarea_name, text)
|
44
34
|
lookup_form_for_tag('textarea','textarea',textarea_name,text)
|
45
|
-
eval("
|
35
|
+
eval("@current_form['#{textarea_name}'] = '#{text}'")
|
46
36
|
end
|
47
|
-
|
37
|
+
|
48
38
|
##
|
49
39
|
#Action for selecting an option from a dropdown box
|
50
|
-
def
|
40
|
+
def select_option(selectlist_name, option)
|
51
41
|
lookup_form_for_tag('select','select list',selectlist_name,option)
|
52
|
-
select_list =
|
42
|
+
select_list = @current_form.fields.find {|f| f.name == selectlist_name}
|
53
43
|
searched_option = select_list.options.find{|f| f.text.strip == option}
|
54
44
|
searched_option.click
|
55
45
|
end
|
56
|
-
|
57
|
-
def
|
46
|
+
|
47
|
+
def check_checkbox(checkbox_name)
|
58
48
|
lookup_form_for_tag('input','checkbox',checkbox_name, '')
|
59
|
-
|
49
|
+
@current_form.checkboxes.name(checkbox_name).check
|
60
50
|
end
|
61
|
-
|
62
|
-
def
|
51
|
+
|
52
|
+
def check_radiobutton(checkbox_name, index=0)
|
63
53
|
lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
|
64
|
-
|
54
|
+
@current_form.radiobuttons.name(checkbox_name)[index].check
|
65
55
|
end
|
66
|
-
|
56
|
+
|
67
57
|
##
|
68
58
|
#Fetch the document
|
69
|
-
def
|
59
|
+
def fetch(*args)
|
70
60
|
FetchAction.fetch(*args)
|
71
61
|
end
|
72
62
|
##
|
73
|
-
#Submit the current form
|
74
|
-
def
|
63
|
+
#Submit the current form
|
64
|
+
def submit(index=nil, type=nil)
|
75
65
|
if index == nil
|
76
|
-
FetchAction.submit(
|
77
|
-
|
66
|
+
FetchAction.submit(@current_form)
|
67
|
+
#----- added by nickmerwin@gmail.com -----
|
78
68
|
elsif index.class == String
|
79
|
-
button =
|
80
|
-
FetchAction.submit(
|
81
|
-
|
69
|
+
button = @current_form.buttons.detect{|b| b.name == index}
|
70
|
+
FetchAction.submit(@current_form, button,type)
|
71
|
+
#-----------------------------------------
|
82
72
|
else
|
83
|
-
FetchAction.submit(
|
73
|
+
FetchAction.submit(@current_form, @current_form.buttons[index])
|
84
74
|
end
|
85
75
|
end
|
86
|
-
|
76
|
+
|
87
77
|
##
|
88
|
-
#Click the link specified by the text
|
89
|
-
def
|
78
|
+
#Click the link specified by the text
|
79
|
+
def click_link(link_spec,index=0)
|
90
80
|
FetchAction.click_link(link_spec,index)
|
91
81
|
end
|
92
|
-
|
93
|
-
def
|
82
|
+
|
83
|
+
def click_image_map(index=0)
|
94
84
|
FetchAction.click_image_map(index)
|
95
85
|
end
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
end
|
100
|
-
|
101
|
-
def self.get_current_doc_url
|
102
|
-
FetchAction.get_current_doc_url
|
103
|
-
end
|
104
|
-
|
105
|
-
def self.get_host_name
|
106
|
-
FetchAction.get_host_name
|
107
|
-
end
|
108
|
-
|
109
|
-
private
|
110
|
-
def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
86
|
+
|
87
|
+
private
|
88
|
+
def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
111
89
|
Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
112
90
|
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
|
113
|
-
p widget
|
114
91
|
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
115
|
-
p form_tag
|
116
92
|
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
117
93
|
end
|
118
94
|
|
119
|
-
def
|
95
|
+
def find_form_based_on_tag(tag, possible_attrs)
|
120
96
|
lookup_attribute_name = nil
|
121
97
|
lookup_attribute_value = nil
|
122
98
|
|
@@ -127,12 +103,11 @@ private
|
|
127
103
|
}
|
128
104
|
i = 0
|
129
105
|
loop do
|
130
|
-
|
131
|
-
return nil if
|
132
|
-
|
133
|
-
break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
106
|
+
@current_form = FetchAction.get_mechanize_doc.forms[i]
|
107
|
+
return nil if @current_form == nil
|
108
|
+
break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
134
109
|
i+= 1
|
135
110
|
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -66,12 +66,13 @@ module Scrubyt
|
|
66
66
|
#should not be called directly
|
67
67
|
|
68
68
|
#TODO still used?
|
69
|
+
alias_method :throw_method_missing, :method_missing
|
69
70
|
def method_missing(method_name, *args, &block)
|
70
71
|
case method_name.to_s
|
71
72
|
when /^ensure.+/
|
72
73
|
constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
|
73
74
|
else
|
74
|
-
|
75
|
+
throw_method_missing(method_name, *args, &block)
|
75
76
|
end
|
76
77
|
end
|
77
78
|
|
@@ -82,7 +83,7 @@ module Scrubyt
|
|
82
83
|
private
|
83
84
|
#We don't want this to be accessible from outside
|
84
85
|
def initialize(parent_pattern, example)
|
85
|
-
@example_type = BaseFilter.determine_example_type(example)
|
86
|
+
@example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
|
86
87
|
@parent_pattern = parent_pattern
|
87
88
|
@example = example
|
88
89
|
@xpath = nil #The xpath to evaluate this filter
|
@@ -2,13 +2,36 @@ module Scrubyt
|
|
2
2
|
class DetailPageFilter < BaseFilter
|
3
3
|
|
4
4
|
def evaluate(source)
|
5
|
-
if source.is_a?
|
6
|
-
|
5
|
+
if source.is_a?(String)
|
6
|
+
url = source
|
7
7
|
else
|
8
|
-
|
9
|
-
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
10
|
-
@parent_pattern, @parent_pattern.resolve)
|
8
|
+
url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
|
11
9
|
end
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
|
11
|
+
@parent_pattern.extractor.store_page
|
12
|
+
original_host_name = @parent_pattern.extractor.get_host_name
|
13
|
+
@parent_pattern.extractor.restore_host_name
|
14
|
+
|
15
|
+
FetchAction.fetch url, :resolve => @parent_pattern.resolve
|
16
|
+
|
17
|
+
if @detail_extractor.nil?
|
18
|
+
@detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
|
19
|
+
root_results = @detail_extractor.result
|
20
|
+
else
|
21
|
+
root_results = @detail_extractor.evaluate_extractor
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
@parent_pattern.extractor.restore_page
|
27
|
+
@parent_pattern.extractor.store_host_name original_host_name
|
28
|
+
|
29
|
+
root_results
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_detail_sexp
|
33
|
+
[:block, *@detail_extractor.result.root_patterns.to_sexp_array]
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|