scrubyt 0.3.0 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
data/CHANGELOG
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
|
-
== 0.3.
|
4
|
-
===
|
3
|
+
== 0.3.1
|
4
|
+
=== 29th May, 2007
|
5
5
|
|
6
6
|
=<tt>changes:</tt>
|
7
7
|
|
@@ -17,19 +17,26 @@
|
|
17
17
|
but it did not work for all cases)
|
18
18
|
[NEW] possibility to click button with it's text (instead of it's index)
|
19
19
|
(credit: Nick Merwin)
|
20
|
+
[NEW] clicking radio buttons
|
20
21
|
[NEW] can click on image buttons (by specifying the name of the button)
|
21
22
|
[NEW] possibility to extract an URL with one step, like so:
|
22
23
|
link 'The Difference/@href'
|
23
|
-
i.e. give me the href attribute of the element matched by the example 'The
|
24
|
+
i.e. give me the href attribute of the element matched by the example 'The Difference'
|
24
25
|
[NEW] new way to match an element of the page:
|
25
26
|
div 'div[The Difference]'
|
26
27
|
means 'return the div which contains the string "The Difference"'. This is
|
27
|
-
useful if the XPath of the element is non-constant across the same site
|
28
|
-
sometimes a banner or add is added, sometimes not etc.)
|
29
|
-
[
|
28
|
+
useful if the XPath of the element is non-constant across the same site
|
29
|
+
(e.g.sometimes a banner or add is added, sometimes not etc.)
|
30
|
+
[NEW] Clicking image maps; At the moment this is achieved by specifying an
|
31
|
+
index, like
|
32
|
+
click_image_map 3
|
33
|
+
which means click the 4th link in the image map
|
34
|
+
[FIX] Replacing \240 ( ) with space in the preprocessing phase
|
35
|
+
automatically
|
30
36
|
[FIX] Fixed: correctly downloading image if the src
|
31
37
|
attribute had a leading space, as in
|
32
38
|
<img src=' /files/downloads/images/image.jpg'/>
|
39
|
+
[FIX] Other misc fixes - a ton of them!
|
33
40
|
|
34
41
|
== 0.2.7
|
35
42
|
=== 12th April, 2007
|
data/Rakefile
CHANGED
@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
|
|
17
17
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
19
19
|
s.name = 'scrubyt'
|
20
|
-
s.version = '0.3.
|
21
|
-
s.summary = 'A powerful Web-scraping framework'
|
20
|
+
s.version = '0.3.4'
|
21
|
+
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
23
|
# Files containing Test::Unit test cases.
|
24
24
|
s.test_files = FileList['test/unittests/**/*']
|
@@ -29,9 +29,12 @@ gem_spec = Gem::Specification.new do |s|
|
|
29
29
|
s.homepage = 'http://www.scrubyt.org'
|
30
30
|
s.add_dependency('hpricot', '>= 0.5')
|
31
31
|
s.add_dependency('mechanize', '>= 0.6.3')
|
32
|
-
|
33
|
-
|
34
|
-
s.
|
32
|
+
s.add_dependency('ParseTreeReloaded')
|
33
|
+
s.add_dependency('RubyInlineAcceleration')
|
34
|
+
s.add_dependency('RubyInline', '= 3.6.3')
|
35
|
+
s.add_dependency('ParseTree', '= 1.7.1')
|
36
|
+
s.add_dependency('ruby2ruby', '= 1.1.6')
|
37
|
+
#s.has_rdoc = 'true'
|
35
38
|
end
|
36
39
|
|
37
40
|
###################################################
|
@@ -56,10 +59,19 @@ Rake::TestTask.new(:test_blackbox) do |task|
|
|
56
59
|
task.test_files = ['test/blackbox_test.rb']
|
57
60
|
end
|
58
61
|
|
62
|
+
task "test_specific" do
|
63
|
+
ruby "test/blackbox_test.rb #{ARGV[1]}"
|
64
|
+
end
|
65
|
+
|
59
66
|
Rake::TestTask.new(:test_non_blackbox) do |task|
|
60
67
|
task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
|
61
68
|
end
|
62
69
|
|
70
|
+
task "rcov" do
|
71
|
+
sh 'rcov --xrefs test/*.rb'
|
72
|
+
puts 'Report done.'
|
73
|
+
end
|
74
|
+
|
63
75
|
task "cleanup_readme" do
|
64
76
|
puts "Cleaning up README..."
|
65
77
|
readme_in = open('./doc/files/README.html')
|
@@ -87,8 +99,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
87
99
|
pkg.need_tar = false
|
88
100
|
end
|
89
101
|
|
90
|
-
Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
|
91
|
-
pkg.need_zip = true
|
92
|
-
pkg.need_tar = true
|
93
|
-
pkg.package_files.include("examples/**/*")
|
94
|
-
end
|
102
|
+
#Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
|
103
|
+
# pkg.need_zip = true
|
104
|
+
# pkg.need_tar = true
|
105
|
+
# pkg.package_files.include("examples/**/*")
|
106
|
+
#end
|
data/lib/scrubyt.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
$KCODE = 'u'
|
2
|
+
require 'jcode'
|
3
|
+
|
1
4
|
#ruby core
|
2
5
|
require 'open-uri'
|
3
6
|
require 'erb'
|
@@ -7,6 +10,7 @@ require 'rubygems'
|
|
7
10
|
require 'mechanize'
|
8
11
|
require 'hpricot'
|
9
12
|
require 'parse_tree_reloaded'
|
13
|
+
require 'rexml/text'
|
10
14
|
|
11
15
|
#little hack to avoid that ruby2ruby tries to load the original parse_tree
|
12
16
|
if Gem
|
@@ -42,16 +46,17 @@ require 'scrubyt/core/scraping/compound_example.rb'
|
|
42
46
|
require 'scrubyt/output/result_node.rb'
|
43
47
|
require 'scrubyt/output/scrubyt_result.rb'
|
44
48
|
require 'scrubyt/output/export.rb'
|
49
|
+
require 'scrubyt/core/navigation/navigation_actions.rb'
|
50
|
+
require 'scrubyt/core/navigation/fetch_action.rb'
|
45
51
|
require 'scrubyt/core/shared/extractor.rb'
|
46
52
|
require 'scrubyt/core/scraping/filters/base_filter.rb'
|
47
53
|
require 'scrubyt/core/scraping/filters/attribute_filter.rb'
|
54
|
+
require 'scrubyt/core/scraping/filters/constant_filter.rb'
|
55
|
+
require 'scrubyt/core/scraping/filters/script_filter.rb'
|
56
|
+
require 'scrubyt/core/scraping/filters/text_filter.rb'
|
48
57
|
require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
|
49
58
|
require 'scrubyt/core/scraping/filters/download_filter.rb'
|
50
59
|
require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
|
51
60
|
require 'scrubyt/core/scraping/filters/regexp_filter.rb'
|
52
61
|
require 'scrubyt/core/scraping/filters/tree_filter.rb'
|
53
62
|
require 'scrubyt/core/scraping/pattern.rb'
|
54
|
-
require 'scrubyt/core/navigation/navigation_actions.rb'
|
55
|
-
require 'scrubyt/core/navigation/fetch_action.rb'
|
56
|
-
require 'scrubyt/core/shared/evaluation_context.rb'
|
57
|
-
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
@@ -7,7 +7,7 @@ module Scrubyt
|
|
7
7
|
#functionality to a separate class - so if you are looking for anything
|
8
8
|
#which is loading a document (even by submitting a form or clicking a link)
|
9
9
|
#and related things like setting a proxy etc. you should find it here.
|
10
|
-
|
10
|
+
module FetchAction
|
11
11
|
|
12
12
|
@@current_doc_url = nil
|
13
13
|
@@current_doc_protocol = nil
|
@@ -30,7 +30,7 @@ module Scrubyt
|
|
30
30
|
mechanize_doc = args[0][:mechanize_doc]
|
31
31
|
resolve = args[0][:resolve]
|
32
32
|
basic_auth = args[0][:basic_auth]
|
33
|
-
user_agent = args[0][:user_agent] || "Mozilla/5.0 (
|
33
|
+
user_agent = args[0][:user_agent] || "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.4) Gecko/20061201 Firefox/2.0.0.4 (Ubuntu-feisty)"
|
34
34
|
#Refactor this whole stuff as well!!! It looks awful...
|
35
35
|
parse_and_set_proxy(proxy) if proxy
|
36
36
|
set_user_agent(user_agent)
|
@@ -120,23 +120,27 @@ module Scrubyt
|
|
120
120
|
@@hpricot_doc
|
121
121
|
end
|
122
122
|
|
123
|
-
def
|
123
|
+
def get_host_name
|
124
124
|
@@host_name
|
125
125
|
end
|
126
126
|
|
127
|
-
def
|
127
|
+
def restore_host_name
|
128
128
|
return if @@current_doc_protocol == 'file'
|
129
129
|
@@host_name = @@original_host_name
|
130
130
|
end
|
131
131
|
|
132
|
-
def
|
132
|
+
def store_page
|
133
133
|
@@history.push @@hpricot_doc
|
134
134
|
end
|
135
135
|
|
136
|
-
def
|
136
|
+
def restore_page
|
137
137
|
@@hpricot_doc = @@history.pop
|
138
138
|
end
|
139
139
|
|
140
|
+
def store_host_name(doc_url)
|
141
|
+
FetchAction.store_host_name(doc_url)
|
142
|
+
end
|
143
|
+
|
140
144
|
def self.store_host_name(doc_url)
|
141
145
|
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
142
146
|
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
@@ -171,8 +175,7 @@ module Scrubyt
|
|
171
175
|
@@port = parts.delete_at(-1)
|
172
176
|
@@host = parts.join(':')
|
173
177
|
if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
|
174
|
-
|
175
|
-
puts "neither host nor port can be nil!"
|
178
|
+
Scrubyt.log :ERROR, "Invalid proxy specification! Neither host nor port can be nil!"
|
176
179
|
exit
|
177
180
|
end
|
178
181
|
end
|
@@ -227,6 +230,6 @@ module Scrubyt
|
|
227
230
|
#custom resilving
|
228
231
|
@@current_doc_url = resolve + doc_url
|
229
232
|
end
|
230
|
-
end
|
231
|
-
end
|
232
|
-
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
@@ -5,25 +5,15 @@ module Scrubyt
|
|
5
5
|
#This class contains all the actions that are used to navigate on web pages;
|
6
6
|
#first of all, *fetch* for downloading the pages - then various actions
|
7
7
|
#like filling textfields, submitting formst, clicking links and more
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
'click_link',
|
16
|
-
'click_image_map',
|
17
|
-
'select_option',
|
18
|
-
'check_checkbox',
|
19
|
-
'check_radiobutton',
|
20
|
-
'end']
|
21
|
-
|
22
|
-
def initialize
|
23
|
-
@@current_form = nil
|
24
|
-
FetchAction.new
|
8
|
+
module NavigationActions
|
9
|
+
|
10
|
+
def self.extend_object(obj)
|
11
|
+
super(obj)
|
12
|
+
obj.instance_eval do
|
13
|
+
@current_form = nil
|
14
|
+
end
|
25
15
|
end
|
26
|
-
|
16
|
+
|
27
17
|
##
|
28
18
|
#Action to fill a textfield with a query string
|
29
19
|
#
|
@@ -33,90 +23,76 @@ module Scrubyt
|
|
33
23
|
#textfield is 'q'
|
34
24
|
#
|
35
25
|
#_query_string_ - the string that should be entered into the textfield
|
36
|
-
def
|
26
|
+
def fill_textfield(textfield_name, query_string)
|
37
27
|
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
38
|
-
eval("
|
28
|
+
eval("@current_form['#{textfield_name}'] = '#{query_string}'")
|
39
29
|
end
|
40
|
-
|
30
|
+
|
41
31
|
##
|
42
32
|
#Action to fill a textarea with text
|
43
|
-
def
|
33
|
+
def fill_textarea(textarea_name, text)
|
44
34
|
lookup_form_for_tag('textarea','textarea',textarea_name,text)
|
45
|
-
eval("
|
35
|
+
eval("@current_form['#{textarea_name}'] = '#{text}'")
|
46
36
|
end
|
47
|
-
|
37
|
+
|
48
38
|
##
|
49
39
|
#Action for selecting an option from a dropdown box
|
50
|
-
def
|
40
|
+
def select_option(selectlist_name, option)
|
51
41
|
lookup_form_for_tag('select','select list',selectlist_name,option)
|
52
|
-
select_list =
|
42
|
+
select_list = @current_form.fields.find {|f| f.name == selectlist_name}
|
53
43
|
searched_option = select_list.options.find{|f| f.text.strip == option}
|
54
44
|
searched_option.click
|
55
45
|
end
|
56
|
-
|
57
|
-
def
|
46
|
+
|
47
|
+
def check_checkbox(checkbox_name)
|
58
48
|
lookup_form_for_tag('input','checkbox',checkbox_name, '')
|
59
|
-
|
49
|
+
@current_form.checkboxes.name(checkbox_name).check
|
60
50
|
end
|
61
|
-
|
62
|
-
def
|
51
|
+
|
52
|
+
def check_radiobutton(checkbox_name, index=0)
|
63
53
|
lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
|
64
|
-
|
54
|
+
@current_form.radiobuttons.name(checkbox_name)[index].check
|
65
55
|
end
|
66
|
-
|
56
|
+
|
67
57
|
##
|
68
58
|
#Fetch the document
|
69
|
-
def
|
59
|
+
def fetch(*args)
|
70
60
|
FetchAction.fetch(*args)
|
71
61
|
end
|
72
62
|
##
|
73
|
-
#Submit the current form
|
74
|
-
def
|
63
|
+
#Submit the current form
|
64
|
+
def submit(index=nil, type=nil)
|
75
65
|
if index == nil
|
76
|
-
FetchAction.submit(
|
77
|
-
|
66
|
+
FetchAction.submit(@current_form)
|
67
|
+
#----- added by nickmerwin@gmail.com -----
|
78
68
|
elsif index.class == String
|
79
|
-
button =
|
80
|
-
FetchAction.submit(
|
81
|
-
|
69
|
+
button = @current_form.buttons.detect{|b| b.name == index}
|
70
|
+
FetchAction.submit(@current_form, button,type)
|
71
|
+
#-----------------------------------------
|
82
72
|
else
|
83
|
-
FetchAction.submit(
|
73
|
+
FetchAction.submit(@current_form, @current_form.buttons[index])
|
84
74
|
end
|
85
75
|
end
|
86
|
-
|
76
|
+
|
87
77
|
##
|
88
|
-
#Click the link specified by the text
|
89
|
-
def
|
78
|
+
#Click the link specified by the text
|
79
|
+
def click_link(link_spec,index=0)
|
90
80
|
FetchAction.click_link(link_spec,index)
|
91
81
|
end
|
92
|
-
|
93
|
-
def
|
82
|
+
|
83
|
+
def click_image_map(index=0)
|
94
84
|
FetchAction.click_image_map(index)
|
95
85
|
end
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
end
|
100
|
-
|
101
|
-
def self.get_current_doc_url
|
102
|
-
FetchAction.get_current_doc_url
|
103
|
-
end
|
104
|
-
|
105
|
-
def self.get_host_name
|
106
|
-
FetchAction.get_host_name
|
107
|
-
end
|
108
|
-
|
109
|
-
private
|
110
|
-
def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
86
|
+
|
87
|
+
private
|
88
|
+
def lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
111
89
|
Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
112
90
|
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
|
113
|
-
p widget
|
114
91
|
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
115
|
-
p form_tag
|
116
92
|
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
117
93
|
end
|
118
94
|
|
119
|
-
def
|
95
|
+
def find_form_based_on_tag(tag, possible_attrs)
|
120
96
|
lookup_attribute_name = nil
|
121
97
|
lookup_attribute_value = nil
|
122
98
|
|
@@ -127,12 +103,11 @@ private
|
|
127
103
|
}
|
128
104
|
i = 0
|
129
105
|
loop do
|
130
|
-
|
131
|
-
return nil if
|
132
|
-
|
133
|
-
break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
106
|
+
@current_form = FetchAction.get_mechanize_doc.forms[i]
|
107
|
+
return nil if @current_form == nil
|
108
|
+
break if @current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
134
109
|
i+= 1
|
135
110
|
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -66,12 +66,13 @@ module Scrubyt
|
|
66
66
|
#should not be called directly
|
67
67
|
|
68
68
|
#TODO still used?
|
69
|
+
alias_method :throw_method_missing, :method_missing
|
69
70
|
def method_missing(method_name, *args, &block)
|
70
71
|
case method_name.to_s
|
71
72
|
when /^ensure.+/
|
72
73
|
constraints << Constraint.send("add_#{method_name.to_s}".to_sym, self, *args)
|
73
74
|
else
|
74
|
-
|
75
|
+
throw_method_missing(method_name, *args, &block)
|
75
76
|
end
|
76
77
|
end
|
77
78
|
|
@@ -82,7 +83,7 @@ module Scrubyt
|
|
82
83
|
private
|
83
84
|
#We don't want this to be accessible from outside
|
84
85
|
def initialize(parent_pattern, example)
|
85
|
-
@example_type = BaseFilter.determine_example_type(example)
|
86
|
+
@example_type = @parent_pattern.example_type ? @parent_pattern.example_type : BaseFilter.determine_example_type(example)
|
86
87
|
@parent_pattern = parent_pattern
|
87
88
|
@example = example
|
88
89
|
@xpath = nil #The xpath to evaluate this filter
|
@@ -2,13 +2,36 @@ module Scrubyt
|
|
2
2
|
class DetailPageFilter < BaseFilter
|
3
3
|
|
4
4
|
def evaluate(source)
|
5
|
-
if source.is_a?
|
6
|
-
|
5
|
+
if source.is_a?(String)
|
6
|
+
url = source
|
7
7
|
else
|
8
|
-
|
9
|
-
XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href'],
|
10
|
-
@parent_pattern, @parent_pattern.resolve)
|
8
|
+
url = XPathUtils.find_nearest_node_with_attribute(source, 'href').attributes['href']
|
11
9
|
end
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
|
11
|
+
@parent_pattern.extractor.store_page
|
12
|
+
original_host_name = @parent_pattern.extractor.get_host_name
|
13
|
+
@parent_pattern.extractor.restore_host_name
|
14
|
+
|
15
|
+
FetchAction.fetch url, :resolve => @parent_pattern.resolve
|
16
|
+
|
17
|
+
if @detail_extractor.nil?
|
18
|
+
@detail_extractor = Extractor.new @parent_pattern.extractor.mode, @parent_pattern.referenced_extractor
|
19
|
+
root_results = @detail_extractor.result
|
20
|
+
else
|
21
|
+
root_results = @detail_extractor.evaluate_extractor
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
@parent_pattern.extractor.restore_page
|
27
|
+
@parent_pattern.extractor.store_host_name original_host_name
|
28
|
+
|
29
|
+
root_results
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_detail_sexp
|
33
|
+
[:block, *@detail_extractor.result.root_patterns.to_sexp_array]
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|