scrubyt 0.4.1 → 0.4.05
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +3 -12
- data/Rakefile +6 -6
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +5 -36
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +5 -10
- data/lib/scrubyt/core/navigation/navigation_actions.rb +5 -17
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +3 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +3 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +4 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +4 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +3 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +4 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +3 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +3 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +8 -0
- data/lib/scrubyt/core/scraping/pattern.rb +26 -1
- data/lib/scrubyt/core/scraping/result_indexer.rb +4 -0
- metadata +3 -3
data/CHANGELOG
CHANGED
@@ -1,20 +1,15 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
3
|
== 0.4.05
|
4
|
-
==
|
4
|
+
== 14th November
|
5
5
|
|
6
6
|
=<tt>changes:</tt>
|
7
|
-
- [NEW] possibility to use FireWatir as the agent for scraping (credit:
|
7
|
+
- [NEW] possibility to use FireWatir as the agent for scraping (credit: Glen Gillen)
|
8
8
|
- [FIX] navigation doesn't crash if a 404/500 is returned (credit: Glen Gillen)
|
9
|
-
- [NEW] navigation
|
9
|
+
- [NEW] navigation actions: click_by_xpath, click_link_and_wait
|
10
10
|
- [MOD] dropped dependencies: RubyInline, ParseTree, Ruby2Ruby (hooray for win32 users)
|
11
|
-
- [NEW] scraping through frames (e.g. google analytics)
|
12
11
|
- [MOD] exporting temporarily doesn't work - for now, generated XPaths are printed to the screen
|
13
|
-
- [MOD] possibility to wait after clicking link/filling textfield (to be able to scrape inserted AJAX stuff)
|
14
|
-
- [NEW] possibility to fetch from a string, by specifying nil as the url and the html string with the :html option
|
15
|
-
- [FIX] firewatir slowness (credit: jak4)
|
16
12
|
- [FIX] lot of bugfixes and stability fixes
|
17
|
-
-
|
18
13
|
|
19
14
|
== 0.4.0 (unofficial)
|
20
15
|
=== 31st October, 2007
|
@@ -27,10 +22,6 @@
|
|
27
22
|
== 0.3.4
|
28
23
|
=== 26th September, 2007
|
29
24
|
|
30
|
-
=<tt>changes:</tt>
|
31
|
-
It seems I have been too busy to update the changelog ;)
|
32
|
-
|
33
|
-
|
34
25
|
== 0.3.1
|
35
26
|
=== 29th May, 2007
|
36
27
|
|
data/Rakefile
CHANGED
@@ -17,7 +17,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
17
17
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
19
19
|
s.name = 'scrubyt'
|
20
|
-
s.version = '0.4.
|
20
|
+
s.version = '0.4.05'
|
21
21
|
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
23
|
# Files containing Test::Unit test cases.
|
@@ -94,8 +94,8 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
94
94
|
pkg.need_tar = false
|
95
95
|
end
|
96
96
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
97
|
+
Rake::PackageTask.new('scrubyt-examples', gem_spec.version) do |pkg|
|
98
|
+
pkg.need_zip = true
|
99
|
+
pkg.need_tar = true
|
100
|
+
pkg.package_files.include("examples/**/*")
|
101
|
+
end
|
@@ -21,7 +21,6 @@ module Scrubyt
|
|
21
21
|
@@host_name = nil
|
22
22
|
@@history = []
|
23
23
|
@@current_form = nil
|
24
|
-
@@current_frame = nil
|
25
24
|
|
26
25
|
##
|
27
26
|
#Action to fetch a document (either a file or a http address)
|
@@ -59,33 +58,12 @@ module Scrubyt
|
|
59
58
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
60
59
|
store_host_name(@@agent.url) # in case we're on a new host
|
61
60
|
end
|
62
|
-
|
63
|
-
def self.frame(attribute, value)
|
64
|
-
if @@current_frame
|
65
|
-
@@current_frame.frame(attribute, value)
|
66
|
-
else
|
67
|
-
@@current_frame = @@agent.frame(attribute, value)
|
68
|
-
end
|
69
|
-
end
|
70
61
|
|
71
62
|
##
|
72
63
|
#Submit the last form;
|
73
|
-
def self.submit(current_form,
|
74
|
-
|
75
|
-
|
76
|
-
#this sucks FAIL omg
|
77
|
-
@@current_frame.locate
|
78
|
-
form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
|
79
|
-
form.submit
|
80
|
-
else
|
81
|
-
@@agent.element_by_xpath(@@current_form).submit
|
82
|
-
end
|
83
|
-
|
84
|
-
if sleep_time
|
85
|
-
sleep sleep_time
|
86
|
-
@@agent.wait
|
87
|
-
end
|
88
|
-
|
64
|
+
def self.submit(current_form, button=nil, type=nil)
|
65
|
+
@@agent.element_by_xpath(@@current_form).submit
|
66
|
+
@@agent.wait
|
89
67
|
@@current_doc_url = @@agent.url
|
90
68
|
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
91
69
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
@@ -196,18 +174,9 @@ module Scrubyt
|
|
196
174
|
end
|
197
175
|
end
|
198
176
|
|
199
|
-
def self.fill_textfield(textfield_name, query_string
|
177
|
+
def self.fill_textfield(textfield_name, query_string)
|
200
178
|
@@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
|
201
|
-
|
202
|
-
if useValue
|
203
|
-
target.text_field(:name,textfield_name).value = query_string
|
204
|
-
else
|
205
|
-
target.text_field(:name,textfield_name).set(query_string)
|
206
|
-
end
|
207
|
-
sleep(wait_secs) if wait_secs > 0
|
208
|
-
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
209
|
-
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
210
|
-
|
179
|
+
@@agent.text_field(:name,textfield_name).set(query_string)
|
211
180
|
end
|
212
181
|
|
213
182
|
##
|
@@ -32,21 +32,16 @@ module Scrubyt
|
|
32
32
|
|
33
33
|
if args.size > 0
|
34
34
|
mechanize_doc = args[0][:mechanize_doc]
|
35
|
-
html = args[0][:html]
|
36
35
|
resolve = args[0][:resolve]
|
37
36
|
basic_auth = args[0][:basic_auth]
|
38
37
|
parse_and_set_basic_auth(basic_auth) if basic_auth
|
39
|
-
if html
|
40
|
-
@@current_doc_protocol = 'string'
|
41
|
-
mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
|
42
|
-
end
|
43
38
|
else
|
44
39
|
mechanize_doc = nil
|
45
40
|
resolve = :full
|
46
41
|
end
|
47
42
|
|
48
43
|
@@current_doc_url = doc_url
|
49
|
-
@@current_doc_protocol
|
44
|
+
@@current_doc_protocol = determine_protocol
|
50
45
|
|
51
46
|
if mechanize_doc.nil? && @@current_doc_protocol != 'file'
|
52
47
|
handle_relative_path(doc_url)
|
@@ -64,13 +59,13 @@ module Scrubyt
|
|
64
59
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
|
65
60
|
else
|
66
61
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
|
67
|
-
store_host_name(self.get_current_doc_url)
|
62
|
+
store_host_name(self.get_current_doc_url) # in case we're on a new host
|
68
63
|
end
|
69
64
|
end
|
70
65
|
|
71
66
|
##
|
72
67
|
#Submit the last form;
|
73
|
-
def self.submit(index=nil,
|
68
|
+
def self.submit(index=nil, type=nil)
|
74
69
|
Scrubyt.log :ACTION, 'Submitting form...'
|
75
70
|
if index == nil
|
76
71
|
result_page = @@agent.submit(@@current_form)
|
@@ -91,7 +86,7 @@ module Scrubyt
|
|
91
86
|
|
92
87
|
##
|
93
88
|
#Click the link specified by the text
|
94
|
-
def self.click_link(link_spec,index = 0
|
89
|
+
def self.click_link(link_spec,index = 0)
|
95
90
|
Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
|
96
91
|
if link_spec.is_a? Hash
|
97
92
|
clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
|
@@ -177,7 +172,7 @@ module Scrubyt
|
|
177
172
|
end
|
178
173
|
end
|
179
174
|
|
180
|
-
def self.fill_textfield(textfield_name, query_string
|
175
|
+
def self.fill_textfield(textfield_name, query_string)
|
181
176
|
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
182
177
|
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
183
178
|
end
|
@@ -23,12 +23,8 @@ module Scrubyt
|
|
23
23
|
#textfield is 'q'
|
24
24
|
#
|
25
25
|
#_query_string_ - the string that should be entered into the textfield
|
26
|
-
def fill_textfield(textfield_name, query_string
|
27
|
-
FetchAction.fill_textfield(textfield_name, query_string
|
28
|
-
end
|
29
|
-
|
30
|
-
def fill_textfield_and_wait(textfield_name, query_string, sleep_secs=0, use_value=nil)
|
31
|
-
FetchAction.fill_textfield(textfield_name, query_string, sleep_secs, use_value)
|
26
|
+
def fill_textfield(textfield_name, query_string)
|
27
|
+
FetchAction.fill_textfield(textfield_name, query_string)
|
32
28
|
end
|
33
29
|
|
34
30
|
##
|
@@ -59,17 +55,13 @@ module Scrubyt
|
|
59
55
|
##
|
60
56
|
#Submit the current form
|
61
57
|
def submit(index=nil, type=nil)
|
62
|
-
FetchAction.submit(
|
63
|
-
end
|
64
|
-
|
65
|
-
def submit_and_wait(sleep_time, index=nil, type=nil)
|
66
|
-
FetchAction.submit(index, sleep_time, type)
|
58
|
+
FetchAction.submit(index, type)
|
67
59
|
end
|
68
60
|
|
69
61
|
##
|
70
62
|
#Click the link specified by the text
|
71
|
-
def click_link(link_spec,index=0)
|
72
|
-
FetchAction.click_link(link_spec,index,
|
63
|
+
def click_link(link_spec,index=0, sleep_secs=0)
|
64
|
+
FetchAction.click_link(link_spec,index, sleep_secs)
|
73
65
|
end
|
74
66
|
|
75
67
|
def click_link_and_wait(link_spec, sleep_secs=0)
|
@@ -84,10 +76,6 @@ module Scrubyt
|
|
84
76
|
FetchAction.click_image_map(index)
|
85
77
|
end
|
86
78
|
|
87
|
-
def frame(attribute,value)
|
88
|
-
FetchAction.frame(attribute,value)
|
89
|
-
end
|
90
|
-
|
91
79
|
def wait(time=1)
|
92
80
|
FetchAction.wait(time)
|
93
81
|
end
|
@@ -134,5 +134,13 @@ module Scrubyt
|
|
134
134
|
@xpath = XPathUtils.generate_relative_XPath_from_XPaths(parent_xpath, @xpath) if (@xpath =~ /^\/html/) #TODO: should not rely on <html> being the root node
|
135
135
|
end
|
136
136
|
|
137
|
+
def to_sexp
|
138
|
+
if @example =~ /.+\[@.+\]$/
|
139
|
+
[:str, "#{@xpath}/@#{@example.scan(/\[@(.+?)\]/)[0][0]}"]
|
140
|
+
else
|
141
|
+
[:str, @xpath]
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
137
145
|
end #End of class TreeFilter
|
138
146
|
end #End of module Scrubyt
|
@@ -78,7 +78,6 @@ module Scrubyt
|
|
78
78
|
|
79
79
|
#grab any examples that are defined
|
80
80
|
examples = look_for_examples(args)
|
81
|
-
|
82
81
|
#parse the options hash if provided
|
83
82
|
parse_options_hash(args[-1]) if args[-1].is_a? Hash
|
84
83
|
|
@@ -309,6 +308,32 @@ module Scrubyt
|
|
309
308
|
end
|
310
309
|
end
|
311
310
|
|
311
|
+
def to_sexp
|
312
|
+
#collect arguments
|
313
|
+
args = []
|
314
|
+
args.push(*@filters.to_sexp_array) if type != :detail_page #TODO: this if shouldn't be there
|
315
|
+
args.push(@options.to_sexp) if !@options.empty?
|
316
|
+
|
317
|
+
#build main call
|
318
|
+
sexp = [:fcall, @name, [:array, *args]]
|
319
|
+
|
320
|
+
if type == :detail_page
|
321
|
+
#add detail page extractor
|
322
|
+
sexp = [:iter, sexp, nil, @filters[0].get_detail_sexp]
|
323
|
+
else
|
324
|
+
#add child block if the pattern has children
|
325
|
+
sexp = [:iter, sexp, nil, [:block, *@children.to_sexp_array ]] if !@children.empty?
|
326
|
+
end
|
327
|
+
|
328
|
+
#add modifier calls - TODO: remove when everything is exported to the options hash
|
329
|
+
@modifier_calls.each do |modifier_sexp|
|
330
|
+
sexp = [:call, sexp, *modifier_sexp]
|
331
|
+
end
|
332
|
+
|
333
|
+
#return complete sexp
|
334
|
+
sexp
|
335
|
+
end
|
336
|
+
|
312
337
|
private
|
313
338
|
def parse_options_hash(hash)
|
314
339
|
#merge provided hash
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrubyt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.05
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Szinek
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-11-15 00:00:00 +01:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -100,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
100
|
requirements: []
|
101
101
|
|
102
102
|
rubyforge_project:
|
103
|
-
rubygems_version: 1.
|
103
|
+
rubygems_version: 1.2.0
|
104
104
|
signing_key:
|
105
105
|
specification_version: 2
|
106
106
|
summary: A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)
|