scrubber-scrubyt 0.4.11 → 0.4.12
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +34 -1
- metadata +1 -1
data/Rakefile
CHANGED
@@ -17,7 +17,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
17
17
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
19
19
|
s.name = 'scrubyt'
|
20
|
-
s.version = '0.4.
|
20
|
+
s.version = '0.4.12'
|
21
21
|
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
23
|
# Files containing Test::Unit test cases.
|
@@ -29,13 +29,14 @@ module Scrubyt
|
|
29
29
|
#_doc_url_ - the url or file name to fetch
|
30
30
|
def self.fetch(doc_url, *args)
|
31
31
|
#Refactor this crap!!! with option_accessor stuff
|
32
|
-
|
33
32
|
if args.size > 0
|
34
33
|
mechanize_doc = args[0][:mechanize_doc]
|
35
34
|
html = args[0][:html]
|
36
35
|
resolve = args[0][:resolve]
|
37
36
|
basic_auth = args[0][:basic_auth]
|
38
37
|
parse_and_set_basic_auth(basic_auth) if basic_auth
|
38
|
+
proxy = args[0][:proxy]
|
39
|
+
parse_and_set_proxy(proxy) if proxy
|
39
40
|
if html
|
40
41
|
@@current_doc_protocol = 'string'
|
41
42
|
mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
|
@@ -122,6 +123,38 @@ module Scrubyt
|
|
122
123
|
@@original_host_name ||= @@host_name
|
123
124
|
end #end of method store_host_name
|
124
125
|
|
126
|
+
def self.parse_and_set_proxy(proxy)
|
127
|
+
proxy = proxy[:proxy]
|
128
|
+
if proxy.downcase == 'localhost'
|
129
|
+
@@host = 'localhost'
|
130
|
+
@@port = proxy.split(':').last
|
131
|
+
else
|
132
|
+
parts = proxy.split(':')
|
133
|
+
if (parts.size > 2)
|
134
|
+
user_pass = parts[0].split('@')
|
135
|
+
if (user.pass.size > 1)
|
136
|
+
@@proxy_user = user_pass[0]
|
137
|
+
@@proxy_pass = user_pass[1]
|
138
|
+
else
|
139
|
+
@@proxy_user = user_pass
|
140
|
+
end
|
141
|
+
@@host = parts[1]
|
142
|
+
@@port = parts[2]
|
143
|
+
else
|
144
|
+
@@host = parts[0]
|
145
|
+
@@port = parts[1]
|
146
|
+
end
|
147
|
+
|
148
|
+
if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
|
149
|
+
puts "Invalid proxy specification..."
|
150
|
+
puts "neither host nor port can be nil!"
|
151
|
+
exit
|
152
|
+
end
|
153
|
+
end
|
154
|
+
Scrubyt.log :ACTION, "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>, username=<#{@@proxy_user}, password=<#{@@proxy_pass}>"
|
155
|
+
@@agent.set_proxy(@@host, @@port)
|
156
|
+
end
|
157
|
+
|
125
158
|
def self.determine_protocol
|
126
159
|
old_protocol = @@current_doc_protocol
|
127
160
|
new_protocol = case @@current_doc_url
|