scrubber-scrubyt 0.4.11 → 0.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +34 -1
- metadata +1 -1
data/Rakefile
CHANGED
|
@@ -17,7 +17,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
|
17
17
|
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
|
19
19
|
s.name = 'scrubyt'
|
|
20
|
-
s.version = '0.4.
|
|
20
|
+
s.version = '0.4.12'
|
|
21
21
|
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
|
23
23
|
# Files containing Test::Unit test cases.
|
|
@@ -29,13 +29,14 @@ module Scrubyt
|
|
|
29
29
|
#_doc_url_ - the url or file name to fetch
|
|
30
30
|
def self.fetch(doc_url, *args)
|
|
31
31
|
#Refactor this crap!!! with option_accessor stuff
|
|
32
|
-
|
|
33
32
|
if args.size > 0
|
|
34
33
|
mechanize_doc = args[0][:mechanize_doc]
|
|
35
34
|
html = args[0][:html]
|
|
36
35
|
resolve = args[0][:resolve]
|
|
37
36
|
basic_auth = args[0][:basic_auth]
|
|
38
37
|
parse_and_set_basic_auth(basic_auth) if basic_auth
|
|
38
|
+
proxy = args[0][:proxy]
|
|
39
|
+
parse_and_set_proxy(proxy) if proxy
|
|
39
40
|
if html
|
|
40
41
|
@@current_doc_protocol = 'string'
|
|
41
42
|
mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
|
|
@@ -122,6 +123,38 @@ module Scrubyt
|
|
|
122
123
|
@@original_host_name ||= @@host_name
|
|
123
124
|
end #end of method store_host_name
|
|
124
125
|
|
|
126
|
+
def self.parse_and_set_proxy(proxy)
|
|
127
|
+
proxy = proxy[:proxy]
|
|
128
|
+
if proxy.downcase == 'localhost'
|
|
129
|
+
@@host = 'localhost'
|
|
130
|
+
@@port = proxy.split(':').last
|
|
131
|
+
else
|
|
132
|
+
parts = proxy.split(':')
|
|
133
|
+
if (parts.size > 2)
|
|
134
|
+
user_pass = parts[0].split('@')
|
|
135
|
+
if (user.pass.size > 1)
|
|
136
|
+
@@proxy_user = user_pass[0]
|
|
137
|
+
@@proxy_pass = user_pass[1]
|
|
138
|
+
else
|
|
139
|
+
@@proxy_user = user_pass
|
|
140
|
+
end
|
|
141
|
+
@@host = parts[1]
|
|
142
|
+
@@port = parts[2]
|
|
143
|
+
else
|
|
144
|
+
@@host = parts[0]
|
|
145
|
+
@@port = parts[1]
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
if (@@host == nil || @@port == nil)# !@@host =~ /^http/)
|
|
149
|
+
puts "Invalid proxy specification..."
|
|
150
|
+
puts "neither host nor port can be nil!"
|
|
151
|
+
exit
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
Scrubyt.log :ACTION, "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>, username=<#{@@proxy_user}, password=<#{@@proxy_pass}>"
|
|
155
|
+
@@agent.set_proxy(@@host, @@port)
|
|
156
|
+
end
|
|
157
|
+
|
|
125
158
|
def self.determine_protocol
|
|
126
159
|
old_protocol = @@current_doc_protocol
|
|
127
160
|
new_protocol = case @@current_doc_url
|