pba_crawler 0.0.8 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/pba_crawler/generic.rb +73 -12
- data/lib/pba_crawler/version.rb +1 -1
- metadata +2 -2
data/lib/pba_crawler/generic.rb
CHANGED
@@ -6,34 +6,64 @@ module PbaCrawler
|
|
6
6
|
@@cursor = 0
|
7
7
|
@@proxy = nil
|
8
8
|
|
9
|
+
@@proxy_list_reload = false
|
10
|
+
|
11
|
+
|
9
12
|
def initialize
|
10
13
|
#load proxy list
|
11
14
|
@@proxies = Proxy.order(:position)
|
12
15
|
end
|
13
16
|
|
14
|
-
def crawl_url(url)
|
15
|
-
|
17
|
+
def crawl_url(url,use_proxy=true)
|
18
|
+
|
19
|
+
if use_proxy.eql?(true)
|
20
|
+
|
21
|
+
unless @@proxy_list_reload
|
22
|
+
load_proxy_list
|
23
|
+
end
|
16
24
|
|
17
|
-
p "#{url}|#{@@proxy.address}|#{@@proxy.port}"
|
18
|
-
|
19
|
-
agent = Mechanize.new { |a|
|
20
|
-
a.html_parser = Nokogiri::HTML
|
21
|
-
a.open_timeout = 20
|
22
|
-
a.read_timeout = 20
|
23
|
-
}
|
24
25
|
|
25
|
-
|
26
|
-
agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
|
26
|
+
@@proxy = @@proxies.fetch(@@cursor)
|
27
27
|
|
28
|
-
|
28
|
+
p "#{url}|#{@@proxy.address}|#{@@proxy.port}"
|
29
|
+
|
30
|
+
agent = Mechanize.new { |a|
|
31
|
+
a.html_parser = Nokogiri::HTML
|
32
|
+
a.open_timeout = 20
|
33
|
+
a.read_timeout = 20
|
34
|
+
}
|
35
|
+
|
36
|
+
agent.set_proxy(@@proxy.address,@@proxy.port)
|
37
|
+
agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
|
38
|
+
|
39
|
+
get_url(agent,url)
|
40
|
+
else
|
41
|
+
agent = Mechanize.new { |a|
|
42
|
+
a.html_parser = Nokogiri::HTML
|
43
|
+
a.open_timeout = 20
|
44
|
+
a.read_timeout = 20
|
45
|
+
}
|
46
|
+
agent.get(url)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def next_proxy
|
51
|
+
if @@cursor + 1 < @@proxies.size
|
52
|
+
@@cursor = @@cursor + 1
|
53
|
+
true
|
54
|
+
else
|
55
|
+
false
|
56
|
+
end
|
29
57
|
end
|
30
58
|
|
31
59
|
def get_url(agent,url)
|
32
60
|
begin
|
33
61
|
response = agent.get(url)
|
34
62
|
if response
|
63
|
+
p "ok"
|
35
64
|
@@proxy.update_attributes(:last_request_at => Time.now)
|
36
65
|
else
|
66
|
+
p "nok"
|
37
67
|
response = rescue_get_url(agent,url)
|
38
68
|
end
|
39
69
|
rescue
|
@@ -56,6 +86,37 @@ module PbaCrawler
|
|
56
86
|
@@proxy
|
57
87
|
end
|
58
88
|
|
89
|
+
def load_proxy_list
|
90
|
+
#/proxylist.php?port=&type=&ssl=&country=&latency=1000&reliability=9000&sort=reliability&desc=true&pnum=1#table
|
91
|
+
|
92
|
+
url = "http://www.xroxy.com/proxylist.php?port=&type=&ssl=&country=&latency=1000&reliability=9000"
|
93
|
+
|
94
|
+
last_created_proxy = Proxy.order("created_at DESC").first
|
95
|
+
|
96
|
+
if last_created_proxy.nil? || ((Time.now - last_created_proxy.created_at) / 1.hour)>2
|
97
|
+
Proxy.delete_all
|
98
|
+
begin
|
99
|
+
(1..4).each do |index|
|
100
|
+
agent = crawl_url("#{url}&pnum=#{index}",false)
|
101
|
+
agent.parser.xpath("//a[@title='View this Proxy details']").each do |a|
|
102
|
+
address = a.content.strip
|
103
|
+
port = a.parent.next.next.children.first.content.strip
|
104
|
+
Proxy.create(:address=> address, :port=> port)
|
105
|
+
end
|
106
|
+
sleep 3
|
107
|
+
end
|
108
|
+
rescue
|
109
|
+
|
110
|
+
end
|
111
|
+
@@proxy_list_reload = true
|
112
|
+
else
|
113
|
+
@@proxy_list_reload = true
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
|
59
120
|
def load_sample
|
60
121
|
list = [
|
61
122
|
["101.0.6.87","31280"],
|
data/lib/pba_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: pba_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.0.
|
5
|
+
version: 0.0.11
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Pierre BASILE
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
13
|
+
date: 2011-05-14 00:00:00 +02:00
|
14
14
|
default_executable:
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|