pba_crawler 0.0.8 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,34 +6,64 @@ module PbaCrawler
6
6
  @@cursor = 0
7
7
  @@proxy = nil
8
8
 
9
+ @@proxy_list_reload = false
10
+
11
+
9
12
  def initialize
10
13
  #load proxy list
11
14
  @@proxies = Proxy.order(:position)
12
15
  end
13
16
 
14
- def crawl_url(url)
15
- @@proxy = @@proxies.fetch(@@cursor)
17
+ def crawl_url(url,use_proxy=true)
18
+
19
+ if use_proxy.eql?(true)
20
+
21
+ unless @@proxy_list_reload
22
+ load_proxy_list
23
+ end
16
24
 
17
- p "#{url}|#{@@proxy.address}|#{@@proxy.port}"
18
-
19
- agent = Mechanize.new { |a|
20
- a.html_parser = Nokogiri::HTML
21
- a.open_timeout = 20
22
- a.read_timeout = 20
23
- }
24
25
 
25
- agent.set_proxy(@@proxy.address,@@proxy.port)
26
- agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
26
+ @@proxy = @@proxies.fetch(@@cursor)
27
27
 
28
- get_url(agent,url)
28
+ p "#{url}|#{@@proxy.address}|#{@@proxy.port}"
29
+
30
+ agent = Mechanize.new { |a|
31
+ a.html_parser = Nokogiri::HTML
32
+ a.open_timeout = 20
33
+ a.read_timeout = 20
34
+ }
35
+
36
+ agent.set_proxy(@@proxy.address,@@proxy.port)
37
+ agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
38
+
39
+ get_url(agent,url)
40
+ else
41
+ agent = Mechanize.new { |a|
42
+ a.html_parser = Nokogiri::HTML
43
+ a.open_timeout = 20
44
+ a.read_timeout = 20
45
+ }
46
+ agent.get(url)
47
+ end
48
+ end
49
+
50
+ def next_proxy
51
+ if @@cursor + 1 < @@proxies.size
52
+ @@cursor = @@cursor + 1
53
+ true
54
+ else
55
+ false
56
+ end
29
57
  end
30
58
 
31
59
  def get_url(agent,url)
32
60
  begin
33
61
  response = agent.get(url)
34
62
  if response
63
+ p "ok"
35
64
  @@proxy.update_attributes(:last_request_at => Time.now)
36
65
  else
66
+ p "nok"
37
67
  response = rescue_get_url(agent,url)
38
68
  end
39
69
  rescue
@@ -56,6 +86,37 @@ module PbaCrawler
56
86
  @@proxy
57
87
  end
58
88
 
89
+ def load_proxy_list
90
+ #/proxylist.php?port=&type=&ssl=&country=&latency=1000&reliability=9000&sort=reliability&desc=true&pnum=1#table
91
+
92
+ url = "http://www.xroxy.com/proxylist.php?port=&type=&ssl=&country=&latency=1000&reliability=9000"
93
+
94
+ last_created_proxy = Proxy.order("created_at DESC").first
95
+
96
+ if last_created_proxy.nil? || ((Time.now - last_created_proxy.created_at) / 1.hour)>2
97
+ Proxy.delete_all
98
+ begin
99
+ (1..4).each do |index|
100
+ agent = crawl_url("#{url}&pnum=#{index}",false)
101
+ agent.parser.xpath("//a[@title='View this Proxy details']").each do |a|
102
+ address = a.content.strip
103
+ port = a.parent.next.next.children.first.content.strip
104
+ Proxy.create(:address=> address, :port=> port)
105
+ end
106
+ sleep 3
107
+ end
108
+ rescue
109
+
110
+ end
111
+ @@proxy_list_reload = true
112
+ else
113
+ @@proxy_list_reload = true
114
+ end
115
+
116
+ end
117
+
118
+
119
+
59
120
  def load_sample
60
121
  list = [
61
122
  ["101.0.6.87","31280"],
@@ -1,3 +1,3 @@
1
1
  module PbaCrawler
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.11"
3
3
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: pba_crawler
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.8
5
+ version: 0.0.11
6
6
  platform: ruby
7
7
  authors:
8
8
  - Pierre BASILE
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-04-19 00:00:00 +02:00
13
+ date: 2011-05-14 00:00:00 +02:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency