pba_crawler 0.0.8 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,34 +6,64 @@ module PbaCrawler
6
6
  @@cursor = 0
7
7
  @@proxy = nil
8
8
 
9
+ @@proxy_list_reload = false
10
+
11
+
9
12
  def initialize
10
13
  #load proxy list
11
14
  @@proxies = Proxy.order(:position)
12
15
  end
13
16
 
14
- def crawl_url(url)
15
- @@proxy = @@proxies.fetch(@@cursor)
17
+ def crawl_url(url,use_proxy=true)
18
+
19
+ if use_proxy.eql?(true)
20
+
21
+ unless @@proxy_list_reload
22
+ load_proxy_list
23
+ end
16
24
 
17
- p "#{url}|#{@@proxy.address}|#{@@proxy.port}"
18
-
19
- agent = Mechanize.new { |a|
20
- a.html_parser = Nokogiri::HTML
21
- a.open_timeout = 20
22
- a.read_timeout = 20
23
- }
24
25
 
25
- agent.set_proxy(@@proxy.address,@@proxy.port)
26
- agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
26
+ @@proxy = @@proxies.fetch(@@cursor)
27
27
 
28
- get_url(agent,url)
28
+ p "#{url}|#{@@proxy.address}|#{@@proxy.port}"
29
+
30
+ agent = Mechanize.new { |a|
31
+ a.html_parser = Nokogiri::HTML
32
+ a.open_timeout = 20
33
+ a.read_timeout = 20
34
+ }
35
+
36
+ agent.set_proxy(@@proxy.address,@@proxy.port)
37
+ agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
38
+
39
+ get_url(agent,url)
40
+ else
41
+ agent = Mechanize.new { |a|
42
+ a.html_parser = Nokogiri::HTML
43
+ a.open_timeout = 20
44
+ a.read_timeout = 20
45
+ }
46
+ agent.get(url)
47
+ end
48
+ end
49
+
50
+ def next_proxy
51
+ if @@cursor + 1 < @@proxies.size
52
+ @@cursor = @@cursor + 1
53
+ true
54
+ else
55
+ false
56
+ end
29
57
  end
30
58
 
31
59
  def get_url(agent,url)
32
60
  begin
33
61
  response = agent.get(url)
34
62
  if response
63
+ p "ok"
35
64
  @@proxy.update_attributes(:last_request_at => Time.now)
36
65
  else
66
+ p "nok"
37
67
  response = rescue_get_url(agent,url)
38
68
  end
39
69
  rescue
@@ -56,6 +86,37 @@ module PbaCrawler
56
86
  @@proxy
57
87
  end
58
88
 
89
+ def load_proxy_list
90
+ #/proxylist.php?port=&type=&ssl=&country=&latency=1000&reliability=9000&sort=reliability&desc=true&pnum=1#table
91
+
92
+ url = "http://www.xroxy.com/proxylist.php?port=&type=&ssl=&country=&latency=1000&reliability=9000"
93
+
94
+ last_created_proxy = Proxy.order("created_at DESC").first
95
+
96
+ if last_created_proxy.nil? || ((Time.now - last_created_proxy.created_at) / 1.hour)>2
97
+ Proxy.delete_all
98
+ begin
99
+ (1..4).each do |index|
100
+ agent = crawl_url("#{url}&pnum=#{index}",false)
101
+ agent.parser.xpath("//a[@title='View this Proxy details']").each do |a|
102
+ address = a.content.strip
103
+ port = a.parent.next.next.children.first.content.strip
104
+ Proxy.create(:address=> address, :port=> port)
105
+ end
106
+ sleep 3
107
+ end
108
+ rescue
109
+
110
+ end
111
+ @@proxy_list_reload = true
112
+ else
113
+ @@proxy_list_reload = true
114
+ end
115
+
116
+ end
117
+
118
+
119
+
59
120
  def load_sample
60
121
  list = [
61
122
  ["101.0.6.87","31280"],
@@ -1,3 +1,3 @@
1
1
  module PbaCrawler
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.11"
3
3
  end
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: pba_crawler
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.0.8
5
+ version: 0.0.11
6
6
  platform: ruby
7
7
  authors:
8
8
  - Pierre BASILE
@@ -10,7 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-04-19 00:00:00 +02:00
13
+ date: 2011-05-14 00:00:00 +02:00
14
14
  default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency