scraptory 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/lib/proxy.rb +48 -0
  3. data/lib/scraptory.rb +184 -0
  4. metadata +87 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a7ecdad23a953fc93146eb88a01ea5216d47dfac
4
+ data.tar.gz: 2289f29c61f00c6db5abf6fef5a71d7d9c0b3fa5
5
+ SHA512:
6
+ metadata.gz: de07b7725d9b99217e0c9896649579f88276b504c60f51deb79f7ddb320420b1de5971e704fcacca5fbe292d13579fe04141f1bde3aa38b9fb3e92b971762cbe
7
+ data.tar.gz: 052389b0c605fce0ba5fc4bdee75cad29d3f6fd8f6be2d9a5ba7a9114f0a22851773f876fd5173f21561fda842810faaf7ad8e67fc2ce810e212f73070f02021
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+ require "tor"
3
+ require "typhoeus"
4
+ require "useragents"
5
+
6
+ class Proxy
7
+ attr_reader :type
8
+ attr_reader :timeout
9
+
10
+ def initialize(host="localhost",port=8080,options={},credentials={})
11
+ @host = host
12
+ @port = port
13
+ @type = options[:type] || "http"
14
+ @timeout = options[:timeout].to_i || 10
15
+ @credentials = credentials
16
+ @config = options
17
+ end
18
+
19
+ def url
20
+ return @host.to_s+":"+@port.to_s
21
+ end
22
+
23
+ def change_ip
24
+ if @config[:tor]
25
+
26
+ if @node.nil?
27
+ @node = Tor::Controller.new(:host => @credentials[:telnet_host], :port => @credentials[:telnet_port])
28
+ end
29
+
30
+ if !@node.authenticated?
31
+ @node.authenticate(@credentials[:telnet_passwd])
32
+ end
33
+
34
+ @node.signal("NEWNYM")
35
+ end
36
+ end
37
+
38
+ def get_ip
39
+ Typhoeus::Config.user_agent = UserAgents.rand()
40
+
41
+ response = Typhoeus::Request.new("http://checkip.amazonaws.com/",
42
+ timeout: @timeout,
43
+ proxy: self.url,
44
+ proxytype: @type).run
45
+
46
+ return response.response_body.gsub("\n", '').strip
47
+ end
48
+ end
@@ -0,0 +1,184 @@
1
+ # encoding: UTF-8
2
+
3
+ # Requests
4
+ require 'typhoeus'
5
+ require 'useragents'
6
+ # For file/dir manipulation
7
+ require 'fileutils'
8
+ # Logs
9
+ require 'logger'
10
+ # For Tor
11
+ require 'net/telnet'
12
+ require_relative "proxy"
13
+
14
+ class Scraptory
15
+ attr_reader :config
16
+ @@chg_ip_fater_nfails = 10
17
+ @@default_hydra_timeout = 10
18
+ @@default_tor_timeout = 10
19
+ @@default_nthreads = 1
20
+ @@default_err_before_chg_ip = 100
21
+ @@count_connect_errors = 0
22
+
23
+ def initialize(config={})
24
+ @proxies = []
25
+ @proxy_cursor=-1
26
+
27
+ set_config(config)
28
+ end
29
+
30
+ def set_config(config={})
31
+ # Use debug output
32
+ if !config.has_key?("debug")
33
+ config["debug"] = false
34
+ end
35
+
36
+ # If debug_file is set and doesn't exists, we create it
37
+ if config.has_key?("debug_file") and !File.exist?(config["debug_file"])
38
+ config["debug"] = true
39
+ FileUtils.touch(config["debug_file"])
40
+ @logger = Logger.new(config["debug_file"])
41
+ elsif config["debug"]
42
+ @logger = Logger.new(STDOUT)
43
+ end
44
+
45
+ # If the param nthreads exists and is an integer it is created. Else it is set to 1
46
+ if config.has_key?("nthreads") and config["nthreads"].is_a? Integer
47
+ @hydra = Typhoeus::Hydra.new(max_concurrency: config["nthreads"].to_i)
48
+ else
49
+ @hydra = Typhoeus::Hydra.new(max_concurrency: @@default_nthreads)
50
+ end
51
+
52
+ # If wrong data are in timeout config, we set it to default
53
+ if !config.has_key?("timeout") or config["timeout"].to_i < 1
54
+ config["timeout"] = @@default_hydra_timeout
55
+ end
56
+
57
+ if !config.has_key?("retry_on_error")
58
+ config["retry_on_error"] = false
59
+ end
60
+
61
+ # Switch between proxies and clear connection
62
+ if config["use_clearconnection"].nil?
63
+ config["use_clearconnection"] = false
64
+ end
65
+
66
+
67
+ if !config["err_before_chg_ip"].nil? or config["err_before_chg_ip"].to_i < 1
68
+ config["err_before_chg_ip"] = @@default_err_before_chg_ip
69
+ end
70
+
71
+ @config = config
72
+ end
73
+
74
+ def queue(url,callback)
75
+ request = build_request(url)
76
+
77
+ request.on_complete do |response|
78
+ on_request_complete(response,request,callback)
79
+ end
80
+
81
+ @hydra.queue(request)
82
+ end
83
+
84
+ def queues(urls=Array.new,callback)
85
+ urls.each do |url|
86
+ queue(url,callback)
87
+ end
88
+ end
89
+
90
+ def build_request(url)
91
+ Typhoeus::Config.user_agent = UserAgents.rand()
92
+
93
+ # proxy_cursor is set to 1 when no using any proxy
94
+ if not @proxies.any? and @proxy_cursor > -1
95
+ proxy = @proxies[@proxy_cursor]
96
+ return Typhoeus::Request.new(url,
97
+ :timeout => proxy.timeout,
98
+ :proxy => proxy.url,
99
+ :proxytype => proxy.type)
100
+ else
101
+ return Typhoeus::Request.new(url, :timeout => @config['timeout'])
102
+ end
103
+ end
104
+
105
+ def on_request_complete(response,request,callback)
106
+ error_msg = "Unknow error"
107
+ add_to_queue = false
108
+
109
+ if response.success?
110
+ # SUCCESS
111
+ callback.call(response)
112
+ elsif response.timed_out?
113
+ error_msg = "Timed out ("+request.url+")"
114
+ add_to_queue = true
115
+ elsif response.code == 404
116
+ error_msg = "404 Page not found ("+request.url+")"
117
+ add_to_queue = false
118
+
119
+ elsif response.code == 301 or response.code == 302
120
+ error_msg = "301/302 Redirection not followed ("+request.url+")"
121
+ add_to_queue = false
122
+
123
+ elsif response.code == 0
124
+ # Could not get an http response, something's wrong.
125
+ error_msg = "Could not get an http response, something's wrong ("+request.url+") : "+response.return_message
126
+ add_to_queue = true
127
+ else
128
+ # Received a non-successful http response.
129
+ error_msg = "Received a non-successful http response ("+request.url+") : "+response.code.to_s
130
+ add_to_queue = true
131
+ end
132
+
133
+ self._debug(error_msg)
134
+
135
+ if add_to_queue
136
+ @@count_connect_errors = @@count_connect_errors + 1
137
+
138
+ if @@count_connect_errors > @config['err_before_chg_ip']
139
+ self._debug("Changing Proxy","info")
140
+
141
+ @proxies[@proxy_cursor].change_ip()
142
+
143
+ if @proxy_cursor == @proxies.length - 1 and @config["use_clearconnection"]
144
+ @proxy_cursor = -1
145
+ elsif @proxy_cursor == @proxies.length - 1 and !@config["use_clearconnection"]
146
+ @proxy_cursor = 0
147
+ else
148
+ @proxy_cursor = @proxy_cursor + 1
149
+ end
150
+
151
+ @@count_connect_errors = 0
152
+ end
153
+
154
+ if @config['retry_on_error']
155
+ @hydra.queue(request)
156
+ end
157
+
158
+ end
159
+ end
160
+
161
+ def scrap()
162
+ @hydra.run
163
+ end
164
+
165
+ def add_proxy(proxy)
166
+ @proxies << proxy
167
+ end
168
+
169
+ def _debug(msg,lvl="warn")
170
+ if @config["debug"]
171
+ if lvl == "warn"
172
+ @logger.warn msg
173
+ elsif lvl == "info"
174
+ @logger.info msg
175
+ elsif lvl == "debug"
176
+ @logger.debug msg
177
+ elsif lvl == "error"
178
+ @logger.error msg
179
+ else
180
+ @logger.warn msg
181
+ end
182
+ end
183
+ end
184
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scraptory
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - AlexMili
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-01-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: typhoeus
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.0.1
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: 1.0.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: useragents
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.4
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: tor2
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: 0.1.2
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 0.1.2
55
+ description: A simple scraping gem using tor
56
+ email: /
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - lib/scraptory.rb
62
+ - lib/proxy.rb
63
+ homepage: https://github.com/AlexMili/scraptory
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.0.14.1
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Scraper over Tor in ruby
87
+ test_files: []