scraptory 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/proxy.rb +48 -0
- data/lib/scraptory.rb +184 -0
- metadata +87 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a7ecdad23a953fc93146eb88a01ea5216d47dfac
|
4
|
+
data.tar.gz: 2289f29c61f00c6db5abf6fef5a71d7d9c0b3fa5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: de07b7725d9b99217e0c9896649579f88276b504c60f51deb79f7ddb320420b1de5971e704fcacca5fbe292d13579fe04141f1bde3aa38b9fb3e92b971762cbe
|
7
|
+
data.tar.gz: 052389b0c605fce0ba5fc4bdee75cad29d3f6fd8f6be2d9a5ba7a9114f0a22851773f876fd5173f21561fda842810faaf7ad8e67fc2ce810e212f73070f02021
|
data/lib/proxy.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require "tor"
|
3
|
+
require "typhoeus"
|
4
|
+
require "useragents"
|
5
|
+
|
6
|
+
class Proxy
|
7
|
+
attr_reader :type
|
8
|
+
attr_reader :timeout
|
9
|
+
|
10
|
+
def initialize(host="localhost",port=8080,options={},credentials={})
|
11
|
+
@host = host
|
12
|
+
@port = port
|
13
|
+
@type = options[:type] || "http"
|
14
|
+
@timeout = options[:timeout].to_i || 10
|
15
|
+
@credentials = credentials
|
16
|
+
@config = options
|
17
|
+
end
|
18
|
+
|
19
|
+
def url
|
20
|
+
return @host.to_s+":"+@port.to_s
|
21
|
+
end
|
22
|
+
|
23
|
+
def change_ip
|
24
|
+
if @config[:tor]
|
25
|
+
|
26
|
+
if @node.nil?
|
27
|
+
@node = Tor::Controller.new(:host => @credentials[:telnet_host], :port => @credentials[:telnet_port])
|
28
|
+
end
|
29
|
+
|
30
|
+
if !@node.authenticated?
|
31
|
+
@node.authenticate(@credentials[:telnet_passwd])
|
32
|
+
end
|
33
|
+
|
34
|
+
@node.signal("NEWNYM")
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def get_ip
|
39
|
+
Typhoeus::Config.user_agent = UserAgents.rand()
|
40
|
+
|
41
|
+
response = Typhoeus::Request.new("http://checkip.amazonaws.com/",
|
42
|
+
timeout: @timeout,
|
43
|
+
proxy: self.url,
|
44
|
+
proxytype: @type).run
|
45
|
+
|
46
|
+
return response.response_body.gsub("\n", '').strip
|
47
|
+
end
|
48
|
+
end
|
data/lib/scraptory.rb
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Requests
|
4
|
+
require 'typhoeus'
|
5
|
+
require 'useragents'
|
6
|
+
# For file/dir manipulation
|
7
|
+
require 'fileutils'
|
8
|
+
# Logs
|
9
|
+
require 'logger'
|
10
|
+
# For Tor
|
11
|
+
require 'net/telnet'
|
12
|
+
require_relative "proxy"
|
13
|
+
|
14
|
+
class Scraptory
|
15
|
+
attr_reader :config
|
16
|
+
@@chg_ip_fater_nfails = 10
|
17
|
+
@@default_hydra_timeout = 10
|
18
|
+
@@default_tor_timeout = 10
|
19
|
+
@@default_nthreads = 1
|
20
|
+
@@default_err_before_chg_ip = 100
|
21
|
+
@@count_connect_errors = 0
|
22
|
+
|
23
|
+
def initialize(config={})
|
24
|
+
@proxies = []
|
25
|
+
@proxy_cursor=-1
|
26
|
+
|
27
|
+
set_config(config)
|
28
|
+
end
|
29
|
+
|
30
|
+
def set_config(config={})
|
31
|
+
# Use debug output
|
32
|
+
if !config.has_key?("debug")
|
33
|
+
config["debug"] = false
|
34
|
+
end
|
35
|
+
|
36
|
+
# If debug_file is set and doesn't exists, we create it
|
37
|
+
if config.has_key?("debug_file") and !File.exist?(config["debug_file"])
|
38
|
+
config["debug"] = true
|
39
|
+
FileUtils.touch(config["debug_file"])
|
40
|
+
@logger = Logger.new(config["debug_file"])
|
41
|
+
elsif config["debug"]
|
42
|
+
@logger = Logger.new(STDOUT)
|
43
|
+
end
|
44
|
+
|
45
|
+
# If the param nthreads exists and is an integer it is created. Else it is set to 1
|
46
|
+
if config.has_key?("nthreads") and config["nthreads"].is_a? Integer
|
47
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: config["nthreads"].to_i)
|
48
|
+
else
|
49
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: @@default_nthreads)
|
50
|
+
end
|
51
|
+
|
52
|
+
# If wrong data are in timeout config, we set it to default
|
53
|
+
if !config.has_key?("timeout") or config["timeout"].to_i < 1
|
54
|
+
config["timeout"] = @@default_hydra_timeout
|
55
|
+
end
|
56
|
+
|
57
|
+
if !config.has_key?("retry_on_error")
|
58
|
+
config["retry_on_error"] = false
|
59
|
+
end
|
60
|
+
|
61
|
+
# Switch between proxies and clear connection
|
62
|
+
if config["use_clearconnection"].nil?
|
63
|
+
config["use_clearconnection"] = false
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
if !config["err_before_chg_ip"].nil? or config["err_before_chg_ip"].to_i < 1
|
68
|
+
config["err_before_chg_ip"] = @@default_err_before_chg_ip
|
69
|
+
end
|
70
|
+
|
71
|
+
@config = config
|
72
|
+
end
|
73
|
+
|
74
|
+
def queue(url,callback)
|
75
|
+
request = build_request(url)
|
76
|
+
|
77
|
+
request.on_complete do |response|
|
78
|
+
on_request_complete(response,request,callback)
|
79
|
+
end
|
80
|
+
|
81
|
+
@hydra.queue(request)
|
82
|
+
end
|
83
|
+
|
84
|
+
def queues(urls=Array.new,callback)
|
85
|
+
urls.each do |url|
|
86
|
+
queue(url,callback)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def build_request(url)
|
91
|
+
Typhoeus::Config.user_agent = UserAgents.rand()
|
92
|
+
|
93
|
+
# proxy_cursor is set to 1 when no using any proxy
|
94
|
+
if not @proxies.any? and @proxy_cursor > -1
|
95
|
+
proxy = @proxies[@proxy_cursor]
|
96
|
+
return Typhoeus::Request.new(url,
|
97
|
+
:timeout => proxy.timeout,
|
98
|
+
:proxy => proxy.url,
|
99
|
+
:proxytype => proxy.type)
|
100
|
+
else
|
101
|
+
return Typhoeus::Request.new(url, :timeout => @config['timeout'])
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def on_request_complete(response,request,callback)
|
106
|
+
error_msg = "Unknow error"
|
107
|
+
add_to_queue = false
|
108
|
+
|
109
|
+
if response.success?
|
110
|
+
# SUCCESS
|
111
|
+
callback.call(response)
|
112
|
+
elsif response.timed_out?
|
113
|
+
error_msg = "Timed out ("+request.url+")"
|
114
|
+
add_to_queue = true
|
115
|
+
elsif response.code == 404
|
116
|
+
error_msg = "404 Page not found ("+request.url+")"
|
117
|
+
add_to_queue = false
|
118
|
+
|
119
|
+
elsif response.code == 301 or response.code == 302
|
120
|
+
error_msg = "301/302 Redirection not followed ("+request.url+")"
|
121
|
+
add_to_queue = false
|
122
|
+
|
123
|
+
elsif response.code == 0
|
124
|
+
# Could not get an http response, something's wrong.
|
125
|
+
error_msg = "Could not get an http response, something's wrong ("+request.url+") : "+response.return_message
|
126
|
+
add_to_queue = true
|
127
|
+
else
|
128
|
+
# Received a non-successful http response.
|
129
|
+
error_msg = "Received a non-successful http response ("+request.url+") : "+response.code.to_s
|
130
|
+
add_to_queue = true
|
131
|
+
end
|
132
|
+
|
133
|
+
self._debug(error_msg)
|
134
|
+
|
135
|
+
if add_to_queue
|
136
|
+
@@count_connect_errors = @@count_connect_errors + 1
|
137
|
+
|
138
|
+
if @@count_connect_errors > @config['err_before_chg_ip']
|
139
|
+
self._debug("Changing Proxy","info")
|
140
|
+
|
141
|
+
@proxies[@proxy_cursor].change_ip()
|
142
|
+
|
143
|
+
if @proxy_cursor == @proxies.length - 1 and @config["use_clearconnection"]
|
144
|
+
@proxy_cursor = -1
|
145
|
+
elsif @proxy_cursor == @proxies.length - 1 and !@config["use_clearconnection"]
|
146
|
+
@proxy_cursor = 0
|
147
|
+
else
|
148
|
+
@proxy_cursor = @proxy_cursor + 1
|
149
|
+
end
|
150
|
+
|
151
|
+
@@count_connect_errors = 0
|
152
|
+
end
|
153
|
+
|
154
|
+
if @config['retry_on_error']
|
155
|
+
@hydra.queue(request)
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def scrap()
|
162
|
+
@hydra.run
|
163
|
+
end
|
164
|
+
|
165
|
+
def add_proxy(proxy)
|
166
|
+
@proxies << proxy
|
167
|
+
end
|
168
|
+
|
169
|
+
def _debug(msg,lvl="warn")
|
170
|
+
if @config["debug"]
|
171
|
+
if lvl == "warn"
|
172
|
+
@logger.warn msg
|
173
|
+
elsif lvl == "info"
|
174
|
+
@logger.info msg
|
175
|
+
elsif lvl == "debug"
|
176
|
+
@logger.debug msg
|
177
|
+
elsif lvl == "error"
|
178
|
+
@logger.error msg
|
179
|
+
else
|
180
|
+
@logger.warn msg
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scraptory
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- AlexMili
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-01-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: typhoeus
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.0.1
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.0.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: useragents
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.4
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.1.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: tor2
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.1.2
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.1.2
|
55
|
+
description: A simple scraping gem using tor
|
56
|
+
email: /
|
57
|
+
executables: []
|
58
|
+
extensions: []
|
59
|
+
extra_rdoc_files: []
|
60
|
+
files:
|
61
|
+
- lib/scraptory.rb
|
62
|
+
- lib/proxy.rb
|
63
|
+
homepage: https://github.com/AlexMili/scraptory
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata: {}
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 2.0.14.1
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: Scraper over Tor in ruby
|
87
|
+
test_files: []
|