scraptory 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/proxy.rb +48 -0
- data/lib/scraptory.rb +184 -0
- metadata +87 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a7ecdad23a953fc93146eb88a01ea5216d47dfac
|
4
|
+
data.tar.gz: 2289f29c61f00c6db5abf6fef5a71d7d9c0b3fa5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: de07b7725d9b99217e0c9896649579f88276b504c60f51deb79f7ddb320420b1de5971e704fcacca5fbe292d13579fe04141f1bde3aa38b9fb3e92b971762cbe
|
7
|
+
data.tar.gz: 052389b0c605fce0ba5fc4bdee75cad29d3f6fd8f6be2d9a5ba7a9114f0a22851773f876fd5173f21561fda842810faaf7ad8e67fc2ce810e212f73070f02021
|
data/lib/proxy.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require "tor"
|
3
|
+
require "typhoeus"
|
4
|
+
require "useragents"
|
5
|
+
|
6
|
+
class Proxy
|
7
|
+
attr_reader :type
|
8
|
+
attr_reader :timeout
|
9
|
+
|
10
|
+
def initialize(host="localhost",port=8080,options={},credentials={})
|
11
|
+
@host = host
|
12
|
+
@port = port
|
13
|
+
@type = options[:type] || "http"
|
14
|
+
@timeout = options[:timeout].to_i || 10
|
15
|
+
@credentials = credentials
|
16
|
+
@config = options
|
17
|
+
end
|
18
|
+
|
19
|
+
def url
|
20
|
+
return @host.to_s+":"+@port.to_s
|
21
|
+
end
|
22
|
+
|
23
|
+
def change_ip
|
24
|
+
if @config[:tor]
|
25
|
+
|
26
|
+
if @node.nil?
|
27
|
+
@node = Tor::Controller.new(:host => @credentials[:telnet_host], :port => @credentials[:telnet_port])
|
28
|
+
end
|
29
|
+
|
30
|
+
if !@node.authenticated?
|
31
|
+
@node.authenticate(@credentials[:telnet_passwd])
|
32
|
+
end
|
33
|
+
|
34
|
+
@node.signal("NEWNYM")
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def get_ip
|
39
|
+
Typhoeus::Config.user_agent = UserAgents.rand()
|
40
|
+
|
41
|
+
response = Typhoeus::Request.new("http://checkip.amazonaws.com/",
|
42
|
+
timeout: @timeout,
|
43
|
+
proxy: self.url,
|
44
|
+
proxytype: @type).run
|
45
|
+
|
46
|
+
return response.response_body.gsub("\n", '').strip
|
47
|
+
end
|
48
|
+
end
|
data/lib/scraptory.rb
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
# Requests
|
4
|
+
require 'typhoeus'
|
5
|
+
require 'useragents'
|
6
|
+
# For file/dir manipulation
|
7
|
+
require 'fileutils'
|
8
|
+
# Logs
|
9
|
+
require 'logger'
|
10
|
+
# For Tor
|
11
|
+
require 'net/telnet'
|
12
|
+
require_relative "proxy"
|
13
|
+
|
14
|
+
class Scraptory
|
15
|
+
attr_reader :config
|
16
|
+
@@chg_ip_fater_nfails = 10
|
17
|
+
@@default_hydra_timeout = 10
|
18
|
+
@@default_tor_timeout = 10
|
19
|
+
@@default_nthreads = 1
|
20
|
+
@@default_err_before_chg_ip = 100
|
21
|
+
@@count_connect_errors = 0
|
22
|
+
|
23
|
+
def initialize(config={})
|
24
|
+
@proxies = []
|
25
|
+
@proxy_cursor=-1
|
26
|
+
|
27
|
+
set_config(config)
|
28
|
+
end
|
29
|
+
|
30
|
+
def set_config(config={})
|
31
|
+
# Use debug output
|
32
|
+
if !config.has_key?("debug")
|
33
|
+
config["debug"] = false
|
34
|
+
end
|
35
|
+
|
36
|
+
# If debug_file is set and doesn't exists, we create it
|
37
|
+
if config.has_key?("debug_file") and !File.exist?(config["debug_file"])
|
38
|
+
config["debug"] = true
|
39
|
+
FileUtils.touch(config["debug_file"])
|
40
|
+
@logger = Logger.new(config["debug_file"])
|
41
|
+
elsif config["debug"]
|
42
|
+
@logger = Logger.new(STDOUT)
|
43
|
+
end
|
44
|
+
|
45
|
+
# If the param nthreads exists and is an integer it is created. Else it is set to 1
|
46
|
+
if config.has_key?("nthreads") and config["nthreads"].is_a? Integer
|
47
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: config["nthreads"].to_i)
|
48
|
+
else
|
49
|
+
@hydra = Typhoeus::Hydra.new(max_concurrency: @@default_nthreads)
|
50
|
+
end
|
51
|
+
|
52
|
+
# If wrong data are in timeout config, we set it to default
|
53
|
+
if !config.has_key?("timeout") or config["timeout"].to_i < 1
|
54
|
+
config["timeout"] = @@default_hydra_timeout
|
55
|
+
end
|
56
|
+
|
57
|
+
if !config.has_key?("retry_on_error")
|
58
|
+
config["retry_on_error"] = false
|
59
|
+
end
|
60
|
+
|
61
|
+
# Switch between proxies and clear connection
|
62
|
+
if config["use_clearconnection"].nil?
|
63
|
+
config["use_clearconnection"] = false
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
if !config["err_before_chg_ip"].nil? or config["err_before_chg_ip"].to_i < 1
|
68
|
+
config["err_before_chg_ip"] = @@default_err_before_chg_ip
|
69
|
+
end
|
70
|
+
|
71
|
+
@config = config
|
72
|
+
end
|
73
|
+
|
74
|
+
def queue(url,callback)
|
75
|
+
request = build_request(url)
|
76
|
+
|
77
|
+
request.on_complete do |response|
|
78
|
+
on_request_complete(response,request,callback)
|
79
|
+
end
|
80
|
+
|
81
|
+
@hydra.queue(request)
|
82
|
+
end
|
83
|
+
|
84
|
+
def queues(urls=Array.new,callback)
|
85
|
+
urls.each do |url|
|
86
|
+
queue(url,callback)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def build_request(url)
|
91
|
+
Typhoeus::Config.user_agent = UserAgents.rand()
|
92
|
+
|
93
|
+
# proxy_cursor is set to 1 when no using any proxy
|
94
|
+
if not @proxies.any? and @proxy_cursor > -1
|
95
|
+
proxy = @proxies[@proxy_cursor]
|
96
|
+
return Typhoeus::Request.new(url,
|
97
|
+
:timeout => proxy.timeout,
|
98
|
+
:proxy => proxy.url,
|
99
|
+
:proxytype => proxy.type)
|
100
|
+
else
|
101
|
+
return Typhoeus::Request.new(url, :timeout => @config['timeout'])
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def on_request_complete(response,request,callback)
|
106
|
+
error_msg = "Unknow error"
|
107
|
+
add_to_queue = false
|
108
|
+
|
109
|
+
if response.success?
|
110
|
+
# SUCCESS
|
111
|
+
callback.call(response)
|
112
|
+
elsif response.timed_out?
|
113
|
+
error_msg = "Timed out ("+request.url+")"
|
114
|
+
add_to_queue = true
|
115
|
+
elsif response.code == 404
|
116
|
+
error_msg = "404 Page not found ("+request.url+")"
|
117
|
+
add_to_queue = false
|
118
|
+
|
119
|
+
elsif response.code == 301 or response.code == 302
|
120
|
+
error_msg = "301/302 Redirection not followed ("+request.url+")"
|
121
|
+
add_to_queue = false
|
122
|
+
|
123
|
+
elsif response.code == 0
|
124
|
+
# Could not get an http response, something's wrong.
|
125
|
+
error_msg = "Could not get an http response, something's wrong ("+request.url+") : "+response.return_message
|
126
|
+
add_to_queue = true
|
127
|
+
else
|
128
|
+
# Received a non-successful http response.
|
129
|
+
error_msg = "Received a non-successful http response ("+request.url+") : "+response.code.to_s
|
130
|
+
add_to_queue = true
|
131
|
+
end
|
132
|
+
|
133
|
+
self._debug(error_msg)
|
134
|
+
|
135
|
+
if add_to_queue
|
136
|
+
@@count_connect_errors = @@count_connect_errors + 1
|
137
|
+
|
138
|
+
if @@count_connect_errors > @config['err_before_chg_ip']
|
139
|
+
self._debug("Changing Proxy","info")
|
140
|
+
|
141
|
+
@proxies[@proxy_cursor].change_ip()
|
142
|
+
|
143
|
+
if @proxy_cursor == @proxies.length - 1 and @config["use_clearconnection"]
|
144
|
+
@proxy_cursor = -1
|
145
|
+
elsif @proxy_cursor == @proxies.length - 1 and !@config["use_clearconnection"]
|
146
|
+
@proxy_cursor = 0
|
147
|
+
else
|
148
|
+
@proxy_cursor = @proxy_cursor + 1
|
149
|
+
end
|
150
|
+
|
151
|
+
@@count_connect_errors = 0
|
152
|
+
end
|
153
|
+
|
154
|
+
if @config['retry_on_error']
|
155
|
+
@hydra.queue(request)
|
156
|
+
end
|
157
|
+
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def scrap()
|
162
|
+
@hydra.run
|
163
|
+
end
|
164
|
+
|
165
|
+
def add_proxy(proxy)
|
166
|
+
@proxies << proxy
|
167
|
+
end
|
168
|
+
|
169
|
+
def _debug(msg,lvl="warn")
|
170
|
+
if @config["debug"]
|
171
|
+
if lvl == "warn"
|
172
|
+
@logger.warn msg
|
173
|
+
elsif lvl == "info"
|
174
|
+
@logger.info msg
|
175
|
+
elsif lvl == "debug"
|
176
|
+
@logger.debug msg
|
177
|
+
elsif lvl == "error"
|
178
|
+
@logger.error msg
|
179
|
+
else
|
180
|
+
@logger.warn msg
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scraptory
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- AlexMili
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-01-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: typhoeus
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.0.1
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.0.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: useragents
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.4
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.1.4
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: tor2
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.1.2
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.1.2
|
55
|
+
description: A simple scraping gem using tor
|
56
|
+
email: /
|
57
|
+
executables: []
|
58
|
+
extensions: []
|
59
|
+
extra_rdoc_files: []
|
60
|
+
files:
|
61
|
+
- lib/scraptory.rb
|
62
|
+
- lib/proxy.rb
|
63
|
+
homepage: https://github.com/AlexMili/scraptory
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata: {}
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 2.0.14.1
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: Scraper over Tor in ruby
|
87
|
+
test_files: []
|