scraptory 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/lib/proxy.rb +48 -0
  3. data/lib/scraptory.rb +184 -0
  4. metadata +87 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a7ecdad23a953fc93146eb88a01ea5216d47dfac
4
+ data.tar.gz: 2289f29c61f00c6db5abf6fef5a71d7d9c0b3fa5
5
+ SHA512:
6
+ metadata.gz: de07b7725d9b99217e0c9896649579f88276b504c60f51deb79f7ddb320420b1de5971e704fcacca5fbe292d13579fe04141f1bde3aa38b9fb3e92b971762cbe
7
+ data.tar.gz: 052389b0c605fce0ba5fc4bdee75cad29d3f6fd8f6be2d9a5ba7a9114f0a22851773f876fd5173f21561fda842810faaf7ad8e67fc2ce810e212f73070f02021
@@ -0,0 +1,48 @@
1
+ # encoding: UTF-8
2
+ require "tor"
3
+ require "typhoeus"
4
+ require "useragents"
5
+
6
+ class Proxy
7
+ attr_reader :type
8
+ attr_reader :timeout
9
+
10
+ def initialize(host="localhost",port=8080,options={},credentials={})
11
+ @host = host
12
+ @port = port
13
+ @type = options[:type] || "http"
14
+ @timeout = options[:timeout].to_i || 10
15
+ @credentials = credentials
16
+ @config = options
17
+ end
18
+
19
+ def url
20
+ return @host.to_s+":"+@port.to_s
21
+ end
22
+
23
+ def change_ip
24
+ if @config[:tor]
25
+
26
+ if @node.nil?
27
+ @node = Tor::Controller.new(:host => @credentials[:telnet_host], :port => @credentials[:telnet_port])
28
+ end
29
+
30
+ if !@node.authenticated?
31
+ @node.authenticate(@credentials[:telnet_passwd])
32
+ end
33
+
34
+ @node.signal("NEWNYM")
35
+ end
36
+ end
37
+
38
+ def get_ip
39
+ Typhoeus::Config.user_agent = UserAgents.rand()
40
+
41
+ response = Typhoeus::Request.new("http://checkip.amazonaws.com/",
42
+ timeout: @timeout,
43
+ proxy: self.url,
44
+ proxytype: @type).run
45
+
46
+ return response.response_body.gsub("\n", '').strip
47
+ end
48
+ end
@@ -0,0 +1,184 @@
1
+ # encoding: UTF-8
2
+
3
+ # Requests
4
+ require 'typhoeus'
5
+ require 'useragents'
6
+ # For file/dir manipulation
7
+ require 'fileutils'
8
+ # Logs
9
+ require 'logger'
10
+ # For Tor
11
+ require 'net/telnet'
12
+ require_relative "proxy"
13
+
14
+ class Scraptory
15
+ attr_reader :config
16
+ @@chg_ip_fater_nfails = 10
17
+ @@default_hydra_timeout = 10
18
+ @@default_tor_timeout = 10
19
+ @@default_nthreads = 1
20
+ @@default_err_before_chg_ip = 100
21
+ @@count_connect_errors = 0
22
+
23
+ def initialize(config={})
24
+ @proxies = []
25
+ @proxy_cursor=-1
26
+
27
+ set_config(config)
28
+ end
29
+
30
+ def set_config(config={})
31
+ # Use debug output
32
+ if !config.has_key?("debug")
33
+ config["debug"] = false
34
+ end
35
+
36
+ # If debug_file is set and doesn't exists, we create it
37
+ if config.has_key?("debug_file") and !File.exist?(config["debug_file"])
38
+ config["debug"] = true
39
+ FileUtils.touch(config["debug_file"])
40
+ @logger = Logger.new(config["debug_file"])
41
+ elsif config["debug"]
42
+ @logger = Logger.new(STDOUT)
43
+ end
44
+
45
+ # If the param nthreads exists and is an integer it is created. Else it is set to 1
46
+ if config.has_key?("nthreads") and config["nthreads"].is_a? Integer
47
+ @hydra = Typhoeus::Hydra.new(max_concurrency: config["nthreads"].to_i)
48
+ else
49
+ @hydra = Typhoeus::Hydra.new(max_concurrency: @@default_nthreads)
50
+ end
51
+
52
+ # If wrong data are in timeout config, we set it to default
53
+ if !config.has_key?("timeout") or config["timeout"].to_i < 1
54
+ config["timeout"] = @@default_hydra_timeout
55
+ end
56
+
57
+ if !config.has_key?("retry_on_error")
58
+ config["retry_on_error"] = false
59
+ end
60
+
61
+ # Switch between proxies and clear connection
62
+ if config["use_clearconnection"].nil?
63
+ config["use_clearconnection"] = false
64
+ end
65
+
66
+
67
+ if !config["err_before_chg_ip"].nil? or config["err_before_chg_ip"].to_i < 1
68
+ config["err_before_chg_ip"] = @@default_err_before_chg_ip
69
+ end
70
+
71
+ @config = config
72
+ end
73
+
74
+ def queue(url,callback)
75
+ request = build_request(url)
76
+
77
+ request.on_complete do |response|
78
+ on_request_complete(response,request,callback)
79
+ end
80
+
81
+ @hydra.queue(request)
82
+ end
83
+
84
+ def queues(urls=Array.new,callback)
85
+ urls.each do |url|
86
+ queue(url,callback)
87
+ end
88
+ end
89
+
90
+ def build_request(url)
91
+ Typhoeus::Config.user_agent = UserAgents.rand()
92
+
93
+ # proxy_cursor is set to 1 when no using any proxy
94
+ if not @proxies.any? and @proxy_cursor > -1
95
+ proxy = @proxies[@proxy_cursor]
96
+ return Typhoeus::Request.new(url,
97
+ :timeout => proxy.timeout,
98
+ :proxy => proxy.url,
99
+ :proxytype => proxy.type)
100
+ else
101
+ return Typhoeus::Request.new(url, :timeout => @config['timeout'])
102
+ end
103
+ end
104
+
105
+ def on_request_complete(response,request,callback)
106
+ error_msg = "Unknow error"
107
+ add_to_queue = false
108
+
109
+ if response.success?
110
+ # SUCCESS
111
+ callback.call(response)
112
+ elsif response.timed_out?
113
+ error_msg = "Timed out ("+request.url+")"
114
+ add_to_queue = true
115
+ elsif response.code == 404
116
+ error_msg = "404 Page not found ("+request.url+")"
117
+ add_to_queue = false
118
+
119
+ elsif response.code == 301 or response.code == 302
120
+ error_msg = "301/302 Redirection not followed ("+request.url+")"
121
+ add_to_queue = false
122
+
123
+ elsif response.code == 0
124
+ # Could not get an http response, something's wrong.
125
+ error_msg = "Could not get an http response, something's wrong ("+request.url+") : "+response.return_message
126
+ add_to_queue = true
127
+ else
128
+ # Received a non-successful http response.
129
+ error_msg = "Received a non-successful http response ("+request.url+") : "+response.code.to_s
130
+ add_to_queue = true
131
+ end
132
+
133
+ self._debug(error_msg)
134
+
135
+ if add_to_queue
136
+ @@count_connect_errors = @@count_connect_errors + 1
137
+
138
+ if @@count_connect_errors > @config['err_before_chg_ip']
139
+ self._debug("Changing Proxy","info")
140
+
141
+ @proxies[@proxy_cursor].change_ip()
142
+
143
+ if @proxy_cursor == @proxies.length - 1 and @config["use_clearconnection"]
144
+ @proxy_cursor = -1
145
+ elsif @proxy_cursor == @proxies.length - 1 and !@config["use_clearconnection"]
146
+ @proxy_cursor = 0
147
+ else
148
+ @proxy_cursor = @proxy_cursor + 1
149
+ end
150
+
151
+ @@count_connect_errors = 0
152
+ end
153
+
154
+ if @config['retry_on_error']
155
+ @hydra.queue(request)
156
+ end
157
+
158
+ end
159
+ end
160
+
161
+ def scrap()
162
+ @hydra.run
163
+ end
164
+
165
+ def add_proxy(proxy)
166
+ @proxies << proxy
167
+ end
168
+
169
+ def _debug(msg,lvl="warn")
170
+ if @config["debug"]
171
+ if lvl == "warn"
172
+ @logger.warn msg
173
+ elsif lvl == "info"
174
+ @logger.info msg
175
+ elsif lvl == "debug"
176
+ @logger.debug msg
177
+ elsif lvl == "error"
178
+ @logger.error msg
179
+ else
180
+ @logger.warn msg
181
+ end
182
+ end
183
+ end
184
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scraptory
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - AlexMili
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-01-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: typhoeus
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.0.1
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: 1.0.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: useragents
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.4
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: tor2
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: 0.1.2
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 0.1.2
55
+ description: A simple scraping gem using tor
56
+ email: /
57
+ executables: []
58
+ extensions: []
59
+ extra_rdoc_files: []
60
+ files:
61
+ - lib/scraptory.rb
62
+ - lib/proxy.rb
63
+ homepage: https://github.com/AlexMili/scraptory
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.0.14.1
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Scraper over Tor in ruby
87
+ test_files: []