the_mask 0.2.2 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d67508982b071a8217ee5ca318f3ad041dc601d1
4
- data.tar.gz: c29f73ee46620975f3671caf575a9c34726a1047
3
+ metadata.gz: 81556b7f9b9fb2c1bdf439166e30b19c53fefb31
4
+ data.tar.gz: 8a63e7102e20188ca209d1141524891241b963b9
5
5
  SHA512:
6
- metadata.gz: d6181d81ef0efb3c02f36f70a4731bf8f55ae62e186ba09f3cbd996300e2a288ac0af2568ae1df0746b109866dd3580fa7de09c666e3ef9a08c0a5576ebc8266
7
- data.tar.gz: e6b3d53f7d9a8465d825e0273194ac89e55299f5443a7e33e91d674fb48fadc69c2f7f9543a7bd145a98bd739988964752c07efac847530ed8bc98693e6f2259
6
+ metadata.gz: 43edf63a4c770bc64852dd9c7bd832792b7fc3e051ff8d4f70e4a1e908e7aa7f3e532b7df666869de15f9b59135bc159558767120ddadf9e250328a6d08077aa
7
+ data.tar.gz: 3d4918317db7ff14b716818d16bddf0c73585aa70a1d98fc97af2f91f3858168426427408f4fd0214e3d40e1a17700980479fec13cae84db088b3665919425ba
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in the_mask.gemspec
4
4
  gemspec
5
+
6
+ gem 'socksify', git: 'git@github.com:saouddk/socksify-ruby.git'
@@ -7,5 +7,6 @@ end
7
7
 
8
8
  require 'the_mask/user_agents'
9
9
  require 'the_mask/proxy_list'
10
+ require 'the_mask/mechanize_socks_support'
10
11
  require 'the_mask/socket'
11
12
  require 'the_mask/connect'
@@ -0,0 +1,38 @@
1
+ require 'socksify'
2
+ require 'socksify/http'
3
+
4
+ module MechanizeSOCKSSupport
5
+ def set_proxy(addr, port, user = nil, pass = nil)
6
+ @socks = false
7
+ super
8
+ end
9
+
10
+ def fetch(uri, method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
11
+ if @socks && !@socks_http.nil?
12
+ html = @socks_http.get URI(uri)
13
+ page = Struct.new(:uri, :body)
14
+ page.new(uri, html)
15
+ else
16
+ super
17
+ end
18
+ end
19
+
20
+ class Mechanize::HTTP::Agent
21
+ prepend MechanizeSOCKSSupport
22
+ attr_accessor :http, :socks
23
+
24
+ public
25
+ def set_socks(addr, port)
26
+ set_http unless @http
27
+ @socks_http = Net::HTTP::SOCKSProxy(addr, port)
28
+ @socks = true
29
+ end
30
+
31
+ private
32
+ def set_http
33
+ @http = Net::HTTP::Persistent.new 'mechanize'
34
+ @http.idle_timeout = 5
35
+ @http.keep_alive = 300
36
+ end
37
+ end
38
+ end
@@ -1,13 +1,30 @@
1
1
  module TheMask
2
2
  class ProxyList
3
- #TheMask::ProxyList::Proxy class
4
3
  class Proxy
5
- attr_accessor :ip, :port, :username, :password
4
+ attr_accessor :ip, :port, :username, :password, :type, :socks_version
5
+ HTTP_PROXY = :http
6
+ SOCKS_PROXY = :socks
7
+ SUPPORTED_SOCKS_VERSIONS = ['4', '5']
6
8
 
7
- def initialize(string = '')
8
- unless string.empty?
9
+ def initialize(input_string = '')
10
+ unless input_string.empty?
9
11
  # Proxy string format = ip:port:username:password
10
- split_str = string.split ":"
12
+ split_str = input_string.split(':')
13
+ last_element = split_str[-1].to_s.downcase
14
+
15
+ # Check if proxy has SOCKS parameter enabled, by default a proxy will always be a HTTP/s proxy
16
+ if last_element.eql?(SOCKS_PROXY.to_s) && (split_str.length == 3 || split_str.length == 5)
17
+ @type = SOCKS_PROXY
18
+ socks_version = last_element[-1]
19
+ if SUPPORTED_SOCKS_VERSIONS.include?(socks_version)
20
+ @socks_version = socks_version.to_i
21
+ else
22
+ # Default version is the latest SOCKS, in this case ver. 5s
23
+ @socks_version = SUPPORTED_SOCKS_VERSIONS[-1].to_i
24
+ end
25
+ else
26
+ @type = HTTP_PROXY
27
+ end
11
28
  @ip = split_str[0].to_s
12
29
  @port = split_str[1].to_i
13
30
  @username = split_str[2].to_s unless split_str[2].nil?
@@ -20,8 +37,14 @@ module TheMask
20
37
  end
21
38
  end
22
39
 
40
+ def is_SOCKS?
41
+ @type == SOCKS_PROXY
42
+ end
43
+
44
+ def is_HTTP?
45
+ @type == HTTP_PROXY
46
+ end
23
47
  end
24
- #ProxyList class
25
48
  attr_accessor :proxy_list
26
49
 
27
50
  def initialize(arr = [])
@@ -1,17 +1,22 @@
1
+ require_relative 'mechanize_socks_support'
2
+
1
3
  module TheMask
4
+ include MechanizeSOCKSSupport
5
+
2
6
  class Socket
3
- #TODO: Move from Mechanize to native Sockets ;)
4
- DEFAULT_OPEN_TIMEOUT = 3 #seconds
5
- DEFAULT_READ_TIMEOUT = 3 #seconds
6
- GENERAL_TIMEOUT = 5 #seconds
7
+ DEFAULT_OPEN_TIMEOUT = 3 # seconds
8
+ DEFAULT_READ_TIMEOUT = 3 # seconds
9
+ GENERAL_TIMEOUT = 5 # seconds
7
10
  MAXIMUM_TRIES = 3
8
- MINIMUM_PAGE_LENGTH = 100 #bytes
11
+ MINIMUM_PAGE_LENGTH = 100 # bytes
9
12
  FORCE_READ = false
10
13
  RESET_USER_AGENT = true
11
- MIN_PROXY_RESPONSE_TIME = nil #seconds, default: nil = do not remove proxies
14
+ MIN_PROXY_RESPONSE_TIME = nil # seconds, default: nil = do not remove proxies
15
+ SOCKS_INCREASE_TIMEOUTS = 0.5 # increase timeout duration by specified magnitude if proxy used for retrieval is SOCKS4/5
12
16
 
13
17
  def initialize(options = {})
14
18
  @proxies = nil
19
+ @socks_increase_timeouts = options[:socks_increase_timeouts] || SOCKS_INCREASE_TIMEOUTS
15
20
  @timeout = options[:timeout] || GENERAL_TIMEOUT
16
21
  @max_tries = options[:max_tries] || MAXIMUM_TRIES
17
22
  @force = options[:force] || FORCE_READ
@@ -20,20 +25,15 @@ module TheMask
20
25
  @min_proxy_response_time = options[:min_proxy_response_time] || MIN_PROXY_RESPONSE_TIME
21
26
 
22
27
  @agent = Mechanize.new
28
+ @agent.history.max_size = 0
23
29
 
24
- @agent.open_timeout = options[:open_timeout] || DEFAULT_OPEN_TIMEOUT
25
- @agent.read_timeout = options[:read_timeout] || DEFAULT_READ_TIMEOUT
30
+ @open_timeout = options[:open_timeout] || DEFAULT_OPEN_TIMEOUT
31
+ @read_timeout = options[:read_timeout] || DEFAULT_READ_TIMEOUT
26
32
 
27
- unless options[:proxies]
28
- if options[:proxy]
29
- if options[:proxy][:username] && options[:proxy][:password]
30
- @agent.set_proxy options[:proxy][:ip], options[:proxy][:port], options[:proxy][:username], options[:proxy][:password]
31
- else
32
- @agent.set_proxy options[:proxy][:ip], options[:proxy][:port]
33
- end
34
- end
35
- else
33
+ if options[:proxies]
36
34
  @proxies = TheMask::ProxyList.new(options[:proxies])
35
+ else
36
+ @proxies = TheMask::ProxyList.new([options[:proxy]])
37
37
  end
38
38
 
39
39
  @agent.user_agent = TheMask.get_random_user_agent_str unless @reset_user_agent
@@ -41,11 +41,12 @@ module TheMask
41
41
 
42
42
  def open_url(url)
43
43
  read_proc = Proc.new do
44
- proxy = nil #Selected proxy
45
- tries = 0 #Total URL retrieval tries
46
- page_data = nil #Retrieved page html data
44
+ proxy = nil # Selected proxy
45
+ tries = 0 # Total URL retrieval tries
46
+ page_data = nil # Retrieved page html data
47
+ timeout_adjustments = nil # Adjustments to timeouts based on proxy type
47
48
 
48
- #Variables for timing the GET request
49
+ # Variables for timing the GET request
49
50
  end_time = nil
50
51
  start_time = nil
51
52
 
@@ -63,30 +64,42 @@ module TheMask
63
64
  begin
64
65
  proxy = @proxies.get_proxy
65
66
 
66
- if proxy.username && proxy.password
67
- @agent.set_proxy proxy.ip, proxy.port, proxy.username, proxy.password
67
+ if proxy.is_SOCKS?
68
+ @agent.agent.set_socks proxy.ip, proxy.port
69
+ timeout_adjustments = calculate_timeouts @socks_increase_timeouts
70
+ elsif proxy.is_HTTP?
71
+ if proxy.username && proxy.password
72
+ @agent.set_proxy proxy.ip, proxy.port, proxy.username, proxy.password
73
+ else
74
+ @agent.set_proxy proxy.ip, proxy.port
75
+ end
76
+ timeout_adjustments = calculate_timeouts
68
77
  else
69
- @agent.set_proxy proxy.ip, proxy.port
78
+ raise "TheMask: unknown proxy type '#{proxy.type}'."
70
79
  end
71
80
  end
72
81
  end
73
82
  rescue Timeout::ExitException => e
74
- #Exception timeout from mechanize
83
+ # Exception timeout from mechanize
75
84
  @proxies.remove_proxy!(proxy)
76
85
  retry
77
86
  end
78
-
79
- Timeout::timeout(@timeout) do
87
+ @agent.open_timeout = timeout_adjustments[:open_timeout]
88
+ @agent.read_timeout = timeout_adjustments[:read_timeout]
89
+ Timeout::timeout(timeout_adjustments[:timeout]) do
80
90
  start_time = Time.now
81
91
  page_data = @agent.get url
82
92
  end_time = Time.now
83
93
  end
94
+
84
95
  rescue Errno::ETIMEDOUT => e
85
96
  retry
86
97
  rescue Net::HTTP::Persistent::Error => e
87
98
  retry
88
99
  rescue Timeout::Error => e
89
100
  retry
101
+ rescue SOCKSError => e
102
+ retry
90
103
  rescue SignalException => e
91
104
  retry
92
105
  rescue Net::HTTPNotFound => e
@@ -104,7 +117,7 @@ module TheMask
104
117
  end
105
118
 
106
119
  unless @min_proxy_response_time.nil? || start_time.nil? || end_time.nil?
107
- #Remove proxy from list if response time is longer than the minimum response time provided in options
120
+ # Remove proxy from list if response time is longer than the minimum response time provided in options
108
121
  response_time = end_time - start_time
109
122
  @proxies.remove_proxy!(proxy) if response_time > @min_proxy_response_time
110
123
  end
@@ -115,7 +128,6 @@ module TheMask
115
128
  if @force
116
129
  while true
117
130
  data = read_proc.call
118
-
119
131
  unless data.nil? || data.body.to_s.empty? || data.body.to_s.length < @min_page_length
120
132
  return data.body
121
133
  end
@@ -124,5 +136,14 @@ module TheMask
124
136
 
125
137
  read_proc.call.body
126
138
  end
139
+
140
+ private
141
+ def calculate_timeouts(increase_magnitude = 0)
142
+ {
143
+ timeout: (@timeout + (@timeout * increase_magnitude)),
144
+ open_timeout: (@open_timeout + (@open_timeout * increase_magnitude)),
145
+ read_timeout: (@read_timeout + (@read_timeout * increase_magnitude))
146
+ }
147
+ end
127
148
  end
128
149
  end
@@ -1,3 +1,3 @@
1
1
  module TheMask
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.5"
3
3
  end
@@ -23,5 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency 'rake', '~> 10.0'
24
24
  spec.add_development_dependency 'rspec', '~> 0'
25
25
 
26
- spec.add_dependency 'mechanize', '~> 2.7'
26
+ spec.add_dependency 'mechanize', '~> 2.7.4'
27
+ spec.add_dependency 'socksify', '~> 1.7.0'
27
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: the_mask
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Saoud Khalifah
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-11-29 00:00:00.000000000 Z
11
+ date: 2016-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -58,14 +58,28 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '2.7'
61
+ version: 2.7.4
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '2.7'
68
+ version: 2.7.4
69
+ - !ruby/object:Gem::Dependency
70
+ name: socksify
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 1.7.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 1.7.0
69
83
  description: TheMask provides functionality for the purposes of data mining.
70
84
  email:
71
85
  - saouddk@gmail.com
@@ -85,6 +99,7 @@ files:
85
99
  - bin/setup
86
100
  - lib/the_mask.rb
87
101
  - lib/the_mask/connect.rb
102
+ - lib/the_mask/mechanize_socks_support.rb
88
103
  - lib/the_mask/proxy_list.rb
89
104
  - lib/the_mask/socket.rb
90
105
  - lib/the_mask/user_agents.rb
@@ -115,3 +130,4 @@ signing_key:
115
130
  specification_version: 4
116
131
  summary: Socket obfuscation for purposes of data mining/gathering.
117
132
  test_files: []
133
+ has_rdoc: