the_mask 0.2.2 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d67508982b071a8217ee5ca318f3ad041dc601d1
4
- data.tar.gz: c29f73ee46620975f3671caf575a9c34726a1047
3
+ metadata.gz: 81556b7f9b9fb2c1bdf439166e30b19c53fefb31
4
+ data.tar.gz: 8a63e7102e20188ca209d1141524891241b963b9
5
5
  SHA512:
6
- metadata.gz: d6181d81ef0efb3c02f36f70a4731bf8f55ae62e186ba09f3cbd996300e2a288ac0af2568ae1df0746b109866dd3580fa7de09c666e3ef9a08c0a5576ebc8266
7
- data.tar.gz: e6b3d53f7d9a8465d825e0273194ac89e55299f5443a7e33e91d674fb48fadc69c2f7f9543a7bd145a98bd739988964752c07efac847530ed8bc98693e6f2259
6
+ metadata.gz: 43edf63a4c770bc64852dd9c7bd832792b7fc3e051ff8d4f70e4a1e908e7aa7f3e532b7df666869de15f9b59135bc159558767120ddadf9e250328a6d08077aa
7
+ data.tar.gz: 3d4918317db7ff14b716818d16bddf0c73585aa70a1d98fc97af2f91f3858168426427408f4fd0214e3d40e1a17700980479fec13cae84db088b3665919425ba
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in the_mask.gemspec
4
4
  gemspec
5
+
6
+ gem 'socksify', git: 'git@github.com:saouddk/socksify-ruby.git'
@@ -7,5 +7,6 @@ end
7
7
 
8
8
  require 'the_mask/user_agents'
9
9
  require 'the_mask/proxy_list'
10
+ require 'the_mask/mechanize_socks_support'
10
11
  require 'the_mask/socket'
11
12
  require 'the_mask/connect'
@@ -0,0 +1,38 @@
1
+ require 'socksify'
2
+ require 'socksify/http'
3
+
4
+ module MechanizeSOCKSSupport
5
+ def set_proxy(addr, port, user = nil, pass = nil)
6
+ @socks = false
7
+ super
8
+ end
9
+
10
+ def fetch(uri, method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
11
+ if @socks && !@socks_http.nil?
12
+ html = @socks_http.get URI(uri)
13
+ page = Struct.new(:uri, :body)
14
+ page.new(uri, html)
15
+ else
16
+ super
17
+ end
18
+ end
19
+
20
+ class Mechanize::HTTP::Agent
21
+ prepend MechanizeSOCKSSupport
22
+ attr_accessor :http, :socks
23
+
24
+ public
25
+ def set_socks(addr, port)
26
+ set_http unless @http
27
+ @socks_http = Net::HTTP::SOCKSProxy(addr, port)
28
+ @socks = true
29
+ end
30
+
31
+ private
32
+ def set_http
33
+ @http = Net::HTTP::Persistent.new 'mechanize'
34
+ @http.idle_timeout = 5
35
+ @http.keep_alive = 300
36
+ end
37
+ end
38
+ end
@@ -1,13 +1,30 @@
1
1
  module TheMask
2
2
  class ProxyList
3
- #TheMask::ProxyList::Proxy class
4
3
  class Proxy
5
- attr_accessor :ip, :port, :username, :password
4
+ attr_accessor :ip, :port, :username, :password, :type, :socks_version
5
+ HTTP_PROXY = :http
6
+ SOCKS_PROXY = :socks
7
+ SUPPORTED_SOCKS_VERSIONS = ['4', '5']
6
8
 
7
- def initialize(string = '')
8
- unless string.empty?
9
+ def initialize(input_string = '')
10
+ unless input_string.empty?
9
11
  # Proxy string format = ip:port:username:password
10
- split_str = string.split ":"
12
+ split_str = input_string.split(':')
13
+ last_element = split_str[-1].to_s.downcase
14
+
15
+ # Check if proxy has SOCKS parameter enabled, by default a proxy will always be a HTTP/s proxy
16
+ if last_element.eql?(SOCKS_PROXY.to_s) && (split_str.length == 3 || split_str.length == 5)
17
+ @type = SOCKS_PROXY
18
+ socks_version = last_element[-1]
19
+ if SUPPORTED_SOCKS_VERSIONS.include?(socks_version)
20
+ @socks_version = socks_version.to_i
21
+ else
22
+ # Default version is the latest SOCKS, in this case ver. 5s
23
+ @socks_version = SUPPORTED_SOCKS_VERSIONS[-1].to_i
24
+ end
25
+ else
26
+ @type = HTTP_PROXY
27
+ end
11
28
  @ip = split_str[0].to_s
12
29
  @port = split_str[1].to_i
13
30
  @username = split_str[2].to_s unless split_str[2].nil?
@@ -20,8 +37,14 @@ module TheMask
20
37
  end
21
38
  end
22
39
 
40
+ def is_SOCKS?
41
+ @type == SOCKS_PROXY
42
+ end
43
+
44
+ def is_HTTP?
45
+ @type == HTTP_PROXY
46
+ end
23
47
  end
24
- #ProxyList class
25
48
  attr_accessor :proxy_list
26
49
 
27
50
  def initialize(arr = [])
@@ -1,17 +1,22 @@
1
+ require_relative 'mechanize_socks_support'
2
+
1
3
  module TheMask
4
+ include MechanizeSOCKSSupport
5
+
2
6
  class Socket
3
- #TODO: Move from Mechanize to native Sockets ;)
4
- DEFAULT_OPEN_TIMEOUT = 3 #seconds
5
- DEFAULT_READ_TIMEOUT = 3 #seconds
6
- GENERAL_TIMEOUT = 5 #seconds
7
+ DEFAULT_OPEN_TIMEOUT = 3 # seconds
8
+ DEFAULT_READ_TIMEOUT = 3 # seconds
9
+ GENERAL_TIMEOUT = 5 # seconds
7
10
  MAXIMUM_TRIES = 3
8
- MINIMUM_PAGE_LENGTH = 100 #bytes
11
+ MINIMUM_PAGE_LENGTH = 100 # bytes
9
12
  FORCE_READ = false
10
13
  RESET_USER_AGENT = true
11
- MIN_PROXY_RESPONSE_TIME = nil #seconds, default: nil = do not remove proxies
14
+ MIN_PROXY_RESPONSE_TIME = nil # seconds, default: nil = do not remove proxies
15
+ SOCKS_INCREASE_TIMEOUTS = 0.5 # increase timeout duration by specified magnitude if proxy used for retrieval is SOCKS4/5
12
16
 
13
17
  def initialize(options = {})
14
18
  @proxies = nil
19
+ @socks_increase_timeouts = options[:socks_increase_timeouts] || SOCKS_INCREASE_TIMEOUTS
15
20
  @timeout = options[:timeout] || GENERAL_TIMEOUT
16
21
  @max_tries = options[:max_tries] || MAXIMUM_TRIES
17
22
  @force = options[:force] || FORCE_READ
@@ -20,20 +25,15 @@ module TheMask
20
25
  @min_proxy_response_time = options[:min_proxy_response_time] || MIN_PROXY_RESPONSE_TIME
21
26
 
22
27
  @agent = Mechanize.new
28
+ @agent.history.max_size = 0
23
29
 
24
- @agent.open_timeout = options[:open_timeout] || DEFAULT_OPEN_TIMEOUT
25
- @agent.read_timeout = options[:read_timeout] || DEFAULT_READ_TIMEOUT
30
+ @open_timeout = options[:open_timeout] || DEFAULT_OPEN_TIMEOUT
31
+ @read_timeout = options[:read_timeout] || DEFAULT_READ_TIMEOUT
26
32
 
27
- unless options[:proxies]
28
- if options[:proxy]
29
- if options[:proxy][:username] && options[:proxy][:password]
30
- @agent.set_proxy options[:proxy][:ip], options[:proxy][:port], options[:proxy][:username], options[:proxy][:password]
31
- else
32
- @agent.set_proxy options[:proxy][:ip], options[:proxy][:port]
33
- end
34
- end
35
- else
33
+ if options[:proxies]
36
34
  @proxies = TheMask::ProxyList.new(options[:proxies])
35
+ else
36
+ @proxies = TheMask::ProxyList.new([options[:proxy]])
37
37
  end
38
38
 
39
39
  @agent.user_agent = TheMask.get_random_user_agent_str unless @reset_user_agent
@@ -41,11 +41,12 @@ module TheMask
41
41
 
42
42
  def open_url(url)
43
43
  read_proc = Proc.new do
44
- proxy = nil #Selected proxy
45
- tries = 0 #Total URL retrieval tries
46
- page_data = nil #Retrieved page html data
44
+ proxy = nil # Selected proxy
45
+ tries = 0 # Total URL retrieval tries
46
+ page_data = nil # Retrieved page html data
47
+ timeout_adjustments = nil # Adjustments to timeouts based on proxy type
47
48
 
48
- #Variables for timing the GET request
49
+ # Variables for timing the GET request
49
50
  end_time = nil
50
51
  start_time = nil
51
52
 
@@ -63,30 +64,42 @@ module TheMask
63
64
  begin
64
65
  proxy = @proxies.get_proxy
65
66
 
66
- if proxy.username && proxy.password
67
- @agent.set_proxy proxy.ip, proxy.port, proxy.username, proxy.password
67
+ if proxy.is_SOCKS?
68
+ @agent.agent.set_socks proxy.ip, proxy.port
69
+ timeout_adjustments = calculate_timeouts @socks_increase_timeouts
70
+ elsif proxy.is_HTTP?
71
+ if proxy.username && proxy.password
72
+ @agent.set_proxy proxy.ip, proxy.port, proxy.username, proxy.password
73
+ else
74
+ @agent.set_proxy proxy.ip, proxy.port
75
+ end
76
+ timeout_adjustments = calculate_timeouts
68
77
  else
69
- @agent.set_proxy proxy.ip, proxy.port
78
+ raise "TheMask: unknown proxy type '#{proxy.type}'."
70
79
  end
71
80
  end
72
81
  end
73
82
  rescue Timeout::ExitException => e
74
- #Exception timeout from mechanize
83
+ # Exception timeout from mechanize
75
84
  @proxies.remove_proxy!(proxy)
76
85
  retry
77
86
  end
78
-
79
- Timeout::timeout(@timeout) do
87
+ @agent.open_timeout = timeout_adjustments[:open_timeout]
88
+ @agent.read_timeout = timeout_adjustments[:read_timeout]
89
+ Timeout::timeout(timeout_adjustments[:timeout]) do
80
90
  start_time = Time.now
81
91
  page_data = @agent.get url
82
92
  end_time = Time.now
83
93
  end
94
+
84
95
  rescue Errno::ETIMEDOUT => e
85
96
  retry
86
97
  rescue Net::HTTP::Persistent::Error => e
87
98
  retry
88
99
  rescue Timeout::Error => e
89
100
  retry
101
+ rescue SOCKSError => e
102
+ retry
90
103
  rescue SignalException => e
91
104
  retry
92
105
  rescue Net::HTTPNotFound => e
@@ -104,7 +117,7 @@ module TheMask
104
117
  end
105
118
 
106
119
  unless @min_proxy_response_time.nil? || start_time.nil? || end_time.nil?
107
- #Remove proxy from list if response time is longer than the minimum response time provided in options
120
+ # Remove proxy from list if response time is longer than the minimum response time provided in options
108
121
  response_time = end_time - start_time
109
122
  @proxies.remove_proxy!(proxy) if response_time > @min_proxy_response_time
110
123
  end
@@ -115,7 +128,6 @@ module TheMask
115
128
  if @force
116
129
  while true
117
130
  data = read_proc.call
118
-
119
131
  unless data.nil? || data.body.to_s.empty? || data.body.to_s.length < @min_page_length
120
132
  return data.body
121
133
  end
@@ -124,5 +136,14 @@ module TheMask
124
136
 
125
137
  read_proc.call.body
126
138
  end
139
+
140
+ private
141
+ def calculate_timeouts(increase_magnitude = 0)
142
+ {
143
+ timeout: (@timeout + (@timeout * increase_magnitude)),
144
+ open_timeout: (@open_timeout + (@open_timeout * increase_magnitude)),
145
+ read_timeout: (@read_timeout + (@read_timeout * increase_magnitude))
146
+ }
147
+ end
127
148
  end
128
149
  end
@@ -1,3 +1,3 @@
1
1
  module TheMask
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.5"
3
3
  end
@@ -23,5 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency 'rake', '~> 10.0'
24
24
  spec.add_development_dependency 'rspec', '~> 0'
25
25
 
26
- spec.add_dependency 'mechanize', '~> 2.7'
26
+ spec.add_dependency 'mechanize', '~> 2.7.4'
27
+ spec.add_dependency 'socksify', '~> 1.7.0'
27
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: the_mask
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Saoud Khalifah
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-11-29 00:00:00.000000000 Z
11
+ date: 2016-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -58,14 +58,28 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '2.7'
61
+ version: 2.7.4
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '2.7'
68
+ version: 2.7.4
69
+ - !ruby/object:Gem::Dependency
70
+ name: socksify
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 1.7.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 1.7.0
69
83
  description: TheMask provides functionality for the purposes of data mining.
70
84
  email:
71
85
  - saouddk@gmail.com
@@ -85,6 +99,7 @@ files:
85
99
  - bin/setup
86
100
  - lib/the_mask.rb
87
101
  - lib/the_mask/connect.rb
102
+ - lib/the_mask/mechanize_socks_support.rb
88
103
  - lib/the_mask/proxy_list.rb
89
104
  - lib/the_mask/socket.rb
90
105
  - lib/the_mask/user_agents.rb
@@ -115,3 +130,4 @@ signing_key:
115
130
  specification_version: 4
116
131
  summary: Socket obfuscation for purposes of data mining/gathering.
117
132
  test_files: []
133
+ has_rdoc: