the_mask 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +27 -16
- data/lib/the_mask/socket.rb +16 -2
- data/lib/the_mask/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d67508982b071a8217ee5ca318f3ad041dc601d1
|
4
|
+
data.tar.gz: c29f73ee46620975f3671caf575a9c34726a1047
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6181d81ef0efb3c02f36f70a4731bf8f55ae62e186ba09f3cbd996300e2a288ac0af2568ae1df0746b109866dd3580fa7de09c666e3ef9a08c0a5576ebc8266
|
7
|
+
data.tar.gz: e6b3d53f7d9a8465d825e0273194ac89e55299f5443a7e33e91d674fb48fadc69c2f7f9543a7bd145a98bd739988964752c07efac847530ed8bc98693e6f2259
|
data/README.md
CHANGED
@@ -4,6 +4,13 @@
|
|
4
4
|
|
5
5
|
Tired of issues involved with data mining? Put on The Mask and try data mining designed for the next generation.
|
6
6
|
|
7
|
+
## Features
|
8
|
+
|
9
|
+
- A powerful internal proxy list manager that prioritizes successful mining and retrieval of data.
|
10
|
+
- Full control over the data mining process through various configurations available (see below for options).
|
11
|
+
- Socket obfuscation (in-progress)
|
12
|
+
|
13
|
+
|
7
14
|
## Installation
|
8
15
|
|
9
16
|
Add this line to your application's Gemfile:
|
@@ -12,36 +19,40 @@ Add this line to your application's Gemfile:
|
|
12
19
|
gem 'the_mask'
|
13
20
|
```
|
14
21
|
|
15
|
-
And then execute:
|
16
|
-
|
17
|
-
$ bundle
|
18
|
-
|
19
22
|
Or install it yourself as:
|
20
23
|
|
21
24
|
$ gem install the_mask
|
22
25
|
|
23
26
|
## Usage
|
24
27
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
```ruby
|
29
|
+
mask_connect = TheMask::Connect.new(read_timeout: 4, open_timeout: 4, max_tries: 4)
|
30
|
+
mask_connect.open_url('http://www.abcdefg.com')
|
31
|
+
```
|
28
32
|
|
29
33
|
This will return the body data from the supplied URL.
|
30
34
|
|
31
35
|
Available options:
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
```
|
37
|
+
read_timeout = Read timeout in seconds (default: 3)
|
38
|
+
open_timeout = Open timeout in seconds (default: 3)
|
39
|
+
timeout = Timeout for whole procedure in seconds (default: 5)
|
40
|
+
max_tries = Maximum attempts in reading the page (default: 3)
|
41
|
+
min_page_length = Minimum page length in bytes, if not satisfied, reattempt retrieval (default: 100 bytes)
|
42
|
+
reset_ua = Reset user agent on every request. (default: true)
|
43
|
+
force = Force continuous opening of page until data is retrieved (default: false)
|
44
|
+
min_proxy_response_time = Minimum response time for proxies in seconds. After executing open_url, if proxy response time is over set minimum, proxy will be removed from internal proxy list (default: no minimum response time)
|
45
|
+
```
|
39
46
|
|
40
47
|
Proxy options example:
|
41
|
-
|
48
|
+
```ruby
|
49
|
+
mask_connect = TheMask::Connect.new(proxy: { ip: '127.0.0.1', port: 8080, username: 'asd333', password: 'asd333' })
|
50
|
+
```
|
42
51
|
|
43
52
|
Or supply multiple proxies with an array:
|
44
|
-
|
53
|
+
```ruby
|
54
|
+
mask_connect = TheMask::Connect.new(proxies: ['111.11.1.1:80', '10.10.101.10:800', '192.10.10.1:80:sdad:asdasd'])
|
55
|
+
```
|
45
56
|
|
46
57
|
|
47
58
|
## Development
|
data/lib/the_mask/socket.rb
CHANGED
@@ -8,6 +8,7 @@ module TheMask
|
|
8
8
|
MINIMUM_PAGE_LENGTH = 100 #bytes
|
9
9
|
FORCE_READ = false
|
10
10
|
RESET_USER_AGENT = true
|
11
|
+
MIN_PROXY_RESPONSE_TIME = nil #seconds, default: nil = do not remove proxies
|
11
12
|
|
12
13
|
def initialize(options = {})
|
13
14
|
@proxies = nil
|
@@ -16,6 +17,7 @@ module TheMask
|
|
16
17
|
@force = options[:force] || FORCE_READ
|
17
18
|
@min_page_length = options[:min_page_length] || MINIMUM_PAGE_LENGTH
|
18
19
|
@reset_user_agent = options[:reset_ua] || RESET_USER_AGENT
|
20
|
+
@min_proxy_response_time = options[:min_proxy_response_time] || MIN_PROXY_RESPONSE_TIME
|
19
21
|
|
20
22
|
@agent = Mechanize.new
|
21
23
|
|
@@ -39,9 +41,14 @@ module TheMask
|
|
39
41
|
|
40
42
|
def open_url(url)
|
41
43
|
read_proc = Proc.new do
|
44
|
+
proxy = nil #Selected proxy
|
42
45
|
tries = 0 #Total URL retrieval tries
|
43
46
|
page_data = nil #Retrieved page html data
|
44
47
|
|
48
|
+
#Variables for timing the GET request
|
49
|
+
end_time = nil
|
50
|
+
start_time = nil
|
51
|
+
|
45
52
|
begin
|
46
53
|
tries += 1
|
47
54
|
|
@@ -51,8 +58,6 @@ module TheMask
|
|
51
58
|
|
52
59
|
@agent.user_agent = TheMask.get_random_user_agent_str if @reset_user_agent
|
53
60
|
|
54
|
-
proxy = nil
|
55
|
-
|
56
61
|
begin
|
57
62
|
unless @proxies.nil?
|
58
63
|
begin
|
@@ -72,7 +77,9 @@ module TheMask
|
|
72
77
|
end
|
73
78
|
|
74
79
|
Timeout::timeout(@timeout) do
|
80
|
+
start_time = Time.now
|
75
81
|
page_data = @agent.get url
|
82
|
+
end_time = Time.now
|
76
83
|
end
|
77
84
|
rescue Errno::ETIMEDOUT => e
|
78
85
|
retry
|
@@ -95,6 +102,13 @@ module TheMask
|
|
95
102
|
rescue
|
96
103
|
retry
|
97
104
|
end
|
105
|
+
|
106
|
+
unless @min_proxy_response_time.nil? || start_time.nil? || end_time.nil?
|
107
|
+
#Remove proxy from list if response time is longer than the minimum response time provided in options
|
108
|
+
response_time = end_time - start_time
|
109
|
+
@proxies.remove_proxy!(proxy) if response_time > @min_proxy_response_time
|
110
|
+
end
|
111
|
+
|
98
112
|
page_data
|
99
113
|
end
|
100
114
|
|
data/lib/the_mask/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: the_mask
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Saoud Khalifah
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|