the_mask 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -16
- data/lib/the_mask/socket.rb +16 -2
- data/lib/the_mask/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d67508982b071a8217ee5ca318f3ad041dc601d1
|
4
|
+
data.tar.gz: c29f73ee46620975f3671caf575a9c34726a1047
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d6181d81ef0efb3c02f36f70a4731bf8f55ae62e186ba09f3cbd996300e2a288ac0af2568ae1df0746b109866dd3580fa7de09c666e3ef9a08c0a5576ebc8266
|
7
|
+
data.tar.gz: e6b3d53f7d9a8465d825e0273194ac89e55299f5443a7e33e91d674fb48fadc69c2f7f9543a7bd145a98bd739988964752c07efac847530ed8bc98693e6f2259
|
data/README.md
CHANGED
@@ -4,6 +4,13 @@
|
|
4
4
|
|
5
5
|
Tired of issues involved with data mining? Put on The Mask and try data mining designed for the next generation.
|
6
6
|
|
7
|
+
## Features
|
8
|
+
|
9
|
+
- A powerful internal proxy list manager that prioritizes successful mining and retrieval of data.
|
10
|
+
- Full control over the data mining process through various configurations available (see below for options).
|
11
|
+
- Socket obfuscation (in-progress)
|
12
|
+
|
13
|
+
|
7
14
|
## Installation
|
8
15
|
|
9
16
|
Add this line to your application's Gemfile:
|
@@ -12,36 +19,40 @@ Add this line to your application's Gemfile:
|
|
12
19
|
gem 'the_mask'
|
13
20
|
```
|
14
21
|
|
15
|
-
And then execute:
|
16
|
-
|
17
|
-
$ bundle
|
18
|
-
|
19
22
|
Or install it yourself as:
|
20
23
|
|
21
24
|
$ gem install the_mask
|
22
25
|
|
23
26
|
## Usage
|
24
27
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
+
```ruby
|
29
|
+
mask_connect = TheMask::Connect.new(read_timeout: 4, open_timeout: 4, max_tries: 4)
|
30
|
+
mask_connect.open_url('http://www.abcdefg.com')
|
31
|
+
```
|
28
32
|
|
29
33
|
This will return the body data from the supplied URL.
|
30
34
|
|
31
35
|
Available options:
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
```
|
37
|
+
read_timeout = Read timeout in seconds (default: 3)
|
38
|
+
open_timeout = Open timeout in seconds (default: 3)
|
39
|
+
timeout = Timeout for whole procedure in seconds (default: 5)
|
40
|
+
max_tries = Maximum attempts in reading the page (default: 3)
|
41
|
+
min_page_length = Minimum page length in bytes, if not satisfied, reattempt retrieval (default: 100 bytes)
|
42
|
+
reset_ua = Reset user agent on every request. (default: true)
|
43
|
+
force = Force continuous opening of page until data is retrieved (default: false)
|
44
|
+
min_proxy_response_time = Minimum response time for proxies in seconds. After executing open_url, if proxy response time is over set minimum, proxy will be removed from internal proxy list (default: no minimum response time)
|
45
|
+
```
|
39
46
|
|
40
47
|
Proxy options example:
|
41
|
-
|
48
|
+
```ruby
|
49
|
+
mask_connect = TheMask::Connect.new(proxy: { ip: '127.0.0.1', port: 8080, username: 'asd333', password: 'asd333' })
|
50
|
+
```
|
42
51
|
|
43
52
|
Or supply multiple proxies with an array:
|
44
|
-
|
53
|
+
```ruby
|
54
|
+
mask_connect = TheMask::Connect.new(proxies: ['111.11.1.1:80', '10.10.101.10:800', '192.10.10.1:80:sdad:asdasd'])
|
55
|
+
```
|
45
56
|
|
46
57
|
|
47
58
|
## Development
|
data/lib/the_mask/socket.rb
CHANGED
@@ -8,6 +8,7 @@ module TheMask
|
|
8
8
|
MINIMUM_PAGE_LENGTH = 100 #bytes
|
9
9
|
FORCE_READ = false
|
10
10
|
RESET_USER_AGENT = true
|
11
|
+
MIN_PROXY_RESPONSE_TIME = nil #seconds, default: nil = do not remove proxies
|
11
12
|
|
12
13
|
def initialize(options = {})
|
13
14
|
@proxies = nil
|
@@ -16,6 +17,7 @@ module TheMask
|
|
16
17
|
@force = options[:force] || FORCE_READ
|
17
18
|
@min_page_length = options[:min_page_length] || MINIMUM_PAGE_LENGTH
|
18
19
|
@reset_user_agent = options[:reset_ua] || RESET_USER_AGENT
|
20
|
+
@min_proxy_response_time = options[:min_proxy_response_time] || MIN_PROXY_RESPONSE_TIME
|
19
21
|
|
20
22
|
@agent = Mechanize.new
|
21
23
|
|
@@ -39,9 +41,14 @@ module TheMask
|
|
39
41
|
|
40
42
|
def open_url(url)
|
41
43
|
read_proc = Proc.new do
|
44
|
+
proxy = nil #Selected proxy
|
42
45
|
tries = 0 #Total URL retrieval tries
|
43
46
|
page_data = nil #Retrieved page html data
|
44
47
|
|
48
|
+
#Variables for timing the GET request
|
49
|
+
end_time = nil
|
50
|
+
start_time = nil
|
51
|
+
|
45
52
|
begin
|
46
53
|
tries += 1
|
47
54
|
|
@@ -51,8 +58,6 @@ module TheMask
|
|
51
58
|
|
52
59
|
@agent.user_agent = TheMask.get_random_user_agent_str if @reset_user_agent
|
53
60
|
|
54
|
-
proxy = nil
|
55
|
-
|
56
61
|
begin
|
57
62
|
unless @proxies.nil?
|
58
63
|
begin
|
@@ -72,7 +77,9 @@ module TheMask
|
|
72
77
|
end
|
73
78
|
|
74
79
|
Timeout::timeout(@timeout) do
|
80
|
+
start_time = Time.now
|
75
81
|
page_data = @agent.get url
|
82
|
+
end_time = Time.now
|
76
83
|
end
|
77
84
|
rescue Errno::ETIMEDOUT => e
|
78
85
|
retry
|
@@ -95,6 +102,13 @@ module TheMask
|
|
95
102
|
rescue
|
96
103
|
retry
|
97
104
|
end
|
105
|
+
|
106
|
+
unless @min_proxy_response_time.nil? || start_time.nil? || end_time.nil?
|
107
|
+
#Remove proxy from list if response time is longer than the minimum response time provided in options
|
108
|
+
response_time = end_time - start_time
|
109
|
+
@proxies.remove_proxy!(proxy) if response_time > @min_proxy_response_time
|
110
|
+
end
|
111
|
+
|
98
112
|
page_data
|
99
113
|
end
|
100
114
|
|
data/lib/the_mask/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: the_mask
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Saoud Khalifah
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|