arachnid2 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -0
- data/lib/arachnid2.rb +24 -14
- data/lib/arachnid2/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 35ba8ebca2f7ed2459bdf6db38053835a6977bb51b34627fd2f9642bb3b7abb3
|
4
|
+
data.tar.gz: f3ed7cd8b5cbabb1643bb2fee9121a8df97e9ba05c4d5452f53e32fa91bd8090
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b99ae55035e1064fa2140d8d5c13e924a47ba52905611586090c0e35650e31d77067c39a00245d0be5364945f24a025821f18b45917b8584fd3aec72fd0190b3
|
7
|
+
data.tar.gz: ebf1e8ca73ed4a964b0b33de3ca396ba48b59df1777402ac9cc8d3ba1126cd5ea63b559c5d55a88f51595af8f58acefa592cefebaae8f590d697586b5ac27461
|
data/README.md
CHANGED
@@ -67,6 +67,12 @@ opts = {
|
|
67
67
|
max_urls: 50,
|
68
68
|
language: "en-UK",
|
69
69
|
user_agent: "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
|
70
|
+
proxy: {
|
71
|
+
ip: "1.2.3.4",
|
72
|
+
port: "1234",
|
73
|
+
username: "sam",
|
74
|
+
password: "coolcoolcool",
|
75
|
+
}
|
70
76
|
}
|
71
77
|
responses = []
|
72
78
|
|
@@ -100,6 +106,12 @@ This user agent is a string mapped to the HTTP header User-Agent. The
|
|
100
106
|
default is
|
101
107
|
`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
|
102
108
|
|
109
|
+
#### `proxy`
|
110
|
+
|
111
|
+
Provide your IP, port for a proxy. If required, provide credentials for
|
112
|
+
authenticating to that proxy. Proxy options and handling are done
|
113
|
+
by Typhoeus.
|
114
|
+
|
103
115
|
### Memory use in Docker
|
104
116
|
|
105
117
|
In case you are operating the crawler within a container, Arachnid2
|
data/lib/arachnid2.rb
CHANGED
@@ -65,8 +65,16 @@ class Arachnid2
|
|
65
65
|
#
|
66
66
|
# opts = {
|
67
67
|
# :time_box => 30,
|
68
|
-
# :
|
69
|
-
#
|
68
|
+
# :headers => {
|
69
|
+
# 'Accept-Language' => "en-UK",
|
70
|
+
# 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
|
71
|
+
# },
|
72
|
+
# :proxy => {
|
73
|
+
# :ip => "1.2.3.4",
|
74
|
+
# :port => "1234",
|
75
|
+
# :username => "sam",
|
76
|
+
# :password => "coolcoolcool",
|
77
|
+
# }
|
70
78
|
# }
|
71
79
|
# responses = []
|
72
80
|
# spider.crawl(opts) { |response|
|
@@ -175,25 +183,27 @@ class Arachnid2
|
|
175
183
|
followlocation: true,
|
176
184
|
cookiefile: @cookie_file.path,
|
177
185
|
cookiejar: @cookie_file.path,
|
178
|
-
headers:
|
179
|
-
'Accept-Language' => "#{language}",
|
180
|
-
'User-Agent' => "#{user_agent}"
|
181
|
-
}
|
186
|
+
headers: @options[:headers]
|
182
187
|
}
|
183
188
|
|
189
|
+
@request_options[:headers] ||= {}
|
190
|
+
@request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
|
191
|
+
@request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
|
192
|
+
|
184
193
|
@request_options
|
185
194
|
end
|
186
195
|
|
187
|
-
def
|
188
|
-
@
|
189
|
-
end
|
196
|
+
def crawl_options
|
197
|
+
@crawl_options ||= nil
|
190
198
|
|
191
|
-
|
192
|
-
|
193
|
-
end
|
199
|
+
if !@crawl_options
|
200
|
+
@crawl_options = { :max_urls => max_urls, :time_limit => time_limit }
|
194
201
|
|
195
|
-
|
196
|
-
|
202
|
+
@crawl_options[:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
|
203
|
+
@crawl_options[:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
|
204
|
+
end
|
205
|
+
|
206
|
+
@crawl_options
|
197
207
|
end
|
198
208
|
|
199
209
|
def max_urls
|
data/lib/arachnid2/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|