arachnid2 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -0
- data/lib/arachnid2.rb +24 -14
- data/lib/arachnid2/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 35ba8ebca2f7ed2459bdf6db38053835a6977bb51b34627fd2f9642bb3b7abb3
|
|
4
|
+
data.tar.gz: f3ed7cd8b5cbabb1643bb2fee9121a8df97e9ba05c4d5452f53e32fa91bd8090
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b99ae55035e1064fa2140d8d5c13e924a47ba52905611586090c0e35650e31d77067c39a00245d0be5364945f24a025821f18b45917b8584fd3aec72fd0190b3
|
|
7
|
+
data.tar.gz: ebf1e8ca73ed4a964b0b33de3ca396ba48b59df1777402ac9cc8d3ba1126cd5ea63b559c5d55a88f51595af8f58acefa592cefebaae8f590d697586b5ac27461
|
data/README.md
CHANGED
|
@@ -67,6 +67,12 @@ opts = {
|
|
|
67
67
|
max_urls: 50,
|
|
68
68
|
language: "en-UK",
|
|
69
69
|
user_agent: "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
|
|
70
|
+
proxy: {
|
|
71
|
+
ip: "1.2.3.4",
|
|
72
|
+
port: "1234",
|
|
73
|
+
username: "sam",
|
|
74
|
+
password: "coolcoolcool",
|
|
75
|
+
}
|
|
70
76
|
}
|
|
71
77
|
responses = []
|
|
72
78
|
|
|
@@ -100,6 +106,12 @@ This user agent is a string mapped to the HTTP header User-Agent. The
|
|
|
100
106
|
default is
|
|
101
107
|
`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
|
|
102
108
|
|
|
109
|
+
#### `proxy`
|
|
110
|
+
|
|
111
|
+
Provide your IP, port for a proxy. If required, provide credentials for
|
|
112
|
+
authenticating to that proxy. Proxy options and handling are done
|
|
113
|
+
by Typhoeus.
|
|
114
|
+
|
|
103
115
|
### Memory use in Docker
|
|
104
116
|
|
|
105
117
|
In case you are operating the crawler within a container, Arachnid2
|
data/lib/arachnid2.rb
CHANGED
|
@@ -65,8 +65,16 @@ class Arachnid2
|
|
|
65
65
|
#
|
|
66
66
|
# opts = {
|
|
67
67
|
# :time_box => 30,
|
|
68
|
-
# :
|
|
69
|
-
#
|
|
68
|
+
# :headers => {
|
|
69
|
+
# 'Accept-Language' => "en-UK",
|
|
70
|
+
# 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
|
|
71
|
+
# },
|
|
72
|
+
# :proxy => {
|
|
73
|
+
# :ip => "1.2.3.4",
|
|
74
|
+
# :port => "1234",
|
|
75
|
+
# :username => "sam",
|
|
76
|
+
# :password => "coolcoolcool",
|
|
77
|
+
# }
|
|
70
78
|
# }
|
|
71
79
|
# responses = []
|
|
72
80
|
# spider.crawl(opts) { |response|
|
|
@@ -175,25 +183,27 @@ class Arachnid2
|
|
|
175
183
|
followlocation: true,
|
|
176
184
|
cookiefile: @cookie_file.path,
|
|
177
185
|
cookiejar: @cookie_file.path,
|
|
178
|
-
headers:
|
|
179
|
-
'Accept-Language' => "#{language}",
|
|
180
|
-
'User-Agent' => "#{user_agent}"
|
|
181
|
-
}
|
|
186
|
+
headers: @options[:headers]
|
|
182
187
|
}
|
|
183
188
|
|
|
189
|
+
@request_options[:headers] ||= {}
|
|
190
|
+
@request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
|
|
191
|
+
@request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
|
|
192
|
+
|
|
184
193
|
@request_options
|
|
185
194
|
end
|
|
186
195
|
|
|
187
|
-
def
|
|
188
|
-
@
|
|
189
|
-
end
|
|
196
|
+
def crawl_options
|
|
197
|
+
@crawl_options ||= nil
|
|
190
198
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
end
|
|
199
|
+
if !@crawl_options
|
|
200
|
+
@crawl_options = { :max_urls => max_urls, :time_limit => time_limit }
|
|
194
201
|
|
|
195
|
-
|
|
196
|
-
|
|
202
|
+
@crawl_options[:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
|
|
203
|
+
@crawl_options[:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
@crawl_options
|
|
197
207
|
end
|
|
198
208
|
|
|
199
209
|
def max_urls
|
data/lib/arachnid2/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: arachnid2
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Sam Nissen
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2018-
|
|
11
|
+
date: 2018-06-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|