arachnid2 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7e0c363d07cd32c0cbef2130ca5bb5a340ffbb2ea41c9ada00e0e292e5058753
4
- data.tar.gz: cdc27f16c8f9b0b84878b9336b6101fb8fa5961e890136b5aa891e3d7fe3cc73
3
+ metadata.gz: 35ba8ebca2f7ed2459bdf6db38053835a6977bb51b34627fd2f9642bb3b7abb3
4
+ data.tar.gz: f3ed7cd8b5cbabb1643bb2fee9121a8df97e9ba05c4d5452f53e32fa91bd8090
5
5
  SHA512:
6
- metadata.gz: a299e7db051a591bd131b1f633476e70d7dd45a7216c57e02bc51705b7e216be87c8d9b98f497fb084bbae1b06b0a07188884a486b79da951202e6e582bed5f9
7
- data.tar.gz: 4148aa53935e6c52c45ce5a349624366a524ac2783f2615346fed73c04b968b46596500ffe1dc7f12389820506748ce65c70ee5f38c0dacbcc3c8e94cf0701f1
6
+ metadata.gz: b99ae55035e1064fa2140d8d5c13e924a47ba52905611586090c0e35650e31d77067c39a00245d0be5364945f24a025821f18b45917b8584fd3aec72fd0190b3
7
+ data.tar.gz: ebf1e8ca73ed4a964b0b33de3ca396ba48b59df1777402ac9cc8d3ba1126cd5ea63b559c5d55a88f51595af8f58acefa592cefebaae8f590d697586b5ac27461
data/README.md CHANGED
@@ -67,6 +67,12 @@ opts = {
67
67
  max_urls: 50,
68
68
  language: "en-UK",
69
69
  user_agent: "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
70
+ proxy: {
71
+ ip: "1.2.3.4",
72
+ port: "1234",
73
+ username: "sam",
74
+ password: "coolcoolcool",
75
+ }
70
76
  }
71
77
  responses = []
72
78
 
@@ -100,6 +106,12 @@ This user agent is a string mapped to the HTTP header User-Agent. The
100
106
  default is
101
107
  `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
102
108
 
109
+ #### `proxy`
110
+
111
+ Provide your IP, port for a proxy. If required, provide credentials for
112
+ authenticating to that proxy. Proxy options and handling are done
113
+ by Typhoeus.
114
+
103
115
  ### Memory use in Docker
104
116
 
105
117
  In case you are operating the crawler within a container, Arachnid2
data/lib/arachnid2.rb CHANGED
@@ -65,8 +65,16 @@ class Arachnid2
65
65
  #
66
66
  # opts = {
67
67
  # :time_box => 30,
68
- # :language => "es-IO",
69
- # :user_agent => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
68
+ # :headers => {
69
+ # 'Accept-Language' => "en-UK",
70
+ # 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
71
+ # },
72
+ # :proxy => {
73
+ # :ip => "1.2.3.4",
74
+ # :port => "1234",
75
+ # :username => "sam",
76
+ # :password => "coolcoolcool",
77
+ # }
70
78
  # }
71
79
  # responses = []
72
80
  # spider.crawl(opts) { |response|
@@ -175,25 +183,27 @@ class Arachnid2
175
183
  followlocation: true,
176
184
  cookiefile: @cookie_file.path,
177
185
  cookiejar: @cookie_file.path,
178
- headers: {
179
- 'Accept-Language' => "#{language}",
180
- 'User-Agent' => "#{user_agent}"
181
- }
186
+ headers: @options[:headers]
182
187
  }
183
188
 
189
+ @request_options[:headers] ||= {}
190
+ @request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
191
+ @request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT
192
+
184
193
  @request_options
185
194
  end
186
195
 
187
- def language
188
- @options[:language] || DEFAULT_LANGUAGE
189
- end
196
+ def crawl_options
197
+ @crawl_options ||= nil
190
198
 
191
- def user_agent
192
- @options[:user_agent] || DEFAULT_USER_AGENT
193
- end
199
+ if !@crawl_options
200
+ @crawl_options = { :max_urls => max_urls, :time_limit => time_limit }
194
201
 
195
- def crawl_options
196
- { :max_urls => max_urls, :time_limit => time_limit }
202
+ @crawl_options[:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
203
+ @crawl_options[:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
204
+ end
205
+
206
+ @crawl_options
197
207
  end
198
208
 
199
209
  def max_urls
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-05-30 00:00:00.000000000 Z
11
+ date: 2018-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler