arachnid2 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 612ad5683bd7211e9be13fd1be854875d9ea54cd4e96680298185f3fe5a474cb
4
+ data.tar.gz: da911439fbd3d4be7741cb34a44b2adeeecf8fe920981f10621e992e203ec1be
5
+ SHA512:
6
+ metadata.gz: 4a27c953917e51b2541ad9bdc3639d2273a7b4541933f9eb7151e6c0f684da366f4a17e366f25e4872013d5d22bd033f835d5d423bafbc085082d77cb968c104
7
+ data.tar.gz: f6fa3992a716a2cb2fa6cee3420629cd21069648e086846d5cd190a7fa3523e24a8adda0b7c779ecbe36b1d9effdde2903d8c07e5bcb4b9bc3f85cd1c4b9c987
data/.gitignore ADDED
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+
13
+ arachnid2-*.gem
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.4.1
5
+ before_install: gem install bundler -v 1.16.1
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at samuel.nissen@rakuten.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in arachnid2.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,56 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ arachnid2 (0.1.0)
5
+ addressable
6
+ adomain
7
+ bloomfilter-rb
8
+ nokogiri
9
+ typhoeus
10
+
11
+ GEM
12
+ remote: https://rubygems.org/
13
+ specs:
14
+ addressable (2.5.2)
15
+ public_suffix (>= 2.0.2, < 4.0)
16
+ adomain (0.1.1)
17
+ addressable (~> 2.5)
18
+ bloomfilter-rb (2.1.1)
19
+ redis
20
+ diff-lcs (1.3)
21
+ ethon (0.11.0)
22
+ ffi (>= 1.3.0)
23
+ ffi (1.9.23)
24
+ mini_portile2 (2.3.0)
25
+ nokogiri (1.8.2)
26
+ mini_portile2 (~> 2.3.0)
27
+ public_suffix (3.0.2)
28
+ rake (10.5.0)
29
+ redis (4.0.1)
30
+ rspec (3.7.0)
31
+ rspec-core (~> 3.7.0)
32
+ rspec-expectations (~> 3.7.0)
33
+ rspec-mocks (~> 3.7.0)
34
+ rspec-core (3.7.1)
35
+ rspec-support (~> 3.7.0)
36
+ rspec-expectations (3.7.0)
37
+ diff-lcs (>= 1.2.0, < 2.0)
38
+ rspec-support (~> 3.7.0)
39
+ rspec-mocks (3.7.0)
40
+ diff-lcs (>= 1.2.0, < 2.0)
41
+ rspec-support (~> 3.7.0)
42
+ rspec-support (3.7.1)
43
+ typhoeus (1.3.0)
44
+ ethon (>= 0.9.0)
45
+
46
+ PLATFORMS
47
+ ruby
48
+
49
+ DEPENDENCIES
50
+ arachnid2!
51
+ bundler (~> 1.16)
52
+ rake (~> 10.0)
53
+ rspec (~> 3.0)
54
+
55
+ BUNDLED WITH
56
+ 1.16.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Sam Nissen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,139 @@
1
+ # Arachnid2
2
+
3
+ ## About
4
+
5
+ Arachnid2 is a simple, fast web-crawler written in Ruby.
6
+ It uses [typhoeus](https://github.com/typhoeus/typhoeus)
7
+ to get HTTP requests,
8
+ [bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb)
9
+ to store the URLs it will get and has gotten,
10
+ and [nokogiri](https://github.com/sparklemotion/nokogiri)
11
+ to find the URLs on each webpage.
12
+
13
+ Arachnid2 is a successor to [Arachnid](https://github.com/dchuk/Arachnid),
14
+ and was abstracted out of the [Tellurion Bot](https://github.com/samnissen/tellurion_bot).
15
+
16
+ ## Usage
17
+
18
+ The basic use of Arachnid2 is surfacing the responses from a domains'
19
+ URLs by visiting a URL, collecting any links to the same domain
20
+ on that page, and visiting those to do the same.
21
+
22
+ Hence, the simplest output would be to collect all of the responses
23
+ while spidering from some URL.
24
+
25
+ ```ruby
26
+ require "arachnid2"
27
+
28
+ url = "http://www.maximumfun.org"
29
+ spider = Arachnid2.new(url)
30
+ responses = []
31
+
32
+ spider.crawl { |response|
33
+ responses << response
34
+ }
35
+ ```
36
+
37
+ Obviously this could become unwieldy,
38
+ so you can execute logic within the spidering to collect a narrow subset
39
+ of the responses, transform or dissect the response,
40
+ or both (or whatever you want).
41
+
42
+ ```ruby
43
+ require "arachnid2"
44
+ require "nokogiri"
45
+
46
+ url = "https://daringfireball.net"
47
+ spider = Arachnid2.new(url)
48
+ responses = []
49
+
50
+ spider.crawl { |response|
51
+ responses << Nokogiri::HTML(response.body) if response.effective_url =~ /.*amazon.*/
52
+ print '*'
53
+ }
54
+ ```
55
+
56
+ `Arachnid2#crawl` will return always `nil`.
57
+
58
+ ### Options
59
+
60
+ ```ruby
61
+ require "arachnid2"
62
+
63
+ url = "http://sixcolours.com"
64
+ spider = Arachnid2.new(url)
65
+ opts = {
66
+ time_box: 60,
67
+ max_urls: 50,
68
+ language: "en-UK",
69
+ user_agent: "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
70
+ }
71
+ responses = []
72
+
73
+ spider.crawl(opts) { |response|
74
+ responses << response
75
+ }
76
+ ```
77
+
78
+ #### `time_box`
79
+
80
+ The crawler will time-bound your spidering. If no valid integer is provided,
81
+ it will crawl for 15 seconds before exiting. 600 seconds (10 minutes)
82
+ is the current maximum, and any value above it will be reduced to 600.
83
+
84
+ #### `max_urls`
85
+
86
+ The crawler will crawl a limited number of URLs before stopping.
87
+ If no valid integer is provided, it will crawl for 50 URLs before exiting.
88
+ 10000 seconds is the current maximum,
89
+ and any value above it will be reduced to 10000.
90
+
91
+ #### `language`
92
+
93
+ The language is a string mapped to the HTTP header Accept-Language. The
94
+ default is
95
+ `en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
96
+
97
+ #### `user_agent`
98
+
99
+ This user agent is a string mapped to the HTTP header User-Agent. The
100
+ default is
101
+ `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
102
+
103
+ ### Memory use in Docker
104
+
105
+ In case you are operating the crawler within a container, Arachnid2
106
+ will attempt to prevent the container from running out of memory,
107
+ and crawls will end when the container has <= 20% of its available memory
108
+ free.
109
+
110
+ ### Non-HTML links
111
+
112
+ The crawler attempts to stop itself from returning data from
113
+ links that are not indicative of HTML, as detailed in
114
+ `Arachnid2::NON_HTML_EXTENSIONS`.
115
+
116
+ ## Development
117
+
118
+ TODO: this
119
+
120
+ ## Contributing
121
+
122
+ Bug reports and pull requests are welcome on GitHub at
123
+ https://github.com/samnissen/arachnid2.
124
+ This project is intended to be a safe,
125
+ welcoming space for collaboration,
126
+ and contributors are expected to adhere to the
127
+ [Contributor Covenant](http://contributor-covenant.org) code of conduct.
128
+
129
+ ## License
130
+
131
+ The gem is available as open source under the terms of the
132
+ [MIT License](https://opensource.org/licenses/MIT).
133
+
134
+ ## Code of Conduct
135
+
136
+ Everyone interacting in the Arachnid2 project’s codebases,
137
+ issue trackers, chat rooms and mailing lists is expected
138
+ to follow the
139
+ [code of conduct](https://github.com/samnissen/arachnid2/blob/master/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/arachnid2.gemspec ADDED
@@ -0,0 +1,33 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "arachnid2/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "arachnid2"
8
+ spec.version = Arachnid2::VERSION
9
+ spec.authors = ["Sam Nissen"]
10
+ spec.email = ["scnissen@gmail.com"]
11
+
12
+ spec.summary = %q{A simple, fast web crawler}
13
+ # spec.description = %q{TODO: Write a longer description or delete this line.}
14
+ spec.homepage = "https://github.com/samnissen/arachnid2"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+ spec.bindir = "exe"
21
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
+ spec.require_paths = ["lib"]
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.16"
25
+ spec.add_development_dependency "rake", "~> 10.0"
26
+ spec.add_development_dependency "rspec", "~> 3.0"
27
+
28
+ spec.add_dependency "typhoeus"
29
+ spec.add_dependency "bloomfilter-rb"
30
+ spec.add_dependency "adomain"
31
+ spec.add_dependency "addressable"
32
+ spec.add_dependency "nokogiri"
33
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "arachnid2"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ class Arachnid2
2
+ VERSION = "0.1.0"
3
+ end
data/lib/arachnid2.rb ADDED
@@ -0,0 +1,237 @@
1
+ require "arachnid2/version"
2
+
3
+ require 'tempfile'
4
+ require "typhoeus"
5
+ require "bloomfilter-rb"
6
+ require "adomain"
7
+ require "addressable/uri"
8
+ require "nokogiri"
9
+
10
+ class Arachnid2
11
+
12
+ # META:
13
+ # About the origins of this crawling approach
14
+ # The Crawler is heavily borrowed from by Arachnid.
15
+ # Original: https://github.com/dchuk/Arachnid
16
+ # Other iterations I've borrowed liberally from:
17
+ # - https://github.com/matstc/Arachnid
18
+ # - https://github.com/intrigueio/Arachnid
19
+ # - https://github.com/jhulme/Arachnid
20
+ # And this was originally written as a part of Tellurion's bot
21
+ # https://github.com/samnissen/tellurion_bot
22
+
23
+ MAX_CRAWL_TIME = 600
24
+ BASE_CRAWL_TIME = 15
25
+ MAX_URLS = 10000
26
+ BASE_URLS = 50
27
+ DEFAULT_LANGUAGE = "en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4"
28
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"
29
+
30
+ NON_HTML_EXTENSIONS = {
31
+ 3 => ['.gz'],
32
+ 4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip',
33
+ '.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar',
34
+ '.iso', '.dmg', '.bin', '.ics', '.exe', '.wav', '.mid'],
35
+ 5 => ['.xlsx', '.docx', '.pptx', '.tiff', '.zipx'],
36
+ 8 => ['.torrent']
37
+ }
38
+ MEMORY_USE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes"
39
+ MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
40
+ MAXIMUM_LOAD_RATE = 79.9
41
+
42
+ #
43
+ # Creates the object to execute the crawl
44
+ #
45
+ # @example
46
+ # url = "https://daringfireball.net"
47
+ # spider = Arachnid2.new(url)
48
+ #
49
+ # @param [String] url
50
+ #
51
+ # @return [Arachnid2] self
52
+ #
53
+ def initialize(url)
54
+ @url = url
55
+ @domain = Adomain[@url]
56
+ end
57
+
58
+ #
59
+ # Visits a URL, gathering links and visiting them,
60
+ # until running out of time, memory or attempts.
61
+ #
62
+ # @example
63
+ # url = "https://daringfireball.net"
64
+ # spider = Arachnid2.new(url)
65
+ #
66
+ # opts = {
67
+ # :time_box => 30,
68
+ # :language => "es-IO",
69
+ # :user_agent => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
70
+ # }
71
+ # responses = []
72
+ # spider.crawl(opts) { |response|
73
+ # responses << response
74
+ # }
75
+ #
76
+ # @param [Hash] opts
77
+ #
78
+ # @return nil
79
+ #
80
+ def crawl(opts = {})
81
+ preflight(opts)
82
+
83
+ until @global_queue.empty?
84
+ @global_queue.size.times do
85
+ begin
86
+ q = @global_queue.shift
87
+
88
+ break if @global_visited.size >= @crawl_options[:max_urls]
89
+ break if Time.now > @crawl_options[:time_limit]
90
+ break if memory_danger?
91
+
92
+ @global_visited.insert(q)
93
+
94
+ request = Typhoeus::Request.new(q, request_options)
95
+
96
+ request.on_complete do |response|
97
+ links = process(response)
98
+ next unless links
99
+
100
+ yield response
101
+
102
+ vacuum(links, response)
103
+ end
104
+
105
+ request.run
106
+ ensure
107
+ @cookie_file.close! if @cookie_file
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ private
114
+ def process(response)
115
+ return false unless Adomain["#{response.effective_url}"].include? @domain
116
+
117
+ elements = Nokogiri::HTML.parse(response.body).css('a')
118
+ return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
119
+ end
120
+
121
+ def vacuum(links, response)
122
+ links.each do |link|
123
+ next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
124
+
125
+ begin
126
+ absolute_link = make_absolute(link, response.effective_url)
127
+
128
+ next if skip_link?(absolute_link)
129
+
130
+ @global_queue << absolute_link
131
+ rescue Addressable::URI::InvalidURIError
132
+ end
133
+ end
134
+ end
135
+
136
+ def skip_link?(absolute_link)
137
+ internal = internal_link?(absolute_link)
138
+ visited = @global_visited.include?(absolute_link)
139
+ ignored = extension_ignored?(absolute_link)
140
+ known = @global_queue.include?(absolute_link)
141
+
142
+ !internal || visited || ignored || known
143
+ end
144
+
145
+ def preflight(opts)
146
+ @options = opts
147
+ @crawl_options = crawl_options
148
+ # TODO: write looping to take advantage of Hydra
149
+ # @hydra = Typhoeus::Hydra.new(:max_concurrency => 1)
150
+ @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
151
+ @global_queue = [@url]
152
+ end
153
+
154
+ def bound_time
155
+ boundary = "#{@options[:time_box]}".to_i
156
+ boundary = BASE_CRAWL_TIME if boundary <= 0
157
+ boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME
158
+
159
+ return Time.now + boundary
160
+ end
161
+
162
+ def bound_urls
163
+ amount = "#{@options[:max_urls]}".to_i
164
+ amount = BASE_URLS if amount <= 0
165
+ amount = MAX_URLS if amount > MAX_URLS
166
+
167
+ amount
168
+ end
169
+
170
+ def request_options
171
+ @cookie_file ||= Tempfile.new('cookies')
172
+
173
+ @request_options = {
174
+ timeout: 10000,
175
+ followlocation: true,
176
+ cookiefile: @cookie_file.path,
177
+ cookiejar: @cookie_file.path,
178
+ headers: {
179
+ 'Accept-Language' => "#{language}",
180
+ 'User-Agent' => "#{user_agent}"
181
+ }
182
+ }
183
+
184
+ @request_options
185
+ end
186
+
187
+ def language
188
+ @options[:language] || DEFAULT_LANGUAGE
189
+ end
190
+
191
+ def user_agent
192
+ @options[:user_agent] || DEFAULT_USER_AGENT
193
+ end
194
+
195
+ def crawl_options
196
+ { :max_urls => max_urls, :time_limit => time_limit }
197
+ end
198
+
199
+ def max_urls
200
+ bound_urls
201
+ end
202
+
203
+ def time_limit
204
+ bound_time
205
+ end
206
+
207
+ def make_absolute(href, root)
208
+ Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
209
+ end
210
+
211
+ def internal_link?(absolute_url)
212
+ Adomain[absolute_url].include? @domain
213
+ end
214
+
215
+ def extension_ignored?(url)
216
+ return false if url.empty?
217
+
218
+ !NON_HTML_EXTENSIONS.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
219
+ end
220
+
221
+ def memory_danger?
222
+ return false unless in_docker?
223
+
224
+ use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
225
+ @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
226
+
227
+ return false unless ( (use > 0.0) && (@limit > 0.0) )
228
+
229
+ return ( ( (use / @limit) * 100.0 ) >= MAXIMUM_LOAD_RATE )
230
+ end
231
+
232
+ def in_docker?
233
+ return false unless File.file?(MEMORY_USE_FILE)
234
+ true
235
+ end
236
+
237
+ end
metadata ADDED
@@ -0,0 +1,170 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: arachnid2
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Sam Nissen
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-05-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: typhoeus
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bloomfilter-rb
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: adomain
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: addressable
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: nokogiri
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description:
126
+ email:
127
+ - scnissen@gmail.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - ".gitignore"
133
+ - ".rspec"
134
+ - ".travis.yml"
135
+ - CODE_OF_CONDUCT.md
136
+ - Gemfile
137
+ - Gemfile.lock
138
+ - LICENSE.txt
139
+ - README.md
140
+ - Rakefile
141
+ - arachnid2.gemspec
142
+ - bin/console
143
+ - bin/setup
144
+ - lib/arachnid2.rb
145
+ - lib/arachnid2/version.rb
146
+ homepage: https://github.com/samnissen/arachnid2
147
+ licenses:
148
+ - MIT
149
+ metadata: {}
150
+ post_install_message:
151
+ rdoc_options: []
152
+ require_paths:
153
+ - lib
154
+ required_ruby_version: !ruby/object:Gem::Requirement
155
+ requirements:
156
+ - - ">="
157
+ - !ruby/object:Gem::Version
158
+ version: '0'
159
+ required_rubygems_version: !ruby/object:Gem::Requirement
160
+ requirements:
161
+ - - ">="
162
+ - !ruby/object:Gem::Version
163
+ version: '0'
164
+ requirements: []
165
+ rubyforge_project:
166
+ rubygems_version: 2.7.7
167
+ signing_key:
168
+ specification_version: 4
169
+ summary: A simple, fast web crawler
170
+ test_files: []