cobweb 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.textile +2 -1
- data/lib/cobweb.rb +0 -1
- data/lib/cobweb_crawler.rb +23 -22
- data/lib/cobweb_version.rb +1 -1
- metadata +19 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3b21efd1b03e9f4515045bdd69439fc8a7a4bf3f5aa896edebdbccd12d421d51
|
4
|
+
data.tar.gz: 4744738dfb7eaca5c5b6c1cabdebea8899e0f18cadb5532f1bb43870e98ff4e2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 55b7f9878fcf6c3d97bd935fa59e61c54d745dd37cc75405e9af0e787d56e6543d79a2dedfca43912ed791fd59b51193b12c58722be9ce8c5d65428b6ea0af48
|
7
|
+
data.tar.gz: 8f767e79d76787cd84edbecd967e3136615a32723210a1abe149f8c126e4c8e1c6b3608f631f4ea7d3ad4fe66aa39702b18f73e28c0091ab02b2defdb28868d2
|
data/README.textile
CHANGED
@@ -6,7 +6,6 @@ h1. Cobweb v1.1.0
|
|
6
6
|
!https://gemnasium.com/stewartmckee/cobweb.png!
|
7
7
|
!https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
|
8
8
|
|
9
|
-
|
10
9
|
h2. Intro
|
11
10
|
|
12
11
|
CobWeb has three methods of running. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly. Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
|
@@ -37,6 +36,7 @@ h3. Data Returned For Each Page
|
|
37
36
|
|
38
37
|
* @:url@ - url of the resource requested
|
39
38
|
* @:status_code@ - status code of the resource requested
|
39
|
+
* @:response_time@ - response time of the resource requested
|
40
40
|
* @:mime_type@ - content type of the resource
|
41
41
|
* @:character_set@ - character set of content determined from content type
|
42
42
|
* @:length@ - length of the content returned
|
@@ -99,6 +99,7 @@ Creates a new crawler object based on a base_url
|
|
99
99
|
|
100
100
|
** @:follow_redirects@ - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
|
101
101
|
** @:redirect_limit@ - sets the limit to be used for concurrent redirects (Default: 10)
|
102
|
+
** @:queue_system@ - sets the the queue system :resque or :sidekiq (Default: :resque)
|
102
103
|
** @:processing_queue@ - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
|
103
104
|
** @:crawl_finished_queue@ - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
|
104
105
|
** @:debug@ - enables debug output (Default: false)
|
data/lib/cobweb.rb
CHANGED
data/lib/cobweb_crawler.rb
CHANGED
@@ -4,13 +4,13 @@ require 'redis-namespace'
|
|
4
4
|
|
5
5
|
# CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
|
6
6
|
class CobwebCrawler
|
7
|
-
|
7
|
+
|
8
8
|
# See README for more information on options available
|
9
9
|
def initialize(options={})
|
10
10
|
@options = options
|
11
|
-
|
11
|
+
|
12
12
|
@statistic = {}
|
13
|
-
|
13
|
+
|
14
14
|
@options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
|
15
15
|
if @options.has_key? :crawl_id
|
16
16
|
@crawl_id = @options[:crawl_id]
|
@@ -18,7 +18,7 @@ class CobwebCrawler
|
|
18
18
|
@crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
|
19
19
|
@options[:crawl_id] = @crawl_id
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
|
23
23
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
24
24
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
@@ -27,27 +27,28 @@ class CobwebCrawler
|
|
27
27
|
|
28
28
|
@options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
|
29
29
|
|
30
|
-
@options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
|
30
|
+
@options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
|
31
31
|
@debug = @options[:debug]
|
32
|
-
|
32
|
+
|
33
33
|
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
34
34
|
if @options[:web_statistics]
|
35
|
+
require "server"
|
35
36
|
Server.start(@options)
|
36
37
|
end
|
37
|
-
|
38
|
+
|
38
39
|
@cobweb = Cobweb.new(@options)
|
39
40
|
end
|
40
|
-
|
41
|
+
|
41
42
|
# Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash'
|
42
43
|
def crawl(base_url, crawl_options = {}, &block)
|
43
44
|
@options[:base_url] = base_url unless @options.has_key? :base_url
|
44
45
|
@options[:thread_count] = 1 unless @options.has_key? :thread_count
|
45
|
-
|
46
|
+
|
46
47
|
@options[:internal_urls] << base_url if @options[:internal_urls].empty?
|
47
48
|
@redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?
|
48
|
-
|
49
|
+
|
49
50
|
@crawl_options = crawl_options
|
50
|
-
|
51
|
+
|
51
52
|
@redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
|
52
53
|
@crawl_counter = @redis.scard("crawled").to_i
|
53
54
|
@queue_counter = @redis.scard("queued").to_i
|
@@ -55,7 +56,7 @@ class CobwebCrawler
|
|
55
56
|
@threads = []
|
56
57
|
begin
|
57
58
|
@stats.start_crawl(@options)
|
58
|
-
|
59
|
+
|
59
60
|
@threads << Thread.new do
|
60
61
|
Thread.abort_on_exception = true
|
61
62
|
spawn_thread(&block)
|
@@ -73,7 +74,7 @@ class CobwebCrawler
|
|
73
74
|
end
|
74
75
|
sleep 1
|
75
76
|
end
|
76
|
-
|
77
|
+
|
77
78
|
ensure
|
78
79
|
@stats.end_crawl(@options)
|
79
80
|
end
|
@@ -96,8 +97,8 @@ class CobwebCrawler
|
|
96
97
|
@stats.update_status("Processing #{url}...")
|
97
98
|
|
98
99
|
@redis.sadd "crawled", url.to_s
|
99
|
-
@redis.incr "crawl-counter"
|
100
|
-
|
100
|
+
@redis.incr "crawl-counter"
|
101
|
+
|
101
102
|
document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
|
102
103
|
|
103
104
|
|
@@ -106,18 +107,18 @@ class CobwebCrawler
|
|
106
107
|
|
107
108
|
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
108
109
|
|
109
|
-
# if the site has the same content for http and https then normalize to http
|
110
|
-
|
110
|
+
# if the site has the same content for http and https then normalize to http
|
111
|
+
|
111
112
|
if @options[:treat_https_as_http]
|
112
113
|
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
113
114
|
end
|
114
|
-
|
115
|
+
|
115
116
|
|
116
117
|
# reject the link if we've crawled it or queued it
|
117
118
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
118
119
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
119
120
|
internal_links.reject!{|link| link.nil? || link.empty?}
|
120
|
-
|
121
|
+
|
121
122
|
internal_links.each do |link|
|
122
123
|
puts "Added #{link.to_s} to queue" if @debug
|
123
124
|
@redis.sadd "queued", link unless link.nil?
|
@@ -134,10 +135,10 @@ class CobwebCrawler
|
|
134
135
|
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
|
135
136
|
end
|
136
137
|
end
|
137
|
-
|
138
|
+
|
138
139
|
@crawl_counter = @redis.scard("crawled").to_i
|
139
140
|
@queue_counter = @redis.scard("queued").to_i
|
140
|
-
|
141
|
+
|
141
142
|
@stats.update_statistics(content, @crawl_counter, @queue_counter)
|
142
143
|
@stats.update_status("Completed #{url}.")
|
143
144
|
yield content, @stats.get_statistics if block_given?
|
@@ -161,7 +162,7 @@ class CobwebCrawler
|
|
161
162
|
def running_thread_count
|
162
163
|
@threads.map{|t| t.status}.select{|status| status=="run" || status == "sleep"}.count
|
163
164
|
end
|
164
|
-
|
165
|
+
|
165
166
|
end
|
166
167
|
|
167
168
|
# Monkey patch into String a starts_with method
|
data/lib/cobweb_version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.6.
|
47
|
+
version: 1.6.0
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.6.
|
54
|
+
version: 1.6.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: addressable
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -220,6 +220,20 @@ dependencies:
|
|
220
220
|
- - ">="
|
221
221
|
- !ruby/object:Gem::Version
|
222
222
|
version: '0'
|
223
|
+
- !ruby/object:Gem::Dependency
|
224
|
+
name: bundle-audit
|
225
|
+
requirement: !ruby/object:Gem::Requirement
|
226
|
+
requirements:
|
227
|
+
- - ">="
|
228
|
+
- !ruby/object:Gem::Version
|
229
|
+
version: '0'
|
230
|
+
type: :development
|
231
|
+
prerelease: false
|
232
|
+
version_requirements: !ruby/object:Gem::Requirement
|
233
|
+
requirements:
|
234
|
+
- - ">="
|
235
|
+
- !ruby/object:Gem::Version
|
236
|
+
version: '0'
|
223
237
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
224
238
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
225
239
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -616,7 +630,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
616
630
|
version: '0'
|
617
631
|
requirements: []
|
618
632
|
rubyforge_project:
|
619
|
-
rubygems_version: 2.
|
633
|
+
rubygems_version: 2.7.7
|
620
634
|
signing_key:
|
621
635
|
specification_version: 4
|
622
636
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|