cobweb 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 854165929dc7a5e3e16d138515723cfd2d3f95b5
4
- data.tar.gz: 8444c545e80547e41dcaeae817a81183ed4a8d54
2
+ SHA256:
3
+ metadata.gz: 3b21efd1b03e9f4515045bdd69439fc8a7a4bf3f5aa896edebdbccd12d421d51
4
+ data.tar.gz: 4744738dfb7eaca5c5b6c1cabdebea8899e0f18cadb5532f1bb43870e98ff4e2
5
5
  SHA512:
6
- metadata.gz: 181edda4c8fc822f52729a645046c6e3a8d11ad0ff3d5540cfcc055528563605c82eec5a5615074fe86f97c8655694dc75c29d61b8e324fed02291372e5f107a
7
- data.tar.gz: 830a4aceeb58d6d5b43d284622a9054aea66a5a46aadabec319bf9eebeb3e20444e476a3a0db42afcc2344a6a5f7dae210aaed024055d6d7d25042f3cbd38f79
6
+ metadata.gz: 55b7f9878fcf6c3d97bd935fa59e61c54d745dd37cc75405e9af0e787d56e6543d79a2dedfca43912ed791fd59b51193b12c58722be9ce8c5d65428b6ea0af48
7
+ data.tar.gz: 8f767e79d76787cd84edbecd967e3136615a32723210a1abe149f8c126e4c8e1c6b3608f631f4ea7d3ad4fe66aa39702b18f73e28c0091ab02b2defdb28868d2
@@ -6,7 +6,6 @@ h1. Cobweb v1.1.0
6
6
  !https://gemnasium.com/stewartmckee/cobweb.png!
7
7
  !https://coveralls.io/repos/stewartmckee/cobweb/badge.png?branch=master(Coverage Status)!:https://coveralls.io/r/stewartmckee/cobweb
8
8
 
9
-
10
9
  h2. Intro
11
10
 
12
11
  CobWeb has three methods of running. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly. Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
@@ -37,6 +36,7 @@ h3. Data Returned For Each Page
37
36
 
38
37
  * @:url@ - url of the resource requested
39
38
  * @:status_code@ - status code of the resource requested
39
+ * @:response_time@ - response time of the resource requested
40
40
  * @:mime_type@ - content type of the resource
41
41
  * @:character_set@ - character set of content determined from content type
42
42
  * @:length@ - length of the content returned
@@ -99,6 +99,7 @@ Creates a new crawler object based on a base_url
99
99
 
100
100
  ** @:follow_redirects@ - transparently follows redirects and populates the :redirect_through key in the content hash (Default: true)
101
101
  ** @:redirect_limit@ - sets the limit to be used for concurrent redirects (Default: 10)
102
+ ** @:queue_system@ - sets the the queue system :resque or :sidekiq (Default: :resque)
102
103
  ** @:processing_queue@ - specifies the processing queue for content to be sent to (Default: 'CobwebProcessJob' when using resque, 'CrawlProcessWorker' when using sidekiq)
103
104
  ** @:crawl_finished_queue@ - specifies the processing queue for statistics to be sent to after finishing crawling (Default: 'CobwebFinishedJob' when using resque, 'CrawlFinishedWorker' when using sidekiq)
104
105
  ** @:debug@ - enables debug output (Default: false)
@@ -10,7 +10,6 @@ end
10
10
 
11
11
  puts Gem::Specification.find_all_by_name("sidekiq", ">=3.0.0")
12
12
 
13
-
14
13
  # Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
15
14
  class Cobweb
16
15
 
@@ -4,13 +4,13 @@ require 'redis-namespace'
4
4
 
5
5
  # CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
6
6
  class CobwebCrawler
7
-
7
+
8
8
  # See README for more information on options available
9
9
  def initialize(options={})
10
10
  @options = options
11
-
11
+
12
12
  @statistic = {}
13
-
13
+
14
14
  @options[:redis_options] = {:host => "127.0.0.1"} unless @options.has_key? :redis_options
15
15
  if @options.has_key? :crawl_id
16
16
  @crawl_id = @options[:crawl_id]
@@ -18,7 +18,7 @@ class CobwebCrawler
18
18
  @crawl_id = Digest::MD5.hexdigest(DateTime.now.inspect.to_s)
19
19
  @options[:crawl_id] = @crawl_id
20
20
  end
21
-
21
+
22
22
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => RedisConnection.new(@options[:redis_options]))
23
23
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
24
24
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
@@ -27,27 +27,28 @@ class CobwebCrawler
27
27
 
28
28
  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
29
29
 
30
- @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
30
+ @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
31
31
  @debug = @options[:debug]
32
-
32
+
33
33
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
34
34
  if @options[:web_statistics]
35
+ require "server"
35
36
  Server.start(@options)
36
37
  end
37
-
38
+
38
39
  @cobweb = Cobweb.new(@options)
39
40
  end
40
-
41
+
41
42
  # Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash'
42
43
  def crawl(base_url, crawl_options = {}, &block)
43
44
  @options[:base_url] = base_url unless @options.has_key? :base_url
44
45
  @options[:thread_count] = 1 unless @options.has_key? :thread_count
45
-
46
+
46
47
  @options[:internal_urls] << base_url if @options[:internal_urls].empty?
47
48
  @redis.sadd("internal_urls", base_url) if @options[:internal_urls].empty?
48
-
49
+
49
50
  @crawl_options = crawl_options
50
-
51
+
51
52
  @redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
52
53
  @crawl_counter = @redis.scard("crawled").to_i
53
54
  @queue_counter = @redis.scard("queued").to_i
@@ -55,7 +56,7 @@ class CobwebCrawler
55
56
  @threads = []
56
57
  begin
57
58
  @stats.start_crawl(@options)
58
-
59
+
59
60
  @threads << Thread.new do
60
61
  Thread.abort_on_exception = true
61
62
  spawn_thread(&block)
@@ -73,7 +74,7 @@ class CobwebCrawler
73
74
  end
74
75
  sleep 1
75
76
  end
76
-
77
+
77
78
  ensure
78
79
  @stats.end_crawl(@options)
79
80
  end
@@ -96,8 +97,8 @@ class CobwebCrawler
96
97
  @stats.update_status("Processing #{url}...")
97
98
 
98
99
  @redis.sadd "crawled", url.to_s
99
- @redis.incr "crawl-counter"
100
-
100
+ @redis.incr "crawl-counter"
101
+
101
102
  document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
102
103
 
103
104
 
@@ -106,18 +107,18 @@ class CobwebCrawler
106
107
 
107
108
  internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
108
109
 
109
- # if the site has the same content for http and https then normalize to http
110
-
110
+ # if the site has the same content for http and https then normalize to http
111
+
111
112
  if @options[:treat_https_as_http]
112
113
  internal_links.map!{|link| link.gsub(/^https/, "http")}
113
114
  end
114
-
115
+
115
116
 
116
117
  # reject the link if we've crawled it or queued it
117
118
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
118
119
  internal_links.reject!{|link| @redis.sismember("queued", link)}
119
120
  internal_links.reject!{|link| link.nil? || link.empty?}
120
-
121
+
121
122
  internal_links.each do |link|
122
123
  puts "Added #{link.to_s} to queue" if @debug
123
124
  @redis.sadd "queued", link unless link.nil?
@@ -134,10 +135,10 @@ class CobwebCrawler
134
135
  @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
135
136
  end
136
137
  end
137
-
138
+
138
139
  @crawl_counter = @redis.scard("crawled").to_i
139
140
  @queue_counter = @redis.scard("queued").to_i
140
-
141
+
141
142
  @stats.update_statistics(content, @crawl_counter, @queue_counter)
142
143
  @stats.update_status("Completed #{url}.")
143
144
  yield content, @stats.get_statistics if block_given?
@@ -161,7 +162,7 @@ class CobwebCrawler
161
162
  def running_thread_count
162
163
  @threads.map{|t| t.status}.select{|status| status=="run" || status == "sleep"}.count
163
164
  end
164
-
165
+
165
166
  end
166
167
 
167
168
  # Monkey patch into String a starts_with method
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.1.0"
6
+ "1.2.0"
7
7
  end
8
8
 
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-06 00:00:00.000000000 Z
11
+ date: 2019-02-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: 1.6.6.2
47
+ version: 1.6.0
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: 1.6.6.2
54
+ version: 1.6.0
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: addressable
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -220,6 +220,20 @@ dependencies:
220
220
  - - ">="
221
221
  - !ruby/object:Gem::Version
222
222
  version: '0'
223
+ - !ruby/object:Gem::Dependency
224
+ name: bundle-audit
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - ">="
228
+ - !ruby/object:Gem::Version
229
+ version: '0'
230
+ type: :development
231
+ prerelease: false
232
+ version_requirements: !ruby/object:Gem::Requirement
233
+ requirements:
234
+ - - ">="
235
+ - !ruby/object:Gem::Version
236
+ version: '0'
223
237
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
224
238
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
225
239
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -616,7 +630,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
616
630
  version: '0'
617
631
  requirements: []
618
632
  rubyforge_project:
619
- rubygems_version: 2.4.5.1
633
+ rubygems_version: 2.7.7
620
634
  signing_key:
621
635
  specification_version: 4
622
636
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly