cobweb 1.0.25 → 1.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.textile +1 -1
- data/bin/cobweb +11 -9
- data/lib/cobweb.rb +4 -7
- data/lib/cobweb_crawler.rb +1 -5
- data/lib/cobweb_links.rb +3 -3
- data/lib/cobweb_version.rb +1 -1
- data/lib/export_command.rb +11 -3
- data/lib/report_command.rb +10 -1
- data/lib/sidekiq/cobweb_helper.rb +2 -2
- data/spec/cobweb/cobweb_crawler_spec.rb +0 -33
- data/spec/cobweb/cobweb_spec.rb +4 -4
- data/spec/samples/sample_site/index.html +0 -2
- data/spec/spec_helper.rb +3 -4
- metadata +30 -44
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b39481d4cdb68d7f602e63a3919e51c126648a2d
|
4
|
+
data.tar.gz: bc29d59ae32beadf047c6e6c73503770d4ca842e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe824099c8329662036c3df2ec67688ba46f8a3337d63b4942cf93e5dd6de452f3215ddc5a80968dad552e29ff4ba445456112c39cec7e0c1c3166d4aadb91eb
|
7
|
+
data.tar.gz: 43fbe93aa454e77cfdb8f95d4c580dc7a8f6c526025839db75f71ccce02c4f38a2a03e91a2c8417c39311367435c2b6a03168eb48b28eb893014b8e67663ef01
|
data/README.textile
CHANGED
data/bin/cobweb
CHANGED
@@ -17,12 +17,13 @@ opts = Slop.parse(:help => true) do
|
|
17
17
|
|
18
18
|
on 'output=', 'Path to output data to'
|
19
19
|
on 'script=', "Script to generate report"
|
20
|
-
|
20
|
+
|
21
21
|
on 'url=', 'URL to start crawl from'
|
22
|
-
on 'internal_urls=', '
|
23
|
-
on 'external_urls=', '
|
24
|
-
on 'seed_urls=', "
|
25
|
-
on '
|
22
|
+
on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
|
23
|
+
on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
|
24
|
+
on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
|
25
|
+
on 'seed_url_file=', "File with URL per line to add to seed list"
|
26
|
+
on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
|
26
27
|
on 'thread_count=', "Set the number of threads used", :as => Integer
|
27
28
|
on 'timeout=', "Sets the timeout for http requests", :as => Integer
|
28
29
|
on 'v', 'verbose', 'Display crawl information'
|
@@ -38,10 +39,11 @@ opts = Slop.parse(:help => true) do
|
|
38
39
|
banner 'Usage: cobweb export [options]'
|
39
40
|
|
40
41
|
on 'url=', 'URL to start crawl from'
|
41
|
-
on 'internal_urls=', '
|
42
|
-
on 'external_urls=', '
|
43
|
-
on 'seed_urls=', "
|
44
|
-
on '
|
42
|
+
on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
|
43
|
+
on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
|
44
|
+
on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
|
45
|
+
on 'seed_url_file=', "File with URL per line to add to seed list"
|
46
|
+
on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
|
45
47
|
on 'thread_count=', "Set the number of threads used", :as => Integer
|
46
48
|
on 'timeout=', "Sets the timeout for http requests", :as => Integer
|
47
49
|
on 'v', 'verbose', 'Display crawl information'
|
data/lib/cobweb.rb
CHANGED
@@ -250,10 +250,7 @@ class Cobweb
|
|
250
250
|
end
|
251
251
|
end
|
252
252
|
rescue RedirectError => e
|
253
|
-
if @options[:raise_exceptions]
|
254
|
-
puts "Re-Raising error #{e.message} on #{uri.to_s}"
|
255
|
-
raise e
|
256
|
-
end
|
253
|
+
raise e if @options[:raise_exceptions]
|
257
254
|
puts "ERROR RedirectError: #{e.message}"
|
258
255
|
|
259
256
|
## generate a blank content
|
@@ -456,9 +453,9 @@ class Cobweb
|
|
456
453
|
pattern = pattern.gsub(".", "\\.")
|
457
454
|
pattern = pattern.gsub("?", "\\?")
|
458
455
|
pattern = pattern.gsub("+", "\\\\+")
|
459
|
-
pattern = pattern.gsub("*", ".*?")
|
460
|
-
if options
|
461
|
-
pattern = pattern.gsub("
|
456
|
+
pattern = pattern.gsub("*", ".*?")
|
457
|
+
if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
|
458
|
+
pattern = pattern.gsub("https", "https?")
|
462
459
|
end
|
463
460
|
pattern
|
464
461
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -27,8 +27,7 @@ class CobwebCrawler
|
|
27
27
|
@options[:seed_urls].map{|link| @redis.sadd "queued", link }
|
28
28
|
|
29
29
|
@options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
|
30
|
-
|
31
|
-
@options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
|
30
|
+
|
32
31
|
@debug = @options[:debug]
|
33
32
|
|
34
33
|
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
@@ -101,19 +100,16 @@ class CobwebCrawler
|
|
101
100
|
|
102
101
|
document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
|
103
102
|
|
104
|
-
|
105
103
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
106
104
|
cobweb_links = CobwebLinks.new(@options)
|
107
105
|
|
108
106
|
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
109
107
|
|
110
108
|
# if the site has the same content for http and https then normalize to http
|
111
|
-
|
112
109
|
if @options[:treat_https_as_http]
|
113
110
|
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
114
111
|
end
|
115
112
|
|
116
|
-
|
117
113
|
# reject the link if we've crawled it or queued it
|
118
114
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
119
115
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
data/lib/cobweb_links.rb
CHANGED
@@ -12,9 +12,9 @@ class CobwebLinks
|
|
12
12
|
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
13
13
|
@options[:debug] = false unless @options.has_key? :debug
|
14
14
|
|
15
|
-
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern,
|
16
|
-
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern,
|
17
|
-
|
15
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
16
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
17
|
+
|
18
18
|
end
|
19
19
|
|
20
20
|
def allowed?(link)
|
data/lib/cobweb_version.rb
CHANGED
data/lib/export_command.rb
CHANGED
@@ -13,8 +13,16 @@ class ExportCommand
|
|
13
13
|
:raise_exceptions => true,
|
14
14
|
:root_path => default_root_path
|
15
15
|
}.merge(opts)
|
16
|
-
|
17
16
|
|
17
|
+
if options.has_key?(:seed_url_file)
|
18
|
+
filename = options.delete(:seed_url_file)
|
19
|
+
options[:seed_urls] = []
|
20
|
+
File.open(filename, "r") do |f|
|
21
|
+
f.each_line do |line|
|
22
|
+
options[:seed_urls] << line
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
18
26
|
|
19
27
|
statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
|
20
28
|
begin
|
@@ -28,7 +36,7 @@ class ExportCommand
|
|
28
36
|
|
29
37
|
uri.path.split("/")[0..-2].each do |dir|
|
30
38
|
path+="/" unless path.ends_with?("/")
|
31
|
-
path+=dir
|
39
|
+
path+=dir
|
32
40
|
if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
|
33
41
|
FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
|
34
42
|
Dir.mkdir(options[:root_path] + path)
|
@@ -48,7 +56,7 @@ class ExportCommand
|
|
48
56
|
doc = Nokogiri::HTML.parse(page[:body])
|
49
57
|
|
50
58
|
if doc.search("title").first
|
51
|
-
title = doc.search("title").first.content.gsub(" - ", " ")
|
59
|
+
title = doc.search("title").first.content.gsub(" - ", " ")
|
52
60
|
else
|
53
61
|
title = uri.path.split("/")[-1]
|
54
62
|
end
|
data/lib/report_command.rb
CHANGED
@@ -5,6 +5,16 @@ class ReportCommand
|
|
5
5
|
options = opts.to_hash.delete_if { |k, v| v.nil?}
|
6
6
|
options[:quiet] = !opts[:verbose]
|
7
7
|
|
8
|
+
if options.has_key?(:seed_url_file)
|
9
|
+
filename = options.delete(:seed_url_file)
|
10
|
+
options[:seed_urls] = []
|
11
|
+
File.open(filename, "r") do |f|
|
12
|
+
f.each_line do |line|
|
13
|
+
options[:seed_urls] << line
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
8
18
|
@crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
|
9
19
|
|
10
20
|
columns = nil
|
@@ -23,7 +33,6 @@ class ReportCommand
|
|
23
33
|
page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
|
24
34
|
page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
|
25
35
|
|
26
|
-
|
27
36
|
if !columns
|
28
37
|
columns = page.keys.reject{|k| k==:body || k==:links}
|
29
38
|
csv << columns.map{|k| k.to_s}
|
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
|
|
4
4
|
require 'sidekiq'
|
5
5
|
else
|
6
6
|
SIDEKIQ_INSTALLED = false
|
7
|
-
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
7
|
+
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
8
8
|
end
|
9
9
|
if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
|
10
10
|
RESQUE_INSTALLED = true
|
11
11
|
require 'resque'
|
12
12
|
else
|
13
13
|
RESQUE_INSTALLED = false
|
14
|
-
puts "resque gem not installed, skipping crawl_job specs"
|
14
|
+
puts "resque gem not installed, skipping crawl_job specs"
|
15
15
|
end
|
16
16
|
|
17
17
|
module Sidekiq
|
@@ -53,39 +53,6 @@ describe CobwebCrawler do
|
|
53
53
|
|
54
54
|
end
|
55
55
|
|
56
|
-
context "internal_links" do
|
57
|
-
it "should match internal links without being explicitly set" do
|
58
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
59
|
-
crawler.crawl(@base_url)
|
60
|
-
queued_links = @redis_mock_object.smembers("queued")
|
61
|
-
queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
|
62
|
-
queued_links.should include("http://localhost:3532/secure")
|
63
|
-
end
|
64
|
-
context "with https" do
|
65
|
-
it "should match https by default" do
|
66
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
67
|
-
crawler.crawl(@base_url)
|
68
|
-
queued_links = @redis_mock_object.smembers("queued")
|
69
|
-
queued_links.should_not include("https://localhost:3532/secure")
|
70
|
-
queued_links.should include("http://localhost:3532/secure")
|
71
|
-
end
|
72
|
-
it "should not define https as different if treat_https_as_http is true" do
|
73
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
|
74
|
-
crawler.crawl(@base_url)
|
75
|
-
queued_links = @redis_mock_object.smembers("queued")
|
76
|
-
queued_links.should_not include("https://localhost:3532/secure")
|
77
|
-
queued_links.should include("http://localhost:3532/secure")
|
78
|
-
end
|
79
|
-
it "should define https as different if treat_https_as_http is false" do
|
80
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
|
81
|
-
crawler.crawl(@base_url)
|
82
|
-
queued_links = @redis_mock_object.smembers("queued")
|
83
|
-
queued_links.should_not include("https://localhost:3532/secure")
|
84
|
-
queued_links.should_not include("http://localhost:3532/secure")
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
56
|
context "storing inbound links" do
|
90
57
|
|
91
58
|
before(:each) do
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -61,19 +61,19 @@ describe Cobweb do
|
|
61
61
|
|
62
62
|
context "with https ignored" do
|
63
63
|
it "should ignore https" do
|
64
|
-
result = Cobweb.escape_pattern_for_regex("
|
64
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com")
|
65
65
|
result.should eql "https?://asdf\\.com"
|
66
66
|
end
|
67
67
|
it "should ignore https" do
|
68
|
-
result = Cobweb.escape_pattern_for_regex("
|
68
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
|
69
69
|
result.should eql "https?://asdf\\.com"
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
73
|
context "without https ignored" do
|
74
74
|
it "should ignore https" do
|
75
|
-
result = Cobweb.escape_pattern_for_regex("
|
76
|
-
result.should eql "
|
75
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
|
76
|
+
result.should eql "https://asdf\\.com"
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
@@ -711,8 +711,6 @@
|
|
711
711
|
<a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
|
712
712
|
<a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
|
713
713
|
<a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
|
714
|
-
|
715
|
-
<a href="https://localhost:3532/secure">HTTPS Link</a>
|
716
714
|
|
717
715
|
<a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
|
718
716
|
<a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
|
data/spec/spec_helper.rb
CHANGED
@@ -37,11 +37,10 @@ RSpec.configure do |config|
|
|
37
37
|
|
38
38
|
config.before(:each) {
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
Redis::Namespace.stub(:new).and_return(@redis_mock_object)
|
40
|
+
#redis_mock = double("redis")
|
41
|
+
#redis_mock.stub(:new).and_return(@redis_mock_object)
|
43
42
|
|
44
|
-
|
43
|
+
#redis_mock.flushdb
|
45
44
|
|
46
45
|
}
|
47
46
|
|
metadata
CHANGED
@@ -1,127 +1,113 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '3.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '3.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.6'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.6'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: addressable
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: awesome_print
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
45
|
+
- - "~>"
|
60
46
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
47
|
+
version: '2.3'
|
62
48
|
type: :runtime
|
63
49
|
prerelease: false
|
64
50
|
version_requirements: !ruby/object:Gem::Requirement
|
65
51
|
requirements:
|
66
|
-
- - "
|
52
|
+
- - "~>"
|
67
53
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
54
|
+
version: '2.3'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: sinatra
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
|
-
- - "
|
59
|
+
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
61
|
+
version: '1.4'
|
76
62
|
type: :runtime
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
|
-
- - "
|
66
|
+
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
68
|
+
version: '1.4'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: haml
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
86
72
|
requirements:
|
87
|
-
- - "
|
73
|
+
- - "~>"
|
88
74
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
75
|
+
version: '4.0'
|
90
76
|
type: :runtime
|
91
77
|
prerelease: false
|
92
78
|
version_requirements: !ruby/object:Gem::Requirement
|
93
79
|
requirements:
|
94
|
-
- - "
|
80
|
+
- - "~>"
|
95
81
|
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
82
|
+
version: '4.0'
|
97
83
|
- !ruby/object:Gem::Dependency
|
98
84
|
name: redis-namespace
|
99
85
|
requirement: !ruby/object:Gem::Requirement
|
100
86
|
requirements:
|
101
|
-
- - "
|
87
|
+
- - "~>"
|
102
88
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
89
|
+
version: '1.3'
|
104
90
|
type: :runtime
|
105
91
|
prerelease: false
|
106
92
|
version_requirements: !ruby/object:Gem::Requirement
|
107
93
|
requirements:
|
108
|
-
- - "
|
94
|
+
- - "~>"
|
109
95
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
96
|
+
version: '1.3'
|
111
97
|
- !ruby/object:Gem::Dependency
|
112
98
|
name: json
|
113
99
|
requirement: !ruby/object:Gem::Requirement
|
114
100
|
requirements:
|
115
|
-
- - "
|
101
|
+
- - "~>"
|
116
102
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
103
|
+
version: '1.8'
|
118
104
|
type: :runtime
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
107
|
requirements:
|
122
|
-
- - "
|
108
|
+
- - "~>"
|
123
109
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
110
|
+
version: '1.8'
|
125
111
|
- !ruby/object:Gem::Dependency
|
126
112
|
name: slop
|
127
113
|
requirement: !ruby/object:Gem::Requirement
|