cobweb 1.0.25 → 1.0.26
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.textile +1 -1
- data/bin/cobweb +11 -9
- data/lib/cobweb.rb +4 -7
- data/lib/cobweb_crawler.rb +1 -5
- data/lib/cobweb_links.rb +3 -3
- data/lib/cobweb_version.rb +1 -1
- data/lib/export_command.rb +11 -3
- data/lib/report_command.rb +10 -1
- data/lib/sidekiq/cobweb_helper.rb +2 -2
- data/spec/cobweb/cobweb_crawler_spec.rb +0 -33
- data/spec/cobweb/cobweb_spec.rb +4 -4
- data/spec/samples/sample_site/index.html +0 -2
- data/spec/spec_helper.rb +3 -4
- metadata +30 -44
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b39481d4cdb68d7f602e63a3919e51c126648a2d
|
4
|
+
data.tar.gz: bc29d59ae32beadf047c6e6c73503770d4ca842e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fe824099c8329662036c3df2ec67688ba46f8a3337d63b4942cf93e5dd6de452f3215ddc5a80968dad552e29ff4ba445456112c39cec7e0c1c3166d4aadb91eb
|
7
|
+
data.tar.gz: 43fbe93aa454e77cfdb8f95d4c580dc7a8f6c526025839db75f71ccce02c4f38a2a03e91a2c8417c39311367435c2b6a03168eb48b28eb893014b8e67663ef01
|
data/README.textile
CHANGED
data/bin/cobweb
CHANGED
@@ -17,12 +17,13 @@ opts = Slop.parse(:help => true) do
|
|
17
17
|
|
18
18
|
on 'output=', 'Path to output data to'
|
19
19
|
on 'script=', "Script to generate report"
|
20
|
-
|
20
|
+
|
21
21
|
on 'url=', 'URL to start crawl from'
|
22
|
-
on 'internal_urls=', '
|
23
|
-
on 'external_urls=', '
|
24
|
-
on 'seed_urls=', "
|
25
|
-
on '
|
22
|
+
on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
|
23
|
+
on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
|
24
|
+
on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
|
25
|
+
on 'seed_url_file=', "File with URL per line to add to seed list"
|
26
|
+
on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
|
26
27
|
on 'thread_count=', "Set the number of threads used", :as => Integer
|
27
28
|
on 'timeout=', "Sets the timeout for http requests", :as => Integer
|
28
29
|
on 'v', 'verbose', 'Display crawl information'
|
@@ -38,10 +39,11 @@ opts = Slop.parse(:help => true) do
|
|
38
39
|
banner 'Usage: cobweb export [options]'
|
39
40
|
|
40
41
|
on 'url=', 'URL to start crawl from'
|
41
|
-
on 'internal_urls=', '
|
42
|
-
on 'external_urls=', '
|
43
|
-
on 'seed_urls=', "
|
44
|
-
on '
|
42
|
+
on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
|
43
|
+
on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
|
44
|
+
on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
|
45
|
+
on 'seed_url_file=', "File with URL per line to add to seed list"
|
46
|
+
on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
|
45
47
|
on 'thread_count=', "Set the number of threads used", :as => Integer
|
46
48
|
on 'timeout=', "Sets the timeout for http requests", :as => Integer
|
47
49
|
on 'v', 'verbose', 'Display crawl information'
|
data/lib/cobweb.rb
CHANGED
@@ -250,10 +250,7 @@ class Cobweb
|
|
250
250
|
end
|
251
251
|
end
|
252
252
|
rescue RedirectError => e
|
253
|
-
if @options[:raise_exceptions]
|
254
|
-
puts "Re-Raising error #{e.message} on #{uri.to_s}"
|
255
|
-
raise e
|
256
|
-
end
|
253
|
+
raise e if @options[:raise_exceptions]
|
257
254
|
puts "ERROR RedirectError: #{e.message}"
|
258
255
|
|
259
256
|
## generate a blank content
|
@@ -456,9 +453,9 @@ class Cobweb
|
|
456
453
|
pattern = pattern.gsub(".", "\\.")
|
457
454
|
pattern = pattern.gsub("?", "\\?")
|
458
455
|
pattern = pattern.gsub("+", "\\\\+")
|
459
|
-
pattern = pattern.gsub("*", ".*?")
|
460
|
-
if options
|
461
|
-
pattern = pattern.gsub("
|
456
|
+
pattern = pattern.gsub("*", ".*?")
|
457
|
+
if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
|
458
|
+
pattern = pattern.gsub("https", "https?")
|
462
459
|
end
|
463
460
|
pattern
|
464
461
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -27,8 +27,7 @@ class CobwebCrawler
|
|
27
27
|
@options[:seed_urls].map{|link| @redis.sadd "queued", link }
|
28
28
|
|
29
29
|
@options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
|
30
|
-
|
31
|
-
@options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
|
30
|
+
|
32
31
|
@debug = @options[:debug]
|
33
32
|
|
34
33
|
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
@@ -101,19 +100,16 @@ class CobwebCrawler
|
|
101
100
|
|
102
101
|
document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
|
103
102
|
|
104
|
-
|
105
103
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
106
104
|
cobweb_links = CobwebLinks.new(@options)
|
107
105
|
|
108
106
|
internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
|
109
107
|
|
110
108
|
# if the site has the same content for http and https then normalize to http
|
111
|
-
|
112
109
|
if @options[:treat_https_as_http]
|
113
110
|
internal_links.map!{|link| link.gsub(/^https/, "http")}
|
114
111
|
end
|
115
112
|
|
116
|
-
|
117
113
|
# reject the link if we've crawled it or queued it
|
118
114
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
119
115
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
data/lib/cobweb_links.rb
CHANGED
@@ -12,9 +12,9 @@ class CobwebLinks
|
|
12
12
|
@options[:external_urls] = [] unless @options.has_key? :external_urls
|
13
13
|
@options[:debug] = false unless @options.has_key? :debug
|
14
14
|
|
15
|
-
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern,
|
16
|
-
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern,
|
17
|
-
|
15
|
+
@internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
16
|
+
@external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
|
17
|
+
|
18
18
|
end
|
19
19
|
|
20
20
|
def allowed?(link)
|
data/lib/cobweb_version.rb
CHANGED
data/lib/export_command.rb
CHANGED
@@ -13,8 +13,16 @@ class ExportCommand
|
|
13
13
|
:raise_exceptions => true,
|
14
14
|
:root_path => default_root_path
|
15
15
|
}.merge(opts)
|
16
|
-
|
17
16
|
|
17
|
+
if options.has_key?(:seed_url_file)
|
18
|
+
filename = options.delete(:seed_url_file)
|
19
|
+
options[:seed_urls] = []
|
20
|
+
File.open(filename, "r") do |f|
|
21
|
+
f.each_line do |line|
|
22
|
+
options[:seed_urls] << line
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
18
26
|
|
19
27
|
statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
|
20
28
|
begin
|
@@ -28,7 +36,7 @@ class ExportCommand
|
|
28
36
|
|
29
37
|
uri.path.split("/")[0..-2].each do |dir|
|
30
38
|
path+="/" unless path.ends_with?("/")
|
31
|
-
path+=dir
|
39
|
+
path+=dir
|
32
40
|
if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
|
33
41
|
FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
|
34
42
|
Dir.mkdir(options[:root_path] + path)
|
@@ -48,7 +56,7 @@ class ExportCommand
|
|
48
56
|
doc = Nokogiri::HTML.parse(page[:body])
|
49
57
|
|
50
58
|
if doc.search("title").first
|
51
|
-
title = doc.search("title").first.content.gsub(" - ", " ")
|
59
|
+
title = doc.search("title").first.content.gsub(" - ", " ")
|
52
60
|
else
|
53
61
|
title = uri.path.split("/")[-1]
|
54
62
|
end
|
data/lib/report_command.rb
CHANGED
@@ -5,6 +5,16 @@ class ReportCommand
|
|
5
5
|
options = opts.to_hash.delete_if { |k, v| v.nil?}
|
6
6
|
options[:quiet] = !opts[:verbose]
|
7
7
|
|
8
|
+
if options.has_key?(:seed_url_file)
|
9
|
+
filename = options.delete(:seed_url_file)
|
10
|
+
options[:seed_urls] = []
|
11
|
+
File.open(filename, "r") do |f|
|
12
|
+
f.each_line do |line|
|
13
|
+
options[:seed_urls] << line
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
8
18
|
@crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
|
9
19
|
|
10
20
|
columns = nil
|
@@ -23,7 +33,6 @@ class ReportCommand
|
|
23
33
|
page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
|
24
34
|
page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
|
25
35
|
|
26
|
-
|
27
36
|
if !columns
|
28
37
|
columns = page.keys.reject{|k| k==:body || k==:links}
|
29
38
|
csv << columns.map{|k| k.to_s}
|
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
|
|
4
4
|
require 'sidekiq'
|
5
5
|
else
|
6
6
|
SIDEKIQ_INSTALLED = false
|
7
|
-
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
7
|
+
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
8
8
|
end
|
9
9
|
if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
|
10
10
|
RESQUE_INSTALLED = true
|
11
11
|
require 'resque'
|
12
12
|
else
|
13
13
|
RESQUE_INSTALLED = false
|
14
|
-
puts "resque gem not installed, skipping crawl_job specs"
|
14
|
+
puts "resque gem not installed, skipping crawl_job specs"
|
15
15
|
end
|
16
16
|
|
17
17
|
module Sidekiq
|
@@ -53,39 +53,6 @@ describe CobwebCrawler do
|
|
53
53
|
|
54
54
|
end
|
55
55
|
|
56
|
-
context "internal_links" do
|
57
|
-
it "should match internal links without being explicitly set" do
|
58
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
59
|
-
crawler.crawl(@base_url)
|
60
|
-
queued_links = @redis_mock_object.smembers("queued")
|
61
|
-
queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
|
62
|
-
queued_links.should include("http://localhost:3532/secure")
|
63
|
-
end
|
64
|
-
context "with https" do
|
65
|
-
it "should match https by default" do
|
66
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
|
67
|
-
crawler.crawl(@base_url)
|
68
|
-
queued_links = @redis_mock_object.smembers("queued")
|
69
|
-
queued_links.should_not include("https://localhost:3532/secure")
|
70
|
-
queued_links.should include("http://localhost:3532/secure")
|
71
|
-
end
|
72
|
-
it "should not define https as different if treat_https_as_http is true" do
|
73
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
|
74
|
-
crawler.crawl(@base_url)
|
75
|
-
queued_links = @redis_mock_object.smembers("queued")
|
76
|
-
queued_links.should_not include("https://localhost:3532/secure")
|
77
|
-
queued_links.should include("http://localhost:3532/secure")
|
78
|
-
end
|
79
|
-
it "should define https as different if treat_https_as_http is false" do
|
80
|
-
crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
|
81
|
-
crawler.crawl(@base_url)
|
82
|
-
queued_links = @redis_mock_object.smembers("queued")
|
83
|
-
queued_links.should_not include("https://localhost:3532/secure")
|
84
|
-
queued_links.should_not include("http://localhost:3532/secure")
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
56
|
context "storing inbound links" do
|
90
57
|
|
91
58
|
before(:each) do
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -61,19 +61,19 @@ describe Cobweb do
|
|
61
61
|
|
62
62
|
context "with https ignored" do
|
63
63
|
it "should ignore https" do
|
64
|
-
result = Cobweb.escape_pattern_for_regex("
|
64
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com")
|
65
65
|
result.should eql "https?://asdf\\.com"
|
66
66
|
end
|
67
67
|
it "should ignore https" do
|
68
|
-
result = Cobweb.escape_pattern_for_regex("
|
68
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
|
69
69
|
result.should eql "https?://asdf\\.com"
|
70
70
|
end
|
71
71
|
end
|
72
72
|
|
73
73
|
context "without https ignored" do
|
74
74
|
it "should ignore https" do
|
75
|
-
result = Cobweb.escape_pattern_for_regex("
|
76
|
-
result.should eql "
|
75
|
+
result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
|
76
|
+
result.should eql "https://asdf\\.com"
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
@@ -711,8 +711,6 @@
|
|
711
711
|
<a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
|
712
712
|
<a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
|
713
713
|
<a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
|
714
|
-
|
715
|
-
<a href="https://localhost:3532/secure">HTTPS Link</a>
|
716
714
|
|
717
715
|
<a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
|
718
716
|
<a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
|
data/spec/spec_helper.rb
CHANGED
@@ -37,11 +37,10 @@ RSpec.configure do |config|
|
|
37
37
|
|
38
38
|
config.before(:each) {
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
Redis::Namespace.stub(:new).and_return(@redis_mock_object)
|
40
|
+
#redis_mock = double("redis")
|
41
|
+
#redis_mock.stub(:new).and_return(@redis_mock_object)
|
43
42
|
|
44
|
-
|
43
|
+
#redis_mock.flushdb
|
45
44
|
|
46
45
|
}
|
47
46
|
|
metadata
CHANGED
@@ -1,127 +1,113 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '3.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '3.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: nokogiri
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.6'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.6'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: addressable
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: awesome_print
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
45
|
+
- - "~>"
|
60
46
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
47
|
+
version: '2.3'
|
62
48
|
type: :runtime
|
63
49
|
prerelease: false
|
64
50
|
version_requirements: !ruby/object:Gem::Requirement
|
65
51
|
requirements:
|
66
|
-
- - "
|
52
|
+
- - "~>"
|
67
53
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
54
|
+
version: '2.3'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: sinatra
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
|
-
- - "
|
59
|
+
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
61
|
+
version: '1.4'
|
76
62
|
type: :runtime
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
|
-
- - "
|
66
|
+
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
68
|
+
version: '1.4'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: haml
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
86
72
|
requirements:
|
87
|
-
- - "
|
73
|
+
- - "~>"
|
88
74
|
- !ruby/object:Gem::Version
|
89
|
-
version: '0'
|
75
|
+
version: '4.0'
|
90
76
|
type: :runtime
|
91
77
|
prerelease: false
|
92
78
|
version_requirements: !ruby/object:Gem::Requirement
|
93
79
|
requirements:
|
94
|
-
- - "
|
80
|
+
- - "~>"
|
95
81
|
- !ruby/object:Gem::Version
|
96
|
-
version: '0'
|
82
|
+
version: '4.0'
|
97
83
|
- !ruby/object:Gem::Dependency
|
98
84
|
name: redis-namespace
|
99
85
|
requirement: !ruby/object:Gem::Requirement
|
100
86
|
requirements:
|
101
|
-
- - "
|
87
|
+
- - "~>"
|
102
88
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
89
|
+
version: '1.3'
|
104
90
|
type: :runtime
|
105
91
|
prerelease: false
|
106
92
|
version_requirements: !ruby/object:Gem::Requirement
|
107
93
|
requirements:
|
108
|
-
- - "
|
94
|
+
- - "~>"
|
109
95
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
96
|
+
version: '1.3'
|
111
97
|
- !ruby/object:Gem::Dependency
|
112
98
|
name: json
|
113
99
|
requirement: !ruby/object:Gem::Requirement
|
114
100
|
requirements:
|
115
|
-
- - "
|
101
|
+
- - "~>"
|
116
102
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
103
|
+
version: '1.8'
|
118
104
|
type: :runtime
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
107
|
requirements:
|
122
|
-
- - "
|
108
|
+
- - "~>"
|
123
109
|
- !ruby/object:Gem::Version
|
124
|
-
version: '
|
110
|
+
version: '1.8'
|
125
111
|
- !ruby/object:Gem::Dependency
|
126
112
|
name: slop
|
127
113
|
requirement: !ruby/object:Gem::Requirement
|