cobweb 1.0.25 → 1.0.26

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 433da726316611ac2835723ff04e645fb00a3dc4
4
- data.tar.gz: e85d23955c5ddbb02cf69baef0c5e240ff7d8204
3
+ metadata.gz: b39481d4cdb68d7f602e63a3919e51c126648a2d
4
+ data.tar.gz: bc29d59ae32beadf047c6e6c73503770d4ca842e
5
5
  SHA512:
6
- metadata.gz: 23b0e6707b07bcad8621f8c547f48c3c8d0abf946e6454ffa3fee599fdfac5ab01942579c3f4a16ce9e743034225e68695f77df752032acc0c9fbfdfeb7e43ce
7
- data.tar.gz: 702b5ed7c93e56f3994c7bee735ef1a44ceabd0cf349e1976b3692a094465b949ca618c106f1896e33551caeb64fe5383b57f53b8a8b4f18144b41255c092ad7
6
+ metadata.gz: fe824099c8329662036c3df2ec67688ba46f8a3337d63b4942cf93e5dd6de452f3215ddc5a80968dad552e29ff4ba445456112c39cec7e0c1c3166d4aadb91eb
7
+ data.tar.gz: 43fbe93aa454e77cfdb8f95d4c580dc7a8f6c526025839db75f71ccce02c4f38a2a03e91a2c8417c39311367435c2b6a03168eb48b28eb893014b8e67663ef01
data/README.textile CHANGED
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.25
1
+ h1. Cobweb v1.0.26
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
data/bin/cobweb CHANGED
@@ -17,12 +17,13 @@ opts = Slop.parse(:help => true) do
17
17
 
18
18
  on 'output=', 'Path to output data to'
19
19
  on 'script=', "Script to generate report"
20
-
20
+
21
21
  on 'url=', 'URL to start crawl from'
22
- on 'internal_urls=', 'Url patterns to include', :as => Array
23
- on 'external_urls=', 'Url patterns to exclude', :as => Array
24
- on 'seed_urls=', "Seed urls", :as => Array
25
- on 'crawl_limit=', 'Limit the crawl to a number of urls', :as => Integer
22
+ on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
23
+ on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
24
+ on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
25
+ on 'seed_url_file=', "File with URL per line to add to seed list"
26
+ on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
26
27
  on 'thread_count=', "Set the number of threads used", :as => Integer
27
28
  on 'timeout=', "Sets the timeout for http requests", :as => Integer
28
29
  on 'v', 'verbose', 'Display crawl information'
@@ -38,10 +39,11 @@ opts = Slop.parse(:help => true) do
38
39
  banner 'Usage: cobweb export [options]'
39
40
 
40
41
  on 'url=', 'URL to start crawl from'
41
- on 'internal_urls=', 'Url patterns to include', :as => Array
42
- on 'external_urls=', 'Url patterns to exclude', :as => Array
43
- on 'seed_urls=', "Seed urls", :as => Array
44
- on 'crawl_limit=', 'Limit the crawl to a number of urls', :as => Integer
42
+ on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
43
+ on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
44
+ on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
45
+ on 'seed_url_file=', "File with URL per line to add to seed list"
46
+ on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
45
47
  on 'thread_count=', "Set the number of threads used", :as => Integer
46
48
  on 'timeout=', "Sets the timeout for http requests", :as => Integer
47
49
  on 'v', 'verbose', 'Display crawl information'
data/lib/cobweb.rb CHANGED
@@ -250,10 +250,7 @@ class Cobweb
250
250
  end
251
251
  end
252
252
  rescue RedirectError => e
253
- if @options[:raise_exceptions]
254
- puts "Re-Raising error #{e.message} on #{uri.to_s}"
255
- raise e
256
- end
253
+ raise e if @options[:raise_exceptions]
257
254
  puts "ERROR RedirectError: #{e.message}"
258
255
 
259
256
  ## generate a blank content
@@ -456,9 +453,9 @@ class Cobweb
456
453
  pattern = pattern.gsub(".", "\\.")
457
454
  pattern = pattern.gsub("?", "\\?")
458
455
  pattern = pattern.gsub("+", "\\\\+")
459
- pattern = pattern.gsub("*", ".*?")
460
- if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
461
- pattern = pattern.gsub("http:", "https?:")
456
+ pattern = pattern.gsub("*", ".*?")
457
+ if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
458
+ pattern = pattern.gsub("https", "https?")
462
459
  end
463
460
  pattern
464
461
  end
@@ -27,8 +27,7 @@ class CobwebCrawler
27
27
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }
28
28
 
29
29
  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
30
-
31
- @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
30
+
32
31
  @debug = @options[:debug]
33
32
 
34
33
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
@@ -101,19 +100,16 @@ class CobwebCrawler
101
100
 
102
101
  document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
103
102
 
104
-
105
103
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
106
104
  cobweb_links = CobwebLinks.new(@options)
107
105
 
108
106
  internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
109
107
 
110
108
  # if the site has the same content for http and https then normalize to http
111
-
112
109
  if @options[:treat_https_as_http]
113
110
  internal_links.map!{|link| link.gsub(/^https/, "http")}
114
111
  end
115
112
 
116
-
117
113
  # reject the link if we've crawled it or queued it
118
114
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
119
115
  internal_links.reject!{|link| @redis.sismember("queued", link)}
data/lib/cobweb_links.rb CHANGED
@@ -12,9 +12,9 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
17
-
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
17
+
18
18
  end
19
19
 
20
20
  def allowed?(link)
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.25"
6
+ "1.0.26"
7
7
  end
8
8
 
9
9
  end
@@ -13,8 +13,16 @@ class ExportCommand
13
13
  :raise_exceptions => true,
14
14
  :root_path => default_root_path
15
15
  }.merge(opts)
16
-
17
16
 
17
+ if options.has_key?(:seed_url_file)
18
+ filename = options.delete(:seed_url_file)
19
+ options[:seed_urls] = []
20
+ File.open(filename, "r") do |f|
21
+ f.each_line do |line|
22
+ options[:seed_urls] << line
23
+ end
24
+ end
25
+ end
18
26
 
19
27
  statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
20
28
  begin
@@ -28,7 +36,7 @@ class ExportCommand
28
36
 
29
37
  uri.path.split("/")[0..-2].each do |dir|
30
38
  path+="/" unless path.ends_with?("/")
31
- path+=dir
39
+ path+=dir
32
40
  if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
33
41
  FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
34
42
  Dir.mkdir(options[:root_path] + path)
@@ -48,7 +56,7 @@ class ExportCommand
48
56
  doc = Nokogiri::HTML.parse(page[:body])
49
57
 
50
58
  if doc.search("title").first
51
- title = doc.search("title").first.content.gsub(" - ", " ")
59
+ title = doc.search("title").first.content.gsub(" - ", " ")
52
60
  else
53
61
  title = uri.path.split("/")[-1]
54
62
  end
@@ -5,6 +5,16 @@ class ReportCommand
5
5
  options = opts.to_hash.delete_if { |k, v| v.nil?}
6
6
  options[:quiet] = !opts[:verbose]
7
7
 
8
+ if options.has_key?(:seed_url_file)
9
+ filename = options.delete(:seed_url_file)
10
+ options[:seed_urls] = []
11
+ File.open(filename, "r") do |f|
12
+ f.each_line do |line|
13
+ options[:seed_urls] << line
14
+ end
15
+ end
16
+ end
17
+
8
18
  @crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
9
19
 
10
20
  columns = nil
@@ -23,7 +33,6 @@ class ReportCommand
23
33
  page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
24
34
  page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
25
35
 
26
-
27
36
  if !columns
28
37
  columns = page.keys.reject{|k| k==:body || k==:links}
29
38
  csv << columns.map{|k| k.to_s}
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
4
4
  require 'sidekiq'
5
5
  else
6
6
  SIDEKIQ_INSTALLED = false
7
- puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
7
+ puts "sidekiq gem not installed, skipping crawl_worker specs"
8
8
  end
9
9
  if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
10
10
  RESQUE_INSTALLED = true
11
11
  require 'resque'
12
12
  else
13
13
  RESQUE_INSTALLED = false
14
- puts "resque gem not installed, skipping crawl_job specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
14
+ puts "resque gem not installed, skipping crawl_job specs"
15
15
  end
16
16
 
17
17
  module Sidekiq
@@ -53,39 +53,6 @@ describe CobwebCrawler do
53
53
 
54
54
  end
55
55
 
56
- context "internal_links" do
57
- it "should match internal links without being explicitly set" do
58
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
59
- crawler.crawl(@base_url)
60
- queued_links = @redis_mock_object.smembers("queued")
61
- queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
62
- queued_links.should include("http://localhost:3532/secure")
63
- end
64
- context "with https" do
65
- it "should match https by default" do
66
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
67
- crawler.crawl(@base_url)
68
- queued_links = @redis_mock_object.smembers("queued")
69
- queued_links.should_not include("https://localhost:3532/secure")
70
- queued_links.should include("http://localhost:3532/secure")
71
- end
72
- it "should not define https as different if treat_https_as_http is true" do
73
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
74
- crawler.crawl(@base_url)
75
- queued_links = @redis_mock_object.smembers("queued")
76
- queued_links.should_not include("https://localhost:3532/secure")
77
- queued_links.should include("http://localhost:3532/secure")
78
- end
79
- it "should define https as different if treat_https_as_http is false" do
80
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
81
- crawler.crawl(@base_url)
82
- queued_links = @redis_mock_object.smembers("queued")
83
- queued_links.should_not include("https://localhost:3532/secure")
84
- queued_links.should_not include("http://localhost:3532/secure")
85
- end
86
- end
87
- end
88
-
89
56
  context "storing inbound links" do
90
57
 
91
58
  before(:each) do
@@ -61,19 +61,19 @@ describe Cobweb do
61
61
 
62
62
  context "with https ignored" do
63
63
  it "should ignore https" do
64
- result = Cobweb.escape_pattern_for_regex("http://asdf.com")
64
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com")
65
65
  result.should eql "https?://asdf\\.com"
66
66
  end
67
67
  it "should ignore https" do
68
- result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => true)
68
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
69
69
  result.should eql "https?://asdf\\.com"
70
70
  end
71
71
  end
72
72
 
73
73
  context "without https ignored" do
74
74
  it "should ignore https" do
75
- result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => false)
76
- result.should eql "http://asdf\\.com"
75
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
76
+ result.should eql "https://asdf\\.com"
77
77
  end
78
78
  end
79
79
 
@@ -711,8 +711,6 @@
711
711
  <a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
712
712
  <a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
713
713
  <a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
714
-
715
- <a href="https://localhost:3532/secure">HTTPS Link</a>
716
714
 
717
715
  <a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
718
716
  <a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
data/spec/spec_helper.rb CHANGED
@@ -37,11 +37,10 @@ RSpec.configure do |config|
37
37
 
38
38
  config.before(:each) {
39
39
 
40
- @redis_mock_object = MockRedis.new
41
- Redis.stub(:new).and_return(@redis_mock_object)
42
- Redis::Namespace.stub(:new).and_return(@redis_mock_object)
40
+ #redis_mock = double("redis")
41
+ #redis_mock.stub(:new).and_return(@redis_mock_object)
43
42
 
44
- @redis_mock_object.flushdb
43
+ #redis_mock.flushdb
45
44
 
46
45
  }
47
46
 
metadata CHANGED
@@ -1,127 +1,113 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.25
4
+ version: 1.0.26
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-24 00:00:00.000000000 Z
11
+ date: 2015-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '3.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '3.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '1.6'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '1.6'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: addressable
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: awesome_print
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
45
+ - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '0'
47
+ version: '2.3'
62
48
  type: :runtime
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
- - - ">="
52
+ - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '0'
54
+ version: '2.3'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: sinatra
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
- - - ">="
59
+ - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '0'
61
+ version: '1.4'
76
62
  type: :runtime
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
- - - ">="
66
+ - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: '0'
68
+ version: '1.4'
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: haml
85
71
  requirement: !ruby/object:Gem::Requirement
86
72
  requirements:
87
- - - ">="
73
+ - - "~>"
88
74
  - !ruby/object:Gem::Version
89
- version: '0'
75
+ version: '4.0'
90
76
  type: :runtime
91
77
  prerelease: false
92
78
  version_requirements: !ruby/object:Gem::Requirement
93
79
  requirements:
94
- - - ">="
80
+ - - "~>"
95
81
  - !ruby/object:Gem::Version
96
- version: '0'
82
+ version: '4.0'
97
83
  - !ruby/object:Gem::Dependency
98
84
  name: redis-namespace
99
85
  requirement: !ruby/object:Gem::Requirement
100
86
  requirements:
101
- - - ">="
87
+ - - "~>"
102
88
  - !ruby/object:Gem::Version
103
- version: '0'
89
+ version: '1.3'
104
90
  type: :runtime
105
91
  prerelease: false
106
92
  version_requirements: !ruby/object:Gem::Requirement
107
93
  requirements:
108
- - - ">="
94
+ - - "~>"
109
95
  - !ruby/object:Gem::Version
110
- version: '0'
96
+ version: '1.3'
111
97
  - !ruby/object:Gem::Dependency
112
98
  name: json
113
99
  requirement: !ruby/object:Gem::Requirement
114
100
  requirements:
115
- - - ">="
101
+ - - "~>"
116
102
  - !ruby/object:Gem::Version
117
- version: '0'
103
+ version: '1.8'
118
104
  type: :runtime
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
107
  requirements:
122
- - - ">="
108
+ - - "~>"
123
109
  - !ruby/object:Gem::Version
124
- version: '0'
110
+ version: '1.8'
125
111
  - !ruby/object:Gem::Dependency
126
112
  name: slop
127
113
  requirement: !ruby/object:Gem::Requirement