cobweb 1.0.25 → 1.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 433da726316611ac2835723ff04e645fb00a3dc4
4
- data.tar.gz: e85d23955c5ddbb02cf69baef0c5e240ff7d8204
3
+ metadata.gz: b39481d4cdb68d7f602e63a3919e51c126648a2d
4
+ data.tar.gz: bc29d59ae32beadf047c6e6c73503770d4ca842e
5
5
  SHA512:
6
- metadata.gz: 23b0e6707b07bcad8621f8c547f48c3c8d0abf946e6454ffa3fee599fdfac5ab01942579c3f4a16ce9e743034225e68695f77df752032acc0c9fbfdfeb7e43ce
7
- data.tar.gz: 702b5ed7c93e56f3994c7bee735ef1a44ceabd0cf349e1976b3692a094465b949ca618c106f1896e33551caeb64fe5383b57f53b8a8b4f18144b41255c092ad7
6
+ metadata.gz: fe824099c8329662036c3df2ec67688ba46f8a3337d63b4942cf93e5dd6de452f3215ddc5a80968dad552e29ff4ba445456112c39cec7e0c1c3166d4aadb91eb
7
+ data.tar.gz: 43fbe93aa454e77cfdb8f95d4c580dc7a8f6c526025839db75f71ccce02c4f38a2a03e91a2c8417c39311367435c2b6a03168eb48b28eb893014b8e67663ef01
data/README.textile CHANGED
@@ -1,4 +1,4 @@
1
- h1. Cobweb v1.0.25
1
+ h1. Cobweb v1.0.26
2
2
 
3
3
  "@cobweb_gem":https://twitter.com/cobweb_gem
4
4
  !https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
data/bin/cobweb CHANGED
@@ -17,12 +17,13 @@ opts = Slop.parse(:help => true) do
17
17
 
18
18
  on 'output=', 'Path to output data to'
19
19
  on 'script=', "Script to generate report"
20
-
20
+
21
21
  on 'url=', 'URL to start crawl from'
22
- on 'internal_urls=', 'Url patterns to include', :as => Array
23
- on 'external_urls=', 'Url patterns to exclude', :as => Array
24
- on 'seed_urls=', "Seed urls", :as => Array
25
- on 'crawl_limit=', 'Limit the crawl to a number of urls', :as => Integer
22
+ on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
23
+ on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
24
+ on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
25
+ on 'seed_url_file=', "File with URL per line to add to seed list"
26
+ on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
26
27
  on 'thread_count=', "Set the number of threads used", :as => Integer
27
28
  on 'timeout=', "Sets the timeout for http requests", :as => Integer
28
29
  on 'v', 'verbose', 'Display crawl information'
@@ -38,10 +39,11 @@ opts = Slop.parse(:help => true) do
38
39
  banner 'Usage: cobweb export [options]'
39
40
 
40
41
  on 'url=', 'URL to start crawl from'
41
- on 'internal_urls=', 'Url patterns to include', :as => Array
42
- on 'external_urls=', 'Url patterns to exclude', :as => Array
43
- on 'seed_urls=', "Seed urls", :as => Array
44
- on 'crawl_limit=', 'Limit the crawl to a number of urls', :as => Integer
42
+ on 'internal_urls=', 'Comma separated list of URL patterns to include (* is wildcard)', :as => Array
43
+ on 'external_urls=', 'Comma separated list of URL patterns to exclude (* is wildcard)', :as => Array
44
+ on 'seed_urls=', "CSV list of seed urls to crawl", :as => Array
45
+ on 'seed_url_file=', "File with URL per line to add to seed list"
46
+ on 'crawl_limit=', 'Maximum number of URLs to crawl', :as => Integer
45
47
  on 'thread_count=', "Set the number of threads used", :as => Integer
46
48
  on 'timeout=', "Sets the timeout for http requests", :as => Integer
47
49
  on 'v', 'verbose', 'Display crawl information'
data/lib/cobweb.rb CHANGED
@@ -250,10 +250,7 @@ class Cobweb
250
250
  end
251
251
  end
252
252
  rescue RedirectError => e
253
- if @options[:raise_exceptions]
254
- puts "Re-Raising error #{e.message} on #{uri.to_s}"
255
- raise e
256
- end
253
+ raise e if @options[:raise_exceptions]
257
254
  puts "ERROR RedirectError: #{e.message}"
258
255
 
259
256
  ## generate a blank content
@@ -456,9 +453,9 @@ class Cobweb
456
453
  pattern = pattern.gsub(".", "\\.")
457
454
  pattern = pattern.gsub("?", "\\?")
458
455
  pattern = pattern.gsub("+", "\\\\+")
459
- pattern = pattern.gsub("*", ".*?")
460
- if options[:treat_https_as_http] || !options.has_key?(:treat_https_as_http)
461
- pattern = pattern.gsub("http:", "https?:")
456
+ pattern = pattern.gsub("*", ".*?")
457
+ if !options.has_key?(:treat_https_as_http) || options[:treat_https_as_http]
458
+ pattern = pattern.gsub("https", "https?")
462
459
  end
463
460
  pattern
464
461
  end
@@ -27,8 +27,7 @@ class CobwebCrawler
27
27
  @options[:seed_urls].map{|link| @redis.sadd "queued", link }
28
28
 
29
29
  @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
30
-
31
- @options[:treat_https_as_http] = true unless @options.has_key? :treat_https_as_http
30
+
32
31
  @debug = @options[:debug]
33
32
 
34
33
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
@@ -101,19 +100,16 @@ class CobwebCrawler
101
100
 
102
101
  document_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https]).uniq
103
102
 
104
-
105
103
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
106
104
  cobweb_links = CobwebLinks.new(@options)
107
105
 
108
106
  internal_links = document_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s) && !cobweb_links.matches_external?(link))}
109
107
 
110
108
  # if the site has the same content for http and https then normalize to http
111
-
112
109
  if @options[:treat_https_as_http]
113
110
  internal_links.map!{|link| link.gsub(/^https/, "http")}
114
111
  end
115
112
 
116
-
117
113
  # reject the link if we've crawled it or queued it
118
114
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
119
115
  internal_links.reject!{|link| @redis.sismember("queued", link)}
data/lib/cobweb_links.rb CHANGED
@@ -12,9 +12,9 @@ class CobwebLinks
12
12
  @options[:external_urls] = [] unless @options.has_key? :external_urls
13
13
  @options[:debug] = false unless @options.has_key? :debug
14
14
 
15
- @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
16
- @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, @options)}")}
17
-
15
+ @internal_patterns = @options[:internal_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
16
+ @external_patterns = @options[:external_urls].map{|pattern| Regexp.new("^#{Cobweb.escape_pattern_for_regex(pattern, options)}")}
17
+
18
18
  end
19
19
 
20
20
  def allowed?(link)
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.25"
6
+ "1.0.26"
7
7
  end
8
8
 
9
9
  end
@@ -13,8 +13,16 @@ class ExportCommand
13
13
  :raise_exceptions => true,
14
14
  :root_path => default_root_path
15
15
  }.merge(opts)
16
-
17
16
 
17
+ if options.has_key?(:seed_url_file)
18
+ filename = options.delete(:seed_url_file)
19
+ options[:seed_urls] = []
20
+ File.open(filename, "r") do |f|
21
+ f.each_line do |line|
22
+ options[:seed_urls] << line
23
+ end
24
+ end
25
+ end
18
26
 
19
27
  statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
20
28
  begin
@@ -28,7 +36,7 @@ class ExportCommand
28
36
 
29
37
  uri.path.split("/")[0..-2].each do |dir|
30
38
  path+="/" unless path.ends_with?("/")
31
- path+=dir
39
+ path+=dir
32
40
  if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
33
41
  FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
34
42
  Dir.mkdir(options[:root_path] + path)
@@ -48,7 +56,7 @@ class ExportCommand
48
56
  doc = Nokogiri::HTML.parse(page[:body])
49
57
 
50
58
  if doc.search("title").first
51
- title = doc.search("title").first.content.gsub(" - ", " ")
59
+ title = doc.search("title").first.content.gsub(" - ", " ")
52
60
  else
53
61
  title = uri.path.split("/")[-1]
54
62
  end
@@ -5,6 +5,16 @@ class ReportCommand
5
5
  options = opts.to_hash.delete_if { |k, v| v.nil?}
6
6
  options[:quiet] = !opts[:verbose]
7
7
 
8
+ if options.has_key?(:seed_url_file)
9
+ filename = options.delete(:seed_url_file)
10
+ options[:seed_urls] = []
11
+ File.open(filename, "r") do |f|
12
+ f.each_line do |line|
13
+ options[:seed_urls] << line
14
+ end
15
+ end
16
+ end
17
+
8
18
  @crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
9
19
 
10
20
  columns = nil
@@ -23,7 +33,6 @@ class ReportCommand
23
33
  page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
24
34
  page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
25
35
 
26
-
27
36
  if !columns
28
37
  columns = page.keys.reject{|k| k==:body || k==:links}
29
38
  csv << columns.map{|k| k.to_s}
@@ -4,14 +4,14 @@ if Gem::Specification.find_all_by_name("sidekiq", ">=1.0.0").count >= 1
4
4
  require 'sidekiq'
5
5
  else
6
6
  SIDEKIQ_INSTALLED = false
7
- puts "sidekiq gem not installed, skipping crawl_worker specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
7
+ puts "sidekiq gem not installed, skipping crawl_worker specs"
8
8
  end
9
9
  if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
10
10
  RESQUE_INSTALLED = true
11
11
  require 'resque'
12
12
  else
13
13
  RESQUE_INSTALLED = false
14
- puts "resque gem not installed, skipping crawl_job specs" if defined?(ENVIRONMENT) && ENVIRONMENT=="test"
14
+ puts "resque gem not installed, skipping crawl_job specs"
15
15
  end
16
16
 
17
17
  module Sidekiq
@@ -53,39 +53,6 @@ describe CobwebCrawler do
53
53
 
54
54
  end
55
55
 
56
- context "internal_links" do
57
- it "should match internal links without being explicitly set" do
58
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
59
- crawler.crawl(@base_url)
60
- queued_links = @redis_mock_object.smembers("queued")
61
- queued_links.should_not include("http://themeforest.net/item/cleandream/490140")
62
- queued_links.should include("http://localhost:3532/secure")
63
- end
64
- context "with https" do
65
- it "should match https by default" do
66
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1})
67
- crawler.crawl(@base_url)
68
- queued_links = @redis_mock_object.smembers("queued")
69
- queued_links.should_not include("https://localhost:3532/secure")
70
- queued_links.should include("http://localhost:3532/secure")
71
- end
72
- it "should not define https as different if treat_https_as_http is true" do
73
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => true})
74
- crawler.crawl(@base_url)
75
- queued_links = @redis_mock_object.smembers("queued")
76
- queued_links.should_not include("https://localhost:3532/secure")
77
- queued_links.should include("http://localhost:3532/secure")
78
- end
79
- it "should define https as different if treat_https_as_http is false" do
80
- crawler = CobwebCrawler.new({:cache => false, :crawl_limit => 1, :treat_https_as_http => false})
81
- crawler.crawl(@base_url)
82
- queued_links = @redis_mock_object.smembers("queued")
83
- queued_links.should_not include("https://localhost:3532/secure")
84
- queued_links.should_not include("http://localhost:3532/secure")
85
- end
86
- end
87
- end
88
-
89
56
  context "storing inbound links" do
90
57
 
91
58
  before(:each) do
@@ -61,19 +61,19 @@ describe Cobweb do
61
61
 
62
62
  context "with https ignored" do
63
63
  it "should ignore https" do
64
- result = Cobweb.escape_pattern_for_regex("http://asdf.com")
64
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com")
65
65
  result.should eql "https?://asdf\\.com"
66
66
  end
67
67
  it "should ignore https" do
68
- result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => true)
68
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => true)
69
69
  result.should eql "https?://asdf\\.com"
70
70
  end
71
71
  end
72
72
 
73
73
  context "without https ignored" do
74
74
  it "should ignore https" do
75
- result = Cobweb.escape_pattern_for_regex("http://asdf.com", :treat_https_as_http => false)
76
- result.should eql "http://asdf\\.com"
75
+ result = Cobweb.escape_pattern_for_regex("https://asdf.com", :treat_https_as_http => false)
76
+ result.should eql "https://asdf\\.com"
77
77
  end
78
78
  end
79
79
 
@@ -711,8 +711,6 @@
711
711
  <a href="gfx/photos/07xl.jpg" class="zoom"><img src="gfx/photos/07.jpg" class="shadow" alt="Photo" /></a>
712
712
  <a href="gfx/photos/08xl.jpg" class="zoom"><img src="gfx/photos/08.jpg" class="shadow" alt="Photo" /></a>
713
713
  <a href="gfx/photos/09xl.jpg" class="zoom"><img src="gfx/photos/09.jpg" class="shadow" alt="Photo" /></a>
714
-
715
- <a href="https://localhost:3532/secure">HTTPS Link</a>
716
714
 
717
715
  <a href="#"><img src="gfx/photos/11.jpg" class="shadow" alt="Photo" /></a>
718
716
  <a href="#"><img src="gfx/photos/12.jpg" class="shadow" alt="Photo" /></a>
data/spec/spec_helper.rb CHANGED
@@ -37,11 +37,10 @@ RSpec.configure do |config|
37
37
 
38
38
  config.before(:each) {
39
39
 
40
- @redis_mock_object = MockRedis.new
41
- Redis.stub(:new).and_return(@redis_mock_object)
42
- Redis::Namespace.stub(:new).and_return(@redis_mock_object)
40
+ #redis_mock = double("redis")
41
+ #redis_mock.stub(:new).and_return(@redis_mock_object)
43
42
 
44
- @redis_mock_object.flushdb
43
+ #redis_mock.flushdb
45
44
 
46
45
  }
47
46
 
metadata CHANGED
@@ -1,127 +1,113 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.25
4
+ version: 1.0.26
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-24 00:00:00.000000000 Z
11
+ date: 2015-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '3.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '3.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '1.6'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '1.6'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: addressable
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: awesome_print
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
45
+ - - "~>"
60
46
  - !ruby/object:Gem::Version
61
- version: '0'
47
+ version: '2.3'
62
48
  type: :runtime
63
49
  prerelease: false
64
50
  version_requirements: !ruby/object:Gem::Requirement
65
51
  requirements:
66
- - - ">="
52
+ - - "~>"
67
53
  - !ruby/object:Gem::Version
68
- version: '0'
54
+ version: '2.3'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: sinatra
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
- - - ">="
59
+ - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '0'
61
+ version: '1.4'
76
62
  type: :runtime
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
- - - ">="
66
+ - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: '0'
68
+ version: '1.4'
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: haml
85
71
  requirement: !ruby/object:Gem::Requirement
86
72
  requirements:
87
- - - ">="
73
+ - - "~>"
88
74
  - !ruby/object:Gem::Version
89
- version: '0'
75
+ version: '4.0'
90
76
  type: :runtime
91
77
  prerelease: false
92
78
  version_requirements: !ruby/object:Gem::Requirement
93
79
  requirements:
94
- - - ">="
80
+ - - "~>"
95
81
  - !ruby/object:Gem::Version
96
- version: '0'
82
+ version: '4.0'
97
83
  - !ruby/object:Gem::Dependency
98
84
  name: redis-namespace
99
85
  requirement: !ruby/object:Gem::Requirement
100
86
  requirements:
101
- - - ">="
87
+ - - "~>"
102
88
  - !ruby/object:Gem::Version
103
- version: '0'
89
+ version: '1.3'
104
90
  type: :runtime
105
91
  prerelease: false
106
92
  version_requirements: !ruby/object:Gem::Requirement
107
93
  requirements:
108
- - - ">="
94
+ - - "~>"
109
95
  - !ruby/object:Gem::Version
110
- version: '0'
96
+ version: '1.3'
111
97
  - !ruby/object:Gem::Dependency
112
98
  name: json
113
99
  requirement: !ruby/object:Gem::Requirement
114
100
  requirements:
115
- - - ">="
101
+ - - "~>"
116
102
  - !ruby/object:Gem::Version
117
- version: '0'
103
+ version: '1.8'
118
104
  type: :runtime
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
107
  requirements:
122
- - - ">="
108
+ - - "~>"
123
109
  - !ruby/object:Gem::Version
124
- version: '0'
110
+ version: '1.8'
125
111
  - !ruby/object:Gem::Dependency
126
112
  name: slop
127
113
  requirement: !ruby/object:Gem::Requirement