cohesion 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/bin/cohesion +21 -9
- data/lib/cohesion.rb +2 -137
- data/lib/cohesion/cache.rb +15 -0
- data/lib/cohesion/check.rb +138 -0
- data/lib/cohesion/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmZhYjJjNDYwY2FlYmU0MjljNDg0M2JkYWUyMmJhMWQ3MjQ3MzQ4Yw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ODRmNzUwNjRjMjAxZmQ0YmI4NTg1ODEyZmY2ODQ0NWI4YjQ3MWIyMw==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NGU5Yjg3YjQ4ZmQyYmNlYzAwMDQ4MGY3NjYyZmY2ZDA1MTc5ZmU3M2ZmMTQ4
|
10
|
+
OGNiOTFmMmI0YjI0NDRmYmMwYzE3NzQ4NGMyNDAwODc4ODBmYWI4YjhhMTRl
|
11
|
+
YWI1Y2ZmMjFmMzExOGNkNmQ5NWU2Yjk2YzliMjVjYjZmN2U4ZTc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZWZjMTU3YzRmMzc1M2YyNjA3OGNmZDhhNzk1ODhiNzE0N2IyZGM2ZThkMjIw
|
14
|
+
ZjZlM2EzZDE4Y2QwNmYwOTAzOGQxNGRjOTIwMDk2NGVjNzljZDQ3ZGI0YjZi
|
15
|
+
N2EyMDc5YWUwOTdjMGYyMjMyYjQ4NDg3YjhlMTA5MDVmNWYxYTA=
|
data/bin/cohesion
CHANGED
@@ -25,36 +25,40 @@ opts = Slop.parse(:help => true) do
|
|
25
25
|
on 'output=', 'Path to output data to'
|
26
26
|
on 'output_format=', "Output format, csv or json"
|
27
27
|
|
28
|
+
on 'c', 'clearcache', "Clear the cache"
|
28
29
|
on 'v', 'verbose', 'Display crawl information'
|
29
30
|
on 'd', 'debug', 'Display debug information'
|
30
31
|
on 'w', 'web_statistics', 'Start web stats server'
|
31
32
|
end
|
32
33
|
|
34
|
+
if opts[:clearcache]
|
35
|
+
Cohesion::Cache.clear
|
36
|
+
end
|
33
37
|
if opts[:url]
|
34
38
|
|
35
39
|
options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
|
36
40
|
|
41
|
+
options[:output_format] = "json" unless options.has_key? :output_format
|
37
42
|
options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0])
|
38
43
|
options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0])
|
39
44
|
options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0])
|
40
45
|
|
41
46
|
failures = Cohesion::Check.site(opts[:url], options)
|
42
|
-
if failures.count == 0
|
47
|
+
if failures[:missing].count == 0
|
43
48
|
exit(true)
|
44
49
|
else
|
45
|
-
if
|
50
|
+
if options[:output]
|
46
51
|
output = []
|
47
|
-
failures.each do |failure|
|
52
|
+
failures[:missing].each do |failure|
|
48
53
|
output << {:error_page => failure[:issue][:url], :inbound_links => failure[:inbound]}
|
49
54
|
end
|
50
55
|
|
51
|
-
|
52
|
-
|
53
|
-
File.open(opts[:output], 'w') do |f|
|
56
|
+
if options[:output_format] == "json"
|
57
|
+
File.open(options[:output], 'w') do |f|
|
54
58
|
f.write output.to_json
|
55
59
|
end
|
56
|
-
elsif
|
57
|
-
CSV.open(
|
60
|
+
elsif options[:output_format] == "csv"
|
61
|
+
CSV.open("missing_#{options[:output]}", "wb") do |csv|
|
58
62
|
csv << ["404 Url", "Page that contains link"]
|
59
63
|
output.each do |line|
|
60
64
|
line[:inbound_links].each do |link|
|
@@ -62,10 +66,18 @@ if opts[:url]
|
|
62
66
|
end
|
63
67
|
end
|
64
68
|
end
|
69
|
+
CSV.open("duplicate_#{options[:output]}", "wb") do |csv|
|
70
|
+
csv << ["Hash of Content", "Pages with duplicate content"]
|
71
|
+
failures[:duplicate].each do |md5, pages|
|
72
|
+
pages.each do |link|
|
73
|
+
csv << [md5, link]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
65
77
|
end
|
66
78
|
end
|
67
79
|
exit(false)
|
68
80
|
end
|
69
81
|
else
|
70
|
-
puts
|
82
|
+
puts opts
|
71
83
|
end
|
data/lib/cohesion.rb
CHANGED
@@ -4,145 +4,10 @@ require 'cobweb'
|
|
4
4
|
require 'ptools'
|
5
5
|
require 'digest/md5'
|
6
6
|
|
7
|
+
require 'cohesion/check'
|
8
|
+
require 'cohesion/cache'
|
7
9
|
require 'cohesion/railtie' if defined?(Rails)
|
8
10
|
|
9
11
|
module Cohesion
|
10
|
-
class Check
|
11
12
|
|
12
|
-
def self.rails_text
|
13
|
-
puts "WARNING - not working yet..."
|
14
|
-
root_path = Rails.root.to_s
|
15
|
-
Dir.glob("**/*").each do |filename|
|
16
|
-
unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
|
17
|
-
f = File.open(filename, "r")
|
18
|
-
content = f.read()
|
19
|
-
f.close
|
20
|
-
if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
|
21
|
-
print "Checking #{$1} "
|
22
|
-
begin
|
23
|
-
status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
|
24
|
-
if status_code != 200
|
25
|
-
puts " [#{status_code}] \e[31m\u2717\e[0m"
|
26
|
-
else
|
27
|
-
puts "\e[32m\u2713\e[0m"
|
28
|
-
end
|
29
|
-
rescue SocketError
|
30
|
-
status_code = 0
|
31
|
-
puts " [DNS Failed] \e[31m\u2717\e[0m"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def self.rails_object
|
39
|
-
puts "WARNING - not working yet..."
|
40
|
-
root_path = Rails.root.to_s
|
41
|
-
#app_name = Rails.application.name
|
42
|
-
#puts "Checking #{app_name}..."
|
43
|
-
app = CobwebSample::Application
|
44
|
-
app.routes.default_url_options = { :host => 'xxx.com' }
|
45
|
-
|
46
|
-
Dir.glob("app/controllers/**/*").each do |filename|
|
47
|
-
controller_name = filename.gsub(".rb","").split("/")[-1].classify
|
48
|
-
unless controller_name == "ApplicationController"
|
49
|
-
puts "Processing #{controller_name}"
|
50
|
-
controller = controller_name.constantize.new
|
51
|
-
|
52
|
-
view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
|
53
|
-
|
54
|
-
view.view_paths = ActionController::Base.view_paths
|
55
|
-
view.extend ApplicationHelper
|
56
|
-
view.controller = controller
|
57
|
-
view.class_eval do
|
58
|
-
include ApplicationHelper
|
59
|
-
include app.routes.url_helpers
|
60
|
-
end
|
61
|
-
begin
|
62
|
-
puts view.render(:template => '/tests/index.html.erb')
|
63
|
-
rescue => e
|
64
|
-
puts "Error rendering view: #{e.message}"
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def self.site(url, options={})
|
71
|
-
errors = []
|
72
|
-
failures = []
|
73
|
-
|
74
|
-
pages = {}
|
75
|
-
|
76
|
-
options[:cache] = options[:cache].to_i if options[:cache]
|
77
|
-
crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
|
78
|
-
|
79
|
-
statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
|
80
|
-
print page[:url]
|
81
|
-
|
82
|
-
duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
|
83
|
-
pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
|
84
|
-
pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
|
85
|
-
|
86
|
-
# if it was a 404 before, just check again not using the cache this time
|
87
|
-
if page[:status_code] == 404
|
88
|
-
page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
|
89
|
-
end
|
90
|
-
|
91
|
-
if page[:status_code] == 404 || duplicate
|
92
|
-
if duplicate
|
93
|
-
puts " [duplicate] \e[31m\u2717\e[0m"
|
94
|
-
else
|
95
|
-
puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
|
96
|
-
end
|
97
|
-
failures << page
|
98
|
-
else
|
99
|
-
puts " \e[32m\u2713\e[0m"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
puts statistics.redis.namespace
|
104
|
-
puts statistics.get_statistics
|
105
|
-
|
106
|
-
total_inbound_failures = 0
|
107
|
-
total_failures = 0
|
108
|
-
|
109
|
-
issues = []
|
110
|
-
if failures.count == 0
|
111
|
-
puts "All links working!"
|
112
|
-
else
|
113
|
-
puts "Failed urls:"
|
114
|
-
failures.each do |f|
|
115
|
-
inbound_links = statistics.inbound_links_for(f[:url])
|
116
|
-
issues << {:issue => f, :inbound => inbound_links}
|
117
|
-
|
118
|
-
total_inbound_failures += inbound_links.count
|
119
|
-
total_failures += 1
|
120
|
-
|
121
|
-
puts ""
|
122
|
-
puts "#{f[:url]} [ #{f[:status_code]} ]"
|
123
|
-
inbound_links.each do |inbound_link|
|
124
|
-
puts " - #{inbound_link}"
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
puts ""
|
129
|
-
puts "Duplicate Content"
|
130
|
-
puts ""
|
131
|
-
pages.select{|k,v| v.count > 1}.each do |k,v|
|
132
|
-
puts "Duplicate: #{k}"
|
133
|
-
v.map{|x| puts " - #{x}" }
|
134
|
-
end
|
135
|
-
|
136
|
-
|
137
|
-
puts ""
|
138
|
-
puts "Total Failed URLs: #{total_failures}"
|
139
|
-
puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
|
140
|
-
puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
|
141
|
-
puts ""
|
142
|
-
end
|
143
|
-
puts
|
144
|
-
|
145
|
-
return issues
|
146
|
-
end
|
147
|
-
end
|
148
13
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Cohesion
|
2
|
+
class Cache
|
3
|
+
def self.clear_public
|
4
|
+
Cobweb::Cache.flush_public
|
5
|
+
end
|
6
|
+
def self.clear_crawls
|
7
|
+
Cobweb::Cache.flush_all_private
|
8
|
+
end
|
9
|
+
def self.clear_crawl(crawl_id)
|
10
|
+
Cobweb::Cache.flush_crawl(crawl_id)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
module Cohesion
|
2
|
+
class Check
|
3
|
+
|
4
|
+
def self.rails_text
|
5
|
+
puts "WARNING - not working yet..."
|
6
|
+
root_path = Rails.root.to_s
|
7
|
+
Dir.glob("**/*").each do |filename|
|
8
|
+
unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
|
9
|
+
f = File.open(filename, "r")
|
10
|
+
content = f.read()
|
11
|
+
f.close
|
12
|
+
if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
|
13
|
+
print "Checking #{$1} "
|
14
|
+
begin
|
15
|
+
status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
|
16
|
+
if status_code != 200
|
17
|
+
puts " [#{status_code}] \e[31m\u2717\e[0m"
|
18
|
+
else
|
19
|
+
puts "\e[32m\u2713\e[0m"
|
20
|
+
end
|
21
|
+
rescue SocketError
|
22
|
+
status_code = 0
|
23
|
+
puts " [DNS Failed] \e[31m\u2717\e[0m"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.rails_object
|
31
|
+
puts "WARNING - not working yet..."
|
32
|
+
root_path = Rails.root.to_s
|
33
|
+
#app_name = Rails.application.name
|
34
|
+
#puts "Checking #{app_name}..."
|
35
|
+
app = CobwebSample::Application
|
36
|
+
app.routes.default_url_options = { :host => 'xxx.com' }
|
37
|
+
|
38
|
+
Dir.glob("app/controllers/**/*").each do |filename|
|
39
|
+
controller_name = filename.gsub(".rb","").split("/")[-1].classify
|
40
|
+
unless controller_name == "ApplicationController"
|
41
|
+
puts "Processing #{controller_name}"
|
42
|
+
controller = controller_name.constantize.new
|
43
|
+
|
44
|
+
view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
|
45
|
+
|
46
|
+
view.view_paths = ActionController::Base.view_paths
|
47
|
+
view.extend ApplicationHelper
|
48
|
+
view.controller = controller
|
49
|
+
view.class_eval do
|
50
|
+
include ApplicationHelper
|
51
|
+
include app.routes.url_helpers
|
52
|
+
end
|
53
|
+
begin
|
54
|
+
puts view.render(:template => '/tests/index.html.erb')
|
55
|
+
rescue => e
|
56
|
+
puts "Error rendering view: #{e.message}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.site(url, options={})
|
63
|
+
errors = []
|
64
|
+
failures = []
|
65
|
+
|
66
|
+
pages = {}
|
67
|
+
|
68
|
+
options[:cache] = options[:cache].to_i if options[:cache]
|
69
|
+
crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
|
70
|
+
|
71
|
+
statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
|
72
|
+
print page[:url]
|
73
|
+
|
74
|
+
#duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
|
75
|
+
#pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
|
76
|
+
#pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
|
77
|
+
|
78
|
+
# if it was a 404 before, just check again not using the cache this time
|
79
|
+
if page[:status_code] == 404
|
80
|
+
page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
|
81
|
+
end
|
82
|
+
|
83
|
+
if page[:status_code] == 404 #|| duplicate
|
84
|
+
#if duplicate
|
85
|
+
# puts " [duplicate] \e[31m\u2717\e[0m"
|
86
|
+
#else
|
87
|
+
puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
|
88
|
+
#end
|
89
|
+
failures << page.select{|k,v| [:url, :status_code].include?(k)}
|
90
|
+
else
|
91
|
+
puts " \e[32m\u2713\e[0m"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
total_inbound_failures = 0
|
96
|
+
total_failures = 0
|
97
|
+
|
98
|
+
issues = {:missing => [], :duplicate => []}#pages.select{|k,v| v.count > 1}}
|
99
|
+
if failures.count > 0
|
100
|
+
puts "Failed urls:"
|
101
|
+
failures.each do |f|
|
102
|
+
inbound_links = statistics.inbound_links_for(f[:url])
|
103
|
+
issues[:missing] << {:issue => f, :inbound => inbound_links}
|
104
|
+
|
105
|
+
total_inbound_failures += inbound_links.count
|
106
|
+
total_failures += 1
|
107
|
+
|
108
|
+
puts ""
|
109
|
+
puts "#{f[:url]} [ #{f[:status_code]} ]"
|
110
|
+
inbound_links.each do |inbound_link|
|
111
|
+
puts " - #{inbound_link}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
duplicate_page_count = 0#pages.map{|d| d[1].count}.select{|count| count > 1}.reduce(:+).to_i
|
117
|
+
if duplicate_page_count > 0
|
118
|
+
|
119
|
+
puts ""
|
120
|
+
puts "Duplicate Content"
|
121
|
+
puts ""
|
122
|
+
pages.select{|k,v| v.count > 1}.each do |k,v|
|
123
|
+
puts "Duplicate: #{k}"
|
124
|
+
v.map{|x| puts " - #{x}" }
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
puts ""
|
130
|
+
puts "Total Failed URLs: #{total_failures}"
|
131
|
+
#puts "Total Duplicates: #{duplicate_page_count}"
|
132
|
+
puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
|
133
|
+
puts ""
|
134
|
+
|
135
|
+
return issues
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
data/lib/cohesion/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cohesion
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-12-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cobweb
|
@@ -59,6 +59,8 @@ files:
|
|
59
59
|
- bin/cohesion
|
60
60
|
- cohesion.gemspec
|
61
61
|
- lib/cohesion.rb
|
62
|
+
- lib/cohesion/cache.rb
|
63
|
+
- lib/cohesion/check.rb
|
62
64
|
- lib/cohesion/railtie.rb
|
63
65
|
- lib/cohesion/version.rb
|
64
66
|
- lib/tasks/cohesion.rake
|