cohesion 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/bin/cohesion +21 -9
- data/lib/cohesion.rb +2 -137
- data/lib/cohesion/cache.rb +15 -0
- data/lib/cohesion/check.rb +138 -0
- data/lib/cohesion/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmZhYjJjNDYwY2FlYmU0MjljNDg0M2JkYWUyMmJhMWQ3MjQ3MzQ4Yw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ODRmNzUwNjRjMjAxZmQ0YmI4NTg1ODEyZmY2ODQ0NWI4YjQ3MWIyMw==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NGU5Yjg3YjQ4ZmQyYmNlYzAwMDQ4MGY3NjYyZmY2ZDA1MTc5ZmU3M2ZmMTQ4
|
10
|
+
OGNiOTFmMmI0YjI0NDRmYmMwYzE3NzQ4NGMyNDAwODc4ODBmYWI4YjhhMTRl
|
11
|
+
YWI1Y2ZmMjFmMzExOGNkNmQ5NWU2Yjk2YzliMjVjYjZmN2U4ZTc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZWZjMTU3YzRmMzc1M2YyNjA3OGNmZDhhNzk1ODhiNzE0N2IyZGM2ZThkMjIw
|
14
|
+
ZjZlM2EzZDE4Y2QwNmYwOTAzOGQxNGRjOTIwMDk2NGVjNzljZDQ3ZGI0YjZi
|
15
|
+
N2EyMDc5YWUwOTdjMGYyMjMyYjQ4NDg3YjhlMTA5MDVmNWYxYTA=
|
data/bin/cohesion
CHANGED
@@ -25,36 +25,40 @@ opts = Slop.parse(:help => true) do
|
|
25
25
|
on 'output=', 'Path to output data to'
|
26
26
|
on 'output_format=', "Output format, csv or json"
|
27
27
|
|
28
|
+
on 'c', 'clearcache', "Clear the cache"
|
28
29
|
on 'v', 'verbose', 'Display crawl information'
|
29
30
|
on 'd', 'debug', 'Display debug information'
|
30
31
|
on 'w', 'web_statistics', 'Start web stats server'
|
31
32
|
end
|
32
33
|
|
34
|
+
if opts[:clearcache]
|
35
|
+
Cohesion::Cache.clear
|
36
|
+
end
|
33
37
|
if opts[:url]
|
34
38
|
|
35
39
|
options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
|
36
40
|
|
41
|
+
options[:output_format] = "json" unless options.has_key? :output_format
|
37
42
|
options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0])
|
38
43
|
options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0])
|
39
44
|
options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0])
|
40
45
|
|
41
46
|
failures = Cohesion::Check.site(opts[:url], options)
|
42
|
-
if failures.count == 0
|
47
|
+
if failures[:missing].count == 0
|
43
48
|
exit(true)
|
44
49
|
else
|
45
|
-
if
|
50
|
+
if options[:output]
|
46
51
|
output = []
|
47
|
-
failures.each do |failure|
|
52
|
+
failures[:missing].each do |failure|
|
48
53
|
output << {:error_page => failure[:issue][:url], :inbound_links => failure[:inbound]}
|
49
54
|
end
|
50
55
|
|
51
|
-
|
52
|
-
|
53
|
-
File.open(opts[:output], 'w') do |f|
|
56
|
+
if options[:output_format] == "json"
|
57
|
+
File.open(options[:output], 'w') do |f|
|
54
58
|
f.write output.to_json
|
55
59
|
end
|
56
|
-
elsif
|
57
|
-
CSV.open(
|
60
|
+
elsif options[:output_format] == "csv"
|
61
|
+
CSV.open("missing_#{options[:output]}", "wb") do |csv|
|
58
62
|
csv << ["404 Url", "Page that contains link"]
|
59
63
|
output.each do |line|
|
60
64
|
line[:inbound_links].each do |link|
|
@@ -62,10 +66,18 @@ if opts[:url]
|
|
62
66
|
end
|
63
67
|
end
|
64
68
|
end
|
69
|
+
CSV.open("duplicate_#{options[:output]}", "wb") do |csv|
|
70
|
+
csv << ["Hash of Content", "Pages with duplicate content"]
|
71
|
+
failures[:duplicate].each do |md5, pages|
|
72
|
+
pages.each do |link|
|
73
|
+
csv << [md5, link]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
65
77
|
end
|
66
78
|
end
|
67
79
|
exit(false)
|
68
80
|
end
|
69
81
|
else
|
70
|
-
puts
|
82
|
+
puts opts
|
71
83
|
end
|
data/lib/cohesion.rb
CHANGED
@@ -4,145 +4,10 @@ require 'cobweb'
|
|
4
4
|
require 'ptools'
|
5
5
|
require 'digest/md5'
|
6
6
|
|
7
|
+
require 'cohesion/check'
|
8
|
+
require 'cohesion/cache'
|
7
9
|
require 'cohesion/railtie' if defined?(Rails)
|
8
10
|
|
9
11
|
module Cohesion
|
10
|
-
class Check
|
11
12
|
|
12
|
-
def self.rails_text
|
13
|
-
puts "WARNING - not working yet..."
|
14
|
-
root_path = Rails.root.to_s
|
15
|
-
Dir.glob("**/*").each do |filename|
|
16
|
-
unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
|
17
|
-
f = File.open(filename, "r")
|
18
|
-
content = f.read()
|
19
|
-
f.close
|
20
|
-
if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
|
21
|
-
print "Checking #{$1} "
|
22
|
-
begin
|
23
|
-
status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
|
24
|
-
if status_code != 200
|
25
|
-
puts " [#{status_code}] \e[31m\u2717\e[0m"
|
26
|
-
else
|
27
|
-
puts "\e[32m\u2713\e[0m"
|
28
|
-
end
|
29
|
-
rescue SocketError
|
30
|
-
status_code = 0
|
31
|
-
puts " [DNS Failed] \e[31m\u2717\e[0m"
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def self.rails_object
|
39
|
-
puts "WARNING - not working yet..."
|
40
|
-
root_path = Rails.root.to_s
|
41
|
-
#app_name = Rails.application.name
|
42
|
-
#puts "Checking #{app_name}..."
|
43
|
-
app = CobwebSample::Application
|
44
|
-
app.routes.default_url_options = { :host => 'xxx.com' }
|
45
|
-
|
46
|
-
Dir.glob("app/controllers/**/*").each do |filename|
|
47
|
-
controller_name = filename.gsub(".rb","").split("/")[-1].classify
|
48
|
-
unless controller_name == "ApplicationController"
|
49
|
-
puts "Processing #{controller_name}"
|
50
|
-
controller = controller_name.constantize.new
|
51
|
-
|
52
|
-
view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
|
53
|
-
|
54
|
-
view.view_paths = ActionController::Base.view_paths
|
55
|
-
view.extend ApplicationHelper
|
56
|
-
view.controller = controller
|
57
|
-
view.class_eval do
|
58
|
-
include ApplicationHelper
|
59
|
-
include app.routes.url_helpers
|
60
|
-
end
|
61
|
-
begin
|
62
|
-
puts view.render(:template => '/tests/index.html.erb')
|
63
|
-
rescue => e
|
64
|
-
puts "Error rendering view: #{e.message}"
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def self.site(url, options={})
|
71
|
-
errors = []
|
72
|
-
failures = []
|
73
|
-
|
74
|
-
pages = {}
|
75
|
-
|
76
|
-
options[:cache] = options[:cache].to_i if options[:cache]
|
77
|
-
crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
|
78
|
-
|
79
|
-
statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
|
80
|
-
print page[:url]
|
81
|
-
|
82
|
-
duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
|
83
|
-
pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
|
84
|
-
pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
|
85
|
-
|
86
|
-
# if it was a 404 before, just check again not using the cache this time
|
87
|
-
if page[:status_code] == 404
|
88
|
-
page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
|
89
|
-
end
|
90
|
-
|
91
|
-
if page[:status_code] == 404 || duplicate
|
92
|
-
if duplicate
|
93
|
-
puts " [duplicate] \e[31m\u2717\e[0m"
|
94
|
-
else
|
95
|
-
puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
|
96
|
-
end
|
97
|
-
failures << page
|
98
|
-
else
|
99
|
-
puts " \e[32m\u2713\e[0m"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
puts statistics.redis.namespace
|
104
|
-
puts statistics.get_statistics
|
105
|
-
|
106
|
-
total_inbound_failures = 0
|
107
|
-
total_failures = 0
|
108
|
-
|
109
|
-
issues = []
|
110
|
-
if failures.count == 0
|
111
|
-
puts "All links working!"
|
112
|
-
else
|
113
|
-
puts "Failed urls:"
|
114
|
-
failures.each do |f|
|
115
|
-
inbound_links = statistics.inbound_links_for(f[:url])
|
116
|
-
issues << {:issue => f, :inbound => inbound_links}
|
117
|
-
|
118
|
-
total_inbound_failures += inbound_links.count
|
119
|
-
total_failures += 1
|
120
|
-
|
121
|
-
puts ""
|
122
|
-
puts "#{f[:url]} [ #{f[:status_code]} ]"
|
123
|
-
inbound_links.each do |inbound_link|
|
124
|
-
puts " - #{inbound_link}"
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
puts ""
|
129
|
-
puts "Duplicate Content"
|
130
|
-
puts ""
|
131
|
-
pages.select{|k,v| v.count > 1}.each do |k,v|
|
132
|
-
puts "Duplicate: #{k}"
|
133
|
-
v.map{|x| puts " - #{x}" }
|
134
|
-
end
|
135
|
-
|
136
|
-
|
137
|
-
puts ""
|
138
|
-
puts "Total Failed URLs: #{total_failures}"
|
139
|
-
puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
|
140
|
-
puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
|
141
|
-
puts ""
|
142
|
-
end
|
143
|
-
puts
|
144
|
-
|
145
|
-
return issues
|
146
|
-
end
|
147
|
-
end
|
148
13
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Cohesion
|
2
|
+
class Cache
|
3
|
+
def self.clear_public
|
4
|
+
Cobweb::Cache.flush_public
|
5
|
+
end
|
6
|
+
def self.clear_crawls
|
7
|
+
Cobweb::Cache.flush_all_private
|
8
|
+
end
|
9
|
+
def self.clear_crawl(crawl_id)
|
10
|
+
Cobweb::Cache.flush_crawl(crawl_id)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
module Cohesion
|
2
|
+
class Check
|
3
|
+
|
4
|
+
def self.rails_text
|
5
|
+
puts "WARNING - not working yet..."
|
6
|
+
root_path = Rails.root.to_s
|
7
|
+
Dir.glob("**/*").each do |filename|
|
8
|
+
unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
|
9
|
+
f = File.open(filename, "r")
|
10
|
+
content = f.read()
|
11
|
+
f.close
|
12
|
+
if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
|
13
|
+
print "Checking #{$1} "
|
14
|
+
begin
|
15
|
+
status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
|
16
|
+
if status_code != 200
|
17
|
+
puts " [#{status_code}] \e[31m\u2717\e[0m"
|
18
|
+
else
|
19
|
+
puts "\e[32m\u2713\e[0m"
|
20
|
+
end
|
21
|
+
rescue SocketError
|
22
|
+
status_code = 0
|
23
|
+
puts " [DNS Failed] \e[31m\u2717\e[0m"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.rails_object
|
31
|
+
puts "WARNING - not working yet..."
|
32
|
+
root_path = Rails.root.to_s
|
33
|
+
#app_name = Rails.application.name
|
34
|
+
#puts "Checking #{app_name}..."
|
35
|
+
app = CobwebSample::Application
|
36
|
+
app.routes.default_url_options = { :host => 'xxx.com' }
|
37
|
+
|
38
|
+
Dir.glob("app/controllers/**/*").each do |filename|
|
39
|
+
controller_name = filename.gsub(".rb","").split("/")[-1].classify
|
40
|
+
unless controller_name == "ApplicationController"
|
41
|
+
puts "Processing #{controller_name}"
|
42
|
+
controller = controller_name.constantize.new
|
43
|
+
|
44
|
+
view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
|
45
|
+
|
46
|
+
view.view_paths = ActionController::Base.view_paths
|
47
|
+
view.extend ApplicationHelper
|
48
|
+
view.controller = controller
|
49
|
+
view.class_eval do
|
50
|
+
include ApplicationHelper
|
51
|
+
include app.routes.url_helpers
|
52
|
+
end
|
53
|
+
begin
|
54
|
+
puts view.render(:template => '/tests/index.html.erb')
|
55
|
+
rescue => e
|
56
|
+
puts "Error rendering view: #{e.message}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.site(url, options={})
|
63
|
+
errors = []
|
64
|
+
failures = []
|
65
|
+
|
66
|
+
pages = {}
|
67
|
+
|
68
|
+
options[:cache] = options[:cache].to_i if options[:cache]
|
69
|
+
crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
|
70
|
+
|
71
|
+
statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
|
72
|
+
print page[:url]
|
73
|
+
|
74
|
+
#duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
|
75
|
+
#pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
|
76
|
+
#pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
|
77
|
+
|
78
|
+
# if it was a 404 before, just check again not using the cache this time
|
79
|
+
if page[:status_code] == 404
|
80
|
+
page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
|
81
|
+
end
|
82
|
+
|
83
|
+
if page[:status_code] == 404 #|| duplicate
|
84
|
+
#if duplicate
|
85
|
+
# puts " [duplicate] \e[31m\u2717\e[0m"
|
86
|
+
#else
|
87
|
+
puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
|
88
|
+
#end
|
89
|
+
failures << page.select{|k,v| [:url, :status_code].include?(k)}
|
90
|
+
else
|
91
|
+
puts " \e[32m\u2713\e[0m"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
total_inbound_failures = 0
|
96
|
+
total_failures = 0
|
97
|
+
|
98
|
+
issues = {:missing => [], :duplicate => []}#pages.select{|k,v| v.count > 1}}
|
99
|
+
if failures.count > 0
|
100
|
+
puts "Failed urls:"
|
101
|
+
failures.each do |f|
|
102
|
+
inbound_links = statistics.inbound_links_for(f[:url])
|
103
|
+
issues[:missing] << {:issue => f, :inbound => inbound_links}
|
104
|
+
|
105
|
+
total_inbound_failures += inbound_links.count
|
106
|
+
total_failures += 1
|
107
|
+
|
108
|
+
puts ""
|
109
|
+
puts "#{f[:url]} [ #{f[:status_code]} ]"
|
110
|
+
inbound_links.each do |inbound_link|
|
111
|
+
puts " - #{inbound_link}"
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
duplicate_page_count = 0#pages.map{|d| d[1].count}.select{|count| count > 1}.reduce(:+).to_i
|
117
|
+
if duplicate_page_count > 0
|
118
|
+
|
119
|
+
puts ""
|
120
|
+
puts "Duplicate Content"
|
121
|
+
puts ""
|
122
|
+
pages.select{|k,v| v.count > 1}.each do |k,v|
|
123
|
+
puts "Duplicate: #{k}"
|
124
|
+
v.map{|x| puts " - #{x}" }
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
puts ""
|
130
|
+
puts "Total Failed URLs: #{total_failures}"
|
131
|
+
#puts "Total Duplicates: #{duplicate_page_count}"
|
132
|
+
puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
|
133
|
+
puts ""
|
134
|
+
|
135
|
+
return issues
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
data/lib/cohesion/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cohesion
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-12-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cobweb
|
@@ -59,6 +59,8 @@ files:
|
|
59
59
|
- bin/cohesion
|
60
60
|
- cohesion.gemspec
|
61
61
|
- lib/cohesion.rb
|
62
|
+
- lib/cohesion/cache.rb
|
63
|
+
- lib/cohesion/check.rb
|
62
64
|
- lib/cohesion/railtie.rb
|
63
65
|
- lib/cohesion/version.rb
|
64
66
|
- lib/tasks/cohesion.rake
|