cohesion 1.0.2 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjRmZDJmNTNjNDJmYjRlODcxNDA1MWQ4YTgzYmNlOWM0NWEzMDRiMw==
4
+ MmZhYjJjNDYwY2FlYmU0MjljNDg0M2JkYWUyMmJhMWQ3MjQ3MzQ4Yw==
5
5
  data.tar.gz: !binary |-
6
- YWM0M2I0NGNhNjQwMWVlYWU1ZGNlNmQ2MmE1N2NiYWE1OWY5YjYyYw==
6
+ ODRmNzUwNjRjMjAxZmQ0YmI4NTg1ODEyZmY2ODQ0NWI4YjQ3MWIyMw==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- NTZhODQ4ZDYyYzU2Yjg4ODZiZDUxMmZjYzliZjg1ZDk3YTlhOTg2ZGNlYzZm
10
- ZDNkNzg1YmYxNzI3M2IzMGQ2OGZjMzgzNGVhMjNjYTQyMGNiMjc3YzQ3NmFk
11
- OWFiMTI5YjhlMGM4YzVkMWJjMjZjZTNjZWMyMThlZjhiZjFiZjc=
9
+ NGU5Yjg3YjQ4ZmQyYmNlYzAwMDQ4MGY3NjYyZmY2ZDA1MTc5ZmU3M2ZmMTQ4
10
+ OGNiOTFmMmI0YjI0NDRmYmMwYzE3NzQ4NGMyNDAwODc4ODBmYWI4YjhhMTRl
11
+ YWI1Y2ZmMjFmMzExOGNkNmQ5NWU2Yjk2YzliMjVjYjZmN2U4ZTc=
12
12
  data.tar.gz: !binary |-
13
- ZmY0ZGFjODA4YzcyZmZkYzJiZDcxYWEwZmI2NzFhYzBjY2YyMTRjMjMzNmE0
14
- NGJiZjkwMzE1NTBlZmVlZjI0ZDY0NjhkNDU5NzJlYjY3YmRhODY5ZjgyNWRi
15
- MjVlNTQ1ZTg2Y2FkYzRmZGFjNzgyY2IwOTU0ZGEwYjYyYWE4YTU=
13
+ ZWZjMTU3YzRmMzc1M2YyNjA3OGNmZDhhNzk1ODhiNzE0N2IyZGM2ZThkMjIw
14
+ ZjZlM2EzZDE4Y2QwNmYwOTAzOGQxNGRjOTIwMDk2NGVjNzljZDQ3ZGI0YjZi
15
+ N2EyMDc5YWUwOTdjMGYyMjMyYjQ4NDg3YjhlMTA5MDVmNWYxYTA=
@@ -25,36 +25,40 @@ opts = Slop.parse(:help => true) do
25
25
  on 'output=', 'Path to output data to'
26
26
  on 'output_format=', "Output format, csv or json"
27
27
 
28
+ on 'c', 'clearcache', "Clear the cache"
28
29
  on 'v', 'verbose', 'Display crawl information'
29
30
  on 'd', 'debug', 'Display debug information'
30
31
  on 'w', 'web_statistics', 'Start web stats server'
31
32
  end
32
33
 
34
+ if opts[:clearcache]
35
+ Cohesion::Cache.clear
36
+ end
33
37
  if opts[:url]
34
38
 
35
39
  options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
36
40
 
41
+ options[:output_format] = "json" unless options.has_key? :output_format
37
42
  options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0])
38
43
  options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0])
39
44
  options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0])
40
45
 
41
46
  failures = Cohesion::Check.site(opts[:url], options)
42
- if failures.count == 0
47
+ if failures[:missing].count == 0
43
48
  exit(true)
44
49
  else
45
- if opts[:output]
50
+ if options[:output]
46
51
  output = []
47
- failures.each do |failure|
52
+ failures[:missing].each do |failure|
48
53
  output << {:error_page => failure[:issue][:url], :inbound_links => failure[:inbound]}
49
54
  end
50
55
 
51
- opts[:output_format] = "json" unless opts[:output_format]
52
- if opts[:output_format] == "json"
53
- File.open(opts[:output], 'w') do |f|
56
+ if options[:output_format] == "json"
57
+ File.open(options[:output], 'w') do |f|
54
58
  f.write output.to_json
55
59
  end
56
- elsif opts[:output_format] == "csv"
57
- CSV.open(opts[:output], "wb") do |csv|
60
+ elsif options[:output_format] == "csv"
61
+ CSV.open("missing_#{options[:output]}", "wb") do |csv|
58
62
  csv << ["404 Url", "Page that contains link"]
59
63
  output.each do |line|
60
64
  line[:inbound_links].each do |link|
@@ -62,10 +66,18 @@ if opts[:url]
62
66
  end
63
67
  end
64
68
  end
69
+ CSV.open("duplicate_#{options[:output]}", "wb") do |csv|
70
+ csv << ["Hash of Content", "Pages with duplicate content"]
71
+ failures[:duplicate].each do |md5, pages|
72
+ pages.each do |link|
73
+ csv << [md5, link]
74
+ end
75
+ end
76
+ end
65
77
  end
66
78
  end
67
79
  exit(false)
68
80
  end
69
81
  else
70
- puts
82
+ puts opts
71
83
  end
@@ -4,145 +4,10 @@ require 'cobweb'
4
4
  require 'ptools'
5
5
  require 'digest/md5'
6
6
 
7
+ require 'cohesion/check'
8
+ require 'cohesion/cache'
7
9
  require 'cohesion/railtie' if defined?(Rails)
8
10
 
9
11
  module Cohesion
10
- class Check
11
12
 
12
- def self.rails_text
13
- puts "WARNING - not working yet..."
14
- root_path = Rails.root.to_s
15
- Dir.glob("**/*").each do |filename|
16
- unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
17
- f = File.open(filename, "r")
18
- content = f.read()
19
- f.close
20
- if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
21
- print "Checking #{$1} "
22
- begin
23
- status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
24
- if status_code != 200
25
- puts " [#{status_code}] \e[31m\u2717\e[0m"
26
- else
27
- puts "\e[32m\u2713\e[0m"
28
- end
29
- rescue SocketError
30
- status_code = 0
31
- puts " [DNS Failed] \e[31m\u2717\e[0m"
32
- end
33
- end
34
- end
35
- end
36
- end
37
-
38
- def self.rails_object
39
- puts "WARNING - not working yet..."
40
- root_path = Rails.root.to_s
41
- #app_name = Rails.application.name
42
- #puts "Checking #{app_name}..."
43
- app = CobwebSample::Application
44
- app.routes.default_url_options = { :host => 'xxx.com' }
45
-
46
- Dir.glob("app/controllers/**/*").each do |filename|
47
- controller_name = filename.gsub(".rb","").split("/")[-1].classify
48
- unless controller_name == "ApplicationController"
49
- puts "Processing #{controller_name}"
50
- controller = controller_name.constantize.new
51
-
52
- view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
53
-
54
- view.view_paths = ActionController::Base.view_paths
55
- view.extend ApplicationHelper
56
- view.controller = controller
57
- view.class_eval do
58
- include ApplicationHelper
59
- include app.routes.url_helpers
60
- end
61
- begin
62
- puts view.render(:template => '/tests/index.html.erb')
63
- rescue => e
64
- puts "Error rendering view: #{e.message}"
65
- end
66
- end
67
- end
68
- end
69
-
70
- def self.site(url, options={})
71
- errors = []
72
- failures = []
73
-
74
- pages = {}
75
-
76
- options[:cache] = options[:cache].to_i if options[:cache]
77
- crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
78
-
79
- statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
80
- print page[:url]
81
-
82
- duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
83
- pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
84
- pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
85
-
86
- # if it was a 404 before, just check again not using the cache this time
87
- if page[:status_code] == 404
88
- page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
89
- end
90
-
91
- if page[:status_code] == 404 || duplicate
92
- if duplicate
93
- puts " [duplicate] \e[31m\u2717\e[0m"
94
- else
95
- puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
96
- end
97
- failures << page
98
- else
99
- puts " \e[32m\u2713\e[0m"
100
- end
101
- end
102
-
103
- puts statistics.redis.namespace
104
- puts statistics.get_statistics
105
-
106
- total_inbound_failures = 0
107
- total_failures = 0
108
-
109
- issues = []
110
- if failures.count == 0
111
- puts "All links working!"
112
- else
113
- puts "Failed urls:"
114
- failures.each do |f|
115
- inbound_links = statistics.inbound_links_for(f[:url])
116
- issues << {:issue => f, :inbound => inbound_links}
117
-
118
- total_inbound_failures += inbound_links.count
119
- total_failures += 1
120
-
121
- puts ""
122
- puts "#{f[:url]} [ #{f[:status_code]} ]"
123
- inbound_links.each do |inbound_link|
124
- puts " - #{inbound_link}"
125
- end
126
- end
127
-
128
- puts ""
129
- puts "Duplicate Content"
130
- puts ""
131
- pages.select{|k,v| v.count > 1}.each do |k,v|
132
- puts "Duplicate: #{k}"
133
- v.map{|x| puts " - #{x}" }
134
- end
135
-
136
-
137
- puts ""
138
- puts "Total Failed URLs: #{total_failures}"
139
- puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
140
- puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
141
- puts ""
142
- end
143
- puts
144
-
145
- return issues
146
- end
147
- end
148
13
  end
@@ -0,0 +1,15 @@
1
+ module Cohesion
2
+ class Cache
3
+ def self.clear_public
4
+ Cobweb::Cache.flush_public
5
+ end
6
+ def self.clear_crawls
7
+ Cobweb::Cache.flush_all_private
8
+ end
9
+ def self.clear_crawl(crawl_id)
10
+ Cobweb::Cache.flush_crawl(crawl_id)
11
+ end
12
+
13
+ end
14
+
15
+ end
@@ -0,0 +1,138 @@
1
+ module Cohesion
2
+ class Check
3
+
4
+ def self.rails_text
5
+ puts "WARNING - not working yet..."
6
+ root_path = Rails.root.to_s
7
+ Dir.glob("**/*").each do |filename|
8
+ unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
9
+ f = File.open(filename, "r")
10
+ content = f.read()
11
+ f.close
12
+ if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
13
+ print "Checking #{$1} "
14
+ begin
15
+ status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
16
+ if status_code != 200
17
+ puts " [#{status_code}] \e[31m\u2717\e[0m"
18
+ else
19
+ puts "\e[32m\u2713\e[0m"
20
+ end
21
+ rescue SocketError
22
+ status_code = 0
23
+ puts " [DNS Failed] \e[31m\u2717\e[0m"
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ def self.rails_object
31
+ puts "WARNING - not working yet..."
32
+ root_path = Rails.root.to_s
33
+ #app_name = Rails.application.name
34
+ #puts "Checking #{app_name}..."
35
+ app = CobwebSample::Application
36
+ app.routes.default_url_options = { :host => 'xxx.com' }
37
+
38
+ Dir.glob("app/controllers/**/*").each do |filename|
39
+ controller_name = filename.gsub(".rb","").split("/")[-1].classify
40
+ unless controller_name == "ApplicationController"
41
+ puts "Processing #{controller_name}"
42
+ controller = controller_name.constantize.new
43
+
44
+ view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
45
+
46
+ view.view_paths = ActionController::Base.view_paths
47
+ view.extend ApplicationHelper
48
+ view.controller = controller
49
+ view.class_eval do
50
+ include ApplicationHelper
51
+ include app.routes.url_helpers
52
+ end
53
+ begin
54
+ puts view.render(:template => '/tests/index.html.erb')
55
+ rescue => e
56
+ puts "Error rendering view: #{e.message}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ def self.site(url, options={})
63
+ errors = []
64
+ failures = []
65
+
66
+ pages = {}
67
+
68
+ options[:cache] = options[:cache].to_i if options[:cache]
69
+ crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
70
+
71
+ statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
72
+ print page[:url]
73
+
74
+ #duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
75
+ #pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
76
+ #pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
77
+
78
+ # if it was a 404 before, just check again not using the cache this time
79
+ if page[:status_code] == 404
80
+ page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
81
+ end
82
+
83
+ if page[:status_code] == 404 #|| duplicate
84
+ #if duplicate
85
+ # puts " [duplicate] \e[31m\u2717\e[0m"
86
+ #else
87
+ puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
88
+ #end
89
+ failures << page.select{|k,v| [:url, :status_code].include?(k)}
90
+ else
91
+ puts " \e[32m\u2713\e[0m"
92
+ end
93
+ end
94
+
95
+ total_inbound_failures = 0
96
+ total_failures = 0
97
+
98
+ issues = {:missing => [], :duplicate => []}#pages.select{|k,v| v.count > 1}}
99
+ if failures.count > 0
100
+ puts "Failed urls:"
101
+ failures.each do |f|
102
+ inbound_links = statistics.inbound_links_for(f[:url])
103
+ issues[:missing] << {:issue => f, :inbound => inbound_links}
104
+
105
+ total_inbound_failures += inbound_links.count
106
+ total_failures += 1
107
+
108
+ puts ""
109
+ puts "#{f[:url]} [ #{f[:status_code]} ]"
110
+ inbound_links.each do |inbound_link|
111
+ puts " - #{inbound_link}"
112
+ end
113
+ end
114
+ end
115
+
116
+ duplicate_page_count = 0#pages.map{|d| d[1].count}.select{|count| count > 1}.reduce(:+).to_i
117
+ if duplicate_page_count > 0
118
+
119
+ puts ""
120
+ puts "Duplicate Content"
121
+ puts ""
122
+ pages.select{|k,v| v.count > 1}.each do |k,v|
123
+ puts "Duplicate: #{k}"
124
+ v.map{|x| puts " - #{x}" }
125
+ end
126
+
127
+ end
128
+
129
+ puts ""
130
+ puts "Total Failed URLs: #{total_failures}"
131
+ #puts "Total Duplicates: #{duplicate_page_count}"
132
+ puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
133
+ puts ""
134
+
135
+ return issues
136
+ end
137
+ end
138
+ end
@@ -1,3 +1,3 @@
1
1
  module Cohesion
2
- VERSION = "1.0.2"
2
+ VERSION = "1.0.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cohesion
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-23 00:00:00.000000000 Z
11
+ date: 2013-12-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cobweb
@@ -59,6 +59,8 @@ files:
59
59
  - bin/cohesion
60
60
  - cohesion.gemspec
61
61
  - lib/cohesion.rb
62
+ - lib/cohesion/cache.rb
63
+ - lib/cohesion/check.rb
62
64
  - lib/cohesion/railtie.rb
63
65
  - lib/cohesion/version.rb
64
66
  - lib/tasks/cohesion.rake