cohesion 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YjRmZDJmNTNjNDJmYjRlODcxNDA1MWQ4YTgzYmNlOWM0NWEzMDRiMw==
4
+ MmZhYjJjNDYwY2FlYmU0MjljNDg0M2JkYWUyMmJhMWQ3MjQ3MzQ4Yw==
5
5
  data.tar.gz: !binary |-
6
- YWM0M2I0NGNhNjQwMWVlYWU1ZGNlNmQ2MmE1N2NiYWE1OWY5YjYyYw==
6
+ ODRmNzUwNjRjMjAxZmQ0YmI4NTg1ODEyZmY2ODQ0NWI4YjQ3MWIyMw==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- NTZhODQ4ZDYyYzU2Yjg4ODZiZDUxMmZjYzliZjg1ZDk3YTlhOTg2ZGNlYzZm
10
- ZDNkNzg1YmYxNzI3M2IzMGQ2OGZjMzgzNGVhMjNjYTQyMGNiMjc3YzQ3NmFk
11
- OWFiMTI5YjhlMGM4YzVkMWJjMjZjZTNjZWMyMThlZjhiZjFiZjc=
9
+ NGU5Yjg3YjQ4ZmQyYmNlYzAwMDQ4MGY3NjYyZmY2ZDA1MTc5ZmU3M2ZmMTQ4
10
+ OGNiOTFmMmI0YjI0NDRmYmMwYzE3NzQ4NGMyNDAwODc4ODBmYWI4YjhhMTRl
11
+ YWI1Y2ZmMjFmMzExOGNkNmQ5NWU2Yjk2YzliMjVjYjZmN2U4ZTc=
12
12
  data.tar.gz: !binary |-
13
- ZmY0ZGFjODA4YzcyZmZkYzJiZDcxYWEwZmI2NzFhYzBjY2YyMTRjMjMzNmE0
14
- NGJiZjkwMzE1NTBlZmVlZjI0ZDY0NjhkNDU5NzJlYjY3YmRhODY5ZjgyNWRi
15
- MjVlNTQ1ZTg2Y2FkYzRmZGFjNzgyY2IwOTU0ZGEwYjYyYWE4YTU=
13
+ ZWZjMTU3YzRmMzc1M2YyNjA3OGNmZDhhNzk1ODhiNzE0N2IyZGM2ZThkMjIw
14
+ ZjZlM2EzZDE4Y2QwNmYwOTAzOGQxNGRjOTIwMDk2NGVjNzljZDQ3ZGI0YjZi
15
+ N2EyMDc5YWUwOTdjMGYyMjMyYjQ4NDg3YjhlMTA5MDVmNWYxYTA=
@@ -25,36 +25,40 @@ opts = Slop.parse(:help => true) do
25
25
  on 'output=', 'Path to output data to'
26
26
  on 'output_format=', "Output format, csv or json"
27
27
 
28
+ on 'c', 'clearcache', "Clear the cache"
28
29
  on 'v', 'verbose', 'Display crawl information'
29
30
  on 'd', 'debug', 'Display debug information'
30
31
  on 'w', 'web_statistics', 'Start web stats server'
31
32
  end
32
33
 
34
+ if opts[:clearcache]
35
+ Cohesion::Cache.clear
36
+ end
33
37
  if opts[:url]
34
38
 
35
39
  options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
36
40
 
41
+ options[:output_format] = "json" unless options.has_key? :output_format
37
42
  options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0])
38
43
  options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0])
39
44
  options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0])
40
45
 
41
46
  failures = Cohesion::Check.site(opts[:url], options)
42
- if failures.count == 0
47
+ if failures[:missing].count == 0
43
48
  exit(true)
44
49
  else
45
- if opts[:output]
50
+ if options[:output]
46
51
  output = []
47
- failures.each do |failure|
52
+ failures[:missing].each do |failure|
48
53
  output << {:error_page => failure[:issue][:url], :inbound_links => failure[:inbound]}
49
54
  end
50
55
 
51
- opts[:output_format] = "json" unless opts[:output_format]
52
- if opts[:output_format] == "json"
53
- File.open(opts[:output], 'w') do |f|
56
+ if options[:output_format] == "json"
57
+ File.open(options[:output], 'w') do |f|
54
58
  f.write output.to_json
55
59
  end
56
- elsif opts[:output_format] == "csv"
57
- CSV.open(opts[:output], "wb") do |csv|
60
+ elsif options[:output_format] == "csv"
61
+ CSV.open("missing_#{options[:output]}", "wb") do |csv|
58
62
  csv << ["404 Url", "Page that contains link"]
59
63
  output.each do |line|
60
64
  line[:inbound_links].each do |link|
@@ -62,10 +66,18 @@ if opts[:url]
62
66
  end
63
67
  end
64
68
  end
69
+ CSV.open("duplicate_#{options[:output]}", "wb") do |csv|
70
+ csv << ["Hash of Content", "Pages with duplicate content"]
71
+ failures[:duplicate].each do |md5, pages|
72
+ pages.each do |link|
73
+ csv << [md5, link]
74
+ end
75
+ end
76
+ end
65
77
  end
66
78
  end
67
79
  exit(false)
68
80
  end
69
81
  else
70
- puts
82
+ puts opts
71
83
  end
@@ -4,145 +4,10 @@ require 'cobweb'
4
4
  require 'ptools'
5
5
  require 'digest/md5'
6
6
 
7
+ require 'cohesion/check'
8
+ require 'cohesion/cache'
7
9
  require 'cohesion/railtie' if defined?(Rails)
8
10
 
9
11
  module Cohesion
10
- class Check
11
12
 
12
- def self.rails_text
13
- puts "WARNING - not working yet..."
14
- root_path = Rails.root.to_s
15
- Dir.glob("**/*").each do |filename|
16
- unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
17
- f = File.open(filename, "r")
18
- content = f.read()
19
- f.close
20
- if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
21
- print "Checking #{$1} "
22
- begin
23
- status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
24
- if status_code != 200
25
- puts " [#{status_code}] \e[31m\u2717\e[0m"
26
- else
27
- puts "\e[32m\u2713\e[0m"
28
- end
29
- rescue SocketError
30
- status_code = 0
31
- puts " [DNS Failed] \e[31m\u2717\e[0m"
32
- end
33
- end
34
- end
35
- end
36
- end
37
-
38
- def self.rails_object
39
- puts "WARNING - not working yet..."
40
- root_path = Rails.root.to_s
41
- #app_name = Rails.application.name
42
- #puts "Checking #{app_name}..."
43
- app = CobwebSample::Application
44
- app.routes.default_url_options = { :host => 'xxx.com' }
45
-
46
- Dir.glob("app/controllers/**/*").each do |filename|
47
- controller_name = filename.gsub(".rb","").split("/")[-1].classify
48
- unless controller_name == "ApplicationController"
49
- puts "Processing #{controller_name}"
50
- controller = controller_name.constantize.new
51
-
52
- view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
53
-
54
- view.view_paths = ActionController::Base.view_paths
55
- view.extend ApplicationHelper
56
- view.controller = controller
57
- view.class_eval do
58
- include ApplicationHelper
59
- include app.routes.url_helpers
60
- end
61
- begin
62
- puts view.render(:template => '/tests/index.html.erb')
63
- rescue => e
64
- puts "Error rendering view: #{e.message}"
65
- end
66
- end
67
- end
68
- end
69
-
70
- def self.site(url, options={})
71
- errors = []
72
- failures = []
73
-
74
- pages = {}
75
-
76
- options[:cache] = options[:cache].to_i if options[:cache]
77
- crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
78
-
79
- statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
80
- print page[:url]
81
-
82
- duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
83
- pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
84
- pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
85
-
86
- # if it was a 404 before, just check again not using the cache this time
87
- if page[:status_code] == 404
88
- page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
89
- end
90
-
91
- if page[:status_code] == 404 || duplicate
92
- if duplicate
93
- puts " [duplicate] \e[31m\u2717\e[0m"
94
- else
95
- puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
96
- end
97
- failures << page
98
- else
99
- puts " \e[32m\u2713\e[0m"
100
- end
101
- end
102
-
103
- puts statistics.redis.namespace
104
- puts statistics.get_statistics
105
-
106
- total_inbound_failures = 0
107
- total_failures = 0
108
-
109
- issues = []
110
- if failures.count == 0
111
- puts "All links working!"
112
- else
113
- puts "Failed urls:"
114
- failures.each do |f|
115
- inbound_links = statistics.inbound_links_for(f[:url])
116
- issues << {:issue => f, :inbound => inbound_links}
117
-
118
- total_inbound_failures += inbound_links.count
119
- total_failures += 1
120
-
121
- puts ""
122
- puts "#{f[:url]} [ #{f[:status_code]} ]"
123
- inbound_links.each do |inbound_link|
124
- puts " - #{inbound_link}"
125
- end
126
- end
127
-
128
- puts ""
129
- puts "Duplicate Content"
130
- puts ""
131
- pages.select{|k,v| v.count > 1}.each do |k,v|
132
- puts "Duplicate: #{k}"
133
- v.map{|x| puts " - #{x}" }
134
- end
135
-
136
-
137
- puts ""
138
- puts "Total Failed URLs: #{total_failures}"
139
- puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
140
- puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
141
- puts ""
142
- end
143
- puts
144
-
145
- return issues
146
- end
147
- end
148
13
  end
@@ -0,0 +1,15 @@
1
+ module Cohesion
2
+ class Cache
3
+ def self.clear_public
4
+ Cobweb::Cache.flush_public
5
+ end
6
+ def self.clear_crawls
7
+ Cobweb::Cache.flush_all_private
8
+ end
9
+ def self.clear_crawl(crawl_id)
10
+ Cobweb::Cache.flush_crawl(crawl_id)
11
+ end
12
+
13
+ end
14
+
15
+ end
@@ -0,0 +1,138 @@
1
+ module Cohesion
2
+ class Check
3
+
4
+ def self.rails_text
5
+ puts "WARNING - not working yet..."
6
+ root_path = Rails.root.to_s
7
+ Dir.glob("**/*").each do |filename|
8
+ unless File.directory?(filename) || File.binary?(filename) || filename.ends_with?(".rdb")
9
+ f = File.open(filename, "r")
10
+ content = f.read()
11
+ f.close
12
+ if content =~ /(https?:\/\/[a-zA-Z0-9\.\/\-_%&\?]+)/
13
+ print "Checking #{$1} "
14
+ begin
15
+ status_code = Cobweb.new(:raise_exceptions => true).head($1)[:status_code].to_i
16
+ if status_code != 200
17
+ puts " [#{status_code}] \e[31m\u2717\e[0m"
18
+ else
19
+ puts "\e[32m\u2713\e[0m"
20
+ end
21
+ rescue SocketError
22
+ status_code = 0
23
+ puts " [DNS Failed] \e[31m\u2717\e[0m"
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ def self.rails_object
31
+ puts "WARNING - not working yet..."
32
+ root_path = Rails.root.to_s
33
+ #app_name = Rails.application.name
34
+ #puts "Checking #{app_name}..."
35
+ app = CobwebSample::Application
36
+ app.routes.default_url_options = { :host => 'xxx.com' }
37
+
38
+ Dir.glob("app/controllers/**/*").each do |filename|
39
+ controller_name = filename.gsub(".rb","").split("/")[-1].classify
40
+ unless controller_name == "ApplicationController"
41
+ puts "Processing #{controller_name}"
42
+ controller = controller_name.constantize.new
43
+
44
+ view = ActionView::Base.new(ActionController::Base.view_paths, {}, controller)
45
+
46
+ view.view_paths = ActionController::Base.view_paths
47
+ view.extend ApplicationHelper
48
+ view.controller = controller
49
+ view.class_eval do
50
+ include ApplicationHelper
51
+ include app.routes.url_helpers
52
+ end
53
+ begin
54
+ puts view.render(:template => '/tests/index.html.erb')
55
+ rescue => e
56
+ puts "Error rendering view: #{e.message}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ def self.site(url, options={})
63
+ errors = []
64
+ failures = []
65
+
66
+ pages = {}
67
+
68
+ options[:cache] = options[:cache].to_i if options[:cache]
69
+ crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
70
+
71
+ statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
72
+ print page[:url]
73
+
74
+ #duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
75
+ #pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
76
+ #pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
77
+
78
+ # if it was a 404 before, just check again not using the cache this time
79
+ if page[:status_code] == 404
80
+ page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
81
+ end
82
+
83
+ if page[:status_code] == 404 #|| duplicate
84
+ #if duplicate
85
+ # puts " [duplicate] \e[31m\u2717\e[0m"
86
+ #else
87
+ puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
88
+ #end
89
+ failures << page.select{|k,v| [:url, :status_code].include?(k)}
90
+ else
91
+ puts " \e[32m\u2713\e[0m"
92
+ end
93
+ end
94
+
95
+ total_inbound_failures = 0
96
+ total_failures = 0
97
+
98
+ issues = {:missing => [], :duplicate => []}#pages.select{|k,v| v.count > 1}}
99
+ if failures.count > 0
100
+ puts "Failed urls:"
101
+ failures.each do |f|
102
+ inbound_links = statistics.inbound_links_for(f[:url])
103
+ issues[:missing] << {:issue => f, :inbound => inbound_links}
104
+
105
+ total_inbound_failures += inbound_links.count
106
+ total_failures += 1
107
+
108
+ puts ""
109
+ puts "#{f[:url]} [ #{f[:status_code]} ]"
110
+ inbound_links.each do |inbound_link|
111
+ puts " - #{inbound_link}"
112
+ end
113
+ end
114
+ end
115
+
116
+ duplicate_page_count = 0#pages.map{|d| d[1].count}.select{|count| count > 1}.reduce(:+).to_i
117
+ if duplicate_page_count > 0
118
+
119
+ puts ""
120
+ puts "Duplicate Content"
121
+ puts ""
122
+ pages.select{|k,v| v.count > 1}.each do |k,v|
123
+ puts "Duplicate: #{k}"
124
+ v.map{|x| puts " - #{x}" }
125
+ end
126
+
127
+ end
128
+
129
+ puts ""
130
+ puts "Total Failed URLs: #{total_failures}"
131
+ #puts "Total Duplicates: #{duplicate_page_count}"
132
+ puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
133
+ puts ""
134
+
135
+ return issues
136
+ end
137
+ end
138
+ end
@@ -1,3 +1,3 @@
1
1
  module Cohesion
2
- VERSION = "1.0.2"
2
+ VERSION = "1.0.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cohesion
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-23 00:00:00.000000000 Z
11
+ date: 2013-12-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cobweb
@@ -59,6 +59,8 @@ files:
59
59
  - bin/cohesion
60
60
  - cohesion.gemspec
61
61
  - lib/cohesion.rb
62
+ - lib/cohesion/cache.rb
63
+ - lib/cohesion/check.rb
62
64
  - lib/cohesion/railtie.rb
63
65
  - lib/cohesion/version.rb
64
66
  - lib/tasks/cohesion.rake