cohesion 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NDQwNDM2N2Y4ZDczY2ZmYWRhMDliMTE0MDIxMDcxN2Y1ZDEzNDhlYQ==
4
+ YTAwMDdjZWJkOGNiMjZhZTU4YjNlYWE3Mzc1NGJlODFkNjc0MmM4Mw==
5
5
  data.tar.gz: !binary |-
6
- OWM0ODU0OWNiNjFjODcwNGUwMTk4ZGIyY2ZhZWY0OTA0NWI1Y2FiNA==
6
+ OGYxMWU0YjdjYzc4ZmU3Y2YxNzg2NzJjYTRmN2UwZGIwMWFkYWNjMg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- OGFlOGY1MDg0MjFlYmViMTMwNzZiMmM2NDIzMGY3MjEyMTFmZDkwZjA4NjVk
10
- NmExNDc5ODhhNTVkZDBjYjZkOTgwOGJhMjlhY2M0YzMzMWE4OTk0ZTIxNDg2
11
- YWVmMWQ1NTA1M2E2ODExMWFlZDNlN2MzODZlNzk1MTNkOTAzZTI=
9
+ NzkzYjI4MDY5ODYzZTBhOWNmNWVlODNlYjEwZmI2NTdkNTgzZGQ4MjA3M2Jl
10
+ NGQ0NzE5YTBhMGU1MjU4YmY3YmNhYzNhMDk2ZTNlZmIwYmI3MDAwYzM5NTQz
11
+ ZTJiYjA2MGQ5OTM5YmJiNjIyMmRjZWZlMTc1MjM4YmM0OGJjOWI=
12
12
  data.tar.gz: !binary |-
13
- ODIyNzcxZjA1MTBiZDkwYjMwY2E1ZGZkZjVjN2RjNjRiNWIwZWE3NDkwZGI1
14
- ODBlOGIxMjkyNWE3NzkwZWU5MzM3YWI5ZmE2ZjI4OTY1YzM3ZDcxMzNkMmQx
15
- OGI2YzlhNTA5NzU5MTA0NDVhZTZmYTI3MzhiYTc2MTI5ZGVlMzc=
13
+ YzFiODMyMDRlMjMyNjAzNTlmYzVkNzMyYjQ4MTIwY2VkY2U0MzQ0Y2M1NmY1
14
+ MDYwOTdiYmNkOTkwY2EyNjk5NjEyOTZhODYwNGU4OGM3N2IwMTk5YTBhMTgw
15
+ MmY3OTczOTIyMjU2ZGM2ZmRlZDFmMDBkYTczYzE5OTU1ODI2YWY=
data/Gemfile CHANGED
@@ -3,7 +3,7 @@ source 'https://rubygems.org'
3
3
  # Specify your gem's dependencies in cohesion.gemspec
4
4
  gemspec
5
5
 
6
- gem "cobweb", ">= 1.0.12"
6
+ gem "cobweb", ">= 1.0.15"
7
7
 
8
8
  gem 'rspec'
9
9
  gem 'resque'
@@ -16,4 +16,4 @@ gem "utf8cleaner"
16
16
 
17
17
  group :test do
18
18
  gem 'coveralls', require: false
19
- end
19
+ end
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- cohesion (0.0.7)
4
+ cohesion (1.0.1)
5
5
  cobweb
6
6
  ptools
7
7
 
@@ -12,7 +12,7 @@ GEM
12
12
  awesome_print (1.1.0)
13
13
  celluloid (0.14.1)
14
14
  timers (>= 1.0.0)
15
- cobweb (1.0.12)
15
+ cobweb (1.0.15)
16
16
  addressable
17
17
  awesome_print
18
18
  haml
@@ -101,7 +101,7 @@ PLATFORMS
101
101
 
102
102
  DEPENDENCIES
103
103
  awesome_print
104
- cobweb (>= 1.0.12)
104
+ cobweb (>= 1.0.15)
105
105
  cohesion!
106
106
  coveralls
107
107
  ptools
@@ -34,20 +34,20 @@ h4. Options
34
34
  Cohesion allows you to control the crawl through various command line options. These are available through help (see below) and are listed here.
35
35
 
36
36
 
37
- * --url <start_url> Url to start crawl from
38
- * --internal_urls <url1,url2> List of url patterns to include
39
- * --external_urls <exclude_url1, exclude_url2> List of url patterns to exclude
40
- * --seed_urls <seed_url1,seed_url2> Seed urls
41
- * --crawl_limit <number_of_urls> Limit the crawl to a number of urls
42
- * --thread_count <number_of_threads> Set the number of threads used
43
- * --cache <number_of_seconds> Sets the timeout for the cache, leave blank for no cache
44
- * --timeout <timeout_in_seconds> Sets the timeout for http requests
45
- * --output <filename> Path to output data to
46
- * --output_format <format_for_output_file> Output format, csv or json
47
-
48
- * -v, --verbose Display crawl information
49
- * -d, --debug Display debug information
50
- * -w, --web_statistics Start web stats server
37
+ * --url <start_url> Url to start crawl from
38
+ * --internal_urls <url1,url2 or filename> List of url patterns to include (comma seperated list or filename with url per line)
39
+ * --external_urls <exclude_url1, exclude_url2 or filename> List of url patterns to exclude (comma seperated list or filename with url per line)
40
+ * --seed_urls <seed_url1,seed_url2 or filename> Seed urls (comma seperated list or filename with url per line)
41
+ * --crawl_limit <number_of_urls> Limit the crawl to a number of urls
42
+ * --thread_count <number_of_threads> Set the number of threads used
43
+ * --cache <number_of_seconds> Sets the timeout for the cache, leave blank for no cache
44
+ * --timeout <timeout_in_seconds> Sets the timeout for http requests
45
+ * --output <filename> Path to output data to
46
+ * --output_format <format_for_output_file> Output format, csv or json
47
+
48
+ * -v, --verbose Display crawl information
49
+ * -d, --debug Display debug information
50
+ * -w, --web_statistics Start web stats server (http://localhost:4567)
51
51
 
52
52
 
53
53
  h4. Help
@@ -31,6 +31,10 @@ if opts[:url]
31
31
 
32
32
  options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
33
33
 
34
+ options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0])
35
+ options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0])
36
+ options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0])
37
+
34
38
  failures = Cohesion::Check.site(opts[:url], options)
35
39
  if failures.count == 0
36
40
  exit(true)
@@ -1,6 +1,8 @@
1
+ require 'bundler/setup'
1
2
  require "cohesion/version"
2
3
  require 'cobweb'
3
4
  require 'ptools'
5
+ require 'digest/md5'
4
6
 
5
7
  require 'cohesion/railtie' if defined?(Rails)
6
8
 
@@ -69,17 +71,29 @@ module Cohesion
69
71
  errors = []
70
72
  failures = []
71
73
 
74
+ pages = {}
75
+
72
76
  options[:cache] = options[:cache].to_i if options[:cache]
73
77
  crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
74
- puts crawler_options
75
78
 
76
79
  statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
77
80
  print page[:url]
81
+
82
+ duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
83
+ pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
84
+ pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
85
+
86
+ # if it was a 404 before, just check again not using the cache this time
78
87
  if page[:status_code] == 404
79
88
  page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
80
89
  end
81
- if page[:status_code] > 399
82
- puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
90
+
91
+ if page[:status_code] == 404 || duplicate
92
+ if duplicate
93
+ puts " [duplicate] \e[31m\u2717\e[0m"
94
+ else
95
+ puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
96
+ end
83
97
  failures << page
84
98
  else
85
99
  puts " \e[32m\u2713\e[0m"
@@ -111,8 +125,18 @@ module Cohesion
111
125
  end
112
126
  end
113
127
 
128
+ puts ""
129
+ puts "Duplicate Content"
130
+ puts ""
131
+ pages.select{|k,v| v.count > 1}.each do |k,v|
132
+ puts "Duplicate: #{k}"
133
+ v.map{|x| puts " - #{x}" }
134
+ end
135
+
136
+
114
137
  puts ""
115
138
  puts "Total Failed URLs: #{total_failures}"
139
+ puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
116
140
  puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
117
141
  puts ""
118
142
  end
@@ -1,3 +1,3 @@
1
1
  module Cohesion
2
- VERSION = "1.0.0"
2
+ VERSION = "1.0.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cohesion
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stewart McKee
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-15 00:00:00.000000000 Z
11
+ date: 2013-08-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: cobweb