cohesion 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/Gemfile +2 -2
- data/Gemfile.lock +3 -3
- data/README.textile +14 -14
- data/bin/cohesion +4 -0
- data/lib/cohesion.rb +27 -3
- data/lib/cohesion/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YTAwMDdjZWJkOGNiMjZhZTU4YjNlYWE3Mzc1NGJlODFkNjc0MmM4Mw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OGYxMWU0YjdjYzc4ZmU3Y2YxNzg2NzJjYTRmN2UwZGIwMWFkYWNjMg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NzkzYjI4MDY5ODYzZTBhOWNmNWVlODNlYjEwZmI2NTdkNTgzZGQ4MjA3M2Jl
|
10
|
+
NGQ0NzE5YTBhMGU1MjU4YmY3YmNhYzNhMDk2ZTNlZmIwYmI3MDAwYzM5NTQz
|
11
|
+
ZTJiYjA2MGQ5OTM5YmJiNjIyMmRjZWZlMTc1MjM4YmM0OGJjOWI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YzFiODMyMDRlMjMyNjAzNTlmYzVkNzMyYjQ4MTIwY2VkY2U0MzQ0Y2M1NmY1
|
14
|
+
MDYwOTdiYmNkOTkwY2EyNjk5NjEyOTZhODYwNGU4OGM3N2IwMTk5YTBhMTgw
|
15
|
+
MmY3OTczOTIyMjU2ZGM2ZmRlZDFmMDBkYTczYzE5OTU1ODI2YWY=
|
data/Gemfile
CHANGED
@@ -3,7 +3,7 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in cohesion.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
gem "cobweb", ">= 1.0.
|
6
|
+
gem "cobweb", ">= 1.0.15"
|
7
7
|
|
8
8
|
gem 'rspec'
|
9
9
|
gem 'resque'
|
@@ -16,4 +16,4 @@ gem "utf8cleaner"
|
|
16
16
|
|
17
17
|
group :test do
|
18
18
|
gem 'coveralls', require: false
|
19
|
-
end
|
19
|
+
end
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
cohesion (
|
4
|
+
cohesion (1.0.1)
|
5
5
|
cobweb
|
6
6
|
ptools
|
7
7
|
|
@@ -12,7 +12,7 @@ GEM
|
|
12
12
|
awesome_print (1.1.0)
|
13
13
|
celluloid (0.14.1)
|
14
14
|
timers (>= 1.0.0)
|
15
|
-
cobweb (1.0.
|
15
|
+
cobweb (1.0.15)
|
16
16
|
addressable
|
17
17
|
awesome_print
|
18
18
|
haml
|
@@ -101,7 +101,7 @@ PLATFORMS
|
|
101
101
|
|
102
102
|
DEPENDENCIES
|
103
103
|
awesome_print
|
104
|
-
cobweb (>= 1.0.
|
104
|
+
cobweb (>= 1.0.15)
|
105
105
|
cohesion!
|
106
106
|
coveralls
|
107
107
|
ptools
|
data/README.textile
CHANGED
@@ -34,20 +34,20 @@ h4. Options
|
|
34
34
|
Cohesion allows you to control the crawl through various command line options. These are available through help (see below) and are listed here.
|
35
35
|
|
36
36
|
|
37
|
-
* --url <start_url>
|
38
|
-
* --internal_urls <url1,url2> List of url patterns to include
|
39
|
-
* --external_urls <exclude_url1, exclude_url2> List of url patterns to exclude
|
40
|
-
* --seed_urls <seed_url1,seed_url2> Seed urls
|
41
|
-
* --crawl_limit <number_of_urls>
|
42
|
-
* --thread_count <number_of_threads>
|
43
|
-
* --cache <number_of_seconds>
|
44
|
-
* --timeout <timeout_in_seconds>
|
45
|
-
* --output <filename>
|
46
|
-
* --output_format <format_for_output_file>
|
47
|
-
|
48
|
-
* -v, --verbose
|
49
|
-
* -d, --debug
|
50
|
-
* -w, --web_statistics
|
37
|
+
* --url <start_url> Url to start crawl from
|
38
|
+
* --internal_urls <url1,url2 or filename> List of url patterns to include (comma seperated list or filename with url per line)
|
39
|
+
* --external_urls <exclude_url1, exclude_url2 or filename> List of url patterns to exclude (comma seperated list or filename with url per line)
|
40
|
+
* --seed_urls <seed_url1,seed_url2 or filename> Seed urls (comma seperated list or filename with url per line)
|
41
|
+
* --crawl_limit <number_of_urls> Limit the crawl to a number of urls
|
42
|
+
* --thread_count <number_of_threads> Set the number of threads used
|
43
|
+
* --cache <number_of_seconds> Sets the timeout for the cache, leave blank for no cache
|
44
|
+
* --timeout <timeout_in_seconds> Sets the timeout for http requests
|
45
|
+
* --output <filename> Path to output data to
|
46
|
+
* --output_format <format_for_output_file> Output format, csv or json
|
47
|
+
|
48
|
+
* -v, --verbose Display crawl information
|
49
|
+
* -d, --debug Display debug information
|
50
|
+
* -w, --web_statistics Start web stats server (http://localhost:4567)
|
51
51
|
|
52
52
|
|
53
53
|
h4. Help
|
data/bin/cohesion
CHANGED
@@ -31,6 +31,10 @@ if opts[:url]
|
|
31
31
|
|
32
32
|
options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
|
33
33
|
|
34
|
+
options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0])
|
35
|
+
options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0])
|
36
|
+
options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0])
|
37
|
+
|
34
38
|
failures = Cohesion::Check.site(opts[:url], options)
|
35
39
|
if failures.count == 0
|
36
40
|
exit(true)
|
data/lib/cohesion.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
require 'bundler/setup'
|
1
2
|
require "cohesion/version"
|
2
3
|
require 'cobweb'
|
3
4
|
require 'ptools'
|
5
|
+
require 'digest/md5'
|
4
6
|
|
5
7
|
require 'cohesion/railtie' if defined?(Rails)
|
6
8
|
|
@@ -69,17 +71,29 @@ module Cohesion
|
|
69
71
|
errors = []
|
70
72
|
failures = []
|
71
73
|
|
74
|
+
pages = {}
|
75
|
+
|
72
76
|
options[:cache] = options[:cache].to_i if options[:cache]
|
73
77
|
crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
|
74
|
-
puts crawler_options
|
75
78
|
|
76
79
|
statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
|
77
80
|
print page[:url]
|
81
|
+
|
82
|
+
duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
|
83
|
+
pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
|
84
|
+
pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
|
85
|
+
|
86
|
+
# if it was a 404 before, just check again not using the cache this time
|
78
87
|
if page[:status_code] == 404
|
79
88
|
page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
|
80
89
|
end
|
81
|
-
|
82
|
-
|
90
|
+
|
91
|
+
if page[:status_code] == 404 || duplicate
|
92
|
+
if duplicate
|
93
|
+
puts " [duplicate] \e[31m\u2717\e[0m"
|
94
|
+
else
|
95
|
+
puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
|
96
|
+
end
|
83
97
|
failures << page
|
84
98
|
else
|
85
99
|
puts " \e[32m\u2713\e[0m"
|
@@ -111,8 +125,18 @@ module Cohesion
|
|
111
125
|
end
|
112
126
|
end
|
113
127
|
|
128
|
+
puts ""
|
129
|
+
puts "Duplicate Content"
|
130
|
+
puts ""
|
131
|
+
pages.select{|k,v| v.count > 1}.each do |k,v|
|
132
|
+
puts "Duplicate: #{k}"
|
133
|
+
v.map{|x| puts " - #{x}" }
|
134
|
+
end
|
135
|
+
|
136
|
+
|
114
137
|
puts ""
|
115
138
|
puts "Total Failed URLs: #{total_failures}"
|
139
|
+
puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
|
116
140
|
puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
|
117
141
|
puts ""
|
118
142
|
end
|
data/lib/cohesion/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cohesion
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-08-
|
11
|
+
date: 2013-08-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cobweb
|