cohesion 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/Gemfile +2 -2
- data/Gemfile.lock +3 -3
- data/README.textile +14 -14
- data/bin/cohesion +4 -0
- data/lib/cohesion.rb +27 -3
- data/lib/cohesion/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YTAwMDdjZWJkOGNiMjZhZTU4YjNlYWE3Mzc1NGJlODFkNjc0MmM4Mw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OGYxMWU0YjdjYzc4ZmU3Y2YxNzg2NzJjYTRmN2UwZGIwMWFkYWNjMg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NzkzYjI4MDY5ODYzZTBhOWNmNWVlODNlYjEwZmI2NTdkNTgzZGQ4MjA3M2Jl
|
10
|
+
NGQ0NzE5YTBhMGU1MjU4YmY3YmNhYzNhMDk2ZTNlZmIwYmI3MDAwYzM5NTQz
|
11
|
+
ZTJiYjA2MGQ5OTM5YmJiNjIyMmRjZWZlMTc1MjM4YmM0OGJjOWI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YzFiODMyMDRlMjMyNjAzNTlmYzVkNzMyYjQ4MTIwY2VkY2U0MzQ0Y2M1NmY1
|
14
|
+
MDYwOTdiYmNkOTkwY2EyNjk5NjEyOTZhODYwNGU4OGM3N2IwMTk5YTBhMTgw
|
15
|
+
MmY3OTczOTIyMjU2ZGM2ZmRlZDFmMDBkYTczYzE5OTU1ODI2YWY=
|
data/Gemfile
CHANGED
@@ -3,7 +3,7 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in cohesion.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
gem "cobweb", ">= 1.0.
|
6
|
+
gem "cobweb", ">= 1.0.15"
|
7
7
|
|
8
8
|
gem 'rspec'
|
9
9
|
gem 'resque'
|
@@ -16,4 +16,4 @@ gem "utf8cleaner"
|
|
16
16
|
|
17
17
|
group :test do
|
18
18
|
gem 'coveralls', require: false
|
19
|
-
end
|
19
|
+
end
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
cohesion (
|
4
|
+
cohesion (1.0.1)
|
5
5
|
cobweb
|
6
6
|
ptools
|
7
7
|
|
@@ -12,7 +12,7 @@ GEM
|
|
12
12
|
awesome_print (1.1.0)
|
13
13
|
celluloid (0.14.1)
|
14
14
|
timers (>= 1.0.0)
|
15
|
-
cobweb (1.0.
|
15
|
+
cobweb (1.0.15)
|
16
16
|
addressable
|
17
17
|
awesome_print
|
18
18
|
haml
|
@@ -101,7 +101,7 @@ PLATFORMS
|
|
101
101
|
|
102
102
|
DEPENDENCIES
|
103
103
|
awesome_print
|
104
|
-
cobweb (>= 1.0.
|
104
|
+
cobweb (>= 1.0.15)
|
105
105
|
cohesion!
|
106
106
|
coveralls
|
107
107
|
ptools
|
data/README.textile
CHANGED
@@ -34,20 +34,20 @@ h4. Options
|
|
34
34
|
Cohesion allows you to control the crawl through various command line options. These are available through help (see below) and are listed here.
|
35
35
|
|
36
36
|
|
37
|
-
* --url <start_url>
|
38
|
-
* --internal_urls <url1,url2> List of url patterns to include
|
39
|
-
* --external_urls <exclude_url1, exclude_url2> List of url patterns to exclude
|
40
|
-
* --seed_urls <seed_url1,seed_url2> Seed urls
|
41
|
-
* --crawl_limit <number_of_urls>
|
42
|
-
* --thread_count <number_of_threads>
|
43
|
-
* --cache <number_of_seconds>
|
44
|
-
* --timeout <timeout_in_seconds>
|
45
|
-
* --output <filename>
|
46
|
-
* --output_format <format_for_output_file>
|
47
|
-
|
48
|
-
* -v, --verbose
|
49
|
-
* -d, --debug
|
50
|
-
* -w, --web_statistics
|
37
|
+
* --url <start_url> Url to start crawl from
|
38
|
+
* --internal_urls <url1,url2 or filename> List of url patterns to include (comma seperated list or filename with url per line)
|
39
|
+
* --external_urls <exclude_url1, exclude_url2 or filename> List of url patterns to exclude (comma seperated list or filename with url per line)
|
40
|
+
* --seed_urls <seed_url1,seed_url2 or filename> Seed urls (comma seperated list or filename with url per line)
|
41
|
+
* --crawl_limit <number_of_urls> Limit the crawl to a number of urls
|
42
|
+
* --thread_count <number_of_threads> Set the number of threads used
|
43
|
+
* --cache <number_of_seconds> Sets the timeout for the cache, leave blank for no cache
|
44
|
+
* --timeout <timeout_in_seconds> Sets the timeout for http requests
|
45
|
+
* --output <filename> Path to output data to
|
46
|
+
* --output_format <format_for_output_file> Output format, csv or json
|
47
|
+
|
48
|
+
* -v, --verbose Display crawl information
|
49
|
+
* -d, --debug Display debug information
|
50
|
+
* -w, --web_statistics Start web stats server (http://localhost:4567)
|
51
51
|
|
52
52
|
|
53
53
|
h4. Help
|
data/bin/cohesion
CHANGED
@@ -31,6 +31,10 @@ if opts[:url]
|
|
31
31
|
|
32
32
|
options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
|
33
33
|
|
34
|
+
options[:seed_urls] = File.open(options[:seed_urls][0]).read.split("\n") if options[:seed_urls] && File.exists?(options[:seed_urls][0])
|
35
|
+
options[:internal_urls] = File.open(options[:internal_urls][0]).read.split("\n") if options[:internal_urls] && File.exists?(options[:internal_urls][0])
|
36
|
+
options[:external_urls] = File.open(options[:external_urls][0]).read.split("\n") if options[:external_urls] && File.exists?(options[:external_urls][0])
|
37
|
+
|
34
38
|
failures = Cohesion::Check.site(opts[:url], options)
|
35
39
|
if failures.count == 0
|
36
40
|
exit(true)
|
data/lib/cohesion.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
require 'bundler/setup'
|
1
2
|
require "cohesion/version"
|
2
3
|
require 'cobweb'
|
3
4
|
require 'ptools'
|
5
|
+
require 'digest/md5'
|
4
6
|
|
5
7
|
require 'cohesion/railtie' if defined?(Rails)
|
6
8
|
|
@@ -69,17 +71,29 @@ module Cohesion
|
|
69
71
|
errors = []
|
70
72
|
failures = []
|
71
73
|
|
74
|
+
pages = {}
|
75
|
+
|
72
76
|
options[:cache] = options[:cache].to_i if options[:cache]
|
73
77
|
crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
|
74
|
-
puts crawler_options
|
75
78
|
|
76
79
|
statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
|
77
80
|
print page[:url]
|
81
|
+
|
82
|
+
duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
|
83
|
+
pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
|
84
|
+
pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
|
85
|
+
|
86
|
+
# if it was a 404 before, just check again not using the cache this time
|
78
87
|
if page[:status_code] == 404
|
79
88
|
page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
|
80
89
|
end
|
81
|
-
|
82
|
-
|
90
|
+
|
91
|
+
if page[:status_code] == 404 || duplicate
|
92
|
+
if duplicate
|
93
|
+
puts " [duplicate] \e[31m\u2717\e[0m"
|
94
|
+
else
|
95
|
+
puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
|
96
|
+
end
|
83
97
|
failures << page
|
84
98
|
else
|
85
99
|
puts " \e[32m\u2713\e[0m"
|
@@ -111,8 +125,18 @@ module Cohesion
|
|
111
125
|
end
|
112
126
|
end
|
113
127
|
|
128
|
+
puts ""
|
129
|
+
puts "Duplicate Content"
|
130
|
+
puts ""
|
131
|
+
pages.select{|k,v| v.count > 1}.each do |k,v|
|
132
|
+
puts "Duplicate: #{k}"
|
133
|
+
v.map{|x| puts " - #{x}" }
|
134
|
+
end
|
135
|
+
|
136
|
+
|
114
137
|
puts ""
|
115
138
|
puts "Total Failed URLs: #{total_failures}"
|
139
|
+
puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
|
116
140
|
puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
|
117
141
|
puts ""
|
118
142
|
end
|
data/lib/cohesion/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cohesion
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stewart McKee
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-08-
|
11
|
+
date: 2013-08-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: cobweb
|