cobweb 1.0.10 → 1.0.11
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +10 -1
- data/lib/cobweb_dsl.rb +32 -0
- data/lib/cobweb_version.rb +1 -1
- data/lib/document.rb +66 -0
- data/lib/export_command.rb +77 -0
- data/lib/report_command.rb +37 -0
- data/lib/string.rb +12 -0
- metadata +40 -24
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v1.0.
|
2
|
+
h1. Cobweb v1.0.11
|
3
3
|
|
4
4
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
5
5
|
|
@@ -22,6 +22,15 @@ h3. Standalone
|
|
22
22
|
|
23
23
|
While the crawler is running, you can view statistics on http://localhost:4567
|
24
24
|
|
25
|
+
h3. Command Line
|
26
|
+
|
27
|
+
Cobweb can also be ran from the command line to perform various pre-canned tasks.
|
28
|
+
|
29
|
+
* report - output a csv with data from the crawl
|
30
|
+
* export - creates a local replication of the data on the server based off the url structure. Text data is stored in yaml format.
|
31
|
+
|
32
|
+
Run "cobweb --help" for more info
|
33
|
+
|
25
34
|
h3. Data Returned For Each Page
|
26
35
|
The data available in the returned hash are:
|
27
36
|
|
data/lib/cobweb_dsl.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
module CobwebDSL
|
2
|
+
|
3
|
+
def scope
|
4
|
+
DocumentScope.new(@doc)
|
5
|
+
end
|
6
|
+
|
7
|
+
# def method_missing(m, *args, &block)
|
8
|
+
# if m.to_s =~ /^(.*?)_tags$/
|
9
|
+
# tag_name = $1
|
10
|
+
# @doc.search($1)
|
11
|
+
# elsif m.to_s =~ /^(.*?)_tag$/
|
12
|
+
# tag_name = $1
|
13
|
+
# @doc.at($1)
|
14
|
+
# elsif m.to_s =~ /^(.*?)_tags_used\?$/
|
15
|
+
# tag_name = $1
|
16
|
+
# !@doc.search(tag_name).empty?
|
17
|
+
# elsif m.to_s =~ /^(.*?)_tags_with_(.*?)$/
|
18
|
+
# tag_name = $1
|
19
|
+
# attribute_name = $2
|
20
|
+
# attribute_value = "=#{args[0]}" unless args[0].nil?
|
21
|
+
# @doc.search("#{tag_name}[#{attribute_name}#{attribute_value}]")
|
22
|
+
# elsif m.to_s =~ /^(.*?)_tag_with_(.*?)$/
|
23
|
+
# tag_name = $1
|
24
|
+
# attribute_name = $2
|
25
|
+
# attribute_value = "=#{args[0]}" unless args[0].nil?
|
26
|
+
# @doc.at("#{tag_name}[#{attribute_name}#{attribute_value}]")
|
27
|
+
# else
|
28
|
+
# super
|
29
|
+
# end
|
30
|
+
# end
|
31
|
+
|
32
|
+
end
|
data/lib/cobweb_version.rb
CHANGED
data/lib/document.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
class DocumentScope
|
2
|
+
|
3
|
+
@context = nil
|
4
|
+
|
5
|
+
def method_missing(m, *args, &block)
|
6
|
+
if m.to_s =~ /^(.*?)_tags$/
|
7
|
+
tag_name = $1
|
8
|
+
@context = @context.search($1) if @context
|
9
|
+
self
|
10
|
+
elsif m.to_s =~ /^(.*?)_tag$/
|
11
|
+
tag_name = $1
|
12
|
+
@context = @context.at($1) if @context
|
13
|
+
self
|
14
|
+
elsif m.to_s =~ /^(.*?)_tags_with_(.*?)$/
|
15
|
+
tag_name = $1
|
16
|
+
attribute_name = $2
|
17
|
+
attribute_value = "=#{args[0]}" unless args[0].nil?
|
18
|
+
|
19
|
+
selector = "#{tag_name}[#{attribute_name}#{attribute_value}]"
|
20
|
+
@context = @context.search(selector) if @context
|
21
|
+
self
|
22
|
+
elsif m.to_s =~ /^(.*?)_tag_with_(.*?)$/
|
23
|
+
tag_name = $1
|
24
|
+
attribute_name = $2
|
25
|
+
attribute_value = "='#{args[0]}'" unless args[0].nil?
|
26
|
+
selector = "#{tag_name}[#{attribute_name}#{attribute_value}]"
|
27
|
+
@context = @context.at(selector) if @context
|
28
|
+
self
|
29
|
+
else
|
30
|
+
super
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def initialize(body)
|
35
|
+
@context = Nokogiri::HTML.parse(body)
|
36
|
+
end
|
37
|
+
|
38
|
+
def each(&block)
|
39
|
+
@context.each(&block)
|
40
|
+
end
|
41
|
+
|
42
|
+
def map(&block)
|
43
|
+
@context.map(&block)
|
44
|
+
end
|
45
|
+
|
46
|
+
def select(&block)
|
47
|
+
@context.select(&block)
|
48
|
+
end
|
49
|
+
|
50
|
+
def [](value)
|
51
|
+
@context ? @context[value] : ""
|
52
|
+
end
|
53
|
+
|
54
|
+
def contents
|
55
|
+
@context ? @context.text.gsub("\n","") : ""
|
56
|
+
end
|
57
|
+
alias :text :contents
|
58
|
+
|
59
|
+
def count
|
60
|
+
@context ? @context.count : 0
|
61
|
+
end
|
62
|
+
def to_s
|
63
|
+
@context ? @context.to_s.gsub("\n","") : ""
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
class ExportCommand
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
def self.start(opts, path)
|
6
|
+
|
7
|
+
uri = URI.parse(opts[:url])
|
8
|
+
default_root_path = File.join(Dir.pwd, uri.host)
|
9
|
+
|
10
|
+
options = {
|
11
|
+
:cache => 600,
|
12
|
+
:crawl_limit => 1000000,
|
13
|
+
:raise_exceptions => true,
|
14
|
+
:root_path => default_root_path
|
15
|
+
}.merge(opts)
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
|
20
|
+
begin
|
21
|
+
puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
|
22
|
+
|
23
|
+
uri = URI.parse(page[:url])
|
24
|
+
|
25
|
+
path = ""
|
26
|
+
|
27
|
+
Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
|
28
|
+
|
29
|
+
uri.path.split("/")[0..-2].each do |dir|
|
30
|
+
path+="/" unless path.ends_with?("/")
|
31
|
+
path+=dir
|
32
|
+
if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
|
33
|
+
FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
|
34
|
+
Dir.mkdir(options[:root_path] + path)
|
35
|
+
FileUtils.mv(options[:root_path] + path + ".tmp", options[:root_path] + path + "/index.html")
|
36
|
+
else
|
37
|
+
Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
path += "/" unless path.ends_with?("/")
|
41
|
+
filename = uri.path.split("/")[-1]
|
42
|
+
if filename.nil? || filename.empty?
|
43
|
+
filename = "index.html"
|
44
|
+
end
|
45
|
+
filename = filename + "_" + uri.query.gsub("/", "%2F") unless uri.query.nil?
|
46
|
+
|
47
|
+
if page[:text_content]
|
48
|
+
doc = Nokogiri::HTML.parse(page[:body])
|
49
|
+
|
50
|
+
if doc.search("title").first
|
51
|
+
title = doc.search("title").first.content.gsub(" - ", " ")
|
52
|
+
else
|
53
|
+
title = uri.path.split("/")[-1]
|
54
|
+
end
|
55
|
+
page[:description] = doc.search("meta[name=description]").first.content if doc.search("meta[name=description]").first
|
56
|
+
page[:keywords] = doc.search("meta[name=keywords]").first.content if doc.search("meta[name=keywords]").first
|
57
|
+
page[:meta_title] = doc.search("meta[name=title]").first.content if doc.search("meta[name=title]").first
|
58
|
+
|
59
|
+
body = page[:body]
|
60
|
+
|
61
|
+
File.open(options[:root_path] + path + filename, "w+"){|f| f.write(page.to_yaml)}
|
62
|
+
|
63
|
+
#puts "Spree::Page.create!(:title => #{title}, :body => #{body}, :visible => #{true}, :meta_keywords => #{keywords}, :meta_description => #{description}, :layout => "", :meta_title => #{meta_title})"
|
64
|
+
#Spree::Page.create!(:title => title, :body => body, :visible => false, :meta_keywords => keywords, :meta_description => description, :layout => "", :meta_title => meta_title)
|
65
|
+
else
|
66
|
+
File.open(options[:root_path] + path + filename, "wb"){|f| f.write(Base64.decode64(page[:body]))}
|
67
|
+
end
|
68
|
+
|
69
|
+
puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets." if statistics
|
70
|
+
rescue => e
|
71
|
+
puts e.message
|
72
|
+
puts e.backtrace
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class ReportCommand
|
2
|
+
def self.start(opts)
|
3
|
+
|
4
|
+
if opts[:output]
|
5
|
+
options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
|
6
|
+
options[:quiet] = !opts[:verbose]
|
7
|
+
|
8
|
+
@crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
|
9
|
+
|
10
|
+
columns = nil
|
11
|
+
|
12
|
+
CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
|
13
|
+
statistics = @crawler.crawl(options[:url]) do |page|
|
14
|
+
puts "Reporting on #{page[:url]}"
|
15
|
+
@doc = page[:body]
|
16
|
+
|
17
|
+
|
18
|
+
page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
|
19
|
+
page["title"] = scope.head_tag.title_tag.contents
|
20
|
+
page["description"] = scope.meta_tag_with_name("description")["content"]
|
21
|
+
page["keywords"] = scope.meta_tag_with_name("keywords")["content"]
|
22
|
+
page["img tag count"] = scope.img_tags.count
|
23
|
+
page["scripts in body"] = scope.body_tag.script_tags.count
|
24
|
+
page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
|
25
|
+
page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
|
26
|
+
|
27
|
+
|
28
|
+
if !columns
|
29
|
+
columns = page.keys.reject{|k| k==:body || k==:links}
|
30
|
+
csv << columns.map{|k| k.to_s}
|
31
|
+
end
|
32
|
+
csv << columns.map{|k| page[k]}
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/string.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
class String
|
2
|
+
|
3
|
+
# add ends_with? support if method is missing
|
4
|
+
def method_missing(m, *args, &block)
|
5
|
+
if m == :ends_with?
|
6
|
+
suffix = args[0]
|
7
|
+
suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
|
8
|
+
else
|
9
|
+
super
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.11
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-02-
|
12
|
+
date: 2013-02-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70274619912400 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70274619912400
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70274619906680 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70274619906680
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70274619897540 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70274619897540
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70274619888180 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70274619888180
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70274619880820 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70274619880820
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70274619877520 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70274619877520
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70274619876520 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70274619876520
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70274619875960 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70274619875960
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70274619875340 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70274619875340
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70274619874800 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70274619874800
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70274619874180 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,18 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70274619874180
|
135
|
+
- !ruby/object:Gem::Dependency
|
136
|
+
name: slop
|
137
|
+
requirement: &70274619873340 !ruby/object:Gem::Requirement
|
138
|
+
none: false
|
139
|
+
requirements:
|
140
|
+
- - ! '>='
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: '0'
|
143
|
+
type: :runtime
|
144
|
+
prerelease: false
|
145
|
+
version_requirements: *70274619873340
|
135
146
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
147
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
137
148
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|
@@ -325,6 +336,7 @@ files:
|
|
325
336
|
- lib/cobweb.rb
|
326
337
|
- lib/cobweb_crawl_helper.rb
|
327
338
|
- lib/cobweb_crawler.rb
|
339
|
+
- lib/cobweb_dsl.rb
|
328
340
|
- lib/cobweb_finished_job.rb
|
329
341
|
- lib/cobweb_links.rb
|
330
342
|
- lib/cobweb_process_job.rb
|
@@ -333,12 +345,16 @@ files:
|
|
333
345
|
- lib/crawl.rb
|
334
346
|
- lib/crawl_job.rb
|
335
347
|
- lib/crawl_object.rb
|
348
|
+
- lib/document.rb
|
336
349
|
- lib/encoding_safe_process_job.rb
|
350
|
+
- lib/export_command.rb
|
337
351
|
- lib/hash_util.rb
|
338
352
|
- lib/redirect_error.rb
|
353
|
+
- lib/report_command.rb
|
339
354
|
- lib/robots.rb
|
340
355
|
- lib/server.rb
|
341
356
|
- lib/stats.rb
|
357
|
+
- lib/string.rb
|
342
358
|
- lib/uri_helper.rb
|
343
359
|
- views/home.haml
|
344
360
|
- views/layout.haml
|