cobweb 1.0.10 → 1.0.11

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.10
2
+ h1. Cobweb v1.0.11
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -22,6 +22,15 @@ h3. Standalone
22
22
 
23
23
  While the crawler is running, you can view statistics on http://localhost:4567
24
24
 
25
+ h3. Command Line
26
+
27
+ Cobweb can also be ran from the command line to perform various pre-canned tasks.
28
+
29
+ * report - output a csv with data from the crawl
30
+ * export - creates a local replication of the data on the server based off the url structure. Text data is stored in yaml format.
31
+
32
+ Run "cobweb --help" for more info
33
+
25
34
  h3. Data Returned For Each Page
26
35
  The data available in the returned hash are:
27
36
 
@@ -0,0 +1,32 @@
1
+ module CobwebDSL
2
+
3
+ def scope
4
+ DocumentScope.new(@doc)
5
+ end
6
+
7
+ # def method_missing(m, *args, &block)
8
+ # if m.to_s =~ /^(.*?)_tags$/
9
+ # tag_name = $1
10
+ # @doc.search($1)
11
+ # elsif m.to_s =~ /^(.*?)_tag$/
12
+ # tag_name = $1
13
+ # @doc.at($1)
14
+ # elsif m.to_s =~ /^(.*?)_tags_used\?$/
15
+ # tag_name = $1
16
+ # !@doc.search(tag_name).empty?
17
+ # elsif m.to_s =~ /^(.*?)_tags_with_(.*?)$/
18
+ # tag_name = $1
19
+ # attribute_name = $2
20
+ # attribute_value = "=#{args[0]}" unless args[0].nil?
21
+ # @doc.search("#{tag_name}[#{attribute_name}#{attribute_value}]")
22
+ # elsif m.to_s =~ /^(.*?)_tag_with_(.*?)$/
23
+ # tag_name = $1
24
+ # attribute_name = $2
25
+ # attribute_value = "=#{args[0]}" unless args[0].nil?
26
+ # @doc.at("#{tag_name}[#{attribute_name}#{attribute_value}]")
27
+ # else
28
+ # super
29
+ # end
30
+ # end
31
+
32
+ end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.10"
6
+ "1.0.11"
7
7
  end
8
8
 
9
9
  end
@@ -0,0 +1,66 @@
1
+ class DocumentScope
2
+
3
+ @context = nil
4
+
5
+ def method_missing(m, *args, &block)
6
+ if m.to_s =~ /^(.*?)_tags$/
7
+ tag_name = $1
8
+ @context = @context.search($1) if @context
9
+ self
10
+ elsif m.to_s =~ /^(.*?)_tag$/
11
+ tag_name = $1
12
+ @context = @context.at($1) if @context
13
+ self
14
+ elsif m.to_s =~ /^(.*?)_tags_with_(.*?)$/
15
+ tag_name = $1
16
+ attribute_name = $2
17
+ attribute_value = "=#{args[0]}" unless args[0].nil?
18
+
19
+ selector = "#{tag_name}[#{attribute_name}#{attribute_value}]"
20
+ @context = @context.search(selector) if @context
21
+ self
22
+ elsif m.to_s =~ /^(.*?)_tag_with_(.*?)$/
23
+ tag_name = $1
24
+ attribute_name = $2
25
+ attribute_value = "='#{args[0]}'" unless args[0].nil?
26
+ selector = "#{tag_name}[#{attribute_name}#{attribute_value}]"
27
+ @context = @context.at(selector) if @context
28
+ self
29
+ else
30
+ super
31
+ end
32
+ end
33
+
34
+ def initialize(body)
35
+ @context = Nokogiri::HTML.parse(body)
36
+ end
37
+
38
+ def each(&block)
39
+ @context.each(&block)
40
+ end
41
+
42
+ def map(&block)
43
+ @context.map(&block)
44
+ end
45
+
46
+ def select(&block)
47
+ @context.select(&block)
48
+ end
49
+
50
+ def [](value)
51
+ @context ? @context[value] : ""
52
+ end
53
+
54
+ def contents
55
+ @context ? @context.text.gsub("\n","") : ""
56
+ end
57
+ alias :text :contents
58
+
59
+ def count
60
+ @context ? @context.count : 0
61
+ end
62
+ def to_s
63
+ @context ? @context.to_s.gsub("\n","") : ""
64
+ end
65
+
66
+ end
@@ -0,0 +1,77 @@
1
+ class ExportCommand
2
+
3
+ require 'yaml'
4
+
5
+ def self.start(opts, path)
6
+
7
+ uri = URI.parse(opts[:url])
8
+ default_root_path = File.join(Dir.pwd, uri.host)
9
+
10
+ options = {
11
+ :cache => 600,
12
+ :crawl_limit => 1000000,
13
+ :raise_exceptions => true,
14
+ :root_path => default_root_path
15
+ }.merge(opts)
16
+
17
+
18
+
19
+ statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
20
+ begin
21
+ puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
22
+
23
+ uri = URI.parse(page[:url])
24
+
25
+ path = ""
26
+
27
+ Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
28
+
29
+ uri.path.split("/")[0..-2].each do |dir|
30
+ path+="/" unless path.ends_with?("/")
31
+ path+=dir
32
+ if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
33
+ FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
34
+ Dir.mkdir(options[:root_path] + path)
35
+ FileUtils.mv(options[:root_path] + path + ".tmp", options[:root_path] + path + "/index.html")
36
+ else
37
+ Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
38
+ end
39
+ end
40
+ path += "/" unless path.ends_with?("/")
41
+ filename = uri.path.split("/")[-1]
42
+ if filename.nil? || filename.empty?
43
+ filename = "index.html"
44
+ end
45
+ filename = filename + "_" + uri.query.gsub("/", "%2F") unless uri.query.nil?
46
+
47
+ if page[:text_content]
48
+ doc = Nokogiri::HTML.parse(page[:body])
49
+
50
+ if doc.search("title").first
51
+ title = doc.search("title").first.content.gsub(" - ", " ")
52
+ else
53
+ title = uri.path.split("/")[-1]
54
+ end
55
+ page[:description] = doc.search("meta[name=description]").first.content if doc.search("meta[name=description]").first
56
+ page[:keywords] = doc.search("meta[name=keywords]").first.content if doc.search("meta[name=keywords]").first
57
+ page[:meta_title] = doc.search("meta[name=title]").first.content if doc.search("meta[name=title]").first
58
+
59
+ body = page[:body]
60
+
61
+ File.open(options[:root_path] + path + filename, "w+"){|f| f.write(page.to_yaml)}
62
+
63
+ #puts "Spree::Page.create!(:title => #{title}, :body => #{body}, :visible => #{true}, :meta_keywords => #{keywords}, :meta_description => #{description}, :layout => "", :meta_title => #{meta_title})"
64
+ #Spree::Page.create!(:title => title, :body => body, :visible => false, :meta_keywords => keywords, :meta_description => description, :layout => "", :meta_title => meta_title)
65
+ else
66
+ File.open(options[:root_path] + path + filename, "wb"){|f| f.write(Base64.decode64(page[:body]))}
67
+ end
68
+
69
+ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets." if statistics
70
+ rescue => e
71
+ puts e.message
72
+ puts e.backtrace
73
+ end
74
+ end
75
+
76
+ end
77
+ end
@@ -0,0 +1,37 @@
1
+ class ReportCommand
2
+ def self.start(opts)
3
+
4
+ if opts[:output]
5
+ options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
6
+ options[:quiet] = !opts[:verbose]
7
+
8
+ @crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
9
+
10
+ columns = nil
11
+
12
+ CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
13
+ statistics = @crawler.crawl(options[:url]) do |page|
14
+ puts "Reporting on #{page[:url]}"
15
+ @doc = page[:body]
16
+
17
+
18
+ page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
19
+ page["title"] = scope.head_tag.title_tag.contents
20
+ page["description"] = scope.meta_tag_with_name("description")["content"]
21
+ page["keywords"] = scope.meta_tag_with_name("keywords")["content"]
22
+ page["img tag count"] = scope.img_tags.count
23
+ page["scripts in body"] = scope.body_tag.script_tags.count
24
+ page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
25
+ page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
26
+
27
+
28
+ if !columns
29
+ columns = page.keys.reject{|k| k==:body || k==:links}
30
+ csv << columns.map{|k| k.to_s}
31
+ end
32
+ csv << columns.map{|k| page[k]}
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,12 @@
1
+ class String
2
+
3
+ # add ends_with? support if method is missing
4
+ def method_missing(m, *args, &block)
5
+ if m == :ends_with?
6
+ suffix = args[0]
7
+ suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
8
+ else
9
+ super
10
+ end
11
+ end
12
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-14 00:00:00.000000000 Z
12
+ date: 2013-02-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70308514080820 !ruby/object:Gem::Requirement
16
+ requirement: &70274619912400 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70308514080820
24
+ version_requirements: *70274619912400
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70308514074360 !ruby/object:Gem::Requirement
27
+ requirement: &70274619906680 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70308514074360
35
+ version_requirements: *70274619906680
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70308514071160 !ruby/object:Gem::Requirement
38
+ requirement: &70274619897540 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70308514071160
46
+ version_requirements: *70274619897540
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70308514067820 !ruby/object:Gem::Requirement
49
+ requirement: &70274619888180 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70308514067820
57
+ version_requirements: *70274619888180
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70308514065940 !ruby/object:Gem::Requirement
60
+ requirement: &70274619880820 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70308514065940
68
+ version_requirements: *70274619880820
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70308514063800 !ruby/object:Gem::Requirement
71
+ requirement: &70274619877520 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70308514063800
79
+ version_requirements: *70274619877520
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70308514059740 !ruby/object:Gem::Requirement
82
+ requirement: &70274619876520 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70308514059740
90
+ version_requirements: *70274619876520
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70308514055360 !ruby/object:Gem::Requirement
93
+ requirement: &70274619875960 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70308514055360
101
+ version_requirements: *70274619875960
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70308514051040 !ruby/object:Gem::Requirement
104
+ requirement: &70274619875340 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70308514051040
112
+ version_requirements: *70274619875340
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70308514046200 !ruby/object:Gem::Requirement
115
+ requirement: &70274619874800 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70308514046200
123
+ version_requirements: *70274619874800
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70308514042680 !ruby/object:Gem::Requirement
126
+ requirement: &70274619874180 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,18 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70308514042680
134
+ version_requirements: *70274619874180
135
+ - !ruby/object:Gem::Dependency
136
+ name: slop
137
+ requirement: &70274619873340 !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ! '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ type: :runtime
144
+ prerelease: false
145
+ version_requirements: *70274619873340
135
146
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
147
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
148
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -325,6 +336,7 @@ files:
325
336
  - lib/cobweb.rb
326
337
  - lib/cobweb_crawl_helper.rb
327
338
  - lib/cobweb_crawler.rb
339
+ - lib/cobweb_dsl.rb
328
340
  - lib/cobweb_finished_job.rb
329
341
  - lib/cobweb_links.rb
330
342
  - lib/cobweb_process_job.rb
@@ -333,12 +345,16 @@ files:
333
345
  - lib/crawl.rb
334
346
  - lib/crawl_job.rb
335
347
  - lib/crawl_object.rb
348
+ - lib/document.rb
336
349
  - lib/encoding_safe_process_job.rb
350
+ - lib/export_command.rb
337
351
  - lib/hash_util.rb
338
352
  - lib/redirect_error.rb
353
+ - lib/report_command.rb
339
354
  - lib/robots.rb
340
355
  - lib/server.rb
341
356
  - lib/stats.rb
357
+ - lib/string.rb
342
358
  - lib/uri_helper.rb
343
359
  - views/home.haml
344
360
  - views/layout.haml