cobweb 1.0.10 → 1.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.10
2
+ h1. Cobweb v1.0.11
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
@@ -22,6 +22,15 @@ h3. Standalone
22
22
 
23
23
  While the crawler is running, you can view statistics on http://localhost:4567
24
24
 
25
+ h3. Command Line
26
+
27
+ Cobweb can also be ran from the command line to perform various pre-canned tasks.
28
+
29
+ * report - output a csv with data from the crawl
30
+ * export - creates a local replication of the data on the server based off the url structure. Text data is stored in yaml format.
31
+
32
+ Run "cobweb --help" for more info
33
+
25
34
  h3. Data Returned For Each Page
26
35
  The data available in the returned hash are:
27
36
 
@@ -0,0 +1,32 @@
1
+ module CobwebDSL
2
+
3
+ def scope
4
+ DocumentScope.new(@doc)
5
+ end
6
+
7
+ # def method_missing(m, *args, &block)
8
+ # if m.to_s =~ /^(.*?)_tags$/
9
+ # tag_name = $1
10
+ # @doc.search($1)
11
+ # elsif m.to_s =~ /^(.*?)_tag$/
12
+ # tag_name = $1
13
+ # @doc.at($1)
14
+ # elsif m.to_s =~ /^(.*?)_tags_used\?$/
15
+ # tag_name = $1
16
+ # !@doc.search(tag_name).empty?
17
+ # elsif m.to_s =~ /^(.*?)_tags_with_(.*?)$/
18
+ # tag_name = $1
19
+ # attribute_name = $2
20
+ # attribute_value = "=#{args[0]}" unless args[0].nil?
21
+ # @doc.search("#{tag_name}[#{attribute_name}#{attribute_value}]")
22
+ # elsif m.to_s =~ /^(.*?)_tag_with_(.*?)$/
23
+ # tag_name = $1
24
+ # attribute_name = $2
25
+ # attribute_value = "=#{args[0]}" unless args[0].nil?
26
+ # @doc.at("#{tag_name}[#{attribute_name}#{attribute_value}]")
27
+ # else
28
+ # super
29
+ # end
30
+ # end
31
+
32
+ end
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.10"
6
+ "1.0.11"
7
7
  end
8
8
 
9
9
  end
@@ -0,0 +1,66 @@
1
+ class DocumentScope
2
+
3
+ @context = nil
4
+
5
+ def method_missing(m, *args, &block)
6
+ if m.to_s =~ /^(.*?)_tags$/
7
+ tag_name = $1
8
+ @context = @context.search($1) if @context
9
+ self
10
+ elsif m.to_s =~ /^(.*?)_tag$/
11
+ tag_name = $1
12
+ @context = @context.at($1) if @context
13
+ self
14
+ elsif m.to_s =~ /^(.*?)_tags_with_(.*?)$/
15
+ tag_name = $1
16
+ attribute_name = $2
17
+ attribute_value = "=#{args[0]}" unless args[0].nil?
18
+
19
+ selector = "#{tag_name}[#{attribute_name}#{attribute_value}]"
20
+ @context = @context.search(selector) if @context
21
+ self
22
+ elsif m.to_s =~ /^(.*?)_tag_with_(.*?)$/
23
+ tag_name = $1
24
+ attribute_name = $2
25
+ attribute_value = "='#{args[0]}'" unless args[0].nil?
26
+ selector = "#{tag_name}[#{attribute_name}#{attribute_value}]"
27
+ @context = @context.at(selector) if @context
28
+ self
29
+ else
30
+ super
31
+ end
32
+ end
33
+
34
+ def initialize(body)
35
+ @context = Nokogiri::HTML.parse(body)
36
+ end
37
+
38
+ def each(&block)
39
+ @context.each(&block)
40
+ end
41
+
42
+ def map(&block)
43
+ @context.map(&block)
44
+ end
45
+
46
+ def select(&block)
47
+ @context.select(&block)
48
+ end
49
+
50
+ def [](value)
51
+ @context ? @context[value] : ""
52
+ end
53
+
54
+ def contents
55
+ @context ? @context.text.gsub("\n","") : ""
56
+ end
57
+ alias :text :contents
58
+
59
+ def count
60
+ @context ? @context.count : 0
61
+ end
62
+ def to_s
63
+ @context ? @context.to_s.gsub("\n","") : ""
64
+ end
65
+
66
+ end
@@ -0,0 +1,77 @@
1
+ class ExportCommand
2
+
3
+ require 'yaml'
4
+
5
+ def self.start(opts, path)
6
+
7
+ uri = URI.parse(opts[:url])
8
+ default_root_path = File.join(Dir.pwd, uri.host)
9
+
10
+ options = {
11
+ :cache => 600,
12
+ :crawl_limit => 1000000,
13
+ :raise_exceptions => true,
14
+ :root_path => default_root_path
15
+ }.merge(opts)
16
+
17
+
18
+
19
+ statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
20
+ begin
21
+ puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
22
+
23
+ uri = URI.parse(page[:url])
24
+
25
+ path = ""
26
+
27
+ Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
28
+
29
+ uri.path.split("/")[0..-2].each do |dir|
30
+ path+="/" unless path.ends_with?("/")
31
+ path+=dir
32
+ if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
33
+ FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
34
+ Dir.mkdir(options[:root_path] + path)
35
+ FileUtils.mv(options[:root_path] + path + ".tmp", options[:root_path] + path + "/index.html")
36
+ else
37
+ Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
38
+ end
39
+ end
40
+ path += "/" unless path.ends_with?("/")
41
+ filename = uri.path.split("/")[-1]
42
+ if filename.nil? || filename.empty?
43
+ filename = "index.html"
44
+ end
45
+ filename = filename + "_" + uri.query.gsub("/", "%2F") unless uri.query.nil?
46
+
47
+ if page[:text_content]
48
+ doc = Nokogiri::HTML.parse(page[:body])
49
+
50
+ if doc.search("title").first
51
+ title = doc.search("title").first.content.gsub(" - ", " ")
52
+ else
53
+ title = uri.path.split("/")[-1]
54
+ end
55
+ page[:description] = doc.search("meta[name=description]").first.content if doc.search("meta[name=description]").first
56
+ page[:keywords] = doc.search("meta[name=keywords]").first.content if doc.search("meta[name=keywords]").first
57
+ page[:meta_title] = doc.search("meta[name=title]").first.content if doc.search("meta[name=title]").first
58
+
59
+ body = page[:body]
60
+
61
+ File.open(options[:root_path] + path + filename, "w+"){|f| f.write(page.to_yaml)}
62
+
63
+ #puts "Spree::Page.create!(:title => #{title}, :body => #{body}, :visible => #{true}, :meta_keywords => #{keywords}, :meta_description => #{description}, :layout => "", :meta_title => #{meta_title})"
64
+ #Spree::Page.create!(:title => title, :body => body, :visible => false, :meta_keywords => keywords, :meta_description => description, :layout => "", :meta_title => meta_title)
65
+ else
66
+ File.open(options[:root_path] + path + filename, "wb"){|f| f.write(Base64.decode64(page[:body]))}
67
+ end
68
+
69
+ puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets." if statistics
70
+ rescue => e
71
+ puts e.message
72
+ puts e.backtrace
73
+ end
74
+ end
75
+
76
+ end
77
+ end
@@ -0,0 +1,37 @@
1
+ class ReportCommand
2
+ def self.start(opts)
3
+
4
+ if opts[:output]
5
+ options = opts.to_hash.delete_if { |k, v| v.nil? || k == :url}
6
+ options[:quiet] = !opts[:verbose]
7
+
8
+ @crawler = CobwebCrawler.new({:cache_type => :full, :raise_exceptions => true}.merge(options))
9
+
10
+ columns = nil
11
+
12
+ CSV.open(options[:output], "wb", :force_quotes => true) do |csv|
13
+ statistics = @crawler.crawl(options[:url]) do |page|
14
+ puts "Reporting on #{page[:url]}"
15
+ @doc = page[:body]
16
+
17
+
18
+ page["link_rel"] = scope.link_tag_with_rel("canonical")["href"]
19
+ page["title"] = scope.head_tag.title_tag.contents
20
+ page["description"] = scope.meta_tag_with_name("description")["content"]
21
+ page["keywords"] = scope.meta_tag_with_name("keywords")["content"]
22
+ page["img tag count"] = scope.img_tags.count
23
+ page["scripts in body"] = scope.body_tag.script_tags.count
24
+ page["img without alt count"] = scope.img_tags.select{|node| node[:alt].nil? || node[:alt].strip().empty?}.count
25
+ page["img alt"] = scope.img_tags_with_alt.map{|node| node[:alt]}.uniq
26
+
27
+
28
+ if !columns
29
+ columns = page.keys.reject{|k| k==:body || k==:links}
30
+ csv << columns.map{|k| k.to_s}
31
+ end
32
+ csv << columns.map{|k| page[k]}
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,12 @@
1
+ class String
2
+
3
+ # add ends_with? support if method is missing
4
+ def method_missing(m, *args, &block)
5
+ if m == :ends_with?
6
+ suffix = args[0]
7
+ suffix.respond_to?(:to_str) && self[-suffix.length, suffix.length] == suffix
8
+ else
9
+ super
10
+ end
11
+ end
12
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.10
4
+ version: 1.0.11
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-14 00:00:00.000000000 Z
12
+ date: 2013-02-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70308514080820 !ruby/object:Gem::Requirement
16
+ requirement: &70274619912400 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70308514080820
24
+ version_requirements: *70274619912400
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70308514074360 !ruby/object:Gem::Requirement
27
+ requirement: &70274619906680 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70308514074360
35
+ version_requirements: *70274619906680
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70308514071160 !ruby/object:Gem::Requirement
38
+ requirement: &70274619897540 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70308514071160
46
+ version_requirements: *70274619897540
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70308514067820 !ruby/object:Gem::Requirement
49
+ requirement: &70274619888180 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70308514067820
57
+ version_requirements: *70274619888180
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70308514065940 !ruby/object:Gem::Requirement
60
+ requirement: &70274619880820 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70308514065940
68
+ version_requirements: *70274619880820
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70308514063800 !ruby/object:Gem::Requirement
71
+ requirement: &70274619877520 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70308514063800
79
+ version_requirements: *70274619877520
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70308514059740 !ruby/object:Gem::Requirement
82
+ requirement: &70274619876520 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70308514059740
90
+ version_requirements: *70274619876520
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70308514055360 !ruby/object:Gem::Requirement
93
+ requirement: &70274619875960 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70308514055360
101
+ version_requirements: *70274619875960
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70308514051040 !ruby/object:Gem::Requirement
104
+ requirement: &70274619875340 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70308514051040
112
+ version_requirements: *70274619875340
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70308514046200 !ruby/object:Gem::Requirement
115
+ requirement: &70274619874800 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70308514046200
123
+ version_requirements: *70274619874800
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70308514042680 !ruby/object:Gem::Requirement
126
+ requirement: &70274619874180 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,18 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70308514042680
134
+ version_requirements: *70274619874180
135
+ - !ruby/object:Gem::Dependency
136
+ name: slop
137
+ requirement: &70274619873340 !ruby/object:Gem::Requirement
138
+ none: false
139
+ requirements:
140
+ - - ! '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ type: :runtime
144
+ prerelease: false
145
+ version_requirements: *70274619873340
135
146
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
147
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
148
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -325,6 +336,7 @@ files:
325
336
  - lib/cobweb.rb
326
337
  - lib/cobweb_crawl_helper.rb
327
338
  - lib/cobweb_crawler.rb
339
+ - lib/cobweb_dsl.rb
328
340
  - lib/cobweb_finished_job.rb
329
341
  - lib/cobweb_links.rb
330
342
  - lib/cobweb_process_job.rb
@@ -333,12 +345,16 @@ files:
333
345
  - lib/crawl.rb
334
346
  - lib/crawl_job.rb
335
347
  - lib/crawl_object.rb
348
+ - lib/document.rb
336
349
  - lib/encoding_safe_process_job.rb
350
+ - lib/export_command.rb
337
351
  - lib/hash_util.rb
338
352
  - lib/redirect_error.rb
353
+ - lib/report_command.rb
339
354
  - lib/robots.rb
340
355
  - lib/server.rb
341
356
  - lib/stats.rb
357
+ - lib/string.rb
342
358
  - lib/uri_helper.rb
343
359
  - views/home.haml
344
360
  - views/layout.haml