web_crawler 0.5.0 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -4,11 +4,15 @@ gem 'thor', '>=0.14.6'
4
4
  gem 'mime-types', '>=1.16'
5
5
  gem 'parallel', '>=0.5.5'
6
6
  gem 'activesupport'
7
+ gem "rake", '0.8.7'
8
+ gem 'i18n'
9
+ gem 'hpricot'
7
10
 
8
11
  # Specify your gem's dependencies in web_crawler.gemspec
9
12
  gemspec
10
13
 
11
14
  group :development, :test do
15
+ gem "rake", '0.8.7'
12
16
  gem "rspec", ">=2.6"
13
17
  gem "autotest"
14
18
  gem "autotest-growl"
data/README CHANGED
@@ -1,22 +1,32 @@
1
1
  Web crawler help you with parse and collect data from the web
2
2
 
3
- #TODO
3
+ ==How it works.
4
4
 
5
- Base web crawler class for API present
6
- Its showld work like this:
5
+ class StackoverflowCrawler < WebCrawler::Base
7
6
 
8
- class MyCrawler < WebCrawler::Base
7
+ target "http://stackoverflow.com/questions/tagged/:tag", :tag=> %w{ruby ruby-on-rails ruby-on-rails-3}
8
+ logger "path/to/log/file" # or Logger.new(...)
9
9
 
10
- target "www.example.com"
11
- target "www.example.com/page2"
12
- target %[www.example.com/contacts www.example.com/about]
13
- target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
10
+ cache_to '/tmp/cache/stackoverflow'
14
11
 
15
- target { call_advanced_logic_for_url_generating }
12
+ context "#questions .question-summary", :jobs do
16
13
 
17
- logger "path/to/log/file" # or Logger.new(...)
14
+ #TODO: defaults :format => lambda{ |v| v.to_i }
18
15
 
16
+ map '.vote-count-post strong', :to => :vote_count, :format => lambda{ |v| v.to_i }
17
+ map '.views', :to => :view_count, :format => lambda{ |v| v.match(/\d+/)[0].to_i }
18
+ map '.status strong', :to => :answer_count, :format => lambda{ |v| v.to_i }
19
+ map '.summary h3 a', :to => :title, :format => lambda{ |v| v.to_i }
20
+ map '.summary .excerpt', :to => :excerpt, :format => lambda{ |v| v.to_i }
21
+ map '.user-action-time .relativetime', :to => :posted_at, :on => [:attr, :title]
22
+ map '.tags .post-tag', :to => :tags, :format => lambda{ |v| v.to_i }
19
23
 
20
-
24
+ end
25
+ end
21
26
 
22
- end
27
+
28
+ #TODO
29
+ 1. Add documentation
30
+ 2. ...
31
+ 3. PROFIT!!!1
32
+ (:
data/bin/wcrawler CHANGED
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
-
3
2
  $:.unshift File.expand_path("../../lib", __FILE__)
4
3
 
5
4
  # Check if an older version of bundler is installed
data/gem_graph.png ADDED
Binary file
@@ -1,6 +1,9 @@
1
1
  module WebCrawler
2
2
  class Application < CLI
3
3
 
4
+ map '-V' => 'version'
5
+ map '-v' => 'version'
6
+
4
7
  desc "test", "Test task"
5
8
 
6
9
  def test
@@ -72,6 +75,11 @@ module WebCrawler
72
75
  end
73
76
  end
74
77
 
78
+ desc '-v or -V or version', 'Show gem version'
79
+ def version
80
+ WebCrawler::VERSION::STRING
81
+ end
82
+
75
83
  protected
76
84
  def allow_format(*allow)
77
85
  allow.flatten.select { |f| f == @options[:format] }.first
@@ -2,7 +2,7 @@ module WebCrawler
2
2
  module VERSION
3
3
  MAJOR = 0
4
4
  MINOR = 5
5
- TINY = 0
5
+ TINY = 2
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -17,6 +17,8 @@ module WebCrawler::View
17
17
  class Base
18
18
  attr_reader :input
19
19
 
20
+ delegate :logger, :to => WebCrawler.logger
21
+
20
22
  class << self
21
23
  attr_accessor :default_options
22
24
 
@@ -52,11 +54,22 @@ module WebCrawler::View
52
54
  @present_output = if override && override.respond_to?(:puts)
53
55
  override
54
56
  elsif @options['output'].is_a?(String)
55
- File.open(@options['output'], 'w+')
57
+ output_to_file(@options['output'])
56
58
  elsif @options['output'].respond_to? :puts
57
59
  @options['output']
58
60
  end
59
61
  end
62
+
63
+ def output_to_file(filename)
64
+ path = Pathname.new(filename)
65
+
66
+ unless path.dirname.exist?
67
+ info("#{path.dirname} not exist, try to create...")
68
+ path.dirname.mkpath
69
+ end
70
+
71
+ path.open('w+')
72
+ end
60
73
  end
61
74
 
62
75
  end
data/lib/web_crawler.rb CHANGED
@@ -7,6 +7,7 @@ require "ext/hash"
7
7
  require "ext/array"
8
8
  require "ext/http_response"
9
9
  require "active_support/core_ext"
10
+ require "web_crawler/version"
10
11
 
11
12
  module WebCrawler
12
13
  autoload :Request, 'web_crawler/request'
data/web_crawler.gemspec CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
8
8
  s.platform = Gem::Platform::RUBY
9
9
  s.authors = ["Anton Sozontov"]
10
10
  s.email = ["a.sozontov@gmail.com"]
11
- s.homepage = ""
11
+ s.homepage = "https://github.com/webgago/web_crawler"
12
12
  s.summary = %q{Web crawler help you with parse and collect data from the web}
13
13
  s.description = %q{Web crawler help you with parse and collect data from the web}
14
14
 
@@ -26,7 +26,7 @@ Gem::Specification.new do |s|
26
26
  s.add_dependency 'thor', '>=0.14.6'
27
27
  s.add_dependency 'mime-types', '>=1.16'
28
28
  s.add_dependency 'parallel', '>=0.5.5'
29
- s.add_dependency 'activesupport'
29
+ s.add_dependency 'activesupport', '>=3.0'
30
30
 
31
31
  s.add_development_dependency(%q<rspec>, [">=2.6"])
32
32
  s.add_development_dependency(%q<fakeweb>)
metadata CHANGED
@@ -1,12 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_crawler
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 5
8
- - 0
9
- version: 0.5.0
4
+ prerelease:
5
+ version: 0.5.2
10
6
  platform: ruby
11
7
  authors:
12
8
  - Anton Sozontov
@@ -14,8 +10,7 @@ autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
12
 
17
- date: 2011-06-14 00:00:00 +04:00
18
- default_executable:
13
+ date: 2011-06-24 00:00:00 Z
19
14
  dependencies:
20
15
  - !ruby/object:Gem::Dependency
21
16
  name: thor
@@ -25,10 +20,6 @@ dependencies:
25
20
  requirements:
26
21
  - - ">="
27
22
  - !ruby/object:Gem::Version
28
- segments:
29
- - 0
30
- - 14
31
- - 6
32
23
  version: 0.14.6
33
24
  type: :runtime
34
25
  version_requirements: *id001
@@ -40,9 +31,6 @@ dependencies:
40
31
  requirements:
41
32
  - - ">="
42
33
  - !ruby/object:Gem::Version
43
- segments:
44
- - 1
45
- - 16
46
34
  version: "1.16"
47
35
  type: :runtime
48
36
  version_requirements: *id002
@@ -54,10 +42,6 @@ dependencies:
54
42
  requirements:
55
43
  - - ">="
56
44
  - !ruby/object:Gem::Version
57
- segments:
58
- - 0
59
- - 5
60
- - 5
61
45
  version: 0.5.5
62
46
  type: :runtime
63
47
  version_requirements: *id003
@@ -69,9 +53,7 @@ dependencies:
69
53
  requirements:
70
54
  - - ">="
71
55
  - !ruby/object:Gem::Version
72
- segments:
73
- - 0
74
- version: "0"
56
+ version: "3.0"
75
57
  type: :runtime
76
58
  version_requirements: *id004
77
59
  - !ruby/object:Gem::Dependency
@@ -82,9 +64,6 @@ dependencies:
82
64
  requirements:
83
65
  - - ">="
84
66
  - !ruby/object:Gem::Version
85
- segments:
86
- - 2
87
- - 6
88
67
  version: "2.6"
89
68
  type: :development
90
69
  version_requirements: *id005
@@ -96,8 +75,6 @@ dependencies:
96
75
  requirements:
97
76
  - - ">="
98
77
  - !ruby/object:Gem::Version
99
- segments:
100
- - 0
101
78
  version: "0"
102
79
  type: :development
103
80
  version_requirements: *id006
@@ -117,6 +94,7 @@ files:
117
94
  - README
118
95
  - Rakefile
119
96
  - bin/wcrawler
97
+ - gem_graph.png
120
98
  - lib/ext/array.rb
121
99
  - lib/ext/hash.rb
122
100
  - lib/ext/http_response.rb
@@ -167,8 +145,7 @@ files:
167
145
  - spec/web_crawler/view_spec.rb
168
146
  - spec/web_crawler/web_crawler_api_base_class_spec.rb
169
147
  - web_crawler.gemspec
170
- has_rdoc: false
171
- homepage: ""
148
+ homepage: https://github.com/webgago/web_crawler
172
149
  licenses: []
173
150
 
174
151
  post_install_message:
@@ -181,21 +158,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
181
158
  requirements:
182
159
  - - ">="
183
160
  - !ruby/object:Gem::Version
184
- segments:
185
- - 0
186
161
  version: "0"
187
162
  required_rubygems_version: !ruby/object:Gem::Requirement
188
163
  none: false
189
164
  requirements:
190
165
  - - ">="
191
166
  - !ruby/object:Gem::Version
192
- segments:
193
- - 0
194
167
  version: "0"
195
168
  requirements: []
196
169
 
197
170
  rubyforge_project: web_crawler
198
- rubygems_version: 1.3.7
171
+ rubygems_version: 1.8.5
199
172
  signing_key:
200
173
  specification_version: 3
201
174
  summary: Web crawler help you with parse and collect data from the web