web_crawler 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/README +22 -12
- data/bin/wcrawler +0 -1
- data/gem_graph.png +0 -0
- data/lib/web_crawler/application.rb +8 -0
- data/lib/web_crawler/version.rb +1 -1
- data/lib/web_crawler/view.rb +14 -1
- data/lib/web_crawler.rb +1 -0
- data/web_crawler.gemspec +2 -2
- metadata +7 -34
data/Gemfile
CHANGED
@@ -4,11 +4,15 @@ gem 'thor', '>=0.14.6'
|
|
4
4
|
gem 'mime-types', '>=1.16'
|
5
5
|
gem 'parallel', '>=0.5.5'
|
6
6
|
gem 'activesupport'
|
7
|
+
gem "rake", '0.8.7'
|
8
|
+
gem 'i18n'
|
9
|
+
gem 'hpricot'
|
7
10
|
|
8
11
|
# Specify your gem's dependencies in web_crawler.gemspec
|
9
12
|
gemspec
|
10
13
|
|
11
14
|
group :development, :test do
|
15
|
+
gem "rake", '0.8.7'
|
12
16
|
gem "rspec", ">=2.6"
|
13
17
|
gem "autotest"
|
14
18
|
gem "autotest-growl"
|
data/README
CHANGED
@@ -1,22 +1,32 @@
|
|
1
1
|
Web crawler help you with parse and collect data from the web
|
2
2
|
|
3
|
-
|
3
|
+
==How it works.
|
4
4
|
|
5
|
-
|
6
|
-
Its showld work like this:
|
5
|
+
class StackoverflowCrawler < WebCrawler::Base
|
7
6
|
|
8
|
-
|
7
|
+
target "http://stackoverflow.com/questions/tagged/:tag", :tag=> %w{ruby ruby-on-rails ruby-on-rails-3}
|
8
|
+
logger "path/to/log/file" # or Logger.new(...)
|
9
9
|
|
10
|
-
|
11
|
-
target "www.example.com/page2"
|
12
|
-
target %[www.example.com/contacts www.example.com/about]
|
13
|
-
target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
|
10
|
+
cache_to '/tmp/cache/stackoverflow'
|
14
11
|
|
15
|
-
|
12
|
+
context "#questions .question-summary", :jobs do
|
16
13
|
|
17
|
-
|
14
|
+
#TODO: defaults :format => lambda{ |v| v.to_i }
|
18
15
|
|
16
|
+
map '.vote-count-post strong', :to => :vote_count, :format => lambda{ |v| v.to_i }
|
17
|
+
map '.views', :to => :view_count, :format => lambda{ |v| v.match(/\d+/)[0].to_i }
|
18
|
+
map '.status strong', :to => :answer_count, :format => lambda{ |v| v.to_i }
|
19
|
+
map '.summary h3 a', :to => :title, :format => lambda{ |v| v.to_i }
|
20
|
+
map '.summary .excerpt', :to => :excerpt, :format => lambda{ |v| v.to_i }
|
21
|
+
map '.user-action-time .relativetime', :to => :posted_at, :on => [:attr, :title]
|
22
|
+
map '.tags .post-tag', :to => :tags, :format => lambda{ |v| v.to_i }
|
19
23
|
|
20
|
-
|
24
|
+
end
|
25
|
+
end
|
21
26
|
|
22
|
-
|
27
|
+
|
28
|
+
#TODO
|
29
|
+
1. Add documentation
|
30
|
+
2. ...
|
31
|
+
3. PROFIT!!!1
|
32
|
+
(:
|
data/bin/wcrawler
CHANGED
data/gem_graph.png
ADDED
Binary file
|
@@ -1,6 +1,9 @@
|
|
1
1
|
module WebCrawler
|
2
2
|
class Application < CLI
|
3
3
|
|
4
|
+
map '-V' => 'version'
|
5
|
+
map '-v' => 'version'
|
6
|
+
|
4
7
|
desc "test", "Test task"
|
5
8
|
|
6
9
|
def test
|
@@ -72,6 +75,11 @@ module WebCrawler
|
|
72
75
|
end
|
73
76
|
end
|
74
77
|
|
78
|
+
desc '-v or -V or version', 'Show gem version'
|
79
|
+
def version
|
80
|
+
WebCrawler::VERSION::STRING
|
81
|
+
end
|
82
|
+
|
75
83
|
protected
|
76
84
|
def allow_format(*allow)
|
77
85
|
allow.flatten.select { |f| f == @options[:format] }.first
|
data/lib/web_crawler/version.rb
CHANGED
data/lib/web_crawler/view.rb
CHANGED
@@ -17,6 +17,8 @@ module WebCrawler::View
|
|
17
17
|
class Base
|
18
18
|
attr_reader :input
|
19
19
|
|
20
|
+
delegate :logger, :to => WebCrawler.logger
|
21
|
+
|
20
22
|
class << self
|
21
23
|
attr_accessor :default_options
|
22
24
|
|
@@ -52,11 +54,22 @@ module WebCrawler::View
|
|
52
54
|
@present_output = if override && override.respond_to?(:puts)
|
53
55
|
override
|
54
56
|
elsif @options['output'].is_a?(String)
|
55
|
-
|
57
|
+
output_to_file(@options['output'])
|
56
58
|
elsif @options['output'].respond_to? :puts
|
57
59
|
@options['output']
|
58
60
|
end
|
59
61
|
end
|
62
|
+
|
63
|
+
def output_to_file(filename)
|
64
|
+
path = Pathname.new(filename)
|
65
|
+
|
66
|
+
unless path.dirname.exist?
|
67
|
+
info("#{path.dirname} not exist, try to create...")
|
68
|
+
path.dirname.mkpath
|
69
|
+
end
|
70
|
+
|
71
|
+
path.open('w+')
|
72
|
+
end
|
60
73
|
end
|
61
74
|
|
62
75
|
end
|
data/lib/web_crawler.rb
CHANGED
data/web_crawler.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
9
|
s.authors = ["Anton Sozontov"]
|
10
10
|
s.email = ["a.sozontov@gmail.com"]
|
11
|
-
s.homepage = ""
|
11
|
+
s.homepage = "https://github.com/webgago/web_crawler"
|
12
12
|
s.summary = %q{Web crawler help you with parse and collect data from the web}
|
13
13
|
s.description = %q{Web crawler help you with parse and collect data from the web}
|
14
14
|
|
@@ -26,7 +26,7 @@ Gem::Specification.new do |s|
|
|
26
26
|
s.add_dependency 'thor', '>=0.14.6'
|
27
27
|
s.add_dependency 'mime-types', '>=1.16'
|
28
28
|
s.add_dependency 'parallel', '>=0.5.5'
|
29
|
-
s.add_dependency 'activesupport'
|
29
|
+
s.add_dependency 'activesupport', '>=3.0'
|
30
30
|
|
31
31
|
s.add_development_dependency(%q<rspec>, [">=2.6"])
|
32
32
|
s.add_development_dependency(%q<fakeweb>)
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 5
|
8
|
-
- 0
|
9
|
-
version: 0.5.0
|
4
|
+
prerelease:
|
5
|
+
version: 0.5.2
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Anton Sozontov
|
@@ -14,8 +10,7 @@ autorequire:
|
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
12
|
|
17
|
-
date: 2011-06-
|
18
|
-
default_executable:
|
13
|
+
date: 2011-06-24 00:00:00 Z
|
19
14
|
dependencies:
|
20
15
|
- !ruby/object:Gem::Dependency
|
21
16
|
name: thor
|
@@ -25,10 +20,6 @@ dependencies:
|
|
25
20
|
requirements:
|
26
21
|
- - ">="
|
27
22
|
- !ruby/object:Gem::Version
|
28
|
-
segments:
|
29
|
-
- 0
|
30
|
-
- 14
|
31
|
-
- 6
|
32
23
|
version: 0.14.6
|
33
24
|
type: :runtime
|
34
25
|
version_requirements: *id001
|
@@ -40,9 +31,6 @@ dependencies:
|
|
40
31
|
requirements:
|
41
32
|
- - ">="
|
42
33
|
- !ruby/object:Gem::Version
|
43
|
-
segments:
|
44
|
-
- 1
|
45
|
-
- 16
|
46
34
|
version: "1.16"
|
47
35
|
type: :runtime
|
48
36
|
version_requirements: *id002
|
@@ -54,10 +42,6 @@ dependencies:
|
|
54
42
|
requirements:
|
55
43
|
- - ">="
|
56
44
|
- !ruby/object:Gem::Version
|
57
|
-
segments:
|
58
|
-
- 0
|
59
|
-
- 5
|
60
|
-
- 5
|
61
45
|
version: 0.5.5
|
62
46
|
type: :runtime
|
63
47
|
version_requirements: *id003
|
@@ -69,9 +53,7 @@ dependencies:
|
|
69
53
|
requirements:
|
70
54
|
- - ">="
|
71
55
|
- !ruby/object:Gem::Version
|
72
|
-
|
73
|
-
- 0
|
74
|
-
version: "0"
|
56
|
+
version: "3.0"
|
75
57
|
type: :runtime
|
76
58
|
version_requirements: *id004
|
77
59
|
- !ruby/object:Gem::Dependency
|
@@ -82,9 +64,6 @@ dependencies:
|
|
82
64
|
requirements:
|
83
65
|
- - ">="
|
84
66
|
- !ruby/object:Gem::Version
|
85
|
-
segments:
|
86
|
-
- 2
|
87
|
-
- 6
|
88
67
|
version: "2.6"
|
89
68
|
type: :development
|
90
69
|
version_requirements: *id005
|
@@ -96,8 +75,6 @@ dependencies:
|
|
96
75
|
requirements:
|
97
76
|
- - ">="
|
98
77
|
- !ruby/object:Gem::Version
|
99
|
-
segments:
|
100
|
-
- 0
|
101
78
|
version: "0"
|
102
79
|
type: :development
|
103
80
|
version_requirements: *id006
|
@@ -117,6 +94,7 @@ files:
|
|
117
94
|
- README
|
118
95
|
- Rakefile
|
119
96
|
- bin/wcrawler
|
97
|
+
- gem_graph.png
|
120
98
|
- lib/ext/array.rb
|
121
99
|
- lib/ext/hash.rb
|
122
100
|
- lib/ext/http_response.rb
|
@@ -167,8 +145,7 @@ files:
|
|
167
145
|
- spec/web_crawler/view_spec.rb
|
168
146
|
- spec/web_crawler/web_crawler_api_base_class_spec.rb
|
169
147
|
- web_crawler.gemspec
|
170
|
-
|
171
|
-
homepage: ""
|
148
|
+
homepage: https://github.com/webgago/web_crawler
|
172
149
|
licenses: []
|
173
150
|
|
174
151
|
post_install_message:
|
@@ -181,21 +158,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
181
158
|
requirements:
|
182
159
|
- - ">="
|
183
160
|
- !ruby/object:Gem::Version
|
184
|
-
segments:
|
185
|
-
- 0
|
186
161
|
version: "0"
|
187
162
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
163
|
none: false
|
189
164
|
requirements:
|
190
165
|
- - ">="
|
191
166
|
- !ruby/object:Gem::Version
|
192
|
-
segments:
|
193
|
-
- 0
|
194
167
|
version: "0"
|
195
168
|
requirements: []
|
196
169
|
|
197
170
|
rubyforge_project: web_crawler
|
198
|
-
rubygems_version: 1.
|
171
|
+
rubygems_version: 1.8.5
|
199
172
|
signing_key:
|
200
173
|
specification_version: 3
|
201
174
|
summary: Web crawler help you with parse and collect data from the web
|