web_crawler 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/README +22 -12
- data/bin/wcrawler +0 -1
- data/gem_graph.png +0 -0
- data/lib/web_crawler/application.rb +8 -0
- data/lib/web_crawler/version.rb +1 -1
- data/lib/web_crawler/view.rb +14 -1
- data/lib/web_crawler.rb +1 -0
- data/web_crawler.gemspec +2 -2
- metadata +7 -34
data/Gemfile
CHANGED
@@ -4,11 +4,15 @@ gem 'thor', '>=0.14.6'
|
|
4
4
|
gem 'mime-types', '>=1.16'
|
5
5
|
gem 'parallel', '>=0.5.5'
|
6
6
|
gem 'activesupport'
|
7
|
+
gem "rake", '0.8.7'
|
8
|
+
gem 'i18n'
|
9
|
+
gem 'hpricot'
|
7
10
|
|
8
11
|
# Specify your gem's dependencies in web_crawler.gemspec
|
9
12
|
gemspec
|
10
13
|
|
11
14
|
group :development, :test do
|
15
|
+
gem "rake", '0.8.7'
|
12
16
|
gem "rspec", ">=2.6"
|
13
17
|
gem "autotest"
|
14
18
|
gem "autotest-growl"
|
data/README
CHANGED
@@ -1,22 +1,32 @@
|
|
1
1
|
Web crawler help you with parse and collect data from the web
|
2
2
|
|
3
|
-
|
3
|
+
==How it works.
|
4
4
|
|
5
|
-
|
6
|
-
Its showld work like this:
|
5
|
+
class StackoverflowCrawler < WebCrawler::Base
|
7
6
|
|
8
|
-
|
7
|
+
target "http://stackoverflow.com/questions/tagged/:tag", :tag=> %w{ruby ruby-on-rails ruby-on-rails-3}
|
8
|
+
logger "path/to/log/file" # or Logger.new(...)
|
9
9
|
|
10
|
-
|
11
|
-
target "www.example.com/page2"
|
12
|
-
target %[www.example.com/contacts www.example.com/about]
|
13
|
-
target "www.example.com/category_:category/page:page/", :categories => [1,2,3,4], :page => 1..100
|
10
|
+
cache_to '/tmp/cache/stackoverflow'
|
14
11
|
|
15
|
-
|
12
|
+
context "#questions .question-summary", :jobs do
|
16
13
|
|
17
|
-
|
14
|
+
#TODO: defaults :format => lambda{ |v| v.to_i }
|
18
15
|
|
16
|
+
map '.vote-count-post strong', :to => :vote_count, :format => lambda{ |v| v.to_i }
|
17
|
+
map '.views', :to => :view_count, :format => lambda{ |v| v.match(/\d+/)[0].to_i }
|
18
|
+
map '.status strong', :to => :answer_count, :format => lambda{ |v| v.to_i }
|
19
|
+
map '.summary h3 a', :to => :title, :format => lambda{ |v| v.to_i }
|
20
|
+
map '.summary .excerpt', :to => :excerpt, :format => lambda{ |v| v.to_i }
|
21
|
+
map '.user-action-time .relativetime', :to => :posted_at, :on => [:attr, :title]
|
22
|
+
map '.tags .post-tag', :to => :tags, :format => lambda{ |v| v.to_i }
|
19
23
|
|
20
|
-
|
24
|
+
end
|
25
|
+
end
|
21
26
|
|
22
|
-
|
27
|
+
|
28
|
+
#TODO
|
29
|
+
1. Add documentation
|
30
|
+
2. ...
|
31
|
+
3. PROFIT!!!1
|
32
|
+
(:
|
data/bin/wcrawler
CHANGED
data/gem_graph.png
ADDED
Binary file
|
@@ -1,6 +1,9 @@
|
|
1
1
|
module WebCrawler
|
2
2
|
class Application < CLI
|
3
3
|
|
4
|
+
map '-V' => 'version'
|
5
|
+
map '-v' => 'version'
|
6
|
+
|
4
7
|
desc "test", "Test task"
|
5
8
|
|
6
9
|
def test
|
@@ -72,6 +75,11 @@ module WebCrawler
|
|
72
75
|
end
|
73
76
|
end
|
74
77
|
|
78
|
+
desc '-v or -V or version', 'Show gem version'
|
79
|
+
def version
|
80
|
+
WebCrawler::VERSION::STRING
|
81
|
+
end
|
82
|
+
|
75
83
|
protected
|
76
84
|
def allow_format(*allow)
|
77
85
|
allow.flatten.select { |f| f == @options[:format] }.first
|
data/lib/web_crawler/version.rb
CHANGED
data/lib/web_crawler/view.rb
CHANGED
@@ -17,6 +17,8 @@ module WebCrawler::View
|
|
17
17
|
class Base
|
18
18
|
attr_reader :input
|
19
19
|
|
20
|
+
delegate :logger, :to => WebCrawler.logger
|
21
|
+
|
20
22
|
class << self
|
21
23
|
attr_accessor :default_options
|
22
24
|
|
@@ -52,11 +54,22 @@ module WebCrawler::View
|
|
52
54
|
@present_output = if override && override.respond_to?(:puts)
|
53
55
|
override
|
54
56
|
elsif @options['output'].is_a?(String)
|
55
|
-
|
57
|
+
output_to_file(@options['output'])
|
56
58
|
elsif @options['output'].respond_to? :puts
|
57
59
|
@options['output']
|
58
60
|
end
|
59
61
|
end
|
62
|
+
|
63
|
+
def output_to_file(filename)
|
64
|
+
path = Pathname.new(filename)
|
65
|
+
|
66
|
+
unless path.dirname.exist?
|
67
|
+
info("#{path.dirname} not exist, try to create...")
|
68
|
+
path.dirname.mkpath
|
69
|
+
end
|
70
|
+
|
71
|
+
path.open('w+')
|
72
|
+
end
|
60
73
|
end
|
61
74
|
|
62
75
|
end
|
data/lib/web_crawler.rb
CHANGED
data/web_crawler.gemspec
CHANGED
@@ -8,7 +8,7 @@ Gem::Specification.new do |s|
|
|
8
8
|
s.platform = Gem::Platform::RUBY
|
9
9
|
s.authors = ["Anton Sozontov"]
|
10
10
|
s.email = ["a.sozontov@gmail.com"]
|
11
|
-
s.homepage = ""
|
11
|
+
s.homepage = "https://github.com/webgago/web_crawler"
|
12
12
|
s.summary = %q{Web crawler help you with parse and collect data from the web}
|
13
13
|
s.description = %q{Web crawler help you with parse and collect data from the web}
|
14
14
|
|
@@ -26,7 +26,7 @@ Gem::Specification.new do |s|
|
|
26
26
|
s.add_dependency 'thor', '>=0.14.6'
|
27
27
|
s.add_dependency 'mime-types', '>=1.16'
|
28
28
|
s.add_dependency 'parallel', '>=0.5.5'
|
29
|
-
s.add_dependency 'activesupport'
|
29
|
+
s.add_dependency 'activesupport', '>=3.0'
|
30
30
|
|
31
31
|
s.add_development_dependency(%q<rspec>, [">=2.6"])
|
32
32
|
s.add_development_dependency(%q<fakeweb>)
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 5
|
8
|
-
- 0
|
9
|
-
version: 0.5.0
|
4
|
+
prerelease:
|
5
|
+
version: 0.5.2
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Anton Sozontov
|
@@ -14,8 +10,7 @@ autorequire:
|
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
12
|
|
17
|
-
date: 2011-06-
|
18
|
-
default_executable:
|
13
|
+
date: 2011-06-24 00:00:00 Z
|
19
14
|
dependencies:
|
20
15
|
- !ruby/object:Gem::Dependency
|
21
16
|
name: thor
|
@@ -25,10 +20,6 @@ dependencies:
|
|
25
20
|
requirements:
|
26
21
|
- - ">="
|
27
22
|
- !ruby/object:Gem::Version
|
28
|
-
segments:
|
29
|
-
- 0
|
30
|
-
- 14
|
31
|
-
- 6
|
32
23
|
version: 0.14.6
|
33
24
|
type: :runtime
|
34
25
|
version_requirements: *id001
|
@@ -40,9 +31,6 @@ dependencies:
|
|
40
31
|
requirements:
|
41
32
|
- - ">="
|
42
33
|
- !ruby/object:Gem::Version
|
43
|
-
segments:
|
44
|
-
- 1
|
45
|
-
- 16
|
46
34
|
version: "1.16"
|
47
35
|
type: :runtime
|
48
36
|
version_requirements: *id002
|
@@ -54,10 +42,6 @@ dependencies:
|
|
54
42
|
requirements:
|
55
43
|
- - ">="
|
56
44
|
- !ruby/object:Gem::Version
|
57
|
-
segments:
|
58
|
-
- 0
|
59
|
-
- 5
|
60
|
-
- 5
|
61
45
|
version: 0.5.5
|
62
46
|
type: :runtime
|
63
47
|
version_requirements: *id003
|
@@ -69,9 +53,7 @@ dependencies:
|
|
69
53
|
requirements:
|
70
54
|
- - ">="
|
71
55
|
- !ruby/object:Gem::Version
|
72
|
-
|
73
|
-
- 0
|
74
|
-
version: "0"
|
56
|
+
version: "3.0"
|
75
57
|
type: :runtime
|
76
58
|
version_requirements: *id004
|
77
59
|
- !ruby/object:Gem::Dependency
|
@@ -82,9 +64,6 @@ dependencies:
|
|
82
64
|
requirements:
|
83
65
|
- - ">="
|
84
66
|
- !ruby/object:Gem::Version
|
85
|
-
segments:
|
86
|
-
- 2
|
87
|
-
- 6
|
88
67
|
version: "2.6"
|
89
68
|
type: :development
|
90
69
|
version_requirements: *id005
|
@@ -96,8 +75,6 @@ dependencies:
|
|
96
75
|
requirements:
|
97
76
|
- - ">="
|
98
77
|
- !ruby/object:Gem::Version
|
99
|
-
segments:
|
100
|
-
- 0
|
101
78
|
version: "0"
|
102
79
|
type: :development
|
103
80
|
version_requirements: *id006
|
@@ -117,6 +94,7 @@ files:
|
|
117
94
|
- README
|
118
95
|
- Rakefile
|
119
96
|
- bin/wcrawler
|
97
|
+
- gem_graph.png
|
120
98
|
- lib/ext/array.rb
|
121
99
|
- lib/ext/hash.rb
|
122
100
|
- lib/ext/http_response.rb
|
@@ -167,8 +145,7 @@ files:
|
|
167
145
|
- spec/web_crawler/view_spec.rb
|
168
146
|
- spec/web_crawler/web_crawler_api_base_class_spec.rb
|
169
147
|
- web_crawler.gemspec
|
170
|
-
|
171
|
-
homepage: ""
|
148
|
+
homepage: https://github.com/webgago/web_crawler
|
172
149
|
licenses: []
|
173
150
|
|
174
151
|
post_install_message:
|
@@ -181,21 +158,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
181
158
|
requirements:
|
182
159
|
- - ">="
|
183
160
|
- !ruby/object:Gem::Version
|
184
|
-
segments:
|
185
|
-
- 0
|
186
161
|
version: "0"
|
187
162
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
188
163
|
none: false
|
189
164
|
requirements:
|
190
165
|
- - ">="
|
191
166
|
- !ruby/object:Gem::Version
|
192
|
-
segments:
|
193
|
-
- 0
|
194
167
|
version: "0"
|
195
168
|
requirements: []
|
196
169
|
|
197
170
|
rubyforge_project: web_crawler
|
198
|
-
rubygems_version: 1.
|
171
|
+
rubygems_version: 1.8.5
|
199
172
|
signing_key:
|
200
173
|
specification_version: 3
|
201
174
|
summary: Web crawler help you with parse and collect data from the web
|