daimon_skycrawlers 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +1 -1
- data/README.md +30 -12
- data/Rakefile +3 -8
- data/bin/daimon-skycrawlers +3 -3
- data/daimon_skycrawlers.gemspec +4 -3
- data/lib/daimon_skycrawlers/cli.rb +3 -3
- data/lib/daimon_skycrawlers/config.rb +8 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
- data/lib/daimon_skycrawlers/consumer.rb +4 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
- data/lib/daimon_skycrawlers/crawler.rb +18 -76
- data/lib/daimon_skycrawlers/filter/base.rb +24 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
- data/lib/daimon_skycrawlers/filter.rb +4 -0
- data/lib/daimon_skycrawlers/generator/new.rb +3 -2
- data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
- data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
- data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
- data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
- data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
- data/lib/daimon_skycrawlers/logger.rb +32 -0
- data/lib/daimon_skycrawlers/processor/base.rb +19 -0
- data/lib/daimon_skycrawlers/processor/default.rb +12 -9
- data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
- data/lib/daimon_skycrawlers/processor.rb +23 -9
- data/lib/daimon_skycrawlers/queue.rb +24 -0
- data/lib/daimon_skycrawlers/storage/base.rb +6 -0
- data/lib/daimon_skycrawlers/timer.rb +24 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +24 -4
- data/sample/spider/Gemfile +4 -0
- data/sample/spider/README.md +50 -0
- data/sample/spider/Rakefile +1 -0
- data/sample/spider/config/database.yml +26 -0
- data/sample/spider/crawler.rb +14 -0
- data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
- data/sample/spider/db/schema.rb +28 -0
- data/sample/spider/enqueue.rb +24 -0
- data/sample/spider/init.rb +22 -0
- data/sample/spider/processor.rb +34 -0
- metadata +47 -12
- data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
- data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
- data/lib/daimon_skycrawlers/parser/base.rb +0 -13
- data/lib/daimon_skycrawlers/parser/default.rb +0 -50
- data/lib/daimon_skycrawlers/parser.rb +0 -7
- data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
@@ -0,0 +1,34 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "daimon_skycrawlers/processor"
|
4
|
+
require "daimon_skycrawlers/processor/spider"
|
5
|
+
require "daimon_skycrawlers/filter"
|
6
|
+
require "daimon_skycrawlers/filter/duplicate_checker"
|
7
|
+
require "daimon_skycrawlers/filter/update_checker"
|
8
|
+
|
9
|
+
require_relative "./init"
|
10
|
+
|
11
|
+
default_processor = DaimonSkycrawlers::Processor::Default.new
|
12
|
+
spider = DaimonSkycrawlers::Processor::Spider.new
|
13
|
+
#spider.enqueue = false
|
14
|
+
spider.append_filter do |url|
|
15
|
+
uri = URI(url)
|
16
|
+
uri.host.nil? || uri.host == "www.clear-code.com"
|
17
|
+
end
|
18
|
+
spider.append_filter do |url|
|
19
|
+
case url
|
20
|
+
when %r!\A(\.\./|/|#)!
|
21
|
+
false
|
22
|
+
else
|
23
|
+
true
|
24
|
+
end
|
25
|
+
end
|
26
|
+
duplicate_checker = DaimonSkycrawlers::Filter::DuplicateChecker.new(base_url: "http://www.clear-code.com/blog/")
|
27
|
+
spider.append_filter(duplicate_checker)
|
28
|
+
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(base_url: "http://www.clear-code.com/blog/")
|
29
|
+
spider.append_filter(update_checker)
|
30
|
+
|
31
|
+
DaimonSkycrawlers.register_processor(default_processor)
|
32
|
+
DaimonSkycrawlers.register_processor(spider)
|
33
|
+
|
34
|
+
DaimonSkycrawlers::Processor.run
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryunosuke SATO
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -108,6 +108,20 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: timers
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
111
125
|
- !ruby/object:Gem::Dependency
|
112
126
|
name: bundler
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -179,7 +193,7 @@ dependencies:
|
|
179
193
|
- !ruby/object:Gem::Version
|
180
194
|
version: '0'
|
181
195
|
- !ruby/object:Gem::Dependency
|
182
|
-
name:
|
196
|
+
name: pry
|
183
197
|
requirement: !ruby/object:Gem::Requirement
|
184
198
|
requirements:
|
185
199
|
- - ">="
|
@@ -193,7 +207,7 @@ dependencies:
|
|
193
207
|
- !ruby/object:Gem::Version
|
194
208
|
version: '0'
|
195
209
|
- !ruby/object:Gem::Dependency
|
196
|
-
name:
|
210
|
+
name: tapp
|
197
211
|
requirement: !ruby/object:Gem::Requirement
|
198
212
|
requirements:
|
199
213
|
- - ">="
|
@@ -207,7 +221,7 @@ dependencies:
|
|
207
221
|
- !ruby/object:Gem::Version
|
208
222
|
version: '0'
|
209
223
|
- !ruby/object:Gem::Dependency
|
210
|
-
name:
|
224
|
+
name: sqlite3
|
211
225
|
requirement: !ruby/object:Gem::Requirement
|
212
226
|
requirements:
|
213
227
|
- - ">="
|
@@ -239,8 +253,18 @@ files:
|
|
239
253
|
- db/schema.rb
|
240
254
|
- lib/daimon_skycrawlers.rb
|
241
255
|
- lib/daimon_skycrawlers/cli.rb
|
242
|
-
- lib/daimon_skycrawlers/
|
256
|
+
- lib/daimon_skycrawlers/config.rb
|
257
|
+
- lib/daimon_skycrawlers/consumer.rb
|
258
|
+
- lib/daimon_skycrawlers/consumer/base.rb
|
259
|
+
- lib/daimon_skycrawlers/consumer/http_response.rb
|
260
|
+
- lib/daimon_skycrawlers/consumer/url.rb
|
243
261
|
- lib/daimon_skycrawlers/crawler.rb
|
262
|
+
- lib/daimon_skycrawlers/crawler/base.rb
|
263
|
+
- lib/daimon_skycrawlers/crawler/default.rb
|
264
|
+
- lib/daimon_skycrawlers/filter.rb
|
265
|
+
- lib/daimon_skycrawlers/filter/base.rb
|
266
|
+
- lib/daimon_skycrawlers/filter/duplicate_checker.rb
|
267
|
+
- lib/daimon_skycrawlers/filter/update_checker.rb
|
244
268
|
- lib/daimon_skycrawlers/generator/new.rb
|
245
269
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
246
270
|
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
@@ -249,21 +273,32 @@ files:
|
|
249
273
|
- lib/daimon_skycrawlers/generator/templates/new/crawler.rb
|
250
274
|
- lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
|
251
275
|
- lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
|
276
|
+
- lib/daimon_skycrawlers/generator/templates/new/init.rb
|
252
277
|
- lib/daimon_skycrawlers/generator/templates/new/processor.rb
|
253
|
-
- lib/daimon_skycrawlers/
|
254
|
-
- lib/daimon_skycrawlers/parser.rb
|
255
|
-
- lib/daimon_skycrawlers/parser/base.rb
|
256
|
-
- lib/daimon_skycrawlers/parser/default.rb
|
278
|
+
- lib/daimon_skycrawlers/logger.rb
|
257
279
|
- lib/daimon_skycrawlers/processor.rb
|
280
|
+
- lib/daimon_skycrawlers/processor/base.rb
|
258
281
|
- lib/daimon_skycrawlers/processor/default.rb
|
282
|
+
- lib/daimon_skycrawlers/processor/spider.rb
|
283
|
+
- lib/daimon_skycrawlers/queue.rb
|
259
284
|
- lib/daimon_skycrawlers/storage.rb
|
260
285
|
- lib/daimon_skycrawlers/storage/base.rb
|
261
286
|
- lib/daimon_skycrawlers/storage/null.rb
|
262
287
|
- lib/daimon_skycrawlers/storage/rdb.rb
|
263
288
|
- lib/daimon_skycrawlers/tasks.rb
|
264
289
|
- lib/daimon_skycrawlers/tasks/database_tasks.rake
|
265
|
-
- lib/daimon_skycrawlers/
|
290
|
+
- lib/daimon_skycrawlers/timer.rb
|
266
291
|
- lib/daimon_skycrawlers/version.rb
|
292
|
+
- sample/spider/Gemfile
|
293
|
+
- sample/spider/README.md
|
294
|
+
- sample/spider/Rakefile
|
295
|
+
- sample/spider/config/database.yml
|
296
|
+
- sample/spider/crawler.rb
|
297
|
+
- sample/spider/db/migrate/20160830155803_create_pages.rb
|
298
|
+
- sample/spider/db/schema.rb
|
299
|
+
- sample/spider/enqueue.rb
|
300
|
+
- sample/spider/init.rb
|
301
|
+
- sample/spider/processor.rb
|
267
302
|
homepage: https://github.com/bm-sms/daimon-skycrawlers
|
268
303
|
licenses:
|
269
304
|
- MIT
|
@@ -284,7 +319,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
284
319
|
version: '0'
|
285
320
|
requirements: []
|
286
321
|
rubyforge_project:
|
287
|
-
rubygems_version: 2.
|
322
|
+
rubygems_version: 2.6.4
|
288
323
|
signing_key:
|
289
324
|
specification_version: 4
|
290
325
|
summary: This is a crawler framework.
|
@@ -1,12 +0,0 @@
|
|
1
|
-
require 'songkick_queue'
|
2
|
-
# TODO Allow to configure from user land
|
3
|
-
SongkickQueue.configure do |config|
|
4
|
-
config.logger = Logger.new(STDOUT)
|
5
|
-
config.host = '127.0.0.1'
|
6
|
-
config.port = 5672
|
7
|
-
# config.username = 'guest'
|
8
|
-
# config.password = 'guest'
|
9
|
-
config.vhost = '/'
|
10
|
-
config.max_reconnect_attempts = 10
|
11
|
-
config.network_recovery_interval = 1.0
|
12
|
-
end
|
@@ -1,38 +0,0 @@
|
|
1
|
-
require "daimon_skycrawlers/processor/default"
|
2
|
-
|
3
|
-
module DaimonSkycrawlers
|
4
|
-
class HTTPResponseConsumer
|
5
|
-
include SongkickQueue::Consumer
|
6
|
-
|
7
|
-
consume_from_queue 'daimon-skycrawler.http-response'
|
8
|
-
|
9
|
-
class << self
|
10
|
-
def register(processor = nil, &block)
|
11
|
-
if block_given?
|
12
|
-
processors << block
|
13
|
-
else
|
14
|
-
processors << processor
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def processors
|
19
|
-
@processors ||= []
|
20
|
-
end
|
21
|
-
|
22
|
-
def default_processor
|
23
|
-
DaimonSkycrawlers::Processor::Default.new
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
def process(message)
|
28
|
-
if self.class.processors.empty?
|
29
|
-
processors = [self.class.default_processor]
|
30
|
-
else
|
31
|
-
processors = self.class.processors
|
32
|
-
end
|
33
|
-
processors.each do |processor|
|
34
|
-
processor.call(message)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
@@ -1,50 +0,0 @@
|
|
1
|
-
require "nokogiri"
|
2
|
-
|
3
|
-
module DaimonSkycrawlers
|
4
|
-
module Parser
|
5
|
-
class Default < Base
|
6
|
-
def initialize
|
7
|
-
@filters = []
|
8
|
-
end
|
9
|
-
|
10
|
-
def append_filter(filter = nil, &block)
|
11
|
-
if block_given?
|
12
|
-
@filters << block
|
13
|
-
else
|
14
|
-
@filters << filter
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def parse(html)
|
19
|
-
@html = html
|
20
|
-
@doc = Nokogiri::HTML(html)
|
21
|
-
end
|
22
|
-
|
23
|
-
def links
|
24
|
-
return @links if @links
|
25
|
-
@links = retrieve_links
|
26
|
-
@links
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
30
|
-
|
31
|
-
def retrieve_links
|
32
|
-
urls = @doc.search("a").map do |element|
|
33
|
-
element["href"]
|
34
|
-
end
|
35
|
-
apply_filters(urls) || []
|
36
|
-
end
|
37
|
-
|
38
|
-
def apply_filters(urls)
|
39
|
-
return if urls.nil?
|
40
|
-
return if urls.empty?
|
41
|
-
@filters.each do |filter|
|
42
|
-
urls = urls.select do |url|
|
43
|
-
filter.call(url)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
urls
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
require 'daimon_skycrawlers/crawler'
|
2
|
-
require 'daimon_skycrawlers/processor'
|
3
|
-
|
4
|
-
module DaimonSkycrawlers
|
5
|
-
class URLConsumer
|
6
|
-
include SongkickQueue::Consumer
|
7
|
-
|
8
|
-
consume_from_queue 'daimon-skycrawler.url'
|
9
|
-
|
10
|
-
class << self
|
11
|
-
def register(crawler)
|
12
|
-
crawlers << crawler
|
13
|
-
end
|
14
|
-
|
15
|
-
def crawlers
|
16
|
-
@crawlers ||= []
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
def process(message)
|
21
|
-
url = message[:url]
|
22
|
-
depth = message[:depth]
|
23
|
-
interval = message[:interval]
|
24
|
-
|
25
|
-
# XXX When several crawlers are registered, how should they behave?
|
26
|
-
self.class.crawlers.each do |crawler|
|
27
|
-
sleep(interval)
|
28
|
-
crawler.fetch(url, depth)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|