daimon_skycrawlers 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +1 -1
- data/README.md +30 -12
- data/Rakefile +3 -8
- data/bin/daimon-skycrawlers +3 -3
- data/daimon_skycrawlers.gemspec +4 -3
- data/lib/daimon_skycrawlers/cli.rb +3 -3
- data/lib/daimon_skycrawlers/config.rb +8 -0
- data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
- data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
- data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
- data/lib/daimon_skycrawlers/consumer.rb +4 -0
- data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
- data/lib/daimon_skycrawlers/crawler.rb +18 -76
- data/lib/daimon_skycrawlers/filter/base.rb +24 -0
- data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
- data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
- data/lib/daimon_skycrawlers/filter.rb +4 -0
- data/lib/daimon_skycrawlers/generator/new.rb +3 -2
- data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
- data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
- data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
- data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
- data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
- data/lib/daimon_skycrawlers/logger.rb +32 -0
- data/lib/daimon_skycrawlers/processor/base.rb +19 -0
- data/lib/daimon_skycrawlers/processor/default.rb +12 -9
- data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
- data/lib/daimon_skycrawlers/processor.rb +23 -9
- data/lib/daimon_skycrawlers/queue.rb +24 -0
- data/lib/daimon_skycrawlers/storage/base.rb +6 -0
- data/lib/daimon_skycrawlers/timer.rb +24 -0
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +24 -4
- data/sample/spider/Gemfile +4 -0
- data/sample/spider/README.md +50 -0
- data/sample/spider/Rakefile +1 -0
- data/sample/spider/config/database.yml +26 -0
- data/sample/spider/crawler.rb +14 -0
- data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
- data/sample/spider/db/schema.rb +28 -0
- data/sample/spider/enqueue.rb +24 -0
- data/sample/spider/init.rb +22 -0
- data/sample/spider/processor.rb +34 -0
- metadata +47 -12
- data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
- data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
- data/lib/daimon_skycrawlers/parser/base.rb +0 -13
- data/lib/daimon_skycrawlers/parser/default.rb +0 -50
- data/lib/daimon_skycrawlers/parser.rb +0 -7
- data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require "daimon_skycrawlers/processor"
|
|
4
|
+
require "daimon_skycrawlers/processor/spider"
|
|
5
|
+
require "daimon_skycrawlers/filter"
|
|
6
|
+
require "daimon_skycrawlers/filter/duplicate_checker"
|
|
7
|
+
require "daimon_skycrawlers/filter/update_checker"
|
|
8
|
+
|
|
9
|
+
require_relative "./init"
|
|
10
|
+
|
|
11
|
+
default_processor = DaimonSkycrawlers::Processor::Default.new
|
|
12
|
+
spider = DaimonSkycrawlers::Processor::Spider.new
|
|
13
|
+
#spider.enqueue = false
|
|
14
|
+
spider.append_filter do |url|
|
|
15
|
+
uri = URI(url)
|
|
16
|
+
uri.host.nil? || uri.host == "www.clear-code.com"
|
|
17
|
+
end
|
|
18
|
+
spider.append_filter do |url|
|
|
19
|
+
case url
|
|
20
|
+
when %r!\A(\.\./|/|#)!
|
|
21
|
+
false
|
|
22
|
+
else
|
|
23
|
+
true
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
duplicate_checker = DaimonSkycrawlers::Filter::DuplicateChecker.new(base_url: "http://www.clear-code.com/blog/")
|
|
27
|
+
spider.append_filter(duplicate_checker)
|
|
28
|
+
update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(base_url: "http://www.clear-code.com/blog/")
|
|
29
|
+
spider.append_filter(update_checker)
|
|
30
|
+
|
|
31
|
+
DaimonSkycrawlers.register_processor(default_processor)
|
|
32
|
+
DaimonSkycrawlers.register_processor(spider)
|
|
33
|
+
|
|
34
|
+
DaimonSkycrawlers::Processor.run
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: daimon_skycrawlers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ryunosuke SATO
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-
|
|
11
|
+
date: 2016-09-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: thor
|
|
@@ -108,6 +108,20 @@ dependencies:
|
|
|
108
108
|
- - ">="
|
|
109
109
|
- !ruby/object:Gem::Version
|
|
110
110
|
version: '0'
|
|
111
|
+
- !ruby/object:Gem::Dependency
|
|
112
|
+
name: timers
|
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
|
114
|
+
requirements:
|
|
115
|
+
- - ">="
|
|
116
|
+
- !ruby/object:Gem::Version
|
|
117
|
+
version: '0'
|
|
118
|
+
type: :runtime
|
|
119
|
+
prerelease: false
|
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
121
|
+
requirements:
|
|
122
|
+
- - ">="
|
|
123
|
+
- !ruby/object:Gem::Version
|
|
124
|
+
version: '0'
|
|
111
125
|
- !ruby/object:Gem::Dependency
|
|
112
126
|
name: bundler
|
|
113
127
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -179,7 +193,7 @@ dependencies:
|
|
|
179
193
|
- !ruby/object:Gem::Version
|
|
180
194
|
version: '0'
|
|
181
195
|
- !ruby/object:Gem::Dependency
|
|
182
|
-
name:
|
|
196
|
+
name: pry
|
|
183
197
|
requirement: !ruby/object:Gem::Requirement
|
|
184
198
|
requirements:
|
|
185
199
|
- - ">="
|
|
@@ -193,7 +207,7 @@ dependencies:
|
|
|
193
207
|
- !ruby/object:Gem::Version
|
|
194
208
|
version: '0'
|
|
195
209
|
- !ruby/object:Gem::Dependency
|
|
196
|
-
name:
|
|
210
|
+
name: tapp
|
|
197
211
|
requirement: !ruby/object:Gem::Requirement
|
|
198
212
|
requirements:
|
|
199
213
|
- - ">="
|
|
@@ -207,7 +221,7 @@ dependencies:
|
|
|
207
221
|
- !ruby/object:Gem::Version
|
|
208
222
|
version: '0'
|
|
209
223
|
- !ruby/object:Gem::Dependency
|
|
210
|
-
name:
|
|
224
|
+
name: sqlite3
|
|
211
225
|
requirement: !ruby/object:Gem::Requirement
|
|
212
226
|
requirements:
|
|
213
227
|
- - ">="
|
|
@@ -239,8 +253,18 @@ files:
|
|
|
239
253
|
- db/schema.rb
|
|
240
254
|
- lib/daimon_skycrawlers.rb
|
|
241
255
|
- lib/daimon_skycrawlers/cli.rb
|
|
242
|
-
- lib/daimon_skycrawlers/
|
|
256
|
+
- lib/daimon_skycrawlers/config.rb
|
|
257
|
+
- lib/daimon_skycrawlers/consumer.rb
|
|
258
|
+
- lib/daimon_skycrawlers/consumer/base.rb
|
|
259
|
+
- lib/daimon_skycrawlers/consumer/http_response.rb
|
|
260
|
+
- lib/daimon_skycrawlers/consumer/url.rb
|
|
243
261
|
- lib/daimon_skycrawlers/crawler.rb
|
|
262
|
+
- lib/daimon_skycrawlers/crawler/base.rb
|
|
263
|
+
- lib/daimon_skycrawlers/crawler/default.rb
|
|
264
|
+
- lib/daimon_skycrawlers/filter.rb
|
|
265
|
+
- lib/daimon_skycrawlers/filter/base.rb
|
|
266
|
+
- lib/daimon_skycrawlers/filter/duplicate_checker.rb
|
|
267
|
+
- lib/daimon_skycrawlers/filter/update_checker.rb
|
|
244
268
|
- lib/daimon_skycrawlers/generator/new.rb
|
|
245
269
|
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
|
246
270
|
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
|
@@ -249,21 +273,32 @@ files:
|
|
|
249
273
|
- lib/daimon_skycrawlers/generator/templates/new/crawler.rb
|
|
250
274
|
- lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
|
|
251
275
|
- lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
|
|
276
|
+
- lib/daimon_skycrawlers/generator/templates/new/init.rb
|
|
252
277
|
- lib/daimon_skycrawlers/generator/templates/new/processor.rb
|
|
253
|
-
- lib/daimon_skycrawlers/
|
|
254
|
-
- lib/daimon_skycrawlers/parser.rb
|
|
255
|
-
- lib/daimon_skycrawlers/parser/base.rb
|
|
256
|
-
- lib/daimon_skycrawlers/parser/default.rb
|
|
278
|
+
- lib/daimon_skycrawlers/logger.rb
|
|
257
279
|
- lib/daimon_skycrawlers/processor.rb
|
|
280
|
+
- lib/daimon_skycrawlers/processor/base.rb
|
|
258
281
|
- lib/daimon_skycrawlers/processor/default.rb
|
|
282
|
+
- lib/daimon_skycrawlers/processor/spider.rb
|
|
283
|
+
- lib/daimon_skycrawlers/queue.rb
|
|
259
284
|
- lib/daimon_skycrawlers/storage.rb
|
|
260
285
|
- lib/daimon_skycrawlers/storage/base.rb
|
|
261
286
|
- lib/daimon_skycrawlers/storage/null.rb
|
|
262
287
|
- lib/daimon_skycrawlers/storage/rdb.rb
|
|
263
288
|
- lib/daimon_skycrawlers/tasks.rb
|
|
264
289
|
- lib/daimon_skycrawlers/tasks/database_tasks.rake
|
|
265
|
-
- lib/daimon_skycrawlers/
|
|
290
|
+
- lib/daimon_skycrawlers/timer.rb
|
|
266
291
|
- lib/daimon_skycrawlers/version.rb
|
|
292
|
+
- sample/spider/Gemfile
|
|
293
|
+
- sample/spider/README.md
|
|
294
|
+
- sample/spider/Rakefile
|
|
295
|
+
- sample/spider/config/database.yml
|
|
296
|
+
- sample/spider/crawler.rb
|
|
297
|
+
- sample/spider/db/migrate/20160830155803_create_pages.rb
|
|
298
|
+
- sample/spider/db/schema.rb
|
|
299
|
+
- sample/spider/enqueue.rb
|
|
300
|
+
- sample/spider/init.rb
|
|
301
|
+
- sample/spider/processor.rb
|
|
267
302
|
homepage: https://github.com/bm-sms/daimon-skycrawlers
|
|
268
303
|
licenses:
|
|
269
304
|
- MIT
|
|
@@ -284,7 +319,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
284
319
|
version: '0'
|
|
285
320
|
requirements: []
|
|
286
321
|
rubyforge_project:
|
|
287
|
-
rubygems_version: 2.
|
|
322
|
+
rubygems_version: 2.6.4
|
|
288
323
|
signing_key:
|
|
289
324
|
specification_version: 4
|
|
290
325
|
summary: This is a crawler framework.
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
require 'songkick_queue'
|
|
2
|
-
# TODO Allow to configure from user land
|
|
3
|
-
SongkickQueue.configure do |config|
|
|
4
|
-
config.logger = Logger.new(STDOUT)
|
|
5
|
-
config.host = '127.0.0.1'
|
|
6
|
-
config.port = 5672
|
|
7
|
-
# config.username = 'guest'
|
|
8
|
-
# config.password = 'guest'
|
|
9
|
-
config.vhost = '/'
|
|
10
|
-
config.max_reconnect_attempts = 10
|
|
11
|
-
config.network_recovery_interval = 1.0
|
|
12
|
-
end
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
require "daimon_skycrawlers/processor/default"
|
|
2
|
-
|
|
3
|
-
module DaimonSkycrawlers
|
|
4
|
-
class HTTPResponseConsumer
|
|
5
|
-
include SongkickQueue::Consumer
|
|
6
|
-
|
|
7
|
-
consume_from_queue 'daimon-skycrawler.http-response'
|
|
8
|
-
|
|
9
|
-
class << self
|
|
10
|
-
def register(processor = nil, &block)
|
|
11
|
-
if block_given?
|
|
12
|
-
processors << block
|
|
13
|
-
else
|
|
14
|
-
processors << processor
|
|
15
|
-
end
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
def processors
|
|
19
|
-
@processors ||= []
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
def default_processor
|
|
23
|
-
DaimonSkycrawlers::Processor::Default.new
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
def process(message)
|
|
28
|
-
if self.class.processors.empty?
|
|
29
|
-
processors = [self.class.default_processor]
|
|
30
|
-
else
|
|
31
|
-
processors = self.class.processors
|
|
32
|
-
end
|
|
33
|
-
processors.each do |processor|
|
|
34
|
-
processor.call(message)
|
|
35
|
-
end
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
end
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
require "nokogiri"
|
|
2
|
-
|
|
3
|
-
module DaimonSkycrawlers
|
|
4
|
-
module Parser
|
|
5
|
-
class Default < Base
|
|
6
|
-
def initialize
|
|
7
|
-
@filters = []
|
|
8
|
-
end
|
|
9
|
-
|
|
10
|
-
def append_filter(filter = nil, &block)
|
|
11
|
-
if block_given?
|
|
12
|
-
@filters << block
|
|
13
|
-
else
|
|
14
|
-
@filters << filter
|
|
15
|
-
end
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
def parse(html)
|
|
19
|
-
@html = html
|
|
20
|
-
@doc = Nokogiri::HTML(html)
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
def links
|
|
24
|
-
return @links if @links
|
|
25
|
-
@links = retrieve_links
|
|
26
|
-
@links
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
private
|
|
30
|
-
|
|
31
|
-
def retrieve_links
|
|
32
|
-
urls = @doc.search("a").map do |element|
|
|
33
|
-
element["href"]
|
|
34
|
-
end
|
|
35
|
-
apply_filters(urls) || []
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
def apply_filters(urls)
|
|
39
|
-
return if urls.nil?
|
|
40
|
-
return if urls.empty?
|
|
41
|
-
@filters.each do |filter|
|
|
42
|
-
urls = urls.select do |url|
|
|
43
|
-
filter.call(url)
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
urls
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
end
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
require 'daimon_skycrawlers/crawler'
|
|
2
|
-
require 'daimon_skycrawlers/processor'
|
|
3
|
-
|
|
4
|
-
module DaimonSkycrawlers
|
|
5
|
-
class URLConsumer
|
|
6
|
-
include SongkickQueue::Consumer
|
|
7
|
-
|
|
8
|
-
consume_from_queue 'daimon-skycrawler.url'
|
|
9
|
-
|
|
10
|
-
class << self
|
|
11
|
-
def register(crawler)
|
|
12
|
-
crawlers << crawler
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
def crawlers
|
|
16
|
-
@crawlers ||= []
|
|
17
|
-
end
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
def process(message)
|
|
21
|
-
url = message[:url]
|
|
22
|
-
depth = message[:depth]
|
|
23
|
-
interval = message[:interval]
|
|
24
|
-
|
|
25
|
-
# XXX When several crawlers are registered, how should they behave?
|
|
26
|
-
self.class.crawlers.each do |crawler|
|
|
27
|
-
sleep(interval)
|
|
28
|
-
crawler.fetch(url, depth)
|
|
29
|
-
end
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
end
|