daimon_skycrawlers 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -2
  3. data/Gemfile +1 -1
  4. data/README.md +30 -12
  5. data/Rakefile +3 -8
  6. data/bin/daimon-skycrawlers +3 -3
  7. data/daimon_skycrawlers.gemspec +4 -3
  8. data/lib/daimon_skycrawlers/cli.rb +3 -3
  9. data/lib/daimon_skycrawlers/config.rb +8 -0
  10. data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
  11. data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
  12. data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
  13. data/lib/daimon_skycrawlers/consumer.rb +4 -0
  14. data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
  15. data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
  16. data/lib/daimon_skycrawlers/crawler.rb +18 -76
  17. data/lib/daimon_skycrawlers/filter/base.rb +24 -0
  18. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
  19. data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
  20. data/lib/daimon_skycrawlers/filter.rb +4 -0
  21. data/lib/daimon_skycrawlers/generator/new.rb +3 -2
  22. data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
  23. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
  24. data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
  25. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
  26. data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
  27. data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
  28. data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
  29. data/lib/daimon_skycrawlers/logger.rb +32 -0
  30. data/lib/daimon_skycrawlers/processor/base.rb +19 -0
  31. data/lib/daimon_skycrawlers/processor/default.rb +12 -9
  32. data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
  33. data/lib/daimon_skycrawlers/processor.rb +23 -9
  34. data/lib/daimon_skycrawlers/queue.rb +24 -0
  35. data/lib/daimon_skycrawlers/storage/base.rb +6 -0
  36. data/lib/daimon_skycrawlers/timer.rb +24 -0
  37. data/lib/daimon_skycrawlers/version.rb +1 -1
  38. data/lib/daimon_skycrawlers.rb +24 -4
  39. data/sample/spider/Gemfile +4 -0
  40. data/sample/spider/README.md +50 -0
  41. data/sample/spider/Rakefile +1 -0
  42. data/sample/spider/config/database.yml +26 -0
  43. data/sample/spider/crawler.rb +14 -0
  44. data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
  45. data/sample/spider/db/schema.rb +28 -0
  46. data/sample/spider/enqueue.rb +24 -0
  47. data/sample/spider/init.rb +22 -0
  48. data/sample/spider/processor.rb +34 -0
  49. metadata +47 -12
  50. data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
  51. data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
  52. data/lib/daimon_skycrawlers/parser/base.rb +0 -13
  53. data/lib/daimon_skycrawlers/parser/default.rb +0 -50
  54. data/lib/daimon_skycrawlers/parser.rb +0 -7
  55. data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/processor"
4
+ require "daimon_skycrawlers/processor/spider"
5
+ require "daimon_skycrawlers/filter"
6
+ require "daimon_skycrawlers/filter/duplicate_checker"
7
+ require "daimon_skycrawlers/filter/update_checker"
8
+
9
+ require_relative "./init"
10
+
11
+ default_processor = DaimonSkycrawlers::Processor::Default.new
12
+ spider = DaimonSkycrawlers::Processor::Spider.new
13
+ #spider.enqueue = false
14
+ spider.append_filter do |url|
15
+ uri = URI(url)
16
+ uri.host.nil? || uri.host == "www.clear-code.com"
17
+ end
18
+ spider.append_filter do |url|
19
+ case url
20
+ when %r!\A(\.\./|/|#)!
21
+ false
22
+ else
23
+ true
24
+ end
25
+ end
26
+ duplicate_checker = DaimonSkycrawlers::Filter::DuplicateChecker.new(base_url: "http://www.clear-code.com/blog/")
27
+ spider.append_filter(duplicate_checker)
28
+ update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(base_url: "http://www.clear-code.com/blog/")
29
+ spider.append_filter(update_checker)
30
+
31
+ DaimonSkycrawlers.register_processor(default_processor)
32
+ DaimonSkycrawlers.register_processor(spider)
33
+
34
+ DaimonSkycrawlers::Processor.run
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-27 00:00:00.000000000 Z
11
+ date: 2016-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: timers
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: bundler
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -179,7 +193,7 @@ dependencies:
179
193
  - !ruby/object:Gem::Version
180
194
  version: '0'
181
195
  - !ruby/object:Gem::Dependency
182
- name: cucumber
196
+ name: pry
183
197
  requirement: !ruby/object:Gem::Requirement
184
198
  requirements:
185
199
  - - ">="
@@ -193,7 +207,7 @@ dependencies:
193
207
  - !ruby/object:Gem::Version
194
208
  version: '0'
195
209
  - !ruby/object:Gem::Dependency
196
- name: pry
210
+ name: tapp
197
211
  requirement: !ruby/object:Gem::Requirement
198
212
  requirements:
199
213
  - - ">="
@@ -207,7 +221,7 @@ dependencies:
207
221
  - !ruby/object:Gem::Version
208
222
  version: '0'
209
223
  - !ruby/object:Gem::Dependency
210
- name: tapp
224
+ name: sqlite3
211
225
  requirement: !ruby/object:Gem::Requirement
212
226
  requirements:
213
227
  - - ">="
@@ -239,8 +253,18 @@ files:
239
253
  - db/schema.rb
240
254
  - lib/daimon_skycrawlers.rb
241
255
  - lib/daimon_skycrawlers/cli.rb
242
- - lib/daimon_skycrawlers/configure_songkick_queue.rb
256
+ - lib/daimon_skycrawlers/config.rb
257
+ - lib/daimon_skycrawlers/consumer.rb
258
+ - lib/daimon_skycrawlers/consumer/base.rb
259
+ - lib/daimon_skycrawlers/consumer/http_response.rb
260
+ - lib/daimon_skycrawlers/consumer/url.rb
243
261
  - lib/daimon_skycrawlers/crawler.rb
262
+ - lib/daimon_skycrawlers/crawler/base.rb
263
+ - lib/daimon_skycrawlers/crawler/default.rb
264
+ - lib/daimon_skycrawlers/filter.rb
265
+ - lib/daimon_skycrawlers/filter/base.rb
266
+ - lib/daimon_skycrawlers/filter/duplicate_checker.rb
267
+ - lib/daimon_skycrawlers/filter/update_checker.rb
244
268
  - lib/daimon_skycrawlers/generator/new.rb
245
269
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
246
270
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
@@ -249,21 +273,32 @@ files:
249
273
  - lib/daimon_skycrawlers/generator/templates/new/crawler.rb
250
274
  - lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
251
275
  - lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
276
+ - lib/daimon_skycrawlers/generator/templates/new/init.rb
252
277
  - lib/daimon_skycrawlers/generator/templates/new/processor.rb
253
- - lib/daimon_skycrawlers/http_response_consumer.rb
254
- - lib/daimon_skycrawlers/parser.rb
255
- - lib/daimon_skycrawlers/parser/base.rb
256
- - lib/daimon_skycrawlers/parser/default.rb
278
+ - lib/daimon_skycrawlers/logger.rb
257
279
  - lib/daimon_skycrawlers/processor.rb
280
+ - lib/daimon_skycrawlers/processor/base.rb
258
281
  - lib/daimon_skycrawlers/processor/default.rb
282
+ - lib/daimon_skycrawlers/processor/spider.rb
283
+ - lib/daimon_skycrawlers/queue.rb
259
284
  - lib/daimon_skycrawlers/storage.rb
260
285
  - lib/daimon_skycrawlers/storage/base.rb
261
286
  - lib/daimon_skycrawlers/storage/null.rb
262
287
  - lib/daimon_skycrawlers/storage/rdb.rb
263
288
  - lib/daimon_skycrawlers/tasks.rb
264
289
  - lib/daimon_skycrawlers/tasks/database_tasks.rake
265
- - lib/daimon_skycrawlers/url_consumer.rb
290
+ - lib/daimon_skycrawlers/timer.rb
266
291
  - lib/daimon_skycrawlers/version.rb
292
+ - sample/spider/Gemfile
293
+ - sample/spider/README.md
294
+ - sample/spider/Rakefile
295
+ - sample/spider/config/database.yml
296
+ - sample/spider/crawler.rb
297
+ - sample/spider/db/migrate/20160830155803_create_pages.rb
298
+ - sample/spider/db/schema.rb
299
+ - sample/spider/enqueue.rb
300
+ - sample/spider/init.rb
301
+ - sample/spider/processor.rb
267
302
  homepage: https://github.com/bm-sms/daimon-skycrawlers
268
303
  licenses:
269
304
  - MIT
@@ -284,7 +319,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
284
319
  version: '0'
285
320
  requirements: []
286
321
  rubyforge_project:
287
- rubygems_version: 2.5.1
322
+ rubygems_version: 2.6.4
288
323
  signing_key:
289
324
  specification_version: 4
290
325
  summary: This is a crawler framework.
@@ -1,12 +0,0 @@
1
- require 'songkick_queue'
2
- # TODO Allow to configure from user land
3
- SongkickQueue.configure do |config|
4
- config.logger = Logger.new(STDOUT)
5
- config.host = '127.0.0.1'
6
- config.port = 5672
7
- # config.username = 'guest'
8
- # config.password = 'guest'
9
- config.vhost = '/'
10
- config.max_reconnect_attempts = 10
11
- config.network_recovery_interval = 1.0
12
- end
@@ -1,38 +0,0 @@
1
- require "daimon_skycrawlers/processor/default"
2
-
3
- module DaimonSkycrawlers
4
- class HTTPResponseConsumer
5
- include SongkickQueue::Consumer
6
-
7
- consume_from_queue 'daimon-skycrawler.http-response'
8
-
9
- class << self
10
- def register(processor = nil, &block)
11
- if block_given?
12
- processors << block
13
- else
14
- processors << processor
15
- end
16
- end
17
-
18
- def processors
19
- @processors ||= []
20
- end
21
-
22
- def default_processor
23
- DaimonSkycrawlers::Processor::Default.new
24
- end
25
- end
26
-
27
- def process(message)
28
- if self.class.processors.empty?
29
- processors = [self.class.default_processor]
30
- else
31
- processors = self.class.processors
32
- end
33
- processors.each do |processor|
34
- processor.call(message)
35
- end
36
- end
37
- end
38
- end
@@ -1,13 +0,0 @@
1
- module DaimonSkycrawlers
2
- module Parser
3
- class Base
4
- def initialize(html)
5
- @html = html
6
- end
7
-
8
- def parse
9
- raise "Implement this method in subclass"
10
- end
11
- end
12
- end
13
- end
@@ -1,50 +0,0 @@
1
- require "nokogiri"
2
-
3
- module DaimonSkycrawlers
4
- module Parser
5
- class Default < Base
6
- def initialize
7
- @filters = []
8
- end
9
-
10
- def append_filter(filter = nil, &block)
11
- if block_given?
12
- @filters << block
13
- else
14
- @filters << filter
15
- end
16
- end
17
-
18
- def parse(html)
19
- @html = html
20
- @doc = Nokogiri::HTML(html)
21
- end
22
-
23
- def links
24
- return @links if @links
25
- @links = retrieve_links
26
- @links
27
- end
28
-
29
- private
30
-
31
- def retrieve_links
32
- urls = @doc.search("a").map do |element|
33
- element["href"]
34
- end
35
- apply_filters(urls) || []
36
- end
37
-
38
- def apply_filters(urls)
39
- return if urls.nil?
40
- return if urls.empty?
41
- @filters.each do |filter|
42
- urls = urls.select do |url|
43
- filter.call(url)
44
- end
45
- end
46
- urls
47
- end
48
- end
49
- end
50
- end
@@ -1,7 +0,0 @@
1
- module DaimonSkycrawlers
2
- module Parser
3
- end
4
- end
5
-
6
- require "daimon_skycrawlers/parser/base"
7
- require "daimon_skycrawlers/parser/default"
@@ -1,32 +0,0 @@
1
- require 'daimon_skycrawlers/crawler'
2
- require 'daimon_skycrawlers/processor'
3
-
4
- module DaimonSkycrawlers
5
- class URLConsumer
6
- include SongkickQueue::Consumer
7
-
8
- consume_from_queue 'daimon-skycrawler.url'
9
-
10
- class << self
11
- def register(crawler)
12
- crawlers << crawler
13
- end
14
-
15
- def crawlers
16
- @crawlers ||= []
17
- end
18
- end
19
-
20
- def process(message)
21
- url = message[:url]
22
- depth = message[:depth]
23
- interval = message[:interval]
24
-
25
- # XXX When several crawlers are registered, how should they behave?
26
- self.class.crawlers.each do |crawler|
27
- sleep(interval)
28
- crawler.fetch(url, depth)
29
- end
30
- end
31
- end
32
- end