daimon_skycrawlers 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +2 -2
  3. data/Gemfile +1 -1
  4. data/README.md +30 -12
  5. data/Rakefile +3 -8
  6. data/bin/daimon-skycrawlers +3 -3
  7. data/daimon_skycrawlers.gemspec +4 -3
  8. data/lib/daimon_skycrawlers/cli.rb +3 -3
  9. data/lib/daimon_skycrawlers/config.rb +8 -0
  10. data/lib/daimon_skycrawlers/consumer/base.rb +16 -0
  11. data/lib/daimon_skycrawlers/consumer/http_response.rb +47 -0
  12. data/lib/daimon_skycrawlers/consumer/url.rb +44 -0
  13. data/lib/daimon_skycrawlers/consumer.rb +4 -0
  14. data/lib/daimon_skycrawlers/crawler/base.rb +75 -0
  15. data/lib/daimon_skycrawlers/crawler/default.rb +33 -0
  16. data/lib/daimon_skycrawlers/crawler.rb +18 -76
  17. data/lib/daimon_skycrawlers/filter/base.rb +24 -0
  18. data/lib/daimon_skycrawlers/filter/duplicate_checker.rb +23 -0
  19. data/lib/daimon_skycrawlers/filter/update_checker.rb +31 -0
  20. data/lib/daimon_skycrawlers/filter.rb +4 -0
  21. data/lib/daimon_skycrawlers/generator/new.rb +3 -2
  22. data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +3 -3
  23. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +22 -6
  24. data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +5 -5
  25. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +6 -6
  26. data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +17 -8
  27. data/lib/daimon_skycrawlers/generator/templates/new/init.rb +20 -0
  28. data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +2 -0
  29. data/lib/daimon_skycrawlers/logger.rb +32 -0
  30. data/lib/daimon_skycrawlers/processor/base.rb +19 -0
  31. data/lib/daimon_skycrawlers/processor/default.rb +12 -9
  32. data/lib/daimon_skycrawlers/processor/spider.rb +77 -0
  33. data/lib/daimon_skycrawlers/processor.rb +23 -9
  34. data/lib/daimon_skycrawlers/queue.rb +24 -0
  35. data/lib/daimon_skycrawlers/storage/base.rb +6 -0
  36. data/lib/daimon_skycrawlers/timer.rb +24 -0
  37. data/lib/daimon_skycrawlers/version.rb +1 -1
  38. data/lib/daimon_skycrawlers.rb +24 -4
  39. data/sample/spider/Gemfile +4 -0
  40. data/sample/spider/README.md +50 -0
  41. data/sample/spider/Rakefile +1 -0
  42. data/sample/spider/config/database.yml +26 -0
  43. data/sample/spider/crawler.rb +14 -0
  44. data/sample/spider/db/migrate/20160830155803_create_pages.rb +13 -0
  45. data/sample/spider/db/schema.rb +28 -0
  46. data/sample/spider/enqueue.rb +24 -0
  47. data/sample/spider/init.rb +22 -0
  48. data/sample/spider/processor.rb +34 -0
  49. metadata +47 -12
  50. data/lib/daimon_skycrawlers/configure_songkick_queue.rb +0 -12
  51. data/lib/daimon_skycrawlers/http_response_consumer.rb +0 -38
  52. data/lib/daimon_skycrawlers/parser/base.rb +0 -13
  53. data/lib/daimon_skycrawlers/parser/default.rb +0 -50
  54. data/lib/daimon_skycrawlers/parser.rb +0 -7
  55. data/lib/daimon_skycrawlers/url_consumer.rb +0 -32
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/processor"
4
+ require "daimon_skycrawlers/processor/spider"
5
+ require "daimon_skycrawlers/filter"
6
+ require "daimon_skycrawlers/filter/duplicate_checker"
7
+ require "daimon_skycrawlers/filter/update_checker"
8
+
9
+ require_relative "./init"
10
+
11
+ default_processor = DaimonSkycrawlers::Processor::Default.new
12
+ spider = DaimonSkycrawlers::Processor::Spider.new
13
+ #spider.enqueue = false
14
+ spider.append_filter do |url|
15
+ uri = URI(url)
16
+ uri.host.nil? || uri.host == "www.clear-code.com"
17
+ end
18
+ spider.append_filter do |url|
19
+ case url
20
+ when %r!\A(\.\./|/|#)!
21
+ false
22
+ else
23
+ true
24
+ end
25
+ end
26
+ duplicate_checker = DaimonSkycrawlers::Filter::DuplicateChecker.new(base_url: "http://www.clear-code.com/blog/")
27
+ spider.append_filter(duplicate_checker)
28
+ update_checker = DaimonSkycrawlers::Filter::UpdateChecker.new(base_url: "http://www.clear-code.com/blog/")
29
+ spider.append_filter(update_checker)
30
+
31
+ DaimonSkycrawlers.register_processor(default_processor)
32
+ DaimonSkycrawlers.register_processor(spider)
33
+
34
+ DaimonSkycrawlers::Processor.run
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-27 00:00:00.000000000 Z
11
+ date: 2016-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: thor
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: timers
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: bundler
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -179,7 +193,7 @@ dependencies:
179
193
  - !ruby/object:Gem::Version
180
194
  version: '0'
181
195
  - !ruby/object:Gem::Dependency
182
- name: cucumber
196
+ name: pry
183
197
  requirement: !ruby/object:Gem::Requirement
184
198
  requirements:
185
199
  - - ">="
@@ -193,7 +207,7 @@ dependencies:
193
207
  - !ruby/object:Gem::Version
194
208
  version: '0'
195
209
  - !ruby/object:Gem::Dependency
196
- name: pry
210
+ name: tapp
197
211
  requirement: !ruby/object:Gem::Requirement
198
212
  requirements:
199
213
  - - ">="
@@ -207,7 +221,7 @@ dependencies:
207
221
  - !ruby/object:Gem::Version
208
222
  version: '0'
209
223
  - !ruby/object:Gem::Dependency
210
- name: tapp
224
+ name: sqlite3
211
225
  requirement: !ruby/object:Gem::Requirement
212
226
  requirements:
213
227
  - - ">="
@@ -239,8 +253,18 @@ files:
239
253
  - db/schema.rb
240
254
  - lib/daimon_skycrawlers.rb
241
255
  - lib/daimon_skycrawlers/cli.rb
242
- - lib/daimon_skycrawlers/configure_songkick_queue.rb
256
+ - lib/daimon_skycrawlers/config.rb
257
+ - lib/daimon_skycrawlers/consumer.rb
258
+ - lib/daimon_skycrawlers/consumer/base.rb
259
+ - lib/daimon_skycrawlers/consumer/http_response.rb
260
+ - lib/daimon_skycrawlers/consumer/url.rb
243
261
  - lib/daimon_skycrawlers/crawler.rb
262
+ - lib/daimon_skycrawlers/crawler/base.rb
263
+ - lib/daimon_skycrawlers/crawler/default.rb
264
+ - lib/daimon_skycrawlers/filter.rb
265
+ - lib/daimon_skycrawlers/filter/base.rb
266
+ - lib/daimon_skycrawlers/filter/duplicate_checker.rb
267
+ - lib/daimon_skycrawlers/filter/update_checker.rb
244
268
  - lib/daimon_skycrawlers/generator/new.rb
245
269
  - lib/daimon_skycrawlers/generator/templates/new/Gemfile
246
270
  - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
@@ -249,21 +273,32 @@ files:
249
273
  - lib/daimon_skycrawlers/generator/templates/new/crawler.rb
250
274
  - lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
251
275
  - lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
276
+ - lib/daimon_skycrawlers/generator/templates/new/init.rb
252
277
  - lib/daimon_skycrawlers/generator/templates/new/processor.rb
253
- - lib/daimon_skycrawlers/http_response_consumer.rb
254
- - lib/daimon_skycrawlers/parser.rb
255
- - lib/daimon_skycrawlers/parser/base.rb
256
- - lib/daimon_skycrawlers/parser/default.rb
278
+ - lib/daimon_skycrawlers/logger.rb
257
279
  - lib/daimon_skycrawlers/processor.rb
280
+ - lib/daimon_skycrawlers/processor/base.rb
258
281
  - lib/daimon_skycrawlers/processor/default.rb
282
+ - lib/daimon_skycrawlers/processor/spider.rb
283
+ - lib/daimon_skycrawlers/queue.rb
259
284
  - lib/daimon_skycrawlers/storage.rb
260
285
  - lib/daimon_skycrawlers/storage/base.rb
261
286
  - lib/daimon_skycrawlers/storage/null.rb
262
287
  - lib/daimon_skycrawlers/storage/rdb.rb
263
288
  - lib/daimon_skycrawlers/tasks.rb
264
289
  - lib/daimon_skycrawlers/tasks/database_tasks.rake
265
- - lib/daimon_skycrawlers/url_consumer.rb
290
+ - lib/daimon_skycrawlers/timer.rb
266
291
  - lib/daimon_skycrawlers/version.rb
292
+ - sample/spider/Gemfile
293
+ - sample/spider/README.md
294
+ - sample/spider/Rakefile
295
+ - sample/spider/config/database.yml
296
+ - sample/spider/crawler.rb
297
+ - sample/spider/db/migrate/20160830155803_create_pages.rb
298
+ - sample/spider/db/schema.rb
299
+ - sample/spider/enqueue.rb
300
+ - sample/spider/init.rb
301
+ - sample/spider/processor.rb
267
302
  homepage: https://github.com/bm-sms/daimon-skycrawlers
268
303
  licenses:
269
304
  - MIT
@@ -284,7 +319,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
284
319
  version: '0'
285
320
  requirements: []
286
321
  rubyforge_project:
287
- rubygems_version: 2.5.1
322
+ rubygems_version: 2.6.4
288
323
  signing_key:
289
324
  specification_version: 4
290
325
  summary: This is a crawler framework.
@@ -1,12 +0,0 @@
1
- require 'songkick_queue'
2
- # TODO Allow to configure from user land
3
- SongkickQueue.configure do |config|
4
- config.logger = Logger.new(STDOUT)
5
- config.host = '127.0.0.1'
6
- config.port = 5672
7
- # config.username = 'guest'
8
- # config.password = 'guest'
9
- config.vhost = '/'
10
- config.max_reconnect_attempts = 10
11
- config.network_recovery_interval = 1.0
12
- end
@@ -1,38 +0,0 @@
1
- require "daimon_skycrawlers/processor/default"
2
-
3
- module DaimonSkycrawlers
4
- class HTTPResponseConsumer
5
- include SongkickQueue::Consumer
6
-
7
- consume_from_queue 'daimon-skycrawler.http-response'
8
-
9
- class << self
10
- def register(processor = nil, &block)
11
- if block_given?
12
- processors << block
13
- else
14
- processors << processor
15
- end
16
- end
17
-
18
- def processors
19
- @processors ||= []
20
- end
21
-
22
- def default_processor
23
- DaimonSkycrawlers::Processor::Default.new
24
- end
25
- end
26
-
27
- def process(message)
28
- if self.class.processors.empty?
29
- processors = [self.class.default_processor]
30
- else
31
- processors = self.class.processors
32
- end
33
- processors.each do |processor|
34
- processor.call(message)
35
- end
36
- end
37
- end
38
- end
@@ -1,13 +0,0 @@
1
- module DaimonSkycrawlers
2
- module Parser
3
- class Base
4
- def initialize(html)
5
- @html = html
6
- end
7
-
8
- def parse
9
- raise "Implement this method in subclass"
10
- end
11
- end
12
- end
13
- end
@@ -1,50 +0,0 @@
1
- require "nokogiri"
2
-
3
- module DaimonSkycrawlers
4
- module Parser
5
- class Default < Base
6
- def initialize
7
- @filters = []
8
- end
9
-
10
- def append_filter(filter = nil, &block)
11
- if block_given?
12
- @filters << block
13
- else
14
- @filters << filter
15
- end
16
- end
17
-
18
- def parse(html)
19
- @html = html
20
- @doc = Nokogiri::HTML(html)
21
- end
22
-
23
- def links
24
- return @links if @links
25
- @links = retrieve_links
26
- @links
27
- end
28
-
29
- private
30
-
31
- def retrieve_links
32
- urls = @doc.search("a").map do |element|
33
- element["href"]
34
- end
35
- apply_filters(urls) || []
36
- end
37
-
38
- def apply_filters(urls)
39
- return if urls.nil?
40
- return if urls.empty?
41
- @filters.each do |filter|
42
- urls = urls.select do |url|
43
- filter.call(url)
44
- end
45
- end
46
- urls
47
- end
48
- end
49
- end
50
- end
@@ -1,7 +0,0 @@
1
- module DaimonSkycrawlers
2
- module Parser
3
- end
4
- end
5
-
6
- require "daimon_skycrawlers/parser/base"
7
- require "daimon_skycrawlers/parser/default"
@@ -1,32 +0,0 @@
1
- require 'daimon_skycrawlers/crawler'
2
- require 'daimon_skycrawlers/processor'
3
-
4
- module DaimonSkycrawlers
5
- class URLConsumer
6
- include SongkickQueue::Consumer
7
-
8
- consume_from_queue 'daimon-skycrawler.url'
9
-
10
- class << self
11
- def register(crawler)
12
- crawlers << crawler
13
- end
14
-
15
- def crawlers
16
- @crawlers ||= []
17
- end
18
- end
19
-
20
- def process(message)
21
- url = message[:url]
22
- depth = message[:depth]
23
- interval = message[:interval]
24
-
25
- # XXX When several crawlers are registered, how should they behave?
26
- self.class.crawlers.each do |crawler|
27
- sleep(interval)
28
- crawler.fetch(url, depth)
29
- end
30
- end
31
- end
32
- end