daimon_skycrawlers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +73 -0
  7. data/Rakefile +17 -0
  8. data/bin/daimon-skycrawlers +9 -0
  9. data/daimon_skycrawlers.gemspec +37 -0
  10. data/db/schema.rb +15 -0
  11. data/lib/daimon_skycrawlers/cli.rb +14 -0
  12. data/lib/daimon_skycrawlers/configure_songkick_queue.rb +12 -0
  13. data/lib/daimon_skycrawlers/crawler.rb +90 -0
  14. data/lib/daimon_skycrawlers/generator/new.rb +42 -0
  15. data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +4 -0
  16. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +34 -0
  17. data/lib/daimon_skycrawlers/generator/templates/new/Rakefile +1 -0
  18. data/lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb +26 -0
  19. data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +14 -0
  20. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +13 -0
  21. data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +15 -0
  22. data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +9 -0
  23. data/lib/daimon_skycrawlers/http_response_consumer.rb +38 -0
  24. data/lib/daimon_skycrawlers/parser/base.rb +13 -0
  25. data/lib/daimon_skycrawlers/parser/default.rb +50 -0
  26. data/lib/daimon_skycrawlers/parser.rb +7 -0
  27. data/lib/daimon_skycrawlers/processor/default.rb +20 -0
  28. data/lib/daimon_skycrawlers/processor.rb +18 -0
  29. data/lib/daimon_skycrawlers/storage/base.rb +13 -0
  30. data/lib/daimon_skycrawlers/storage/null.rb +11 -0
  31. data/lib/daimon_skycrawlers/storage/rdb.rb +30 -0
  32. data/lib/daimon_skycrawlers/storage.rb +8 -0
  33. data/lib/daimon_skycrawlers/tasks/database_tasks.rake +53 -0
  34. data/lib/daimon_skycrawlers/tasks.rb +2 -0
  35. data/lib/daimon_skycrawlers/url_consumer.rb +32 -0
  36. data/lib/daimon_skycrawlers/version.rb +3 -0
  37. data/lib/daimon_skycrawlers.rb +15 -0
  38. metadata +291 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e9ebc710f18b92107a91a3bb5a4c8972051ede8f
4
+ data.tar.gz: 61db84bcdc73557ae9b28cc7894c04a1055dda6d
5
+ SHA512:
6
+ metadata.gz: 76d7b629eefc04a89d5cdce8939a9e83f35c68a9b8d1be3599e3c7d59b815f506eb1c295d1e3095f18ca7ec12169fa5e365cda981561804bbb4d2d070eef3051
7
+ data.tar.gz: 04b326f7a8531d364c3d41d12f99328831b2eda1acefa72869f588750e065abc0a044fd6d87b92f20a6359983dba8b00c42a4ab40d75979f92c6a553ec16f9e9
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.travis.yml ADDED
@@ -0,0 +1,11 @@
1
+ language: ruby
2
+ sudo: false
3
+ cache: bundler
4
+ rvm:
5
+ - 2.2.4
6
+ - 2.3.0
7
+
8
+ before_install: gem install bundler -v 1.11.2
9
+
10
+ services:
11
+ - rabbitmq
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Ryunosuke SATO
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,73 @@
1
+ # DaimonSkycrawlers
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/daimon_skycrawlers`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Requirements
8
+
9
+ - Ruby
10
+ - RabbitMQ
11
+ - RDB
12
+ - PostgreSQL (default)
13
+ - MySQL
14
+ - SQLite3
15
+
16
+ ## Installation
17
+
18
+ Add this line to your application's Gemfile:
19
+
20
+ ```ruby
21
+ gem 'daimon_skycrawlers'
22
+ ```
23
+
24
+ And then execute:
25
+
26
+ $ bundle
27
+
28
+ Or install it yourself as:
29
+
30
+ $ gem install daimon_skycrawlers
31
+
32
+ ## Usage
33
+
34
+ 1. Create project
35
+
36
+ $ bundle exec daimon-skycrawlers new mycrawlers
37
+ $ cd mycrawlers
38
+
39
+ 2. Install dependencies
40
+
41
+ $ bundle install
42
+
43
+ 3. Create database
44
+
45
+ $ bundle exec rake db:create
46
+ $ bundle exec rake db:migrate
47
+
48
+ 4. Open new terminal and run crawler/processor
49
+
50
+ $ bundle exec ruby crawler.rb # on new terminal
51
+ $ bundle exec ruby processor.rb # on new terminal
52
+
53
+ 5. Enqueue task
54
+
55
+ $ bundle exec ruby enqueue.rb http://example.com/
56
+
57
+ 6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
58
+
59
+ ## Development
60
+
61
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
62
+
63
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
64
+
65
+ ## Contributing
66
+
67
+ Bug reports and pull requests are welcome on GitHub at https://github.com/bm-sms/daimon-skycrawlers.
68
+
69
+
70
+ ## License
71
+
72
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
73
+
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ require "bundler/setup"
2
+ require "bundler/gem_tasks"
3
+ require "rake/testtask"
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList['test/**/*_test.rb']
9
+ end
10
+
11
+ require "cucumber/rake/task"
12
+
13
+ Cucumber::Rake::Task.new(:features) do |t|
14
+ t.cucumber_opts = "features --format pretty"
15
+ end
16
+
17
+ task :default => [:test, :features]
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if File.exist?(File.expand_path('../.git', __dir__))
4
+ $LOAD_PATH << File.expand_path('../lib', __dir__)
5
+ end
6
+
7
+ require 'daimon_skycrawlers/cli'
8
+
9
+ DaimonSkycrawlers::CLI.start
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'daimon_skycrawlers/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "daimon_skycrawlers"
8
+ spec.version = DaimonSkycrawlers::VERSION
9
+ spec.authors = ["Ryunosuke SATO"]
10
+ spec.email = ["tricknotes.rs@gmail.com"]
11
+
12
+ spec.summary = %q{This is a crawler framework.}
13
+ spec.description = %q{This is a crawler framework.}
14
+ spec.homepage = "https://github.com/bm-sms/daimon-skycrawlers"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "thor"
22
+ spec.add_dependency "songkick_queue"
23
+ spec.add_dependency "faraday"
24
+ spec.add_dependency "faraday_middleware"
25
+ spec.add_dependency "nokogiri"
26
+ spec.add_dependency "activerecord"
27
+ spec.add_dependency "pg"
28
+
29
+ spec.add_development_dependency "bundler", "~> 1.11"
30
+ spec.add_development_dependency "rake", "~> 10.0"
31
+ spec.add_development_dependency "test-unit"
32
+ spec.add_development_dependency "test-unit-rr"
33
+ spec.add_development_dependency "test-unit-notify"
34
+ spec.add_development_dependency "cucumber"
35
+ spec.add_development_dependency "pry"
36
+ spec.add_development_dependency "tapp"
37
+ end
data/db/schema.rb ADDED
@@ -0,0 +1,15 @@
1
+ require "active_record"
2
+
3
+ ActiveRecord::Base.establish_connection(adapter: "sqlite3",
4
+ database: "storage.db")
5
+
6
+ ActiveRecord::Schema.define(version: 1) do
7
+ create_table :pages do |t|
8
+ t.string :url
9
+ t.text :headers
10
+ t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
11
+ t.datetime :last_modified_at
12
+ t.string :etag
13
+ t.timestamps
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ require 'thor'
2
+ require 'daimon_skycrawlers/generator/new'
3
+ require 'daimon_skycrawlers/version'
4
+
5
+ module DaimonSkycrawlers
6
+ class CLI < Thor
7
+ register(Generator::New, "new", "new NAME", "Create new project")
8
+
9
+ desc "version", "Show version"
10
+ def version
11
+ puts VERSION
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,12 @@
1
+ require 'songkick_queue'
2
+ # TODO Allow to configure from user land
3
+ SongkickQueue.configure do |config|
4
+ config.logger = Logger.new(STDOUT)
5
+ config.host = '127.0.0.1'
6
+ config.port = 5672
7
+ # config.username = 'guest'
8
+ # config.password = 'guest'
9
+ config.vhost = '/'
10
+ config.max_reconnect_attempts = 10
11
+ config.network_recovery_interval = 1.0
12
+ end
@@ -0,0 +1,90 @@
1
+ require 'uri'
2
+
3
+ require 'daimon_skycrawlers'
4
+ require 'daimon_skycrawlers/version'
5
+ require 'daimon_skycrawlers/configure_songkick_queue'
6
+ require 'daimon_skycrawlers/url_consumer'
7
+ require 'daimon_skycrawlers/storage'
8
+ require 'daimon_skycrawlers/parser'
9
+
10
+ require 'faraday'
11
+
12
+ module DaimonSkycrawlers
13
+ class Crawler
14
+ class << self
15
+ def run(process_name: 'daimon-skycrawler:url')
16
+ SongkickQueue::Worker.new(process_name, [URLConsumer]).run
17
+ end
18
+
19
+ def enqueue_url(url, depth: 3, interval: 1)
20
+ SongkickQueue.publish('daimon-skycrawler.url', url: url, depth: depth, interval: interval)
21
+ end
22
+ end
23
+
24
+ attr_writer :storage
25
+ attr_writer :parser
26
+
27
+ def initialize(base_url, options = {})
28
+ @base_url = base_url
29
+ @options = options
30
+ end
31
+
32
+ def setup_connection(options = {})
33
+ @connection = Faraday.new(@base_url, options) do |faraday|
34
+ yield faraday
35
+ end
36
+ end
37
+
38
+ def storage
39
+ @storage ||= Storage::RDB.new
40
+ end
41
+
42
+ def parser
43
+ @parser ||= Parser::Default.new
44
+ end
45
+
46
+ # TODO Support POST when we need
47
+ # TODO `params` should be a part of `path`. such as `path == "/hoi?hi=yoyo"`.
48
+ def fetch(path, params = {}, depth: 3)
49
+ @connection ||= Faraday.new(@base_url)
50
+ response = get(path)
51
+
52
+ url = @connection.url_prefix + path
53
+
54
+ data = [url.to_s, response.headers, response.body]
55
+
56
+ yield(*data) if block_given?
57
+
58
+ storage.save(*data)
59
+
60
+ schedule_to_process(url.to_s)
61
+
62
+ parser.parse(response.body)
63
+ urls = parser.links
64
+
65
+ enqueue_next_urls(urls, depth: depth - 1, interval: 1)
66
+ end
67
+
68
+ def get(path, params = {})
69
+ @connection.get(path, params)
70
+ end
71
+
72
+ def post(path, params = {})
73
+ @connection.post(path, params)
74
+ end
75
+
76
+ private
77
+
78
+ def schedule_to_process(url)
79
+ DaimonSkycrawlers::Processor.enqueue_http_response(url)
80
+ end
81
+
82
+ def enqueue_next_urls(urls, depth: 3, interval: 1)
83
+ return if depth <= 0
84
+
85
+ urls.each do |url|
86
+ self.class.enqueue_url(url, depth: depth, interval: interval)
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,42 @@
1
+ require 'thor'
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class New < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(File.dirname(__FILE__), "templates", "new")
12
+ end
13
+
14
+ def create_files
15
+ [
16
+ "README.md",
17
+ "config/database.yml",
18
+ ].each do |path|
19
+ template("#{path}.erb", "#{name}/#{path}")
20
+ end
21
+ end
22
+
23
+ def copy_files
24
+ [
25
+ "Gemfile",
26
+ "Rakefile",
27
+ "crawler.rb",
28
+ "enqueue.rb",
29
+ "processor.rb",
30
+ ].each do |path|
31
+ copy_file(path, "#{name}/#{path}")
32
+ end
33
+ [
34
+ "db/migrate/create_pages.rb",
35
+ ].each do |path|
36
+ migration = "#{Time.now.strftime("%Y%m%d%H%M%S")}_#{File.basename(path)}"
37
+ copy_file(path, "#{name}/db/migrate/#{migration}")
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'rake'
4
+ gem 'daimon_skycrawlers'
@@ -0,0 +1,34 @@
1
+ # <%= name %>
2
+
3
+ TODO: Write description.
4
+
5
+ ## Requirements
6
+
7
+ - Ruby
8
+ - RabbitMQ
9
+ - RDB
10
+ - PostgreSQL (default)
11
+ - MySQL
12
+ - SQLite3
13
+
14
+ ## Usage
15
+
16
+ 1. Install dependencies
17
+
18
+ $ bundle install
19
+
20
+ 2. Create database
21
+
22
+ $ bundle exec rake db:create
23
+ $ bundle exec rake db:migrate
24
+
25
+ 3. Open new terminal and run crawler/processor
26
+
27
+ $ bundle exec ruby crawler.rb # on new terminal
28
+ $ bundle exec ruby processor.rb # on new terminal
29
+
30
+ 4. Enqueue task
31
+
32
+ $ bundle exec ruby enqueue.rb http://example.com/
33
+
34
+ 5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
@@ -0,0 +1 @@
1
+ require "daimon_skycrawlers/tasks"
@@ -0,0 +1,26 @@
1
+ # PostgreSQL. Versions 8.2 and up are supported.
2
+ #
3
+ default: &default
4
+ adapter: postgresql
5
+ encoding: unicode
6
+ pool: 5
7
+
8
+ development:
9
+ <<: *default
10
+ database: <%= name %>_development
11
+ #username: <%= name %>
12
+ #password:
13
+ #host: localhost
14
+ #port: 5432
15
+ #schema_search_path: myapp,sharedapp,public
16
+ #min_messages: notice
17
+
18
+ test:
19
+ <<: *default
20
+ database: <%= name %>_test
21
+
22
+ production:
23
+ <<: *default
24
+ database: <%= name %>_production
25
+ username: <%= name %>
26
+ password: <%%= ENV['<%= name.upcase %>_PASSWORD'] %>
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/crawler"
4
+
5
+ base_url = 'http://example.com'
6
+
7
+ crawler = DaimonSkycrawlers::Crawler.new(base_url)
8
+ crawler.parser.append_filter do |url|
9
+ url.start_with?(base_url)
10
+ end
11
+
12
+ DaimonSkycrawlers.register_crawler(crawler)
13
+
14
+ DaimonSkycrawlers::Crawler.run
@@ -0,0 +1,13 @@
1
+ class CreatePages < ActiveRecord::Migration
2
+ def change
3
+ create_table :pages do |t|
4
+ t.string :url
5
+ t.text :headers
6
+ t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
7
+ t.datetime :last_modified_at
8
+ t.string :etag
9
+
10
+ t.timestamps null: false
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/crawler"
4
+
5
+ USAGE = "Usage: #{$0} [URL]"
6
+
7
+ if ARGV.size < 1
8
+ $stderr.puts "#{$0}: missing URL"
9
+ $stderr.puts USAGE
10
+ exit false
11
+ end
12
+
13
+ url = ARGV[0]
14
+
15
+ DaimonSkycrawlers::Crawler.enqueue_url(url)
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/processor"
4
+
5
+ DaimonSkycrawlers.register_processor do |data|
6
+ p "It works with '#{data[:url]}'"
7
+ end
8
+
9
+ DaimonSkycrawlers::Processor.run
@@ -0,0 +1,38 @@
1
+ require "daimon_skycrawlers/processor/default"
2
+
3
+ module DaimonSkycrawlers
4
+ class HTTPResponseConsumer
5
+ include SongkickQueue::Consumer
6
+
7
+ consume_from_queue 'daimon-skycrawler.http-response'
8
+
9
+ class << self
10
+ def register(processor = nil, &block)
11
+ if block_given?
12
+ processors << block
13
+ else
14
+ processors << processor
15
+ end
16
+ end
17
+
18
+ def processors
19
+ @processors ||= []
20
+ end
21
+
22
+ def default_processor
23
+ DaimonSkycrawlers::Processor::Default.new
24
+ end
25
+ end
26
+
27
+ def process(message)
28
+ if self.class.processors.empty?
29
+ processors = [self.class.default_processor]
30
+ else
31
+ processors = self.class.processors
32
+ end
33
+ processors.each do |processor|
34
+ processor.call(message)
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,13 @@
1
+ module DaimonSkycrawlers
2
+ module Parser
3
+ class Base
4
+ def initialize(html)
5
+ @html = html
6
+ end
7
+
8
+ def parse
9
+ raise "Implement this method in subclass"
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,50 @@
1
+ require "nokogiri"
2
+
3
+ module DaimonSkycrawlers
4
+ module Parser
5
+ class Default < Base
6
+ def initialize
7
+ @filters = []
8
+ end
9
+
10
+ def append_filter(filter = nil, &block)
11
+ if block_given?
12
+ @filters << block
13
+ else
14
+ @filters << filter
15
+ end
16
+ end
17
+
18
+ def parse(html)
19
+ @html = html
20
+ @doc = Nokogiri::HTML(html)
21
+ end
22
+
23
+ def links
24
+ return @links if @links
25
+ @links = retrieve_links
26
+ @links
27
+ end
28
+
29
+ private
30
+
31
+ def retrieve_links
32
+ urls = @doc.search("a").map do |element|
33
+ element["href"]
34
+ end
35
+ apply_filters(urls) || []
36
+ end
37
+
38
+ def apply_filters(urls)
39
+ return if urls.nil?
40
+ return if urls.empty?
41
+ @filters.each do |filter|
42
+ urls = urls.select do |url|
43
+ filter.call(url)
44
+ end
45
+ end
46
+ urls
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,7 @@
1
+ module DaimonSkycrawlers
2
+ module Parser
3
+ end
4
+ end
5
+
6
+ require "daimon_skycrawlers/parser/base"
7
+ require "daimon_skycrawlers/parser/default"
@@ -0,0 +1,20 @@
1
+ require "daimon_skycrawlers/storage/rdb"
2
+
3
+ module DaimonSkycrawlers
4
+ class Processor
5
+ class Default
6
+ def call(message)
7
+ url = message[:url]
8
+ storage = DaimonSkycrawlers::Storage::RDB.new
9
+ page = storage.find(url)
10
+ headers = JSON.parse(page.headers)
11
+ puts "URL: #{page.url}"
12
+ puts "Body: #{page.body.bytesize} bytes"
13
+ puts "Headers:"
14
+ headers.each do |key, value|
15
+ puts " #{key}: #{value}"
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ require 'daimon_skycrawlers'
2
+ require 'daimon_skycrawlers/configure_songkick_queue'
3
+ require 'daimon_skycrawlers/url_consumer'
4
+ require 'daimon_skycrawlers/http_response_consumer'
5
+
6
+ module DaimonSkycrawlers
7
+ class Processor
8
+ class << self
9
+ def run(process_name: 'daimon-skycrawler:http-response')
10
+ SongkickQueue::Worker.new(process_name, [HTTPResponseConsumer]).run
11
+ end
12
+
13
+ def enqueue_http_response(url)
14
+ SongkickQueue.publish('daimon-skycrawler.http-response', url: url)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ module DaimonSkycrawlers
2
+ module Storage
3
+ class Base
4
+ def save(url, headers, body)
5
+ raise "Implement this in subclass"
6
+ end
7
+
8
+ def read(url)
9
+ raise "Implement this in subclass"
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,11 @@
1
+ module DaimonSkycrawlers
2
+ module Storage
3
+ class Null < Base
4
+ def save(url, headers, body)
5
+ end
6
+
7
+ def find(url)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,30 @@
1
+ require "daimon_skycrawlers/storage/base"
2
+ require "active_record"
3
+
4
+ module DaimonSkycrawlers
5
+ module Storage
6
+ class RDB < Base
7
+ def initialize(config_path = "config/database.yml")
8
+ config = YAML.load_file(config_path)
9
+ environment = ENV["SKYCRAWLERS_ENV"] || "development"
10
+ ActiveRecord::Base.establish_connection(config[environment])
11
+ end
12
+
13
+ def save(url, headers, body)
14
+ Page.create(url: url,
15
+ headers: JSON.generate(headers),
16
+ body: body,
17
+ last_modified_at: headers["last-modified"],
18
+ etag: headers["etag"])
19
+ end
20
+
21
+ def find(url)
22
+ Page.where(url: url).order(last_modified_at: :desc).limit(1).first
23
+ end
24
+
25
+ class Page < ActiveRecord::Base
26
+ self.table_name = "pages"
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,8 @@
1
+ module DaimonSkycrawlers
2
+ module Storage
3
+ end
4
+ end
5
+
6
+ require "daimon_skycrawlers/storage/base"
7
+ require "daimon_skycrawlers/storage/rdb"
8
+ require "daimon_skycrawlers/storage/null"
@@ -0,0 +1,53 @@
1
+ # Copyright (c) 2012 Janko Marohnić
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person
4
+ # obtaining a copy of this software and associated documentation
5
+ # files (the "Software"), to deal in the Software without
6
+ # restriction, including without limitation the rights to use,
7
+ # copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ # copies of the Software, and to permit persons to whom the
9
+ # Software is furnished to do so, subject to the following
10
+ # conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ # OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ # https://github.com/janko-m/sinatra-activerecord
25
+ #
26
+
27
+ seed_loader = Class.new do
28
+ def load_seed
29
+ # load "#{ActiveRecord::Tasks::DatabaseTasks.db_dir}/seeds.rb"
30
+ end
31
+ end
32
+
33
+ ActiveRecord::Tasks::DatabaseTasks.tap do |config|
34
+ config.root = Rake.application.original_dir
35
+ config.env = ENV["SKYCRAWLERS_ENV"] || "development"
36
+ config.db_dir = "db"
37
+ config.migrations_paths = ["db/migrate"]
38
+ config.fixtures_path = "test/fixtures"
39
+ config.seed_loader = seed_loader.new
40
+ config.database_configuration = ActiveRecord::Base.configurations
41
+ end
42
+
43
+ # db:load_config can be overriden manually
44
+ Rake::Task["db:seed"].enhance(["db:load_config"])
45
+ Rake::Task["db:load_config"].clear
46
+
47
+ Rake::Task.define_task("db:environment")
48
+ Rake::Task.define_task("db:load_config") do
49
+ ActiveRecord::Base.configurations = YAML.load_file("config/database.yml")
50
+ environment = ENV["SKYCRAWLERS_ENV"] || "development"
51
+ ActiveRecord::Base.establish_connection(environment.to_sym)
52
+ end
53
+ Rake::Task["db:test:deprecated"].clear if Rake::Task.task_defined?("db:test:deprecated")
@@ -0,0 +1,2 @@
1
+ load "active_record/railties/databases.rake"
2
+ load "daimon_skycrawlers/tasks/database_tasks.rake"
@@ -0,0 +1,32 @@
1
+ require 'daimon_skycrawlers/crawler'
2
+ require 'daimon_skycrawlers/processor'
3
+
4
+ module DaimonSkycrawlers
5
+ class URLConsumer
6
+ include SongkickQueue::Consumer
7
+
8
+ consume_from_queue 'daimon-skycrawler.url'
9
+
10
+ class << self
11
+ def register(crawler)
12
+ crawlers << crawler
13
+ end
14
+
15
+ def crawlers
16
+ @crawlers ||= []
17
+ end
18
+ end
19
+
20
+ def process(message)
21
+ url = message[:url]
22
+ depth = message[:depth]
23
+ interval = message[:interval]
24
+
25
+ # XXX When several crawlers are registered, how should they behave?
26
+ self.class.crawlers.each do |crawler|
27
+ sleep(interval)
28
+ crawler.fetch(url, depth)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ module DaimonSkycrawlers
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,15 @@
1
+ require 'bundler/setup'
2
+
3
+ require 'daimon_skycrawlers/version'
4
+
5
+ module DaimonSkycrawlers
6
+ class << self
7
+ def register_processor(processor = nil, &block)
8
+ HTTPResponseConsumer.register(processor, &block)
9
+ end
10
+
11
+ def register_crawler(crawler)
12
+ URLConsumer.register(crawler)
13
+ end
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,291 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: daimon_skycrawlers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryunosuke SATO
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: songkick_queue
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faraday
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: faraday_middleware
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: activerecord
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pg
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: bundler
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.11'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.11'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '10.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '10.0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: test-unit
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: test-unit-rr
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: test-unit-notify
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
181
+ - !ruby/object:Gem::Dependency
182
+ name: cucumber
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ - !ruby/object:Gem::Dependency
196
+ name: pry
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ">="
200
+ - !ruby/object:Gem::Version
201
+ version: '0'
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: tapp
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
223
+ description: This is a crawler framework.
224
+ email:
225
+ - tricknotes.rs@gmail.com
226
+ executables:
227
+ - daimon-skycrawlers
228
+ extensions: []
229
+ extra_rdoc_files: []
230
+ files:
231
+ - ".gitignore"
232
+ - ".travis.yml"
233
+ - Gemfile
234
+ - LICENSE.txt
235
+ - README.md
236
+ - Rakefile
237
+ - bin/daimon-skycrawlers
238
+ - daimon_skycrawlers.gemspec
239
+ - db/schema.rb
240
+ - lib/daimon_skycrawlers.rb
241
+ - lib/daimon_skycrawlers/cli.rb
242
+ - lib/daimon_skycrawlers/configure_songkick_queue.rb
243
+ - lib/daimon_skycrawlers/crawler.rb
244
+ - lib/daimon_skycrawlers/generator/new.rb
245
+ - lib/daimon_skycrawlers/generator/templates/new/Gemfile
246
+ - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
247
+ - lib/daimon_skycrawlers/generator/templates/new/Rakefile
248
+ - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
249
+ - lib/daimon_skycrawlers/generator/templates/new/crawler.rb
250
+ - lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
251
+ - lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
252
+ - lib/daimon_skycrawlers/generator/templates/new/processor.rb
253
+ - lib/daimon_skycrawlers/http_response_consumer.rb
254
+ - lib/daimon_skycrawlers/parser.rb
255
+ - lib/daimon_skycrawlers/parser/base.rb
256
+ - lib/daimon_skycrawlers/parser/default.rb
257
+ - lib/daimon_skycrawlers/processor.rb
258
+ - lib/daimon_skycrawlers/processor/default.rb
259
+ - lib/daimon_skycrawlers/storage.rb
260
+ - lib/daimon_skycrawlers/storage/base.rb
261
+ - lib/daimon_skycrawlers/storage/null.rb
262
+ - lib/daimon_skycrawlers/storage/rdb.rb
263
+ - lib/daimon_skycrawlers/tasks.rb
264
+ - lib/daimon_skycrawlers/tasks/database_tasks.rake
265
+ - lib/daimon_skycrawlers/url_consumer.rb
266
+ - lib/daimon_skycrawlers/version.rb
267
+ homepage: https://github.com/bm-sms/daimon-skycrawlers
268
+ licenses:
269
+ - MIT
270
+ metadata: {}
271
+ post_install_message:
272
+ rdoc_options: []
273
+ require_paths:
274
+ - lib
275
+ required_ruby_version: !ruby/object:Gem::Requirement
276
+ requirements:
277
+ - - ">="
278
+ - !ruby/object:Gem::Version
279
+ version: '0'
280
+ required_rubygems_version: !ruby/object:Gem::Requirement
281
+ requirements:
282
+ - - ">="
283
+ - !ruby/object:Gem::Version
284
+ version: '0'
285
+ requirements: []
286
+ rubyforge_project:
287
+ rubygems_version: 2.5.1
288
+ signing_key:
289
+ specification_version: 4
290
+ summary: This is a crawler framework.
291
+ test_files: []