daimon_skycrawlers 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.travis.yml +11 -0
  4. data/Gemfile +3 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +73 -0
  7. data/Rakefile +17 -0
  8. data/bin/daimon-skycrawlers +9 -0
  9. data/daimon_skycrawlers.gemspec +37 -0
  10. data/db/schema.rb +15 -0
  11. data/lib/daimon_skycrawlers/cli.rb +14 -0
  12. data/lib/daimon_skycrawlers/configure_songkick_queue.rb +12 -0
  13. data/lib/daimon_skycrawlers/crawler.rb +90 -0
  14. data/lib/daimon_skycrawlers/generator/new.rb +42 -0
  15. data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +4 -0
  16. data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +34 -0
  17. data/lib/daimon_skycrawlers/generator/templates/new/Rakefile +1 -0
  18. data/lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb +26 -0
  19. data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +14 -0
  20. data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +13 -0
  21. data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +15 -0
  22. data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +9 -0
  23. data/lib/daimon_skycrawlers/http_response_consumer.rb +38 -0
  24. data/lib/daimon_skycrawlers/parser/base.rb +13 -0
  25. data/lib/daimon_skycrawlers/parser/default.rb +50 -0
  26. data/lib/daimon_skycrawlers/parser.rb +7 -0
  27. data/lib/daimon_skycrawlers/processor/default.rb +20 -0
  28. data/lib/daimon_skycrawlers/processor.rb +18 -0
  29. data/lib/daimon_skycrawlers/storage/base.rb +13 -0
  30. data/lib/daimon_skycrawlers/storage/null.rb +11 -0
  31. data/lib/daimon_skycrawlers/storage/rdb.rb +30 -0
  32. data/lib/daimon_skycrawlers/storage.rb +8 -0
  33. data/lib/daimon_skycrawlers/tasks/database_tasks.rake +53 -0
  34. data/lib/daimon_skycrawlers/tasks.rb +2 -0
  35. data/lib/daimon_skycrawlers/url_consumer.rb +32 -0
  36. data/lib/daimon_skycrawlers/version.rb +3 -0
  37. data/lib/daimon_skycrawlers.rb +15 -0
  38. metadata +291 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e9ebc710f18b92107a91a3bb5a4c8972051ede8f
4
+ data.tar.gz: 61db84bcdc73557ae9b28cc7894c04a1055dda6d
5
+ SHA512:
6
+ metadata.gz: 76d7b629eefc04a89d5cdce8939a9e83f35c68a9b8d1be3599e3c7d59b815f506eb1c295d1e3095f18ca7ec12169fa5e365cda981561804bbb4d2d070eef3051
7
+ data.tar.gz: 04b326f7a8531d364c3d41d12f99328831b2eda1acefa72869f588750e065abc0a044fd6d87b92f20a6359983dba8b00c42a4ab40d75979f92c6a553ec16f9e9
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.travis.yml ADDED
@@ -0,0 +1,11 @@
1
+ language: ruby
2
+ sudo: false
3
+ cache: bundler
4
+ rvm:
5
+ - 2.2.4
6
+ - 2.3.0
7
+
8
+ before_install: gem install bundler -v 1.11.2
9
+
10
+ services:
11
+ - rabbitmq
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Ryunosuke SATO
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,73 @@
1
+ # DaimonSkycrawlers
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/daimon_skycrawlers`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Requirements
8
+
9
+ - Ruby
10
+ - RabbitMQ
11
+ - RDB
12
+ - PostgreSQL (default)
13
+ - MySQL
14
+ - SQLite3
15
+
16
+ ## Installation
17
+
18
+ Add this line to your application's Gemfile:
19
+
20
+ ```ruby
21
+ gem 'daimon_skycrawlers'
22
+ ```
23
+
24
+ And then execute:
25
+
26
+ $ bundle
27
+
28
+ Or install it yourself as:
29
+
30
+ $ gem install daimon_skycrawlers
31
+
32
+ ## Usage
33
+
34
+ 1. Create project
35
+
36
+ $ bundle exec daimon-skycrawlers new mycrawlers
37
+ $ cd mycrawlers
38
+
39
+ 2. Install dependencies
40
+
41
+ $ bundle install
42
+
43
+ 3. Create database
44
+
45
+ $ bundle exec rake db:create
46
+ $ bundle exec rake db:migrate
47
+
48
+ 4. Open new terminal and run crawler/processor
49
+
50
+ $ bundle exec ruby crawler.rb # on new terminal
51
+ $ bundle exec ruby processor.rb # on new terminal
52
+
53
+ 5. Enqueue task
54
+
55
+ $ bundle exec ruby enqueue.rb http://example.com/
56
+
57
+ 6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
58
+
59
+ ## Development
60
+
61
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
62
+
63
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
64
+
65
+ ## Contributing
66
+
67
+ Bug reports and pull requests are welcome on GitHub at https://github.com/bm-sms/daimon-skycrawlers.
68
+
69
+
70
+ ## License
71
+
72
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
73
+
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ require "bundler/setup"
2
+ require "bundler/gem_tasks"
3
+ require "rake/testtask"
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList['test/**/*_test.rb']
9
+ end
10
+
11
+ require "cucumber/rake/task"
12
+
13
+ Cucumber::Rake::Task.new(:features) do |t|
14
+ t.cucumber_opts = "features --format pretty"
15
+ end
16
+
17
+ task :default => [:test, :features]
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if File.exist?(File.expand_path('../.git', __dir__))
4
+ $LOAD_PATH << File.expand_path('../lib', __dir__)
5
+ end
6
+
7
+ require 'daimon_skycrawlers/cli'
8
+
9
+ DaimonSkycrawlers::CLI.start
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'daimon_skycrawlers/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "daimon_skycrawlers"
8
+ spec.version = DaimonSkycrawlers::VERSION
9
+ spec.authors = ["Ryunosuke SATO"]
10
+ spec.email = ["tricknotes.rs@gmail.com"]
11
+
12
+ spec.summary = %q{This is a crawler framework.}
13
+ spec.description = %q{This is a crawler framework.}
14
+ spec.homepage = "https://github.com/bm-sms/daimon-skycrawlers"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "thor"
22
+ spec.add_dependency "songkick_queue"
23
+ spec.add_dependency "faraday"
24
+ spec.add_dependency "faraday_middleware"
25
+ spec.add_dependency "nokogiri"
26
+ spec.add_dependency "activerecord"
27
+ spec.add_dependency "pg"
28
+
29
+ spec.add_development_dependency "bundler", "~> 1.11"
30
+ spec.add_development_dependency "rake", "~> 10.0"
31
+ spec.add_development_dependency "test-unit"
32
+ spec.add_development_dependency "test-unit-rr"
33
+ spec.add_development_dependency "test-unit-notify"
34
+ spec.add_development_dependency "cucumber"
35
+ spec.add_development_dependency "pry"
36
+ spec.add_development_dependency "tapp"
37
+ end
data/db/schema.rb ADDED
@@ -0,0 +1,15 @@
1
+ require "active_record"
2
+
3
+ ActiveRecord::Base.establish_connection(adapter: "sqlite3",
4
+ database: "storage.db")
5
+
6
+ ActiveRecord::Schema.define(version: 1) do
7
+ create_table :pages do |t|
8
+ t.string :url
9
+ t.text :headers
10
+ t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
11
+ t.datetime :last_modified_at
12
+ t.string :etag
13
+ t.timestamps
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ require 'thor'
2
+ require 'daimon_skycrawlers/generator/new'
3
+ require 'daimon_skycrawlers/version'
4
+
5
+ module DaimonSkycrawlers
6
+ class CLI < Thor
7
+ register(Generator::New, "new", "new NAME", "Create new project")
8
+
9
+ desc "version", "Show version"
10
+ def version
11
+ puts VERSION
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,12 @@
1
+ require 'songkick_queue'
2
+ # TODO Allow to configure from user land
3
+ SongkickQueue.configure do |config|
4
+ config.logger = Logger.new(STDOUT)
5
+ config.host = '127.0.0.1'
6
+ config.port = 5672
7
+ # config.username = 'guest'
8
+ # config.password = 'guest'
9
+ config.vhost = '/'
10
+ config.max_reconnect_attempts = 10
11
+ config.network_recovery_interval = 1.0
12
+ end
@@ -0,0 +1,90 @@
1
+ require 'uri'
2
+
3
+ require 'daimon_skycrawlers'
4
+ require 'daimon_skycrawlers/version'
5
+ require 'daimon_skycrawlers/configure_songkick_queue'
6
+ require 'daimon_skycrawlers/url_consumer'
7
+ require 'daimon_skycrawlers/storage'
8
+ require 'daimon_skycrawlers/parser'
9
+
10
+ require 'faraday'
11
+
12
+ module DaimonSkycrawlers
13
+ class Crawler
14
+ class << self
15
+ def run(process_name: 'daimon-skycrawler:url')
16
+ SongkickQueue::Worker.new(process_name, [URLConsumer]).run
17
+ end
18
+
19
+ def enqueue_url(url, depth: 3, interval: 1)
20
+ SongkickQueue.publish('daimon-skycrawler.url', url: url, depth: depth, interval: interval)
21
+ end
22
+ end
23
+
24
+ attr_writer :storage
25
+ attr_writer :parser
26
+
27
+ def initialize(base_url, options = {})
28
+ @base_url = base_url
29
+ @options = options
30
+ end
31
+
32
+ def setup_connection(options = {})
33
+ @connection = Faraday.new(@base_url, options) do |faraday|
34
+ yield faraday
35
+ end
36
+ end
37
+
38
+ def storage
39
+ @storage ||= Storage::RDB.new
40
+ end
41
+
42
+ def parser
43
+ @parser ||= Parser::Default.new
44
+ end
45
+
46
+ # TODO Support POST when we need
47
+ # TODO `params` should be a part of `path`. such as `path == "/hoi?hi=yoyo"`.
48
+ def fetch(path, params = {}, depth: 3)
49
+ @connection ||= Faraday.new(@base_url)
50
+ response = get(path)
51
+
52
+ url = @connection.url_prefix + path
53
+
54
+ data = [url.to_s, response.headers, response.body]
55
+
56
+ yield(*data) if block_given?
57
+
58
+ storage.save(*data)
59
+
60
+ schedule_to_process(url.to_s)
61
+
62
+ parser.parse(response.body)
63
+ urls = parser.links
64
+
65
+ enqueue_next_urls(urls, depth: depth - 1, interval: 1)
66
+ end
67
+
68
+ def get(path, params = {})
69
+ @connection.get(path, params)
70
+ end
71
+
72
+ def post(path, params = {})
73
+ @connection.post(path, params)
74
+ end
75
+
76
+ private
77
+
78
+ def schedule_to_process(url)
79
+ DaimonSkycrawlers::Processor.enqueue_http_response(url)
80
+ end
81
+
82
+ def enqueue_next_urls(urls, depth: 3, interval: 1)
83
+ return if depth <= 0
84
+
85
+ urls.each do |url|
86
+ self.class.enqueue_url(url, depth: depth, interval: interval)
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,42 @@
1
+ require 'thor'
2
+
3
+ module DaimonSkycrawlers
4
+ module Generator
5
+ class New < Thor::Group
6
+ include Thor::Actions
7
+
8
+ argument :name
9
+
10
+ def self.source_root
11
+ File.join(File.dirname(__FILE__), "templates", "new")
12
+ end
13
+
14
+ def create_files
15
+ [
16
+ "README.md",
17
+ "config/database.yml",
18
+ ].each do |path|
19
+ template("#{path}.erb", "#{name}/#{path}")
20
+ end
21
+ end
22
+
23
+ def copy_files
24
+ [
25
+ "Gemfile",
26
+ "Rakefile",
27
+ "crawler.rb",
28
+ "enqueue.rb",
29
+ "processor.rb",
30
+ ].each do |path|
31
+ copy_file(path, "#{name}/#{path}")
32
+ end
33
+ [
34
+ "db/migrate/create_pages.rb",
35
+ ].each do |path|
36
+ migration = "#{Time.now.strftime("%Y%m%d%H%M%S")}_#{File.basename(path)}"
37
+ copy_file(path, "#{name}/db/migrate/#{migration}")
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'rake'
4
+ gem 'daimon_skycrawlers'
@@ -0,0 +1,34 @@
1
+ # <%= name %>
2
+
3
+ TODO: Write description.
4
+
5
+ ## Requirements
6
+
7
+ - Ruby
8
+ - RabbitMQ
9
+ - RDB
10
+ - PostgreSQL (default)
11
+ - MySQL
12
+ - SQLite3
13
+
14
+ ## Usage
15
+
16
+ 1. Install dependencies
17
+
18
+ $ bundle install
19
+
20
+ 2. Create database
21
+
22
+ $ bundle exec rake db:create
23
+ $ bundle exec rake db:migrate
24
+
25
+ 3. Open new terminal and run crawler/processor
26
+
27
+ $ bundle exec ruby crawler.rb # on new terminal
28
+ $ bundle exec ruby processor.rb # on new terminal
29
+
30
+ 4. Enqueue task
31
+
32
+ $ bundle exec ruby enqueue.rb http://example.com/
33
+
34
+ 5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
@@ -0,0 +1 @@
1
+ require "daimon_skycrawlers/tasks"
@@ -0,0 +1,26 @@
1
+ # PostgreSQL. Versions 8.2 and up are supported.
2
+ #
3
+ default: &default
4
+ adapter: postgresql
5
+ encoding: unicode
6
+ pool: 5
7
+
8
+ development:
9
+ <<: *default
10
+ database: <%= name %>_development
11
+ #username: <%= name %>
12
+ #password:
13
+ #host: localhost
14
+ #port: 5432
15
+ #schema_search_path: myapp,sharedapp,public
16
+ #min_messages: notice
17
+
18
+ test:
19
+ <<: *default
20
+ database: <%= name %>_test
21
+
22
+ production:
23
+ <<: *default
24
+ database: <%= name %>_production
25
+ username: <%= name %>
26
+ password: <%%= ENV['<%= name.upcase %>_PASSWORD'] %>
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/crawler"
4
+
5
+ base_url = 'http://example.com'
6
+
7
+ crawler = DaimonSkycrawlers::Crawler.new(base_url)
8
+ crawler.parser.append_filter do |url|
9
+ url.start_with?(base_url)
10
+ end
11
+
12
+ DaimonSkycrawlers.register_crawler(crawler)
13
+
14
+ DaimonSkycrawlers::Crawler.run
@@ -0,0 +1,13 @@
1
+ class CreatePages < ActiveRecord::Migration
2
+ def change
3
+ create_table :pages do |t|
4
+ t.string :url
5
+ t.text :headers
6
+ t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
7
+ t.datetime :last_modified_at
8
+ t.string :etag
9
+
10
+ t.timestamps null: false
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/crawler"
4
+
5
+ USAGE = "Usage: #{$0} [URL]"
6
+
7
+ if ARGV.size < 1
8
+ $stderr.puts "#{$0}: missing URL"
9
+ $stderr.puts USAGE
10
+ exit false
11
+ end
12
+
13
+ url = ARGV[0]
14
+
15
+ DaimonSkycrawlers::Crawler.enqueue_url(url)
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "daimon_skycrawlers/processor"
4
+
5
+ DaimonSkycrawlers.register_processor do |data|
6
+ p "It works with '#{data[:url]}'"
7
+ end
8
+
9
+ DaimonSkycrawlers::Processor.run
@@ -0,0 +1,38 @@
1
+ require "daimon_skycrawlers/processor/default"
2
+
3
+ module DaimonSkycrawlers
4
+ class HTTPResponseConsumer
5
+ include SongkickQueue::Consumer
6
+
7
+ consume_from_queue 'daimon-skycrawler.http-response'
8
+
9
+ class << self
10
+ def register(processor = nil, &block)
11
+ if block_given?
12
+ processors << block
13
+ else
14
+ processors << processor
15
+ end
16
+ end
17
+
18
+ def processors
19
+ @processors ||= []
20
+ end
21
+
22
+ def default_processor
23
+ DaimonSkycrawlers::Processor::Default.new
24
+ end
25
+ end
26
+
27
+ def process(message)
28
+ if self.class.processors.empty?
29
+ processors = [self.class.default_processor]
30
+ else
31
+ processors = self.class.processors
32
+ end
33
+ processors.each do |processor|
34
+ processor.call(message)
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,13 @@
1
+ module DaimonSkycrawlers
2
+ module Parser
3
+ class Base
4
+ def initialize(html)
5
+ @html = html
6
+ end
7
+
8
+ def parse
9
+ raise "Implement this method in subclass"
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,50 @@
1
+ require "nokogiri"
2
+
3
+ module DaimonSkycrawlers
4
+ module Parser
5
+ class Default < Base
6
+ def initialize
7
+ @filters = []
8
+ end
9
+
10
+ def append_filter(filter = nil, &block)
11
+ if block_given?
12
+ @filters << block
13
+ else
14
+ @filters << filter
15
+ end
16
+ end
17
+
18
+ def parse(html)
19
+ @html = html
20
+ @doc = Nokogiri::HTML(html)
21
+ end
22
+
23
+ def links
24
+ return @links if @links
25
+ @links = retrieve_links
26
+ @links
27
+ end
28
+
29
+ private
30
+
31
+ def retrieve_links
32
+ urls = @doc.search("a").map do |element|
33
+ element["href"]
34
+ end
35
+ apply_filters(urls) || []
36
+ end
37
+
38
+ def apply_filters(urls)
39
+ return if urls.nil?
40
+ return if urls.empty?
41
+ @filters.each do |filter|
42
+ urls = urls.select do |url|
43
+ filter.call(url)
44
+ end
45
+ end
46
+ urls
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,7 @@
1
+ module DaimonSkycrawlers
2
+ module Parser
3
+ end
4
+ end
5
+
6
+ require "daimon_skycrawlers/parser/base"
7
+ require "daimon_skycrawlers/parser/default"
@@ -0,0 +1,20 @@
1
+ require "daimon_skycrawlers/storage/rdb"
2
+
3
+ module DaimonSkycrawlers
4
+ class Processor
5
+ class Default
6
+ def call(message)
7
+ url = message[:url]
8
+ storage = DaimonSkycrawlers::Storage::RDB.new
9
+ page = storage.find(url)
10
+ headers = JSON.parse(page.headers)
11
+ puts "URL: #{page.url}"
12
+ puts "Body: #{page.body.bytesize} bytes"
13
+ puts "Headers:"
14
+ headers.each do |key, value|
15
+ puts " #{key}: #{value}"
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ require 'daimon_skycrawlers'
2
+ require 'daimon_skycrawlers/configure_songkick_queue'
3
+ require 'daimon_skycrawlers/url_consumer'
4
+ require 'daimon_skycrawlers/http_response_consumer'
5
+
6
+ module DaimonSkycrawlers
7
+ class Processor
8
+ class << self
9
+ def run(process_name: 'daimon-skycrawler:http-response')
10
+ SongkickQueue::Worker.new(process_name, [HTTPResponseConsumer]).run
11
+ end
12
+
13
+ def enqueue_http_response(url)
14
+ SongkickQueue.publish('daimon-skycrawler.http-response', url: url)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ module DaimonSkycrawlers
2
+ module Storage
3
+ class Base
4
+ def save(url, headers, body)
5
+ raise "Implement this in subclass"
6
+ end
7
+
8
+ def read(url)
9
+ raise "Implement this in subclass"
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,11 @@
1
+ module DaimonSkycrawlers
2
+ module Storage
3
+ class Null < Base
4
+ def save(url, headers, body)
5
+ end
6
+
7
+ def find(url)
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,30 @@
1
+ require "daimon_skycrawlers/storage/base"
2
+ require "active_record"
3
+
4
+ module DaimonSkycrawlers
5
+ module Storage
6
+ class RDB < Base
7
+ def initialize(config_path = "config/database.yml")
8
+ config = YAML.load_file(config_path)
9
+ environment = ENV["SKYCRAWLERS_ENV"] || "development"
10
+ ActiveRecord::Base.establish_connection(config[environment])
11
+ end
12
+
13
+ def save(url, headers, body)
14
+ Page.create(url: url,
15
+ headers: JSON.generate(headers),
16
+ body: body,
17
+ last_modified_at: headers["last-modified"],
18
+ etag: headers["etag"])
19
+ end
20
+
21
+ def find(url)
22
+ Page.where(url: url).order(last_modified_at: :desc).limit(1).first
23
+ end
24
+
25
+ class Page < ActiveRecord::Base
26
+ self.table_name = "pages"
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,8 @@
1
+ module DaimonSkycrawlers
2
+ module Storage
3
+ end
4
+ end
5
+
6
+ require "daimon_skycrawlers/storage/base"
7
+ require "daimon_skycrawlers/storage/rdb"
8
+ require "daimon_skycrawlers/storage/null"
@@ -0,0 +1,53 @@
1
+ # Copyright (c) 2012 Janko Marohnić
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person
4
+ # obtaining a copy of this software and associated documentation
5
+ # files (the "Software"), to deal in the Software without
6
+ # restriction, including without limitation the rights to use,
7
+ # copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ # copies of the Software, and to permit persons to whom the
9
+ # Software is furnished to do so, subject to the following
10
+ # conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ # OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ # https://github.com/janko-m/sinatra-activerecord
25
+ #
26
+
27
+ seed_loader = Class.new do
28
+ def load_seed
29
+ # load "#{ActiveRecord::Tasks::DatabaseTasks.db_dir}/seeds.rb"
30
+ end
31
+ end
32
+
33
+ ActiveRecord::Tasks::DatabaseTasks.tap do |config|
34
+ config.root = Rake.application.original_dir
35
+ config.env = ENV["SKYCRAWLERS_ENV"] || "development"
36
+ config.db_dir = "db"
37
+ config.migrations_paths = ["db/migrate"]
38
+ config.fixtures_path = "test/fixtures"
39
+ config.seed_loader = seed_loader.new
40
+ config.database_configuration = ActiveRecord::Base.configurations
41
+ end
42
+
43
+ # db:load_config can be overriden manually
44
+ Rake::Task["db:seed"].enhance(["db:load_config"])
45
+ Rake::Task["db:load_config"].clear
46
+
47
+ Rake::Task.define_task("db:environment")
48
+ Rake::Task.define_task("db:load_config") do
49
+ ActiveRecord::Base.configurations = YAML.load_file("config/database.yml")
50
+ environment = ENV["SKYCRAWLERS_ENV"] || "development"
51
+ ActiveRecord::Base.establish_connection(environment.to_sym)
52
+ end
53
+ Rake::Task["db:test:deprecated"].clear if Rake::Task.task_defined?("db:test:deprecated")
@@ -0,0 +1,2 @@
1
+ load "active_record/railties/databases.rake"
2
+ load "daimon_skycrawlers/tasks/database_tasks.rake"
@@ -0,0 +1,32 @@
1
+ require 'daimon_skycrawlers/crawler'
2
+ require 'daimon_skycrawlers/processor'
3
+
4
+ module DaimonSkycrawlers
5
+ class URLConsumer
6
+ include SongkickQueue::Consumer
7
+
8
+ consume_from_queue 'daimon-skycrawler.url'
9
+
10
+ class << self
11
+ def register(crawler)
12
+ crawlers << crawler
13
+ end
14
+
15
+ def crawlers
16
+ @crawlers ||= []
17
+ end
18
+ end
19
+
20
+ def process(message)
21
+ url = message[:url]
22
+ depth = message[:depth]
23
+ interval = message[:interval]
24
+
25
+ # XXX When several crawlers are registered, how should they behave?
26
+ self.class.crawlers.each do |crawler|
27
+ sleep(interval)
28
+ crawler.fetch(url, depth)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,3 @@
1
+ module DaimonSkycrawlers
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,15 @@
1
+ require 'bundler/setup'
2
+
3
+ require 'daimon_skycrawlers/version'
4
+
5
+ module DaimonSkycrawlers
6
+ class << self
7
+ def register_processor(processor = nil, &block)
8
+ HTTPResponseConsumer.register(processor, &block)
9
+ end
10
+
11
+ def register_crawler(crawler)
12
+ URLConsumer.register(crawler)
13
+ end
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,291 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: daimon_skycrawlers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryunosuke SATO
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-01-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: thor
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: songkick_queue
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: faraday
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: faraday_middleware
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: activerecord
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pg
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: bundler
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.11'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.11'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '10.0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '10.0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: test-unit
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: test-unit-rr
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: test-unit-notify
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - ">="
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - ">="
179
+ - !ruby/object:Gem::Version
180
+ version: '0'
181
+ - !ruby/object:Gem::Dependency
182
+ name: cucumber
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ - !ruby/object:Gem::Dependency
196
+ name: pry
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ">="
200
+ - !ruby/object:Gem::Version
201
+ version: '0'
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: tapp
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
223
+ description: This is a crawler framework.
224
+ email:
225
+ - tricknotes.rs@gmail.com
226
+ executables:
227
+ - daimon-skycrawlers
228
+ extensions: []
229
+ extra_rdoc_files: []
230
+ files:
231
+ - ".gitignore"
232
+ - ".travis.yml"
233
+ - Gemfile
234
+ - LICENSE.txt
235
+ - README.md
236
+ - Rakefile
237
+ - bin/daimon-skycrawlers
238
+ - daimon_skycrawlers.gemspec
239
+ - db/schema.rb
240
+ - lib/daimon_skycrawlers.rb
241
+ - lib/daimon_skycrawlers/cli.rb
242
+ - lib/daimon_skycrawlers/configure_songkick_queue.rb
243
+ - lib/daimon_skycrawlers/crawler.rb
244
+ - lib/daimon_skycrawlers/generator/new.rb
245
+ - lib/daimon_skycrawlers/generator/templates/new/Gemfile
246
+ - lib/daimon_skycrawlers/generator/templates/new/README.md.erb
247
+ - lib/daimon_skycrawlers/generator/templates/new/Rakefile
248
+ - lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
249
+ - lib/daimon_skycrawlers/generator/templates/new/crawler.rb
250
+ - lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
251
+ - lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
252
+ - lib/daimon_skycrawlers/generator/templates/new/processor.rb
253
+ - lib/daimon_skycrawlers/http_response_consumer.rb
254
+ - lib/daimon_skycrawlers/parser.rb
255
+ - lib/daimon_skycrawlers/parser/base.rb
256
+ - lib/daimon_skycrawlers/parser/default.rb
257
+ - lib/daimon_skycrawlers/processor.rb
258
+ - lib/daimon_skycrawlers/processor/default.rb
259
+ - lib/daimon_skycrawlers/storage.rb
260
+ - lib/daimon_skycrawlers/storage/base.rb
261
+ - lib/daimon_skycrawlers/storage/null.rb
262
+ - lib/daimon_skycrawlers/storage/rdb.rb
263
+ - lib/daimon_skycrawlers/tasks.rb
264
+ - lib/daimon_skycrawlers/tasks/database_tasks.rake
265
+ - lib/daimon_skycrawlers/url_consumer.rb
266
+ - lib/daimon_skycrawlers/version.rb
267
+ homepage: https://github.com/bm-sms/daimon-skycrawlers
268
+ licenses:
269
+ - MIT
270
+ metadata: {}
271
+ post_install_message:
272
+ rdoc_options: []
273
+ require_paths:
274
+ - lib
275
+ required_ruby_version: !ruby/object:Gem::Requirement
276
+ requirements:
277
+ - - ">="
278
+ - !ruby/object:Gem::Version
279
+ version: '0'
280
+ required_rubygems_version: !ruby/object:Gem::Requirement
281
+ requirements:
282
+ - - ">="
283
+ - !ruby/object:Gem::Version
284
+ version: '0'
285
+ requirements: []
286
+ rubyforge_project:
287
+ rubygems_version: 2.5.1
288
+ signing_key:
289
+ specification_version: 4
290
+ summary: This is a crawler framework.
291
+ test_files: []