daimon_skycrawlers 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.travis.yml +11 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +73 -0
- data/Rakefile +17 -0
- data/bin/daimon-skycrawlers +9 -0
- data/daimon_skycrawlers.gemspec +37 -0
- data/db/schema.rb +15 -0
- data/lib/daimon_skycrawlers/cli.rb +14 -0
- data/lib/daimon_skycrawlers/configure_songkick_queue.rb +12 -0
- data/lib/daimon_skycrawlers/crawler.rb +90 -0
- data/lib/daimon_skycrawlers/generator/new.rb +42 -0
- data/lib/daimon_skycrawlers/generator/templates/new/Gemfile +4 -0
- data/lib/daimon_skycrawlers/generator/templates/new/README.md.erb +34 -0
- data/lib/daimon_skycrawlers/generator/templates/new/Rakefile +1 -0
- data/lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb +26 -0
- data/lib/daimon_skycrawlers/generator/templates/new/crawler.rb +14 -0
- data/lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb +13 -0
- data/lib/daimon_skycrawlers/generator/templates/new/enqueue.rb +15 -0
- data/lib/daimon_skycrawlers/generator/templates/new/processor.rb +9 -0
- data/lib/daimon_skycrawlers/http_response_consumer.rb +38 -0
- data/lib/daimon_skycrawlers/parser/base.rb +13 -0
- data/lib/daimon_skycrawlers/parser/default.rb +50 -0
- data/lib/daimon_skycrawlers/parser.rb +7 -0
- data/lib/daimon_skycrawlers/processor/default.rb +20 -0
- data/lib/daimon_skycrawlers/processor.rb +18 -0
- data/lib/daimon_skycrawlers/storage/base.rb +13 -0
- data/lib/daimon_skycrawlers/storage/null.rb +11 -0
- data/lib/daimon_skycrawlers/storage/rdb.rb +30 -0
- data/lib/daimon_skycrawlers/storage.rb +8 -0
- data/lib/daimon_skycrawlers/tasks/database_tasks.rake +53 -0
- data/lib/daimon_skycrawlers/tasks.rb +2 -0
- data/lib/daimon_skycrawlers/url_consumer.rb +32 -0
- data/lib/daimon_skycrawlers/version.rb +3 -0
- data/lib/daimon_skycrawlers.rb +15 -0
- metadata +291 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e9ebc710f18b92107a91a3bb5a4c8972051ede8f
|
4
|
+
data.tar.gz: 61db84bcdc73557ae9b28cc7894c04a1055dda6d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 76d7b629eefc04a89d5cdce8939a9e83f35c68a9b8d1be3599e3c7d59b815f506eb1c295d1e3095f18ca7ec12169fa5e365cda981561804bbb4d2d070eef3051
|
7
|
+
data.tar.gz: 04b326f7a8531d364c3d41d12f99328831b2eda1acefa72869f588750e065abc0a044fd6d87b92f20a6359983dba8b00c42a4ab40d75979f92c6a553ec16f9e9
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 Ryunosuke SATO
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# DaimonSkycrawlers
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/daimon_skycrawlers`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Requirements
|
8
|
+
|
9
|
+
- Ruby
|
10
|
+
- RabbitMQ
|
11
|
+
- RDB
|
12
|
+
- PostgreSQL (default)
|
13
|
+
- MySQL
|
14
|
+
- SQLite3
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
Add this line to your application's Gemfile:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem 'daimon_skycrawlers'
|
22
|
+
```
|
23
|
+
|
24
|
+
And then execute:
|
25
|
+
|
26
|
+
$ bundle
|
27
|
+
|
28
|
+
Or install it yourself as:
|
29
|
+
|
30
|
+
$ gem install daimon_skycrawlers
|
31
|
+
|
32
|
+
## Usage
|
33
|
+
|
34
|
+
1. Create project
|
35
|
+
|
36
|
+
$ bundle exec daimon-skycrawlers new mycrawlers
|
37
|
+
$ cd mycrawlers
|
38
|
+
|
39
|
+
2. Install dependencies
|
40
|
+
|
41
|
+
$ bundle install
|
42
|
+
|
43
|
+
3. Create database
|
44
|
+
|
45
|
+
$ bundle exec rake db:create
|
46
|
+
$ bundle exec rake db:migrate
|
47
|
+
|
48
|
+
4. Open new terminal and run crawler/processor
|
49
|
+
|
50
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
51
|
+
$ bundle exec ruby processor.rb # on new terminal
|
52
|
+
|
53
|
+
5. Enqueue task
|
54
|
+
|
55
|
+
$ bundle exec ruby enqueue.rb http://example.com/
|
56
|
+
|
57
|
+
6. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
58
|
+
|
59
|
+
## Development
|
60
|
+
|
61
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
62
|
+
|
63
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
64
|
+
|
65
|
+
## Contributing
|
66
|
+
|
67
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/bm-sms/daimon-skycrawlers.
|
68
|
+
|
69
|
+
|
70
|
+
## License
|
71
|
+
|
72
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
73
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require "rake/testtask"
|
4
|
+
|
5
|
+
Rake::TestTask.new(:test) do |t|
|
6
|
+
t.libs << "test"
|
7
|
+
t.libs << "lib"
|
8
|
+
t.test_files = FileList['test/**/*_test.rb']
|
9
|
+
end
|
10
|
+
|
11
|
+
require "cucumber/rake/task"
|
12
|
+
|
13
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
14
|
+
t.cucumber_opts = "features --format pretty"
|
15
|
+
end
|
16
|
+
|
17
|
+
task :default => [:test, :features]
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'daimon_skycrawlers/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "daimon_skycrawlers"
|
8
|
+
spec.version = DaimonSkycrawlers::VERSION
|
9
|
+
spec.authors = ["Ryunosuke SATO"]
|
10
|
+
spec.email = ["tricknotes.rs@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{This is a crawler framework.}
|
13
|
+
spec.description = %q{This is a crawler framework.}
|
14
|
+
spec.homepage = "https://github.com/bm-sms/daimon-skycrawlers"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "thor"
|
22
|
+
spec.add_dependency "songkick_queue"
|
23
|
+
spec.add_dependency "faraday"
|
24
|
+
spec.add_dependency "faraday_middleware"
|
25
|
+
spec.add_dependency "nokogiri"
|
26
|
+
spec.add_dependency "activerecord"
|
27
|
+
spec.add_dependency "pg"
|
28
|
+
|
29
|
+
spec.add_development_dependency "bundler", "~> 1.11"
|
30
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
31
|
+
spec.add_development_dependency "test-unit"
|
32
|
+
spec.add_development_dependency "test-unit-rr"
|
33
|
+
spec.add_development_dependency "test-unit-notify"
|
34
|
+
spec.add_development_dependency "cucumber"
|
35
|
+
spec.add_development_dependency "pry"
|
36
|
+
spec.add_development_dependency "tapp"
|
37
|
+
end
|
data/db/schema.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require "active_record"
|
2
|
+
|
3
|
+
ActiveRecord::Base.establish_connection(adapter: "sqlite3",
|
4
|
+
database: "storage.db")
|
5
|
+
|
6
|
+
ActiveRecord::Schema.define(version: 1) do
|
7
|
+
create_table :pages do |t|
|
8
|
+
t.string :url
|
9
|
+
t.text :headers
|
10
|
+
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
11
|
+
t.datetime :last_modified_at
|
12
|
+
t.string :etag
|
13
|
+
t.timestamps
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'daimon_skycrawlers/generator/new'
|
3
|
+
require 'daimon_skycrawlers/version'
|
4
|
+
|
5
|
+
module DaimonSkycrawlers
|
6
|
+
class CLI < Thor
|
7
|
+
register(Generator::New, "new", "new NAME", "Create new project")
|
8
|
+
|
9
|
+
desc "version", "Show version"
|
10
|
+
def version
|
11
|
+
puts VERSION
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'songkick_queue'
|
2
|
+
# TODO Allow to configure from user land
|
3
|
+
SongkickQueue.configure do |config|
|
4
|
+
config.logger = Logger.new(STDOUT)
|
5
|
+
config.host = '127.0.0.1'
|
6
|
+
config.port = 5672
|
7
|
+
# config.username = 'guest'
|
8
|
+
# config.password = 'guest'
|
9
|
+
config.vhost = '/'
|
10
|
+
config.max_reconnect_attempts = 10
|
11
|
+
config.network_recovery_interval = 1.0
|
12
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
require 'daimon_skycrawlers'
|
4
|
+
require 'daimon_skycrawlers/version'
|
5
|
+
require 'daimon_skycrawlers/configure_songkick_queue'
|
6
|
+
require 'daimon_skycrawlers/url_consumer'
|
7
|
+
require 'daimon_skycrawlers/storage'
|
8
|
+
require 'daimon_skycrawlers/parser'
|
9
|
+
|
10
|
+
require 'faraday'
|
11
|
+
|
12
|
+
module DaimonSkycrawlers
|
13
|
+
class Crawler
|
14
|
+
class << self
|
15
|
+
def run(process_name: 'daimon-skycrawler:url')
|
16
|
+
SongkickQueue::Worker.new(process_name, [URLConsumer]).run
|
17
|
+
end
|
18
|
+
|
19
|
+
def enqueue_url(url, depth: 3, interval: 1)
|
20
|
+
SongkickQueue.publish('daimon-skycrawler.url', url: url, depth: depth, interval: interval)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_writer :storage
|
25
|
+
attr_writer :parser
|
26
|
+
|
27
|
+
def initialize(base_url, options = {})
|
28
|
+
@base_url = base_url
|
29
|
+
@options = options
|
30
|
+
end
|
31
|
+
|
32
|
+
def setup_connection(options = {})
|
33
|
+
@connection = Faraday.new(@base_url, options) do |faraday|
|
34
|
+
yield faraday
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def storage
|
39
|
+
@storage ||= Storage::RDB.new
|
40
|
+
end
|
41
|
+
|
42
|
+
def parser
|
43
|
+
@parser ||= Parser::Default.new
|
44
|
+
end
|
45
|
+
|
46
|
+
# TODO Support POST when we need
|
47
|
+
# TODO `params` should be a part of `path`. such as `path == "/hoi?hi=yoyo"`.
|
48
|
+
def fetch(path, params = {}, depth: 3)
|
49
|
+
@connection ||= Faraday.new(@base_url)
|
50
|
+
response = get(path)
|
51
|
+
|
52
|
+
url = @connection.url_prefix + path
|
53
|
+
|
54
|
+
data = [url.to_s, response.headers, response.body]
|
55
|
+
|
56
|
+
yield(*data) if block_given?
|
57
|
+
|
58
|
+
storage.save(*data)
|
59
|
+
|
60
|
+
schedule_to_process(url.to_s)
|
61
|
+
|
62
|
+
parser.parse(response.body)
|
63
|
+
urls = parser.links
|
64
|
+
|
65
|
+
enqueue_next_urls(urls, depth: depth - 1, interval: 1)
|
66
|
+
end
|
67
|
+
|
68
|
+
def get(path, params = {})
|
69
|
+
@connection.get(path, params)
|
70
|
+
end
|
71
|
+
|
72
|
+
def post(path, params = {})
|
73
|
+
@connection.post(path, params)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def schedule_to_process(url)
|
79
|
+
DaimonSkycrawlers::Processor.enqueue_http_response(url)
|
80
|
+
end
|
81
|
+
|
82
|
+
def enqueue_next_urls(urls, depth: 3, interval: 1)
|
83
|
+
return if depth <= 0
|
84
|
+
|
85
|
+
urls.each do |url|
|
86
|
+
self.class.enqueue_url(url, depth: depth, interval: interval)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'thor'
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
module Generator
|
5
|
+
class New < Thor::Group
|
6
|
+
include Thor::Actions
|
7
|
+
|
8
|
+
argument :name
|
9
|
+
|
10
|
+
def self.source_root
|
11
|
+
File.join(File.dirname(__FILE__), "templates", "new")
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_files
|
15
|
+
[
|
16
|
+
"README.md",
|
17
|
+
"config/database.yml",
|
18
|
+
].each do |path|
|
19
|
+
template("#{path}.erb", "#{name}/#{path}")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def copy_files
|
24
|
+
[
|
25
|
+
"Gemfile",
|
26
|
+
"Rakefile",
|
27
|
+
"crawler.rb",
|
28
|
+
"enqueue.rb",
|
29
|
+
"processor.rb",
|
30
|
+
].each do |path|
|
31
|
+
copy_file(path, "#{name}/#{path}")
|
32
|
+
end
|
33
|
+
[
|
34
|
+
"db/migrate/create_pages.rb",
|
35
|
+
].each do |path|
|
36
|
+
migration = "#{Time.now.strftime("%Y%m%d%H%M%S")}_#{File.basename(path)}"
|
37
|
+
copy_file(path, "#{name}/db/migrate/#{migration}")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# <%= name %>
|
2
|
+
|
3
|
+
TODO: Write description.
|
4
|
+
|
5
|
+
## Requirements
|
6
|
+
|
7
|
+
- Ruby
|
8
|
+
- RabbitMQ
|
9
|
+
- RDB
|
10
|
+
- PostgreSQL (default)
|
11
|
+
- MySQL
|
12
|
+
- SQLite3
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
1. Install dependencies
|
17
|
+
|
18
|
+
$ bundle install
|
19
|
+
|
20
|
+
2. Create database
|
21
|
+
|
22
|
+
$ bundle exec rake db:create
|
23
|
+
$ bundle exec rake db:migrate
|
24
|
+
|
25
|
+
3. Open new terminal and run crawler/processor
|
26
|
+
|
27
|
+
$ bundle exec ruby crawler.rb # on new terminal
|
28
|
+
$ bundle exec ruby processor.rb # on new terminal
|
29
|
+
|
30
|
+
4. Enqueue task
|
31
|
+
|
32
|
+
$ bundle exec ruby enqueue.rb http://example.com/
|
33
|
+
|
34
|
+
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
@@ -0,0 +1 @@
|
|
1
|
+
require "daimon_skycrawlers/tasks"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# PostgreSQL. Versions 8.2 and up are supported.
|
2
|
+
#
|
3
|
+
default: &default
|
4
|
+
adapter: postgresql
|
5
|
+
encoding: unicode
|
6
|
+
pool: 5
|
7
|
+
|
8
|
+
development:
|
9
|
+
<<: *default
|
10
|
+
database: <%= name %>_development
|
11
|
+
#username: <%= name %>
|
12
|
+
#password:
|
13
|
+
#host: localhost
|
14
|
+
#port: 5432
|
15
|
+
#schema_search_path: myapp,sharedapp,public
|
16
|
+
#min_messages: notice
|
17
|
+
|
18
|
+
test:
|
19
|
+
<<: *default
|
20
|
+
database: <%= name %>_test
|
21
|
+
|
22
|
+
production:
|
23
|
+
<<: *default
|
24
|
+
database: <%= name %>_production
|
25
|
+
username: <%= name %>
|
26
|
+
password: <%%= ENV['<%= name.upcase %>_PASSWORD'] %>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "daimon_skycrawlers/crawler"
|
4
|
+
|
5
|
+
base_url = 'http://example.com'
|
6
|
+
|
7
|
+
crawler = DaimonSkycrawlers::Crawler.new(base_url)
|
8
|
+
crawler.parser.append_filter do |url|
|
9
|
+
url.start_with?(base_url)
|
10
|
+
end
|
11
|
+
|
12
|
+
DaimonSkycrawlers.register_crawler(crawler)
|
13
|
+
|
14
|
+
DaimonSkycrawlers::Crawler.run
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class CreatePages < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :pages do |t|
|
4
|
+
t.string :url
|
5
|
+
t.text :headers
|
6
|
+
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
7
|
+
t.datetime :last_modified_at
|
8
|
+
t.string :etag
|
9
|
+
|
10
|
+
t.timestamps null: false
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "daimon_skycrawlers/crawler"
|
4
|
+
|
5
|
+
USAGE = "Usage: #{$0} [URL]"
|
6
|
+
|
7
|
+
if ARGV.size < 1
|
8
|
+
$stderr.puts "#{$0}: missing URL"
|
9
|
+
$stderr.puts USAGE
|
10
|
+
exit false
|
11
|
+
end
|
12
|
+
|
13
|
+
url = ARGV[0]
|
14
|
+
|
15
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url)
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require "daimon_skycrawlers/processor/default"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
class HTTPResponseConsumer
|
5
|
+
include SongkickQueue::Consumer
|
6
|
+
|
7
|
+
consume_from_queue 'daimon-skycrawler.http-response'
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def register(processor = nil, &block)
|
11
|
+
if block_given?
|
12
|
+
processors << block
|
13
|
+
else
|
14
|
+
processors << processor
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def processors
|
19
|
+
@processors ||= []
|
20
|
+
end
|
21
|
+
|
22
|
+
def default_processor
|
23
|
+
DaimonSkycrawlers::Processor::Default.new
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def process(message)
|
28
|
+
if self.class.processors.empty?
|
29
|
+
processors = [self.class.default_processor]
|
30
|
+
else
|
31
|
+
processors = self.class.processors
|
32
|
+
end
|
33
|
+
processors.each do |processor|
|
34
|
+
processor.call(message)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
module Parser
|
5
|
+
class Default < Base
|
6
|
+
def initialize
|
7
|
+
@filters = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def append_filter(filter = nil, &block)
|
11
|
+
if block_given?
|
12
|
+
@filters << block
|
13
|
+
else
|
14
|
+
@filters << filter
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse(html)
|
19
|
+
@html = html
|
20
|
+
@doc = Nokogiri::HTML(html)
|
21
|
+
end
|
22
|
+
|
23
|
+
def links
|
24
|
+
return @links if @links
|
25
|
+
@links = retrieve_links
|
26
|
+
@links
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def retrieve_links
|
32
|
+
urls = @doc.search("a").map do |element|
|
33
|
+
element["href"]
|
34
|
+
end
|
35
|
+
apply_filters(urls) || []
|
36
|
+
end
|
37
|
+
|
38
|
+
def apply_filters(urls)
|
39
|
+
return if urls.nil?
|
40
|
+
return if urls.empty?
|
41
|
+
@filters.each do |filter|
|
42
|
+
urls = urls.select do |url|
|
43
|
+
filter.call(url)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
urls
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "daimon_skycrawlers/storage/rdb"
|
2
|
+
|
3
|
+
module DaimonSkycrawlers
|
4
|
+
class Processor
|
5
|
+
class Default
|
6
|
+
def call(message)
|
7
|
+
url = message[:url]
|
8
|
+
storage = DaimonSkycrawlers::Storage::RDB.new
|
9
|
+
page = storage.find(url)
|
10
|
+
headers = JSON.parse(page.headers)
|
11
|
+
puts "URL: #{page.url}"
|
12
|
+
puts "Body: #{page.body.bytesize} bytes"
|
13
|
+
puts "Headers:"
|
14
|
+
headers.each do |key, value|
|
15
|
+
puts " #{key}: #{value}"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'daimon_skycrawlers'
|
2
|
+
require 'daimon_skycrawlers/configure_songkick_queue'
|
3
|
+
require 'daimon_skycrawlers/url_consumer'
|
4
|
+
require 'daimon_skycrawlers/http_response_consumer'
|
5
|
+
|
6
|
+
module DaimonSkycrawlers
|
7
|
+
class Processor
|
8
|
+
class << self
|
9
|
+
def run(process_name: 'daimon-skycrawler:http-response')
|
10
|
+
SongkickQueue::Worker.new(process_name, [HTTPResponseConsumer]).run
|
11
|
+
end
|
12
|
+
|
13
|
+
def enqueue_http_response(url)
|
14
|
+
SongkickQueue.publish('daimon-skycrawler.http-response', url: url)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require "daimon_skycrawlers/storage/base"
|
2
|
+
require "active_record"
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
module Storage
|
6
|
+
class RDB < Base
|
7
|
+
def initialize(config_path = "config/database.yml")
|
8
|
+
config = YAML.load_file(config_path)
|
9
|
+
environment = ENV["SKYCRAWLERS_ENV"] || "development"
|
10
|
+
ActiveRecord::Base.establish_connection(config[environment])
|
11
|
+
end
|
12
|
+
|
13
|
+
def save(url, headers, body)
|
14
|
+
Page.create(url: url,
|
15
|
+
headers: JSON.generate(headers),
|
16
|
+
body: body,
|
17
|
+
last_modified_at: headers["last-modified"],
|
18
|
+
etag: headers["etag"])
|
19
|
+
end
|
20
|
+
|
21
|
+
def find(url)
|
22
|
+
Page.where(url: url).order(last_modified_at: :desc).limit(1).first
|
23
|
+
end
|
24
|
+
|
25
|
+
class Page < ActiveRecord::Base
|
26
|
+
self.table_name = "pages"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# Copyright (c) 2012 Janko Marohnić
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person
|
4
|
+
# obtaining a copy of this software and associated documentation
|
5
|
+
# files (the "Software"), to deal in the Software without
|
6
|
+
# restriction, including without limitation the rights to use,
|
7
|
+
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
# copies of the Software, and to permit persons to whom the
|
9
|
+
# Software is furnished to do so, subject to the following
|
10
|
+
# conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
+
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
+
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
# OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
# https://github.com/janko-m/sinatra-activerecord
|
25
|
+
#
|
26
|
+
|
27
|
+
seed_loader = Class.new do
|
28
|
+
def load_seed
|
29
|
+
# load "#{ActiveRecord::Tasks::DatabaseTasks.db_dir}/seeds.rb"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
ActiveRecord::Tasks::DatabaseTasks.tap do |config|
|
34
|
+
config.root = Rake.application.original_dir
|
35
|
+
config.env = ENV["SKYCRAWLERS_ENV"] || "development"
|
36
|
+
config.db_dir = "db"
|
37
|
+
config.migrations_paths = ["db/migrate"]
|
38
|
+
config.fixtures_path = "test/fixtures"
|
39
|
+
config.seed_loader = seed_loader.new
|
40
|
+
config.database_configuration = ActiveRecord::Base.configurations
|
41
|
+
end
|
42
|
+
|
43
|
+
# db:load_config can be overriden manually
|
44
|
+
Rake::Task["db:seed"].enhance(["db:load_config"])
|
45
|
+
Rake::Task["db:load_config"].clear
|
46
|
+
|
47
|
+
Rake::Task.define_task("db:environment")
|
48
|
+
Rake::Task.define_task("db:load_config") do
|
49
|
+
ActiveRecord::Base.configurations = YAML.load_file("config/database.yml")
|
50
|
+
environment = ENV["SKYCRAWLERS_ENV"] || "development"
|
51
|
+
ActiveRecord::Base.establish_connection(environment.to_sym)
|
52
|
+
end
|
53
|
+
Rake::Task["db:test:deprecated"].clear if Rake::Task.task_defined?("db:test:deprecated")
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'daimon_skycrawlers/crawler'
|
2
|
+
require 'daimon_skycrawlers/processor'
|
3
|
+
|
4
|
+
module DaimonSkycrawlers
|
5
|
+
class URLConsumer
|
6
|
+
include SongkickQueue::Consumer
|
7
|
+
|
8
|
+
consume_from_queue 'daimon-skycrawler.url'
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def register(crawler)
|
12
|
+
crawlers << crawler
|
13
|
+
end
|
14
|
+
|
15
|
+
def crawlers
|
16
|
+
@crawlers ||= []
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def process(message)
|
21
|
+
url = message[:url]
|
22
|
+
depth = message[:depth]
|
23
|
+
interval = message[:interval]
|
24
|
+
|
25
|
+
# XXX When several crawlers are registered, how should they behave?
|
26
|
+
self.class.crawlers.each do |crawler|
|
27
|
+
sleep(interval)
|
28
|
+
crawler.fetch(url, depth)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
|
3
|
+
require 'daimon_skycrawlers/version'
|
4
|
+
|
5
|
+
module DaimonSkycrawlers
|
6
|
+
class << self
|
7
|
+
def register_processor(processor = nil, &block)
|
8
|
+
HTTPResponseConsumer.register(processor, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
def register_crawler(crawler)
|
12
|
+
URLConsumer.register(crawler)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,291 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: daimon_skycrawlers
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ryunosuke SATO
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-01-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: thor
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: songkick_queue
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: faraday
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: faraday_middleware
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: activerecord
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pg
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: bundler
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '1.11'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '1.11'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: rake
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '10.0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '10.0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: test-unit
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: test-unit-rr
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :development
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: test-unit-notify
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :development
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: cucumber
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
type: :development
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - ">="
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: pry
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :development
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: tapp
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :development
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
223
|
+
description: This is a crawler framework.
|
224
|
+
email:
|
225
|
+
- tricknotes.rs@gmail.com
|
226
|
+
executables:
|
227
|
+
- daimon-skycrawlers
|
228
|
+
extensions: []
|
229
|
+
extra_rdoc_files: []
|
230
|
+
files:
|
231
|
+
- ".gitignore"
|
232
|
+
- ".travis.yml"
|
233
|
+
- Gemfile
|
234
|
+
- LICENSE.txt
|
235
|
+
- README.md
|
236
|
+
- Rakefile
|
237
|
+
- bin/daimon-skycrawlers
|
238
|
+
- daimon_skycrawlers.gemspec
|
239
|
+
- db/schema.rb
|
240
|
+
- lib/daimon_skycrawlers.rb
|
241
|
+
- lib/daimon_skycrawlers/cli.rb
|
242
|
+
- lib/daimon_skycrawlers/configure_songkick_queue.rb
|
243
|
+
- lib/daimon_skycrawlers/crawler.rb
|
244
|
+
- lib/daimon_skycrawlers/generator/new.rb
|
245
|
+
- lib/daimon_skycrawlers/generator/templates/new/Gemfile
|
246
|
+
- lib/daimon_skycrawlers/generator/templates/new/README.md.erb
|
247
|
+
- lib/daimon_skycrawlers/generator/templates/new/Rakefile
|
248
|
+
- lib/daimon_skycrawlers/generator/templates/new/config/database.yml.erb
|
249
|
+
- lib/daimon_skycrawlers/generator/templates/new/crawler.rb
|
250
|
+
- lib/daimon_skycrawlers/generator/templates/new/db/migrate/create_pages.rb
|
251
|
+
- lib/daimon_skycrawlers/generator/templates/new/enqueue.rb
|
252
|
+
- lib/daimon_skycrawlers/generator/templates/new/processor.rb
|
253
|
+
- lib/daimon_skycrawlers/http_response_consumer.rb
|
254
|
+
- lib/daimon_skycrawlers/parser.rb
|
255
|
+
- lib/daimon_skycrawlers/parser/base.rb
|
256
|
+
- lib/daimon_skycrawlers/parser/default.rb
|
257
|
+
- lib/daimon_skycrawlers/processor.rb
|
258
|
+
- lib/daimon_skycrawlers/processor/default.rb
|
259
|
+
- lib/daimon_skycrawlers/storage.rb
|
260
|
+
- lib/daimon_skycrawlers/storage/base.rb
|
261
|
+
- lib/daimon_skycrawlers/storage/null.rb
|
262
|
+
- lib/daimon_skycrawlers/storage/rdb.rb
|
263
|
+
- lib/daimon_skycrawlers/tasks.rb
|
264
|
+
- lib/daimon_skycrawlers/tasks/database_tasks.rake
|
265
|
+
- lib/daimon_skycrawlers/url_consumer.rb
|
266
|
+
- lib/daimon_skycrawlers/version.rb
|
267
|
+
homepage: https://github.com/bm-sms/daimon-skycrawlers
|
268
|
+
licenses:
|
269
|
+
- MIT
|
270
|
+
metadata: {}
|
271
|
+
post_install_message:
|
272
|
+
rdoc_options: []
|
273
|
+
require_paths:
|
274
|
+
- lib
|
275
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
276
|
+
requirements:
|
277
|
+
- - ">="
|
278
|
+
- !ruby/object:Gem::Version
|
279
|
+
version: '0'
|
280
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
281
|
+
requirements:
|
282
|
+
- - ">="
|
283
|
+
- !ruby/object:Gem::Version
|
284
|
+
version: '0'
|
285
|
+
requirements: []
|
286
|
+
rubyforge_project:
|
287
|
+
rubygems_version: 2.5.1
|
288
|
+
signing_key:
|
289
|
+
specification_version: 4
|
290
|
+
summary: This is a crawler framework.
|
291
|
+
test_files: []
|