daimon_skycrawlers 0.11.0 → 0.11.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7c8534d5ada19111a31249998b4ff7c935dc44c4
4
- data.tar.gz: 77a7c9d78db52e9d40e412c9ced08d024caa1499
3
+ metadata.gz: 23b99f272ecd7f75af615fbfb696bb375d333736
4
+ data.tar.gz: 3f3df3ae6b5812f6b9cb676d82e09e7ecd08cd10
5
5
  SHA512:
6
- metadata.gz: 205234df54f6c985aa14f97fa5e5a967e18c923d7bf0fc4b29fe3d3c40c85d88667987b6bc7ca9d389dd513b9cb235b699b4741f228ab1151aa7dcb1a5802ebb
7
- data.tar.gz: e2090d4983e1ffb7434eef92b76c30795e495f79c89fe9fe3567ba4e7e8f8b650b33870b0ab718d5d1afd799de45d35d7791fbc29e43f4f76bfb23ce510b32e9
6
+ metadata.gz: 146c4a77653a4ad277bb573b2c6e9aae53b4a743884d7f21d1ded62bf9120e917c8ce1c7e36a5f2487dd652407df209f2facd050f47fc24885e3d7681e6dfdbc
7
+ data.tar.gz: 4b8512401a17ad800e4067483d28f46bbb90b96686f4b175ffa485fae3c03f6e3d7a8b15e87761d50547b0896421324715b9cc7716acd06a507831c16356194c
data/.gitignore CHANGED
@@ -7,3 +7,4 @@
7
7
  /pkg/
8
8
  /spec/reports/
9
9
  /tmp/
10
+ /sample/*/docker-cache/
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.11.0"
2
+ VERSION = "0.11.1"
3
3
  end
@@ -0,0 +1,5 @@
1
+ SKYCRAWLERS_RABBITMQ_HOST=amazon-ranking-rabbitmq
2
+ SKYCRAWLERS_RABBITMQ_PORT=5672
3
+ DATABASE_HOST=amazon-ranking-db
4
+ DATABASE_PORT=5432
5
+ DATABASE_URL=postgres://crawler:jSygSCX-TOoWH08AtawoxQ@amazon-ranking-db/amazon-ranking_development
@@ -0,0 +1,5 @@
1
+ POSTGRES_USER=postgres
2
+ POSTGRES_PASSWORD=jSygSCX-TOoWH08AtawoxQ
3
+ DATABASE_USER=crawler
4
+ DATABASE_PASSWORD=jSygSCX-TOoWH08AtawoxQ
5
+ DATABASE_PREFIX=amazon-ranking
@@ -0,0 +1,30 @@
1
+ FROM ruby:2.3.1-alpine
2
+
3
+ RUN apk --no-cache --update add build-base ruby-dev libxml2-dev postgresql-dev libcurl openssl git
4
+
5
+ RUN adduser -D -h /home/crawler -g "DaimonSkycrawlers user" -s /bin/sh crawler crawler
6
+
7
+ ARG SKYCRAWLERS_ENV=production
8
+ ARG SKYCRAWLERS_MAIN=crawler
9
+ ENV SKYCRAWLERS_ENV=$SKYCRAWLERS_ENV \
10
+ SKYCRAWLERS_MAIN=$SKYCRAWLERS_MAIN \
11
+ BUNDLE_JOBS=4 \
12
+ DOCKERIZE_VERSION=v0.3.0
13
+
14
+ RUN wget https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
15
+ && tar -C /usr/local/bin -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \
16
+ && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz
17
+
18
+ USER crawler
19
+ WORKDIR /home/crawler
20
+ COPY ./Gemfile* ./
21
+
22
+ RUN if [ "$SKYCRAWLERS_ENV" = "production" ]; then \
23
+ bundle install --without development:test; \
24
+ fi
25
+
26
+ COPY . .
27
+
28
+ ADD services/common/docker-entrypoint.sh /docker-entrypoint.sh
29
+ ENTRYPOINT ["/docker-entrypoint.sh"]
30
+ CMD ["$SKYCRAWLERS_MAIN"]
@@ -0,0 +1,6 @@
1
+ FROM postgres:9.5.4
2
+
3
+ RUN localedef -i ja_JP -c -f UTF-8 -A /usr/share/locale/locale.alias ja_JP.UTF-8
4
+ ENV LANG ja_JP.UTF-8
5
+
6
+ ADD services/db/init-user-db.sh /docker-entrypoint-initdb.d/init-user-db.sh
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem "rake"
4
+ gem "daimon_skycrawlers"
@@ -0,0 +1,101 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ actionpack (5.0.0.1)
5
+ actionview (= 5.0.0.1)
6
+ activesupport (= 5.0.0.1)
7
+ rack (~> 2.0)
8
+ rack-test (~> 0.6.3)
9
+ rails-dom-testing (~> 2.0)
10
+ rails-html-sanitizer (~> 1.0, >= 1.0.2)
11
+ actionview (5.0.0.1)
12
+ activesupport (= 5.0.0.1)
13
+ builder (~> 3.1)
14
+ erubis (~> 2.7.0)
15
+ rails-dom-testing (~> 2.0)
16
+ rails-html-sanitizer (~> 1.0, >= 1.0.2)
17
+ activemodel (5.0.0.1)
18
+ activesupport (= 5.0.0.1)
19
+ activerecord (5.0.0.1)
20
+ activemodel (= 5.0.0.1)
21
+ activesupport (= 5.0.0.1)
22
+ arel (~> 7.0)
23
+ activesupport (5.0.0.1)
24
+ concurrent-ruby (~> 1.0, >= 1.0.2)
25
+ i18n (~> 0.7)
26
+ minitest (~> 5.1)
27
+ tzinfo (~> 1.1)
28
+ amq-protocol (2.0.1)
29
+ arel (7.1.4)
30
+ builder (3.2.2)
31
+ bunny (2.6.1)
32
+ amq-protocol (>= 2.0.1)
33
+ concurrent-ruby (1.0.2)
34
+ daimon_skycrawlers (0.10.0)
35
+ activerecord
36
+ faraday
37
+ faraday_middleware
38
+ nokogiri
39
+ pg
40
+ railties
41
+ songkick_queue
42
+ thor
43
+ timers
44
+ typhoeus
45
+ webrobots
46
+ erubis (2.7.0)
47
+ ethon (0.10.1)
48
+ ffi (>= 1.3.0)
49
+ faraday (0.10.0)
50
+ multipart-post (>= 1.2, < 3)
51
+ faraday_middleware (0.10.1)
52
+ faraday (>= 0.7.4, < 1.0)
53
+ ffi (1.9.14)
54
+ hitimes (1.2.4)
55
+ i18n (0.7.0)
56
+ loofah (2.0.3)
57
+ nokogiri (>= 1.5.9)
58
+ method_source (0.8.2)
59
+ mini_portile2 (2.1.0)
60
+ minitest (5.10.1)
61
+ multipart-post (2.0.0)
62
+ nokogiri (1.6.8.1)
63
+ mini_portile2 (~> 2.1.0)
64
+ pg (0.19.0)
65
+ rack (2.0.1)
66
+ rack-test (0.6.3)
67
+ rack (>= 1.0)
68
+ rails-dom-testing (2.0.1)
69
+ activesupport (>= 4.2.0, < 6.0)
70
+ nokogiri (~> 1.6.0)
71
+ rails-html-sanitizer (1.0.3)
72
+ loofah (~> 2.0)
73
+ railties (5.0.0.1)
74
+ actionpack (= 5.0.0.1)
75
+ activesupport (= 5.0.0.1)
76
+ method_source
77
+ rake (>= 0.8.7)
78
+ thor (>= 0.18.1, < 2.0)
79
+ rake (11.3.0)
80
+ songkick_queue (1.0.0)
81
+ activesupport (>= 3.0.0)
82
+ bunny (~> 2.2)
83
+ thor (0.19.4)
84
+ thread_safe (0.3.5)
85
+ timers (4.1.2)
86
+ hitimes
87
+ typhoeus (1.1.2)
88
+ ethon (>= 0.9.0)
89
+ tzinfo (1.2.2)
90
+ thread_safe (~> 0.1)
91
+ webrobots (0.1.2)
92
+
93
+ PLATFORMS
94
+ ruby
95
+
96
+ DEPENDENCIES
97
+ daimon_skycrawlers
98
+ rake
99
+
100
+ BUNDLED WITH
101
+ 1.13.5
@@ -0,0 +1,86 @@
1
+ # amazon-ranking
2
+
3
+ https://www.amazon.co.jp/gp/bestsellers/
4
+
5
+ amazonの全てのカテゴリの売れ筋ランキングを1位から20位まで抽出する。
6
+
7
+ ## Requirements
8
+
9
+ - Ruby
10
+ - RabbitMQ
11
+ - RDB
12
+ - PostgreSQL (default)
13
+ - MySQL
14
+ - SQLite3
15
+
16
+ ## Usage
17
+
18
+ 1. Install dependencies
19
+
20
+ ```
21
+ $ bundle install
22
+ ```
23
+
24
+ 2. Create database
25
+
26
+ ```
27
+ $ bundle exec rake db:create
28
+ $ bundle exec rake db:migrate
29
+ ```
30
+
31
+ 3. Open new terminal and run crawler/processor
32
+
33
+ ```
34
+ $ bin/crawler # on new terminal
35
+ $ bin/processor # on new terminal
36
+ ```
37
+
38
+ 4. Enqueue task
39
+
40
+ ```
41
+ $ bin/enqueue url http://example.com/
42
+ ```
43
+
44
+ 5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
45
+
46
+ 6. You can re-enqueue task for processor
47
+
48
+ ```
49
+ $ bin/enqueue response http://example.com/
50
+ ```
51
+
52
+ Display `It works with 'http://example.com'` again on your terminal which runs your processor.
53
+
54
+ ## Usage with docker-compose
55
+
56
+ 1. Build docker images
57
+
58
+ ```
59
+ $ docker-compose build
60
+ ```
61
+
62
+ 2. Run docker containers
63
+
64
+ ```
65
+ $ docker-compose up -d
66
+ ```
67
+
68
+ 3. Run a command on docker containers
69
+
70
+ ```
71
+ $ docker-compose exec <service name> <command>
72
+ ```
73
+
74
+ For example,
75
+
76
+ ```
77
+ $ docker-compose exec amazon-ranking-db bash
78
+ $ docker-compose exec amazon-ranking-crawler sh
79
+ ```
80
+
81
+ 4. Shutdown docker containers
82
+
83
+ ```
84
+ $ docker-compose down
85
+ $ docker-compose down --rmi all # Remove all related images
86
+ ```
@@ -0,0 +1 @@
1
+ require "daimon_skycrawlers/tasks"
@@ -0,0 +1,8 @@
1
+ require "daimon_skycrawlers/crawler"
2
+ require "daimon_skycrawlers/crawler/default"
3
+
4
+ base_url = "http://example.com"
5
+
6
+ crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
7
+
8
+ DaimonSkycrawlers.register_crawler(crawler)
@@ -0,0 +1,9 @@
1
+ require "daimon_skycrawlers/filter/base"
2
+
3
+ class SampleFilter < DaimonSkycrawlers::Filter::Base
4
+ def call(message)
5
+ # Imprement your filter here.
6
+ # If you want to crawl `url`, return true otherwise false.
7
+ true
8
+ end
9
+ end
@@ -0,0 +1,37 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+ require "daimon_skycrawlers/processor/base"
4
+ require "daimon_skycrawlers/processor/spider"
5
+
6
+ class AmazonRanking < DaimonSkycrawlers::Processor::Base
7
+ Item = Struct.new(:rank, :name, :url, :star, :review)
8
+ def call(message)
9
+ url = message[:url]
10
+ page = storage.find(url)
11
+ doc = Nokogiri::HTML(page.body)
12
+ ranking = []
13
+ doc.search(".zg_itemRow").each do |item|
14
+ rank = item.at(".zg_rankNumber").inner_text
15
+ link = item.at(".zg_rankLine+a")
16
+ star, review = item.search(".zg_rankLine+a+div a")
17
+ ranking << Item.new(rank, link.inner_text, link[:href], star[:title], review.inner_text)
18
+ end
19
+ p ranking
20
+ end
21
+ end
22
+
23
+ spider = DaimonSkycrawlers::Processor::Spider.new
24
+ spider.configure do |s|
25
+ s.link_rules = "ul#zg_browseRoot li a"
26
+ s.link_message = { next_processor: "AmazonRanking" }
27
+ s.before_process do |message|
28
+ message[:next_processor] != "AmazonRanking"
29
+ end
30
+ end
31
+ DaimonSkycrawlers.register_processor(spider)
32
+
33
+ processor = AmazonRanking.new
34
+ processor.before_process do |message|
35
+ message[:next_processor] == "AmazonRanking"
36
+ end
37
+ DaimonSkycrawlers.register_processor(processor)
@@ -0,0 +1,5 @@
1
+ require "daimon_skycrawlers/processor"
2
+
3
+ DaimonSkycrawlers.register_processor do |data|
4
+ p "It works with '#{data[:url]}'"
5
+ end
@@ -0,0 +1,27 @@
1
+ # PostgreSQL. Versions 8.2 and up are supported.
2
+ #
3
+ default: &default
4
+ adapter: postgresql
5
+ encoding: unicode
6
+ pool: 5
7
+ url: <%= ENV['DATABASE_URL'] %>
8
+
9
+ development:
10
+ <<: *default
11
+ database: amazon-ranking_development
12
+ #username: amazon-ranking
13
+ #password:
14
+ #host: localhost
15
+ #port: 5432
16
+ #schema_search_path: myapp,sharedapp,public
17
+ #min_messages: notice
18
+
19
+ test:
20
+ <<: *default
21
+ database: amazon-ranking_test
22
+
23
+ production:
24
+ <<: *default
25
+ database: amazon-ranking_production
26
+ username: amazon-ranking
27
+ password: <%= ENV['AMAZON-RANKING_PASSWORD'] %>
@@ -0,0 +1,27 @@
1
+ require "bundler/setup"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/logger"
4
+ require "daimon_skycrawlers/queue"
5
+
6
+ DaimonSkycrawlers.configure do |config|
7
+ config.logger = DaimonSkycrawlers::Logger.default
8
+ config.crawler_interval = 1
9
+ config.shutdown_interval = 300
10
+ end
11
+
12
+ DaimonSkycrawlers::Queue.configure do |config|
13
+ if ENV["CLOUDAMQP_URL"]
14
+ amqp_uri = URI(ENV["CLOUDAMQP_URL"])
15
+ config.host = amqp_uri.host
16
+ config.username = amqp_uri.user
17
+ config.password = amqp_uri.password
18
+ config.vhost = amqp_uri.user
19
+ else
20
+ config.port = 5672
21
+ config.host = ENV["SKYCRAWLERS_RABBITMQ_HOST"] || "localhost"
22
+ config.vhost = "/"
23
+ end
24
+ config.logger = DaimonSkycrawlers.configuration.logger
25
+ config.max_reconnect_attempts = 10
26
+ config.network_recovery_interval = 1.0
27
+ end
@@ -0,0 +1,16 @@
1
+ class CreatePages < ActiveRecord::Migration[5.0]
2
+ def change
3
+ create_table :pages do |t|
4
+ t.string :url
5
+ t.text :headers
6
+ t.binary :body
7
+ t.datetime :last_modified_at
8
+ t.string :etag
9
+
10
+ t.timestamps
11
+
12
+ t.index [:url]
13
+ t.index [:url, :updated_at]
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,30 @@
1
+ # This file is auto-generated from the current state of the database. Instead
2
+ # of editing this file, please use the migrations feature of Active Record to
3
+ # incrementally modify your database, and then regenerate this schema definition.
4
+ #
5
+ # Note that this schema.rb definition is the authoritative source for your
6
+ # database schema. If you need to create the application database on another
7
+ # system, you should be using db:schema:load, not running all the migrations
8
+ # from scratch. The latter is a flawed and unsustainable approach (the more migrations
9
+ # you'll amass, the slower it'll run and the greater likelihood for issues).
10
+ #
11
+ # It's strongly recommended that you check this file into your version control system.
12
+
13
+ ActiveRecord::Schema.define(version: 20161206061241) do
14
+
15
+ # These are extensions that must be enabled in order to support this database
16
+ enable_extension "plpgsql"
17
+
18
+ create_table "pages", force: :cascade do |t|
19
+ t.string "url"
20
+ t.text "headers"
21
+ t.binary "body"
22
+ t.datetime "last_modified_at"
23
+ t.string "etag"
24
+ t.datetime "created_at", null: false
25
+ t.datetime "updated_at", null: false
26
+ t.index ["url", "updated_at"], name: "index_pages_on_url_and_updated_at", using: :btree
27
+ t.index ["url"], name: "index_pages_on_url", using: :btree
28
+ end
29
+
30
+ end
@@ -0,0 +1,43 @@
1
+ version: "2"
2
+ services:
3
+ amazon-ranking-rabbitmq:
4
+ image: rabbitmq
5
+ volumes:
6
+ - amazon-ranking-rabbitmq-storage:/var/lib/rabbitmq
7
+
8
+ amazon-ranking-db:
9
+ build:
10
+ context: .
11
+ dockerfile: Dockerfile.db
12
+ volumes:
13
+ - amazon-ranking-db-storage:/var/lib/postgresql/data
14
+ env_file: .env.db
15
+
16
+ amazon-ranking-common: &common
17
+ build:
18
+ context: .
19
+ args:
20
+ - SKYCRAWLERS_ENV=development
21
+ links:
22
+ - amazon-ranking-rabbitmq
23
+ - amazon-ranking-db
24
+ volumes:
25
+ - ./:/home/crawler
26
+ - ./docker-cache/.bundle:/home/crawler/.bundle
27
+ - ./docker-cache/bundle:/home/crawler/vendor/bundle
28
+ working_dir: /home/crawler
29
+ env_file: .env
30
+
31
+ amazon-ranking-crawler:
32
+ <<: *common
33
+ command: crawler
34
+
35
+ amazon-ranking-processor:
36
+ <<: *common
37
+ depends_on:
38
+ - amazon-ranking-crawler
39
+ command: processor
40
+
41
+ volumes:
42
+ amazon-ranking-db-storage:
43
+ amazon-ranking-rabbitmq-storage:
@@ -0,0 +1,36 @@
1
+ #!/bin/sh
2
+
3
+ set -x
4
+
5
+ MAIN=$1
6
+ dockerize -timeout 60s \
7
+ -wait tcp://${DATABASE_HOST}:${DATABASE_PORT} \
8
+ -wait tcp://${SKYCRAWLERS_RABBITMQ_HOST}:${SKYCRAWLERS_RABBITMQ_PORT}
9
+ case $MAIN in
10
+ crawler)
11
+ bundle check || bundle install --retry=3 --path=vendor/bundle \
12
+ && bundle exec rake db:migrate
13
+ bundle exec daimon_skycrawlers exec $MAIN
14
+ ;;
15
+ processor)
16
+ while [ ! -e Gemfile.lock ]; do
17
+ sleep 5
18
+ done
19
+ bundle check || bundle install --retry=3 --path=vendor/bundle \
20
+ && bundle exec rake db:migrate
21
+ bundle exec daimon_skycrawlers exec $MAIN
22
+ ;;
23
+ setup)
24
+ bundle install --retry=3 --path=vendor/bundle
25
+ bundle exec rake db:schema:load
26
+ ;;
27
+ migrate)
28
+ bundle exec rake db:migrate
29
+ ;;
30
+ none)
31
+ echo NOP
32
+ ;;
33
+ sleep)
34
+ sleep 1d
35
+ ;;
36
+ esac
@@ -0,0 +1,9 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ createuser -e -U $POSTGRES_USER -d $DATABASE_USER
6
+ createdb -e -U $POSTGRES_USER -E UTF-8 -O $DATABASE_USER $DATABASE_USER
7
+ createdb -e -U $POSTGRES_USER -E UTF-8 -O $DATABASE_USER ${DATABASE_PREFIX}_development
8
+ createdb -e -U $POSTGRES_USER -E UTF-8 -O $DATABASE_USER ${DATABASE_PREFIX}_test
9
+ psql -U postgres -c "ALTER ROLE $DATABASE_USER WITH PASSWORD '$DATABASE_PASSWORD';"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.11.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - daimon developers
@@ -376,6 +376,25 @@ files:
376
376
  - lib/daimon_skycrawlers/tasks/database_tasks.rake
377
377
  - lib/daimon_skycrawlers/timer.rb
378
378
  - lib/daimon_skycrawlers/version.rb
379
+ - sample/amazon-ranking/.env
380
+ - sample/amazon-ranking/.env.db
381
+ - sample/amazon-ranking/Dockerfile
382
+ - sample/amazon-ranking/Dockerfile.db
383
+ - sample/amazon-ranking/Gemfile
384
+ - sample/amazon-ranking/Gemfile.lock
385
+ - sample/amazon-ranking/README.md
386
+ - sample/amazon-ranking/Rakefile
387
+ - sample/amazon-ranking/app/crawlers/sample_crawler.rb
388
+ - sample/amazon-ranking/app/filters/sample_filter.rb
389
+ - sample/amazon-ranking/app/processors/amazon_ranking.rb
390
+ - sample/amazon-ranking/app/processors/sample_processor.rb
391
+ - sample/amazon-ranking/config/database.yml
392
+ - sample/amazon-ranking/config/init.rb
393
+ - sample/amazon-ranking/db/migrate/20161206061241_create_pages.rb
394
+ - sample/amazon-ranking/db/schema.rb
395
+ - sample/amazon-ranking/docker-compose.yml
396
+ - sample/amazon-ranking/services/common/docker-entrypoint.sh
397
+ - sample/amazon-ranking/services/db/init-user-db.sh
379
398
  - sample/itp-crawler/Gemfile
380
399
  - sample/itp-crawler/Gemfile.lock
381
400
  - sample/itp-crawler/README.md