kimurai 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +1923 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai.rb +53 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup.yml +44 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/base.rb +249 -0
- data/lib/kimurai/base/simple_saver.rb +98 -0
- data/lib/kimurai/base/uniq_checker.rb +22 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder.rb +32 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
- data/lib/kimurai/capybara_ext/session.rb +150 -0
- data/lib/kimurai/capybara_ext/session/config.rb +18 -0
- data/lib/kimurai/cli.rb +157 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +25 -0
- data/lib/kimurai/runner.rb +72 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/.ruby-version +1 -0
- data/lib/kimurai/template/Gemfile +20 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +32 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +104 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- metadata +349 -0
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "kimurai"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/exe/kimurai
ADDED
data/kimurai.gemspec
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "kimurai/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "kimurai"
|
8
|
+
spec.version = Kimurai::VERSION
|
9
|
+
spec.authors = ["Victor Afanasev"]
|
10
|
+
spec.email = ["vicfreefly@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
|
13
|
+
spec.homepage = "https://github.com/vifreefly/kimurai"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
# Specify which files should be added to the gem when it is released.
|
17
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
18
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
19
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
|
+
end
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ["lib"]
|
24
|
+
spec.required_ruby_version = ">= 2.5.0"
|
25
|
+
|
26
|
+
spec.add_dependency "thor"
|
27
|
+
spec.add_dependency "cliver"
|
28
|
+
spec.add_dependency "activesupport"
|
29
|
+
spec.add_dependency "murmurhash3"
|
30
|
+
spec.add_dependency "nokogiri"
|
31
|
+
|
32
|
+
spec.add_dependency "capybara", ">= 2.15", "< 4.0"
|
33
|
+
spec.add_dependency "capybara-mechanize"
|
34
|
+
spec.add_dependency "poltergeist"
|
35
|
+
spec.add_dependency "selenium-webdriver"
|
36
|
+
|
37
|
+
spec.add_dependency "headless"
|
38
|
+
spec.add_dependency "pmap"
|
39
|
+
|
40
|
+
spec.add_dependency "whenever"
|
41
|
+
|
42
|
+
spec.add_dependency "rbcat", "~> 0.2"
|
43
|
+
spec.add_dependency "pry"
|
44
|
+
|
45
|
+
spec.add_development_dependency "bundler", "~> 1.16"
|
46
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
47
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
48
|
+
end
|
data/lib/kimurai.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'logger'
|
3
|
+
require 'json'
|
4
|
+
require 'active_support'
|
5
|
+
require 'active_support/core_ext'
|
6
|
+
require 'rbcat'
|
7
|
+
|
8
|
+
require_relative 'kimurai/version'
|
9
|
+
|
10
|
+
require_relative 'kimurai/core_ext/numeric'
|
11
|
+
require_relative 'kimurai/core_ext/string'
|
12
|
+
require_relative 'kimurai/core_ext/array'
|
13
|
+
|
14
|
+
require_relative 'kimurai/browser_builder'
|
15
|
+
require_relative 'kimurai/base_helper'
|
16
|
+
require_relative 'kimurai/pipeline'
|
17
|
+
require_relative 'kimurai/base'
|
18
|
+
|
19
|
+
module Kimurai
|
20
|
+
class << self
|
21
|
+
def configuration
|
22
|
+
@configuration ||= OpenStruct.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def configure
|
26
|
+
yield(configuration)
|
27
|
+
end
|
28
|
+
|
29
|
+
def env
|
30
|
+
ENV.fetch("KIMURAI_ENV") { "development" }
|
31
|
+
end
|
32
|
+
|
33
|
+
def time_zone
|
34
|
+
ENV["TZ"]
|
35
|
+
end
|
36
|
+
|
37
|
+
def time_zone=(value)
|
38
|
+
ENV.store("TZ", value)
|
39
|
+
end
|
40
|
+
|
41
|
+
def list
|
42
|
+
Base.descendants.map do |klass|
|
43
|
+
next unless klass.name
|
44
|
+
[klass.name, klass]
|
45
|
+
end.compact.to_h
|
46
|
+
end
|
47
|
+
|
48
|
+
def find_by_name(name)
|
49
|
+
return unless name
|
50
|
+
Base.descendants.find { |klass| klass.name == name }
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
---
|
2
|
+
- hosts: all
|
3
|
+
vars:
|
4
|
+
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
|
5
|
+
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
|
6
|
+
repo_url:
|
7
|
+
repo_name:
|
8
|
+
repo_key_path:
|
9
|
+
|
10
|
+
tasks:
|
11
|
+
- name: Copy custom git ssh key to /tmp/private_key (if provided)
|
12
|
+
when: repo_key_path is not none
|
13
|
+
copy:
|
14
|
+
src: "{{ repo_key_path }}"
|
15
|
+
dest: /tmp/private_key
|
16
|
+
mode: 0600
|
17
|
+
|
18
|
+
- name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
|
19
|
+
when: repo_key_path is none
|
20
|
+
git:
|
21
|
+
repo: "{{ repo_url }}"
|
22
|
+
dest: "~/{{ repo_name }}"
|
23
|
+
force: true
|
24
|
+
accept_hostkey: true
|
25
|
+
|
26
|
+
- name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
|
27
|
+
when: repo_key_path is not none
|
28
|
+
git:
|
29
|
+
repo: "{{ repo_url }}"
|
30
|
+
dest: "~/{{ repo_name }}"
|
31
|
+
force: true
|
32
|
+
accept_hostkey: true
|
33
|
+
key_file: /tmp/private_key
|
34
|
+
|
35
|
+
- name: Delete custom git ssh key from /tmp/private_key (if provided)
|
36
|
+
when: repo_key_path is not none
|
37
|
+
file:
|
38
|
+
state: absent
|
39
|
+
path: /tmp/private_key
|
40
|
+
|
41
|
+
- name: Run bundle install
|
42
|
+
command: bundle install
|
43
|
+
args:
|
44
|
+
chdir: ~/{{ repo_name }}
|
45
|
+
environment:
|
46
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
47
|
+
|
48
|
+
- name: Run whenever to update crontab
|
49
|
+
command: whenever --update-crontab
|
50
|
+
args:
|
51
|
+
chdir: ~/{{ repo_name }}
|
52
|
+
environment:
|
53
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
54
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
---
|
2
|
+
- hosts: all
|
3
|
+
vars:
|
4
|
+
ruby: 2.5.1
|
5
|
+
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
|
6
|
+
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
|
7
|
+
ruby_versions_path: "{{ rbenv_root_path }}/versions"
|
8
|
+
# check latest here http://phantomjs.org/download.html
|
9
|
+
phantomjs: 2.1.1
|
10
|
+
# check latest here https://github.com/mozilla/geckodriver/releases/
|
11
|
+
geckodriver: 0.21.0
|
12
|
+
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
13
|
+
chromedriver: 2.39
|
14
|
+
|
15
|
+
tasks:
|
16
|
+
- name: Update apt cache
|
17
|
+
become: true
|
18
|
+
apt: update_cache=yes cache_valid_time=86400
|
19
|
+
|
20
|
+
- name: Install base packages
|
21
|
+
become: true
|
22
|
+
apt:
|
23
|
+
pkg: "{{ item }}"
|
24
|
+
state: present
|
25
|
+
with_items:
|
26
|
+
- xvfb
|
27
|
+
- libsqlite3-dev
|
28
|
+
- sqlite3
|
29
|
+
- mongodb-clients
|
30
|
+
- mysql-client
|
31
|
+
- libmysqlclient-dev
|
32
|
+
- postgresql-client
|
33
|
+
- libpq-dev
|
34
|
+
|
35
|
+
- import_tasks: setup/ruby_environment.yml
|
36
|
+
|
37
|
+
- import_tasks: setup/phantomjs.yml
|
38
|
+
become: true
|
39
|
+
|
40
|
+
- import_tasks: setup/firefox_geckodriver.yml
|
41
|
+
become: true
|
42
|
+
|
43
|
+
- import_tasks: setup/chromium_chromedriver.yml
|
44
|
+
become: true
|
@@ -0,0 +1,26 @@
|
|
1
|
+
---
|
2
|
+
- name: Install chromium browser
|
3
|
+
apt:
|
4
|
+
pkg: chromium-browser
|
5
|
+
state: present
|
6
|
+
|
7
|
+
- name: Get current chromedriver version
|
8
|
+
shell: chromedriver --version
|
9
|
+
args:
|
10
|
+
executable: /bin/bash
|
11
|
+
register: current_chromedriver_version
|
12
|
+
changed_when: false
|
13
|
+
ignore_errors: true
|
14
|
+
|
15
|
+
- name: Install unzip tool to unarchive chromedriver archive
|
16
|
+
apt:
|
17
|
+
pkg: unzip
|
18
|
+
state: present
|
19
|
+
|
20
|
+
- name: Download chromedriver binary archive and unarchive it to /usr/local/bin
|
21
|
+
unarchive:
|
22
|
+
src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
|
23
|
+
dest: /usr/local/bin
|
24
|
+
remote_src: true
|
25
|
+
mode: a+x
|
26
|
+
when: chromedriver not in current_chromedriver_version.stdout_lines
|
@@ -0,0 +1,20 @@
|
|
1
|
+
---
|
2
|
+
- name: Install firefox
|
3
|
+
apt:
|
4
|
+
pkg: firefox
|
5
|
+
state: present
|
6
|
+
|
7
|
+
- name: Get current geckodriver version
|
8
|
+
shell: geckodriver --version
|
9
|
+
args:
|
10
|
+
executable: /bin/bash
|
11
|
+
register: current_geckodriver_version
|
12
|
+
changed_when: false
|
13
|
+
ignore_errors: true
|
14
|
+
|
15
|
+
- name: Download geckodriver binary archive and unarchive it to /usr/local/bin
|
16
|
+
unarchive:
|
17
|
+
src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
|
18
|
+
dest: /usr/local/bin
|
19
|
+
remote_src: true
|
20
|
+
when: geckodriver not in current_geckodriver_version.stdout
|
@@ -0,0 +1,33 @@
|
|
1
|
+
---
|
2
|
+
- name: Install dependencies for PhantomJS
|
3
|
+
apt:
|
4
|
+
pkg: "{{ item }}"
|
5
|
+
state: present
|
6
|
+
with_items:
|
7
|
+
- chrpath
|
8
|
+
- libxft-dev
|
9
|
+
- libfreetype6
|
10
|
+
- libfreetype6-dev
|
11
|
+
- libfontconfig1
|
12
|
+
- libfontconfig1-dev
|
13
|
+
|
14
|
+
- name: Get current phantomjs version
|
15
|
+
shell: phantomjs -v
|
16
|
+
args:
|
17
|
+
executable: /bin/bash
|
18
|
+
register: current_phantomjs_version
|
19
|
+
changed_when: false
|
20
|
+
ignore_errors: true
|
21
|
+
|
22
|
+
- name: Download phantomJS archive and unarchive it to /usr/local/lib
|
23
|
+
unarchive:
|
24
|
+
src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
|
25
|
+
dest: /usr/local/lib
|
26
|
+
remote_src: true
|
27
|
+
when: phantomjs not in current_phantomjs_version.stdout
|
28
|
+
|
29
|
+
- name: Link PhantomJS binary to /usr/local/bin/phantomjs
|
30
|
+
file:
|
31
|
+
src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
|
32
|
+
dest: /usr/local/bin/phantomjs
|
33
|
+
state: link
|
@@ -0,0 +1,124 @@
|
|
1
|
+
---
|
2
|
+
- name: Install dependencies for ruby-build
|
3
|
+
become: true
|
4
|
+
apt:
|
5
|
+
pkg: "{{ item }}"
|
6
|
+
state: present
|
7
|
+
with_items:
|
8
|
+
- zlib1g-dev
|
9
|
+
- build-essential
|
10
|
+
- libssl-dev
|
11
|
+
- libreadline-dev
|
12
|
+
- libreadline6-dev
|
13
|
+
- libyaml-dev
|
14
|
+
- libxml2-dev
|
15
|
+
- libxslt1-dev
|
16
|
+
- libcurl4-openssl-dev
|
17
|
+
- libffi-dev
|
18
|
+
|
19
|
+
- name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
|
20
|
+
git:
|
21
|
+
repo: https://github.com/sstephenson/rbenv.git
|
22
|
+
dest: "{{ rbenv_root_path }}"
|
23
|
+
|
24
|
+
- name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
|
25
|
+
git:
|
26
|
+
repo: https://github.com/sstephenson/ruby-build.git
|
27
|
+
dest: "{{ rbenv_root_path }}/plugins/ruby-build"
|
28
|
+
|
29
|
+
- name: Add Rbenv path to the .bashrc
|
30
|
+
lineinfile:
|
31
|
+
dest: ~/.bashrc
|
32
|
+
regexp: '^export PATH="\$HOME\/\.rbenv'
|
33
|
+
line: export PATH="$HOME/.rbenv/bin:$PATH"
|
34
|
+
state: present
|
35
|
+
|
36
|
+
- name: Add Rbenv init to the .bashrc
|
37
|
+
lineinfile:
|
38
|
+
dest: ~/.bashrc
|
39
|
+
regexp: '^eval "\$\(rbenv'
|
40
|
+
line: eval "$(rbenv init -)"
|
41
|
+
state: present
|
42
|
+
|
43
|
+
- name: Check if desired Ruby version already installed
|
44
|
+
stat:
|
45
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}"
|
46
|
+
register: ruby_present
|
47
|
+
|
48
|
+
- name: Install desired Ruby version using ruby-build (this can take a while)
|
49
|
+
command: rbenv install {{ ruby }}
|
50
|
+
when: not ruby_present.stat.exists
|
51
|
+
environment:
|
52
|
+
CONFIGURE_OPTS: "--disable-install-doc"
|
53
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
54
|
+
|
55
|
+
- name: Get current Ruby version
|
56
|
+
command: "ruby -v"
|
57
|
+
register: current_ruby_version
|
58
|
+
changed_when: false
|
59
|
+
ignore_errors: true
|
60
|
+
environment:
|
61
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
62
|
+
|
63
|
+
- name: Set desired Ruby version as a global version
|
64
|
+
command: "rbenv global {{ ruby }}"
|
65
|
+
when: ruby not in current_ruby_version.stdout
|
66
|
+
environment:
|
67
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
68
|
+
register: set_ruby
|
69
|
+
|
70
|
+
- name: Execute `rbenv rehash` command
|
71
|
+
command: rbenv rehash
|
72
|
+
when: set_ruby.changed
|
73
|
+
environment:
|
74
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
75
|
+
|
76
|
+
- name: Create ~/.gemrc file to skip docs
|
77
|
+
copy:
|
78
|
+
dest: ~/.gemrc
|
79
|
+
content: "gem: --no-ri --no-rdoc"
|
80
|
+
|
81
|
+
- name: Create ~/.bundle directory
|
82
|
+
file:
|
83
|
+
dest: ~/.bundle
|
84
|
+
state: directory
|
85
|
+
|
86
|
+
- name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
|
87
|
+
copy:
|
88
|
+
dest: ~/.bundle/config
|
89
|
+
content: |
|
90
|
+
BUNDLE_GIT__ALLOW_INSECURE: "true"
|
91
|
+
BUNDLE_JOBS: "4"
|
92
|
+
|
93
|
+
- name: Check if Bundler gem installed
|
94
|
+
stat:
|
95
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
|
96
|
+
register: bundler_gem_present
|
97
|
+
|
98
|
+
- name: Install Bundler gem
|
99
|
+
command: gem install bundler
|
100
|
+
when: not bundler_gem_present.stat.exists
|
101
|
+
environment:
|
102
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
103
|
+
|
104
|
+
- name: Check if Whenever gem installed
|
105
|
+
stat:
|
106
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
|
107
|
+
register: whenever_gem_present
|
108
|
+
|
109
|
+
- name: Install Whenever gem
|
110
|
+
command: gem install whenever
|
111
|
+
when: not whenever_gem_present.stat.exists
|
112
|
+
environment:
|
113
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
114
|
+
|
115
|
+
- name: Check if Kimurai gem installed
|
116
|
+
stat:
|
117
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
|
118
|
+
register: kimurai_gem_present
|
119
|
+
|
120
|
+
- name: Install Kimurai gem
|
121
|
+
command: gem install kimurai
|
122
|
+
when: not kimurai_gem_present.stat.exists
|
123
|
+
environment:
|
124
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|