kimurai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/LICENSE.txt +21 -0
- data/README.md +1923 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/kimurai +6 -0
- data/kimurai.gemspec +48 -0
- data/lib/kimurai.rb +53 -0
- data/lib/kimurai/automation/deploy.yml +54 -0
- data/lib/kimurai/automation/setup.yml +44 -0
- data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
- data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
- data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
- data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
- data/lib/kimurai/base.rb +249 -0
- data/lib/kimurai/base/simple_saver.rb +98 -0
- data/lib/kimurai/base/uniq_checker.rb +22 -0
- data/lib/kimurai/base_helper.rb +22 -0
- data/lib/kimurai/browser_builder.rb +32 -0
- data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
- data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
- data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
- data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
- data/lib/kimurai/capybara_configuration.rb +10 -0
- data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
- data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
- data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
- data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
- data/lib/kimurai/capybara_ext/session.rb +150 -0
- data/lib/kimurai/capybara_ext/session/config.rb +18 -0
- data/lib/kimurai/cli.rb +157 -0
- data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
- data/lib/kimurai/cli/generator.rb +57 -0
- data/lib/kimurai/core_ext/array.rb +14 -0
- data/lib/kimurai/core_ext/numeric.rb +19 -0
- data/lib/kimurai/core_ext/string.rb +7 -0
- data/lib/kimurai/pipeline.rb +25 -0
- data/lib/kimurai/runner.rb +72 -0
- data/lib/kimurai/template/.gitignore +18 -0
- data/lib/kimurai/template/.ruby-version +1 -0
- data/lib/kimurai/template/Gemfile +20 -0
- data/lib/kimurai/template/README.md +3 -0
- data/lib/kimurai/template/config/application.rb +32 -0
- data/lib/kimurai/template/config/automation.yml +13 -0
- data/lib/kimurai/template/config/boot.rb +22 -0
- data/lib/kimurai/template/config/initializers/.keep +0 -0
- data/lib/kimurai/template/config/schedule.rb +57 -0
- data/lib/kimurai/template/db/.keep +0 -0
- data/lib/kimurai/template/helpers/application_helper.rb +3 -0
- data/lib/kimurai/template/lib/.keep +0 -0
- data/lib/kimurai/template/log/.keep +0 -0
- data/lib/kimurai/template/pipelines/saver.rb +11 -0
- data/lib/kimurai/template/pipelines/validator.rb +24 -0
- data/lib/kimurai/template/spiders/application_spider.rb +104 -0
- data/lib/kimurai/template/tmp/.keep +0 -0
- data/lib/kimurai/version.rb +3 -0
- metadata +349 -0
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "kimurai"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/exe/kimurai
ADDED
data/kimurai.gemspec
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "kimurai/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "kimurai"
|
8
|
+
spec.version = Kimurai::VERSION
|
9
|
+
spec.authors = ["Victor Afanasev"]
|
10
|
+
spec.email = ["vicfreefly@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
|
13
|
+
spec.homepage = "https://github.com/vifreefly/kimurai"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
# Specify which files should be added to the gem when it is released.
|
17
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
18
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
19
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
|
+
end
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ["lib"]
|
24
|
+
spec.required_ruby_version = ">= 2.5.0"
|
25
|
+
|
26
|
+
spec.add_dependency "thor"
|
27
|
+
spec.add_dependency "cliver"
|
28
|
+
spec.add_dependency "activesupport"
|
29
|
+
spec.add_dependency "murmurhash3"
|
30
|
+
spec.add_dependency "nokogiri"
|
31
|
+
|
32
|
+
spec.add_dependency "capybara", ">= 2.15", "< 4.0"
|
33
|
+
spec.add_dependency "capybara-mechanize"
|
34
|
+
spec.add_dependency "poltergeist"
|
35
|
+
spec.add_dependency "selenium-webdriver"
|
36
|
+
|
37
|
+
spec.add_dependency "headless"
|
38
|
+
spec.add_dependency "pmap"
|
39
|
+
|
40
|
+
spec.add_dependency "whenever"
|
41
|
+
|
42
|
+
spec.add_dependency "rbcat", "~> 0.2"
|
43
|
+
spec.add_dependency "pry"
|
44
|
+
|
45
|
+
spec.add_development_dependency "bundler", "~> 1.16"
|
46
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
47
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
48
|
+
end
|
data/lib/kimurai.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'logger'
|
3
|
+
require 'json'
|
4
|
+
require 'active_support'
|
5
|
+
require 'active_support/core_ext'
|
6
|
+
require 'rbcat'
|
7
|
+
|
8
|
+
require_relative 'kimurai/version'
|
9
|
+
|
10
|
+
require_relative 'kimurai/core_ext/numeric'
|
11
|
+
require_relative 'kimurai/core_ext/string'
|
12
|
+
require_relative 'kimurai/core_ext/array'
|
13
|
+
|
14
|
+
require_relative 'kimurai/browser_builder'
|
15
|
+
require_relative 'kimurai/base_helper'
|
16
|
+
require_relative 'kimurai/pipeline'
|
17
|
+
require_relative 'kimurai/base'
|
18
|
+
|
19
|
+
module Kimurai
|
20
|
+
class << self
|
21
|
+
def configuration
|
22
|
+
@configuration ||= OpenStruct.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def configure
|
26
|
+
yield(configuration)
|
27
|
+
end
|
28
|
+
|
29
|
+
def env
|
30
|
+
ENV.fetch("KIMURAI_ENV") { "development" }
|
31
|
+
end
|
32
|
+
|
33
|
+
def time_zone
|
34
|
+
ENV["TZ"]
|
35
|
+
end
|
36
|
+
|
37
|
+
def time_zone=(value)
|
38
|
+
ENV.store("TZ", value)
|
39
|
+
end
|
40
|
+
|
41
|
+
def list
|
42
|
+
Base.descendants.map do |klass|
|
43
|
+
next unless klass.name
|
44
|
+
[klass.name, klass]
|
45
|
+
end.compact.to_h
|
46
|
+
end
|
47
|
+
|
48
|
+
def find_by_name(name)
|
49
|
+
return unless name
|
50
|
+
Base.descendants.find { |klass| klass.name == name }
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
---
|
2
|
+
- hosts: all
|
3
|
+
vars:
|
4
|
+
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
|
5
|
+
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
|
6
|
+
repo_url:
|
7
|
+
repo_name:
|
8
|
+
repo_key_path:
|
9
|
+
|
10
|
+
tasks:
|
11
|
+
- name: Copy custom git ssh key to /tmp/private_key (if provided)
|
12
|
+
when: repo_key_path is not none
|
13
|
+
copy:
|
14
|
+
src: "{{ repo_key_path }}"
|
15
|
+
dest: /tmp/private_key
|
16
|
+
mode: 0600
|
17
|
+
|
18
|
+
- name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
|
19
|
+
when: repo_key_path is none
|
20
|
+
git:
|
21
|
+
repo: "{{ repo_url }}"
|
22
|
+
dest: "~/{{ repo_name }}"
|
23
|
+
force: true
|
24
|
+
accept_hostkey: true
|
25
|
+
|
26
|
+
- name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
|
27
|
+
when: repo_key_path is not none
|
28
|
+
git:
|
29
|
+
repo: "{{ repo_url }}"
|
30
|
+
dest: "~/{{ repo_name }}"
|
31
|
+
force: true
|
32
|
+
accept_hostkey: true
|
33
|
+
key_file: /tmp/private_key
|
34
|
+
|
35
|
+
- name: Delete custom git ssh key from /tmp/private_key (if provided)
|
36
|
+
when: repo_key_path is not none
|
37
|
+
file:
|
38
|
+
state: absent
|
39
|
+
path: /tmp/private_key
|
40
|
+
|
41
|
+
- name: Run bundle install
|
42
|
+
command: bundle install
|
43
|
+
args:
|
44
|
+
chdir: ~/{{ repo_name }}
|
45
|
+
environment:
|
46
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
47
|
+
|
48
|
+
- name: Run whenever to update crontab
|
49
|
+
command: whenever --update-crontab
|
50
|
+
args:
|
51
|
+
chdir: ~/{{ repo_name }}
|
52
|
+
environment:
|
53
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
54
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
---
|
2
|
+
- hosts: all
|
3
|
+
vars:
|
4
|
+
ruby: 2.5.1
|
5
|
+
rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
|
6
|
+
rbenv_shims_path: "{{ rbenv_root_path }}/shims"
|
7
|
+
ruby_versions_path: "{{ rbenv_root_path }}/versions"
|
8
|
+
# check latest here http://phantomjs.org/download.html
|
9
|
+
phantomjs: 2.1.1
|
10
|
+
# check latest here https://github.com/mozilla/geckodriver/releases/
|
11
|
+
geckodriver: 0.21.0
|
12
|
+
# check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
|
13
|
+
chromedriver: 2.39
|
14
|
+
|
15
|
+
tasks:
|
16
|
+
- name: Update apt cache
|
17
|
+
become: true
|
18
|
+
apt: update_cache=yes cache_valid_time=86400
|
19
|
+
|
20
|
+
- name: Install base packages
|
21
|
+
become: true
|
22
|
+
apt:
|
23
|
+
pkg: "{{ item }}"
|
24
|
+
state: present
|
25
|
+
with_items:
|
26
|
+
- xvfb
|
27
|
+
- libsqlite3-dev
|
28
|
+
- sqlite3
|
29
|
+
- mongodb-clients
|
30
|
+
- mysql-client
|
31
|
+
- libmysqlclient-dev
|
32
|
+
- postgresql-client
|
33
|
+
- libpq-dev
|
34
|
+
|
35
|
+
- import_tasks: setup/ruby_environment.yml
|
36
|
+
|
37
|
+
- import_tasks: setup/phantomjs.yml
|
38
|
+
become: true
|
39
|
+
|
40
|
+
- import_tasks: setup/firefox_geckodriver.yml
|
41
|
+
become: true
|
42
|
+
|
43
|
+
- import_tasks: setup/chromium_chromedriver.yml
|
44
|
+
become: true
|
@@ -0,0 +1,26 @@
|
|
1
|
+
---
|
2
|
+
- name: Install chromium browser
|
3
|
+
apt:
|
4
|
+
pkg: chromium-browser
|
5
|
+
state: present
|
6
|
+
|
7
|
+
- name: Get current chromedriver version
|
8
|
+
shell: chromedriver --version
|
9
|
+
args:
|
10
|
+
executable: /bin/bash
|
11
|
+
register: current_chromedriver_version
|
12
|
+
changed_when: false
|
13
|
+
ignore_errors: true
|
14
|
+
|
15
|
+
- name: Install unzip tool to unarchive chromedriver archive
|
16
|
+
apt:
|
17
|
+
pkg: unzip
|
18
|
+
state: present
|
19
|
+
|
20
|
+
- name: Download chromedriver binary archive and unarchive it to /usr/local/bin
|
21
|
+
unarchive:
|
22
|
+
src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
|
23
|
+
dest: /usr/local/bin
|
24
|
+
remote_src: true
|
25
|
+
mode: a+x
|
26
|
+
when: chromedriver not in current_chromedriver_version.stdout_lines
|
@@ -0,0 +1,20 @@
|
|
1
|
+
---
|
2
|
+
- name: Install firefox
|
3
|
+
apt:
|
4
|
+
pkg: firefox
|
5
|
+
state: present
|
6
|
+
|
7
|
+
- name: Get current geckodriver version
|
8
|
+
shell: geckodriver --version
|
9
|
+
args:
|
10
|
+
executable: /bin/bash
|
11
|
+
register: current_geckodriver_version
|
12
|
+
changed_when: false
|
13
|
+
ignore_errors: true
|
14
|
+
|
15
|
+
- name: Download geckodriver binary archive and unarchive it to /usr/local/bin
|
16
|
+
unarchive:
|
17
|
+
src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
|
18
|
+
dest: /usr/local/bin
|
19
|
+
remote_src: true
|
20
|
+
when: geckodriver not in current_geckodriver_version.stdout
|
@@ -0,0 +1,33 @@
|
|
1
|
+
---
|
2
|
+
- name: Install dependencies for PhantomJS
|
3
|
+
apt:
|
4
|
+
pkg: "{{ item }}"
|
5
|
+
state: present
|
6
|
+
with_items:
|
7
|
+
- chrpath
|
8
|
+
- libxft-dev
|
9
|
+
- libfreetype6
|
10
|
+
- libfreetype6-dev
|
11
|
+
- libfontconfig1
|
12
|
+
- libfontconfig1-dev
|
13
|
+
|
14
|
+
- name: Get current phantomjs version
|
15
|
+
shell: phantomjs -v
|
16
|
+
args:
|
17
|
+
executable: /bin/bash
|
18
|
+
register: current_phantomjs_version
|
19
|
+
changed_when: false
|
20
|
+
ignore_errors: true
|
21
|
+
|
22
|
+
- name: Download phantomJS archive and unarchive it to /usr/local/lib
|
23
|
+
unarchive:
|
24
|
+
src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
|
25
|
+
dest: /usr/local/lib
|
26
|
+
remote_src: true
|
27
|
+
when: phantomjs not in current_phantomjs_version.stdout
|
28
|
+
|
29
|
+
- name: Link PhantomJS binary to /usr/local/bin/phantomjs
|
30
|
+
file:
|
31
|
+
src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
|
32
|
+
dest: /usr/local/bin/phantomjs
|
33
|
+
state: link
|
@@ -0,0 +1,124 @@
|
|
1
|
+
---
|
2
|
+
- name: Install dependencies for ruby-build
|
3
|
+
become: true
|
4
|
+
apt:
|
5
|
+
pkg: "{{ item }}"
|
6
|
+
state: present
|
7
|
+
with_items:
|
8
|
+
- zlib1g-dev
|
9
|
+
- build-essential
|
10
|
+
- libssl-dev
|
11
|
+
- libreadline-dev
|
12
|
+
- libreadline6-dev
|
13
|
+
- libyaml-dev
|
14
|
+
- libxml2-dev
|
15
|
+
- libxslt1-dev
|
16
|
+
- libcurl4-openssl-dev
|
17
|
+
- libffi-dev
|
18
|
+
|
19
|
+
- name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
|
20
|
+
git:
|
21
|
+
repo: https://github.com/sstephenson/rbenv.git
|
22
|
+
dest: "{{ rbenv_root_path }}"
|
23
|
+
|
24
|
+
- name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
|
25
|
+
git:
|
26
|
+
repo: https://github.com/sstephenson/ruby-build.git
|
27
|
+
dest: "{{ rbenv_root_path }}/plugins/ruby-build"
|
28
|
+
|
29
|
+
- name: Add Rbenv path to the .bashrc
|
30
|
+
lineinfile:
|
31
|
+
dest: ~/.bashrc
|
32
|
+
regexp: '^export PATH="\$HOME\/\.rbenv'
|
33
|
+
line: export PATH="$HOME/.rbenv/bin:$PATH"
|
34
|
+
state: present
|
35
|
+
|
36
|
+
- name: Add Rbenv init to the .bashrc
|
37
|
+
lineinfile:
|
38
|
+
dest: ~/.bashrc
|
39
|
+
regexp: '^eval "\$\(rbenv'
|
40
|
+
line: eval "$(rbenv init -)"
|
41
|
+
state: present
|
42
|
+
|
43
|
+
- name: Check if desired Ruby version already installed
|
44
|
+
stat:
|
45
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}"
|
46
|
+
register: ruby_present
|
47
|
+
|
48
|
+
- name: Install desired Ruby version using ruby-build (this can take a while)
|
49
|
+
command: rbenv install {{ ruby }}
|
50
|
+
when: not ruby_present.stat.exists
|
51
|
+
environment:
|
52
|
+
CONFIGURE_OPTS: "--disable-install-doc"
|
53
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
54
|
+
|
55
|
+
- name: Get current Ruby version
|
56
|
+
command: "ruby -v"
|
57
|
+
register: current_ruby_version
|
58
|
+
changed_when: false
|
59
|
+
ignore_errors: true
|
60
|
+
environment:
|
61
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
62
|
+
|
63
|
+
- name: Set desired Ruby version as a global version
|
64
|
+
command: "rbenv global {{ ruby }}"
|
65
|
+
when: ruby not in current_ruby_version.stdout
|
66
|
+
environment:
|
67
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
68
|
+
register: set_ruby
|
69
|
+
|
70
|
+
- name: Execute `rbenv rehash` command
|
71
|
+
command: rbenv rehash
|
72
|
+
when: set_ruby.changed
|
73
|
+
environment:
|
74
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
75
|
+
|
76
|
+
- name: Create ~/.gemrc file to skip docs
|
77
|
+
copy:
|
78
|
+
dest: ~/.gemrc
|
79
|
+
content: "gem: --no-ri --no-rdoc"
|
80
|
+
|
81
|
+
- name: Create ~/.bundle directory
|
82
|
+
file:
|
83
|
+
dest: ~/.bundle
|
84
|
+
state: directory
|
85
|
+
|
86
|
+
- name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
|
87
|
+
copy:
|
88
|
+
dest: ~/.bundle/config
|
89
|
+
content: |
|
90
|
+
BUNDLE_GIT__ALLOW_INSECURE: "true"
|
91
|
+
BUNDLE_JOBS: "4"
|
92
|
+
|
93
|
+
- name: Check if Bundler gem installed
|
94
|
+
stat:
|
95
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
|
96
|
+
register: bundler_gem_present
|
97
|
+
|
98
|
+
- name: Install Bundler gem
|
99
|
+
command: gem install bundler
|
100
|
+
when: not bundler_gem_present.stat.exists
|
101
|
+
environment:
|
102
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
103
|
+
|
104
|
+
- name: Check if Whenever gem installed
|
105
|
+
stat:
|
106
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
|
107
|
+
register: whenever_gem_present
|
108
|
+
|
109
|
+
- name: Install Whenever gem
|
110
|
+
command: gem install whenever
|
111
|
+
when: not whenever_gem_present.stat.exists
|
112
|
+
environment:
|
113
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|
114
|
+
|
115
|
+
- name: Check if Kimurai gem installed
|
116
|
+
stat:
|
117
|
+
path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
|
118
|
+
register: kimurai_gem_present
|
119
|
+
|
120
|
+
- name: Install Kimurai gem
|
121
|
+
command: gem install kimurai
|
122
|
+
when: not kimurai_gem_present.stat.exists
|
123
|
+
environment:
|
124
|
+
PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
|