kimurai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "kimurai"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'kimurai'
4
+ require 'kimurai/cli'
5
+
6
+ Kimurai::CLI.start(ARGV)
@@ -0,0 +1,48 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "kimurai/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "kimurai"
8
+ spec.version = Kimurai::VERSION
9
+ spec.authors = ["Victor Afanasev"]
10
+ spec.email = ["vicfreefly@gmail.com"]
11
+
12
+ spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
13
+ spec.homepage = "https://github.com/vifreefly/kimurai"
14
+ spec.license = "MIT"
15
+
16
+ # Specify which files should be added to the gem when it is released.
17
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+ spec.required_ruby_version = ">= 2.5.0"
25
+
26
+ spec.add_dependency "thor"
27
+ spec.add_dependency "cliver"
28
+ spec.add_dependency "activesupport"
29
+ spec.add_dependency "murmurhash3"
30
+ spec.add_dependency "nokogiri"
31
+
32
+ spec.add_dependency "capybara", ">= 2.15", "< 4.0"
33
+ spec.add_dependency "capybara-mechanize"
34
+ spec.add_dependency "poltergeist"
35
+ spec.add_dependency "selenium-webdriver"
36
+
37
+ spec.add_dependency "headless"
38
+ spec.add_dependency "pmap"
39
+
40
+ spec.add_dependency "whenever"
41
+
42
+ spec.add_dependency "rbcat", "~> 0.2"
43
+ spec.add_dependency "pry"
44
+
45
+ spec.add_development_dependency "bundler", "~> 1.16"
46
+ spec.add_development_dependency "rake", "~> 10.0"
47
+ spec.add_development_dependency "minitest", "~> 5.0"
48
+ end
@@ -0,0 +1,53 @@
1
+ require 'ostruct'
2
+ require 'logger'
3
+ require 'json'
4
+ require 'active_support'
5
+ require 'active_support/core_ext'
6
+ require 'rbcat'
7
+
8
+ require_relative 'kimurai/version'
9
+
10
+ require_relative 'kimurai/core_ext/numeric'
11
+ require_relative 'kimurai/core_ext/string'
12
+ require_relative 'kimurai/core_ext/array'
13
+
14
+ require_relative 'kimurai/browser_builder'
15
+ require_relative 'kimurai/base_helper'
16
+ require_relative 'kimurai/pipeline'
17
+ require_relative 'kimurai/base'
18
+
19
+ module Kimurai
20
+ class << self
21
+ def configuration
22
+ @configuration ||= OpenStruct.new
23
+ end
24
+
25
+ def configure
26
+ yield(configuration)
27
+ end
28
+
29
+ def env
30
+ ENV.fetch("KIMURAI_ENV") { "development" }
31
+ end
32
+
33
+ def time_zone
34
+ ENV["TZ"]
35
+ end
36
+
37
+ def time_zone=(value)
38
+ ENV.store("TZ", value)
39
+ end
40
+
41
+ def list
42
+ Base.descendants.map do |klass|
43
+ next unless klass.name
44
+ [klass.name, klass]
45
+ end.compact.to_h
46
+ end
47
+
48
+ def find_by_name(name)
49
+ return unless name
50
+ Base.descendants.find { |klass| klass.name == name }
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,54 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
5
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
6
+ repo_url:
7
+ repo_name:
8
+ repo_key_path:
9
+
10
+ tasks:
11
+ - name: Copy custom git ssh key to /tmp/private_key (if provided)
12
+ when: repo_key_path is not none
13
+ copy:
14
+ src: "{{ repo_key_path }}"
15
+ dest: /tmp/private_key
16
+ mode: 0600
17
+
18
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
19
+ when: repo_key_path is none
20
+ git:
21
+ repo: "{{ repo_url }}"
22
+ dest: "~/{{ repo_name }}"
23
+ force: true
24
+ accept_hostkey: true
25
+
26
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
27
+ when: repo_key_path is not none
28
+ git:
29
+ repo: "{{ repo_url }}"
30
+ dest: "~/{{ repo_name }}"
31
+ force: true
32
+ accept_hostkey: true
33
+ key_file: /tmp/private_key
34
+
35
+ - name: Delete custom git ssh key from /tmp/private_key (if provided)
36
+ when: repo_key_path is not none
37
+ file:
38
+ state: absent
39
+ path: /tmp/private_key
40
+
41
+ - name: Run bundle install
42
+ command: bundle install
43
+ args:
44
+ chdir: ~/{{ repo_name }}
45
+ environment:
46
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
47
+
48
+ - name: Run whenever to update crontab
49
+ command: whenever --update-crontab
50
+ args:
51
+ chdir: ~/{{ repo_name }}
52
+ environment:
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
@@ -0,0 +1,44 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ ruby: 2.5.1
5
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
6
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
7
+ ruby_versions_path: "{{ rbenv_root_path }}/versions"
8
+ # check latest here http://phantomjs.org/download.html
9
+ phantomjs: 2.1.1
10
+ # check latest here https://github.com/mozilla/geckodriver/releases/
11
+ geckodriver: 0.21.0
12
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
13
+ chromedriver: 2.39
14
+
15
+ tasks:
16
+ - name: Update apt cache
17
+ become: true
18
+ apt: update_cache=yes cache_valid_time=86400
19
+
20
+ - name: Install base packages
21
+ become: true
22
+ apt:
23
+ pkg: "{{ item }}"
24
+ state: present
25
+ with_items:
26
+ - xvfb
27
+ - libsqlite3-dev
28
+ - sqlite3
29
+ - mongodb-clients
30
+ - mysql-client
31
+ - libmysqlclient-dev
32
+ - postgresql-client
33
+ - libpq-dev
34
+
35
+ - import_tasks: setup/ruby_environment.yml
36
+
37
+ - import_tasks: setup/phantomjs.yml
38
+ become: true
39
+
40
+ - import_tasks: setup/firefox_geckodriver.yml
41
+ become: true
42
+
43
+ - import_tasks: setup/chromium_chromedriver.yml
44
+ become: true
@@ -0,0 +1,26 @@
1
+ ---
2
+ - name: Install chromium browser
3
+ apt:
4
+ pkg: chromium-browser
5
+ state: present
6
+
7
+ - name: Get current chromedriver version
8
+ shell: chromedriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_chromedriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Install unzip tool to unarchive chromedriver archive
16
+ apt:
17
+ pkg: unzip
18
+ state: present
19
+
20
+ - name: Download chromedriver binary archive and unarchive it to /usr/local/bin
21
+ unarchive:
22
+ src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
23
+ dest: /usr/local/bin
24
+ remote_src: true
25
+ mode: a+x
26
+ when: chromedriver not in current_chromedriver_version.stdout_lines
@@ -0,0 +1,20 @@
1
+ ---
2
+ - name: Install firefox
3
+ apt:
4
+ pkg: firefox
5
+ state: present
6
+
7
+ - name: Get current geckodriver version
8
+ shell: geckodriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_geckodriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Download geckodriver binary archive and unarchive it to /usr/local/bin
16
+ unarchive:
17
+ src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
18
+ dest: /usr/local/bin
19
+ remote_src: true
20
+ when: geckodriver not in current_geckodriver_version.stdout
@@ -0,0 +1,33 @@
1
+ ---
2
+ - name: Install dependencies for PhantomJS
3
+ apt:
4
+ pkg: "{{ item }}"
5
+ state: present
6
+ with_items:
7
+ - chrpath
8
+ - libxft-dev
9
+ - libfreetype6
10
+ - libfreetype6-dev
11
+ - libfontconfig1
12
+ - libfontconfig1-dev
13
+
14
+ - name: Get current phantomjs version
15
+ shell: phantomjs -v
16
+ args:
17
+ executable: /bin/bash
18
+ register: current_phantomjs_version
19
+ changed_when: false
20
+ ignore_errors: true
21
+
22
+ - name: Download phantomJS archive and unarchive it to /usr/local/lib
23
+ unarchive:
24
+ src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
25
+ dest: /usr/local/lib
26
+ remote_src: true
27
+ when: phantomjs not in current_phantomjs_version.stdout
28
+
29
+ - name: Link PhantomJS binary to /usr/local/bin/phantomjs
30
+ file:
31
+ src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
32
+ dest: /usr/local/bin/phantomjs
33
+ state: link
@@ -0,0 +1,124 @@
1
+ ---
2
+ - name: Install dependencies for ruby-build
3
+ become: true
4
+ apt:
5
+ pkg: "{{ item }}"
6
+ state: present
7
+ with_items:
8
+ - zlib1g-dev
9
+ - build-essential
10
+ - libssl-dev
11
+ - libreadline-dev
12
+ - libreadline6-dev
13
+ - libyaml-dev
14
+ - libxml2-dev
15
+ - libxslt1-dev
16
+ - libcurl4-openssl-dev
17
+ - libffi-dev
18
+
19
+ - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
20
+ git:
21
+ repo: https://github.com/sstephenson/rbenv.git
22
+ dest: "{{ rbenv_root_path }}"
23
+
24
+ - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
25
+ git:
26
+ repo: https://github.com/sstephenson/ruby-build.git
27
+ dest: "{{ rbenv_root_path }}/plugins/ruby-build"
28
+
29
+ - name: Add Rbenv path to the .bashrc
30
+ lineinfile:
31
+ dest: ~/.bashrc
32
+ regexp: '^export PATH="\$HOME\/\.rbenv'
33
+ line: export PATH="$HOME/.rbenv/bin:$PATH"
34
+ state: present
35
+
36
+ - name: Add Rbenv init to the .bashrc
37
+ lineinfile:
38
+ dest: ~/.bashrc
39
+ regexp: '^eval "\$\(rbenv'
40
+ line: eval "$(rbenv init -)"
41
+ state: present
42
+
43
+ - name: Check if desired Ruby version already installed
44
+ stat:
45
+ path: "{{ ruby_versions_path }}/{{ ruby }}"
46
+ register: ruby_present
47
+
48
+ - name: Install desired Ruby version using ruby-build (this can take a while)
49
+ command: rbenv install {{ ruby }}
50
+ when: not ruby_present.stat.exists
51
+ environment:
52
+ CONFIGURE_OPTS: "--disable-install-doc"
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
55
+ - name: Get current Ruby version
56
+ command: "ruby -v"
57
+ register: current_ruby_version
58
+ changed_when: false
59
+ ignore_errors: true
60
+ environment:
61
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
62
+
63
+ - name: Set desired Ruby version as a global version
64
+ command: "rbenv global {{ ruby }}"
65
+ when: ruby not in current_ruby_version.stdout
66
+ environment:
67
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
68
+ register: set_ruby
69
+
70
+ - name: Execute `rbenv rehash` command
71
+ command: rbenv rehash
72
+ when: set_ruby.changed
73
+ environment:
74
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
75
+
76
+ - name: Create ~/.gemrc file to skip docs
77
+ copy:
78
+ dest: ~/.gemrc
79
+ content: "gem: --no-ri --no-rdoc"
80
+
81
+ - name: Create ~/.bundle directory
82
+ file:
83
+ dest: ~/.bundle
84
+ state: directory
85
+
86
+ - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
87
+ copy:
88
+ dest: ~/.bundle/config
89
+ content: |
90
+ BUNDLE_GIT__ALLOW_INSECURE: "true"
91
+ BUNDLE_JOBS: "4"
92
+
93
+ - name: Check if Bundler gem installed
94
+ stat:
95
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
96
+ register: bundler_gem_present
97
+
98
+ - name: Install Bundler gem
99
+ command: gem install bundler
100
+ when: not bundler_gem_present.stat.exists
101
+ environment:
102
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
103
+
104
+ - name: Check if Whenever gem installed
105
+ stat:
106
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
107
+ register: whenever_gem_present
108
+
109
+ - name: Install Whenever gem
110
+ command: gem install whenever
111
+ when: not whenever_gem_present.stat.exists
112
+ environment:
113
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
114
+
115
+ - name: Check if Kimurai gem installed
116
+ stat:
117
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
118
+ register: kimurai_gem_present
119
+
120
+ - name: Install Kimurai gem
121
+ command: gem install kimurai
122
+ when: not kimurai_gem_present.stat.exists
123
+ environment:
124
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"