kimurai 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CODE_OF_CONDUCT.md +74 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +1923 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai.rb +53 -0
  14. data/lib/kimurai/automation/deploy.yml +54 -0
  15. data/lib/kimurai/automation/setup.yml +44 -0
  16. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  17. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  18. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  19. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  20. data/lib/kimurai/base.rb +249 -0
  21. data/lib/kimurai/base/simple_saver.rb +98 -0
  22. data/lib/kimurai/base/uniq_checker.rb +22 -0
  23. data/lib/kimurai/base_helper.rb +22 -0
  24. data/lib/kimurai/browser_builder.rb +32 -0
  25. data/lib/kimurai/browser_builder/mechanize_builder.rb +140 -0
  26. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +156 -0
  27. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +178 -0
  28. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +185 -0
  29. data/lib/kimurai/capybara_configuration.rb +10 -0
  30. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  31. data/lib/kimurai/capybara_ext/mechanize/driver.rb +55 -0
  32. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  33. data/lib/kimurai/capybara_ext/selenium/driver.rb +24 -0
  34. data/lib/kimurai/capybara_ext/session.rb +150 -0
  35. data/lib/kimurai/capybara_ext/session/config.rb +18 -0
  36. data/lib/kimurai/cli.rb +157 -0
  37. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  38. data/lib/kimurai/cli/generator.rb +57 -0
  39. data/lib/kimurai/core_ext/array.rb +14 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +25 -0
  43. data/lib/kimurai/runner.rb +72 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/.ruby-version +1 -0
  46. data/lib/kimurai/template/Gemfile +20 -0
  47. data/lib/kimurai/template/README.md +3 -0
  48. data/lib/kimurai/template/config/application.rb +32 -0
  49. data/lib/kimurai/template/config/automation.yml +13 -0
  50. data/lib/kimurai/template/config/boot.rb +22 -0
  51. data/lib/kimurai/template/config/initializers/.keep +0 -0
  52. data/lib/kimurai/template/config/schedule.rb +57 -0
  53. data/lib/kimurai/template/db/.keep +0 -0
  54. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  55. data/lib/kimurai/template/lib/.keep +0 -0
  56. data/lib/kimurai/template/log/.keep +0 -0
  57. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  58. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  59. data/lib/kimurai/template/spiders/application_spider.rb +104 -0
  60. data/lib/kimurai/template/tmp/.keep +0 -0
  61. data/lib/kimurai/version.rb +3 -0
  62. metadata +349 -0
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "kimurai"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'kimurai'
4
+ require 'kimurai/cli'
5
+
6
+ Kimurai::CLI.start(ARGV)
@@ -0,0 +1,48 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "kimurai/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "kimurai"
8
+ spec.version = Kimurai::VERSION
9
+ spec.authors = ["Victor Afanasev"]
10
+ spec.email = ["vicfreefly@gmail.com"]
11
+
12
+ spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
13
+ spec.homepage = "https://github.com/vifreefly/kimurai"
14
+ spec.license = "MIT"
15
+
16
+ # Specify which files should be added to the gem when it is released.
17
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ["lib"]
24
+ spec.required_ruby_version = ">= 2.5.0"
25
+
26
+ spec.add_dependency "thor"
27
+ spec.add_dependency "cliver"
28
+ spec.add_dependency "activesupport"
29
+ spec.add_dependency "murmurhash3"
30
+ spec.add_dependency "nokogiri"
31
+
32
+ spec.add_dependency "capybara", ">= 2.15", "< 4.0"
33
+ spec.add_dependency "capybara-mechanize"
34
+ spec.add_dependency "poltergeist"
35
+ spec.add_dependency "selenium-webdriver"
36
+
37
+ spec.add_dependency "headless"
38
+ spec.add_dependency "pmap"
39
+
40
+ spec.add_dependency "whenever"
41
+
42
+ spec.add_dependency "rbcat", "~> 0.2"
43
+ spec.add_dependency "pry"
44
+
45
+ spec.add_development_dependency "bundler", "~> 1.16"
46
+ spec.add_development_dependency "rake", "~> 10.0"
47
+ spec.add_development_dependency "minitest", "~> 5.0"
48
+ end
@@ -0,0 +1,53 @@
1
+ require 'ostruct'
2
+ require 'logger'
3
+ require 'json'
4
+ require 'active_support'
5
+ require 'active_support/core_ext'
6
+ require 'rbcat'
7
+
8
+ require_relative 'kimurai/version'
9
+
10
+ require_relative 'kimurai/core_ext/numeric'
11
+ require_relative 'kimurai/core_ext/string'
12
+ require_relative 'kimurai/core_ext/array'
13
+
14
+ require_relative 'kimurai/browser_builder'
15
+ require_relative 'kimurai/base_helper'
16
+ require_relative 'kimurai/pipeline'
17
+ require_relative 'kimurai/base'
18
+
19
+ module Kimurai
20
+ class << self
21
+ def configuration
22
+ @configuration ||= OpenStruct.new
23
+ end
24
+
25
+ def configure
26
+ yield(configuration)
27
+ end
28
+
29
+ def env
30
+ ENV.fetch("KIMURAI_ENV") { "development" }
31
+ end
32
+
33
+ def time_zone
34
+ ENV["TZ"]
35
+ end
36
+
37
+ def time_zone=(value)
38
+ ENV.store("TZ", value)
39
+ end
40
+
41
+ def list
42
+ Base.descendants.map do |klass|
43
+ next unless klass.name
44
+ [klass.name, klass]
45
+ end.compact.to_h
46
+ end
47
+
48
+ def find_by_name(name)
49
+ return unless name
50
+ Base.descendants.find { |klass| klass.name == name }
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,54 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
5
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
6
+ repo_url:
7
+ repo_name:
8
+ repo_key_path:
9
+
10
+ tasks:
11
+ - name: Copy custom git ssh key to /tmp/private_key (if provided)
12
+ when: repo_key_path is not none
13
+ copy:
14
+ src: "{{ repo_key_path }}"
15
+ dest: /tmp/private_key
16
+ mode: 0600
17
+
18
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
19
+ when: repo_key_path is none
20
+ git:
21
+ repo: "{{ repo_url }}"
22
+ dest: "~/{{ repo_name }}"
23
+ force: true
24
+ accept_hostkey: true
25
+
26
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
27
+ when: repo_key_path is not none
28
+ git:
29
+ repo: "{{ repo_url }}"
30
+ dest: "~/{{ repo_name }}"
31
+ force: true
32
+ accept_hostkey: true
33
+ key_file: /tmp/private_key
34
+
35
+ - name: Delete custom git ssh key from /tmp/private_key (if provided)
36
+ when: repo_key_path is not none
37
+ file:
38
+ state: absent
39
+ path: /tmp/private_key
40
+
41
+ - name: Run bundle install
42
+ command: bundle install
43
+ args:
44
+ chdir: ~/{{ repo_name }}
45
+ environment:
46
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
47
+
48
+ - name: Run whenever to update crontab
49
+ command: whenever --update-crontab
50
+ args:
51
+ chdir: ~/{{ repo_name }}
52
+ environment:
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
@@ -0,0 +1,44 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ ruby: 2.5.1
5
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
6
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
7
+ ruby_versions_path: "{{ rbenv_root_path }}/versions"
8
+ # check latest here http://phantomjs.org/download.html
9
+ phantomjs: 2.1.1
10
+ # check latest here https://github.com/mozilla/geckodriver/releases/
11
+ geckodriver: 0.21.0
12
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
13
+ chromedriver: 2.39
14
+
15
+ tasks:
16
+ - name: Update apt cache
17
+ become: true
18
+ apt: update_cache=yes cache_valid_time=86400
19
+
20
+ - name: Install base packages
21
+ become: true
22
+ apt:
23
+ pkg: "{{ item }}"
24
+ state: present
25
+ with_items:
26
+ - xvfb
27
+ - libsqlite3-dev
28
+ - sqlite3
29
+ - mongodb-clients
30
+ - mysql-client
31
+ - libmysqlclient-dev
32
+ - postgresql-client
33
+ - libpq-dev
34
+
35
+ - import_tasks: setup/ruby_environment.yml
36
+
37
+ - import_tasks: setup/phantomjs.yml
38
+ become: true
39
+
40
+ - import_tasks: setup/firefox_geckodriver.yml
41
+ become: true
42
+
43
+ - import_tasks: setup/chromium_chromedriver.yml
44
+ become: true
@@ -0,0 +1,26 @@
1
+ ---
2
+ - name: Install chromium browser
3
+ apt:
4
+ pkg: chromium-browser
5
+ state: present
6
+
7
+ - name: Get current chromedriver version
8
+ shell: chromedriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_chromedriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Install unzip tool to unarchive chromedriver archive
16
+ apt:
17
+ pkg: unzip
18
+ state: present
19
+
20
+ - name: Download chromedriver binary archive and unarchive it to /usr/local/bin
21
+ unarchive:
22
+ src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
23
+ dest: /usr/local/bin
24
+ remote_src: true
25
+ mode: a+x
26
+ when: chromedriver not in current_chromedriver_version.stdout_lines
@@ -0,0 +1,20 @@
1
+ ---
2
+ - name: Install firefox
3
+ apt:
4
+ pkg: firefox
5
+ state: present
6
+
7
+ - name: Get current geckodriver version
8
+ shell: geckodriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_geckodriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Download geckodriver binary archive and unarchive it to /usr/local/bin
16
+ unarchive:
17
+ src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
18
+ dest: /usr/local/bin
19
+ remote_src: true
20
+ when: geckodriver not in current_geckodriver_version.stdout
@@ -0,0 +1,33 @@
1
+ ---
2
+ - name: Install dependencies for PhantomJS
3
+ apt:
4
+ pkg: "{{ item }}"
5
+ state: present
6
+ with_items:
7
+ - chrpath
8
+ - libxft-dev
9
+ - libfreetype6
10
+ - libfreetype6-dev
11
+ - libfontconfig1
12
+ - libfontconfig1-dev
13
+
14
+ - name: Get current phantomjs version
15
+ shell: phantomjs -v
16
+ args:
17
+ executable: /bin/bash
18
+ register: current_phantomjs_version
19
+ changed_when: false
20
+ ignore_errors: true
21
+
22
+ - name: Download phantomJS archive and unarchive it to /usr/local/lib
23
+ unarchive:
24
+ src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
25
+ dest: /usr/local/lib
26
+ remote_src: true
27
+ when: phantomjs not in current_phantomjs_version.stdout
28
+
29
+ - name: Link PhantomJS binary to /usr/local/bin/phantomjs
30
+ file:
31
+ src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
32
+ dest: /usr/local/bin/phantomjs
33
+ state: link
@@ -0,0 +1,124 @@
1
+ ---
2
+ - name: Install dependencies for ruby-build
3
+ become: true
4
+ apt:
5
+ pkg: "{{ item }}"
6
+ state: present
7
+ with_items:
8
+ - zlib1g-dev
9
+ - build-essential
10
+ - libssl-dev
11
+ - libreadline-dev
12
+ - libreadline6-dev
13
+ - libyaml-dev
14
+ - libxml2-dev
15
+ - libxslt1-dev
16
+ - libcurl4-openssl-dev
17
+ - libffi-dev
18
+
19
+ - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
20
+ git:
21
+ repo: https://github.com/sstephenson/rbenv.git
22
+ dest: "{{ rbenv_root_path }}"
23
+
24
+ - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
25
+ git:
26
+ repo: https://github.com/sstephenson/ruby-build.git
27
+ dest: "{{ rbenv_root_path }}/plugins/ruby-build"
28
+
29
+ - name: Add Rbenv path to the .bashrc
30
+ lineinfile:
31
+ dest: ~/.bashrc
32
+ regexp: '^export PATH="\$HOME\/\.rbenv'
33
+ line: export PATH="$HOME/.rbenv/bin:$PATH"
34
+ state: present
35
+
36
+ - name: Add Rbenv init to the .bashrc
37
+ lineinfile:
38
+ dest: ~/.bashrc
39
+ regexp: '^eval "\$\(rbenv'
40
+ line: eval "$(rbenv init -)"
41
+ state: present
42
+
43
+ - name: Check if desired Ruby version already installed
44
+ stat:
45
+ path: "{{ ruby_versions_path }}/{{ ruby }}"
46
+ register: ruby_present
47
+
48
+ - name: Install desired Ruby version using ruby-build (this can take a while)
49
+ command: rbenv install {{ ruby }}
50
+ when: not ruby_present.stat.exists
51
+ environment:
52
+ CONFIGURE_OPTS: "--disable-install-doc"
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
55
+ - name: Get current Ruby version
56
+ command: "ruby -v"
57
+ register: current_ruby_version
58
+ changed_when: false
59
+ ignore_errors: true
60
+ environment:
61
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
62
+
63
+ - name: Set desired Ruby version as a global version
64
+ command: "rbenv global {{ ruby }}"
65
+ when: ruby not in current_ruby_version.stdout
66
+ environment:
67
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
68
+ register: set_ruby
69
+
70
+ - name: Execute `rbenv rehash` command
71
+ command: rbenv rehash
72
+ when: set_ruby.changed
73
+ environment:
74
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
75
+
76
+ - name: Create ~/.gemrc file to skip docs
77
+ copy:
78
+ dest: ~/.gemrc
79
+ content: "gem: --no-ri --no-rdoc"
80
+
81
+ - name: Create ~/.bundle directory
82
+ file:
83
+ dest: ~/.bundle
84
+ state: directory
85
+
86
+ - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
87
+ copy:
88
+ dest: ~/.bundle/config
89
+ content: |
90
+ BUNDLE_GIT__ALLOW_INSECURE: "true"
91
+ BUNDLE_JOBS: "4"
92
+
93
+ - name: Check if Bundler gem installed
94
+ stat:
95
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
96
+ register: bundler_gem_present
97
+
98
+ - name: Install Bundler gem
99
+ command: gem install bundler
100
+ when: not bundler_gem_present.stat.exists
101
+ environment:
102
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
103
+
104
+ - name: Check if Whenever gem installed
105
+ stat:
106
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
107
+ register: whenever_gem_present
108
+
109
+ - name: Install Whenever gem
110
+ command: gem install whenever
111
+ when: not whenever_gem_present.stat.exists
112
+ environment:
113
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
114
+
115
+ - name: Check if Kimurai gem installed
116
+ stat:
117
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
118
+ register: kimurai_gem_present
119
+
120
+ - name: Install Kimurai gem
121
+ command: gem install kimurai
122
+ when: not kimurai_gem_present.stat.exists
123
+ environment:
124
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"