kimurai 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +21 -0
  4. data/Gemfile +2 -2
  5. data/README.md +476 -648
  6. data/Rakefile +6 -6
  7. data/bin/console +3 -4
  8. data/exe/kimurai +0 -1
  9. data/kimurai.gemspec +38 -37
  10. data/lib/kimurai/base/saver.rb +15 -19
  11. data/lib/kimurai/base/storage.rb +1 -1
  12. data/lib/kimurai/base.rb +38 -38
  13. data/lib/kimurai/base_helper.rb +5 -4
  14. data/lib/kimurai/browser_builder/mechanize_builder.rb +121 -119
  15. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +160 -152
  16. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +162 -160
  17. data/lib/kimurai/browser_builder.rb +1 -7
  18. data/lib/kimurai/capybara_configuration.rb +1 -1
  19. data/lib/kimurai/capybara_ext/driver/base.rb +50 -46
  20. data/lib/kimurai/capybara_ext/mechanize/driver.rb +51 -50
  21. data/lib/kimurai/capybara_ext/selenium/driver.rb +33 -29
  22. data/lib/kimurai/capybara_ext/session.rb +31 -38
  23. data/lib/kimurai/cli/generator.rb +15 -15
  24. data/lib/kimurai/cli.rb +49 -86
  25. data/lib/kimurai/core_ext/array.rb +2 -2
  26. data/lib/kimurai/core_ext/hash.rb +1 -1
  27. data/lib/kimurai/core_ext/numeric.rb +4 -4
  28. data/lib/kimurai/pipeline.rb +2 -1
  29. data/lib/kimurai/runner.rb +6 -6
  30. data/lib/kimurai/template/Gemfile +2 -2
  31. data/lib/kimurai/template/config/boot.rb +4 -4
  32. data/lib/kimurai/template/config/schedule.rb +15 -15
  33. data/lib/kimurai/template/spiders/application_spider.rb +8 -14
  34. data/lib/kimurai/version.rb +1 -1
  35. data/lib/kimurai.rb +7 -3
  36. metadata +58 -65
  37. data/.travis.yml +0 -5
  38. data/lib/kimurai/automation/deploy.yml +0 -54
  39. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +0 -26
  40. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +0 -20
  41. data/lib/kimurai/automation/setup/phantomjs.yml +0 -33
  42. data/lib/kimurai/automation/setup/ruby_environment.yml +0 -124
  43. data/lib/kimurai/automation/setup.yml +0 -44
  44. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +0 -175
  45. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +0 -13
  46. data/lib/kimurai/cli/ansible_command_builder.rb +0 -71
  47. data/lib/kimurai/template/config/automation.yml +0 -13
@@ -1,124 +0,0 @@
1
- ---
2
- - name: Install dependencies for ruby-build
3
- become: true
4
- apt:
5
- pkg: "{{ item }}"
6
- state: present
7
- with_items:
8
- - zlib1g-dev
9
- - build-essential
10
- - libssl-dev
11
- - libreadline-dev
12
- - libreadline6-dev
13
- - libyaml-dev
14
- - libxml2-dev
15
- - libxslt1-dev
16
- - libcurl4-openssl-dev
17
- - libffi-dev
18
-
19
- - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
20
- git:
21
- repo: https://github.com/sstephenson/rbenv.git
22
- dest: "{{ rbenv_root_path }}"
23
-
24
- - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
25
- git:
26
- repo: https://github.com/sstephenson/ruby-build.git
27
- dest: "{{ rbenv_root_path }}/plugins/ruby-build"
28
-
29
- - name: Add Rbenv path to the .bashrc
30
- lineinfile:
31
- dest: ~/.bashrc
32
- regexp: '^export PATH="\$HOME\/\.rbenv'
33
- line: export PATH="$HOME/.rbenv/bin:$PATH"
34
- state: present
35
-
36
- - name: Add Rbenv init to the .bashrc
37
- lineinfile:
38
- dest: ~/.bashrc
39
- regexp: '^eval "\$\(rbenv'
40
- line: eval "$(rbenv init -)"
41
- state: present
42
-
43
- - name: Check if desired Ruby version already installed
44
- stat:
45
- path: "{{ ruby_versions_path }}/{{ ruby }}"
46
- register: ruby_present
47
-
48
- - name: Install desired Ruby version using ruby-build (this can take a while)
49
- command: rbenv install {{ ruby }}
50
- when: not ruby_present.stat.exists
51
- environment:
52
- CONFIGURE_OPTS: "--disable-install-doc"
53
- PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
-
55
- - name: Get current Ruby version
56
- command: "ruby -v"
57
- register: current_ruby_version
58
- changed_when: false
59
- ignore_errors: true
60
- environment:
61
- PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
62
-
63
- - name: Set desired Ruby version as a global version
64
- command: "rbenv global {{ ruby }}"
65
- when: ruby not in current_ruby_version.stdout
66
- environment:
67
- PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
68
- register: set_ruby
69
-
70
- - name: Execute `rbenv rehash` command
71
- command: rbenv rehash
72
- when: set_ruby.changed
73
- environment:
74
- PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
75
-
76
- - name: Create ~/.gemrc file to skip docs
77
- copy:
78
- dest: ~/.gemrc
79
- content: "gem: --no-ri --no-rdoc"
80
-
81
- - name: Create ~/.bundle directory
82
- file:
83
- dest: ~/.bundle
84
- state: directory
85
-
86
- - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
87
- copy:
88
- dest: ~/.bundle/config
89
- content: |
90
- BUNDLE_GIT__ALLOW_INSECURE: "true"
91
- BUNDLE_JOBS: "4"
92
-
93
- - name: Check if Bundler gem installed
94
- stat:
95
- path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
96
- register: bundler_gem_present
97
-
98
- - name: Install Bundler gem
99
- command: gem install bundler
100
- when: not bundler_gem_present.stat.exists
101
- environment:
102
- PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
103
-
104
- - name: Check if Whenever gem installed
105
- stat:
106
- path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
107
- register: whenever_gem_present
108
-
109
- - name: Install Whenever gem
110
- command: gem install whenever
111
- when: not whenever_gem_present.stat.exists
112
- environment:
113
- PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
114
-
115
- - name: Check if Kimurai gem installed
116
- stat:
117
- path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
118
- register: kimurai_gem_present
119
-
120
- - name: Install Kimurai gem
121
- command: gem install kimurai
122
- when: not kimurai_gem_present.stat.exists
123
- environment:
124
- PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
@@ -1,44 +0,0 @@
1
- ---
2
- - hosts: all
3
- vars:
4
- ruby: 2.5.3
5
- rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
6
- rbenv_shims_path: "{{ rbenv_root_path }}/shims"
7
- ruby_versions_path: "{{ rbenv_root_path }}/versions"
8
- # check latest here http://phantomjs.org/download.html
9
- phantomjs: 2.1.1
10
- # check latest here https://github.com/mozilla/geckodriver/releases/
11
- geckodriver: 0.23.0
12
- # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
13
- chromedriver: 2.44
14
-
15
- tasks:
16
- - name: Update apt cache
17
- become: true
18
- apt: update_cache=yes cache_valid_time=86400
19
-
20
- - name: Install base packages
21
- become: true
22
- apt:
23
- pkg: "{{ item }}"
24
- state: present
25
- with_items:
26
- - xvfb
27
- - libsqlite3-dev
28
- - sqlite3
29
- - mongodb-clients
30
- - mysql-client
31
- - libmysqlclient-dev
32
- - postgresql-client
33
- - libpq-dev
34
-
35
- - import_tasks: setup/ruby_environment.yml
36
-
37
- - import_tasks: setup/phantomjs.yml
38
- become: true
39
-
40
- - import_tasks: setup/firefox_geckodriver.yml
41
- become: true
42
-
43
- - import_tasks: setup/chromium_chromedriver.yml
44
- become: true
@@ -1,175 +0,0 @@
1
- require 'capybara'
2
- require 'capybara/poltergeist'
3
- require_relative '../capybara_configuration'
4
- require_relative '../capybara_ext/poltergeist/driver'
5
- require_relative '../capybara_ext/session'
6
-
7
- module Kimurai::BrowserBuilder
8
- class PoltergeistPhantomjsBuilder
9
- attr_reader :logger, :spider
10
-
11
- def initialize(config, spider:)
12
- @config = config
13
- @spider = spider
14
- @logger = spider.logger
15
- end
16
-
17
- def build
18
- # Register driver
19
- Capybara.register_driver :poltergeist_phantomjs do |app|
20
- # Create driver options
21
- driver_options = {
22
- js_errors: false, debug: false, inspector: false, phantomjs_options: []
23
- }
24
-
25
- if extensions = @config[:extensions].presence
26
- driver_options[:extensions] = extensions
27
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
28
- end
29
-
30
- # Window size
31
- if size = @config[:window_size].presence
32
- driver_options[:window_size] = size
33
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
34
- end
35
-
36
- # SSL
37
- if ssl_cert_path = @config[:ssl_cert_path].presence
38
- driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
39
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
40
- end
41
-
42
- if @config[:ignore_ssl_errors].present?
43
- driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
44
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
45
- end
46
-
47
- # Disable images
48
- if @config[:disable_images].present?
49
- driver_options[:phantomjs_options] << "--load-images=no"
50
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
51
- end
52
-
53
- Capybara::Poltergeist::Driver.new(app, driver_options)
54
- end
55
-
56
- # Create browser instance (Capybara session)
57
- @browser = Capybara::Session.new(:poltergeist_phantomjs)
58
- @browser.spider = spider
59
- logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
60
-
61
- # Proxy
62
- if proxy = @config[:proxy].presence
63
- proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
64
- ip, port, type = proxy_string.split(":")
65
-
66
- if %w(http socks5).include?(type)
67
- @browser.driver.set_proxy(*proxy_string.split(":"))
68
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
69
- else
70
- logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
71
- end
72
- end
73
-
74
- # Headers
75
- if headers = @config[:headers].presence
76
- @browser.driver.headers = headers
77
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
78
- end
79
-
80
- if user_agent = @config[:user_agent].presence
81
- user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
82
-
83
- @browser.driver.add_header("User-Agent", user_agent_string)
84
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
85
- end
86
-
87
- # Cookies
88
- if cookies = @config[:cookies].presence
89
- cookies.each do |cookie|
90
- @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
91
- end
92
-
93
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
94
- end
95
-
96
- # Browser instance options
97
- # skip_request_errors
98
- if skip_errors = @config[:skip_request_errors].presence
99
- @browser.config.skip_request_errors = skip_errors
100
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
101
- end
102
-
103
- # retry_request_errors
104
- if retry_errors = @config[:retry_request_errors].presence
105
- @browser.config.retry_request_errors = retry_errors
106
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
107
- end
108
-
109
- # restart_if
110
- if requests_limit = @config.dig(:restart_if, :requests_limit).presence
111
- @browser.config.restart_if[:requests_limit] = requests_limit
112
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
113
- end
114
-
115
- if memory_limit = @config.dig(:restart_if, :memory_limit).presence
116
- @browser.config.restart_if[:memory_limit] = memory_limit
117
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
118
- end
119
-
120
- # before_request clear_cookies
121
- if @config.dig(:before_request, :clear_cookies)
122
- @browser.config.before_request[:clear_cookies] = true
123
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
124
- end
125
-
126
- # before_request clear_and_set_cookies
127
- if @config.dig(:before_request, :clear_and_set_cookies)
128
- if cookies = @config[:cookies].presence
129
- @browser.config.cookies = cookies
130
- @browser.config.before_request[:clear_and_set_cookies] = true
131
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
132
- else
133
- logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
134
- end
135
- end
136
-
137
- # before_request change_user_agent
138
- if @config.dig(:before_request, :change_user_agent)
139
- if @config[:user_agent].present? && @config[:user_agent].class == Proc
140
- @browser.config.user_agent = @config[:user_agent]
141
- @browser.config.before_request[:change_user_agent] = true
142
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
143
- else
144
- logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
145
- end
146
- end
147
-
148
- # before_request change_proxy
149
- if @config.dig(:before_request, :change_proxy)
150
- if @config[:proxy].present? && @config[:proxy].class == Proc
151
- @browser.config.proxy = @config[:proxy]
152
- @browser.config.before_request[:change_proxy] = true
153
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
154
- else
155
- logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
156
- end
157
- end
158
-
159
- # before_request delay
160
- if delay = @config.dig(:before_request, :delay).presence
161
- @browser.config.before_request[:delay] = delay
162
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
163
- end
164
-
165
- # encoding
166
- if encoding = @config[:encoding]
167
- @browser.config.encoding = encoding
168
- logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}"
169
- end
170
-
171
- # return Capybara session instance
172
- @browser
173
- end
174
- end
175
- end
@@ -1,13 +0,0 @@
1
- require_relative '../driver/base'
2
-
3
- module Capybara::Poltergeist
4
- class Driver
5
- def pid
6
- client_pid
7
- end
8
-
9
- def port
10
- server.port
11
- end
12
- end
13
- end
@@ -1,71 +0,0 @@
1
- require 'cliver'
2
-
3
- module Kimurai
4
- class CLI
5
- class AnsibleCommandBuilder
6
- def initialize(user_host, options, playbook:, vars: {})
7
- @user_host = user_host
8
- @options = options
9
- @playbook = playbook
10
- @vars = vars
11
- end
12
-
13
- def get
14
- unless Cliver.detect("ansible-playbook")
15
- raise "Can't find `ansible-playbook` executable, to install: " \
16
- "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17
- end
18
-
19
- user = @user_host[/(.*?)\@/, 1]
20
- host = @user_host[/\@(.+)/, 1] || @user_host
21
- inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22
-
23
- gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24
- playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25
-
26
- command = [
27
- "ansible-playbook", playbook_path,
28
- "--inventory", inventory,
29
- "--ssh-extra-args", "-oForwardAgent=yes",
30
- "--connection", @options["local"] ? "local" : "smart",
31
- "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32
- ]
33
-
34
- if File.exists? "config/automation.yml"
35
- require 'yaml'
36
- if config = YAML.load_file("config/automation.yml").dig(@playbook)
37
- config.each { |key, value| @vars[key] = value unless @vars[key] }
38
- end
39
- end
40
-
41
- @vars.each do |key, value|
42
- next unless value.present?
43
- command.push "--extra-vars", "#{key}=#{value}"
44
- end
45
-
46
- if user
47
- command.push "--user", user
48
- end
49
-
50
- if @options["ask-sudo"]
51
- command.push "--ask-become-pass"
52
- end
53
-
54
- if @options["ask-auth-pass"]
55
- unless Cliver.detect("sshpass")
56
- raise "Can't find `sshpass` executable for password authentication, to install: " \
57
- "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58
- end
59
-
60
- command.push "--ask-pass"
61
- end
62
-
63
- if ssh_key_path = @options["ssh-key-path"]
64
- command.push "--private-key", ssh_key_path
65
- end
66
-
67
- command
68
- end
69
- end
70
- end
71
- end
@@ -1,13 +0,0 @@
1
- # software versions to install for `setup` command
2
- setup:
3
- ruby: 2.5.1
4
- # check latest here http://phantomjs.org/download.html
5
- phantomjs: 2.1.1
6
- # check latest here https://github.com/mozilla/geckodriver/releases/
7
- geckodriver: 0.21.0
8
- # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
9
- chromedriver: 2.39
10
- # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11
- deploy:
12
- # repo_url: git@bitbucket.org:username/repo_name.git
13
- # repo_key_path: ~/.ssh/id_rsa