tanakai 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +12 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +118 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/tanakai +6 -0
  12. data/lib/tanakai/automation/deploy.yml +54 -0
  13. data/lib/tanakai/automation/setup/chromium_chromedriver.yml +26 -0
  14. data/lib/tanakai/automation/setup/firefox_geckodriver.yml +20 -0
  15. data/lib/tanakai/automation/setup/phantomjs.yml +33 -0
  16. data/lib/tanakai/automation/setup/ruby_environment.yml +124 -0
  17. data/lib/tanakai/automation/setup.yml +45 -0
  18. data/lib/tanakai/base/saver.rb +106 -0
  19. data/lib/tanakai/base/storage.rb +54 -0
  20. data/lib/tanakai/base.rb +326 -0
  21. data/lib/tanakai/base_helper.rb +22 -0
  22. data/lib/tanakai/browser_builder/apparition_builder.rb +58 -0
  23. data/lib/tanakai/browser_builder/cuprite_builder.rb +54 -0
  24. data/lib/tanakai/browser_builder/mechanize_builder.rb +154 -0
  25. data/lib/tanakai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  26. data/lib/tanakai/browser_builder/selenium_chrome_builder.rb +199 -0
  27. data/lib/tanakai/browser_builder/selenium_firefox_builder.rb +204 -0
  28. data/lib/tanakai/browser_builder.rb +20 -0
  29. data/lib/tanakai/capybara_configuration.rb +10 -0
  30. data/lib/tanakai/capybara_ext/apparition/driver.rb +13 -0
  31. data/lib/tanakai/capybara_ext/cuprite/driver.rb +13 -0
  32. data/lib/tanakai/capybara_ext/driver/base.rb +62 -0
  33. data/lib/tanakai/capybara_ext/mechanize/driver.rb +71 -0
  34. data/lib/tanakai/capybara_ext/poltergeist/driver.rb +13 -0
  35. data/lib/tanakai/capybara_ext/selenium/driver.rb +34 -0
  36. data/lib/tanakai/capybara_ext/session/config.rb +22 -0
  37. data/lib/tanakai/capybara_ext/session.rb +249 -0
  38. data/lib/tanakai/cli/ansible_command_builder.rb +71 -0
  39. data/lib/tanakai/cli/generator.rb +57 -0
  40. data/lib/tanakai/cli.rb +183 -0
  41. data/lib/tanakai/core_ext/array.rb +14 -0
  42. data/lib/tanakai/core_ext/hash.rb +5 -0
  43. data/lib/tanakai/core_ext/numeric.rb +19 -0
  44. data/lib/tanakai/core_ext/string.rb +7 -0
  45. data/lib/tanakai/pipeline.rb +33 -0
  46. data/lib/tanakai/runner.rb +60 -0
  47. data/lib/tanakai/template/.gitignore +18 -0
  48. data/lib/tanakai/template/Gemfile +28 -0
  49. data/lib/tanakai/template/README.md +3 -0
  50. data/lib/tanakai/template/config/application.rb +37 -0
  51. data/lib/tanakai/template/config/automation.yml +13 -0
  52. data/lib/tanakai/template/config/boot.rb +22 -0
  53. data/lib/tanakai/template/config/initializers/.keep +0 -0
  54. data/lib/tanakai/template/config/schedule.rb +57 -0
  55. data/lib/tanakai/template/db/.keep +0 -0
  56. data/lib/tanakai/template/helpers/application_helper.rb +3 -0
  57. data/lib/tanakai/template/lib/.keep +0 -0
  58. data/lib/tanakai/template/log/.keep +0 -0
  59. data/lib/tanakai/template/pipelines/saver.rb +11 -0
  60. data/lib/tanakai/template/pipelines/validator.rb +24 -0
  61. data/lib/tanakai/template/spiders/application_spider.rb +143 -0
  62. data/lib/tanakai/template/tmp/.keep +0 -0
  63. data/lib/tanakai/version.rb +3 -0
  64. data/lib/tanakai.rb +54 -0
  65. data/tanakai.gemspec +50 -0
  66. metadata +382 -0
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "tanakai"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/exe/tanakai ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'tanakai'
4
+ require 'tanakai/cli'
5
+
6
+ Tanakai::CLI.start(ARGV)
@@ -0,0 +1,54 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
5
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
6
+ repo_url:
7
+ repo_name:
8
+ repo_key_path:
9
+
10
+ tasks:
11
+ - name: Copy custom git ssh key to /tmp/private_key (if provided)
12
+ when: repo_key_path is not none
13
+ copy:
14
+ src: "{{ repo_key_path }}"
15
+ dest: /tmp/private_key
16
+ mode: 0600
17
+
18
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
19
+ when: repo_key_path is none
20
+ git:
21
+ repo: "{{ repo_url }}"
22
+ dest: "~/{{ repo_name }}"
23
+ force: true
24
+ accept_hostkey: true
25
+
26
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
27
+ when: repo_key_path is not none
28
+ git:
29
+ repo: "{{ repo_url }}"
30
+ dest: "~/{{ repo_name }}"
31
+ force: true
32
+ accept_hostkey: true
33
+ key_file: /tmp/private_key
34
+
35
+ - name: Delete custom git ssh key from /tmp/private_key (if provided)
36
+ when: repo_key_path is not none
37
+ file:
38
+ state: absent
39
+ path: /tmp/private_key
40
+
41
+ - name: Run bundle install
42
+ command: bundle install
43
+ args:
44
+ chdir: ~/{{ repo_name }}
45
+ environment:
46
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
47
+
48
+ - name: Run whenever to update crontab
49
+ command: whenever --update-crontab
50
+ args:
51
+ chdir: ~/{{ repo_name }}
52
+ environment:
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
@@ -0,0 +1,26 @@
1
+ ---
2
+ - name: Install chromium browser
3
+ apt:
4
+ pkg: chromium-browser
5
+ state: present
6
+
7
+ - name: Get current chromedriver version
8
+ shell: chromedriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_chromedriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Install unzip tool to unarchive chromedriver archive
16
+ apt:
17
+ pkg: unzip
18
+ state: present
19
+
20
+ - name: Download chromedriver binary archive and unarchive it to /usr/local/bin
21
+ unarchive:
22
+ src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
23
+ dest: /usr/local/bin
24
+ remote_src: true
25
+ mode: a+x
26
+ when: chromedriver not in current_chromedriver_version.stdout_lines
@@ -0,0 +1,20 @@
1
+ ---
2
+ - name: Install firefox
3
+ apt:
4
+ pkg: firefox
5
+ state: present
6
+
7
+ - name: Get current geckodriver version
8
+ shell: geckodriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_geckodriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Download geckodriver binary archive and unarchive it to /usr/local/bin
16
+ unarchive:
17
+ src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
18
+ dest: /usr/local/bin
19
+ remote_src: true
20
+ when: geckodriver not in current_geckodriver_version.stdout
@@ -0,0 +1,33 @@
1
+ ---
2
+ - name: Install dependencies for PhantomJS
3
+ apt:
4
+ pkg: "{{ item }}"
5
+ state: present
6
+ with_items:
7
+ - chrpath
8
+ - libxft-dev
9
+ - libfreetype6
10
+ - libfreetype6-dev
11
+ - libfontconfig1
12
+ - libfontconfig1-dev
13
+
14
+ - name: Get current phantomjs version
15
+ shell: phantomjs -v
16
+ args:
17
+ executable: /bin/bash
18
+ register: current_phantomjs_version
19
+ changed_when: false
20
+ ignore_errors: true
21
+
22
+ - name: Download phantomJS archive and unarchive it to /usr/local/lib
23
+ unarchive:
24
+ src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
25
+ dest: /usr/local/lib
26
+ remote_src: true
27
+ when: phantomjs not in current_phantomjs_version.stdout
28
+
29
+ - name: Link PhantomJS binary to /usr/local/bin/phantomjs
30
+ file:
31
+ src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
32
+ dest: /usr/local/bin/phantomjs
33
+ state: link
@@ -0,0 +1,124 @@
1
+ ---
2
+ - name: Install dependencies for ruby-build
3
+ become: true
4
+ apt:
5
+ pkg: "{{ item }}"
6
+ state: present
7
+ with_items:
8
+ - zlib1g-dev
9
+ - build-essential
10
+ - libssl-dev
11
+ - libreadline-dev
12
+ - libreadline6-dev
13
+ - libyaml-dev
14
+ - libxml2-dev
15
+ - libxslt1-dev
16
+ - libcurl4-openssl-dev
17
+ - libffi-dev
18
+
19
+ - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
20
+ git:
21
+ repo: https://github.com/sstephenson/rbenv.git
22
+ dest: "{{ rbenv_root_path }}"
23
+
24
+ - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
25
+ git:
26
+ repo: https://github.com/sstephenson/ruby-build.git
27
+ dest: "{{ rbenv_root_path }}/plugins/ruby-build"
28
+
29
+ - name: Add Rbenv path to the .bashrc
30
+ lineinfile:
31
+ dest: ~/.bashrc
32
+ regexp: '^export PATH="\$HOME\/\.rbenv'
33
+ line: export PATH="$HOME/.rbenv/bin:$PATH"
34
+ state: present
35
+
36
+ - name: Add Rbenv init to the .bashrc
37
+ lineinfile:
38
+ dest: ~/.bashrc
39
+ regexp: '^eval "\$\(rbenv'
40
+ line: eval "$(rbenv init -)"
41
+ state: present
42
+
43
+ - name: Check if desired Ruby version already installed
44
+ stat:
45
+ path: "{{ ruby_versions_path }}/{{ ruby }}"
46
+ register: ruby_present
47
+
48
+ - name: Install desired Ruby version using ruby-build (this can take a while)
49
+ command: rbenv install {{ ruby }}
50
+ when: not ruby_present.stat.exists
51
+ environment:
52
+ CONFIGURE_OPTS: "--disable-install-doc"
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
55
+ - name: Get current Ruby version
56
+ command: "ruby -v"
57
+ register: current_ruby_version
58
+ changed_when: false
59
+ ignore_errors: true
60
+ environment:
61
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
62
+
63
+ - name: Set desired Ruby version as a global version
64
+ command: "rbenv global {{ ruby }}"
65
+ when: ruby not in current_ruby_version.stdout
66
+ environment:
67
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
68
+ register: set_ruby
69
+
70
+ - name: Execute `rbenv rehash` command
71
+ command: rbenv rehash
72
+ when: set_ruby.changed
73
+ environment:
74
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
75
+
76
+ - name: Create ~/.gemrc file to skip docs
77
+ copy:
78
+ dest: ~/.gemrc
79
+ content: "gem: --no-ri --no-rdoc"
80
+
81
+ - name: Create ~/.bundle directory
82
+ file:
83
+ dest: ~/.bundle
84
+ state: directory
85
+
86
+ - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
87
+ copy:
88
+ dest: ~/.bundle/config
89
+ content: |
90
+ BUNDLE_GIT__ALLOW_INSECURE: "true"
91
+ BUNDLE_JOBS: "4"
92
+
93
+ - name: Check if Bundler gem installed
94
+ stat:
95
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
96
+ register: bundler_gem_present
97
+
98
+ - name: Install Bundler gem
99
+ command: gem install bundler
100
+ when: not bundler_gem_present.stat.exists
101
+ environment:
102
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
103
+
104
+ - name: Check if Whenever gem installed
105
+ stat:
106
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
107
+ register: whenever_gem_present
108
+
109
+ - name: Install Whenever gem
110
+ command: gem install whenever
111
+ when: not whenever_gem_present.stat.exists
112
+ environment:
113
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
114
+
115
+ - name: Check if Tanakai gem installed
116
+ stat:
117
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/tanakai"
118
+ register: tanakai_gem_present
119
+
120
+ - name: Install Tanakai gem
121
+ command: gem install tanakai
122
+ when: not tanakai_gem_present.stat.exists
123
+ environment:
124
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
@@ -0,0 +1,45 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ ruby: 2.5.3
5
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
6
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
7
+ ruby_versions_path: "{{ rbenv_root_path }}/versions"
8
+ # check latest here http://phantomjs.org/download.html
9
+ phantomjs: 2.1.1
10
+ # check latest here https://github.com/mozilla/geckodriver/releases/
11
+ geckodriver: 0.23.0
12
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
13
+ chromedriver: 2.44
14
+
15
+ tasks:
16
+ - name: Update apt cache
17
+ become: true
18
+ apt: update_cache=yes cache_valid_time=86400
19
+
20
+ - name: Install base packages
21
+ become: true
22
+ apt:
23
+ pkg: "{{ item }}"
24
+ state: present
25
+ with_items:
26
+ - git
27
+ - xvfb
28
+ - libsqlite3-dev
29
+ - sqlite3
30
+ - mongodb-clients
31
+ - mysql-client
32
+ - libmysqlclient-dev
33
+ - postgresql-client
34
+ - libpq-dev
35
+
36
+ - import_tasks: setup/ruby_environment.yml
37
+
38
+ - import_tasks: setup/phantomjs.yml
39
+ become: true
40
+
41
+ - import_tasks: setup/firefox_geckodriver.yml
42
+ become: true
43
+
44
+ - import_tasks: setup/chromium_chromedriver.yml
45
+ become: true
@@ -0,0 +1,106 @@
1
+ require 'json'
2
+ require 'csv'
3
+
4
+ module Tanakai
5
+ class Base
6
+ class Saver
7
+ attr_reader :format, :path, :position, :append
8
+
9
+ def initialize(path, format:, position: true, append: false)
10
+ unless %i(json pretty_json jsonlines csv).include?(format)
11
+ raise "SimpleSaver: wrong type of format: #{format}"
12
+ end
13
+
14
+ @path = path
15
+ @format = format
16
+ @position = position
17
+ @index = 0
18
+ @append = append
19
+ @mutex = Mutex.new
20
+ end
21
+
22
+ def save(item)
23
+ @mutex.synchronize do
24
+ @index += 1
25
+ item[:position] = @index if position
26
+
27
+ case format
28
+ when :json
29
+ save_to_json(item)
30
+ when :pretty_json
31
+ save_to_pretty_json(item)
32
+ when :jsonlines
33
+ save_to_jsonlines(item)
34
+ when :csv
35
+ save_to_csv(item)
36
+ end
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def save_to_json(item)
43
+ data = JSON.generate([item])
44
+
45
+ if @index > 1 || append && File.exists?(path)
46
+ file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
47
+ File.open(path, "w") do |f|
48
+ f.write(file_content + data.sub(/\A\[/, ""))
49
+ end
50
+ else
51
+ File.open(path, "w") { |f| f.write(data) }
52
+ end
53
+ end
54
+
55
+ def save_to_pretty_json(item)
56
+ data = JSON.pretty_generate([item])
57
+
58
+ if @index > 1 || append && File.exists?(path)
59
+ file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
60
+ File.open(path, "w") do |f|
61
+ f.write(file_content + data.sub(/\A\[\n/, ""))
62
+ end
63
+ else
64
+ File.open(path, "w") { |f| f.write(data) }
65
+ end
66
+ end
67
+
68
+ def save_to_jsonlines(item)
69
+ data = JSON.generate(item)
70
+
71
+ if @index > 1 || append && File.exists?(path)
72
+ File.open(path, "a") { |file| file.write("\n" + data) }
73
+ else
74
+ File.open(path, "w") { |file| file.write(data) }
75
+ end
76
+ end
77
+
78
+ def save_to_csv(item)
79
+ data = flatten_hash(item)
80
+
81
+ if @index > 1 || append && File.exists?(path)
82
+ CSV.open(path, "a+", force_quotes: true) do |csv|
83
+ csv << data.values
84
+ end
85
+ else
86
+ CSV.open(path, "w", force_quotes: true) do |csv|
87
+ csv << data.keys
88
+ csv << data.values
89
+ end
90
+ end
91
+ end
92
+
93
+ def flatten_hash(hash)
94
+ hash.each_with_object({}) do |(k, v), h|
95
+ if v.is_a? Hash
96
+ flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v }
97
+ else
98
+ h[k&.to_s] = v
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+
@@ -0,0 +1,54 @@
1
+ module Tanakai
2
+ class Base
3
+ class Storage
4
+ attr_reader :database
5
+
6
+ def initialize
7
+ @mutex = Mutex.new
8
+ @database = {}
9
+ end
10
+
11
+ def all(scope = nil)
12
+ @mutex.synchronize do
13
+ scope ? database.fetch(scope, []) : database
14
+ end
15
+ end
16
+
17
+ def include?(scope, value)
18
+ @mutex.synchronize do
19
+ database[scope] ||= []
20
+ database[scope].include?(value)
21
+ end
22
+ end
23
+
24
+ def add(scope, value)
25
+ @mutex.synchronize do
26
+ database[scope] ||= []
27
+ if value.kind_of?(Array)
28
+ database[scope] += value
29
+ database[scope].uniq!
30
+ else
31
+ database[scope].push(value) unless database[scope].include?(value)
32
+ end
33
+ end
34
+ end
35
+
36
+ ###
37
+
38
+ def unique?(scope, value)
39
+ @mutex.synchronize do
40
+ database[scope] ||= []
41
+ database[scope].include?(value) ? false : database[scope].push(value) and true
42
+ end
43
+ end
44
+
45
+ ###
46
+
47
+ def clear!
48
+ @mutex.synchronize do
49
+ @database = {}
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end