kimurai_dynamic 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.travis.yml +5 -0
  4. data/CHANGELOG.md +111 -0
  5. data/Gemfile +6 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +2038 -0
  8. data/Rakefile +10 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/exe/kimurai +6 -0
  12. data/kimurai.gemspec +48 -0
  13. data/lib/kimurai/automation/deploy.yml +54 -0
  14. data/lib/kimurai/automation/setup/chromium_chromedriver.yml +26 -0
  15. data/lib/kimurai/automation/setup/firefox_geckodriver.yml +20 -0
  16. data/lib/kimurai/automation/setup/phantomjs.yml +33 -0
  17. data/lib/kimurai/automation/setup/ruby_environment.yml +124 -0
  18. data/lib/kimurai/automation/setup.yml +45 -0
  19. data/lib/kimurai/base/saver.rb +106 -0
  20. data/lib/kimurai/base/storage.rb +54 -0
  21. data/lib/kimurai/base.rb +330 -0
  22. data/lib/kimurai/base_helper.rb +22 -0
  23. data/lib/kimurai/browser_builder/mechanize_builder.rb +154 -0
  24. data/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb +175 -0
  25. data/lib/kimurai/browser_builder/selenium_chrome_builder.rb +199 -0
  26. data/lib/kimurai/browser_builder/selenium_firefox_builder.rb +204 -0
  27. data/lib/kimurai/browser_builder.rb +20 -0
  28. data/lib/kimurai/capybara_configuration.rb +10 -0
  29. data/lib/kimurai/capybara_ext/driver/base.rb +62 -0
  30. data/lib/kimurai/capybara_ext/mechanize/driver.rb +71 -0
  31. data/lib/kimurai/capybara_ext/poltergeist/driver.rb +13 -0
  32. data/lib/kimurai/capybara_ext/selenium/driver.rb +34 -0
  33. data/lib/kimurai/capybara_ext/session/config.rb +22 -0
  34. data/lib/kimurai/capybara_ext/session.rb +249 -0
  35. data/lib/kimurai/cli/ansible_command_builder.rb +71 -0
  36. data/lib/kimurai/cli/generator.rb +57 -0
  37. data/lib/kimurai/cli.rb +183 -0
  38. data/lib/kimurai/core_ext/array.rb +14 -0
  39. data/lib/kimurai/core_ext/hash.rb +5 -0
  40. data/lib/kimurai/core_ext/numeric.rb +19 -0
  41. data/lib/kimurai/core_ext/string.rb +7 -0
  42. data/lib/kimurai/pipeline.rb +33 -0
  43. data/lib/kimurai/runner.rb +60 -0
  44. data/lib/kimurai/template/.gitignore +18 -0
  45. data/lib/kimurai/template/Gemfile +28 -0
  46. data/lib/kimurai/template/README.md +3 -0
  47. data/lib/kimurai/template/config/application.rb +37 -0
  48. data/lib/kimurai/template/config/automation.yml +13 -0
  49. data/lib/kimurai/template/config/boot.rb +22 -0
  50. data/lib/kimurai/template/config/initializers/.keep +0 -0
  51. data/lib/kimurai/template/config/schedule.rb +57 -0
  52. data/lib/kimurai/template/db/.keep +0 -0
  53. data/lib/kimurai/template/helpers/application_helper.rb +3 -0
  54. data/lib/kimurai/template/lib/.keep +0 -0
  55. data/lib/kimurai/template/log/.keep +0 -0
  56. data/lib/kimurai/template/pipelines/saver.rb +11 -0
  57. data/lib/kimurai/template/pipelines/validator.rb +24 -0
  58. data/lib/kimurai/template/spiders/application_spider.rb +143 -0
  59. data/lib/kimurai/template/tmp/.keep +0 -0
  60. data/lib/kimurai/version.rb +3 -0
  61. data/lib/kimurai.rb +54 -0
  62. metadata +349 -0
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "kimurai"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/exe/kimurai ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'kimurai'
4
+ require 'kimurai/cli'
5
+
6
+ Kimurai::CLI.start(ARGV)
data/kimurai.gemspec ADDED
@@ -0,0 +1,48 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "kimurai/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "kimurai_dynamic"
8
+ spec.version = Kimurai::VERSION
9
+ spec.authors = ["Mehmet Celik"]
10
+ spec.email = ['mehmetcelik4@gmail.com']
11
+
12
+ spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri, with dynamic crawl option"
13
+ spec.homepage = "https://github.com/MehmetCelik4/kimuraframework"
14
+ spec.license = "MIT"
15
+
16
+ # Specify which files should be added to the gem when it is released.
17
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20
+ end
21
+ spec.bindir = "exe"
22
+ spec.executables = "kimurai"
23
+ spec.require_paths = ["lib"]
24
+ spec.required_ruby_version = ">= 2.5.0"
25
+
26
+ spec.add_dependency "thor"
27
+ spec.add_dependency "cliver"
28
+ spec.add_dependency "activesupport"
29
+ spec.add_dependency "murmurhash3"
30
+ spec.add_dependency "nokogiri"
31
+
32
+ spec.add_dependency "capybara", ">= 2.15", "< 4.0"
33
+ spec.add_dependency "capybara-mechanize"
34
+ spec.add_dependency "poltergeist"
35
+ spec.add_dependency "selenium-webdriver"
36
+
37
+ spec.add_dependency "headless"
38
+ spec.add_dependency "pmap"
39
+
40
+ spec.add_dependency "whenever"
41
+
42
+ spec.add_dependency "rbcat", "~> 0.2"
43
+ spec.add_dependency "pry"
44
+
45
+ spec.add_development_dependency "bundler", "~> 1.16"
46
+ spec.add_development_dependency "rake", "~> 10.0"
47
+ spec.add_development_dependency "minitest", "~> 5.0"
48
+ end
@@ -0,0 +1,54 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
5
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
6
+ repo_url:
7
+ repo_name:
8
+ repo_key_path:
9
+
10
+ tasks:
11
+ - name: Copy custom git ssh key to /tmp/private_key (if provided)
12
+ when: repo_key_path is not none
13
+ copy:
14
+ src: "{{ repo_key_path }}"
15
+ dest: /tmp/private_key
16
+ mode: 0600
17
+
18
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
19
+ when: repo_key_path is none
20
+ git:
21
+ repo: "{{ repo_url }}"
22
+ dest: "~/{{ repo_name }}"
23
+ force: true
24
+ accept_hostkey: true
25
+
26
+ - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
27
+ when: repo_key_path is not none
28
+ git:
29
+ repo: "{{ repo_url }}"
30
+ dest: "~/{{ repo_name }}"
31
+ force: true
32
+ accept_hostkey: true
33
+ key_file: /tmp/private_key
34
+
35
+ - name: Delete custom git ssh key from /tmp/private_key (if provided)
36
+ when: repo_key_path is not none
37
+ file:
38
+ state: absent
39
+ path: /tmp/private_key
40
+
41
+ - name: Run bundle install
42
+ command: bundle install
43
+ args:
44
+ chdir: ~/{{ repo_name }}
45
+ environment:
46
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
47
+
48
+ - name: Run whenever to update crontab
49
+ command: whenever --update-crontab
50
+ args:
51
+ chdir: ~/{{ repo_name }}
52
+ environment:
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
@@ -0,0 +1,26 @@
1
+ ---
2
+ - name: Install chromium browser
3
+ apt:
4
+ pkg: chromium-browser
5
+ state: present
6
+
7
+ - name: Get current chromedriver version
8
+ shell: chromedriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_chromedriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Install unzip tool to unarchive chromedriver archive
16
+ apt:
17
+ pkg: unzip
18
+ state: present
19
+
20
+ - name: Download chromedriver binary archive and unarchive it to /usr/local/bin
21
+ unarchive:
22
+ src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
23
+ dest: /usr/local/bin
24
+ remote_src: true
25
+ mode: a+x
26
+ when: chromedriver not in current_chromedriver_version.stdout_lines
@@ -0,0 +1,20 @@
1
+ ---
2
+ - name: Install firefox
3
+ apt:
4
+ pkg: firefox
5
+ state: present
6
+
7
+ - name: Get current geckodriver version
8
+ shell: geckodriver --version
9
+ args:
10
+ executable: /bin/bash
11
+ register: current_geckodriver_version
12
+ changed_when: false
13
+ ignore_errors: true
14
+
15
+ - name: Download geckodriver binary archive and unarchive it to /usr/local/bin
16
+ unarchive:
17
+ src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
18
+ dest: /usr/local/bin
19
+ remote_src: true
20
+ when: geckodriver not in current_geckodriver_version.stdout
@@ -0,0 +1,33 @@
1
+ ---
2
+ - name: Install dependencies for PhantomJS
3
+ apt:
4
+ pkg: "{{ item }}"
5
+ state: present
6
+ with_items:
7
+ - chrpath
8
+ - libxft-dev
9
+ - libfreetype6
10
+ - libfreetype6-dev
11
+ - libfontconfig1
12
+ - libfontconfig1-dev
13
+
14
+ - name: Get current phantomjs version
15
+ shell: phantomjs -v
16
+ args:
17
+ executable: /bin/bash
18
+ register: current_phantomjs_version
19
+ changed_when: false
20
+ ignore_errors: true
21
+
22
+ - name: Download phantomJS archive and unarchive it to /usr/local/lib
23
+ unarchive:
24
+ src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
25
+ dest: /usr/local/lib
26
+ remote_src: true
27
+ when: phantomjs not in current_phantomjs_version.stdout
28
+
29
+ - name: Link PhantomJS binary to /usr/local/bin/phantomjs
30
+ file:
31
+ src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
32
+ dest: /usr/local/bin/phantomjs
33
+ state: link
@@ -0,0 +1,124 @@
1
+ ---
2
+ - name: Install dependencies for ruby-build
3
+ become: true
4
+ apt:
5
+ pkg: "{{ item }}"
6
+ state: present
7
+ with_items:
8
+ - zlib1g-dev
9
+ - build-essential
10
+ - libssl-dev
11
+ - libreadline-dev
12
+ - libreadline6-dev
13
+ - libyaml-dev
14
+ - libxml2-dev
15
+ - libxslt1-dev
16
+ - libcurl4-openssl-dev
17
+ - libffi-dev
18
+
19
+ - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
20
+ git:
21
+ repo: https://github.com/sstephenson/rbenv.git
22
+ dest: "{{ rbenv_root_path }}"
23
+
24
+ - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
25
+ git:
26
+ repo: https://github.com/sstephenson/ruby-build.git
27
+ dest: "{{ rbenv_root_path }}/plugins/ruby-build"
28
+
29
+ - name: Add Rbenv path to the .bashrc
30
+ lineinfile:
31
+ dest: ~/.bashrc
32
+ regexp: '^export PATH="\$HOME\/\.rbenv'
33
+ line: export PATH="$HOME/.rbenv/bin:$PATH"
34
+ state: present
35
+
36
+ - name: Add Rbenv init to the .bashrc
37
+ lineinfile:
38
+ dest: ~/.bashrc
39
+ regexp: '^eval "\$\(rbenv'
40
+ line: eval "$(rbenv init -)"
41
+ state: present
42
+
43
+ - name: Check if desired Ruby version already installed
44
+ stat:
45
+ path: "{{ ruby_versions_path }}/{{ ruby }}"
46
+ register: ruby_present
47
+
48
+ - name: Install desired Ruby version using ruby-build (this can take a while)
49
+ command: rbenv install {{ ruby }}
50
+ when: not ruby_present.stat.exists
51
+ environment:
52
+ CONFIGURE_OPTS: "--disable-install-doc"
53
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54
+
55
+ - name: Get current Ruby version
56
+ command: "ruby -v"
57
+ register: current_ruby_version
58
+ changed_when: false
59
+ ignore_errors: true
60
+ environment:
61
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
62
+
63
+ - name: Set desired Ruby version as a global version
64
+ command: "rbenv global {{ ruby }}"
65
+ when: ruby not in current_ruby_version.stdout
66
+ environment:
67
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
68
+ register: set_ruby
69
+
70
+ - name: Execute `rbenv rehash` command
71
+ command: rbenv rehash
72
+ when: set_ruby.changed
73
+ environment:
74
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
75
+
76
+ - name: Create ~/.gemrc file to skip docs
77
+ copy:
78
+ dest: ~/.gemrc
79
+ content: "gem: --no-ri --no-rdoc"
80
+
81
+ - name: Create ~/.bundle directory
82
+ file:
83
+ dest: ~/.bundle
84
+ state: directory
85
+
86
+ - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
87
+ copy:
88
+ dest: ~/.bundle/config
89
+ content: |
90
+ BUNDLE_GIT__ALLOW_INSECURE: "true"
91
+ BUNDLE_JOBS: "4"
92
+
93
+ - name: Check if Bundler gem installed
94
+ stat:
95
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
96
+ register: bundler_gem_present
97
+
98
+ - name: Install Bundler gem
99
+ command: gem install bundler
100
+ when: not bundler_gem_present.stat.exists
101
+ environment:
102
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
103
+
104
+ - name: Check if Whenever gem installed
105
+ stat:
106
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
107
+ register: whenever_gem_present
108
+
109
+ - name: Install Whenever gem
110
+ command: gem install whenever
111
+ when: not whenever_gem_present.stat.exists
112
+ environment:
113
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
114
+
115
+ - name: Check if Kimurai gem installed
116
+ stat:
117
+ path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
118
+ register: kimurai_gem_present
119
+
120
+ - name: Install Kimurai gem
121
+ command: gem install kimurai
122
+ when: not kimurai_gem_present.stat.exists
123
+ environment:
124
+ PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
@@ -0,0 +1,45 @@
1
+ ---
2
+ - hosts: all
3
+ vars:
4
+ ruby: 2.5.3
5
+ rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
6
+ rbenv_shims_path: "{{ rbenv_root_path }}/shims"
7
+ ruby_versions_path: "{{ rbenv_root_path }}/versions"
8
+ # check latest here http://phantomjs.org/download.html
9
+ phantomjs: 2.1.1
10
+ # check latest here https://github.com/mozilla/geckodriver/releases/
11
+ geckodriver: 0.23.0
12
+ # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
13
+ chromedriver: 2.44
14
+
15
+ tasks:
16
+ - name: Update apt cache
17
+ become: true
18
+ apt: update_cache=yes cache_valid_time=86400
19
+
20
+ - name: Install base packages
21
+ become: true
22
+ apt:
23
+ pkg: "{{ item }}"
24
+ state: present
25
+ with_items:
26
+ - git
27
+ - xvfb
28
+ - libsqlite3-dev
29
+ - sqlite3
30
+ - mongodb-clients
31
+ - mysql-client
32
+ - libmysqlclient-dev
33
+ - postgresql-client
34
+ - libpq-dev
35
+
36
+ - import_tasks: setup/ruby_environment.yml
37
+
38
+ - import_tasks: setup/phantomjs.yml
39
+ become: true
40
+
41
+ - import_tasks: setup/firefox_geckodriver.yml
42
+ become: true
43
+
44
+ - import_tasks: setup/chromium_chromedriver.yml
45
+ become: true
@@ -0,0 +1,106 @@
1
+ require 'json'
2
+ require 'csv'
3
+
4
+ module Kimurai
5
+ class Base
6
+ class Saver
7
+ attr_reader :format, :path, :position, :append
8
+
9
+ def initialize(path, format:, position: true, append: false)
10
+ unless %i(json pretty_json jsonlines csv).include?(format)
11
+ raise "SimpleSaver: wrong type of format: #{format}"
12
+ end
13
+
14
+ @path = path
15
+ @format = format
16
+ @position = position
17
+ @index = 0
18
+ @append = append
19
+ @mutex = Mutex.new
20
+ end
21
+
22
+ def save(item)
23
+ @mutex.synchronize do
24
+ @index += 1
25
+ item[:position] = @index if position
26
+
27
+ case format
28
+ when :json
29
+ save_to_json(item)
30
+ when :pretty_json
31
+ save_to_pretty_json(item)
32
+ when :jsonlines
33
+ save_to_jsonlines(item)
34
+ when :csv
35
+ save_to_csv(item)
36
+ end
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def save_to_json(item)
43
+ data = JSON.generate([item])
44
+
45
+ if @index > 1 || append && File.exists?(path)
46
+ file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
47
+ File.open(path, "w") do |f|
48
+ f.write(file_content + data.sub(/\A\[/, ""))
49
+ end
50
+ else
51
+ File.open(path, "w") { |f| f.write(data) }
52
+ end
53
+ end
54
+
55
+ def save_to_pretty_json(item)
56
+ data = JSON.pretty_generate([item])
57
+
58
+ if @index > 1 || append && File.exists?(path)
59
+ file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
60
+ File.open(path, "w") do |f|
61
+ f.write(file_content + data.sub(/\A\[\n/, ""))
62
+ end
63
+ else
64
+ File.open(path, "w") { |f| f.write(data) }
65
+ end
66
+ end
67
+
68
+ def save_to_jsonlines(item)
69
+ data = JSON.generate(item)
70
+
71
+ if @index > 1 || append && File.exists?(path)
72
+ File.open(path, "a") { |file| file.write("\n" + data) }
73
+ else
74
+ File.open(path, "w") { |file| file.write(data) }
75
+ end
76
+ end
77
+
78
+ def save_to_csv(item)
79
+ data = flatten_hash(item)
80
+
81
+ if @index > 1 || append && File.exists?(path)
82
+ CSV.open(path, "a+", force_quotes: true) do |csv|
83
+ csv << data.values
84
+ end
85
+ else
86
+ CSV.open(path, "w", force_quotes: true) do |csv|
87
+ csv << data.keys
88
+ csv << data.values
89
+ end
90
+ end
91
+ end
92
+
93
+ def flatten_hash(hash)
94
+ hash.each_with_object({}) do |(k, v), h|
95
+ if v.is_a? Hash
96
+ flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v }
97
+ else
98
+ h[k&.to_s] = v
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+
@@ -0,0 +1,54 @@
1
+ module Kimurai
2
+ class Base
3
+ class Storage
4
+ attr_reader :database
5
+
6
+ def initialize
7
+ @mutex = Mutex.new
8
+ @database = {}
9
+ end
10
+
11
+ def all(scope = nil)
12
+ @mutex.synchronize do
13
+ scope ? database.fetch(scope, []) : database
14
+ end
15
+ end
16
+
17
+ def include?(scope, value)
18
+ @mutex.synchronize do
19
+ database[scope] ||= []
20
+ database[scope].include?(value)
21
+ end
22
+ end
23
+
24
+ def add(scope, value)
25
+ @mutex.synchronize do
26
+ database[scope] ||= []
27
+ if value.kind_of?(Array)
28
+ database[scope] += value
29
+ database[scope].uniq!
30
+ else
31
+ database[scope].push(value) unless database[scope].include?(value)
32
+ end
33
+ end
34
+ end
35
+
36
+ ###
37
+
38
+ def unique?(scope, value)
39
+ @mutex.synchronize do
40
+ database[scope] ||= []
41
+ database[scope].include?(value) ? false : database[scope].push(value) and true
42
+ end
43
+ end
44
+
45
+ ###
46
+
47
+ def clear!
48
+ @mutex.synchronize do
49
+ @database = {}
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end