RubyGems - dbtools - Versions diffs - 0.5.2 - Mend

dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/README.md +333 -0
data/Thorfile +1 -0
data/bin/dbtools +5 -0
data/config/client_secret_dbtools.json +1 -0
data/config/config.yml +1 -0
data/config/database_config.yml +12 -0
data/config/databases.txt +5 -0
data/config/schedule.rb +8 -0
data/dbtools.gemspec +37 -0
data/lib/dbtools.rb +47 -0
data/lib/dbtools/constants.rb +847 -0
data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
data/lib/dbtools/converter/csv_importer.rb +107 -0
data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
data/lib/dbtools/database/database_data.rb +146 -0
data/lib/dbtools/database/db_connection.rb +236 -0
data/lib/dbtools/database/mysql_connection.rb +78 -0
data/lib/dbtools/database/postgresql_connection.rb +132 -0
data/lib/dbtools/database/violation.rb +45 -0
data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
data/lib/dbtools/plsql_functions/link.sql +17 -0
data/lib/dbtools/plsql_functions/unlink.sql +15 -0
data/lib/dbtools/rdf/rdf_reader.rb +136 -0
data/lib/dbtools/version.rb +3 -0
data/lib/rdf/geophy.rb +27 -0
data/lib/tasks/aws.rb +43 -0
data/lib/tasks/backup.rb +107 -0
data/lib/tasks/check.rb +220 -0
data/lib/tasks/ckan.rb +151 -0
data/lib/tasks/convert.rb +139 -0
data/lib/tasks/dump.rb +110 -0
data/lib/tasks/googledrivetool.rb +252 -0
data/lib/tasks/import.rb +142 -0
data/lib/tasks/postgres.rb +29 -0
metadata +307 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 836a2bf1d0432d2fd21cc831cecdccf8767336fb
+  data.tar.gz: f4a8477d5824a0ba5f620a199bfc63ade084ea23
+SHA512:
+  metadata.gz: ba133c70b316391c025118b2b84d4244699bde0d1882483c2afea1696a9ea2dfc7473b52683ca7b4c47bece03961d3b9414833d7706246cbc26af4cc75aedd42
+  data.tar.gz: 2d0838799a72beec6ec0cf683da9af3d49e1f94e5309407f409c1defd271623f75e1530a010cc2e1949788eca4974848027f61079d21d9b9ebc17f8efa14b1f7

data/README.md ADDED

@@ -0,0 +1,333 @@
+# DB tools
+# Usage
+This sections describes how to use this tool. For development, go to the Development section.
+## Installation
+This application is a Ruby gem that can be installed, and run as an executable.
+Clone this repo and install the latest version gem by running:
+    gem install rdf-reasoner -v 0.4.1
+    gem install specific_install
+    gem specific_install -l ssh://git@git.geophy.com:9922/coredb/coredb-application-dbtools.git
+Depending on how Ruby is installed on your machine, you might have to run this command using `sudo` (not recommended).
+## Configuration
+All configuration files is stored in `~/.dbtools/`.
+The configuration for the databases, can be configured in `database_config.yml`.
+Run `dbtools google_drive list` to invoke the Google Drive authorization.
+## Running the application
+After installing the gem, you should be able to run this application from the command line:
+    $ dbtools
+Using `thor`, you can also run this application without installing the gem. Install `thor` using:
+    $ gem install thor
+Run `thor` from the same folder where this source code is located.
+    $ thor dbtools
+## Commands
+This gem allows you to run a few tasks from the command line, using the `dbtools` executable. The same commands are available using `thor`.
++ `dbtools check casing [URL]`
++ `dbtools check completeness [URL]`
++ `dbtools check compression [URL] `
++ `dbtools check database_comments [URL] `
++ `dbtools check help [COMMAND] `
++ `dbtools check indexes [URL] `
++ `dbtools check inverse_functional_property [URL] `
++ `dbtools check keywords [URL] `
++ `dbtools check output_indexes [URL] `
++ `dbtools check spelling [URL] `
++ `dbtools check table_comments [URL] `
++ `dbtools ckan help [COMMAND] `
++ `dbtools ckan list_databases `
++ `dbtools ckan load_dataset [dataset] `
++ `dbtools ckan load_rdf_in_desc [target_database, ckan_dataset] `
++ `dbtools ckan load_resource [dataset, resource] `
++ `dbtools dump help [COMMAND] `
++ `dbtools dump schema [URL, PATH] `
++ `dbtools google_drive download [File_id, target_dir(optional)] `
++ `dbtools google_drive export [File_id, format, target_dir(optional)] `
++ `dbtools google_drive help [COMMAND] `
++ `dbtools google_drive list `
++ `dbtools import convert_to_utf8 [file] `
++ `dbtools import create_schema [csv_file, table_name] `
++ `dbtools import csv_in postgres [csv_file, database_name, table_name] `
++ `dbtools import excel [database_name, file] `
++ `dbtools import excel2csv [database_name, file] `
++ `dbtools import help [COMMAND] `
++ `dbtools import mysql_dump [database_name. dump_file] `
++ `dbtools import postgres_dump [database_name. dump_file] `
++ `dbtools postgres help [COMMAND] `
++ `dbtools postgres load_functions [database_url] `
+### Namespaces
+All commands are separated by the type of tasks the command performs.
+#### Check
+  This describes all database checking related tasks. The output of these tasks are printed to STDOUT.
+  All tasks take an URL as argument. This should be the url to the database in the following format: adapter://username:oassword@host/database.
+  Both MySQL and Postgres is supported.
++ **all [URL]**
+  This runs the following tasks on the specified database:
+  + indexes
+  + keywords
+  + casing
+  + completeness
+  + compression
+  + inverse_functional_property
++ **all_databases**
+  This runs the task `all` on all databases it can find. Database credentials must be specified in `~/.dbtools/database_config.yml`.
++ **casing [URL]**
+  This task outputs all columns that have a column name with uppercase characters in it.
++ **completeness [URL]**
+  This task outputs the completeness of every column. For every column, it counts the amount of empty/null entries and the total entries.
++ **compression [URL]**
+  This tasks counts, for every column, the amount of distinct lowercased entries and the amount of distinct entries.
++ **database_comments [URL]**
+  This task checks whether a database has a comment or not. This does not work on MySQL because MySQL does not have support for database comments.
++ **indexes [URL]**
+  This task checks whether a database column has missing indexes. It looks for all columns with name *_id and does not have an index.
++ **inverse_functional_property [URL]**
+  This task checks the inverse functional property of the database. It checks for distinct values and total value.
++ **keywords [URL]**
+  This task checks for column names with reserved keywords in them.
++ **output_indexes [URL]**
+  This outputs the command to create the missing index on the column.
++ **spelling [URL]**
+  This task checks for spelling mistakes in column names.
++ **table_comments [URL]**
+  This task checks whether a table has a comment or not.
+#### Google Drive
+  This describes all Google Drive related tasks. The tool will prompt for Google Drive credentials if they don't exist yet.
++ **download [File_id, target_dest(optional)]**
+  `download` will download a file stored on Google Drive.
+  An optional target directory can be given. This will download the file into the directory,
+  using the same name as on Google Drive. If a Google Doc file is given, it will automatically
+  export it to the most commonly used format.
+  If no target directory is given, the file will be streamed to STDOUT.
+  Examples:
+  \x5$ dbtools google_drive download 0B67ew1eLtcXxeUVmTndialhTRTA /tmp/target_destination/"
+  \x5$ dbtools google_drive download 0B67ew1eLtcXxeUVmTndialhTRTA > /tmp/image.jpg"
++ **export [File_id, format, target_dir(optional)]**
+  `export` will download a Google document stored on Google Drive, like Google Spreadsheets, Google slides etc.
+  Download doesn't work for these files. You need to specify the export format.
+  The supported conversion formats can be found here:
+  https://developers.google.com/drive/v3/web/manage-downloads#downloading_a_file
+  An optional target directory can be given. This will download the file into the directory,
+  using the same name as on Google Drive.
+  If no target directory is given, the file will be streamed to STDOUT.
+  Examples:
+  \x5$ dbtools google_drive export 0B67ew1eLtcXxeUVmTndialhTRTA 'text/plain' /tmp/target_dir/"
+  \x5$ dbtools google_drive export 0B67ew1eLtcXxeUVmTndialhTRTA 'application/pdf' > /tmp/test.pdf"
+#### Import
+  This describes all import related tasks. The target databases must be specified in `~/.dbtools/database_config.yml`.
++ **convert_to_utf8 [file]**
+  This tasks converts a file to UTF-8 encoding. It will create a temporary copy of the file, while converting every line to UTF-8.
+  After it is done, it will replace the original file with the UTF-8 encoding copy.
++ **create_schema [csv_file, table_name]**
+  This will output a table schema for a csv file.
++ **csv_in postgres [csv_file, database_name, table_name]**
+  This will import a csv file into the Postgres database.  You have to specify the target database name, and target table name.
++ **excel [database_name, file]**
+  This will import an excel file into the Postgres database. It will transform an excel file to csv files. Every sheet will be a new csv table, and thus a new table.
+  You only have to specify the target database name. The table name will be created using the excel file name and sheet name.
++ **excel2csv [file, target_dir]**
+  This will export an excel file to csv file(s). Every sheet will be a new table name.
++ **mysql_dump [database_name. dump_file]**
+  This will import a MySQL dump into the MySQL database.
++ **postgres_dump [database_name. dump_file]**
+  This will import a Postges dump into the Postgres database.
+#### Postgres
++ **load_functions [database_url]**
+  This will load all pl/sql functions in the `lib/dbtools/plsql_functions` folder into the target Postgres database.
+##### PL/SQL functions
+  The following PL/SQL functions are included:
++ **connect_server(servername, host, port, dbname, server_user, server_password)**
+  Creates a server with name `servername` connected to `host:port` and database `dbname`. Credentials of the `host:port`
+  must be given in `server_user` and `server_password`. It will also create a user mapping for the current user.
++ **link(servername, schema, table)**
+  After creating a server, you can simply call this function with the schema name and table name of the foreign server.
+  This will create a foreign table in the current database.
++ **unlink(schema, table)**
+  This will drop the foreign table.
+#### Convert
++ **csv2rdf [csv_file, rdf_uri]**
+  `csv2rdf csv_file uri` will convert a csv file to a RDF NTriples file.
+  The URI will be the subject in the resulting RDF file.
+  You can optionally specify a third parameter, which will write
+  the output to a file. You can also specify a fourth parameter to compress
+  the file to .gz format.
+  Example:
+  \x5$ dbtools convert csv2rdf data.csv http://geophy.data output.nt
+  Resulting triples will look like:
+  <http://geophy.data#ROWNUMBER> <http://geophy.data/COLUMNNAME> VALUE .
+  <http://geophy.data#1> <http://geophy.data/name> "Bob" .
+# Development
+For development, install `bundler` to manage the gem dependencies. Clone this repository and run:
+    $ bundle install
+To install all dependencies.
+## Components
+#### CSV importer
+  + `lib/dbtools/csv_importer.rb`
+##### Delimiters
+  The csv importer allows you to prepare a table schema fot the csv, so the csv can be loading into Postgres using the `copy` command. When
+  invoking the class, you can specify the delimiter. If no delimiter is specified, it will attempt to detect the delimiter by counting
+  the most occurred delimiter in the first 10 lines. It is able to detect the following delimiters: `[',', '|', "\t", ';']`.
+  If all occurrences are zero, it will raise an exception.
+##### SQL Type detection
+  To detect the type of the column, it will read the first 10.000 rows of the csv file. For every column, it will then iterate through every entry and
+  keep track which type it is. If both integers and floats occures, float will take precedence. If a string has occurenced, then that will take precedence
+  over every other type. Date type has been disabled, because the Ruby date parser accepts any string that contains a date, as Date type. E.g. sentences that
+  contain a date are accepted by the date parser, but are invalid in SQL.
+  Integers will because BigInt in SQL, Floats will because Float and everything else will be a varchar.
+  Improvements that can be made:
+  + Iterate (per line) through all entries instead of just the first 10.000. Currently it loads it all into memory. If you iterate it line-by-line, you can
+    check all entries. You probably won't be able to iterate by column though.
+  + Support for more datatypes. Currently BigInt is used for integer type to prevent errors when integers get too large. If you would check all entries,
+    you can keep track of the largest entry occurred. Based on that, you have specify the type as either smallInt, Int or BigInt.
+  + Conversion to SQL commands. For each row in the csv, you can change it to a SQL insert command. That way, you could run a command to create an entire table.
+#### Excel to csv converter
+   + `lib/dbtools/excel2csv.rb`
+   The Excel to csv converter uses the `roo` and `roo-xls` gems to provide Excel reading support. The `roo` gem provides supports modern files, such as `.ods`, `.xlsx` files while the `roo-xls` gem
+   provides support for older Excel files, such as `xls` or `xml` in SpreadsheetML format.
+   The output folder has to be specified. The output csv file name(s) are combined from the excel file name, and the sheet name.
+#### Dynamic SQL queries
+   + `lib/dbtools/database.rb`
+   + `lib/dbtools/db_connection.rb`
+   + `lib/dbtools/postgres_connection.rb`
+   + `lib/dbtools/mysql_connection.rb`
+   To create the metrics, a query has to be created taking in account all the tables, columns etc. To represent this structure, classes are used to store the data.
+   The results can also be stored in the classes. The Table class allows you to construct the query. The result can be stored in the Column class. Another process should
+   run the query and store the result in the Column classes. The classes contain no logic to connect and query databases itself.
+   All queries that are constructed use quotes around the name identifiers. This is to prevent errors when identifiers use spaces or reserved keywords in their name.
+   The Postgres dialact supports quotes for identifiers. MySQL uses backticks for identifiers. In the code, if the MySQL connection is used,
+   it will replace all quotes with backticks.
+   Using an ActiveRecord connection, queries are executed. The Postgres adapter, by default, returns a hash. The MySQL adapter needs another option to turn in into a Hash.
+   If you use the `execute_query` wrapper command, it will do this automatically.
+#### Importer
+  + `lib/tasks/import.rb`
+##### UTF-8 Conversion
+   Since all databases are in UTF-8 encoding format, the csv file that has to be loaded must be in UTF-8 encoding too. If this is not done, the copy command of
+   Postgres might fail if there's an invalid character sequence detected.
+   That is why before a csv file is loaded into the database, it is converted to UTF-8 encoding.
+##### Names
+   For loading Excel and csv files, a database name must be specified. These must be a valid database name. For loading a single csv file, the table name must also be specified.
+   Excel files use the file name + sheet name as table name. For dumps, the database name can be specified, but depending on how the dump is created,
+   it might create it's own database.
+##### Process
+   The entire excel and csv loading process is as follows:
+   excel: convert every sheet to a single csv file => see csv process
+   csv: convert to UTF-8 => create table schema command => connect to database using an ActiveRecord connection => run create database command => run table schema command =>
+         use copy to load the csv into the database.
+#### Google Drive
+   + `lib/dbtools/google_drive_api.rb`
+   + `lib/tasks/googledrivetool.rb`
+   The `google_drive_api.rb` class connects this client to Google Drive with authorization with read only scope. The `client_secret_geophy.json` file specifies to which
+   Google Drive it should connect. It will prompt for authorization if it tries to authenticate without credentials available.
+   The credentials are stored in `~/.credentials/dbtools_geophy.yml`.
+##### Downloading files
+  Downloading files is done using the `get_file` function. This does not work for Google Documents however. These files have to be called using the `export_file` command.
+  When using the `export_file` function, the output format has to be specified. There does not seem to be a way know whether the file is a Google Document or
+  regular file by looking at just the metadata(?). When no download destination is specified, it will return the metadata.
+#### RDF
+  The RDF reader must be used with CKAN with the ckanext-dcat plugin. This allows you to query the RDF graph, and return metadata about the dataset and/or resources.
+  This way, it knows where a Google Drive file is located, what the database name should be, etc.
+  The CKAN RDF catalog uses pagination, so the RDF reader will load all pages. It can then run SPARQL queries to find datasets.
+  To improve this, you might be able to use the CKAN API instead. Parsing the RDF graph seems to be a bit slow, and you have to specify the full url of
+  the dataset/resource(unless you use a regex filter).

data/Thorfile ADDED

	@@ -0,0 +1 @@
1	+ require 'dbtools'

data/bin/dbtools ADDED

@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+require 'dbtools'
+Dbtools::Main.start(ARGV)

data/config/client_secret_dbtools.json ADDED

@@ -0,0 +1 @@

+ {"installed":{"client_id":"494523368434-74cacnqnkqrj6nbhcfqluajqrv9d5uuf.apps.googleusercontent.com","project_id":"sacred-archway-138523","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://accounts.google.com/o/oauth2/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"d1aJVQiyDhNX4cPn6395ez1a","redirect_uris":["urn:ietf:wg:oauth:2.0:oob","http://localhost"]}}

data/config/config.yml ADDED

	@@ -0,0 +1 @@
1	+ defaults:

data/config/database_config.yml ADDED

@@ -0,0 +1,12 @@
+mysql:
+  adapter: mysql2
+  username: root
+  password: root
+  host: localhost
+postgres:
+  adapter: postgres
+  username: postgres
+  password: postgres
+  host: localhost
+  port: 5432

data/config/databases.txt ADDED

@@ -0,0 +1,5 @@
+postgres://postgres:postgres@localhost/dvdrental
+postgres://postgres:postgres@localhost/ckan_test
+postgres://postgres:postgres@localhost/PointOfInterest
+mysql2://root:root@localhost/world
+mysql2://root:root@localhost/classicmodels

data/config/schedule.rb ADDED

@@ -0,0 +1,8 @@
+DB_LIST = File.join(Dir.home, '.dbtools/databases.txt')
+every 4.day, :at => '1:00am' do
+  File.foreach(DB_LIST) do |db|
+    command "dbtools check all #{db}"
+  end
+end

data/dbtools.gemspec ADDED

@@ -0,0 +1,37 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require "dbtools/version"
+Gem::Specification.new do |spec|
+  spec.name          = "dbtools"
+  spec.version       = Dbtools::VERSION
+  spec.authors       = ["Hung Nguyen"]
+  spec.email         = ["h.nguyen@geophy.com"]
+  spec.summary       = "Tool for database maintenance."
+  spec.files         = Dir['Rakefile', '{bin,lib,man,test,spec,tasks,config}/**/*', 'README*', 'LICENSE*', '*.gemspec', 'Thorfile*']
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency 'rdf-reasoner', '~> 0.4.1'
+  spec.add_dependency 'thor', '~> 0.19.1'
+  spec.add_dependency 'activerecord', '~> 4.2'
+  spec.add_dependency 'activesupport', '~> 4.2'
+  spec.add_dependency 'activemodel', '~> 4.2'
+  spec.add_dependency 'google-api-client', '~> 0.9'
+  spec.add_dependency 'linkeddata', '~> 2.0'
+  spec.add_dependency 'whenever'
+  spec.add_dependency 'roo', '~> 2.4'
+  spec.add_dependency 'roo-xls', '~> 1.0'
+  spec.add_dependency 'ru_bee', '~> 0.1'
+  spec.add_dependency 'mysql2', '~> 0.4'
+  spec.add_dependency 'pg', '~> 0.18'
+  spec.add_dependency 'spira', '~> 2.0.0'
+  spec.add_dependency 'aws-sdk', '~>2.7'
+  spec.add_development_dependency "bundler", "~> 1.12"
+end

data/lib/dbtools.rb ADDED

@@ -0,0 +1,47 @@
+require 'thor'
+require 'dbtools/version'
+require_relative 'rdf/geophy'
+require 'tasks/aws'
+require 'tasks/check'
+require 'tasks/dump'
+require 'tasks/googledrivetool'
+require 'tasks/ckan'
+require 'tasks/import'
+require 'tasks/postgres'
+require 'tasks/convert'
+require 'tasks/backup'
+require 'fileutils'
+module Dbtools
+  class Main < Thor
+    # Alias module GoogleDrive => Google_Drive
+    Dbtools::GoogleDrive = Dbtools::Google_Drive
+    register(Check, 'check', 'check [COMMAND]', 'Runs database checking tasks')
+    register(Dump, 'dump', 'dump [COMMAND]', 'Runs schema dumping tasks')
+    register(Google_drive, 'google_drive', 'google_drive [COMMAND]', 'Runs Google Drive related tasks')
+    register(Google_drive, 'googledrive', 'googledrive [COMMAND]', 'Runs Google Drive related tasks')
+    register(Aws, 'aws', 'aws [COMMAND]', 'Runs AWS related tasks')
+    register(Backup, 'backup', 'backup [COMMAND]', 'Runs backup tasks')
+    register(Import, 'import', 'import [COMMAND]', 'Runs database import related tasks')
+    register(Postgres, 'postgres', 'postgres [COMMAND]', 'Runs Postgres related tasks')
+    register(Convert, 'convert', 'convert [COMMAND]', 'Runs converting related tasks')
+    def initialize(*args)
+      super
+      init_config
+    end
+    desc 'init_config', 'Inits DbTools configuration'
+    def init_config
+      begin
+        FileUtils.mkdir_p(File.join(Dir.home, '.dbtools'))
+        FileUtils.copy(File.join(Dir.pwd, 'config', 'config.yml'), Dbtools::Constants::CONFIG_PATH ) unless File.exists?(Dbtools::Constants::CONFIG_PATH)
+        FileUtils.copy(File.join(Dir.pwd, 'config', 'database_config.yml'), Dbtools::Constants::DB_TARGET_CONFIG_PATH ) unless File.exists?(Dbtools::Constants::DB_TARGET_CONFIG_PATH)
+      rescue StandardError => e
+        STDERR.puts e.message
+        STDERR.puts "Warning: not all functions might work."
+      end
+    end
+  end
+end