RubyGems - dbtools - Versions diffs - 0.5.2 - Mend

dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/README.md +333 -0
data/Thorfile +1 -0
data/bin/dbtools +5 -0
data/config/client_secret_dbtools.json +1 -0
data/config/config.yml +1 -0
data/config/database_config.yml +12 -0
data/config/databases.txt +5 -0
data/config/schedule.rb +8 -0
data/dbtools.gemspec +37 -0
data/lib/dbtools.rb +47 -0
data/lib/dbtools/constants.rb +847 -0
data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
data/lib/dbtools/converter/csv_importer.rb +107 -0
data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
data/lib/dbtools/database/database_data.rb +146 -0
data/lib/dbtools/database/db_connection.rb +236 -0
data/lib/dbtools/database/mysql_connection.rb +78 -0
data/lib/dbtools/database/postgresql_connection.rb +132 -0
data/lib/dbtools/database/violation.rb +45 -0
data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
data/lib/dbtools/plsql_functions/link.sql +17 -0
data/lib/dbtools/plsql_functions/unlink.sql +15 -0
data/lib/dbtools/rdf/rdf_reader.rb +136 -0
data/lib/dbtools/version.rb +3 -0
data/lib/rdf/geophy.rb +27 -0
data/lib/tasks/aws.rb +43 -0
data/lib/tasks/backup.rb +107 -0
data/lib/tasks/check.rb +220 -0
data/lib/tasks/ckan.rb +151 -0
data/lib/tasks/convert.rb +139 -0
data/lib/tasks/dump.rb +110 -0
data/lib/tasks/googledrivetool.rb +252 -0
data/lib/tasks/import.rb +142 -0
data/lib/tasks/postgres.rb +29 -0
metadata +307 -0

data/lib/dbtools/database/db_connection.rb ADDED

@@ -0,0 +1,236 @@
+require 'active_record'
+require 'ru_bee'
+require 'dbtools/constants'
+require 'dbtools/database/database_data'
+require 'dbtools/database/violation'
+module Dbtools::Database
+  class DbConnection
+    attr_accessor :connection
+    # Creates a connection to a database using the URL.
+    def initialize(url)
+      @connection = ActiveRecord::Base.establish_connection(url).connection
+      @database = get_current_database
+    end
+    def close
+      @connection.close
+    end
+    def database_name
+      @connection.current_database
+    end
+    # Return an object representing the current database structure.
+    def get_current_database
+      query = %{select c.table_catalog as "table_catalog", c.table_schema as "table_schema", c.table_name as "table_name", c.column_name as "column_name", c.data_type as "data_type"
+from information_schema.columns as c
+join information_schema.tables as t
+	on c.table_catalog = t.table_catalog
+    and c.table_schema = t.table_schema
+    and c.table_name = t.table_name
+where c.data_type not like '%text%'
+and c.table_schema not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')}
+      database = DatabaseData.new(database_name)
+      execute_query(query).each do |h|
+        table_name = h['table_name']
+        table_schema = h['table_schema']
+        column_name = h['column_name']
+        data_type = h['data_type']
+        database.add_table(table_name, table_schema).add_column(column_name, data_type)
+      end
+      return database
+    end
+    def get_all_columns
+      query = %{select c.COLUMN_NAME
+from information_schema.COLUMNS as c
+where c.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')}
+      execute_query(query)
+    end
+    # Function to check if reserved keywords occur in the schema/table/column names.
+    def check_reserved_keywords
+      reserved_keywords = "("
+      Dbtools::Constants::RESERVED_KEYWORDS.each do |keyword|
+        reserved_keywords << "'#{keyword}', "
+      end
+      reserved_keywords = reserved_keywords[0..-3]
+      reserved_keywords << ")"
+      sql = %{
+    select c.TABLE_CATALOG as "table_catalog", c.TABLE_SCHEMA as "table_schema", c.TABLE_NAME as "table_name", c.COLUMN_NAME as "column_name"
+    from information_schema.COLUMNS as c
+    where c.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')
+      and (upper(c.COLUMN_NAME) in #{reserved_keywords}
+      or upper(c.TABLE_CATALOG) in #{reserved_keywords}
+      or upper(c.TABLE_NAME) in #{reserved_keywords})
+    }
+      violations = []
+      execute_query(sql).each do |h|
+        table_schema = h['table_schema']
+        table_name = h['table_name']
+        column_name = h['column_name']
+        violations << Violation.new(database: database_name,
+                                    metric: "Reserved keywords",
+                                    offender: "#{table_schema}.#{table_name}.#{column_name}")
+      end
+      return violations
+    end
+    # Get the completeness of the columns.
+    def get_completeness
+      database = @database
+      database.tables.values.each do |table|
+        # This query counts all null entries(or entries containing '' for string columns) in every column.
+        query = table.query_empty_records
+        execute_query(query).each do |res|
+          table.columns.each {|k, col| col.missing_entries = res[k]}
+        end
+        # This query counts the total values in the table.
+        # The query should only return a single hashmap/dictionary as result.
+        # Since the total records for all columns should be equal, taking just the first value
+        # should be fine.
+        query = table.query_total_records
+        execute_query(query).each do |res|
+          table.columns.each {|k, col| col.total_entries = res.values.first}
+        end
+      end
+      violations = []
+      database.tables.each do |table_name, table|
+        table.columns.each do |col_name, col|
+          next if col.missing_entries.to_i.zero?
+          violations << Violation.new(database: database_name,
+                                      metric: "Completeness",
+                                      offender: "#{col.full_name.delete('"')}",
+                                      violating_records: col.missing_entries.to_i,
+                                      total_records: col.total_entries.to_i)
+        end
+      end
+      return violations
+    end
+    # Get the compression of the entries grouped by columns.
+    def get_syntax_compression
+      database = @database
+      database.tables.values.each do |table|
+        # This query counts all lowercased distinct values in every column.
+        query = table.query_distinct_lowercased_entries
+        execute_query(query).each do |res|
+          table.columns.each {|k, col| col.distinct_lower_entries = res[k]}
+        end
+        # This query counts all distinct values in every column.
+        query = table.query_distinct_entries
+        execute_query(query).each do |res|
+          table.columns.each {|k, col| col.distinct_entries = res[k]}
+        end
+      end
+      violations = []
+      database.tables.each do |table_name, table|
+        table.columns.each do |col_name, col|
+          next if col.distinct_lower_entries == col.distinct_entries
+          violations << Violation.new(database: database_name,
+                                      metric: "Syntax compression",
+                                      offender: "#{col.full_name.delete('"')}",
+                                      violating_records: col.distinct_entries.to_i - col.distinct_lower_entries.to_i,
+                                      total_records: col.distinct_entries.to_i)
+        end
+      end
+      return violations
+    end
+    # Calculate the inverse functional property of the database.
+    def get_inverse_functional_property
+      database = @database
+      # Run the query on every table.
+      database.tables.values.each do |table|
+        # This query counts all distinct values in every column.
+        # The query should only return a single hashmap/dictionary as result.
+        query = table.query_distinct_entries
+        execute_query(query).each do |res|
+          table.columns.each {|k, col| col.distinct_entries = res[k]}
+        end
+        # This query counts the total values in the table.
+        # The query should only return a single hashmap/dictionary as result.
+        # Since the total records for all columns should be equal, taking just the first value
+        # should be fine.
+        query = table.query_total_records
+        execute_query(query).each do |res|
+          table.columns.each {|k, col| col.total_entries = res.values.first}
+        end
+      end
+      violations = []
+      database.tables.each do |table_name, table|
+        #puts "Table: #{table_name}:"
+        table.columns.each do |col_name, col|
+          #puts "\t #{col_name}: #{col.distinct_entries}/#{col.total_entries}" unless col.distinct_entries.to_i.zero?
+          violating_records = col.distinct_entries.to_i
+          next if violating_records.zero?
+          violations << Violation.new(database: database_name,
+                                      metric: "Inverse functional property",
+                                      offender: "#{col.full_name.delete('"')}",
+                                      violating_records: violating_records,
+                                      total_records: col.total_entries.to_i)
+        end
+      end
+      return violations
+    end
+    # Get the columns that are not downcase.
+    def get_uppercase_columns
+      database = @database
+      violations = []
+      database.tables.each do |table_name, table|
+        table.columns.each do |col_name, col|
+          next if col.name.downcase.eql?(col.name)
+          violations << Violation.new(database: database_name,
+                                      metric: "Uppercase column names",
+                                      offender: "#{col.full_name.delete('"')}",
+                                      solution: "#{table.schema}.#{table.name}.#{col.name.downcase.delete('"')}")
+        end
+      end
+      return violations
+    end
+    # Checks the spelling of all column names.
+    def check_spelling
+      database = @database
+      violations = []
+      database.tables.each do |table_name, table|
+        table.columns.each do |col_name, col|
+          next if col.name.correct?
+          violations << Violation.new(database: database_name,
+                                      metric: "Spelling",
+                                      offender: "#{col.full_name.delete('"')}")
+        end
+      end
+      return violations
+    end
+    # Creates a new database.
+    def create_database(name)
+      @connection.create_database(name)
+    end
+    # Runs all sql files in the specified directory
+    def execute_files(directory)
+      Dir.foreach(directory) do |file|
+        # Skip these files.
+        next if file == '.' or file == '..' or not ['.sql'].include?(File.extname(file))
+        file_path = File.join(directory, file)
+        content = File.read(file_path)
+        execute_query(content)
+      end
+    end
+  end
+end

data/lib/dbtools/database/mysql_connection.rb ADDED

@@ -0,0 +1,78 @@
+require 'dbtools/database/db_connection'
+module Dbtools::Database
+  class MysqlConnection < DbConnection
+    # Executes a SQL statement on the connected database.
+    def execute_query(query)
+      # Replaces quotes with ticks, so queries are compatible with MySQL.
+      query.gsub!("\"", "`")
+      # The MySQL ActiveRecord adapter throws an error when the query is empty.
+      result = begin
+                 @connection.exec_query(query).to_hash
+               rescue
+                 {}
+               end
+      return result
+    end
+    # Queries all the primary keys in the database and
+    # outputs a query to create an index for that key.
+    def check_indexes
+      sql = %{
+select cols.table_schema as "table_schema", cols.table_name as "table_name", cols.column_name as "column_name"
+from information_schema.columns as cols
+left join information_schema.statistics as stats
+	on cols.table_schema = stats.table_schema
+    and cols.table_name = stats.table_name
+    and cols.column_name = stats.column_name
+where cols.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')
+	and cols.column_name like '%\_id'
+    and stats.column_name IS NULL}
+      violations = []
+      execute_query(sql).each do |h|
+        table_schema = h['table_schema']
+        table_name = h['table_name']
+        column_name = h['column_name']
+        violations << Violation.new(database: database_name,
+                                    metric: "Missing indexes",
+                                    offender: "#{table_schema}.#{table_name}.#{column_name}",
+                                    solution: "CREATE INDEX idx_#{table_schema}_#{table_name}_#{column_name} ON #{table_schema}.#{table_name} (#{column_name}); ")
+      end
+      return violations
+    end
+    # Adds a description to a table by adding a comment.
+    def set_description_table(comment, object_name)
+      query = %{ALTER TABLE `#{object_name}` COMMENT is '#{comment}'}
+      execute_query(query)
+    end
+    # Returns all databases on the system.
+    def get_all_databases
+      sql = %q{show databases}
+      execute_query(sql).map {|v| v.values}.flatten.to_set
+    end
+    # Queries for all columns that don't have comment metadata.
+    def get_tables_without_comments
+      query = %{select t.table_schema as "table_schema", t.table_name as "table_name"
+from information_schema.tables as t
+where t.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')
+  and t.table_comment = ''}
+      violations = []
+      execute_query(query).each do |h|
+        table_schema = h['table_schema']
+        table_name = h['table_name']
+        violations << Violation.new(database: database_name,
+                                    metric: "Table without comments",
+                                    offender: "#{table_schema}.#{table_name}")
+      end
+      return violations
+    end
+  end
+end

data/lib/dbtools/database/postgresql_connection.rb ADDED

@@ -0,0 +1,132 @@
+require 'dbtools/database/db_connection'
+module Dbtools::Database
+  class PostgresqlConnection < DbConnection
+    # Executes a SQL query on the connected database.
+    def execute_query(query)
+      result = begin
+                 @connection.exec_query(query)
+               rescue
+                 {}
+               end
+      return result
+    end
+    # Analyzes the tables and checks if there are more sequence scans
+    # than index scans. Suggests where indexes could be created
+    def analyze_missing_indexes
+      sql = %{
+      SELECT relname, seq_scan-idx_scan AS too_much_seq, case when seq_scan-idx_scan>0 THEN 'Missing Index?' ELSE 'OK' END, pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan
+      FROM pg_stat_all_tables
+      WHERE schemaname ='public' AND pg_relation_size(relname::regclass)>80000 ORDER BY too_much_seq DESC;
+    }
+      execute_query(sql).each do |index|
+        puts index
+      end
+    end
+    # Queries all the primary keys in the database and
+    # outputs a query to create an index for that key.
+    def check_indexes
+      sql = %{
+select cols.table_schema as "table_schema", cols.table_name as "table_name", cols.column_name as "column_name"
+from
+(select c.table_schema, c.table_name, c.column_name
+from information_schema.columns as c
+where c.column_name != 'ID' and c.column_name != 'id' and c.column_name like '%\\_id'
+  and c.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')) as cols
+left join
+(SELECT UNNEST(ARRAY(
+       SELECT pg_get_indexdef(idx.indexrelid, k + 1, true)
+       FROM generate_subscripts(idx.indkey, 1) as k
+       ORDER BY k
+       )) as indkey_names
+FROM   pg_index as idx
+  JOIN   pg_class as i ON i.oid = idx.indexrelid
+  JOIN   pg_am as am ON i.relam = am.oid) as indexes
+on cols.column_name = indexes.indkey_names
+where indexes.indkey_names IS NULL}
+      violations = []
+      execute_query(sql).each do |h|
+        table_schema = h['table_schema']
+        table_name = h['table_name']
+        column_name = h['column_name']
+        violations << Violation.new(database: database_name,
+                                    metric: "Missing indexes",
+                                    offender: "#{table_schema}.#{table_name}.#{column_name}",
+                                    solution: "CREATE INDEX idx_#{table_schema}_#{table_name}_#{column_name} ON #{table_schema}.#{table_name} (#{column_name}); ")
+      end
+      return violations
+    end
+    # Adds a description to the current database by adding a comment.
+    def set_description_database(comment)
+      query = %{COMMENT ON DATABASE #{@connection.current_database} IS '#{comment}'}
+      execute_query(query)
+    end
+    # Adds a description to a table by adding a comment.
+    def set_description_table(comment, object_name)
+      query = %{COMMENT ON TABLE #{object_name} IS '#{comment}'}
+      execute_query(query)
+    end
+    # Queries for all columns that don't have comment metadata.
+    def get_tables_without_comments
+      query = %{select t.table_catalog, t.table_schema, t.table_name, d.description
+from information_schema.tables as t
+join
+pg_class as c on c.relname = t.table_name
+join pg_namespace as n on c.relnamespace = n.oid
+left join pg_description as d on c.oid = d.objoid
+where t.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')
+and d.objoid is null}
+      violations = []
+      execute_query(query).each do |h|
+        table_catalog = h['table_catalog']
+        table_schema = h['table_schema']
+        table_name = h['table_name']
+        violations << Violation.new(database: database_name,
+                                    metric: "Table without comments",
+                                    offender: "#{table_catalog}.#{table_schema}.#{table_name}")
+      end
+      return violations
+    end
+    # Returns all databases from the postgres.
+    def get_all_databases
+      sql = %q{SELECT datname FROM pg_database WHERE datistemplate = false;}
+      execute_query(sql).map {|v| v.values}.flatten.to_set
+    end
+    # Queries for all databases that don't have comment metadata.
+    def get_databases_without_comments
+      query = %{select db.datname
+from pg_database as db
+left join pg_shdescription as sd on sd.objoid = db.oid
+where sd.objoid is null}
+      violations = []
+      execute_query(query).each do |h|
+        datname = h['datname']
+        violations << Violation.new(database: database_name,
+                                    metric: "Database without comment",
+                                    offender: datname)
+      end
+      return violations
+    end
+    # Runs the copy command using stdin
+    def copy_from_file(file, table_name, delimiter)
+      raw_connection = @connection.raw_connection
+      raw_connection.copy_data(%(COPY "#{table_name}" FROM STDIN DELIMITER '#{delimiter}' CSV HEADER;)) do
+        File.foreach(file) do |line|
+          raw_connection.put_copy_data(line)
+        end
+      end
+    end
+  end
+end

data/lib/dbtools/database/violation.rb ADDED

@@ -0,0 +1,45 @@
+require 'json'
+require 'date'
+module Dbtools::Database
+  class Violation
+    attr_reader :metric, :database, :offender, :violating_records, :total_records, :solution, :schema, :table, :column
+    def initialize(metric:, database:, offender:,
+                   schema: nil, table: nil, column: nil,
+                   violating_records: nil, total_records: nil, solution: nil)
+      @metric = metric
+      @database = database
+      @offender = offender
+      @violating_records = violating_records
+      @total_records = total_records
+      @solution = solution
+      @timestamp = Time.now.utc.iso8601
+      # Save everything in a Hash to make json serialization easily possible.
+      @violation = Hash.new
+      @violation['metric'] = @metric
+      @violation['database'] = @database
+      @violation['offender'] = @offender
+      @schema, @table, @column = @offender.split(".")
+      @schema ||= schema
+      @table ||= table
+      @column ||= column
+      @violation['schema'] = @schema
+      @violation['table'] = @table
+      @violation['column'] = @column
+      @violation['violating_records'] = @violating_records unless violating_records.nil?
+      @violation['total_records'] = @total_records unless total_records.nil?
+      @violation['measure'] = @violating_records.to_f / total_records.to_f unless (violating_records.nil? || total_records.to_i.zero?)
+      @violation['solution'] = @solution unless solution.nil?
+      @violation['timestamp'] = @timestamp
+    end
+    def to_s
+      @violation.to_json
+    end
+  end
+end