RubyGems - dbtools - Versions diffs - 0.5.2 - Mend

dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/README.md +333 -0
data/Thorfile +1 -0
data/bin/dbtools +5 -0
data/config/client_secret_dbtools.json +1 -0
data/config/config.yml +1 -0
data/config/database_config.yml +12 -0
data/config/databases.txt +5 -0
data/config/schedule.rb +8 -0
data/dbtools.gemspec +37 -0
data/lib/dbtools.rb +47 -0
data/lib/dbtools/constants.rb +847 -0
data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
data/lib/dbtools/converter/csv_importer.rb +107 -0
data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
data/lib/dbtools/database/database_data.rb +146 -0
data/lib/dbtools/database/db_connection.rb +236 -0
data/lib/dbtools/database/mysql_connection.rb +78 -0
data/lib/dbtools/database/postgresql_connection.rb +132 -0
data/lib/dbtools/database/violation.rb +45 -0
data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
data/lib/dbtools/plsql_functions/link.sql +17 -0
data/lib/dbtools/plsql_functions/unlink.sql +15 -0
data/lib/dbtools/rdf/rdf_reader.rb +136 -0
data/lib/dbtools/version.rb +3 -0
data/lib/rdf/geophy.rb +27 -0
data/lib/tasks/aws.rb +43 -0
data/lib/tasks/backup.rb +107 -0
data/lib/tasks/check.rb +220 -0
data/lib/tasks/ckan.rb +151 -0
data/lib/tasks/convert.rb +139 -0
data/lib/tasks/dump.rb +110 -0
data/lib/tasks/googledrivetool.rb +252 -0
data/lib/tasks/import.rb +142 -0
data/lib/tasks/postgres.rb +29 -0
metadata +307 -0

data/lib/dbtools/version.rb ADDED

@@ -0,0 +1,3 @@
+module Dbtools
+  VERSION = "0.5.2"
+end

data/lib/rdf/geophy.rb ADDED

@@ -0,0 +1,27 @@
+require 'rdf'
+module RDF
+  # Quick and dirty way to put the RDF vocabulary for the Spira models in one place.
+  class Geophy
+    def self.vocab
+      return RDF::URI.new('http://geophy.io/google_drive#')
+    end
+    def self.GoogleDriveEntity
+      return self.vocab.join('google_drive/entity')
+    end
+    def self.GoogleDriveFile
+      return self.vocab.join('google_drive/file')
+    end
+    def self.GoogleDriveFolder
+      return self.vocab.join('google_drive/folder')
+    end
+    def self.ChangesPageToken
+      return self.vocab.join('google_drive/changes_page_token')
+    end
+  end
+end

data/lib/tasks/aws.rb ADDED

@@ -0,0 +1,43 @@
+require 'thor'
+require 'aws-sdk'
+class Dbtools::Aws < Thor
+  package_name "dbtools"
+  def initialize(*args)
+    super
+    credentials_path = File.join(Dir.home, '.aws', 'credentials')
+    if !File.exists?(credentials_path) && (ENV['AWS_ACCESS_KEY_ID'].nil? || ENV['AWS_SECRET_ACCESS_KEY'].nil?)
+      STDERR.puts "No credentials for AWS found. You might want to configure them. " +
+                      "Your credentials should be configured in ~/.aws/credentials, or in the environmental variables ENV['AWS_ACCESS_KEY_ID'] and ENV['AWS_SECRET_ACCESS_KEY']." +
+                      "\nSee https://aws.amazon.com/blogs/security/a-new-and-standardized-way-to-manage-credentials-in-the-aws-sdks/ for more info. "
+    end
+  end
+  desc 'upload_to_s3 [file, bucket, key]', 'Uploads a file to S3. This requires your credentials to be configured in ~/.aws/credentials.'
+  long_desc <<-LONGDESC
+      `upload_to_s3 [file, bucket, key]` uploads a file to S3. You must specify the bucket name, along with
+      the key.
+      This task requires your credentials to be configured in ~/.aws/credentials,
+      or in the following environmental variables: ENV['AWS_ACCESS_KEY_ID'] and ENV['AWS_SECRET_ACCESS_KEY']
+      Example ~/.aws/credentials:
+        [default]
+        aws_access_key_id=ABCDEF123
+        aws_secret_access_key=+aBcDeF123
+        region=eu-central-1
+      Example usage:
+      \x5$ dbtools aws upload_to_s3 /tmp/file.txt bucket backups/something.txt
+  LONGDESC
+  method_option :prefix => :string, :default => nil
+  def upload_to_s3(file, bucket, key)
+    client = Aws::S3::Client.new
+    resource = Aws::S3::Resource.new(client: client)
+    s3_bucket = resource.bucket(bucket)
+    key = File.join(options[:prefix], key) if !options[:prefix].nil? && !options[:prefix].empty?
+    obj = s3_bucket.object(key)
+    obj.upload_file(file)
+  end
+end

data/lib/tasks/backup.rb ADDED

@@ -0,0 +1,107 @@
+require 'fileutils'
+require 'thor'
+require 'rdf'
+require 'sparql/client'
+module Dbtools
+  class Backup < Thor
+    package_name "dbtools"
+    # Backs up a rdf graph at a sparql endpoint
+    desc 'rdf [sparql_endpoint]', 'Backup a RDF store at a SPARQL endpoint.'
+    long_desc <<-LONGDESC
+      `rdf [sparql_endpoint]` will create a dump containing all ntriples
+      located at the SPARQL endpoint, and upload it automatically to GoogleDrive or AWS S3.
+      The backup location can be specified with the arguments --googledrive and --aws_s3.
+      For Google Drive backups, the folder where the backup will be stored can be given with the --folder=folder_id argument.
+      The name can be specified with the optional argument --filename. The default
+      is "metatools_rdf_backup_#{Time.now.strftime('%Y%m%d-%H%M')}.nt".
+      Argument --nagios will run the `geophy-nagios-report` command.
+      Example:
+      \x5$ dbtools backup rdf http://localhost:9999/blazegraph/namespace/test/sparql --googledrive
+      \x5$ dbtools backup rdf http://localhost:9999/blazegraph/namespace/test/sparql --aws_s3 --bucket=example-bucket --key=example.txt
+    LONGDESC
+    method_option :filename, :type => :string, :default => "blazegraph_#{Time.now.strftime('%Y%m%d-%H%M')}.nt.gz"
+    method_option :folder, :type => :array, :default => ['0Byv6wMVo_JE4MElLLVJUS1U1RE0']
+    method_option :googledrive, :type => :boolean, :default => false
+    method_option :aws_s3, :type => :boolean, :default => false
+    method_option :nagios, :type => :boolean, :default => false
+    method_options :bucket => :string, :key => :string
+    def rdf(sparql_endpoint)
+      if !options[:googledrive] && !options[:aws_s3]
+        error_message = "You must choose either Google Drive or AWS S3 as backup location. " +
+            "Pass --googledrive or --aws_s3 as an argument."
+        STDERR.puts error_message
+        `sudo /usr/local/bin/geophy-nagios-report -s blazegraph-backup -m "#{error_message}" -c "#{$PROGRAM_NAME}" ERROR` if options[:nagios]
+        return
+      end
+      if options[:aws_s3]
+        if options[:bucket].nil? || options[:key].nil?
+          error_message = "Bucket and key must be specified. Use the arguments --bucket= and --key="
+          STDERR.puts error_message
+          `sudo /usr/local/bin/geophy-nagios-report -s blazegraph-backup -m "#{error_message}" -c "#{$PROGRAM_NAME}" ERROR` if options[:nagios]
+          return
+        end
+      end
+      filename = options[:filename]
+      tempfile = Tempfile.new(['backup', '.nt.gz'])
+      begin
+        invoke "dbtools:dump:rdf", [sparql_endpoint, tempfile.path], :compress => true
+        invoke "dbtools:google_drive:upload", [tempfile.path], :filename => filename, :folder => options[:folder] if options[:googledrive]
+        invoke "dbtools:aws:upload_to_s3", [tempfile.path, options[:bucket], options[:key]], :attributes => false if options[:aws_s3]
+        `sudo /usr/local/bin/geophy-nagios-report -s blazegraph-backup -m "Backup successful" -c "#{$PROGRAM_NAME}" OK` if options[:nagios]
+      rescue Exception => e
+        `sudo /usr/local/bin/geophy-nagios-report -s blazegraph-backup -m "#{e.message}" -c "#{$PROGRAM_NAME}" ERROR` if options[:nagios]
+      ensure
+        tempfile.close
+        tempfile.unlink
+      end
+    end
+    desc 'blazegraph [url]', 'Dumps a blazegraph database to a jnl file.'
+    long_desc <<-LONGDESC
+      `blazegraph [sparql_endpoint]` will create a backup of a blazegraph database,
+      using the built in backup function, and upload it to GoogleDrive or AWS S3.
+      The backup location can be specified with the arguments --googledrive and --aws_s3.
+      For Google Drive backups, the folder of the backup can be given with the --folder=folder_id argument.
+      The name can be specified with the optional argument --filename. The default
+      is "metatools_rdf_backup_#{Time.now.strftime('%Y%m%d-%H%M')}.jnl.gz".
+      Example:
+      \x5$ dbtools backup rdf http://localhost:9999/blazegraph/backup --googledrive
+      \x5$ dbtools backup rdf http://localhost:9999/blazegraph/backup --aws_s3 --bucket=example-bucket --key=example.txt
+    LONGDESC
+    option :filename, :type => :string, :default => "metatools_rdf_backup_#{Time.now.strftime('%Y%m%d-%H%M')}.jnl.gz"
+    option :folder, :type => :array, :default => ['0Byv6wMVo_JE4MElLLVJUS1U1RE0']
+    option :googledrive, :type => :boolean, :default => false
+    option :aws_s3, :type => :boolean, :default => false
+    options :bucket => :string, :key => :string
+    def blazegraph(url)
+      if !options[:googledrive] && !options[:aws_s3]
+        STDERR.puts "You must choose either Google Drive or AWS S3 as backup location. " +
+                        "Pass --googledrive or --aws_s3 as an argument."
+        return
+      end
+      if options[:aws_s3]
+        STDERR.puts "Bucket and key must be specified. Use the arguments --bucket= and --key=." if options[:bucket].nil? || options[:key].nil?
+        return
+      end
+      filename = File.join(Dir.pwd, options[:filename])
+      begin
+        invoke "dbtools:dump:blazegraph", [url, filename], :compress => true
+        invoke "dbtools:google_drive:upload", [filename], :folder => options[:folder]
+        invoke "dbtools:aws:upload_to_s3", [filename, options[:bucket], options[:key]] if options[:aws_s3]
+      ensure
+        File.delete(filename) if File.exists?(filename)
+      end
+    end
+  end
+end

data/lib/tasks/check.rb ADDED

@@ -0,0 +1,220 @@
+require 'dbtools/database/db_connection'
+require 'dbtools/database/postgresql_connection'
+require 'dbtools/database/mysql_connection'
+require 'thor'
+# require 'slack-notifier'
+require 'dbtools/constants'
+module Dbtools
+  class Check < Thor
+    package_name "dbtools"
+    desc "all [url]", "Run all tasks on this database."
+    def all(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      output_result(db.check_indexes)
+      output_result(db.check_reserved_keywords)
+      output_result(db.get_uppercase_columns)
+      output_result(db.get_completeness)
+      output_result(db.get_syntax_compression)
+      output_result(db.get_inverse_functional_property)
+      db.close
+    end
+    desc "all_databases", "Run all tasks on all databases it finds. Specify the credentials in ~/.dbtools/database_config.yml"
+    def all_databases()
+      load_config
+      @config.each do |k, db_credentials|
+        begin
+          db_connection = check_adapter(db_credentials)
+          databases = db_connection.get_all_databases
+          databases.each do |database|
+            next if Dbtools::Constants::IGNORE_DATABASES.include?(database)
+            db_credentials['database'] = database
+            self.all(db_credentials)
+          end
+        rescue Exception
+          nil
+        end
+      end
+    end
+    desc "indexes [URL]", "This task runs the function \'create_indexes\' on the database. Works on a mysql and postgres database. "
+    def indexes(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.check_indexes
+      output_result(result)
+    end
+    desc "output_indexes [URL]", "This task runs the function \'create_indexes\' on the database and outputs the result. Works on a mysql and postgres database. Outputs the queries."
+    def output_indexes(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      violations = db.check_indexes
+      queries = violations.map do |violation|
+        violation.solution
+      end
+      puts queries.join("\n") unless queries.empty?
+    end
+    # Checks if column names or table names include reserved keywords.
+    desc 'keywords [URL]', 'Checks if column names or table names include reserved keywords.'
+    def keywords(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.check_reserved_keywords
+      output_result(result)
+    end
+    desc 'completeness [URL]', 'Checks the amount of empty/null entries in the database.'
+    def completeness(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.get_completeness
+      output_result(result)
+    end
+    desc 'compression [URL]', 'Checks the amount of entries that can be compressed in the database.'
+    def compression(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.get_syntax_compression
+      output_result(result)
+    end
+    desc 'casing [URL]', 'Checks whether all column names are lowercase. '
+    def casing(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.get_uppercase_columns
+      output_result(result)
+    end
+    desc 'spelling [URL]', 'Checks whether all column names are correctly spelled. '
+    def spelling(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.check_spelling
+      output_result(result)
+    end
+    desc 'table_comments [URL]', 'Checks for table without comment metadata. '
+    def table_comments(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.get_tables_without_comments
+      output_result(result)
+    end
+    desc 'database_comments [URL]', 'Checks for databases without comment metadata. '
+    def database_comments(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.get_databases_without_comments
+      output_result(result)
+    end
+    desc 'inverse_functional_property [URL]', 'Gets the inverse functional property of the database. '
+    def inverse_functional_property(url)
+      @url = url
+      db = check_adapter(url)
+      return if db.nil?
+      result = db.get_inverse_functional_property
+      output_result(result)
+    end
+  private
+      # Check if the url is a postgres or mysql connection.
+      def check_adapter(url)
+        adapter = if url.is_a?(Hash)
+                    url['adapter']
+                  else
+                    url.match("^([a-zA-Z0-9]+):\/\/(.+)@(.+)\/(.+)").captures[0]
+                  end
+        case adapter
+          when "postgres", "postgresql"
+            db = Dbtools::Database::PostgresqlConnection.new(url)
+          when "mysql2"
+            db = Dbtools::Database::MysqlConnection.new(url)
+          else
+            puts "Invalid url"
+            return nil
+        end
+        return db
+      end
+      # Loads the config file
+      def load_config()
+        @config = YAML.load_file(Dbtools::Constants::DB_CONFIG_PATH)
+      end
+      def output_result(result)
+        result = result.join("\n")
+        #notifier = init_slack_notifier(@url)
+        #notifier.ping(result) unless result.empty?
+        if not result.empty?
+          puts result
+          # Only write if directory is writable
+          if File.writable?(File.dirname(Dbtools::Constants::OUTPUT_FILE))
+            File.open(Dbtools::Constants::OUTPUT_FILE, 'a') { |f| f.puts(result) }
+          end
+      end
+    end
+  end
+  class NoOpHTTPClient
+    def self.post uri, params={}
+    end
+  end
+# "Database name": [
+#   "metric": {
+#     "name": "name"
+#     "counter": 0
+#     "Violations": [
+#       "offender": {
+#         "total_records": 132
+#         "violating_records": 123
+#         "measure": 12
+#         "solution":
+#       }
+#     ]
+# }]
+#
+# {
+# 	"metrics": [{
+# 		"metric": "metric name",
+#     "counter": "number of violations",
+# 		"violations": {
+# 			"database.schema?.table?.col?": {
+# 				"total_records": "a number",
+# 				"violating_records": "a number",
+# 				"measure": "a number",
+# 				"solution": " a query/text with a proposed solution"
+# 			}
+# 		}
+# 	}]
+# }
+end

data/lib/tasks/ckan.rb ADDED

@@ -0,0 +1,151 @@
+require 'yaml'
+require 'thor'
+require 'find'
+require 'rdf'
+require 'open-uri'
+require 'dbtools/rdf/rdf_reader'
+require 'dbtools/constants'
+require 'dbtools/converter/csv_importer'
+require 'dbtools/converter/excel2csv_converter'
+require 'dbtools/database/mysql_connection'
+require 'dbtools/database/postgresql_connection'
+require 'dbtools/google_drive/google_drive_api'
+require 'tasks/import'
+require 'fileutils'
+module Dbtools
+  class Ckan < Thor
+    package_name "dbtools"
+    def initialize(*args)
+      super
+      load_config
+      @gdrive = Dbtools::Google_Drive::Google_drive_api.new
+      @service = @gdrive.service
+      @rdf_graph = @gdrive.rdf_graph
+      @import = Import.new
+    end
+    desc 'load_dataset [dataset]', 'Loads a dataset from a CKAN source by querying the rdf graph. Accepts an url containing the UUID or the UUID itself. '
+    def load_dataset(dataset)
+      datasets_metadata = @rdf_graph.get_metadata(dataset)
+      raise "Dataset not found. " if datasets_metadata.empty?
+      datasets_metadata.values.each do |metadata|
+        load_dataset_resource(metadata)
+      end
+    end
+    desc 'load_resource [dataset, resource]', 'Loads a single resource from a ckan dataset. Accepts urls containing the UUID or the UUID itself. '
+    def load_resource(dataset, resource)
+      datasets_metadata = @rdf_graph.get_metadata(dataset)
+      puts datasets_metadata.inspect
+      resource_metadata = datasets_metadata.select { |k, v| v['resource'].to_s.include?(resource) }
+      raise "Resource not found." if resource_metadata.empty?
+      raise "Multiple resources found." if resource_metadata.length > 1
+      load_dataset_resource(resource_metadata.values.first)
+    end
+    #desc 'check_missing_databases', 'Checks for databases that are listed in the RDF, but not loaded on the system. Prints the result.'
+    #def check_missing_databases
+      #postgres_databases = PostgresqlConnection.new(@postgres_connection_url).get_all_databases
+      #mysql_databases = MysqlConnection.new(@mysql_connection_url).get_all_databases
+      #installed_databases = postgres_databases + mysql_databases
+      #rdf_databases = @rdf_graph.get_available_databases.map {|k, v| [v['database_title'].gsub(/[^0-9a-zA-Z_]/,'_'), v]}.to_h
+      #missing_databases = rdf_databases.keys.to_set - installed_databases
+      #puts missing_databases.inspect
+      #return missing_databases.map {|title| [title, rdf_databases[title]]}.to_h
+    #end
+    #desc 'load_missing_databases', 'Loads all databases that are listed in the RDF, but missing on the system.'
+    #def load_missing_databases
+      #missing_databases = check_missing_databases
+      #missing_databases.each do |database_title, metadata|
+        #load_dataset(metadata['dataset'])
+      #end
+    #end
+    desc 'list_databases', 'Lists all databases by querying the rdf graph'
+    def list_databases
+      databases = @rdf_graph.get_available_databases
+      databases.each do |index, res|
+        puts "#{index}. #{res['dataset_title']}"
+      end
+      selection = ask("Which data set do you want to load? ").to_i
+      unless databases.key?(selection)
+        puts 'Data set not found. '
+        return
+      end
+      load_dataset(databases[selection]['dataset'])
+      return databases
+    end
+    desc 'load_rdf_in_desc [target_database, ckan_dataset]', 'Loads the RDF metadata into the database description. '
+    def load_rdf_in_desc(target_database, dataset)
+      begin
+        # Open the rdf of the file.
+        description = open("#{dataset}.ttl").read
+        # Put the rdf in the comments
+        psql = Dbtools::Database::PostgresqlConnection.new(target_database)
+        psql.set_description_database(description)
+        psql.close
+      rescue
+        puts "Could not open rdf from dataset: #{dataset}"
+      end
+    end
+    private
+    def load_dataset_resource(metadata)
+      dataset = metadata['dataset'].to_s
+      table_name = metadata['resource_title'].gsub(/[^0-9a-zA-Z_]/,'_')
+      database_name = metadata['database_title'].gsub(/[^0-9a-zA-Z_]/,'_')
+      format = metadata['format'].gsub(/[^0-9a-zA-Z_]/,'_')
+      folder = "/tmp/#{database_name}"
+      # Create folder if it doesn't exist
+      FileUtils.mkdir_p(folder)
+      begin
+        file_id = @gdrive.get_file_id(metadata['access_url'])
+        file_name = @service.get_file(file_id).name
+        destination = File.join(folder, file_name)
+        @service.get_file(file_id, download_dest: destination)
+        connection = load_database(database_name, destination, format, table_name: table_name)
+        load_rdf_in_desc(connection, dataset)
+      end
+    end
+    # Loads a database into either postgres or mysql, depending on the format.
+    def load_database(database_name, file, format, table_name: '')
+      return case format.downcase
+        when /postgres/
+          @import.postgres_dump(database_name, file)
+        when /mysql/
+          @import.mysql_dump(database_name, file)
+        when /csv/, /txt/
+          @import.csv_in_postgres(file, database_name, table_name)
+        when /xls/
+          @import.excel(database_name, file)
+        else
+          puts "Can't load #{format} file."
+          return nil
+      end
+    end
+    # Loads all configurations needed
+    def load_config
+      config = YAML.load(File.read(Dbtools::Constants::DB_TARGET_CONFIG_PATH))
+      postgres_config = config['postgres']
+      mysql_config = config['mysql']
+      @postgres_connection_url = "postgres://#{postgres_config['username']}:#{postgres_config['password']}@#{postgres_config['host']}/"
+      @mysql_connection_url = "mysql2://#{mysql_config['username']}:#{mysql_config['password']}@#{mysql_config['host']}/"
+      @postgres_connection_options = "--username=#{postgres_config['username']} --host=#{postgres_config['host']} --port=#{postgres_config['port']}"
+      @postgres_connection_command = "psql #{@postgres_connection_options}"
+      @mysql_connection_command = "mysql -u #{mysql_config['username']} -p#{mysql_config['password']} -h #{mysql_config['host']}"
+    end
+  end
+end