RubyGems - dbtools - Versions diffs - 0.5.2 - Mend

dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/README.md +333 -0
data/Thorfile +1 -0
data/bin/dbtools +5 -0
data/config/client_secret_dbtools.json +1 -0
data/config/config.yml +1 -0
data/config/database_config.yml +12 -0
data/config/databases.txt +5 -0
data/config/schedule.rb +8 -0
data/dbtools.gemspec +37 -0
data/lib/dbtools.rb +47 -0
data/lib/dbtools/constants.rb +847 -0
data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
data/lib/dbtools/converter/csv_importer.rb +107 -0
data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
data/lib/dbtools/database/database_data.rb +146 -0
data/lib/dbtools/database/db_connection.rb +236 -0
data/lib/dbtools/database/mysql_connection.rb +78 -0
data/lib/dbtools/database/postgresql_connection.rb +132 -0
data/lib/dbtools/database/violation.rb +45 -0
data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
data/lib/dbtools/plsql_functions/link.sql +17 -0
data/lib/dbtools/plsql_functions/unlink.sql +15 -0
data/lib/dbtools/rdf/rdf_reader.rb +136 -0
data/lib/dbtools/version.rb +3 -0
data/lib/rdf/geophy.rb +27 -0
data/lib/tasks/aws.rb +43 -0
data/lib/tasks/backup.rb +107 -0
data/lib/tasks/check.rb +220 -0
data/lib/tasks/ckan.rb +151 -0
data/lib/tasks/convert.rb +139 -0
data/lib/tasks/dump.rb +110 -0
data/lib/tasks/googledrivetool.rb +252 -0
data/lib/tasks/import.rb +142 -0
data/lib/tasks/postgres.rb +29 -0
metadata +307 -0

data/lib/tasks/convert.rb ADDED

@@ -0,0 +1,139 @@
+require 'rdf'
+require 'linkeddata'
+require 'dbtools/converter/csv2rdf_converter'
+module Dbtools
+  class Convert < Thor
+    package_name "dbtools"
+    def initialize(*args)
+      super
+    end
+    # desc 'csv2rdf [csv_file, format, metadata]', 'Converts a csv file to a RDF file. Output file has the same name as input file, with a different extension.'
+    # def csv2rdf(csv_file, format = 'ttl', metadata_file = nil)
+    #   if csv_file
+    #     # Output as the same file, but with a different extension.
+    #     rdf_file = change_filename_extension(csv_file, format)
+    #     RDF::Writer.open(rdf_file) do |f|
+    #       # Load the csv into rdf graph, and write output to file.
+    #       options = {}
+    #       options[:metadata] = metadata_file if metadata_file
+    #       graph = RDF::Graph.load(csv_file, options)
+    #       f << graph
+    #     end
+    #   end
+    # end
+    desc 'excel2csv [excel_file, sheet_index, output_file(optional)]', 'Converts an excel file to CSV file.'
+    long_desc <<-LONGDESC
+      `excel2csv` converts an excel file to csv format. The sheet index should be specified.
+      It will default to the first sheet by default.
+    LONGDESC
+    # option :sheet,
+    def excel2csv(excel_file, sheet_index = 0, output_file = nil)
+      converter = Dbtools::Converter::Excel2csv_converter.new(excel_file)
+      if output_file.nil?
+        STDOUT << converter.sheet2csv(sheet_index)
+      else
+        File.open(output_file, 'a') do |f|
+          f << converter.sheet2csv(sheet_index)
+        end
+      end
+    end
+    desc 'csv2rdf [csv_file, uri, compressed(optional)]', 'Converts a csv file to a RDF ntriples file.'
+    long_desc <<-LONGDESC
+      `csv2rdf csv_file uri` will convert a csv file to a RDF NTriples file.
+      The URI will be the subject in the resulting RDF file.
+      You can optionally specify a third parameter, which will write
+      the output to a file. You can also specify a fourth parameter to compress
+      the file to .gz format.
+      Example:
+      \x5$ dbtools convert csv2rdf data.csv http://geophy.io output.nt
+      Resulting triples will look like:
+      <http://geophy.io#ROWNUMBER> <http://geophy.io/COLUMNNAME> VALUE .
+      <http://geophy.io#1> <http://geophy.io/name> "Bob" .
+    LONGDESC
+    def csv2rdf(csv_file, uri, output_file=nil, compressed=false)
+      csv_rdf = Dbtools::Converter::Csv2rdf_converter.new(csv_file, uri)
+      if output_file.nil?
+        csv_rdf.each_triple do |triple|
+          puts triple
+        end
+      else
+        begin
+          file = if compressed
+                   Zlib::GzipWriter.open(output_file + '.gz')
+                 else
+                   File.open(output_file, 'w')
+                 end
+          csv_rdf.each_triple do |triple|
+            file.write(triple << "\n")
+          end
+        ensure
+          file.close unless file.nil?
+        end
+      end
+    end
+    desc 'googledrive2rdf [file_id]', 'Converts a csv file from Google Drive to a RDF ntriples file. '
+    long_desc <<-LONGDESC
+      `googledrive2rdf file_id` will download a file from Google Drive and
+      convert that file to a RDF NTriples file.
+      The Google Drive file id will be the subject in the resulting RDF file.
+      You can optionally specify a second parameter, which will write
+      the output to a file. To compress the file to .gz, you can specify a third parameter.
+      Example:
+      \x5$ dbtools convert googledrive2rdf 0Byv6wMVo_JE4WGR6QWc2S3NiQjg output.nt true
+    LONGDESC
+    def googledrive2rdf(file_id, output_file=nil, compressed=false)
+      output_dir = File.join('/tmp', 'dbtools_googledrive/')
+      file_path = invoke("dbtools:google_drive:download", [file_id, output_dir])
+      uri = "https://drive.google.com/open?id=" << file_id
+      invoke "dbtools:convert:csv2rdf", [file_path, uri, output_file, compressed]
+    ensure
+      FileUtils.remove_entry_secure(output_dir)
+    end
+    # desc 'excel2rdf [excel_file, format, metadata]', 'Converts an excel file to an RDF file. Output file has the same name as input file, with a different extension.'
+    # def excel2rdf(excel_file, format = 'ttl', metadata_file = nil)
+    #   if excel_file
+    #     # Use tmp dir to output csv files.
+    #     output_dir = File.join('/tmp', 'excel2csv')
+    #     files = Dbtools::Import.new.excel2csv(excel_file, output_dir)
+    #     # Output as the same file, but with a different extension.
+    #     rdf_file = change_filename_extension(excel_file, format)
+    #     RDF::Writer.open(rdf_file) do |f|
+    #       options = {}
+    #       # Use metadata file for mapping if specified.
+    #       options[:metadata] = metadata_file if metadata_file
+    #       graph = RDF::Graph.new
+    #       files.each do |sheetname, csv_file|
+    #         # Load all sheets in the graph.
+    #         graph.load(csv_file, options)
+    #       end
+    #       f << graph
+    #     end
+    #     # Remove tmp dir
+    #     FileUtils.remove_entry_secure(output_dir)
+    #   end
+    # end
+    private
+    # Change the extension of the given filename with the new extension
+    def change_filename_extension(file, extension)
+      # Strip current extension from file name
+      file_extless = File.join(File.dirname(file), File.basename(file, ".*"))
+      ext = extension.delete(".")
+      return file_extless << "." << ext
+    end
+  end
+end

data/lib/tasks/dump.rb ADDED

@@ -0,0 +1,110 @@
+require 'fileutils'
+require 'thor'
+require 'rdf'
+require 'sparql/client'
+module Dbtools
+  class Dump < Thor
+    package_name "dbtools"
+    # Backs up a rdf graph at a sparql endpoint
+    desc 'rdf [sparql_endpoint, filename]', 'Dumps a rdf database to a file.'
+    long_desc <<-LONGDESC
+      `rdf [sparql_endpoint, filename]` will write all ntriple statements
+      located in the RDF repository to the specified file.
+      Example:
+      \x5$ dbtools dump rdf http://localhost:9999/blazegraph/namespace/test/sparql /tmp/repository.nt --compress=false
+      \x5$ dbtools dump rdf http://localhost:9999/blazegraph/namespace/test/sparql /tmp/repository.nt.gz --compress
+    LONGDESC
+    option :compress, :default => false, :type => :boolean
+    def rdf(sparql_endpoint, filename)
+      repo = SPARQL::Client::Repository.new(uri: sparql_endpoint)
+      # Use temp name when compressing
+      if options[:compress]
+        target = filename + '_tmp.nt'
+        STDERR.puts %q[Warning: compress option is selected, but filename doesn't end with .gz.
+You should change the name to end in .gz if you want to open it.] if File.extname(filename) != '.gz'
+      else
+        target = filename
+      end
+      RDF::Writer.open(target, format: :ntriples) do |w|
+        repo.each {|stmt| w << stmt}
+      end
+      # Zlib::GzipWriter can't wrap around RDF::Writer, because RDF::Writer is not a IO-like object...
+      if options[:compress]
+        # Compress the file using gz
+        Zlib::GzipWriter.open(filename) do |gz|
+          # Write in chunks.
+          File.open(target) do |fp|
+            while chunk = fp.read(16 * 1024) do
+              gz.write chunk
+            end
+          end
+          gz.close
+        end
+        # Delete the original file after generating the compressed version.
+        File.delete(target) if File.exists?(target)
+      end
+    end
+    desc 'blazegraph [url, file]', 'Dumps a blazegraph database to a jnl file.'
+    long_desc <<-LONGDESC
+      `blazegraph [url, file]` will create a backup of a blazegraph database,
+      using the built in backup function, and output it to the specified file.
+      It is recommended to use absolute filepaths.
+      Note: the resulting file will reside on the same server as where blazegraph
+            is running.
+      Example:
+      \x5$ dbtools backup blazegraph http://localhost:9999/blazegraph/backup /tmp/dump.jnl
+    LONGDESC
+    option :compress, :default => false, :type => :boolean
+    def blazegraph(url, file)
+      uri = URI(url)
+      params = { :compress => options[:compress],
+                 :file => file }
+      uri.query = URI.encode_www_form(params)
+      res = Net::HTTP.get_response(uri)
+    end
+    # Creates a schema dump of the database. Specify the database with an url.
+    desc 'schema URL PATH', 'Creates a schema dump of the database. Specify the database with an url.'
+    def schema(url, path)
+      adapter, user, password, host, database = url.match("^([a-zA-Z0-9]+):\/\/(.+):(.+)@(.+)\/(.+)").captures
+      case adapter
+        when "mysql2"
+          dump_mysql_schema(user, password, database, host, path)
+        when "postgres"
+          dump_postgresql_schema(user, password, database, host, path)
+        else
+          puts "Not supported database"
+      end
+    end
+    private
+    # Dumps a mysql database schema to a file.
+    def dump_mysql_schema(user, password, database, host, path)
+      dump_path = File.join(path, 'mysql', host)
+      FileUtils::mkdir_p(dump_path)
+      dump_file_name = File.join(dump_path, "#{database}_schema.sql")
+      puts "Dumping schema to #{dump_file_name}"
+      system "mysqldump -u #{user} -p#{password} -h #{host} --no-data #{database} > #{dump_file_name}"
+    end
+    # Dumps a postgres database schema to a file.
+    def dump_postgresql_schema(user, password, database, host, path)
+      dump_path = File.join(path, "postgres", host)
+      FileUtils::mkdir_p(dump_path)
+      dump_file_name = File.join(dump_path, "#{database}_schema.sql")
+      puts "Dumping schema to #{dump_file_name}"
+      system "pg_dump -h #{host} --dbname=#{database} --username=#{user} --schema-only > #{dump_file_name}"
+    end
+  end
+end

data/lib/tasks/googledrivetool.rb ADDED

@@ -0,0 +1,252 @@
+require 'yaml'
+require 'thor'
+require 'find'
+require 'dbtools/rdf/rdf_reader'
+require 'dbtools/constants'
+require 'dbtools/converter/csv_importer'
+require 'dbtools/converter/google_drive2_rdf_converter'
+require 'dbtools/converter/excel2csv_converter'
+require 'dbtools/database/mysql_connection'
+require 'dbtools/database/postgresql_connection'
+require 'dbtools/google_drive/google_drive_api'
+require 'fileutils'
+require 'googleauth'
+require 'googleauth/stores/file_token_store'
+require 'google/apis/drive_v3'
+module Dbtools
+  class FormatNotSupportedError < StandardError; end
+  class Google_drive < Thor
+    def initialize(*args)
+      super
+      @gdrive = Google_Drive::Google_drive_api.new(auth=authorize)
+      @service = @gdrive.service
+    end
+    desc 'export [File_id, format, target_dest(optional)]', 'Exports a file stored on Google Drive. Like Google Spreadsheets etc. For regular files, use download. If no target directory is specified, it will print the contents. Supported export formats: https://developers.google.com/drive/v3/web/manage-downloads#downloading_a_file'
+    long_desc <<-LONGDESC
+      `export` will download a Google document stored on Google Drive, like Google Spreadsheets, Google slides etc.
+      You need to specify the export format. The supported conversion formats can be found here:
+      https://developers.google.com/drive/v3/web/manage-downloads#downloading_a_file
+      An optional target directory can be given. This will download the file into the directory,
+      using the same name as on Google Drive.
+      If no target directory is given, the file will be streamed to STDOUT.
+      Examples:
+      \x5$ dbtools google_drive export 0B67ew1eLtcXxeUVmTndialhTRTA 'text/plain' /tmp/target_dir/"
+      \x5$ dbtools google_drive export 0B67ew1eLtcXxeUVmTndialhTRTA 'application/pdf' > /tmp/test.pdf"
+    LONGDESC
+    def export(file_id, target_format, target_dest = nil)
+      if target_dest.nil?
+        @service.export_file(file_id, target_format, download_dest: STDOUT)
+      else
+        file = @service.get_file(file_id)
+        FileUtils.mkdir_p(target_dest)
+        extension_mapping = { 'text/html'                                                                 => '.html',
+                              'text/plain'                                                                => '.txt',
+                              'application/rtf'                                                           => '.rtf',
+                              'application/vnd.oasis.opendocument.text'                                   => '.odt',
+                              'application/pdf'                                                           => '.pdf',
+                              'application/vnd.openxmlformats-officedocument.wordprocessingml.document'   => '.docx',
+                              'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'         => '.xlsx',
+                              'application/x-vnd.oasis.opendocument.spreadsheet'                          => '.ods',
+                              'text/csv'                                                                  => '.csv',
+                              'image/jpeg'                                                                => '.jpg',
+                              'image/png'                                                                 => '.png',
+                              'image/svg+xml'                                                             => '.svg',
+                              'application/vnd.openxmlformats-officedocument.presentationml.presentation' => '.pptx',
+                              'application/vnd.google-apps.script+json'                                   => '.json' }
+        extension_mapping.default = ''
+        extension = extension_mapping[target_format]
+        destination = File.join(target_dest, file.name + extension)
+        @service.export_file(file_id, target_format, download_dest: destination)
+        return destination
+      end
+    end
+    desc 'download [File_id, target_dest(optional)]', 'Downloads a file stored on google drive. Regular files, no Google Spreadsheets etc. For Google documents, use export. If no target directory is specified, it will print the contents. '
+    long_desc <<-LONGDESC
+      `download` will download a file stored on Google Drive.
+      An optional target directory can be given. This will download the file into the directory,
+      using the same name as on Google Drive. If a Google Doc file is given, it will automatically
+      export it to the most commonly used format.
+      If no target directory is given, the file will be streamed to STDOUT.
+      Examples:
+      \x5$ dbtools google_drive download 0B67ew1eLtcXxeUVmTndialhTRTA /tmp/target_destination/"
+      \x5$ dbtools google_drive download 0B67ew1eLtcXxeUVmTndialhTRTA > /tmp/image.jpg"
+    LONGDESC
+    def download(file_id, target_dest = nil)
+      metadata = @service.get_file(file_id)
+      if metadata.mime_type.index('application/vnd.google-apps')
+        # Default conversion formats
+        googledoc_format_conversion = { 'application/vnd.google-apps.document' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                                        'application/vnd.google-apps.presentation' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+                                        'application/vnd.google-apps.spreadsheet' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+                                        'application/vnd.google-apps.drawing' => 'image/png' }
+        target_format = googledoc_format_conversion[metadata.mime_type]
+        raise FormatNotSupportedError.new("Mimetype #{metadata.mime_type} is not supported. If you know the target conversion format, try using the `export` task.") if target_format.nil?
+        return export(file_id, target_format, target_dest)
+      end
+      if target_dest.nil?
+        @service.get_file(file_id, download_dest: STDOUT)
+      else
+        destination = target_dest
+        if target_dest.is_a?(String)
+          FileUtils.mkdir_p(target_dest)
+          destination = File.join(target_dest, metadata.name)
+        end
+        @service.get_file(file_id, download_dest: destination)
+        return destination
+      end
+    end
+    desc 'serialize_rdf [target_file]', 'Serializes the entire Google Drive to a RDF representation. '
+    long_desc <<-LONGDESC
+      `serialize_rdf` will serialize the entire Google Drive to a RDF representation. When no target file
+      is specified, it will output the results to STDOUT.
+      An optional `query` parameter can be given, to only serialize matching results.
+      See the `Search for files` guide from the Google Drive API for supported syntax.
+      Example:
+      \x5$ dbtools google_drive serialize_rdf output.nt --query="name contains 'Data Lake'"
+    LONGDESC
+    option :query, :option => '', :type => :string
+    option :verbose, :default => false, :type => :boolean
+    def serialize_rdf(target_file = nil)
+      f = if target_file.nil? then STDOUT else File.open(target_file, 'w') end
+      begin
+        print_progress = options[:verbose]
+        files = @gdrive.get_tree(optional_query=options[:query], verbose: print_progress)
+        googledrive2rdf_converter = Dbtools::Converter::GoogleDrive2RDFConverter.new
+        googledrive2rdf_converter.serialize_as_rdf(files=files, verbose: print_progress) do |statement|
+          f << statement
+        end
+      ensure
+        if f != STDOUT
+          f.flush
+          f.close
+        end
+      end
+    end
+    desc 'upload [file]', 'Upload a file to Google Drive'
+    long_desc <<-LONGDESC
+      `upload` will load a file stored on Google Drive. You can specify the target directory by
+      providing the --folder argument with the folder id. If no argument is given, the file will
+      be placed in the root directory.
+      Note: Google Drive allows the same file to be placed in multiple directories, similar to
+            symlinking a file. To place the uploaded file in multiple directories, pass an
+            array of folder ids to the --folder argument.
+      Examples:
+      \x5$ dbtools google_drive upload file.txt --folder=0B1ptxcLvq-tCNHlQMzU0ZFcyZjQ
+    LONGDESC
+    method_option :folder, :type => :array
+    method_option :filename, :type => :string
+    def upload(file)
+      mime_type = `file --mime-type -b #{file}`.chomp
+      file_metadata = {
+          name: File.basename(file),
+          mime_type: mime_type,
+      }
+      file_metadata.merge!({ parents: options[:folder] }) if options[:folder]
+      file_metadata.merge!({ name: options[:filename] }) if options[:filename]
+      result = @service.create_file(file_metadata, upload_source: file,
+                  content_type: mime_type, options: { retries: 3 } )
+      return result
+    end
+    desc 'changes_as_rdf [page_token, target_file]', 'Queries all changes to the Google Drive from a starting point defined by the page token.'
+    def changes_as_rdf(start_page_token, target_file=nil)
+      # Check if it's not empty.
+      if start_page_token.nil? || start_page_token.to_s.empty?
+        STDERR.puts "Start page token cannot be nil. "
+        return
+      end
+      f = if target_file.nil?
+            STDOUT
+          else
+            File.open(target_file, 'w')
+          end
+      begin
+        changed_files, removed_files, new_start_page_token = @gdrive.get_changes(page_token=start_page_token)
+        googledrive2rdf = Dbtools::Converter::GoogleDrive2RDFConverter.new
+        googledrive2rdf.serialize_as_rdf(files=changed_files) do |statement|
+          f << statement
+        end
+      ensure
+        if f != STDOUT
+          f.flush
+          f.close
+        end
+      end
+      STDERR.puts "New page token: #{new_start_page_token}"
+      return changed_files, removed_files, new_start_page_token
+    end
+    desc 'list [file_id]', 'Lists all files in the folder.'
+    def list(file_id)
+      folder = @service.get_file(file_id)
+      puts @gdrive.print_child_files(folder)
+    end
+    no_commands do
+      def get_file(file_id, download_dest: nil)
+        @service.get_file(file_id, download_dest: download_dest)
+      end
+    end
+    private
+    OOB_URI = 'urn:ietf:wg:oauth:2.0:oob'
+    CLIENT_SECRETS_PATH = Dbtools::Google_Drive::Google_drive_api::CLIENT_SECRETS_PATH
+    CREDENTIALS_PATH = Dbtools::Google_Drive::Google_drive_api::CREDENTIALS_PATH
+    SCOPE = Google::Apis::DriveV3::AUTH_DRIVE
+    # Ensure valid credentials, either by restoring from the saved credentials
+    # files or intitiating an OAuth2 authorization. If authorization is required,
+    # the user's default browser will be launched to approve the request.
+    #
+    # @return [Google::Auth::UserRefreshCredentials] OAuth2 credentials
+    def authorize
+      unless File.exist?(CLIENT_SECRETS_PATH)
+        puts "#{CLIENT_SECRETS_PATH} not found."
+        puts "Create Google Drive API OAuth 2.0 credentials to allow access. "
+        exit(1)
+      end
+      FileUtils.mkdir_p(File.dirname(CREDENTIALS_PATH))
+      client_id = Google::Auth::ClientId.from_file(CLIENT_SECRETS_PATH)
+      token_store = Google::Auth::Stores::FileTokenStore.new(file: CREDENTIALS_PATH)
+      authorizer = Google::Auth::UserAuthorizer.new(
+          client_id, SCOPE, token_store)
+      user_id = 'default'
+      credentials = authorizer.get_credentials(user_id)
+      if credentials.nil?
+        url = authorizer.get_authorization_url(
+            base_url: OOB_URI)
+        puts url
+        code = ask("Open the following URL in the browser and enter the " +
+                       "resulting code after authorization: ")
+        credentials = authorizer.get_and_store_credentials_from_code(
+            user_id: user_id, code: code, base_url: OOB_URI)
+      end
+      credentials
+    end
+  end
+end