RubyGems - dbtools - Versions diffs - 0.5.2 - Mend

dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +7 -0
data/README.md +333 -0
data/Thorfile +1 -0
data/bin/dbtools +5 -0
data/config/client_secret_dbtools.json +1 -0
data/config/config.yml +1 -0
data/config/database_config.yml +12 -0
data/config/databases.txt +5 -0
data/config/schedule.rb +8 -0
data/dbtools.gemspec +37 -0
data/lib/dbtools.rb +47 -0
data/lib/dbtools/constants.rb +847 -0
data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
data/lib/dbtools/converter/csv_importer.rb +107 -0
data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
data/lib/dbtools/database/database_data.rb +146 -0
data/lib/dbtools/database/db_connection.rb +236 -0
data/lib/dbtools/database/mysql_connection.rb +78 -0
data/lib/dbtools/database/postgresql_connection.rb +132 -0
data/lib/dbtools/database/violation.rb +45 -0
data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
data/lib/dbtools/plsql_functions/link.sql +17 -0
data/lib/dbtools/plsql_functions/unlink.sql +15 -0
data/lib/dbtools/rdf/rdf_reader.rb +136 -0
data/lib/dbtools/version.rb +3 -0
data/lib/rdf/geophy.rb +27 -0
data/lib/tasks/aws.rb +43 -0
data/lib/tasks/backup.rb +107 -0
data/lib/tasks/check.rb +220 -0
data/lib/tasks/ckan.rb +151 -0
data/lib/tasks/convert.rb +139 -0
data/lib/tasks/dump.rb +110 -0
data/lib/tasks/googledrivetool.rb +252 -0
data/lib/tasks/import.rb +142 -0
data/lib/tasks/postgres.rb +29 -0
metadata +307 -0

data/lib/dbtools/google_drive/google_drive_api.rb ADDED

@@ -0,0 +1,211 @@
+require 'google/apis/drive_v3'
+require 'googleauth'
+require 'googleauth/stores/file_token_store'
+require 'google/apis/plus_v1'
+require 'dbtools/constants'
+require 'fileutils'
+require 'thor'
+module Dbtools::Google_Drive
+  class Google_drive_api
+    attr_reader :service
+    OOB_URI = 'urn:ietf:wg:oauth:2.0:oob'
+    APPLICATION_NAME = 'Dbtools'
+    CLIENT_SECRETS_PATH = File.join(Dbtools::Constants::ROOT_DIR, 'config', 'client_secret_dbtools.json')
+    CREDENTIALS_PATH = File.join(Dir.home, '.credentials', "dbtools_geophy.yaml")
+    SCOPE = Google::Apis::DriveV3::AUTH_DRIVE
+    QUERY_FIELDS = 'nextPageToken, files(id,name,mimeType,size,fileExtension,iconLink,createdTime,' +
+        'modifiedTime,webContentLink,webViewLink,parents,description,properties,trashed)'
+    Google::Apis.logger.level = Logger::FATAL
+    # Constructor for this class. You should pass the authorization object if you want to use
+    # another way to authenticate the user. Check the googleauth documentation for more info.
+    def initialize(auth=authorize)
+      @service = Google::Apis::DriveV3::DriveService.new
+      @service.client_options.application_name = APPLICATION_NAME
+      @service.authorization = auth
+    end
+    # Ensure valid credentials, either by restoring from the saved credentials
+    # files or intitiating an OAuth2 authorization. If authorization is required,
+    # the user's default browser will be launched to approve the request.
+    #
+    # @return [Google::Auth::UserRefreshCredentials] OAuth2 credentials
+    def authorize
+      unless File.exist?(CLIENT_SECRETS_PATH)
+        puts "#{CLIENT_SECRETS_PATH} not found."
+        puts "Create Google Drive API OAuth 2.0 credentials to allow access. "
+        exit(1)
+      end
+      FileUtils.mkdir_p(File.dirname(CREDENTIALS_PATH))
+      client_id = Google::Auth::ClientId.from_file(CLIENT_SECRETS_PATH)
+      token_store = Google::Auth::Stores::FileTokenStore.new(file: CREDENTIALS_PATH)
+      authorizer = Google::Auth::UserAuthorizer.new(
+          client_id, SCOPE, token_store)
+      user_id = 'default'
+      credentials = authorizer.get_credentials(user_id)
+      if credentials.nil?
+        url = authorizer.get_authorization_url(
+            base_url: OOB_URI)
+        puts "Open the following URL in the browser and enter the " +
+                 "resulting code after authorization: "
+        puts url
+        code = gets
+        credentials = authorizer.get_and_store_credentials_from_code(
+            user_id: user_id, code: code, base_url: OOB_URI)
+      end
+      credentials
+    end
+    # Retrieves the children of the parent folder.
+    # @param [Google::Apis::DriveV3::File] parent_folder
+    #   Parent folder to get the children from.
+    # @return [Google::Apis::DriveV3::FileList]
+    #   List of all files in the parent_folder.
+    def get_children_files(parent_folder, optional_query = '')
+      page_token = nil
+      children = Google::Apis::DriveV3::FileList.new
+      begin
+        query = "'#{parent_folder.id}' in parents and trashed = false"
+        query << " and " << optional_query if !optional_query.empty?
+        result = @service.list_files(q: query,
+                                     spaces: 'drive',
+                                     page_size: 1000,
+                                     page_token: page_token,
+                                     fields: QUERY_FIELDS)
+        page_token = result.next_page_token
+        result.files.concat(children.files) unless children.files.nil?
+        children = result
+      end while !page_token.nil?
+      return children
+    end
+    # Updates the metadata of a Google Drive File.
+    # @param file_id
+    #   Google Drive File ID for the file to be updated.
+    # @param attributes
+    #   Hash containing all the attributes to be updated.
+    # @return [Google::Apis::DriveV3::File]
+    #   Returns File object with the updated metadata.
+    def update_file(file_id, attributes={})
+      attributes.stringify_keys!
+      file_object = {}
+      file_object[:description] = attributes.delete('description') if attributes['description']
+      file_object[:properties] = attributes
+      fields = file_object.keys.join(',')
+      return @service.update_file(file_id, file_object, fields: fields)
+    end
+    # Queries all the files and returns an Hash with all files.
+    # All files have a reference to their parent and children if applicable.
+    def get_tree(optional_query = "", verbose: false)
+      files = []
+      count = 0
+      # Query for all files. Loops until no next_page_tokens are given.
+      # Returns 1000 files per query (Google Drive API limitation).
+      page_token = nil
+      begin
+        query = "trashed = false"
+        query << " and " << optional_query if !optional_query.nil? && !optional_query.empty?
+        result = @service.list_files(q: query,
+                                     spaces: 'drive',
+                                     page_size: 1000,
+                                     page_token: page_token,
+                                     fields: QUERY_FIELDS)
+        page_token = result.next_page_token
+        files.concat(result.files) unless result.files.nil?
+        count += result.files.size
+        STDERR.puts "Count: #{count} \t\t\r" if verbose
+      end while !page_token.nil?
+      # Convert array into key-value pairs, using ID as key.
+      # ID is unique for every file.
+      files = files.map! { |f| [f.id, f] } if not files.empty?
+      files = files.to_h
+      # Add two-way reference for all children-parents.
+      # files.each do |id, file|
+      #   file.parents.each do |parent|
+      #     next if files[parent].nil?
+      #     files[parent].children = [] if files[parent].children.nil?
+      #     files[parent].children << file.id
+      #   end unless file.parents.nil?
+      # end
+      return files
+    end
+    # Return the changes that happened from a point given by the page token.
+    # The page token for the current state can be retrieved by calling @service.get_changes_start_page_token.
+    # @param page_token
+    #   Last saved token
+    # @return changed_files, removed_files, saved_start_page_token
+    #   An array with three elements. The first element containing a hash with the file_id as the key,
+    #   and the file object containing the attributes as value, for all changed files. The second element
+    #   contains the file_ids for all files that were removed. The third element contains the new page token.
+    def get_changes(page_token)
+      changed_files = {}
+      removed_files = []
+      saved_start_page_token = page_token
+      while !page_token.nil?
+        response = @service.list_changes(page_token,
+                                         spaces: 'drive',
+                                         page_size: 1000,
+                                         restrict_to_my_drive: true,
+                                         fields: 'nextPageToken, newStartPageToken, changes')
+        response.changes.each do |change|
+          if change.file
+            changed_files[change.file_id] = change.file
+          else
+            removed_files << change.file_id
+          end
+        end
+        # Last page, save this token for the next polling interval
+        saved_start_page_token = response.new_start_page_token if response.new_start_page_token
+        page_token = response.next_page_token
+      end
+      return changed_files, removed_files, saved_start_page_token
+    end
+    # Prints the children of the parent folder.
+    # @param [Google::Apis::DriveV3::DriveService] drive_service
+    #   An authenticated Google Drive service.
+    # @param [Google::Apis::DriveV3::File] parent_folder
+    #   Parent folder to get the children from.
+    # @return [String]
+    #   String representing the folder structure.
+    def print_child_files(parent_folder, depth = 0, optional_query = '')
+      output = ''
+      children = get_children_files(parent_folder, optional_query)
+      children.files.each do |file|
+        prefix = '  ' * depth
+        if file.mime_type == 'application/vnd.google-apps.folder'
+          output << "#{prefix} - #{file.name} (#{file.id})" << "\n"
+          output << print_child_files(file, depth + 1)
+        else
+          output << "#{prefix} * #{file.name} (#{file.id})" << "\n"
+        end
+      end
+      return output
+    end
+    # Extracts the file_id from a google link using regex.
+    # Matches links like:
+    # https://drive.google.com/uc?export=download&id=FILEID
+    # https://drive.google.com/open?id=FILEID
+    # https://drive.google.com/file/d/FILEID/edit?usp=sharing
+    def self.get_file_id(access_url)
+      match = access_url.match(/.*google.*(id=|d\/)\b(?<file_id>[a-zA-Z0-9_-]+)\b/)
+      match['file_id']
+    end
+    # Returns the url from the drive id
+    def self.get_url_from_id(drive_id)
+      return "https://drive.google.com/open?id=#{drive_id}"
+    end
+  end
+end

data/lib/dbtools/google_drive/google_drive_entity.rb ADDED

@@ -0,0 +1,22 @@
+require 'spira'
+module Dbtools::Google_Drive
+  class GoogleDriveEntity < Spira::Base
+    configure :default_vocabulary => RDF::Geophy.vocab
+    type RDF::Geophy.GoogleDriveEntity
+    property :identifier, :type => RDF::XSD.string
+    property :name, :type => RDF::XSD.string
+    property :mime_type, :type => RDF::XSD.string
+    property :size, :type => RDF::XSD.integer
+    property :modified_time, :type => RDF::XSD.dateTime
+    property :created_time, :type => RDF::XSD.dateTime
+    property :icon_link, :type => RDF::URI
+    property :description, :predicate => RDF::RDFS.comment, :type => RDF::XSD.string
+    property :web_view_link, :type => RDF::URI
+    property :trashed, :type => RDF::XSD.boolean
+    # Has many relation, because files can be symlinked, resulting in multiple parents.
+    has_many :parents, :type => :GoogleDriveFolder
+  end
+end

data/lib/dbtools/google_drive/google_drive_file.rb ADDED

@@ -0,0 +1,10 @@
+require 'dbtools/google_drive/google_drive_entity'
+module Dbtools::Google_Drive
+  class GoogleDriveFile < GoogleDriveEntity
+    type RDF::Geophy.GoogleDriveFile
+    property :file_extension, :type => RDF::XSD.string
+    property :web_content_link, :type => RDF::URI
+  end
+end

data/lib/dbtools/google_drive/google_drive_folder.rb ADDED

@@ -0,0 +1,9 @@
+require 'dbtools/google_drive/google_drive_entity'
+module Dbtools::Google_Drive
+  class GoogleDriveFolder < GoogleDriveEntity
+    type RDF::Geophy.GoogleDriveFolder
+    has_many :children, :type => :GoogleDriveEntity
+  end
+end

data/lib/dbtools/plsql_functions/connect_server.sql ADDED

@@ -0,0 +1,30 @@
+CREATE OR REPLACE FUNCTION connect_server(
+    servername character varying,
+    host character varying,
+    port character varying,
+    dbname character varying,
+    server_user character varying,
+    server_password character varying)
+  RETURNS void AS
+$BODY$
+BEGIN
+CREATE EXTENSION IF NOT EXISTS postgres_fdw;
+EXECUTE '
+CREATE EXTENSION IF NOT EXISTS postgres_fdw;
+DROP SERVER IF EXISTS ' || serverName || ' CASCADE;
+CREATE SERVER ' || serverName || '
+FOREIGN DATA WRAPPER postgres_fdw
+OPTIONS (host ' || quote_literal(host) || ', port ' || quote_literal(port) || ', dbname ' || quote_literal(dbname) || ');
+CREATE USER MAPPING FOR current_user
+SERVER ' || serverName || '
+OPTIONS (user ' || quote_literal(server_user) || ', password ' || quote_literal(server_password) || ');
+';
+END;
+$BODY$
+  LANGUAGE plpgsql

data/lib/dbtools/plsql_functions/link.sql ADDED

@@ -0,0 +1,17 @@
+CREATE OR REPLACE FUNCTION link(
+    f_server_name character varying,
+    f_schema_name character varying,
+    f_table_name character varying)
+  RETURNS void AS
+$BODY$
+BEGIN
+EXECUTE '
+IMPORT FOREIGN SCHEMA ' || quote_ident(f_schema_name) || ' LIMIT TO (' || quote_ident(f_table_name) || ')
+FROM SERVER ' || f_server_name || ' INTO public;
+';
+END;
+$BODY$
+  LANGUAGE plpgsql

data/lib/dbtools/plsql_functions/unlink.sql ADDED

@@ -0,0 +1,15 @@
+CREATE OR REPLACE FUNCTION unlink(
+    f_schema_name character varying,
+    f_table_name character varying)
+  RETURNS void AS
+$BODY$
+BEGIN
+EXECUTE '
+DROP FOREIGN TABLE ' || f_schema_name || '.' || f_table_name || ' CASCADE;
+';
+END;
+$BODY$
+  LANGUAGE plpgsql

data/lib/dbtools/rdf/rdf_reader.rb ADDED

@@ -0,0 +1,136 @@
+require 'rdf'
+require 'thor'
+require 'sparql'
+require 'linkeddata'
+module Dbtools
+  class Rdf_reader
+    def initialize
+      @graph = RDF::Graph.new
+    end
+    # Loads a file into the graph
+    def load_from_file(file)
+      @graph.load(file)
+      while has_next_page
+        next_page
+      end
+    end
+    # Executes a query that returns all csv/postgres/mysql datasets from the rdf graph.
+    def get_available_databases
+      query = SPARQL.parse(%(
+PREFIX dcat: <http://www.w3.org/ns/dcat#>
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX gp: <http://www.geophy.com/rdf/terms#>
+SELECT ?dataset ?database_title ?dataset_title
+WHERE {
+?dataset rdf:type dcat:Dataset ;
+         dct:title ?dataset_title ;
+         gp:database ?database_title .
+}
+ORDER BY ?database_title
+))
+      @graph.query(query).map.with_index do |result, index|
+        queryResult = Hash.new
+        queryResult['dataset'] = result.dataset.to_s
+        queryResult['dataset_title'] = result.dataset_title.to_s
+        queryResult['database_title'] = result.database_title.to_s
+        [index, queryResult]
+      end.to_h
+    end
+    def get_metadata(dataset_id)
+      query = SPARQL.parse(%(
+PREFIX dcat: <http://www.w3.org/ns/dcat#>
+PREFIX dct: <http://purl.org/dc/terms/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+PREFIX gp: <http://www.geophy.com/rdf/terms#>
+SELECT ?database_title ?dataset ?distribution ?format ?description ?access_url ?resource_title
+WHERE {
+?dataset rdf:type dcat:Dataset ;
+         gp:database ?database_title ;
+         dcat:distribution ?distribution .
+OPTIONAL {
+?dataset dct:description ?description .
+}
+?distribution dct:format ?format ;
+              dct:title ?resource_title ;
+              dcat:accessURL ?access_url .
+FILTER( regex(?format, "postgres|mysql|csv|tsv|xlsx|xls|txt", 'i') &&
+        regex(str(?dataset), "#{dataset_id}") )
+}
+ORDER BY ?title
+))
+      output = @graph.query(query).map.with_index do |result, index|
+        queryResult = Hash.new
+        queryResult['dataset'] = result.dataset
+        queryResult['resource'] = result.distribution
+        queryResult['database_title'] = result.database_title.to_s
+        queryResult['resource_title'] = result.resource_title.to_s
+        queryResult['format'] = result.format.to_s
+        queryResult['description'] = result.description.to_s if defined?(result.description)
+        queryResult['access_url'] = result.access_url.to_s
+        [index, queryResult]
+      end.to_h
+      return output
+    end
+    # Check if there's a hydra next page.
+    def has_next_page
+      question = SPARQL.parse(%[
+PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ASK { ?catalog rdf:type hydra:PagedCollection ;
+             hydra:nextPage ?next_catalog .  }
+                           ])
+      @graph.query(question).true?
+    end
+    # Adds the next page to the graph
+    def next_page
+      return nil if !has_next_page
+      # Get next page links
+      next_page_query = SPARQL.parse(%(
+PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+SELECT ?catalog
+WHERE { ?current_catalog rdf:type hydra:PagedCollection ;
+                hydra:nextPage ?catalog . }))
+      next_page = @graph.query(next_page_query).first
+      # Delete existing hydra nodes
+      delete_query = SPARQL.parse(%(
+PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+SELECT ?catalog ?p ?o
+WHERE { ?catalog rdf:type hydra:PagedCollection ;
+             ?p ?o .  }
+                           ))
+      @graph.query(delete_query) do |res|
+        statement = RDF::Statement(res.catalog, res.p, res.o)
+        @graph.delete(statement)
+      end
+      # Load the next page.
+      @graph.load(next_page[:catalog])
+    end
+  end
+  # rdf = Rdf_reader.new
+  # rdf.load_from_file("/mnt/data/Development/geophy/db-maintenance-tool/lib/turtle_example.ttl")
+  # metadata = rdf.get_metadata("dvdrental")
+  # puts metadata.inspect
+end