dbtools 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +333 -0
- data/Thorfile +1 -0
- data/bin/dbtools +5 -0
- data/config/client_secret_dbtools.json +1 -0
- data/config/config.yml +1 -0
- data/config/database_config.yml +12 -0
- data/config/databases.txt +5 -0
- data/config/schedule.rb +8 -0
- data/dbtools.gemspec +37 -0
- data/lib/dbtools.rb +47 -0
- data/lib/dbtools/constants.rb +847 -0
- data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
- data/lib/dbtools/converter/csv_importer.rb +107 -0
- data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
- data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
- data/lib/dbtools/database/database_data.rb +146 -0
- data/lib/dbtools/database/db_connection.rb +236 -0
- data/lib/dbtools/database/mysql_connection.rb +78 -0
- data/lib/dbtools/database/postgresql_connection.rb +132 -0
- data/lib/dbtools/database/violation.rb +45 -0
- data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
- data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
- data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
- data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
- data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
- data/lib/dbtools/plsql_functions/link.sql +17 -0
- data/lib/dbtools/plsql_functions/unlink.sql +15 -0
- data/lib/dbtools/rdf/rdf_reader.rb +136 -0
- data/lib/dbtools/version.rb +3 -0
- data/lib/rdf/geophy.rb +27 -0
- data/lib/tasks/aws.rb +43 -0
- data/lib/tasks/backup.rb +107 -0
- data/lib/tasks/check.rb +220 -0
- data/lib/tasks/ckan.rb +151 -0
- data/lib/tasks/convert.rb +139 -0
- data/lib/tasks/dump.rb +110 -0
- data/lib/tasks/googledrivetool.rb +252 -0
- data/lib/tasks/import.rb +142 -0
- data/lib/tasks/postgres.rb +29 -0
- metadata +307 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'rdf'
|
3
|
+
|
4
|
+
module Dbtools::Converter
|
5
|
+
class Csv2rdf_converter
|
6
|
+
|
7
|
+
# Constructor for the csv2rdf converter.
|
8
|
+
# @param filename
|
9
|
+
# Filename of the csv file that needs to be converted.
|
10
|
+
# @param uri
|
11
|
+
# RDF URI for the subject. This will be prepended with the row number.
|
12
|
+
# Example:
|
13
|
+
# uri = 'http://example.org/fileid'
|
14
|
+
# <http://example.org/fileid#123> <predicate> "value"
|
15
|
+
# @param default_vocabulary
|
16
|
+
# Base vocabulary for the column names.
|
17
|
+
# Example:
|
18
|
+
# default_vocabulary = "http://geophy.io/"
|
19
|
+
# <subject> <http://geophy.io/column1> "value"
|
20
|
+
def initialize(filename, uri, default_vocabulary: "http://geophy.io/", options: {})
|
21
|
+
@uri = uri
|
22
|
+
@default_vocabulary = default_vocabulary
|
23
|
+
delimiter = options[:col_sep]
|
24
|
+
delimiter ||= guess_delimiter(filename)
|
25
|
+
opts = { :headers => true,
|
26
|
+
:header_converters => :symbol,
|
27
|
+
:converters => :all,
|
28
|
+
:col_sep => delimiter,
|
29
|
+
:skip_blanks => true
|
30
|
+
}.merge(options)
|
31
|
+
@csv = CSV.open(filename, opts)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Converts the current row to rdf triples.
|
35
|
+
def each_triple
|
36
|
+
@csv.each do |row|
|
37
|
+
lineno = @csv.lineno
|
38
|
+
#print a triple with the row id
|
39
|
+
rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
|
40
|
+
predicate: RDF::URI.new("#{@default_vocabulary}rid"),
|
41
|
+
object: lineno
|
42
|
+
})
|
43
|
+
yield rdf.to_ntriples
|
44
|
+
row.each do |colname, colvalue|
|
45
|
+
next if colvalue.nil? or colvalue.to_s.empty?
|
46
|
+
rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
|
47
|
+
predicate: RDF::URI.new(File.join(@default_vocabulary, colname.to_s)),
|
48
|
+
object: colvalue
|
49
|
+
})
|
50
|
+
yield rdf.to_ntriples
|
51
|
+
# yield "#{subject} #{predicate} #{object} ."
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Attempt to guess delimiter based on occurrence in the header.
|
57
|
+
def guess_delimiter(filename)
|
58
|
+
delimiters = [',', '|', "\t", ';']
|
59
|
+
lines = File.foreach("#{filename}").first(10).join
|
60
|
+
delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
|
61
|
+
# Key is the delimiter, value is the occurence.
|
62
|
+
most_likely_delimiter = delimiters_count.max_by { |k, v| v }
|
63
|
+
# Check if the occurrence is not zero.
|
64
|
+
raise "No delimiter detected. " if most_likely_delimiter[1].zero?
|
65
|
+
return most_likely_delimiter.first
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'time'
|
3
|
+
|
4
|
+
module Dbtools::Converter
|
5
|
+
class Csv_importer
|
6
|
+
attr_reader :tablename, :delimiter
|
7
|
+
def initialize(filename, delimiter: '', tablename: '')
|
8
|
+
@delimiter = guess_delimiter(filename) if delimiter.empty?
|
9
|
+
options = { :headers => true,
|
10
|
+
:header_converters => :symbol,
|
11
|
+
:converters => :all,
|
12
|
+
:col_sep => @delimiter
|
13
|
+
}
|
14
|
+
csv = CSV.open(filename, options)
|
15
|
+
arr = Array.new
|
16
|
+
csv.take(10000).each do |row|
|
17
|
+
arr.push(row)
|
18
|
+
end
|
19
|
+
@data = CSV::Table.new(arr)
|
20
|
+
@tablename = tablename
|
21
|
+
@tablename = File.basename(filename, '.csv').gsub(/[^0-9a-zA-Z_]/,'_').to_sym if tablename.empty?
|
22
|
+
@types = Hash.new
|
23
|
+
end
|
24
|
+
|
25
|
+
# Try to infer the type of the columns, and store them.
|
26
|
+
def infer_type_of_columns
|
27
|
+
@data.by_col!.each do |colName, rows|
|
28
|
+
# Count all the types.
|
29
|
+
count = Hash.new
|
30
|
+
rows.each do |entry|
|
31
|
+
type = infer_type(entry)
|
32
|
+
count[type] = count[type].nil? ? 1 : count[type] + 1
|
33
|
+
end
|
34
|
+
# Set the type to the most occurring type.
|
35
|
+
most_occurring_type = count.sort_by(&:last).last
|
36
|
+
type = most_occurring_type.first
|
37
|
+
|
38
|
+
# Let float take precedence over integers if it occurred.
|
39
|
+
type = Float if type == Fixnum && !count[Float].nil?
|
40
|
+
# Let string take precedence over other types if it occurred.
|
41
|
+
type = String unless count[String].nil?
|
42
|
+
@types[colName] = type
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Infer the type of a value by using Ruby's internal type inference system.
|
47
|
+
def infer_type(entry)
|
48
|
+
type = (Time.parse(entry) rescue nil)
|
49
|
+
type = entry if type.nil?
|
50
|
+
return type.class
|
51
|
+
end
|
52
|
+
|
53
|
+
# Converts a ruby class to a string representing a SQL type.
|
54
|
+
def class_to_sql_type(klass)
|
55
|
+
# There's probably a better way to detect the type..
|
56
|
+
if klass == Fixnum
|
57
|
+
'BIGINT'
|
58
|
+
elsif klass == Float
|
59
|
+
'FLOAT'
|
60
|
+
#elsif klass == Time
|
61
|
+
#'DATE'
|
62
|
+
else
|
63
|
+
'VARCHAR(255)'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns a sql schema script of the csv file.
|
68
|
+
def to_sql_schema_script
|
69
|
+
infer_type_of_columns if @types.empty?
|
70
|
+
output = %{CREATE TABLE IF NOT EXISTS "#{@tablename}" ( \n}
|
71
|
+
columns = @data.by_col!.map.with_index do |data, index|
|
72
|
+
colName, rows = data
|
73
|
+
# Use column position if no header is defined.
|
74
|
+
colName = "col_#{index}" if (colName.nil? || colName.empty?)
|
75
|
+
# Check if column can be null.
|
76
|
+
nullable = rows.all? { |entry| !entry.to_s.gsub(/\s/, '').empty? }
|
77
|
+
|
78
|
+
sql_type = class_to_sql_type(@types[colName])
|
79
|
+
|
80
|
+
result = "\t#{colName.downcase} #{sql_type}"
|
81
|
+
result << "\tNOT NULL" if nullable
|
82
|
+
result
|
83
|
+
end.join(", \n")
|
84
|
+
output << columns << "\n);\n"
|
85
|
+
end
|
86
|
+
|
87
|
+
# Writes the script to a file.
|
88
|
+
def output_schema_to_file(filename)
|
89
|
+
open(filename, 'w') { |f| f << to_sql_schema_script}
|
90
|
+
return filename
|
91
|
+
end
|
92
|
+
|
93
|
+
# Attempt to guess delimiter based on occurrence in the header.
|
94
|
+
def guess_delimiter(filename)
|
95
|
+
delimiters = [',', '|', "\t", ';']
|
96
|
+
lines = File.foreach("#{filename}").first(10).join
|
97
|
+
delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
|
98
|
+
puts delimiters_count
|
99
|
+
|
100
|
+
# Key is the delimiter, value is the occurence.
|
101
|
+
most_likely_delimiter = delimiters_count.max_by { |k, v| v }
|
102
|
+
# Check if the occurrence is not zero.
|
103
|
+
raise "No delimiter detected. " if most_likely_delimiter[1].zero?
|
104
|
+
return most_likely_delimiter.first
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'roo'
|
2
|
+
require 'roo-xls'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
module Dbtools::Converter
|
7
|
+
class Excel2csv_converter
|
8
|
+
|
9
|
+
# Initialize the roo excel library.
|
10
|
+
def initialize(filename)
|
11
|
+
@excel = Roo::Spreadsheet.open(filename)
|
12
|
+
@excel_filename = File.basename(filename)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Output all sheets in the excel to csv. Set the sheetname as the filename.
|
16
|
+
def output(folder)
|
17
|
+
FileUtils.mkdir_p(folder)
|
18
|
+
h = Hash.new
|
19
|
+
@excel.each_with_pagename do |sheetname, sheet|
|
20
|
+
filename = @excel_filename.gsub(/[^0-9a-zA-Z_-]/,'_') + "_" + sheetname.gsub(/[^0-9a-zA-Z_-]/,'_') + ".csv"
|
21
|
+
path = File.join(folder, filename)
|
22
|
+
output = File.open(path, "w") do |f|
|
23
|
+
f.write(sheet.to_csv)
|
24
|
+
end
|
25
|
+
h[sheetname] = path
|
26
|
+
end
|
27
|
+
@excel.close
|
28
|
+
return h
|
29
|
+
end
|
30
|
+
|
31
|
+
# Convert an excel sheet to csv, given the index.
|
32
|
+
def sheet2csv(sheet_index)
|
33
|
+
begin
|
34
|
+
@excel.sheet(sheet_index).to_csv
|
35
|
+
rescue ArgumentError => e
|
36
|
+
puts e.message
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'spira'
|
2
|
+
require 'dbtools/google_drive/google_drive_folder'
|
3
|
+
require 'dbtools/google_drive/google_drive_file'
|
4
|
+
|
5
|
+
module Dbtools::Converter
|
6
|
+
class GoogleDrive2RDFConverter
|
7
|
+
def initialize
|
8
|
+
Spira.repository = RDF::Repository.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Converts a google drive file instance to RDF statements
|
12
|
+
# @param [Google::Apis::DriveV3::File] google_drive_file
|
13
|
+
# Google drive file that will be converted.
|
14
|
+
# @return [String]
|
15
|
+
# RDF ntriples in string format.
|
16
|
+
def drivefile2rdf(google_drive_file)
|
17
|
+
file = google_drive_file
|
18
|
+
result = []
|
19
|
+
Spira.repository ||= RDF::Repository.new
|
20
|
+
|
21
|
+
if file.mime_type == 'application/vnd.google-apps.folder'
|
22
|
+
# Google Folder specific attributes
|
23
|
+
uri = RDF::URI.new("https://drive.google.com/drive/folders/#{file.id}")
|
24
|
+
drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
|
25
|
+
else
|
26
|
+
# Google File specific attributes
|
27
|
+
uri = RDF::URI.new(Dbtools::Google_Drive::Google_drive_api.get_url_from_id(file.id))
|
28
|
+
drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFile)
|
29
|
+
drive_file.file_extension = file.file_extension
|
30
|
+
drive_file.web_content_link = file.web_content_link
|
31
|
+
end
|
32
|
+
# Shared attributes
|
33
|
+
drive_file.name = file.name
|
34
|
+
drive_file.identifier = file.id
|
35
|
+
drive_file.created_time = file.created_time
|
36
|
+
drive_file.mime_type = file.mime_type
|
37
|
+
drive_file.size = file.size
|
38
|
+
drive_file.modified_time = file.modified_time
|
39
|
+
drive_file.icon_link = file.icon_link
|
40
|
+
drive_file.description = file.description
|
41
|
+
drive_file.web_view_link = file.web_view_link
|
42
|
+
drive_file.trashed = file.trashed
|
43
|
+
|
44
|
+
# Assign all key-value pairs from properties attribute to the Spira resource.
|
45
|
+
# Could probably be done for all attributes above..
|
46
|
+
# Untested so commented to prevent errors
|
47
|
+
# file.properties.each do |key, value|
|
48
|
+
# m = "#{key}="
|
49
|
+
# drive_file.send(m, value) if drive_file.respond_to?(m)
|
50
|
+
# end if file.properties
|
51
|
+
|
52
|
+
# Add bi-directional relation for parents-children.
|
53
|
+
drive_file.parents = file.parents.map do |parent_id|
|
54
|
+
parent_uri = RDF::URI.new("https://drive.google.com/drive/folders/#{parent_id}")
|
55
|
+
parent_drive_folder = parent_uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
|
56
|
+
parent_drive_folder.children << drive_file
|
57
|
+
result << parent_drive_folder
|
58
|
+
parent_drive_folder
|
59
|
+
end if file.parents
|
60
|
+
|
61
|
+
result << drive_file
|
62
|
+
return result
|
63
|
+
end
|
64
|
+
|
65
|
+
# Serializes a list of files to RDF statements.
|
66
|
+
# Yields rdf ntriples for every file.
|
67
|
+
# @param files
|
68
|
+
# List of files to be converted. Default is all files.
|
69
|
+
# @param verbose
|
70
|
+
# Prints progress if true
|
71
|
+
def serialize_as_rdf(files, verbose: true)
|
72
|
+
if verbose
|
73
|
+
total = files.size
|
74
|
+
count = 0
|
75
|
+
end
|
76
|
+
files.each do |file|
|
77
|
+
if verbose
|
78
|
+
count += 1
|
79
|
+
STDERR.puts("Converting file to rdf: #{count}/#{total}\t\r")
|
80
|
+
end
|
81
|
+
# get tree method returns a hash with [id, file].
|
82
|
+
file = file[1] if files.is_a?(Hash)
|
83
|
+
yield drivefile2rdf(file).map(&:to_ntriples).join("\n")
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
def get_variables(google_drive_file)
|
89
|
+
attributes = {}
|
90
|
+
google_drive_file.instance_variables.each do |var|
|
91
|
+
attributes[var.to_s.delete('@')] = google_drive_file.instance_variable_get(var)
|
92
|
+
end
|
93
|
+
return attributes
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require 'dbtools/constants'
|
2
|
+
|
3
|
+
module Dbtools::Database
|
4
|
+
class DatabaseData
|
5
|
+
attr_reader :name, :tables
|
6
|
+
def initialize(name)
|
7
|
+
@name = name
|
8
|
+
@tables = Hash.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Add table if it doesn't exist yet.
|
12
|
+
def add_table(table_name, schema)
|
13
|
+
key = "#{schema}.#{table_name}"
|
14
|
+
@tables[key] = Table.new(table_name, schema) unless @tables.include?(table_name)
|
15
|
+
return @tables[key]
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s
|
19
|
+
output = "#{@name}: \n"
|
20
|
+
@tables.each do |k, v|
|
21
|
+
output << v.to_s << "\n"
|
22
|
+
end
|
23
|
+
return output
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Table
|
28
|
+
attr_reader :name, :schema, :columns
|
29
|
+
def initialize(name, schema)
|
30
|
+
@name = name
|
31
|
+
@schema = schema
|
32
|
+
@columns = Hash.new
|
33
|
+
end
|
34
|
+
|
35
|
+
# Put quotes around name to avoid casing problems.
|
36
|
+
def name
|
37
|
+
return "\"#{@name}\""
|
38
|
+
end
|
39
|
+
|
40
|
+
# Put quotes around name to avoid casing problems.
|
41
|
+
def schema
|
42
|
+
return "\"#{@schema}\""
|
43
|
+
end
|
44
|
+
|
45
|
+
# Add column if it doesn't exist yet.
|
46
|
+
def add_column(column_name, data_type)
|
47
|
+
@columns[column_name] = Column.new(column_name, name, schema, data_type) unless @columns.include?(column_name)
|
48
|
+
return @columns[column_name]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Create a query to count all records that are empty.
|
52
|
+
def query_empty_records
|
53
|
+
query_columns = columns.values.map do |col|
|
54
|
+
%{SUM(CASE WHEN #{col.not_empty} THEN 1 ELSE 0 END) AS #{col.name}}
|
55
|
+
end.join(", \n")
|
56
|
+
query = unless query_columns.empty?
|
57
|
+
%{SELECT #{query_columns} FROM #{schema}.#{name}}
|
58
|
+
else
|
59
|
+
''
|
60
|
+
end
|
61
|
+
return query
|
62
|
+
end
|
63
|
+
|
64
|
+
# Create a query to count all the records.
|
65
|
+
def query_total_records
|
66
|
+
return %{SELECT COUNT(*) FROM #{schema}.#{name}}
|
67
|
+
end
|
68
|
+
|
69
|
+
# Create a query to count all distinct lowercased values per column.
|
70
|
+
def query_distinct_lowercased_entries
|
71
|
+
# Skip unless column is a string column
|
72
|
+
query_columns = columns.values.map do |col|
|
73
|
+
%{count(distinct(lower(#{col.full_name}))) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
|
74
|
+
end
|
75
|
+
# Remove nulls caused by skipping
|
76
|
+
query_columns.compact!
|
77
|
+
query_columns = query_columns.join(", \n")
|
78
|
+
query = unless query_columns.empty?
|
79
|
+
%{SELECT #{query_columns} FROM #{schema}.#{name}}
|
80
|
+
else
|
81
|
+
''
|
82
|
+
end
|
83
|
+
return query
|
84
|
+
end
|
85
|
+
|
86
|
+
# Create a query to count all distinct values per column.
|
87
|
+
def query_distinct_entries
|
88
|
+
# Skip unless column is a string column
|
89
|
+
query_columns = columns.values.map do |col|
|
90
|
+
%{count(distinct(#{col.full_name})) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
|
91
|
+
end
|
92
|
+
# Remove nulls caused by skipping
|
93
|
+
query_columns.compact!
|
94
|
+
query_columns = query_columns.join(", \n")
|
95
|
+
|
96
|
+
query = unless query_columns.empty?
|
97
|
+
%{SELECT #{query_columns} FROM #{schema}.#{name}}
|
98
|
+
else
|
99
|
+
''
|
100
|
+
end
|
101
|
+
return query
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_s
|
105
|
+
output = "+ #{@name}: \n"
|
106
|
+
@columns.each do |k, v|
|
107
|
+
output << v.to_s << "\n"
|
108
|
+
end
|
109
|
+
return output
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Column
|
114
|
+
attr_reader :name, :full_name, :data_type
|
115
|
+
attr_accessor :total_entries, :missing_entries, :distinct_entries, :distinct_lower_entries
|
116
|
+
|
117
|
+
def initialize(name, table_name, schema_name, data_type)
|
118
|
+
@name = name
|
119
|
+
@full_name = "#{schema_name}.#{table_name}.#{self.name}"
|
120
|
+
@total_entries = 0
|
121
|
+
@missing_entries = 0
|
122
|
+
@distinct_entries = 0
|
123
|
+
@distinct_lower_entries = 0
|
124
|
+
@data_type = data_type
|
125
|
+
end
|
126
|
+
|
127
|
+
def not_empty
|
128
|
+
statement = case @data_type
|
129
|
+
when 'character varying', 'varchar'
|
130
|
+
"#{full_name} IS NULL OR #{full_name} = \'\'"
|
131
|
+
else
|
132
|
+
"#{full_name} IS NULL"
|
133
|
+
end
|
134
|
+
return statement
|
135
|
+
end
|
136
|
+
|
137
|
+
# Put quotes around name to avoid casing problems.
|
138
|
+
def name
|
139
|
+
return "\"#{@name}\""
|
140
|
+
end
|
141
|
+
|
142
|
+
def to_s
|
143
|
+
return " - #{@name.ljust(40)}: \t #{missing_entries}, #{total_entries}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|