dbtools 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +333 -0
- data/Thorfile +1 -0
- data/bin/dbtools +5 -0
- data/config/client_secret_dbtools.json +1 -0
- data/config/config.yml +1 -0
- data/config/database_config.yml +12 -0
- data/config/databases.txt +5 -0
- data/config/schedule.rb +8 -0
- data/dbtools.gemspec +37 -0
- data/lib/dbtools.rb +47 -0
- data/lib/dbtools/constants.rb +847 -0
- data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
- data/lib/dbtools/converter/csv_importer.rb +107 -0
- data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
- data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
- data/lib/dbtools/database/database_data.rb +146 -0
- data/lib/dbtools/database/db_connection.rb +236 -0
- data/lib/dbtools/database/mysql_connection.rb +78 -0
- data/lib/dbtools/database/postgresql_connection.rb +132 -0
- data/lib/dbtools/database/violation.rb +45 -0
- data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
- data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
- data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
- data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
- data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
- data/lib/dbtools/plsql_functions/link.sql +17 -0
- data/lib/dbtools/plsql_functions/unlink.sql +15 -0
- data/lib/dbtools/rdf/rdf_reader.rb +136 -0
- data/lib/dbtools/version.rb +3 -0
- data/lib/rdf/geophy.rb +27 -0
- data/lib/tasks/aws.rb +43 -0
- data/lib/tasks/backup.rb +107 -0
- data/lib/tasks/check.rb +220 -0
- data/lib/tasks/ckan.rb +151 -0
- data/lib/tasks/convert.rb +139 -0
- data/lib/tasks/dump.rb +110 -0
- data/lib/tasks/googledrivetool.rb +252 -0
- data/lib/tasks/import.rb +142 -0
- data/lib/tasks/postgres.rb +29 -0
- metadata +307 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'rdf'
|
3
|
+
|
4
|
+
module Dbtools::Converter
|
5
|
+
class Csv2rdf_converter
|
6
|
+
|
7
|
+
# Constructor for the csv2rdf converter.
|
8
|
+
# @param filename
|
9
|
+
# Filename of the csv file that needs to be converted.
|
10
|
+
# @param uri
|
11
|
+
# RDF URI for the subject. This will be prepended with the row number.
|
12
|
+
# Example:
|
13
|
+
# uri = 'http://example.org/fileid'
|
14
|
+
# <http://example.org/fileid#123> <predicate> "value"
|
15
|
+
# @param default_vocabulary
|
16
|
+
# Base vocabulary for the column names.
|
17
|
+
# Example:
|
18
|
+
# default_vocabulary = "http://geophy.io/"
|
19
|
+
# <subject> <http://geophy.io/column1> "value"
|
20
|
+
def initialize(filename, uri, default_vocabulary: "http://geophy.io/", options: {})
|
21
|
+
@uri = uri
|
22
|
+
@default_vocabulary = default_vocabulary
|
23
|
+
delimiter = options[:col_sep]
|
24
|
+
delimiter ||= guess_delimiter(filename)
|
25
|
+
opts = { :headers => true,
|
26
|
+
:header_converters => :symbol,
|
27
|
+
:converters => :all,
|
28
|
+
:col_sep => delimiter,
|
29
|
+
:skip_blanks => true
|
30
|
+
}.merge(options)
|
31
|
+
@csv = CSV.open(filename, opts)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Converts the current row to rdf triples.
|
35
|
+
def each_triple
|
36
|
+
@csv.each do |row|
|
37
|
+
lineno = @csv.lineno
|
38
|
+
#print a triple with the row id
|
39
|
+
rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
|
40
|
+
predicate: RDF::URI.new("#{@default_vocabulary}rid"),
|
41
|
+
object: lineno
|
42
|
+
})
|
43
|
+
yield rdf.to_ntriples
|
44
|
+
row.each do |colname, colvalue|
|
45
|
+
next if colvalue.nil? or colvalue.to_s.empty?
|
46
|
+
rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
|
47
|
+
predicate: RDF::URI.new(File.join(@default_vocabulary, colname.to_s)),
|
48
|
+
object: colvalue
|
49
|
+
})
|
50
|
+
yield rdf.to_ntriples
|
51
|
+
# yield "#{subject} #{predicate} #{object} ."
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Attempt to guess delimiter based on occurrence in the header.
|
57
|
+
def guess_delimiter(filename)
|
58
|
+
delimiters = [',', '|', "\t", ';']
|
59
|
+
lines = File.foreach("#{filename}").first(10).join
|
60
|
+
delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
|
61
|
+
# Key is the delimiter, value is the occurence.
|
62
|
+
most_likely_delimiter = delimiters_count.max_by { |k, v| v }
|
63
|
+
# Check if the occurrence is not zero.
|
64
|
+
raise "No delimiter detected. " if most_likely_delimiter[1].zero?
|
65
|
+
return most_likely_delimiter.first
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'time'
|
3
|
+
|
4
|
+
module Dbtools::Converter
|
5
|
+
class Csv_importer
|
6
|
+
attr_reader :tablename, :delimiter
|
7
|
+
def initialize(filename, delimiter: '', tablename: '')
|
8
|
+
@delimiter = guess_delimiter(filename) if delimiter.empty?
|
9
|
+
options = { :headers => true,
|
10
|
+
:header_converters => :symbol,
|
11
|
+
:converters => :all,
|
12
|
+
:col_sep => @delimiter
|
13
|
+
}
|
14
|
+
csv = CSV.open(filename, options)
|
15
|
+
arr = Array.new
|
16
|
+
csv.take(10000).each do |row|
|
17
|
+
arr.push(row)
|
18
|
+
end
|
19
|
+
@data = CSV::Table.new(arr)
|
20
|
+
@tablename = tablename
|
21
|
+
@tablename = File.basename(filename, '.csv').gsub(/[^0-9a-zA-Z_]/,'_').to_sym if tablename.empty?
|
22
|
+
@types = Hash.new
|
23
|
+
end
|
24
|
+
|
25
|
+
# Try to infer the type of the columns, and store them.
|
26
|
+
def infer_type_of_columns
|
27
|
+
@data.by_col!.each do |colName, rows|
|
28
|
+
# Count all the types.
|
29
|
+
count = Hash.new
|
30
|
+
rows.each do |entry|
|
31
|
+
type = infer_type(entry)
|
32
|
+
count[type] = count[type].nil? ? 1 : count[type] + 1
|
33
|
+
end
|
34
|
+
# Set the type to the most occurring type.
|
35
|
+
most_occurring_type = count.sort_by(&:last).last
|
36
|
+
type = most_occurring_type.first
|
37
|
+
|
38
|
+
# Let float take precedence over integers if it occurred.
|
39
|
+
type = Float if type == Fixnum && !count[Float].nil?
|
40
|
+
# Let string take precedence over other types if it occurred.
|
41
|
+
type = String unless count[String].nil?
|
42
|
+
@types[colName] = type
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Infer the type of a value by using Ruby's internal type inference system.
|
47
|
+
def infer_type(entry)
|
48
|
+
type = (Time.parse(entry) rescue nil)
|
49
|
+
type = entry if type.nil?
|
50
|
+
return type.class
|
51
|
+
end
|
52
|
+
|
53
|
+
# Converts a ruby class to a string representing a SQL type.
|
54
|
+
def class_to_sql_type(klass)
|
55
|
+
# There's probably a better way to detect the type..
|
56
|
+
if klass == Fixnum
|
57
|
+
'BIGINT'
|
58
|
+
elsif klass == Float
|
59
|
+
'FLOAT'
|
60
|
+
#elsif klass == Time
|
61
|
+
#'DATE'
|
62
|
+
else
|
63
|
+
'VARCHAR(255)'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns a sql schema script of the csv file.
|
68
|
+
def to_sql_schema_script
|
69
|
+
infer_type_of_columns if @types.empty?
|
70
|
+
output = %{CREATE TABLE IF NOT EXISTS "#{@tablename}" ( \n}
|
71
|
+
columns = @data.by_col!.map.with_index do |data, index|
|
72
|
+
colName, rows = data
|
73
|
+
# Use column position if no header is defined.
|
74
|
+
colName = "col_#{index}" if (colName.nil? || colName.empty?)
|
75
|
+
# Check if column can be null.
|
76
|
+
nullable = rows.all? { |entry| !entry.to_s.gsub(/\s/, '').empty? }
|
77
|
+
|
78
|
+
sql_type = class_to_sql_type(@types[colName])
|
79
|
+
|
80
|
+
result = "\t#{colName.downcase} #{sql_type}"
|
81
|
+
result << "\tNOT NULL" if nullable
|
82
|
+
result
|
83
|
+
end.join(", \n")
|
84
|
+
output << columns << "\n);\n"
|
85
|
+
end
|
86
|
+
|
87
|
+
# Writes the script to a file.
|
88
|
+
def output_schema_to_file(filename)
|
89
|
+
open(filename, 'w') { |f| f << to_sql_schema_script}
|
90
|
+
return filename
|
91
|
+
end
|
92
|
+
|
93
|
+
# Attempt to guess delimiter based on occurrence in the header.
|
94
|
+
def guess_delimiter(filename)
|
95
|
+
delimiters = [',', '|', "\t", ';']
|
96
|
+
lines = File.foreach("#{filename}").first(10).join
|
97
|
+
delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
|
98
|
+
puts delimiters_count
|
99
|
+
|
100
|
+
# Key is the delimiter, value is the occurence.
|
101
|
+
most_likely_delimiter = delimiters_count.max_by { |k, v| v }
|
102
|
+
# Check if the occurrence is not zero.
|
103
|
+
raise "No delimiter detected. " if most_likely_delimiter[1].zero?
|
104
|
+
return most_likely_delimiter.first
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'roo'
|
2
|
+
require 'roo-xls'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
module Dbtools::Converter
|
7
|
+
class Excel2csv_converter
|
8
|
+
|
9
|
+
# Initialize the roo excel library.
|
10
|
+
def initialize(filename)
|
11
|
+
@excel = Roo::Spreadsheet.open(filename)
|
12
|
+
@excel_filename = File.basename(filename)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Output all sheets in the excel to csv. Set the sheetname as the filename.
|
16
|
+
def output(folder)
|
17
|
+
FileUtils.mkdir_p(folder)
|
18
|
+
h = Hash.new
|
19
|
+
@excel.each_with_pagename do |sheetname, sheet|
|
20
|
+
filename = @excel_filename.gsub(/[^0-9a-zA-Z_-]/,'_') + "_" + sheetname.gsub(/[^0-9a-zA-Z_-]/,'_') + ".csv"
|
21
|
+
path = File.join(folder, filename)
|
22
|
+
output = File.open(path, "w") do |f|
|
23
|
+
f.write(sheet.to_csv)
|
24
|
+
end
|
25
|
+
h[sheetname] = path
|
26
|
+
end
|
27
|
+
@excel.close
|
28
|
+
return h
|
29
|
+
end
|
30
|
+
|
31
|
+
# Convert an excel sheet to csv, given the index.
|
32
|
+
def sheet2csv(sheet_index)
|
33
|
+
begin
|
34
|
+
@excel.sheet(sheet_index).to_csv
|
35
|
+
rescue ArgumentError => e
|
36
|
+
puts e.message
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'spira'
|
2
|
+
require 'dbtools/google_drive/google_drive_folder'
|
3
|
+
require 'dbtools/google_drive/google_drive_file'
|
4
|
+
|
5
|
+
module Dbtools::Converter
|
6
|
+
class GoogleDrive2RDFConverter
|
7
|
+
def initialize
|
8
|
+
Spira.repository = RDF::Repository.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Converts a google drive file instance to RDF statements
|
12
|
+
# @param [Google::Apis::DriveV3::File] google_drive_file
|
13
|
+
# Google drive file that will be converted.
|
14
|
+
# @return [String]
|
15
|
+
# RDF ntriples in string format.
|
16
|
+
def drivefile2rdf(google_drive_file)
|
17
|
+
file = google_drive_file
|
18
|
+
result = []
|
19
|
+
Spira.repository ||= RDF::Repository.new
|
20
|
+
|
21
|
+
if file.mime_type == 'application/vnd.google-apps.folder'
|
22
|
+
# Google Folder specific attributes
|
23
|
+
uri = RDF::URI.new("https://drive.google.com/drive/folders/#{file.id}")
|
24
|
+
drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
|
25
|
+
else
|
26
|
+
# Google File specific attributes
|
27
|
+
uri = RDF::URI.new(Dbtools::Google_Drive::Google_drive_api.get_url_from_id(file.id))
|
28
|
+
drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFile)
|
29
|
+
drive_file.file_extension = file.file_extension
|
30
|
+
drive_file.web_content_link = file.web_content_link
|
31
|
+
end
|
32
|
+
# Shared attributes
|
33
|
+
drive_file.name = file.name
|
34
|
+
drive_file.identifier = file.id
|
35
|
+
drive_file.created_time = file.created_time
|
36
|
+
drive_file.mime_type = file.mime_type
|
37
|
+
drive_file.size = file.size
|
38
|
+
drive_file.modified_time = file.modified_time
|
39
|
+
drive_file.icon_link = file.icon_link
|
40
|
+
drive_file.description = file.description
|
41
|
+
drive_file.web_view_link = file.web_view_link
|
42
|
+
drive_file.trashed = file.trashed
|
43
|
+
|
44
|
+
# Assign all key-value pairs from properties attribute to the Spira resource.
|
45
|
+
# Could probably be done for all attributes above..
|
46
|
+
# Untested so commented to prevent errors
|
47
|
+
# file.properties.each do |key, value|
|
48
|
+
# m = "#{key}="
|
49
|
+
# drive_file.send(m, value) if drive_file.respond_to?(m)
|
50
|
+
# end if file.properties
|
51
|
+
|
52
|
+
# Add bi-directional relation for parents-children.
|
53
|
+
drive_file.parents = file.parents.map do |parent_id|
|
54
|
+
parent_uri = RDF::URI.new("https://drive.google.com/drive/folders/#{parent_id}")
|
55
|
+
parent_drive_folder = parent_uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
|
56
|
+
parent_drive_folder.children << drive_file
|
57
|
+
result << parent_drive_folder
|
58
|
+
parent_drive_folder
|
59
|
+
end if file.parents
|
60
|
+
|
61
|
+
result << drive_file
|
62
|
+
return result
|
63
|
+
end
|
64
|
+
|
65
|
+
# Serializes a list of files to RDF statements.
|
66
|
+
# Yields rdf ntriples for every file.
|
67
|
+
# @param files
|
68
|
+
# List of files to be converted. Default is all files.
|
69
|
+
# @param verbose
|
70
|
+
# Prints progress if true
|
71
|
+
def serialize_as_rdf(files, verbose: true)
|
72
|
+
if verbose
|
73
|
+
total = files.size
|
74
|
+
count = 0
|
75
|
+
end
|
76
|
+
files.each do |file|
|
77
|
+
if verbose
|
78
|
+
count += 1
|
79
|
+
STDERR.puts("Converting file to rdf: #{count}/#{total}\t\r")
|
80
|
+
end
|
81
|
+
# get tree method returns a hash with [id, file].
|
82
|
+
file = file[1] if files.is_a?(Hash)
|
83
|
+
yield drivefile2rdf(file).map(&:to_ntriples).join("\n")
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
def get_variables(google_drive_file)
|
89
|
+
attributes = {}
|
90
|
+
google_drive_file.instance_variables.each do |var|
|
91
|
+
attributes[var.to_s.delete('@')] = google_drive_file.instance_variable_get(var)
|
92
|
+
end
|
93
|
+
return attributes
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require 'dbtools/constants'
|
2
|
+
|
3
|
+
module Dbtools::Database
|
4
|
+
class DatabaseData
|
5
|
+
attr_reader :name, :tables
|
6
|
+
def initialize(name)
|
7
|
+
@name = name
|
8
|
+
@tables = Hash.new
|
9
|
+
end
|
10
|
+
|
11
|
+
# Add table if it doesn't exist yet.
|
12
|
+
def add_table(table_name, schema)
|
13
|
+
key = "#{schema}.#{table_name}"
|
14
|
+
@tables[key] = Table.new(table_name, schema) unless @tables.include?(table_name)
|
15
|
+
return @tables[key]
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_s
|
19
|
+
output = "#{@name}: \n"
|
20
|
+
@tables.each do |k, v|
|
21
|
+
output << v.to_s << "\n"
|
22
|
+
end
|
23
|
+
return output
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class Table
|
28
|
+
attr_reader :name, :schema, :columns
|
29
|
+
def initialize(name, schema)
|
30
|
+
@name = name
|
31
|
+
@schema = schema
|
32
|
+
@columns = Hash.new
|
33
|
+
end
|
34
|
+
|
35
|
+
# Put quotes around name to avoid casing problems.
|
36
|
+
def name
|
37
|
+
return "\"#{@name}\""
|
38
|
+
end
|
39
|
+
|
40
|
+
# Put quotes around name to avoid casing problems.
|
41
|
+
def schema
|
42
|
+
return "\"#{@schema}\""
|
43
|
+
end
|
44
|
+
|
45
|
+
# Add column if it doesn't exist yet.
|
46
|
+
def add_column(column_name, data_type)
|
47
|
+
@columns[column_name] = Column.new(column_name, name, schema, data_type) unless @columns.include?(column_name)
|
48
|
+
return @columns[column_name]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Create a query to count all records that are empty.
|
52
|
+
def query_empty_records
|
53
|
+
query_columns = columns.values.map do |col|
|
54
|
+
%{SUM(CASE WHEN #{col.not_empty} THEN 1 ELSE 0 END) AS #{col.name}}
|
55
|
+
end.join(", \n")
|
56
|
+
query = unless query_columns.empty?
|
57
|
+
%{SELECT #{query_columns} FROM #{schema}.#{name}}
|
58
|
+
else
|
59
|
+
''
|
60
|
+
end
|
61
|
+
return query
|
62
|
+
end
|
63
|
+
|
64
|
+
# Create a query to count all the records.
|
65
|
+
def query_total_records
|
66
|
+
return %{SELECT COUNT(*) FROM #{schema}.#{name}}
|
67
|
+
end
|
68
|
+
|
69
|
+
# Create a query to count all distinct lowercased values per column.
|
70
|
+
def query_distinct_lowercased_entries
|
71
|
+
# Skip unless column is a string column
|
72
|
+
query_columns = columns.values.map do |col|
|
73
|
+
%{count(distinct(lower(#{col.full_name}))) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
|
74
|
+
end
|
75
|
+
# Remove nulls caused by skipping
|
76
|
+
query_columns.compact!
|
77
|
+
query_columns = query_columns.join(", \n")
|
78
|
+
query = unless query_columns.empty?
|
79
|
+
%{SELECT #{query_columns} FROM #{schema}.#{name}}
|
80
|
+
else
|
81
|
+
''
|
82
|
+
end
|
83
|
+
return query
|
84
|
+
end
|
85
|
+
|
86
|
+
# Create a query to count all distinct values per column.
|
87
|
+
def query_distinct_entries
|
88
|
+
# Skip unless column is a string column
|
89
|
+
query_columns = columns.values.map do |col|
|
90
|
+
%{count(distinct(#{col.full_name})) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
|
91
|
+
end
|
92
|
+
# Remove nulls caused by skipping
|
93
|
+
query_columns.compact!
|
94
|
+
query_columns = query_columns.join(", \n")
|
95
|
+
|
96
|
+
query = unless query_columns.empty?
|
97
|
+
%{SELECT #{query_columns} FROM #{schema}.#{name}}
|
98
|
+
else
|
99
|
+
''
|
100
|
+
end
|
101
|
+
return query
|
102
|
+
end
|
103
|
+
|
104
|
+
def to_s
|
105
|
+
output = "+ #{@name}: \n"
|
106
|
+
@columns.each do |k, v|
|
107
|
+
output << v.to_s << "\n"
|
108
|
+
end
|
109
|
+
return output
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Column
|
114
|
+
attr_reader :name, :full_name, :data_type
|
115
|
+
attr_accessor :total_entries, :missing_entries, :distinct_entries, :distinct_lower_entries
|
116
|
+
|
117
|
+
def initialize(name, table_name, schema_name, data_type)
|
118
|
+
@name = name
|
119
|
+
@full_name = "#{schema_name}.#{table_name}.#{self.name}"
|
120
|
+
@total_entries = 0
|
121
|
+
@missing_entries = 0
|
122
|
+
@distinct_entries = 0
|
123
|
+
@distinct_lower_entries = 0
|
124
|
+
@data_type = data_type
|
125
|
+
end
|
126
|
+
|
127
|
+
def not_empty
|
128
|
+
statement = case @data_type
|
129
|
+
when 'character varying', 'varchar'
|
130
|
+
"#{full_name} IS NULL OR #{full_name} = \'\'"
|
131
|
+
else
|
132
|
+
"#{full_name} IS NULL"
|
133
|
+
end
|
134
|
+
return statement
|
135
|
+
end
|
136
|
+
|
137
|
+
# Put quotes around name to avoid casing problems.
|
138
|
+
def name
|
139
|
+
return "\"#{@name}\""
|
140
|
+
end
|
141
|
+
|
142
|
+
def to_s
|
143
|
+
return " - #{@name.ljust(40)}: \t #{missing_entries}, #{total_entries}"
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|