dbtools 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +333 -0
  3. data/Thorfile +1 -0
  4. data/bin/dbtools +5 -0
  5. data/config/client_secret_dbtools.json +1 -0
  6. data/config/config.yml +1 -0
  7. data/config/database_config.yml +12 -0
  8. data/config/databases.txt +5 -0
  9. data/config/schedule.rb +8 -0
  10. data/dbtools.gemspec +37 -0
  11. data/lib/dbtools.rb +47 -0
  12. data/lib/dbtools/constants.rb +847 -0
  13. data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
  14. data/lib/dbtools/converter/csv_importer.rb +107 -0
  15. data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
  16. data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
  17. data/lib/dbtools/database/database_data.rb +146 -0
  18. data/lib/dbtools/database/db_connection.rb +236 -0
  19. data/lib/dbtools/database/mysql_connection.rb +78 -0
  20. data/lib/dbtools/database/postgresql_connection.rb +132 -0
  21. data/lib/dbtools/database/violation.rb +45 -0
  22. data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
  23. data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
  24. data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
  25. data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
  26. data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
  27. data/lib/dbtools/plsql_functions/link.sql +17 -0
  28. data/lib/dbtools/plsql_functions/unlink.sql +15 -0
  29. data/lib/dbtools/rdf/rdf_reader.rb +136 -0
  30. data/lib/dbtools/version.rb +3 -0
  31. data/lib/rdf/geophy.rb +27 -0
  32. data/lib/tasks/aws.rb +43 -0
  33. data/lib/tasks/backup.rb +107 -0
  34. data/lib/tasks/check.rb +220 -0
  35. data/lib/tasks/ckan.rb +151 -0
  36. data/lib/tasks/convert.rb +139 -0
  37. data/lib/tasks/dump.rb +110 -0
  38. data/lib/tasks/googledrivetool.rb +252 -0
  39. data/lib/tasks/import.rb +142 -0
  40. data/lib/tasks/postgres.rb +29 -0
  41. metadata +307 -0
@@ -0,0 +1,68 @@
1
+ require 'csv'
2
+ require 'rdf'
3
+
4
+ module Dbtools::Converter
5
+ class Csv2rdf_converter
6
+
7
+ # Constructor for the csv2rdf converter.
8
+ # @param filename
9
+ # Filename of the csv file that needs to be converted.
10
+ # @param uri
11
+ # RDF URI for the subject. This will be prepended with the row number.
12
+ # Example:
13
+ # uri = 'http://example.org/fileid'
14
+ # <http://example.org/fileid#123> <predicate> "value"
15
+ # @param default_vocabulary
16
+ # Base vocabulary for the column names.
17
+ # Example:
18
+ # default_vocabulary = "http://geophy.io/"
19
+ # <subject> <http://geophy.io/column1> "value"
20
+ def initialize(filename, uri, default_vocabulary: "http://geophy.io/", options: {})
21
+ @uri = uri
22
+ @default_vocabulary = default_vocabulary
23
+ delimiter = options[:col_sep]
24
+ delimiter ||= guess_delimiter(filename)
25
+ opts = { :headers => true,
26
+ :header_converters => :symbol,
27
+ :converters => :all,
28
+ :col_sep => delimiter,
29
+ :skip_blanks => true
30
+ }.merge(options)
31
+ @csv = CSV.open(filename, opts)
32
+ end
33
+
34
+ # Converts the current row to rdf triples.
35
+ def each_triple
36
+ @csv.each do |row|
37
+ lineno = @csv.lineno
38
+ #print a triple with the row id
39
+ rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
40
+ predicate: RDF::URI.new("#{@default_vocabulary}rid"),
41
+ object: lineno
42
+ })
43
+ yield rdf.to_ntriples
44
+ row.each do |colname, colvalue|
45
+ next if colvalue.nil? or colvalue.to_s.empty?
46
+ rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
47
+ predicate: RDF::URI.new(File.join(@default_vocabulary, colname.to_s)),
48
+ object: colvalue
49
+ })
50
+ yield rdf.to_ntriples
51
+ # yield "#{subject} #{predicate} #{object} ."
52
+ end
53
+ end
54
+ end
55
+
56
+ # Attempt to guess delimiter based on occurrence in the header.
57
+ def guess_delimiter(filename)
58
+ delimiters = [',', '|', "\t", ';']
59
+ lines = File.foreach("#{filename}").first(10).join
60
+ delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
61
+ # Key is the delimiter, value is the occurence.
62
+ most_likely_delimiter = delimiters_count.max_by { |k, v| v }
63
+ # Check if the occurrence is not zero.
64
+ raise "No delimiter detected. " if most_likely_delimiter[1].zero?
65
+ return most_likely_delimiter.first
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,107 @@
1
+ require 'csv'
2
+ require 'time'
3
+
4
+ module Dbtools::Converter
5
+ class Csv_importer
6
+ attr_reader :tablename, :delimiter
7
+ def initialize(filename, delimiter: '', tablename: '')
8
+ @delimiter = guess_delimiter(filename) if delimiter.empty?
9
+ options = { :headers => true,
10
+ :header_converters => :symbol,
11
+ :converters => :all,
12
+ :col_sep => @delimiter
13
+ }
14
+ csv = CSV.open(filename, options)
15
+ arr = Array.new
16
+ csv.take(10000).each do |row|
17
+ arr.push(row)
18
+ end
19
+ @data = CSV::Table.new(arr)
20
+ @tablename = tablename
21
+ @tablename = File.basename(filename, '.csv').gsub(/[^0-9a-zA-Z_]/,'_').to_sym if tablename.empty?
22
+ @types = Hash.new
23
+ end
24
+
25
+ # Try to infer the type of the columns, and store them.
26
+ def infer_type_of_columns
27
+ @data.by_col!.each do |colName, rows|
28
+ # Count all the types.
29
+ count = Hash.new
30
+ rows.each do |entry|
31
+ type = infer_type(entry)
32
+ count[type] = count[type].nil? ? 1 : count[type] + 1
33
+ end
34
+ # Set the type to the most occurring type.
35
+ most_occurring_type = count.sort_by(&:last).last
36
+ type = most_occurring_type.first
37
+
38
+ # Let float take precedence over integers if it occurred.
39
+ type = Float if type == Fixnum && !count[Float].nil?
40
+ # Let string take precedence over other types if it occurred.
41
+ type = String unless count[String].nil?
42
+ @types[colName] = type
43
+ end
44
+ end
45
+
46
+ # Infer the type of a value by using Ruby's internal type inference system.
47
+ def infer_type(entry)
48
+ type = (Time.parse(entry) rescue nil)
49
+ type = entry if type.nil?
50
+ return type.class
51
+ end
52
+
53
+ # Converts a ruby class to a string representing a SQL type.
54
+ def class_to_sql_type(klass)
55
+ # There's probably a better way to detect the type..
56
+ if klass == Fixnum
57
+ 'BIGINT'
58
+ elsif klass == Float
59
+ 'FLOAT'
60
+ #elsif klass == Time
61
+ #'DATE'
62
+ else
63
+ 'VARCHAR(255)'
64
+ end
65
+ end
66
+
67
+ # Returns a sql schema script of the csv file.
68
+ def to_sql_schema_script
69
+ infer_type_of_columns if @types.empty?
70
+ output = %{CREATE TABLE IF NOT EXISTS "#{@tablename}" ( \n}
71
+ columns = @data.by_col!.map.with_index do |data, index|
72
+ colName, rows = data
73
+ # Use column position if no header is defined.
74
+ colName = "col_#{index}" if (colName.nil? || colName.empty?)
75
+ # Check if column can be null.
76
+ nullable = rows.all? { |entry| !entry.to_s.gsub(/\s/, '').empty? }
77
+
78
+ sql_type = class_to_sql_type(@types[colName])
79
+
80
+ result = "\t#{colName.downcase} #{sql_type}"
81
+ result << "\tNOT NULL" if nullable
82
+ result
83
+ end.join(", \n")
84
+ output << columns << "\n);\n"
85
+ end
86
+
87
+ # Writes the script to a file.
88
+ def output_schema_to_file(filename)
89
+ open(filename, 'w') { |f| f << to_sql_schema_script}
90
+ return filename
91
+ end
92
+
93
+ # Attempt to guess delimiter based on occurrence in the header.
94
+ def guess_delimiter(filename)
95
+ delimiters = [',', '|', "\t", ';']
96
+ lines = File.foreach("#{filename}").first(10).join
97
+ delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
98
+ puts delimiters_count
99
+
100
+ # Key is the delimiter, value is the occurence.
101
+ most_likely_delimiter = delimiters_count.max_by { |k, v| v }
102
+ # Check if the occurrence is not zero.
103
+ raise "No delimiter detected. " if most_likely_delimiter[1].zero?
104
+ return most_likely_delimiter.first
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,40 @@
1
+ require 'roo'
2
+ require 'roo-xls'
3
+ require 'csv'
4
+ require 'fileutils'
5
+
6
+ module Dbtools::Converter
7
+ class Excel2csv_converter
8
+
9
+ # Initialize the roo excel library.
10
+ def initialize(filename)
11
+ @excel = Roo::Spreadsheet.open(filename)
12
+ @excel_filename = File.basename(filename)
13
+ end
14
+
15
+ # Output all sheets in the excel to csv. Set the sheetname as the filename.
16
+ def output(folder)
17
+ FileUtils.mkdir_p(folder)
18
+ h = Hash.new
19
+ @excel.each_with_pagename do |sheetname, sheet|
20
+ filename = @excel_filename.gsub(/[^0-9a-zA-Z_-]/,'_') + "_" + sheetname.gsub(/[^0-9a-zA-Z_-]/,'_') + ".csv"
21
+ path = File.join(folder, filename)
22
+ output = File.open(path, "w") do |f|
23
+ f.write(sheet.to_csv)
24
+ end
25
+ h[sheetname] = path
26
+ end
27
+ @excel.close
28
+ return h
29
+ end
30
+
31
+ # Convert an excel sheet to csv, given the index.
32
+ def sheet2csv(sheet_index)
33
+ begin
34
+ @excel.sheet(sheet_index).to_csv
35
+ rescue ArgumentError => e
36
+ puts e.message
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,97 @@
1
+ require 'spira'
2
+ require 'dbtools/google_drive/google_drive_folder'
3
+ require 'dbtools/google_drive/google_drive_file'
4
+
5
+ module Dbtools::Converter
6
+ class GoogleDrive2RDFConverter
7
+ def initialize
8
+ Spira.repository = RDF::Repository.new
9
+ end
10
+
11
+ # Converts a google drive file instance to RDF statements
12
+ # @param [Google::Apis::DriveV3::File] google_drive_file
13
+ # Google drive file that will be converted.
14
+ # @return [String]
15
+ # RDF ntriples in string format.
16
+ def drivefile2rdf(google_drive_file)
17
+ file = google_drive_file
18
+ result = []
19
+ Spira.repository ||= RDF::Repository.new
20
+
21
+ if file.mime_type == 'application/vnd.google-apps.folder'
22
+ # Google Folder specific attributes
23
+ uri = RDF::URI.new("https://drive.google.com/drive/folders/#{file.id}")
24
+ drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
25
+ else
26
+ # Google File specific attributes
27
+ uri = RDF::URI.new(Dbtools::Google_Drive::Google_drive_api.get_url_from_id(file.id))
28
+ drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFile)
29
+ drive_file.file_extension = file.file_extension
30
+ drive_file.web_content_link = file.web_content_link
31
+ end
32
+ # Shared attributes
33
+ drive_file.name = file.name
34
+ drive_file.identifier = file.id
35
+ drive_file.created_time = file.created_time
36
+ drive_file.mime_type = file.mime_type
37
+ drive_file.size = file.size
38
+ drive_file.modified_time = file.modified_time
39
+ drive_file.icon_link = file.icon_link
40
+ drive_file.description = file.description
41
+ drive_file.web_view_link = file.web_view_link
42
+ drive_file.trashed = file.trashed
43
+
44
+ # Assign all key-value pairs from properties attribute to the Spira resource.
45
+ # Could probably be done for all attributes above..
46
+ # Untested so commented to prevent errors
47
+ # file.properties.each do |key, value|
48
+ # m = "#{key}="
49
+ # drive_file.send(m, value) if drive_file.respond_to?(m)
50
+ # end if file.properties
51
+
52
+ # Add bi-directional relation for parents-children.
53
+ drive_file.parents = file.parents.map do |parent_id|
54
+ parent_uri = RDF::URI.new("https://drive.google.com/drive/folders/#{parent_id}")
55
+ parent_drive_folder = parent_uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
56
+ parent_drive_folder.children << drive_file
57
+ result << parent_drive_folder
58
+ parent_drive_folder
59
+ end if file.parents
60
+
61
+ result << drive_file
62
+ return result
63
+ end
64
+
65
+ # Serializes a list of files to RDF statements.
66
+ # Yields rdf ntriples for every file.
67
+ # @param files
68
+ # List of files to be converted. Default is all files.
69
+ # @param verbose
70
+ # Prints progress if true
71
+ def serialize_as_rdf(files, verbose: true)
72
+ if verbose
73
+ total = files.size
74
+ count = 0
75
+ end
76
+ files.each do |file|
77
+ if verbose
78
+ count += 1
79
+ STDERR.puts("Converting file to rdf: #{count}/#{total}\t\r")
80
+ end
81
+ # get tree method returns a hash with [id, file].
82
+ file = file[1] if files.is_a?(Hash)
83
+ yield drivefile2rdf(file).map(&:to_ntriples).join("\n")
84
+ end
85
+ end
86
+
87
+ private
88
+ def get_variables(google_drive_file)
89
+ attributes = {}
90
+ google_drive_file.instance_variables.each do |var|
91
+ attributes[var.to_s.delete('@')] = google_drive_file.instance_variable_get(var)
92
+ end
93
+ return attributes
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,146 @@
1
+ require 'dbtools/constants'
2
+
3
+ module Dbtools::Database
4
+ class DatabaseData
5
+ attr_reader :name, :tables
6
+ def initialize(name)
7
+ @name = name
8
+ @tables = Hash.new
9
+ end
10
+
11
+ # Add table if it doesn't exist yet.
12
+ def add_table(table_name, schema)
13
+ key = "#{schema}.#{table_name}"
14
+ @tables[key] = Table.new(table_name, schema) unless @tables.include?(table_name)
15
+ return @tables[key]
16
+ end
17
+
18
+ def to_s
19
+ output = "#{@name}: \n"
20
+ @tables.each do |k, v|
21
+ output << v.to_s << "\n"
22
+ end
23
+ return output
24
+ end
25
+ end
26
+
27
+ class Table
28
+ attr_reader :name, :schema, :columns
29
+ def initialize(name, schema)
30
+ @name = name
31
+ @schema = schema
32
+ @columns = Hash.new
33
+ end
34
+
35
+ # Put quotes around name to avoid casing problems.
36
+ def name
37
+ return "\"#{@name}\""
38
+ end
39
+
40
+ # Put quotes around name to avoid casing problems.
41
+ def schema
42
+ return "\"#{@schema}\""
43
+ end
44
+
45
+ # Add column if it doesn't exist yet.
46
+ def add_column(column_name, data_type)
47
+ @columns[column_name] = Column.new(column_name, name, schema, data_type) unless @columns.include?(column_name)
48
+ return @columns[column_name]
49
+ end
50
+
51
+ # Create a query to count all records that are empty.
52
+ def query_empty_records
53
+ query_columns = columns.values.map do |col|
54
+ %{SUM(CASE WHEN #{col.not_empty} THEN 1 ELSE 0 END) AS #{col.name}}
55
+ end.join(", \n")
56
+ query = unless query_columns.empty?
57
+ %{SELECT #{query_columns} FROM #{schema}.#{name}}
58
+ else
59
+ ''
60
+ end
61
+ return query
62
+ end
63
+
64
+ # Create a query to count all the records.
65
+ def query_total_records
66
+ return %{SELECT COUNT(*) FROM #{schema}.#{name}}
67
+ end
68
+
69
+ # Create a query to count all distinct lowercased values per column.
70
+ def query_distinct_lowercased_entries
71
+ # Skip unless column is a string column
72
+ query_columns = columns.values.map do |col|
73
+ %{count(distinct(lower(#{col.full_name}))) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
74
+ end
75
+ # Remove nulls caused by skipping
76
+ query_columns.compact!
77
+ query_columns = query_columns.join(", \n")
78
+ query = unless query_columns.empty?
79
+ %{SELECT #{query_columns} FROM #{schema}.#{name}}
80
+ else
81
+ ''
82
+ end
83
+ return query
84
+ end
85
+
86
+ # Create a query to count all distinct values per column.
87
+ def query_distinct_entries
88
+ # Skip unless column is a string column
89
+ query_columns = columns.values.map do |col|
90
+ %{count(distinct(#{col.full_name})) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
91
+ end
92
+ # Remove nulls caused by skipping
93
+ query_columns.compact!
94
+ query_columns = query_columns.join(", \n")
95
+
96
+ query = unless query_columns.empty?
97
+ %{SELECT #{query_columns} FROM #{schema}.#{name}}
98
+ else
99
+ ''
100
+ end
101
+ return query
102
+ end
103
+
104
+ def to_s
105
+ output = "+ #{@name}: \n"
106
+ @columns.each do |k, v|
107
+ output << v.to_s << "\n"
108
+ end
109
+ return output
110
+ end
111
+ end
112
+
113
+ class Column
114
+ attr_reader :name, :full_name, :data_type
115
+ attr_accessor :total_entries, :missing_entries, :distinct_entries, :distinct_lower_entries
116
+
117
+ def initialize(name, table_name, schema_name, data_type)
118
+ @name = name
119
+ @full_name = "#{schema_name}.#{table_name}.#{self.name}"
120
+ @total_entries = 0
121
+ @missing_entries = 0
122
+ @distinct_entries = 0
123
+ @distinct_lower_entries = 0
124
+ @data_type = data_type
125
+ end
126
+
127
+ def not_empty
128
+ statement = case @data_type
129
+ when 'character varying', 'varchar'
130
+ "#{full_name} IS NULL OR #{full_name} = \'\'"
131
+ else
132
+ "#{full_name} IS NULL"
133
+ end
134
+ return statement
135
+ end
136
+
137
+ # Put quotes around name to avoid casing problems.
138
+ def name
139
+ return "\"#{@name}\""
140
+ end
141
+
142
+ def to_s
143
+ return " - #{@name.ljust(40)}: \t #{missing_entries}, #{total_entries}"
144
+ end
145
+ end
146
+ end