dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +333 -0
  3. data/Thorfile +1 -0
  4. data/bin/dbtools +5 -0
  5. data/config/client_secret_dbtools.json +1 -0
  6. data/config/config.yml +1 -0
  7. data/config/database_config.yml +12 -0
  8. data/config/databases.txt +5 -0
  9. data/config/schedule.rb +8 -0
  10. data/dbtools.gemspec +37 -0
  11. data/lib/dbtools.rb +47 -0
  12. data/lib/dbtools/constants.rb +847 -0
  13. data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
  14. data/lib/dbtools/converter/csv_importer.rb +107 -0
  15. data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
  16. data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
  17. data/lib/dbtools/database/database_data.rb +146 -0
  18. data/lib/dbtools/database/db_connection.rb +236 -0
  19. data/lib/dbtools/database/mysql_connection.rb +78 -0
  20. data/lib/dbtools/database/postgresql_connection.rb +132 -0
  21. data/lib/dbtools/database/violation.rb +45 -0
  22. data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
  23. data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
  24. data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
  25. data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
  26. data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
  27. data/lib/dbtools/plsql_functions/link.sql +17 -0
  28. data/lib/dbtools/plsql_functions/unlink.sql +15 -0
  29. data/lib/dbtools/rdf/rdf_reader.rb +136 -0
  30. data/lib/dbtools/version.rb +3 -0
  31. data/lib/rdf/geophy.rb +27 -0
  32. data/lib/tasks/aws.rb +43 -0
  33. data/lib/tasks/backup.rb +107 -0
  34. data/lib/tasks/check.rb +220 -0
  35. data/lib/tasks/ckan.rb +151 -0
  36. data/lib/tasks/convert.rb +139 -0
  37. data/lib/tasks/dump.rb +110 -0
  38. data/lib/tasks/googledrivetool.rb +252 -0
  39. data/lib/tasks/import.rb +142 -0
  40. data/lib/tasks/postgres.rb +29 -0
  41. metadata +307 -0
@@ -0,0 +1,68 @@
1
+ require 'csv'
2
+ require 'rdf'
3
+
4
+ module Dbtools::Converter
5
+ class Csv2rdf_converter
6
+
7
+ # Constructor for the csv2rdf converter.
8
+ # @param filename
9
+ # Filename of the csv file that needs to be converted.
10
+ # @param uri
11
+ # RDF URI for the subject. This will be prepended with the row number.
12
+ # Example:
13
+ # uri = 'http://example.org/fileid'
14
+ # <http://example.org/fileid#123> <predicate> "value"
15
+ # @param default_vocabulary
16
+ # Base vocabulary for the column names.
17
+ # Example:
18
+ # default_vocabulary = "http://geophy.io/"
19
+ # <subject> <http://geophy.io/column1> "value"
20
+ def initialize(filename, uri, default_vocabulary: "http://geophy.io/", options: {})
21
+ @uri = uri
22
+ @default_vocabulary = default_vocabulary
23
+ delimiter = options[:col_sep]
24
+ delimiter ||= guess_delimiter(filename)
25
+ opts = { :headers => true,
26
+ :header_converters => :symbol,
27
+ :converters => :all,
28
+ :col_sep => delimiter,
29
+ :skip_blanks => true
30
+ }.merge(options)
31
+ @csv = CSV.open(filename, opts)
32
+ end
33
+
34
+ # Converts the current row to rdf triples.
35
+ def each_triple
36
+ @csv.each do |row|
37
+ lineno = @csv.lineno
38
+ #print a triple with the row id
39
+ rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
40
+ predicate: RDF::URI.new("#{@default_vocabulary}rid"),
41
+ object: lineno
42
+ })
43
+ yield rdf.to_ntriples
44
+ row.each do |colname, colvalue|
45
+ next if colvalue.nil? or colvalue.to_s.empty?
46
+ rdf = RDF::Statement({ subject: RDF::URI.new("#{@uri}##{lineno}"),
47
+ predicate: RDF::URI.new(File.join(@default_vocabulary, colname.to_s)),
48
+ object: colvalue
49
+ })
50
+ yield rdf.to_ntriples
51
+ # yield "#{subject} #{predicate} #{object} ."
52
+ end
53
+ end
54
+ end
55
+
56
+ # Attempt to guess delimiter based on occurrence in the header.
57
+ def guess_delimiter(filename)
58
+ delimiters = [',', '|', "\t", ';']
59
+ lines = File.foreach("#{filename}").first(10).join
60
+ delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
61
+ # Key is the delimiter, value is the occurence.
62
+ most_likely_delimiter = delimiters_count.max_by { |k, v| v }
63
+ # Check if the occurrence is not zero.
64
+ raise "No delimiter detected. " if most_likely_delimiter[1].zero?
65
+ return most_likely_delimiter.first
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,107 @@
1
+ require 'csv'
2
+ require 'time'
3
+
4
+ module Dbtools::Converter
5
+ class Csv_importer
6
+ attr_reader :tablename, :delimiter
7
+ def initialize(filename, delimiter: '', tablename: '')
8
+ @delimiter = guess_delimiter(filename) if delimiter.empty?
9
+ options = { :headers => true,
10
+ :header_converters => :symbol,
11
+ :converters => :all,
12
+ :col_sep => @delimiter
13
+ }
14
+ csv = CSV.open(filename, options)
15
+ arr = Array.new
16
+ csv.take(10000).each do |row|
17
+ arr.push(row)
18
+ end
19
+ @data = CSV::Table.new(arr)
20
+ @tablename = tablename
21
+ @tablename = File.basename(filename, '.csv').gsub(/[^0-9a-zA-Z_]/,'_').to_sym if tablename.empty?
22
+ @types = Hash.new
23
+ end
24
+
25
+ # Try to infer the type of the columns, and store them.
26
+ def infer_type_of_columns
27
+ @data.by_col!.each do |colName, rows|
28
+ # Count all the types.
29
+ count = Hash.new
30
+ rows.each do |entry|
31
+ type = infer_type(entry)
32
+ count[type] = count[type].nil? ? 1 : count[type] + 1
33
+ end
34
+ # Set the type to the most occurring type.
35
+ most_occurring_type = count.sort_by(&:last).last
36
+ type = most_occurring_type.first
37
+
38
+ # Let float take precedence over integers if it occurred.
39
+ type = Float if type == Fixnum && !count[Float].nil?
40
+ # Let string take precedence over other types if it occurred.
41
+ type = String unless count[String].nil?
42
+ @types[colName] = type
43
+ end
44
+ end
45
+
46
+ # Infer the type of a value by using Ruby's internal type inference system.
47
+ def infer_type(entry)
48
+ type = (Time.parse(entry) rescue nil)
49
+ type = entry if type.nil?
50
+ return type.class
51
+ end
52
+
53
+ # Converts a ruby class to a string representing a SQL type.
54
+ def class_to_sql_type(klass)
55
+ # There's probably a better way to detect the type..
56
+ if klass == Fixnum
57
+ 'BIGINT'
58
+ elsif klass == Float
59
+ 'FLOAT'
60
+ #elsif klass == Time
61
+ #'DATE'
62
+ else
63
+ 'VARCHAR(255)'
64
+ end
65
+ end
66
+
67
+ # Returns a sql schema script of the csv file.
68
+ def to_sql_schema_script
69
+ infer_type_of_columns if @types.empty?
70
+ output = %{CREATE TABLE IF NOT EXISTS "#{@tablename}" ( \n}
71
+ columns = @data.by_col!.map.with_index do |data, index|
72
+ colName, rows = data
73
+ # Use column position if no header is defined.
74
+ colName = "col_#{index}" if (colName.nil? || colName.empty?)
75
+ # Check if column can be null.
76
+ nullable = rows.all? { |entry| !entry.to_s.gsub(/\s/, '').empty? }
77
+
78
+ sql_type = class_to_sql_type(@types[colName])
79
+
80
+ result = "\t#{colName.downcase} #{sql_type}"
81
+ result << "\tNOT NULL" if nullable
82
+ result
83
+ end.join(", \n")
84
+ output << columns << "\n);\n"
85
+ end
86
+
87
+ # Writes the script to a file.
88
+ def output_schema_to_file(filename)
89
+ open(filename, 'w') { |f| f << to_sql_schema_script}
90
+ return filename
91
+ end
92
+
93
+ # Attempt to guess delimiter based on occurrence in the header.
94
+ def guess_delimiter(filename)
95
+ delimiters = [',', '|', "\t", ';']
96
+ lines = File.foreach("#{filename}").first(10).join
97
+ delimiters_count = delimiters.map { |x| [x, lines.count(x)] }.to_h
98
+ puts delimiters_count
99
+
100
+ # Key is the delimiter, value is the occurence.
101
+ most_likely_delimiter = delimiters_count.max_by { |k, v| v }
102
+ # Check if the occurrence is not zero.
103
+ raise "No delimiter detected. " if most_likely_delimiter[1].zero?
104
+ return most_likely_delimiter.first
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,40 @@
1
+ require 'roo'
2
+ require 'roo-xls'
3
+ require 'csv'
4
+ require 'fileutils'
5
+
6
+ module Dbtools::Converter
7
+ class Excel2csv_converter
8
+
9
+ # Initialize the roo excel library.
10
+ def initialize(filename)
11
+ @excel = Roo::Spreadsheet.open(filename)
12
+ @excel_filename = File.basename(filename)
13
+ end
14
+
15
+ # Output all sheets in the excel to csv. Set the sheetname as the filename.
16
+ def output(folder)
17
+ FileUtils.mkdir_p(folder)
18
+ h = Hash.new
19
+ @excel.each_with_pagename do |sheetname, sheet|
20
+ filename = @excel_filename.gsub(/[^0-9a-zA-Z_-]/,'_') + "_" + sheetname.gsub(/[^0-9a-zA-Z_-]/,'_') + ".csv"
21
+ path = File.join(folder, filename)
22
+ output = File.open(path, "w") do |f|
23
+ f.write(sheet.to_csv)
24
+ end
25
+ h[sheetname] = path
26
+ end
27
+ @excel.close
28
+ return h
29
+ end
30
+
31
+ # Convert an excel sheet to csv, given the index.
32
+ def sheet2csv(sheet_index)
33
+ begin
34
+ @excel.sheet(sheet_index).to_csv
35
+ rescue ArgumentError => e
36
+ puts e.message
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,97 @@
1
+ require 'spira'
2
+ require 'dbtools/google_drive/google_drive_folder'
3
+ require 'dbtools/google_drive/google_drive_file'
4
+
5
+ module Dbtools::Converter
6
+ class GoogleDrive2RDFConverter
7
+ def initialize
8
+ Spira.repository = RDF::Repository.new
9
+ end
10
+
11
+ # Converts a google drive file instance to RDF statements
12
+ # @param [Google::Apis::DriveV3::File] google_drive_file
13
+ # Google drive file that will be converted.
14
+ # @return [String]
15
+ # RDF ntriples in string format.
16
+ def drivefile2rdf(google_drive_file)
17
+ file = google_drive_file
18
+ result = []
19
+ Spira.repository ||= RDF::Repository.new
20
+
21
+ if file.mime_type == 'application/vnd.google-apps.folder'
22
+ # Google Folder specific attributes
23
+ uri = RDF::URI.new("https://drive.google.com/drive/folders/#{file.id}")
24
+ drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
25
+ else
26
+ # Google File specific attributes
27
+ uri = RDF::URI.new(Dbtools::Google_Drive::Google_drive_api.get_url_from_id(file.id))
28
+ drive_file = uri.as(Dbtools::Google_Drive::GoogleDriveFile)
29
+ drive_file.file_extension = file.file_extension
30
+ drive_file.web_content_link = file.web_content_link
31
+ end
32
+ # Shared attributes
33
+ drive_file.name = file.name
34
+ drive_file.identifier = file.id
35
+ drive_file.created_time = file.created_time
36
+ drive_file.mime_type = file.mime_type
37
+ drive_file.size = file.size
38
+ drive_file.modified_time = file.modified_time
39
+ drive_file.icon_link = file.icon_link
40
+ drive_file.description = file.description
41
+ drive_file.web_view_link = file.web_view_link
42
+ drive_file.trashed = file.trashed
43
+
44
+ # Assign all key-value pairs from properties attribute to the Spira resource.
45
+ # Could probably be done for all attributes above..
46
+ # Untested so commented to prevent errors
47
+ # file.properties.each do |key, value|
48
+ # m = "#{key}="
49
+ # drive_file.send(m, value) if drive_file.respond_to?(m)
50
+ # end if file.properties
51
+
52
+ # Add bi-directional relation for parents-children.
53
+ drive_file.parents = file.parents.map do |parent_id|
54
+ parent_uri = RDF::URI.new("https://drive.google.com/drive/folders/#{parent_id}")
55
+ parent_drive_folder = parent_uri.as(Dbtools::Google_Drive::GoogleDriveFolder)
56
+ parent_drive_folder.children << drive_file
57
+ result << parent_drive_folder
58
+ parent_drive_folder
59
+ end if file.parents
60
+
61
+ result << drive_file
62
+ return result
63
+ end
64
+
65
+ # Serializes a list of files to RDF statements.
66
+ # Yields rdf ntriples for every file.
67
+ # @param files
68
+ # List of files to be converted. Default is all files.
69
+ # @param verbose
70
+ # Prints progress if true
71
+ def serialize_as_rdf(files, verbose: true)
72
+ if verbose
73
+ total = files.size
74
+ count = 0
75
+ end
76
+ files.each do |file|
77
+ if verbose
78
+ count += 1
79
+ STDERR.puts("Converting file to rdf: #{count}/#{total}\t\r")
80
+ end
81
+ # get tree method returns a hash with [id, file].
82
+ file = file[1] if files.is_a?(Hash)
83
+ yield drivefile2rdf(file).map(&:to_ntriples).join("\n")
84
+ end
85
+ end
86
+
87
+ private
88
+ def get_variables(google_drive_file)
89
+ attributes = {}
90
+ google_drive_file.instance_variables.each do |var|
91
+ attributes[var.to_s.delete('@')] = google_drive_file.instance_variable_get(var)
92
+ end
93
+ return attributes
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,146 @@
1
+ require 'dbtools/constants'
2
+
3
+ module Dbtools::Database
4
+ class DatabaseData
5
+ attr_reader :name, :tables
6
+ def initialize(name)
7
+ @name = name
8
+ @tables = Hash.new
9
+ end
10
+
11
+ # Add table if it doesn't exist yet.
12
+ def add_table(table_name, schema)
13
+ key = "#{schema}.#{table_name}"
14
+ @tables[key] = Table.new(table_name, schema) unless @tables.include?(table_name)
15
+ return @tables[key]
16
+ end
17
+
18
+ def to_s
19
+ output = "#{@name}: \n"
20
+ @tables.each do |k, v|
21
+ output << v.to_s << "\n"
22
+ end
23
+ return output
24
+ end
25
+ end
26
+
27
+ class Table
28
+ attr_reader :name, :schema, :columns
29
+ def initialize(name, schema)
30
+ @name = name
31
+ @schema = schema
32
+ @columns = Hash.new
33
+ end
34
+
35
+ # Put quotes around name to avoid casing problems.
36
+ def name
37
+ return "\"#{@name}\""
38
+ end
39
+
40
+ # Put quotes around name to avoid casing problems.
41
+ def schema
42
+ return "\"#{@schema}\""
43
+ end
44
+
45
+ # Add column if it doesn't exist yet.
46
+ def add_column(column_name, data_type)
47
+ @columns[column_name] = Column.new(column_name, name, schema, data_type) unless @columns.include?(column_name)
48
+ return @columns[column_name]
49
+ end
50
+
51
+ # Create a query to count all records that are empty.
52
+ def query_empty_records
53
+ query_columns = columns.values.map do |col|
54
+ %{SUM(CASE WHEN #{col.not_empty} THEN 1 ELSE 0 END) AS #{col.name}}
55
+ end.join(", \n")
56
+ query = unless query_columns.empty?
57
+ %{SELECT #{query_columns} FROM #{schema}.#{name}}
58
+ else
59
+ ''
60
+ end
61
+ return query
62
+ end
63
+
64
+ # Create a query to count all the records.
65
+ def query_total_records
66
+ return %{SELECT COUNT(*) FROM #{schema}.#{name}}
67
+ end
68
+
69
+ # Create a query to count all distinct lowercased values per column.
70
+ def query_distinct_lowercased_entries
71
+ # Skip unless column is a string column
72
+ query_columns = columns.values.map do |col|
73
+ %{count(distinct(lower(#{col.full_name}))) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
74
+ end
75
+ # Remove nulls caused by skipping
76
+ query_columns.compact!
77
+ query_columns = query_columns.join(", \n")
78
+ query = unless query_columns.empty?
79
+ %{SELECT #{query_columns} FROM #{schema}.#{name}}
80
+ else
81
+ ''
82
+ end
83
+ return query
84
+ end
85
+
86
+ # Create a query to count all distinct values per column.
87
+ def query_distinct_entries
88
+ # Skip unless column is a string column
89
+ query_columns = columns.values.map do |col|
90
+ %{count(distinct(#{col.full_name})) AS #{col.name}} if Dbtools::Constants::STRING_COLUMNS.include?(col.data_type)
91
+ end
92
+ # Remove nulls caused by skipping
93
+ query_columns.compact!
94
+ query_columns = query_columns.join(", \n")
95
+
96
+ query = unless query_columns.empty?
97
+ %{SELECT #{query_columns} FROM #{schema}.#{name}}
98
+ else
99
+ ''
100
+ end
101
+ return query
102
+ end
103
+
104
+ def to_s
105
+ output = "+ #{@name}: \n"
106
+ @columns.each do |k, v|
107
+ output << v.to_s << "\n"
108
+ end
109
+ return output
110
+ end
111
+ end
112
+
113
+ class Column
114
+ attr_reader :name, :full_name, :data_type
115
+ attr_accessor :total_entries, :missing_entries, :distinct_entries, :distinct_lower_entries
116
+
117
+ def initialize(name, table_name, schema_name, data_type)
118
+ @name = name
119
+ @full_name = "#{schema_name}.#{table_name}.#{self.name}"
120
+ @total_entries = 0
121
+ @missing_entries = 0
122
+ @distinct_entries = 0
123
+ @distinct_lower_entries = 0
124
+ @data_type = data_type
125
+ end
126
+
127
+ def not_empty
128
+ statement = case @data_type
129
+ when 'character varying', 'varchar'
130
+ "#{full_name} IS NULL OR #{full_name} = \'\'"
131
+ else
132
+ "#{full_name} IS NULL"
133
+ end
134
+ return statement
135
+ end
136
+
137
+ # Put quotes around name to avoid casing problems.
138
+ def name
139
+ return "\"#{@name}\""
140
+ end
141
+
142
+ def to_s
143
+ return " - #{@name.ljust(40)}: \t #{missing_entries}, #{total_entries}"
144
+ end
145
+ end
146
+ end