dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +333 -0
  3. data/Thorfile +1 -0
  4. data/bin/dbtools +5 -0
  5. data/config/client_secret_dbtools.json +1 -0
  6. data/config/config.yml +1 -0
  7. data/config/database_config.yml +12 -0
  8. data/config/databases.txt +5 -0
  9. data/config/schedule.rb +8 -0
  10. data/dbtools.gemspec +37 -0
  11. data/lib/dbtools.rb +47 -0
  12. data/lib/dbtools/constants.rb +847 -0
  13. data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
  14. data/lib/dbtools/converter/csv_importer.rb +107 -0
  15. data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
  16. data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
  17. data/lib/dbtools/database/database_data.rb +146 -0
  18. data/lib/dbtools/database/db_connection.rb +236 -0
  19. data/lib/dbtools/database/mysql_connection.rb +78 -0
  20. data/lib/dbtools/database/postgresql_connection.rb +132 -0
  21. data/lib/dbtools/database/violation.rb +45 -0
  22. data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
  23. data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
  24. data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
  25. data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
  26. data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
  27. data/lib/dbtools/plsql_functions/link.sql +17 -0
  28. data/lib/dbtools/plsql_functions/unlink.sql +15 -0
  29. data/lib/dbtools/rdf/rdf_reader.rb +136 -0
  30. data/lib/dbtools/version.rb +3 -0
  31. data/lib/rdf/geophy.rb +27 -0
  32. data/lib/tasks/aws.rb +43 -0
  33. data/lib/tasks/backup.rb +107 -0
  34. data/lib/tasks/check.rb +220 -0
  35. data/lib/tasks/ckan.rb +151 -0
  36. data/lib/tasks/convert.rb +139 -0
  37. data/lib/tasks/dump.rb +110 -0
  38. data/lib/tasks/googledrivetool.rb +252 -0
  39. data/lib/tasks/import.rb +142 -0
  40. data/lib/tasks/postgres.rb +29 -0
  41. metadata +307 -0
@@ -0,0 +1,236 @@
1
+ require 'active_record'
2
+ require 'ru_bee'
3
+ require 'dbtools/constants'
4
+ require 'dbtools/database/database_data'
5
+ require 'dbtools/database/violation'
6
+
7
+ module Dbtools::Database
8
+ class DbConnection
9
+ attr_accessor :connection
10
+
11
+ # Creates a connection to a database using the URL.
12
+ def initialize(url)
13
+ @connection = ActiveRecord::Base.establish_connection(url).connection
14
+ @database = get_current_database
15
+ end
16
+
17
+ def close
18
+ @connection.close
19
+ end
20
+
21
+ def database_name
22
+ @connection.current_database
23
+ end
24
+
25
+ # Return an object representing the current database structure.
26
+ def get_current_database
27
+ query = %{select c.table_catalog as "table_catalog", c.table_schema as "table_schema", c.table_name as "table_name", c.column_name as "column_name", c.data_type as "data_type"
28
+ from information_schema.columns as c
29
+ join information_schema.tables as t
30
+ on c.table_catalog = t.table_catalog
31
+ and c.table_schema = t.table_schema
32
+ and c.table_name = t.table_name
33
+ where c.data_type not like '%text%'
34
+ and c.table_schema not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')}
35
+ database = DatabaseData.new(database_name)
36
+ execute_query(query).each do |h|
37
+ table_name = h['table_name']
38
+ table_schema = h['table_schema']
39
+ column_name = h['column_name']
40
+ data_type = h['data_type']
41
+ database.add_table(table_name, table_schema).add_column(column_name, data_type)
42
+ end
43
+ return database
44
+ end
45
+
46
+ def get_all_columns
47
+ query = %{select c.COLUMN_NAME
48
+ from information_schema.COLUMNS as c
49
+ where c.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')}
50
+ execute_query(query)
51
+ end
52
+
53
+
54
+ # Function to check if reserved keywords occur in the schema/table/column names.
55
+ def check_reserved_keywords
56
+ reserved_keywords = "("
57
+ Dbtools::Constants::RESERVED_KEYWORDS.each do |keyword|
58
+ reserved_keywords << "'#{keyword}', "
59
+ end
60
+ reserved_keywords = reserved_keywords[0..-3]
61
+ reserved_keywords << ")"
62
+ sql = %{
63
+ select c.TABLE_CATALOG as "table_catalog", c.TABLE_SCHEMA as "table_schema", c.TABLE_NAME as "table_name", c.COLUMN_NAME as "column_name"
64
+ from information_schema.COLUMNS as c
65
+ where c.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')
66
+ and (upper(c.COLUMN_NAME) in #{reserved_keywords}
67
+ or upper(c.TABLE_CATALOG) in #{reserved_keywords}
68
+ or upper(c.TABLE_NAME) in #{reserved_keywords})
69
+ }
70
+ violations = []
71
+ execute_query(sql).each do |h|
72
+ table_schema = h['table_schema']
73
+ table_name = h['table_name']
74
+ column_name = h['column_name']
75
+ violations << Violation.new(database: database_name,
76
+ metric: "Reserved keywords",
77
+ offender: "#{table_schema}.#{table_name}.#{column_name}")
78
+ end
79
+ return violations
80
+ end
81
+
82
+ # Get the completeness of the columns.
83
+ def get_completeness
84
+ database = @database
85
+
86
+ database.tables.values.each do |table|
87
+ # This query counts all null entries(or entries containing '' for string columns) in every column.
88
+ query = table.query_empty_records
89
+ execute_query(query).each do |res|
90
+ table.columns.each {|k, col| col.missing_entries = res[k]}
91
+ end
92
+ # This query counts the total values in the table.
93
+ # The query should only return a single hashmap/dictionary as result.
94
+ # Since the total records for all columns should be equal, taking just the first value
95
+ # should be fine.
96
+ query = table.query_total_records
97
+ execute_query(query).each do |res|
98
+ table.columns.each {|k, col| col.total_entries = res.values.first}
99
+ end
100
+ end
101
+
102
+ violations = []
103
+ database.tables.each do |table_name, table|
104
+ table.columns.each do |col_name, col|
105
+ next if col.missing_entries.to_i.zero?
106
+ violations << Violation.new(database: database_name,
107
+ metric: "Completeness",
108
+ offender: "#{col.full_name.delete('"')}",
109
+ violating_records: col.missing_entries.to_i,
110
+ total_records: col.total_entries.to_i)
111
+ end
112
+ end
113
+ return violations
114
+ end
115
+
116
+ # Get the compression of the entries grouped by columns.
117
+ def get_syntax_compression
118
+ database = @database
119
+
120
+ database.tables.values.each do |table|
121
+ # This query counts all lowercased distinct values in every column.
122
+ query = table.query_distinct_lowercased_entries
123
+ execute_query(query).each do |res|
124
+ table.columns.each {|k, col| col.distinct_lower_entries = res[k]}
125
+ end
126
+ # This query counts all distinct values in every column.
127
+ query = table.query_distinct_entries
128
+ execute_query(query).each do |res|
129
+ table.columns.each {|k, col| col.distinct_entries = res[k]}
130
+ end
131
+ end
132
+
133
+ violations = []
134
+ database.tables.each do |table_name, table|
135
+ table.columns.each do |col_name, col|
136
+ next if col.distinct_lower_entries == col.distinct_entries
137
+ violations << Violation.new(database: database_name,
138
+ metric: "Syntax compression",
139
+ offender: "#{col.full_name.delete('"')}",
140
+ violating_records: col.distinct_entries.to_i - col.distinct_lower_entries.to_i,
141
+ total_records: col.distinct_entries.to_i)
142
+ end
143
+ end
144
+ return violations
145
+ end
146
+
147
+ # Calculate the inverse functional property of the database.
148
+ def get_inverse_functional_property
149
+ database = @database
150
+
151
+ # Run the query on every table.
152
+ database.tables.values.each do |table|
153
+ # This query counts all distinct values in every column.
154
+ # The query should only return a single hashmap/dictionary as result.
155
+ query = table.query_distinct_entries
156
+ execute_query(query).each do |res|
157
+ table.columns.each {|k, col| col.distinct_entries = res[k]}
158
+ end
159
+ # This query counts the total values in the table.
160
+ # The query should only return a single hashmap/dictionary as result.
161
+ # Since the total records for all columns should be equal, taking just the first value
162
+ # should be fine.
163
+ query = table.query_total_records
164
+ execute_query(query).each do |res|
165
+ table.columns.each {|k, col| col.total_entries = res.values.first}
166
+ end
167
+ end
168
+
169
+ violations = []
170
+ database.tables.each do |table_name, table|
171
+ #puts "Table: #{table_name}:"
172
+ table.columns.each do |col_name, col|
173
+ #puts "\t #{col_name}: #{col.distinct_entries}/#{col.total_entries}" unless col.distinct_entries.to_i.zero?
174
+ violating_records = col.distinct_entries.to_i
175
+ next if violating_records.zero?
176
+ violations << Violation.new(database: database_name,
177
+ metric: "Inverse functional property",
178
+ offender: "#{col.full_name.delete('"')}",
179
+ violating_records: violating_records,
180
+ total_records: col.total_entries.to_i)
181
+ end
182
+ end
183
+ return violations
184
+ end
185
+
186
+ # Get the columns that are not downcase.
187
+ def get_uppercase_columns
188
+ database = @database
189
+ violations = []
190
+
191
+ database.tables.each do |table_name, table|
192
+ table.columns.each do |col_name, col|
193
+ next if col.name.downcase.eql?(col.name)
194
+ violations << Violation.new(database: database_name,
195
+ metric: "Uppercase column names",
196
+ offender: "#{col.full_name.delete('"')}",
197
+ solution: "#{table.schema}.#{table.name}.#{col.name.downcase.delete('"')}")
198
+ end
199
+ end
200
+ return violations
201
+ end
202
+
203
+ # Checks the spelling of all column names.
204
+ def check_spelling
205
+ database = @database
206
+ violations = []
207
+
208
+ database.tables.each do |table_name, table|
209
+ table.columns.each do |col_name, col|
210
+ next if col.name.correct?
211
+ violations << Violation.new(database: database_name,
212
+ metric: "Spelling",
213
+ offender: "#{col.full_name.delete('"')}")
214
+ end
215
+ end
216
+ return violations
217
+ end
218
+
219
+ # Creates a new database.
220
+ def create_database(name)
221
+ @connection.create_database(name)
222
+ end
223
+
224
+ # Runs all sql files in the specified directory
225
+ def execute_files(directory)
226
+ Dir.foreach(directory) do |file|
227
+ # Skip these files.
228
+ next if file == '.' or file == '..' or not ['.sql'].include?(File.extname(file))
229
+ file_path = File.join(directory, file)
230
+ content = File.read(file_path)
231
+ execute_query(content)
232
+ end
233
+ end
234
+
235
+ end
236
+ end
@@ -0,0 +1,78 @@
1
+ require 'dbtools/database/db_connection'
2
+
3
+ module Dbtools::Database
4
+ class MysqlConnection < DbConnection
5
+
6
+ # Executes a SQL statement on the connected database.
7
+ def execute_query(query)
8
+ # Replaces quotes with ticks, so queries are compatible with MySQL.
9
+ query.gsub!("\"", "`")
10
+ # The MySQL ActiveRecord adapter throws an error when the query is empty.
11
+ result = begin
12
+ @connection.exec_query(query).to_hash
13
+ rescue
14
+ {}
15
+ end
16
+ return result
17
+ end
18
+
19
+ # Queries all the primary keys in the database and
20
+ # outputs a query to create an index for that key.
21
+ def check_indexes
22
+ sql = %{
23
+ select cols.table_schema as "table_schema", cols.table_name as "table_name", cols.column_name as "column_name"
24
+ from information_schema.columns as cols
25
+ left join information_schema.statistics as stats
26
+ on cols.table_schema = stats.table_schema
27
+ and cols.table_name = stats.table_name
28
+ and cols.column_name = stats.column_name
29
+ where cols.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')
30
+ and cols.column_name like '%\_id'
31
+ and stats.column_name IS NULL}
32
+
33
+ violations = []
34
+ execute_query(sql).each do |h|
35
+ table_schema = h['table_schema']
36
+ table_name = h['table_name']
37
+ column_name = h['column_name']
38
+ violations << Violation.new(database: database_name,
39
+ metric: "Missing indexes",
40
+ offender: "#{table_schema}.#{table_name}.#{column_name}",
41
+ solution: "CREATE INDEX idx_#{table_schema}_#{table_name}_#{column_name} ON #{table_schema}.#{table_name} (#{column_name}); ")
42
+ end
43
+ return violations
44
+ end
45
+
46
+ # Adds a description to a table by adding a comment.
47
+ def set_description_table(comment, object_name)
48
+ query = %{ALTER TABLE `#{object_name}` COMMENT is '#{comment}'}
49
+ execute_query(query)
50
+ end
51
+
52
+ # Returns all databases on the system.
53
+ def get_all_databases
54
+ sql = %q{show databases}
55
+ execute_query(sql).map {|v| v.values}.flatten.to_set
56
+ end
57
+
58
+
59
+ # Queries for all columns that don't have comment metadata.
60
+ def get_tables_without_comments
61
+ query = %{select t.table_schema as "table_schema", t.table_name as "table_name"
62
+ from information_schema.tables as t
63
+ where t.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')
64
+ and t.table_comment = ''}
65
+
66
+ violations = []
67
+ execute_query(query).each do |h|
68
+ table_schema = h['table_schema']
69
+ table_name = h['table_name']
70
+ violations << Violation.new(database: database_name,
71
+ metric: "Table without comments",
72
+ offender: "#{table_schema}.#{table_name}")
73
+ end
74
+ return violations
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,132 @@
1
+ require 'dbtools/database/db_connection'
2
+
3
+ module Dbtools::Database
4
+ class PostgresqlConnection < DbConnection
5
+
6
+ # Executes a SQL query on the connected database.
7
+ def execute_query(query)
8
+ result = begin
9
+ @connection.exec_query(query)
10
+ rescue
11
+ {}
12
+ end
13
+ return result
14
+ end
15
+
16
+
17
+ # Analyzes the tables and checks if there are more sequence scans
18
+ # than index scans. Suggests where indexes could be created
19
+ def analyze_missing_indexes
20
+ sql = %{
21
+ SELECT relname, seq_scan-idx_scan AS too_much_seq, case when seq_scan-idx_scan>0 THEN 'Missing Index?' ELSE 'OK' END, pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan
22
+ FROM pg_stat_all_tables
23
+ WHERE schemaname ='public' AND pg_relation_size(relname::regclass)>80000 ORDER BY too_much_seq DESC;
24
+ }
25
+ execute_query(sql).each do |index|
26
+ puts index
27
+ end
28
+ end
29
+
30
+ # Queries all the primary keys in the database and
31
+ # outputs a query to create an index for that key.
32
+ def check_indexes
33
+ sql = %{
34
+ select cols.table_schema as "table_schema", cols.table_name as "table_name", cols.column_name as "column_name"
35
+ from
36
+ (select c.table_schema, c.table_name, c.column_name
37
+ from information_schema.columns as c
38
+ where c.column_name != 'ID' and c.column_name != 'id' and c.column_name like '%\\_id'
39
+ and c.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')) as cols
40
+ left join
41
+ (SELECT UNNEST(ARRAY(
42
+ SELECT pg_get_indexdef(idx.indexrelid, k + 1, true)
43
+ FROM generate_subscripts(idx.indkey, 1) as k
44
+ ORDER BY k
45
+ )) as indkey_names
46
+ FROM pg_index as idx
47
+ JOIN pg_class as i ON i.oid = idx.indexrelid
48
+ JOIN pg_am as am ON i.relam = am.oid) as indexes
49
+ on cols.column_name = indexes.indkey_names
50
+ where indexes.indkey_names IS NULL}
51
+ violations = []
52
+ execute_query(sql).each do |h|
53
+ table_schema = h['table_schema']
54
+ table_name = h['table_name']
55
+ column_name = h['column_name']
56
+ violations << Violation.new(database: database_name,
57
+ metric: "Missing indexes",
58
+ offender: "#{table_schema}.#{table_name}.#{column_name}",
59
+ solution: "CREATE INDEX idx_#{table_schema}_#{table_name}_#{column_name} ON #{table_schema}.#{table_name} (#{column_name}); ")
60
+ end
61
+ return violations
62
+ end
63
+
64
+ # Adds a description to the current database by adding a comment.
65
+ def set_description_database(comment)
66
+ query = %{COMMENT ON DATABASE #{@connection.current_database} IS '#{comment}'}
67
+ execute_query(query)
68
+ end
69
+
70
+ # Adds a description to a table by adding a comment.
71
+ def set_description_table(comment, object_name)
72
+ query = %{COMMENT ON TABLE #{object_name} IS '#{comment}'}
73
+ execute_query(query)
74
+ end
75
+
76
+
77
+ # Queries for all columns that don't have comment metadata.
78
+ def get_tables_without_comments
79
+ query = %{select t.table_catalog, t.table_schema, t.table_name, d.description
80
+ from information_schema.tables as t
81
+ join
82
+ pg_class as c on c.relname = t.table_name
83
+ join pg_namespace as n on c.relnamespace = n.oid
84
+ left join pg_description as d on c.oid = d.objoid
85
+ where t.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')
86
+ and d.objoid is null}
87
+ violations = []
88
+ execute_query(query).each do |h|
89
+ table_catalog = h['table_catalog']
90
+ table_schema = h['table_schema']
91
+ table_name = h['table_name']
92
+ violations << Violation.new(database: database_name,
93
+ metric: "Table without comments",
94
+ offender: "#{table_catalog}.#{table_schema}.#{table_name}")
95
+ end
96
+ return violations
97
+ end
98
+
99
+ # Returns all databases from the postgres.
100
+ def get_all_databases
101
+ sql = %q{SELECT datname FROM pg_database WHERE datistemplate = false;}
102
+ execute_query(sql).map {|v| v.values}.flatten.to_set
103
+ end
104
+
105
+ # Queries for all databases that don't have comment metadata.
106
+ def get_databases_without_comments
107
+ query = %{select db.datname
108
+ from pg_database as db
109
+ left join pg_shdescription as sd on sd.objoid = db.oid
110
+ where sd.objoid is null}
111
+ violations = []
112
+ execute_query(query).each do |h|
113
+ datname = h['datname']
114
+ violations << Violation.new(database: database_name,
115
+ metric: "Database without comment",
116
+ offender: datname)
117
+ end
118
+ return violations
119
+ end
120
+
121
+ # Runs the copy command using stdin
122
+ def copy_from_file(file, table_name, delimiter)
123
+ raw_connection = @connection.raw_connection
124
+
125
+ raw_connection.copy_data(%(COPY "#{table_name}" FROM STDIN DELIMITER '#{delimiter}' CSV HEADER;)) do
126
+ File.foreach(file) do |line|
127
+ raw_connection.put_copy_data(line)
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,45 @@
1
+ require 'json'
2
+ require 'date'
3
+
4
+ module Dbtools::Database
5
+ class Violation
6
+ attr_reader :metric, :database, :offender, :violating_records, :total_records, :solution, :schema, :table, :column
7
+
8
+ def initialize(metric:, database:, offender:,
9
+ schema: nil, table: nil, column: nil,
10
+ violating_records: nil, total_records: nil, solution: nil)
11
+ @metric = metric
12
+ @database = database
13
+ @offender = offender
14
+ @violating_records = violating_records
15
+ @total_records = total_records
16
+ @solution = solution
17
+ @timestamp = Time.now.utc.iso8601
18
+
19
+ # Save everything in a Hash to make json serialization easily possible.
20
+ @violation = Hash.new
21
+ @violation['metric'] = @metric
22
+ @violation['database'] = @database
23
+ @violation['offender'] = @offender
24
+
25
+ @schema, @table, @column = @offender.split(".")
26
+ @schema ||= schema
27
+ @table ||= table
28
+ @column ||= column
29
+
30
+ @violation['schema'] = @schema
31
+ @violation['table'] = @table
32
+ @violation['column'] = @column
33
+
34
+ @violation['violating_records'] = @violating_records unless violating_records.nil?
35
+ @violation['total_records'] = @total_records unless total_records.nil?
36
+ @violation['measure'] = @violating_records.to_f / total_records.to_f unless (violating_records.nil? || total_records.to_i.zero?)
37
+ @violation['solution'] = @solution unless solution.nil?
38
+ @violation['timestamp'] = @timestamp
39
+ end
40
+
41
+ def to_s
42
+ @violation.to_json
43
+ end
44
+ end
45
+ end