dbtools 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +333 -0
  3. data/Thorfile +1 -0
  4. data/bin/dbtools +5 -0
  5. data/config/client_secret_dbtools.json +1 -0
  6. data/config/config.yml +1 -0
  7. data/config/database_config.yml +12 -0
  8. data/config/databases.txt +5 -0
  9. data/config/schedule.rb +8 -0
  10. data/dbtools.gemspec +37 -0
  11. data/lib/dbtools.rb +47 -0
  12. data/lib/dbtools/constants.rb +847 -0
  13. data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
  14. data/lib/dbtools/converter/csv_importer.rb +107 -0
  15. data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
  16. data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
  17. data/lib/dbtools/database/database_data.rb +146 -0
  18. data/lib/dbtools/database/db_connection.rb +236 -0
  19. data/lib/dbtools/database/mysql_connection.rb +78 -0
  20. data/lib/dbtools/database/postgresql_connection.rb +132 -0
  21. data/lib/dbtools/database/violation.rb +45 -0
  22. data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
  23. data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
  24. data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
  25. data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
  26. data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
  27. data/lib/dbtools/plsql_functions/link.sql +17 -0
  28. data/lib/dbtools/plsql_functions/unlink.sql +15 -0
  29. data/lib/dbtools/rdf/rdf_reader.rb +136 -0
  30. data/lib/dbtools/version.rb +3 -0
  31. data/lib/rdf/geophy.rb +27 -0
  32. data/lib/tasks/aws.rb +43 -0
  33. data/lib/tasks/backup.rb +107 -0
  34. data/lib/tasks/check.rb +220 -0
  35. data/lib/tasks/ckan.rb +151 -0
  36. data/lib/tasks/convert.rb +139 -0
  37. data/lib/tasks/dump.rb +110 -0
  38. data/lib/tasks/googledrivetool.rb +252 -0
  39. data/lib/tasks/import.rb +142 -0
  40. data/lib/tasks/postgres.rb +29 -0
  41. metadata +307 -0
@@ -0,0 +1,236 @@
1
+ require 'active_record'
2
+ require 'ru_bee'
3
+ require 'dbtools/constants'
4
+ require 'dbtools/database/database_data'
5
+ require 'dbtools/database/violation'
6
+
7
+ module Dbtools::Database
8
+ class DbConnection
9
+ attr_accessor :connection
10
+
11
+ # Creates a connection to a database using the URL.
12
+ def initialize(url)
13
+ @connection = ActiveRecord::Base.establish_connection(url).connection
14
+ @database = get_current_database
15
+ end
16
+
17
+ def close
18
+ @connection.close
19
+ end
20
+
21
+ def database_name
22
+ @connection.current_database
23
+ end
24
+
25
+ # Return an object representing the current database structure.
26
+ def get_current_database
27
+ query = %{select c.table_catalog as "table_catalog", c.table_schema as "table_schema", c.table_name as "table_name", c.column_name as "column_name", c.data_type as "data_type"
28
+ from information_schema.columns as c
29
+ join information_schema.tables as t
30
+ on c.table_catalog = t.table_catalog
31
+ and c.table_schema = t.table_schema
32
+ and c.table_name = t.table_name
33
+ where c.data_type not like '%text%'
34
+ and c.table_schema not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')}
35
+ database = DatabaseData.new(database_name)
36
+ execute_query(query).each do |h|
37
+ table_name = h['table_name']
38
+ table_schema = h['table_schema']
39
+ column_name = h['column_name']
40
+ data_type = h['data_type']
41
+ database.add_table(table_name, table_schema).add_column(column_name, data_type)
42
+ end
43
+ return database
44
+ end
45
+
46
+ def get_all_columns
47
+ query = %{select c.COLUMN_NAME
48
+ from information_schema.COLUMNS as c
49
+ where c.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')}
50
+ execute_query(query)
51
+ end
52
+
53
+
54
+ # Function to check if reserved keywords occur in the schema/table/column names.
55
+ def check_reserved_keywords
56
+ reserved_keywords = "("
57
+ Dbtools::Constants::RESERVED_KEYWORDS.each do |keyword|
58
+ reserved_keywords << "'#{keyword}', "
59
+ end
60
+ reserved_keywords = reserved_keywords[0..-3]
61
+ reserved_keywords << ")"
62
+ sql = %{
63
+ select c.TABLE_CATALOG as "table_catalog", c.TABLE_SCHEMA as "table_schema", c.TABLE_NAME as "table_name", c.COLUMN_NAME as "column_name"
64
+ from information_schema.COLUMNS as c
65
+ where c.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')
66
+ and (upper(c.COLUMN_NAME) in #{reserved_keywords}
67
+ or upper(c.TABLE_CATALOG) in #{reserved_keywords}
68
+ or upper(c.TABLE_NAME) in #{reserved_keywords})
69
+ }
70
+ violations = []
71
+ execute_query(sql).each do |h|
72
+ table_schema = h['table_schema']
73
+ table_name = h['table_name']
74
+ column_name = h['column_name']
75
+ violations << Violation.new(database: database_name,
76
+ metric: "Reserved keywords",
77
+ offender: "#{table_schema}.#{table_name}.#{column_name}")
78
+ end
79
+ return violations
80
+ end
81
+
82
+ # Get the completeness of the columns.
83
+ def get_completeness
84
+ database = @database
85
+
86
+ database.tables.values.each do |table|
87
+ # This query counts all null entries(or entries containing '' for string columns) in every column.
88
+ query = table.query_empty_records
89
+ execute_query(query).each do |res|
90
+ table.columns.each {|k, col| col.missing_entries = res[k]}
91
+ end
92
+ # This query counts the total values in the table.
93
+ # The query should only return a single hashmap/dictionary as result.
94
+ # Since the total records for all columns should be equal, taking just the first value
95
+ # should be fine.
96
+ query = table.query_total_records
97
+ execute_query(query).each do |res|
98
+ table.columns.each {|k, col| col.total_entries = res.values.first}
99
+ end
100
+ end
101
+
102
+ violations = []
103
+ database.tables.each do |table_name, table|
104
+ table.columns.each do |col_name, col|
105
+ next if col.missing_entries.to_i.zero?
106
+ violations << Violation.new(database: database_name,
107
+ metric: "Completeness",
108
+ offender: "#{col.full_name.delete('"')}",
109
+ violating_records: col.missing_entries.to_i,
110
+ total_records: col.total_entries.to_i)
111
+ end
112
+ end
113
+ return violations
114
+ end
115
+
116
+ # Get the compression of the entries grouped by columns.
117
+ def get_syntax_compression
118
+ database = @database
119
+
120
+ database.tables.values.each do |table|
121
+ # This query counts all lowercased distinct values in every column.
122
+ query = table.query_distinct_lowercased_entries
123
+ execute_query(query).each do |res|
124
+ table.columns.each {|k, col| col.distinct_lower_entries = res[k]}
125
+ end
126
+ # This query counts all distinct values in every column.
127
+ query = table.query_distinct_entries
128
+ execute_query(query).each do |res|
129
+ table.columns.each {|k, col| col.distinct_entries = res[k]}
130
+ end
131
+ end
132
+
133
+ violations = []
134
+ database.tables.each do |table_name, table|
135
+ table.columns.each do |col_name, col|
136
+ next if col.distinct_lower_entries == col.distinct_entries
137
+ violations << Violation.new(database: database_name,
138
+ metric: "Syntax compression",
139
+ offender: "#{col.full_name.delete('"')}",
140
+ violating_records: col.distinct_entries.to_i - col.distinct_lower_entries.to_i,
141
+ total_records: col.distinct_entries.to_i)
142
+ end
143
+ end
144
+ return violations
145
+ end
146
+
147
+ # Calculate the inverse functional property of the database.
148
+ def get_inverse_functional_property
149
+ database = @database
150
+
151
+ # Run the query on every table.
152
+ database.tables.values.each do |table|
153
+ # This query counts all distinct values in every column.
154
+ # The query should only return a single hashmap/dictionary as result.
155
+ query = table.query_distinct_entries
156
+ execute_query(query).each do |res|
157
+ table.columns.each {|k, col| col.distinct_entries = res[k]}
158
+ end
159
+ # This query counts the total values in the table.
160
+ # The query should only return a single hashmap/dictionary as result.
161
+ # Since the total records for all columns should be equal, taking just the first value
162
+ # should be fine.
163
+ query = table.query_total_records
164
+ execute_query(query).each do |res|
165
+ table.columns.each {|k, col| col.total_entries = res.values.first}
166
+ end
167
+ end
168
+
169
+ violations = []
170
+ database.tables.each do |table_name, table|
171
+ #puts "Table: #{table_name}:"
172
+ table.columns.each do |col_name, col|
173
+ #puts "\t #{col_name}: #{col.distinct_entries}/#{col.total_entries}" unless col.distinct_entries.to_i.zero?
174
+ violating_records = col.distinct_entries.to_i
175
+ next if violating_records.zero?
176
+ violations << Violation.new(database: database_name,
177
+ metric: "Inverse functional property",
178
+ offender: "#{col.full_name.delete('"')}",
179
+ violating_records: violating_records,
180
+ total_records: col.total_entries.to_i)
181
+ end
182
+ end
183
+ return violations
184
+ end
185
+
186
+ # Get the columns that are not downcase.
187
+ def get_uppercase_columns
188
+ database = @database
189
+ violations = []
190
+
191
+ database.tables.each do |table_name, table|
192
+ table.columns.each do |col_name, col|
193
+ next if col.name.downcase.eql?(col.name)
194
+ violations << Violation.new(database: database_name,
195
+ metric: "Uppercase column names",
196
+ offender: "#{col.full_name.delete('"')}",
197
+ solution: "#{table.schema}.#{table.name}.#{col.name.downcase.delete('"')}")
198
+ end
199
+ end
200
+ return violations
201
+ end
202
+
203
+ # Checks the spelling of all column names.
204
+ def check_spelling
205
+ database = @database
206
+ violations = []
207
+
208
+ database.tables.each do |table_name, table|
209
+ table.columns.each do |col_name, col|
210
+ next if col.name.correct?
211
+ violations << Violation.new(database: database_name,
212
+ metric: "Spelling",
213
+ offender: "#{col.full_name.delete('"')}")
214
+ end
215
+ end
216
+ return violations
217
+ end
218
+
219
+ # Creates a new database.
220
+ def create_database(name)
221
+ @connection.create_database(name)
222
+ end
223
+
224
+ # Runs all sql files in the specified directory
225
+ def execute_files(directory)
226
+ Dir.foreach(directory) do |file|
227
+ # Skip these files.
228
+ next if file == '.' or file == '..' or not ['.sql'].include?(File.extname(file))
229
+ file_path = File.join(directory, file)
230
+ content = File.read(file_path)
231
+ execute_query(content)
232
+ end
233
+ end
234
+
235
+ end
236
+ end
@@ -0,0 +1,78 @@
1
+ require 'dbtools/database/db_connection'
2
+
3
+ module Dbtools::Database
4
+ class MysqlConnection < DbConnection
5
+
6
+ # Executes a SQL statement on the connected database.
7
+ def execute_query(query)
8
+ # Replaces quotes with ticks, so queries are compatible with MySQL.
9
+ query.gsub!("\"", "`")
10
+ # The MySQL ActiveRecord adapter throws an error when the query is empty.
11
+ result = begin
12
+ @connection.exec_query(query).to_hash
13
+ rescue
14
+ {}
15
+ end
16
+ return result
17
+ end
18
+
19
+ # Queries all the primary keys in the database and
20
+ # outputs a query to create an index for that key.
21
+ def check_indexes
22
+ sql = %{
23
+ select cols.table_schema as "table_schema", cols.table_name as "table_name", cols.column_name as "column_name"
24
+ from information_schema.columns as cols
25
+ left join information_schema.statistics as stats
26
+ on cols.table_schema = stats.table_schema
27
+ and cols.table_name = stats.table_name
28
+ and cols.column_name = stats.column_name
29
+ where cols.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')
30
+ and cols.column_name like '%\_id'
31
+ and stats.column_name IS NULL}
32
+
33
+ violations = []
34
+ execute_query(sql).each do |h|
35
+ table_schema = h['table_schema']
36
+ table_name = h['table_name']
37
+ column_name = h['column_name']
38
+ violations << Violation.new(database: database_name,
39
+ metric: "Missing indexes",
40
+ offender: "#{table_schema}.#{table_name}.#{column_name}",
41
+ solution: "CREATE INDEX idx_#{table_schema}_#{table_name}_#{column_name} ON #{table_schema}.#{table_name} (#{column_name}); ")
42
+ end
43
+ return violations
44
+ end
45
+
46
+ # Adds a description to a table by adding a comment.
47
+ def set_description_table(comment, object_name)
48
+ query = %{ALTER TABLE `#{object_name}` COMMENT is '#{comment}'}
49
+ execute_query(query)
50
+ end
51
+
52
+ # Returns all databases on the system.
53
+ def get_all_databases
54
+ sql = %q{show databases}
55
+ execute_query(sql).map {|v| v.values}.flatten.to_set
56
+ end
57
+
58
+
59
+ # Queries for all columns that don't have comment metadata.
60
+ def get_tables_without_comments
61
+ query = %{select t.table_schema as "table_schema", t.table_name as "table_name"
62
+ from information_schema.tables as t
63
+ where t.TABLE_SCHEMA not in ('information_schema', 'performance_schema', 'mysql', 'sys', 'pg_catalog')
64
+ and t.table_comment = ''}
65
+
66
+ violations = []
67
+ execute_query(query).each do |h|
68
+ table_schema = h['table_schema']
69
+ table_name = h['table_name']
70
+ violations << Violation.new(database: database_name,
71
+ metric: "Table without comments",
72
+ offender: "#{table_schema}.#{table_name}")
73
+ end
74
+ return violations
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,132 @@
1
+ require 'dbtools/database/db_connection'
2
+
3
+ module Dbtools::Database
4
+ class PostgresqlConnection < DbConnection
5
+
6
+ # Executes a SQL query on the connected database.
7
+ def execute_query(query)
8
+ result = begin
9
+ @connection.exec_query(query)
10
+ rescue
11
+ {}
12
+ end
13
+ return result
14
+ end
15
+
16
+
17
+ # Analyzes the tables and checks if there are more sequence scans
18
+ # than index scans. Suggests where indexes could be created
19
+ def analyze_missing_indexes
20
+ sql = %{
21
+ SELECT relname, seq_scan-idx_scan AS too_much_seq, case when seq_scan-idx_scan>0 THEN 'Missing Index?' ELSE 'OK' END, pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan
22
+ FROM pg_stat_all_tables
23
+ WHERE schemaname ='public' AND pg_relation_size(relname::regclass)>80000 ORDER BY too_much_seq DESC;
24
+ }
25
+ execute_query(sql).each do |index|
26
+ puts index
27
+ end
28
+ end
29
+
30
+ # Queries all the primary keys in the database and
31
+ # outputs a query to create an index for that key.
32
+ def check_indexes
33
+ sql = %{
34
+ select cols.table_schema as "table_schema", cols.table_name as "table_name", cols.column_name as "column_name"
35
+ from
36
+ (select c.table_schema, c.table_name, c.column_name
37
+ from information_schema.columns as c
38
+ where c.column_name != 'ID' and c.column_name != 'id' and c.column_name like '%\\_id'
39
+ and c.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')) as cols
40
+ left join
41
+ (SELECT UNNEST(ARRAY(
42
+ SELECT pg_get_indexdef(idx.indexrelid, k + 1, true)
43
+ FROM generate_subscripts(idx.indkey, 1) as k
44
+ ORDER BY k
45
+ )) as indkey_names
46
+ FROM pg_index as idx
47
+ JOIN pg_class as i ON i.oid = idx.indexrelid
48
+ JOIN pg_am as am ON i.relam = am.oid) as indexes
49
+ on cols.column_name = indexes.indkey_names
50
+ where indexes.indkey_names IS NULL}
51
+ violations = []
52
+ execute_query(sql).each do |h|
53
+ table_schema = h['table_schema']
54
+ table_name = h['table_name']
55
+ column_name = h['column_name']
56
+ violations << Violation.new(database: database_name,
57
+ metric: "Missing indexes",
58
+ offender: "#{table_schema}.#{table_name}.#{column_name}",
59
+ solution: "CREATE INDEX idx_#{table_schema}_#{table_name}_#{column_name} ON #{table_schema}.#{table_name} (#{column_name}); ")
60
+ end
61
+ return violations
62
+ end
63
+
64
+ # Adds a description to the current database by adding a comment.
65
+ def set_description_database(comment)
66
+ query = %{COMMENT ON DATABASE #{@connection.current_database} IS '#{comment}'}
67
+ execute_query(query)
68
+ end
69
+
70
+ # Adds a description to a table by adding a comment.
71
+ def set_description_table(comment, object_name)
72
+ query = %{COMMENT ON TABLE #{object_name} IS '#{comment}'}
73
+ execute_query(query)
74
+ end
75
+
76
+
77
+ # Queries for all columns that don't have comment metadata.
78
+ def get_tables_without_comments
79
+ query = %{select t.table_catalog, t.table_schema, t.table_name, d.description
80
+ from information_schema.tables as t
81
+ join
82
+ pg_class as c on c.relname = t.table_name
83
+ join pg_namespace as n on c.relnamespace = n.oid
84
+ left join pg_description as d on c.oid = d.objoid
85
+ where t.table_schema not in ('information_schema', 'pg_catalog', 'performance_schema', 'mysql', 'sys')
86
+ and d.objoid is null}
87
+ violations = []
88
+ execute_query(query).each do |h|
89
+ table_catalog = h['table_catalog']
90
+ table_schema = h['table_schema']
91
+ table_name = h['table_name']
92
+ violations << Violation.new(database: database_name,
93
+ metric: "Table without comments",
94
+ offender: "#{table_catalog}.#{table_schema}.#{table_name}")
95
+ end
96
+ return violations
97
+ end
98
+
99
+ # Returns all databases from the postgres.
100
+ def get_all_databases
101
+ sql = %q{SELECT datname FROM pg_database WHERE datistemplate = false;}
102
+ execute_query(sql).map {|v| v.values}.flatten.to_set
103
+ end
104
+
105
+ # Queries for all databases that don't have comment metadata.
106
+ def get_databases_without_comments
107
+ query = %{select db.datname
108
+ from pg_database as db
109
+ left join pg_shdescription as sd on sd.objoid = db.oid
110
+ where sd.objoid is null}
111
+ violations = []
112
+ execute_query(query).each do |h|
113
+ datname = h['datname']
114
+ violations << Violation.new(database: database_name,
115
+ metric: "Database without comment",
116
+ offender: datname)
117
+ end
118
+ return violations
119
+ end
120
+
121
+ # Runs the copy command using stdin
122
+ def copy_from_file(file, table_name, delimiter)
123
+ raw_connection = @connection.raw_connection
124
+
125
+ raw_connection.copy_data(%(COPY "#{table_name}" FROM STDIN DELIMITER '#{delimiter}' CSV HEADER;)) do
126
+ File.foreach(file) do |line|
127
+ raw_connection.put_copy_data(line)
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,45 @@
1
+ require 'json'
2
+ require 'date'
3
+
4
+ module Dbtools::Database
5
+ class Violation
6
+ attr_reader :metric, :database, :offender, :violating_records, :total_records, :solution, :schema, :table, :column
7
+
8
+ def initialize(metric:, database:, offender:,
9
+ schema: nil, table: nil, column: nil,
10
+ violating_records: nil, total_records: nil, solution: nil)
11
+ @metric = metric
12
+ @database = database
13
+ @offender = offender
14
+ @violating_records = violating_records
15
+ @total_records = total_records
16
+ @solution = solution
17
+ @timestamp = Time.now.utc.iso8601
18
+
19
+ # Save everything in a Hash to make json serialization easily possible.
20
+ @violation = Hash.new
21
+ @violation['metric'] = @metric
22
+ @violation['database'] = @database
23
+ @violation['offender'] = @offender
24
+
25
+ @schema, @table, @column = @offender.split(".")
26
+ @schema ||= schema
27
+ @table ||= table
28
+ @column ||= column
29
+
30
+ @violation['schema'] = @schema
31
+ @violation['table'] = @table
32
+ @violation['column'] = @column
33
+
34
+ @violation['violating_records'] = @violating_records unless violating_records.nil?
35
+ @violation['total_records'] = @total_records unless total_records.nil?
36
+ @violation['measure'] = @violating_records.to_f / total_records.to_f unless (violating_records.nil? || total_records.to_i.zero?)
37
+ @violation['solution'] = @solution unless solution.nil?
38
+ @violation['timestamp'] = @timestamp
39
+ end
40
+
41
+ def to_s
42
+ @violation.to_json
43
+ end
44
+ end
45
+ end