dbtools 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +333 -0
  3. data/Thorfile +1 -0
  4. data/bin/dbtools +5 -0
  5. data/config/client_secret_dbtools.json +1 -0
  6. data/config/config.yml +1 -0
  7. data/config/database_config.yml +12 -0
  8. data/config/databases.txt +5 -0
  9. data/config/schedule.rb +8 -0
  10. data/dbtools.gemspec +37 -0
  11. data/lib/dbtools.rb +47 -0
  12. data/lib/dbtools/constants.rb +847 -0
  13. data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
  14. data/lib/dbtools/converter/csv_importer.rb +107 -0
  15. data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
  16. data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
  17. data/lib/dbtools/database/database_data.rb +146 -0
  18. data/lib/dbtools/database/db_connection.rb +236 -0
  19. data/lib/dbtools/database/mysql_connection.rb +78 -0
  20. data/lib/dbtools/database/postgresql_connection.rb +132 -0
  21. data/lib/dbtools/database/violation.rb +45 -0
  22. data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
  23. data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
  24. data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
  25. data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
  26. data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
  27. data/lib/dbtools/plsql_functions/link.sql +17 -0
  28. data/lib/dbtools/plsql_functions/unlink.sql +15 -0
  29. data/lib/dbtools/rdf/rdf_reader.rb +136 -0
  30. data/lib/dbtools/version.rb +3 -0
  31. data/lib/rdf/geophy.rb +27 -0
  32. data/lib/tasks/aws.rb +43 -0
  33. data/lib/tasks/backup.rb +107 -0
  34. data/lib/tasks/check.rb +220 -0
  35. data/lib/tasks/ckan.rb +151 -0
  36. data/lib/tasks/convert.rb +139 -0
  37. data/lib/tasks/dump.rb +110 -0
  38. data/lib/tasks/googledrivetool.rb +252 -0
  39. data/lib/tasks/import.rb +142 -0
  40. data/lib/tasks/postgres.rb +29 -0
  41. metadata +307 -0
@@ -0,0 +1,211 @@
1
+ require 'google/apis/drive_v3'
2
+ require 'googleauth'
3
+ require 'googleauth/stores/file_token_store'
4
+ require 'google/apis/plus_v1'
5
+ require 'dbtools/constants'
6
+ require 'fileutils'
7
+ require 'thor'
8
+
9
+ module Dbtools::Google_Drive
10
+ class Google_drive_api
11
+ attr_reader :service
12
+
13
+ OOB_URI = 'urn:ietf:wg:oauth:2.0:oob'
14
+ APPLICATION_NAME = 'Dbtools'
15
+ CLIENT_SECRETS_PATH = File.join(Dbtools::Constants::ROOT_DIR, 'config', 'client_secret_dbtools.json')
16
+ CREDENTIALS_PATH = File.join(Dir.home, '.credentials', "dbtools_geophy.yaml")
17
+ SCOPE = Google::Apis::DriveV3::AUTH_DRIVE
18
+ QUERY_FIELDS = 'nextPageToken, files(id,name,mimeType,size,fileExtension,iconLink,createdTime,' +
19
+ 'modifiedTime,webContentLink,webViewLink,parents,description,properties,trashed)'
20
+
21
+ Google::Apis.logger.level = Logger::FATAL
22
+
23
+ # Constructor for this class. You should pass the authorization object if you want to use
24
+ # another way to authenticate the user. Check the googleauth documentation for more info.
25
+ def initialize(auth=authorize)
26
+ @service = Google::Apis::DriveV3::DriveService.new
27
+ @service.client_options.application_name = APPLICATION_NAME
28
+ @service.authorization = auth
29
+ end
30
+
31
+ # Ensure valid credentials, either by restoring from the saved credentials
32
+ # files or intitiating an OAuth2 authorization. If authorization is required,
33
+ # the user's default browser will be launched to approve the request.
34
+ #
35
+ # @return [Google::Auth::UserRefreshCredentials] OAuth2 credentials
36
+ def authorize
37
+ unless File.exist?(CLIENT_SECRETS_PATH)
38
+ puts "#{CLIENT_SECRETS_PATH} not found."
39
+ puts "Create Google Drive API OAuth 2.0 credentials to allow access. "
40
+ exit(1)
41
+ end
42
+
43
+ FileUtils.mkdir_p(File.dirname(CREDENTIALS_PATH))
44
+ client_id = Google::Auth::ClientId.from_file(CLIENT_SECRETS_PATH)
45
+ token_store = Google::Auth::Stores::FileTokenStore.new(file: CREDENTIALS_PATH)
46
+ authorizer = Google::Auth::UserAuthorizer.new(
47
+ client_id, SCOPE, token_store)
48
+ user_id = 'default'
49
+ credentials = authorizer.get_credentials(user_id)
50
+ if credentials.nil?
51
+ url = authorizer.get_authorization_url(
52
+ base_url: OOB_URI)
53
+ puts "Open the following URL in the browser and enter the " +
54
+ "resulting code after authorization: "
55
+ puts url
56
+ code = gets
57
+ credentials = authorizer.get_and_store_credentials_from_code(
58
+ user_id: user_id, code: code, base_url: OOB_URI)
59
+ end
60
+ credentials
61
+ end
62
+
63
+ # Retrieves the children of the parent folder.
64
+ # @param [Google::Apis::DriveV3::File] parent_folder
65
+ # Parent folder to get the children from.
66
+ # @return [Google::Apis::DriveV3::FileList]
67
+ # List of all files in the parent_folder.
68
+ def get_children_files(parent_folder, optional_query = '')
69
+ page_token = nil
70
+ children = Google::Apis::DriveV3::FileList.new
71
+ begin
72
+ query = "'#{parent_folder.id}' in parents and trashed = false"
73
+ query << " and " << optional_query if !optional_query.empty?
74
+ result = @service.list_files(q: query,
75
+ spaces: 'drive',
76
+ page_size: 1000,
77
+ page_token: page_token,
78
+ fields: QUERY_FIELDS)
79
+ page_token = result.next_page_token
80
+ result.files.concat(children.files) unless children.files.nil?
81
+ children = result
82
+ end while !page_token.nil?
83
+ return children
84
+ end
85
+
86
+ # Updates the metadata of a Google Drive File.
87
+ # @param file_id
88
+ # Google Drive File ID for the file to be updated.
89
+ # @param attributes
90
+ # Hash containing all the attributes to be updated.
91
+ # @return [Google::Apis::DriveV3::File]
92
+ # Returns File object with the updated metadata.
93
+ def update_file(file_id, attributes={})
94
+ attributes.stringify_keys!
95
+ file_object = {}
96
+ file_object[:description] = attributes.delete('description') if attributes['description']
97
+ file_object[:properties] = attributes
98
+ fields = file_object.keys.join(',')
99
+ return @service.update_file(file_id, file_object, fields: fields)
100
+ end
101
+
102
+ # Queries all the files and returns an Hash with all files.
103
+ # All files have a reference to their parent and children if applicable.
104
+ def get_tree(optional_query = "", verbose: false)
105
+ files = []
106
+ count = 0
107
+ # Query for all files. Loops until no next_page_tokens are given.
108
+ # Returns 1000 files per query (Google Drive API limitation).
109
+ page_token = nil
110
+ begin
111
+ query = "trashed = false"
112
+ query << " and " << optional_query if !optional_query.nil? && !optional_query.empty?
113
+ result = @service.list_files(q: query,
114
+ spaces: 'drive',
115
+ page_size: 1000,
116
+ page_token: page_token,
117
+ fields: QUERY_FIELDS)
118
+
119
+ page_token = result.next_page_token
120
+ files.concat(result.files) unless result.files.nil?
121
+ count += result.files.size
122
+ STDERR.puts "Count: #{count} \t\t\r" if verbose
123
+ end while !page_token.nil?
124
+
125
+ # Convert array into key-value pairs, using ID as key.
126
+ # ID is unique for every file.
127
+ files = files.map! { |f| [f.id, f] } if not files.empty?
128
+ files = files.to_h
129
+
130
+ # Add two-way reference for all children-parents.
131
+ # files.each do |id, file|
132
+ # file.parents.each do |parent|
133
+ # next if files[parent].nil?
134
+ # files[parent].children = [] if files[parent].children.nil?
135
+ # files[parent].children << file.id
136
+ # end unless file.parents.nil?
137
+ # end
138
+ return files
139
+ end
140
+
141
+ # Return the changes that happened from a point given by the page token.
142
+ # The page token for the current state can be retrieved by calling @service.get_changes_start_page_token.
143
+ # @param page_token
144
+ # Last saved token
145
+ # @return changed_files, removed_files, saved_start_page_token
146
+ # An array with three elements. The first element containing a hash with the file_id as the key,
147
+ # and the file object containing the attributes as value, for all changed files. The second element
148
+ # contains the file_ids for all files that were removed. The third element contains the new page token.
149
+ def get_changes(page_token)
150
+ changed_files = {}
151
+ removed_files = []
152
+ saved_start_page_token = page_token
153
+ while !page_token.nil?
154
+ response = @service.list_changes(page_token,
155
+ spaces: 'drive',
156
+ page_size: 1000,
157
+ restrict_to_my_drive: true,
158
+ fields: 'nextPageToken, newStartPageToken, changes')
159
+ response.changes.each do |change|
160
+ if change.file
161
+ changed_files[change.file_id] = change.file
162
+ else
163
+ removed_files << change.file_id
164
+ end
165
+ end
166
+ # Last page, save this token for the next polling interval
167
+ saved_start_page_token = response.new_start_page_token if response.new_start_page_token
168
+ page_token = response.next_page_token
169
+ end
170
+ return changed_files, removed_files, saved_start_page_token
171
+ end
172
+
173
+ # Prints the children of the parent folder.
174
+ # @param [Google::Apis::DriveV3::DriveService] drive_service
175
+ # An authenticated Google Drive service.
176
+ # @param [Google::Apis::DriveV3::File] parent_folder
177
+ # Parent folder to get the children from.
178
+ # @return [String]
179
+ # String representing the folder structure.
180
+ def print_child_files(parent_folder, depth = 0, optional_query = '')
181
+ output = ''
182
+ children = get_children_files(parent_folder, optional_query)
183
+ children.files.each do |file|
184
+ prefix = ' ' * depth
185
+ if file.mime_type == 'application/vnd.google-apps.folder'
186
+ output << "#{prefix} - #{file.name} (#{file.id})" << "\n"
187
+ output << print_child_files(file, depth + 1)
188
+ else
189
+ output << "#{prefix} * #{file.name} (#{file.id})" << "\n"
190
+ end
191
+ end
192
+ return output
193
+ end
194
+
195
+ # Extracts the file_id from a google link using regex.
196
+ # Matches links like:
197
+ # https://drive.google.com/uc?export=download&id=FILEID
198
+ # https://drive.google.com/open?id=FILEID
199
+ # https://drive.google.com/file/d/FILEID/edit?usp=sharing
200
+ def self.get_file_id(access_url)
201
+ match = access_url.match(/.*google.*(id=|d\/)\b(?<file_id>[a-zA-Z0-9_-]+)\b/)
202
+ match['file_id']
203
+ end
204
+
205
+ # Returns the url from the drive id
206
+ def self.get_url_from_id(drive_id)
207
+ return "https://drive.google.com/open?id=#{drive_id}"
208
+ end
209
+
210
+ end
211
+ end
@@ -0,0 +1,22 @@
1
+ require 'spira'
2
+
3
+ module Dbtools::Google_Drive
4
+ class GoogleDriveEntity < Spira::Base
5
+ configure :default_vocabulary => RDF::Geophy.vocab
6
+ type RDF::Geophy.GoogleDriveEntity
7
+
8
+ property :identifier, :type => RDF::XSD.string
9
+ property :name, :type => RDF::XSD.string
10
+ property :mime_type, :type => RDF::XSD.string
11
+ property :size, :type => RDF::XSD.integer
12
+ property :modified_time, :type => RDF::XSD.dateTime
13
+ property :created_time, :type => RDF::XSD.dateTime
14
+ property :icon_link, :type => RDF::URI
15
+ property :description, :predicate => RDF::RDFS.comment, :type => RDF::XSD.string
16
+ property :web_view_link, :type => RDF::URI
17
+ property :trashed, :type => RDF::XSD.boolean
18
+
19
+ # Has many relation, because files can be symlinked, resulting in multiple parents.
20
+ has_many :parents, :type => :GoogleDriveFolder
21
+ end
22
+ end
@@ -0,0 +1,10 @@
1
+ require 'dbtools/google_drive/google_drive_entity'
2
+
3
+ module Dbtools::Google_Drive
4
+ class GoogleDriveFile < GoogleDriveEntity
5
+ type RDF::Geophy.GoogleDriveFile
6
+
7
+ property :file_extension, :type => RDF::XSD.string
8
+ property :web_content_link, :type => RDF::URI
9
+ end
10
+ end
@@ -0,0 +1,9 @@
1
+ require 'dbtools/google_drive/google_drive_entity'
2
+
3
+ module Dbtools::Google_Drive
4
+ class GoogleDriveFolder < GoogleDriveEntity
5
+ type RDF::Geophy.GoogleDriveFolder
6
+
7
+ has_many :children, :type => :GoogleDriveEntity
8
+ end
9
+ end
@@ -0,0 +1,30 @@
1
+ CREATE OR REPLACE FUNCTION connect_server(
2
+ servername character varying,
3
+ host character varying,
4
+ port character varying,
5
+ dbname character varying,
6
+ server_user character varying,
7
+ server_password character varying)
8
+ RETURNS void AS
9
+ $BODY$
10
+ BEGIN
11
+ CREATE EXTENSION IF NOT EXISTS postgres_fdw;
12
+
13
+ EXECUTE '
14
+ CREATE EXTENSION IF NOT EXISTS postgres_fdw;
15
+
16
+ DROP SERVER IF EXISTS ' || serverName || ' CASCADE;
17
+
18
+ CREATE SERVER ' || serverName || '
19
+ FOREIGN DATA WRAPPER postgres_fdw
20
+ OPTIONS (host ' || quote_literal(host) || ', port ' || quote_literal(port) || ', dbname ' || quote_literal(dbname) || ');
21
+
22
+ CREATE USER MAPPING FOR current_user
23
+ SERVER ' || serverName || '
24
+ OPTIONS (user ' || quote_literal(server_user) || ', password ' || quote_literal(server_password) || ');
25
+ ';
26
+
27
+ END;
28
+
29
+ $BODY$
30
+ LANGUAGE plpgsql
@@ -0,0 +1,17 @@
1
+ CREATE OR REPLACE FUNCTION link(
2
+ f_server_name character varying,
3
+ f_schema_name character varying,
4
+ f_table_name character varying)
5
+ RETURNS void AS
6
+ $BODY$
7
+ BEGIN
8
+
9
+ EXECUTE '
10
+ IMPORT FOREIGN SCHEMA ' || quote_ident(f_schema_name) || ' LIMIT TO (' || quote_ident(f_table_name) || ')
11
+ FROM SERVER ' || f_server_name || ' INTO public;
12
+ ';
13
+
14
+ END;
15
+
16
+ $BODY$
17
+ LANGUAGE plpgsql
@@ -0,0 +1,15 @@
1
+ CREATE OR REPLACE FUNCTION unlink(
2
+ f_schema_name character varying,
3
+ f_table_name character varying)
4
+ RETURNS void AS
5
+ $BODY$
6
+ BEGIN
7
+
8
+ EXECUTE '
9
+ DROP FOREIGN TABLE ' || f_schema_name || '.' || f_table_name || ' CASCADE;
10
+ ';
11
+
12
+ END;
13
+
14
+ $BODY$
15
+ LANGUAGE plpgsql
@@ -0,0 +1,136 @@
1
+ require 'rdf'
2
+ require 'thor'
3
+ require 'sparql'
4
+ require 'linkeddata'
5
+
6
+ module Dbtools
7
+ class Rdf_reader
8
+
9
+ def initialize
10
+ @graph = RDF::Graph.new
11
+ end
12
+
13
+ # Loads a file into the graph
14
+ def load_from_file(file)
15
+ @graph.load(file)
16
+ while has_next_page
17
+ next_page
18
+ end
19
+ end
20
+
21
+ # Executes a query that returns all csv/postgres/mysql datasets from the rdf graph.
22
+ def get_available_databases
23
+ query = SPARQL.parse(%(
24
+ PREFIX dcat: <http://www.w3.org/ns/dcat#>
25
+ PREFIX dct: <http://purl.org/dc/terms/>
26
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
27
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
28
+ PREFIX gp: <http://www.geophy.com/rdf/terms#>
29
+ SELECT ?dataset ?database_title ?dataset_title
30
+ WHERE {
31
+ ?dataset rdf:type dcat:Dataset ;
32
+ dct:title ?dataset_title ;
33
+ gp:database ?database_title .
34
+ }
35
+ ORDER BY ?database_title
36
+ ))
37
+ @graph.query(query).map.with_index do |result, index|
38
+ queryResult = Hash.new
39
+ queryResult['dataset'] = result.dataset.to_s
40
+ queryResult['dataset_title'] = result.dataset_title.to_s
41
+ queryResult['database_title'] = result.database_title.to_s
42
+ [index, queryResult]
43
+ end.to_h
44
+ end
45
+
46
+ def get_metadata(dataset_id)
47
+ query = SPARQL.parse(%(
48
+ PREFIX dcat: <http://www.w3.org/ns/dcat#>
49
+ PREFIX dct: <http://purl.org/dc/terms/>
50
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
51
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
52
+ PREFIX gp: <http://www.geophy.com/rdf/terms#>
53
+ SELECT ?database_title ?dataset ?distribution ?format ?description ?access_url ?resource_title
54
+ WHERE {
55
+ ?dataset rdf:type dcat:Dataset ;
56
+ gp:database ?database_title ;
57
+ dcat:distribution ?distribution .
58
+ OPTIONAL {
59
+ ?dataset dct:description ?description .
60
+ }
61
+ ?distribution dct:format ?format ;
62
+ dct:title ?resource_title ;
63
+ dcat:accessURL ?access_url .
64
+ FILTER( regex(?format, "postgres|mysql|csv|tsv|xlsx|xls|txt", 'i') &&
65
+ regex(str(?dataset), "#{dataset_id}") )
66
+ }
67
+ ORDER BY ?title
68
+ ))
69
+ output = @graph.query(query).map.with_index do |result, index|
70
+ queryResult = Hash.new
71
+ queryResult['dataset'] = result.dataset
72
+ queryResult['resource'] = result.distribution
73
+ queryResult['database_title'] = result.database_title.to_s
74
+ queryResult['resource_title'] = result.resource_title.to_s
75
+ queryResult['format'] = result.format.to_s
76
+ queryResult['description'] = result.description.to_s if defined?(result.description)
77
+ queryResult['access_url'] = result.access_url.to_s
78
+ [index, queryResult]
79
+ end.to_h
80
+
81
+ return output
82
+ end
83
+
84
+ # Check if there's a hydra next page.
85
+ def has_next_page
86
+ question = SPARQL.parse(%[
87
+ PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
88
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
89
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
90
+
91
+ ASK { ?catalog rdf:type hydra:PagedCollection ;
92
+ hydra:nextPage ?next_catalog . }
93
+ ])
94
+ @graph.query(question).true?
95
+ end
96
+
97
+ # Adds the next page to the graph
98
+ def next_page
99
+ return nil if !has_next_page
100
+
101
+ # Get next page links
102
+ next_page_query = SPARQL.parse(%(
103
+ PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
104
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
105
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
106
+
107
+ SELECT ?catalog
108
+ WHERE { ?current_catalog rdf:type hydra:PagedCollection ;
109
+ hydra:nextPage ?catalog . }))
110
+ next_page = @graph.query(next_page_query).first
111
+
112
+ # Delete existing hydra nodes
113
+ delete_query = SPARQL.parse(%(
114
+ PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
115
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
116
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
117
+
118
+ SELECT ?catalog ?p ?o
119
+ WHERE { ?catalog rdf:type hydra:PagedCollection ;
120
+ ?p ?o . }
121
+ ))
122
+ @graph.query(delete_query) do |res|
123
+ statement = RDF::Statement(res.catalog, res.p, res.o)
124
+ @graph.delete(statement)
125
+ end
126
+ # Load the next page.
127
+ @graph.load(next_page[:catalog])
128
+ end
129
+ end
130
+
131
+
132
+ # rdf = Rdf_reader.new
133
+ # rdf.load_from_file("/mnt/data/Development/geophy/db-maintenance-tool/lib/turtle_example.ttl")
134
+ # metadata = rdf.get_metadata("dvdrental")
135
+ # puts metadata.inspect
136
+ end