dbtools 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +333 -0
  3. data/Thorfile +1 -0
  4. data/bin/dbtools +5 -0
  5. data/config/client_secret_dbtools.json +1 -0
  6. data/config/config.yml +1 -0
  7. data/config/database_config.yml +12 -0
  8. data/config/databases.txt +5 -0
  9. data/config/schedule.rb +8 -0
  10. data/dbtools.gemspec +37 -0
  11. data/lib/dbtools.rb +47 -0
  12. data/lib/dbtools/constants.rb +847 -0
  13. data/lib/dbtools/converter/csv2rdf_converter.rb +68 -0
  14. data/lib/dbtools/converter/csv_importer.rb +107 -0
  15. data/lib/dbtools/converter/excel2csv_converter.rb +40 -0
  16. data/lib/dbtools/converter/google_drive2_rdf_converter.rb +97 -0
  17. data/lib/dbtools/database/database_data.rb +146 -0
  18. data/lib/dbtools/database/db_connection.rb +236 -0
  19. data/lib/dbtools/database/mysql_connection.rb +78 -0
  20. data/lib/dbtools/database/postgresql_connection.rb +132 -0
  21. data/lib/dbtools/database/violation.rb +45 -0
  22. data/lib/dbtools/google_drive/google_drive_api.rb +211 -0
  23. data/lib/dbtools/google_drive/google_drive_entity.rb +22 -0
  24. data/lib/dbtools/google_drive/google_drive_file.rb +10 -0
  25. data/lib/dbtools/google_drive/google_drive_folder.rb +9 -0
  26. data/lib/dbtools/plsql_functions/connect_server.sql +30 -0
  27. data/lib/dbtools/plsql_functions/link.sql +17 -0
  28. data/lib/dbtools/plsql_functions/unlink.sql +15 -0
  29. data/lib/dbtools/rdf/rdf_reader.rb +136 -0
  30. data/lib/dbtools/version.rb +3 -0
  31. data/lib/rdf/geophy.rb +27 -0
  32. data/lib/tasks/aws.rb +43 -0
  33. data/lib/tasks/backup.rb +107 -0
  34. data/lib/tasks/check.rb +220 -0
  35. data/lib/tasks/ckan.rb +151 -0
  36. data/lib/tasks/convert.rb +139 -0
  37. data/lib/tasks/dump.rb +110 -0
  38. data/lib/tasks/googledrivetool.rb +252 -0
  39. data/lib/tasks/import.rb +142 -0
  40. data/lib/tasks/postgres.rb +29 -0
  41. metadata +307 -0
@@ -0,0 +1,211 @@
1
+ require 'google/apis/drive_v3'
2
+ require 'googleauth'
3
+ require 'googleauth/stores/file_token_store'
4
+ require 'google/apis/plus_v1'
5
+ require 'dbtools/constants'
6
+ require 'fileutils'
7
+ require 'thor'
8
+
9
+ module Dbtools::Google_Drive
10
+ class Google_drive_api
11
+ attr_reader :service
12
+
13
+ OOB_URI = 'urn:ietf:wg:oauth:2.0:oob'
14
+ APPLICATION_NAME = 'Dbtools'
15
+ CLIENT_SECRETS_PATH = File.join(Dbtools::Constants::ROOT_DIR, 'config', 'client_secret_dbtools.json')
16
+ CREDENTIALS_PATH = File.join(Dir.home, '.credentials', "dbtools_geophy.yaml")
17
+ SCOPE = Google::Apis::DriveV3::AUTH_DRIVE
18
+ QUERY_FIELDS = 'nextPageToken, files(id,name,mimeType,size,fileExtension,iconLink,createdTime,' +
19
+ 'modifiedTime,webContentLink,webViewLink,parents,description,properties,trashed)'
20
+
21
+ Google::Apis.logger.level = Logger::FATAL
22
+
23
+ # Constructor for this class. You should pass the authorization object if you want to use
24
+ # another way to authenticate the user. Check the googleauth documentation for more info.
25
+ def initialize(auth=authorize)
26
+ @service = Google::Apis::DriveV3::DriveService.new
27
+ @service.client_options.application_name = APPLICATION_NAME
28
+ @service.authorization = auth
29
+ end
30
+
31
+ # Ensure valid credentials, either by restoring from the saved credentials
32
+ # files or intitiating an OAuth2 authorization. If authorization is required,
33
+ # the user's default browser will be launched to approve the request.
34
+ #
35
+ # @return [Google::Auth::UserRefreshCredentials] OAuth2 credentials
36
+ def authorize
37
+ unless File.exist?(CLIENT_SECRETS_PATH)
38
+ puts "#{CLIENT_SECRETS_PATH} not found."
39
+ puts "Create Google Drive API OAuth 2.0 credentials to allow access. "
40
+ exit(1)
41
+ end
42
+
43
+ FileUtils.mkdir_p(File.dirname(CREDENTIALS_PATH))
44
+ client_id = Google::Auth::ClientId.from_file(CLIENT_SECRETS_PATH)
45
+ token_store = Google::Auth::Stores::FileTokenStore.new(file: CREDENTIALS_PATH)
46
+ authorizer = Google::Auth::UserAuthorizer.new(
47
+ client_id, SCOPE, token_store)
48
+ user_id = 'default'
49
+ credentials = authorizer.get_credentials(user_id)
50
+ if credentials.nil?
51
+ url = authorizer.get_authorization_url(
52
+ base_url: OOB_URI)
53
+ puts "Open the following URL in the browser and enter the " +
54
+ "resulting code after authorization: "
55
+ puts url
56
+ code = gets
57
+ credentials = authorizer.get_and_store_credentials_from_code(
58
+ user_id: user_id, code: code, base_url: OOB_URI)
59
+ end
60
+ credentials
61
+ end
62
+
63
+ # Retrieves the children of the parent folder.
64
+ # @param [Google::Apis::DriveV3::File] parent_folder
65
+ # Parent folder to get the children from.
66
+ # @return [Google::Apis::DriveV3::FileList]
67
+ # List of all files in the parent_folder.
68
+ def get_children_files(parent_folder, optional_query = '')
69
+ page_token = nil
70
+ children = Google::Apis::DriveV3::FileList.new
71
+ begin
72
+ query = "'#{parent_folder.id}' in parents and trashed = false"
73
+ query << " and " << optional_query if !optional_query.empty?
74
+ result = @service.list_files(q: query,
75
+ spaces: 'drive',
76
+ page_size: 1000,
77
+ page_token: page_token,
78
+ fields: QUERY_FIELDS)
79
+ page_token = result.next_page_token
80
+ result.files.concat(children.files) unless children.files.nil?
81
+ children = result
82
+ end while !page_token.nil?
83
+ return children
84
+ end
85
+
86
+ # Updates the metadata of a Google Drive File.
87
+ # @param file_id
88
+ # Google Drive File ID for the file to be updated.
89
+ # @param attributes
90
+ # Hash containing all the attributes to be updated.
91
+ # @return [Google::Apis::DriveV3::File]
92
+ # Returns File object with the updated metadata.
93
+ def update_file(file_id, attributes={})
94
+ attributes.stringify_keys!
95
+ file_object = {}
96
+ file_object[:description] = attributes.delete('description') if attributes['description']
97
+ file_object[:properties] = attributes
98
+ fields = file_object.keys.join(',')
99
+ return @service.update_file(file_id, file_object, fields: fields)
100
+ end
101
+
102
+ # Queries all the files and returns an Hash with all files.
103
+ # All files have a reference to their parent and children if applicable.
104
+ def get_tree(optional_query = "", verbose: false)
105
+ files = []
106
+ count = 0
107
+ # Query for all files. Loops until no next_page_tokens are given.
108
+ # Returns 1000 files per query (Google Drive API limitation).
109
+ page_token = nil
110
+ begin
111
+ query = "trashed = false"
112
+ query << " and " << optional_query if !optional_query.nil? && !optional_query.empty?
113
+ result = @service.list_files(q: query,
114
+ spaces: 'drive',
115
+ page_size: 1000,
116
+ page_token: page_token,
117
+ fields: QUERY_FIELDS)
118
+
119
+ page_token = result.next_page_token
120
+ files.concat(result.files) unless result.files.nil?
121
+ count += result.files.size
122
+ STDERR.puts "Count: #{count} \t\t\r" if verbose
123
+ end while !page_token.nil?
124
+
125
+ # Convert array into key-value pairs, using ID as key.
126
+ # ID is unique for every file.
127
+ files = files.map! { |f| [f.id, f] } if not files.empty?
128
+ files = files.to_h
129
+
130
+ # Add two-way reference for all children-parents.
131
+ # files.each do |id, file|
132
+ # file.parents.each do |parent|
133
+ # next if files[parent].nil?
134
+ # files[parent].children = [] if files[parent].children.nil?
135
+ # files[parent].children << file.id
136
+ # end unless file.parents.nil?
137
+ # end
138
+ return files
139
+ end
140
+
141
+ # Return the changes that happened from a point given by the page token.
142
+ # The page token for the current state can be retrieved by calling @service.get_changes_start_page_token.
143
+ # @param page_token
144
+ # Last saved token
145
+ # @return changed_files, removed_files, saved_start_page_token
146
+ # An array with three elements. The first element containing a hash with the file_id as the key,
147
+ # and the file object containing the attributes as value, for all changed files. The second element
148
+ # contains the file_ids for all files that were removed. The third element contains the new page token.
149
+ def get_changes(page_token)
150
+ changed_files = {}
151
+ removed_files = []
152
+ saved_start_page_token = page_token
153
+ while !page_token.nil?
154
+ response = @service.list_changes(page_token,
155
+ spaces: 'drive',
156
+ page_size: 1000,
157
+ restrict_to_my_drive: true,
158
+ fields: 'nextPageToken, newStartPageToken, changes')
159
+ response.changes.each do |change|
160
+ if change.file
161
+ changed_files[change.file_id] = change.file
162
+ else
163
+ removed_files << change.file_id
164
+ end
165
+ end
166
+ # Last page, save this token for the next polling interval
167
+ saved_start_page_token = response.new_start_page_token if response.new_start_page_token
168
+ page_token = response.next_page_token
169
+ end
170
+ return changed_files, removed_files, saved_start_page_token
171
+ end
172
+
173
+ # Prints the children of the parent folder.
174
+ # @param [Google::Apis::DriveV3::DriveService] drive_service
175
+ # An authenticated Google Drive service.
176
+ # @param [Google::Apis::DriveV3::File] parent_folder
177
+ # Parent folder to get the children from.
178
+ # @return [String]
179
+ # String representing the folder structure.
180
+ def print_child_files(parent_folder, depth = 0, optional_query = '')
181
+ output = ''
182
+ children = get_children_files(parent_folder, optional_query)
183
+ children.files.each do |file|
184
+ prefix = ' ' * depth
185
+ if file.mime_type == 'application/vnd.google-apps.folder'
186
+ output << "#{prefix} - #{file.name} (#{file.id})" << "\n"
187
+ output << print_child_files(file, depth + 1)
188
+ else
189
+ output << "#{prefix} * #{file.name} (#{file.id})" << "\n"
190
+ end
191
+ end
192
+ return output
193
+ end
194
+
195
+ # Extracts the file_id from a google link using regex.
196
+ # Matches links like:
197
+ # https://drive.google.com/uc?export=download&id=FILEID
198
+ # https://drive.google.com/open?id=FILEID
199
+ # https://drive.google.com/file/d/FILEID/edit?usp=sharing
200
+ def self.get_file_id(access_url)
201
+ match = access_url.match(/.*google.*(id=|d\/)\b(?<file_id>[a-zA-Z0-9_-]+)\b/)
202
+ match['file_id']
203
+ end
204
+
205
+ # Returns the url from the drive id
206
+ def self.get_url_from_id(drive_id)
207
+ return "https://drive.google.com/open?id=#{drive_id}"
208
+ end
209
+
210
+ end
211
+ end
@@ -0,0 +1,22 @@
1
+ require 'spira'
2
+
3
+ module Dbtools::Google_Drive
4
+ class GoogleDriveEntity < Spira::Base
5
+ configure :default_vocabulary => RDF::Geophy.vocab
6
+ type RDF::Geophy.GoogleDriveEntity
7
+
8
+ property :identifier, :type => RDF::XSD.string
9
+ property :name, :type => RDF::XSD.string
10
+ property :mime_type, :type => RDF::XSD.string
11
+ property :size, :type => RDF::XSD.integer
12
+ property :modified_time, :type => RDF::XSD.dateTime
13
+ property :created_time, :type => RDF::XSD.dateTime
14
+ property :icon_link, :type => RDF::URI
15
+ property :description, :predicate => RDF::RDFS.comment, :type => RDF::XSD.string
16
+ property :web_view_link, :type => RDF::URI
17
+ property :trashed, :type => RDF::XSD.boolean
18
+
19
+ # Has many relation, because files can be symlinked, resulting in multiple parents.
20
+ has_many :parents, :type => :GoogleDriveFolder
21
+ end
22
+ end
@@ -0,0 +1,10 @@
1
+ require 'dbtools/google_drive/google_drive_entity'
2
+
3
+ module Dbtools::Google_Drive
4
+ class GoogleDriveFile < GoogleDriveEntity
5
+ type RDF::Geophy.GoogleDriveFile
6
+
7
+ property :file_extension, :type => RDF::XSD.string
8
+ property :web_content_link, :type => RDF::URI
9
+ end
10
+ end
@@ -0,0 +1,9 @@
1
+ require 'dbtools/google_drive/google_drive_entity'
2
+
3
+ module Dbtools::Google_Drive
4
+ class GoogleDriveFolder < GoogleDriveEntity
5
+ type RDF::Geophy.GoogleDriveFolder
6
+
7
+ has_many :children, :type => :GoogleDriveEntity
8
+ end
9
+ end
@@ -0,0 +1,30 @@
1
+ CREATE OR REPLACE FUNCTION connect_server(
2
+ servername character varying,
3
+ host character varying,
4
+ port character varying,
5
+ dbname character varying,
6
+ server_user character varying,
7
+ server_password character varying)
8
+ RETURNS void AS
9
+ $BODY$
10
+ BEGIN
11
+ CREATE EXTENSION IF NOT EXISTS postgres_fdw;
12
+
13
+ EXECUTE '
14
+ CREATE EXTENSION IF NOT EXISTS postgres_fdw;
15
+
16
+ DROP SERVER IF EXISTS ' || serverName || ' CASCADE;
17
+
18
+ CREATE SERVER ' || serverName || '
19
+ FOREIGN DATA WRAPPER postgres_fdw
20
+ OPTIONS (host ' || quote_literal(host) || ', port ' || quote_literal(port) || ', dbname ' || quote_literal(dbname) || ');
21
+
22
+ CREATE USER MAPPING FOR current_user
23
+ SERVER ' || serverName || '
24
+ OPTIONS (user ' || quote_literal(server_user) || ', password ' || quote_literal(server_password) || ');
25
+ ';
26
+
27
+ END;
28
+
29
+ $BODY$
30
+ LANGUAGE plpgsql
@@ -0,0 +1,17 @@
1
+ CREATE OR REPLACE FUNCTION link(
2
+ f_server_name character varying,
3
+ f_schema_name character varying,
4
+ f_table_name character varying)
5
+ RETURNS void AS
6
+ $BODY$
7
+ BEGIN
8
+
9
+ EXECUTE '
10
+ IMPORT FOREIGN SCHEMA ' || quote_ident(f_schema_name) || ' LIMIT TO (' || quote_ident(f_table_name) || ')
11
+ FROM SERVER ' || f_server_name || ' INTO public;
12
+ ';
13
+
14
+ END;
15
+
16
+ $BODY$
17
+ LANGUAGE plpgsql
@@ -0,0 +1,15 @@
1
+ CREATE OR REPLACE FUNCTION unlink(
2
+ f_schema_name character varying,
3
+ f_table_name character varying)
4
+ RETURNS void AS
5
+ $BODY$
6
+ BEGIN
7
+
8
+ EXECUTE '
9
+ DROP FOREIGN TABLE ' || f_schema_name || '.' || f_table_name || ' CASCADE;
10
+ ';
11
+
12
+ END;
13
+
14
+ $BODY$
15
+ LANGUAGE plpgsql
@@ -0,0 +1,136 @@
1
+ require 'rdf'
2
+ require 'thor'
3
+ require 'sparql'
4
+ require 'linkeddata'
5
+
6
+ module Dbtools
7
+ class Rdf_reader
8
+
9
+ def initialize
10
+ @graph = RDF::Graph.new
11
+ end
12
+
13
+ # Loads a file into the graph
14
+ def load_from_file(file)
15
+ @graph.load(file)
16
+ while has_next_page
17
+ next_page
18
+ end
19
+ end
20
+
21
+ # Executes a query that returns all csv/postgres/mysql datasets from the rdf graph.
22
+ def get_available_databases
23
+ query = SPARQL.parse(%(
24
+ PREFIX dcat: <http://www.w3.org/ns/dcat#>
25
+ PREFIX dct: <http://purl.org/dc/terms/>
26
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
27
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
28
+ PREFIX gp: <http://www.geophy.com/rdf/terms#>
29
+ SELECT ?dataset ?database_title ?dataset_title
30
+ WHERE {
31
+ ?dataset rdf:type dcat:Dataset ;
32
+ dct:title ?dataset_title ;
33
+ gp:database ?database_title .
34
+ }
35
+ ORDER BY ?database_title
36
+ ))
37
+ @graph.query(query).map.with_index do |result, index|
38
+ queryResult = Hash.new
39
+ queryResult['dataset'] = result.dataset.to_s
40
+ queryResult['dataset_title'] = result.dataset_title.to_s
41
+ queryResult['database_title'] = result.database_title.to_s
42
+ [index, queryResult]
43
+ end.to_h
44
+ end
45
+
46
+ def get_metadata(dataset_id)
47
+ query = SPARQL.parse(%(
48
+ PREFIX dcat: <http://www.w3.org/ns/dcat#>
49
+ PREFIX dct: <http://purl.org/dc/terms/>
50
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
51
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
52
+ PREFIX gp: <http://www.geophy.com/rdf/terms#>
53
+ SELECT ?database_title ?dataset ?distribution ?format ?description ?access_url ?resource_title
54
+ WHERE {
55
+ ?dataset rdf:type dcat:Dataset ;
56
+ gp:database ?database_title ;
57
+ dcat:distribution ?distribution .
58
+ OPTIONAL {
59
+ ?dataset dct:description ?description .
60
+ }
61
+ ?distribution dct:format ?format ;
62
+ dct:title ?resource_title ;
63
+ dcat:accessURL ?access_url .
64
+ FILTER( regex(?format, "postgres|mysql|csv|tsv|xlsx|xls|txt", 'i') &&
65
+ regex(str(?dataset), "#{dataset_id}") )
66
+ }
67
+ ORDER BY ?title
68
+ ))
69
+ output = @graph.query(query).map.with_index do |result, index|
70
+ queryResult = Hash.new
71
+ queryResult['dataset'] = result.dataset
72
+ queryResult['resource'] = result.distribution
73
+ queryResult['database_title'] = result.database_title.to_s
74
+ queryResult['resource_title'] = result.resource_title.to_s
75
+ queryResult['format'] = result.format.to_s
76
+ queryResult['description'] = result.description.to_s if defined?(result.description)
77
+ queryResult['access_url'] = result.access_url.to_s
78
+ [index, queryResult]
79
+ end.to_h
80
+
81
+ return output
82
+ end
83
+
84
+ # Check if there's a hydra next page.
85
+ def has_next_page
86
+ question = SPARQL.parse(%[
87
+ PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
88
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
89
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
90
+
91
+ ASK { ?catalog rdf:type hydra:PagedCollection ;
92
+ hydra:nextPage ?next_catalog . }
93
+ ])
94
+ @graph.query(question).true?
95
+ end
96
+
97
+ # Adds the next page to the graph
98
+ def next_page
99
+ return nil if !has_next_page
100
+
101
+ # Get next page links
102
+ next_page_query = SPARQL.parse(%(
103
+ PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
104
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
105
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
106
+
107
+ SELECT ?catalog
108
+ WHERE { ?current_catalog rdf:type hydra:PagedCollection ;
109
+ hydra:nextPage ?catalog . }))
110
+ next_page = @graph.query(next_page_query).first
111
+
112
+ # Delete existing hydra nodes
113
+ delete_query = SPARQL.parse(%(
114
+ PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
115
+ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
116
+ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
117
+
118
+ SELECT ?catalog ?p ?o
119
+ WHERE { ?catalog rdf:type hydra:PagedCollection ;
120
+ ?p ?o . }
121
+ ))
122
+ @graph.query(delete_query) do |res|
123
+ statement = RDF::Statement(res.catalog, res.p, res.o)
124
+ @graph.delete(statement)
125
+ end
126
+ # Load the next page.
127
+ @graph.load(next_page[:catalog])
128
+ end
129
+ end
130
+
131
+
132
+ # rdf = Rdf_reader.new
133
+ # rdf.load_from_file("/mnt/data/Development/geophy/db-maintenance-tool/lib/turtle_example.ttl")
134
+ # metadata = rdf.get_metadata("dvdrental")
135
+ # puts metadata.inspect
136
+ end