datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +1934 -25
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +37 -14
- datamarket/interfaces/llm.py +1220 -0
- datamarket/interfaces/nominatim.py +314 -42
- datamarket/interfaces/peerdb.py +272 -104
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +7 -15
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/logs.py +88 -0
- datamarket/utils/main.py +138 -10
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +655 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- datamarket-0.10.3.dist-info/METADATA +172 -0
- datamarket-0.10.3.dist-info/RECORD +38 -0
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
- datamarket-0.6.0.dist-info/METADATA +0 -49
- datamarket-0.6.0.dist-info/RECORD +0 -24
- datamarket-0.6.0.dist-info/top_level.txt +0 -1
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
datamarket/interfaces/peerdb.py
CHANGED
|
@@ -3,13 +3,27 @@
|
|
|
3
3
|
|
|
4
4
|
import base64
|
|
5
5
|
import logging
|
|
6
|
+
import re
|
|
6
7
|
import time
|
|
7
8
|
|
|
9
|
+
import boto3
|
|
8
10
|
import clickhouse_driver
|
|
9
11
|
import requests
|
|
10
|
-
from .
|
|
11
|
-
from .aws import AWSInterface
|
|
12
|
+
from requests.exceptions import HTTPError
|
|
12
13
|
from sqlalchemy import text
|
|
14
|
+
from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponential
|
|
15
|
+
|
|
16
|
+
from .alchemy import AlchemyInterface
|
|
17
|
+
|
|
18
|
+
########################################################################################################################
|
|
19
|
+
# EXCEPTIONS
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DatabaseNotConnectedError(Exception):
|
|
23
|
+
"""Custom error for when database is not connected."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
13
27
|
|
|
14
28
|
########################################################################################################################
|
|
15
29
|
# CLASSES
|
|
@@ -19,45 +33,47 @@ logger = logging.getLogger(__name__)
|
|
|
19
33
|
|
|
20
34
|
class PostgresPeer:
|
|
21
35
|
def __init__(self, config):
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
36
|
+
if "db" in config:
|
|
37
|
+
self.config = config["db"]
|
|
38
|
+
self.alchemy_interface = AlchemyInterface(config)
|
|
39
|
+
self.engine = self.alchemy_interface.engine
|
|
40
|
+
else:
|
|
41
|
+
logger.warning("no db section in config")
|
|
25
42
|
|
|
26
|
-
def create_user(self):
|
|
27
|
-
|
|
28
|
-
password = self.config["password"]
|
|
43
|
+
def create_user(self, user, password):
|
|
44
|
+
database = self.config["database"]
|
|
29
45
|
|
|
30
46
|
logger.info(f"Creating PostgreSQL user '{user}' for database: {self.config['database']}")
|
|
31
47
|
|
|
32
48
|
with self.engine.connect() as conn:
|
|
33
49
|
conn.execute(
|
|
34
50
|
text(f"""
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
51
|
+
DO $$
|
|
52
|
+
BEGIN
|
|
53
|
+
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{user}') THEN
|
|
54
|
+
CREATE USER "{user}" WITH PASSWORD '{password}';
|
|
55
|
+
ALTER USER "{user}" REPLICATION;
|
|
56
|
+
GRANT CREATE ON DATABASE {database} TO "{user}";
|
|
57
|
+
END IF;
|
|
58
|
+
END
|
|
59
|
+
$$;
|
|
60
|
+
""")
|
|
45
61
|
)
|
|
62
|
+
conn.commit()
|
|
46
63
|
logger.info(f"PostgreSQL user '{user}' created or already exists")
|
|
47
64
|
|
|
48
|
-
def grant_permissions(self, schema_name):
|
|
49
|
-
user = self.config["user"]
|
|
50
|
-
|
|
65
|
+
def grant_permissions(self, schema_name, user):
|
|
51
66
|
logger.info(f"Granting permissions for schema '{schema_name}' to '{user}'")
|
|
52
67
|
|
|
53
68
|
with self.engine.connect() as conn:
|
|
54
69
|
conn.execute(
|
|
55
70
|
text(f"""
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
71
|
+
GRANT USAGE ON SCHEMA "{schema_name}" TO "{user}";
|
|
72
|
+
GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA "{schema_name}" TO "{user}";
|
|
73
|
+
ALTER DEFAULT PRIVILEGES IN SCHEMA "{schema_name}" GRANT ALL PRIVILEGES ON TABLES TO "{user}";
|
|
74
|
+
""")
|
|
60
75
|
)
|
|
76
|
+
conn.commit()
|
|
61
77
|
logger.info(f"Permissions granted for schema '{schema_name}' to '{user}'")
|
|
62
78
|
|
|
63
79
|
def create_publication(self, schema_name, table_names):
|
|
@@ -65,12 +81,40 @@ class PostgresPeer:
|
|
|
65
81
|
with self.engine.connect() as conn:
|
|
66
82
|
conn.execute(text(f"DROP PUBLICATION IF EXISTS {schema_name}_peerdb"))
|
|
67
83
|
|
|
68
|
-
table_list =
|
|
84
|
+
table_list = []
|
|
85
|
+
for table in table_names:
|
|
86
|
+
full_table_name = f'"{schema_name}"."{table}"'
|
|
87
|
+
|
|
88
|
+
# Check current replica identity
|
|
89
|
+
query = text("""
|
|
90
|
+
SELECT CASE c.relreplident
|
|
91
|
+
WHEN 'd' THEN 'DEFAULT'
|
|
92
|
+
WHEN 'n' THEN 'NOTHING'
|
|
93
|
+
WHEN 'f' THEN 'FULL'
|
|
94
|
+
WHEN 'i' THEN 'INDEX'
|
|
95
|
+
END AS replica_identity
|
|
96
|
+
FROM pg_class c
|
|
97
|
+
JOIN pg_namespace n ON c.relnamespace = n.oid
|
|
98
|
+
WHERE c.relname = :table_name
|
|
99
|
+
AND n.nspname = :schema_name;
|
|
100
|
+
""")
|
|
101
|
+
result = conn.execute(query, {"table_name": table, "schema_name": schema_name}).scalar_one_or_none()
|
|
102
|
+
|
|
103
|
+
if result != "FULL":
|
|
104
|
+
logger.info(f"Setting REPLICA IDENTITY FULL for table: {full_table_name}")
|
|
105
|
+
conn.execute(text(f"ALTER TABLE {full_table_name} REPLICA IDENTITY FULL;"))
|
|
106
|
+
else:
|
|
107
|
+
logger.info(f"REPLICA IDENTITY for table {full_table_name} is already FULL. Skipping ALTER TABLE.")
|
|
108
|
+
|
|
109
|
+
table_list.append(full_table_name)
|
|
110
|
+
|
|
111
|
+
table_list_str = ", ".join(table_list)
|
|
69
112
|
conn.execute(
|
|
70
113
|
text(f"""
|
|
71
|
-
|
|
72
|
-
|
|
114
|
+
CREATE PUBLICATION {schema_name}_peerdb FOR TABLE {table_list_str};
|
|
115
|
+
""")
|
|
73
116
|
)
|
|
117
|
+
conn.commit()
|
|
74
118
|
logger.info(f"Publication '{schema_name}_peerdb' created successfully")
|
|
75
119
|
|
|
76
120
|
def create_tables(self, schema_tables, drop=False):
|
|
@@ -90,39 +134,42 @@ class PostgresPeer:
|
|
|
90
134
|
"""),
|
|
91
135
|
{"slot_name": slot_name},
|
|
92
136
|
)
|
|
137
|
+
conn.commit()
|
|
93
138
|
logger.info(f"Replication slot '{slot_name}' dropped if it existed")
|
|
94
139
|
|
|
95
140
|
|
|
96
141
|
class ClickhousePeer:
|
|
97
142
|
def __init__(self, config):
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
host=self.config["host"],
|
|
102
|
-
port=self.config["port"],
|
|
103
|
-
user=self.config["user"],
|
|
104
|
-
password=self.config["password"],
|
|
105
|
-
database=self.config["database"],
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
def ensure_database_exists(self):
|
|
109
|
-
logger.info(f"Checking if database '{self.config['database']}' exists in Clickhouse")
|
|
110
|
-
temp_client = clickhouse_driver.Client(
|
|
111
|
-
host=self.config["host"],
|
|
112
|
-
port=self.config["port"],
|
|
113
|
-
user=self.config["user"],
|
|
114
|
-
password=self.config["password"],
|
|
115
|
-
)
|
|
143
|
+
if "clickhouse" in config:
|
|
144
|
+
self.config = config["clickhouse"]
|
|
145
|
+
self.credentials = {key: self.config[key] for key in ["user", "password", "host", "port"]}
|
|
116
146
|
|
|
117
|
-
databases = temp_client.execute("SHOW DATABASES")
|
|
118
|
-
if (self.config["database"],) not in databases:
|
|
119
|
-
logger.info(f"Database '{self.config['database']}' does not exist. Creating it now.")
|
|
120
|
-
temp_client.execute(f"CREATE DATABASE IF NOT EXISTS {self.config['database']}")
|
|
121
|
-
logger.info(f"Database '{self.config['database']}' created successfully")
|
|
122
147
|
else:
|
|
123
|
-
logger.
|
|
148
|
+
logger.warning("no clickhouse section in config")
|
|
149
|
+
|
|
150
|
+
def connect(self, database):
|
|
151
|
+
if not database:
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
self.ensure_database_exists(database)
|
|
155
|
+
self.config["database"] = self.credentials["database"] = database
|
|
156
|
+
self.client = clickhouse_driver.Client(**self.credentials)
|
|
157
|
+
|
|
158
|
+
def _check_connection(self):
|
|
159
|
+
if self.client is None:
|
|
160
|
+
raise DatabaseNotConnectedError("Database not connected. Call connect() method first.")
|
|
161
|
+
|
|
162
|
+
def ensure_database_exists(self, database):
|
|
163
|
+
logger.info(f"Checking if database '{database}' exists in Clickhouse")
|
|
164
|
+
temp_client = clickhouse_driver.Client(**self.credentials)
|
|
165
|
+
databases = temp_client.execute("SHOW DATABASES")
|
|
166
|
+
if database not in [db[0] for db in databases]:
|
|
167
|
+
logger.info(f"Creating database '{database}'")
|
|
168
|
+
temp_client.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
|
|
169
|
+
temp_client.disconnect()
|
|
124
170
|
|
|
125
171
|
def delete_existing_tables(self, table_names):
|
|
172
|
+
self._check_connection()
|
|
126
173
|
logger.info(f"Deleting existing tables in Clickhouse for database: {self.config['database']}")
|
|
127
174
|
|
|
128
175
|
all_tables = self.client.execute("SHOW TABLES")
|
|
@@ -148,6 +195,7 @@ class ClickhousePeer:
|
|
|
148
195
|
logger.info("Finished deleting existing tables in Clickhouse")
|
|
149
196
|
|
|
150
197
|
def create_row_policies(self, schema_name, table_names):
|
|
198
|
+
self._check_connection()
|
|
151
199
|
logger.info(f"Creating row policies for schema: {schema_name}")
|
|
152
200
|
for table_name in table_names:
|
|
153
201
|
policy_name = "non_deleted"
|
|
@@ -158,34 +206,86 @@ class ClickhousePeer:
|
|
|
158
206
|
self.client.execute(query)
|
|
159
207
|
logger.info(f"Created row policy '{policy_name}' for table '{table_name}'")
|
|
160
208
|
|
|
209
|
+
def execute_sql_file(self, file_path):
|
|
210
|
+
self._check_connection()
|
|
211
|
+
try:
|
|
212
|
+
with file_path.open("r") as sql_file:
|
|
213
|
+
sql_content = sql_file.read()
|
|
214
|
+
logger.info(f"Executing SQL from file: {file_path}")
|
|
215
|
+
|
|
216
|
+
sql_statements = [stmt.strip() for stmt in sql_content.split(";") if stmt.strip()]
|
|
217
|
+
|
|
218
|
+
for statement in sql_statements:
|
|
219
|
+
self.client.execute(statement)
|
|
220
|
+
logger.info(f"Successfully executed SQL statement: {statement}")
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f"Error executing SQL from file {file_path}: {str(e)}")
|
|
224
|
+
|
|
225
|
+
def teardown_from_sql_folder(self, sql_folder):
|
|
226
|
+
logger.info("Performing ClickHouse teardown")
|
|
227
|
+
self._process_sql_files(sql_folder, teardown=True)
|
|
228
|
+
logger.info("ClickHouse teardown completed")
|
|
229
|
+
|
|
230
|
+
def initialize_from_sql_folder(self, sql_folder):
|
|
231
|
+
logger.info(f"Initializing Clickhouse database from SQL files in folder: {sql_folder}")
|
|
232
|
+
self._process_sql_files(sql_folder)
|
|
233
|
+
logger.info("Finished initializing Clickhouse database from SQL files")
|
|
234
|
+
|
|
235
|
+
def _process_sql_files(self, sql_folder, teardown=False):
|
|
236
|
+
if not sql_folder.exists():
|
|
237
|
+
logger.error(f"SQL initialization folder does not exist: {sql_folder}")
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
all_dirs = [sql_folder] + [d for d in sql_folder.rglob("*") if d.is_dir()]
|
|
241
|
+
sorted_dirs = sorted(all_dirs)
|
|
242
|
+
|
|
243
|
+
for directory in sorted_dirs:
|
|
244
|
+
sql_files = self._filter_sql_files(directory, teardown)
|
|
245
|
+
|
|
246
|
+
for file_path in sql_files:
|
|
247
|
+
self.execute_sql_file(file_path)
|
|
248
|
+
|
|
249
|
+
def _filter_sql_files(self, directory, teardown):
|
|
250
|
+
all_sql_files = directory.glob("*.sql")
|
|
251
|
+
return sorted(f for f in all_sql_files if ("teardown" in f.name.lower()) == teardown)
|
|
252
|
+
|
|
161
253
|
|
|
162
254
|
class TransientS3:
|
|
163
255
|
def __init__(self, config):
|
|
164
|
-
|
|
165
|
-
|
|
256
|
+
if "peerdb-s3" in config:
|
|
257
|
+
self.config = config["peerdb-s3"]
|
|
258
|
+
self.bucket_name = self.config["bucket"]
|
|
259
|
+
self.session = boto3.Session(profile_name=self.config["profile"])
|
|
260
|
+
self.s3_client = self.session.client("s3")
|
|
261
|
+
self.credentials = self.session.get_credentials()
|
|
262
|
+
self.access_key = self.credentials.access_key
|
|
263
|
+
self.secret_key = self.credentials.secret_key
|
|
264
|
+
self.region_name = self.session.region_name
|
|
265
|
+
self.endpoint_url = self.s3_client.meta.endpoint_url
|
|
266
|
+
else:
|
|
267
|
+
logger.warning("no peerdb-s3 section in config")
|
|
166
268
|
|
|
167
269
|
def delete_paths_with_schema(self, schema_name):
|
|
168
270
|
logger.info(f"Deleting paths containing '{schema_name}' from S3")
|
|
169
271
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
paginator = self.aws_interface.s3_client.get_paginator("list_objects_v2")
|
|
173
|
-
pages = paginator.paginate(Bucket=bucket_name, Delimiter="/")
|
|
272
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
273
|
+
pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
|
|
174
274
|
|
|
175
275
|
for page in pages:
|
|
176
276
|
if "CommonPrefixes" in page:
|
|
177
277
|
for prefix in page["CommonPrefixes"]:
|
|
178
278
|
folder = prefix["Prefix"]
|
|
179
279
|
if schema_name in folder:
|
|
180
|
-
self._delete_folder_contents(
|
|
280
|
+
self._delete_folder_contents(folder)
|
|
181
281
|
|
|
182
282
|
logger.info(f"Deleted paths containing '{schema_name}' from S3")
|
|
183
283
|
|
|
184
|
-
def _delete_folder_contents(self,
|
|
284
|
+
def _delete_folder_contents(self, folder):
|
|
185
285
|
logger.info(f"Deleting contents of folder: {folder}")
|
|
186
286
|
|
|
187
|
-
paginator = self.
|
|
188
|
-
pages = paginator.paginate(Bucket=bucket_name, Prefix=folder)
|
|
287
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
288
|
+
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
|
|
189
289
|
|
|
190
290
|
delete_us = dict(Objects=[])
|
|
191
291
|
for page in pages:
|
|
@@ -195,22 +295,32 @@ class TransientS3:
|
|
|
195
295
|
|
|
196
296
|
# AWS limits to deleting 1000 objects at a time
|
|
197
297
|
if len(delete_us["Objects"]) >= 1000:
|
|
198
|
-
self.
|
|
298
|
+
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
199
299
|
delete_us = dict(Objects=[])
|
|
200
300
|
|
|
201
301
|
if len(delete_us["Objects"]):
|
|
202
|
-
self.
|
|
302
|
+
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
203
303
|
|
|
204
304
|
logger.info(f"Deleted contents of folder: {folder}")
|
|
205
305
|
|
|
206
306
|
|
|
207
307
|
class PeerDBInterface:
|
|
208
308
|
def __init__(self, config):
|
|
209
|
-
|
|
309
|
+
if "peerdb" in config:
|
|
310
|
+
self.config = config["peerdb"]
|
|
311
|
+
self.docker_host_mapping = self.config.get("docker_host_mapping")
|
|
312
|
+
else:
|
|
313
|
+
logger.warning("no peerdb section in config")
|
|
314
|
+
|
|
210
315
|
self.source = PostgresPeer(config)
|
|
211
|
-
self.destination = ClickhousePeer(config)
|
|
212
316
|
self.transient_s3 = TransientS3(config)
|
|
317
|
+
self.destination = ClickhousePeer(config)
|
|
213
318
|
|
|
319
|
+
@retry(
|
|
320
|
+
stop=stop_after_attempt(5),
|
|
321
|
+
wait=wait_exponential(multiplier=1, min=4, max=10),
|
|
322
|
+
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
323
|
+
)
|
|
214
324
|
def _make_api_request(self, endpoint, payload):
|
|
215
325
|
url = f"http://{self.config['host']}:{self.config['port']}/api/{endpoint}"
|
|
216
326
|
password = self.config["password"]
|
|
@@ -219,17 +329,44 @@ class PeerDBInterface:
|
|
|
219
329
|
|
|
220
330
|
headers = {"Authorization": f"Basic {encoded_credentials}", "Content-Type": "application/json"}
|
|
221
331
|
|
|
222
|
-
logger.
|
|
332
|
+
logger.debug(f"Making API request to PeerDB endpoint: {endpoint}")
|
|
223
333
|
try:
|
|
224
334
|
r = requests.post(url, headers=headers, json=payload, timeout=30)
|
|
335
|
+
response = r.json()
|
|
225
336
|
r.raise_for_status()
|
|
226
|
-
logger.
|
|
227
|
-
return
|
|
228
|
-
except
|
|
337
|
+
logger.debug(f"API request to {endpoint} completed successfully")
|
|
338
|
+
return response
|
|
339
|
+
except HTTPError as e:
|
|
229
340
|
logger.error(f"HTTP error occurred: {e}")
|
|
230
|
-
logger.error(f"Response JSON: {r.json()}")
|
|
341
|
+
logger.error(f"Response JSON: {r.json() if 'r' in locals() else 'N/A'}")
|
|
342
|
+
|
|
343
|
+
if "no rows in result set" in response.get("message", ""):
|
|
344
|
+
return {"currentFlowState": "STATUS_UNKNOWN"}
|
|
345
|
+
|
|
231
346
|
raise
|
|
232
347
|
|
|
348
|
+
def _resolve_host_mapping(self, host):
|
|
349
|
+
"""
|
|
350
|
+
Resolves host mapping for Docker environments.
|
|
351
|
+
If host is localhost/127.0.0.1 and docker_host_mapping is configured,
|
|
352
|
+
returns the mapped host, otherwise returns original host.
|
|
353
|
+
"""
|
|
354
|
+
if not self.docker_host_mapping or not host:
|
|
355
|
+
return host
|
|
356
|
+
|
|
357
|
+
if host in ["localhost", "127.0.0.1"]:
|
|
358
|
+
logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
|
|
359
|
+
return self.docker_host_mapping
|
|
360
|
+
|
|
361
|
+
url_pattern = r"(localhost|127\.0\.0\.1)"
|
|
362
|
+
match = re.search(url_pattern, host)
|
|
363
|
+
if match:
|
|
364
|
+
original_host = match.group(1)
|
|
365
|
+
mapped_host = self._resolve_host_mapping(original_host)
|
|
366
|
+
return host.replace(original_host, mapped_host)
|
|
367
|
+
|
|
368
|
+
return host
|
|
369
|
+
|
|
233
370
|
def create_postgres_peer(self):
|
|
234
371
|
logger.info(f"Creating Postgres peer for database: {self.source.config['database']}")
|
|
235
372
|
payload = {
|
|
@@ -237,7 +374,7 @@ class PeerDBInterface:
|
|
|
237
374
|
"name": self.source.config["database"],
|
|
238
375
|
"type": 3,
|
|
239
376
|
"postgres_config": {
|
|
240
|
-
"host": self.source.config["host"],
|
|
377
|
+
"host": self._resolve_host_mapping(self.source.config["host"]),
|
|
241
378
|
"port": int(self.source.config["admin_port"]),
|
|
242
379
|
"user": self.config["user"],
|
|
243
380
|
"password": self.config["password"],
|
|
@@ -255,23 +392,22 @@ class PeerDBInterface:
|
|
|
255
392
|
|
|
256
393
|
def create_clickhouse_peer(self, schema_name):
|
|
257
394
|
logger.info(f"Creating Clickhouse peer for schema: {schema_name}")
|
|
258
|
-
|
|
259
395
|
payload = {
|
|
260
396
|
"peer": {
|
|
261
397
|
"name": f"{schema_name}",
|
|
262
398
|
"type": 8,
|
|
263
399
|
"clickhouse_config": {
|
|
264
|
-
"host": self.destination.config["host"],
|
|
400
|
+
"host": self._resolve_host_mapping(self.destination.config["host"]),
|
|
265
401
|
"port": int(self.destination.config["port"]),
|
|
266
402
|
"user": self.destination.config["user"],
|
|
267
403
|
"password": self.destination.config["password"],
|
|
268
404
|
"database": schema_name,
|
|
269
405
|
"disable_tls": True,
|
|
270
|
-
"s3_path": f"s3://{self.
|
|
271
|
-
"access_key_id": self.
|
|
272
|
-
"secret_access_key": self.
|
|
273
|
-
"region":
|
|
274
|
-
"endpoint":
|
|
406
|
+
"s3_path": f"s3://{self.transient_s3.bucket_name}",
|
|
407
|
+
"access_key_id": self.transient_s3.access_key,
|
|
408
|
+
"secret_access_key": self.transient_s3.secret_key,
|
|
409
|
+
"region": self.transient_s3.region_name,
|
|
410
|
+
"endpoint": self._resolve_host_mapping(self.transient_s3.endpoint_url),
|
|
275
411
|
},
|
|
276
412
|
},
|
|
277
413
|
"allow_update": True,
|
|
@@ -283,6 +419,16 @@ class PeerDBInterface:
|
|
|
283
419
|
|
|
284
420
|
logger.info(f"Clickhouse peer for schema '{schema_name}' created successfully")
|
|
285
421
|
|
|
422
|
+
def check_mirror_status(self, schema_name):
|
|
423
|
+
current_state = "STATUS_UNKNOWN"
|
|
424
|
+
try:
|
|
425
|
+
payload = {"flowJobName": schema_name, "includeFlowInfo": False}
|
|
426
|
+
response = self._make_api_request("v1/mirrors/status", payload)
|
|
427
|
+
current_state = response.get("currentFlowState")
|
|
428
|
+
except Exception as e:
|
|
429
|
+
logger.debug(f"Error checking mirror status for schema '{schema_name}': {str(e)}")
|
|
430
|
+
return current_state
|
|
431
|
+
|
|
286
432
|
def drop_mirror(self, schema_name):
|
|
287
433
|
logger.info(f"Dropping mirror for schema: {schema_name}")
|
|
288
434
|
|
|
@@ -301,37 +447,42 @@ class PeerDBInterface:
|
|
|
301
447
|
|
|
302
448
|
logger.info(f"Mirror for schema '{schema_name}' dropped successfully")
|
|
303
449
|
|
|
304
|
-
def
|
|
305
|
-
logger.info(f"
|
|
306
|
-
max_attempts = 60
|
|
450
|
+
def wait_for_running_mirror(self, schema_name, max_attempts=360, sleep_interval=10):
|
|
451
|
+
logger.info(f"Waiting for mirror status to be 'STATUS_RUNNING' for schema: {schema_name}")
|
|
307
452
|
attempt = 0
|
|
308
453
|
while attempt < max_attempts:
|
|
309
|
-
|
|
310
|
-
response = self._make_api_request("v1/mirrors/status", payload)
|
|
311
|
-
current_state = response.get("currentFlowState")
|
|
454
|
+
current_state = self.check_mirror_status(schema_name)
|
|
312
455
|
|
|
313
|
-
if current_state
|
|
314
|
-
logger.info(f"Mirror status for schema '{schema_name}' is: {current_state}")
|
|
456
|
+
if current_state == "STATUS_RUNNING":
|
|
457
|
+
logger.info(f"Mirror status for schema '{schema_name}' is now: {current_state}")
|
|
315
458
|
return current_state
|
|
316
459
|
|
|
317
460
|
attempt += 1
|
|
318
|
-
|
|
461
|
+
logger.info(f"Status is '{current_state}'. Waiting {sleep_interval} seconds before next check.")
|
|
462
|
+
time.sleep(sleep_interval)
|
|
319
463
|
|
|
320
464
|
logger.warning(f"Mirror status check timed out for schema: {schema_name}")
|
|
321
465
|
return None
|
|
322
466
|
|
|
323
|
-
def
|
|
467
|
+
def pre_init(self, schema_name, table_names, clickhouse_sql_path, resync, hard_resync):
|
|
468
|
+
logger.info("Running pre-init operations.")
|
|
324
469
|
if resync:
|
|
325
|
-
logger.info(f"Resync requested. Performing {'hard' if hard_resync else 'simple'} resync operations.")
|
|
326
470
|
self.drop_mirror(schema_name)
|
|
327
471
|
self.transient_s3.delete_paths_with_schema(schema_name)
|
|
472
|
+
self.destination.teardown_from_sql_folder(clickhouse_sql_path)
|
|
473
|
+
self.source.drop_replication_slot(schema_name)
|
|
328
474
|
if hard_resync:
|
|
329
475
|
self.destination.delete_existing_tables(table_names)
|
|
330
|
-
|
|
331
|
-
|
|
476
|
+
logger.info("Pre-init operations completed.")
|
|
477
|
+
|
|
478
|
+
def post_init(self, schema_name, table_names, clickhouse_sql_path, resync, hard_resync):
|
|
479
|
+
logger.info("Running post-init operations.")
|
|
480
|
+
self.destination.create_row_policies(schema_name, table_names)
|
|
481
|
+
if resync:
|
|
482
|
+
self.destination.initialize_from_sql_folder(clickhouse_sql_path)
|
|
483
|
+
logger.info("Post-init operations completed.")
|
|
332
484
|
|
|
333
485
|
def create_mirror(self, schema_name, table_names, resync, hard_resync):
|
|
334
|
-
self.resync_operations(schema_name, table_names, resync, hard_resync)
|
|
335
486
|
logger.info(f"Creating mirror for schema: {schema_name}")
|
|
336
487
|
|
|
337
488
|
table_mappings = [
|
|
@@ -352,7 +503,7 @@ class PeerDBInterface:
|
|
|
352
503
|
"snapshot_num_rows_per_partition": 1000000,
|
|
353
504
|
"snapshot_max_parallel_workers": 1,
|
|
354
505
|
"snapshot_num_tables_in_parallel": 1,
|
|
355
|
-
"resync": not hard_resync,
|
|
506
|
+
"resync": resync and not hard_resync,
|
|
356
507
|
"initial_snapshot_only": False,
|
|
357
508
|
"soft_delete_col_name": "_peerdb_is_deleted",
|
|
358
509
|
"synced_at_col_name": "_peerdb_synced_at",
|
|
@@ -365,25 +516,42 @@ class PeerDBInterface:
|
|
|
365
516
|
f"Failed to create mirror for schema '{schema_name}': {response.get('errorMessage', response.get('message', 'Unknown error'))}"
|
|
366
517
|
)
|
|
367
518
|
|
|
368
|
-
mirror_status = self.
|
|
519
|
+
mirror_status = self.wait_for_running_mirror(schema_name)
|
|
369
520
|
if mirror_status:
|
|
370
|
-
logger.info(f"Mirror status for schema '{schema_name}' is: {mirror_status}")
|
|
371
521
|
logger.info(f"Mirror creation for schema '{schema_name}' completed successfully")
|
|
372
522
|
else:
|
|
373
523
|
logger.warning(f"Failed to confirm mirror status change for schema: {schema_name}")
|
|
374
524
|
|
|
375
|
-
def run_automation(
|
|
525
|
+
def run_automation(
|
|
526
|
+
self,
|
|
527
|
+
schema_name,
|
|
528
|
+
schema_tables,
|
|
529
|
+
drop=False,
|
|
530
|
+
sync=False,
|
|
531
|
+
resync=False,
|
|
532
|
+
hard_resync=False,
|
|
533
|
+
clickhouse_sql_path=None,
|
|
534
|
+
):
|
|
376
535
|
logger.info(f"Starting automation for schema: {schema_name}")
|
|
377
536
|
|
|
378
|
-
|
|
537
|
+
base_tables = [table for table, _ in schema_tables]
|
|
538
|
+
mirror_tablenames = [table.__tablename__ for table, should_replicate in schema_tables if should_replicate]
|
|
379
539
|
|
|
380
|
-
self.source.
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
540
|
+
self.source.create_tables(base_tables, drop)
|
|
541
|
+
if not (sync or resync):
|
|
542
|
+
return
|
|
543
|
+
|
|
544
|
+
peerdb_user = self.config["user"]
|
|
545
|
+
peerdb_pwd = self.config["password"]
|
|
546
|
+
|
|
547
|
+
self.source.create_user(peerdb_user, peerdb_pwd)
|
|
548
|
+
self.source.grant_permissions(schema_name, peerdb_user)
|
|
549
|
+
self.source.create_publication(schema_name, mirror_tablenames)
|
|
550
|
+
self.destination.connect(schema_name)
|
|
384
551
|
self.create_postgres_peer()
|
|
385
552
|
self.create_clickhouse_peer(schema_name)
|
|
386
|
-
self.
|
|
387
|
-
self.
|
|
553
|
+
self.pre_init(schema_name, mirror_tablenames, clickhouse_sql_path, resync, hard_resync)
|
|
554
|
+
self.create_mirror(schema_name, mirror_tablenames, resync, hard_resync)
|
|
555
|
+
self.post_init(schema_name, mirror_tablenames, clickhouse_sql_path, resync, hard_resync)
|
|
388
556
|
|
|
389
557
|
logger.info(f"Automation completed successfully for schema: {schema_name}")
|