datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (38) hide show
  1. datamarket/__init__.py +0 -1
  2. datamarket/exceptions/__init__.py +1 -0
  3. datamarket/exceptions/main.py +118 -0
  4. datamarket/interfaces/alchemy.py +1934 -25
  5. datamarket/interfaces/aws.py +81 -14
  6. datamarket/interfaces/azure.py +127 -0
  7. datamarket/interfaces/drive.py +60 -10
  8. datamarket/interfaces/ftp.py +37 -14
  9. datamarket/interfaces/llm.py +1220 -0
  10. datamarket/interfaces/nominatim.py +314 -42
  11. datamarket/interfaces/peerdb.py +272 -104
  12. datamarket/interfaces/proxy.py +354 -50
  13. datamarket/interfaces/tinybird.py +7 -15
  14. datamarket/params/nominatim.py +439 -0
  15. datamarket/utils/__init__.py +1 -1
  16. datamarket/utils/airflow.py +10 -7
  17. datamarket/utils/alchemy.py +2 -1
  18. datamarket/utils/logs.py +88 -0
  19. datamarket/utils/main.py +138 -10
  20. datamarket/utils/nominatim.py +201 -0
  21. datamarket/utils/playwright/__init__.py +0 -0
  22. datamarket/utils/playwright/async_api.py +274 -0
  23. datamarket/utils/playwright/sync_api.py +281 -0
  24. datamarket/utils/requests.py +655 -0
  25. datamarket/utils/selenium.py +6 -12
  26. datamarket/utils/strings/__init__.py +1 -0
  27. datamarket/utils/strings/normalization.py +217 -0
  28. datamarket/utils/strings/obfuscation.py +153 -0
  29. datamarket/utils/strings/standardization.py +40 -0
  30. datamarket/utils/typer.py +2 -1
  31. datamarket/utils/types.py +1 -0
  32. datamarket-0.10.3.dist-info/METADATA +172 -0
  33. datamarket-0.10.3.dist-info/RECORD +38 -0
  34. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
  35. datamarket-0.6.0.dist-info/METADATA +0 -49
  36. datamarket-0.6.0.dist-info/RECORD +0 -24
  37. datamarket-0.6.0.dist-info/top_level.txt +0 -1
  38. {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
@@ -3,13 +3,27 @@
3
3
 
4
4
  import base64
5
5
  import logging
6
+ import re
6
7
  import time
7
8
 
9
+ import boto3
8
10
  import clickhouse_driver
9
11
  import requests
10
- from .alchemy import AlchemyInterface
11
- from .aws import AWSInterface
12
+ from requests.exceptions import HTTPError
12
13
  from sqlalchemy import text
14
+ from tenacity import before_sleep_log, retry, stop_after_attempt, wait_exponential
15
+
16
+ from .alchemy import AlchemyInterface
17
+
18
+ ########################################################################################################################
19
+ # EXCEPTIONS
20
+
21
+
22
+ class DatabaseNotConnectedError(Exception):
23
+ """Custom error for when database is not connected."""
24
+
25
+ pass
26
+
13
27
 
14
28
  ########################################################################################################################
15
29
  # CLASSES
@@ -19,45 +33,47 @@ logger = logging.getLogger(__name__)
19
33
 
20
34
  class PostgresPeer:
21
35
  def __init__(self, config):
22
- self.config = config["db"]
23
- self.alchemy_interface = AlchemyInterface(config)
24
- self.engine = self.alchemy_interface.engine
36
+ if "db" in config:
37
+ self.config = config["db"]
38
+ self.alchemy_interface = AlchemyInterface(config)
39
+ self.engine = self.alchemy_interface.engine
40
+ else:
41
+ logger.warning("no db section in config")
25
42
 
26
- def create_user(self):
27
- user = self.config["user"]
28
- password = self.config["password"]
43
+ def create_user(self, user, password):
44
+ database = self.config["database"]
29
45
 
30
46
  logger.info(f"Creating PostgreSQL user '{user}' for database: {self.config['database']}")
31
47
 
32
48
  with self.engine.connect() as conn:
33
49
  conn.execute(
34
50
  text(f"""
35
- DO $$
36
- BEGIN
37
- IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{user}') THEN
38
- CREATE USER {user} WITH PASSWORD '{password}';
39
- ALTER USER {user} REPLICATION;
40
- GRANT CREATE ON DATABASE datamarket TO {user};
41
- END IF;
42
- END
43
- $$;
44
- """)
51
+ DO $$
52
+ BEGIN
53
+ IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{user}') THEN
54
+ CREATE USER "{user}" WITH PASSWORD '{password}';
55
+ ALTER USER "{user}" REPLICATION;
56
+ GRANT CREATE ON DATABASE {database} TO "{user}";
57
+ END IF;
58
+ END
59
+ $$;
60
+ """)
45
61
  )
62
+ conn.commit()
46
63
  logger.info(f"PostgreSQL user '{user}' created or already exists")
47
64
 
48
- def grant_permissions(self, schema_name):
49
- user = self.config["user"]
50
-
65
+ def grant_permissions(self, schema_name, user):
51
66
  logger.info(f"Granting permissions for schema '{schema_name}' to '{user}'")
52
67
 
53
68
  with self.engine.connect() as conn:
54
69
  conn.execute(
55
70
  text(f"""
56
- GRANT USAGE ON SCHEMA "{schema_name}" TO {user};
57
- GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA "{schema_name}" TO {user};
58
- ALTER DEFAULT PRIVILEGES IN SCHEMA "{schema_name}" GRANT ALL PRIVILEGES ON TABLES TO {user};
59
- """)
71
+ GRANT USAGE ON SCHEMA "{schema_name}" TO "{user}";
72
+ GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA "{schema_name}" TO "{user}";
73
+ ALTER DEFAULT PRIVILEGES IN SCHEMA "{schema_name}" GRANT ALL PRIVILEGES ON TABLES TO "{user}";
74
+ """)
60
75
  )
76
+ conn.commit()
61
77
  logger.info(f"Permissions granted for schema '{schema_name}' to '{user}'")
62
78
 
63
79
  def create_publication(self, schema_name, table_names):
@@ -65,12 +81,40 @@ class PostgresPeer:
65
81
  with self.engine.connect() as conn:
66
82
  conn.execute(text(f"DROP PUBLICATION IF EXISTS {schema_name}_peerdb"))
67
83
 
68
- table_list = ", ".join([f"{schema_name}.{table}" for table in table_names])
84
+ table_list = []
85
+ for table in table_names:
86
+ full_table_name = f'"{schema_name}"."{table}"'
87
+
88
+ # Check current replica identity
89
+ query = text("""
90
+ SELECT CASE c.relreplident
91
+ WHEN 'd' THEN 'DEFAULT'
92
+ WHEN 'n' THEN 'NOTHING'
93
+ WHEN 'f' THEN 'FULL'
94
+ WHEN 'i' THEN 'INDEX'
95
+ END AS replica_identity
96
+ FROM pg_class c
97
+ JOIN pg_namespace n ON c.relnamespace = n.oid
98
+ WHERE c.relname = :table_name
99
+ AND n.nspname = :schema_name;
100
+ """)
101
+ result = conn.execute(query, {"table_name": table, "schema_name": schema_name}).scalar_one_or_none()
102
+
103
+ if result != "FULL":
104
+ logger.info(f"Setting REPLICA IDENTITY FULL for table: {full_table_name}")
105
+ conn.execute(text(f"ALTER TABLE {full_table_name} REPLICA IDENTITY FULL;"))
106
+ else:
107
+ logger.info(f"REPLICA IDENTITY for table {full_table_name} is already FULL. Skipping ALTER TABLE.")
108
+
109
+ table_list.append(full_table_name)
110
+
111
+ table_list_str = ", ".join(table_list)
69
112
  conn.execute(
70
113
  text(f"""
71
- CREATE PUBLICATION {schema_name}_peerdb FOR TABLE {table_list};
72
- """)
114
+ CREATE PUBLICATION {schema_name}_peerdb FOR TABLE {table_list_str};
115
+ """)
73
116
  )
117
+ conn.commit()
74
118
  logger.info(f"Publication '{schema_name}_peerdb' created successfully")
75
119
 
76
120
  def create_tables(self, schema_tables, drop=False):
@@ -90,39 +134,42 @@ class PostgresPeer:
90
134
  """),
91
135
  {"slot_name": slot_name},
92
136
  )
137
+ conn.commit()
93
138
  logger.info(f"Replication slot '{slot_name}' dropped if it existed")
94
139
 
95
140
 
96
141
  class ClickhousePeer:
97
142
  def __init__(self, config):
98
- self.config = config["clickhouse"]
99
- self.ensure_database_exists()
100
- self.client = clickhouse_driver.Client(
101
- host=self.config["host"],
102
- port=self.config["port"],
103
- user=self.config["user"],
104
- password=self.config["password"],
105
- database=self.config["database"],
106
- )
107
-
108
- def ensure_database_exists(self):
109
- logger.info(f"Checking if database '{self.config['database']}' exists in Clickhouse")
110
- temp_client = clickhouse_driver.Client(
111
- host=self.config["host"],
112
- port=self.config["port"],
113
- user=self.config["user"],
114
- password=self.config["password"],
115
- )
143
+ if "clickhouse" in config:
144
+ self.config = config["clickhouse"]
145
+ self.credentials = {key: self.config[key] for key in ["user", "password", "host", "port"]}
116
146
 
117
- databases = temp_client.execute("SHOW DATABASES")
118
- if (self.config["database"],) not in databases:
119
- logger.info(f"Database '{self.config['database']}' does not exist. Creating it now.")
120
- temp_client.execute(f"CREATE DATABASE IF NOT EXISTS {self.config['database']}")
121
- logger.info(f"Database '{self.config['database']}' created successfully")
122
147
  else:
123
- logger.info(f"Database '{self.config['database']}' already exists")
148
+ logger.warning("no clickhouse section in config")
149
+
150
+ def connect(self, database):
151
+ if not database:
152
+ return
153
+
154
+ self.ensure_database_exists(database)
155
+ self.config["database"] = self.credentials["database"] = database
156
+ self.client = clickhouse_driver.Client(**self.credentials)
157
+
158
+ def _check_connection(self):
159
+ if self.client is None:
160
+ raise DatabaseNotConnectedError("Database not connected. Call connect() method first.")
161
+
162
+ def ensure_database_exists(self, database):
163
+ logger.info(f"Checking if database '{database}' exists in Clickhouse")
164
+ temp_client = clickhouse_driver.Client(**self.credentials)
165
+ databases = temp_client.execute("SHOW DATABASES")
166
+ if database not in [db[0] for db in databases]:
167
+ logger.info(f"Creating database '{database}'")
168
+ temp_client.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
169
+ temp_client.disconnect()
124
170
 
125
171
  def delete_existing_tables(self, table_names):
172
+ self._check_connection()
126
173
  logger.info(f"Deleting existing tables in Clickhouse for database: {self.config['database']}")
127
174
 
128
175
  all_tables = self.client.execute("SHOW TABLES")
@@ -148,6 +195,7 @@ class ClickhousePeer:
148
195
  logger.info("Finished deleting existing tables in Clickhouse")
149
196
 
150
197
  def create_row_policies(self, schema_name, table_names):
198
+ self._check_connection()
151
199
  logger.info(f"Creating row policies for schema: {schema_name}")
152
200
  for table_name in table_names:
153
201
  policy_name = "non_deleted"
@@ -158,34 +206,86 @@ class ClickhousePeer:
158
206
  self.client.execute(query)
159
207
  logger.info(f"Created row policy '{policy_name}' for table '{table_name}'")
160
208
 
209
+ def execute_sql_file(self, file_path):
210
+ self._check_connection()
211
+ try:
212
+ with file_path.open("r") as sql_file:
213
+ sql_content = sql_file.read()
214
+ logger.info(f"Executing SQL from file: {file_path}")
215
+
216
+ sql_statements = [stmt.strip() for stmt in sql_content.split(";") if stmt.strip()]
217
+
218
+ for statement in sql_statements:
219
+ self.client.execute(statement)
220
+ logger.info(f"Successfully executed SQL statement: {statement}")
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error executing SQL from file {file_path}: {str(e)}")
224
+
225
+ def teardown_from_sql_folder(self, sql_folder):
226
+ logger.info("Performing ClickHouse teardown")
227
+ self._process_sql_files(sql_folder, teardown=True)
228
+ logger.info("ClickHouse teardown completed")
229
+
230
+ def initialize_from_sql_folder(self, sql_folder):
231
+ logger.info(f"Initializing Clickhouse database from SQL files in folder: {sql_folder}")
232
+ self._process_sql_files(sql_folder)
233
+ logger.info("Finished initializing Clickhouse database from SQL files")
234
+
235
+ def _process_sql_files(self, sql_folder, teardown=False):
236
+ if not sql_folder.exists():
237
+ logger.error(f"SQL initialization folder does not exist: {sql_folder}")
238
+ return
239
+
240
+ all_dirs = [sql_folder] + [d for d in sql_folder.rglob("*") if d.is_dir()]
241
+ sorted_dirs = sorted(all_dirs)
242
+
243
+ for directory in sorted_dirs:
244
+ sql_files = self._filter_sql_files(directory, teardown)
245
+
246
+ for file_path in sql_files:
247
+ self.execute_sql_file(file_path)
248
+
249
+ def _filter_sql_files(self, directory, teardown):
250
+ all_sql_files = directory.glob("*.sql")
251
+ return sorted(f for f in all_sql_files if ("teardown" in f.name.lower()) == teardown)
252
+
161
253
 
162
254
  class TransientS3:
163
255
  def __init__(self, config):
164
- self.aws_interface = AWSInterface(config)
165
- self.aws_interface.switch_profile("datamarket-minio")
256
+ if "peerdb-s3" in config:
257
+ self.config = config["peerdb-s3"]
258
+ self.bucket_name = self.config["bucket"]
259
+ self.session = boto3.Session(profile_name=self.config["profile"])
260
+ self.s3_client = self.session.client("s3")
261
+ self.credentials = self.session.get_credentials()
262
+ self.access_key = self.credentials.access_key
263
+ self.secret_key = self.credentials.secret_key
264
+ self.region_name = self.session.region_name
265
+ self.endpoint_url = self.s3_client.meta.endpoint_url
266
+ else:
267
+ logger.warning("no peerdb-s3 section in config")
166
268
 
167
269
  def delete_paths_with_schema(self, schema_name):
168
270
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
169
271
 
170
- bucket_name = self.aws_interface.current_profile["bucket"]
171
-
172
- paginator = self.aws_interface.s3_client.get_paginator("list_objects_v2")
173
- pages = paginator.paginate(Bucket=bucket_name, Delimiter="/")
272
+ paginator = self.s3_client.get_paginator("list_objects_v2")
273
+ pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
174
274
 
175
275
  for page in pages:
176
276
  if "CommonPrefixes" in page:
177
277
  for prefix in page["CommonPrefixes"]:
178
278
  folder = prefix["Prefix"]
179
279
  if schema_name in folder:
180
- self._delete_folder_contents(bucket_name, folder)
280
+ self._delete_folder_contents(folder)
181
281
 
182
282
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
183
283
 
184
- def _delete_folder_contents(self, bucket_name, folder):
284
+ def _delete_folder_contents(self, folder):
185
285
  logger.info(f"Deleting contents of folder: {folder}")
186
286
 
187
- paginator = self.aws_interface.s3_client.get_paginator("list_objects_v2")
188
- pages = paginator.paginate(Bucket=bucket_name, Prefix=folder)
287
+ paginator = self.s3_client.get_paginator("list_objects_v2")
288
+ pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
189
289
 
190
290
  delete_us = dict(Objects=[])
191
291
  for page in pages:
@@ -195,22 +295,32 @@ class TransientS3:
195
295
 
196
296
  # AWS limits to deleting 1000 objects at a time
197
297
  if len(delete_us["Objects"]) >= 1000:
198
- self.aws_interface.s3_client.delete_objects(Bucket=bucket_name, Delete=delete_us)
298
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
199
299
  delete_us = dict(Objects=[])
200
300
 
201
301
  if len(delete_us["Objects"]):
202
- self.aws_interface.s3_client.delete_objects(Bucket=bucket_name, Delete=delete_us)
302
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
203
303
 
204
304
  logger.info(f"Deleted contents of folder: {folder}")
205
305
 
206
306
 
207
307
  class PeerDBInterface:
208
308
  def __init__(self, config):
209
- self.config = config["peerdb"]
309
+ if "peerdb" in config:
310
+ self.config = config["peerdb"]
311
+ self.docker_host_mapping = self.config.get("docker_host_mapping")
312
+ else:
313
+ logger.warning("no peerdb section in config")
314
+
210
315
  self.source = PostgresPeer(config)
211
- self.destination = ClickhousePeer(config)
212
316
  self.transient_s3 = TransientS3(config)
317
+ self.destination = ClickhousePeer(config)
213
318
 
319
+ @retry(
320
+ stop=stop_after_attempt(5),
321
+ wait=wait_exponential(multiplier=1, min=4, max=10),
322
+ before_sleep=before_sleep_log(logger, logging.WARNING),
323
+ )
214
324
  def _make_api_request(self, endpoint, payload):
215
325
  url = f"http://{self.config['host']}:{self.config['port']}/api/{endpoint}"
216
326
  password = self.config["password"]
@@ -219,17 +329,44 @@ class PeerDBInterface:
219
329
 
220
330
  headers = {"Authorization": f"Basic {encoded_credentials}", "Content-Type": "application/json"}
221
331
 
222
- logger.info(f"Making API request to PeerDB endpoint: {endpoint}")
332
+ logger.debug(f"Making API request to PeerDB endpoint: {endpoint}")
223
333
  try:
224
334
  r = requests.post(url, headers=headers, json=payload, timeout=30)
335
+ response = r.json()
225
336
  r.raise_for_status()
226
- logger.info(f"API request to {endpoint} completed successfully")
227
- return r.json()
228
- except requests.exceptions.HTTPError as e:
337
+ logger.debug(f"API request to {endpoint} completed successfully")
338
+ return response
339
+ except HTTPError as e:
229
340
  logger.error(f"HTTP error occurred: {e}")
230
- logger.error(f"Response JSON: {r.json()}")
341
+ logger.error(f"Response JSON: {r.json() if 'r' in locals() else 'N/A'}")
342
+
343
+ if "no rows in result set" in response.get("message", ""):
344
+ return {"currentFlowState": "STATUS_UNKNOWN"}
345
+
231
346
  raise
232
347
 
348
+ def _resolve_host_mapping(self, host):
349
+ """
350
+ Resolves host mapping for Docker environments.
351
+ If host is localhost/127.0.0.1 and docker_host_mapping is configured,
352
+ returns the mapped host, otherwise returns original host.
353
+ """
354
+ if not self.docker_host_mapping or not host:
355
+ return host
356
+
357
+ if host in ["localhost", "127.0.0.1"]:
358
+ logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
359
+ return self.docker_host_mapping
360
+
361
+ url_pattern = r"(localhost|127\.0\.0\.1)"
362
+ match = re.search(url_pattern, host)
363
+ if match:
364
+ original_host = match.group(1)
365
+ mapped_host = self._resolve_host_mapping(original_host)
366
+ return host.replace(original_host, mapped_host)
367
+
368
+ return host
369
+
233
370
  def create_postgres_peer(self):
234
371
  logger.info(f"Creating Postgres peer for database: {self.source.config['database']}")
235
372
  payload = {
@@ -237,7 +374,7 @@ class PeerDBInterface:
237
374
  "name": self.source.config["database"],
238
375
  "type": 3,
239
376
  "postgres_config": {
240
- "host": self.source.config["host"],
377
+ "host": self._resolve_host_mapping(self.source.config["host"]),
241
378
  "port": int(self.source.config["admin_port"]),
242
379
  "user": self.config["user"],
243
380
  "password": self.config["password"],
@@ -255,23 +392,22 @@ class PeerDBInterface:
255
392
 
256
393
  def create_clickhouse_peer(self, schema_name):
257
394
  logger.info(f"Creating Clickhouse peer for schema: {schema_name}")
258
-
259
395
  payload = {
260
396
  "peer": {
261
397
  "name": f"{schema_name}",
262
398
  "type": 8,
263
399
  "clickhouse_config": {
264
- "host": self.destination.config["host"],
400
+ "host": self._resolve_host_mapping(self.destination.config["host"]),
265
401
  "port": int(self.destination.config["port"]),
266
402
  "user": self.destination.config["user"],
267
403
  "password": self.destination.config["password"],
268
404
  "database": schema_name,
269
405
  "disable_tls": True,
270
- "s3_path": f"s3://{self.destination.config['s3_bucket']}",
271
- "access_key_id": self.destination.config["access_key_id"],
272
- "secret_access_key": self.destination.config["secret_access_key"],
273
- "region": "local",
274
- "endpoint": f"http://{self.destination.config['s3_host']}:{self.destination.config['s3_port']}",
406
+ "s3_path": f"s3://{self.transient_s3.bucket_name}",
407
+ "access_key_id": self.transient_s3.access_key,
408
+ "secret_access_key": self.transient_s3.secret_key,
409
+ "region": self.transient_s3.region_name,
410
+ "endpoint": self._resolve_host_mapping(self.transient_s3.endpoint_url),
275
411
  },
276
412
  },
277
413
  "allow_update": True,
@@ -283,6 +419,16 @@ class PeerDBInterface:
283
419
 
284
420
  logger.info(f"Clickhouse peer for schema '{schema_name}' created successfully")
285
421
 
422
+ def check_mirror_status(self, schema_name):
423
+ current_state = "STATUS_UNKNOWN"
424
+ try:
425
+ payload = {"flowJobName": schema_name, "includeFlowInfo": False}
426
+ response = self._make_api_request("v1/mirrors/status", payload)
427
+ current_state = response.get("currentFlowState")
428
+ except Exception as e:
429
+ logger.debug(f"Error checking mirror status for schema '{schema_name}': {str(e)}")
430
+ return current_state
431
+
286
432
  def drop_mirror(self, schema_name):
287
433
  logger.info(f"Dropping mirror for schema: {schema_name}")
288
434
 
@@ -301,37 +447,42 @@ class PeerDBInterface:
301
447
 
302
448
  logger.info(f"Mirror for schema '{schema_name}' dropped successfully")
303
449
 
304
- def check_mirror_status(self, schema_name):
305
- logger.info(f"Checking mirror status for schema: {schema_name}")
306
- max_attempts = 60
450
+ def wait_for_running_mirror(self, schema_name, max_attempts=360, sleep_interval=10):
451
+ logger.info(f"Waiting for mirror status to be 'STATUS_RUNNING' for schema: {schema_name}")
307
452
  attempt = 0
308
453
  while attempt < max_attempts:
309
- payload = {"flowJobName": schema_name, "includeFlowInfo": False}
310
- response = self._make_api_request("v1/mirrors/status", payload)
311
- current_state = response.get("currentFlowState")
454
+ current_state = self.check_mirror_status(schema_name)
312
455
 
313
- if current_state != "STATUS_SETUP":
314
- logger.info(f"Mirror status for schema '{schema_name}' is: {current_state}")
456
+ if current_state == "STATUS_RUNNING":
457
+ logger.info(f"Mirror status for schema '{schema_name}' is now: {current_state}")
315
458
  return current_state
316
459
 
317
460
  attempt += 1
318
- time.sleep(10)
461
+ logger.info(f"Status is '{current_state}'. Waiting {sleep_interval} seconds before next check.")
462
+ time.sleep(sleep_interval)
319
463
 
320
464
  logger.warning(f"Mirror status check timed out for schema: {schema_name}")
321
465
  return None
322
466
 
323
- def resync_operations(self, schema_name, table_names, resync, hard_resync):
467
+ def pre_init(self, schema_name, table_names, clickhouse_sql_path, resync, hard_resync):
468
+ logger.info("Running pre-init operations.")
324
469
  if resync:
325
- logger.info(f"Resync requested. Performing {'hard' if hard_resync else 'simple'} resync operations.")
326
470
  self.drop_mirror(schema_name)
327
471
  self.transient_s3.delete_paths_with_schema(schema_name)
472
+ self.destination.teardown_from_sql_folder(clickhouse_sql_path)
473
+ self.source.drop_replication_slot(schema_name)
328
474
  if hard_resync:
329
475
  self.destination.delete_existing_tables(table_names)
330
- self.source.drop_replication_slot(schema_name)
331
- logger.info("Resync operations completed.")
476
+ logger.info("Pre-init operations completed.")
477
+
478
+ def post_init(self, schema_name, table_names, clickhouse_sql_path, resync, hard_resync):
479
+ logger.info("Running post-init operations.")
480
+ self.destination.create_row_policies(schema_name, table_names)
481
+ if resync:
482
+ self.destination.initialize_from_sql_folder(clickhouse_sql_path)
483
+ logger.info("Post-init operations completed.")
332
484
 
333
485
  def create_mirror(self, schema_name, table_names, resync, hard_resync):
334
- self.resync_operations(schema_name, table_names, resync, hard_resync)
335
486
  logger.info(f"Creating mirror for schema: {schema_name}")
336
487
 
337
488
  table_mappings = [
@@ -352,7 +503,7 @@ class PeerDBInterface:
352
503
  "snapshot_num_rows_per_partition": 1000000,
353
504
  "snapshot_max_parallel_workers": 1,
354
505
  "snapshot_num_tables_in_parallel": 1,
355
- "resync": not hard_resync,
506
+ "resync": resync and not hard_resync,
356
507
  "initial_snapshot_only": False,
357
508
  "soft_delete_col_name": "_peerdb_is_deleted",
358
509
  "synced_at_col_name": "_peerdb_synced_at",
@@ -365,25 +516,42 @@ class PeerDBInterface:
365
516
  f"Failed to create mirror for schema '{schema_name}': {response.get('errorMessage', response.get('message', 'Unknown error'))}"
366
517
  )
367
518
 
368
- mirror_status = self.check_mirror_status(schema_name)
519
+ mirror_status = self.wait_for_running_mirror(schema_name)
369
520
  if mirror_status:
370
- logger.info(f"Mirror status for schema '{schema_name}' is: {mirror_status}")
371
521
  logger.info(f"Mirror creation for schema '{schema_name}' completed successfully")
372
522
  else:
373
523
  logger.warning(f"Failed to confirm mirror status change for schema: {schema_name}")
374
524
 
375
- def run_automation(self, schema_name, schema_tables, drop=False, resync=False, hard_resync=False):
525
+ def run_automation(
526
+ self,
527
+ schema_name,
528
+ schema_tables,
529
+ drop=False,
530
+ sync=False,
531
+ resync=False,
532
+ hard_resync=False,
533
+ clickhouse_sql_path=None,
534
+ ):
376
535
  logger.info(f"Starting automation for schema: {schema_name}")
377
536
 
378
- table_names = [table.__tablename__ for table in schema_tables]
537
+ base_tables = [table for table, _ in schema_tables]
538
+ mirror_tablenames = [table.__tablename__ for table, should_replicate in schema_tables if should_replicate]
379
539
 
380
- self.source.create_user()
381
- self.source.create_tables(schema_tables, drop)
382
- self.source.grant_permissions(schema_name)
383
- self.source.create_publication(schema_name, table_names)
540
+ self.source.create_tables(base_tables, drop)
541
+ if not (sync or resync):
542
+ return
543
+
544
+ peerdb_user = self.config["user"]
545
+ peerdb_pwd = self.config["password"]
546
+
547
+ self.source.create_user(peerdb_user, peerdb_pwd)
548
+ self.source.grant_permissions(schema_name, peerdb_user)
549
+ self.source.create_publication(schema_name, mirror_tablenames)
550
+ self.destination.connect(schema_name)
384
551
  self.create_postgres_peer()
385
552
  self.create_clickhouse_peer(schema_name)
386
- self.create_mirror(schema_name, table_names, resync, hard_resync)
387
- self.destination.create_row_policies(schema_name, table_names)
553
+ self.pre_init(schema_name, mirror_tablenames, clickhouse_sql_path, resync, hard_resync)
554
+ self.create_mirror(schema_name, mirror_tablenames, resync, hard_resync)
555
+ self.post_init(schema_name, mirror_tablenames, clickhouse_sql_path, resync, hard_resync)
388
556
 
389
557
  logger.info(f"Automation completed successfully for schema: {schema_name}")