acryl-datahub 0.15.0rc10__py3-none-any.whl → 0.15.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=vEQvZGPQ7hfoL7fm6XTBo6OY37wlMIITNNW1rpZuQCk,575
1
+ datahub/__init__.py,sha256=RtQFMiAfUcKAAt_1AITPz1UOKRx2gIW0yLbLYUEChqU,575
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -61,7 +61,7 @@ datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
61
  datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
62
62
  datahub/cli/cli_utils.py,sha256=xMEK4tmoX2f_5BihxWrApOkayVLwhpPvMUU842x6FsI,13111
63
63
  datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
64
- datahub/cli/delete_cli.py,sha256=YEbPDFcMtbIHDE0yNB9opfVI5r8f7kFOnfyc0rlyIqU,20906
64
+ datahub/cli/delete_cli.py,sha256=Z7iXNr4ZMmghCnldU8laK4SwTNrhQEEnnUH_TeaBKog,21838
65
65
  datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
66
66
  datahub/cli/docker_cli.py,sha256=QGoWFp8ZZsXOSMbgu0Q4snMmMmtP3epWAN-fYglUNEc,36491
67
67
  datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
@@ -97,7 +97,7 @@ datahub/configuration/git.py,sha256=s55eUHxKqVZgtVsISaDyS-1F4iZBiybbjYsjbp5LU5o,
97
97
  datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
98
98
  datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
99
99
  datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
100
- datahub/configuration/kafka_consumer_config.py,sha256=14UWK6kTAnKTgHM43aeWjg67fddACsSLzhO6wgp6cm4,1175
100
+ datahub/configuration/kafka_consumer_config.py,sha256=LivsObTt9yC3WoGnslJbF_x4ojfNdxMIMEhb8vvJfcA,2133
101
101
  datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
102
102
  datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
103
103
  datahub/configuration/source_common.py,sha256=68LZOuB23zSEcfgQJE1wZQnyYQHVVnEZK3Sniv_nEQs,2107
@@ -201,7 +201,7 @@ datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPG
201
201
  datahub/ingestion/source/openapi.py,sha256=3ea2ORz1cuq4e7L2hSjxG9Cw3__pVoJ5UNYTJS3EnKU,17386
202
202
  datahub/ingestion/source/openapi_parser.py,sha256=1_68wHWe_SzWYEyC1YVDw9vxoadKjW1yv8DecvyIhwY,13606
203
203
  datahub/ingestion/source/preset.py,sha256=eq7h1qKs8nfSBVot1ofN-YgZhw_rzq8DG4cKOGfDHko,3948
204
- datahub/ingestion/source/pulsar.py,sha256=bKtBzBTPuKwdTN4nSs7M0FYfA-q6FDc554W2NhnuWEY,19475
204
+ datahub/ingestion/source/pulsar.py,sha256=H8XJC7xIX8Kdkd7006PxllAGVO_Pjza8Xx9VUBOvpPc,19827
205
205
  datahub/ingestion/source/redash.py,sha256=E-a14X19zppPun7_-S-pZ2lRiw1-68QiT-jL7bDzG10,32057
206
206
  datahub/ingestion/source/salesforce.py,sha256=S6LSM6mzl8-zKbrJPoINhM1SCpYfM244Xb74pbEI-J0,31792
207
207
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
@@ -301,7 +301,7 @@ datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP
301
301
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
302
302
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
303
303
  datahub/ingestion/source/gc/datahub_gc.py,sha256=f6Erj3KfD0Hx3ydwL5MUVCZgFzS9c6U2Pkr54JLIUOA,12394
304
- datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=rGIwUKPlNu2XR0YT8DuJGg2pPGIr3MM-YDa5Slo2vNY,14470
304
+ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=GOdLVYYfmmHV7Xtr-cwCTiQiRSVwq80Nu0EmARyoEeo,15323
305
305
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
306
306
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=_tms5AqNAJRDRzQmyN_VydzXbdME2lkvTwa5u1La5z8,7353
307
307
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
974
974
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
975
975
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
976
976
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
977
- acryl_datahub-0.15.0rc10.dist-info/METADATA,sha256=oHQM6bvFxv1QboFDXeahyZEbfacayYoFOlOUcPacaeI,172487
978
- acryl_datahub-0.15.0rc10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
- acryl_datahub-0.15.0rc10.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
- acryl_datahub-0.15.0rc10.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
- acryl_datahub-0.15.0rc10.dist-info/RECORD,,
977
+ acryl_datahub-0.15.0rc11.dist-info/METADATA,sha256=0GnLLvLM_Fvdw4XNusJLRrDZ0vemeu_B0ftz8XQXmbs,174408
978
+ acryl_datahub-0.15.0rc11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
+ acryl_datahub-0.15.0rc11.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
+ acryl_datahub-0.15.0rc11.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
+ acryl_datahub-0.15.0rc11.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc10"
6
+ __version__ = "0.15.0rc11"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
datahub/cli/delete_cli.py CHANGED
@@ -214,14 +214,47 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
214
214
 
215
215
 
216
216
  @delete.command()
217
- @click.option("--urn", required=True, type=str, help="the urn of the entity")
218
- def undo_by_filter(urn: str) -> None:
217
+ @click.option("--urn", required=False, type=str, help="the urn of the entity")
218
+ @click.option(
219
+ "-p",
220
+ "--platform",
221
+ required=False,
222
+ type=str,
223
+ help="Platform filter (e.g. snowflake)",
224
+ )
225
+ @click.option(
226
+ "-b",
227
+ "--batch-size",
228
+ required=False,
229
+ default=3000,
230
+ type=int,
231
+ help="Batch size when querying for entities to un-soft delete."
232
+ "Maximum 10000. Large batch sizes may cause timeouts.",
233
+ )
234
+ def undo_by_filter(
235
+ urn: Optional[str], platform: Optional[str], batch_size: int
236
+ ) -> None:
219
237
  """
220
- Undo a soft deletion of an entity
238
+ Undo soft deletion by filters
221
239
  """
222
240
  graph = get_default_graph()
223
241
  logger.info(f"Using {graph}")
224
- graph.set_soft_delete_status(urn=urn, delete=False)
242
+ if urn:
243
+ graph.set_soft_delete_status(urn=urn, delete=False)
244
+ else:
245
+ urns = list(
246
+ graph.get_urns_by_filter(
247
+ platform=platform,
248
+ query="*",
249
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
250
+ batch_size=batch_size,
251
+ )
252
+ )
253
+ logger.info(f"Going to un-soft delete {len(urns)} urns")
254
+ urns_iter = progressbar.progressbar(urns, redirect_stdout=True)
255
+ for urn in urns_iter:
256
+ assert urn
257
+ graph.set_soft_delete_status(urn=urn, delete=False)
225
258
 
226
259
 
227
260
  @delete.command(no_args_is_help=True)
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import logging
2
3
  from typing import Any, Dict, Optional
3
4
 
@@ -34,5 +35,34 @@ class CallableConsumerConfig:
34
35
  "oauth_cb must be a string representing python function reference "
35
36
  "in the format <python-module>:<function-name>."
36
37
  )
38
+
39
+ call_back_fn = import_path(call_back)
40
+ self._validate_call_back_fn_signature(call_back_fn)
41
+
37
42
  # Set the callback
38
- self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
43
+ self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = call_back_fn
44
+
45
+ def _validate_call_back_fn_signature(self, call_back_fn: Any) -> None:
46
+ sig = inspect.signature(call_back_fn)
47
+
48
+ num_positional_args = len(
49
+ [
50
+ param
51
+ for param in sig.parameters.values()
52
+ if param.kind
53
+ in (
54
+ inspect.Parameter.POSITIONAL_ONLY,
55
+ inspect.Parameter.POSITIONAL_OR_KEYWORD,
56
+ )
57
+ and param.default == inspect.Parameter.empty
58
+ ]
59
+ )
60
+
61
+ has_variadic_args = any(
62
+ param.kind == inspect.Parameter.VAR_POSITIONAL
63
+ for param in sig.parameters.values()
64
+ )
65
+
66
+ assert num_positional_args == 1 or (
67
+ has_variadic_args and num_positional_args <= 1
68
+ ), "oauth_cb function must accept single positional argument."
@@ -208,22 +208,28 @@ class DataProcessCleanup:
208
208
  dpis = []
209
209
  start = 0
210
210
  while True:
211
- job_query_result = self.ctx.graph.execute_graphql(
212
- DATA_PROCESS_INSTANCES_QUERY,
213
- {"dataJobUrn": job_urn, "start": start, "count": batch_size},
214
- )
215
- job_data = job_query_result.get("dataJob")
216
- if not job_data:
217
- raise ValueError(f"Error getting job {job_urn}")
218
-
219
- runs_data = job_data.get("runs")
220
- if not runs_data:
221
- raise ValueError(f"Error getting runs for {job_urn}")
222
-
223
- runs = runs_data.get("runs")
224
- dpis.extend(runs)
225
- start += batch_size
226
- if len(runs) < batch_size:
211
+ try:
212
+ job_query_result = self.ctx.graph.execute_graphql(
213
+ DATA_PROCESS_INSTANCES_QUERY,
214
+ {"dataJobUrn": job_urn, "start": start, "count": batch_size},
215
+ )
216
+ job_data = job_query_result.get("dataJob")
217
+ if not job_data:
218
+ logger.error(f"Error getting job {job_urn}")
219
+ break
220
+
221
+ runs_data = job_data.get("runs")
222
+ if not runs_data:
223
+ logger.error(f"Error getting runs for {job_urn}")
224
+ break
225
+
226
+ runs = runs_data.get("runs")
227
+ dpis.extend(runs)
228
+ start += batch_size
229
+ if len(runs) < batch_size:
230
+ break
231
+ except Exception as e:
232
+ logger.error(f"Exception while fetching DPIs for job {job_urn}: {e}")
227
233
  break
228
234
  return dpis
229
235
 
@@ -243,8 +249,12 @@ class DataProcessCleanup:
243
249
  futures[future] = dpi
244
250
 
245
251
  for future in as_completed(futures):
246
- deleted_count_last_n += 1
247
- futures[future]["deleted"] = True
252
+ try:
253
+ future.result()
254
+ deleted_count_last_n += 1
255
+ futures[future]["deleted"] = True
256
+ except Exception as e:
257
+ logger.error(f"Exception while deleting DPI: {e}")
248
258
 
249
259
  if deleted_count_last_n % self.config.batch_size == 0:
250
260
  logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
@@ -279,7 +289,7 @@ class DataProcessCleanup:
279
289
  dpis = self.fetch_dpis(job.urn, self.config.batch_size)
280
290
  dpis.sort(
281
291
  key=lambda x: x["created"]["time"]
282
- if x["created"] and x["created"]["time"]
292
+ if "created" in x and "time" in x["created"]
283
293
  else 0,
284
294
  reverse=True,
285
295
  )
@@ -314,15 +324,23 @@ class DataProcessCleanup:
314
324
  if dpi.get("deleted"):
315
325
  continue
316
326
 
317
- if dpi["created"]["time"] < retention_time * 1000:
327
+ if (
328
+ "created" not in dpi
329
+ or "time" not in dpi["created"]
330
+ or dpi["created"]["time"] < retention_time * 1000
331
+ ):
318
332
  future = executor.submit(
319
333
  self.delete_entity, dpi["urn"], "dataprocessInstance"
320
334
  )
321
335
  futures[future] = dpi
322
336
 
323
337
  for future in as_completed(futures):
324
- deleted_count_retention += 1
325
- futures[future]["deleted"] = True
338
+ try:
339
+ future.result()
340
+ deleted_count_retention += 1
341
+ futures[future]["deleted"] = True
342
+ except Exception as e:
343
+ logger.error(f"Exception while deleting DPI: {e}")
326
344
 
327
345
  if deleted_count_retention % self.config.batch_size == 0:
328
346
  logger.info(
@@ -378,8 +396,11 @@ class DataProcessCleanup:
378
396
  dataFlows[flow.urn] = flow
379
397
 
380
398
  scroll_id: Optional[str] = None
399
+ previous_scroll_id: Optional[str] = None
400
+
381
401
  dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
382
402
  deleted_jobs: int = 0
403
+
383
404
  while True:
384
405
  result = self.ctx.graph.execute_graphql(
385
406
  DATAJOB_QUERY,
@@ -426,9 +447,11 @@ class DataProcessCleanup:
426
447
  else:
427
448
  dataJobs[datajob_entity.flow_urn].append(datajob_entity)
428
449
 
429
- if not scroll_id:
450
+ if not scroll_id or previous_scroll_id == scroll_id:
430
451
  break
431
452
 
453
+ previous_scroll_id = scroll_id
454
+
432
455
  logger.info(f"Deleted {deleted_jobs} DataJobs")
433
456
  # Delete empty dataflows if needed
434
457
  if self.config.delete_empty_data_flows:
@@ -443,4 +466,5 @@ class DataProcessCleanup:
443
466
  if deleted_jobs % self.config.batch_size == 0:
444
467
  logger.info(f"Deleted {deleted_data_flows} DataFlows")
445
468
  logger.info(f"Deleted {deleted_data_flows} DataFlows")
469
+
446
470
  return []
@@ -78,7 +78,17 @@ class PulsarSchema:
78
78
  def __init__(self, schema):
79
79
  self.schema_version = schema.get("version")
80
80
 
81
- avro_schema = json.loads(schema.get("data"))
81
+ schema_data = schema.get("data")
82
+ if not schema_data:
83
+ logger.warning("Schema data is empty or None. Using default empty schema.")
84
+ schema_data = "{}"
85
+
86
+ try:
87
+ avro_schema = json.loads(schema_data)
88
+ except json.JSONDecodeError as e:
89
+ logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
90
+ avro_schema = {}
91
+
82
92
  self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
83
93
  self.schema_description = avro_schema.get("doc")
84
94
  self.schema_type = schema.get("type")