acryl-datahub 1.0.0.4rc4__py3-none-any.whl → 1.0.0.4rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -4,22 +4,24 @@ import difflib
4
4
  import logging
5
5
  from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
6
6
 
7
+ from typing_extensions import assert_never
8
+
7
9
  import datahub.metadata.schema_classes as models
8
10
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
11
  from datahub.errors import SdkUsageError
10
- from datahub.metadata.schema_classes import SchemaMetadataClass
11
- from datahub.metadata.urns import DatasetUrn, QueryUrn
12
- from datahub.sdk._shared import DatasetUrnOrStr
12
+ from datahub.metadata.urns import DataJobUrn, DatasetUrn, QueryUrn
13
+ from datahub.sdk._shared import DatajobUrnOrStr, DatasetUrnOrStr
13
14
  from datahub.sdk._utils import DEFAULT_ACTOR_URN
14
15
  from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
16
+ from datahub.specific.datajob import DataJobPatchBuilder
15
17
  from datahub.specific.dataset import DatasetPatchBuilder
16
18
  from datahub.sql_parsing.fingerprint_utils import generate_hash
17
19
  from datahub.utilities.ordered_set import OrderedSet
20
+ from datahub.utilities.urns.error import InvalidUrnError
18
21
 
19
22
  if TYPE_CHECKING:
20
23
  from datahub.sdk.main_client import DataHubClient
21
24
 
22
- logger = logging.getLogger(__name__)
23
25
 
24
26
  _empty_audit_stamp = models.AuditStampClass(
25
27
  time=0,
@@ -27,16 +29,19 @@ _empty_audit_stamp = models.AuditStampClass(
27
29
  )
28
30
 
29
31
 
32
+ logger = logging.getLogger(__name__)
33
+
34
+
30
35
  class LineageClient:
31
36
  def __init__(self, client: DataHubClient):
32
37
  self._client = client
33
38
 
34
39
  def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
35
40
  schema_metadata = self._client._graph.get_aspect(
36
- str(dataset_urn), SchemaMetadataClass
41
+ str(dataset_urn), models.SchemaMetadataClass
37
42
  )
38
43
  if schema_metadata is None:
39
- return Set()
44
+ return set()
40
45
 
41
46
  return {field.fieldPath for field in schema_metadata.fields}
42
47
 
@@ -122,7 +127,7 @@ class LineageClient:
122
127
 
123
128
  if column_lineage is None:
124
129
  cll = None
125
- elif column_lineage in ["auto_fuzzy", "auto_strict"]:
130
+ elif column_lineage == "auto_fuzzy" or column_lineage == "auto_strict":
126
131
  upstream_schema = self._get_fields_from_dataset_urn(upstream)
127
132
  downstream_schema = self._get_fields_from_dataset_urn(downstream)
128
133
  if column_lineage == "auto_fuzzy":
@@ -144,6 +149,8 @@ class LineageClient:
144
149
  downstream=downstream,
145
150
  cll_mapping=column_lineage,
146
151
  )
152
+ else:
153
+ assert_never(column_lineage)
147
154
 
148
155
  updater = DatasetPatchBuilder(str(downstream))
149
156
  updater.add_upstream_lineage(
@@ -227,9 +234,129 @@ class LineageClient:
227
234
  raise SdkUsageError(
228
235
  f"Dataset {updater.urn} does not exist, and hence cannot be updated."
229
236
  )
237
+
230
238
  mcps: List[
231
239
  Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
232
240
  ] = list(updater.build())
233
241
  if query_entity:
234
242
  mcps.extend(query_entity)
235
243
  self._client._graph.emit_mcps(mcps)
244
+
245
+ def add_dataset_lineage_from_sql(
246
+ self,
247
+ *,
248
+ query_text: str,
249
+ platform: str,
250
+ platform_instance: Optional[str] = None,
251
+ env: str = "PROD",
252
+ default_db: Optional[str] = None,
253
+ default_schema: Optional[str] = None,
254
+ ) -> None:
255
+ """Add lineage by parsing a SQL query."""
256
+ from datahub.sql_parsing.sqlglot_lineage import (
257
+ create_lineage_sql_parsed_result,
258
+ )
259
+
260
+ # Parse the SQL query to extract lineage information
261
+ parsed_result = create_lineage_sql_parsed_result(
262
+ query=query_text,
263
+ default_db=default_db,
264
+ default_schema=default_schema,
265
+ platform=platform,
266
+ platform_instance=platform_instance,
267
+ env=env,
268
+ graph=self._client._graph,
269
+ )
270
+
271
+ if parsed_result.debug_info.table_error:
272
+ raise SdkUsageError(
273
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}"
274
+ )
275
+ elif parsed_result.debug_info.column_error:
276
+ logger.warning(
277
+ f"Failed to parse SQL query: {parsed_result.debug_info.error}",
278
+ )
279
+
280
+ if not parsed_result.out_tables:
281
+ raise SdkUsageError(
282
+ "No output tables found in the query. Cannot establish lineage."
283
+ )
284
+
285
+ # Use the first output table as the downstream
286
+ downstream_urn = parsed_result.out_tables[0]
287
+
288
+ # Process all upstream tables found in the query
289
+ for upstream_table in parsed_result.in_tables:
290
+ # Skip self-lineage
291
+ if upstream_table == downstream_urn:
292
+ continue
293
+
294
+ # Extract column-level lineage for this specific upstream table
295
+ column_mapping = {}
296
+ if parsed_result.column_lineage:
297
+ for col_lineage in parsed_result.column_lineage:
298
+ if not (col_lineage.downstream and col_lineage.downstream.column):
299
+ continue
300
+
301
+ # Filter upstreams to only include columns from current upstream table
302
+ upstream_cols = [
303
+ ref.column
304
+ for ref in col_lineage.upstreams
305
+ if ref.table == upstream_table and ref.column
306
+ ]
307
+
308
+ if upstream_cols:
309
+ column_mapping[col_lineage.downstream.column] = upstream_cols
310
+
311
+ # Add lineage, including query text
312
+ self.add_dataset_transform_lineage(
313
+ upstream=upstream_table,
314
+ downstream=downstream_urn,
315
+ column_lineage=column_mapping or None,
316
+ query_text=query_text,
317
+ )
318
+
319
+ def add_datajob_lineage(
320
+ self,
321
+ *,
322
+ datajob: DatajobUrnOrStr,
323
+ upstreams: Optional[List[Union[DatasetUrnOrStr, DatajobUrnOrStr]]] = None,
324
+ downstreams: Optional[List[DatasetUrnOrStr]] = None,
325
+ ) -> None:
326
+ """
327
+ Add lineage between a datajob and datasets/datajobs.
328
+
329
+ Args:
330
+ datajob: The datajob URN to connect lineage with
331
+ upstreams: List of upstream datasets or datajobs that serve as inputs to the datajob
332
+ downstreams: List of downstream datasets that are outputs of the datajob
333
+ """
334
+
335
+ if not upstreams and not downstreams:
336
+ raise SdkUsageError("No upstreams or downstreams provided")
337
+
338
+ datajob_urn = DataJobUrn.from_string(datajob)
339
+
340
+ # Initialize the patch builder for the datajob
341
+ patch_builder = DataJobPatchBuilder(str(datajob_urn))
342
+
343
+ # Process upstream connections (inputs to the datajob)
344
+ if upstreams:
345
+ for upstream in upstreams:
346
+ # try converting to dataset urn
347
+ try:
348
+ dataset_urn = DatasetUrn.from_string(upstream)
349
+ patch_builder.add_input_dataset(dataset_urn)
350
+ except InvalidUrnError:
351
+ # try converting to datajob urn
352
+ datajob_urn = DataJobUrn.from_string(upstream)
353
+ patch_builder.add_input_datajob(datajob_urn)
354
+
355
+ # Process downstream connections (outputs from the datajob)
356
+ if downstreams:
357
+ for downstream in downstreams:
358
+ downstream_urn = DatasetUrn.from_string(downstream)
359
+ patch_builder.add_output_dataset(downstream_urn)
360
+
361
+ # Apply the changes to the entity
362
+ self._client.entities.update(patch_builder)
@@ -12,10 +12,10 @@ from datahub.sdk.search_client import SearchClient
12
12
 
13
13
  try:
14
14
  from acryl_datahub_cloud._sdk_extras import ( # type: ignore[import-not-found]
15
- AssertionClient,
15
+ AssertionsClient,
16
16
  )
17
17
  except ImportError:
18
- AssertionClient = None
18
+ AssertionsClient = None
19
19
 
20
20
 
21
21
  class DataHubClient:
@@ -112,9 +112,9 @@ class DataHubClient:
112
112
  return LineageClient(self)
113
113
 
114
114
  @property
115
- def assertion(self) -> AssertionClient: # type: ignore[return-value] # Type is not available if assertion_client is not installed
116
- if AssertionClient is None:
115
+ def assertions(self) -> AssertionsClient: # type: ignore[return-value] # Type is not available if assertion_client is not installed
116
+ if AssertionsClient is None:
117
117
  raise SdkUsageError(
118
- "AssertionClient is not installed, please install it with `pip install acryl-datahub-cloud`"
118
+ "AssertionsClient is not installed, please install it with `pip install acryl-datahub-cloud`"
119
119
  )
120
- return AssertionClient(self)
120
+ return AssertionsClient(self)
@@ -48,16 +48,22 @@ def wait_for_port(
48
48
  subprocess.run(f"docker logs {container_name}", shell=True, check=True)
49
49
 
50
50
 
51
+ DOCKER_DEFAULT_UNLIMITED_PARALLELISM = -1
52
+
53
+
51
54
  @pytest.fixture(scope="module")
52
55
  def docker_compose_runner(
53
56
  docker_compose_command, docker_compose_project_name, docker_setup, docker_cleanup
54
57
  ):
55
58
  @contextlib.contextmanager
56
59
  def run(
57
- compose_file_path: Union[str, List[str]], key: str, cleanup: bool = True
60
+ compose_file_path: Union[str, List[str]],
61
+ key: str,
62
+ cleanup: bool = True,
63
+ parallel: int = DOCKER_DEFAULT_UNLIMITED_PARALLELISM,
58
64
  ) -> Iterator[pytest_docker.plugin.Services]:
59
65
  with pytest_docker.plugin.get_docker_services(
60
- docker_compose_command=docker_compose_command,
66
+ docker_compose_command=f"{docker_compose_command} --parallel {parallel}",
61
67
  # We can remove the type ignore once this is merged:
62
68
  # https://github.com/avast/pytest-docker/pull/108
63
69
  docker_compose_file=compose_file_path, # type: ignore