acryl-datahub 1.2.0.10rc4__py3-none-any.whl → 1.2.0.10rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -0,0 +1,107 @@
1
+ from datetime import datetime
2
+ from typing import Iterable, Optional, Tuple
3
+
4
+ import requests
5
+
6
+ from datahub.ingestion.api.source import (
7
+ SourceReport,
8
+ )
9
+ from datahub.ingestion.source.snaplogic.snaplogic_config import SnaplogicConfig
10
+ from datahub.ingestion.source.state.redundant_run_skip_handler import (
11
+ RedundantLineageRunSkipHandler,
12
+ )
13
+
14
+
15
+ class SnaplogicLineageExtractor:
16
+ """
17
+ A class to interact with the SnapLogic API.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ config: SnaplogicConfig,
23
+ redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler],
24
+ report: SourceReport,
25
+ ):
26
+ self.config = config
27
+ self.report = report
28
+ self.redundant_run_skip_handler = redundant_run_skip_handler
29
+ self.start_time, self.end_time = self._get_time_window()
30
+
31
+ def get_lineages(self) -> Iterable[dict]:
32
+ """Generator function that yields lineage records one at a time as they are fetched."""
33
+ page = 0
34
+ has_more = True
35
+ records_processed = 0
36
+
37
+ try:
38
+ while has_more:
39
+ params = {
40
+ "format": "OPENLINEAGE",
41
+ "start_ts": str(int(self.start_time.timestamp() * 1000)),
42
+ "end_ts": str(int(self.end_time.timestamp() * 1000)),
43
+ "page": str(page),
44
+ }
45
+
46
+ self.report.info(
47
+ message=f"Fetching lineage data - page: {page}, start_ts: {self.start_time}, end_ts: {self.end_time}",
48
+ title="Lineage Fetch",
49
+ )
50
+ headers = {"User-Agent": "datahub-connector/1.0"}
51
+ response = requests.get(
52
+ url=f"{self.config.base_url}/api/1/rest/public/catalog/{self.config.org_name}/lineage",
53
+ params=params,
54
+ headers=headers,
55
+ auth=(
56
+ self.config.username,
57
+ self.config.password.get_secret_value(),
58
+ ),
59
+ )
60
+ response.raise_for_status()
61
+
62
+ data = response.json()
63
+ content = data["content"]
64
+
65
+ # Yield records one at a time
66
+ for record in content:
67
+ records_processed += 1
68
+ yield record
69
+
70
+ # Check if we need to fetch more pages
71
+ has_more = (
72
+ len(content) >= 20
73
+ ) # If we got full page size, there might be more
74
+ page += 1
75
+
76
+ self.report.info(
77
+ message=f"Completed fetching lineage data. Total records processed: {records_processed}",
78
+ title="Lineage Fetch Complete",
79
+ )
80
+
81
+ except Exception as e:
82
+ self.report.report_failure(
83
+ message="Error fetching lineage data",
84
+ exc=e,
85
+ title="Lineage Fetch Error",
86
+ )
87
+ raise
88
+
89
+ def _get_time_window(self) -> Tuple[datetime, datetime]:
90
+ if self.redundant_run_skip_handler:
91
+ return self.redundant_run_skip_handler.suggest_run_time_window(
92
+ self.config.start_time, self.config.end_time
93
+ )
94
+ else:
95
+ return self.config.start_time, self.config.end_time
96
+
97
+ def update_stats(self):
98
+ if self.redundant_run_skip_handler:
99
+ # Update the checkpoint state for this run.
100
+ self.redundant_run_skip_handler.update_state(
101
+ self.config.start_time,
102
+ self.config.end_time,
103
+ )
104
+
105
+ def report_status(self, step: str, status: bool) -> None:
106
+ if self.redundant_run_skip_handler:
107
+ self.redundant_run_skip_handler.report_current_run_status(step, status)
@@ -0,0 +1,168 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, List, Optional
3
+
4
+
5
+ @dataclass
6
+ class Dataset:
7
+ name: str
8
+ display_name: str
9
+ fields: List[Dict] = field(default_factory=list)
10
+ platform: str = "snaplogic"
11
+ platform_instance: Optional[str] = None
12
+ type: Optional[str] = None # INPUT or OUTPUT
13
+ env: str = "PROD"
14
+
15
+
16
+ @dataclass
17
+ class Pipeline:
18
+ name: str
19
+ id: str
20
+ namespace: str
21
+
22
+
23
+ @dataclass
24
+ class Task:
25
+ name: str
26
+ id: str
27
+ namespace: str
28
+
29
+
30
+ @dataclass
31
+ class ColumnMapping:
32
+ input_dataset: Dataset
33
+ output_dataset: Dataset
34
+ input_field: str
35
+ output_field: str
36
+
37
+
38
+ class SnapLogicParser:
39
+ def __init__(self, case_insensitive_namespaces: list[str], namespace_mapping: dict):
40
+ self.case_insensitive_namespaces = case_insensitive_namespaces
41
+ self.namespace_mapping = namespace_mapping
42
+ self.platform_mapping = {
43
+ "sqlserver": "mssql",
44
+ }
45
+
46
+ def _parse_platform(self, namespace: str) -> str:
47
+ type_part = namespace.split("://")[0] if "://" in namespace else namespace
48
+
49
+ return self.platform_mapping.get(type_part.lower(), type_part.lower())
50
+
51
+ def extract_task_from_lineage(self, lineage: dict) -> Task:
52
+ job = lineage.get("job")
53
+ if not job:
54
+ raise ValueError("Job information is missing in the lineage data.")
55
+ name = job.get("name")
56
+ namespace = job.get("namespace")
57
+
58
+ return Task(
59
+ id=name,
60
+ name=name.rsplit(":", 1)[0],
61
+ namespace=self._parse_platform(namespace),
62
+ )
63
+
64
+ def extract_pipeline_from_lineage(self, lineage: dict) -> Pipeline:
65
+ parent_run = lineage.get("run", {}).get("facets", {}).get("parent", {})
66
+ job = parent_run.get("job", {})
67
+ name = job.get("name")
68
+ namespace = job.get("namespace")
69
+ pipeline_snode_id = parent_run.get("_producer").split("#pipe_snode=")[1]
70
+ return Pipeline(
71
+ id=pipeline_snode_id, name=name, namespace=self._parse_platform(namespace)
72
+ )
73
+
74
+ def _get_case_sensitive_value(self, value: str, namespace: str) -> str:
75
+ """Transform value to lowercase if namespace is case-insensitive."""
76
+ return value.lower() if namespace in self.case_insensitive_namespaces else value
77
+
78
+ def _create_dataset_info(
79
+ self,
80
+ namespace: str,
81
+ name: str,
82
+ display_name: str,
83
+ type: str,
84
+ fields: Optional[List[Dict]] = None,
85
+ ) -> Dataset:
86
+ """Create a Dataset instance with proper case sensitivity."""
87
+ return Dataset(
88
+ platform=self._parse_platform(namespace),
89
+ name=self._get_case_sensitive_value(name, namespace),
90
+ display_name=display_name or name,
91
+ fields=fields or [],
92
+ env="PROD",
93
+ platform_instance=self.namespace_mapping.get(namespace, None),
94
+ type=type,
95
+ )
96
+
97
+ def extract_columns_mapping_from_lineage(
98
+ self, lineage: dict
99
+ ) -> List[ColumnMapping]:
100
+ outputs = lineage.get("outputs", [])
101
+ lineages = []
102
+
103
+ for output in outputs:
104
+ output_namespace = output.get("namespace")
105
+ output_name = output.get("name", "")
106
+ column_lineage = (
107
+ output.get("facets", {}).get("columnLineage", {}).get("fields", {})
108
+ )
109
+
110
+ for field_name, field_dict in column_lineage.items():
111
+ output_field = self._get_case_sensitive_value(
112
+ field_name, output_namespace
113
+ )
114
+
115
+ for input_field in field_dict.get("inputFields", []):
116
+ input_namespace = input_field.get("namespace")
117
+ input_name = input_field.get("name", "")
118
+ input_field_name = input_field.get("field", "")
119
+
120
+ lineages.append(
121
+ ColumnMapping(
122
+ input_dataset=self._create_dataset_info(
123
+ input_namespace, input_name, input_name, "INPUT"
124
+ ),
125
+ output_dataset=self._create_dataset_info(
126
+ output_namespace, output_name, output_name, "OUTPUT"
127
+ ),
128
+ input_field=self._get_case_sensitive_value(
129
+ input_field_name, input_namespace
130
+ ),
131
+ output_field=output_field,
132
+ )
133
+ )
134
+
135
+ return lineages
136
+
137
+ def extract_datasets_from_lineage(self, lineage: dict) -> List[Dataset]:
138
+ inputs = lineage.get("inputs", {})
139
+ outputs = lineage.get("outputs", {})
140
+
141
+ datasets = []
142
+ for dataset, dataset_type in [
143
+ *[(input_dataset, "INPUT") for input_dataset in inputs],
144
+ *[(output_dataset, "OUTPUT") for output_dataset in outputs],
145
+ ]:
146
+ namespace = dataset.get("namespace")
147
+ name = dataset.get("name", "")
148
+ fields = dataset.get("facets", {}).get("schema", {}).get("fields", [])
149
+ display_name = name
150
+
151
+ # Transform names to lowercase if namespace is in case_insensitive_namespaces
152
+ if namespace in self.case_insensitive_namespaces:
153
+ name = name.lower()
154
+ fields = [
155
+ {**field, "name": field.get("name", "").lower()} for field in fields
156
+ ]
157
+
158
+ datasets.append(
159
+ self._create_dataset_info(
160
+ namespace=namespace,
161
+ name=name,
162
+ fields=fields,
163
+ display_name=display_name,
164
+ type=dataset_type,
165
+ )
166
+ )
167
+
168
+ return datasets
@@ -0,0 +1,31 @@
1
+ from datahub.metadata.schema_classes import (
2
+ BooleanTypeClass,
3
+ NumberTypeClass,
4
+ SchemaFieldDataTypeClass,
5
+ StringTypeClass,
6
+ )
7
+
8
+
9
+ class SnaplogicUtils:
10
+ @staticmethod
11
+ def get_datahub_type(type_str: str) -> SchemaFieldDataTypeClass:
12
+ """
13
+ Maps a string-based type to a DataHub SchemaFieldDataTypeClass.
14
+
15
+ Args:
16
+ type_str (str): The input type (e.g., "string", "int", "boolean").
17
+
18
+ Returns:
19
+ SchemaFieldDataTypeClass: The mapped DataHub type.
20
+ """
21
+ normalized_type = type_str.lower()
22
+
23
+ if normalized_type in ["string", "varchar"]:
24
+ return SchemaFieldDataTypeClass(type=StringTypeClass())
25
+ elif normalized_type in ["number", "long", "float", "double", "int"]:
26
+ return SchemaFieldDataTypeClass(type=NumberTypeClass())
27
+ elif normalized_type == "boolean":
28
+ return SchemaFieldDataTypeClass(type=BooleanTypeClass())
29
+ else:
30
+ # Default fallback: String
31
+ return SchemaFieldDataTypeClass(type=StringTypeClass())
@@ -594,13 +594,13 @@ class TableauConfig(
594
594
  )
595
595
 
596
596
  extract_lineage_from_unsupported_custom_sql_queries: bool = Field(
597
- default=False,
598
- description="[Experimental] Whether to extract lineage from unsupported custom sql queries using SQL parsing",
597
+ default=True,
598
+ description="[Experimental] Extract lineage from Custom SQL queries using DataHub's SQL parser in cases where the Tableau Catalog API fails to return lineage for the query.",
599
599
  )
600
600
 
601
601
  force_extraction_of_lineage_from_custom_sql_queries: bool = Field(
602
602
  default=False,
603
- description="[Experimental] Force extraction of lineage from custom sql queries using SQL parsing, ignoring Tableau metadata",
603
+ description="[Experimental] Force extraction of lineage from Custom SQL queries using DataHub's SQL parser, even when the Tableau Catalog API returns lineage already.",
604
604
  )
605
605
 
606
606
  sql_parsing_disable_schema_awareness: bool = Field(
@@ -0,0 +1,112 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Optional, cast
4
+
5
+ from datahub.configuration.common import (
6
+ TransformerSemanticsConfigModel,
7
+ )
8
+ from datahub.emitter.mce_builder import Aspect
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.transformer.base_transformer import (
11
+ BaseTransformer,
12
+ SingleAspectTransformer,
13
+ )
14
+ from datahub.metadata.schema_classes import (
15
+ BrowsePathEntryClass,
16
+ BrowsePathsV2Class,
17
+ )
18
+ from datahub.utilities.urns.urn import guess_entity_type
19
+
20
+
21
+ class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
22
+ path: List[str]
23
+
24
+
25
+ class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
26
+ ctx: PipelineContext
27
+ config: SetBrowsePathTransformerConfig
28
+
29
+ def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
30
+ super().__init__()
31
+ self.ctx = ctx
32
+ self.config = config
33
+
34
+ def aspect_name(self) -> str:
35
+ return "browsePathsV2"
36
+
37
+ def entity_types(self) -> List[str]:
38
+ # This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
39
+ return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
40
+
41
+ @classmethod
42
+ def create(
43
+ cls, config_dict: dict, ctx: PipelineContext
44
+ ) -> "SetBrowsePathTransformer":
45
+ config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
46
+ return cls(config, ctx)
47
+
48
+ @staticmethod
49
+ def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
50
+ template_vars: Dict[str, List[str]] = {}
51
+ model: Dict[str, List[str]] = defaultdict(list)
52
+ for entry in existing_browse_paths.path or []:
53
+ if entry.urn:
54
+ entity_type = guess_entity_type(entry.urn)
55
+ model[entity_type].append(entry.urn)
56
+
57
+ for entity_type, urns in model.items():
58
+ template_vars[f"{entity_type}[*]"] = urns
59
+ for i, urn in enumerate(urns):
60
+ template_vars[f"{entity_type}[{i}]"] = [urn]
61
+
62
+ return template_vars
63
+
64
+ @classmethod
65
+ def _expand_nodes(
66
+ cls, templates: List[str], template_vars: Dict[str, List[str]]
67
+ ) -> BrowsePathsV2Class:
68
+ expanded_nodes: List[str] = []
69
+ for node in templates:
70
+ resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
71
+ expanded_nodes.extend(resolved_nodes)
72
+
73
+ processed_entries: List[BrowsePathEntryClass] = []
74
+ for node in expanded_nodes:
75
+ if not node or node.isspace():
76
+ continue
77
+ processed_entries.append(
78
+ BrowsePathEntryClass(
79
+ id=node, urn=node if node.startswith("urn:") else None
80
+ )
81
+ )
82
+ return BrowsePathsV2Class(path=processed_entries)
83
+
84
+ def transform_aspect(
85
+ self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
86
+ ) -> Optional[Aspect]:
87
+ template_vars: Dict[str, List[str]] = {}
88
+ if aspect is not None:
89
+ assert isinstance(aspect, BrowsePathsV2Class)
90
+ template_vars = self._build_model(aspect)
91
+ new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
92
+ self.config.path, template_vars
93
+ )
94
+ if aspect is not None and not self.config.replace_existing:
95
+ for node in aspect.path:
96
+ new_browse_paths.path.append(node)
97
+
98
+ return cast(Aspect, new_browse_paths)
99
+
100
+ @staticmethod
101
+ def _resolve_template_to_nodes(
102
+ template_str: str, template_vars: Dict[str, List[str]]
103
+ ) -> List[str]:
104
+ # This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
105
+ # proper templating engine, like jinja).
106
+ template_str = template_str.strip()
107
+ var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
108
+
109
+ if not var_pattern:
110
+ return [template_str]
111
+
112
+ return template_vars.get(var_pattern[0], [])
datahub/sdk/_shared.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import warnings
4
+ from abc import ABC, abstractmethod
4
5
  from datetime import datetime
5
6
  from typing import (
6
7
  TYPE_CHECKING,
@@ -61,6 +62,7 @@ DataPlatformInstanceUrnOrStr: TypeAlias = Union[str, DataPlatformInstanceUrn]
61
62
  DataPlatformUrnOrStr: TypeAlias = Union[str, DataPlatformUrn]
62
63
 
63
64
  ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn]
65
+ ActorUrnOrStr: TypeAlias = Union[str, ActorUrn]
64
66
  StructuredPropertyUrnOrStr: TypeAlias = Union[str, StructuredPropertyUrn]
65
67
  StructuredPropertyValueType: TypeAlias = Union[str, float, int]
66
68
  StructuredPropertyInputType: TypeAlias = Dict[
@@ -110,6 +112,130 @@ def parse_time_stamp(ts: Optional[models.TimeStampClass]) -> Optional[datetime]:
110
112
  return parse_ts_millis(ts.time)
111
113
 
112
114
 
115
+ class ChangeAuditStampsMixin(ABC):
116
+ """Mixin class for managing audit stamps on entities."""
117
+
118
+ __slots__ = ()
119
+
120
+ @abstractmethod
121
+ def _get_audit_stamps(self) -> models.ChangeAuditStampsClass:
122
+ """Get the audit stamps from the entity properties."""
123
+ pass
124
+
125
+ @abstractmethod
126
+ def _set_audit_stamps(self, audit_stamps: models.ChangeAuditStampsClass) -> None:
127
+ """Set the audit stamps on the entity properties."""
128
+ pass
129
+
130
+ @property
131
+ def last_modified(self) -> Optional[datetime]:
132
+ """Get the last modification timestamp from audit stamps."""
133
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
134
+ if audit_stamps.lastModified.time == 0:
135
+ return None
136
+ return datetime.fromtimestamp(
137
+ audit_stamps.lastModified.time / 1000
138
+ ) # supports only seconds precision
139
+
140
+ def set_last_modified(self, last_modified: datetime) -> None:
141
+ """Set the last modification timestamp in audit stamps."""
142
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
143
+ audit_stamps.lastModified.time = make_ts_millis(last_modified)
144
+ self._set_audit_stamps(audit_stamps)
145
+
146
+ @property
147
+ def last_modified_by(self) -> Optional[str]:
148
+ """Get the last modification actor from audit stamps."""
149
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
150
+ if audit_stamps.lastModified.actor == builder.UNKNOWN_USER:
151
+ return None
152
+ return audit_stamps.lastModified.actor
153
+
154
+ def set_last_modified_by(self, last_modified_by: ActorUrnOrStr) -> None:
155
+ """Set the last modification actor in audit stamps."""
156
+ if isinstance(last_modified_by, str):
157
+ last_modified_by = make_user_urn(last_modified_by)
158
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
159
+ audit_stamps.lastModified.actor = str(last_modified_by)
160
+ self._set_audit_stamps(audit_stamps)
161
+
162
+ @property
163
+ def created_at(self) -> Optional[datetime]:
164
+ """Get the creation timestamp from audit stamps."""
165
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
166
+ if audit_stamps.created.time == 0:
167
+ return None
168
+ return datetime.fromtimestamp(
169
+ audit_stamps.created.time / 1000
170
+ ) # supports only seconds precision
171
+
172
+ def set_created_at(self, created_at: datetime) -> None:
173
+ """Set the creation timestamp in audit stamps."""
174
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
175
+ audit_stamps.created.time = make_ts_millis(created_at)
176
+ self._set_audit_stamps(audit_stamps)
177
+
178
+ @property
179
+ def created_by(self) -> Optional[ActorUrnOrStr]:
180
+ """Get the creation actor from audit stamps."""
181
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
182
+ if audit_stamps.created.actor == builder.UNKNOWN_USER:
183
+ return None
184
+ return audit_stamps.created.actor
185
+
186
+ def set_created_by(self, created_by: ActorUrnOrStr) -> None:
187
+ """Set the creation actor in audit stamps."""
188
+ if isinstance(created_by, str):
189
+ created_by = make_user_urn(created_by)
190
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
191
+ audit_stamps.created.actor = str(created_by)
192
+ self._set_audit_stamps(audit_stamps)
193
+
194
+ @property
195
+ def deleted_on(self) -> Optional[datetime]:
196
+ """Get the deletion timestamp from audit stamps."""
197
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
198
+ if audit_stamps.deleted is None or audit_stamps.deleted.time == 0:
199
+ return None
200
+ return datetime.fromtimestamp(
201
+ audit_stamps.deleted.time / 1000
202
+ ) # supports only seconds precision
203
+
204
+ def set_deleted_on(self, deleted_on: datetime) -> None:
205
+ """Set the deletion timestamp in audit stamps."""
206
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
207
+ # Default constructor sets deleted to None
208
+ if audit_stamps.deleted is None:
209
+ audit_stamps.deleted = models.AuditStampClass(
210
+ time=0, actor=builder.UNKNOWN_USER
211
+ )
212
+ audit_stamps.deleted.time = make_ts_millis(deleted_on)
213
+ self._set_audit_stamps(audit_stamps)
214
+
215
+ @property
216
+ def deleted_by(self) -> Optional[ActorUrnOrStr]:
217
+ """Get the deletion actor from audit stamps."""
218
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
219
+ if (
220
+ audit_stamps.deleted is None
221
+ or audit_stamps.deleted.actor == builder.UNKNOWN_USER
222
+ ):
223
+ return None
224
+ return audit_stamps.deleted.actor
225
+
226
+ def set_deleted_by(self, deleted_by: ActorUrnOrStr) -> None:
227
+ """Set the deletion actor in audit stamps."""
228
+ if isinstance(deleted_by, str):
229
+ deleted_by = make_user_urn(deleted_by)
230
+ audit_stamps: models.ChangeAuditStampsClass = self._get_audit_stamps()
231
+ if audit_stamps.deleted is None:
232
+ audit_stamps.deleted = models.AuditStampClass(
233
+ time=0, actor=builder.UNKNOWN_USER
234
+ )
235
+ audit_stamps.deleted.actor = str(deleted_by)
236
+ self._set_audit_stamps(audit_stamps)
237
+
238
+
113
239
  class HasPlatformInstance(Entity):
114
240
  __slots__ = ()
115
241