acryl-datahub 1.1.0.5rc3__py3-none-any.whl → 1.1.0.5rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (41) hide show
  1. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/METADATA +2423 -2423
  2. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/RECORD +41 -34
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/api/report.py +123 -2
  5. datahub/ingestion/api/source.py +45 -44
  6. datahub/ingestion/autogenerated/lineage_helper.py +193 -0
  7. datahub/ingestion/run/pipeline.py +6 -0
  8. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  9. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  10. datahub/ingestion/source/bigquery_v2/queries.py +4 -4
  11. datahub/ingestion/source/common/subtypes.py +2 -0
  12. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  13. datahub/ingestion/source/hex/api.py +26 -1
  14. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  15. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
  16. datahub/ingestion/source/slack/slack.py +2 -1
  17. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -0
  18. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  19. datahub/ingestion/source/sql/vertica.py +2 -1
  20. datahub/ingestion/source/unity/source.py +36 -20
  21. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  22. datahub/metadata/_internal_schema_classes.py +601 -0
  23. datahub/metadata/_urns/urn_defs.py +112 -0
  24. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  25. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  26. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  27. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  28. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  29. datahub/metadata/schema.avsc +383 -0
  30. datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
  31. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  32. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
  33. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  34. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  35. datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
  36. datahub/sdk/datajob.py +39 -15
  37. datahub/specific/dataproduct.py +4 -0
  38. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/entry_points.txt +0 -0
  40. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/licenses/LICENSE +0 -0
  41. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,193 @@
1
+ import json
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List, Optional, Set
5
+
6
+ from datahub.utilities.urns.urn import guess_entity_type
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # Global cache for lineage data to avoid repeated file reads
11
+ _lineage_data: Optional[Dict] = None
12
+
13
+
14
+ def _load_lineage_data() -> Dict:
15
+ """
16
+ This is experimental internal API subject to breaking changes without prior notice.
17
+
18
+ Load lineage data from the autogenerated lineage.json file.
19
+
20
+ Returns:
21
+ Dict containing the lineage information
22
+
23
+ Raises:
24
+ FileNotFoundError: If lineage.json doesn't exist
25
+ json.JSONDecodeError: If lineage.json is malformed
26
+ """
27
+ global _lineage_data
28
+
29
+ if _lineage_data is not None:
30
+ return _lineage_data
31
+
32
+ # Get the path to lineage.json relative to this file
33
+ current_file = Path(__file__)
34
+ lineage_file = current_file.parent / "lineage.json"
35
+
36
+ if not lineage_file.exists():
37
+ raise FileNotFoundError(f"Lineage file not found: {lineage_file}")
38
+
39
+ try:
40
+ with open(lineage_file, "r") as f:
41
+ _lineage_data = json.load(f)
42
+ return _lineage_data
43
+ except json.JSONDecodeError as e:
44
+ raise json.JSONDecodeError(
45
+ f"Failed to parse lineage.json: {e}", e.doc, e.pos
46
+ ) from e
47
+
48
+
49
+ def get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
50
+ """
51
+ This is experimental internal API subject to breaking changes without prior notice.
52
+
53
+ Get lineage fields for a specific entity type and aspect.
54
+
55
+ Args:
56
+ entity_type: The entity type (e.g., 'dataset', 'dataJob')
57
+ aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
58
+
59
+ Returns:
60
+ List of lineage field dictionaries, each containing:
61
+ - name: field name
62
+ - path: dot-notation path to the field
63
+ - isLineage: boolean indicating if it's lineage
64
+ - relationship: relationship information
65
+
66
+ Raises:
67
+ FileNotFoundError: If lineage.json doesn't exist
68
+ json.JSONDecodeError: If lineage.json is malformed
69
+ """
70
+ lineage_data = _load_lineage_data()
71
+
72
+ entity_data = lineage_data.get("entities", {}).get(entity_type, {})
73
+ aspect_data = entity_data.get(aspect_name, {})
74
+
75
+ return aspect_data.get("fields", [])
76
+
77
+
78
+ def is_lineage_field(urn: str, aspect_name: str, field_path: str) -> bool:
79
+ """
80
+ This is experimental internal API subject to breaking changes without prior notice.
81
+
82
+ Check if a specific field path is lineage-related.
83
+
84
+ Args:
85
+ urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
86
+ aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
87
+ field_path: The dot-notation path to the field (e.g., 'upstreams.dataset')
88
+
89
+ Returns:
90
+ True if the field is lineage-related, False otherwise
91
+
92
+ Raises:
93
+ FileNotFoundError: If lineage.json doesn't exist
94
+ json.JSONDecodeError: If lineage.json is malformed
95
+ AssertionError: If URN doesn't start with 'urn:li:'
96
+ """
97
+ entity_type = guess_entity_type(urn)
98
+ lineage_fields = get_lineage_fields(entity_type, aspect_name)
99
+
100
+ for field in lineage_fields:
101
+ if field.get("path") == field_path:
102
+ return field.get("isLineage", False)
103
+
104
+ return False
105
+
106
+
107
+ def has_lineage(urn: str, aspect: Any) -> bool:
108
+ """
109
+ This is experimental internal API subject to breaking changes without prior notice.
110
+
111
+ Check if an aspect has any lineage fields.
112
+
113
+ Args:
114
+ urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
115
+ aspect: The aspect object
116
+
117
+ Returns:
118
+ True if the aspect has lineage fields, False otherwise
119
+
120
+ Raises:
121
+ FileNotFoundError: If lineage.json doesn't exist
122
+ json.JSONDecodeError: If lineage.json is malformed
123
+ AssertionError: If URN doesn't start with 'urn:li:'
124
+ """
125
+ entity_type = guess_entity_type(urn)
126
+ aspect_class = getattr(aspect, "__class__", None)
127
+ aspect_name = (
128
+ aspect_class.__name__ if aspect_class is not None else str(type(aspect))
129
+ )
130
+
131
+ lineage_fields = get_lineage_fields(entity_type, aspect_name)
132
+ return len(lineage_fields) > 0
133
+
134
+
135
+ def has_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
136
+ """
137
+ This is experimental internal API subject to breaking changes without prior notice.
138
+
139
+ Check if an aspect has any lineage fields.
140
+
141
+ Args:
142
+ entity_type: The entity type (e.g., 'dataset', 'dataJob')
143
+ aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
144
+
145
+ Returns:
146
+ True if the aspect has lineage fields, False otherwise
147
+
148
+ Raises:
149
+ FileNotFoundError: If lineage.json doesn't exist
150
+ json.JSONDecodeError: If lineage.json is malformed
151
+ """
152
+ lineage_fields = get_lineage_fields(entity_type, aspect_name)
153
+ return len(lineage_fields) > 0
154
+
155
+
156
+ def get_all_lineage_aspects(entity_type: str) -> Set[str]:
157
+ """
158
+ This is experimental internal API subject to breaking changes without prior notice.
159
+
160
+ Get all aspects that have lineage fields for a given entity type.
161
+
162
+ Args:
163
+ entity_type: The entity type (e.g., 'dataset', 'dataJob')
164
+
165
+ Returns:
166
+ Set of aspect names that have lineage fields
167
+
168
+ Raises:
169
+ FileNotFoundError: If lineage.json doesn't exist
170
+ json.JSONDecodeError: If lineage.json is malformed
171
+ """
172
+ lineage_data = _load_lineage_data()
173
+
174
+ entity_data = lineage_data.get("entities", {}).get(entity_type, {})
175
+ lineage_aspects = set()
176
+
177
+ for aspect_name, aspect_data in entity_data.items():
178
+ if aspect_data.get("fields"):
179
+ lineage_aspects.add(aspect_name)
180
+
181
+ return lineage_aspects
182
+
183
+
184
+ def clear_cache() -> None:
185
+ """
186
+ This is experimental internal API subject to breaking changes without prior notice.
187
+
188
+ Clear the internal cache of lineage data.
189
+
190
+ This is useful for testing or when the lineage.json file has been updated.
191
+ """
192
+ global _lineage_data
193
+ _lineage_data = None
@@ -578,11 +578,17 @@ class Pipeline:
578
578
  sink_failures = len(self.sink.get_report().failures)
579
579
  sink_warnings = len(self.sink.get_report().warnings)
580
580
  global_warnings = len(get_global_warnings())
581
+ source_aspects = self.source.get_report().get_aspects_dict()
582
+ source_aspects_by_subtype = (
583
+ self.source.get_report().get_aspects_by_subtypes_dict()
584
+ )
581
585
 
582
586
  telemetry_instance.ping(
583
587
  "ingest_stats",
584
588
  {
585
589
  "source_type": self.source_type,
590
+ "source_aspects": source_aspects,
591
+ "source_aspects_by_subtype": source_aspects_by_subtype,
586
592
  "sink_type": self.sink_type,
587
593
  "transformer_types": [
588
594
  transformer.type for transformer in self.config.transformers or []
@@ -94,3 +94,4 @@ class BigQueryQueriesSource(Source):
94
94
  def close(self) -> None:
95
95
  self.queries_extractor.close()
96
96
  self.connection.close()
97
+ super().close()
@@ -189,6 +189,7 @@ WHERE
189
189
 
190
190
  if len(profile_requests) == 0:
191
191
  return
192
+
192
193
  yield from self.generate_profile_workunits(
193
194
  profile_requests,
194
195
  max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
226
227
  db_name, schema_name, bq_table, self.config.profiling.partition_datetime
227
228
  )
228
229
 
229
- if partition is None and bq_table.partition_info:
230
+ # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
231
+ if partition is None and bq_table.partition_info and bq_table.rows_count:
230
232
  self.report.report_warning(
231
233
  title="Profile skipped for partitioned table",
232
- message="profile skipped as partitioned table is empty or partition id or type was invalid",
234
+ message="profile skipped as partition id or type was invalid",
233
235
  context=profile_request.pretty_name,
234
236
  )
235
237
  return None
@@ -45,12 +45,12 @@ SELECT
45
45
  tos.OPTION_VALUE as comment,
46
46
  t.is_insertable_into,
47
47
  t.ddl,
48
- ts.row_count,
49
- ts.size_bytes as bytes,
48
+ ts.row_count as row_count,
49
+ ts.size_bytes as size_bytes,
50
50
  p.num_partitions,
51
51
  p.max_partition_id,
52
- p.active_billable_bytes,
53
- p.long_term_billable_bytes,
52
+ p.active_billable_bytes as active_billable_bytes,
53
+ -- IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
54
54
  REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
55
55
  REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
56
56
 
@@ -26,6 +26,8 @@ class DatasetSubTypes(StrEnum):
26
26
  NEO4J_RELATIONSHIP = "Neo4j Relationship"
27
27
  SNOWFLAKE_STREAM = "Snowflake Stream"
28
28
  API_ENDPOINT = "API Endpoint"
29
+ SLACK_CHANNEL = "Slack Channel"
30
+ PROJECTIONS = "Projections"
29
31
 
30
32
  # TODO: Create separate entity...
31
33
  NOTEBOOK = "Notebook"
@@ -1,8 +1,8 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional
2
+ from typing import Dict, Iterable, List, Optional, Union
3
3
 
4
4
  import datahub.emitter.mce_builder as builder
5
- from datahub.api.entities.datajob import DataFlow, DataJob
5
+ from datahub.api.entities.datajob import DataJob as DataJobV1
6
6
  from datahub.api.entities.dataprocess.dataprocess_instance import (
7
7
  DataProcessInstance,
8
8
  InstanceRunResult,
@@ -42,8 +42,10 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
42
42
  FineGrainedLineageDownstreamType,
43
43
  FineGrainedLineageUpstreamType,
44
44
  )
45
- from datahub.utilities.urns.data_flow_urn import DataFlowUrn
46
- from datahub.utilities.urns.dataset_urn import DatasetUrn
45
+ from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
46
+ from datahub.sdk.dataflow import DataFlow
47
+ from datahub.sdk.datajob import DataJob
48
+ from datahub.sdk.entity import Entity
47
49
 
48
50
  # Logger instance
49
51
  logger = logging.getLogger(__name__)
@@ -75,8 +77,8 @@ class FivetranSource(StatefulIngestionSourceBase):
75
77
  self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
76
78
 
77
79
  def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
78
- input_dataset_urn_list: List[DatasetUrn] = []
79
- output_dataset_urn_list: List[DatasetUrn] = []
80
+ input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
81
+ output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
80
82
  fine_grained_lineage: List[FineGrainedLineage] = []
81
83
 
82
84
  # TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
@@ -178,9 +180,9 @@ class FivetranSource(StatefulIngestionSourceBase):
178
180
  )
179
181
  )
180
182
 
181
- datajob.inlets.extend(input_dataset_urn_list)
182
- datajob.outlets.extend(output_dataset_urn_list)
183
- datajob.fine_grained_lineages.extend(fine_grained_lineage)
183
+ datajob.set_inlets(input_dataset_urn_list)
184
+ datajob.set_outlets(output_dataset_urn_list)
185
+ datajob.set_fine_grained_lineages(fine_grained_lineage)
184
186
 
185
187
  return dict(
186
188
  **{
@@ -197,10 +199,10 @@ class FivetranSource(StatefulIngestionSourceBase):
197
199
 
198
200
  def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
199
201
  return DataFlow(
200
- orchestrator=Constant.ORCHESTRATOR,
201
- id=connector.connector_id,
202
+ platform=Constant.ORCHESTRATOR,
203
+ name=connector.connector_id,
202
204
  env=self.config.env,
203
- name=connector.connector_name,
205
+ display_name=connector.connector_name,
204
206
  platform_instance=self.config.platform_instance,
205
207
  )
206
208
 
@@ -213,11 +215,11 @@ class FivetranSource(StatefulIngestionSourceBase):
213
215
  )
214
216
  owner_email = self.audit_log.get_user_email(connector.user_id)
215
217
  datajob = DataJob(
216
- id=connector.connector_id,
218
+ name=connector.connector_id,
217
219
  flow_urn=dataflow_urn,
218
220
  platform_instance=self.config.platform_instance,
219
- name=connector.connector_name,
220
- owners={owner_email} if owner_email else set(),
221
+ display_name=connector.connector_name,
222
+ owners=[CorpUserUrn(owner_email)] if owner_email else None,
221
223
  )
222
224
 
223
225
  # Map connector source and destination table with dataset entity
@@ -232,16 +234,24 @@ class FivetranSource(StatefulIngestionSourceBase):
232
234
  "sync_frequency": str(connector.sync_frequency),
233
235
  "destination_id": connector.destination_id,
234
236
  }
235
- datajob.properties = {
236
- **connector_properties,
237
- **lineage_properties,
238
- }
237
+
238
+ datajob.set_custom_properties({**connector_properties, **lineage_properties})
239
239
 
240
240
  return datajob
241
241
 
242
242
  def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
243
+ # hack: convert to old instance for DataProcessInstance.from_datajob compatibility
244
+ datajob_v1 = DataJobV1(
245
+ id=datajob.name,
246
+ flow_urn=datajob.flow_urn,
247
+ platform_instance=self.config.platform_instance,
248
+ name=datajob.name,
249
+ inlets=datajob.inlets,
250
+ outlets=datajob.outlets,
251
+ fine_grained_lineages=datajob.fine_grained_lineages,
252
+ )
243
253
  return DataProcessInstance.from_datajob(
244
- datajob=datajob,
254
+ datajob=datajob_v1,
245
255
  id=job.job_id,
246
256
  clone_inlets=True,
247
257
  clone_outlets=True,
@@ -278,17 +288,15 @@ class FivetranSource(StatefulIngestionSourceBase):
278
288
 
279
289
  def _get_connector_workunits(
280
290
  self, connector: Connector
281
- ) -> Iterable[MetadataWorkUnit]:
291
+ ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
282
292
  self.report.report_connectors_scanned()
283
293
  # Create dataflow entity with same name as connector name
284
294
  dataflow = self._generate_dataflow_from_connector(connector)
285
- for mcp in dataflow.generate_mcp():
286
- yield mcp.as_workunit()
295
+ yield dataflow
287
296
 
288
297
  # Map Fivetran's connector entity with Datahub's datajob entity
289
298
  datajob = self._generate_datajob_from_connector(connector)
290
- for mcp in datajob.generate_mcp(materialize_iolets=False):
291
- yield mcp.as_workunit()
299
+ yield datajob
292
300
 
293
301
  # Map Fivetran's job/sync history entity with Datahub's data process entity
294
302
  if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
@@ -310,7 +318,7 @@ class FivetranSource(StatefulIngestionSourceBase):
310
318
  ).workunit_processor,
311
319
  ]
312
320
 
313
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
321
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
314
322
  """
315
323
  Datahub Ingestion framework invoke this method
316
324
  """
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
5
5
 
6
6
  import requests
7
7
  from pydantic import BaseModel, Field, ValidationError, validator
8
+ from requests.adapters import HTTPAdapter
8
9
  from typing_extensions import assert_never
10
+ from urllib3.util.retry import Retry
9
11
 
10
12
  from datahub.ingestion.api.source import SourceReport
11
13
  from datahub.ingestion.source.hex.constants import (
@@ -220,6 +222,7 @@ class HexApi:
220
222
  self.base_url = base_url
221
223
  self.report = report
222
224
  self.page_size = page_size
225
+ self.session = self._create_retry_session()
223
226
 
224
227
  def _list_projects_url(self):
225
228
  return f"{self.base_url}/projects"
@@ -227,6 +230,28 @@ class HexApi:
227
230
  def _auth_header(self):
228
231
  return {"Authorization": f"Bearer {self.token}"}
229
232
 
233
+ def _create_retry_session(self) -> requests.Session:
234
+ """Create a requests session with retry logic for rate limiting.
235
+
236
+ Hex API rate limit: 60 requests per minute
237
+ https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
238
+ """
239
+ session = requests.Session()
240
+
241
+ # Configure retry strategy for 429 (Too Many Requests) with exponential backoff
242
+ retry_strategy = Retry(
243
+ total=5, # Maximum number of retries
244
+ status_forcelist=[429], # Only retry on 429 status code
245
+ backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
246
+ raise_on_status=True, # Raise exception after max retries
247
+ )
248
+
249
+ adapter = HTTPAdapter(max_retries=retry_strategy)
250
+ session.mount("http://", adapter)
251
+ session.mount("https://", adapter)
252
+
253
+ return session
254
+
230
255
  def fetch_projects(
231
256
  self,
232
257
  include_components: bool = True,
@@ -259,7 +284,7 @@ class HexApi:
259
284
  logger.debug(f"Fetching projects page with params: {params}")
260
285
  self.report.fetch_projects_page_calls += 1
261
286
  try:
262
- response = requests.get(
287
+ response = self.session.get(
263
288
  url=self._list_projects_url(),
264
289
  headers=self._auth_header(),
265
290
  params=params,