acryl-datahub 1.1.0.5rc3__py3-none-any.whl → 1.1.0.5rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (52) hide show
  1. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/METADATA +2575 -2575
  2. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/RECORD +52 -45
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +21 -4
  5. datahub/ingestion/api/decorators.py +14 -3
  6. datahub/ingestion/api/report.py +123 -2
  7. datahub/ingestion/api/source.py +45 -44
  8. datahub/ingestion/autogenerated/lineage_helper.py +193 -0
  9. datahub/ingestion/graph/client.py +71 -28
  10. datahub/ingestion/run/pipeline.py +6 -0
  11. datahub/ingestion/source/aws/glue.py +1 -1
  12. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  13. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  14. datahub/ingestion/source/bigquery_v2/queries.py +4 -4
  15. datahub/ingestion/source/common/subtypes.py +43 -0
  16. datahub/ingestion/source/dbt/dbt_common.py +1 -1
  17. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  18. datahub/ingestion/source/hex/api.py +26 -1
  19. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  20. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -15
  21. datahub/ingestion/source/salesforce.py +6 -3
  22. datahub/ingestion/source/slack/slack.py +2 -1
  23. datahub/ingestion/source/snowflake/snowflake_queries.py +1 -0
  24. datahub/ingestion/source/sql/athena.py +15 -3
  25. datahub/ingestion/source/sql/mssql/source.py +9 -0
  26. datahub/ingestion/source/sql/sql_common.py +3 -0
  27. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  28. datahub/ingestion/source/sql/teradata.py +4 -1
  29. datahub/ingestion/source/sql/vertica.py +9 -1
  30. datahub/ingestion/source/tableau/tableau.py +6 -1
  31. datahub/ingestion/source/unity/source.py +36 -20
  32. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  33. datahub/metadata/_internal_schema_classes.py +601 -0
  34. datahub/metadata/_urns/urn_defs.py +112 -0
  35. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  36. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  37. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  38. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  39. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  40. datahub/metadata/schema.avsc +383 -0
  41. datahub/metadata/schemas/CorpUserSettings.avsc +25 -0
  42. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  43. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +202 -0
  44. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  45. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  46. datahub/metadata/schemas/GlobalSettingsInfo.avsc +25 -0
  47. datahub/sdk/datajob.py +39 -15
  48. datahub/specific/dataproduct.py +4 -0
  49. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/licenses/LICENSE +0 -0
  52. {acryl_datahub-1.1.0.5rc3.dist-info → acryl_datahub-1.1.0.5rc5.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@ import contextlib
2
2
  import datetime
3
3
  import logging
4
4
  from abc import ABCMeta, abstractmethod
5
- from collections import defaultdict
6
5
  from dataclasses import dataclass, field
7
6
  from enum import Enum
8
7
  from functools import partial
@@ -15,7 +14,6 @@ from typing import (
15
14
  List,
16
15
  Optional,
17
16
  Sequence,
18
- Set,
19
17
  Type,
20
18
  TypeVar,
21
19
  Union,
@@ -28,7 +26,6 @@ from typing_extensions import LiteralString, Self
28
26
  from datahub.configuration.common import ConfigModel
29
27
  from datahub.configuration.source_common import PlatformInstanceConfigMixin
30
28
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
31
- from datahub.emitter.mcp_builder import mcps_from_mce
32
29
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
33
30
  auto_patch_last_modified,
34
31
  )
@@ -37,7 +34,7 @@ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
37
34
  )
38
35
  from datahub.ingestion.api.closeable import Closeable
39
36
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
40
- from datahub.ingestion.api.report import Report
37
+ from datahub.ingestion.api.report import ExamplesReport, Report
41
38
  from datahub.ingestion.api.source_helpers import (
42
39
  AutoSystemMetadata,
43
40
  auto_browse_path_v2,
@@ -50,9 +47,8 @@ from datahub.ingestion.api.source_helpers import (
50
47
  auto_workunit_reporter,
51
48
  )
52
49
  from datahub.ingestion.api.workunit import MetadataWorkUnit
53
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
54
- from datahub.metadata.schema_classes import UpstreamLineageClass
55
50
  from datahub.sdk.entity import Entity
51
+ from datahub.telemetry import stats
56
52
  from datahub.utilities.lossy_collections import LossyDict, LossyList
57
53
  from datahub.utilities.type_annotations import get_class_from_annotation
58
54
 
@@ -191,20 +187,11 @@ class StructuredLogs(Report):
191
187
 
192
188
 
193
189
  @dataclass
194
- class SourceReport(Report):
190
+ class SourceReport(ExamplesReport):
195
191
  event_not_produced_warn: bool = True
196
192
  events_produced: int = 0
197
193
  events_produced_per_sec: int = 0
198
194
 
199
- _urns_seen: Set[str] = field(default_factory=set)
200
- entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
201
- aspects: Dict[str, Dict[str, int]] = field(
202
- default_factory=lambda: defaultdict(lambda: defaultdict(int))
203
- )
204
- aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
205
- default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
206
- )
207
-
208
195
  _structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
209
196
 
210
197
  @property
@@ -221,34 +208,10 @@ class SourceReport(Report):
221
208
 
222
209
  def report_workunit(self, wu: WorkUnit) -> None:
223
210
  self.events_produced += 1
211
+ if not isinstance(wu, MetadataWorkUnit):
212
+ return
224
213
 
225
- if isinstance(wu, MetadataWorkUnit):
226
- urn = wu.get_urn()
227
-
228
- # Specialized entity reporting.
229
- if not isinstance(wu.metadata, MetadataChangeEvent):
230
- mcps = [wu.metadata]
231
- else:
232
- mcps = list(mcps_from_mce(wu.metadata))
233
-
234
- for mcp in mcps:
235
- entityType = mcp.entityType
236
- aspectName = mcp.aspectName
237
-
238
- if urn not in self._urns_seen:
239
- self._urns_seen.add(urn)
240
- self.entities[entityType].append(urn)
241
-
242
- if aspectName is not None: # usually true
243
- self.aspects[entityType][aspectName] += 1
244
- self.aspect_urn_samples[entityType][aspectName].append(urn)
245
- if isinstance(mcp.aspect, UpstreamLineageClass):
246
- upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
247
- if upstream_lineage.fineGrainedLineages:
248
- self.aspect_urn_samples[entityType][
249
- "fineGrainedLineages"
250
- ].append(urn)
251
- self.aspects[entityType]["fineGrainedLineages"] += 1
214
+ super()._store_workunit_data(wu)
252
215
 
253
216
  def report_warning(
254
217
  self,
@@ -327,6 +290,7 @@ class SourceReport(Report):
327
290
  )
328
291
 
329
292
  def __post_init__(self) -> None:
293
+ super().__post_init__()
330
294
  self.start_time = datetime.datetime.now()
331
295
  self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
332
296
 
@@ -339,6 +303,43 @@ class SourceReport(Report):
339
303
  "infos": Report.to_pure_python_obj(self.infos),
340
304
  }
341
305
 
306
+ @staticmethod
307
+ def _discretize_dict_values(
308
+ nested_dict: Dict[str, Dict[str, int]],
309
+ ) -> Dict[str, Dict[str, int]]:
310
+ """Helper method to discretize values in a nested dictionary structure."""
311
+ result = {}
312
+ for outer_key, inner_dict in nested_dict.items():
313
+ discretized_dict: Dict[str, int] = {}
314
+ for inner_key, count in inner_dict.items():
315
+ discretized_dict[inner_key] = stats.discretize(count)
316
+ result[outer_key] = discretized_dict
317
+ return result
318
+
319
+ def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
320
+ """Convert the nested defaultdict aspects to a regular dict for serialization."""
321
+ return self._discretize_dict_values(self.aspects)
322
+
323
+ def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
324
+ """Get aspect counts grouped by entity type and subtype."""
325
+ return self._discretize_dict_values_nested(self.aspects_by_subtypes)
326
+
327
+ @staticmethod
328
+ def _discretize_dict_values_nested(
329
+ nested_dict: Dict[str, Dict[str, Dict[str, int]]],
330
+ ) -> Dict[str, Dict[str, Dict[str, int]]]:
331
+ """Helper method to discretize values in a nested dictionary structure with three levels."""
332
+ result = {}
333
+ for outer_key, middle_dict in nested_dict.items():
334
+ discretized_middle_dict: Dict[str, Dict[str, int]] = {}
335
+ for middle_key, inner_dict in middle_dict.items():
336
+ discretized_inner_dict: Dict[str, int] = {}
337
+ for inner_key, count in inner_dict.items():
338
+ discretized_inner_dict[inner_key] = stats.discretize(count)
339
+ discretized_middle_dict[middle_key] = discretized_inner_dict
340
+ result[outer_key] = discretized_middle_dict
341
+ return result
342
+
342
343
  def compute_stats(self) -> None:
343
344
  super().compute_stats()
344
345
 
@@ -505,7 +506,7 @@ class Source(Closeable, metaclass=ABCMeta):
505
506
  pass
506
507
 
507
508
  def close(self) -> None:
508
- pass
509
+ self.get_report().close()
509
510
 
510
511
  def _infer_platform(self) -> Optional[str]:
511
512
  config = self.get_config()
@@ -0,0 +1,193 @@
1
+ import json
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List, Optional, Set
5
+
6
+ from datahub.utilities.urns.urn import guess_entity_type
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # Global cache for lineage data to avoid repeated file reads
11
+ _lineage_data: Optional[Dict] = None
12
+
13
+
14
+ def _load_lineage_data() -> Dict:
15
+ """
16
+ This is experimental internal API subject to breaking changes without prior notice.
17
+
18
+ Load lineage data from the autogenerated lineage.json file.
19
+
20
+ Returns:
21
+ Dict containing the lineage information
22
+
23
+ Raises:
24
+ FileNotFoundError: If lineage.json doesn't exist
25
+ json.JSONDecodeError: If lineage.json is malformed
26
+ """
27
+ global _lineage_data
28
+
29
+ if _lineage_data is not None:
30
+ return _lineage_data
31
+
32
+ # Get the path to lineage.json relative to this file
33
+ current_file = Path(__file__)
34
+ lineage_file = current_file.parent / "lineage.json"
35
+
36
+ if not lineage_file.exists():
37
+ raise FileNotFoundError(f"Lineage file not found: {lineage_file}")
38
+
39
+ try:
40
+ with open(lineage_file, "r") as f:
41
+ _lineage_data = json.load(f)
42
+ return _lineage_data
43
+ except json.JSONDecodeError as e:
44
+ raise json.JSONDecodeError(
45
+ f"Failed to parse lineage.json: {e}", e.doc, e.pos
46
+ ) from e
47
+
48
+
49
+ def get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
50
+ """
51
+ This is experimental internal API subject to breaking changes without prior notice.
52
+
53
+ Get lineage fields for a specific entity type and aspect.
54
+
55
+ Args:
56
+ entity_type: The entity type (e.g., 'dataset', 'dataJob')
57
+ aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
58
+
59
+ Returns:
60
+ List of lineage field dictionaries, each containing:
61
+ - name: field name
62
+ - path: dot-notation path to the field
63
+ - isLineage: boolean indicating if it's lineage
64
+ - relationship: relationship information
65
+
66
+ Raises:
67
+ FileNotFoundError: If lineage.json doesn't exist
68
+ json.JSONDecodeError: If lineage.json is malformed
69
+ """
70
+ lineage_data = _load_lineage_data()
71
+
72
+ entity_data = lineage_data.get("entities", {}).get(entity_type, {})
73
+ aspect_data = entity_data.get(aspect_name, {})
74
+
75
+ return aspect_data.get("fields", [])
76
+
77
+
78
+ def is_lineage_field(urn: str, aspect_name: str, field_path: str) -> bool:
79
+ """
80
+ This is experimental internal API subject to breaking changes without prior notice.
81
+
82
+ Check if a specific field path is lineage-related.
83
+
84
+ Args:
85
+ urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
86
+ aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
87
+ field_path: The dot-notation path to the field (e.g., 'upstreams.dataset')
88
+
89
+ Returns:
90
+ True if the field is lineage-related, False otherwise
91
+
92
+ Raises:
93
+ FileNotFoundError: If lineage.json doesn't exist
94
+ json.JSONDecodeError: If lineage.json is malformed
95
+ AssertionError: If URN doesn't start with 'urn:li:'
96
+ """
97
+ entity_type = guess_entity_type(urn)
98
+ lineage_fields = get_lineage_fields(entity_type, aspect_name)
99
+
100
+ for field in lineage_fields:
101
+ if field.get("path") == field_path:
102
+ return field.get("isLineage", False)
103
+
104
+ return False
105
+
106
+
107
+ def has_lineage(urn: str, aspect: Any) -> bool:
108
+ """
109
+ This is experimental internal API subject to breaking changes without prior notice.
110
+
111
+ Check if an aspect has any lineage fields.
112
+
113
+ Args:
114
+ urn: The entity URN (e.g., 'urn:li:dataset:(urn:li:dataPlatform:mysql,test_db.test_table,PROD)')
115
+ aspect: The aspect object
116
+
117
+ Returns:
118
+ True if the aspect has lineage fields, False otherwise
119
+
120
+ Raises:
121
+ FileNotFoundError: If lineage.json doesn't exist
122
+ json.JSONDecodeError: If lineage.json is malformed
123
+ AssertionError: If URN doesn't start with 'urn:li:'
124
+ """
125
+ entity_type = guess_entity_type(urn)
126
+ aspect_class = getattr(aspect, "__class__", None)
127
+ aspect_name = (
128
+ aspect_class.__name__ if aspect_class is not None else str(type(aspect))
129
+ )
130
+
131
+ lineage_fields = get_lineage_fields(entity_type, aspect_name)
132
+ return len(lineage_fields) > 0
133
+
134
+
135
+ def has_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
136
+ """
137
+ This is experimental internal API subject to breaking changes without prior notice.
138
+
139
+ Check if an aspect has any lineage fields.
140
+
141
+ Args:
142
+ entity_type: The entity type (e.g., 'dataset', 'dataJob')
143
+ aspect_name: The aspect name (e.g., 'upstreamLineage', 'dataJobInputOutput')
144
+
145
+ Returns:
146
+ True if the aspect has lineage fields, False otherwise
147
+
148
+ Raises:
149
+ FileNotFoundError: If lineage.json doesn't exist
150
+ json.JSONDecodeError: If lineage.json is malformed
151
+ """
152
+ lineage_fields = get_lineage_fields(entity_type, aspect_name)
153
+ return len(lineage_fields) > 0
154
+
155
+
156
+ def get_all_lineage_aspects(entity_type: str) -> Set[str]:
157
+ """
158
+ This is experimental internal API subject to breaking changes without prior notice.
159
+
160
+ Get all aspects that have lineage fields for a given entity type.
161
+
162
+ Args:
163
+ entity_type: The entity type (e.g., 'dataset', 'dataJob')
164
+
165
+ Returns:
166
+ Set of aspect names that have lineage fields
167
+
168
+ Raises:
169
+ FileNotFoundError: If lineage.json doesn't exist
170
+ json.JSONDecodeError: If lineage.json is malformed
171
+ """
172
+ lineage_data = _load_lineage_data()
173
+
174
+ entity_data = lineage_data.get("entities", {}).get(entity_type, {})
175
+ lineage_aspects = set()
176
+
177
+ for aspect_name, aspect_data in entity_data.items():
178
+ if aspect_data.get("fields"):
179
+ lineage_aspects.add(aspect_name)
180
+
181
+ return lineage_aspects
182
+
183
+
184
+ def clear_cache() -> None:
185
+ """
186
+ This is experimental internal API subject to breaking changes without prior notice.
187
+
188
+ Clear the internal cache of lineage data.
189
+
190
+ This is useful for testing or when the lineage.json file has been updated.
191
+ """
192
+ global _lineage_data
193
+ _lineage_data = None
@@ -22,6 +22,7 @@ from typing import (
22
22
  Union,
23
23
  )
24
24
 
25
+ import progressbar
25
26
  from avro.schema import RecordSchema
26
27
  from pydantic import BaseModel
27
28
  from requests.models import HTTPError
@@ -504,7 +505,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
504
505
  "limit": limit,
505
506
  "filter": filter,
506
507
  }
507
- end_point = f"{self.config.server}/aspects?action=getTimeseriesAspectValues"
508
+ end_point = f"{self._gms_server}/aspects?action=getTimeseriesAspectValues"
508
509
  resp: Dict = self._post_generic(end_point, query_body)
509
510
 
510
511
  values: Optional[List] = resp.get("value", {}).get("values")
@@ -524,7 +525,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
524
525
  def get_entity_raw(
525
526
  self, entity_urn: str, aspects: Optional[List[str]] = None
526
527
  ) -> Dict:
527
- endpoint: str = f"{self.config.server}/entitiesV2/{Urn.url_encode(entity_urn)}"
528
+ endpoint: str = f"{self._gms_server}/entitiesV2/{Urn.url_encode(entity_urn)}"
528
529
  if aspects is not None:
529
530
  assert aspects, "if provided, aspects must be a non-empty list"
530
531
  endpoint = f"{endpoint}?aspects=List(" + ",".join(aspects) + ")"
@@ -654,15 +655,15 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
654
655
 
655
656
  @property
656
657
  def _search_endpoint(self):
657
- return f"{self.config.server}/entities?action=search"
658
+ return f"{self._gms_server}/entities?action=search"
658
659
 
659
660
  @property
660
661
  def _relationships_endpoint(self):
661
- return f"{self.config.server}/openapi/relationships/v1/"
662
+ return f"{self._gms_server}/openapi/relationships/v1/"
662
663
 
663
664
  @property
664
665
  def _aspect_count_endpoint(self):
665
- return f"{self.config.server}/aspects?action=getCount"
666
+ return f"{self._gms_server}/aspects?action=getCount"
666
667
 
667
668
  def get_domain_urn_by_name(self, domain_name: str) -> Optional[str]:
668
669
  """Retrieve a domain urn based on its name. Returns None if there is no match found"""
@@ -1209,7 +1210,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1209
1210
  operation_name: Optional[str] = None,
1210
1211
  format_exception: bool = True,
1211
1212
  ) -> Dict:
1212
- url = f"{self.config.server}/api/graphql"
1213
+ url = f"{self._gms_server}/api/graphql"
1213
1214
 
1214
1215
  body: Dict = {
1215
1216
  "query": query,
@@ -1434,40 +1435,82 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1434
1435
  related_aspects = response.get("relatedAspects", [])
1435
1436
  return reference_count, related_aspects
1436
1437
 
1438
+ def get_kafka_consumer_offsets(
1439
+ self,
1440
+ ) -> dict:
1441
+ """
1442
+ Get Kafka consumer offsets from the DataHub API.
1443
+
1444
+ Args:
1445
+ graph (DataHubGraph): The DataHub graph client
1446
+
1447
+ """
1448
+ urls = {
1449
+ "mcp": f"{self.config.server}/openapi/operations/kafka/mcp/consumer/offsets",
1450
+ "mcl": f"{self.config.server}/openapi/operations/kafka/mcl/consumer/offsets",
1451
+ "mcl-timeseries": f"{self.config.server}/openapi/operations/kafka/mcl-timeseries/consumer/offsets",
1452
+ }
1453
+
1454
+ params = {"skipCache": "true", "detailed": "true"}
1455
+ results = {}
1456
+ for key, url in urls.items():
1457
+ response = self._get_generic(url=url, params=params)
1458
+ results[key] = response
1459
+ if "errors" in response:
1460
+ logger.error(f"Error: {response['errors']}")
1461
+ return results
1462
+
1463
+ def _restore_index_call(self, payload_obj: dict) -> None:
1464
+ result = self._post_generic(
1465
+ f"{self._gms_server}/operations?action=restoreIndices", payload_obj
1466
+ )
1467
+ logger.debug(f"Restore indices result: {result}")
1468
+
1437
1469
  def restore_indices(
1438
1470
  self,
1439
- urn_pattern: str,
1471
+ urn_pattern: Optional[str] = None,
1440
1472
  aspect: Optional[str] = None,
1441
1473
  start: Optional[int] = None,
1442
1474
  batch_size: Optional[int] = None,
1443
- ) -> str:
1475
+ file: Optional[str] = None,
1476
+ ) -> None:
1444
1477
  """Restore the indices for a given urn or urn-like pattern.
1445
1478
 
1446
1479
  Args:
1447
- urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs.
1480
+ urn_pattern: The exact URN or a pattern (with % for wildcard) to match URNs. If not provided, will restore indices from the file.
1448
1481
  aspect: Optional aspect string to restore indices for a specific aspect.
1449
- start: Optional integer to decide which row number of sql store to restore from. Default: 0.
1450
- batch_size: Optional integer to decide how many rows to restore. Default: 10.
1482
+ start: Optional integer to decide which row number of sql store to restore from. Default: 0. Ignored in case file is provided.
1483
+ batch_size: Optional integer to decide how many rows to restore. Default: 10. Ignored in case file is provided.
1484
+ file: Optional file path to a file containing URNs to restore indices for.
1451
1485
 
1452
1486
  Returns:
1453
1487
  A string containing the result of the restore indices operation. This format is subject to change.
1454
1488
  """
1455
- if "%" in urn_pattern:
1456
- payload_obj: dict = {"urnLike": urn_pattern}
1489
+ payload_obj = {}
1490
+ if file is not None:
1491
+ with open(file) as f:
1492
+ for urn in progressbar.progressbar(f.readlines()):
1493
+ urn = urn.strip()
1494
+ if "%" in urn:
1495
+ payload_obj["urnLike"] = urn
1496
+ else:
1497
+ payload_obj["urn"] = urn
1498
+ if aspect is not None:
1499
+ payload_obj["aspect"] = aspect
1500
+ self._restore_index_call(payload_obj)
1457
1501
  else:
1458
- payload_obj = {"urn": urn_pattern}
1459
- if aspect is not None:
1460
- payload_obj["aspect"] = aspect
1461
- if start is not None:
1462
- payload_obj["start"] = start
1463
- if batch_size is not None:
1464
- payload_obj["batchSize"] = batch_size
1465
- raw_result = self._post_generic(
1466
- f"{self._gms_server}/operations?action=restoreIndices", payload_obj
1467
- )
1468
- result = raw_result["value"]
1469
- logger.debug(f"Restore indices result: {result}")
1470
- return result
1502
+ if urn_pattern is not None:
1503
+ if "%" in urn_pattern:
1504
+ payload_obj["urnLike"] = urn_pattern
1505
+ else:
1506
+ payload_obj["urn"] = urn_pattern
1507
+ if aspect is not None:
1508
+ payload_obj["aspect"] = aspect
1509
+ if start is not None:
1510
+ payload_obj["start"] = start
1511
+ if batch_size is not None:
1512
+ payload_obj["batchSize"] = batch_size
1513
+ self._restore_index_call(payload_obj)
1471
1514
 
1472
1515
  @functools.lru_cache
1473
1516
  def _make_schema_resolver(
@@ -1774,7 +1817,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1774
1817
  "Accept": "application/json",
1775
1818
  "Content-Type": "application/json",
1776
1819
  }
1777
- url = f"{self.config.server}/openapi/v2/entity/batch/{entity_name}"
1820
+ url = f"{self._gms_server}/openapi/v2/entity/batch/{entity_name}"
1778
1821
  response = self._session.post(url, data=json.dumps(payload), headers=headers)
1779
1822
  response.raise_for_status()
1780
1823
 
@@ -1831,7 +1874,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
1831
1874
  "Content-Type": "application/json",
1832
1875
  }
1833
1876
 
1834
- url = f"{self.config.server}/openapi/v3/entity/{entity_name}/batchGet"
1877
+ url = f"{self._gms_server}/openapi/v3/entity/{entity_name}/batchGet"
1835
1878
  if with_system_metadata:
1836
1879
  url += "?systemMetadata=true"
1837
1880
 
@@ -578,11 +578,17 @@ class Pipeline:
578
578
  sink_failures = len(self.sink.get_report().failures)
579
579
  sink_warnings = len(self.sink.get_report().warnings)
580
580
  global_warnings = len(get_global_warnings())
581
+ source_aspects = self.source.get_report().get_aspects_dict()
582
+ source_aspects_by_subtype = (
583
+ self.source.get_report().get_aspects_by_subtypes_dict()
584
+ )
581
585
 
582
586
  telemetry_instance.ping(
583
587
  "ingest_stats",
584
588
  {
585
589
  "source_type": self.source_type,
590
+ "source_aspects": source_aspects,
591
+ "source_aspects_by_subtype": source_aspects_by_subtype,
586
592
  "sink_type": self.sink_type,
587
593
  "transformer_types": [
588
594
  transformer.type for transformer in self.config.transformers or []
@@ -269,7 +269,7 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
269
269
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
270
270
  @capability(
271
271
  SourceCapability.DELETION_DETECTION,
272
- "Enabled by default when stateful ingestion is turned on.",
272
+ "Enabled by default via stateful ingestion.",
273
273
  )
274
274
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
275
275
  @capability(
@@ -94,3 +94,4 @@ class BigQueryQueriesSource(Source):
94
94
  def close(self) -> None:
95
95
  self.queries_extractor.close()
96
96
  self.connection.close()
97
+ super().close()
@@ -189,6 +189,7 @@ WHERE
189
189
 
190
190
  if len(profile_requests) == 0:
191
191
  return
192
+
192
193
  yield from self.generate_profile_workunits(
193
194
  profile_requests,
194
195
  max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
226
227
  db_name, schema_name, bq_table, self.config.profiling.partition_datetime
227
228
  )
228
229
 
229
- if partition is None and bq_table.partition_info:
230
+ # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
231
+ if partition is None and bq_table.partition_info and bq_table.rows_count:
230
232
  self.report.report_warning(
231
233
  title="Profile skipped for partitioned table",
232
- message="profile skipped as partitioned table is empty or partition id or type was invalid",
234
+ message="profile skipped as partition id or type was invalid",
233
235
  context=profile_request.pretty_name,
234
236
  )
235
237
  return None
@@ -45,12 +45,12 @@ SELECT
45
45
  tos.OPTION_VALUE as comment,
46
46
  t.is_insertable_into,
47
47
  t.ddl,
48
- ts.row_count,
49
- ts.size_bytes as bytes,
48
+ ts.row_count as row_count,
49
+ ts.size_bytes as size_bytes,
50
50
  p.num_partitions,
51
51
  p.max_partition_id,
52
- p.active_billable_bytes,
53
- p.long_term_billable_bytes,
52
+ p.active_billable_bytes as active_billable_bytes,
53
+ -- IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
54
54
  REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
55
55
  REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
56
56
 
@@ -1,5 +1,10 @@
1
+ import logging
2
+ from typing import Any, Dict
3
+
1
4
  from datahub.utilities.str_enum import StrEnum
2
5
 
6
+ logger = logging.getLogger(__name__)
7
+
3
8
 
4
9
  class DatasetSubTypes(StrEnum):
5
10
  # Generic SubTypes
@@ -26,6 +31,8 @@ class DatasetSubTypes(StrEnum):
26
31
  NEO4J_RELATIONSHIP = "Neo4j Relationship"
27
32
  SNOWFLAKE_STREAM = "Snowflake Stream"
28
33
  API_ENDPOINT = "API Endpoint"
34
+ SLACK_CHANNEL = "Slack Channel"
35
+ PROJECTIONS = "Projections"
29
36
 
30
37
  # TODO: Create separate entity...
31
38
  NOTEBOOK = "Notebook"
@@ -74,6 +81,9 @@ class JobContainerSubTypes(StrEnum):
74
81
 
75
82
 
76
83
  class BIAssetSubTypes(StrEnum):
84
+ DASHBOARD = "Dashboard"
85
+ CHART = "Chart"
86
+
77
87
  # Generic SubTypes
78
88
  REPORT = "Report"
79
89
 
@@ -116,3 +126,36 @@ class MLAssetSubTypes(StrEnum):
116
126
  VERTEX_PIPELINE = "Pipeline Job"
117
127
  VERTEX_PIPELINE_TASK = "Pipeline Task"
118
128
  VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
129
+
130
+
131
+ def create_source_capability_modifier_enum():
132
+ all_values: Dict[str, Any] = {}
133
+ source_enums = [
134
+ DatasetSubTypes,
135
+ DatasetContainerSubTypes,
136
+ BIContainerSubTypes,
137
+ FlowContainerSubTypes,
138
+ JobContainerSubTypes,
139
+ BIAssetSubTypes,
140
+ MLAssetSubTypes,
141
+ ]
142
+
143
+ for enum_class in source_enums:
144
+ for member in enum_class: # type: ignore[var-annotated]
145
+ if member.name in all_values:
146
+ logger.error(
147
+ f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
148
+ )
149
+ continue
150
+ all_values[member.name] = member.value
151
+
152
+ enum_code = "class SourceCapabilityModifier(StrEnum):\n"
153
+ for name, value in all_values.items():
154
+ enum_code += f' {name} = "{value}"\n'
155
+
156
+ exec(enum_code, globals())
157
+ return globals()["SourceCapabilityModifier"]
158
+
159
+
160
+ # This will have all values from the enums above
161
+ SourceCapabilityModifier = create_source_capability_modifier_enum()
@@ -355,7 +355,7 @@ class DBTCommonConfig(
355
355
  # override default value to True.
356
356
  incremental_lineage: bool = Field(
357
357
  default=True,
358
- description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.",
358
+ description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run. This would also require enabling 'incremental_lineage' in the counterpart warehouse ingestion (_e.g._ BigQuery, Redshift, etc).",
359
359
  )
360
360
 
361
361
  _remove_use_compiled_code = pydantic_removed_field("use_compiled_code")