acryl-datahub 0.15.0.3rc1__py3-none-any.whl → 0.15.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (36) hide show
  1. acryl_datahub-0.15.0.4.dist-info/LICENSE +202 -0
  2. {acryl_datahub-0.15.0.3rc1.dist-info → acryl_datahub-0.15.0.4.dist-info}/METADATA +2411 -2408
  3. {acryl_datahub-0.15.0.3rc1.dist-info → acryl_datahub-0.15.0.4.dist-info}/RECORD +36 -33
  4. datahub/__init__.py +1 -1
  5. datahub/cli/container_cli.py +108 -0
  6. datahub/emitter/enum_helpers.py +4 -2
  7. datahub/emitter/mce_builder.py +4 -0
  8. datahub/emitter/mcp_builder.py +19 -0
  9. datahub/entrypoints.py +2 -0
  10. datahub/ingestion/api/decorators.py +2 -0
  11. datahub/ingestion/api/registry.py +3 -1
  12. datahub/ingestion/api/sink.py +12 -0
  13. datahub/ingestion/api/source.py +5 -2
  14. datahub/ingestion/source/aws/glue.py +11 -5
  15. datahub/ingestion/source/aws/s3_util.py +1 -24
  16. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -2
  17. datahub/ingestion/source/dbt/dbt_common.py +2 -2
  18. datahub/ingestion/source/powerbi/powerbi.py +4 -4
  19. datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +6 -6
  20. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +24 -18
  21. datahub/ingestion/source/s3/source.py +6 -2
  22. datahub/ingestion/source/slack/slack.py +6 -0
  23. datahub/ingestion/source/sql/hive_metastore.py +3 -3
  24. datahub/ingestion/source/sql/mssql/job_models.py +2 -2
  25. datahub/ingestion/source/sql/mssql/source.py +26 -11
  26. datahub/ingestion/source/sql/teradata.py +2 -2
  27. datahub/ingestion/source/tableau/tableau.py +23 -10
  28. datahub/metadata/_schema_classes.py +401 -401
  29. datahub/metadata/_urns/urn_defs.py +1857 -1408
  30. datahub/metadata/schema.avsc +16624 -16266
  31. datahub/sql_parsing/sql_parsing_aggregator.py +3 -3
  32. datahub/utilities/groupby.py +17 -0
  33. datahub/utilities/urns/_urn_base.py +6 -2
  34. {acryl_datahub-0.15.0.3rc1.dist-info → acryl_datahub-0.15.0.4.dist-info}/WHEEL +0 -0
  35. {acryl_datahub-0.15.0.3rc1.dist-info → acryl_datahub-0.15.0.4.dist-info}/entry_points.txt +0 -0
  36. {acryl_datahub-0.15.0.3rc1.dist-info → acryl_datahub-0.15.0.4.dist-info}/top_level.txt +0 -0
@@ -71,13 +71,13 @@ class Workspace:
71
71
  id: str
72
72
  name: str
73
73
  type: str # This is used as a subtype of the Container entity.
74
- dashboards: List["Dashboard"]
75
- reports: List["Report"]
76
- datasets: Dict[str, "PowerBIDataset"]
77
- report_endorsements: Dict[str, List[str]]
78
- dashboard_endorsements: Dict[str, List[str]]
74
+ dashboards: Dict[str, "Dashboard"] # key = dashboard id
75
+ reports: Dict[str, "Report"] # key = report id
76
+ datasets: Dict[str, "PowerBIDataset"] # key = dataset id
77
+ report_endorsements: Dict[str, List[str]] # key = report id
78
+ dashboard_endorsements: Dict[str, List[str]] # key = dashboard id
79
79
  scan_result: dict
80
- independent_datasets: List["PowerBIDataset"]
80
+ independent_datasets: Dict[str, "PowerBIDataset"] # key = dataset id
81
81
  app: Optional["App"]
82
82
 
83
83
  def get_urn_part(self, workspace_id_as_urn_part: Optional[bool] = False) -> str:
@@ -193,15 +193,18 @@ class PowerBiAPI:
193
193
  def get_report_users(self, workspace_id: str, report_id: str) -> List[User]:
194
194
  return self._get_entity_users(workspace_id, Constant.REPORTS, report_id)
195
195
 
196
- def get_reports(self, workspace: Workspace) -> List[Report]:
196
+ def get_reports(self, workspace: Workspace) -> Dict[str, Report]:
197
197
  """
198
198
  Fetch the report from PowerBi for the given Workspace
199
199
  """
200
- reports: List[Report] = []
200
+ reports: Dict[str, Report] = {}
201
201
  try:
202
- reports = self._get_resolver().get_reports(workspace)
202
+ reports = {
203
+ report.id: report
204
+ for report in self._get_resolver().get_reports(workspace)
205
+ }
203
206
  # Fill Report dataset
204
- for report in reports:
207
+ for report in reports.values():
205
208
  if report.dataset_id:
206
209
  report.dataset = self.dataset_registry.get(report.dataset_id)
207
210
  if report.dataset is None:
@@ -222,7 +225,7 @@ class PowerBiAPI:
222
225
  )
223
226
  return
224
227
 
225
- for report in reports:
228
+ for report in reports.values():
226
229
  report.users = self.get_report_users(
227
230
  workspace_id=workspace.id, report_id=report.id
228
231
  )
@@ -234,7 +237,7 @@ class PowerBiAPI:
234
237
  )
235
238
  return
236
239
 
237
- for report in reports:
240
+ for report in reports.values():
238
241
  report.tags = workspace.report_endorsements.get(report.id, [])
239
242
 
240
243
  fill_ownership()
@@ -270,12 +273,12 @@ class PowerBiAPI:
270
273
  name=workspace[Constant.NAME],
271
274
  type=workspace[Constant.TYPE],
272
275
  datasets={},
273
- dashboards=[],
274
- reports=[],
276
+ dashboards={},
277
+ reports={},
275
278
  report_endorsements={},
276
279
  dashboard_endorsements={},
277
280
  scan_result={},
278
- independent_datasets=[],
281
+ independent_datasets={},
279
282
  app=None, # It will be populated in _fill_metadata_from_scan_result method
280
283
  )
281
284
  for workspace in groups
@@ -561,12 +564,12 @@ class PowerBiAPI:
561
564
  name=workspace_metadata[Constant.NAME],
562
565
  type=workspace_metadata[Constant.TYPE],
563
566
  datasets={},
564
- dashboards=[],
565
- reports=[],
567
+ dashboards={},
568
+ reports={},
566
569
  report_endorsements={},
567
570
  dashboard_endorsements={},
568
571
  scan_result={},
569
- independent_datasets=[],
572
+ independent_datasets={},
570
573
  app=None, # It is getting set from scan-result
571
574
  )
572
575
  cur_workspace.scan_result = workspace_metadata
@@ -597,25 +600,28 @@ class PowerBiAPI:
597
600
  def _fill_independent_datasets(self, workspace: Workspace) -> None:
598
601
  reachable_datasets: List[str] = []
599
602
  # Find out reachable datasets
600
- for dashboard in workspace.dashboards:
603
+ for dashboard in workspace.dashboards.values():
601
604
  for tile in dashboard.tiles:
602
605
  if tile.dataset is not None:
603
606
  reachable_datasets.append(tile.dataset.id)
604
607
 
605
- for report in workspace.reports:
608
+ for report in workspace.reports.values():
606
609
  if report.dataset is not None:
607
610
  reachable_datasets.append(report.dataset.id)
608
611
 
609
612
  # Set datasets not present in reachable_datasets
610
613
  for dataset in workspace.datasets.values():
611
614
  if dataset.id not in reachable_datasets:
612
- workspace.independent_datasets.append(dataset)
615
+ workspace.independent_datasets[dataset.id] = dataset
613
616
 
614
617
  def _fill_regular_metadata_detail(self, workspace: Workspace) -> None:
615
618
  def fill_dashboards() -> None:
616
- workspace.dashboards = self._get_resolver().get_dashboards(workspace)
619
+ workspace.dashboards = {
620
+ dashboard.id: dashboard
621
+ for dashboard in self._get_resolver().get_dashboards(workspace)
622
+ }
617
623
  # set tiles of Dashboard
618
- for dashboard in workspace.dashboards:
624
+ for dashboard in workspace.dashboards.values():
619
625
  dashboard.tiles = self._get_resolver().get_tiles(
620
626
  workspace, dashboard=dashboard
621
627
  )
@@ -644,7 +650,7 @@ class PowerBiAPI:
644
650
  "Skipping tag retrieval for dashboard as extract_endorsements_to_tags is set to false"
645
651
  )
646
652
  return
647
- for dashboard in workspace.dashboards:
653
+ for dashboard in workspace.dashboards.values():
648
654
  dashboard.tags = workspace.dashboard_endorsements.get(dashboard.id, [])
649
655
 
650
656
  if self.__config.extract_dashboards:
@@ -40,7 +40,6 @@ from datahub.ingestion.source.aws.s3_util import (
40
40
  get_bucket_name,
41
41
  get_bucket_relative_path,
42
42
  get_key_prefix,
43
- group_s3_objects_by_dirname,
44
43
  strip_s3_prefix,
45
44
  )
46
45
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -73,6 +72,7 @@ from datahub.metadata.schema_classes import (
73
72
  _Aspect,
74
73
  )
75
74
  from datahub.telemetry import stats, telemetry
75
+ from datahub.utilities.groupby import groupby_unsorted
76
76
  from datahub.utilities.perf_timer import PerfTimer
77
77
 
78
78
  if TYPE_CHECKING:
@@ -868,7 +868,11 @@ class S3Source(StatefulIngestionSourceBase):
868
868
  """
869
869
  partitions: List[Folder] = []
870
870
  s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
871
- for key, group in group_s3_objects_by_dirname(s3_objects).items():
871
+ grouped_s3_objects_by_dirname = groupby_unsorted(
872
+ s3_objects,
873
+ key=lambda obj: obj.key.rsplit("/", 1)[0],
874
+ )
875
+ for key, group in grouped_s3_objects_by_dirname:
872
876
  file_size = 0
873
877
  creation_time = None
874
878
  modification_time = None
@@ -5,6 +5,8 @@ from typing import Iterable, List, Optional, Tuple
5
5
 
6
6
  from pydantic import Field, SecretStr
7
7
  from slack_sdk import WebClient
8
+ from tenacity import retry, wait_exponential
9
+ from tenacity.before_sleep import before_sleep_log
8
10
 
9
11
  import datahub.emitter.mce_builder as builder
10
12
  from datahub.configuration.common import ConfigModel
@@ -294,6 +296,10 @@ class SlackSource(Source):
294
296
  return
295
297
  raise e
296
298
 
299
+ @retry(
300
+ wait=wait_exponential(multiplier=2, min=4, max=60),
301
+ before_sleep=before_sleep_log(logger, logging.ERROR, True),
302
+ )
297
303
  def get_user_to_be_updated(self) -> Iterable[CorpUser]:
298
304
  graphql_query = textwrap.dedent(
299
305
  """
@@ -2,7 +2,6 @@ import base64
2
2
  import json
3
3
  import logging
4
4
  from collections import namedtuple
5
- from itertools import groupby
6
5
  from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
7
6
 
8
7
  from pydantic.dataclasses import dataclass
@@ -58,6 +57,7 @@ from datahub.metadata.schema_classes import (
58
57
  SubTypesClass,
59
58
  ViewPropertiesClass,
60
59
  )
60
+ from datahub.utilities.groupby import groupby_unsorted
61
61
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
62
62
  from datahub.utilities.str_enum import StrEnum
63
63
 
@@ -490,7 +490,7 @@ class HiveMetastoreSource(SQLAlchemySource):
490
490
 
491
491
  iter_res = self._alchemy_client.execute_query(statement)
492
492
 
493
- for key, group in groupby(iter_res, self._get_table_key):
493
+ for key, group in groupby_unsorted(iter_res, self._get_table_key):
494
494
  schema_name = (
495
495
  f"{db_name}.{key.schema}"
496
496
  if self.config.include_catalog_name_in_ids
@@ -647,7 +647,7 @@ class HiveMetastoreSource(SQLAlchemySource):
647
647
  )
648
648
 
649
649
  iter_res = self._alchemy_client.execute_query(statement)
650
- for key, group in groupby(iter_res, self._get_table_key):
650
+ for key, group in groupby_unsorted(iter_res, self._get_table_key):
651
651
  db_name = self.get_db_name(inspector)
652
652
 
653
653
  schema_name = (
@@ -156,7 +156,7 @@ class MSSQLDataJob:
156
156
  entity: Union[StoredProcedure, JobStep]
157
157
  type: str = "dataJob"
158
158
  source: str = "mssql"
159
- external_url: str = ""
159
+ external_url: Optional[str] = None
160
160
  description: Optional[str] = None
161
161
  status: Optional[str] = None
162
162
  incoming: List[str] = field(default_factory=list)
@@ -228,7 +228,7 @@ class MSSQLDataFlow:
228
228
  entity: Union[MSSQLJob, MSSQLProceduresContainer]
229
229
  type: str = "dataFlow"
230
230
  source: str = "mssql"
231
- external_url: str = ""
231
+ external_url: Optional[str] = None
232
232
  flow_properties: Dict[str, str] = field(default_factory=dict)
233
233
 
234
234
  def add_property(
@@ -11,6 +11,7 @@ from sqlalchemy.engine.base import Connection
11
11
  from sqlalchemy.engine.reflection import Inspector
12
12
  from sqlalchemy.exc import ProgrammingError, ResourceClosedError
13
13
 
14
+ import datahub.metadata.schema_classes as models
14
15
  from datahub.configuration.common import AllowDenyPattern
15
16
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
16
17
  from datahub.ingestion.api.common import PipelineContext
@@ -49,21 +50,15 @@ from datahub.ingestion.source.sql.sql_config import (
49
50
  make_sqlalchemy_uri,
50
51
  )
51
52
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
52
- from datahub.metadata.schema_classes import (
53
- BooleanTypeClass,
54
- NumberTypeClass,
55
- StringTypeClass,
56
- UnionTypeClass,
57
- )
58
53
  from datahub.utilities.file_backed_collections import FileBackedList
59
54
 
60
55
  logger: logging.Logger = logging.getLogger(__name__)
61
56
 
62
- register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass)
63
- register_custom_type(sqlalchemy.dialects.mssql.MONEY, NumberTypeClass)
64
- register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, NumberTypeClass)
65
- register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass)
66
- register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, StringTypeClass)
57
+ register_custom_type(sqlalchemy.dialects.mssql.BIT, models.BooleanTypeClass)
58
+ register_custom_type(sqlalchemy.dialects.mssql.MONEY, models.NumberTypeClass)
59
+ register_custom_type(sqlalchemy.dialects.mssql.SMALLMONEY, models.NumberTypeClass)
60
+ register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, models.UnionTypeClass)
61
+ register_custom_type(sqlalchemy.dialects.mssql.UNIQUEIDENTIFIER, models.StringTypeClass)
67
62
 
68
63
 
69
64
  class SQLServerConfig(BasicSQLAlchemyConfig):
@@ -651,6 +646,26 @@ class SQLServerSource(SQLAlchemySource):
651
646
  entityUrn=data_job.urn,
652
647
  aspect=data_job.as_datajob_input_output_aspect,
653
648
  ).as_workunit()
649
+
650
+ if (
651
+ self.config.include_stored_procedures_code
652
+ and isinstance(data_job.entity, StoredProcedure)
653
+ and data_job.entity.code is not None
654
+ ):
655
+ yield MetadataChangeProposalWrapper(
656
+ entityUrn=data_job.urn,
657
+ aspect=models.DataTransformLogicClass(
658
+ transforms=[
659
+ models.DataTransformClass(
660
+ queryStatement=models.QueryStatementClass(
661
+ value=data_job.entity.code,
662
+ language=models.QueryLanguageClass.SQL,
663
+ ),
664
+ )
665
+ ]
666
+ ),
667
+ ).as_workunit()
668
+
654
669
  # TODO: Add SubType when it appear
655
670
 
656
671
  def construct_flow_workunits(
@@ -3,7 +3,6 @@ from collections import defaultdict
3
3
  from dataclasses import dataclass
4
4
  from datetime import datetime
5
5
  from functools import lru_cache
6
- from itertools import groupby
7
6
  from typing import (
8
7
  Any,
9
8
  Dict,
@@ -59,6 +58,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
59
58
  from datahub.metadata.schema_classes import SchemaMetadataClass
60
59
  from datahub.sql_parsing.schema_resolver import SchemaResolver
61
60
  from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
61
+ from datahub.utilities.groupby import groupby_unsorted
62
62
 
63
63
  logger: logging.Logger = logging.getLogger(__name__)
64
64
 
@@ -286,7 +286,7 @@ def optimized_get_foreign_keys(self, connection, table_name, schema=None, **kw):
286
286
 
287
287
  # TODO: Check if there's a better way
288
288
  fk_dicts = list()
289
- for constraint_info, constraint_cols in groupby(res, grouper):
289
+ for constraint_info, constraint_cols in groupby_unsorted(res, grouper):
290
290
  fk_dict = {
291
291
  "name": str(constraint_info["name"]),
292
292
  "constrained_columns": list(),
@@ -1147,23 +1147,36 @@ class TableauSiteSource:
1147
1147
  )
1148
1148
  # Set parent project name
1149
1149
  for _project_id, project in all_project_map.items():
1150
- if (
1151
- project.parent_id is not None
1152
- and project.parent_id in all_project_map
1153
- ):
1150
+ if project.parent_id is None:
1151
+ continue
1152
+
1153
+ if project.parent_id in all_project_map:
1154
1154
  project.parent_name = all_project_map[project.parent_id].name
1155
+ else:
1156
+ self.report.warning(
1157
+ title="Incomplete project hierarchy",
1158
+ message="Project details missing. Child projects will be ingested without reference to their parent project. We generally need Site Administrator Explorer permissions to extract the complete project hierarchy.",
1159
+ context=f"Missing {project.parent_id}, referenced by {project.id} {project.project_name}",
1160
+ )
1161
+ project.parent_id = None
1162
+
1163
+ # Post-condition
1164
+ assert all(
1165
+ [
1166
+ ((project.parent_id is None) == (project.parent_name is None))
1167
+ and (
1168
+ project.parent_id is None
1169
+ or project.parent_id in all_project_map
1170
+ )
1171
+ for project in all_project_map.values()
1172
+ ]
1173
+ ), "Parent project id and name should be consistent"
1155
1174
 
1156
1175
  def set_project_path():
1157
1176
  def form_path(project_id: str) -> List[str]:
1158
1177
  cur_proj = all_project_map[project_id]
1159
1178
  ancestors = [cur_proj.name]
1160
1179
  while cur_proj.parent_id is not None:
1161
- if cur_proj.parent_id not in all_project_map:
1162
- self.report.warning(
1163
- "project-issue",
1164
- f"Parent project {cur_proj.parent_id} not found. We need Site Administrator Explorer permissions.",
1165
- )
1166
- break
1167
1180
  cur_proj = all_project_map[cur_proj.parent_id]
1168
1181
  ancestors = [cur_proj.name, *ancestors]
1169
1182
  return ancestors