acryl-datahub 0.15.0.4rc2__py3-none-any.whl → 0.15.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (96) hide show
  1. acryl_datahub-0.15.0.5.dist-info/LICENSE +202 -0
  2. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/METADATA +2444 -2404
  3. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/RECORD +96 -86
  4. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/entry_points.txt +1 -0
  5. datahub/__init__.py +1 -25
  6. datahub/_version.py +13 -0
  7. datahub/api/entities/dataprocess/dataprocess_instance.py +104 -11
  8. datahub/cli/check_cli.py +1 -1
  9. datahub/cli/cli_utils.py +3 -3
  10. datahub/cli/container_cli.py +1 -64
  11. datahub/cli/iceberg_cli.py +707 -0
  12. datahub/cli/ingest_cli.py +2 -2
  13. datahub/emitter/composite_emitter.py +36 -0
  14. datahub/emitter/rest_emitter.py +1 -1
  15. datahub/entrypoints.py +26 -5
  16. datahub/ingestion/api/incremental_lineage_helper.py +4 -0
  17. datahub/ingestion/api/registry.py +4 -2
  18. datahub/ingestion/glossary/classification_mixin.py +6 -0
  19. datahub/ingestion/glossary/classifier.py +3 -2
  20. datahub/ingestion/graph/client.py +2 -1
  21. datahub/ingestion/graph/entity_versioning.py +201 -0
  22. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
  23. datahub/ingestion/run/connection.py +1 -1
  24. datahub/ingestion/run/pipeline.py +3 -3
  25. datahub/ingestion/source/abs/report.py +2 -2
  26. datahub/ingestion/source/apply/__init__.py +0 -0
  27. datahub/ingestion/source/apply/datahub_apply.py +223 -0
  28. datahub/ingestion/source/aws/glue.py +15 -6
  29. datahub/ingestion/source/aws/sagemaker_processors/common.py +3 -2
  30. datahub/ingestion/source/bigquery_v2/bigquery_report.py +1 -1
  31. datahub/ingestion/source/dbt/dbt_core.py +1 -1
  32. datahub/ingestion/source/delta_lake/report.py +2 -2
  33. datahub/ingestion/source/dynamodb/dynamodb.py +2 -1
  34. datahub/ingestion/source/elastic_search.py +2 -1
  35. datahub/ingestion/source/ge_profiling_config.py +11 -7
  36. datahub/ingestion/source/iceberg/iceberg_common.py +3 -2
  37. datahub/ingestion/source/identity/azure_ad.py +6 -14
  38. datahub/ingestion/source/identity/okta.py +2 -1
  39. datahub/ingestion/source/kafka/kafka.py +2 -1
  40. datahub/ingestion/source/kafka_connect/common.py +2 -1
  41. datahub/ingestion/source/ldap.py +2 -1
  42. datahub/ingestion/source/looker/looker_config.py +3 -1
  43. datahub/ingestion/source/looker/looker_dataclasses.py +8 -0
  44. datahub/ingestion/source/looker/looker_file_loader.py +14 -3
  45. datahub/ingestion/source/looker/looker_template_language.py +104 -14
  46. datahub/ingestion/source/looker/lookml_config.py +29 -8
  47. datahub/ingestion/source/looker/lookml_source.py +110 -22
  48. datahub/ingestion/source/mode.py +2 -4
  49. datahub/ingestion/source/mongodb.py +2 -1
  50. datahub/ingestion/source/nifi.py +2 -1
  51. datahub/ingestion/source/powerbi/config.py +2 -2
  52. datahub/ingestion/source/powerbi_report_server/report_server.py +2 -1
  53. datahub/ingestion/source/redash.py +5 -5
  54. datahub/ingestion/source/salesforce.py +4 -1
  55. datahub/ingestion/source/slack/slack.py +6 -0
  56. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  57. datahub/ingestion/source/snowflake/snowflake_query.py +11 -0
  58. datahub/ingestion/source/snowflake/snowflake_report.py +3 -1
  59. datahub/ingestion/source/snowflake/snowflake_schema.py +17 -0
  60. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +35 -43
  61. datahub/ingestion/source/snowflake/snowflake_tag.py +57 -3
  62. datahub/ingestion/source/snowflake/snowflake_v2.py +42 -4
  63. datahub/ingestion/source/sql/clickhouse.py +5 -43
  64. datahub/ingestion/source/sql/mssql/job_models.py +37 -8
  65. datahub/ingestion/source/sql/mssql/source.py +17 -0
  66. datahub/ingestion/source/sql/sql_config.py +0 -10
  67. datahub/ingestion/source/tableau/tableau.py +16 -13
  68. datahub/ingestion/source/tableau/tableau_common.py +1 -1
  69. datahub/ingestion/source/unity/ge_profiler.py +55 -4
  70. datahub/ingestion/source/unity/proxy.py +2 -2
  71. datahub/ingestion/source/unity/report.py +1 -0
  72. datahub/ingestion/source_config/operation_config.py +9 -0
  73. datahub/ingestion/source_report/pulsar.py +5 -4
  74. datahub/metadata/_schema_classes.py +304 -6
  75. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +6 -0
  76. datahub/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
  77. datahub/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
  78. datahub/metadata/schema.avsc +211 -12
  79. datahub/metadata/schemas/AssertionInfo.avsc +2 -2
  80. datahub/metadata/schemas/CorpUserSettings.avsc +9 -0
  81. datahub/metadata/schemas/DashboardInfo.avsc +5 -5
  82. datahub/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
  83. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  84. datahub/metadata/schemas/Deprecation.avsc +12 -0
  85. datahub/metadata/schemas/DisplayProperties.avsc +62 -0
  86. datahub/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
  87. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +92 -0
  88. datahub/metadata/schemas/MetadataChangeEvent.avsc +17 -5
  89. datahub/metadata/schemas/PostInfo.avsc +28 -2
  90. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  91. datahub/specific/dashboard.py +43 -1
  92. datahub/telemetry/telemetry.py +4 -4
  93. datahub/testing/check_imports.py +28 -0
  94. datahub/upgrade/upgrade.py +17 -9
  95. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/WHEEL +0 -0
  96. {acryl_datahub-0.15.0.4rc2.dist-info → acryl_datahub-0.15.0.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,223 @@
1
+ import logging
2
+ from functools import partial
3
+ from typing import Any, Iterable, List, Optional, Union
4
+
5
+ import progressbar
6
+ from pydantic import Field
7
+
8
+ from datahub.configuration.common import ConfigModel
9
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
10
+ from datahub.ingestion.api.common import PipelineContext
11
+ from datahub.ingestion.api.decorators import (
12
+ SupportStatus,
13
+ config_class,
14
+ platform_name,
15
+ support_status,
16
+ )
17
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
18
+ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
19
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
20
+ from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
21
+ from datahub.metadata.schema_classes import (
22
+ DomainsClass,
23
+ GlossaryTermAssociationClass,
24
+ MetadataChangeProposalClass,
25
+ OwnerClass,
26
+ OwnershipTypeClass,
27
+ TagAssociationClass,
28
+ )
29
+ from datahub.specific.dataset import DatasetPatchBuilder
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def apply_association_to_container(
35
+ container_urn: str,
36
+ association_urn: str,
37
+ association_type: str,
38
+ emit: bool = True,
39
+ graph: Optional[DataHubGraph] = None,
40
+ ) -> Optional[List[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]]:
41
+ """
42
+ Common function to add either tags, terms, domains, or owners to child datasets (for now).
43
+
44
+ Args:
45
+ container_urn: The URN of the container
46
+ association_urn: The URN of the tag, term, or user to apply
47
+ association_type: One of 'tag', 'term', 'domain' or 'owner'
48
+ """
49
+ urns: List[str] = [container_urn]
50
+ if not graph:
51
+ graph = get_default_graph()
52
+ logger.info(f"Using {graph}")
53
+ urns.extend(
54
+ graph.get_urns_by_filter(
55
+ container=container_urn,
56
+ batch_size=1000,
57
+ entity_types=["dataset", "container"],
58
+ )
59
+ )
60
+
61
+ all_patches: List[Any] = []
62
+ for urn in urns:
63
+ builder = DatasetPatchBuilder(urn)
64
+ patches: List[Any] = []
65
+ if association_type == "tag":
66
+ patches = builder.add_tag(TagAssociationClass(association_urn)).build()
67
+ elif association_type == "term":
68
+ patches = builder.add_term(
69
+ GlossaryTermAssociationClass(association_urn)
70
+ ).build()
71
+ elif association_type == "owner":
72
+ patches = builder.add_owner(
73
+ OwnerClass(
74
+ owner=association_urn,
75
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
76
+ )
77
+ ).build()
78
+ elif association_type == "domain":
79
+ patches = [
80
+ MetadataChangeProposalWrapper(
81
+ entityUrn=urn,
82
+ aspect=DomainsClass(domains=[association_urn]),
83
+ )
84
+ ]
85
+ all_patches.extend(patches)
86
+ if emit:
87
+ mcps_iter = progressbar.progressbar(all_patches, redirect_stdout=True)
88
+ for mcp in mcps_iter:
89
+ graph.emit(mcp)
90
+ return None
91
+ else:
92
+ return all_patches
93
+
94
+
95
+ class DomainApplyConfig(ConfigModel):
96
+ assets: List[str] = Field(
97
+ default_factory=list,
98
+ description="List of assets to apply domain hierarchichaly. Currently only containers and datasets are supported",
99
+ )
100
+ domain_urn: str = Field(default="")
101
+
102
+
103
+ class TagApplyConfig(ConfigModel):
104
+ assets: List[str] = Field(
105
+ default_factory=list,
106
+ description="List of assets to apply tag hierarchichaly. Currently only containers and datasets are supported",
107
+ )
108
+ tag_urn: str = Field(default="")
109
+
110
+
111
+ class TermApplyConfig(ConfigModel):
112
+ assets: List[str] = Field(
113
+ default_factory=list,
114
+ description="List of assets to apply term hierarchichaly. Currently only containers and datasets are supported",
115
+ )
116
+ term_urn: str = Field(default="")
117
+
118
+
119
+ class OwnerApplyConfig(ConfigModel):
120
+ assets: List[str] = Field(
121
+ default_factory=list,
122
+ description="List of assets to apply owner hierarchichaly. Currently only containers and datasets are supported",
123
+ )
124
+ owner_urn: str = Field(default="")
125
+
126
+
127
+ class DataHubApplyConfig(ConfigModel):
128
+ domain_apply: Optional[List[DomainApplyConfig]] = Field(
129
+ default=None,
130
+ description="List to apply domains to assets",
131
+ )
132
+ tag_apply: Optional[List[TagApplyConfig]] = Field(
133
+ default=None,
134
+ description="List to apply tags to assets",
135
+ )
136
+ term_apply: Optional[List[TermApplyConfig]] = Field(
137
+ default=None,
138
+ description="List to apply terms to assets",
139
+ )
140
+ owner_apply: Optional[List[OwnerApplyConfig]] = Field(
141
+ default=None,
142
+ description="List to apply owners to assets",
143
+ )
144
+
145
+
146
+ @platform_name("DataHubApply")
147
+ @config_class(DataHubApplyConfig)
148
+ @support_status(SupportStatus.TESTING)
149
+ class DataHubApplySource(Source):
150
+ """
151
+ This source is a helper over CLI
152
+ so people can use the helper to apply various metadata changes to DataHub
153
+ via Managed Ingestion
154
+ """
155
+
156
+ def __init__(self, ctx: PipelineContext, config: DataHubApplyConfig):
157
+ self.ctx = ctx
158
+ self.config = config
159
+ self.report = SourceReport()
160
+ self.graph = ctx.require_graph()
161
+
162
+ def _yield_workunits(
163
+ self,
164
+ proposals: List[
165
+ Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]
166
+ ],
167
+ ) -> Iterable[MetadataWorkUnit]:
168
+ for proposal in proposals:
169
+ if isinstance(proposal, MetadataChangeProposalWrapper):
170
+ yield proposal.as_workunit()
171
+ else:
172
+ yield MetadataWorkUnit(
173
+ id=MetadataWorkUnit.generate_workunit_id(proposal),
174
+ mcp_raw=proposal,
175
+ )
176
+
177
+ def _handle_assets(
178
+ self, assets: List[str], apply_urn: str, apply_type: str
179
+ ) -> Iterable[MetadataWorkUnit]:
180
+ for asset in assets:
181
+ change_proposals = apply_association_to_container(
182
+ asset, apply_urn, apply_type, emit=False, graph=self.graph
183
+ )
184
+ assert change_proposals is not None
185
+ yield from self._yield_workunits(change_proposals)
186
+
187
+ def _yield_domain(self) -> Iterable[MetadataWorkUnit]:
188
+ if not self.config.domain_apply:
189
+ return
190
+ for apply in self.config.domain_apply:
191
+ yield from self._handle_assets(apply.assets, apply.domain_urn, "domain")
192
+
193
+ def _yield_tag(self) -> Iterable[MetadataWorkUnit]:
194
+ if not self.config.tag_apply:
195
+ return
196
+ for apply in self.config.tag_apply:
197
+ yield from self._handle_assets(apply.assets, apply.tag_urn, "tag")
198
+
199
+ def _yield_term(self) -> Iterable[MetadataWorkUnit]:
200
+ if not self.config.term_apply:
201
+ return
202
+ for apply in self.config.term_apply:
203
+ yield from self._handle_assets(apply.assets, apply.term_urn, "term")
204
+
205
+ def _yield_owner(self) -> Iterable[MetadataWorkUnit]:
206
+ if not self.config.owner_apply:
207
+ return
208
+ for apply in self.config.owner_apply:
209
+ yield from self._handle_assets(apply.assets, apply.owner_urn, "owner")
210
+
211
+ def get_workunits_internal(
212
+ self,
213
+ ) -> Iterable[MetadataWorkUnit]:
214
+ yield from self._yield_domain()
215
+ yield from self._yield_tag()
216
+ yield from self._yield_term()
217
+ yield from self._yield_owner()
218
+
219
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
220
+ return [partial(auto_workunit_reporter, self.get_report())]
221
+
222
+ def get_report(self) -> SourceReport:
223
+ return self.report
@@ -113,6 +113,7 @@ from datahub.metadata.schema_classes import (
113
113
  )
114
114
  from datahub.utilities.delta import delta_type_to_hive_type
115
115
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
116
+ from datahub.utilities.lossy_collections import LossyList
116
117
 
117
118
  logger = logging.getLogger(__name__)
118
119
 
@@ -218,8 +219,9 @@ class GlueSourceConfig(
218
219
 
219
220
  @dataclass
220
221
  class GlueSourceReport(StaleEntityRemovalSourceReport):
222
+ catalog_id: Optional[str] = None
221
223
  tables_scanned = 0
222
- filtered: List[str] = dataclass_field(default_factory=list)
224
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
223
225
  databases: EntityFilterReport = EntityFilterReport.field(type="database")
224
226
 
225
227
  num_job_script_location_missing: int = 0
@@ -315,6 +317,7 @@ class GlueSource(StatefulIngestionSourceBase):
315
317
  self.extract_owners = config.extract_owners
316
318
  self.source_config = config
317
319
  self.report = GlueSourceReport()
320
+ self.report.catalog_id = self.source_config.catalog_id
318
321
  self.glue_client = config.glue_client
319
322
  self.s3_client = config.s3_client
320
323
  self.extract_transforms = config.extract_transforms
@@ -738,11 +741,17 @@ class GlueSource(StatefulIngestionSourceBase):
738
741
  self,
739
742
  ) -> Tuple[List[Mapping[str, Any]], List[Dict]]:
740
743
  all_databases = [*self.get_all_databases()]
741
- all_tables = [
742
- tables
743
- for database in all_databases
744
- for tables in self.get_tables_from_database(database)
745
- ]
744
+ all_tables = []
745
+ for database in all_databases:
746
+ try:
747
+ for tables in self.get_tables_from_database(database):
748
+ all_tables.append(tables)
749
+ except Exception as e:
750
+ self.report.warning(
751
+ message="Failed to get tables from database",
752
+ context=database["Name"],
753
+ exc=e,
754
+ )
746
755
  return all_databases, all_tables
747
756
 
748
757
  def get_lineage_if_enabled(
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import Dict, List, Optional, Union
2
+ from typing import Dict, Optional, Union
3
3
 
4
4
  from pydantic.fields import Field
5
5
 
@@ -9,6 +9,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
9
9
  StatefulIngestionConfigBase,
10
10
  StatefulStaleMetadataRemovalConfig,
11
11
  )
12
+ from datahub.utilities.lossy_collections import LossyList
12
13
 
13
14
 
14
15
  class SagemakerSourceConfig(
@@ -42,7 +43,7 @@ class SagemakerSourceReport(StaleEntityRemovalSourceReport):
42
43
  jobs_scanned = 0
43
44
  jobs_processed = 0
44
45
  datasets_scanned = 0
45
- filtered: List[str] = field(default_factory=list)
46
+ filtered: LossyList[str] = field(default_factory=LossyList)
46
47
  model_endpoint_lineage = 0
47
48
  model_group_lineage = 0
48
49
 
@@ -141,7 +141,7 @@ class BigQueryV2Report(
141
141
  profiling_skipped_invalid_partition_type: Dict[str, str] = field(
142
142
  default_factory=TopKDict
143
143
  )
144
- profiling_skipped_partition_profiling_disabled: List[str] = field(
144
+ profiling_skipped_partition_profiling_disabled: LossyList[str] = field(
145
145
  default_factory=LossyList
146
146
  )
147
147
  allow_pattern: Optional[str] = None
@@ -488,7 +488,7 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
488
488
  ) -> Dict:
489
489
  if re.match("^https?://", uri):
490
490
  return json.loads(requests.get(uri).text)
491
- elif re.match("^s3://", uri):
491
+ elif is_s3_uri(uri):
492
492
  u = urlparse(uri)
493
493
  assert aws_connection
494
494
  response = aws_connection.get_s3_client().get_object(
@@ -1,14 +1,14 @@
1
1
  import dataclasses
2
2
  from dataclasses import field as dataclass_field
3
- from typing import List
4
3
 
5
4
  from datahub.ingestion.api.source import SourceReport
5
+ from datahub.utilities.lossy_collections import LossyList
6
6
 
7
7
 
8
8
  @dataclasses.dataclass
9
9
  class DeltaLakeSourceReport(SourceReport):
10
10
  files_scanned = 0
11
- filtered: List[str] = dataclass_field(default_factory=list)
11
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
12
12
 
13
13
  def report_file_scanned(self) -> None:
14
14
  self.files_scanned += 1
@@ -68,6 +68,7 @@ from datahub.metadata.schema_classes import (
68
68
  StringTypeClass,
69
69
  UnionTypeClass,
70
70
  )
71
+ from datahub.utilities.lossy_collections import LossyList
71
72
  from datahub.utilities.registries.domain_registry import DomainRegistry
72
73
 
73
74
  MAX_ITEMS_TO_RETRIEVE = 100
@@ -120,7 +121,7 @@ class DynamoDBConfig(
120
121
 
121
122
  @dataclass
122
123
  class DynamoDBSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
123
- filtered: List[str] = field(default_factory=list)
124
+ filtered: LossyList[str] = field(default_factory=LossyList)
124
125
 
125
126
  def report_dropped(self, name: str) -> None:
126
127
  self.filtered.append(name)
@@ -62,6 +62,7 @@ from datahub.metadata.schema_classes import (
62
62
  SubTypesClass,
63
63
  )
64
64
  from datahub.utilities.config_clean import remove_protocol
65
+ from datahub.utilities.lossy_collections import LossyList
65
66
  from datahub.utilities.urns.dataset_urn import DatasetUrn
66
67
 
67
68
  logger = logging.getLogger(__name__)
@@ -189,7 +190,7 @@ class ElasticToSchemaFieldConverter:
189
190
  @dataclass
190
191
  class ElasticsearchSourceReport(SourceReport):
191
192
  index_scanned: int = 0
192
- filtered: List[str] = field(default_factory=list)
193
+ filtered: LossyList[str] = field(default_factory=LossyList)
193
194
 
194
195
  def report_index_scanned(self, index: str) -> None:
195
196
  self.index_scanned += 1
@@ -115,26 +115,30 @@ class GEProfilingConfig(GEProfilingBaseConfig):
115
115
  )
116
116
  max_number_of_fields_to_profile: Optional[pydantic.PositiveInt] = Field(
117
117
  default=None,
118
- description="A positive integer that specifies the maximum number of columns to profile for any table. `None` implies all columns. The cost of profiling goes up significantly as the number of columns to profile goes up.",
118
+ description="A positive integer that specifies the maximum number of columns to profile for "
119
+ "any table. `None` implies all columns. The cost of profiling goes up significantly as the "
120
+ "number of columns to profile goes up.",
119
121
  )
120
122
 
121
123
  profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
122
124
  default=None,
123
- description="Profile table only if it has been updated since these many number of days. If set to `null`, no constraint of last modified time for tables to profile. Supported only in `snowflake` and `BigQuery`.",
125
+ description="Profile table only if it has been updated since these many number of days. "
126
+ "If set to `null`, no constraint of last modified time for tables to profile. "
127
+ "Supported only in `snowflake` and `BigQuery`.",
124
128
  )
125
129
 
126
130
  profile_table_size_limit: Optional[int] = Field(
127
131
  default=5,
128
132
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
129
- "no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`"
130
- "Supported for `oracle` based on calculated size from gathered stats.",
133
+ "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
134
+ "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
131
135
  )
132
136
 
133
137
  profile_table_row_limit: Optional[int] = Field(
134
138
  default=5000000,
135
- description="Profile tables only if their row count is less than specified count. If set to `null`, "
136
- "no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`"
137
- "Supported for `oracle` based on gathered stats.",
139
+ description="Profile tables only if their row count is less than specified count. "
140
+ "If set to `null`, no limit on the row count of tables to profile. Supported only in "
141
+ "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
138
142
  )
139
143
 
140
144
  profile_table_row_count_estimate_only: bool = Field(
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, Optional
4
4
 
5
5
  from humanfriendly import format_timespan
6
6
  from pydantic import Field, validator
@@ -20,6 +20,7 @@ from datahub.ingestion.source_config.operation_config import (
20
20
  OperationConfig,
21
21
  is_profiling_enabled,
22
22
  )
23
+ from datahub.utilities.lossy_collections import LossyList
23
24
  from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
24
25
 
25
26
  logger = logging.getLogger(__name__)
@@ -198,7 +199,7 @@ class TimingClass:
198
199
  class IcebergSourceReport(StaleEntityRemovalSourceReport):
199
200
  tables_scanned: int = 0
200
201
  entities_profiled: int = 0
201
- filtered: List[str] = field(default_factory=list)
202
+ filtered: LossyList[str] = field(default_factory=LossyList)
202
203
  load_table_timings: TimingClass = field(default_factory=TimingClass)
203
204
  processing_table_timings: TimingClass = field(default_factory=TimingClass)
204
205
  profiling_table_timings: TimingClass = field(default_factory=TimingClass)
@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter, Retry
13
13
 
14
14
  from datahub.configuration.common import AllowDenyPattern
15
15
  from datahub.configuration.source_common import DatasetSourceConfigMixin
16
+ from datahub.configuration.validate_field_removal import pydantic_removed_field
16
17
  from datahub.emitter.mce_builder import make_group_urn, make_user_urn
17
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
19
  from datahub.ingestion.api.common import PipelineContext
@@ -51,6 +52,7 @@ from datahub.metadata.schema_classes import (
51
52
  OriginTypeClass,
52
53
  StatusClass,
53
54
  )
55
+ from datahub.utilities.lossy_collections import LossyList
54
56
 
55
57
  logger = logging.getLogger(__name__)
56
58
 
@@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
132
134
  description="regex patterns for groups to include in ingestion.",
133
135
  )
134
136
 
135
- # If enabled, report will contain names of filtered users and groups.
136
- filtered_tracking: bool = Field(
137
- default=True,
138
- description="If enabled, report will contain names of filtered users and groups.",
139
- )
137
+ _remove_filtered_tracking = pydantic_removed_field("filtered_tracking")
140
138
 
141
139
  # Optional: Whether to mask sensitive information from workunit ID's. On by default.
142
140
  mask_group_id: bool = Field(
@@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
156
154
 
157
155
  @dataclass
158
156
  class AzureADSourceReport(StaleEntityRemovalSourceReport):
159
- filtered: List[str] = field(default_factory=list)
160
- filtered_tracking: bool = field(default=True, repr=False)
161
- filtered_count: int = field(default=0)
157
+ filtered: LossyList[str] = field(default_factory=LossyList)
162
158
 
163
159
  def report_filtered(self, name: str) -> None:
164
- self.filtered_count += 1
165
- if self.filtered_tracking:
166
- self.filtered.append(name)
160
+ self.filtered.append(name)
167
161
 
168
162
 
169
163
  # Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
@@ -266,9 +260,7 @@ class AzureADSource(StatefulIngestionSourceBase):
266
260
  def __init__(self, config: AzureADConfig, ctx: PipelineContext):
267
261
  super().__init__(config, ctx)
268
262
  self.config = config
269
- self.report = AzureADSourceReport(
270
- filtered_tracking=self.config.filtered_tracking
271
- )
263
+ self.report = AzureADSourceReport()
272
264
  session = requests.Session()
273
265
  retries = Retry(
274
266
  total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]
@@ -50,6 +50,7 @@ from datahub.metadata.schema_classes import (
50
50
  OriginTypeClass,
51
51
  StatusClass,
52
52
  )
53
+ from datahub.utilities.lossy_collections import LossyList
53
54
 
54
55
  logger = logging.getLogger(__name__)
55
56
  nest_asyncio.apply()
@@ -173,7 +174,7 @@ class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
173
174
 
174
175
  @dataclass
175
176
  class OktaSourceReport(StaleEntityRemovalSourceReport):
176
- filtered: List[str] = field(default_factory=list)
177
+ filtered: LossyList[str] = field(default_factory=LossyList)
177
178
 
178
179
  def report_filtered(self, name: str) -> None:
179
180
  self.filtered.append(name)
@@ -73,6 +73,7 @@ from datahub.metadata.schema_classes import (
73
73
  OwnershipSourceTypeClass,
74
74
  SubTypesClass,
75
75
  )
76
+ from datahub.utilities.lossy_collections import LossyList
76
77
  from datahub.utilities.mapping import Constants, OperationProcessor
77
78
  from datahub.utilities.registries.domain_registry import DomainRegistry
78
79
  from datahub.utilities.str_enum import StrEnum
@@ -190,7 +191,7 @@ def get_kafka_admin_client(
190
191
  @dataclass
191
192
  class KafkaSourceReport(StaleEntityRemovalSourceReport):
192
193
  topics_scanned: int = 0
193
- filtered: List[str] = field(default_factory=list)
194
+ filtered: LossyList[str] = field(default_factory=LossyList)
194
195
 
195
196
  def report_topic_scanned(self, topic: str) -> None:
196
197
  self.topics_scanned += 1
@@ -16,6 +16,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
16
16
  from datahub.ingestion.source.state.stateful_ingestion_base import (
17
17
  StatefulIngestionConfigBase,
18
18
  )
19
+ from datahub.utilities.lossy_collections import LossyList
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -83,7 +84,7 @@ class KafkaConnectSourceConfig(
83
84
  @dataclass
84
85
  class KafkaConnectSourceReport(StaleEntityRemovalSourceReport):
85
86
  connectors_scanned: int = 0
86
- filtered: List[str] = field(default_factory=list)
87
+ filtered: LossyList[str] = field(default_factory=LossyList)
87
88
 
88
89
  def report_connector_scanned(self, connector: str) -> None:
89
90
  self.connectors_scanned += 1
@@ -37,6 +37,7 @@ from datahub.metadata.schema_classes import (
37
37
  CorpUserSnapshotClass,
38
38
  GroupMembershipClass,
39
39
  )
40
+ from datahub.utilities.lossy_collections import LossyList
40
41
 
41
42
  # default mapping for attrs
42
43
  user_attrs_map: Dict[str, Any] = {}
@@ -160,7 +161,7 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
160
161
 
161
162
  @dataclasses.dataclass
162
163
  class LDAPSourceReport(StaleEntityRemovalSourceReport):
163
- dropped_dns: List[str] = dataclasses.field(default_factory=list)
164
+ dropped_dns: LossyList[str] = dataclasses.field(default_factory=LossyList)
164
165
 
165
166
  def report_dropped(self, dn: str) -> None:
166
167
  self.dropped_dns.append(dn)
@@ -177,7 +177,9 @@ def _get_generic_definition(
177
177
  class LookerConnectionDefinition(ConfigModel):
178
178
  platform: str
179
179
  default_db: str
180
- default_schema: Optional[str] # Optional since some sources are two-level only
180
+ default_schema: Optional[str] = (
181
+ None # Optional since some sources are two-level only
182
+ )
181
183
  platform_instance: Optional[str] = None
182
184
  platform_env: Optional[str] = Field(
183
185
  default=None,
@@ -32,6 +32,12 @@ class LookerField:
32
32
  sql: Optional[str]
33
33
 
34
34
 
35
+ @dataclass
36
+ class LookerConstant:
37
+ name: str
38
+ value: str
39
+
40
+
35
41
  @dataclass
36
42
  class LookerModel:
37
43
  connection: str
@@ -75,6 +81,7 @@ class LookerModel:
75
81
  try:
76
82
  parsed = load_and_preprocess_file(
77
83
  path=included_file,
84
+ reporter=reporter,
78
85
  source_config=source_config,
79
86
  )
80
87
  included_explores = parsed.get("explores", [])
@@ -217,6 +224,7 @@ class LookerModel:
217
224
  try:
218
225
  parsed = load_and_preprocess_file(
219
226
  path=included_file,
227
+ reporter=reporter,
220
228
  source_config=source_config,
221
229
  )
222
230
  seen_so_far.add(included_file)
@@ -4,7 +4,10 @@ from dataclasses import replace
4
4
  from typing import Dict, Optional
5
5
 
6
6
  from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
7
- from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile
7
+ from datahub.ingestion.source.looker.looker_dataclasses import (
8
+ LookerConstant,
9
+ LookerViewFile,
10
+ )
8
11
  from datahub.ingestion.source.looker.looker_template_language import (
9
12
  load_and_preprocess_file,
10
13
  )
@@ -30,12 +33,14 @@ class LookerViewFileLoader:
30
33
  base_projects_folder: Dict[str, pathlib.Path],
31
34
  reporter: LookMLSourceReport,
32
35
  source_config: LookMLSourceConfig,
36
+ manifest_constants: Dict[str, LookerConstant] = {},
33
37
  ) -> None:
34
38
  self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
35
39
  self._root_project_name = root_project_name
36
40
  self._base_projects_folder = base_projects_folder
37
41
  self.reporter = reporter
38
42
  self.source_config = source_config
43
+ self.manifest_constants = manifest_constants
39
44
 
40
45
  def _load_viewfile(
41
46
  self, project_name: str, path: str, reporter: LookMLSourceReport
@@ -60,7 +65,7 @@ class LookerViewFileLoader:
60
65
  with open(path) as file:
61
66
  raw_file_content = file.read()
62
67
  except Exception as e:
63
- self.reporter.failure(
68
+ self.reporter.report_warning(
64
69
  title="LKML File Loading Error",
65
70
  message="A lookml file is not present on local storage or GitHub",
66
71
  context=f"file path: {path}",
@@ -71,9 +76,15 @@ class LookerViewFileLoader:
71
76
  try:
72
77
  logger.debug(f"Loading viewfile {path}")
73
78
 
79
+ # load_and preprocess_file is called multiple times for loading view file from multiple flows.
80
+ # Flag resolve_constants is a hack to avoid passing around manifest_constants from all of the flows.
81
+ # This is fine as rest of flows do not need resolution of constants.
74
82
  parsed = load_and_preprocess_file(
75
83
  path=path,
84
+ reporter=self.reporter,
76
85
  source_config=self.source_config,
86
+ resolve_constants=True,
87
+ manifest_constants=self.manifest_constants,
77
88
  )
78
89
 
79
90
  looker_viewfile = LookerViewFile.from_looker_dict(
@@ -90,7 +101,7 @@ class LookerViewFileLoader:
90
101
  self.viewfile_cache[path] = looker_viewfile
91
102
  return looker_viewfile
92
103
  except Exception as e:
93
- self.reporter.failure(
104
+ self.reporter.report_warning(
94
105
  title="LKML File Parsing Error",
95
106
  message="The input file is not lookml file",
96
107
  context=f"file path: {path}",