acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2236 -2240
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -186,16 +186,16 @@ class LookerModel:
|
|
|
186
186
|
f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}"
|
|
187
187
|
)
|
|
188
188
|
if "*" not in inc and not included_files:
|
|
189
|
-
reporter.
|
|
189
|
+
reporter.warning(
|
|
190
190
|
title="Error Resolving Include",
|
|
191
|
-
message=
|
|
192
|
-
context=f"
|
|
191
|
+
message="Cannot resolve included file",
|
|
192
|
+
context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
|
|
193
193
|
)
|
|
194
194
|
elif not included_files:
|
|
195
|
-
reporter.
|
|
195
|
+
reporter.warning(
|
|
196
196
|
title="Error Resolving Include",
|
|
197
|
-
message=
|
|
198
|
-
context=f"
|
|
197
|
+
message="Did not find anything matching the wildcard include",
|
|
198
|
+
context=f"Include: {inc}, path: {path}, traversal_path: {traversal_path}",
|
|
199
199
|
)
|
|
200
200
|
# only load files that we haven't seen so far
|
|
201
201
|
included_files = [x for x in included_files if x not in seen_so_far]
|
|
@@ -231,9 +231,7 @@ class LookerModel:
|
|
|
231
231
|
source_config,
|
|
232
232
|
reporter,
|
|
233
233
|
seen_so_far,
|
|
234
|
-
traversal_path=traversal_path
|
|
235
|
-
+ "."
|
|
236
|
-
+ pathlib.Path(included_file).stem,
|
|
234
|
+
traversal_path=f"{traversal_path} -> {pathlib.Path(included_file).stem}",
|
|
237
235
|
)
|
|
238
236
|
)
|
|
239
237
|
except Exception as e:
|
|
@@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel):
|
|
|
68
68
|
get_look_calls: int = 0
|
|
69
69
|
search_looks_calls: int = 0
|
|
70
70
|
search_dashboards_calls: int = 0
|
|
71
|
+
all_user_calls: int = 0
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
class LookerAPI:
|
|
@@ -135,7 +136,7 @@ class LookerAPI:
|
|
|
135
136
|
|
|
136
137
|
return permissions
|
|
137
138
|
|
|
138
|
-
@lru_cache(maxsize=
|
|
139
|
+
@lru_cache(maxsize=5000)
|
|
139
140
|
def get_user(self, id_: str, user_fields: str) -> Optional[User]:
|
|
140
141
|
self.client_stats.user_calls += 1
|
|
141
142
|
try:
|
|
@@ -154,6 +155,17 @@ class LookerAPI:
|
|
|
154
155
|
# User not found
|
|
155
156
|
return None
|
|
156
157
|
|
|
158
|
+
def all_users(self, user_fields: str) -> Sequence[User]:
|
|
159
|
+
self.client_stats.all_user_calls += 1
|
|
160
|
+
try:
|
|
161
|
+
return self.client.all_users(
|
|
162
|
+
fields=cast(str, user_fields),
|
|
163
|
+
transport_options=self.transport_options,
|
|
164
|
+
)
|
|
165
|
+
except SDKError as e:
|
|
166
|
+
logger.warning(f"Failure was {e}")
|
|
167
|
+
return []
|
|
168
|
+
|
|
157
169
|
def execute_query(self, write_query: WriteQuery) -> List[Dict]:
|
|
158
170
|
logger.debug(f"Executing query {write_query}")
|
|
159
171
|
self.client_stats.query_calls += 1
|
|
@@ -68,6 +68,7 @@ from datahub.ingestion.source.looker.looker_common import (
|
|
|
68
68
|
ViewField,
|
|
69
69
|
ViewFieldType,
|
|
70
70
|
gen_model_key,
|
|
71
|
+
get_urn_looker_element_id,
|
|
71
72
|
)
|
|
72
73
|
from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig
|
|
73
74
|
from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI
|
|
@@ -145,7 +146,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
145
146
|
self.source_config: LookerDashboardSourceConfig = config
|
|
146
147
|
self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
|
|
147
148
|
self.looker_api: LookerAPI = LookerAPI(self.source_config)
|
|
148
|
-
self.user_registry: LookerUserRegistry = LookerUserRegistry(
|
|
149
|
+
self.user_registry: LookerUserRegistry = LookerUserRegistry(
|
|
150
|
+
self.looker_api, self.reporter
|
|
151
|
+
)
|
|
149
152
|
self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
|
|
150
153
|
self.looker_api, self.reporter, self.source_config
|
|
151
154
|
)
|
|
@@ -163,6 +166,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
163
166
|
# Required, as we do not ingest all folders but only those that have dashboards/looks
|
|
164
167
|
self.processed_folders: List[str] = []
|
|
165
168
|
|
|
169
|
+
# Keep track of ingested chart urns, to omit usage for non-ingested entities
|
|
170
|
+
self.chart_urns: Set[str] = set()
|
|
171
|
+
|
|
166
172
|
@staticmethod
|
|
167
173
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
168
174
|
test_report = TestConnectionReport()
|
|
@@ -640,6 +646,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
640
646
|
chart_urn = self._make_chart_urn(
|
|
641
647
|
element_id=dashboard_element.get_urn_element_id()
|
|
642
648
|
)
|
|
649
|
+
self.chart_urns.add(chart_urn)
|
|
643
650
|
chart_snapshot = ChartSnapshot(
|
|
644
651
|
urn=chart_urn,
|
|
645
652
|
aspects=[Status(removed=False)],
|
|
@@ -1378,7 +1385,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1378
1385
|
yield from self._emit_folder_as_container(folder)
|
|
1379
1386
|
|
|
1380
1387
|
def extract_usage_stat(
|
|
1381
|
-
self,
|
|
1388
|
+
self,
|
|
1389
|
+
looker_dashboards: List[looker_usage.LookerDashboardForUsage],
|
|
1390
|
+
ingested_chart_urns: Set[str],
|
|
1382
1391
|
) -> List[MetadataChangeProposalWrapper]:
|
|
1383
1392
|
looks: List[looker_usage.LookerChartForUsage] = []
|
|
1384
1393
|
# filter out look from all dashboard
|
|
@@ -1389,6 +1398,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1389
1398
|
|
|
1390
1399
|
# dedup looks
|
|
1391
1400
|
looks = list({str(look.id): look for look in looks}.values())
|
|
1401
|
+
filtered_looks = []
|
|
1402
|
+
for look in looks:
|
|
1403
|
+
if not look.id:
|
|
1404
|
+
continue
|
|
1405
|
+
chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id))
|
|
1406
|
+
if chart_urn in ingested_chart_urns:
|
|
1407
|
+
filtered_looks.append(look)
|
|
1408
|
+
else:
|
|
1409
|
+
self.reporter.charts_skipped_for_usage.add(look.id)
|
|
1392
1410
|
|
|
1393
1411
|
# Keep stat generators to generate entity stat aspect later
|
|
1394
1412
|
stat_generator_config: looker_usage.StatGeneratorConfig = (
|
|
@@ -1412,7 +1430,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1412
1430
|
stat_generator_config,
|
|
1413
1431
|
self.reporter,
|
|
1414
1432
|
self._make_chart_urn,
|
|
1415
|
-
|
|
1433
|
+
filtered_looks,
|
|
1416
1434
|
)
|
|
1417
1435
|
|
|
1418
1436
|
mcps: List[MetadataChangeProposalWrapper] = []
|
|
@@ -1667,11 +1685,20 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
|
|
|
1667
1685
|
if self.source_config.extract_usage_history:
|
|
1668
1686
|
self.reporter.report_stage_start("usage_extraction")
|
|
1669
1687
|
usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat(
|
|
1670
|
-
looker_dashboards_for_usage
|
|
1688
|
+
looker_dashboards_for_usage, self.chart_urns
|
|
1671
1689
|
)
|
|
1672
1690
|
for usage_mcp in usage_mcps:
|
|
1673
1691
|
yield usage_mcp.as_workunit()
|
|
1674
1692
|
self.reporter.report_stage_end("usage_extraction")
|
|
1675
1693
|
|
|
1694
|
+
# Dump looker user resource mappings.
|
|
1695
|
+
logger.info("Ingesting looker user resource mapping workunits")
|
|
1696
|
+
self.reporter.report_stage_start("user_resource_extraction")
|
|
1697
|
+
yield from auto_workunit(
|
|
1698
|
+
self.user_registry.to_platform_resource(
|
|
1699
|
+
self.source_config.platform_instance
|
|
1700
|
+
)
|
|
1701
|
+
)
|
|
1702
|
+
|
|
1676
1703
|
def get_report(self) -> SourceReport:
|
|
1677
1704
|
return self.reporter
|
|
@@ -42,6 +42,7 @@ from datahub.metadata.schema_classes import (
|
|
|
42
42
|
TimeWindowSizeClass,
|
|
43
43
|
_Aspect as AspectAbstract,
|
|
44
44
|
)
|
|
45
|
+
from datahub.utilities.lossy_collections import LossySet
|
|
45
46
|
|
|
46
47
|
logger = logging.getLogger(__name__)
|
|
47
48
|
|
|
@@ -170,7 +171,7 @@ class BaseStatGenerator(ABC):
|
|
|
170
171
|
self.config = config
|
|
171
172
|
self.looker_models = looker_models
|
|
172
173
|
# Later it will help to find out for what are the looker entities from query result
|
|
173
|
-
self.
|
|
174
|
+
self.id_to_model: Dict[str, ModelForUsage] = {
|
|
174
175
|
self.get_id(looker_object): looker_object for looker_object in looker_models
|
|
175
176
|
}
|
|
176
177
|
self.post_filter = len(self.looker_models) > 100
|
|
@@ -225,6 +226,10 @@ class BaseStatGenerator(ABC):
|
|
|
225
226
|
def get_id_from_row(self, row: dict) -> str:
|
|
226
227
|
pass
|
|
227
228
|
|
|
229
|
+
@abstractmethod
|
|
230
|
+
def report_skip_set(self) -> LossySet[str]:
|
|
231
|
+
pass
|
|
232
|
+
|
|
228
233
|
def create_mcp(
|
|
229
234
|
self, model: ModelForUsage, aspect: Aspect
|
|
230
235
|
) -> MetadataChangeProposalWrapper:
|
|
@@ -258,20 +263,11 @@ class BaseStatGenerator(ABC):
|
|
|
258
263
|
|
|
259
264
|
return entity_stat_aspect
|
|
260
265
|
|
|
261
|
-
def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]:
|
|
262
|
-
aspects: List[Tuple[ModelForUsage, AspectAbstract]] = []
|
|
263
|
-
for looker_object in self.looker_models:
|
|
264
|
-
aspects.append(
|
|
265
|
-
(looker_object, self.to_entity_absolute_stat_aspect(looker_object))
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
return aspects
|
|
269
|
-
|
|
270
266
|
def _fill_user_stat_aspect(
|
|
271
267
|
self,
|
|
272
268
|
entity_usage_stat: Dict[Tuple[str, str], Aspect],
|
|
273
269
|
user_wise_rows: List[Dict],
|
|
274
|
-
) -> Iterable[Tuple[
|
|
270
|
+
) -> Iterable[Tuple[str, Aspect]]:
|
|
275
271
|
logger.debug("Entering fill user stat aspect")
|
|
276
272
|
|
|
277
273
|
# We first resolve all the users using a threadpool to warm up the cache
|
|
@@ -300,7 +296,7 @@ class BaseStatGenerator(ABC):
|
|
|
300
296
|
|
|
301
297
|
for row in user_wise_rows:
|
|
302
298
|
# Confirm looker object was given for stat generation
|
|
303
|
-
looker_object = self.
|
|
299
|
+
looker_object = self.id_to_model.get(self.get_id_from_row(row))
|
|
304
300
|
if looker_object is None:
|
|
305
301
|
logger.warning(
|
|
306
302
|
"Looker object with id({}) was not register with stat generator".format(
|
|
@@ -338,7 +334,7 @@ class BaseStatGenerator(ABC):
|
|
|
338
334
|
logger.debug("Starting to yield answers for user-wise counts")
|
|
339
335
|
|
|
340
336
|
for (id, _), aspect in entity_usage_stat.items():
|
|
341
|
-
yield
|
|
337
|
+
yield id, aspect
|
|
342
338
|
|
|
343
339
|
def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]:
|
|
344
340
|
rows = []
|
|
@@ -357,7 +353,7 @@ class BaseStatGenerator(ABC):
|
|
|
357
353
|
)
|
|
358
354
|
if self.post_filter:
|
|
359
355
|
logger.debug("post filtering")
|
|
360
|
-
rows = [r for r in rows if self.get_id_from_row(r) in self.
|
|
356
|
+
rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model]
|
|
361
357
|
logger.debug("Filtered down to %d rows", len(rows))
|
|
362
358
|
except Exception as e:
|
|
363
359
|
logger.warning(f"Failed to execute {query_name} query: {e}")
|
|
@@ -378,7 +374,8 @@ class BaseStatGenerator(ABC):
|
|
|
378
374
|
return
|
|
379
375
|
|
|
380
376
|
# yield absolute stat for looker entities
|
|
381
|
-
for looker_object
|
|
377
|
+
for looker_object in self.looker_models:
|
|
378
|
+
aspect = self.to_entity_absolute_stat_aspect(looker_object)
|
|
382
379
|
yield self.create_mcp(looker_object, aspect)
|
|
383
380
|
|
|
384
381
|
# Execute query and process the raw json which contains stat information
|
|
@@ -399,10 +396,13 @@ class BaseStatGenerator(ABC):
|
|
|
399
396
|
)
|
|
400
397
|
user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query")
|
|
401
398
|
# yield absolute stat for entity
|
|
402
|
-
for
|
|
399
|
+
for object_id, aspect in self._fill_user_stat_aspect(
|
|
403
400
|
entity_usage_stat, user_wise_rows
|
|
404
401
|
):
|
|
405
|
-
|
|
402
|
+
if object_id in self.id_to_model:
|
|
403
|
+
yield self.create_mcp(self.id_to_model[object_id], aspect)
|
|
404
|
+
else:
|
|
405
|
+
self.report_skip_set().add(object_id)
|
|
406
406
|
|
|
407
407
|
|
|
408
408
|
class DashboardStatGenerator(BaseStatGenerator):
|
|
@@ -425,6 +425,9 @@ class DashboardStatGenerator(BaseStatGenerator):
|
|
|
425
425
|
def get_stats_generator_name(self) -> str:
|
|
426
426
|
return "DashboardStats"
|
|
427
427
|
|
|
428
|
+
def report_skip_set(self) -> LossySet[str]:
|
|
429
|
+
return self.report.dashboards_skipped_for_usage
|
|
430
|
+
|
|
428
431
|
def get_filter(self) -> Dict[ViewField, str]:
|
|
429
432
|
return {
|
|
430
433
|
HistoryViewField.HISTORY_DASHBOARD_ID: ",".join(
|
|
@@ -541,6 +544,9 @@ class LookStatGenerator(BaseStatGenerator):
|
|
|
541
544
|
def get_stats_generator_name(self) -> str:
|
|
542
545
|
return "ChartStats"
|
|
543
546
|
|
|
547
|
+
def report_skip_set(self) -> LossySet[str]:
|
|
548
|
+
return self.report.charts_skipped_for_usage
|
|
549
|
+
|
|
544
550
|
def get_filter(self) -> Dict[ViewField, str]:
|
|
545
551
|
return {
|
|
546
552
|
LookViewField.LOOK_ID: ",".join(
|
|
@@ -38,16 +38,30 @@ T = TypeVar("T")
|
|
|
38
38
|
class MLflowConfig(EnvConfigMixin):
|
|
39
39
|
tracking_uri: Optional[str] = Field(
|
|
40
40
|
default=None,
|
|
41
|
-
description=
|
|
41
|
+
description=(
|
|
42
|
+
"Tracking server URI. If not set, an MLflow default tracking_uri is used"
|
|
43
|
+
" (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)"
|
|
44
|
+
),
|
|
42
45
|
)
|
|
43
46
|
registry_uri: Optional[str] = Field(
|
|
44
47
|
default=None,
|
|
45
|
-
description=
|
|
48
|
+
description=(
|
|
49
|
+
"Registry server URI. If not set, an MLflow default registry_uri is used"
|
|
50
|
+
" (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)"
|
|
51
|
+
),
|
|
46
52
|
)
|
|
47
53
|
model_name_separator: str = Field(
|
|
48
54
|
default="_",
|
|
49
55
|
description="A string which separates model name from its version (e.g. model_1 or model-1)",
|
|
50
56
|
)
|
|
57
|
+
base_external_url: Optional[str] = Field(
|
|
58
|
+
default=None,
|
|
59
|
+
description=(
|
|
60
|
+
"Base URL to use when constructing external URLs to MLflow."
|
|
61
|
+
" If not set, tracking_uri is used if it's an HTTP URL."
|
|
62
|
+
" If neither is set, external URLs are not generated."
|
|
63
|
+
),
|
|
64
|
+
)
|
|
51
65
|
|
|
52
66
|
|
|
53
67
|
@dataclass
|
|
@@ -279,12 +293,23 @@ class MLflowSource(Source):
|
|
|
279
293
|
)
|
|
280
294
|
return urn
|
|
281
295
|
|
|
282
|
-
def
|
|
296
|
+
def _get_base_external_url_from_tracking_uri(self) -> Optional[str]:
|
|
297
|
+
if isinstance(
|
|
298
|
+
self.client.tracking_uri, str
|
|
299
|
+
) and self.client.tracking_uri.startswith("http"):
|
|
300
|
+
return self.client.tracking_uri
|
|
301
|
+
else:
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
def _make_external_url(self, model_version: ModelVersion) -> Optional[str]:
|
|
283
305
|
"""
|
|
284
306
|
Generate URL for a Model Version to MLflow UI.
|
|
285
307
|
"""
|
|
286
|
-
base_uri =
|
|
287
|
-
|
|
308
|
+
base_uri = (
|
|
309
|
+
self.config.base_external_url
|
|
310
|
+
or self._get_base_external_url_from_tracking_uri()
|
|
311
|
+
)
|
|
312
|
+
if base_uri:
|
|
288
313
|
return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}"
|
|
289
314
|
else:
|
|
290
315
|
return None
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -5,6 +5,7 @@ import time
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
7
|
from functools import lru_cache
|
|
8
|
+
from json import JSONDecodeError
|
|
8
9
|
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
9
10
|
|
|
10
11
|
import dateutil.parser as dp
|
|
@@ -98,6 +99,7 @@ from datahub.metadata.schema_classes import (
|
|
|
98
99
|
TagPropertiesClass,
|
|
99
100
|
UpstreamClass,
|
|
100
101
|
UpstreamLineageClass,
|
|
102
|
+
ViewPropertiesClass,
|
|
101
103
|
)
|
|
102
104
|
from datahub.metadata.urns import QueryUrn
|
|
103
105
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
@@ -192,6 +194,9 @@ class HTTPError429(HTTPError):
|
|
|
192
194
|
pass
|
|
193
195
|
|
|
194
196
|
|
|
197
|
+
ModeRequestError = (HTTPError, JSONDecodeError)
|
|
198
|
+
|
|
199
|
+
|
|
195
200
|
@dataclass
|
|
196
201
|
class ModeSourceReport(StaleEntityRemovalSourceReport):
|
|
197
202
|
filtered_spaces: LossyList[str] = dataclasses.field(default_factory=LossyList)
|
|
@@ -327,11 +332,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
327
332
|
# Test the connection
|
|
328
333
|
try:
|
|
329
334
|
self._get_request_json(f"{self.config.connect_uri}/api/verify")
|
|
330
|
-
except
|
|
335
|
+
except ModeRequestError as e:
|
|
331
336
|
self.report.report_failure(
|
|
332
337
|
title="Failed to Connect",
|
|
333
338
|
message="Unable to verify connection to mode.",
|
|
334
|
-
context=f"Error: {str(
|
|
339
|
+
context=f"Error: {str(e)}",
|
|
335
340
|
)
|
|
336
341
|
|
|
337
342
|
self.workspace_uri = f"{self.config.connect_uri}/api/{self.config.workspace}"
|
|
@@ -520,11 +525,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
520
525
|
if self.config.owner_username_instead_of_email
|
|
521
526
|
else user_json.get("email")
|
|
522
527
|
)
|
|
523
|
-
except
|
|
528
|
+
except ModeRequestError as e:
|
|
524
529
|
self.report.report_warning(
|
|
525
530
|
title="Failed to retrieve Mode creator",
|
|
526
531
|
message=f"Unable to retrieve user for {href}",
|
|
527
|
-
context=f"Reason: {str(
|
|
532
|
+
context=f"Reason: {str(e)}",
|
|
528
533
|
)
|
|
529
534
|
return user
|
|
530
535
|
|
|
@@ -570,11 +575,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
570
575
|
logging.debug(f"Skipping space {space_name} due to space pattern")
|
|
571
576
|
continue
|
|
572
577
|
space_info[s.get("token", "")] = s.get("name", "")
|
|
573
|
-
except
|
|
578
|
+
except ModeRequestError as e:
|
|
574
579
|
self.report.report_failure(
|
|
575
580
|
title="Failed to Retrieve Spaces",
|
|
576
581
|
message="Unable to retrieve spaces / collections for workspace.",
|
|
577
|
-
context=f"Workspace: {self.workspace_uri}, Error: {str(
|
|
582
|
+
context=f"Workspace: {self.workspace_uri}, Error: {str(e)}",
|
|
578
583
|
)
|
|
579
584
|
|
|
580
585
|
return space_info
|
|
@@ -720,11 +725,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
720
725
|
try:
|
|
721
726
|
ds_json = self._get_request_json(f"{self.workspace_uri}/data_sources")
|
|
722
727
|
data_sources = ds_json.get("_embedded", {}).get("data_sources", [])
|
|
723
|
-
except
|
|
728
|
+
except ModeRequestError as e:
|
|
724
729
|
self.report.report_failure(
|
|
725
730
|
title="Failed to retrieve Data Sources",
|
|
726
731
|
message="Unable to retrieve data sources from Mode.",
|
|
727
|
-
context=f"Error: {str(
|
|
732
|
+
context=f"Error: {str(e)}",
|
|
728
733
|
)
|
|
729
734
|
|
|
730
735
|
return data_sources
|
|
@@ -811,11 +816,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
811
816
|
if definition.get("name", "") == definition_name:
|
|
812
817
|
return definition.get("source", "")
|
|
813
818
|
|
|
814
|
-
except
|
|
819
|
+
except ModeRequestError as e:
|
|
815
820
|
self.report.report_failure(
|
|
816
821
|
title="Failed to Retrieve Definition",
|
|
817
822
|
message="Unable to retrieve definition from Mode.",
|
|
818
|
-
context=f"Definition Name: {definition_name}, Error: {str(
|
|
823
|
+
context=f"Definition Name: {definition_name}, Error: {str(e)}",
|
|
819
824
|
)
|
|
820
825
|
return None
|
|
821
826
|
|
|
@@ -930,16 +935,13 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
930
935
|
|
|
931
936
|
dataset_props = DatasetPropertiesClass(
|
|
932
937
|
name=report_info.get("name") if is_mode_dataset else query_data.get("name"),
|
|
933
|
-
description=
|
|
934
|
-
``` sql
|
|
935
|
-
{query_data.get("raw_query")}
|
|
936
|
-
```
|
|
937
|
-
""",
|
|
938
|
+
description=None,
|
|
938
939
|
externalUrl=externalUrl,
|
|
939
940
|
customProperties=self.get_custom_props_from_dict(
|
|
940
941
|
query_data,
|
|
941
942
|
[
|
|
942
|
-
"id"
|
|
943
|
+
"id",
|
|
944
|
+
"created_at",
|
|
943
945
|
"updated_at",
|
|
944
946
|
"last_run_id",
|
|
945
947
|
"data_source_id",
|
|
@@ -949,7 +951,6 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
949
951
|
],
|
|
950
952
|
),
|
|
951
953
|
)
|
|
952
|
-
|
|
953
954
|
yield (
|
|
954
955
|
MetadataChangeProposalWrapper(
|
|
955
956
|
entityUrn=query_urn,
|
|
@@ -957,6 +958,16 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
957
958
|
).as_workunit()
|
|
958
959
|
)
|
|
959
960
|
|
|
961
|
+
if raw_query := query_data.get("raw_query"):
|
|
962
|
+
yield MetadataChangeProposalWrapper(
|
|
963
|
+
entityUrn=query_urn,
|
|
964
|
+
aspect=ViewPropertiesClass(
|
|
965
|
+
viewLogic=raw_query,
|
|
966
|
+
viewLanguage=QueryLanguageClass.SQL,
|
|
967
|
+
materialized=False,
|
|
968
|
+
),
|
|
969
|
+
).as_workunit()
|
|
970
|
+
|
|
960
971
|
if is_mode_dataset:
|
|
961
972
|
space_container_key = self.gen_space_key(space_token)
|
|
962
973
|
yield from add_dataset_to_container(
|
|
@@ -1375,11 +1386,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1375
1386
|
f"{self.workspace_uri}/spaces/{space_token}/reports"
|
|
1376
1387
|
)
|
|
1377
1388
|
reports = reports_json.get("_embedded", {}).get("reports", {})
|
|
1378
|
-
except
|
|
1389
|
+
except ModeRequestError as e:
|
|
1379
1390
|
self.report.report_failure(
|
|
1380
1391
|
title="Failed to Retrieve Reports for Space",
|
|
1381
1392
|
message="Unable to retrieve reports for space token.",
|
|
1382
|
-
context=f"Space Token: {space_token}, Error: {str(
|
|
1393
|
+
context=f"Space Token: {space_token}, Error: {str(e)}",
|
|
1383
1394
|
)
|
|
1384
1395
|
return reports
|
|
1385
1396
|
|
|
@@ -1393,11 +1404,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1393
1404
|
url = f"{self.workspace_uri}/spaces/{space_token}/datasets"
|
|
1394
1405
|
datasets_json = self._get_request_json(url)
|
|
1395
1406
|
datasets = datasets_json.get("_embedded", {}).get("reports", [])
|
|
1396
|
-
except
|
|
1407
|
+
except ModeRequestError as e:
|
|
1397
1408
|
self.report.report_failure(
|
|
1398
1409
|
title="Failed to Retrieve Datasets for Space",
|
|
1399
1410
|
message=f"Unable to retrieve datasets for space token {space_token}.",
|
|
1400
|
-
context=f"Error: {str(
|
|
1411
|
+
context=f"Error: {str(e)}",
|
|
1401
1412
|
)
|
|
1402
1413
|
return datasets
|
|
1403
1414
|
|
|
@@ -1409,11 +1420,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1409
1420
|
f"{self.workspace_uri}/reports/{report_token}/queries"
|
|
1410
1421
|
)
|
|
1411
1422
|
queries = queries_json.get("_embedded", {}).get("queries", {})
|
|
1412
|
-
except
|
|
1423
|
+
except ModeRequestError as e:
|
|
1413
1424
|
self.report.report_failure(
|
|
1414
1425
|
title="Failed to Retrieve Queries",
|
|
1415
1426
|
message="Unable to retrieve queries for report token.",
|
|
1416
|
-
context=f"Report Token: {report_token}, Error: {str(
|
|
1427
|
+
context=f"Report Token: {report_token}, Error: {str(e)}",
|
|
1417
1428
|
)
|
|
1418
1429
|
return queries
|
|
1419
1430
|
|
|
@@ -1426,11 +1437,11 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1426
1437
|
f"{self.workspace_uri}/reports/{report_token}/runs/{report_run_id}/query_runs{query_run_id}"
|
|
1427
1438
|
)
|
|
1428
1439
|
queries = queries_json.get("_embedded", {}).get("queries", {})
|
|
1429
|
-
except
|
|
1440
|
+
except ModeRequestError as e:
|
|
1430
1441
|
self.report.report_failure(
|
|
1431
1442
|
title="Failed to Retrieve Queries for Report",
|
|
1432
1443
|
message="Unable to retrieve queries for report token.",
|
|
1433
|
-
context=f"Report Token:{report_token}, Error: {str(
|
|
1444
|
+
context=f"Report Token:{report_token}, Error: {str(e)}",
|
|
1434
1445
|
)
|
|
1435
1446
|
return {}
|
|
1436
1447
|
return queries
|
|
@@ -1444,13 +1455,13 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1444
1455
|
f"/queries/{query_token}/charts"
|
|
1445
1456
|
)
|
|
1446
1457
|
charts = charts_json.get("_embedded", {}).get("charts", {})
|
|
1447
|
-
except
|
|
1458
|
+
except ModeRequestError as e:
|
|
1448
1459
|
self.report.report_failure(
|
|
1449
1460
|
title="Failed to Retrieve Charts",
|
|
1450
1461
|
message="Unable to retrieve charts from Mode.",
|
|
1451
1462
|
context=f"Report Token: {report_token}, "
|
|
1452
1463
|
f"Query token: {query_token}, "
|
|
1453
|
-
f"Error: {str(
|
|
1464
|
+
f"Error: {str(e)}",
|
|
1454
1465
|
)
|
|
1455
1466
|
return charts
|
|
1456
1467
|
|
|
@@ -1470,6 +1481,8 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1470
1481
|
response = self.session.get(
|
|
1471
1482
|
url, timeout=self.config.api_options.timeout
|
|
1472
1483
|
)
|
|
1484
|
+
if response.status_code == 204: # No content, don't parse json
|
|
1485
|
+
return {}
|
|
1473
1486
|
return response.json()
|
|
1474
1487
|
except HTTPError as http_error:
|
|
1475
1488
|
error_response = http_error.response
|
|
@@ -9,7 +9,7 @@ from pydantic.class_validators import root_validator
|
|
|
9
9
|
|
|
10
10
|
import datahub.emitter.mce_builder as builder
|
|
11
11
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
12
|
-
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
12
|
+
from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
|
|
13
13
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
14
14
|
from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
|
|
15
15
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]:
|
|
|
232
232
|
return dict_
|
|
233
233
|
|
|
234
234
|
|
|
235
|
-
class PlatformDetail(ConfigModel):
|
|
236
|
-
platform_instance: Optional[str] = pydantic.Field(
|
|
237
|
-
default=None,
|
|
238
|
-
description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
|
|
239
|
-
"with platform instance name used in ingestion "
|
|
240
|
-
"recipe of other datahub sources.",
|
|
241
|
-
)
|
|
242
|
-
env: str = pydantic.Field(
|
|
243
|
-
default=builder.DEFAULT_ENV,
|
|
244
|
-
description="The environment that all assets produced by DataHub platform ingestion source belong to",
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
|
|
248
235
|
class DataBricksPlatformDetail(PlatformDetail):
|
|
249
236
|
"""
|
|
250
237
|
metastore is an additional field used in Databricks connector to generate the dataset urn
|
|
@@ -2,8 +2,8 @@ import logging
|
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from typing import Union
|
|
4
4
|
|
|
5
|
+
from datahub.configuration.source_common import PlatformDetail
|
|
5
6
|
from datahub.ingestion.source.powerbi.config import (
|
|
6
|
-
PlatformDetail,
|
|
7
7
|
PowerBiDashboardSourceConfig,
|
|
8
8
|
PowerBIPlatformDetail,
|
|
9
9
|
)
|
|
@@ -5,13 +5,13 @@ from typing import Dict, List, Optional, Tuple, Type, cast
|
|
|
5
5
|
|
|
6
6
|
from lark import Tree
|
|
7
7
|
|
|
8
|
+
from datahub.configuration.source_common import PlatformDetail
|
|
8
9
|
from datahub.emitter import mce_builder as builder
|
|
9
10
|
from datahub.ingestion.api.common import PipelineContext
|
|
10
11
|
from datahub.ingestion.source.powerbi.config import (
|
|
11
12
|
Constant,
|
|
12
13
|
DataBricksPlatformDetail,
|
|
13
14
|
DataPlatformPair,
|
|
14
|
-
PlatformDetail,
|
|
15
15
|
PowerBiDashboardSourceConfig,
|
|
16
16
|
PowerBiDashboardSourceReport,
|
|
17
17
|
PowerBIPlatformDetail,
|
|
@@ -21,6 +21,11 @@
|
|
|
21
21
|
// | empty_string
|
|
22
22
|
// | empty_string "," argument_list
|
|
23
23
|
// - Added sql_string in any_literal
|
|
24
|
+
// - Added WS_INLINE? in field expression
|
|
25
|
+
// Added to ignore any comments
|
|
26
|
+
// %ignore WS // Ignore whitespace
|
|
27
|
+
// %ignore CPP_COMMENT // Ignore single-line comments
|
|
28
|
+
// %ignore C_COMMENT // Ignore multi-line comments
|
|
24
29
|
|
|
25
30
|
lexical_unit: lexical_elements?
|
|
26
31
|
|
|
@@ -245,6 +250,8 @@ operator_or_punctuator: ","
|
|
|
245
250
|
| "=>"
|
|
246
251
|
| ".."
|
|
247
252
|
| "..."
|
|
253
|
+
| "{{"
|
|
254
|
+
| "}}"
|
|
248
255
|
|
|
249
256
|
document: section_document
|
|
250
257
|
| expression_document
|
|
@@ -275,6 +282,7 @@ expression: logical_or_expression
|
|
|
275
282
|
| if_expression
|
|
276
283
|
| error_raising_expression
|
|
277
284
|
| error_handling_expression
|
|
285
|
+
| outer_expression
|
|
278
286
|
|
|
279
287
|
|
|
280
288
|
logical_or_expression: logical_and_expression
|
|
@@ -376,6 +384,8 @@ sql_content: /(?:[^\"\\]|\\[\"]|\"\"|\#\(lf\))+/
|
|
|
376
384
|
|
|
377
385
|
sql_string: "\"" sql_content "\""
|
|
378
386
|
|
|
387
|
+
outer_expression: "{{" expression "}}"
|
|
388
|
+
|
|
379
389
|
argument_list: WS_INLINE? expression
|
|
380
390
|
| WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list
|
|
381
391
|
| WS_INLINE? sql_string
|
|
@@ -409,7 +419,7 @@ record_expression: "[" field_list? "]"
|
|
|
409
419
|
field_list: field
|
|
410
420
|
| field "," field_list
|
|
411
421
|
|
|
412
|
-
field: field_name WS_INLINE? "=" WS_INLINE? expression
|
|
422
|
+
field: WS_INLINE? field_name WS_INLINE? "=" WS_INLINE? expression
|
|
413
423
|
|
|
414
424
|
field_name: generalized_identifier
|
|
415
425
|
| quoted_identifier
|
|
@@ -621,4 +631,8 @@ any_literal: record_literal
|
|
|
621
631
|
%import common.DIGIT
|
|
622
632
|
%import common.LF
|
|
623
633
|
%import common.CR
|
|
624
|
-
%import common.ESCAPED_STRING
|
|
634
|
+
%import common.ESCAPED_STRING
|
|
635
|
+
|
|
636
|
+
%ignore WS // Ignore whitespace
|
|
637
|
+
%ignore CPP_COMMENT // Ignore single-line comments
|
|
638
|
+
%ignore C_COMMENT // Ignore multi-line comments
|