acryl-datahub 1.3.0.1rc5__py3-none-any.whl → 1.3.0.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/METADATA +2332 -2333
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/RECORD +47 -42
- datahub/_version.py +1 -1
- datahub/cli/docker_check.py +1 -1
- datahub/emitter/mce_builder.py +6 -0
- datahub/ingestion/autogenerated/capability_summary.json +12 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +17 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +16 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +2 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +41 -4
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/dremio/dremio_source.py +15 -15
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/config.py +33 -0
- datahub/ingestion/source/fivetran/fivetran.py +184 -13
- datahub/ingestion/source/fivetran/fivetran_log_api.py +20 -5
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/hex/hex.py +1 -1
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/metabase.py +23 -4
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +16 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_v2.py +14 -1
- datahub/ingestion/source/sql_queries.py +1 -1
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stateful_ingestion_base.py +30 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +1 -1
- datahub/metadata/_internal_schema_classes.py +223 -0
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +208 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/sdk/mlmodel.py +19 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +18 -4
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc5.dist-info → acryl_datahub-1.3.0.1rc7.dist-info}/top_level.txt +0 -0
|
@@ -52,6 +52,7 @@ from datahub.metadata.schema_classes import (
|
|
|
52
52
|
ChartQueryTypeClass,
|
|
53
53
|
ChartTypeClass,
|
|
54
54
|
DashboardInfoClass,
|
|
55
|
+
EdgeClass,
|
|
55
56
|
OwnerClass,
|
|
56
57
|
OwnershipClass,
|
|
57
58
|
OwnershipTypeClass,
|
|
@@ -338,19 +339,25 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
338
339
|
lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
|
|
339
340
|
)
|
|
340
341
|
|
|
341
|
-
|
|
342
|
+
# Convert chart URNs to chart edges (instead of deprecated charts field)
|
|
343
|
+
chart_edges = []
|
|
342
344
|
cards_data = dashboard_details.get("dashcards", {})
|
|
343
345
|
for card_info in cards_data:
|
|
344
346
|
card_id = card_info.get("card").get("id", "")
|
|
345
347
|
if not card_id:
|
|
346
348
|
continue # most likely a virtual card without an id (text or heading), not relevant.
|
|
347
349
|
chart_urn = builder.make_chart_urn(self.platform, str(card_id))
|
|
348
|
-
|
|
350
|
+
chart_edges.append(
|
|
351
|
+
EdgeClass(
|
|
352
|
+
destinationUrn=chart_urn,
|
|
353
|
+
lastModified=last_modified.lastModified,
|
|
354
|
+
)
|
|
355
|
+
)
|
|
349
356
|
|
|
350
357
|
dashboard_info_class = DashboardInfoClass(
|
|
351
358
|
description=description,
|
|
352
359
|
title=title,
|
|
353
|
-
|
|
360
|
+
chartEdges=chart_edges,
|
|
354
361
|
lastModified=last_modified,
|
|
355
362
|
dashboardUrl=f"{self.config.display_uri}/dashboard/{dashboard_id}",
|
|
356
363
|
customProperties={},
|
|
@@ -488,13 +495,25 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
488
495
|
datasource_urn = self.get_datasource_urn(card_details)
|
|
489
496
|
custom_properties = self.construct_card_custom_properties(card_details)
|
|
490
497
|
|
|
498
|
+
input_edges = (
|
|
499
|
+
[
|
|
500
|
+
EdgeClass(
|
|
501
|
+
destinationUrn=urn,
|
|
502
|
+
lastModified=last_modified.lastModified,
|
|
503
|
+
)
|
|
504
|
+
for urn in datasource_urn
|
|
505
|
+
]
|
|
506
|
+
if datasource_urn
|
|
507
|
+
else None
|
|
508
|
+
)
|
|
509
|
+
|
|
491
510
|
chart_info = ChartInfoClass(
|
|
492
511
|
type=chart_type,
|
|
493
512
|
description=description,
|
|
494
513
|
title=title,
|
|
495
514
|
lastModified=last_modified,
|
|
496
515
|
chartUrl=f"{self.config.display_uri}/card/{card_id}",
|
|
497
|
-
|
|
516
|
+
inputEdges=input_edges,
|
|
498
517
|
customProperties=custom_properties,
|
|
499
518
|
)
|
|
500
519
|
chart_snapshot.aspects.append(chart_info)
|
|
@@ -136,7 +136,7 @@ class MLflowRegisteredModelStageInfo:
|
|
|
136
136
|
|
|
137
137
|
@platform_name("MLflow")
|
|
138
138
|
@config_class(MLflowConfig)
|
|
139
|
-
@support_status(SupportStatus.
|
|
139
|
+
@support_status(SupportStatus.INCUBATING)
|
|
140
140
|
@capability(
|
|
141
141
|
SourceCapability.DESCRIPTIONS,
|
|
142
142
|
"Extract descriptions for MLflow Registered Models and Model Versions",
|
|
@@ -188,7 +188,7 @@ class TableData:
|
|
|
188
188
|
|
|
189
189
|
@platform_name("S3 / Local Files", id="s3")
|
|
190
190
|
@config_class(DataLakeSourceConfig)
|
|
191
|
-
@support_status(SupportStatus.
|
|
191
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
192
192
|
@capability(
|
|
193
193
|
SourceCapability.CONTAINERS,
|
|
194
194
|
"Enabled by default",
|
|
@@ -527,7 +527,7 @@ class SalesforceApi:
|
|
|
527
527
|
|
|
528
528
|
@platform_name("Salesforce")
|
|
529
529
|
@config_class(SalesforceConfig)
|
|
530
|
-
@support_status(SupportStatus.
|
|
530
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
531
531
|
@capability(
|
|
532
532
|
capability_name=SourceCapability.PLATFORM_INSTANCE,
|
|
533
533
|
description="Can be equivalent to Salesforce organization",
|
|
@@ -245,7 +245,7 @@ DATA_PLATFORM_SLACK_URN: str = builder.make_data_platform_urn(PLATFORM_NAME)
|
|
|
245
245
|
|
|
246
246
|
@platform_name("Slack")
|
|
247
247
|
@config_class(SlackSourceConfig)
|
|
248
|
-
@support_status(SupportStatus.
|
|
248
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
249
249
|
class SlackSource(StatefulIngestionSourceBase):
|
|
250
250
|
def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
|
|
251
251
|
super().__init__(config, ctx)
|
|
@@ -31,6 +31,7 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
|
|
|
31
31
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
32
32
|
StatefulLineageConfigMixin,
|
|
33
33
|
StatefulProfilingConfigMixin,
|
|
34
|
+
StatefulTimeWindowConfigMixin,
|
|
34
35
|
StatefulUsageConfigMixin,
|
|
35
36
|
)
|
|
36
37
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
@@ -199,6 +200,7 @@ class SnowflakeV2Config(
|
|
|
199
200
|
SnowflakeUsageConfig,
|
|
200
201
|
StatefulLineageConfigMixin,
|
|
201
202
|
StatefulUsageConfigMixin,
|
|
203
|
+
StatefulTimeWindowConfigMixin,
|
|
202
204
|
StatefulProfilingConfigMixin,
|
|
203
205
|
ClassificationSourceConfigMixin,
|
|
204
206
|
IncrementalPropertiesConfigMixin,
|
|
@@ -477,6 +479,20 @@ class SnowflakeV2Config(
|
|
|
477
479
|
|
|
478
480
|
return shares
|
|
479
481
|
|
|
482
|
+
@root_validator(pre=False, skip_on_failure=True)
|
|
483
|
+
def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
|
|
484
|
+
if values.get("use_queries_v2"):
|
|
485
|
+
if values.get("enable_stateful_lineage_ingestion") or values.get(
|
|
486
|
+
"enable_stateful_usage_ingestion"
|
|
487
|
+
):
|
|
488
|
+
logger.warning(
|
|
489
|
+
"enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
|
|
490
|
+
"when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
|
|
491
|
+
"For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
|
|
492
|
+
"for the unified time window extraction (lineage + usage + operations + queries)."
|
|
493
|
+
)
|
|
494
|
+
return values
|
|
495
|
+
|
|
480
496
|
def outbounds(self) -> Dict[str, Set[DatabaseId]]:
|
|
481
497
|
"""
|
|
482
498
|
Returns mapping of
|
|
@@ -17,9 +17,11 @@ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFr
|
|
|
17
17
|
from datahub.configuration.time_window_config import (
|
|
18
18
|
BaseTimeWindowConfig,
|
|
19
19
|
BucketDuration,
|
|
20
|
+
get_time_bucket,
|
|
20
21
|
)
|
|
21
22
|
from datahub.ingestion.api.closeable import Closeable
|
|
22
23
|
from datahub.ingestion.api.common import PipelineContext
|
|
24
|
+
from datahub.ingestion.api.decorators import SupportStatus, config_class, support_status
|
|
23
25
|
from datahub.ingestion.api.report import Report
|
|
24
26
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
25
27
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
@@ -50,6 +52,9 @@ from datahub.ingestion.source.snowflake.stored_proc_lineage import (
|
|
|
50
52
|
StoredProcLineageReport,
|
|
51
53
|
StoredProcLineageTracker,
|
|
52
54
|
)
|
|
55
|
+
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
56
|
+
RedundantQueriesRunSkipHandler,
|
|
57
|
+
)
|
|
53
58
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
54
59
|
from datahub.metadata.urns import CorpUserUrn
|
|
55
60
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
@@ -180,6 +185,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
180
185
|
structured_report: SourceReport,
|
|
181
186
|
filters: SnowflakeFilter,
|
|
182
187
|
identifiers: SnowflakeIdentifierBuilder,
|
|
188
|
+
redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
|
|
183
189
|
graph: Optional[DataHubGraph] = None,
|
|
184
190
|
schema_resolver: Optional[SchemaResolver] = None,
|
|
185
191
|
discovered_tables: Optional[List[str]] = None,
|
|
@@ -191,9 +197,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
191
197
|
self.filters = filters
|
|
192
198
|
self.identifiers = identifiers
|
|
193
199
|
self.discovered_tables = set(discovered_tables) if discovered_tables else None
|
|
200
|
+
self.redundant_run_skip_handler = redundant_run_skip_handler
|
|
194
201
|
|
|
195
202
|
self._structured_report = structured_report
|
|
196
203
|
|
|
204
|
+
# Adjust time window based on stateful ingestion state
|
|
205
|
+
self.start_time, self.end_time = self._get_time_window()
|
|
206
|
+
|
|
197
207
|
# The exit stack helps ensure that we close all the resources we open.
|
|
198
208
|
self._exit_stack = contextlib.ExitStack()
|
|
199
209
|
|
|
@@ -211,8 +221,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
211
221
|
generate_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
212
222
|
usage_config=BaseUsageConfig(
|
|
213
223
|
bucket_duration=self.config.window.bucket_duration,
|
|
214
|
-
start_time=self.
|
|
215
|
-
end_time=self.
|
|
224
|
+
start_time=self.start_time,
|
|
225
|
+
end_time=self.end_time,
|
|
216
226
|
user_email_pattern=self.config.user_email_pattern,
|
|
217
227
|
# TODO make the rest of the fields configurable
|
|
218
228
|
),
|
|
@@ -228,6 +238,34 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
228
238
|
def structured_reporter(self) -> SourceReport:
|
|
229
239
|
return self._structured_report
|
|
230
240
|
|
|
241
|
+
def _get_time_window(self) -> tuple[datetime, datetime]:
|
|
242
|
+
if self.redundant_run_skip_handler:
|
|
243
|
+
start_time, end_time = (
|
|
244
|
+
self.redundant_run_skip_handler.suggest_run_time_window(
|
|
245
|
+
self.config.window.start_time,
|
|
246
|
+
self.config.window.end_time,
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
else:
|
|
250
|
+
start_time = self.config.window.start_time
|
|
251
|
+
end_time = self.config.window.end_time
|
|
252
|
+
|
|
253
|
+
# Usage statistics are aggregated per bucket (typically per day).
|
|
254
|
+
# To ensure accurate aggregated metrics, we need to align the start_time
|
|
255
|
+
# to the beginning of a bucket so that we include complete bucket periods.
|
|
256
|
+
if self.config.include_usage_statistics:
|
|
257
|
+
start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
|
|
258
|
+
|
|
259
|
+
return start_time, end_time
|
|
260
|
+
|
|
261
|
+
def _update_state(self) -> None:
|
|
262
|
+
if self.redundant_run_skip_handler:
|
|
263
|
+
self.redundant_run_skip_handler.update_state(
|
|
264
|
+
self.config.window.start_time,
|
|
265
|
+
self.config.window.end_time,
|
|
266
|
+
self.config.window.bucket_duration,
|
|
267
|
+
)
|
|
268
|
+
|
|
231
269
|
@functools.cached_property
|
|
232
270
|
def local_temp_path(self) -> pathlib.Path:
|
|
233
271
|
if self.config.local_temp_path:
|
|
@@ -355,6 +393,9 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
355
393
|
with self.report.aggregator_generate_timer:
|
|
356
394
|
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
357
395
|
|
|
396
|
+
# Update the stateful ingestion state after successful extraction
|
|
397
|
+
self._update_state()
|
|
398
|
+
|
|
358
399
|
def fetch_users(self) -> UsersMapping:
|
|
359
400
|
users: UsersMapping = dict()
|
|
360
401
|
with self.structured_reporter.report_exc("Error fetching users from Snowflake"):
|
|
@@ -378,8 +419,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
378
419
|
# Derived from _populate_external_lineage_from_copy_history.
|
|
379
420
|
|
|
380
421
|
query: str = SnowflakeQuery.copy_lineage_history(
|
|
381
|
-
start_time_millis=int(self.
|
|
382
|
-
end_time_millis=int(self.
|
|
422
|
+
start_time_millis=int(self.start_time.timestamp() * 1000),
|
|
423
|
+
end_time_millis=int(self.end_time.timestamp() * 1000),
|
|
383
424
|
downstreams_deny_pattern=self.config.temporary_tables_pattern,
|
|
384
425
|
)
|
|
385
426
|
|
|
@@ -414,8 +455,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
414
455
|
Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
|
|
415
456
|
]:
|
|
416
457
|
query_log_query = QueryLogQueryBuilder(
|
|
417
|
-
start_time=self.
|
|
418
|
-
end_time=self.
|
|
458
|
+
start_time=self.start_time,
|
|
459
|
+
end_time=self.end_time,
|
|
419
460
|
bucket_duration=self.config.window.bucket_duration,
|
|
420
461
|
deny_usernames=self.config.pushdown_deny_usernames,
|
|
421
462
|
allow_usernames=self.config.pushdown_allow_usernames,
|
|
@@ -710,6 +751,8 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
710
751
|
self._exit_stack.close()
|
|
711
752
|
|
|
712
753
|
|
|
754
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
755
|
+
@config_class(SnowflakeQueriesSourceConfig)
|
|
713
756
|
class SnowflakeQueriesSource(Source):
|
|
714
757
|
def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig):
|
|
715
758
|
self.ctx = ctx
|
|
@@ -59,7 +59,7 @@ class SnowflakeSummaryReport(SourceReport, BaseTimeWindowReport):
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
@config_class(SnowflakeSummaryConfig)
|
|
62
|
-
@support_status(SupportStatus.
|
|
62
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
63
63
|
class SnowflakeSummarySource(Source):
|
|
64
64
|
def __init__(self, ctx: PipelineContext, config: SnowflakeSummaryConfig):
|
|
65
65
|
super().__init__(ctx)
|
|
@@ -73,6 +73,7 @@ from datahub.ingestion.source.snowflake.snowflake_utils import (
|
|
|
73
73
|
from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
|
|
74
74
|
from datahub.ingestion.source.state.redundant_run_skip_handler import (
|
|
75
75
|
RedundantLineageRunSkipHandler,
|
|
76
|
+
RedundantQueriesRunSkipHandler,
|
|
76
77
|
RedundantUsageRunSkipHandler,
|
|
77
78
|
)
|
|
78
79
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
@@ -207,7 +208,7 @@ class SnowflakeV2Source(
|
|
|
207
208
|
)
|
|
208
209
|
self.report.sql_aggregator = self.aggregator.report
|
|
209
210
|
|
|
210
|
-
if self.config.include_table_lineage:
|
|
211
|
+
if self.config.include_table_lineage and not self.config.use_queries_v2:
|
|
211
212
|
redundant_lineage_run_skip_handler: Optional[
|
|
212
213
|
RedundantLineageRunSkipHandler
|
|
213
214
|
] = None
|
|
@@ -589,6 +590,17 @@ class SnowflakeV2Source(
|
|
|
589
590
|
with self.report.new_stage(f"*: {QUERIES_EXTRACTION}"):
|
|
590
591
|
schema_resolver = self.aggregator._schema_resolver
|
|
591
592
|
|
|
593
|
+
redundant_queries_run_skip_handler: Optional[
|
|
594
|
+
RedundantQueriesRunSkipHandler
|
|
595
|
+
] = None
|
|
596
|
+
if self.config.enable_stateful_time_window:
|
|
597
|
+
redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
|
|
598
|
+
source=self,
|
|
599
|
+
config=self.config,
|
|
600
|
+
pipeline_name=self.ctx.pipeline_name,
|
|
601
|
+
run_id=self.ctx.run_id,
|
|
602
|
+
)
|
|
603
|
+
|
|
592
604
|
queries_extractor = SnowflakeQueriesExtractor(
|
|
593
605
|
connection=self.connection,
|
|
594
606
|
# TODO: this should be its own section in main recipe
|
|
@@ -614,6 +626,7 @@ class SnowflakeV2Source(
|
|
|
614
626
|
structured_report=self.report,
|
|
615
627
|
filters=self.filters,
|
|
616
628
|
identifiers=self.identifiers,
|
|
629
|
+
redundant_run_skip_handler=redundant_queries_run_skip_handler,
|
|
617
630
|
schema_resolver=schema_resolver,
|
|
618
631
|
discovered_tables=self.discovered_datasets,
|
|
619
632
|
graph=self.ctx.graph,
|
|
@@ -93,7 +93,7 @@ class SqlQueriesSourceReport(SourceReport):
|
|
|
93
93
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
94
94
|
|
|
95
95
|
|
|
96
|
-
@platform_name("SQL Queries")
|
|
96
|
+
@platform_name("SQL Queries", id="sql-queries")
|
|
97
97
|
@config_class(SqlQueriesSourceConfig)
|
|
98
98
|
@support_status(SupportStatus.INCUBATING)
|
|
99
99
|
@capability(SourceCapability.LINEAGE_COARSE, "Parsed from SQL queries")
|
|
@@ -244,3 +244,24 @@ class RedundantUsageRunSkipHandler(RedundantRunSkipHandler):
|
|
|
244
244
|
cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
|
|
245
245
|
cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
|
|
246
246
|
cur_state.bucket_duration = bucket_duration
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class RedundantQueriesRunSkipHandler(RedundantRunSkipHandler):
|
|
250
|
+
"""
|
|
251
|
+
Handler for stateful ingestion of queries v2 extraction.
|
|
252
|
+
Manages the time window for audit log extraction that combines
|
|
253
|
+
lineage, usage, operations, and queries.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def get_job_name_suffix(self):
|
|
257
|
+
return "_audit_window"
|
|
258
|
+
|
|
259
|
+
def update_state(
|
|
260
|
+
self, start_time: datetime, end_time: datetime, bucket_duration: BucketDuration
|
|
261
|
+
) -> None:
|
|
262
|
+
cur_checkpoint = self.get_current_checkpoint()
|
|
263
|
+
if cur_checkpoint:
|
|
264
|
+
cur_state = cast(BaseTimeWindowCheckpointState, cur_checkpoint.state)
|
|
265
|
+
cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
|
|
266
|
+
cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
|
|
267
|
+
cur_state.bucket_duration = bucket_duration
|
|
@@ -101,7 +101,9 @@ class StatefulLineageConfigMixin(ConfigModel):
|
|
|
101
101
|
default=True,
|
|
102
102
|
description="Enable stateful lineage ingestion."
|
|
103
103
|
" This will store lineage window timestamps after successful lineage ingestion. "
|
|
104
|
-
"and will not run lineage ingestion for same timestamps in subsequent run. "
|
|
104
|
+
"and will not run lineage ingestion for same timestamps in subsequent run. "
|
|
105
|
+
"NOTE: This only works with use_queries_v2=False (legacy extraction path). "
|
|
106
|
+
"For queries v2, use enable_stateful_time_window instead.",
|
|
105
107
|
)
|
|
106
108
|
|
|
107
109
|
_store_last_lineage_extraction_timestamp = pydantic_renamed_field(
|
|
@@ -150,7 +152,9 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
150
152
|
default=True,
|
|
151
153
|
description="Enable stateful lineage ingestion."
|
|
152
154
|
" This will store usage window timestamps after successful usage ingestion. "
|
|
153
|
-
"and will not run usage ingestion for same timestamps in subsequent run. "
|
|
155
|
+
"and will not run usage ingestion for same timestamps in subsequent run. "
|
|
156
|
+
"NOTE: This only works with use_queries_v2=False (legacy extraction path). "
|
|
157
|
+
"For queries v2, use enable_stateful_time_window instead.",
|
|
154
158
|
)
|
|
155
159
|
|
|
156
160
|
_store_last_usage_extraction_timestamp = pydantic_renamed_field(
|
|
@@ -169,6 +173,30 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
|
|
|
169
173
|
return values
|
|
170
174
|
|
|
171
175
|
|
|
176
|
+
class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
|
|
177
|
+
enable_stateful_time_window: bool = Field(
|
|
178
|
+
default=False,
|
|
179
|
+
description="Enable stateful time window tracking."
|
|
180
|
+
" This will store the time window after successful extraction "
|
|
181
|
+
"and adjust the time window in subsequent runs to avoid reprocessing. "
|
|
182
|
+
"NOTE: This is ONLY applicable when using queries v2 (use_queries_v2=True). "
|
|
183
|
+
"This replaces enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion "
|
|
184
|
+
"for the queries v2 extraction path, since queries v2 extracts lineage, usage, operations, "
|
|
185
|
+
"and queries together from a single audit log and uses a unified time window.",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
@root_validator(skip_on_failure=True)
|
|
189
|
+
def time_window_stateful_option_validator(cls, values: Dict) -> Dict:
|
|
190
|
+
sti = values.get("stateful_ingestion")
|
|
191
|
+
if not sti or not sti.enabled:
|
|
192
|
+
if values.get("enable_stateful_time_window"):
|
|
193
|
+
logger.warning(
|
|
194
|
+
"Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
|
|
195
|
+
)
|
|
196
|
+
values["enable_stateful_time_window"] = False
|
|
197
|
+
return values
|
|
198
|
+
|
|
199
|
+
|
|
172
200
|
@dataclass
|
|
173
201
|
class StatefulIngestionReport(SourceReport):
|
|
174
202
|
pass
|
|
@@ -176,7 +176,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
176
176
|
supported=True,
|
|
177
177
|
)
|
|
178
178
|
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
179
|
-
@support_status(SupportStatus.
|
|
179
|
+
@support_status(SupportStatus.CERTIFIED)
|
|
180
180
|
class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
181
181
|
"""
|
|
182
182
|
This plugin extracts the following metadata from Databricks Unity Catalog:
|
|
@@ -145,7 +145,7 @@ class PipelineMetadata:
|
|
|
145
145
|
|
|
146
146
|
@platform_name("Vertex AI", id="vertexai")
|
|
147
147
|
@config_class(VertexAIConfig)
|
|
148
|
-
@support_status(SupportStatus.
|
|
148
|
+
@support_status(SupportStatus.INCUBATING)
|
|
149
149
|
@capability(
|
|
150
150
|
SourceCapability.DESCRIPTIONS,
|
|
151
151
|
"Extract descriptions for Vertex AI Registered Models and Model Versions",
|