acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/METADATA +2468 -2461
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/RECORD +74 -74
- datahub/_version.py +1 -1
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +9 -11
- datahub/ingestion/source/elastic_search.py +106 -29
- datahub/ingestion/source/snowflake/snowflake_queries.py +27 -3
- datahub/ingestion/source/sql_queries.py +164 -15
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +264 -89
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +9 -3
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -407,12 +407,78 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
|
|
|
407
407
|
for mcp in self._get_data_stream_index_count_mcps():
|
|
408
408
|
yield mcp.as_workunit()
|
|
409
409
|
if self.source_config.ingest_index_templates:
|
|
410
|
-
|
|
411
|
-
|
|
410
|
+
# Fetch legacy index templates
|
|
411
|
+
legacy_templates = self.client.indices.get_template()
|
|
412
|
+
for template in legacy_templates:
|
|
412
413
|
if self.source_config.index_template_pattern.allowed(template):
|
|
413
414
|
for mcp in self._extract_mcps(template, is_index=False):
|
|
414
415
|
yield mcp.as_workunit()
|
|
415
416
|
|
|
417
|
+
# Fetch composable index templates (ES 7.8+ / OpenSearch)
|
|
418
|
+
try:
|
|
419
|
+
composable_templates = self.client.indices.get_index_template()
|
|
420
|
+
for template_info in composable_templates.get("index_templates", []):
|
|
421
|
+
template = template_info.get("name")
|
|
422
|
+
if template and self.source_config.index_template_pattern.allowed(
|
|
423
|
+
template
|
|
424
|
+
):
|
|
425
|
+
for mcp in self._extract_mcps(
|
|
426
|
+
template, is_index=False, is_composable_template=True
|
|
427
|
+
):
|
|
428
|
+
yield mcp.as_workunit()
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.warning(f"Unable to fetch composable index templates: {e}")
|
|
431
|
+
|
|
432
|
+
def _get_template_metadata(
|
|
433
|
+
self, template_name: str, is_composable: bool
|
|
434
|
+
) -> Dict[str, Any]:
|
|
435
|
+
"""Fetch template metadata from Elasticsearch/OpenSearch."""
|
|
436
|
+
if is_composable:
|
|
437
|
+
# For composable templates (ES 7.8+ / OpenSearch)
|
|
438
|
+
raw_response = self.client.indices.get_index_template(name=template_name)
|
|
439
|
+
template_data = raw_response.get("index_templates", [{}])[0]
|
|
440
|
+
return template_data.get("index_template", {})
|
|
441
|
+
else:
|
|
442
|
+
# For legacy templates
|
|
443
|
+
raw_response = self.client.indices.get_template(name=template_name)
|
|
444
|
+
return raw_response[template_name]
|
|
445
|
+
|
|
446
|
+
def _extract_template_custom_properties(
|
|
447
|
+
self, raw_metadata: Dict[str, Any], is_composable: bool
|
|
448
|
+
) -> Dict[str, str]:
|
|
449
|
+
"""Extract custom properties from template metadata."""
|
|
450
|
+
custom_properties: Dict[str, str] = {}
|
|
451
|
+
|
|
452
|
+
# Extract aliases
|
|
453
|
+
if is_composable:
|
|
454
|
+
aliases_dict = raw_metadata.get("template", {}).get("aliases", {})
|
|
455
|
+
else:
|
|
456
|
+
aliases_dict = raw_metadata.get("aliases", {})
|
|
457
|
+
index_aliases: List[str] = list(aliases_dict.keys()) if aliases_dict else []
|
|
458
|
+
if index_aliases:
|
|
459
|
+
custom_properties["aliases"] = ",".join(index_aliases)
|
|
460
|
+
|
|
461
|
+
# Extract index_patterns
|
|
462
|
+
index_patterns: List[str] = raw_metadata.get("index_patterns", [])
|
|
463
|
+
if index_patterns:
|
|
464
|
+
custom_properties["index_patterns"] = ",".join(index_patterns)
|
|
465
|
+
|
|
466
|
+
# Extract settings
|
|
467
|
+
if is_composable:
|
|
468
|
+
index_settings: Dict[str, Any] = (
|
|
469
|
+
raw_metadata.get("template", {}).get("settings", {}).get("index", {})
|
|
470
|
+
)
|
|
471
|
+
else:
|
|
472
|
+
index_settings = raw_metadata.get("settings", {}).get("index", {})
|
|
473
|
+
num_shards: str = index_settings.get("number_of_shards", "")
|
|
474
|
+
if num_shards:
|
|
475
|
+
custom_properties["num_shards"] = num_shards
|
|
476
|
+
num_replicas: str = index_settings.get("number_of_replicas", "")
|
|
477
|
+
if num_replicas:
|
|
478
|
+
custom_properties["num_replicas"] = num_replicas
|
|
479
|
+
|
|
480
|
+
return custom_properties
|
|
481
|
+
|
|
416
482
|
def _get_data_stream_index_count_mcps(
|
|
417
483
|
self,
|
|
418
484
|
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
@@ -434,9 +500,11 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
|
|
|
434
500
|
)
|
|
435
501
|
|
|
436
502
|
def _extract_mcps(
|
|
437
|
-
self, index: str, is_index: bool = True
|
|
503
|
+
self, index: str, is_index: bool = True, is_composable_template: bool = False
|
|
438
504
|
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
439
|
-
logger.debug(
|
|
505
|
+
logger.debug(
|
|
506
|
+
f"index='{index}', is_index={is_index}, is_composable_template={is_composable_template}"
|
|
507
|
+
)
|
|
440
508
|
|
|
441
509
|
if is_index:
|
|
442
510
|
raw_index = self.client.indices.get(index=index)
|
|
@@ -451,15 +519,20 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
|
|
|
451
519
|
# This is a duplicate, skip processing it further.
|
|
452
520
|
return
|
|
453
521
|
else:
|
|
454
|
-
|
|
455
|
-
|
|
522
|
+
raw_index_metadata = self._get_template_metadata(
|
|
523
|
+
index, is_composable_template
|
|
524
|
+
)
|
|
456
525
|
collapsed_index_name = collapse_name(
|
|
457
526
|
name=index, collapse_urns=self.source_config.collapse_urns
|
|
458
527
|
)
|
|
459
528
|
|
|
460
529
|
# 1. Construct and emit the schemaMetadata aspect
|
|
461
530
|
# 1.1 Generate the schema fields from ES mappings.
|
|
462
|
-
|
|
531
|
+
# For composable templates, mappings are under 'template.mappings'
|
|
532
|
+
if is_composable_template:
|
|
533
|
+
index_mappings = raw_index_metadata.get("template", {}).get("mappings", {})
|
|
534
|
+
else:
|
|
535
|
+
index_mappings = raw_index_metadata.get("mappings", {})
|
|
463
536
|
index_mappings_json_str: str = json.dumps(index_mappings)
|
|
464
537
|
md5_hash = md5(index_mappings_json_str.encode()).hexdigest()
|
|
465
538
|
schema_fields = list(
|
|
@@ -517,28 +590,32 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
|
|
|
517
590
|
),
|
|
518
591
|
)
|
|
519
592
|
|
|
520
|
-
# 4. Construct and emit properties
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
593
|
+
# 4. Construct and emit properties
|
|
594
|
+
if is_index:
|
|
595
|
+
custom_properties: Dict[str, str] = {}
|
|
596
|
+
# Extract properties for indices
|
|
597
|
+
index_aliases: List[str] = list(
|
|
598
|
+
raw_index_metadata.get("aliases", {}).keys()
|
|
599
|
+
)
|
|
600
|
+
if index_aliases:
|
|
601
|
+
custom_properties["aliases"] = ",".join(index_aliases)
|
|
602
|
+
index_patterns: List[str] = raw_index_metadata.get("index_patterns", [])
|
|
603
|
+
if index_patterns:
|
|
604
|
+
custom_properties["index_patterns"] = ",".join(index_patterns)
|
|
605
|
+
index_settings: Dict[str, Any] = raw_index_metadata.get("settings", {}).get(
|
|
606
|
+
"index", {}
|
|
607
|
+
)
|
|
608
|
+
num_shards: str = index_settings.get("number_of_shards", "")
|
|
609
|
+
if num_shards:
|
|
610
|
+
custom_properties["num_shards"] = num_shards
|
|
611
|
+
num_replicas: str = index_settings.get("number_of_replicas", "")
|
|
612
|
+
if num_replicas:
|
|
613
|
+
custom_properties["num_replicas"] = num_replicas
|
|
614
|
+
else:
|
|
615
|
+
# Extract properties for templates
|
|
616
|
+
custom_properties = self._extract_template_custom_properties(
|
|
617
|
+
raw_index_metadata, is_composable_template
|
|
618
|
+
)
|
|
542
619
|
|
|
543
620
|
yield MetadataChangeProposalWrapper(
|
|
544
621
|
entityUrn=dataset_urn,
|
|
@@ -78,6 +78,7 @@ from datahub.utilities.file_backed_collections import (
|
|
|
78
78
|
ConnectionWrapper,
|
|
79
79
|
FileBackedList,
|
|
80
80
|
)
|
|
81
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
81
82
|
from datahub.utilities.perf_timer import PerfTimer
|
|
82
83
|
|
|
83
84
|
logger = logging.getLogger(__name__)
|
|
@@ -169,6 +170,10 @@ class SnowflakeQueriesExtractorReport(Report):
|
|
|
169
170
|
num_stream_queries_observed: int = 0
|
|
170
171
|
num_create_temp_view_queries_observed: int = 0
|
|
171
172
|
num_users: int = 0
|
|
173
|
+
num_queries_with_empty_column_name: int = 0
|
|
174
|
+
queries_with_empty_column_name: LossyList[str] = dataclasses.field(
|
|
175
|
+
default_factory=LossyList
|
|
176
|
+
)
|
|
172
177
|
|
|
173
178
|
|
|
174
179
|
@dataclass
|
|
@@ -626,9 +631,28 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
|
|
|
626
631
|
|
|
627
632
|
columns = set()
|
|
628
633
|
for modified_column in obj["columns"]:
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
634
|
+
column_name = modified_column["columnName"]
|
|
635
|
+
# An empty column name in the audit log would cause an error when creating column URNs.
|
|
636
|
+
# To avoid this and still extract lineage, the raw query text is parsed as a fallback.
|
|
637
|
+
if not column_name or not column_name.strip():
|
|
638
|
+
query_id = res["query_id"]
|
|
639
|
+
self.report.num_queries_with_empty_column_name += 1
|
|
640
|
+
self.report.queries_with_empty_column_name.append(query_id)
|
|
641
|
+
logger.info(f"Query {query_id} has empty column name in audit log.")
|
|
642
|
+
|
|
643
|
+
return ObservedQuery(
|
|
644
|
+
query=query_text,
|
|
645
|
+
session_id=res["session_id"],
|
|
646
|
+
timestamp=timestamp,
|
|
647
|
+
user=user,
|
|
648
|
+
default_db=res["default_db"],
|
|
649
|
+
default_schema=res["default_schema"],
|
|
650
|
+
query_hash=get_query_fingerprint(
|
|
651
|
+
query_text, self.identifiers.platform, fast=True
|
|
652
|
+
),
|
|
653
|
+
extra_info=extra_info,
|
|
654
|
+
)
|
|
655
|
+
columns.add(self.identifiers.snowflake_identifier(column_name))
|
|
632
656
|
|
|
633
657
|
upstreams.append(dataset)
|
|
634
658
|
column_usage[dataset] = columns
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass, field
|
|
5
6
|
from datetime import datetime
|
|
6
7
|
from functools import partial
|
|
7
|
-
from typing import ClassVar, Iterable, List, Optional, Union
|
|
8
|
+
from typing import ClassVar, Iterable, List, Optional, Union, cast
|
|
8
9
|
|
|
10
|
+
import smart_open
|
|
9
11
|
from pydantic import BaseModel, Field, validator
|
|
10
12
|
|
|
11
13
|
from datahub.configuration.common import HiddenFromDocs
|
|
@@ -36,12 +38,13 @@ from datahub.ingestion.api.source import (
|
|
|
36
38
|
SourceCapability,
|
|
37
39
|
SourceReport,
|
|
38
40
|
)
|
|
39
|
-
from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
41
|
+
from datahub.ingestion.api.source_helpers import auto_workunit, auto_workunit_reporter
|
|
40
42
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
41
43
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
44
|
+
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
42
45
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
43
46
|
from datahub.metadata.urns import CorpUserUrn, DatasetUrn
|
|
44
|
-
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
47
|
+
from datahub.sql_parsing.schema_resolver import SchemaResolver, SchemaResolverReport
|
|
45
48
|
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
46
49
|
KnownQueryLineageInfo,
|
|
47
50
|
ObservedQuery,
|
|
@@ -82,6 +85,24 @@ class SqlQueriesSourceConfig(
|
|
|
82
85
|
None,
|
|
83
86
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
84
87
|
)
|
|
88
|
+
temp_table_patterns: List[str] = Field(
|
|
89
|
+
description="Regex patterns for temporary tables to filter in lineage ingestion. "
|
|
90
|
+
"Specify regex to match the entire table name. This is useful for platforms like Athena "
|
|
91
|
+
"that don't have native temp tables but use naming patterns for fake temp tables.",
|
|
92
|
+
default=[],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
enable_lazy_schema_loading: bool = Field(
|
|
96
|
+
default=True,
|
|
97
|
+
description="Enable lazy schema loading for better performance. When enabled, schemas are fetched on-demand "
|
|
98
|
+
"instead of bulk loading all schemas upfront, reducing startup time and memory usage.",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# AWS/S3 configuration
|
|
102
|
+
aws_config: Optional[AwsConnectionConfig] = Field(
|
|
103
|
+
default=None,
|
|
104
|
+
description="AWS configuration for S3 access. Required when query_file is an S3 URI (s3://).",
|
|
105
|
+
)
|
|
85
106
|
|
|
86
107
|
|
|
87
108
|
@dataclass
|
|
@@ -89,8 +110,13 @@ class SqlQueriesSourceReport(SourceReport):
|
|
|
89
110
|
num_entries_processed: int = 0
|
|
90
111
|
num_entries_failed: int = 0
|
|
91
112
|
num_queries_aggregator_failures: int = 0
|
|
113
|
+
num_queries_processed_sequential: int = 0
|
|
114
|
+
num_temp_tables_detected: int = 0
|
|
115
|
+
temp_table_patterns_used: List[str] = field(default_factory=list)
|
|
116
|
+
peak_memory_usage_mb: float = 0.0
|
|
92
117
|
|
|
93
118
|
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
119
|
+
schema_resolver_report: Optional[SchemaResolverReport] = None
|
|
94
120
|
|
|
95
121
|
|
|
96
122
|
@platform_name("SQL Queries", id="sql-queries")
|
|
@@ -115,6 +141,18 @@ class SqlQueriesSource(Source):
|
|
|
115
141
|
- upstream_tables (optional): string[] - Fallback list of tables the query reads from,
|
|
116
142
|
used if the query can't be parsed.
|
|
117
143
|
|
|
144
|
+
**Lazy Schema Loading**:
|
|
145
|
+
- Fetches schemas on-demand during query parsing instead of bulk loading all schemas upfront
|
|
146
|
+
- Caches fetched schemas for future lookups to avoid repeated network requests
|
|
147
|
+
- Reduces initial startup time and memory usage significantly
|
|
148
|
+
- Automatically handles large platforms efficiently without memory issues
|
|
149
|
+
|
|
150
|
+
**Query Processing**:
|
|
151
|
+
- Loads the entire query file into memory at once
|
|
152
|
+
- Processes all queries sequentially before generating metadata work units
|
|
153
|
+
- Preserves temp table mappings and lineage relationships to ensure consistent lineage tracking
|
|
154
|
+
- Query deduplication is handled automatically by the SQL parsing aggregator
|
|
155
|
+
|
|
118
156
|
### Incremental Lineage
|
|
119
157
|
When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
|
|
120
158
|
This allows you to add lineage edges without removing existing ones, which is useful for:
|
|
@@ -124,6 +162,12 @@ class SqlQueriesSource(Source):
|
|
|
124
162
|
|
|
125
163
|
Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
|
|
126
164
|
statistics will still be emitted normally.
|
|
165
|
+
|
|
166
|
+
### Temporary Table Support
|
|
167
|
+
For platforms like Athena that don't have native temporary tables, you can use the `temp_table_patterns`
|
|
168
|
+
configuration to specify regex patterns that identify fake temporary tables. This allows the source to
|
|
169
|
+
process these tables like other sources that support native temp tables, enabling proper lineage tracking
|
|
170
|
+
across temporary table operations.
|
|
127
171
|
"""
|
|
128
172
|
|
|
129
173
|
schema_resolver: Optional[SchemaResolver]
|
|
@@ -141,13 +185,19 @@ class SqlQueriesSource(Source):
|
|
|
141
185
|
self.report = SqlQueriesSourceReport()
|
|
142
186
|
|
|
143
187
|
if self.config.use_schema_resolver:
|
|
144
|
-
#
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
188
|
+
# Create schema resolver report for tracking
|
|
189
|
+
self.report.schema_resolver_report = SchemaResolverReport()
|
|
190
|
+
|
|
191
|
+
# Use lazy loading - schemas will be fetched on-demand and cached
|
|
192
|
+
logger.info(
|
|
193
|
+
"Using lazy schema loading - schemas will be fetched on-demand and cached"
|
|
194
|
+
)
|
|
195
|
+
self.schema_resolver = SchemaResolver(
|
|
148
196
|
platform=self.config.platform,
|
|
149
197
|
platform_instance=self.config.platform_instance,
|
|
150
198
|
env=self.config.env,
|
|
199
|
+
graph=self.graph,
|
|
200
|
+
report=self.report.schema_resolver_report,
|
|
151
201
|
)
|
|
152
202
|
else:
|
|
153
203
|
self.schema_resolver = None
|
|
@@ -156,7 +206,9 @@ class SqlQueriesSource(Source):
|
|
|
156
206
|
platform=self.config.platform,
|
|
157
207
|
platform_instance=self.config.platform_instance,
|
|
158
208
|
env=self.config.env,
|
|
159
|
-
schema_resolver=self.schema_resolver
|
|
209
|
+
schema_resolver=cast(SchemaResolver, self.schema_resolver)
|
|
210
|
+
if self.schema_resolver
|
|
211
|
+
else None,
|
|
160
212
|
eager_graph_load=False,
|
|
161
213
|
generate_lineage=True, # TODO: make this configurable
|
|
162
214
|
generate_queries=True, # TODO: make this configurable
|
|
@@ -165,7 +217,9 @@ class SqlQueriesSource(Source):
|
|
|
165
217
|
generate_usage_statistics=True,
|
|
166
218
|
generate_operations=True, # TODO: make this configurable
|
|
167
219
|
usage_config=self.config.usage,
|
|
168
|
-
is_temp_table=
|
|
220
|
+
is_temp_table=self.is_temp_table
|
|
221
|
+
if self.config.temp_table_patterns
|
|
222
|
+
else None,
|
|
169
223
|
is_allowed_table=None,
|
|
170
224
|
format_queries=False,
|
|
171
225
|
)
|
|
@@ -193,20 +247,73 @@ class SqlQueriesSource(Source):
|
|
|
193
247
|
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
194
248
|
logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
|
|
195
249
|
|
|
250
|
+
logger.info("Processing all queries in batch mode")
|
|
251
|
+
yield from self._process_queries_batch()
|
|
252
|
+
|
|
253
|
+
def _process_queries_batch(
|
|
254
|
+
self,
|
|
255
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
256
|
+
"""Process all queries in memory (original behavior)."""
|
|
196
257
|
with self.report.new_stage("Collecting queries from file"):
|
|
197
258
|
queries = list(self._parse_query_file())
|
|
198
259
|
logger.info(f"Collected {len(queries)} queries for processing")
|
|
199
260
|
|
|
200
261
|
with self.report.new_stage("Processing queries through SQL parsing aggregator"):
|
|
201
|
-
|
|
202
|
-
|
|
262
|
+
logger.info("Using sequential processing")
|
|
263
|
+
self._process_queries_sequential(queries)
|
|
203
264
|
|
|
204
265
|
with self.report.new_stage("Generating metadata work units"):
|
|
205
266
|
logger.info("Generating workunits from SQL parsing aggregator")
|
|
206
|
-
yield from self.aggregator.gen_metadata()
|
|
267
|
+
yield from auto_workunit(self.aggregator.gen_metadata())
|
|
207
268
|
|
|
208
|
-
def
|
|
209
|
-
"""
|
|
269
|
+
def _is_s3_uri(self, path: str) -> bool:
|
|
270
|
+
"""Check if the path is an S3 URI."""
|
|
271
|
+
return path.startswith("s3://")
|
|
272
|
+
|
|
273
|
+
def _parse_s3_query_file(self) -> Iterable["QueryEntry"]:
|
|
274
|
+
"""Parse query file from S3 using smart_open."""
|
|
275
|
+
if not self.config.aws_config:
|
|
276
|
+
raise ValueError("AWS configuration required for S3 file access")
|
|
277
|
+
|
|
278
|
+
logger.info(f"Reading query file from S3: {self.config.query_file}")
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
# Use smart_open for efficient S3 streaming, similar to S3FileSystem
|
|
282
|
+
s3_client = self.config.aws_config.get_s3_client()
|
|
283
|
+
|
|
284
|
+
with smart_open.open(
|
|
285
|
+
self.config.query_file, mode="r", transport_params={"client": s3_client}
|
|
286
|
+
) as file_stream:
|
|
287
|
+
for line in file_stream:
|
|
288
|
+
if line.strip():
|
|
289
|
+
try:
|
|
290
|
+
query_dict = json.loads(line, strict=False)
|
|
291
|
+
entry = QueryEntry.create(query_dict, config=self.config)
|
|
292
|
+
self.report.num_entries_processed += 1
|
|
293
|
+
if self.report.num_entries_processed % 1000 == 0:
|
|
294
|
+
logger.info(
|
|
295
|
+
f"Processed {self.report.num_entries_processed} query entries from S3"
|
|
296
|
+
)
|
|
297
|
+
yield entry
|
|
298
|
+
except Exception as e:
|
|
299
|
+
self.report.num_entries_failed += 1
|
|
300
|
+
self.report.warning(
|
|
301
|
+
title="Error processing query from S3",
|
|
302
|
+
message="Query skipped due to parsing error",
|
|
303
|
+
context=line.strip(),
|
|
304
|
+
exc=e,
|
|
305
|
+
)
|
|
306
|
+
except Exception as e:
|
|
307
|
+
self.report.warning(
|
|
308
|
+
title="Error reading S3 file",
|
|
309
|
+
message="Failed to read S3 file",
|
|
310
|
+
context=self.config.query_file,
|
|
311
|
+
exc=e,
|
|
312
|
+
)
|
|
313
|
+
raise
|
|
314
|
+
|
|
315
|
+
def _parse_local_query_file(self) -> Iterable["QueryEntry"]:
|
|
316
|
+
"""Parse local query file (existing logic)."""
|
|
210
317
|
with open(self.config.query_file) as f:
|
|
211
318
|
for line in f:
|
|
212
319
|
try:
|
|
@@ -227,6 +334,30 @@ class SqlQueriesSource(Source):
|
|
|
227
334
|
exc=e,
|
|
228
335
|
)
|
|
229
336
|
|
|
337
|
+
def _parse_query_file(self) -> Iterable["QueryEntry"]:
|
|
338
|
+
"""Parse the query file and yield QueryEntry objects."""
|
|
339
|
+
if self._is_s3_uri(self.config.query_file):
|
|
340
|
+
yield from self._parse_s3_query_file()
|
|
341
|
+
else:
|
|
342
|
+
yield from self._parse_local_query_file()
|
|
343
|
+
|
|
344
|
+
def _process_queries_sequential(self, queries: List["QueryEntry"]) -> None:
|
|
345
|
+
"""Process queries sequentially."""
|
|
346
|
+
total_queries = len(queries)
|
|
347
|
+
logger.info(f"Processing {total_queries} queries sequentially")
|
|
348
|
+
|
|
349
|
+
# Process each query sequentially
|
|
350
|
+
for i, query_entry in enumerate(queries):
|
|
351
|
+
self._add_query_to_aggregator(query_entry)
|
|
352
|
+
self.report.num_queries_processed_sequential += 1
|
|
353
|
+
|
|
354
|
+
# Simple progress reporting every 1000 queries
|
|
355
|
+
if (i + 1) % 1000 == 0:
|
|
356
|
+
progress_pct = ((i + 1) / total_queries) * 100
|
|
357
|
+
logger.info(
|
|
358
|
+
f"Processed {i + 1}/{total_queries} queries ({progress_pct:.1f}%)"
|
|
359
|
+
)
|
|
360
|
+
|
|
230
361
|
def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
|
|
231
362
|
"""Add a query to the SQL parsing aggregator."""
|
|
232
363
|
try:
|
|
@@ -285,6 +416,24 @@ class SqlQueriesSource(Source):
|
|
|
285
416
|
exc=e,
|
|
286
417
|
)
|
|
287
418
|
|
|
419
|
+
def is_temp_table(self, name: str) -> bool:
|
|
420
|
+
"""Check if a table name matches any of the configured temp table patterns."""
|
|
421
|
+
if not self.config.temp_table_patterns:
|
|
422
|
+
return False
|
|
423
|
+
|
|
424
|
+
try:
|
|
425
|
+
for pattern in self.config.temp_table_patterns:
|
|
426
|
+
if re.match(pattern, name, flags=re.IGNORECASE):
|
|
427
|
+
logger.debug(
|
|
428
|
+
f"Table '{name}' matched temp table pattern: {pattern}"
|
|
429
|
+
)
|
|
430
|
+
self.report.num_temp_tables_detected += 1
|
|
431
|
+
return True
|
|
432
|
+
except re.error as e:
|
|
433
|
+
logger.warning(f"Invalid regex pattern '{pattern}': {e}")
|
|
434
|
+
|
|
435
|
+
return False
|
|
436
|
+
|
|
288
437
|
|
|
289
438
|
class QueryEntry(BaseModel):
|
|
290
439
|
query: str
|
|
@@ -511,6 +511,7 @@ class AssertionInfoClass(_Aspect):
|
|
|
511
511
|
source: Union[None, "AssertionSourceClass"]=None,
|
|
512
512
|
lastUpdated: Union[None, "AuditStampClass"]=None,
|
|
513
513
|
description: Union[None, str]=None,
|
|
514
|
+
note: Union[None, "AssertionNoteClass"]=None,
|
|
514
515
|
):
|
|
515
516
|
super().__init__()
|
|
516
517
|
|
|
@@ -531,6 +532,7 @@ class AssertionInfoClass(_Aspect):
|
|
|
531
532
|
self.source = source
|
|
532
533
|
self.lastUpdated = lastUpdated
|
|
533
534
|
self.description = description
|
|
535
|
+
self.note = note
|
|
534
536
|
|
|
535
537
|
def _restore_defaults(self) -> None:
|
|
536
538
|
self.customProperties = dict()
|
|
@@ -546,6 +548,7 @@ class AssertionInfoClass(_Aspect):
|
|
|
546
548
|
self.source = self.RECORD_SCHEMA.fields_dict["source"].default
|
|
547
549
|
self.lastUpdated = self.RECORD_SCHEMA.fields_dict["lastUpdated"].default
|
|
548
550
|
self.description = self.RECORD_SCHEMA.fields_dict["description"].default
|
|
551
|
+
self.note = self.RECORD_SCHEMA.fields_dict["note"].default
|
|
549
552
|
|
|
550
553
|
|
|
551
554
|
@property
|
|
@@ -570,7 +573,7 @@ class AssertionInfoClass(_Aspect):
|
|
|
570
573
|
|
|
571
574
|
@property
|
|
572
575
|
def type(self) -> Union[str, "AssertionTypeClass"]:
|
|
573
|
-
"""Type of assertion.
|
|
576
|
+
"""Type of assertion."""
|
|
574
577
|
return self._inner_dict.get('type') # type: ignore
|
|
575
578
|
|
|
576
579
|
@type.setter
|
|
@@ -682,6 +685,55 @@ class AssertionInfoClass(_Aspect):
|
|
|
682
685
|
self._inner_dict['description'] = value
|
|
683
686
|
|
|
684
687
|
|
|
688
|
+
@property
|
|
689
|
+
def note(self) -> Union[None, "AssertionNoteClass"]:
|
|
690
|
+
"""An optional note to give technical owners more context about the assertion, and how to troubleshoot it.
|
|
691
|
+
The UI will render this in markdown format."""
|
|
692
|
+
return self._inner_dict.get('note') # type: ignore
|
|
693
|
+
|
|
694
|
+
@note.setter
|
|
695
|
+
def note(self, value: Union[None, "AssertionNoteClass"]) -> None:
|
|
696
|
+
self._inner_dict['note'] = value
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
class AssertionNoteClass(DictWrapper):
|
|
700
|
+
# No docs available.
|
|
701
|
+
|
|
702
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.assertion.AssertionNote")
|
|
703
|
+
def __init__(self,
|
|
704
|
+
content: str,
|
|
705
|
+
lastModified: "AuditStampClass",
|
|
706
|
+
):
|
|
707
|
+
super().__init__()
|
|
708
|
+
|
|
709
|
+
self.content = content
|
|
710
|
+
self.lastModified = lastModified
|
|
711
|
+
|
|
712
|
+
def _restore_defaults(self) -> None:
|
|
713
|
+
self.content = str()
|
|
714
|
+
self.lastModified = AuditStampClass._construct_with_defaults()
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
@property
|
|
718
|
+
def content(self) -> str:
|
|
719
|
+
"""The note to give technical owners more context about the assertion, and how to troubleshoot it."""
|
|
720
|
+
return self._inner_dict.get('content') # type: ignore
|
|
721
|
+
|
|
722
|
+
@content.setter
|
|
723
|
+
def content(self, value: str) -> None:
|
|
724
|
+
self._inner_dict['content'] = value
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
@property
|
|
728
|
+
def lastModified(self) -> "AuditStampClass":
|
|
729
|
+
"""The time at which the note was last modified."""
|
|
730
|
+
return self._inner_dict.get('lastModified') # type: ignore
|
|
731
|
+
|
|
732
|
+
@lastModified.setter
|
|
733
|
+
def lastModified(self, value: "AuditStampClass") -> None:
|
|
734
|
+
self._inner_dict['lastModified'] = value
|
|
735
|
+
|
|
736
|
+
|
|
685
737
|
class AssertionResultClass(DictWrapper):
|
|
686
738
|
"""The result of running an assertion"""
|
|
687
739
|
|
|
@@ -1337,7 +1389,7 @@ class AssertionStdParametersClass(DictWrapper):
|
|
|
1337
1389
|
|
|
1338
1390
|
|
|
1339
1391
|
class AssertionTypeClass(object):
|
|
1340
|
-
|
|
1392
|
+
"""Type of assertion. Assertion types can evolve to span Datasets, Flows (Pipelines), Models, Features etc."""
|
|
1341
1393
|
|
|
1342
1394
|
DATASET = "DATASET"
|
|
1343
1395
|
"""A single-dataset assertion.
|
|
@@ -12623,6 +12675,9 @@ class NotificationSinkTypeClass(object):
|
|
|
12623
12675
|
EMAIL = "EMAIL"
|
|
12624
12676
|
"""Email target type."""
|
|
12625
12677
|
|
|
12678
|
+
TEAMS = "TEAMS"
|
|
12679
|
+
"""Microsoft Teams target type."""
|
|
12680
|
+
|
|
12626
12681
|
|
|
12627
12682
|
|
|
12628
12683
|
class EmailNotificationSettingsClass(DictWrapper):
|
|
@@ -20439,6 +20494,9 @@ class DataHubPageModuleTypeClass(object):
|
|
|
20439
20494
|
PLATFORMS = "PLATFORMS"
|
|
20440
20495
|
"""Module displaying the platforms in an instance"""
|
|
20441
20496
|
|
|
20497
|
+
UNKNOWN = "UNKNOWN"
|
|
20498
|
+
"""Unknown module type - this can occur with corrupted data or rolling back to versions without new modules"""
|
|
20499
|
+
|
|
20442
20500
|
|
|
20443
20501
|
|
|
20444
20502
|
class DataHubPageModuleVisibilityClass(DictWrapper):
|
|
@@ -27742,6 +27800,7 @@ __SCHEMA_TYPES = {
|
|
|
27742
27800
|
'com.linkedin.pegasus2avro.assertion.AssertionActionType': AssertionActionTypeClass,
|
|
27743
27801
|
'com.linkedin.pegasus2avro.assertion.AssertionActions': AssertionActionsClass,
|
|
27744
27802
|
'com.linkedin.pegasus2avro.assertion.AssertionInfo': AssertionInfoClass,
|
|
27803
|
+
'com.linkedin.pegasus2avro.assertion.AssertionNote': AssertionNoteClass,
|
|
27745
27804
|
'com.linkedin.pegasus2avro.assertion.AssertionResult': AssertionResultClass,
|
|
27746
27805
|
'com.linkedin.pegasus2avro.assertion.AssertionResultError': AssertionResultErrorClass,
|
|
27747
27806
|
'com.linkedin.pegasus2avro.assertion.AssertionResultErrorType': AssertionResultErrorTypeClass,
|
|
@@ -28268,6 +28327,7 @@ __SCHEMA_TYPES = {
|
|
|
28268
28327
|
'AssertionActionType': AssertionActionTypeClass,
|
|
28269
28328
|
'AssertionActions': AssertionActionsClass,
|
|
28270
28329
|
'AssertionInfo': AssertionInfoClass,
|
|
28330
|
+
'AssertionNote': AssertionNoteClass,
|
|
28271
28331
|
'AssertionResult': AssertionResultClass,
|
|
28272
28332
|
'AssertionResultError': AssertionResultErrorClass,
|
|
28273
28333
|
'AssertionResultErrorType': AssertionResultErrorTypeClass,
|
|
@@ -11,6 +11,7 @@ from .....schema_classes import AssertionActionClass
|
|
|
11
11
|
from .....schema_classes import AssertionActionTypeClass
|
|
12
12
|
from .....schema_classes import AssertionActionsClass
|
|
13
13
|
from .....schema_classes import AssertionInfoClass
|
|
14
|
+
from .....schema_classes import AssertionNoteClass
|
|
14
15
|
from .....schema_classes import AssertionResultClass
|
|
15
16
|
from .....schema_classes import AssertionResultErrorClass
|
|
16
17
|
from .....schema_classes import AssertionResultErrorTypeClass
|
|
@@ -64,6 +65,7 @@ AssertionAction = AssertionActionClass
|
|
|
64
65
|
AssertionActionType = AssertionActionTypeClass
|
|
65
66
|
AssertionActions = AssertionActionsClass
|
|
66
67
|
AssertionInfo = AssertionInfoClass
|
|
68
|
+
AssertionNote = AssertionNoteClass
|
|
67
69
|
AssertionResult = AssertionResultClass
|
|
68
70
|
AssertionResultError = AssertionResultErrorClass
|
|
69
71
|
AssertionResultErrorType = AssertionResultErrorTypeClass
|