acryl-datahub 1.3.0.1rc8__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show
  1. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/METADATA +2654 -2647
  2. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/RECORD +74 -74
  3. datahub/_version.py +1 -1
  4. datahub/ingestion/graph/client.py +5 -1
  5. datahub/ingestion/run/pipeline.py +1 -0
  6. datahub/ingestion/source/dremio/dremio_api.py +212 -78
  7. datahub/ingestion/source/dremio/dremio_entities.py +55 -39
  8. datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
  9. datahub/ingestion/source/dremio/dremio_source.py +9 -11
  10. datahub/ingestion/source/elastic_search.py +106 -29
  11. datahub/ingestion/source/snowflake/snowflake_queries.py +27 -3
  12. datahub/ingestion/source/sql_queries.py +164 -15
  13. datahub/metadata/_internal_schema_classes.py +62 -2
  14. datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
  15. datahub/metadata/schema.avsc +264 -89
  16. datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
  17. datahub/metadata/schemas/AssertionInfo.avsc +48 -5
  18. datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
  19. datahub/metadata/schemas/ChartInfo.avsc +12 -5
  20. datahub/metadata/schemas/ContainerProperties.avsc +12 -5
  21. datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
  22. datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
  23. datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
  24. datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
  25. datahub/metadata/schemas/DashboardInfo.avsc +16 -4
  26. datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
  27. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
  28. datahub/metadata/schemas/DataJobInfo.avsc +9 -4
  29. datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
  30. datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
  31. datahub/metadata/schemas/DataProductProperties.avsc +5 -2
  32. datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
  33. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  34. datahub/metadata/schemas/DatasetProperties.avsc +12 -5
  35. datahub/metadata/schemas/DomainProperties.avsc +7 -3
  36. datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
  37. datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
  38. datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
  39. datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
  40. datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
  41. datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
  42. datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
  43. datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
  44. datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
  45. datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
  46. datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
  47. datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
  48. datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
  49. datahub/metadata/schemas/GlobalTags.avsc +3 -2
  50. datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
  51. datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
  52. datahub/metadata/schemas/InputFields.avsc +3 -2
  53. datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
  54. datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
  55. datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
  56. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  57. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  58. datahub/metadata/schemas/MLModelProperties.avsc +4 -2
  59. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
  60. datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
  61. datahub/metadata/schemas/NotebookInfo.avsc +5 -2
  62. datahub/metadata/schemas/Ownership.avsc +3 -2
  63. datahub/metadata/schemas/RoleProperties.avsc +3 -1
  64. datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
  65. datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
  66. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +9 -3
  67. datahub/metadata/schemas/TagProperties.avsc +3 -1
  68. datahub/metadata/schemas/TestInfo.avsc +2 -1
  69. datahub/sql_parsing/schema_resolver.py +29 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
  71. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/WHEEL +0 -0
  72. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/entry_points.txt +0 -0
  73. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/licenses/LICENSE +0 -0
  74. {acryl_datahub-1.3.0.1rc8.dist-info → acryl_datahub-1.3.1.dist-info}/top_level.txt +0 -0
@@ -407,12 +407,78 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
407
407
  for mcp in self._get_data_stream_index_count_mcps():
408
408
  yield mcp.as_workunit()
409
409
  if self.source_config.ingest_index_templates:
410
- templates = self.client.indices.get_template()
411
- for template in templates:
410
+ # Fetch legacy index templates
411
+ legacy_templates = self.client.indices.get_template()
412
+ for template in legacy_templates:
412
413
  if self.source_config.index_template_pattern.allowed(template):
413
414
  for mcp in self._extract_mcps(template, is_index=False):
414
415
  yield mcp.as_workunit()
415
416
 
417
+ # Fetch composable index templates (ES 7.8+ / OpenSearch)
418
+ try:
419
+ composable_templates = self.client.indices.get_index_template()
420
+ for template_info in composable_templates.get("index_templates", []):
421
+ template = template_info.get("name")
422
+ if template and self.source_config.index_template_pattern.allowed(
423
+ template
424
+ ):
425
+ for mcp in self._extract_mcps(
426
+ template, is_index=False, is_composable_template=True
427
+ ):
428
+ yield mcp.as_workunit()
429
+ except Exception as e:
430
+ logger.warning(f"Unable to fetch composable index templates: {e}")
431
+
432
+ def _get_template_metadata(
433
+ self, template_name: str, is_composable: bool
434
+ ) -> Dict[str, Any]:
435
+ """Fetch template metadata from Elasticsearch/OpenSearch."""
436
+ if is_composable:
437
+ # For composable templates (ES 7.8+ / OpenSearch)
438
+ raw_response = self.client.indices.get_index_template(name=template_name)
439
+ template_data = raw_response.get("index_templates", [{}])[0]
440
+ return template_data.get("index_template", {})
441
+ else:
442
+ # For legacy templates
443
+ raw_response = self.client.indices.get_template(name=template_name)
444
+ return raw_response[template_name]
445
+
446
+ def _extract_template_custom_properties(
447
+ self, raw_metadata: Dict[str, Any], is_composable: bool
448
+ ) -> Dict[str, str]:
449
+ """Extract custom properties from template metadata."""
450
+ custom_properties: Dict[str, str] = {}
451
+
452
+ # Extract aliases
453
+ if is_composable:
454
+ aliases_dict = raw_metadata.get("template", {}).get("aliases", {})
455
+ else:
456
+ aliases_dict = raw_metadata.get("aliases", {})
457
+ index_aliases: List[str] = list(aliases_dict.keys()) if aliases_dict else []
458
+ if index_aliases:
459
+ custom_properties["aliases"] = ",".join(index_aliases)
460
+
461
+ # Extract index_patterns
462
+ index_patterns: List[str] = raw_metadata.get("index_patterns", [])
463
+ if index_patterns:
464
+ custom_properties["index_patterns"] = ",".join(index_patterns)
465
+
466
+ # Extract settings
467
+ if is_composable:
468
+ index_settings: Dict[str, Any] = (
469
+ raw_metadata.get("template", {}).get("settings", {}).get("index", {})
470
+ )
471
+ else:
472
+ index_settings = raw_metadata.get("settings", {}).get("index", {})
473
+ num_shards: str = index_settings.get("number_of_shards", "")
474
+ if num_shards:
475
+ custom_properties["num_shards"] = num_shards
476
+ num_replicas: str = index_settings.get("number_of_replicas", "")
477
+ if num_replicas:
478
+ custom_properties["num_replicas"] = num_replicas
479
+
480
+ return custom_properties
481
+
416
482
  def _get_data_stream_index_count_mcps(
417
483
  self,
418
484
  ) -> Iterable[MetadataChangeProposalWrapper]:
@@ -434,9 +500,11 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
434
500
  )
435
501
 
436
502
  def _extract_mcps(
437
- self, index: str, is_index: bool = True
503
+ self, index: str, is_index: bool = True, is_composable_template: bool = False
438
504
  ) -> Iterable[MetadataChangeProposalWrapper]:
439
- logger.debug(f"index='{index}', is_index={is_index}")
505
+ logger.debug(
506
+ f"index='{index}', is_index={is_index}, is_composable_template={is_composable_template}"
507
+ )
440
508
 
441
509
  if is_index:
442
510
  raw_index = self.client.indices.get(index=index)
@@ -451,15 +519,20 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
451
519
  # This is a duplicate, skip processing it further.
452
520
  return
453
521
  else:
454
- raw_index = self.client.indices.get_template(name=index)
455
- raw_index_metadata = raw_index[index]
522
+ raw_index_metadata = self._get_template_metadata(
523
+ index, is_composable_template
524
+ )
456
525
  collapsed_index_name = collapse_name(
457
526
  name=index, collapse_urns=self.source_config.collapse_urns
458
527
  )
459
528
 
460
529
  # 1. Construct and emit the schemaMetadata aspect
461
530
  # 1.1 Generate the schema fields from ES mappings.
462
- index_mappings = raw_index_metadata["mappings"]
531
+ # For composable templates, mappings are under 'template.mappings'
532
+ if is_composable_template:
533
+ index_mappings = raw_index_metadata.get("template", {}).get("mappings", {})
534
+ else:
535
+ index_mappings = raw_index_metadata.get("mappings", {})
463
536
  index_mappings_json_str: str = json.dumps(index_mappings)
464
537
  md5_hash = md5(index_mappings_json_str.encode()).hexdigest()
465
538
  schema_fields = list(
@@ -517,28 +590,32 @@ class ElasticsearchSource(StatefulIngestionSourceBase):
517
590
  ),
518
591
  )
519
592
 
520
- # 4. Construct and emit properties if needed. Will attempt to get the following properties
521
- custom_properties: Dict[str, str] = {}
522
- # 4.1 aliases
523
- index_aliases: List[str] = raw_index_metadata.get("aliases", {}).keys()
524
- if index_aliases:
525
- custom_properties["aliases"] = ",".join(index_aliases)
526
- # 4.2 index_patterns
527
- index_patterns: List[str] = raw_index_metadata.get("index_patterns", [])
528
- if index_patterns:
529
- custom_properties["index_patterns"] = ",".join(index_patterns)
530
-
531
- # 4.3 number_of_shards
532
- index_settings: Dict[str, Any] = raw_index_metadata.get("settings", {}).get(
533
- "index", {}
534
- )
535
- num_shards: str = index_settings.get("number_of_shards", "")
536
- if num_shards:
537
- custom_properties["num_shards"] = num_shards
538
- # 4.4 number_of_replicas
539
- num_replicas: str = index_settings.get("number_of_replicas", "")
540
- if num_replicas:
541
- custom_properties["num_replicas"] = num_replicas
593
+ # 4. Construct and emit properties
594
+ if is_index:
595
+ custom_properties: Dict[str, str] = {}
596
+ # Extract properties for indices
597
+ index_aliases: List[str] = list(
598
+ raw_index_metadata.get("aliases", {}).keys()
599
+ )
600
+ if index_aliases:
601
+ custom_properties["aliases"] = ",".join(index_aliases)
602
+ index_patterns: List[str] = raw_index_metadata.get("index_patterns", [])
603
+ if index_patterns:
604
+ custom_properties["index_patterns"] = ",".join(index_patterns)
605
+ index_settings: Dict[str, Any] = raw_index_metadata.get("settings", {}).get(
606
+ "index", {}
607
+ )
608
+ num_shards: str = index_settings.get("number_of_shards", "")
609
+ if num_shards:
610
+ custom_properties["num_shards"] = num_shards
611
+ num_replicas: str = index_settings.get("number_of_replicas", "")
612
+ if num_replicas:
613
+ custom_properties["num_replicas"] = num_replicas
614
+ else:
615
+ # Extract properties for templates
616
+ custom_properties = self._extract_template_custom_properties(
617
+ raw_index_metadata, is_composable_template
618
+ )
542
619
 
543
620
  yield MetadataChangeProposalWrapper(
544
621
  entityUrn=dataset_urn,
@@ -78,6 +78,7 @@ from datahub.utilities.file_backed_collections import (
78
78
  ConnectionWrapper,
79
79
  FileBackedList,
80
80
  )
81
+ from datahub.utilities.lossy_collections import LossyList
81
82
  from datahub.utilities.perf_timer import PerfTimer
82
83
 
83
84
  logger = logging.getLogger(__name__)
@@ -169,6 +170,10 @@ class SnowflakeQueriesExtractorReport(Report):
169
170
  num_stream_queries_observed: int = 0
170
171
  num_create_temp_view_queries_observed: int = 0
171
172
  num_users: int = 0
173
+ num_queries_with_empty_column_name: int = 0
174
+ queries_with_empty_column_name: LossyList[str] = dataclasses.field(
175
+ default_factory=LossyList
176
+ )
172
177
 
173
178
 
174
179
  @dataclass
@@ -626,9 +631,28 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
626
631
 
627
632
  columns = set()
628
633
  for modified_column in obj["columns"]:
629
- columns.add(
630
- self.identifiers.snowflake_identifier(modified_column["columnName"])
631
- )
634
+ column_name = modified_column["columnName"]
635
+ # An empty column name in the audit log would cause an error when creating column URNs.
636
+ # To avoid this and still extract lineage, the raw query text is parsed as a fallback.
637
+ if not column_name or not column_name.strip():
638
+ query_id = res["query_id"]
639
+ self.report.num_queries_with_empty_column_name += 1
640
+ self.report.queries_with_empty_column_name.append(query_id)
641
+ logger.info(f"Query {query_id} has empty column name in audit log.")
642
+
643
+ return ObservedQuery(
644
+ query=query_text,
645
+ session_id=res["session_id"],
646
+ timestamp=timestamp,
647
+ user=user,
648
+ default_db=res["default_db"],
649
+ default_schema=res["default_schema"],
650
+ query_hash=get_query_fingerprint(
651
+ query_text, self.identifiers.platform, fast=True
652
+ ),
653
+ extra_info=extra_info,
654
+ )
655
+ columns.add(self.identifiers.snowflake_identifier(column_name))
632
656
 
633
657
  upstreams.append(dataset)
634
658
  column_usage[dataset] = columns
@@ -1,11 +1,13 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- from dataclasses import dataclass
4
+ import re
5
+ from dataclasses import dataclass, field
5
6
  from datetime import datetime
6
7
  from functools import partial
7
- from typing import ClassVar, Iterable, List, Optional, Union
8
+ from typing import ClassVar, Iterable, List, Optional, Union, cast
8
9
 
10
+ import smart_open
9
11
  from pydantic import BaseModel, Field, validator
10
12
 
11
13
  from datahub.configuration.common import HiddenFromDocs
@@ -36,12 +38,13 @@ from datahub.ingestion.api.source import (
36
38
  SourceCapability,
37
39
  SourceReport,
38
40
  )
39
- from datahub.ingestion.api.source_helpers import auto_workunit_reporter
41
+ from datahub.ingestion.api.source_helpers import auto_workunit, auto_workunit_reporter
40
42
  from datahub.ingestion.api.workunit import MetadataWorkUnit
41
43
  from datahub.ingestion.graph.client import DataHubGraph
44
+ from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
42
45
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
43
46
  from datahub.metadata.urns import CorpUserUrn, DatasetUrn
44
- from datahub.sql_parsing.schema_resolver import SchemaResolver
47
+ from datahub.sql_parsing.schema_resolver import SchemaResolver, SchemaResolverReport
45
48
  from datahub.sql_parsing.sql_parsing_aggregator import (
46
49
  KnownQueryLineageInfo,
47
50
  ObservedQuery,
@@ -82,6 +85,24 @@ class SqlQueriesSourceConfig(
82
85
  None,
83
86
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
84
87
  )
88
+ temp_table_patterns: List[str] = Field(
89
+ description="Regex patterns for temporary tables to filter in lineage ingestion. "
90
+ "Specify regex to match the entire table name. This is useful for platforms like Athena "
91
+ "that don't have native temp tables but use naming patterns for fake temp tables.",
92
+ default=[],
93
+ )
94
+
95
+ enable_lazy_schema_loading: bool = Field(
96
+ default=True,
97
+ description="Enable lazy schema loading for better performance. When enabled, schemas are fetched on-demand "
98
+ "instead of bulk loading all schemas upfront, reducing startup time and memory usage.",
99
+ )
100
+
101
+ # AWS/S3 configuration
102
+ aws_config: Optional[AwsConnectionConfig] = Field(
103
+ default=None,
104
+ description="AWS configuration for S3 access. Required when query_file is an S3 URI (s3://).",
105
+ )
85
106
 
86
107
 
87
108
  @dataclass
@@ -89,8 +110,13 @@ class SqlQueriesSourceReport(SourceReport):
89
110
  num_entries_processed: int = 0
90
111
  num_entries_failed: int = 0
91
112
  num_queries_aggregator_failures: int = 0
113
+ num_queries_processed_sequential: int = 0
114
+ num_temp_tables_detected: int = 0
115
+ temp_table_patterns_used: List[str] = field(default_factory=list)
116
+ peak_memory_usage_mb: float = 0.0
92
117
 
93
118
  sql_aggregator: Optional[SqlAggregatorReport] = None
119
+ schema_resolver_report: Optional[SchemaResolverReport] = None
94
120
 
95
121
 
96
122
  @platform_name("SQL Queries", id="sql-queries")
@@ -115,6 +141,18 @@ class SqlQueriesSource(Source):
115
141
  - upstream_tables (optional): string[] - Fallback list of tables the query reads from,
116
142
  used if the query can't be parsed.
117
143
 
144
+ **Lazy Schema Loading**:
145
+ - Fetches schemas on-demand during query parsing instead of bulk loading all schemas upfront
146
+ - Caches fetched schemas for future lookups to avoid repeated network requests
147
+ - Reduces initial startup time and memory usage significantly
148
+ - Automatically handles large platforms efficiently without memory issues
149
+
150
+ **Query Processing**:
151
+ - Loads the entire query file into memory at once
152
+ - Processes all queries sequentially before generating metadata work units
153
+ - Preserves temp table mappings and lineage relationships to ensure consistent lineage tracking
154
+ - Query deduplication is handled automatically by the SQL parsing aggregator
155
+
118
156
  ### Incremental Lineage
119
157
  When `incremental_lineage` is enabled, this source will emit lineage as patches rather than full overwrites.
120
158
  This allows you to add lineage edges without removing existing ones, which is useful for:
@@ -124,6 +162,12 @@ class SqlQueriesSource(Source):
124
162
 
125
163
  Note: Incremental lineage only applies to UpstreamLineage aspects. Other aspects like queries and usage
126
164
  statistics will still be emitted normally.
165
+
166
+ ### Temporary Table Support
167
+ For platforms like Athena that don't have native temporary tables, you can use the `temp_table_patterns`
168
+ configuration to specify regex patterns that identify fake temporary tables. This allows the source to
169
+ process these tables like other sources that support native temp tables, enabling proper lineage tracking
170
+ across temporary table operations.
127
171
  """
128
172
 
129
173
  schema_resolver: Optional[SchemaResolver]
@@ -141,13 +185,19 @@ class SqlQueriesSource(Source):
141
185
  self.report = SqlQueriesSourceReport()
142
186
 
143
187
  if self.config.use_schema_resolver:
144
- # TODO: `initialize_schema_resolver_from_datahub` does a bulk initialization by fetching all schemas
145
- # for the given platform, platform instance, and env. Instead this should be configurable:
146
- # bulk initialization vs lazy on-demand schema fetching.
147
- self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
188
+ # Create schema resolver report for tracking
189
+ self.report.schema_resolver_report = SchemaResolverReport()
190
+
191
+ # Use lazy loading - schemas will be fetched on-demand and cached
192
+ logger.info(
193
+ "Using lazy schema loading - schemas will be fetched on-demand and cached"
194
+ )
195
+ self.schema_resolver = SchemaResolver(
148
196
  platform=self.config.platform,
149
197
  platform_instance=self.config.platform_instance,
150
198
  env=self.config.env,
199
+ graph=self.graph,
200
+ report=self.report.schema_resolver_report,
151
201
  )
152
202
  else:
153
203
  self.schema_resolver = None
@@ -156,7 +206,9 @@ class SqlQueriesSource(Source):
156
206
  platform=self.config.platform,
157
207
  platform_instance=self.config.platform_instance,
158
208
  env=self.config.env,
159
- schema_resolver=self.schema_resolver,
209
+ schema_resolver=cast(SchemaResolver, self.schema_resolver)
210
+ if self.schema_resolver
211
+ else None,
160
212
  eager_graph_load=False,
161
213
  generate_lineage=True, # TODO: make this configurable
162
214
  generate_queries=True, # TODO: make this configurable
@@ -165,7 +217,9 @@ class SqlQueriesSource(Source):
165
217
  generate_usage_statistics=True,
166
218
  generate_operations=True, # TODO: make this configurable
167
219
  usage_config=self.config.usage,
168
- is_temp_table=None,
220
+ is_temp_table=self.is_temp_table
221
+ if self.config.temp_table_patterns
222
+ else None,
169
223
  is_allowed_table=None,
170
224
  format_queries=False,
171
225
  )
@@ -193,20 +247,73 @@ class SqlQueriesSource(Source):
193
247
  ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
194
248
  logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
195
249
 
250
+ logger.info("Processing all queries in batch mode")
251
+ yield from self._process_queries_batch()
252
+
253
+ def _process_queries_batch(
254
+ self,
255
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
256
+ """Process all queries in memory (original behavior)."""
196
257
  with self.report.new_stage("Collecting queries from file"):
197
258
  queries = list(self._parse_query_file())
198
259
  logger.info(f"Collected {len(queries)} queries for processing")
199
260
 
200
261
  with self.report.new_stage("Processing queries through SQL parsing aggregator"):
201
- for query_entry in queries:
202
- self._add_query_to_aggregator(query_entry)
262
+ logger.info("Using sequential processing")
263
+ self._process_queries_sequential(queries)
203
264
 
204
265
  with self.report.new_stage("Generating metadata work units"):
205
266
  logger.info("Generating workunits from SQL parsing aggregator")
206
- yield from self.aggregator.gen_metadata()
267
+ yield from auto_workunit(self.aggregator.gen_metadata())
207
268
 
208
- def _parse_query_file(self) -> Iterable["QueryEntry"]:
209
- """Parse the query file and yield QueryEntry objects."""
269
+ def _is_s3_uri(self, path: str) -> bool:
270
+ """Check if the path is an S3 URI."""
271
+ return path.startswith("s3://")
272
+
273
+ def _parse_s3_query_file(self) -> Iterable["QueryEntry"]:
274
+ """Parse query file from S3 using smart_open."""
275
+ if not self.config.aws_config:
276
+ raise ValueError("AWS configuration required for S3 file access")
277
+
278
+ logger.info(f"Reading query file from S3: {self.config.query_file}")
279
+
280
+ try:
281
+ # Use smart_open for efficient S3 streaming, similar to S3FileSystem
282
+ s3_client = self.config.aws_config.get_s3_client()
283
+
284
+ with smart_open.open(
285
+ self.config.query_file, mode="r", transport_params={"client": s3_client}
286
+ ) as file_stream:
287
+ for line in file_stream:
288
+ if line.strip():
289
+ try:
290
+ query_dict = json.loads(line, strict=False)
291
+ entry = QueryEntry.create(query_dict, config=self.config)
292
+ self.report.num_entries_processed += 1
293
+ if self.report.num_entries_processed % 1000 == 0:
294
+ logger.info(
295
+ f"Processed {self.report.num_entries_processed} query entries from S3"
296
+ )
297
+ yield entry
298
+ except Exception as e:
299
+ self.report.num_entries_failed += 1
300
+ self.report.warning(
301
+ title="Error processing query from S3",
302
+ message="Query skipped due to parsing error",
303
+ context=line.strip(),
304
+ exc=e,
305
+ )
306
+ except Exception as e:
307
+ self.report.warning(
308
+ title="Error reading S3 file",
309
+ message="Failed to read S3 file",
310
+ context=self.config.query_file,
311
+ exc=e,
312
+ )
313
+ raise
314
+
315
+ def _parse_local_query_file(self) -> Iterable["QueryEntry"]:
316
+ """Parse local query file (existing logic)."""
210
317
  with open(self.config.query_file) as f:
211
318
  for line in f:
212
319
  try:
@@ -227,6 +334,30 @@ class SqlQueriesSource(Source):
227
334
  exc=e,
228
335
  )
229
336
 
337
+ def _parse_query_file(self) -> Iterable["QueryEntry"]:
338
+ """Parse the query file and yield QueryEntry objects."""
339
+ if self._is_s3_uri(self.config.query_file):
340
+ yield from self._parse_s3_query_file()
341
+ else:
342
+ yield from self._parse_local_query_file()
343
+
344
+ def _process_queries_sequential(self, queries: List["QueryEntry"]) -> None:
345
+ """Process queries sequentially."""
346
+ total_queries = len(queries)
347
+ logger.info(f"Processing {total_queries} queries sequentially")
348
+
349
+ # Process each query sequentially
350
+ for i, query_entry in enumerate(queries):
351
+ self._add_query_to_aggregator(query_entry)
352
+ self.report.num_queries_processed_sequential += 1
353
+
354
+ # Simple progress reporting every 1000 queries
355
+ if (i + 1) % 1000 == 0:
356
+ progress_pct = ((i + 1) / total_queries) * 100
357
+ logger.info(
358
+ f"Processed {i + 1}/{total_queries} queries ({progress_pct:.1f}%)"
359
+ )
360
+
230
361
  def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
231
362
  """Add a query to the SQL parsing aggregator."""
232
363
  try:
@@ -285,6 +416,24 @@ class SqlQueriesSource(Source):
285
416
  exc=e,
286
417
  )
287
418
 
419
+ def is_temp_table(self, name: str) -> bool:
420
+ """Check if a table name matches any of the configured temp table patterns."""
421
+ if not self.config.temp_table_patterns:
422
+ return False
423
+
424
+ try:
425
+ for pattern in self.config.temp_table_patterns:
426
+ if re.match(pattern, name, flags=re.IGNORECASE):
427
+ logger.debug(
428
+ f"Table '{name}' matched temp table pattern: {pattern}"
429
+ )
430
+ self.report.num_temp_tables_detected += 1
431
+ return True
432
+ except re.error as e:
433
+ logger.warning(f"Invalid regex pattern '{pattern}': {e}")
434
+
435
+ return False
436
+
288
437
 
289
438
  class QueryEntry(BaseModel):
290
439
  query: str
@@ -511,6 +511,7 @@ class AssertionInfoClass(_Aspect):
511
511
  source: Union[None, "AssertionSourceClass"]=None,
512
512
  lastUpdated: Union[None, "AuditStampClass"]=None,
513
513
  description: Union[None, str]=None,
514
+ note: Union[None, "AssertionNoteClass"]=None,
514
515
  ):
515
516
  super().__init__()
516
517
 
@@ -531,6 +532,7 @@ class AssertionInfoClass(_Aspect):
531
532
  self.source = source
532
533
  self.lastUpdated = lastUpdated
533
534
  self.description = description
535
+ self.note = note
534
536
 
535
537
  def _restore_defaults(self) -> None:
536
538
  self.customProperties = dict()
@@ -546,6 +548,7 @@ class AssertionInfoClass(_Aspect):
546
548
  self.source = self.RECORD_SCHEMA.fields_dict["source"].default
547
549
  self.lastUpdated = self.RECORD_SCHEMA.fields_dict["lastUpdated"].default
548
550
  self.description = self.RECORD_SCHEMA.fields_dict["description"].default
551
+ self.note = self.RECORD_SCHEMA.fields_dict["note"].default
549
552
 
550
553
 
551
554
  @property
@@ -570,7 +573,7 @@ class AssertionInfoClass(_Aspect):
570
573
 
571
574
  @property
572
575
  def type(self) -> Union[str, "AssertionTypeClass"]:
573
- """Type of assertion. Assertion types can evolve to span Datasets, Flows (Pipelines), Models, Features etc."""
576
+ """Type of assertion."""
574
577
  return self._inner_dict.get('type') # type: ignore
575
578
 
576
579
  @type.setter
@@ -682,6 +685,55 @@ class AssertionInfoClass(_Aspect):
682
685
  self._inner_dict['description'] = value
683
686
 
684
687
 
688
+ @property
689
+ def note(self) -> Union[None, "AssertionNoteClass"]:
690
+ """An optional note to give technical owners more context about the assertion, and how to troubleshoot it.
691
+ The UI will render this in markdown format."""
692
+ return self._inner_dict.get('note') # type: ignore
693
+
694
+ @note.setter
695
+ def note(self, value: Union[None, "AssertionNoteClass"]) -> None:
696
+ self._inner_dict['note'] = value
697
+
698
+
699
+ class AssertionNoteClass(DictWrapper):
700
+ # No docs available.
701
+
702
+ RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.assertion.AssertionNote")
703
+ def __init__(self,
704
+ content: str,
705
+ lastModified: "AuditStampClass",
706
+ ):
707
+ super().__init__()
708
+
709
+ self.content = content
710
+ self.lastModified = lastModified
711
+
712
+ def _restore_defaults(self) -> None:
713
+ self.content = str()
714
+ self.lastModified = AuditStampClass._construct_with_defaults()
715
+
716
+
717
+ @property
718
+ def content(self) -> str:
719
+ """The note to give technical owners more context about the assertion, and how to troubleshoot it."""
720
+ return self._inner_dict.get('content') # type: ignore
721
+
722
+ @content.setter
723
+ def content(self, value: str) -> None:
724
+ self._inner_dict['content'] = value
725
+
726
+
727
+ @property
728
+ def lastModified(self) -> "AuditStampClass":
729
+ """The time at which the note was last modified."""
730
+ return self._inner_dict.get('lastModified') # type: ignore
731
+
732
+ @lastModified.setter
733
+ def lastModified(self, value: "AuditStampClass") -> None:
734
+ self._inner_dict['lastModified'] = value
735
+
736
+
685
737
  class AssertionResultClass(DictWrapper):
686
738
  """The result of running an assertion"""
687
739
 
@@ -1337,7 +1389,7 @@ class AssertionStdParametersClass(DictWrapper):
1337
1389
 
1338
1390
 
1339
1391
  class AssertionTypeClass(object):
1340
- # No docs available.
1392
+ """Type of assertion. Assertion types can evolve to span Datasets, Flows (Pipelines), Models, Features etc."""
1341
1393
 
1342
1394
  DATASET = "DATASET"
1343
1395
  """A single-dataset assertion.
@@ -12623,6 +12675,9 @@ class NotificationSinkTypeClass(object):
12623
12675
  EMAIL = "EMAIL"
12624
12676
  """Email target type."""
12625
12677
 
12678
+ TEAMS = "TEAMS"
12679
+ """Microsoft Teams target type."""
12680
+
12626
12681
 
12627
12682
 
12628
12683
  class EmailNotificationSettingsClass(DictWrapper):
@@ -20439,6 +20494,9 @@ class DataHubPageModuleTypeClass(object):
20439
20494
  PLATFORMS = "PLATFORMS"
20440
20495
  """Module displaying the platforms in an instance"""
20441
20496
 
20497
+ UNKNOWN = "UNKNOWN"
20498
+ """Unknown module type - this can occur with corrupted data or rolling back to versions without new modules"""
20499
+
20442
20500
 
20443
20501
 
20444
20502
  class DataHubPageModuleVisibilityClass(DictWrapper):
@@ -27742,6 +27800,7 @@ __SCHEMA_TYPES = {
27742
27800
  'com.linkedin.pegasus2avro.assertion.AssertionActionType': AssertionActionTypeClass,
27743
27801
  'com.linkedin.pegasus2avro.assertion.AssertionActions': AssertionActionsClass,
27744
27802
  'com.linkedin.pegasus2avro.assertion.AssertionInfo': AssertionInfoClass,
27803
+ 'com.linkedin.pegasus2avro.assertion.AssertionNote': AssertionNoteClass,
27745
27804
  'com.linkedin.pegasus2avro.assertion.AssertionResult': AssertionResultClass,
27746
27805
  'com.linkedin.pegasus2avro.assertion.AssertionResultError': AssertionResultErrorClass,
27747
27806
  'com.linkedin.pegasus2avro.assertion.AssertionResultErrorType': AssertionResultErrorTypeClass,
@@ -28268,6 +28327,7 @@ __SCHEMA_TYPES = {
28268
28327
  'AssertionActionType': AssertionActionTypeClass,
28269
28328
  'AssertionActions': AssertionActionsClass,
28270
28329
  'AssertionInfo': AssertionInfoClass,
28330
+ 'AssertionNote': AssertionNoteClass,
28271
28331
  'AssertionResult': AssertionResultClass,
28272
28332
  'AssertionResultError': AssertionResultErrorClass,
28273
28333
  'AssertionResultErrorType': AssertionResultErrorTypeClass,
@@ -11,6 +11,7 @@ from .....schema_classes import AssertionActionClass
11
11
  from .....schema_classes import AssertionActionTypeClass
12
12
  from .....schema_classes import AssertionActionsClass
13
13
  from .....schema_classes import AssertionInfoClass
14
+ from .....schema_classes import AssertionNoteClass
14
15
  from .....schema_classes import AssertionResultClass
15
16
  from .....schema_classes import AssertionResultErrorClass
16
17
  from .....schema_classes import AssertionResultErrorTypeClass
@@ -64,6 +65,7 @@ AssertionAction = AssertionActionClass
64
65
  AssertionActionType = AssertionActionTypeClass
65
66
  AssertionActions = AssertionActionsClass
66
67
  AssertionInfo = AssertionInfoClass
68
+ AssertionNote = AssertionNoteClass
67
69
  AssertionResult = AssertionResultClass
68
70
  AssertionResultError = AssertionResultErrorClass
69
71
  AssertionResultErrorType = AssertionResultErrorTypeClass