acryl-datahub 1.2.0.1__py3-none-any.whl → 1.2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (54) hide show
  1. {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2.dist-info}/METADATA +2574 -2572
  2. {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2.dist-info}/RECORD +54 -46
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +13 -1
  5. datahub/emitter/rest_emitter.py +3 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +97 -6
  7. datahub/ingestion/source/abs/source.py +5 -29
  8. datahub/ingestion/source/aws/glue.py +8 -0
  9. datahub/ingestion/source/cassandra/cassandra.py +5 -7
  10. datahub/ingestion/source/common/subtypes.py +2 -0
  11. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  12. datahub/ingestion/source/datahub/datahub_source.py +3 -0
  13. datahub/ingestion/source/dbt/dbt_common.py +69 -2
  14. datahub/ingestion/source/delta_lake/source.py +1 -0
  15. datahub/ingestion/source/ge_data_profiler.py +9 -1
  16. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  17. datahub/ingestion/source/grafana/field_utils.py +307 -0
  18. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  19. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  20. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  21. datahub/ingestion/source/grafana/lineage.py +202 -0
  22. datahub/ingestion/source/grafana/models.py +120 -0
  23. datahub/ingestion/source/grafana/report.py +91 -0
  24. datahub/ingestion/source/grafana/types.py +16 -0
  25. datahub/ingestion/source/hex/hex.py +8 -0
  26. datahub/ingestion/source/looker/looker_common.py +40 -4
  27. datahub/ingestion/source/looker/looker_source.py +9 -0
  28. datahub/ingestion/source/looker/lookml_source.py +8 -0
  29. datahub/ingestion/source/mongodb.py +11 -1
  30. datahub/ingestion/source/redshift/redshift.py +8 -1
  31. datahub/ingestion/source/s3/source.py +14 -34
  32. datahub/ingestion/source/sql/athena.py +8 -2
  33. datahub/ingestion/source/sql/clickhouse.py +9 -0
  34. datahub/ingestion/source/sql/postgres.py +190 -1
  35. datahub/ingestion/source/sql_queries.py +111 -76
  36. datahub/ingestion/source/unity/proxy.py +8 -8
  37. datahub/metadata/_internal_schema_classes.py +96 -0
  38. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +2 -0
  39. datahub/metadata/schema.avsc +69 -0
  40. datahub/metadata/schemas/CorpUserSettings.avsc +10 -1
  41. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +42 -0
  42. datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -0
  43. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  44. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  45. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  46. datahub/sdk/dataset.py +44 -0
  47. datahub/sdk/search_filters.py +84 -15
  48. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  49. datahub/telemetry/telemetry.py +4 -1
  50. datahub/upgrade/upgrade.py +5 -3
  51. {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2.dist-info}/WHEEL +0 -0
  52. {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2.dist-info}/entry_points.txt +0 -0
  53. {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2.dist-info}/licenses/LICENSE +0 -0
  54. {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2.dist-info}/top_level.txt +0 -0
@@ -41,7 +41,11 @@ from datahub.ingestion.source.aws.s3_util import (
41
41
  get_key_prefix,
42
42
  strip_s3_prefix,
43
43
  )
44
- from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
44
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
45
+ from datahub.ingestion.source.data_lake_common.data_lake_utils import (
46
+ ContainerWUCreator,
47
+ add_partition_columns_to_schema,
48
+ )
45
49
  from datahub.ingestion.source.data_lake_common.object_store import (
46
50
  create_object_store_adapter,
47
51
  )
@@ -58,9 +62,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
58
62
  )
59
63
  from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
60
64
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
61
- SchemaField,
62
65
  SchemaMetadata,
63
- StringTypeClass,
64
66
  )
65
67
  from datahub.metadata.schema_classes import (
66
68
  DataPlatformInstanceClass,
@@ -70,7 +72,6 @@ from datahub.metadata.schema_classes import (
70
72
  OtherSchemaClass,
71
73
  PartitionsSummaryClass,
72
74
  PartitionSummaryClass,
73
- SchemaFieldDataTypeClass,
74
75
  _Aspect,
75
76
  )
76
77
  from datahub.telemetry import stats, telemetry
@@ -196,7 +197,14 @@ class TableData:
196
197
  @platform_name("S3 / Local Files", id="s3")
197
198
  @config_class(DataLakeSourceConfig)
198
199
  @support_status(SupportStatus.INCUBATING)
199
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
200
+ @capability(
201
+ SourceCapability.CONTAINERS,
202
+ "Enabled by default",
203
+ subtype_modifier=[
204
+ SourceCapabilityModifier.FOLDER,
205
+ SourceCapabilityModifier.S3_BUCKET,
206
+ ],
207
+ )
200
208
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
201
209
  @capability(
202
210
  SourceCapability.SCHEMA_METADATA, "Can infer schema from supported file types"
@@ -474,7 +482,7 @@ class S3Source(StatefulIngestionSourceBase):
474
482
  fields = sorted(fields, key=lambda f: f.fieldPath)
475
483
 
476
484
  if self.source_config.add_partition_columns_to_schema and table_data.partitions:
477
- self.add_partition_columns_to_schema(
485
+ add_partition_columns_to_schema(
478
486
  fields=fields, path_spec=path_spec, full_path=table_data.full_path
479
487
  )
480
488
 
@@ -510,34 +518,6 @@ class S3Source(StatefulIngestionSourceBase):
510
518
  else:
511
519
  return None
512
520
 
513
- def add_partition_columns_to_schema(
514
- self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
515
- ) -> None:
516
- is_fieldpath_v2 = False
517
- for field in fields:
518
- if field.fieldPath.startswith("[version=2.0]"):
519
- is_fieldpath_v2 = True
520
- break
521
- partition_keys = path_spec.get_partition_from_path(full_path)
522
- if not partition_keys:
523
- return None
524
-
525
- for partition_key in partition_keys:
526
- fields.append(
527
- SchemaField(
528
- fieldPath=(
529
- f"{partition_key[0]}"
530
- if not is_fieldpath_v2
531
- else f"[version=2.0].[type=string].{partition_key[0]}"
532
- ),
533
- nativeDataType="string",
534
- type=SchemaFieldDataTypeClass(StringTypeClass()),
535
- isPartitioningKey=True,
536
- nullable=True,
537
- recursive=False,
538
- )
539
- )
540
-
541
521
  def get_table_profile(
542
522
  self, table_data: TableData, dataset_urn: str
543
523
  ) -> Iterable[MetadataWorkUnit]:
@@ -347,12 +347,18 @@ class Partitionitem:
347
347
  @capability(
348
348
  SourceCapability.LINEAGE_COARSE,
349
349
  "Supported for S3 tables",
350
- subtype_modifier=[SourceCapabilityModifier.TABLE],
350
+ subtype_modifier=[
351
+ SourceCapabilityModifier.VIEW,
352
+ SourceCapabilityModifier.TABLE,
353
+ ],
351
354
  )
352
355
  @capability(
353
356
  SourceCapability.LINEAGE_FINE,
354
357
  "Supported for S3 tables",
355
- subtype_modifier=[SourceCapabilityModifier.TABLE],
358
+ subtype_modifier=[
359
+ SourceCapabilityModifier.VIEW,
360
+ SourceCapabilityModifier.TABLE,
361
+ ],
356
362
  )
357
363
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
358
364
  class AthenaSource(SQLAlchemySource):
@@ -32,6 +32,7 @@ from datahub.ingestion.api.decorators import (
32
32
  support_status,
33
33
  )
34
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
35
36
  from datahub.ingestion.source.sql.sql_common import (
36
37
  SqlWorkUnit,
37
38
  logger,
@@ -383,6 +384,14 @@ clickhouse_datetime_format = "%Y-%m-%d %H:%M:%S"
383
384
  SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
384
385
  )
385
386
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
387
+ @capability(
388
+ SourceCapability.LINEAGE_COARSE,
389
+ "Enabled by default to get lineage for views via `include_view_lineage`",
390
+ subtype_modifier=[
391
+ SourceCapabilityModifier.VIEW,
392
+ SourceCapabilityModifier.TABLE,
393
+ ],
394
+ )
386
395
  class ClickHouseSource(TwoTierSQLAlchemySource):
387
396
  """
388
397
  This plugin extracts the following:
@@ -36,6 +36,15 @@ from datahub.ingestion.source.sql.sql_common import (
36
36
  register_custom_type,
37
37
  )
38
38
  from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
39
+ from datahub.ingestion.source.sql.sql_utils import (
40
+ gen_database_key,
41
+ gen_schema_key,
42
+ )
43
+ from datahub.ingestion.source.sql.stored_procedures.base import (
44
+ BaseProcedure,
45
+ generate_procedure_container_workunits,
46
+ generate_procedure_workunits,
47
+ )
39
48
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
40
49
  ArrayTypeClass,
41
50
  BytesTypeClass,
@@ -123,6 +132,15 @@ class PostgresConfig(BasePostgresConfig):
123
132
  "Note: this is not used if `database` or `sqlalchemy_uri` are provided."
124
133
  ),
125
134
  )
135
+ include_stored_procedures: bool = Field(
136
+ default=True,
137
+ description="Include ingest of stored procedures.",
138
+ )
139
+ procedure_pattern: AllowDenyPattern = Field(
140
+ default=AllowDenyPattern.allow_all(),
141
+ description="Regex patterns for stored procedures to filter in ingestion."
142
+ "Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
143
+ )
126
144
 
127
145
 
128
146
  @platform_name("Postgres")
@@ -135,7 +153,7 @@ class PostgresSource(SQLAlchemySource):
135
153
  """
136
154
  This plugin extracts the following:
137
155
 
138
- - Metadata for databases, schemas, views, and tables
156
+ - Metadata for databases, schemas, views, tables, and stored procedures
139
157
  - Column types associated with each table
140
158
  - Also supports PostGIS extensions
141
159
  - Table, row, and column statistics via optional SQL profiling
@@ -291,3 +309,174 @@ class PostgresSource(SQLAlchemySource):
291
309
  ] = row.table_size
292
310
  except Exception as e:
293
311
  logger.error(f"failed to fetch profile metadata: {e}")
312
+
313
+ def get_schema_level_workunits(
314
+ self,
315
+ inspector: Inspector,
316
+ schema: str,
317
+ database: str,
318
+ ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
319
+ yield from super().get_schema_level_workunits(
320
+ inspector=inspector,
321
+ schema=schema,
322
+ database=database,
323
+ )
324
+
325
+ if self.config.include_stored_procedures:
326
+ try:
327
+ yield from self.loop_stored_procedures(inspector, schema, self.config)
328
+ except Exception as e:
329
+ self.report.failure(
330
+ title="Failed to list stored procedures for schema",
331
+ message="An error occurred while listing procedures for the schema.",
332
+ context=f"{database}.{schema}",
333
+ exc=e,
334
+ )
335
+
336
+ def loop_stored_procedures(
337
+ self,
338
+ inspector: Inspector,
339
+ schema: str,
340
+ config: PostgresConfig,
341
+ ) -> Iterable[MetadataWorkUnit]:
342
+ """
343
+ Loop schema data for get stored procedures as dataJob-s.
344
+ """
345
+ db_name = self.get_db_name(inspector)
346
+
347
+ procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
348
+ if procedures:
349
+ yield from self._process_procedures(procedures, db_name, schema)
350
+
351
+ def fetch_procedures_for_schema(
352
+ self, inspector: Inspector, schema: str, db_name: str
353
+ ) -> List[BaseProcedure]:
354
+ try:
355
+ raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
356
+ inspector, schema, db_name
357
+ )
358
+ procedures: List[BaseProcedure] = []
359
+ for procedure in raw_procedures:
360
+ procedure_qualified_name = self.get_identifier(
361
+ schema=schema,
362
+ entity=procedure.name,
363
+ inspector=inspector,
364
+ )
365
+
366
+ if not self.config.procedure_pattern.allowed(procedure_qualified_name):
367
+ self.report.report_dropped(procedure_qualified_name)
368
+ else:
369
+ procedures.append(procedure)
370
+ return procedures
371
+ except Exception as e:
372
+ self.report.warning(
373
+ title="Failed to get procedures for schema",
374
+ message="An error occurred while fetching procedures for the schema.",
375
+ context=f"{db_name}.{schema}",
376
+ exc=e,
377
+ )
378
+ return []
379
+
380
+ def get_procedures_for_schema(
381
+ self, inspector: Inspector, schema: str, db_name: str
382
+ ) -> List[BaseProcedure]:
383
+ """
384
+ Get stored procedures for a specific schema.
385
+ """
386
+ base_procedures = []
387
+ with inspector.engine.connect() as conn:
388
+ procedures = conn.execute(
389
+ """
390
+ SELECT
391
+ p.proname AS name,
392
+ l.lanname AS language,
393
+ pg_get_function_arguments(p.oid) AS arguments,
394
+ pg_get_functiondef(p.oid) AS definition,
395
+ obj_description(p.oid, 'pg_proc') AS comment
396
+ FROM
397
+ pg_proc p
398
+ JOIN
399
+ pg_namespace n ON n.oid = p.pronamespace
400
+ JOIN
401
+ pg_language l ON l.oid = p.prolang
402
+ WHERE
403
+ p.prokind = 'p'
404
+ AND n.nspname = '"""
405
+ + schema
406
+ + """';
407
+ """
408
+ )
409
+
410
+ procedure_rows = list(procedures)
411
+ for row in procedure_rows:
412
+ base_procedures.append(
413
+ BaseProcedure(
414
+ name=row.name,
415
+ language=row.language,
416
+ argument_signature=row.arguments,
417
+ return_type=None,
418
+ procedure_definition=row.definition,
419
+ created=None,
420
+ last_altered=None,
421
+ comment=row.comment,
422
+ extra_properties=None,
423
+ )
424
+ )
425
+ return base_procedures
426
+
427
+ def _process_procedures(
428
+ self,
429
+ procedures: List[BaseProcedure],
430
+ db_name: str,
431
+ schema: str,
432
+ ) -> Iterable[MetadataWorkUnit]:
433
+ if procedures:
434
+ yield from generate_procedure_container_workunits(
435
+ database_key=gen_database_key(
436
+ database=db_name,
437
+ platform=self.platform,
438
+ platform_instance=self.config.platform_instance,
439
+ env=self.config.env,
440
+ ),
441
+ schema_key=gen_schema_key(
442
+ db_name=db_name,
443
+ schema=schema,
444
+ platform=self.platform,
445
+ platform_instance=self.config.platform_instance,
446
+ env=self.config.env,
447
+ ),
448
+ )
449
+ for procedure in procedures:
450
+ yield from self._process_procedure(procedure, schema, db_name)
451
+
452
+ def _process_procedure(
453
+ self,
454
+ procedure: BaseProcedure,
455
+ schema: str,
456
+ db_name: str,
457
+ ) -> Iterable[MetadataWorkUnit]:
458
+ try:
459
+ yield from generate_procedure_workunits(
460
+ procedure=procedure,
461
+ database_key=gen_database_key(
462
+ database=db_name,
463
+ platform=self.platform,
464
+ platform_instance=self.config.platform_instance,
465
+ env=self.config.env,
466
+ ),
467
+ schema_key=gen_schema_key(
468
+ db_name=db_name,
469
+ schema=schema,
470
+ platform=self.platform,
471
+ platform_instance=self.config.platform_instance,
472
+ env=self.config.env,
473
+ ),
474
+ schema_resolver=self.get_schema_resolver(),
475
+ )
476
+ except Exception as e:
477
+ self.report.warning(
478
+ title="Failed to emit stored procedure",
479
+ message="An error occurred while emitting stored procedure",
480
+ context=procedure.name,
481
+ exc=e,
482
+ )
@@ -4,7 +4,7 @@ import os
4
4
  from dataclasses import dataclass
5
5
  from datetime import datetime, timezone
6
6
  from functools import partial
7
- from typing import Iterable, List, Optional, Set
7
+ from typing import Iterable, List, Optional, Union
8
8
 
9
9
  from pydantic import Field
10
10
 
@@ -14,9 +14,8 @@ from datahub.configuration.source_common import (
14
14
  )
15
15
  from datahub.emitter.mce_builder import (
16
16
  make_dataset_urn_with_platform_instance,
17
- make_user_urn,
18
17
  )
19
- from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
18
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
20
19
  from datahub.ingestion.api.common import PipelineContext
21
20
  from datahub.ingestion.api.decorators import (
22
21
  SupportStatus,
@@ -35,8 +34,15 @@ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
35
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
35
  from datahub.ingestion.graph.client import DataHubGraph
37
36
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
37
+ from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
38
+ from datahub.metadata.urns import CorpUserUrn
38
39
  from datahub.sql_parsing.schema_resolver import SchemaResolver
39
- from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage
40
+ from datahub.sql_parsing.sql_parsing_aggregator import (
41
+ KnownQueryLineageInfo,
42
+ ObservedQuery,
43
+ SqlAggregatorReport,
44
+ SqlParsingAggregator,
45
+ )
40
46
 
41
47
  logger = logging.getLogger(__name__)
42
48
 
@@ -72,23 +78,13 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
72
78
  )
73
79
 
74
80
 
75
- class SqlQueriesSourceReport(SourceReport):
76
- num_queries_parsed: int = 0
77
- num_table_parse_failures: int = 0
78
- num_column_parse_failures: int = 0
81
+ @dataclass
82
+ class SqlQueriesSourceReport(SourceReport, IngestionStageReport):
83
+ num_entries_processed: int = 0
84
+ num_entries_failed: int = 0
85
+ num_queries_aggregator_failures: int = 0
79
86
 
80
- def compute_stats(self) -> None:
81
- super().compute_stats()
82
- self.table_failure_rate = (
83
- f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}"
84
- if self.num_queries_parsed
85
- else "0"
86
- )
87
- self.column_failure_rate = (
88
- f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}"
89
- if self.num_queries_parsed
90
- else "0"
91
- )
87
+ sql_aggregator: Optional[SqlAggregatorReport] = None
92
88
 
93
89
 
94
90
  @platform_name("SQL Queries")
@@ -107,15 +103,15 @@ class SqlQueriesSource(Source):
107
103
  - user (optional): string - The user who ran the query.
108
104
  This user value will be directly converted into a DataHub user urn.
109
105
  - operation_type (optional): string - Platform-specific operation type, used if the operation type can't be parsed.
106
+ - session_id (optional): string - Session identifier for temporary table resolution across queries.
110
107
  - downstream_tables (optional): string[] - Fallback list of tables that the query writes to,
111
108
  used if the query can't be parsed.
112
109
  - upstream_tables (optional): string[] - Fallback list of tables the query reads from,
113
110
  used if the query can't be parsed.
114
111
  """
115
112
 
116
- urns: Optional[Set[str]]
117
- schema_resolver: SchemaResolver
118
- builder: SqlParsingBuilder
113
+ schema_resolver: Optional[SchemaResolver]
114
+ aggregator: SqlParsingAggregator
119
115
 
120
116
  def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
121
117
  if not ctx.graph:
@@ -128,22 +124,36 @@ class SqlQueriesSource(Source):
128
124
  self.config = config
129
125
  self.report = SqlQueriesSourceReport()
130
126
 
131
- self.builder = SqlParsingBuilder(usage_config=self.config.usage)
132
-
133
127
  if self.config.use_schema_resolver:
128
+ # TODO: `initialize_schema_resolver_from_datahub` does a bulk initialization by fetching all schemas
129
+ # for the given platform, platform instance, and env. Instead this should be configurable:
130
+ # bulk initialization vs lazy on-demand schema fetching.
134
131
  self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
135
132
  platform=self.config.platform,
136
133
  platform_instance=self.config.platform_instance,
137
134
  env=self.config.env,
138
135
  )
139
- self.urns = self.schema_resolver.get_urns()
140
136
  else:
141
- self.schema_resolver = self.graph._make_schema_resolver(
142
- platform=self.config.platform,
143
- platform_instance=self.config.platform_instance,
144
- env=self.config.env,
145
- )
146
- self.urns = None
137
+ self.schema_resolver = None
138
+
139
+ self.aggregator = SqlParsingAggregator(
140
+ platform=self.config.platform,
141
+ platform_instance=self.config.platform_instance,
142
+ env=self.config.env,
143
+ schema_resolver=self.schema_resolver,
144
+ eager_graph_load=False,
145
+ generate_lineage=True, # TODO: make this configurable
146
+ generate_queries=True, # TODO: make this configurable
147
+ generate_query_subject_fields=True, # TODO: make this configurable
148
+ generate_query_usage_statistics=True, # This enables publishing SELECT query entities, otherwise only mutation queries are published
149
+ generate_usage_statistics=True,
150
+ generate_operations=True, # TODO: make this configurable
151
+ usage_config=self.config.usage,
152
+ is_temp_table=None,
153
+ is_allowed_table=None,
154
+ format_queries=False,
155
+ )
156
+ self.report.sql_aggregator = self.aggregator.report
147
157
 
148
158
  @classmethod
149
159
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource":
@@ -156,68 +166,92 @@ class SqlQueriesSource(Source):
156
166
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
157
167
  return [partial(auto_workunit_reporter, self.get_report())]
158
168
 
159
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
169
+ def get_workunits_internal(
170
+ self,
171
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
160
172
  logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
173
+
174
+ with self.report.new_stage("Collecting queries from file"):
175
+ queries = list(self._parse_query_file())
176
+ logger.info(f"Collected {len(queries)} queries for processing")
177
+
178
+ with self.report.new_stage("Processing queries through SQL parsing aggregator"):
179
+ for query_entry in queries:
180
+ self._add_query_to_aggregator(query_entry)
181
+
182
+ with self.report.new_stage("Generating metadata work units"):
183
+ logger.info("Generating workunits from SQL parsing aggregator")
184
+ yield from self.aggregator.gen_metadata()
185
+
186
+ def _parse_query_file(self) -> Iterable["QueryEntry"]:
187
+ """Parse the query file and yield QueryEntry objects."""
161
188
  with open(self.config.query_file) as f:
162
189
  for line in f:
163
190
  try:
164
191
  query_dict = json.loads(line, strict=False)
165
192
  entry = QueryEntry.create(query_dict, config=self.config)
166
- yield from self._process_query(entry)
193
+ self.report.num_entries_processed += 1
194
+ if self.report.num_entries_processed % 1000 == 0:
195
+ logger.info(
196
+ f"Processed {self.report.num_entries_processed} query entries"
197
+ )
198
+ yield entry
167
199
  except Exception as e:
168
- logger.warning("Error processing query", exc_info=True)
169
- self.report.report_warning("process-query", str(e))
170
-
171
- logger.info("Generating workunits")
172
- yield from self.builder.gen_workunits()
200
+ self.report.num_entries_failed += 1
201
+ self.report.warning(
202
+ title="Error processing query",
203
+ message="Query skipped due to parsing error",
204
+ context=line.strip(),
205
+ exc=e,
206
+ )
173
207
 
174
- def _process_query(self, entry: "QueryEntry") -> Iterable[MetadataWorkUnit]:
175
- self.report.num_queries_parsed += 1
176
- if self.report.num_queries_parsed % 1000 == 0:
177
- logger.info(f"Parsed {self.report.num_queries_parsed} queries")
178
-
179
- result = sqlglot_lineage(
180
- sql=entry.query,
181
- schema_resolver=self.schema_resolver,
182
- default_db=self.config.default_db,
183
- default_schema=self.config.default_schema,
184
- override_dialect=self.config.override_dialect,
185
- )
186
- if result.debug_info.table_error:
187
- logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
188
- self.report.num_table_parse_failures += 1
189
- for downstream_urn in set(entry.downstream_tables):
190
- self.builder.add_lineage(
191
- downstream_urn=downstream_urn,
192
- upstream_urns=entry.upstream_tables,
193
- timestamp=entry.timestamp,
194
- user=entry.user,
208
+ def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
209
+ """Add a query to the SQL parsing aggregator."""
210
+ try:
211
+ # If we have explicit lineage, use it directly
212
+ if query_entry.upstream_tables or query_entry.downstream_tables:
213
+ logger.debug("Using explicit lineage from query file")
214
+ for downstream_table in query_entry.downstream_tables:
215
+ known_lineage = KnownQueryLineageInfo(
216
+ query_text=query_entry.query,
217
+ downstream=downstream_table,
218
+ upstreams=query_entry.upstream_tables,
219
+ timestamp=query_entry.timestamp,
220
+ session_id=query_entry.session_id,
221
+ )
222
+ self.aggregator.add_known_query_lineage(known_lineage)
223
+ else:
224
+ # No explicit lineage, rely on parsing
225
+ observed_query = ObservedQuery(
226
+ query=query_entry.query,
227
+ timestamp=query_entry.timestamp,
228
+ user=query_entry.user,
229
+ session_id=query_entry.session_id,
230
+ default_db=self.config.default_db,
231
+ default_schema=self.config.default_schema,
232
+ override_dialect=self.config.override_dialect,
195
233
  )
196
- return
197
- elif result.debug_info.column_error:
198
- logger.debug(
199
- f"Error parsing column lineage, {result.debug_info.column_error}"
234
+ self.aggregator.add_observed_query(observed_query)
235
+
236
+ except Exception as e:
237
+ self.report.num_queries_aggregator_failures += 1
238
+ self.report.warning(
239
+ title="Error adding query to aggregator",
240
+ message="Query skipped due to failure when adding query to SQL parsing aggregator",
241
+ context=query_entry.query,
242
+ exc=e,
200
243
  )
201
- self.report.num_column_parse_failures += 1
202
-
203
- yield from self.builder.process_sql_parsing_result(
204
- result,
205
- query=entry.query,
206
- query_timestamp=entry.timestamp,
207
- user=entry.user,
208
- custom_operation_type=entry.operation_type,
209
- include_urns=self.urns,
210
- )
211
244
 
212
245
 
213
246
  @dataclass
214
247
  class QueryEntry:
215
248
  query: str
216
249
  timestamp: Optional[datetime]
217
- user: Optional[str]
250
+ user: Optional[CorpUserUrn]
218
251
  operation_type: Optional[str]
219
252
  downstream_tables: List[str]
220
253
  upstream_tables: List[str]
254
+ session_id: Optional[str] = None
221
255
 
222
256
  @classmethod
223
257
  def create(
@@ -230,7 +264,7 @@ class QueryEntry:
230
264
  if "timestamp" in entry_dict
231
265
  else None
232
266
  ),
233
- user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None,
267
+ user=CorpUserUrn(entry_dict["user"]) if "user" in entry_dict else None,
234
268
  operation_type=entry_dict.get("operation_type"),
235
269
  downstream_tables=[
236
270
  make_dataset_urn_with_platform_instance(
@@ -250,4 +284,5 @@ class QueryEntry:
250
284
  )
251
285
  for table in entry_dict.get("upstream_tables", [])
252
286
  ],
287
+ session_id=entry_dict.get("session_id"),
253
288
  )
@@ -521,9 +521,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
521
521
  @cached(cachetools.FIFOCache(maxsize=100))
522
522
  def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
523
523
  """Optimized version using databricks-sql"""
524
- logger.info(f"Fetching schema tags for catalog: {catalog}")
524
+ logger.info(f"Fetching schema tags for catalog: `{catalog}`")
525
525
 
526
- query = f"SELECT * FROM {catalog}.information_schema.schema_tags"
526
+ query = f"SELECT * FROM `{catalog}`.information_schema.schema_tags"
527
527
  rows = self._execute_sql_query(query)
528
528
 
529
529
  result_dict: Dict[str, List[UnityCatalogTag]] = {}
@@ -544,9 +544,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
544
544
  @cached(cachetools.FIFOCache(maxsize=100))
545
545
  def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
546
546
  """Optimized version using databricks-sql"""
547
- logger.info(f"Fetching table tags for catalog: {catalog}")
547
+ logger.info(f"Fetching table tags for catalog: `{catalog}`")
548
548
 
549
- query = f"SELECT * FROM {catalog}.information_schema.catalog_tags"
549
+ query = f"SELECT * FROM `{catalog}`.information_schema.catalog_tags"
550
550
  rows = self._execute_sql_query(query)
551
551
 
552
552
  result_dict: Dict[str, List[UnityCatalogTag]] = {}
@@ -566,9 +566,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
566
566
  @cached(cachetools.FIFOCache(maxsize=100))
567
567
  def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
568
568
  """Optimized version using databricks-sql"""
569
- logger.info(f"Fetching table tags for catalog: {catalog}")
569
+ logger.info(f"Fetching table tags for catalog: `{catalog}`")
570
570
 
571
- query = f"SELECT * FROM {catalog}.information_schema.table_tags"
571
+ query = f"SELECT * FROM `{catalog}`.information_schema.table_tags"
572
572
  rows = self._execute_sql_query(query)
573
573
 
574
574
  result_dict: Dict[str, List[UnityCatalogTag]] = {}
@@ -589,9 +589,9 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
589
589
  @cached(cachetools.FIFOCache(maxsize=100))
590
590
  def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
591
591
  """Optimized version using databricks-sql"""
592
- logger.info(f"Fetching column tags for catalog: {catalog}")
592
+ logger.info(f"Fetching column tags for catalog: `{catalog}`")
593
593
 
594
- query = f"SELECT * FROM {catalog}.information_schema.column_tags"
594
+ query = f"SELECT * FROM `{catalog}`.information_schema.column_tags"
595
595
  rows = self._execute_sql_query(query)
596
596
 
597
597
  result_dict: Dict[str, List[UnityCatalogTag]] = {}