acryl-datahub 1.2.0.1__py3-none-any.whl → 1.2.0.2rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2rc1.dist-info}/METADATA +2605 -2605
- {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2rc1.dist-info}/RECORD +27 -27
- datahub/_version.py +1 -1
- datahub/emitter/rest_emitter.py +3 -1
- datahub/ingestion/source/abs/source.py +5 -29
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
- datahub/ingestion/source/dbt/dbt_common.py +69 -2
- datahub/ingestion/source/looker/looker_common.py +40 -4
- datahub/ingestion/source/s3/source.py +5 -33
- datahub/ingestion/source/sql/postgres.py +190 -1
- datahub/ingestion/source/sql_queries.py +112 -77
- datahub/metadata/_internal_schema_classes.py +81 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +2 -0
- datahub/metadata/schema.avsc +60 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +10 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +33 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
- datahub/metadata/schemas/SystemMetadata.avsc +61 -0
- datahub/sdk/search_filters.py +51 -2
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -0
- datahub/upgrade/upgrade.py +5 -3
- {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.1.dist-info → acryl_datahub-1.2.0.2rc1.dist-info}/top_level.txt +0 -0
|
@@ -36,6 +36,15 @@ from datahub.ingestion.source.sql.sql_common import (
|
|
|
36
36
|
register_custom_type,
|
|
37
37
|
)
|
|
38
38
|
from datahub.ingestion.source.sql.sql_config import BasicSQLAlchemyConfig
|
|
39
|
+
from datahub.ingestion.source.sql.sql_utils import (
|
|
40
|
+
gen_database_key,
|
|
41
|
+
gen_schema_key,
|
|
42
|
+
)
|
|
43
|
+
from datahub.ingestion.source.sql.stored_procedures.base import (
|
|
44
|
+
BaseProcedure,
|
|
45
|
+
generate_procedure_container_workunits,
|
|
46
|
+
generate_procedure_workunits,
|
|
47
|
+
)
|
|
39
48
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
40
49
|
ArrayTypeClass,
|
|
41
50
|
BytesTypeClass,
|
|
@@ -123,6 +132,15 @@ class PostgresConfig(BasePostgresConfig):
|
|
|
123
132
|
"Note: this is not used if `database` or `sqlalchemy_uri` are provided."
|
|
124
133
|
),
|
|
125
134
|
)
|
|
135
|
+
include_stored_procedures: bool = Field(
|
|
136
|
+
default=True,
|
|
137
|
+
description="Include ingest of stored procedures.",
|
|
138
|
+
)
|
|
139
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
140
|
+
default=AllowDenyPattern.allow_all(),
|
|
141
|
+
description="Regex patterns for stored procedures to filter in ingestion."
|
|
142
|
+
"Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
143
|
+
)
|
|
126
144
|
|
|
127
145
|
|
|
128
146
|
@platform_name("Postgres")
|
|
@@ -135,7 +153,7 @@ class PostgresSource(SQLAlchemySource):
|
|
|
135
153
|
"""
|
|
136
154
|
This plugin extracts the following:
|
|
137
155
|
|
|
138
|
-
- Metadata for databases, schemas, views, and
|
|
156
|
+
- Metadata for databases, schemas, views, tables, and stored procedures
|
|
139
157
|
- Column types associated with each table
|
|
140
158
|
- Also supports PostGIS extensions
|
|
141
159
|
- Table, row, and column statistics via optional SQL profiling
|
|
@@ -291,3 +309,174 @@ class PostgresSource(SQLAlchemySource):
|
|
|
291
309
|
] = row.table_size
|
|
292
310
|
except Exception as e:
|
|
293
311
|
logger.error(f"failed to fetch profile metadata: {e}")
|
|
312
|
+
|
|
313
|
+
def get_schema_level_workunits(
|
|
314
|
+
self,
|
|
315
|
+
inspector: Inspector,
|
|
316
|
+
schema: str,
|
|
317
|
+
database: str,
|
|
318
|
+
) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
|
|
319
|
+
yield from super().get_schema_level_workunits(
|
|
320
|
+
inspector=inspector,
|
|
321
|
+
schema=schema,
|
|
322
|
+
database=database,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if self.config.include_stored_procedures:
|
|
326
|
+
try:
|
|
327
|
+
yield from self.loop_stored_procedures(inspector, schema, self.config)
|
|
328
|
+
except Exception as e:
|
|
329
|
+
self.report.failure(
|
|
330
|
+
title="Failed to list stored procedures for schema",
|
|
331
|
+
message="An error occurred while listing procedures for the schema.",
|
|
332
|
+
context=f"{database}.{schema}",
|
|
333
|
+
exc=e,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
def loop_stored_procedures(
|
|
337
|
+
self,
|
|
338
|
+
inspector: Inspector,
|
|
339
|
+
schema: str,
|
|
340
|
+
config: PostgresConfig,
|
|
341
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
342
|
+
"""
|
|
343
|
+
Loop schema data for get stored procedures as dataJob-s.
|
|
344
|
+
"""
|
|
345
|
+
db_name = self.get_db_name(inspector)
|
|
346
|
+
|
|
347
|
+
procedures = self.fetch_procedures_for_schema(inspector, schema, db_name)
|
|
348
|
+
if procedures:
|
|
349
|
+
yield from self._process_procedures(procedures, db_name, schema)
|
|
350
|
+
|
|
351
|
+
def fetch_procedures_for_schema(
|
|
352
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
353
|
+
) -> List[BaseProcedure]:
|
|
354
|
+
try:
|
|
355
|
+
raw_procedures: List[BaseProcedure] = self.get_procedures_for_schema(
|
|
356
|
+
inspector, schema, db_name
|
|
357
|
+
)
|
|
358
|
+
procedures: List[BaseProcedure] = []
|
|
359
|
+
for procedure in raw_procedures:
|
|
360
|
+
procedure_qualified_name = self.get_identifier(
|
|
361
|
+
schema=schema,
|
|
362
|
+
entity=procedure.name,
|
|
363
|
+
inspector=inspector,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
if not self.config.procedure_pattern.allowed(procedure_qualified_name):
|
|
367
|
+
self.report.report_dropped(procedure_qualified_name)
|
|
368
|
+
else:
|
|
369
|
+
procedures.append(procedure)
|
|
370
|
+
return procedures
|
|
371
|
+
except Exception as e:
|
|
372
|
+
self.report.warning(
|
|
373
|
+
title="Failed to get procedures for schema",
|
|
374
|
+
message="An error occurred while fetching procedures for the schema.",
|
|
375
|
+
context=f"{db_name}.{schema}",
|
|
376
|
+
exc=e,
|
|
377
|
+
)
|
|
378
|
+
return []
|
|
379
|
+
|
|
380
|
+
def get_procedures_for_schema(
|
|
381
|
+
self, inspector: Inspector, schema: str, db_name: str
|
|
382
|
+
) -> List[BaseProcedure]:
|
|
383
|
+
"""
|
|
384
|
+
Get stored procedures for a specific schema.
|
|
385
|
+
"""
|
|
386
|
+
base_procedures = []
|
|
387
|
+
with inspector.engine.connect() as conn:
|
|
388
|
+
procedures = conn.execute(
|
|
389
|
+
"""
|
|
390
|
+
SELECT
|
|
391
|
+
p.proname AS name,
|
|
392
|
+
l.lanname AS language,
|
|
393
|
+
pg_get_function_arguments(p.oid) AS arguments,
|
|
394
|
+
pg_get_functiondef(p.oid) AS definition,
|
|
395
|
+
obj_description(p.oid, 'pg_proc') AS comment
|
|
396
|
+
FROM
|
|
397
|
+
pg_proc p
|
|
398
|
+
JOIN
|
|
399
|
+
pg_namespace n ON n.oid = p.pronamespace
|
|
400
|
+
JOIN
|
|
401
|
+
pg_language l ON l.oid = p.prolang
|
|
402
|
+
WHERE
|
|
403
|
+
p.prokind = 'p'
|
|
404
|
+
AND n.nspname = '"""
|
|
405
|
+
+ schema
|
|
406
|
+
+ """';
|
|
407
|
+
"""
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
procedure_rows = list(procedures)
|
|
411
|
+
for row in procedure_rows:
|
|
412
|
+
base_procedures.append(
|
|
413
|
+
BaseProcedure(
|
|
414
|
+
name=row.name,
|
|
415
|
+
language=row.language,
|
|
416
|
+
argument_signature=row.arguments,
|
|
417
|
+
return_type=None,
|
|
418
|
+
procedure_definition=row.definition,
|
|
419
|
+
created=None,
|
|
420
|
+
last_altered=None,
|
|
421
|
+
comment=row.comment,
|
|
422
|
+
extra_properties=None,
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
return base_procedures
|
|
426
|
+
|
|
427
|
+
def _process_procedures(
|
|
428
|
+
self,
|
|
429
|
+
procedures: List[BaseProcedure],
|
|
430
|
+
db_name: str,
|
|
431
|
+
schema: str,
|
|
432
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
433
|
+
if procedures:
|
|
434
|
+
yield from generate_procedure_container_workunits(
|
|
435
|
+
database_key=gen_database_key(
|
|
436
|
+
database=db_name,
|
|
437
|
+
platform=self.platform,
|
|
438
|
+
platform_instance=self.config.platform_instance,
|
|
439
|
+
env=self.config.env,
|
|
440
|
+
),
|
|
441
|
+
schema_key=gen_schema_key(
|
|
442
|
+
db_name=db_name,
|
|
443
|
+
schema=schema,
|
|
444
|
+
platform=self.platform,
|
|
445
|
+
platform_instance=self.config.platform_instance,
|
|
446
|
+
env=self.config.env,
|
|
447
|
+
),
|
|
448
|
+
)
|
|
449
|
+
for procedure in procedures:
|
|
450
|
+
yield from self._process_procedure(procedure, schema, db_name)
|
|
451
|
+
|
|
452
|
+
def _process_procedure(
|
|
453
|
+
self,
|
|
454
|
+
procedure: BaseProcedure,
|
|
455
|
+
schema: str,
|
|
456
|
+
db_name: str,
|
|
457
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
458
|
+
try:
|
|
459
|
+
yield from generate_procedure_workunits(
|
|
460
|
+
procedure=procedure,
|
|
461
|
+
database_key=gen_database_key(
|
|
462
|
+
database=db_name,
|
|
463
|
+
platform=self.platform,
|
|
464
|
+
platform_instance=self.config.platform_instance,
|
|
465
|
+
env=self.config.env,
|
|
466
|
+
),
|
|
467
|
+
schema_key=gen_schema_key(
|
|
468
|
+
db_name=db_name,
|
|
469
|
+
schema=schema,
|
|
470
|
+
platform=self.platform,
|
|
471
|
+
platform_instance=self.config.platform_instance,
|
|
472
|
+
env=self.config.env,
|
|
473
|
+
),
|
|
474
|
+
schema_resolver=self.get_schema_resolver(),
|
|
475
|
+
)
|
|
476
|
+
except Exception as e:
|
|
477
|
+
self.report.warning(
|
|
478
|
+
title="Failed to emit stored procedure",
|
|
479
|
+
message="An error occurred while emitting stored procedure",
|
|
480
|
+
context=procedure.name,
|
|
481
|
+
exc=e,
|
|
482
|
+
)
|
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
6
|
from functools import partial
|
|
7
|
-
from typing import Iterable, List, Optional,
|
|
7
|
+
from typing import Iterable, List, Optional, Union
|
|
8
8
|
|
|
9
9
|
from pydantic import Field
|
|
10
10
|
|
|
@@ -14,9 +14,8 @@ from datahub.configuration.source_common import (
|
|
|
14
14
|
)
|
|
15
15
|
from datahub.emitter.mce_builder import (
|
|
16
16
|
make_dataset_urn_with_platform_instance,
|
|
17
|
-
make_user_urn,
|
|
18
17
|
)
|
|
19
|
-
from datahub.emitter.
|
|
18
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
21
20
|
from datahub.ingestion.api.decorators import (
|
|
22
21
|
SupportStatus,
|
|
@@ -35,8 +34,15 @@ from datahub.ingestion.api.source_helpers import auto_workunit_reporter
|
|
|
35
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
35
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
37
36
|
from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
|
|
37
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
38
|
+
from datahub.metadata.urns import CorpUserUrn
|
|
38
39
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
39
|
-
from datahub.sql_parsing.
|
|
40
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
41
|
+
KnownQueryLineageInfo,
|
|
42
|
+
ObservedQuery,
|
|
43
|
+
SqlAggregatorReport,
|
|
44
|
+
SqlParsingAggregator,
|
|
45
|
+
)
|
|
40
46
|
|
|
41
47
|
logger = logging.getLogger(__name__)
|
|
42
48
|
|
|
@@ -67,28 +73,19 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
67
73
|
default=None,
|
|
68
74
|
)
|
|
69
75
|
override_dialect: Optional[str] = Field(
|
|
70
|
-
description="
|
|
76
|
+
description="DEPRECATED: This field is ignored. SQL dialect detection is now handled automatically by the SQL parsing aggregator based on the platform.",
|
|
71
77
|
default=None,
|
|
78
|
+
hidden_from_docs=True,
|
|
72
79
|
)
|
|
73
80
|
|
|
74
81
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
82
|
+
@dataclass
|
|
83
|
+
class SqlQueriesSourceReport(SourceReport, IngestionStageReport):
|
|
84
|
+
num_entries_processed: int = 0
|
|
85
|
+
num_entries_failed: int = 0
|
|
86
|
+
num_queries_aggregator_failures: int = 0
|
|
79
87
|
|
|
80
|
-
|
|
81
|
-
super().compute_stats()
|
|
82
|
-
self.table_failure_rate = (
|
|
83
|
-
f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}"
|
|
84
|
-
if self.num_queries_parsed
|
|
85
|
-
else "0"
|
|
86
|
-
)
|
|
87
|
-
self.column_failure_rate = (
|
|
88
|
-
f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}"
|
|
89
|
-
if self.num_queries_parsed
|
|
90
|
-
else "0"
|
|
91
|
-
)
|
|
88
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
92
89
|
|
|
93
90
|
|
|
94
91
|
@platform_name("SQL Queries")
|
|
@@ -107,15 +104,15 @@ class SqlQueriesSource(Source):
|
|
|
107
104
|
- user (optional): string - The user who ran the query.
|
|
108
105
|
This user value will be directly converted into a DataHub user urn.
|
|
109
106
|
- operation_type (optional): string - Platform-specific operation type, used if the operation type can't be parsed.
|
|
107
|
+
- session_id (optional): string - Session identifier for temporary table resolution across queries.
|
|
110
108
|
- downstream_tables (optional): string[] - Fallback list of tables that the query writes to,
|
|
111
109
|
used if the query can't be parsed.
|
|
112
110
|
- upstream_tables (optional): string[] - Fallback list of tables the query reads from,
|
|
113
111
|
used if the query can't be parsed.
|
|
114
112
|
"""
|
|
115
113
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
builder: SqlParsingBuilder
|
|
114
|
+
schema_resolver: Optional[SchemaResolver]
|
|
115
|
+
aggregator: SqlParsingAggregator
|
|
119
116
|
|
|
120
117
|
def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig):
|
|
121
118
|
if not ctx.graph:
|
|
@@ -128,22 +125,36 @@ class SqlQueriesSource(Source):
|
|
|
128
125
|
self.config = config
|
|
129
126
|
self.report = SqlQueriesSourceReport()
|
|
130
127
|
|
|
131
|
-
self.builder = SqlParsingBuilder(usage_config=self.config.usage)
|
|
132
|
-
|
|
133
128
|
if self.config.use_schema_resolver:
|
|
129
|
+
# TODO: `initialize_schema_resolver_from_datahub` does a bulk initialization by fetching all schemas
|
|
130
|
+
# for the given platform, platform instance, and env. Instead this should be configurable:
|
|
131
|
+
# bulk initialization vs lazy on-demand schema fetching.
|
|
134
132
|
self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub(
|
|
135
133
|
platform=self.config.platform,
|
|
136
134
|
platform_instance=self.config.platform_instance,
|
|
137
135
|
env=self.config.env,
|
|
138
136
|
)
|
|
139
|
-
self.urns = self.schema_resolver.get_urns()
|
|
140
137
|
else:
|
|
141
|
-
self.schema_resolver =
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
self.
|
|
138
|
+
self.schema_resolver = None
|
|
139
|
+
|
|
140
|
+
self.aggregator = SqlParsingAggregator(
|
|
141
|
+
platform=self.config.platform,
|
|
142
|
+
platform_instance=self.config.platform_instance,
|
|
143
|
+
env=self.config.env,
|
|
144
|
+
schema_resolver=self.schema_resolver,
|
|
145
|
+
eager_graph_load=False,
|
|
146
|
+
generate_lineage=True, # TODO: make this configurable
|
|
147
|
+
generate_queries=True, # TODO: make this configurable
|
|
148
|
+
generate_query_subject_fields=True, # TODO: make this configurable
|
|
149
|
+
generate_query_usage_statistics=True, # This enables publishing SELECT query entities, otherwise only mutation queries are published
|
|
150
|
+
generate_usage_statistics=True,
|
|
151
|
+
generate_operations=True, # TODO: make this configurable
|
|
152
|
+
usage_config=self.config.usage,
|
|
153
|
+
is_temp_table=None,
|
|
154
|
+
is_allowed_table=None,
|
|
155
|
+
format_queries=False,
|
|
156
|
+
)
|
|
157
|
+
self.report.sql_aggregator = self.aggregator.report
|
|
147
158
|
|
|
148
159
|
@classmethod
|
|
149
160
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource":
|
|
@@ -156,68 +167,91 @@ class SqlQueriesSource(Source):
|
|
|
156
167
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
157
168
|
return [partial(auto_workunit_reporter, self.get_report())]
|
|
158
169
|
|
|
159
|
-
def get_workunits_internal(
|
|
170
|
+
def get_workunits_internal(
|
|
171
|
+
self,
|
|
172
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
160
173
|
logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}")
|
|
174
|
+
|
|
175
|
+
with self.report.new_stage("Collecting queries from file"):
|
|
176
|
+
queries = list(self._parse_query_file())
|
|
177
|
+
logger.info(f"Collected {len(queries)} queries for processing")
|
|
178
|
+
|
|
179
|
+
with self.report.new_stage("Processing queries through SQL parsing aggregator"):
|
|
180
|
+
for query_entry in queries:
|
|
181
|
+
self._add_query_to_aggregator(query_entry)
|
|
182
|
+
|
|
183
|
+
with self.report.new_stage("Generating metadata work units"):
|
|
184
|
+
logger.info("Generating workunits from SQL parsing aggregator")
|
|
185
|
+
yield from self.aggregator.gen_metadata()
|
|
186
|
+
|
|
187
|
+
def _parse_query_file(self) -> Iterable["QueryEntry"]:
|
|
188
|
+
"""Parse the query file and yield QueryEntry objects."""
|
|
161
189
|
with open(self.config.query_file) as f:
|
|
162
190
|
for line in f:
|
|
163
191
|
try:
|
|
164
192
|
query_dict = json.loads(line, strict=False)
|
|
165
193
|
entry = QueryEntry.create(query_dict, config=self.config)
|
|
166
|
-
|
|
194
|
+
self.report.num_entries_processed += 1
|
|
195
|
+
if self.report.num_entries_processed % 1000 == 0:
|
|
196
|
+
logger.info(
|
|
197
|
+
f"Processed {self.report.num_entries_processed} query entries"
|
|
198
|
+
)
|
|
199
|
+
yield entry
|
|
167
200
|
except Exception as e:
|
|
168
|
-
|
|
169
|
-
self.report.
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
201
|
+
self.report.num_entries_failed += 1
|
|
202
|
+
self.report.warning(
|
|
203
|
+
title="Error processing query",
|
|
204
|
+
message="Query skipped due to parsing error",
|
|
205
|
+
context=line.strip(),
|
|
206
|
+
exc=e,
|
|
207
|
+
)
|
|
173
208
|
|
|
174
|
-
def
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
timestamp=
|
|
194
|
-
user=
|
|
209
|
+
def _add_query_to_aggregator(self, query_entry: "QueryEntry") -> None:
|
|
210
|
+
"""Add a query to the SQL parsing aggregator."""
|
|
211
|
+
try:
|
|
212
|
+
# If we have explicit lineage, use it directly
|
|
213
|
+
if query_entry.upstream_tables or query_entry.downstream_tables:
|
|
214
|
+
logger.debug("Using explicit lineage from query file")
|
|
215
|
+
for downstream_table in query_entry.downstream_tables:
|
|
216
|
+
known_lineage = KnownQueryLineageInfo(
|
|
217
|
+
query_text=query_entry.query,
|
|
218
|
+
downstream=downstream_table,
|
|
219
|
+
upstreams=query_entry.upstream_tables,
|
|
220
|
+
timestamp=query_entry.timestamp,
|
|
221
|
+
session_id=query_entry.session_id,
|
|
222
|
+
)
|
|
223
|
+
self.aggregator.add_known_query_lineage(known_lineage)
|
|
224
|
+
else:
|
|
225
|
+
# No explicit lineage, rely on parsing
|
|
226
|
+
observed_query = ObservedQuery(
|
|
227
|
+
query=query_entry.query,
|
|
228
|
+
timestamp=query_entry.timestamp,
|
|
229
|
+
user=query_entry.user,
|
|
230
|
+
session_id=query_entry.session_id,
|
|
231
|
+
default_db=self.config.default_db,
|
|
232
|
+
default_schema=self.config.default_schema,
|
|
195
233
|
)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
234
|
+
self.aggregator.add_observed_query(observed_query)
|
|
235
|
+
|
|
236
|
+
except Exception as e:
|
|
237
|
+
self.report.num_queries_aggregator_failures += 1
|
|
238
|
+
self.report.warning(
|
|
239
|
+
title="Error adding query to aggregator",
|
|
240
|
+
message="Query skipped due to failure when adding query to SQL parsing aggregator",
|
|
241
|
+
context=query_entry.query,
|
|
242
|
+
exc=e,
|
|
200
243
|
)
|
|
201
|
-
self.report.num_column_parse_failures += 1
|
|
202
|
-
|
|
203
|
-
yield from self.builder.process_sql_parsing_result(
|
|
204
|
-
result,
|
|
205
|
-
query=entry.query,
|
|
206
|
-
query_timestamp=entry.timestamp,
|
|
207
|
-
user=entry.user,
|
|
208
|
-
custom_operation_type=entry.operation_type,
|
|
209
|
-
include_urns=self.urns,
|
|
210
|
-
)
|
|
211
244
|
|
|
212
245
|
|
|
213
246
|
@dataclass
|
|
214
247
|
class QueryEntry:
|
|
215
248
|
query: str
|
|
216
249
|
timestamp: Optional[datetime]
|
|
217
|
-
user: Optional[
|
|
250
|
+
user: Optional[CorpUserUrn]
|
|
218
251
|
operation_type: Optional[str]
|
|
219
252
|
downstream_tables: List[str]
|
|
220
253
|
upstream_tables: List[str]
|
|
254
|
+
session_id: Optional[str] = None
|
|
221
255
|
|
|
222
256
|
@classmethod
|
|
223
257
|
def create(
|
|
@@ -230,7 +264,7 @@ class QueryEntry:
|
|
|
230
264
|
if "timestamp" in entry_dict
|
|
231
265
|
else None
|
|
232
266
|
),
|
|
233
|
-
user=
|
|
267
|
+
user=CorpUserUrn(entry_dict["user"]) if "user" in entry_dict else None,
|
|
234
268
|
operation_type=entry_dict.get("operation_type"),
|
|
235
269
|
downstream_tables=[
|
|
236
270
|
make_dataset_urn_with_platform_instance(
|
|
@@ -250,4 +284,5 @@ class QueryEntry:
|
|
|
250
284
|
)
|
|
251
285
|
for table in entry_dict.get("upstream_tables", [])
|
|
252
286
|
],
|
|
287
|
+
session_id=entry_dict.get("session_id"),
|
|
253
288
|
)
|
|
@@ -19964,17 +19964,20 @@ class DataHubPageModuleParamsClass(DictWrapper):
|
|
|
19964
19964
|
linkParams: Union[None, "LinkModuleParamsClass"]=None,
|
|
19965
19965
|
richTextParams: Union[None, "RichTextModuleParamsClass"]=None,
|
|
19966
19966
|
assetCollectionParams: Union[None, "AssetCollectionModuleParamsClass"]=None,
|
|
19967
|
+
hierarchyViewParams: Union[None, "HierarchyModuleParamsClass"]=None,
|
|
19967
19968
|
):
|
|
19968
19969
|
super().__init__()
|
|
19969
19970
|
|
|
19970
19971
|
self.linkParams = linkParams
|
|
19971
19972
|
self.richTextParams = richTextParams
|
|
19972
19973
|
self.assetCollectionParams = assetCollectionParams
|
|
19974
|
+
self.hierarchyViewParams = hierarchyViewParams
|
|
19973
19975
|
|
|
19974
19976
|
def _restore_defaults(self) -> None:
|
|
19975
19977
|
self.linkParams = self.RECORD_SCHEMA.fields_dict["linkParams"].default
|
|
19976
19978
|
self.richTextParams = self.RECORD_SCHEMA.fields_dict["richTextParams"].default
|
|
19977
19979
|
self.assetCollectionParams = self.RECORD_SCHEMA.fields_dict["assetCollectionParams"].default
|
|
19980
|
+
self.hierarchyViewParams = self.RECORD_SCHEMA.fields_dict["hierarchyViewParams"].default
|
|
19978
19981
|
|
|
19979
19982
|
|
|
19980
19983
|
@property
|
|
@@ -20007,6 +20010,16 @@ class DataHubPageModuleParamsClass(DictWrapper):
|
|
|
20007
20010
|
self._inner_dict['assetCollectionParams'] = value
|
|
20008
20011
|
|
|
20009
20012
|
|
|
20013
|
+
@property
|
|
20014
|
+
def hierarchyViewParams(self) -> Union[None, "HierarchyModuleParamsClass"]:
|
|
20015
|
+
"""The params required if the module is type HIERARCHY_VIEW"""
|
|
20016
|
+
return self._inner_dict.get('hierarchyViewParams') # type: ignore
|
|
20017
|
+
|
|
20018
|
+
@hierarchyViewParams.setter
|
|
20019
|
+
def hierarchyViewParams(self, value: Union[None, "HierarchyModuleParamsClass"]) -> None:
|
|
20020
|
+
self._inner_dict['hierarchyViewParams'] = value
|
|
20021
|
+
|
|
20022
|
+
|
|
20010
20023
|
class DataHubPageModulePropertiesClass(_Aspect):
|
|
20011
20024
|
"""The main properties of a DataHub page module"""
|
|
20012
20025
|
|
|
@@ -20149,6 +20162,46 @@ class DataHubPageModuleVisibilityClass(DictWrapper):
|
|
|
20149
20162
|
self._inner_dict['scope'] = value
|
|
20150
20163
|
|
|
20151
20164
|
|
|
20165
|
+
class HierarchyModuleParamsClass(DictWrapper):
|
|
20166
|
+
"""The params required if the module is type HIERARCHY_VIEW
|
|
20167
|
+
TODO: add filters
|
|
20168
|
+
relatedEntitiesFilter: optional Filter"""
|
|
20169
|
+
|
|
20170
|
+
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.module.HierarchyModuleParams")
|
|
20171
|
+
def __init__(self,
|
|
20172
|
+
showRelatedEntities: bool,
|
|
20173
|
+
assetUrns: Union[None, List[str]]=None,
|
|
20174
|
+
):
|
|
20175
|
+
super().__init__()
|
|
20176
|
+
|
|
20177
|
+
self.assetUrns = assetUrns
|
|
20178
|
+
self.showRelatedEntities = showRelatedEntities
|
|
20179
|
+
|
|
20180
|
+
def _restore_defaults(self) -> None:
|
|
20181
|
+
self.assetUrns = self.RECORD_SCHEMA.fields_dict["assetUrns"].default
|
|
20182
|
+
self.showRelatedEntities = bool()
|
|
20183
|
+
|
|
20184
|
+
|
|
20185
|
+
@property
|
|
20186
|
+
def assetUrns(self) -> Union[None, List[str]]:
|
|
20187
|
+
# No docs available.
|
|
20188
|
+
return self._inner_dict.get('assetUrns') # type: ignore
|
|
20189
|
+
|
|
20190
|
+
@assetUrns.setter
|
|
20191
|
+
def assetUrns(self, value: Union[None, List[str]]) -> None:
|
|
20192
|
+
self._inner_dict['assetUrns'] = value
|
|
20193
|
+
|
|
20194
|
+
|
|
20195
|
+
@property
|
|
20196
|
+
def showRelatedEntities(self) -> bool:
|
|
20197
|
+
# No docs available.
|
|
20198
|
+
return self._inner_dict.get('showRelatedEntities') # type: ignore
|
|
20199
|
+
|
|
20200
|
+
@showRelatedEntities.setter
|
|
20201
|
+
def showRelatedEntities(self, value: bool) -> None:
|
|
20202
|
+
self._inner_dict['showRelatedEntities'] = value
|
|
20203
|
+
|
|
20204
|
+
|
|
20152
20205
|
class LinkModuleParamsClass(DictWrapper):
|
|
20153
20206
|
# No docs available.
|
|
20154
20207
|
|
|
@@ -20772,6 +20825,8 @@ class SystemMetadataClass(_Aspect):
|
|
|
20772
20825
|
registryVersion: Union[None, str]=None,
|
|
20773
20826
|
properties: Union[None, Dict[str, str]]=None,
|
|
20774
20827
|
version: Union[None, str]=None,
|
|
20828
|
+
aspectCreated: Union[None, "AuditStampClass"]=None,
|
|
20829
|
+
aspectModified: Union[None, "AuditStampClass"]=None,
|
|
20775
20830
|
):
|
|
20776
20831
|
super().__init__()
|
|
20777
20832
|
|
|
@@ -20795,6 +20850,8 @@ class SystemMetadataClass(_Aspect):
|
|
|
20795
20850
|
self.registryVersion = registryVersion
|
|
20796
20851
|
self.properties = properties
|
|
20797
20852
|
self.version = version
|
|
20853
|
+
self.aspectCreated = aspectCreated
|
|
20854
|
+
self.aspectModified = aspectModified
|
|
20798
20855
|
|
|
20799
20856
|
def _restore_defaults(self) -> None:
|
|
20800
20857
|
self.lastObserved = self.RECORD_SCHEMA.fields_dict["lastObserved"].default
|
|
@@ -20805,6 +20862,8 @@ class SystemMetadataClass(_Aspect):
|
|
|
20805
20862
|
self.registryVersion = self.RECORD_SCHEMA.fields_dict["registryVersion"].default
|
|
20806
20863
|
self.properties = self.RECORD_SCHEMA.fields_dict["properties"].default
|
|
20807
20864
|
self.version = self.RECORD_SCHEMA.fields_dict["version"].default
|
|
20865
|
+
self.aspectCreated = self.RECORD_SCHEMA.fields_dict["aspectCreated"].default
|
|
20866
|
+
self.aspectModified = self.RECORD_SCHEMA.fields_dict["aspectModified"].default
|
|
20808
20867
|
|
|
20809
20868
|
|
|
20810
20869
|
@property
|
|
@@ -20889,6 +20948,26 @@ class SystemMetadataClass(_Aspect):
|
|
|
20889
20948
|
self._inner_dict['version'] = value
|
|
20890
20949
|
|
|
20891
20950
|
|
|
20951
|
+
@property
|
|
20952
|
+
def aspectCreated(self) -> Union[None, "AuditStampClass"]:
|
|
20953
|
+
"""When the aspect was initially created and who created it, detected by version 0 -> 1 change"""
|
|
20954
|
+
return self._inner_dict.get('aspectCreated') # type: ignore
|
|
20955
|
+
|
|
20956
|
+
@aspectCreated.setter
|
|
20957
|
+
def aspectCreated(self, value: Union[None, "AuditStampClass"]) -> None:
|
|
20958
|
+
self._inner_dict['aspectCreated'] = value
|
|
20959
|
+
|
|
20960
|
+
|
|
20961
|
+
@property
|
|
20962
|
+
def aspectModified(self) -> Union[None, "AuditStampClass"]:
|
|
20963
|
+
"""When the aspect was last modified and the actor that performed the modification"""
|
|
20964
|
+
return self._inner_dict.get('aspectModified') # type: ignore
|
|
20965
|
+
|
|
20966
|
+
@aspectModified.setter
|
|
20967
|
+
def aspectModified(self, value: Union[None, "AuditStampClass"]) -> None:
|
|
20968
|
+
self._inner_dict['aspectModified'] = value
|
|
20969
|
+
|
|
20970
|
+
|
|
20892
20971
|
class ChartCellClass(DictWrapper):
|
|
20893
20972
|
"""Chart cell in a notebook, which will present content in chart format"""
|
|
20894
20973
|
|
|
@@ -27171,6 +27250,7 @@ __SCHEMA_TYPES = {
|
|
|
27171
27250
|
'com.linkedin.pegasus2avro.module.DataHubPageModuleProperties': DataHubPageModulePropertiesClass,
|
|
27172
27251
|
'com.linkedin.pegasus2avro.module.DataHubPageModuleType': DataHubPageModuleTypeClass,
|
|
27173
27252
|
'com.linkedin.pegasus2avro.module.DataHubPageModuleVisibility': DataHubPageModuleVisibilityClass,
|
|
27253
|
+
'com.linkedin.pegasus2avro.module.HierarchyModuleParams': HierarchyModuleParamsClass,
|
|
27174
27254
|
'com.linkedin.pegasus2avro.module.LinkModuleParams': LinkModuleParamsClass,
|
|
27175
27255
|
'com.linkedin.pegasus2avro.module.PageModuleScope': PageModuleScopeClass,
|
|
27176
27256
|
'com.linkedin.pegasus2avro.module.RichTextModuleParams': RichTextModuleParamsClass,
|
|
@@ -27680,6 +27760,7 @@ __SCHEMA_TYPES = {
|
|
|
27680
27760
|
'DataHubPageModuleProperties': DataHubPageModulePropertiesClass,
|
|
27681
27761
|
'DataHubPageModuleType': DataHubPageModuleTypeClass,
|
|
27682
27762
|
'DataHubPageModuleVisibility': DataHubPageModuleVisibilityClass,
|
|
27763
|
+
'HierarchyModuleParams': HierarchyModuleParamsClass,
|
|
27683
27764
|
'LinkModuleParams': LinkModuleParamsClass,
|
|
27684
27765
|
'PageModuleScope': PageModuleScopeClass,
|
|
27685
27766
|
'RichTextModuleParams': RichTextModuleParamsClass,
|
|
@@ -12,6 +12,7 @@ from .....schema_classes import DataHubPageModuleParamsClass
|
|
|
12
12
|
from .....schema_classes import DataHubPageModulePropertiesClass
|
|
13
13
|
from .....schema_classes import DataHubPageModuleTypeClass
|
|
14
14
|
from .....schema_classes import DataHubPageModuleVisibilityClass
|
|
15
|
+
from .....schema_classes import HierarchyModuleParamsClass
|
|
15
16
|
from .....schema_classes import LinkModuleParamsClass
|
|
16
17
|
from .....schema_classes import PageModuleScopeClass
|
|
17
18
|
from .....schema_classes import RichTextModuleParamsClass
|
|
@@ -22,6 +23,7 @@ DataHubPageModuleParams = DataHubPageModuleParamsClass
|
|
|
22
23
|
DataHubPageModuleProperties = DataHubPageModulePropertiesClass
|
|
23
24
|
DataHubPageModuleType = DataHubPageModuleTypeClass
|
|
24
25
|
DataHubPageModuleVisibility = DataHubPageModuleVisibilityClass
|
|
26
|
+
HierarchyModuleParams = HierarchyModuleParamsClass
|
|
25
27
|
LinkModuleParams = LinkModuleParamsClass
|
|
26
28
|
PageModuleScope = PageModuleScopeClass
|
|
27
29
|
RichTextModuleParams = RichTextModuleParamsClass
|