acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
- datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +46 -9
- datahub/ingestion/source/ge_profiling_config.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/sigma/data_classes.py +1 -0
- datahub/ingestion/source/sigma/sigma.py +101 -43
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +18 -6
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import TYPE_CHECKING, Any, DefaultDict, Dict, List, Set
|
|
@@ -6,6 +7,8 @@ from datahub.ingestion.source.aws.sagemaker_processors.common import (
|
|
|
6
7
|
SagemakerSourceReport,
|
|
7
8
|
)
|
|
8
9
|
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
9
12
|
if TYPE_CHECKING:
|
|
10
13
|
from mypy_boto3_sagemaker import SageMakerClient
|
|
11
14
|
from mypy_boto3_sagemaker.type_defs import (
|
|
@@ -88,7 +91,6 @@ class LineageProcessor:
|
|
|
88
91
|
paginator = self.sagemaker_client.get_paginator("list_contexts")
|
|
89
92
|
for page in paginator.paginate():
|
|
90
93
|
contexts += page["ContextSummaries"]
|
|
91
|
-
|
|
92
94
|
return contexts
|
|
93
95
|
|
|
94
96
|
def get_incoming_edges(self, node_arn: str) -> List["AssociationSummaryTypeDef"]:
|
|
@@ -225,27 +227,32 @@ class LineageProcessor:
|
|
|
225
227
|
"""
|
|
226
228
|
Get the lineage of all artifacts in SageMaker.
|
|
227
229
|
"""
|
|
228
|
-
|
|
230
|
+
logger.info("Getting lineage for SageMaker artifacts...")
|
|
231
|
+
logger.info("Getting all actions")
|
|
229
232
|
for action in self.get_all_actions():
|
|
230
233
|
self.nodes[action["ActionArn"]] = {**action, "node_type": "action"}
|
|
234
|
+
logger.info("Getting all artifacts")
|
|
231
235
|
for artifact in self.get_all_artifacts():
|
|
232
236
|
self.nodes[artifact["ArtifactArn"]] = {**artifact, "node_type": "artifact"}
|
|
237
|
+
logger.info("Getting all contexts")
|
|
233
238
|
for context in self.get_all_contexts():
|
|
234
239
|
self.nodes[context["ContextArn"]] = {**context, "node_type": "context"}
|
|
235
240
|
|
|
241
|
+
logger.info("Getting lineage for model deployments and model groups")
|
|
236
242
|
for node_arn, node in self.nodes.items():
|
|
243
|
+
logger.debug(f"Getting lineage for node {node_arn}")
|
|
237
244
|
# get model-endpoint lineage
|
|
238
245
|
if (
|
|
239
246
|
node["node_type"] == "action"
|
|
240
247
|
and node.get("ActionType") == "ModelDeployment"
|
|
241
248
|
):
|
|
242
249
|
self.get_model_deployment_lineage(node_arn)
|
|
243
|
-
|
|
250
|
+
self.report.model_endpoint_lineage += 1
|
|
244
251
|
# get model-group lineage
|
|
245
252
|
if (
|
|
246
253
|
node["node_type"] == "context"
|
|
247
254
|
and node.get("ContextType") == "ModelGroup"
|
|
248
255
|
):
|
|
249
256
|
self.get_model_group_lineage(node_arn, node)
|
|
250
|
-
|
|
257
|
+
self.report.model_group_lineage += 1
|
|
251
258
|
return self.lineage_info
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from collections import defaultdict
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime
|
|
@@ -65,6 +66,8 @@ ENDPOINT_STATUS_MAP: Dict[str, str] = {
|
|
|
65
66
|
"Unknown": DeploymentStatusClass.UNKNOWN,
|
|
66
67
|
}
|
|
67
68
|
|
|
69
|
+
logger = logging.getLogger(__name__)
|
|
70
|
+
|
|
68
71
|
|
|
69
72
|
@dataclass
|
|
70
73
|
class ModelProcessor:
|
|
@@ -385,6 +388,26 @@ class ModelProcessor:
|
|
|
385
388
|
model_metrics,
|
|
386
389
|
)
|
|
387
390
|
|
|
391
|
+
@staticmethod
|
|
392
|
+
def get_group_name_from_arn(arn: str) -> str:
|
|
393
|
+
"""
|
|
394
|
+
Extract model package group name from a SageMaker ARN.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
arn (str): Full ARN of the model package group
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
str: Name of the model package group
|
|
401
|
+
|
|
402
|
+
Example:
|
|
403
|
+
>>> ModelProcessor.get_group_name_from_arn("arn:aws:sagemaker:eu-west-1:123456789:model-package-group/my-model-group")
|
|
404
|
+
'my-model-group'
|
|
405
|
+
"""
|
|
406
|
+
logger.debug(
|
|
407
|
+
f"Extracting group name from ARN: {arn} because group was not seen before"
|
|
408
|
+
)
|
|
409
|
+
return arn.split("/")[-1]
|
|
410
|
+
|
|
388
411
|
def get_model_wu(
|
|
389
412
|
self,
|
|
390
413
|
model_details: "DescribeModelOutputTypeDef",
|
|
@@ -425,8 +448,14 @@ class ModelProcessor:
|
|
|
425
448
|
model_group_arns = model_uri_groups | model_image_groups
|
|
426
449
|
|
|
427
450
|
model_group_names = sorted(
|
|
428
|
-
[
|
|
451
|
+
[
|
|
452
|
+
self.group_arn_to_name[x]
|
|
453
|
+
if x in self.group_arn_to_name
|
|
454
|
+
else self.get_group_name_from_arn(x)
|
|
455
|
+
for x in model_group_arns
|
|
456
|
+
]
|
|
429
457
|
)
|
|
458
|
+
|
|
430
459
|
model_group_urns = [
|
|
431
460
|
builder.make_ml_model_group_urn("sagemaker", x, self.env)
|
|
432
461
|
for x in model_group_names
|
|
@@ -190,7 +190,7 @@ class BigQueryTableRef:
|
|
|
190
190
|
@classmethod
|
|
191
191
|
def from_urn(cls, urn: str) -> "BigQueryTableRef":
|
|
192
192
|
"""Raises: ValueError if urn is not a valid BigQuery table URN."""
|
|
193
|
-
dataset_urn = DatasetUrn.
|
|
193
|
+
dataset_urn = DatasetUrn.from_string(urn)
|
|
194
194
|
split = dataset_urn.name.rsplit(".", 3)
|
|
195
195
|
if len(split) == 3:
|
|
196
196
|
project, dataset, table = split
|
|
@@ -118,7 +118,6 @@ class BigqueryTable(BaseTable):
|
|
|
118
118
|
active_billable_bytes: Optional[int] = None
|
|
119
119
|
long_term_billable_bytes: Optional[int] = None
|
|
120
120
|
partition_info: Optional[PartitionInfo] = None
|
|
121
|
-
columns_ignore_from_profiling: List[str] = field(default_factory=list)
|
|
122
121
|
external: bool = False
|
|
123
122
|
constraints: List[BigqueryTableConstraint] = field(default_factory=list)
|
|
124
123
|
table_type: Optional[str] = None
|
|
@@ -598,18 +598,6 @@ class BigQuerySchemaGenerator:
|
|
|
598
598
|
dataset_name=dataset_name,
|
|
599
599
|
)
|
|
600
600
|
|
|
601
|
-
# This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here
|
|
602
|
-
# because the profiler doesn't have access to columns
|
|
603
|
-
def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]:
|
|
604
|
-
ignore_list: List[str] = []
|
|
605
|
-
for column in columns:
|
|
606
|
-
if not column.data_type or any(
|
|
607
|
-
word in column.data_type.lower()
|
|
608
|
-
for word in ["array", "struct", "geography", "json"]
|
|
609
|
-
):
|
|
610
|
-
ignore_list.append(column.field_path)
|
|
611
|
-
return ignore_list
|
|
612
|
-
|
|
613
601
|
def _process_table(
|
|
614
602
|
self,
|
|
615
603
|
table: BigqueryTable,
|
|
@@ -631,15 +619,6 @@ class BigQuerySchemaGenerator:
|
|
|
631
619
|
)
|
|
632
620
|
table.column_count = len(columns)
|
|
633
621
|
|
|
634
|
-
# We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
|
|
635
|
-
if (
|
|
636
|
-
self.config.is_profiling_enabled()
|
|
637
|
-
and not self.config.profiling.profile_table_level_only
|
|
638
|
-
):
|
|
639
|
-
table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
|
|
640
|
-
columns
|
|
641
|
-
)
|
|
642
|
-
|
|
643
622
|
if not table.column_count:
|
|
644
623
|
logger.warning(
|
|
645
624
|
f"Table doesn't have any column or unable to get columns for table: {table_identifier}"
|
|
@@ -166,12 +166,6 @@ WHERE
|
|
|
166
166
|
normalized_table_name = BigqueryTableIdentifier(
|
|
167
167
|
project_id=project_id, dataset=dataset, table=table.name
|
|
168
168
|
).get_table_name()
|
|
169
|
-
for column in table.columns_ignore_from_profiling:
|
|
170
|
-
# Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
|
|
171
|
-
# We also filter columns without data type as it means that column is part of a complex type.
|
|
172
|
-
self.config.profile_pattern.deny.append(
|
|
173
|
-
f"^{normalized_table_name}.{column}$"
|
|
174
|
-
)
|
|
175
169
|
|
|
176
170
|
if table.external and not self.config.profiling.profile_external_tables:
|
|
177
171
|
self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1
|
|
@@ -22,6 +22,8 @@ class DatasetSubTypes(StrEnum):
|
|
|
22
22
|
SAC_MODEL = "Model"
|
|
23
23
|
SAC_IMPORT_DATA_MODEL = "Import Data Model"
|
|
24
24
|
SAC_LIVE_DATA_MODEL = "Live Data Model"
|
|
25
|
+
NEO4J_NODE = "Neo4j Node"
|
|
26
|
+
NEO4J_RELATIONSHIP = "Neo4j Relationship"
|
|
25
27
|
|
|
26
28
|
# TODO: Create separate entity...
|
|
27
29
|
NOTEBOOK = "Notebook"
|
|
@@ -653,7 +653,7 @@ class CSVEnricherSource(Source):
|
|
|
653
653
|
|
|
654
654
|
is_resource_row: bool = not row["subresource"]
|
|
655
655
|
entity_urn = row["resource"]
|
|
656
|
-
entity_type = Urn.
|
|
656
|
+
entity_type = Urn.from_string(row["resource"]).get_type()
|
|
657
657
|
|
|
658
658
|
term_associations: List[
|
|
659
659
|
GlossaryTermAssociationClass
|
|
@@ -147,6 +147,47 @@ class DataHubDatabaseReader:
|
|
|
147
147
|
version
|
|
148
148
|
"""
|
|
149
149
|
|
|
150
|
+
def execute_server_cursor(
|
|
151
|
+
self, query: str, params: Dict[str, Any]
|
|
152
|
+
) -> Iterable[Dict[str, Any]]:
|
|
153
|
+
with self.engine.connect() as conn:
|
|
154
|
+
if self.engine.dialect.name == "postgresql":
|
|
155
|
+
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
|
156
|
+
conn = conn.execution_options(
|
|
157
|
+
stream_results=True,
|
|
158
|
+
yield_per=self.config.database_query_batch_size,
|
|
159
|
+
)
|
|
160
|
+
result = conn.execute(query, params)
|
|
161
|
+
for row in result:
|
|
162
|
+
yield dict(row)
|
|
163
|
+
elif self.engine.dialect.name == "mysql": # MySQL
|
|
164
|
+
import MySQLdb
|
|
165
|
+
|
|
166
|
+
with contextlib.closing(
|
|
167
|
+
conn.connection.cursor(MySQLdb.cursors.SSCursor)
|
|
168
|
+
) as cursor:
|
|
169
|
+
logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
|
|
170
|
+
cursor.execute(query, params)
|
|
171
|
+
|
|
172
|
+
columns = [desc[0] for desc in cursor.description]
|
|
173
|
+
while True:
|
|
174
|
+
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
175
|
+
if not rows:
|
|
176
|
+
break # Use break instead of return in generator
|
|
177
|
+
for row in rows:
|
|
178
|
+
yield dict(zip(columns, row))
|
|
179
|
+
else:
|
|
180
|
+
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
181
|
+
|
|
182
|
+
def _get_rows(
|
|
183
|
+
self, from_createdon: datetime, stop_time: datetime
|
|
184
|
+
) -> Iterable[Dict[str, Any]]:
|
|
185
|
+
params = {
|
|
186
|
+
"exclude_aspects": list(self.config.exclude_aspects),
|
|
187
|
+
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
188
|
+
}
|
|
189
|
+
yield from self.execute_server_cursor(self.query, params)
|
|
190
|
+
|
|
150
191
|
def get_aspects(
|
|
151
192
|
self, from_createdon: datetime, stop_time: datetime
|
|
152
193
|
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
@@ -159,27 +200,6 @@ class DataHubDatabaseReader:
|
|
|
159
200
|
if mcp:
|
|
160
201
|
yield mcp, row["createdon"]
|
|
161
202
|
|
|
162
|
-
def _get_rows(
|
|
163
|
-
self, from_createdon: datetime, stop_time: datetime
|
|
164
|
-
) -> Iterable[Dict[str, Any]]:
|
|
165
|
-
with self.engine.connect() as conn:
|
|
166
|
-
with contextlib.closing(conn.connection.cursor()) as cursor:
|
|
167
|
-
cursor.execute(
|
|
168
|
-
self.query,
|
|
169
|
-
{
|
|
170
|
-
"exclude_aspects": list(self.config.exclude_aspects),
|
|
171
|
-
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
172
|
-
},
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
columns = [desc[0] for desc in cursor.description]
|
|
176
|
-
while True:
|
|
177
|
-
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
178
|
-
if not rows:
|
|
179
|
-
return
|
|
180
|
-
for row in rows:
|
|
181
|
-
yield dict(zip(columns, row))
|
|
182
|
-
|
|
183
203
|
def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
|
|
184
204
|
"""
|
|
185
205
|
Fetches all soft-deleted entities from the database.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from datetime import datetime, timezone
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
3
|
from functools import partial
|
|
4
4
|
from typing import Dict, Iterable, List, Optional
|
|
5
5
|
|
|
@@ -26,6 +26,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
26
26
|
StatefulIngestionSourceBase,
|
|
27
27
|
)
|
|
28
28
|
from datahub.metadata.schema_classes import ChangeTypeClass
|
|
29
|
+
from datahub.utilities.progress_timer import ProgressTimer
|
|
29
30
|
|
|
30
31
|
logger = logging.getLogger(__name__)
|
|
31
32
|
|
|
@@ -105,11 +106,17 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
105
106
|
self, from_createdon: datetime, reader: DataHubDatabaseReader
|
|
106
107
|
) -> Iterable[MetadataWorkUnit]:
|
|
107
108
|
logger.info(f"Fetching database aspects starting from {from_createdon}")
|
|
109
|
+
progress = ProgressTimer(report_every=timedelta(seconds=60))
|
|
108
110
|
mcps = reader.get_aspects(from_createdon, self.report.stop_time)
|
|
109
111
|
for i, (mcp, createdon) in enumerate(mcps):
|
|
110
112
|
if not self.urn_pattern.allowed(str(mcp.entityUrn)):
|
|
111
113
|
continue
|
|
112
114
|
|
|
115
|
+
if progress.should_report():
|
|
116
|
+
logger.info(
|
|
117
|
+
f"Ingested {i} database aspects so far, currently at {createdon}"
|
|
118
|
+
)
|
|
119
|
+
|
|
113
120
|
yield mcp.as_workunit()
|
|
114
121
|
self.report.num_database_aspects_ingested += 1
|
|
115
122
|
|
|
@@ -53,19 +53,7 @@ from datahub.ingestion.source.dbt.dbt_tests import (
|
|
|
53
53
|
make_assertion_from_test,
|
|
54
54
|
make_assertion_result_from_test,
|
|
55
55
|
)
|
|
56
|
-
from datahub.ingestion.source.sql.sql_types import
|
|
57
|
-
ATHENA_SQL_TYPES_MAP,
|
|
58
|
-
BIGQUERY_TYPES_MAP,
|
|
59
|
-
POSTGRES_TYPES_MAP,
|
|
60
|
-
SNOWFLAKE_TYPES_MAP,
|
|
61
|
-
SPARK_SQL_TYPES_MAP,
|
|
62
|
-
TRINO_SQL_TYPES_MAP,
|
|
63
|
-
VERTICA_SQL_TYPES_MAP,
|
|
64
|
-
resolve_athena_modified_type,
|
|
65
|
-
resolve_postgres_modified_type,
|
|
66
|
-
resolve_trino_modified_type,
|
|
67
|
-
resolve_vertica_modified_type,
|
|
68
|
-
)
|
|
56
|
+
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
69
57
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
70
58
|
StaleEntityRemovalHandler,
|
|
71
59
|
StaleEntityRemovalSourceReport,
|
|
@@ -89,17 +77,11 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
89
77
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
90
78
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
91
79
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
92
|
-
BooleanTypeClass,
|
|
93
|
-
DateTypeClass,
|
|
94
80
|
MySqlDDL,
|
|
95
81
|
NullTypeClass,
|
|
96
|
-
NumberTypeClass,
|
|
97
|
-
RecordType,
|
|
98
82
|
SchemaField,
|
|
99
83
|
SchemaFieldDataType,
|
|
100
84
|
SchemaMetadata,
|
|
101
|
-
StringTypeClass,
|
|
102
|
-
TimeTypeClass,
|
|
103
85
|
)
|
|
104
86
|
from datahub.metadata.schema_classes import (
|
|
105
87
|
DataPlatformInstanceClass,
|
|
@@ -804,28 +786,6 @@ def make_mapping_upstream_lineage(
|
|
|
804
786
|
)
|
|
805
787
|
|
|
806
788
|
|
|
807
|
-
# See https://github.com/fishtown-analytics/dbt/blob/master/core/dbt/adapters/sql/impl.py
|
|
808
|
-
_field_type_mapping = {
|
|
809
|
-
"boolean": BooleanTypeClass,
|
|
810
|
-
"date": DateTypeClass,
|
|
811
|
-
"time": TimeTypeClass,
|
|
812
|
-
"numeric": NumberTypeClass,
|
|
813
|
-
"text": StringTypeClass,
|
|
814
|
-
"timestamp with time zone": DateTypeClass,
|
|
815
|
-
"timestamp without time zone": DateTypeClass,
|
|
816
|
-
"integer": NumberTypeClass,
|
|
817
|
-
"float8": NumberTypeClass,
|
|
818
|
-
"struct": RecordType,
|
|
819
|
-
**POSTGRES_TYPES_MAP,
|
|
820
|
-
**SNOWFLAKE_TYPES_MAP,
|
|
821
|
-
**BIGQUERY_TYPES_MAP,
|
|
822
|
-
**SPARK_SQL_TYPES_MAP,
|
|
823
|
-
**TRINO_SQL_TYPES_MAP,
|
|
824
|
-
**ATHENA_SQL_TYPES_MAP,
|
|
825
|
-
**VERTICA_SQL_TYPES_MAP,
|
|
826
|
-
}
|
|
827
|
-
|
|
828
|
-
|
|
829
789
|
def get_column_type(
|
|
830
790
|
report: DBTSourceReport,
|
|
831
791
|
dataset_name: str,
|
|
@@ -835,24 +795,10 @@ def get_column_type(
|
|
|
835
795
|
"""
|
|
836
796
|
Maps known DBT types to datahub types
|
|
837
797
|
"""
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
if dbt_adapter == "trino":
|
|
843
|
-
TypeClass = resolve_trino_modified_type(column_type)
|
|
844
|
-
elif dbt_adapter == "athena":
|
|
845
|
-
TypeClass = resolve_athena_modified_type(column_type)
|
|
846
|
-
elif dbt_adapter == "postgres" or dbt_adapter == "redshift":
|
|
847
|
-
# Redshift uses a variant of Postgres, so we can use the same logic.
|
|
848
|
-
TypeClass = resolve_postgres_modified_type(column_type)
|
|
849
|
-
elif dbt_adapter == "vertica":
|
|
850
|
-
TypeClass = resolve_vertica_modified_type(column_type)
|
|
851
|
-
elif dbt_adapter == "snowflake":
|
|
852
|
-
# Snowflake types are uppercase, so we check that.
|
|
853
|
-
TypeClass = _field_type_mapping.get(column_type.upper())
|
|
854
|
-
|
|
855
|
-
# if still not found, report the warning
|
|
798
|
+
|
|
799
|
+
TypeClass = resolve_sql_type(column_type, dbt_adapter)
|
|
800
|
+
|
|
801
|
+
# if still not found, report a warning
|
|
856
802
|
if TypeClass is None:
|
|
857
803
|
if column_type:
|
|
858
804
|
report.info(
|
|
@@ -861,9 +807,9 @@ def get_column_type(
|
|
|
861
807
|
context=f"{dataset_name} - {column_type}",
|
|
862
808
|
log=False,
|
|
863
809
|
)
|
|
864
|
-
TypeClass = NullTypeClass
|
|
810
|
+
TypeClass = NullTypeClass()
|
|
865
811
|
|
|
866
|
-
return SchemaFieldDataType(type=TypeClass
|
|
812
|
+
return SchemaFieldDataType(type=TypeClass)
|
|
867
813
|
|
|
868
814
|
|
|
869
815
|
@platform_name("dbt")
|