acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/METADATA +2324 -2324
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/RECORD +25 -25
- datahub/__init__.py +1 -1
- datahub/cli/cli_utils.py +12 -1
- datahub/emitter/rest_emitter.py +140 -92
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +14 -11
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/source/aws/glue.py +52 -35
- datahub/ingestion/source/bigquery_v2/bigquery.py +2 -0
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +8 -0
- datahub/ingestion/source/datahub/config.py +10 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +11 -7
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
- datahub/ingestion/source/unity/source.py +0 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +8 -5
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/top_level.txt +0 -0
|
@@ -1054,49 +1054,66 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1054
1054
|
yield from self.gen_database_containers(database)
|
|
1055
1055
|
|
|
1056
1056
|
for table in tables:
|
|
1057
|
-
database_name = table["DatabaseName"]
|
|
1058
1057
|
table_name = table["Name"]
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1058
|
+
try:
|
|
1059
|
+
yield from self._gen_table_wu(table=table)
|
|
1060
|
+
except KeyError as e:
|
|
1061
|
+
self.report.report_failure(
|
|
1062
|
+
message="Failed to extract workunit for table",
|
|
1063
|
+
context=f"Table: {table_name}",
|
|
1064
|
+
exc=e,
|
|
1065
|
+
)
|
|
1066
|
+
if self.extract_transforms:
|
|
1067
|
+
yield from self._transform_extraction()
|
|
1066
1068
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1069
|
+
def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
|
|
1070
|
+
database_name = table["DatabaseName"]
|
|
1071
|
+
table_name = table["Name"]
|
|
1072
|
+
full_table_name = f"{database_name}.{table_name}"
|
|
1073
|
+
self.report.report_table_scanned()
|
|
1074
|
+
if not self.source_config.database_pattern.allowed(
|
|
1075
|
+
database_name
|
|
1076
|
+
) or not self.source_config.table_pattern.allowed(full_table_name):
|
|
1077
|
+
self.report.report_table_dropped(full_table_name)
|
|
1078
|
+
return
|
|
1079
|
+
|
|
1080
|
+
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
1081
|
+
platform=self.platform,
|
|
1082
|
+
name=full_table_name,
|
|
1083
|
+
env=self.env,
|
|
1084
|
+
platform_instance=self.source_config.platform_instance,
|
|
1085
|
+
)
|
|
1073
1086
|
|
|
1074
|
-
|
|
1075
|
-
|
|
1087
|
+
mce = self._extract_record(dataset_urn, table, full_table_name)
|
|
1088
|
+
yield MetadataWorkUnit(full_table_name, mce=mce)
|
|
1076
1089
|
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1090
|
+
# We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
|
|
1091
|
+
# possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
|
|
1092
|
+
yield MetadataChangeProposalWrapper(
|
|
1093
|
+
entityUrn=dataset_urn,
|
|
1094
|
+
aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
|
|
1095
|
+
).as_workunit()
|
|
1083
1096
|
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1097
|
+
yield from self._get_domain_wu(
|
|
1098
|
+
dataset_name=full_table_name,
|
|
1099
|
+
entity_urn=dataset_urn,
|
|
1100
|
+
)
|
|
1101
|
+
yield from self.add_table_to_database_container(
|
|
1102
|
+
dataset_urn=dataset_urn, db_name=database_name
|
|
1103
|
+
)
|
|
1091
1104
|
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1105
|
+
wu = self.get_lineage_if_enabled(mce)
|
|
1106
|
+
if wu:
|
|
1107
|
+
yield wu
|
|
1095
1108
|
|
|
1109
|
+
try:
|
|
1096
1110
|
yield from self.get_profile_if_enabled(mce, database_name, table_name)
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1111
|
+
except KeyError as e:
|
|
1112
|
+
self.report.report_failure(
|
|
1113
|
+
message="Failed to extract profile for table",
|
|
1114
|
+
context=f"Table: {dataset_urn}",
|
|
1115
|
+
exc=e,
|
|
1116
|
+
)
|
|
1100
1117
|
|
|
1101
1118
|
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
|
|
1102
1119
|
dags: Dict[str, Optional[Dict[str, Any]]] = {}
|
|
@@ -281,6 +281,8 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
|
|
|
281
281
|
include_lineage=self.config.include_table_lineage,
|
|
282
282
|
include_usage_statistics=self.config.include_usage_statistics,
|
|
283
283
|
include_operations=self.config.usage.include_operational_stats,
|
|
284
|
+
include_queries=self.config.include_queries,
|
|
285
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
284
286
|
top_n_queries=self.config.usage.top_n_queries,
|
|
285
287
|
region_qualifiers=self.config.region_qualifiers,
|
|
286
288
|
),
|
|
@@ -447,6 +447,14 @@ class BigQueryV2Config(
|
|
|
447
447
|
default=False,
|
|
448
448
|
description="If enabled, uses the new queries extractor to extract queries from bigquery.",
|
|
449
449
|
)
|
|
450
|
+
include_queries: bool = Field(
|
|
451
|
+
default=True,
|
|
452
|
+
description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
|
|
453
|
+
)
|
|
454
|
+
include_query_usage_statistics: bool = Field(
|
|
455
|
+
default=True,
|
|
456
|
+
description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
|
|
457
|
+
)
|
|
450
458
|
|
|
451
459
|
@property
|
|
452
460
|
def have_table_data_read_permission(self) -> bool:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional, Set
|
|
3
3
|
|
|
4
|
+
import pydantic
|
|
4
5
|
from pydantic import Field, root_validator
|
|
5
6
|
|
|
6
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -119,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
119
120
|
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
|
|
120
121
|
)
|
|
121
122
|
return values
|
|
123
|
+
|
|
124
|
+
@pydantic.validator("database_connection")
|
|
125
|
+
def validate_mysql_scheme(
|
|
126
|
+
cls, v: SQLAlchemyConnectionConfig
|
|
127
|
+
) -> SQLAlchemyConnectionConfig:
|
|
128
|
+
if "mysql" in v.scheme:
|
|
129
|
+
if v.scheme != "mysql+pymysql":
|
|
130
|
+
raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
|
|
131
|
+
return v
|
|
@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
|
|
|
151
151
|
self, query: str, params: Dict[str, Any]
|
|
152
152
|
) -> Iterable[Dict[str, Any]]:
|
|
153
153
|
with self.engine.connect() as conn:
|
|
154
|
-
if self.engine.dialect.name
|
|
154
|
+
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
155
|
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
|
156
|
+
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
|
157
|
+
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
|
156
158
|
conn = conn.execution_options(
|
|
157
159
|
stream_results=True,
|
|
158
160
|
yield_per=self.config.database_query_batch_size,
|
|
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
|
|
|
160
162
|
result = conn.execute(query, params)
|
|
161
163
|
for row in result:
|
|
162
164
|
yield dict(row)
|
|
163
|
-
elif self.engine.dialect.name == "mysql": # MySQL
|
|
164
|
-
import MySQLdb
|
|
165
|
-
|
|
166
|
-
with contextlib.closing(
|
|
167
|
-
conn.connection.cursor(MySQLdb.cursors.SSCursor)
|
|
168
|
-
) as cursor:
|
|
169
|
-
logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
|
|
170
|
-
cursor.execute(query, params)
|
|
171
|
-
|
|
172
|
-
columns = [desc[0] for desc in cursor.description]
|
|
173
|
-
while True:
|
|
174
|
-
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
175
|
-
if not rows:
|
|
176
|
-
break # Use break instead of return in generator
|
|
177
|
-
for row in rows:
|
|
178
|
-
yield dict(zip(columns, row))
|
|
179
165
|
else:
|
|
180
166
|
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
181
167
|
|
|
@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
130
130
|
self._commit_progress(i)
|
|
131
131
|
|
|
132
132
|
def _get_kafka_workunits(
|
|
133
|
-
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
|
|
133
|
+
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
|
|
134
134
|
) -> Iterable[MetadataWorkUnit]:
|
|
135
135
|
if self.config.kafka_connection is None:
|
|
136
136
|
return
|
|
@@ -19,8 +19,8 @@ from datahub.utilities.urns._urn_base import Urn
|
|
|
19
19
|
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
query
|
|
22
|
+
QUERY_ENTITIES = """
|
|
23
|
+
query listEntities($input: ScrollAcrossEntitiesInput!) {
|
|
24
24
|
scrollAcrossEntities(input: $input) {
|
|
25
25
|
nextScrollId
|
|
26
26
|
count
|
|
@@ -29,6 +29,9 @@ query listQueries($input: ScrollAcrossEntitiesInput!) {
|
|
|
29
29
|
... on QueryEntity {
|
|
30
30
|
urn
|
|
31
31
|
}
|
|
32
|
+
... on DataProcessInstance {
|
|
33
|
+
urn
|
|
34
|
+
}
|
|
32
35
|
}
|
|
33
36
|
}
|
|
34
37
|
}
|
|
@@ -225,16 +228,16 @@ class SoftDeletedEntitiesCleanup:
|
|
|
225
228
|
time.sleep(self.config.delay)
|
|
226
229
|
return futures
|
|
227
230
|
|
|
228
|
-
def
|
|
231
|
+
def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
|
|
229
232
|
assert self.ctx.graph
|
|
230
233
|
scroll_id: Optional[str] = None
|
|
231
234
|
while True:
|
|
232
235
|
try:
|
|
233
236
|
result = self.ctx.graph.execute_graphql(
|
|
234
|
-
|
|
237
|
+
graphql_query,
|
|
235
238
|
{
|
|
236
239
|
"input": {
|
|
237
|
-
"types": [
|
|
240
|
+
"types": [entity_type],
|
|
238
241
|
"query": "*",
|
|
239
242
|
"scrollId": scroll_id if scroll_id else None,
|
|
240
243
|
"count": self.config.batch_size,
|
|
@@ -254,7 +257,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
254
257
|
)
|
|
255
258
|
except Exception as e:
|
|
256
259
|
self.report.failure(
|
|
257
|
-
f"While trying to get
|
|
260
|
+
f"While trying to get {entity_type} with {scroll_id}", exc=e
|
|
258
261
|
)
|
|
259
262
|
break
|
|
260
263
|
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
@@ -275,7 +278,8 @@ class SoftDeletedEntitiesCleanup:
|
|
|
275
278
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
276
279
|
batch_size=self.config.batch_size,
|
|
277
280
|
)
|
|
278
|
-
yield from self.
|
|
281
|
+
yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
|
|
282
|
+
yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
|
|
279
283
|
|
|
280
284
|
def _times_up(self) -> bool:
|
|
281
285
|
if (
|
|
@@ -221,6 +221,14 @@ class SnowflakeV2Config(
|
|
|
221
221
|
default=False,
|
|
222
222
|
description="If enabled, uses the new queries extractor to extract queries from snowflake.",
|
|
223
223
|
)
|
|
224
|
+
include_queries: bool = Field(
|
|
225
|
+
default=True,
|
|
226
|
+
description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
|
|
227
|
+
)
|
|
228
|
+
include_query_usage_statistics: bool = Field(
|
|
229
|
+
default=True,
|
|
230
|
+
description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
|
|
231
|
+
)
|
|
224
232
|
|
|
225
233
|
lazy_schema_resolver: bool = Field(
|
|
226
234
|
default=True,
|
|
@@ -40,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
40
40
|
ColumnRef,
|
|
41
41
|
DownstreamColumnRef,
|
|
42
42
|
)
|
|
43
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
43
44
|
from datahub.utilities.perf_timer import PerfTimer
|
|
44
45
|
from datahub.utilities.time import ts_millis_to_datetime
|
|
45
46
|
|
|
@@ -239,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
239
240
|
downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
|
|
240
241
|
|
|
241
242
|
known_lineage = KnownQueryLineageInfo(
|
|
243
|
+
query_id=get_query_fingerprint(
|
|
244
|
+
query.query_text, self.identifiers.platform, fast=True
|
|
245
|
+
),
|
|
242
246
|
query_text=query.query_text,
|
|
243
247
|
downstream=downstream_table_urn,
|
|
244
248
|
upstreams=self.map_query_result_upstreams(
|
|
@@ -528,6 +528,8 @@ class SnowflakeV2Source(
|
|
|
528
528
|
include_lineage=self.config.include_table_lineage,
|
|
529
529
|
include_usage_statistics=self.config.include_usage_stats,
|
|
530
530
|
include_operations=self.config.include_operational_stats,
|
|
531
|
+
include_queries=self.config.include_queries,
|
|
532
|
+
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
531
533
|
user_email_pattern=self.config.user_email_pattern,
|
|
532
534
|
),
|
|
533
535
|
structured_report=self.report,
|
|
@@ -26,9 +26,6 @@ from datahub.emitter.mcp_builder import (
|
|
|
26
26
|
gen_containers,
|
|
27
27
|
)
|
|
28
28
|
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
29
|
-
from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
|
|
30
|
-
EnsureAspectSizeProcessor,
|
|
31
|
-
)
|
|
32
29
|
from datahub.ingestion.api.common import PipelineContext
|
|
33
30
|
from datahub.ingestion.api.decorators import (
|
|
34
31
|
SupportStatus,
|
|
@@ -263,7 +260,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
263
260
|
StaleEntityRemovalHandler.create(
|
|
264
261
|
self, self.config, self.ctx
|
|
265
262
|
).workunit_processor,
|
|
266
|
-
EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
|
|
267
263
|
]
|
|
268
264
|
|
|
269
265
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
|
|
|
165
165
|
timestamp: Optional[datetime] = None
|
|
166
166
|
session_id: Optional[str] = None
|
|
167
167
|
query_type: QueryType = QueryType.UNKNOWN
|
|
168
|
+
query_id: Optional[str] = None
|
|
168
169
|
|
|
169
170
|
|
|
170
171
|
@dataclasses.dataclass
|
|
@@ -618,11 +619,13 @@ class SqlParsingAggregator(Closeable):
|
|
|
618
619
|
self.report.num_known_query_lineage += 1
|
|
619
620
|
|
|
620
621
|
# Generate a fingerprint for the query.
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
622
|
+
query_fingerprint = known_query_lineage.query_id
|
|
623
|
+
if not query_fingerprint:
|
|
624
|
+
with self.report.sql_fingerprinting_timer:
|
|
625
|
+
query_fingerprint = get_query_fingerprint(
|
|
626
|
+
known_query_lineage.query_text,
|
|
627
|
+
platform=self.platform.platform_name,
|
|
628
|
+
)
|
|
626
629
|
formatted_query = self._maybe_format_query(known_query_lineage.query_text)
|
|
627
630
|
|
|
628
631
|
# Register the query.
|
|
File without changes
|
{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|