acryl-datahub 1.3.0.1rc1__py3-none-any.whl → 1.3.0.1rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/METADATA +2539 -2537
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/RECORD +47 -45
- datahub/_version.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +26 -0
- datahub/cli/config_utils.py +18 -10
- datahub/cli/docker_check.py +2 -1
- datahub/cli/docker_cli.py +4 -2
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/quickstart_versioning.py +2 -2
- datahub/cli/specific/dataproduct_cli.py +2 -4
- datahub/cli/specific/user_cli.py +172 -1
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/kafka.py +6 -4
- datahub/emitter/mce_builder.py +2 -4
- datahub/emitter/rest_emitter.py +15 -15
- datahub/entrypoints.py +2 -0
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/source.py +5 -0
- datahub/ingestion/graph/client.py +197 -0
- datahub/ingestion/graph/config.py +2 -2
- datahub/ingestion/sink/datahub_rest.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +20 -13
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -4
- datahub/ingestion/source/delta_lake/config.py +8 -4
- datahub/ingestion/source/grafana/models.py +5 -0
- datahub/ingestion/source/iceberg/iceberg.py +39 -19
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -1
- datahub/ingestion/source/mode.py +13 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +7 -1
- datahub/ingestion/source/sql/teradata.py +80 -65
- datahub/ingestion/source/unity/config.py +31 -0
- datahub/ingestion/source/unity/proxy.py +73 -0
- datahub/ingestion/source/unity/source.py +27 -70
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +14 -5
- datahub/sql_parsing/sqlglot_lineage.py +7 -0
- datahub/telemetry/telemetry.py +8 -3
- datahub/utilities/file_backed_collections.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +22 -6
- datahub/utilities/sample_data.py +5 -4
- datahub/emitter/sql_parsing_builder.py +0 -306
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc1.dist-info → acryl_datahub-1.3.0.1rc3.dist-info}/top_level.txt +0 -0
|
@@ -30,7 +30,6 @@ from datahub.emitter.mcp_builder import (
|
|
|
30
30
|
add_entity_to_container,
|
|
31
31
|
gen_containers,
|
|
32
32
|
)
|
|
33
|
-
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
34
33
|
from datahub.ingestion.api.common import PipelineContext
|
|
35
34
|
from datahub.ingestion.api.decorators import (
|
|
36
35
|
SupportStatus,
|
|
@@ -142,11 +141,7 @@ from datahub.metadata.schema_classes import (
|
|
|
142
141
|
from datahub.metadata.urns import MlModelGroupUrn, MlModelUrn, TagUrn
|
|
143
142
|
from datahub.sdk import MLModel, MLModelGroup
|
|
144
143
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
145
|
-
from datahub.sql_parsing.
|
|
146
|
-
SqlParsingResult,
|
|
147
|
-
sqlglot_lineage,
|
|
148
|
-
view_definition_lineage_helper,
|
|
149
|
-
)
|
|
144
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator
|
|
150
145
|
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
151
146
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
152
147
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -199,6 +194,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
199
194
|
platform_resource_repository: Optional[UnityCatalogPlatformResourceRepository] = (
|
|
200
195
|
None
|
|
201
196
|
)
|
|
197
|
+
sql_parsing_aggregator: Optional[SqlParsingAggregator] = None
|
|
202
198
|
|
|
203
199
|
def get_report(self) -> UnityCatalogReport:
|
|
204
200
|
return self.report
|
|
@@ -218,6 +214,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
218
214
|
report=self.report,
|
|
219
215
|
hive_metastore_proxy=self.hive_metastore_proxy,
|
|
220
216
|
lineage_data_source=config.lineage_data_source,
|
|
217
|
+
usage_data_source=config.usage_data_source,
|
|
221
218
|
databricks_api_page_size=config.databricks_api_page_size,
|
|
222
219
|
)
|
|
223
220
|
|
|
@@ -244,9 +241,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
244
241
|
self.table_refs: Set[TableReference] = set()
|
|
245
242
|
self.view_refs: Set[TableReference] = set()
|
|
246
243
|
self.notebooks: FileBackedDict[Notebook] = FileBackedDict()
|
|
247
|
-
self.view_definitions: FileBackedDict[Tuple[TableReference, str]] = (
|
|
248
|
-
FileBackedDict()
|
|
249
|
-
)
|
|
250
244
|
|
|
251
245
|
# Global map of tables, for profiling
|
|
252
246
|
self.tables: FileBackedDict[Table] = FileBackedDict()
|
|
@@ -290,6 +284,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
290
284
|
platform_instance=self.config.platform_instance,
|
|
291
285
|
env=self.config.env,
|
|
292
286
|
)
|
|
287
|
+
self.sql_parsing_aggregator = SqlParsingAggregator(
|
|
288
|
+
platform=self.platform,
|
|
289
|
+
platform_instance=self.config.platform_instance,
|
|
290
|
+
env=self.config.env,
|
|
291
|
+
schema_resolver=self.sql_parser_schema_resolver,
|
|
292
|
+
generate_lineage=True,
|
|
293
|
+
generate_queries=False,
|
|
294
|
+
generate_usage_statistics=False,
|
|
295
|
+
generate_operations=False,
|
|
296
|
+
)
|
|
297
|
+
self.report.sql_aggregator = self.sql_parsing_aggregator.report
|
|
293
298
|
except Exception as e:
|
|
294
299
|
logger.debug("Exception", exc_info=True)
|
|
295
300
|
self.warn(
|
|
@@ -629,8 +634,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
629
634
|
self.sql_parser_schema_resolver.add_schema_metadata(
|
|
630
635
|
dataset_urn, schema_metadata
|
|
631
636
|
)
|
|
632
|
-
if table.view_definition:
|
|
633
|
-
self.
|
|
637
|
+
if table.view_definition and self.sql_parsing_aggregator:
|
|
638
|
+
self.sql_parsing_aggregator.add_view_definition(
|
|
639
|
+
view_urn=dataset_urn,
|
|
640
|
+
view_definition=table.view_definition,
|
|
641
|
+
default_db=table.ref.catalog,
|
|
642
|
+
default_schema=table.ref.schema,
|
|
643
|
+
)
|
|
634
644
|
|
|
635
645
|
if (
|
|
636
646
|
table_props.customProperties.get("table_type")
|
|
@@ -1334,75 +1344,22 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1334
1344
|
)
|
|
1335
1345
|
]
|
|
1336
1346
|
|
|
1337
|
-
def _run_sql_parser(
|
|
1338
|
-
self, view_ref: TableReference, query: str, schema_resolver: SchemaResolver
|
|
1339
|
-
) -> Optional[SqlParsingResult]:
|
|
1340
|
-
raw_lineage = sqlglot_lineage(
|
|
1341
|
-
query,
|
|
1342
|
-
schema_resolver=schema_resolver,
|
|
1343
|
-
default_db=view_ref.catalog,
|
|
1344
|
-
default_schema=view_ref.schema,
|
|
1345
|
-
)
|
|
1346
|
-
view_urn = self.gen_dataset_urn(view_ref)
|
|
1347
|
-
|
|
1348
|
-
if raw_lineage.debug_info.table_error:
|
|
1349
|
-
logger.debug(
|
|
1350
|
-
f"Failed to parse lineage for view {view_ref}: "
|
|
1351
|
-
f"{raw_lineage.debug_info.table_error}"
|
|
1352
|
-
)
|
|
1353
|
-
self.report.num_view_definitions_failed_parsing += 1
|
|
1354
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1355
|
-
f"Table-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.table_error}"
|
|
1356
|
-
)
|
|
1357
|
-
return None
|
|
1358
|
-
|
|
1359
|
-
elif raw_lineage.debug_info.column_error:
|
|
1360
|
-
self.report.num_view_definitions_failed_column_parsing += 1
|
|
1361
|
-
self.report.view_definitions_parsing_failures.append(
|
|
1362
|
-
f"Column-level sql parsing error for view {view_ref}: {raw_lineage.debug_info.column_error}"
|
|
1363
|
-
)
|
|
1364
|
-
else:
|
|
1365
|
-
self.report.num_view_definitions_parsed += 1
|
|
1366
|
-
if raw_lineage.out_tables != [view_urn]:
|
|
1367
|
-
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
1368
|
-
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
1369
|
-
|
|
1370
1347
|
def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
|
|
1371
1348
|
if not (
|
|
1372
1349
|
self.config.include_hive_metastore
|
|
1373
1350
|
and self.config.include_table_lineage
|
|
1374
|
-
and self.
|
|
1351
|
+
and self.sql_parsing_aggregator
|
|
1375
1352
|
):
|
|
1376
1353
|
return
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
generate_usage_statistics=False,
|
|
1381
|
-
generate_operations=False,
|
|
1382
|
-
)
|
|
1383
|
-
for dataset_name in self.view_definitions:
|
|
1384
|
-
view_ref, view_definition = self.view_definitions[dataset_name]
|
|
1385
|
-
result = self._run_sql_parser(
|
|
1386
|
-
view_ref,
|
|
1387
|
-
view_definition,
|
|
1388
|
-
self.sql_parser_schema_resolver,
|
|
1389
|
-
)
|
|
1390
|
-
if result and result.out_tables:
|
|
1391
|
-
# This does not yield any workunits but we use
|
|
1392
|
-
# yield here to execute this method
|
|
1393
|
-
yield from builder.process_sql_parsing_result(
|
|
1394
|
-
result=result,
|
|
1395
|
-
query=view_definition,
|
|
1396
|
-
is_view_ddl=True,
|
|
1397
|
-
include_column_lineage=self.config.include_view_column_lineage,
|
|
1398
|
-
)
|
|
1399
|
-
yield from builder.gen_workunits()
|
|
1354
|
+
|
|
1355
|
+
for mcp in self.sql_parsing_aggregator.gen_metadata():
|
|
1356
|
+
yield mcp.as_workunit()
|
|
1400
1357
|
|
|
1401
1358
|
def close(self):
|
|
1402
1359
|
if self.hive_metastore_proxy:
|
|
1403
1360
|
self.hive_metastore_proxy.close()
|
|
1404
|
-
if self.
|
|
1405
|
-
self.
|
|
1361
|
+
if self.sql_parsing_aggregator:
|
|
1362
|
+
self.sql_parsing_aggregator.close()
|
|
1406
1363
|
if self.sql_parser_schema_resolver:
|
|
1407
1364
|
self.sql_parser_schema_resolver.close()
|
|
1408
1365
|
|
|
@@ -11,7 +11,10 @@ from databricks.sdk.service.sql import QueryStatementType
|
|
|
11
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
12
|
from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
|
|
13
13
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
14
|
-
from datahub.ingestion.source.unity.config import
|
|
14
|
+
from datahub.ingestion.source.unity.config import (
|
|
15
|
+
UnityCatalogSourceConfig,
|
|
16
|
+
UsageDataSource,
|
|
17
|
+
)
|
|
15
18
|
from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
|
|
16
19
|
from datahub.ingestion.source.unity.proxy_types import (
|
|
17
20
|
OPERATION_STATEMENT_TYPES,
|
|
@@ -164,11 +167,50 @@ class UnityCatalogUsageExtractor:
|
|
|
164
167
|
aspect=operation_aspect,
|
|
165
168
|
).as_workunit()
|
|
166
169
|
|
|
170
|
+
def _validate_usage_data_source_config(self) -> None:
|
|
171
|
+
"""Validate usage data source configuration before execution."""
|
|
172
|
+
usage_data_source = self.config.usage_data_source
|
|
173
|
+
|
|
174
|
+
if (
|
|
175
|
+
usage_data_source == UsageDataSource.SYSTEM_TABLES
|
|
176
|
+
and not self.proxy.warehouse_id
|
|
177
|
+
):
|
|
178
|
+
raise ValueError(
|
|
179
|
+
"usage_data_source is set to SYSTEM_TABLES but warehouse_id is not configured. "
|
|
180
|
+
"Either set warehouse_id or use AUTO/API mode."
|
|
181
|
+
)
|
|
182
|
+
|
|
167
183
|
def _get_queries(self) -> Iterable[Query]:
|
|
168
184
|
try:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
185
|
+
self._validate_usage_data_source_config()
|
|
186
|
+
usage_data_source = self.config.usage_data_source
|
|
187
|
+
|
|
188
|
+
if usage_data_source == UsageDataSource.AUTO:
|
|
189
|
+
if self.proxy.warehouse_id:
|
|
190
|
+
logger.info(
|
|
191
|
+
"Using system tables for usage query history (AUTO mode)"
|
|
192
|
+
)
|
|
193
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
194
|
+
self.config.start_time, self.config.end_time
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
logger.info(
|
|
198
|
+
"Using API for usage query history (AUTO mode, no warehouse)"
|
|
199
|
+
)
|
|
200
|
+
yield from self.proxy.query_history(
|
|
201
|
+
self.config.start_time, self.config.end_time
|
|
202
|
+
)
|
|
203
|
+
elif usage_data_source == UsageDataSource.SYSTEM_TABLES:
|
|
204
|
+
logger.info("Using system tables for usage query history (forced)")
|
|
205
|
+
yield from self.proxy.get_query_history_via_system_tables(
|
|
206
|
+
self.config.start_time, self.config.end_time
|
|
207
|
+
)
|
|
208
|
+
elif usage_data_source == UsageDataSource.API:
|
|
209
|
+
logger.info("Using API for usage query history (forced)")
|
|
210
|
+
yield from self.proxy.query_history(
|
|
211
|
+
self.config.start_time, self.config.end_time
|
|
212
|
+
)
|
|
213
|
+
|
|
172
214
|
except Exception as e:
|
|
173
215
|
logger.warning("Error getting queries", exc_info=True)
|
|
174
216
|
self.report.report_warning("get-queries", str(e))
|
|
@@ -4,7 +4,6 @@ import enum
|
|
|
4
4
|
import functools
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import os
|
|
8
7
|
import pathlib
|
|
9
8
|
import tempfile
|
|
10
9
|
import uuid
|
|
@@ -14,10 +13,10 @@ from typing import Callable, Dict, Iterable, List, Optional, Set, Union, cast
|
|
|
14
13
|
|
|
15
14
|
import datahub.emitter.mce_builder as builder
|
|
16
15
|
import datahub.metadata.schema_classes as models
|
|
16
|
+
from datahub.configuration.env_vars import get_sql_agg_query_log
|
|
17
17
|
from datahub.configuration.time_window_config import get_time_bucket
|
|
18
18
|
from datahub.emitter.mce_builder import get_sys_time, make_ts_millis
|
|
19
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
|
-
from datahub.emitter.sql_parsing_builder import compute_upstream_fields
|
|
21
20
|
from datahub.ingestion.api.closeable import Closeable
|
|
22
21
|
from datahub.ingestion.api.report import Report
|
|
23
22
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
@@ -84,7 +83,7 @@ class QueryLogSetting(enum.Enum):
|
|
|
84
83
|
_DEFAULT_USER_URN = CorpUserUrn("_ingestion")
|
|
85
84
|
_MISSING_SESSION_ID = "__MISSING_SESSION_ID"
|
|
86
85
|
_DEFAULT_QUERY_LOG_SETTING = QueryLogSetting[
|
|
87
|
-
|
|
86
|
+
get_sql_agg_query_log() or QueryLogSetting.DISABLED.name
|
|
88
87
|
]
|
|
89
88
|
MAX_UPSTREAM_TABLES_COUNT = 300
|
|
90
89
|
MAX_FINEGRAINEDLINEAGE_COUNT = 2000
|
|
@@ -868,7 +867,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
868
867
|
downstream=parsed.out_tables[0] if parsed.out_tables else None,
|
|
869
868
|
column_lineage=parsed.column_lineage,
|
|
870
869
|
# TODO: We need a full list of columns referenced, not just the out tables.
|
|
871
|
-
column_usage=
|
|
870
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
872
871
|
inferred_schema=infer_output_schema(parsed),
|
|
873
872
|
confidence_score=parsed.debug_info.confidence,
|
|
874
873
|
extra_info=observed.extra_info,
|
|
@@ -1157,7 +1156,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1157
1156
|
actor=None,
|
|
1158
1157
|
upstreams=parsed.in_tables,
|
|
1159
1158
|
column_lineage=parsed.column_lineage or [],
|
|
1160
|
-
column_usage=
|
|
1159
|
+
column_usage=self._compute_upstream_fields(parsed),
|
|
1161
1160
|
confidence_score=parsed.debug_info.confidence,
|
|
1162
1161
|
)
|
|
1163
1162
|
)
|
|
@@ -1741,6 +1740,16 @@ class SqlParsingAggregator(Closeable):
|
|
|
1741
1740
|
|
|
1742
1741
|
return resolved_query
|
|
1743
1742
|
|
|
1743
|
+
@staticmethod
|
|
1744
|
+
def _compute_upstream_fields(
|
|
1745
|
+
result: SqlParsingResult,
|
|
1746
|
+
) -> Dict[UrnStr, Set[UrnStr]]:
|
|
1747
|
+
upstream_fields: Dict[UrnStr, Set[UrnStr]] = defaultdict(set)
|
|
1748
|
+
for cl in result.column_lineage or []:
|
|
1749
|
+
for upstream in cl.upstreams:
|
|
1750
|
+
upstream_fields[upstream.table].add(upstream.column)
|
|
1751
|
+
return upstream_fields
|
|
1752
|
+
|
|
1744
1753
|
def _gen_usage_statistics_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
1745
1754
|
if not self._usage_aggregator:
|
|
1746
1755
|
return
|
|
@@ -691,6 +691,13 @@ def _column_level_lineage(
|
|
|
691
691
|
select_statement=select_statement,
|
|
692
692
|
)
|
|
693
693
|
|
|
694
|
+
# Handle VALUES expressions separately - they have no upstream tables and no column lineage
|
|
695
|
+
if isinstance(select_statement, sqlglot.exp.Values):
|
|
696
|
+
return _ColumnLineageWithDebugInfo(
|
|
697
|
+
column_lineage=[],
|
|
698
|
+
select_statement=select_statement,
|
|
699
|
+
)
|
|
700
|
+
|
|
694
701
|
assert isinstance(select_statement, _SupportedColumnLineageTypesTuple)
|
|
695
702
|
try:
|
|
696
703
|
root_scope = sqlglot.optimizer.build_scope(select_statement)
|
datahub/telemetry/telemetry.py
CHANGED
|
@@ -16,6 +16,11 @@ from datahub._version import __version__, nice_version_name
|
|
|
16
16
|
from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER
|
|
17
17
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
18
18
|
from datahub.configuration.common import ExceptionWithProps
|
|
19
|
+
from datahub.configuration.env_vars import (
|
|
20
|
+
get_sentry_dsn,
|
|
21
|
+
get_sentry_environment,
|
|
22
|
+
get_telemetry_timeout,
|
|
23
|
+
)
|
|
19
24
|
from datahub.metadata.schema_classes import _custom_package_path
|
|
20
25
|
from datahub.utilities.perf_timer import PerfTimer
|
|
21
26
|
|
|
@@ -97,11 +102,11 @@ if any(var in os.environ for var in CI_ENV_VARS):
|
|
|
97
102
|
if _custom_package_path:
|
|
98
103
|
ENV_ENABLED = False
|
|
99
104
|
|
|
100
|
-
TIMEOUT = int(
|
|
105
|
+
TIMEOUT = int(get_telemetry_timeout())
|
|
101
106
|
MIXPANEL_ENDPOINT = "track.datahubproject.io/mp"
|
|
102
107
|
MIXPANEL_TOKEN = "5ee83d940754d63cacbf7d34daa6f44a"
|
|
103
|
-
SENTRY_DSN: Optional[str] =
|
|
104
|
-
SENTRY_ENVIRONMENT: str =
|
|
108
|
+
SENTRY_DSN: Optional[str] = get_sentry_dsn()
|
|
109
|
+
SENTRY_ENVIRONMENT: str = get_sentry_environment()
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
def _default_global_properties() -> Dict[str, Any]:
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import gzip
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
4
|
import pathlib
|
|
6
5
|
import pickle
|
|
7
6
|
import shutil
|
|
@@ -28,6 +27,7 @@ from typing import (
|
|
|
28
27
|
Union,
|
|
29
28
|
)
|
|
30
29
|
|
|
30
|
+
from datahub.configuration.env_vars import get_override_sqlite_version_req
|
|
31
31
|
from datahub.ingestion.api.closeable import Closeable
|
|
32
32
|
from datahub.utilities.sentinels import Unset, unset
|
|
33
33
|
|
|
@@ -36,7 +36,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
36
36
|
|
|
37
37
|
def _get_sqlite_version_override() -> bool:
|
|
38
38
|
"""Check if SQLite version requirement should be overridden at runtime."""
|
|
39
|
-
override_str =
|
|
39
|
+
override_str = get_override_sqlite_version_req()
|
|
40
40
|
return bool(override_str and override_str.lower() != "false")
|
|
41
41
|
|
|
42
42
|
|
datahub/utilities/is_pytest.py
CHANGED
|
@@ -15,13 +15,13 @@ import collections
|
|
|
15
15
|
import contextlib
|
|
16
16
|
import itertools
|
|
17
17
|
import logging
|
|
18
|
-
import os
|
|
19
18
|
import pathlib
|
|
20
19
|
import sys
|
|
21
20
|
from typing import Deque, Iterator, Optional
|
|
22
21
|
|
|
23
22
|
import click
|
|
24
23
|
|
|
24
|
+
from datahub.configuration.env_vars import get_no_color, get_suppress_logging_manager
|
|
25
25
|
from datahub.utilities.tee_io import TeeIO
|
|
26
26
|
|
|
27
27
|
BASE_LOGGING_FORMAT = (
|
|
@@ -38,7 +38,7 @@ IN_MEMORY_LOG_BUFFER_SIZE = 2000 # lines
|
|
|
38
38
|
IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH = 2000 # characters
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
NO_COLOR =
|
|
41
|
+
NO_COLOR = get_no_color()
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
def extract_name_from_filename(filename: str, fallback_name: str) -> str:
|
|
@@ -179,6 +179,18 @@ class _LogBuffer:
|
|
|
179
179
|
return text
|
|
180
180
|
|
|
181
181
|
|
|
182
|
+
class _ResilientStreamHandler(logging.StreamHandler):
|
|
183
|
+
"""StreamHandler that gracefully handles closed streams."""
|
|
184
|
+
|
|
185
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
186
|
+
try:
|
|
187
|
+
super().emit(record)
|
|
188
|
+
except (ValueError, OSError):
|
|
189
|
+
# Stream was closed (e.g., during pytest teardown)
|
|
190
|
+
# Silently ignore to prevent test failures
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
|
|
182
194
|
class _BufferLogHandler(logging.Handler):
|
|
183
195
|
def __init__(self, storage: _LogBuffer) -> None:
|
|
184
196
|
super().__init__()
|
|
@@ -201,7 +213,11 @@ class _BufferLogHandler(logging.Handler):
|
|
|
201
213
|
def _remove_all_handlers(logger: logging.Logger) -> None:
|
|
202
214
|
for handler in logger.handlers[:]:
|
|
203
215
|
logger.removeHandler(handler)
|
|
204
|
-
|
|
216
|
+
try:
|
|
217
|
+
handler.close()
|
|
218
|
+
except (ValueError, OSError):
|
|
219
|
+
# Handler stream may already be closed (e.g., during pytest teardown)
|
|
220
|
+
pass
|
|
205
221
|
|
|
206
222
|
|
|
207
223
|
_log_buffer = _LogBuffer(maxlen=IN_MEMORY_LOG_BUFFER_SIZE)
|
|
@@ -219,14 +235,14 @@ _default_formatter = logging.Formatter(BASE_LOGGING_FORMAT)
|
|
|
219
235
|
def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[None]:
|
|
220
236
|
_log_buffer.clear()
|
|
221
237
|
|
|
222
|
-
if
|
|
238
|
+
if get_suppress_logging_manager() == "1":
|
|
223
239
|
# If we're running in pytest, we don't want to configure logging.
|
|
224
240
|
yield
|
|
225
241
|
return
|
|
226
242
|
|
|
227
243
|
with contextlib.ExitStack() as stack:
|
|
228
244
|
# Create stdout handler.
|
|
229
|
-
stream_handler =
|
|
245
|
+
stream_handler = _ResilientStreamHandler()
|
|
230
246
|
stream_handler.addFilter(_DatahubLogFilter(debug=debug))
|
|
231
247
|
stream_handler.setFormatter(_stream_formatter)
|
|
232
248
|
|
|
@@ -237,7 +253,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N
|
|
|
237
253
|
tee = TeeIO(sys.stdout, file)
|
|
238
254
|
stack.enter_context(contextlib.redirect_stdout(tee)) # type: ignore
|
|
239
255
|
|
|
240
|
-
file_handler =
|
|
256
|
+
file_handler = _ResilientStreamHandler(file)
|
|
241
257
|
file_handler.addFilter(_DatahubLogFilter(debug=True))
|
|
242
258
|
file_handler.setFormatter(_default_formatter)
|
|
243
259
|
else:
|
datahub/utilities/sample_data.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import pathlib
|
|
3
2
|
import tempfile
|
|
4
3
|
|
|
5
4
|
import requests
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
from datahub.configuration.env_vars import get_docker_compose_base
|
|
7
|
+
|
|
8
|
+
DOCKER_COMPOSE_BASE = (
|
|
9
|
+
get_docker_compose_base()
|
|
10
|
+
or "https://raw.githubusercontent.com/datahub-project/datahub/master"
|
|
10
11
|
)
|
|
11
12
|
BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
|
|
12
13
|
BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}"
|