acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import socket
|
|
3
|
+
import time
|
|
4
|
+
from typing import Iterable, Optional
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import dns.exception
|
|
8
|
+
import dns.resolver
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
from datahub.configuration.common import ConfigModel
|
|
12
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
13
|
+
from datahub.ingestion.api.decorators import (
|
|
14
|
+
SupportStatus,
|
|
15
|
+
config_class,
|
|
16
|
+
platform_name,
|
|
17
|
+
support_status,
|
|
18
|
+
)
|
|
19
|
+
from datahub.ingestion.api.source import Source, SourceReport
|
|
20
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DataHubDebugSourceConfig(ConfigModel):
|
|
26
|
+
dns_probe_url: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@platform_name("DataHubDebug")
|
|
30
|
+
@config_class(DataHubDebugSourceConfig)
|
|
31
|
+
@support_status(SupportStatus.TESTING)
|
|
32
|
+
class DataHubDebugSource(Source):
|
|
33
|
+
"""
|
|
34
|
+
DataHubDebugSource is helper to debug things in executor where ingestion is running.
|
|
35
|
+
|
|
36
|
+
This source can perform the following tasks:
|
|
37
|
+
1. Network probe of a URL. Different from test connection in sources as that is after source starts.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, ctx: PipelineContext, config: DataHubDebugSourceConfig):
|
|
42
|
+
self.ctx = ctx
|
|
43
|
+
self.config = config
|
|
44
|
+
self.report = SourceReport()
|
|
45
|
+
self.report.event_not_produced_warn = False
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def create(cls, config_dict, ctx):
|
|
49
|
+
config = DataHubDebugSourceConfig.parse_obj(config_dict)
|
|
50
|
+
return cls(ctx, config)
|
|
51
|
+
|
|
52
|
+
def perform_dns_probe(self, url: str) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Perform comprehensive DNS probe and network connectivity tests.
|
|
55
|
+
Logs detailed information to help diagnose network issues.
|
|
56
|
+
"""
|
|
57
|
+
logger.info(f"Starting DNS probe for URL: {url}")
|
|
58
|
+
logger.info("=" * 60)
|
|
59
|
+
logger.info(f"DNS PROBE REPORT FOR: {url}")
|
|
60
|
+
logger.info("=" * 60)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
# Parse the URL to extract hostname
|
|
64
|
+
parsed_url = urlparse(
|
|
65
|
+
url if url.startswith(("http://", "https://")) else f"http://{url}"
|
|
66
|
+
)
|
|
67
|
+
hostname = parsed_url.hostname or parsed_url.netloc
|
|
68
|
+
port = parsed_url.port or (443 if parsed_url.scheme == "https" else 80)
|
|
69
|
+
|
|
70
|
+
logger.info(f"Parsed hostname: {hostname}")
|
|
71
|
+
logger.info(f"Target port: {port}")
|
|
72
|
+
logger.info(f"URL scheme: {parsed_url.scheme}")
|
|
73
|
+
logger.info("-" * 60)
|
|
74
|
+
|
|
75
|
+
# Test 1: Enhanced DNS resolution with dnspython if available
|
|
76
|
+
logger.info("1. DNS RESOLUTION TEST")
|
|
77
|
+
self._dns_probe_with_dnspython(hostname)
|
|
78
|
+
|
|
79
|
+
logger.info("-" * 60)
|
|
80
|
+
|
|
81
|
+
# Test 2: HTTP/HTTPS connectivity test with requests if available
|
|
82
|
+
logger.info("2. HTTP CONNECTIVITY TEST")
|
|
83
|
+
self._http_probe_with_requests(url)
|
|
84
|
+
|
|
85
|
+
logger.info("-" * 60)
|
|
86
|
+
|
|
87
|
+
# Test 3: System network information
|
|
88
|
+
logger.info("3. SYSTEM NETWORK INFORMATION")
|
|
89
|
+
self._log_system_network_info()
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.error(f"DNS probe failed with unexpected error: {e}", exc_info=True)
|
|
93
|
+
|
|
94
|
+
logger.info("=" * 60)
|
|
95
|
+
logger.info("DNS PROBE COMPLETED")
|
|
96
|
+
logger.info("=" * 60)
|
|
97
|
+
|
|
98
|
+
def _dns_probe_with_dnspython(self, hostname: str) -> None:
|
|
99
|
+
"""Enhanced DNS probing using dnspython library"""
|
|
100
|
+
try:
|
|
101
|
+
# Test different record types
|
|
102
|
+
record_types = ["A", "AAAA", "CNAME", "MX"]
|
|
103
|
+
|
|
104
|
+
for record_type in record_types:
|
|
105
|
+
try:
|
|
106
|
+
start_time = time.time()
|
|
107
|
+
answers = dns.resolver.resolve(hostname, record_type)
|
|
108
|
+
dns_time = time.time() - start_time
|
|
109
|
+
|
|
110
|
+
logger.info(
|
|
111
|
+
f"✓ {record_type} record resolution successful ({dns_time:.3f}s)"
|
|
112
|
+
)
|
|
113
|
+
for answer in answers:
|
|
114
|
+
logger.info(f" - {record_type}: {answer}")
|
|
115
|
+
|
|
116
|
+
except dns.resolver.NXDOMAIN:
|
|
117
|
+
logger.info(f"✗ {record_type} record: Domain does not exist")
|
|
118
|
+
except dns.resolver.NoAnswer:
|
|
119
|
+
logger.info(
|
|
120
|
+
f"- {record_type} record: No answer (record type not available)"
|
|
121
|
+
)
|
|
122
|
+
except dns.exception.Timeout:
|
|
123
|
+
logger.error(f"✗ {record_type} record: DNS query timed out")
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"✗ {record_type} record query failed: {e}")
|
|
126
|
+
|
|
127
|
+
# Test different DNS servers
|
|
128
|
+
logger.info("Testing with different DNS servers:")
|
|
129
|
+
dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
|
|
130
|
+
|
|
131
|
+
for dns_server in dns_servers:
|
|
132
|
+
try:
|
|
133
|
+
resolver = dns.resolver.Resolver()
|
|
134
|
+
resolver.nameservers = [dns_server]
|
|
135
|
+
resolver.timeout = 5
|
|
136
|
+
|
|
137
|
+
start_time = time.time()
|
|
138
|
+
answers = resolver.resolve(hostname, "A")
|
|
139
|
+
dns_time = time.time() - start_time
|
|
140
|
+
|
|
141
|
+
logger.info(
|
|
142
|
+
f"✓ DNS server {dns_server} responded ({dns_time:.3f}s)"
|
|
143
|
+
)
|
|
144
|
+
for answer in answers:
|
|
145
|
+
logger.info(f" - A: {answer}")
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"✗ DNS server {dns_server} failed: {e}")
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.error(f"Enhanced DNS probe failed: {e}", exc_info=True)
|
|
152
|
+
|
|
153
|
+
def _http_probe_with_requests(self, url: str) -> None:
|
|
154
|
+
"""HTTP connectivity test using requests library"""
|
|
155
|
+
try:
|
|
156
|
+
# Test with different timeouts and methods
|
|
157
|
+
timeout = 10
|
|
158
|
+
allow_redirects_head = True
|
|
159
|
+
allow_redirects_get = False
|
|
160
|
+
|
|
161
|
+
# Test HEAD request
|
|
162
|
+
try:
|
|
163
|
+
logger.info(f"Testing HEAD request with timeout {timeout}s")
|
|
164
|
+
start_time = time.time()
|
|
165
|
+
|
|
166
|
+
response = requests.head(
|
|
167
|
+
url, timeout=timeout, allow_redirects=allow_redirects_head
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
request_time = time.time() - start_time
|
|
171
|
+
|
|
172
|
+
logger.info(f"✓ HEAD request successful ({request_time:.3f}s)")
|
|
173
|
+
logger.info(f" Status code: {response.status_code}")
|
|
174
|
+
logger.info(
|
|
175
|
+
f" Response headers: {dict(list(response.headers.items())[:5])}"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
if hasattr(response, "url") and response.url != url:
|
|
179
|
+
logger.info(f" Final URL after redirects: {response.url}")
|
|
180
|
+
|
|
181
|
+
except requests.exceptions.Timeout:
|
|
182
|
+
logger.error(f"✗ HEAD request timed out after {timeout}s")
|
|
183
|
+
except requests.exceptions.ConnectionError as e:
|
|
184
|
+
logger.error(f"✗ HEAD connection error: {e}")
|
|
185
|
+
except requests.exceptions.RequestException as e:
|
|
186
|
+
logger.error(f"✗ HEAD request failed: {e}")
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logger.error(f"✗ HEAD unexpected error: {e}")
|
|
189
|
+
|
|
190
|
+
# Test GET request
|
|
191
|
+
try:
|
|
192
|
+
logger.info(f"Testing GET request with timeout {timeout}s")
|
|
193
|
+
start_time = time.time()
|
|
194
|
+
|
|
195
|
+
response = requests.get(
|
|
196
|
+
url, timeout=timeout, allow_redirects=allow_redirects_get
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
request_time = time.time() - start_time
|
|
200
|
+
|
|
201
|
+
logger.info(f"✓ GET request successful ({request_time:.3f}s)")
|
|
202
|
+
logger.info(f" Status code: {response.status_code}")
|
|
203
|
+
logger.info(
|
|
204
|
+
f" Response headers: {dict(list(response.headers.items())[:5])}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if hasattr(response, "url") and response.url != url:
|
|
208
|
+
logger.info(f" Final URL after redirects: {response.url}")
|
|
209
|
+
|
|
210
|
+
except requests.exceptions.Timeout:
|
|
211
|
+
logger.error(f"✗ GET request timed out after {timeout}s")
|
|
212
|
+
except requests.exceptions.ConnectionError as e:
|
|
213
|
+
logger.error(f"✗ GET connection error: {e}")
|
|
214
|
+
except requests.exceptions.RequestException as e:
|
|
215
|
+
logger.error(f"✗ GET request failed: {e}")
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"✗ GET unexpected error: {e}")
|
|
218
|
+
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.error(f"HTTP probe failed: {e}", exc_info=True)
|
|
221
|
+
|
|
222
|
+
def _log_dns_troubleshooting(self) -> None:
|
|
223
|
+
"""Log DNS troubleshooting information"""
|
|
224
|
+
logger.info("DNS TROUBLESHOOTING SUGGESTIONS:")
|
|
225
|
+
logger.info("- Check if the hostname is correct")
|
|
226
|
+
logger.info("- Verify DNS server configuration")
|
|
227
|
+
logger.info("- Check network connectivity")
|
|
228
|
+
logger.info("- Try using a different DNS server (8.8.8.8, 1.1.1.1)")
|
|
229
|
+
logger.info("- Check if there are firewall restrictions")
|
|
230
|
+
|
|
231
|
+
def _log_system_network_info(self) -> None:
|
|
232
|
+
"""Log system network configuration information"""
|
|
233
|
+
try:
|
|
234
|
+
local_hostname = socket.gethostname()
|
|
235
|
+
logger.info(f"Local hostname: {local_hostname}")
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
local_ips = socket.getaddrinfo(local_hostname, None)
|
|
239
|
+
logger.info("Local IP addresses:")
|
|
240
|
+
for addr_info in local_ips:
|
|
241
|
+
if addr_info[0] in [socket.AF_INET, socket.AF_INET6]:
|
|
242
|
+
family = "IPv4" if addr_info[0] == socket.AF_INET else "IPv6"
|
|
243
|
+
logger.info(f" - {addr_info[4][0]} ({family})")
|
|
244
|
+
except Exception as e:
|
|
245
|
+
logger.warning(f"Could not retrieve local IP addresses: {e}")
|
|
246
|
+
|
|
247
|
+
logger.info("DNS Server Connectivity:")
|
|
248
|
+
dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
|
|
249
|
+
for dns_server in dns_servers:
|
|
250
|
+
try:
|
|
251
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
252
|
+
sock.settimeout(5)
|
|
253
|
+
result = sock.connect_ex((dns_server, 53))
|
|
254
|
+
if result == 0:
|
|
255
|
+
logger.info(f" ✓ Can reach {dns_server}:53")
|
|
256
|
+
else:
|
|
257
|
+
logger.error(f" ✗ Cannot reach {dns_server}:53")
|
|
258
|
+
sock.close()
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.error(f" ✗ Error testing {dns_server}:53 - {e}")
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.warning(f"Could not gather system network info: {e}")
|
|
264
|
+
|
|
265
|
+
def _test_alternative_dns(self, hostname: str) -> None:
|
|
266
|
+
"""Test hostname resolution using alternative methods"""
|
|
267
|
+
try:
|
|
268
|
+
families = [(socket.AF_INET, "IPv4"), (socket.AF_INET6, "IPv6")]
|
|
269
|
+
|
|
270
|
+
for family, family_name in families:
|
|
271
|
+
try:
|
|
272
|
+
result = socket.getaddrinfo(hostname, None, family)
|
|
273
|
+
if result:
|
|
274
|
+
logger.info(f"✓ {family_name} resolution successful:")
|
|
275
|
+
for addr_info in result[:3]:
|
|
276
|
+
logger.info(f" - {addr_info[4][0]}")
|
|
277
|
+
else:
|
|
278
|
+
logger.warning(
|
|
279
|
+
f"✗ {family_name} resolution returned no results"
|
|
280
|
+
)
|
|
281
|
+
except socket.gaierror:
|
|
282
|
+
logger.error(f"✗ {family_name} resolution failed")
|
|
283
|
+
except Exception as e:
|
|
284
|
+
logger.error(f"✗ {family_name} resolution error: {e}")
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.error(f"Alternative DNS test failed: {e}")
|
|
288
|
+
|
|
289
|
+
def get_workunits_internal(
|
|
290
|
+
self,
|
|
291
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
292
|
+
if self.config.dns_probe_url is not None:
|
|
293
|
+
# Perform DNS probe
|
|
294
|
+
logger.info(f"Performing DNS probe for: {self.config.dns_probe_url}")
|
|
295
|
+
self.perform_dns_probe(self.config.dns_probe_url)
|
|
296
|
+
|
|
297
|
+
yield from []
|
|
298
|
+
|
|
299
|
+
def get_report(self) -> SourceReport:
|
|
300
|
+
return self.report
|
|
@@ -7,7 +7,7 @@ from collections import defaultdict
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from itertools import product
|
|
9
9
|
from time import sleep, time
|
|
10
|
-
from typing import Any, Deque, Dict, List, Optional, Union
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
|
|
11
11
|
from urllib.parse import quote
|
|
12
12
|
|
|
13
13
|
import requests
|
|
@@ -15,12 +15,17 @@ from requests.adapters import HTTPAdapter
|
|
|
15
15
|
from urllib3 import Retry
|
|
16
16
|
from urllib3.exceptions import InsecureRequestWarning
|
|
17
17
|
|
|
18
|
+
from datahub.emitter.request_helper import make_curl_command
|
|
18
19
|
from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
|
|
19
20
|
from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
20
21
|
DremioToDataHubSourceTypeMapping,
|
|
21
22
|
)
|
|
22
23
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
23
24
|
from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
|
|
25
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
|
|
24
29
|
|
|
25
30
|
logger = logging.getLogger(__name__)
|
|
26
31
|
|
|
@@ -54,6 +59,8 @@ class DremioAPIOperations:
|
|
|
54
59
|
self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
|
|
55
60
|
self._max_workers: int = connection_args.max_workers
|
|
56
61
|
self.is_dremio_cloud = connection_args.is_dremio_cloud
|
|
62
|
+
self.start_time = connection_args.start_time
|
|
63
|
+
self.end_time = connection_args.end_time
|
|
57
64
|
self.report = report
|
|
58
65
|
self.session = requests.Session()
|
|
59
66
|
if connection_args.is_dremio_cloud:
|
|
@@ -178,6 +185,7 @@ class DremioAPIOperations:
|
|
|
178
185
|
self.session.headers.update(
|
|
179
186
|
{"Authorization": f"Bearer {connection_args.password}"}
|
|
180
187
|
)
|
|
188
|
+
logger.debug("Configured Dremio cloud API session to use PAT")
|
|
181
189
|
return
|
|
182
190
|
|
|
183
191
|
# On-prem Dremio authentication (PAT or Basic Auth)
|
|
@@ -189,6 +197,7 @@ class DremioAPIOperations:
|
|
|
189
197
|
"Authorization": f"Bearer {connection_args.password}",
|
|
190
198
|
}
|
|
191
199
|
)
|
|
200
|
+
logger.debug("Configured Dremio API session to use PAT")
|
|
192
201
|
return
|
|
193
202
|
else:
|
|
194
203
|
assert connection_args.username and connection_args.password, (
|
|
@@ -212,10 +221,10 @@ class DremioAPIOperations:
|
|
|
212
221
|
response.raise_for_status()
|
|
213
222
|
token = response.json().get("token")
|
|
214
223
|
if token:
|
|
224
|
+
logger.debug("Exchanged username and password for Dremio token")
|
|
215
225
|
self.session.headers.update(
|
|
216
226
|
{"Authorization": f"_dremio{token}"}
|
|
217
227
|
)
|
|
218
|
-
|
|
219
228
|
return
|
|
220
229
|
else:
|
|
221
230
|
self.report.failure("Failed to authenticate", login_url)
|
|
@@ -231,49 +240,76 @@ class DremioAPIOperations:
|
|
|
231
240
|
"Credentials cannot be refreshed. Please check your username and password."
|
|
232
241
|
)
|
|
233
242
|
|
|
243
|
+
def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
|
|
244
|
+
"""Send a request to the Dremio API."""
|
|
245
|
+
|
|
246
|
+
logger.debug(f"{method} request to {self.base_url + url}")
|
|
247
|
+
self.report.api_calls_total += 1
|
|
248
|
+
self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
|
|
249
|
+
|
|
250
|
+
with PerfTimer() as timer:
|
|
251
|
+
response = self.session.request(
|
|
252
|
+
method=method,
|
|
253
|
+
url=(self.base_url + url),
|
|
254
|
+
data=data,
|
|
255
|
+
verify=self._verify,
|
|
256
|
+
timeout=self._timeout,
|
|
257
|
+
)
|
|
258
|
+
self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
|
|
259
|
+
timer.elapsed_seconds()
|
|
260
|
+
)
|
|
261
|
+
# response.raise_for_status() # Enabling this line, makes integration tests to fail
|
|
262
|
+
try:
|
|
263
|
+
return response.json()
|
|
264
|
+
except requests.exceptions.JSONDecodeError as e:
|
|
265
|
+
logger.info(
|
|
266
|
+
f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
267
|
+
)
|
|
268
|
+
logger.debug(
|
|
269
|
+
f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
|
|
270
|
+
)
|
|
271
|
+
raise DremioAPIException(
|
|
272
|
+
f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
|
|
273
|
+
) from e
|
|
274
|
+
|
|
234
275
|
def get(self, url: str) -> Dict:
|
|
235
|
-
"""
|
|
236
|
-
|
|
237
|
-
url=(self.base_url + url),
|
|
238
|
-
verify=self._verify,
|
|
239
|
-
timeout=self._timeout,
|
|
240
|
-
)
|
|
241
|
-
return response.json()
|
|
276
|
+
"""Send a GET request to the Dremio API."""
|
|
277
|
+
return self._request("GET", url)
|
|
242
278
|
|
|
243
279
|
def post(self, url: str, data: str) -> Dict:
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
url=(self.base_url + url),
|
|
247
|
-
data=data,
|
|
248
|
-
verify=self._verify,
|
|
249
|
-
timeout=self._timeout,
|
|
250
|
-
)
|
|
251
|
-
return response.json()
|
|
280
|
+
"""Send a POST request to the Dremio API."""
|
|
281
|
+
return self._request("POST", url, data=data)
|
|
252
282
|
|
|
253
283
|
def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
|
|
254
284
|
"""Execute SQL query with timeout and error handling"""
|
|
255
285
|
try:
|
|
256
|
-
|
|
286
|
+
with PerfTimer() as timer:
|
|
287
|
+
logger.info(f"Executing query: {query}")
|
|
288
|
+
response = self.post(url="/sql", data=json.dumps({"sql": query}))
|
|
257
289
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
290
|
+
if "errorMessage" in response:
|
|
291
|
+
self.report.failure(
|
|
292
|
+
message="SQL Error", context=f"{response['errorMessage']}"
|
|
293
|
+
)
|
|
294
|
+
raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
|
|
263
295
|
|
|
264
|
-
|
|
296
|
+
job_id = response["id"]
|
|
265
297
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
298
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
299
|
+
future = executor.submit(self.fetch_results, job_id)
|
|
300
|
+
try:
|
|
301
|
+
result = future.result(timeout=timeout)
|
|
302
|
+
logger.info(
|
|
303
|
+
f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
|
|
304
|
+
)
|
|
305
|
+
return result
|
|
306
|
+
except concurrent.futures.TimeoutError:
|
|
307
|
+
self.cancel_query(job_id)
|
|
308
|
+
raise DremioAPIException(
|
|
309
|
+
f"Query execution timed out after {timeout} seconds"
|
|
310
|
+
) from None
|
|
311
|
+
except RuntimeError as e:
|
|
312
|
+
raise DremioAPIException() from e
|
|
277
313
|
|
|
278
314
|
except requests.RequestException as e:
|
|
279
315
|
raise DremioAPIException("Error executing query") from e
|
|
@@ -462,7 +498,9 @@ class DremioAPIOperations:
|
|
|
462
498
|
pattern_str = "|".join(f"({p})" for p in patterns)
|
|
463
499
|
return f"AND {operator}({field}, '{pattern_str}')"
|
|
464
500
|
|
|
465
|
-
def get_all_tables_and_columns(
|
|
501
|
+
def get_all_tables_and_columns(
|
|
502
|
+
self, containers: Deque["DremioContainer"]
|
|
503
|
+
) -> List[Dict]:
|
|
466
504
|
if self.edition == DremioEdition.ENTERPRISE:
|
|
467
505
|
query_template = DremioSQLQueries.QUERY_DATASETS_EE
|
|
468
506
|
elif self.edition == DremioEdition.CLOUD:
|
|
@@ -603,10 +641,25 @@ class DremioAPIOperations:
|
|
|
603
641
|
return parents_list
|
|
604
642
|
|
|
605
643
|
def extract_all_queries(self) -> List[Dict[str, Any]]:
|
|
644
|
+
# Convert datetime objects to string format for SQL queries
|
|
645
|
+
start_timestamp_str = None
|
|
646
|
+
end_timestamp_str = None
|
|
647
|
+
|
|
648
|
+
if self.start_time:
|
|
649
|
+
start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
650
|
+
if self.end_time:
|
|
651
|
+
end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
|
652
|
+
|
|
606
653
|
if self.edition == DremioEdition.CLOUD:
|
|
607
|
-
jobs_query = DremioSQLQueries.
|
|
654
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
|
|
655
|
+
start_timestamp_millis=start_timestamp_str,
|
|
656
|
+
end_timestamp_millis=end_timestamp_str,
|
|
657
|
+
)
|
|
608
658
|
else:
|
|
609
|
-
jobs_query = DremioSQLQueries.
|
|
659
|
+
jobs_query = DremioSQLQueries.get_query_all_jobs(
|
|
660
|
+
start_timestamp_millis=start_timestamp_str,
|
|
661
|
+
end_timestamp_millis=end_timestamp_str,
|
|
662
|
+
)
|
|
610
663
|
|
|
611
664
|
return self.execute_query(query=jobs_query)
|
|
612
665
|
|
|
@@ -685,6 +738,27 @@ class DremioAPIOperations:
|
|
|
685
738
|
|
|
686
739
|
return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
|
|
687
740
|
|
|
741
|
+
def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
|
|
742
|
+
"""
|
|
743
|
+
Check if a container path could potentially match a schema pattern.
|
|
744
|
+
This handles hierarchical path matching for container filtering.
|
|
745
|
+
"""
|
|
746
|
+
if pattern == ".*":
|
|
747
|
+
return True
|
|
748
|
+
|
|
749
|
+
current_path = ".".join(path_components)
|
|
750
|
+
|
|
751
|
+
# Handle simple .* patterns (like "a.b.c.*")
|
|
752
|
+
if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
|
|
753
|
+
# Simple dotstar pattern - check prefix matching
|
|
754
|
+
pattern_prefix = pattern[:-2] # Remove ".*"
|
|
755
|
+
return current_path.lower().startswith(
|
|
756
|
+
pattern_prefix.lower()
|
|
757
|
+
) or pattern_prefix.lower().startswith(current_path.lower())
|
|
758
|
+
else:
|
|
759
|
+
# Complex regex pattern - use existing regex matching logic
|
|
760
|
+
return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
|
|
761
|
+
|
|
688
762
|
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
689
763
|
"""
|
|
690
764
|
Helper method to check if a container should be included based on schema patterns.
|
|
@@ -711,41 +785,8 @@ class DremioAPIOperations:
|
|
|
711
785
|
|
|
712
786
|
# Check allow patterns
|
|
713
787
|
for pattern in self.allow_schema_pattern:
|
|
714
|
-
#
|
|
715
|
-
if
|
|
716
|
-
pattern_parts = pattern.split(".")
|
|
717
|
-
path_parts = path_components
|
|
718
|
-
|
|
719
|
-
# If pattern has exact same number of parts, check each component
|
|
720
|
-
if len(pattern_parts) == len(path_parts):
|
|
721
|
-
matches = True
|
|
722
|
-
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
723
|
-
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
724
|
-
matches = False
|
|
725
|
-
break
|
|
726
|
-
if matches:
|
|
727
|
-
self.report.report_container_scanned(full_path)
|
|
728
|
-
return True
|
|
729
|
-
# Otherwise check if current path is prefix match
|
|
730
|
-
else:
|
|
731
|
-
# Remove the trailing wildcard if present
|
|
732
|
-
if pattern_parts[-1] == "*":
|
|
733
|
-
pattern_parts = pattern_parts[:-1]
|
|
734
|
-
|
|
735
|
-
for i in range(len(path_parts)):
|
|
736
|
-
current_path = ".".join(path_parts[: i + 1])
|
|
737
|
-
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
738
|
-
|
|
739
|
-
if pattern_prefix.startswith(current_path):
|
|
740
|
-
self.report.report_container_scanned(full_path)
|
|
741
|
-
return True
|
|
742
|
-
|
|
743
|
-
# Direct pattern matching
|
|
744
|
-
if self._check_pattern_match(
|
|
745
|
-
pattern=pattern,
|
|
746
|
-
paths=[full_path],
|
|
747
|
-
allow_prefix=True,
|
|
748
|
-
):
|
|
788
|
+
# Check if current path could potentially match this pattern
|
|
789
|
+
if self._could_match_pattern(pattern, path_components):
|
|
749
790
|
self.report.report_container_scanned(full_path)
|
|
750
791
|
return True
|
|
751
792
|
|
|
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
|
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
11
11
|
)
|
|
12
|
+
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
12
13
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
|
|
13
14
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
14
15
|
StatefulStaleMetadataRemovalConfig,
|
|
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
|
|
|
118
119
|
class DremioSourceConfig(
|
|
119
120
|
DremioConnectionConfig,
|
|
120
121
|
StatefulIngestionConfigBase,
|
|
122
|
+
BaseTimeWindowConfig,
|
|
121
123
|
EnvConfigMixin,
|
|
122
124
|
PlatformInstanceConfigMixin,
|
|
123
125
|
):
|
|
@@ -1,22 +1,43 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
5
6
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
6
7
|
StaleEntityRemovalSourceReport,
|
|
7
8
|
)
|
|
8
9
|
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
10
|
+
from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
|
|
11
|
+
from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
|
|
12
|
+
from datahub.utilities.stats_collections import (
|
|
13
|
+
TopKDict,
|
|
14
|
+
float_top_k_dict,
|
|
15
|
+
int_top_k_dict,
|
|
16
|
+
)
|
|
9
17
|
|
|
10
18
|
|
|
11
19
|
@dataclass
|
|
12
20
|
class DremioSourceReport(
|
|
13
|
-
SQLSourceReport,
|
|
21
|
+
SQLSourceReport,
|
|
22
|
+
StaleEntityRemovalSourceReport,
|
|
23
|
+
IngestionStageReport,
|
|
24
|
+
BaseTimeWindowReport,
|
|
14
25
|
):
|
|
15
26
|
num_containers_failed: int = 0
|
|
16
27
|
num_datasets_failed: int = 0
|
|
17
28
|
containers_scanned: int = 0
|
|
18
29
|
containers_filtered: int = 0
|
|
19
30
|
|
|
31
|
+
api_calls_total: int = 0
|
|
32
|
+
api_calls_by_method_and_path: TopKDict[str, int] = field(
|
|
33
|
+
default_factory=int_top_k_dict
|
|
34
|
+
)
|
|
35
|
+
api_call_secs_by_method_and_path: TopKDict[str, float] = field(
|
|
36
|
+
default_factory=float_top_k_dict
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
sql_aggregator: Optional[SqlAggregatorReport] = None
|
|
40
|
+
|
|
20
41
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
21
42
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
22
43
|
# for future implementation of min / max / percentiles etc.
|