acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,300 @@
1
+ import logging
2
+ import socket
3
+ import time
4
+ from typing import Iterable, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ import dns.exception
8
+ import dns.resolver
9
+ import requests
10
+
11
+ from datahub.configuration.common import ConfigModel
12
+ from datahub.ingestion.api.common import PipelineContext
13
+ from datahub.ingestion.api.decorators import (
14
+ SupportStatus,
15
+ config_class,
16
+ platform_name,
17
+ support_status,
18
+ )
19
+ from datahub.ingestion.api.source import Source, SourceReport
20
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class DataHubDebugSourceConfig(ConfigModel):
26
+ dns_probe_url: Optional[str] = None
27
+
28
+
29
+ @platform_name("DataHubDebug")
30
+ @config_class(DataHubDebugSourceConfig)
31
+ @support_status(SupportStatus.TESTING)
32
+ class DataHubDebugSource(Source):
33
+ """
34
+ DataHubDebugSource is helper to debug things in executor where ingestion is running.
35
+
36
+ This source can perform the following tasks:
37
+ 1. Network probe of a URL. Different from test connection in sources as that is after source starts.
38
+
39
+ """
40
+
41
+ def __init__(self, ctx: PipelineContext, config: DataHubDebugSourceConfig):
42
+ self.ctx = ctx
43
+ self.config = config
44
+ self.report = SourceReport()
45
+ self.report.event_not_produced_warn = False
46
+
47
+ @classmethod
48
+ def create(cls, config_dict, ctx):
49
+ config = DataHubDebugSourceConfig.parse_obj(config_dict)
50
+ return cls(ctx, config)
51
+
52
+ def perform_dns_probe(self, url: str) -> None:
53
+ """
54
+ Perform comprehensive DNS probe and network connectivity tests.
55
+ Logs detailed information to help diagnose network issues.
56
+ """
57
+ logger.info(f"Starting DNS probe for URL: {url}")
58
+ logger.info("=" * 60)
59
+ logger.info(f"DNS PROBE REPORT FOR: {url}")
60
+ logger.info("=" * 60)
61
+
62
+ try:
63
+ # Parse the URL to extract hostname
64
+ parsed_url = urlparse(
65
+ url if url.startswith(("http://", "https://")) else f"http://{url}"
66
+ )
67
+ hostname = parsed_url.hostname or parsed_url.netloc
68
+ port = parsed_url.port or (443 if parsed_url.scheme == "https" else 80)
69
+
70
+ logger.info(f"Parsed hostname: {hostname}")
71
+ logger.info(f"Target port: {port}")
72
+ logger.info(f"URL scheme: {parsed_url.scheme}")
73
+ logger.info("-" * 60)
74
+
75
+ # Test 1: Enhanced DNS resolution with dnspython if available
76
+ logger.info("1. DNS RESOLUTION TEST")
77
+ self._dns_probe_with_dnspython(hostname)
78
+
79
+ logger.info("-" * 60)
80
+
81
+ # Test 2: HTTP/HTTPS connectivity test with requests if available
82
+ logger.info("2. HTTP CONNECTIVITY TEST")
83
+ self._http_probe_with_requests(url)
84
+
85
+ logger.info("-" * 60)
86
+
87
+ # Test 3: System network information
88
+ logger.info("3. SYSTEM NETWORK INFORMATION")
89
+ self._log_system_network_info()
90
+
91
+ except Exception as e:
92
+ logger.error(f"DNS probe failed with unexpected error: {e}", exc_info=True)
93
+
94
+ logger.info("=" * 60)
95
+ logger.info("DNS PROBE COMPLETED")
96
+ logger.info("=" * 60)
97
+
98
+ def _dns_probe_with_dnspython(self, hostname: str) -> None:
99
+ """Enhanced DNS probing using dnspython library"""
100
+ try:
101
+ # Test different record types
102
+ record_types = ["A", "AAAA", "CNAME", "MX"]
103
+
104
+ for record_type in record_types:
105
+ try:
106
+ start_time = time.time()
107
+ answers = dns.resolver.resolve(hostname, record_type)
108
+ dns_time = time.time() - start_time
109
+
110
+ logger.info(
111
+ f"✓ {record_type} record resolution successful ({dns_time:.3f}s)"
112
+ )
113
+ for answer in answers:
114
+ logger.info(f" - {record_type}: {answer}")
115
+
116
+ except dns.resolver.NXDOMAIN:
117
+ logger.info(f"✗ {record_type} record: Domain does not exist")
118
+ except dns.resolver.NoAnswer:
119
+ logger.info(
120
+ f"- {record_type} record: No answer (record type not available)"
121
+ )
122
+ except dns.exception.Timeout:
123
+ logger.error(f"✗ {record_type} record: DNS query timed out")
124
+ except Exception as e:
125
+ logger.error(f"✗ {record_type} record query failed: {e}")
126
+
127
+ # Test different DNS servers
128
+ logger.info("Testing with different DNS servers:")
129
+ dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
130
+
131
+ for dns_server in dns_servers:
132
+ try:
133
+ resolver = dns.resolver.Resolver()
134
+ resolver.nameservers = [dns_server]
135
+ resolver.timeout = 5
136
+
137
+ start_time = time.time()
138
+ answers = resolver.resolve(hostname, "A")
139
+ dns_time = time.time() - start_time
140
+
141
+ logger.info(
142
+ f"✓ DNS server {dns_server} responded ({dns_time:.3f}s)"
143
+ )
144
+ for answer in answers:
145
+ logger.info(f" - A: {answer}")
146
+
147
+ except Exception as e:
148
+ logger.error(f"✗ DNS server {dns_server} failed: {e}")
149
+
150
+ except Exception as e:
151
+ logger.error(f"Enhanced DNS probe failed: {e}", exc_info=True)
152
+
153
+ def _http_probe_with_requests(self, url: str) -> None:
154
+ """HTTP connectivity test using requests library"""
155
+ try:
156
+ # Test with different timeouts and methods
157
+ timeout = 10
158
+ allow_redirects_head = True
159
+ allow_redirects_get = False
160
+
161
+ # Test HEAD request
162
+ try:
163
+ logger.info(f"Testing HEAD request with timeout {timeout}s")
164
+ start_time = time.time()
165
+
166
+ response = requests.head(
167
+ url, timeout=timeout, allow_redirects=allow_redirects_head
168
+ )
169
+
170
+ request_time = time.time() - start_time
171
+
172
+ logger.info(f"✓ HEAD request successful ({request_time:.3f}s)")
173
+ logger.info(f" Status code: {response.status_code}")
174
+ logger.info(
175
+ f" Response headers: {dict(list(response.headers.items())[:5])}"
176
+ )
177
+
178
+ if hasattr(response, "url") and response.url != url:
179
+ logger.info(f" Final URL after redirects: {response.url}")
180
+
181
+ except requests.exceptions.Timeout:
182
+ logger.error(f"✗ HEAD request timed out after {timeout}s")
183
+ except requests.exceptions.ConnectionError as e:
184
+ logger.error(f"✗ HEAD connection error: {e}")
185
+ except requests.exceptions.RequestException as e:
186
+ logger.error(f"✗ HEAD request failed: {e}")
187
+ except Exception as e:
188
+ logger.error(f"✗ HEAD unexpected error: {e}")
189
+
190
+ # Test GET request
191
+ try:
192
+ logger.info(f"Testing GET request with timeout {timeout}s")
193
+ start_time = time.time()
194
+
195
+ response = requests.get(
196
+ url, timeout=timeout, allow_redirects=allow_redirects_get
197
+ )
198
+
199
+ request_time = time.time() - start_time
200
+
201
+ logger.info(f"✓ GET request successful ({request_time:.3f}s)")
202
+ logger.info(f" Status code: {response.status_code}")
203
+ logger.info(
204
+ f" Response headers: {dict(list(response.headers.items())[:5])}"
205
+ )
206
+
207
+ if hasattr(response, "url") and response.url != url:
208
+ logger.info(f" Final URL after redirects: {response.url}")
209
+
210
+ except requests.exceptions.Timeout:
211
+ logger.error(f"✗ GET request timed out after {timeout}s")
212
+ except requests.exceptions.ConnectionError as e:
213
+ logger.error(f"✗ GET connection error: {e}")
214
+ except requests.exceptions.RequestException as e:
215
+ logger.error(f"✗ GET request failed: {e}")
216
+ except Exception as e:
217
+ logger.error(f"✗ GET unexpected error: {e}")
218
+
219
+ except Exception as e:
220
+ logger.error(f"HTTP probe failed: {e}", exc_info=True)
221
+
222
+ def _log_dns_troubleshooting(self) -> None:
223
+ """Log DNS troubleshooting information"""
224
+ logger.info("DNS TROUBLESHOOTING SUGGESTIONS:")
225
+ logger.info("- Check if the hostname is correct")
226
+ logger.info("- Verify DNS server configuration")
227
+ logger.info("- Check network connectivity")
228
+ logger.info("- Try using a different DNS server (8.8.8.8, 1.1.1.1)")
229
+ logger.info("- Check if there are firewall restrictions")
230
+
231
+ def _log_system_network_info(self) -> None:
232
+ """Log system network configuration information"""
233
+ try:
234
+ local_hostname = socket.gethostname()
235
+ logger.info(f"Local hostname: {local_hostname}")
236
+
237
+ try:
238
+ local_ips = socket.getaddrinfo(local_hostname, None)
239
+ logger.info("Local IP addresses:")
240
+ for addr_info in local_ips:
241
+ if addr_info[0] in [socket.AF_INET, socket.AF_INET6]:
242
+ family = "IPv4" if addr_info[0] == socket.AF_INET else "IPv6"
243
+ logger.info(f" - {addr_info[4][0]} ({family})")
244
+ except Exception as e:
245
+ logger.warning(f"Could not retrieve local IP addresses: {e}")
246
+
247
+ logger.info("DNS Server Connectivity:")
248
+ dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
249
+ for dns_server in dns_servers:
250
+ try:
251
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
252
+ sock.settimeout(5)
253
+ result = sock.connect_ex((dns_server, 53))
254
+ if result == 0:
255
+ logger.info(f" ✓ Can reach {dns_server}:53")
256
+ else:
257
+ logger.error(f" ✗ Cannot reach {dns_server}:53")
258
+ sock.close()
259
+ except Exception as e:
260
+ logger.error(f" ✗ Error testing {dns_server}:53 - {e}")
261
+
262
+ except Exception as e:
263
+ logger.warning(f"Could not gather system network info: {e}")
264
+
265
+ def _test_alternative_dns(self, hostname: str) -> None:
266
+ """Test hostname resolution using alternative methods"""
267
+ try:
268
+ families = [(socket.AF_INET, "IPv4"), (socket.AF_INET6, "IPv6")]
269
+
270
+ for family, family_name in families:
271
+ try:
272
+ result = socket.getaddrinfo(hostname, None, family)
273
+ if result:
274
+ logger.info(f"✓ {family_name} resolution successful:")
275
+ for addr_info in result[:3]:
276
+ logger.info(f" - {addr_info[4][0]}")
277
+ else:
278
+ logger.warning(
279
+ f"✗ {family_name} resolution returned no results"
280
+ )
281
+ except socket.gaierror:
282
+ logger.error(f"✗ {family_name} resolution failed")
283
+ except Exception as e:
284
+ logger.error(f"✗ {family_name} resolution error: {e}")
285
+
286
+ except Exception as e:
287
+ logger.error(f"Alternative DNS test failed: {e}")
288
+
289
+ def get_workunits_internal(
290
+ self,
291
+ ) -> Iterable[MetadataWorkUnit]:
292
+ if self.config.dns_probe_url is not None:
293
+ # Perform DNS probe
294
+ logger.info(f"Performing DNS probe for: {self.config.dns_probe_url}")
295
+ self.perform_dns_probe(self.config.dns_probe_url)
296
+
297
+ yield from []
298
+
299
+ def get_report(self) -> SourceReport:
300
+ return self.report
@@ -7,7 +7,7 @@ from collections import defaultdict
7
7
  from enum import Enum
8
8
  from itertools import product
9
9
  from time import sleep, time
10
- from typing import Any, Deque, Dict, List, Optional, Union
10
+ from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
11
11
  from urllib.parse import quote
12
12
 
13
13
  import requests
@@ -15,12 +15,17 @@ from requests.adapters import HTTPAdapter
15
15
  from urllib3 import Retry
16
16
  from urllib3.exceptions import InsecureRequestWarning
17
17
 
18
+ from datahub.emitter.request_helper import make_curl_command
18
19
  from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
19
20
  from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
20
21
  DremioToDataHubSourceTypeMapping,
21
22
  )
22
23
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
23
24
  from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
25
+ from datahub.utilities.perf_timer import PerfTimer
26
+
27
+ if TYPE_CHECKING:
28
+ from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
24
29
 
25
30
  logger = logging.getLogger(__name__)
26
31
 
@@ -54,6 +59,8 @@ class DremioAPIOperations:
54
59
  self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
55
60
  self._max_workers: int = connection_args.max_workers
56
61
  self.is_dremio_cloud = connection_args.is_dremio_cloud
62
+ self.start_time = connection_args.start_time
63
+ self.end_time = connection_args.end_time
57
64
  self.report = report
58
65
  self.session = requests.Session()
59
66
  if connection_args.is_dremio_cloud:
@@ -178,6 +185,7 @@ class DremioAPIOperations:
178
185
  self.session.headers.update(
179
186
  {"Authorization": f"Bearer {connection_args.password}"}
180
187
  )
188
+ logger.debug("Configured Dremio cloud API session to use PAT")
181
189
  return
182
190
 
183
191
  # On-prem Dremio authentication (PAT or Basic Auth)
@@ -189,6 +197,7 @@ class DremioAPIOperations:
189
197
  "Authorization": f"Bearer {connection_args.password}",
190
198
  }
191
199
  )
200
+ logger.debug("Configured Dremio API session to use PAT")
192
201
  return
193
202
  else:
194
203
  assert connection_args.username and connection_args.password, (
@@ -212,10 +221,10 @@ class DremioAPIOperations:
212
221
  response.raise_for_status()
213
222
  token = response.json().get("token")
214
223
  if token:
224
+ logger.debug("Exchanged username and password for Dremio token")
215
225
  self.session.headers.update(
216
226
  {"Authorization": f"_dremio{token}"}
217
227
  )
218
-
219
228
  return
220
229
  else:
221
230
  self.report.failure("Failed to authenticate", login_url)
@@ -231,49 +240,76 @@ class DremioAPIOperations:
231
240
  "Credentials cannot be refreshed. Please check your username and password."
232
241
  )
233
242
 
243
+ def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
244
+ """Send a request to the Dremio API."""
245
+
246
+ logger.debug(f"{method} request to {self.base_url + url}")
247
+ self.report.api_calls_total += 1
248
+ self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
249
+
250
+ with PerfTimer() as timer:
251
+ response = self.session.request(
252
+ method=method,
253
+ url=(self.base_url + url),
254
+ data=data,
255
+ verify=self._verify,
256
+ timeout=self._timeout,
257
+ )
258
+ self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
259
+ timer.elapsed_seconds()
260
+ )
261
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
262
+ try:
263
+ return response.json()
264
+ except requests.exceptions.JSONDecodeError as e:
265
+ logger.info(
266
+ f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
267
+ )
268
+ logger.debug(
269
+ f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
270
+ )
271
+ raise DremioAPIException(
272
+ f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
273
+ ) from e
274
+
234
275
  def get(self, url: str) -> Dict:
235
- """execute a get request on dremio"""
236
- response = self.session.get(
237
- url=(self.base_url + url),
238
- verify=self._verify,
239
- timeout=self._timeout,
240
- )
241
- return response.json()
276
+ """Send a GET request to the Dremio API."""
277
+ return self._request("GET", url)
242
278
 
243
279
  def post(self, url: str, data: str) -> Dict:
244
- """execute a get request on dremio"""
245
- response = self.session.post(
246
- url=(self.base_url + url),
247
- data=data,
248
- verify=self._verify,
249
- timeout=self._timeout,
250
- )
251
- return response.json()
280
+ """Send a POST request to the Dremio API."""
281
+ return self._request("POST", url, data=data)
252
282
 
253
283
  def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
254
284
  """Execute SQL query with timeout and error handling"""
255
285
  try:
256
- response = self.post(url="/sql", data=json.dumps({"sql": query}))
286
+ with PerfTimer() as timer:
287
+ logger.info(f"Executing query: {query}")
288
+ response = self.post(url="/sql", data=json.dumps({"sql": query}))
257
289
 
258
- if "errorMessage" in response:
259
- self.report.failure(
260
- message="SQL Error", context=f"{response['errorMessage']}"
261
- )
262
- raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
290
+ if "errorMessage" in response:
291
+ self.report.failure(
292
+ message="SQL Error", context=f"{response['errorMessage']}"
293
+ )
294
+ raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
263
295
 
264
- job_id = response["id"]
296
+ job_id = response["id"]
265
297
 
266
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
267
- future = executor.submit(self.fetch_results, job_id)
268
- try:
269
- return future.result(timeout=timeout)
270
- except concurrent.futures.TimeoutError:
271
- self.cancel_query(job_id)
272
- raise DremioAPIException(
273
- f"Query execution timed out after {timeout} seconds"
274
- ) from None
275
- except RuntimeError as e:
276
- raise DremioAPIException() from e
298
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
299
+ future = executor.submit(self.fetch_results, job_id)
300
+ try:
301
+ result = future.result(timeout=timeout)
302
+ logger.info(
303
+ f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
304
+ )
305
+ return result
306
+ except concurrent.futures.TimeoutError:
307
+ self.cancel_query(job_id)
308
+ raise DremioAPIException(
309
+ f"Query execution timed out after {timeout} seconds"
310
+ ) from None
311
+ except RuntimeError as e:
312
+ raise DremioAPIException() from e
277
313
 
278
314
  except requests.RequestException as e:
279
315
  raise DremioAPIException("Error executing query") from e
@@ -462,7 +498,9 @@ class DremioAPIOperations:
462
498
  pattern_str = "|".join(f"({p})" for p in patterns)
463
499
  return f"AND {operator}({field}, '{pattern_str}')"
464
500
 
465
- def get_all_tables_and_columns(self, containers: Deque) -> List[Dict]:
501
+ def get_all_tables_and_columns(
502
+ self, containers: Deque["DremioContainer"]
503
+ ) -> List[Dict]:
466
504
  if self.edition == DremioEdition.ENTERPRISE:
467
505
  query_template = DremioSQLQueries.QUERY_DATASETS_EE
468
506
  elif self.edition == DremioEdition.CLOUD:
@@ -603,10 +641,25 @@ class DremioAPIOperations:
603
641
  return parents_list
604
642
 
605
643
  def extract_all_queries(self) -> List[Dict[str, Any]]:
644
+ # Convert datetime objects to string format for SQL queries
645
+ start_timestamp_str = None
646
+ end_timestamp_str = None
647
+
648
+ if self.start_time:
649
+ start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
650
+ if self.end_time:
651
+ end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
652
+
606
653
  if self.edition == DremioEdition.CLOUD:
607
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS_CLOUD
654
+ jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
655
+ start_timestamp_millis=start_timestamp_str,
656
+ end_timestamp_millis=end_timestamp_str,
657
+ )
608
658
  else:
609
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS
659
+ jobs_query = DremioSQLQueries.get_query_all_jobs(
660
+ start_timestamp_millis=start_timestamp_str,
661
+ end_timestamp_millis=end_timestamp_str,
662
+ )
610
663
 
611
664
  return self.execute_query(query=jobs_query)
612
665
 
@@ -685,6 +738,27 @@ class DremioAPIOperations:
685
738
 
686
739
  return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
687
740
 
741
+ def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
742
+ """
743
+ Check if a container path could potentially match a schema pattern.
744
+ This handles hierarchical path matching for container filtering.
745
+ """
746
+ if pattern == ".*":
747
+ return True
748
+
749
+ current_path = ".".join(path_components)
750
+
751
+ # Handle simple .* patterns (like "a.b.c.*")
752
+ if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
753
+ # Simple dotstar pattern - check prefix matching
754
+ pattern_prefix = pattern[:-2] # Remove ".*"
755
+ return current_path.lower().startswith(
756
+ pattern_prefix.lower()
757
+ ) or pattern_prefix.lower().startswith(current_path.lower())
758
+ else:
759
+ # Complex regex pattern - use existing regex matching logic
760
+ return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
761
+
688
762
  def should_include_container(self, path: List[str], name: str) -> bool:
689
763
  """
690
764
  Helper method to check if a container should be included based on schema patterns.
@@ -711,41 +785,8 @@ class DremioAPIOperations:
711
785
 
712
786
  # Check allow patterns
713
787
  for pattern in self.allow_schema_pattern:
714
- # For patterns with wildcards, check if this path is a parent of the pattern
715
- if "*" in pattern:
716
- pattern_parts = pattern.split(".")
717
- path_parts = path_components
718
-
719
- # If pattern has exact same number of parts, check each component
720
- if len(pattern_parts) == len(path_parts):
721
- matches = True
722
- for p_part, c_part in zip(pattern_parts, path_parts):
723
- if p_part != "*" and p_part.lower() != c_part.lower():
724
- matches = False
725
- break
726
- if matches:
727
- self.report.report_container_scanned(full_path)
728
- return True
729
- # Otherwise check if current path is prefix match
730
- else:
731
- # Remove the trailing wildcard if present
732
- if pattern_parts[-1] == "*":
733
- pattern_parts = pattern_parts[:-1]
734
-
735
- for i in range(len(path_parts)):
736
- current_path = ".".join(path_parts[: i + 1])
737
- pattern_prefix = ".".join(pattern_parts[: i + 1])
738
-
739
- if pattern_prefix.startswith(current_path):
740
- self.report.report_container_scanned(full_path)
741
- return True
742
-
743
- # Direct pattern matching
744
- if self._check_pattern_match(
745
- pattern=pattern,
746
- paths=[full_path],
747
- allow_prefix=True,
748
- ):
788
+ # Check if current path could potentially match this pattern
789
+ if self._could_match_pattern(pattern, path_components):
749
790
  self.report.report_container_scanned(full_path)
750
791
  return True
751
792
 
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
13
14
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
118
119
  class DremioSourceConfig(
119
120
  DremioConnectionConfig,
120
121
  StatefulIngestionConfigBase,
122
+ BaseTimeWindowConfig,
121
123
  EnvConfigMixin,
122
124
  PlatformInstanceConfigMixin,
123
125
  ):
@@ -1,22 +1,43 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
+ from typing import Optional
3
4
 
4
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
5
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
7
  StaleEntityRemovalSourceReport,
7
8
  )
8
9
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
+ from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
11
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
12
+ from datahub.utilities.stats_collections import (
13
+ TopKDict,
14
+ float_top_k_dict,
15
+ int_top_k_dict,
16
+ )
9
17
 
10
18
 
11
19
  @dataclass
12
20
  class DremioSourceReport(
13
- SQLSourceReport, StaleEntityRemovalSourceReport, IngestionStageReport
21
+ SQLSourceReport,
22
+ StaleEntityRemovalSourceReport,
23
+ IngestionStageReport,
24
+ BaseTimeWindowReport,
14
25
  ):
15
26
  num_containers_failed: int = 0
16
27
  num_datasets_failed: int = 0
17
28
  containers_scanned: int = 0
18
29
  containers_filtered: int = 0
19
30
 
31
+ api_calls_total: int = 0
32
+ api_calls_by_method_and_path: TopKDict[str, int] = field(
33
+ default_factory=int_top_k_dict
34
+ )
35
+ api_call_secs_by_method_and_path: TopKDict[str, float] = field(
36
+ default_factory=float_top_k_dict
37
+ )
38
+
39
+ sql_aggregator: Optional[SqlAggregatorReport] = None
40
+
20
41
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
21
42
  # recording total combined latency is not very useful, keeping this method as a placeholder
22
43
  # for future implementation of min / max / percentiles etc.