acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (156) hide show
  1. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
  2. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
  3. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/cli/check_cli.py +65 -11
  7. datahub/cli/cli_utils.py +63 -0
  8. datahub/cli/container_cli.py +5 -0
  9. datahub/cli/delete_cli.py +3 -4
  10. datahub/cli/docker_check.py +107 -12
  11. datahub/cli/docker_cli.py +149 -227
  12. datahub/cli/exists_cli.py +0 -2
  13. datahub/cli/get_cli.py +0 -2
  14. datahub/cli/iceberg_cli.py +5 -0
  15. datahub/cli/ingest_cli.py +3 -15
  16. datahub/cli/migrate.py +2 -0
  17. datahub/cli/put_cli.py +1 -4
  18. datahub/cli/quickstart_versioning.py +50 -7
  19. datahub/cli/specific/assertions_cli.py +0 -4
  20. datahub/cli/specific/datacontract_cli.py +0 -3
  21. datahub/cli/specific/dataproduct_cli.py +0 -11
  22. datahub/cli/specific/dataset_cli.py +1 -8
  23. datahub/cli/specific/forms_cli.py +0 -4
  24. datahub/cli/specific/group_cli.py +0 -2
  25. datahub/cli/specific/structuredproperties_cli.py +1 -4
  26. datahub/cli/specific/user_cli.py +0 -2
  27. datahub/cli/state_cli.py +0 -2
  28. datahub/cli/timeline_cli.py +0 -2
  29. datahub/emitter/rest_emitter.py +41 -8
  30. datahub/entrypoints.py +4 -3
  31. datahub/ingestion/api/decorators.py +15 -3
  32. datahub/ingestion/api/report.py +332 -3
  33. datahub/ingestion/api/sink.py +3 -0
  34. datahub/ingestion/api/source.py +47 -45
  35. datahub/ingestion/autogenerated/__init__.py +0 -0
  36. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  37. datahub/ingestion/autogenerated/lineage.json +401 -0
  38. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  39. datahub/ingestion/extractor/schema_util.py +13 -4
  40. datahub/ingestion/graph/client.py +73 -30
  41. datahub/ingestion/run/pipeline.py +54 -2
  42. datahub/ingestion/sink/datahub_rest.py +12 -0
  43. datahub/ingestion/source/abs/source.py +1 -1
  44. datahub/ingestion/source/aws/glue.py +1 -1
  45. datahub/ingestion/source/azure/azure_common.py +2 -2
  46. datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
  47. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  48. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  49. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  50. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  51. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  52. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  53. datahub/ingestion/source/common/subtypes.py +45 -0
  54. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  55. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  56. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  57. datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
  58. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  59. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  60. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  61. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  62. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  63. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  64. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  65. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  66. datahub/ingestion/source/ge_data_profiler.py +76 -28
  67. datahub/ingestion/source/hex/api.py +26 -1
  68. datahub/ingestion/source/identity/azure_ad.py +1 -1
  69. datahub/ingestion/source/identity/okta.py +1 -14
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  72. datahub/ingestion/source/mlflow.py +11 -1
  73. datahub/ingestion/source/mock_data/__init__.py +0 -0
  74. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  75. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  76. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  77. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  78. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  79. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  80. datahub/ingestion/source/preset.py +2 -2
  81. datahub/ingestion/source/redshift/redshift.py +17 -0
  82. datahub/ingestion/source/redshift/usage.py +4 -3
  83. datahub/ingestion/source/s3/report.py +4 -2
  84. datahub/ingestion/source/s3/source.py +367 -115
  85. datahub/ingestion/source/salesforce.py +6 -3
  86. datahub/ingestion/source/sigma/sigma.py +6 -1
  87. datahub/ingestion/source/slack/slack.py +2 -1
  88. datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
  89. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  90. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  91. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
  92. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  93. datahub/ingestion/source/sql/athena.py +119 -12
  94. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  95. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  96. datahub/ingestion/source/sql/mssql/source.py +24 -15
  97. datahub/ingestion/source/sql/oracle.py +1 -1
  98. datahub/ingestion/source/sql/sql_common.py +11 -0
  99. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  100. datahub/ingestion/source/sql/teradata.py +997 -235
  101. datahub/ingestion/source/sql/vertica.py +10 -6
  102. datahub/ingestion/source/sql_queries.py +2 -2
  103. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  104. datahub/ingestion/source/superset.py +57 -2
  105. datahub/ingestion/source/tableau/tableau.py +57 -37
  106. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  107. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  108. datahub/ingestion/source/unity/proxy.py +4 -3
  109. datahub/ingestion/source/unity/source.py +56 -30
  110. datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
  111. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  112. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  113. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  114. datahub/metadata/_internal_schema_classes.py +1253 -536
  115. datahub/metadata/_urns/urn_defs.py +1797 -1685
  116. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  117. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  118. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  119. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  120. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  121. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  122. datahub/metadata/schema.avsc +16614 -16538
  123. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  124. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  125. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  126. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  127. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  128. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  129. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  130. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  131. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  132. datahub/metadata/schemas/DatasetKey.avsc +4 -1
  133. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  134. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  135. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  136. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  137. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  138. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  139. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  140. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  141. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  142. datahub/sdk/datajob.py +39 -15
  143. datahub/sdk/lineage_client.py +2 -0
  144. datahub/sdk/main_client.py +14 -2
  145. datahub/sdk/search_client.py +4 -3
  146. datahub/specific/dataproduct.py +4 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  148. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  149. datahub/telemetry/telemetry.py +17 -11
  150. datahub/upgrade/upgrade.py +46 -13
  151. datahub/utilities/server_config_util.py +8 -0
  152. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  153. datahub/utilities/stats_collections.py +4 -0
  154. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
  155. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
  156. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ from collections import defaultdict
7
7
  from enum import Enum
8
8
  from itertools import product
9
9
  from time import sleep, time
10
- from typing import Any, Deque, Dict, List, Optional, Union
10
+ from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
11
11
  from urllib.parse import quote
12
12
 
13
13
  import requests
@@ -15,12 +15,17 @@ from requests.adapters import HTTPAdapter
15
15
  from urllib3 import Retry
16
16
  from urllib3.exceptions import InsecureRequestWarning
17
17
 
18
+ from datahub.emitter.request_helper import make_curl_command
18
19
  from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
19
20
  from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
20
21
  DremioToDataHubSourceTypeMapping,
21
22
  )
22
23
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
23
24
  from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
25
+ from datahub.utilities.perf_timer import PerfTimer
26
+
27
+ if TYPE_CHECKING:
28
+ from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
24
29
 
25
30
  logger = logging.getLogger(__name__)
26
31
 
@@ -54,6 +59,8 @@ class DremioAPIOperations:
54
59
  self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
55
60
  self._max_workers: int = connection_args.max_workers
56
61
  self.is_dremio_cloud = connection_args.is_dremio_cloud
62
+ self.start_time = connection_args.start_time
63
+ self.end_time = connection_args.end_time
57
64
  self.report = report
58
65
  self.session = requests.Session()
59
66
  if connection_args.is_dremio_cloud:
@@ -178,6 +185,7 @@ class DremioAPIOperations:
178
185
  self.session.headers.update(
179
186
  {"Authorization": f"Bearer {connection_args.password}"}
180
187
  )
188
+ logger.debug("Configured Dremio cloud API session to use PAT")
181
189
  return
182
190
 
183
191
  # On-prem Dremio authentication (PAT or Basic Auth)
@@ -189,6 +197,7 @@ class DremioAPIOperations:
189
197
  "Authorization": f"Bearer {connection_args.password}",
190
198
  }
191
199
  )
200
+ logger.debug("Configured Dremio API session to use PAT")
192
201
  return
193
202
  else:
194
203
  assert connection_args.username and connection_args.password, (
@@ -212,10 +221,10 @@ class DremioAPIOperations:
212
221
  response.raise_for_status()
213
222
  token = response.json().get("token")
214
223
  if token:
224
+ logger.debug("Exchanged username and password for Dremio token")
215
225
  self.session.headers.update(
216
226
  {"Authorization": f"_dremio{token}"}
217
227
  )
218
-
219
228
  return
220
229
  else:
221
230
  self.report.failure("Failed to authenticate", login_url)
@@ -231,49 +240,76 @@ class DremioAPIOperations:
231
240
  "Credentials cannot be refreshed. Please check your username and password."
232
241
  )
233
242
 
243
+ def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
244
+ """Send a request to the Dremio API."""
245
+
246
+ logger.debug(f"{method} request to {self.base_url + url}")
247
+ self.report.api_calls_total += 1
248
+ self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
249
+
250
+ with PerfTimer() as timer:
251
+ response = self.session.request(
252
+ method=method,
253
+ url=(self.base_url + url),
254
+ data=data,
255
+ verify=self._verify,
256
+ timeout=self._timeout,
257
+ )
258
+ self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
259
+ timer.elapsed_seconds()
260
+ )
261
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
262
+ try:
263
+ return response.json()
264
+ except requests.exceptions.JSONDecodeError as e:
265
+ logger.info(
266
+ f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
267
+ )
268
+ logger.debug(
269
+ f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
270
+ )
271
+ raise DremioAPIException(
272
+ f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
273
+ ) from e
274
+
234
275
  def get(self, url: str) -> Dict:
235
- """execute a get request on dremio"""
236
- response = self.session.get(
237
- url=(self.base_url + url),
238
- verify=self._verify,
239
- timeout=self._timeout,
240
- )
241
- return response.json()
276
+ """Send a GET request to the Dremio API."""
277
+ return self._request("GET", url)
242
278
 
243
279
  def post(self, url: str, data: str) -> Dict:
244
- """execute a get request on dremio"""
245
- response = self.session.post(
246
- url=(self.base_url + url),
247
- data=data,
248
- verify=self._verify,
249
- timeout=self._timeout,
250
- )
251
- return response.json()
280
+ """Send a POST request to the Dremio API."""
281
+ return self._request("POST", url, data=data)
252
282
 
253
283
  def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
254
284
  """Execute SQL query with timeout and error handling"""
255
285
  try:
256
- response = self.post(url="/sql", data=json.dumps({"sql": query}))
286
+ with PerfTimer() as timer:
287
+ logger.info(f"Executing query: {query}")
288
+ response = self.post(url="/sql", data=json.dumps({"sql": query}))
257
289
 
258
- if "errorMessage" in response:
259
- self.report.failure(
260
- message="SQL Error", context=f"{response['errorMessage']}"
261
- )
262
- raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
290
+ if "errorMessage" in response:
291
+ self.report.failure(
292
+ message="SQL Error", context=f"{response['errorMessage']}"
293
+ )
294
+ raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
263
295
 
264
- job_id = response["id"]
296
+ job_id = response["id"]
265
297
 
266
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
267
- future = executor.submit(self.fetch_results, job_id)
268
- try:
269
- return future.result(timeout=timeout)
270
- except concurrent.futures.TimeoutError:
271
- self.cancel_query(job_id)
272
- raise DremioAPIException(
273
- f"Query execution timed out after {timeout} seconds"
274
- ) from None
275
- except RuntimeError as e:
276
- raise DremioAPIException() from e
298
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
299
+ future = executor.submit(self.fetch_results, job_id)
300
+ try:
301
+ result = future.result(timeout=timeout)
302
+ logger.info(
303
+ f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
304
+ )
305
+ return result
306
+ except concurrent.futures.TimeoutError:
307
+ self.cancel_query(job_id)
308
+ raise DremioAPIException(
309
+ f"Query execution timed out after {timeout} seconds"
310
+ ) from None
311
+ except RuntimeError as e:
312
+ raise DremioAPIException() from e
277
313
 
278
314
  except requests.RequestException as e:
279
315
  raise DremioAPIException("Error executing query") from e
@@ -462,7 +498,9 @@ class DremioAPIOperations:
462
498
  pattern_str = "|".join(f"({p})" for p in patterns)
463
499
  return f"AND {operator}({field}, '{pattern_str}')"
464
500
 
465
- def get_all_tables_and_columns(self, containers: Deque) -> List[Dict]:
501
+ def get_all_tables_and_columns(
502
+ self, containers: Deque["DremioContainer"]
503
+ ) -> List[Dict]:
466
504
  if self.edition == DremioEdition.ENTERPRISE:
467
505
  query_template = DremioSQLQueries.QUERY_DATASETS_EE
468
506
  elif self.edition == DremioEdition.CLOUD:
@@ -603,10 +641,25 @@ class DremioAPIOperations:
603
641
  return parents_list
604
642
 
605
643
  def extract_all_queries(self) -> List[Dict[str, Any]]:
644
+ # Convert datetime objects to string format for SQL queries
645
+ start_timestamp_str = None
646
+ end_timestamp_str = None
647
+
648
+ if self.start_time:
649
+ start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
650
+ if self.end_time:
651
+ end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
652
+
606
653
  if self.edition == DremioEdition.CLOUD:
607
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS_CLOUD
654
+ jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
655
+ start_timestamp_millis=start_timestamp_str,
656
+ end_timestamp_millis=end_timestamp_str,
657
+ )
608
658
  else:
609
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS
659
+ jobs_query = DremioSQLQueries.get_query_all_jobs(
660
+ start_timestamp_millis=start_timestamp_str,
661
+ end_timestamp_millis=end_timestamp_str,
662
+ )
610
663
 
611
664
  return self.execute_query(query=jobs_query)
612
665
 
@@ -685,6 +738,27 @@ class DremioAPIOperations:
685
738
 
686
739
  return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
687
740
 
741
+ def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
742
+ """
743
+ Check if a container path could potentially match a schema pattern.
744
+ This handles hierarchical path matching for container filtering.
745
+ """
746
+ if pattern == ".*":
747
+ return True
748
+
749
+ current_path = ".".join(path_components)
750
+
751
+ # Handle simple .* patterns (like "a.b.c.*")
752
+ if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
753
+ # Simple dotstar pattern - check prefix matching
754
+ pattern_prefix = pattern[:-2] # Remove ".*"
755
+ return current_path.lower().startswith(
756
+ pattern_prefix.lower()
757
+ ) or pattern_prefix.lower().startswith(current_path.lower())
758
+ else:
759
+ # Complex regex pattern - use existing regex matching logic
760
+ return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
761
+
688
762
  def should_include_container(self, path: List[str], name: str) -> bool:
689
763
  """
690
764
  Helper method to check if a container should be included based on schema patterns.
@@ -711,41 +785,8 @@ class DremioAPIOperations:
711
785
 
712
786
  # Check allow patterns
713
787
  for pattern in self.allow_schema_pattern:
714
- # For patterns with wildcards, check if this path is a parent of the pattern
715
- if "*" in pattern:
716
- pattern_parts = pattern.split(".")
717
- path_parts = path_components
718
-
719
- # If pattern has exact same number of parts, check each component
720
- if len(pattern_parts) == len(path_parts):
721
- matches = True
722
- for p_part, c_part in zip(pattern_parts, path_parts):
723
- if p_part != "*" and p_part.lower() != c_part.lower():
724
- matches = False
725
- break
726
- if matches:
727
- self.report.report_container_scanned(full_path)
728
- return True
729
- # Otherwise check if current path is prefix match
730
- else:
731
- # Remove the trailing wildcard if present
732
- if pattern_parts[-1] == "*":
733
- pattern_parts = pattern_parts[:-1]
734
-
735
- for i in range(len(path_parts)):
736
- current_path = ".".join(path_parts[: i + 1])
737
- pattern_prefix = ".".join(pattern_parts[: i + 1])
738
-
739
- if pattern_prefix.startswith(current_path):
740
- self.report.report_container_scanned(full_path)
741
- return True
742
-
743
- # Direct pattern matching
744
- if self._check_pattern_match(
745
- pattern=pattern,
746
- paths=[full_path],
747
- allow_prefix=True,
748
- ):
788
+ # Check if current path could potentially match this pattern
789
+ if self._could_match_pattern(pattern, path_components):
749
790
  self.report.report_container_scanned(full_path)
750
791
  return True
751
792
 
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
13
14
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
118
119
  class DremioSourceConfig(
119
120
  DremioConnectionConfig,
120
121
  StatefulIngestionConfigBase,
122
+ BaseTimeWindowConfig,
121
123
  EnvConfigMixin,
122
124
  PlatformInstanceConfigMixin,
123
125
  ):
@@ -1,22 +1,43 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
+ from typing import Optional
3
4
 
4
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
5
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
7
  StaleEntityRemovalSourceReport,
7
8
  )
8
9
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
+ from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
11
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
12
+ from datahub.utilities.stats_collections import (
13
+ TopKDict,
14
+ float_top_k_dict,
15
+ int_top_k_dict,
16
+ )
9
17
 
10
18
 
11
19
  @dataclass
12
20
  class DremioSourceReport(
13
- SQLSourceReport, StaleEntityRemovalSourceReport, IngestionStageReport
21
+ SQLSourceReport,
22
+ StaleEntityRemovalSourceReport,
23
+ IngestionStageReport,
24
+ BaseTimeWindowReport,
14
25
  ):
15
26
  num_containers_failed: int = 0
16
27
  num_datasets_failed: int = 0
17
28
  containers_scanned: int = 0
18
29
  containers_filtered: int = 0
19
30
 
31
+ api_calls_total: int = 0
32
+ api_calls_by_method_and_path: TopKDict[str, int] = field(
33
+ default_factory=int_top_k_dict
34
+ )
35
+ api_call_secs_by_method_and_path: TopKDict[str, float] = field(
36
+ default_factory=float_top_k_dict
37
+ )
38
+
39
+ sql_aggregator: Optional[SqlAggregatorReport] = None
40
+
20
41
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
21
42
  # recording total combined latency is not very useful, keeping this method as a placeholder
22
43
  # for future implementation of min / max / percentiles etc.
@@ -51,13 +51,17 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
51
51
  from datahub.ingestion.source.state.stateful_ingestion_base import (
52
52
  StatefulIngestionSourceBase,
53
53
  )
54
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
54
+ from datahub.ingestion.source_report.ingestion_stage import (
55
+ LINEAGE_EXTRACTION,
56
+ METADATA_EXTRACTION,
57
+ PROFILING,
58
+ )
55
59
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
56
60
  DatasetLineageTypeClass,
57
61
  UpstreamClass,
58
62
  UpstreamLineage,
59
63
  )
60
- from datahub.metadata.schema_classes import ChangeTypeClass, SchemaMetadataClass
64
+ from datahub.metadata.schema_classes import SchemaMetadataClass
61
65
  from datahub.metadata.urns import CorpUserUrn
62
66
  from datahub.sql_parsing.sql_parsing_aggregator import (
63
67
  KnownQueryLineageInfo,
@@ -89,6 +93,7 @@ class DremioSourceMapEntry:
89
93
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
90
94
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
91
95
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
96
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
92
97
  class DremioSource(StatefulIngestionSourceBase):
93
98
  """
94
99
  This plugin integrates with Dremio to extract and ingest metadata into DataHub.
@@ -126,6 +131,13 @@ class DremioSource(StatefulIngestionSourceBase):
126
131
  self.default_db = "dremio"
127
132
  self.config = config
128
133
  self.report = DremioSourceReport()
134
+
135
+ # Set time window for query lineage extraction
136
+ self.report.window_start_time, self.report.window_end_time = (
137
+ self.config.start_time,
138
+ self.config.end_time,
139
+ )
140
+
129
141
  self.source_map: Dict[str, DremioSourceMapEntry] = dict()
130
142
 
131
143
  # Initialize API operations
@@ -154,6 +166,7 @@ class DremioSource(StatefulIngestionSourceBase):
154
166
  generate_operations=True,
155
167
  usage_config=self.config.usage,
156
168
  )
169
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
157
170
 
158
171
  # For profiling
159
172
  self.profiler = DremioProfiler(config, self.report, dremio_api)
@@ -190,84 +203,88 @@ class DremioSource(StatefulIngestionSourceBase):
190
203
 
191
204
  self.source_map = self._build_source_map()
192
205
 
193
- # Process Containers
194
- containers = self.dremio_catalog.get_containers()
195
- for container in containers:
196
- try:
197
- yield from self.process_container(container)
198
- logger.info(
199
- f"Dremio container {container.container_name} emitted successfully"
200
- )
201
- except Exception as exc:
202
- self.report.num_containers_failed += 1 # Increment failed containers
203
- self.report.report_failure(
204
- message="Failed to process Dremio container",
205
- context=f"{'.'.join(container.path)}.{container.container_name}",
206
- exc=exc,
207
- )
206
+ with self.report.new_stage(METADATA_EXTRACTION):
207
+ # Process Containers
208
+ containers = self.dremio_catalog.get_containers()
209
+ for container in containers:
210
+ try:
211
+ yield from self.process_container(container)
212
+ logger.info(
213
+ f"Dremio container {container.container_name} emitted successfully"
214
+ )
215
+ except Exception as exc:
216
+ self.report.num_containers_failed += 1
217
+ self.report.report_failure(
218
+ message="Failed to process Dremio container",
219
+ context=f"{'.'.join(container.path)}.{container.container_name}",
220
+ exc=exc,
221
+ )
208
222
 
209
- # Process Datasets
210
- datasets = self.dremio_catalog.get_datasets()
223
+ # Process Datasets
224
+ datasets = self.dremio_catalog.get_datasets()
211
225
 
212
- for dataset_info in datasets:
213
- try:
214
- yield from self.process_dataset(dataset_info)
215
- logger.info(
216
- f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
217
- )
218
- except Exception as exc:
219
- self.report.num_datasets_failed += 1 # Increment failed datasets
220
- self.report.report_failure(
221
- message="Failed to process Dremio dataset",
222
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
223
- exc=exc,
224
- )
226
+ for dataset_info in datasets:
227
+ try:
228
+ yield from self.process_dataset(dataset_info)
229
+ logger.info(
230
+ f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
231
+ )
232
+ except Exception as exc:
233
+ self.report.num_datasets_failed += 1 # Increment failed datasets
234
+ self.report.report_failure(
235
+ message="Failed to process Dremio dataset",
236
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
237
+ exc=exc,
238
+ )
225
239
 
226
- # Optionally Process Query Lineage
227
- if self.config.include_query_lineage:
228
- self.get_query_lineage_workunits()
229
-
230
- # Process Glossary Terms
231
- glossary_terms = self.dremio_catalog.get_glossary_terms()
232
-
233
- for glossary_term in glossary_terms:
234
- try:
235
- yield from self.process_glossary_term(glossary_term)
236
- except Exception as exc:
237
- self.report.report_failure(
238
- message="Failed to process Glossary terms",
239
- context=f"{glossary_term.glossary_term}",
240
- exc=exc,
241
- )
240
+ # Process Glossary Terms
241
+ glossary_terms = self.dremio_catalog.get_glossary_terms()
242
242
 
243
- # Generate workunit for aggregated SQL parsing results
244
- for mcp in self.sql_parsing_aggregator.gen_metadata():
245
- self.report.report_workunit(mcp.as_workunit())
246
- yield mcp.as_workunit()
247
-
248
- # Profiling
249
- if self.config.is_profiling_enabled():
250
- with ThreadPoolExecutor(
251
- max_workers=self.config.profiling.max_workers
252
- ) as executor:
253
- future_to_dataset = {
254
- executor.submit(self.generate_profiles, dataset): dataset
255
- for dataset in datasets
256
- }
257
-
258
- for future in as_completed(future_to_dataset):
259
- dataset_info = future_to_dataset[future]
260
- try:
261
- yield from future.result()
262
- except Exception as exc:
263
- self.report.profiling_skipped_other[
264
- dataset_info.resource_name
265
- ] += 1
266
- self.report.report_failure(
267
- message="Failed to profile dataset",
268
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
269
- exc=exc,
270
- )
243
+ for glossary_term in glossary_terms:
244
+ try:
245
+ yield from self.process_glossary_term(glossary_term)
246
+ except Exception as exc:
247
+ self.report.report_failure(
248
+ message="Failed to process Glossary terms",
249
+ context=f"{glossary_term.glossary_term}",
250
+ exc=exc,
251
+ )
252
+
253
+ # Optionally Process Query Lineage
254
+ if self.config.include_query_lineage:
255
+ with self.report.new_stage(LINEAGE_EXTRACTION):
256
+ self.get_query_lineage_workunits()
257
+
258
+ # Generate workunit for aggregated SQL parsing results
259
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
260
+ yield mcp.as_workunit()
261
+
262
+ # Profiling
263
+ if self.config.is_profiling_enabled():
264
+ with (
265
+ self.report.new_stage(PROFILING),
266
+ ThreadPoolExecutor(
267
+ max_workers=self.config.profiling.max_workers
268
+ ) as executor,
269
+ ):
270
+ future_to_dataset = {
271
+ executor.submit(self.generate_profiles, dataset): dataset
272
+ for dataset in datasets
273
+ }
274
+
275
+ for future in as_completed(future_to_dataset):
276
+ dataset_info = future_to_dataset[future]
277
+ try:
278
+ yield from future.result()
279
+ except Exception as exc:
280
+ self.report.profiling_skipped_other[
281
+ dataset_info.resource_name
282
+ ] += 1
283
+ self.report.report_failure(
284
+ message="Failed to profile dataset",
285
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
286
+ exc=exc,
287
+ )
271
288
 
272
289
  def process_container(
273
290
  self, container_info: DremioContainer
@@ -388,8 +405,7 @@ class DremioSource(StatefulIngestionSourceBase):
388
405
  env=self.config.env,
389
406
  platform_instance=self.config.platform_instance,
390
407
  )
391
- with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
392
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
408
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
393
409
 
394
410
  def generate_view_lineage(
395
411
  self, dataset_urn: str, parents: List[str]
@@ -417,11 +433,8 @@ class DremioSource(StatefulIngestionSourceBase):
417
433
  ]
418
434
  )
419
435
  mcp = MetadataChangeProposalWrapper(
420
- entityType="dataset",
421
436
  entityUrn=dataset_urn,
422
- aspectName=lineage.ASPECT_NAME,
423
437
  aspect=lineage,
424
- changeType=ChangeTypeClass.UPSERT,
425
438
  )
426
439
 
427
440
  for upstream_urn in upstream_urns: