acryl-datahub 1.1.0.4rc1__py3-none-any.whl → 1.1.0.4rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (58) hide show
  1. {acryl_datahub-1.1.0.4rc1.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/METADATA +2581 -2581
  2. {acryl_datahub-1.1.0.4rc1.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/RECORD +58 -58
  3. datahub/_version.py +1 -1
  4. datahub/emitter/rest_emitter.py +18 -1
  5. datahub/ingestion/api/source.py +2 -0
  6. datahub/ingestion/source/bigquery_v2/bigquery.py +18 -0
  7. datahub/ingestion/source/dbt/dbt_cloud.py +3 -0
  8. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  9. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  10. datahub/ingestion/source/dremio/dremio_api.py +98 -68
  11. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  12. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  13. datahub/ingestion/source/dremio/dremio_source.py +90 -77
  14. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  15. datahub/ingestion/source/file.py +3 -0
  16. datahub/ingestion/source/ge_data_profiler.py +48 -8
  17. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  18. datahub/ingestion/source/kafka/kafka.py +1 -0
  19. datahub/ingestion/source/looker/looker_source.py +1 -0
  20. datahub/ingestion/source/powerbi/powerbi.py +1 -0
  21. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  22. datahub/ingestion/source/redshift/redshift.py +21 -1
  23. datahub/ingestion/source/sac/sac.py +3 -1
  24. datahub/ingestion/source/sigma/sigma.py +1 -0
  25. datahub/ingestion/source/snowflake/snowflake_config.py +3 -6
  26. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  27. datahub/ingestion/source/snowflake/snowflake_v2.py +2 -0
  28. datahub/ingestion/source/sql/clickhouse.py +3 -1
  29. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  30. datahub/ingestion/source/sql/hana.py +3 -1
  31. datahub/ingestion/source/sql/hive_metastore.py +3 -1
  32. datahub/ingestion/source/sql/mariadb.py +0 -1
  33. datahub/ingestion/source/sql/mssql/source.py +8 -1
  34. datahub/ingestion/source/sql/mysql.py +0 -9
  35. datahub/ingestion/source/sql/postgres.py +0 -1
  36. datahub/ingestion/source/sql/sql_common.py +12 -0
  37. datahub/ingestion/source/tableau/tableau.py +1 -0
  38. datahub/ingestion/source/unity/source.py +1 -0
  39. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  40. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  41. datahub/metadata/_internal_schema_classes.py +25 -0
  42. datahub/metadata/schema.avsc +18 -1
  43. datahub/metadata/schemas/ContainerProperties.avsc +6 -0
  44. datahub/metadata/schemas/DataFlowInfo.avsc +6 -0
  45. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  46. datahub/metadata/schemas/DataJobInfo.avsc +6 -0
  47. datahub/metadata/schemas/DataProcessKey.avsc +6 -0
  48. datahub/metadata/schemas/DatasetKey.avsc +6 -0
  49. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +6 -0
  50. datahub/metadata/schemas/MLModelDeploymentKey.avsc +6 -0
  51. datahub/metadata/schemas/MLModelGroupKey.avsc +6 -0
  52. datahub/metadata/schemas/MLModelKey.avsc +6 -0
  53. datahub/metadata/schemas/MetadataChangeEvent.avsc +18 -1
  54. datahub/utilities/stats_collections.py +4 -0
  55. {acryl_datahub-1.1.0.4rc1.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/WHEEL +0 -0
  56. {acryl_datahub-1.1.0.4rc1.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/entry_points.txt +0 -0
  57. {acryl_datahub-1.1.0.4rc1.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/licenses/LICENSE +0 -0
  58. {acryl_datahub-1.1.0.4rc1.dist-info → acryl_datahub-1.1.0.4rc3.dist-info}/top_level.txt +0 -0
@@ -21,6 +21,7 @@ from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
21
21
  )
22
22
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
23
23
  from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
24
+ from datahub.utilities.perf_timer import PerfTimer
24
25
 
25
26
  logger = logging.getLogger(__name__)
26
27
 
@@ -54,6 +55,8 @@ class DremioAPIOperations:
54
55
  self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
55
56
  self._max_workers: int = connection_args.max_workers
56
57
  self.is_dremio_cloud = connection_args.is_dremio_cloud
58
+ self.start_time = connection_args.start_time
59
+ self.end_time = connection_args.end_time
57
60
  self.report = report
58
61
  self.session = requests.Session()
59
62
  if connection_args.is_dremio_cloud:
@@ -233,47 +236,71 @@ class DremioAPIOperations:
233
236
 
234
237
  def get(self, url: str) -> Dict:
235
238
  """execute a get request on dremio"""
236
- response = self.session.get(
237
- url=(self.base_url + url),
238
- verify=self._verify,
239
- timeout=self._timeout,
240
- )
241
- return response.json()
239
+ logger.debug(f"GET request to {self.base_url + url}")
240
+ self.report.api_calls_total += 1
241
+ self.report.api_calls_by_method_and_path["GET " + url] += 1
242
+
243
+ with PerfTimer() as timer:
244
+ response = self.session.get(
245
+ url=(self.base_url + url),
246
+ verify=self._verify,
247
+ timeout=self._timeout,
248
+ )
249
+ self.report.api_call_secs_by_method_and_path["GET " + url] += (
250
+ timer.elapsed_seconds()
251
+ )
252
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
253
+ return response.json()
242
254
 
243
255
  def post(self, url: str, data: str) -> Dict:
244
256
  """execute a get request on dremio"""
245
- response = self.session.post(
246
- url=(self.base_url + url),
247
- data=data,
248
- verify=self._verify,
249
- timeout=self._timeout,
250
- )
251
- return response.json()
257
+ logger.debug(f"POST request to {self.base_url + url}")
258
+ self.report.api_calls_total += 1
259
+ self.report.api_calls_by_method_and_path["POST " + url] += 1
260
+
261
+ with PerfTimer() as timer:
262
+ response = self.session.post(
263
+ url=(self.base_url + url),
264
+ data=data,
265
+ verify=self._verify,
266
+ timeout=self._timeout,
267
+ )
268
+ self.report.api_call_secs_by_method_and_path["POST " + url] += (
269
+ timer.elapsed_seconds()
270
+ )
271
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
272
+ return response.json()
252
273
 
253
274
  def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
254
275
  """Execute SQL query with timeout and error handling"""
255
276
  try:
256
- response = self.post(url="/sql", data=json.dumps({"sql": query}))
277
+ with PerfTimer() as timer:
278
+ logger.info(f"Executing query: {query}")
279
+ response = self.post(url="/sql", data=json.dumps({"sql": query}))
257
280
 
258
- if "errorMessage" in response:
259
- self.report.failure(
260
- message="SQL Error", context=f"{response['errorMessage']}"
261
- )
262
- raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
281
+ if "errorMessage" in response:
282
+ self.report.failure(
283
+ message="SQL Error", context=f"{response['errorMessage']}"
284
+ )
285
+ raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
263
286
 
264
- job_id = response["id"]
287
+ job_id = response["id"]
265
288
 
266
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
267
- future = executor.submit(self.fetch_results, job_id)
268
- try:
269
- return future.result(timeout=timeout)
270
- except concurrent.futures.TimeoutError:
271
- self.cancel_query(job_id)
272
- raise DremioAPIException(
273
- f"Query execution timed out after {timeout} seconds"
274
- ) from None
275
- except RuntimeError as e:
276
- raise DremioAPIException() from e
289
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
290
+ future = executor.submit(self.fetch_results, job_id)
291
+ try:
292
+ result = future.result(timeout=timeout)
293
+ logger.info(
294
+ f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
295
+ )
296
+ return result
297
+ except concurrent.futures.TimeoutError:
298
+ self.cancel_query(job_id)
299
+ raise DremioAPIException(
300
+ f"Query execution timed out after {timeout} seconds"
301
+ ) from None
302
+ except RuntimeError as e:
303
+ raise DremioAPIException() from e
277
304
 
278
305
  except requests.RequestException as e:
279
306
  raise DremioAPIException("Error executing query") from e
@@ -603,10 +630,25 @@ class DremioAPIOperations:
603
630
  return parents_list
604
631
 
605
632
  def extract_all_queries(self) -> List[Dict[str, Any]]:
633
+ # Convert datetime objects to string format for SQL queries
634
+ start_timestamp_str = None
635
+ end_timestamp_str = None
636
+
637
+ if self.start_time:
638
+ start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
639
+ if self.end_time:
640
+ end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
641
+
606
642
  if self.edition == DremioEdition.CLOUD:
607
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS_CLOUD
643
+ jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
644
+ start_timestamp_millis=start_timestamp_str,
645
+ end_timestamp_millis=end_timestamp_str,
646
+ )
608
647
  else:
609
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS
648
+ jobs_query = DremioSQLQueries.get_query_all_jobs(
649
+ start_timestamp_millis=start_timestamp_str,
650
+ end_timestamp_millis=end_timestamp_str,
651
+ )
610
652
 
611
653
  return self.execute_query(query=jobs_query)
612
654
 
@@ -685,6 +727,27 @@ class DremioAPIOperations:
685
727
 
686
728
  return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
687
729
 
730
+ def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
731
+ """
732
+ Check if a container path could potentially match a schema pattern.
733
+ This handles hierarchical path matching for container filtering.
734
+ """
735
+ if pattern == ".*":
736
+ return True
737
+
738
+ current_path = ".".join(path_components)
739
+
740
+ # Handle simple .* patterns (like "a.b.c.*")
741
+ if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
742
+ # Simple dotstar pattern - check prefix matching
743
+ pattern_prefix = pattern[:-2] # Remove ".*"
744
+ return current_path.lower().startswith(
745
+ pattern_prefix.lower()
746
+ ) or pattern_prefix.lower().startswith(current_path.lower())
747
+ else:
748
+ # Complex regex pattern - use existing regex matching logic
749
+ return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
750
+
688
751
  def should_include_container(self, path: List[str], name: str) -> bool:
689
752
  """
690
753
  Helper method to check if a container should be included based on schema patterns.
@@ -711,41 +774,8 @@ class DremioAPIOperations:
711
774
 
712
775
  # Check allow patterns
713
776
  for pattern in self.allow_schema_pattern:
714
- # For patterns with wildcards, check if this path is a parent of the pattern
715
- if "*" in pattern:
716
- pattern_parts = pattern.split(".")
717
- path_parts = path_components
718
-
719
- # If pattern has exact same number of parts, check each component
720
- if len(pattern_parts) == len(path_parts):
721
- matches = True
722
- for p_part, c_part in zip(pattern_parts, path_parts):
723
- if p_part != "*" and p_part.lower() != c_part.lower():
724
- matches = False
725
- break
726
- if matches:
727
- self.report.report_container_scanned(full_path)
728
- return True
729
- # Otherwise check if current path is prefix match
730
- else:
731
- # Remove the trailing wildcard if present
732
- if pattern_parts[-1] == "*":
733
- pattern_parts = pattern_parts[:-1]
734
-
735
- for i in range(len(path_parts)):
736
- current_path = ".".join(path_parts[: i + 1])
737
- pattern_prefix = ".".join(pattern_parts[: i + 1])
738
-
739
- if pattern_prefix.startswith(current_path):
740
- self.report.report_container_scanned(full_path)
741
- return True
742
-
743
- # Direct pattern matching
744
- if self._check_pattern_match(
745
- pattern=pattern,
746
- paths=[full_path],
747
- allow_prefix=True,
748
- ):
777
+ # Check if current path could potentially match this pattern
778
+ if self._could_match_pattern(pattern, path_components):
749
779
  self.report.report_container_scanned(full_path)
750
780
  return True
751
781
 
@@ -9,6 +9,7 @@ from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
11
11
  )
12
+ from datahub.configuration.time_window_config import BaseTimeWindowConfig
12
13
  from datahub.ingestion.source.ge_profiling_config import GEProfilingBaseConfig
13
14
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
14
15
  StatefulStaleMetadataRemovalConfig,
@@ -118,6 +119,7 @@ class DremioSourceMapping(EnvConfigMixin, PlatformInstanceConfigMixin, ConfigMod
118
119
  class DremioSourceConfig(
119
120
  DremioConnectionConfig,
120
121
  StatefulIngestionConfigBase,
122
+ BaseTimeWindowConfig,
121
123
  EnvConfigMixin,
122
124
  PlatformInstanceConfigMixin,
123
125
  ):
@@ -1,22 +1,43 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from datetime import datetime
3
+ from typing import Optional
3
4
 
4
5
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
5
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
7
  StaleEntityRemovalSourceReport,
7
8
  )
8
9
  from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
+ from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
11
+ from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
12
+ from datahub.utilities.stats_collections import (
13
+ TopKDict,
14
+ float_top_k_dict,
15
+ int_top_k_dict,
16
+ )
9
17
 
10
18
 
11
19
  @dataclass
12
20
  class DremioSourceReport(
13
- SQLSourceReport, StaleEntityRemovalSourceReport, IngestionStageReport
21
+ SQLSourceReport,
22
+ StaleEntityRemovalSourceReport,
23
+ IngestionStageReport,
24
+ BaseTimeWindowReport,
14
25
  ):
15
26
  num_containers_failed: int = 0
16
27
  num_datasets_failed: int = 0
17
28
  containers_scanned: int = 0
18
29
  containers_filtered: int = 0
19
30
 
31
+ api_calls_total: int = 0
32
+ api_calls_by_method_and_path: TopKDict[str, int] = field(
33
+ default_factory=int_top_k_dict
34
+ )
35
+ api_call_secs_by_method_and_path: TopKDict[str, float] = field(
36
+ default_factory=float_top_k_dict
37
+ )
38
+
39
+ sql_aggregator: Optional[SqlAggregatorReport] = None
40
+
20
41
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
21
42
  # recording total combined latency is not very useful, keeping this method as a placeholder
22
43
  # for future implementation of min / max / percentiles etc.
@@ -51,7 +51,11 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
51
51
  from datahub.ingestion.source.state.stateful_ingestion_base import (
52
52
  StatefulIngestionSourceBase,
53
53
  )
54
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
54
+ from datahub.ingestion.source_report.ingestion_stage import (
55
+ LINEAGE_EXTRACTION,
56
+ METADATA_EXTRACTION,
57
+ PROFILING,
58
+ )
55
59
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
56
60
  DatasetLineageTypeClass,
57
61
  UpstreamClass,
@@ -89,6 +93,7 @@ class DremioSourceMapEntry:
89
93
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
90
94
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
91
95
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
96
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
92
97
  class DremioSource(StatefulIngestionSourceBase):
93
98
  """
94
99
  This plugin integrates with Dremio to extract and ingest metadata into DataHub.
@@ -126,6 +131,13 @@ class DremioSource(StatefulIngestionSourceBase):
126
131
  self.default_db = "dremio"
127
132
  self.config = config
128
133
  self.report = DremioSourceReport()
134
+
135
+ # Set time window for query lineage extraction
136
+ self.report.window_start_time, self.report.window_end_time = (
137
+ self.config.start_time,
138
+ self.config.end_time,
139
+ )
140
+
129
141
  self.source_map: Dict[str, DremioSourceMapEntry] = dict()
130
142
 
131
143
  # Initialize API operations
@@ -154,6 +166,7 @@ class DremioSource(StatefulIngestionSourceBase):
154
166
  generate_operations=True,
155
167
  usage_config=self.config.usage,
156
168
  )
169
+ self.report.sql_aggregator = self.sql_parsing_aggregator.report
157
170
 
158
171
  # For profiling
159
172
  self.profiler = DremioProfiler(config, self.report, dremio_api)
@@ -190,84 +203,85 @@ class DremioSource(StatefulIngestionSourceBase):
190
203
 
191
204
  self.source_map = self._build_source_map()
192
205
 
193
- # Process Containers
194
- containers = self.dremio_catalog.get_containers()
195
- for container in containers:
196
- try:
197
- yield from self.process_container(container)
198
- logger.info(
199
- f"Dremio container {container.container_name} emitted successfully"
200
- )
201
- except Exception as exc:
202
- self.report.num_containers_failed += 1 # Increment failed containers
203
- self.report.report_failure(
204
- message="Failed to process Dremio container",
205
- context=f"{'.'.join(container.path)}.{container.container_name}",
206
- exc=exc,
207
- )
206
+ with self.report.new_stage(METADATA_EXTRACTION):
207
+ # Process Containers
208
+ containers = self.dremio_catalog.get_containers()
209
+ for container in containers:
210
+ try:
211
+ yield from self.process_container(container)
212
+ logger.info(
213
+ f"Dremio container {container.container_name} emitted successfully"
214
+ )
215
+ except Exception as exc:
216
+ self.report.num_containers_failed += 1
217
+ self.report.report_failure(
218
+ message="Failed to process Dremio container",
219
+ context=f"{'.'.join(container.path)}.{container.container_name}",
220
+ exc=exc,
221
+ )
208
222
 
209
- # Process Datasets
210
- datasets = self.dremio_catalog.get_datasets()
223
+ # Process Datasets
224
+ datasets = self.dremio_catalog.get_datasets()
211
225
 
212
- for dataset_info in datasets:
213
- try:
214
- yield from self.process_dataset(dataset_info)
215
- logger.info(
216
- f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
217
- )
218
- except Exception as exc:
219
- self.report.num_datasets_failed += 1 # Increment failed datasets
220
- self.report.report_failure(
221
- message="Failed to process Dremio dataset",
222
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
223
- exc=exc,
224
- )
226
+ for dataset_info in datasets:
227
+ try:
228
+ yield from self.process_dataset(dataset_info)
229
+ logger.info(
230
+ f"Dremio dataset {'.'.join(dataset_info.path)}.{dataset_info.resource_name} emitted successfully"
231
+ )
232
+ except Exception as exc:
233
+ self.report.num_datasets_failed += 1 # Increment failed datasets
234
+ self.report.report_failure(
235
+ message="Failed to process Dremio dataset",
236
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
237
+ exc=exc,
238
+ )
225
239
 
226
- # Optionally Process Query Lineage
227
- if self.config.include_query_lineage:
228
- self.get_query_lineage_workunits()
229
-
230
- # Process Glossary Terms
231
- glossary_terms = self.dremio_catalog.get_glossary_terms()
232
-
233
- for glossary_term in glossary_terms:
234
- try:
235
- yield from self.process_glossary_term(glossary_term)
236
- except Exception as exc:
237
- self.report.report_failure(
238
- message="Failed to process Glossary terms",
239
- context=f"{glossary_term.glossary_term}",
240
- exc=exc,
241
- )
240
+ # Process Glossary Terms
241
+ glossary_terms = self.dremio_catalog.get_glossary_terms()
242
242
 
243
- # Generate workunit for aggregated SQL parsing results
244
- for mcp in self.sql_parsing_aggregator.gen_metadata():
245
- self.report.report_workunit(mcp.as_workunit())
246
- yield mcp.as_workunit()
247
-
248
- # Profiling
249
- if self.config.is_profiling_enabled():
250
- with ThreadPoolExecutor(
251
- max_workers=self.config.profiling.max_workers
252
- ) as executor:
253
- future_to_dataset = {
254
- executor.submit(self.generate_profiles, dataset): dataset
255
- for dataset in datasets
256
- }
257
-
258
- for future in as_completed(future_to_dataset):
259
- dataset_info = future_to_dataset[future]
260
- try:
261
- yield from future.result()
262
- except Exception as exc:
263
- self.report.profiling_skipped_other[
264
- dataset_info.resource_name
265
- ] += 1
266
- self.report.report_failure(
267
- message="Failed to profile dataset",
268
- context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
269
- exc=exc,
270
- )
243
+ for glossary_term in glossary_terms:
244
+ try:
245
+ yield from self.process_glossary_term(glossary_term)
246
+ except Exception as exc:
247
+ self.report.report_failure(
248
+ message="Failed to process Glossary terms",
249
+ context=f"{glossary_term.glossary_term}",
250
+ exc=exc,
251
+ )
252
+
253
+ # Optionally Process Query Lineage
254
+ if self.config.include_query_lineage:
255
+ with self.report.new_stage(LINEAGE_EXTRACTION):
256
+ self.get_query_lineage_workunits()
257
+
258
+ # Generate workunit for aggregated SQL parsing results
259
+ for mcp in self.sql_parsing_aggregator.gen_metadata():
260
+ yield mcp.as_workunit()
261
+
262
+ # Profiling
263
+ if self.config.is_profiling_enabled():
264
+ with self.report.new_stage(PROFILING), ThreadPoolExecutor(
265
+ max_workers=self.config.profiling.max_workers
266
+ ) as executor:
267
+ future_to_dataset = {
268
+ executor.submit(self.generate_profiles, dataset): dataset
269
+ for dataset in datasets
270
+ }
271
+
272
+ for future in as_completed(future_to_dataset):
273
+ dataset_info = future_to_dataset[future]
274
+ try:
275
+ yield from future.result()
276
+ except Exception as exc:
277
+ self.report.profiling_skipped_other[
278
+ dataset_info.resource_name
279
+ ] += 1
280
+ self.report.report_failure(
281
+ message="Failed to profile dataset",
282
+ context=f"{'.'.join(dataset_info.path)}.{dataset_info.resource_name}",
283
+ exc=exc,
284
+ )
271
285
 
272
286
  def process_container(
273
287
  self, container_info: DremioContainer
@@ -388,8 +402,7 @@ class DremioSource(StatefulIngestionSourceBase):
388
402
  env=self.config.env,
389
403
  platform_instance=self.config.platform_instance,
390
404
  )
391
- with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
392
- yield from self.profiler.get_workunits(dataset_info, dataset_urn)
405
+ yield from self.profiler.get_workunits(dataset_info, dataset_urn)
393
406
 
394
407
  def generate_view_lineage(
395
408
  self, dataset_urn: str, parents: List[str]
@@ -1,3 +1,7 @@
1
+ from datetime import datetime, timedelta
2
+ from typing import Optional
3
+
4
+
1
5
  class DremioSQLQueries:
2
6
  QUERY_DATASETS_CE = """
3
7
  SELECT* FROM
@@ -235,28 +239,83 @@ class DremioSQLQueries:
235
239
  TABLE_NAME ASC
236
240
  """
237
241
 
238
- # Dremio Documentation: https://docs.dremio.com/current/reference/sql/system-tables/jobs_recent/
239
- # queried_datasets incorrectly documented as [varchar]. Observed as varchar.
240
- # LENGTH used as opposed to ARRAY_SIZE
241
- QUERY_ALL_JOBS = """
242
- SELECT
243
- job_id,
244
- user_name,
245
- submitted_ts,
246
- query,
247
- queried_datasets
248
- FROM
249
- SYS.JOBS_RECENT
250
- WHERE
251
- STATUS = 'COMPLETED'
252
- AND LENGTH(queried_datasets)>0
253
- AND user_name != '$dremio$'
254
- AND query_type not like '%INTERNAL%'
255
- """
242
+ @staticmethod
243
+ def _get_default_start_timestamp_millis() -> str:
244
+ """Get default start timestamp (1 day ago) in milliseconds precision format"""
245
+ one_day_ago = datetime.now() - timedelta(days=1)
246
+ return one_day_ago.strftime("%Y-%m-%d %H:%M:%S.%f")[
247
+ :-3
248
+ ] # Truncate to milliseconds
249
+
250
+ @staticmethod
251
+ def _get_default_end_timestamp_millis() -> str:
252
+ """Get default end timestamp (now) in milliseconds precision format"""
253
+ now = datetime.now()
254
+ return now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] # Truncate to milliseconds
255
+
256
+ @staticmethod
257
+ def get_query_all_jobs(
258
+ start_timestamp_millis: Optional[str] = None,
259
+ end_timestamp_millis: Optional[str] = None,
260
+ ) -> str:
261
+ """
262
+ Get query for all jobs with optional time filtering.
263
+
264
+ Args:
265
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 1 day ago)
266
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
267
+
268
+ Returns:
269
+ SQL query string with time filtering applied
270
+ """
271
+ if start_timestamp_millis is None:
272
+ start_timestamp_millis = (
273
+ DremioSQLQueries._get_default_start_timestamp_millis()
274
+ )
275
+ if end_timestamp_millis is None:
276
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
277
+
278
+ return f"""
279
+ SELECT
280
+ job_id,
281
+ user_name,
282
+ submitted_ts,
283
+ query,
284
+ queried_datasets
285
+ FROM
286
+ SYS.JOBS_RECENT
287
+ WHERE
288
+ STATUS = 'COMPLETED'
289
+ AND LENGTH(queried_datasets)>0
290
+ AND user_name != '$dremio$'
291
+ AND query_type not like '%INTERNAL%'
292
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
293
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
294
+ """
295
+
296
+ @staticmethod
297
+ def get_query_all_jobs_cloud(
298
+ start_timestamp_millis: Optional[str] = None,
299
+ end_timestamp_millis: Optional[str] = None,
300
+ ) -> str:
301
+ """
302
+ Get query for all jobs in Dremio Cloud with optional time filtering.
303
+
304
+ Args:
305
+ start_timestamp_millis: Start timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to 7 days ago)
306
+ end_timestamp_millis: End timestamp in format 'YYYY-MM-DD HH:MM:SS.mmm' (defaults to now)
307
+
308
+ Returns:
309
+ SQL query string with time filtering applied
310
+ """
311
+ if start_timestamp_millis is None:
312
+ start_timestamp_millis = (
313
+ DremioSQLQueries._get_default_start_timestamp_millis()
314
+ )
315
+ if end_timestamp_millis is None:
316
+ end_timestamp_millis = DremioSQLQueries._get_default_end_timestamp_millis()
256
317
 
257
- # Dremio Documentation: https://docs.dremio.com/cloud/reference/sql/system-tables/jobs-historical
258
- # queried_datasets correctly documented as [varchar]
259
- QUERY_ALL_JOBS_CLOUD = """
318
+ return f"""
260
319
  SELECT
261
320
  job_id,
262
321
  user_name,
@@ -270,6 +329,8 @@ class DremioSQLQueries:
270
329
  AND ARRAY_SIZE(queried_datasets)>0
271
330
  AND user_name != '$dremio$'
272
331
  AND query_type not like '%INTERNAL%'
332
+ AND submitted_ts >= TIMESTAMP '{start_timestamp_millis}'
333
+ AND submitted_ts <= TIMESTAMP '{end_timestamp_millis}'
273
334
  """
274
335
 
275
336
  QUERY_TYPES = [
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
19
  from datahub.ingestion.api.common import PipelineContext
20
20
  from datahub.ingestion.api.decorators import (
21
+ SourceCapability,
21
22
  SupportStatus,
23
+ capability,
22
24
  config_class,
23
25
  platform_name,
24
26
  support_status,
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
187
189
  @platform_name("Metadata File")
188
190
  @config_class(FileSourceConfig)
189
191
  @support_status(SupportStatus.CERTIFIED)
192
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
190
193
  class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
191
194
  """
192
195
  This plugin pulls metadata from a previously generated file.