acryl-datahub 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (58) hide show
  1. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/METADATA +2460 -2460
  2. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/RECORD +58 -54
  3. datahub/__init__.py +1 -1
  4. datahub/cli/delete_cli.py +3 -3
  5. datahub/cli/migrate.py +2 -2
  6. datahub/emitter/mcp_builder.py +27 -0
  7. datahub/emitter/rest_emitter.py +1 -1
  8. datahub/ingestion/api/source.py +2 -2
  9. datahub/ingestion/graph/client.py +4 -2
  10. datahub/ingestion/source/aws/glue.py +14 -1
  11. datahub/ingestion/source/aws/s3_util.py +24 -1
  12. datahub/ingestion/source/delta_lake/source.py +0 -5
  13. datahub/ingestion/source/demo_data.py +1 -1
  14. datahub/ingestion/source/fivetran/fivetran.py +1 -6
  15. datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
  16. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
  17. datahub/ingestion/source/iceberg/iceberg.py +10 -3
  18. datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
  19. datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
  20. datahub/ingestion/source/kafka_connect/kafka_connect.py +1 -6
  21. datahub/ingestion/source/metabase.py +1 -6
  22. datahub/ingestion/source/mlflow.py +0 -5
  23. datahub/ingestion/source/nifi.py +0 -5
  24. datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
  25. datahub/ingestion/source/redash.py +0 -5
  26. datahub/ingestion/source/redshift/redshift.py +1 -0
  27. datahub/ingestion/source/s3/source.py +10 -14
  28. datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
  29. datahub/ingestion/source/snowflake/snowflake_schema.py +5 -2
  30. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -20
  31. datahub/ingestion/source/snowflake/snowflake_tag.py +14 -4
  32. datahub/ingestion/source/snowflake/snowflake_v2.py +0 -6
  33. datahub/ingestion/source/sql/sql_types.py +1 -1
  34. datahub/ingestion/source/sql/sql_utils.py +5 -0
  35. datahub/ingestion/source/superset.py +1 -6
  36. datahub/ingestion/source/tableau/tableau.py +0 -6
  37. datahub/metadata/_schema_classes.py +316 -43
  38. datahub/metadata/_urns/urn_defs.py +69 -15
  39. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  40. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  41. datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
  42. datahub/metadata/schema.avsc +296 -87
  43. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  44. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  45. datahub/metadata/schemas/DatasetKey.avsc +2 -1
  46. datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
  47. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
  48. datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
  49. datahub/metadata/schemas/MLModelKey.avsc +2 -1
  50. datahub/metadata/schemas/MLModelProperties.avsc +96 -48
  51. datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
  52. datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
  53. datahub/metadata/schemas/VersionProperties.avsc +216 -0
  54. datahub/metadata/schemas/VersionSetKey.avsc +26 -0
  55. datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
  56. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/WHEEL +0 -0
  57. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/entry_points.txt +0 -0
  58. {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@ from datahub.ingestion.api.decorators import (
16
16
  platform_name,
17
17
  support_status,
18
18
  )
19
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
19
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
20
20
  from datahub.ingestion.api.workunit import MetadataWorkUnit
21
21
  from datahub.ingestion.source.fivetran.config import (
22
22
  KNOWN_DATA_PLATFORM_MAPPING,
@@ -291,11 +291,6 @@ class FivetranSource(StatefulIngestionSourceBase):
291
291
  dpi = self._generate_dpi_from_job(job, datajob)
292
292
  yield from self._get_dpi_workunits(job, dpi)
293
293
 
294
- @classmethod
295
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
296
- config = FivetranSourceConfig.parse_obj(config_dict)
297
- return cls(config, ctx)
298
-
299
294
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
300
295
  return [
301
296
  *super().get_workunit_processors(),
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
29
29
  )
30
30
 
31
31
  keep_history_max_days: int = Field(
32
- 30,
32
+ 90,
33
33
  description="Maximum number of days to keep execution requests for, per ingestion source",
34
34
  )
35
35
 
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
48
48
  description="Maximum runtime in seconds for the cleanup task",
49
49
  )
50
50
 
51
+ limit_entities_delete: Optional[int] = Field(
52
+ 10000, description="Max number of execution requests to hard delete."
53
+ )
54
+
51
55
  max_read_errors: int = Field(
52
56
  default=10,
53
57
  description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
65
69
  ergc_delete_errors: int = 0
66
70
  ergc_start_time: Optional[datetime.datetime] = None
67
71
  ergc_end_time: Optional[datetime.datetime] = None
72
+ ergc_delete_limit_reached: bool = False
73
+ ergc_runtime_limit_reached: bool = False
68
74
 
69
75
 
70
76
  class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
85
91
  self.graph = graph
86
92
  self.report = report
87
93
  self.instance_id = int(time.time())
94
+ self.last_print_time = 0.0
88
95
 
89
96
  if config is not None:
90
97
  self.config = config
91
98
  else:
92
99
  self.config = DatahubExecutionRequestCleanupConfig()
93
100
 
101
+ def _print_report(self) -> None:
102
+ time_taken = round(time.time() - self.last_print_time, 1)
103
+ # Print report every 2 minutes
104
+ if time_taken > 120:
105
+ self.last_print_time = time.time()
106
+ logger.info(f"\n{self.report.as_string()}")
107
+
94
108
  def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
95
109
  input_aspect = (
96
110
  entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
175
189
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
176
190
 
177
191
  for entry in self._scroll_execution_requests():
192
+ self._print_report()
178
193
  self.report.ergc_records_read += 1
179
194
  key = entry.ingestion_source
180
195
 
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
225
240
  f"record timestamp: {entry.requested_at}."
226
241
  )
227
242
  )
228
- self.report.ergc_records_deleted += 1
229
243
  yield entry
230
244
 
231
245
  def _delete_entry(self, entry: CleanupRecord) -> None:
232
246
  try:
233
- logger.info(
234
- f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
235
- )
236
247
  self.graph.delete_entity(entry.urn, True)
248
+ self.report.ergc_records_deleted += 1
237
249
  except Exception as e:
238
250
  self.report.ergc_delete_errors += 1
239
251
  self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
252
264
  >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
253
265
  )
254
266
  ):
267
+ self.report.ergc_runtime_limit_reached = True
255
268
  logger.info(f"ergc({self.instance_id}): max runtime reached.")
256
269
  return True
257
270
  return False
258
271
 
272
+ def _reached_delete_limit(self) -> bool:
273
+ if (
274
+ self.config.limit_entities_delete
275
+ and self.report.ergc_records_deleted >= self.config.limit_entities_delete
276
+ ):
277
+ logger.info(
278
+ f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
279
+ )
280
+ self.report.ergc_delete_limit_reached = True
281
+ return True
282
+ return False
283
+
259
284
  def run(self) -> None:
260
285
  if not self.config.enabled:
261
286
  logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
274
299
  )
275
300
 
276
301
  for entry in self._scroll_garbage_records():
277
- if self._reached_runtime_limit():
302
+ if self._reached_runtime_limit() or self._reached_delete_limit():
278
303
  break
279
304
  self._delete_entry(entry)
280
305
 
@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
231
231
  def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
232
232
  assert self.ctx.graph
233
233
  scroll_id: Optional[str] = None
234
+
235
+ batch_size = self.config.batch_size
236
+ if entity_type == "DATA_PROCESS_INSTANCE":
237
+ # Due to a bug in Data process instance querying this is a temp workaround
238
+ # to avoid a giant stacktrace by having a smaller batch size in first call
239
+ # This will be remove in future version after server with fix has been
240
+ # around for a while
241
+ batch_size = 10
242
+
234
243
  while True:
235
244
  try:
236
245
  result = self.ctx.graph.execute_graphql(
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
240
249
  "types": [entity_type],
241
250
  "query": "*",
242
251
  "scrollId": scroll_id if scroll_id else None,
243
- "count": self.config.batch_size,
252
+ "count": batch_size,
244
253
  "orFilters": [
245
254
  {
246
255
  "and": [
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
263
272
  scroll_across_entities = result.get("scrollAcrossEntities")
264
273
  if not scroll_across_entities or not scroll_across_entities.get("count"):
265
274
  break
275
+ if entity_type == "DATA_PROCESS_INSTANCE":
276
+ # Temp workaround. See note in beginning of the function
277
+ # We make the batch size = config after call has succeeded once
278
+ batch_size = self.config.batch_size
266
279
  scroll_id = scroll_across_entities.get("nextScrollId")
267
280
  self.report.num_queries_found += scroll_across_entities.get("count")
268
281
  for query in scroll_across_entities.get("searchResults"):
@@ -203,7 +203,9 @@ class IcebergSource(StatefulIngestionSourceBase):
203
203
  with PerfTimer() as timer:
204
204
  table = thread_local.local_catalog.load_table(dataset_path)
205
205
  time_taken = timer.elapsed_seconds()
206
- self.report.report_table_load_time(time_taken)
206
+ self.report.report_table_load_time(
207
+ time_taken, dataset_name, table.metadata_location
208
+ )
207
209
  LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
208
210
  yield from self._create_iceberg_workunit(dataset_name, table)
209
211
  except NoSuchPropertyException as e:
@@ -247,7 +249,10 @@ class IcebergSource(StatefulIngestionSourceBase):
247
249
  f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
248
250
  )
249
251
  except Exception as e:
250
- self.report.report_failure("general", f"Failed to create workunit: {e}")
252
+ self.report.report_failure(
253
+ "general",
254
+ f"Failed to create workunit for dataset {dataset_name}: {e}",
255
+ )
251
256
  LOGGER.exception(
252
257
  f"Exception while processing table {dataset_path}, skipping it.",
253
258
  )
@@ -312,7 +317,9 @@ class IcebergSource(StatefulIngestionSourceBase):
312
317
  dataset_snapshot.aspects.append(schema_metadata)
313
318
 
314
319
  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
315
- self.report.report_table_processing_time(timer.elapsed_seconds())
320
+ self.report.report_table_processing_time(
321
+ timer.elapsed_seconds(), dataset_name, table.metadata_location
322
+ )
316
323
  yield MetadataWorkUnit(id=dataset_name, mce=mce)
317
324
 
318
325
  dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional
5
5
  from humanfriendly import format_timespan
6
6
  from pydantic import Field, validator
7
7
  from pyiceberg.catalog import Catalog, load_catalog
8
+ from sortedcontainers import SortedList
8
9
 
9
10
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
11
  from datahub.configuration.source_common import DatasetSourceConfigMixin
@@ -146,19 +147,40 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
146
147
  return load_catalog(name=catalog_name, **catalog_config)
147
148
 
148
149
 
150
+ class TopTableTimings:
151
+ _VALUE_FIELD: str = "timing"
152
+ top_entites: SortedList
153
+ _size: int
154
+
155
+ def __init__(self, size: int = 10):
156
+ self._size = size
157
+ self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
158
+
159
+ def add(self, entity: Dict[str, Any]) -> None:
160
+ if self._VALUE_FIELD not in entity:
161
+ return
162
+ self.top_entites.add(entity)
163
+ if len(self.top_entites) > self._size:
164
+ self.top_entites.pop()
165
+
166
+ def __str__(self) -> str:
167
+ if len(self.top_entites) == 0:
168
+ return "no timings reported"
169
+ return str(list(self.top_entites))
170
+
171
+
149
172
  class TimingClass:
150
- times: List[int]
173
+ times: SortedList
151
174
 
152
175
  def __init__(self):
153
- self.times = []
176
+ self.times = SortedList()
154
177
 
155
- def add_timing(self, t):
156
- self.times.append(t)
178
+ def add_timing(self, t: float) -> None:
179
+ self.times.add(t)
157
180
 
158
- def __str__(self):
181
+ def __str__(self) -> str:
159
182
  if len(self.times) == 0:
160
183
  return "no timings reported"
161
- self.times.sort()
162
184
  total = sum(self.times)
163
185
  avg = total / len(self.times)
164
186
  return str(
@@ -180,6 +202,9 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
180
202
  load_table_timings: TimingClass = field(default_factory=TimingClass)
181
203
  processing_table_timings: TimingClass = field(default_factory=TimingClass)
182
204
  profiling_table_timings: TimingClass = field(default_factory=TimingClass)
205
+ tables_load_timings: TopTableTimings = field(default_factory=TopTableTimings)
206
+ tables_profile_timings: TopTableTimings = field(default_factory=TopTableTimings)
207
+ tables_process_timings: TopTableTimings = field(default_factory=TopTableTimings)
183
208
  listed_namespaces: int = 0
184
209
  total_listed_tables: int = 0
185
210
  tables_listed_per_namespace: TopKDict[str, int] = field(
@@ -201,11 +226,26 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
201
226
  def report_dropped(self, ent_name: str) -> None:
202
227
  self.filtered.append(ent_name)
203
228
 
204
- def report_table_load_time(self, t: float) -> None:
229
+ def report_table_load_time(
230
+ self, t: float, table_name: str, table_metadata_location: str
231
+ ) -> None:
205
232
  self.load_table_timings.add_timing(t)
233
+ self.tables_load_timings.add(
234
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
235
+ )
206
236
 
207
- def report_table_processing_time(self, t: float) -> None:
237
+ def report_table_processing_time(
238
+ self, t: float, table_name: str, table_metadata_location: str
239
+ ) -> None:
208
240
  self.processing_table_timings.add_timing(t)
241
+ self.tables_process_timings.add(
242
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
243
+ )
209
244
 
210
- def report_table_profiling_time(self, t: float) -> None:
245
+ def report_table_profiling_time(
246
+ self, t: float, table_name: str, table_metadata_location: str
247
+ ) -> None:
211
248
  self.profiling_table_timings.add_timing(t)
249
+ self.tables_profile_timings.add(
250
+ {"table": table_name, "timing": t, "metadata_file": table_metadata_location}
251
+ )
@@ -204,7 +204,9 @@ class IcebergProfiler:
204
204
  )
205
205
  dataset_profile.fieldProfiles.append(column_profile)
206
206
  time_taken = timer.elapsed_seconds()
207
- self.report.report_table_profiling_time(time_taken)
207
+ self.report.report_table_profiling_time(
208
+ time_taken, dataset_name, table.metadata_location
209
+ )
208
210
  LOGGER.debug(
209
211
  f"Finished profiling of dataset: {dataset_name} in {time_taken}"
210
212
  )
@@ -17,7 +17,7 @@ from datahub.ingestion.api.decorators import (
17
17
  platform_name,
18
18
  support_status,
19
19
  )
20
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
20
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
21
21
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
22
  from datahub.ingestion.source.kafka_connect.common import (
23
23
  CONNECTOR_CLASS,
@@ -94,11 +94,6 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
94
94
  if not jpype.isJVMStarted():
95
95
  jpype.startJVM()
96
96
 
97
- @classmethod
98
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
99
- config = KafkaConnectSourceConfig.parse_obj(config_dict)
100
- return cls(config, ctx)
101
-
102
97
  def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
103
98
  """Get Kafka Connect connectors manifest using REST API.
104
99
  Enrich with lineages metadata.
@@ -23,7 +23,7 @@ from datahub.ingestion.api.decorators import (
23
23
  platform_name,
24
24
  support_status,
25
25
  )
26
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
26
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
27
27
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
28
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
29
29
  StaleEntityRemovalHandler,
@@ -789,11 +789,6 @@ class MetabaseSource(StatefulIngestionSourceBase):
789
789
 
790
790
  return platform, dbname, schema, platform_instance
791
791
 
792
- @classmethod
793
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
794
- config = MetabaseConfig.parse_obj(config_dict)
795
- return cls(ctx, config)
796
-
797
792
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
798
793
  return [
799
794
  *super().get_workunit_processors(),
@@ -333,8 +333,3 @@ class MLflowSource(Source):
333
333
  aspect=global_tags,
334
334
  )
335
335
  return wu
336
-
337
- @classmethod
338
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
339
- config = MLflowConfig.parse_obj(config_dict)
340
- return cls(ctx, config)
@@ -484,11 +484,6 @@ class NifiSource(Source):
484
484
  def rest_api_base_url(self):
485
485
  return self.config.site_url[: -len("nifi/")] + "nifi-api/"
486
486
 
487
- @classmethod
488
- def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
489
- config = NifiSourceConfig.parse_obj(config_dict)
490
- return cls(config, ctx)
491
-
492
487
  def get_report(self) -> SourceReport:
493
488
  return self.report
494
489
 
@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
485
485
  self.filtered_reports.append(view)
486
486
 
487
487
 
488
- @platform_name("PowerBI")
488
+ @platform_name("PowerBI Report Server")
489
489
  @config_class(PowerBiReportServerDashboardSourceConfig)
490
490
  @support_status(SupportStatus.INCUBATING)
491
491
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
@@ -369,11 +369,6 @@ class RedashSource(Source):
369
369
  else:
370
370
  raise ValueError(f"Failed to connect to {self.config.connect_uri}/api")
371
371
 
372
- @classmethod
373
- def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
374
- config = RedashConfig.parse_obj(config_dict)
375
- return cls(ctx, config)
376
-
377
372
  def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
378
373
  url = f"/api/data_sources/{data_source_id}"
379
374
  resp = self.client._get(url).json()
@@ -276,6 +276,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
276
276
  "HLLSKETCH": NullType,
277
277
  "TIMETZ": TimeType,
278
278
  "VARBYTE": StringType,
279
+ "SUPER": NullType,
279
280
  }
280
281
 
281
282
  def get_platform_instance_id(self) -> str:
@@ -6,9 +6,8 @@ import pathlib
6
6
  import re
7
7
  import time
8
8
  from datetime import datetime
9
- from itertools import groupby
10
9
  from pathlib import PurePath
11
- from typing import Any, Dict, Iterable, List, Optional, Tuple
10
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
12
11
  from urllib.parse import urlparse
13
12
 
14
13
  import smart_open.compression as so_compression
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
41
40
  get_bucket_name,
42
41
  get_bucket_relative_path,
43
42
  get_key_prefix,
43
+ group_s3_objects_by_dirname,
44
44
  strip_s3_prefix,
45
45
  )
46
46
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
75
75
  from datahub.telemetry import stats, telemetry
76
76
  from datahub.utilities.perf_timer import PerfTimer
77
77
 
78
+ if TYPE_CHECKING:
79
+ from mypy_boto3_s3.service_resource import Bucket
80
+
78
81
  # hide annoying debug errors from py4j
79
82
  logging.getLogger("py4j").setLevel(logging.ERROR)
80
83
  logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
842
845
  def get_folder_info(
843
846
  self,
844
847
  path_spec: PathSpec,
845
- bucket: Any, # Todo: proper type
848
+ bucket: "Bucket",
846
849
  prefix: str,
847
850
  ) -> List[Folder]:
848
851
  """
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
857
860
 
858
861
  Parameters:
859
862
  path_spec (PathSpec): The path specification used to determine partitioning.
860
- bucket (Any): The S3 bucket object.
863
+ bucket (Bucket): The S3 bucket object.
861
864
  prefix (str): The prefix path in the S3 bucket to list objects from.
862
865
 
863
866
  Returns:
864
867
  List[Folder]: A list of Folder objects representing the partitions found.
865
868
  """
866
-
867
- prefix_to_list = prefix
868
- files = list(
869
- bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
870
- )
871
- files = sorted(files, key=lambda a: a.last_modified)
872
- grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
873
-
874
869
  partitions: List[Folder] = []
875
- for key, group in grouped_files:
870
+ s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
871
+ for key, group in group_s3_objects_by_dirname(s3_objects).items():
876
872
  file_size = 0
877
873
  creation_time = None
878
874
  modification_time = None
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
904
900
  Folder(
905
901
  partition_id=id,
906
902
  is_partition=bool(id),
907
- creation_time=creation_time if creation_time else None,
903
+ creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
908
904
  modification_time=modification_time,
909
905
  sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
910
906
  size=file_size,
@@ -244,6 +244,11 @@ class SnowflakeV2Config(
244
244
  description="""Optional. Allowed values are `without_lineage`, `with_lineage`, and `skip` (default). `without_lineage` only extracts tags that have been applied directly to the given entity. `with_lineage` extracts both directly applied and propagated tags, but will be significantly slower. See the [Snowflake documentation](https://docs.snowflake.com/en/user-guide/object-tagging.html#tag-lineage) for information about tag lineage/propagation. """,
245
245
  )
246
246
 
247
+ extract_tags_as_structured_properties: bool = Field(
248
+ default=False,
249
+ description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
250
+ )
251
+
247
252
  include_external_url: bool = Field(
248
253
  default=True,
249
254
  description="Whether to populate Snowsight url for Snowflake Objects",
@@ -263,6 +268,14 @@ class SnowflakeV2Config(
263
268
  description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
264
269
  )
265
270
 
271
+ structured_property_pattern: AllowDenyPattern = Field(
272
+ default=AllowDenyPattern.allow_all(),
273
+ description=(
274
+ "List of regex patterns for structured properties to include in ingestion."
275
+ " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
276
+ ),
277
+ )
278
+
266
279
  # This is required since access_history table does not capture whether the table was temporary table.
267
280
  temporary_tables_pattern: List[str] = Field(
268
281
  default=DEFAULT_TEMP_TABLES_PATTERNS,
@@ -45,15 +45,18 @@ class SnowflakeTag:
45
45
  name: str
46
46
  value: str
47
47
 
48
- def display_name(self) -> str:
48
+ def tag_display_name(self) -> str:
49
49
  return f"{self.name}: {self.value}"
50
50
 
51
- def identifier(self) -> str:
51
+ def tag_identifier(self) -> str:
52
52
  return f"{self._id_prefix_as_str()}:{self.value}"
53
53
 
54
54
  def _id_prefix_as_str(self) -> str:
55
55
  return f"{self.database}.{self.schema}.{self.name}"
56
56
 
57
+ def structured_property_identifier(self) -> str:
58
+ return f"snowflake.{self.database}.{self.schema}.{self.name}"
59
+
57
60
 
58
61
  @dataclass
59
62
  class SnowflakeColumn(BaseColumn):