acryl-datahub 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/METADATA +2460 -2460
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/RECORD +58 -54
- datahub/__init__.py +1 -1
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/migrate.py +2 -2
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/rest_emitter.py +1 -1
- datahub/ingestion/api/source.py +2 -2
- datahub/ingestion/graph/client.py +4 -2
- datahub/ingestion/source/aws/glue.py +14 -1
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
- datahub/ingestion/source/iceberg/iceberg.py +10 -3
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/kafka_connect/kafka_connect.py +1 -6
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +0 -5
- datahub/ingestion/source/nifi.py +0 -5
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/redshift.py +1 -0
- datahub/ingestion/source/s3/source.py +10 -14
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +5 -2
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -20
- datahub/ingestion/source/snowflake/snowflake_tag.py +14 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +0 -6
- datahub/ingestion/source/sql/sql_types.py +1 -1
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +0 -6
- datahub/metadata/_schema_classes.py +316 -43
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +296 -87
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc5.dist-info}/top_level.txt +0 -0
|
@@ -16,7 +16,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
platform_name,
|
|
17
17
|
support_status,
|
|
18
18
|
)
|
|
19
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor,
|
|
19
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
20
20
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
21
21
|
from datahub.ingestion.source.fivetran.config import (
|
|
22
22
|
KNOWN_DATA_PLATFORM_MAPPING,
|
|
@@ -291,11 +291,6 @@ class FivetranSource(StatefulIngestionSourceBase):
|
|
|
291
291
|
dpi = self._generate_dpi_from_job(job, datajob)
|
|
292
292
|
yield from self._get_dpi_workunits(job, dpi)
|
|
293
293
|
|
|
294
|
-
@classmethod
|
|
295
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
296
|
-
config = FivetranSourceConfig.parse_obj(config_dict)
|
|
297
|
-
return cls(config, ctx)
|
|
298
|
-
|
|
299
294
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
300
295
|
return [
|
|
301
296
|
*super().get_workunit_processors(),
|
|
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
keep_history_max_days: int = Field(
|
|
32
|
-
|
|
32
|
+
90,
|
|
33
33
|
description="Maximum number of days to keep execution requests for, per ingestion source",
|
|
34
34
|
)
|
|
35
35
|
|
|
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
48
48
|
description="Maximum runtime in seconds for the cleanup task",
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
+
limit_entities_delete: Optional[int] = Field(
|
|
52
|
+
10000, description="Max number of execution requests to hard delete."
|
|
53
|
+
)
|
|
54
|
+
|
|
51
55
|
max_read_errors: int = Field(
|
|
52
56
|
default=10,
|
|
53
57
|
description="Maximum number of read errors before aborting",
|
|
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
|
|
|
65
69
|
ergc_delete_errors: int = 0
|
|
66
70
|
ergc_start_time: Optional[datetime.datetime] = None
|
|
67
71
|
ergc_end_time: Optional[datetime.datetime] = None
|
|
72
|
+
ergc_delete_limit_reached: bool = False
|
|
73
|
+
ergc_runtime_limit_reached: bool = False
|
|
68
74
|
|
|
69
75
|
|
|
70
76
|
class CleanupRecord(BaseModel):
|
|
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
|
|
|
85
91
|
self.graph = graph
|
|
86
92
|
self.report = report
|
|
87
93
|
self.instance_id = int(time.time())
|
|
94
|
+
self.last_print_time = 0.0
|
|
88
95
|
|
|
89
96
|
if config is not None:
|
|
90
97
|
self.config = config
|
|
91
98
|
else:
|
|
92
99
|
self.config = DatahubExecutionRequestCleanupConfig()
|
|
93
100
|
|
|
101
|
+
def _print_report(self) -> None:
|
|
102
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
103
|
+
# Print report every 2 minutes
|
|
104
|
+
if time_taken > 120:
|
|
105
|
+
self.last_print_time = time.time()
|
|
106
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
107
|
+
|
|
94
108
|
def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
|
|
95
109
|
input_aspect = (
|
|
96
110
|
entry.get("aspects", {})
|
|
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
175
189
|
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
|
|
176
190
|
|
|
177
191
|
for entry in self._scroll_execution_requests():
|
|
192
|
+
self._print_report()
|
|
178
193
|
self.report.ergc_records_read += 1
|
|
179
194
|
key = entry.ingestion_source
|
|
180
195
|
|
|
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
|
|
|
225
240
|
f"record timestamp: {entry.requested_at}."
|
|
226
241
|
)
|
|
227
242
|
)
|
|
228
|
-
self.report.ergc_records_deleted += 1
|
|
229
243
|
yield entry
|
|
230
244
|
|
|
231
245
|
def _delete_entry(self, entry: CleanupRecord) -> None:
|
|
232
246
|
try:
|
|
233
|
-
logger.info(
|
|
234
|
-
f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
|
|
235
|
-
)
|
|
236
247
|
self.graph.delete_entity(entry.urn, True)
|
|
248
|
+
self.report.ergc_records_deleted += 1
|
|
237
249
|
except Exception as e:
|
|
238
250
|
self.report.ergc_delete_errors += 1
|
|
239
251
|
self.report.failure(
|
|
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
|
|
|
252
264
|
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
|
|
253
265
|
)
|
|
254
266
|
):
|
|
267
|
+
self.report.ergc_runtime_limit_reached = True
|
|
255
268
|
logger.info(f"ergc({self.instance_id}): max runtime reached.")
|
|
256
269
|
return True
|
|
257
270
|
return False
|
|
258
271
|
|
|
272
|
+
def _reached_delete_limit(self) -> bool:
|
|
273
|
+
if (
|
|
274
|
+
self.config.limit_entities_delete
|
|
275
|
+
and self.report.ergc_records_deleted >= self.config.limit_entities_delete
|
|
276
|
+
):
|
|
277
|
+
logger.info(
|
|
278
|
+
f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
|
|
279
|
+
)
|
|
280
|
+
self.report.ergc_delete_limit_reached = True
|
|
281
|
+
return True
|
|
282
|
+
return False
|
|
283
|
+
|
|
259
284
|
def run(self) -> None:
|
|
260
285
|
if not self.config.enabled:
|
|
261
286
|
logger.info(
|
|
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
274
299
|
)
|
|
275
300
|
|
|
276
301
|
for entry in self._scroll_garbage_records():
|
|
277
|
-
if self._reached_runtime_limit():
|
|
302
|
+
if self._reached_runtime_limit() or self._reached_delete_limit():
|
|
278
303
|
break
|
|
279
304
|
self._delete_entry(entry)
|
|
280
305
|
|
|
@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
|
|
|
231
231
|
def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
|
|
232
232
|
assert self.ctx.graph
|
|
233
233
|
scroll_id: Optional[str] = None
|
|
234
|
+
|
|
235
|
+
batch_size = self.config.batch_size
|
|
236
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
237
|
+
# Due to a bug in Data process instance querying this is a temp workaround
|
|
238
|
+
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
239
|
+
# This will be remove in future version after server with fix has been
|
|
240
|
+
# around for a while
|
|
241
|
+
batch_size = 10
|
|
242
|
+
|
|
234
243
|
while True:
|
|
235
244
|
try:
|
|
236
245
|
result = self.ctx.graph.execute_graphql(
|
|
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
240
249
|
"types": [entity_type],
|
|
241
250
|
"query": "*",
|
|
242
251
|
"scrollId": scroll_id if scroll_id else None,
|
|
243
|
-
"count":
|
|
252
|
+
"count": batch_size,
|
|
244
253
|
"orFilters": [
|
|
245
254
|
{
|
|
246
255
|
"and": [
|
|
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
|
|
|
263
272
|
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
264
273
|
if not scroll_across_entities or not scroll_across_entities.get("count"):
|
|
265
274
|
break
|
|
275
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
276
|
+
# Temp workaround. See note in beginning of the function
|
|
277
|
+
# We make the batch size = config after call has succeeded once
|
|
278
|
+
batch_size = self.config.batch_size
|
|
266
279
|
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
267
280
|
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
268
281
|
for query in scroll_across_entities.get("searchResults"):
|
|
@@ -203,7 +203,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
203
203
|
with PerfTimer() as timer:
|
|
204
204
|
table = thread_local.local_catalog.load_table(dataset_path)
|
|
205
205
|
time_taken = timer.elapsed_seconds()
|
|
206
|
-
self.report.report_table_load_time(
|
|
206
|
+
self.report.report_table_load_time(
|
|
207
|
+
time_taken, dataset_name, table.metadata_location
|
|
208
|
+
)
|
|
207
209
|
LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
|
|
208
210
|
yield from self._create_iceberg_workunit(dataset_name, table)
|
|
209
211
|
except NoSuchPropertyException as e:
|
|
@@ -247,7 +249,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
247
249
|
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
248
250
|
)
|
|
249
251
|
except Exception as e:
|
|
250
|
-
self.report.report_failure(
|
|
252
|
+
self.report.report_failure(
|
|
253
|
+
"general",
|
|
254
|
+
f"Failed to create workunit for dataset {dataset_name}: {e}",
|
|
255
|
+
)
|
|
251
256
|
LOGGER.exception(
|
|
252
257
|
f"Exception while processing table {dataset_path}, skipping it.",
|
|
253
258
|
)
|
|
@@ -312,7 +317,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
312
317
|
dataset_snapshot.aspects.append(schema_metadata)
|
|
313
318
|
|
|
314
319
|
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
315
|
-
self.report.report_table_processing_time(
|
|
320
|
+
self.report.report_table_processing_time(
|
|
321
|
+
timer.elapsed_seconds(), dataset_name, table.metadata_location
|
|
322
|
+
)
|
|
316
323
|
yield MetadataWorkUnit(id=dataset_name, mce=mce)
|
|
317
324
|
|
|
318
325
|
dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional
|
|
|
5
5
|
from humanfriendly import format_timespan
|
|
6
6
|
from pydantic import Field, validator
|
|
7
7
|
from pyiceberg.catalog import Catalog, load_catalog
|
|
8
|
+
from sortedcontainers import SortedList
|
|
8
9
|
|
|
9
10
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
11
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
@@ -146,19 +147,40 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
|
|
|
146
147
|
return load_catalog(name=catalog_name, **catalog_config)
|
|
147
148
|
|
|
148
149
|
|
|
150
|
+
class TopTableTimings:
|
|
151
|
+
_VALUE_FIELD: str = "timing"
|
|
152
|
+
top_entites: SortedList
|
|
153
|
+
_size: int
|
|
154
|
+
|
|
155
|
+
def __init__(self, size: int = 10):
|
|
156
|
+
self._size = size
|
|
157
|
+
self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
|
|
158
|
+
|
|
159
|
+
def add(self, entity: Dict[str, Any]) -> None:
|
|
160
|
+
if self._VALUE_FIELD not in entity:
|
|
161
|
+
return
|
|
162
|
+
self.top_entites.add(entity)
|
|
163
|
+
if len(self.top_entites) > self._size:
|
|
164
|
+
self.top_entites.pop()
|
|
165
|
+
|
|
166
|
+
def __str__(self) -> str:
|
|
167
|
+
if len(self.top_entites) == 0:
|
|
168
|
+
return "no timings reported"
|
|
169
|
+
return str(list(self.top_entites))
|
|
170
|
+
|
|
171
|
+
|
|
149
172
|
class TimingClass:
|
|
150
|
-
times:
|
|
173
|
+
times: SortedList
|
|
151
174
|
|
|
152
175
|
def __init__(self):
|
|
153
|
-
self.times =
|
|
176
|
+
self.times = SortedList()
|
|
154
177
|
|
|
155
|
-
def add_timing(self, t):
|
|
156
|
-
self.times.
|
|
178
|
+
def add_timing(self, t: float) -> None:
|
|
179
|
+
self.times.add(t)
|
|
157
180
|
|
|
158
|
-
def __str__(self):
|
|
181
|
+
def __str__(self) -> str:
|
|
159
182
|
if len(self.times) == 0:
|
|
160
183
|
return "no timings reported"
|
|
161
|
-
self.times.sort()
|
|
162
184
|
total = sum(self.times)
|
|
163
185
|
avg = total / len(self.times)
|
|
164
186
|
return str(
|
|
@@ -180,6 +202,9 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
|
|
|
180
202
|
load_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
181
203
|
processing_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
182
204
|
profiling_table_timings: TimingClass = field(default_factory=TimingClass)
|
|
205
|
+
tables_load_timings: TopTableTimings = field(default_factory=TopTableTimings)
|
|
206
|
+
tables_profile_timings: TopTableTimings = field(default_factory=TopTableTimings)
|
|
207
|
+
tables_process_timings: TopTableTimings = field(default_factory=TopTableTimings)
|
|
183
208
|
listed_namespaces: int = 0
|
|
184
209
|
total_listed_tables: int = 0
|
|
185
210
|
tables_listed_per_namespace: TopKDict[str, int] = field(
|
|
@@ -201,11 +226,26 @@ class IcebergSourceReport(StaleEntityRemovalSourceReport):
|
|
|
201
226
|
def report_dropped(self, ent_name: str) -> None:
|
|
202
227
|
self.filtered.append(ent_name)
|
|
203
228
|
|
|
204
|
-
def report_table_load_time(
|
|
229
|
+
def report_table_load_time(
|
|
230
|
+
self, t: float, table_name: str, table_metadata_location: str
|
|
231
|
+
) -> None:
|
|
205
232
|
self.load_table_timings.add_timing(t)
|
|
233
|
+
self.tables_load_timings.add(
|
|
234
|
+
{"table": table_name, "timing": t, "metadata_file": table_metadata_location}
|
|
235
|
+
)
|
|
206
236
|
|
|
207
|
-
def report_table_processing_time(
|
|
237
|
+
def report_table_processing_time(
|
|
238
|
+
self, t: float, table_name: str, table_metadata_location: str
|
|
239
|
+
) -> None:
|
|
208
240
|
self.processing_table_timings.add_timing(t)
|
|
241
|
+
self.tables_process_timings.add(
|
|
242
|
+
{"table": table_name, "timing": t, "metadata_file": table_metadata_location}
|
|
243
|
+
)
|
|
209
244
|
|
|
210
|
-
def report_table_profiling_time(
|
|
245
|
+
def report_table_profiling_time(
|
|
246
|
+
self, t: float, table_name: str, table_metadata_location: str
|
|
247
|
+
) -> None:
|
|
211
248
|
self.profiling_table_timings.add_timing(t)
|
|
249
|
+
self.tables_profile_timings.add(
|
|
250
|
+
{"table": table_name, "timing": t, "metadata_file": table_metadata_location}
|
|
251
|
+
)
|
|
@@ -204,7 +204,9 @@ class IcebergProfiler:
|
|
|
204
204
|
)
|
|
205
205
|
dataset_profile.fieldProfiles.append(column_profile)
|
|
206
206
|
time_taken = timer.elapsed_seconds()
|
|
207
|
-
self.report.report_table_profiling_time(
|
|
207
|
+
self.report.report_table_profiling_time(
|
|
208
|
+
time_taken, dataset_name, table.metadata_location
|
|
209
|
+
)
|
|
208
210
|
LOGGER.debug(
|
|
209
211
|
f"Finished profiling of dataset: {dataset_name} in {time_taken}"
|
|
210
212
|
)
|
|
@@ -17,7 +17,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
17
17
|
platform_name,
|
|
18
18
|
support_status,
|
|
19
19
|
)
|
|
20
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
20
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
21
21
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
22
22
|
from datahub.ingestion.source.kafka_connect.common import (
|
|
23
23
|
CONNECTOR_CLASS,
|
|
@@ -94,11 +94,6 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
94
94
|
if not jpype.isJVMStarted():
|
|
95
95
|
jpype.startJVM()
|
|
96
96
|
|
|
97
|
-
@classmethod
|
|
98
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
99
|
-
config = KafkaConnectSourceConfig.parse_obj(config_dict)
|
|
100
|
-
return cls(config, ctx)
|
|
101
|
-
|
|
102
97
|
def get_connectors_manifest(self) -> Iterable[ConnectorManifest]:
|
|
103
98
|
"""Get Kafka Connect connectors manifest using REST API.
|
|
104
99
|
Enrich with lineages metadata.
|
|
@@ -23,7 +23,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
23
23
|
platform_name,
|
|
24
24
|
support_status,
|
|
25
25
|
)
|
|
26
|
-
from datahub.ingestion.api.source import MetadataWorkUnitProcessor,
|
|
26
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
27
27
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
28
28
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
29
29
|
StaleEntityRemovalHandler,
|
|
@@ -789,11 +789,6 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
789
789
|
|
|
790
790
|
return platform, dbname, schema, platform_instance
|
|
791
791
|
|
|
792
|
-
@classmethod
|
|
793
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
794
|
-
config = MetabaseConfig.parse_obj(config_dict)
|
|
795
|
-
return cls(ctx, config)
|
|
796
|
-
|
|
797
792
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
798
793
|
return [
|
|
799
794
|
*super().get_workunit_processors(),
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -484,11 +484,6 @@ class NifiSource(Source):
|
|
|
484
484
|
def rest_api_base_url(self):
|
|
485
485
|
return self.config.site_url[: -len("nifi/")] + "nifi-api/"
|
|
486
486
|
|
|
487
|
-
@classmethod
|
|
488
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
|
|
489
|
-
config = NifiSourceConfig.parse_obj(config_dict)
|
|
490
|
-
return cls(config, ctx)
|
|
491
|
-
|
|
492
487
|
def get_report(self) -> SourceReport:
|
|
493
488
|
return self.report
|
|
494
489
|
|
|
@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
|
485
485
|
self.filtered_reports.append(view)
|
|
486
486
|
|
|
487
487
|
|
|
488
|
-
@platform_name("PowerBI")
|
|
488
|
+
@platform_name("PowerBI Report Server")
|
|
489
489
|
@config_class(PowerBiReportServerDashboardSourceConfig)
|
|
490
490
|
@support_status(SupportStatus.INCUBATING)
|
|
491
491
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
@@ -369,11 +369,6 @@ class RedashSource(Source):
|
|
|
369
369
|
else:
|
|
370
370
|
raise ValueError(f"Failed to connect to {self.config.connect_uri}/api")
|
|
371
371
|
|
|
372
|
-
@classmethod
|
|
373
|
-
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
|
|
374
|
-
config = RedashConfig.parse_obj(config_dict)
|
|
375
|
-
return cls(ctx, config)
|
|
376
|
-
|
|
377
372
|
def _get_chart_data_source(self, data_source_id: Optional[int] = None) -> Dict:
|
|
378
373
|
url = f"/api/data_sources/{data_source_id}"
|
|
379
374
|
resp = self.client._get(url).json()
|
|
@@ -6,9 +6,8 @@ import pathlib
|
|
|
6
6
|
import re
|
|
7
7
|
import time
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from itertools import groupby
|
|
10
9
|
from pathlib import PurePath
|
|
11
|
-
from typing import
|
|
10
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
|
|
12
11
|
from urllib.parse import urlparse
|
|
13
12
|
|
|
14
13
|
import smart_open.compression as so_compression
|
|
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
41
40
|
get_bucket_name,
|
|
42
41
|
get_bucket_relative_path,
|
|
43
42
|
get_key_prefix,
|
|
43
|
+
group_s3_objects_by_dirname,
|
|
44
44
|
strip_s3_prefix,
|
|
45
45
|
)
|
|
46
46
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
|
|
|
75
75
|
from datahub.telemetry import stats, telemetry
|
|
76
76
|
from datahub.utilities.perf_timer import PerfTimer
|
|
77
77
|
|
|
78
|
+
if TYPE_CHECKING:
|
|
79
|
+
from mypy_boto3_s3.service_resource import Bucket
|
|
80
|
+
|
|
78
81
|
# hide annoying debug errors from py4j
|
|
79
82
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
80
83
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
842
845
|
def get_folder_info(
|
|
843
846
|
self,
|
|
844
847
|
path_spec: PathSpec,
|
|
845
|
-
bucket:
|
|
848
|
+
bucket: "Bucket",
|
|
846
849
|
prefix: str,
|
|
847
850
|
) -> List[Folder]:
|
|
848
851
|
"""
|
|
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
857
860
|
|
|
858
861
|
Parameters:
|
|
859
862
|
path_spec (PathSpec): The path specification used to determine partitioning.
|
|
860
|
-
bucket (
|
|
863
|
+
bucket (Bucket): The S3 bucket object.
|
|
861
864
|
prefix (str): The prefix path in the S3 bucket to list objects from.
|
|
862
865
|
|
|
863
866
|
Returns:
|
|
864
867
|
List[Folder]: A list of Folder objects representing the partitions found.
|
|
865
868
|
"""
|
|
866
|
-
|
|
867
|
-
prefix_to_list = prefix
|
|
868
|
-
files = list(
|
|
869
|
-
bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
|
|
870
|
-
)
|
|
871
|
-
files = sorted(files, key=lambda a: a.last_modified)
|
|
872
|
-
grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
|
|
873
|
-
|
|
874
869
|
partitions: List[Folder] = []
|
|
875
|
-
|
|
870
|
+
s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
871
|
+
for key, group in group_s3_objects_by_dirname(s3_objects).items():
|
|
876
872
|
file_size = 0
|
|
877
873
|
creation_time = None
|
|
878
874
|
modification_time = None
|
|
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
904
900
|
Folder(
|
|
905
901
|
partition_id=id,
|
|
906
902
|
is_partition=bool(id),
|
|
907
|
-
creation_time=creation_time if creation_time else None,
|
|
903
|
+
creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
|
|
908
904
|
modification_time=modification_time,
|
|
909
905
|
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
|
|
910
906
|
size=file_size,
|
|
@@ -244,6 +244,11 @@ class SnowflakeV2Config(
|
|
|
244
244
|
description="""Optional. Allowed values are `without_lineage`, `with_lineage`, and `skip` (default). `without_lineage` only extracts tags that have been applied directly to the given entity. `with_lineage` extracts both directly applied and propagated tags, but will be significantly slower. See the [Snowflake documentation](https://docs.snowflake.com/en/user-guide/object-tagging.html#tag-lineage) for information about tag lineage/propagation. """,
|
|
245
245
|
)
|
|
246
246
|
|
|
247
|
+
extract_tags_as_structured_properties: bool = Field(
|
|
248
|
+
default=False,
|
|
249
|
+
description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
|
|
250
|
+
)
|
|
251
|
+
|
|
247
252
|
include_external_url: bool = Field(
|
|
248
253
|
default=True,
|
|
249
254
|
description="Whether to populate Snowsight url for Snowflake Objects",
|
|
@@ -263,6 +268,14 @@ class SnowflakeV2Config(
|
|
|
263
268
|
description="List of regex patterns for tags to include in ingestion. Only used if `extract_tags` is enabled.",
|
|
264
269
|
)
|
|
265
270
|
|
|
271
|
+
structured_property_pattern: AllowDenyPattern = Field(
|
|
272
|
+
default=AllowDenyPattern.allow_all(),
|
|
273
|
+
description=(
|
|
274
|
+
"List of regex patterns for structured properties to include in ingestion."
|
|
275
|
+
" Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
|
|
276
|
+
),
|
|
277
|
+
)
|
|
278
|
+
|
|
266
279
|
# This is required since access_history table does not capture whether the table was temporary table.
|
|
267
280
|
temporary_tables_pattern: List[str] = Field(
|
|
268
281
|
default=DEFAULT_TEMP_TABLES_PATTERNS,
|
|
@@ -45,15 +45,18 @@ class SnowflakeTag:
|
|
|
45
45
|
name: str
|
|
46
46
|
value: str
|
|
47
47
|
|
|
48
|
-
def
|
|
48
|
+
def tag_display_name(self) -> str:
|
|
49
49
|
return f"{self.name}: {self.value}"
|
|
50
50
|
|
|
51
|
-
def
|
|
51
|
+
def tag_identifier(self) -> str:
|
|
52
52
|
return f"{self._id_prefix_as_str()}:{self.value}"
|
|
53
53
|
|
|
54
54
|
def _id_prefix_as_str(self) -> str:
|
|
55
55
|
return f"{self.database}.{self.schema}.{self.name}"
|
|
56
56
|
|
|
57
|
+
def structured_property_identifier(self) -> str:
|
|
58
|
+
return f"snowflake.{self.database}.{self.schema}.{self.name}"
|
|
59
|
+
|
|
57
60
|
|
|
58
61
|
@dataclass
|
|
59
62
|
class SnowflakeColumn(BaseColumn):
|