acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/METADATA +2405 -2405
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/RECORD +52 -52
- datahub/_version.py +1 -1
- datahub/configuration/git.py +1 -3
- datahub/ingestion/glossary/classification_mixin.py +1 -1
- datahub/ingestion/graph/client.py +1 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/file.py +5 -2
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/ge_data_profiler.py +11 -14
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
- datahub/ingestion/source/identity/okta.py +1 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +2 -1
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +6 -3
- datahub/ingestion/source/openapi_parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +2 -1
- datahub/ingestion/source/s3/config.py +2 -4
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +2 -2
- datahub/ingestion/source/sql/sql_common.py +2 -2
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +4 -2
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/superset.py +65 -37
- datahub/ingestion/source/tableau/tableau.py +1 -5
- datahub/lite/duckdb_lite.py +3 -9
- datahub/sdk/dataset.py +3 -3
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc8.dist-info}/top_level.txt +0 -0
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import threading
|
|
4
4
|
import uuid
|
|
5
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
6
|
|
|
7
|
+
from dateutil import parser as dateutil_parser
|
|
7
8
|
from pyiceberg.catalog import Catalog
|
|
8
9
|
from pyiceberg.exceptions import (
|
|
9
10
|
NoSuchIcebergTableError,
|
|
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
|
|
|
81
82
|
OwnerClass,
|
|
82
83
|
OwnershipClass,
|
|
83
84
|
OwnershipTypeClass,
|
|
85
|
+
TimeStampClass,
|
|
84
86
|
)
|
|
85
87
|
from datahub.utilities.perf_timer import PerfTimer
|
|
86
88
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
183
185
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
184
186
|
thread_local = threading.local()
|
|
185
187
|
|
|
186
|
-
def
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if not self.config.table_pattern.allowed(dataset_name):
|
|
190
|
-
# Dataset name is rejected by pattern, report as dropped.
|
|
191
|
-
self.report.report_dropped(dataset_name)
|
|
192
|
-
LOGGER.debug(
|
|
193
|
-
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
194
|
-
)
|
|
195
|
-
return
|
|
188
|
+
def _try_processing_dataset(
|
|
189
|
+
dataset_path: Tuple[str, ...], dataset_name: str
|
|
190
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
196
191
|
try:
|
|
197
192
|
if not hasattr(thread_local, "local_catalog"):
|
|
198
193
|
LOGGER.debug(
|
|
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
248
243
|
LOGGER.warning(
|
|
249
244
|
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
250
245
|
)
|
|
246
|
+
except ValueError as e:
|
|
247
|
+
if "Could not initialize FileIO" not in str(e):
|
|
248
|
+
raise
|
|
249
|
+
self.report.warning(
|
|
250
|
+
"Could not initialize FileIO",
|
|
251
|
+
f"Could not initialize FileIO for {dataset_path} due to: {e}",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
|
|
255
|
+
try:
|
|
256
|
+
LOGGER.debug(f"Processing dataset for path {dataset_path}")
|
|
257
|
+
dataset_name = ".".join(dataset_path)
|
|
258
|
+
if not self.config.table_pattern.allowed(dataset_name):
|
|
259
|
+
# Dataset name is rejected by pattern, report as dropped.
|
|
260
|
+
self.report.report_dropped(dataset_name)
|
|
261
|
+
LOGGER.debug(
|
|
262
|
+
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
263
|
+
)
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
yield from _try_processing_dataset(dataset_path, dataset_name)
|
|
251
267
|
except Exception as e:
|
|
252
268
|
self.report.report_failure(
|
|
253
269
|
"general",
|
|
254
|
-
f"Failed to create workunit for dataset {
|
|
270
|
+
f"Failed to create workunit for dataset {dataset_path}: {e}",
|
|
255
271
|
)
|
|
256
272
|
LOGGER.exception(
|
|
257
273
|
f"Exception while processing table {dataset_path}, skipping it.",
|
|
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
288
304
|
)
|
|
289
305
|
|
|
290
306
|
# Dataset properties aspect.
|
|
307
|
+
additional_properties = {}
|
|
291
308
|
custom_properties = table.metadata.properties.copy()
|
|
292
309
|
custom_properties["location"] = table.metadata.location
|
|
293
310
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
299
316
|
custom_properties["manifest-list"] = (
|
|
300
317
|
table.current_snapshot().manifest_list
|
|
301
318
|
)
|
|
319
|
+
additional_properties["lastModified"] = TimeStampClass(
|
|
320
|
+
int(table.current_snapshot().timestamp_ms)
|
|
321
|
+
)
|
|
322
|
+
if "created-at" in custom_properties:
|
|
323
|
+
try:
|
|
324
|
+
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
325
|
+
additional_properties["created"] = TimeStampClass(
|
|
326
|
+
int(dt.timestamp() * 1000)
|
|
327
|
+
)
|
|
328
|
+
except Exception as ex:
|
|
329
|
+
LOGGER.warning(
|
|
330
|
+
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
331
|
+
)
|
|
332
|
+
|
|
302
333
|
dataset_properties = DatasetPropertiesClass(
|
|
303
334
|
name=table.name()[-1],
|
|
304
335
|
description=table.metadata.properties.get("comment", None),
|
|
305
336
|
customProperties=custom_properties,
|
|
337
|
+
lastModified=additional_properties.get("lastModified"),
|
|
338
|
+
created=additional_properties.get("created"),
|
|
339
|
+
qualifiedName=dataset_name,
|
|
306
340
|
)
|
|
307
341
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
308
342
|
# Dataset ownership aspect.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import threading
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
4
|
from typing import Any, Dict, Optional
|
|
4
5
|
|
|
@@ -156,18 +157,21 @@ class TopTableTimings:
|
|
|
156
157
|
def __init__(self, size: int = 10):
|
|
157
158
|
self._size = size
|
|
158
159
|
self.top_entites = SortedList(key=lambda x: -x.get(self._VALUE_FIELD, 0))
|
|
160
|
+
self._lock = threading.Lock()
|
|
159
161
|
|
|
160
162
|
def add(self, entity: Dict[str, Any]) -> None:
|
|
161
163
|
if self._VALUE_FIELD not in entity:
|
|
162
164
|
return
|
|
163
|
-
self.
|
|
164
|
-
|
|
165
|
-
self.top_entites.
|
|
165
|
+
with self._lock:
|
|
166
|
+
self.top_entites.add(entity)
|
|
167
|
+
if len(self.top_entites) > self._size:
|
|
168
|
+
self.top_entites.pop()
|
|
166
169
|
|
|
167
170
|
def __str__(self) -> str:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
+
with self._lock:
|
|
172
|
+
if len(self.top_entites) == 0:
|
|
173
|
+
return "no timings reported"
|
|
174
|
+
return str(list(self.top_entites))
|
|
171
175
|
|
|
172
176
|
|
|
173
177
|
class TimingClass:
|
|
@@ -175,24 +179,31 @@ class TimingClass:
|
|
|
175
179
|
|
|
176
180
|
def __init__(self):
|
|
177
181
|
self.times = SortedList()
|
|
182
|
+
self._lock = threading.Lock()
|
|
178
183
|
|
|
179
184
|
def add_timing(self, t: float) -> None:
|
|
180
|
-
self.
|
|
185
|
+
with self._lock:
|
|
186
|
+
self.times.add(t)
|
|
181
187
|
|
|
182
188
|
def __str__(self) -> str:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
189
|
+
with self._lock:
|
|
190
|
+
if len(self.times) == 0:
|
|
191
|
+
return "no timings reported"
|
|
192
|
+
total = sum(self.times)
|
|
193
|
+
avg = total / len(self.times)
|
|
194
|
+
return str(
|
|
195
|
+
{
|
|
196
|
+
"average_time": format_timespan(avg, detailed=True, max_units=3),
|
|
197
|
+
"min_time": format_timespan(
|
|
198
|
+
self.times[0], detailed=True, max_units=3
|
|
199
|
+
),
|
|
200
|
+
"max_time": format_timespan(
|
|
201
|
+
self.times[-1], detailed=True, max_units=3
|
|
202
|
+
),
|
|
203
|
+
# total_time does not provide correct information in case we run in more than 1 thread
|
|
204
|
+
"total_time": format_timespan(total, detailed=True, max_units=3),
|
|
205
|
+
}
|
|
206
|
+
)
|
|
196
207
|
|
|
197
208
|
|
|
198
209
|
@dataclass
|
|
@@ -568,9 +568,7 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
568
568
|
if (
|
|
569
569
|
self.config.include_deprovisioned_users is False
|
|
570
570
|
and okta_user.status == UserStatus.DEPROVISIONED
|
|
571
|
-
)
|
|
572
|
-
return False
|
|
573
|
-
elif (
|
|
571
|
+
) or (
|
|
574
572
|
self.config.include_suspended_users is False
|
|
575
573
|
and okta_user.status == UserStatus.SUSPENDED
|
|
576
574
|
):
|
|
@@ -447,13 +447,10 @@ class DebeziumSourceConnector(BaseConnector):
|
|
|
447
447
|
) -> DebeziumParser:
|
|
448
448
|
connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "")
|
|
449
449
|
|
|
450
|
-
if
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
database_name=None,
|
|
455
|
-
)
|
|
456
|
-
elif connector_class == "MySqlConnector":
|
|
450
|
+
if (
|
|
451
|
+
connector_class == "io.debezium.connector.mysql.MySqlConnector"
|
|
452
|
+
or connector_class == "MySqlConnector"
|
|
453
|
+
):
|
|
457
454
|
parser = self.DebeziumParser(
|
|
458
455
|
source_platform="mysql",
|
|
459
456
|
server_name=self.get_server_name(connector_manifest),
|
|
@@ -205,8 +205,9 @@ class LookerAPI:
|
|
|
205
205
|
def folder_ancestors(
|
|
206
206
|
self,
|
|
207
207
|
folder_id: str,
|
|
208
|
-
fields: Union[str, List[str]] =
|
|
208
|
+
fields: Optional[Union[str, List[str]]] = None,
|
|
209
209
|
) -> Sequence[Folder]:
|
|
210
|
+
fields = fields or ["id", "name", "parent_id"]
|
|
210
211
|
self.client_stats.folder_calls += 1
|
|
211
212
|
try:
|
|
212
213
|
return self.client.folder_ancestors(
|
|
@@ -464,9 +464,10 @@ def process_lookml_template_language(
|
|
|
464
464
|
source_config: LookMLSourceConfig,
|
|
465
465
|
view_lkml_file_dict: dict,
|
|
466
466
|
reporter: LookMLSourceReport,
|
|
467
|
-
manifest_constants: Dict[str, "LookerConstant"] =
|
|
467
|
+
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
|
|
468
468
|
resolve_constants: bool = False,
|
|
469
469
|
) -> None:
|
|
470
|
+
manifest_constants = manifest_constants or {}
|
|
470
471
|
if "views" not in view_lkml_file_dict:
|
|
471
472
|
return
|
|
472
473
|
|
|
@@ -507,9 +508,10 @@ def load_and_preprocess_file(
|
|
|
507
508
|
path: Union[str, pathlib.Path],
|
|
508
509
|
source_config: LookMLSourceConfig,
|
|
509
510
|
reporter: LookMLSourceReport,
|
|
510
|
-
manifest_constants: Dict[str, "LookerConstant"] =
|
|
511
|
+
manifest_constants: Optional[Dict[str, "LookerConstant"]] = None,
|
|
511
512
|
resolve_constants: bool = False,
|
|
512
513
|
) -> dict:
|
|
514
|
+
manifest_constants = manifest_constants or {}
|
|
513
515
|
parsed = load_lkml(path)
|
|
514
516
|
|
|
515
517
|
process_lookml_template_language(
|
|
@@ -1006,8 +1006,9 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
1006
1006
|
def report_skipped_unreachable_views(
|
|
1007
1007
|
self,
|
|
1008
1008
|
viewfile_loader: LookerViewFileLoader,
|
|
1009
|
-
processed_view_map: Dict[str, Set[str]] =
|
|
1009
|
+
processed_view_map: Optional[Dict[str, Set[str]]] = None,
|
|
1010
1010
|
) -> None:
|
|
1011
|
+
processed_view_map = processed_view_map or {}
|
|
1011
1012
|
view_files: Dict[str, List[pathlib.Path]] = {}
|
|
1012
1013
|
for project, folder_path in self.base_projects_folder.items():
|
|
1013
1014
|
folder = pathlib.Path(folder_path)
|
|
@@ -104,8 +104,8 @@ class FineGrainedLineageConfig(ConfigModel):
|
|
|
104
104
|
|
|
105
105
|
class EntityNodeConfig(ConfigModel):
|
|
106
106
|
entity: EntityConfig
|
|
107
|
-
upstream: Optional[List["EntityNodeConfig"]]
|
|
108
|
-
fineGrainedLineages: Optional[List[FineGrainedLineageConfig]]
|
|
107
|
+
upstream: Optional[List["EntityNodeConfig"]] = None
|
|
108
|
+
fineGrainedLineages: Optional[List[FineGrainedLineageConfig]] = None
|
|
109
109
|
|
|
110
110
|
|
|
111
111
|
# https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ required for when you reference a model within itself
|
|
@@ -292,7 +292,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
|
|
|
292
292
|
return record["properties"]
|
|
293
293
|
|
|
294
294
|
def get_relationships(self, record: dict) -> dict:
|
|
295
|
-
return record.get("relationships",
|
|
295
|
+
return record.get("relationships", {})
|
|
296
296
|
|
|
297
297
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
298
298
|
return [
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -1234,11 +1234,14 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
1234
1234
|
job_type: str,
|
|
1235
1235
|
description: Optional[str],
|
|
1236
1236
|
job_properties: Optional[Dict[str, str]] = None,
|
|
1237
|
-
inlets: List[str] =
|
|
1238
|
-
outlets: List[str] =
|
|
1239
|
-
inputJobs: List[str] =
|
|
1237
|
+
inlets: Optional[List[str]] = None,
|
|
1238
|
+
outlets: Optional[List[str]] = None,
|
|
1239
|
+
inputJobs: Optional[List[str]] = None,
|
|
1240
1240
|
status: Optional[str] = None,
|
|
1241
1241
|
) -> Iterable[MetadataWorkUnit]:
|
|
1242
|
+
inlets = inlets or []
|
|
1243
|
+
outlets = outlets or []
|
|
1244
|
+
inputJobs = inputJobs or []
|
|
1242
1245
|
logger.debug(f"Begining construction of job workunit for {job_urn}")
|
|
1243
1246
|
if job_properties:
|
|
1244
1247
|
job_properties = {k: v for k, v in job_properties.items() if v is not None}
|
|
@@ -167,7 +167,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
167
167
|
Try to determine if example data is defined for the endpoint, and return it
|
|
168
168
|
"""
|
|
169
169
|
data = {}
|
|
170
|
-
if "content" in base_res
|
|
170
|
+
if "content" in base_res:
|
|
171
171
|
res_cont = base_res["content"]
|
|
172
172
|
if "application/json" in res_cont.keys():
|
|
173
173
|
ex_field = None
|
|
@@ -188,7 +188,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
188
188
|
)
|
|
189
189
|
elif "text/csv" in res_cont.keys():
|
|
190
190
|
data = res_cont["text/csv"]["schema"]
|
|
191
|
-
elif "examples" in base_res
|
|
191
|
+
elif "examples" in base_res:
|
|
192
192
|
data = base_res["examples"]["application/json"]
|
|
193
193
|
|
|
194
194
|
return data
|
|
@@ -2,7 +2,7 @@ import functools
|
|
|
2
2
|
import importlib.resources as pkg_resource
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
-
from typing import Dict, List
|
|
5
|
+
from typing import Dict, List, Optional
|
|
6
6
|
|
|
7
7
|
import lark
|
|
8
8
|
from lark import Lark, Tree
|
|
@@ -65,8 +65,9 @@ def get_upstream_tables(
|
|
|
65
65
|
platform_instance_resolver: AbstractDataPlatformInstanceResolver,
|
|
66
66
|
ctx: PipelineContext,
|
|
67
67
|
config: PowerBiDashboardSourceConfig,
|
|
68
|
-
parameters: Dict[str, str] =
|
|
68
|
+
parameters: Optional[Dict[str, str]] = None,
|
|
69
69
|
) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]:
|
|
70
|
+
parameters = parameters or {}
|
|
70
71
|
if table.expression is None:
|
|
71
72
|
logger.debug(f"There is no M-Query expression in table {table.full_name}")
|
|
72
73
|
return []
|
|
@@ -70,13 +70,14 @@ def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]:
|
|
|
70
70
|
return expression_tree
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
def token_values(tree: Tree, parameters: Dict[str, str] =
|
|
73
|
+
def token_values(tree: Tree, parameters: Optional[Dict[str, str]] = None) -> List[str]:
|
|
74
74
|
"""
|
|
75
75
|
:param tree: Tree to traverse
|
|
76
76
|
:param parameters: If parameters is not an empty dict, it will try to resolve identifier variable references
|
|
77
77
|
using the values in 'parameters'.
|
|
78
78
|
:return: List of leaf token data
|
|
79
79
|
"""
|
|
80
|
+
parameters = parameters or {}
|
|
80
81
|
values: List[str] = []
|
|
81
82
|
|
|
82
83
|
def internal(node: Union[Tree, Token]) -> None:
|
|
@@ -890,9 +890,7 @@ class Mapper:
|
|
|
890
890
|
set(user_rights) & set(self.__config.ownership.owner_criteria)
|
|
891
891
|
)
|
|
892
892
|
> 0
|
|
893
|
-
):
|
|
894
|
-
user_mcps.extend(self.to_datahub_user(user))
|
|
895
|
-
elif self.__config.ownership.owner_criteria is None:
|
|
893
|
+
) or self.__config.ownership.owner_criteria is None:
|
|
896
894
|
user_mcps.extend(self.to_datahub_user(user))
|
|
897
895
|
else:
|
|
898
896
|
continue
|
|
@@ -380,8 +380,9 @@ class DataResolverBase(ABC):
|
|
|
380
380
|
def itr_pages(
|
|
381
381
|
self,
|
|
382
382
|
endpoint: str,
|
|
383
|
-
parameter_override: Dict =
|
|
383
|
+
parameter_override: Optional[Dict] = None,
|
|
384
384
|
) -> Iterator[List[Dict]]:
|
|
385
|
+
parameter_override = parameter_override or {}
|
|
385
386
|
params: dict = {
|
|
386
387
|
"$skip": 0,
|
|
387
388
|
"$top": self.TOP,
|
|
@@ -196,7 +196,7 @@ class PowerBiReportServerAPI:
|
|
|
196
196
|
}
|
|
197
197
|
|
|
198
198
|
reports: List[Any] = []
|
|
199
|
-
for report_type in report_types_mapping
|
|
199
|
+
for report_type in report_types_mapping:
|
|
200
200
|
report_get_endpoint: str = API_ENDPOINTS[report_type]
|
|
201
201
|
# Replace place holders
|
|
202
202
|
report_get_endpoint_http = report_get_endpoint.format(
|
|
@@ -17,8 +17,9 @@ class WebsocketConnection:
|
|
|
17
17
|
self.handle = [-1]
|
|
18
18
|
|
|
19
19
|
def _build_websocket_request_dict(
|
|
20
|
-
self, method: str, params: Union[Dict, List] =
|
|
20
|
+
self, method: str, params: Optional[Union[Dict, List]] = None
|
|
21
21
|
) -> Dict:
|
|
22
|
+
params = params or {}
|
|
22
23
|
return {
|
|
23
24
|
"jsonrpc": "2.0",
|
|
24
25
|
"id": self.request_id,
|
|
@@ -37,11 +38,12 @@ class WebsocketConnection:
|
|
|
37
38
|
return {}
|
|
38
39
|
|
|
39
40
|
def websocket_send_request(
|
|
40
|
-
self, method: str, params: Union[Dict, List] =
|
|
41
|
+
self, method: str, params: Optional[Union[Dict, List]] = None
|
|
41
42
|
) -> Dict:
|
|
42
43
|
"""
|
|
43
44
|
Method to send request to websocket
|
|
44
45
|
"""
|
|
46
|
+
params = params or {}
|
|
45
47
|
self.request_id += 1
|
|
46
48
|
request = self._build_websocket_request_dict(method, params)
|
|
47
49
|
response = self._send_request(request=request)
|
|
@@ -421,8 +421,9 @@ class RedashSource(StatefulIngestionSourceBase):
|
|
|
421
421
|
return database_name
|
|
422
422
|
|
|
423
423
|
def _get_datasource_urns(
|
|
424
|
-
self, data_source: Dict, sql_query_data: Dict =
|
|
424
|
+
self, data_source: Dict, sql_query_data: Optional[Dict] = None
|
|
425
425
|
) -> Optional[List[str]]:
|
|
426
|
+
sql_query_data = sql_query_data or {}
|
|
426
427
|
platform = self._get_platform_based_on_datasource(data_source)
|
|
427
428
|
database_name = self._get_database_name_based_on_datasource(data_source)
|
|
428
429
|
data_source_syntax = data_source.get("syntax")
|
|
@@ -154,10 +154,8 @@ class DataLakeSourceConfig(
|
|
|
154
154
|
return path_specs
|
|
155
155
|
|
|
156
156
|
@pydantic.validator("platform", always=True)
|
|
157
|
-
def platform_valid(cls, platform:
|
|
158
|
-
inferred_platform = values.get(
|
|
159
|
-
"platform", None
|
|
160
|
-
) # we may have inferred it above
|
|
157
|
+
def platform_valid(cls, platform: Any, values: dict) -> str:
|
|
158
|
+
inferred_platform = values.get("platform") # we may have inferred it above
|
|
161
159
|
platform = platform or inferred_platform
|
|
162
160
|
if not platform:
|
|
163
161
|
raise ValueError("platform must not be empty")
|
|
@@ -834,7 +834,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
834
834
|
min=min,
|
|
835
835
|
)
|
|
836
836
|
folders.extend(folders_list)
|
|
837
|
-
if
|
|
837
|
+
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
|
838
838
|
return folders
|
|
839
839
|
if folders:
|
|
840
840
|
return folders
|
|
@@ -847,7 +847,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
847
847
|
path_spec: PathSpec,
|
|
848
848
|
bucket: "Bucket",
|
|
849
849
|
prefix: str,
|
|
850
|
-
) ->
|
|
850
|
+
) -> Iterable[Folder]:
|
|
851
851
|
"""
|
|
852
852
|
Retrieves all the folders in a path by listing all the files in the prefix.
|
|
853
853
|
If the prefix is a full path then only that folder will be extracted.
|
|
@@ -877,51 +877,30 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
877
877
|
s3_objects = (
|
|
878
878
|
obj
|
|
879
879
|
for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
880
|
-
if _is_allowed_path(
|
|
880
|
+
if _is_allowed_path(
|
|
881
|
+
path_spec, self.create_s3_path(obj.bucket_name, obj.key)
|
|
882
|
+
)
|
|
881
883
|
)
|
|
882
|
-
|
|
883
|
-
partitions: List[Folder] = []
|
|
884
884
|
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
885
885
|
s3_objects,
|
|
886
886
|
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
887
887
|
)
|
|
888
|
-
for
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
logger.warning(
|
|
903
|
-
f"Unable to find any files in the folder {key}. Skipping..."
|
|
904
|
-
)
|
|
905
|
-
continue
|
|
906
|
-
|
|
907
|
-
id = path_spec.get_partition_from_path(
|
|
908
|
-
self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
888
|
+
for _, group in grouped_s3_objects_by_dirname:
|
|
889
|
+
max_file = max(group, key=lambda x: x.last_modified)
|
|
890
|
+
max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
891
|
+
|
|
892
|
+
# If partition_id is None, it means the folder is not a partition
|
|
893
|
+
partition_id = path_spec.get_partition_from_path(max_file_s3_path)
|
|
894
|
+
|
|
895
|
+
yield Folder(
|
|
896
|
+
partition_id=partition_id,
|
|
897
|
+
is_partition=bool(partition_id),
|
|
898
|
+
creation_time=min(obj.last_modified for obj in group),
|
|
899
|
+
modification_time=max_file.last_modified,
|
|
900
|
+
sample_file=max_file_s3_path,
|
|
901
|
+
size=sum(obj.size for obj in group),
|
|
909
902
|
)
|
|
910
903
|
|
|
911
|
-
# If id is None, it means the folder is not a partition
|
|
912
|
-
partitions.append(
|
|
913
|
-
Folder(
|
|
914
|
-
partition_id=id,
|
|
915
|
-
is_partition=bool(id),
|
|
916
|
-
creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
|
|
917
|
-
modification_time=modification_time,
|
|
918
|
-
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
|
|
919
|
-
size=file_size,
|
|
920
|
-
)
|
|
921
|
-
)
|
|
922
|
-
|
|
923
|
-
return partitions
|
|
924
|
-
|
|
925
904
|
def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
|
|
926
905
|
if self.source_config.aws_config is None:
|
|
927
906
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
@@ -1000,7 +979,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1000
979
|
min=True,
|
|
1001
980
|
)
|
|
1002
981
|
dirs_to_process.append(dirs_to_process_min[0])
|
|
1003
|
-
folders = []
|
|
982
|
+
folders: List[Folder] = []
|
|
1004
983
|
for dir in dirs_to_process:
|
|
1005
984
|
logger.info(f"Getting files from folder: {dir}")
|
|
1006
985
|
prefix_to_process = urlparse(dir).path.lstrip("/")
|
|
@@ -615,7 +615,7 @@ class SalesforceSource(StatefulIngestionSourceBase):
|
|
|
615
615
|
prefix = "\\" if text.startswith("#") else ""
|
|
616
616
|
desc += f"\n\n{prefix}{text}"
|
|
617
617
|
|
|
618
|
-
text = field.get("InlineHelpText"
|
|
618
|
+
text = field.get("InlineHelpText")
|
|
619
619
|
if text:
|
|
620
620
|
prefix = "\\" if text.startswith("#") else ""
|
|
621
621
|
desc += f"\n\n{prefix}{text}"
|
|
@@ -149,7 +149,7 @@ def construct_schema(
|
|
|
149
149
|
|
|
150
150
|
extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {}
|
|
151
151
|
|
|
152
|
-
for field_path in schema
|
|
152
|
+
for field_path in schema:
|
|
153
153
|
field_types = schema[field_path]["types"]
|
|
154
154
|
field_type: Union[str, type] = "mixed"
|
|
155
155
|
|
|
@@ -125,7 +125,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
125
125
|
|
|
126
126
|
@pydantic.validator("authentication_type", always=True)
|
|
127
127
|
def authenticator_type_is_valid(cls, v, values):
|
|
128
|
-
if v not in _VALID_AUTH_TYPES
|
|
128
|
+
if v not in _VALID_AUTH_TYPES:
|
|
129
129
|
raise ValueError(
|
|
130
130
|
f"unsupported authenticator type '{v}' was provided,"
|
|
131
131
|
f" use one of {list(_VALID_AUTH_TYPES.keys())}"
|
|
@@ -396,7 +396,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
396
396
|
metadata.table_type if metadata.table_type else ""
|
|
397
397
|
)
|
|
398
398
|
|
|
399
|
-
location: Optional[str] = custom_properties.get("location"
|
|
399
|
+
location: Optional[str] = custom_properties.get("location")
|
|
400
400
|
if location is not None:
|
|
401
401
|
if location.startswith("s3://"):
|
|
402
402
|
location = make_s3_urn(location, self.config.env)
|
|
@@ -538,7 +538,7 @@ class AthenaSource(SQLAlchemySource):
|
|
|
538
538
|
column_name=column["name"],
|
|
539
539
|
column_type=column["type"],
|
|
540
540
|
inspector=inspector,
|
|
541
|
-
description=column.get("comment"
|
|
541
|
+
description=column.get("comment"),
|
|
542
542
|
nullable=column.get("nullable", True),
|
|
543
543
|
is_part_of_key=(
|
|
544
544
|
True
|
|
@@ -204,7 +204,7 @@ def get_column_type(
|
|
|
204
204
|
"""
|
|
205
205
|
|
|
206
206
|
TypeClass: Optional[Type] = None
|
|
207
|
-
for sql_type in _field_type_mapping
|
|
207
|
+
for sql_type in _field_type_mapping:
|
|
208
208
|
if isinstance(column_type, sql_type):
|
|
209
209
|
TypeClass = _field_type_mapping[sql_type]
|
|
210
210
|
break
|
|
@@ -973,7 +973,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
973
973
|
inspector=inspector,
|
|
974
974
|
)
|
|
975
975
|
),
|
|
976
|
-
description=column.get("comment"
|
|
976
|
+
description=column.get("comment"),
|
|
977
977
|
nullable=column["nullable"],
|
|
978
978
|
recursive=False,
|
|
979
979
|
globalTags=gtc,
|
|
@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
|
|
|
317
317
|
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
|
|
318
318
|
if match:
|
|
319
319
|
modified_type_base = match.group(1) # Extract the base type
|
|
320
|
-
return SNOWFLAKE_TYPES_MAP.get(modified_type_base
|
|
320
|
+
return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
|
|
321
321
|
|
|
322
322
|
# Fallback for types without precision/scale
|
|
323
|
-
return SNOWFLAKE_TYPES_MAP.get(type_string
|
|
323
|
+
return SNOWFLAKE_TYPES_MAP.get(type_string)
|
|
324
324
|
|
|
325
325
|
|
|
326
326
|
# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
|