acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +46 -6
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +573 -517
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18564 -18484
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import io
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from enum import Enum, auto
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from pathlib import PurePath
|
|
10
|
+
from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
|
+
|
|
13
|
+
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
14
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
15
|
+
from datahub.ingestion.api.common import PipelineContext
|
|
16
|
+
from datahub.ingestion.api.decorators import (
|
|
17
|
+
SupportStatus,
|
|
18
|
+
capability,
|
|
19
|
+
config_class,
|
|
20
|
+
platform_name,
|
|
21
|
+
support_status,
|
|
22
|
+
)
|
|
23
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
|
|
24
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
25
|
+
from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags
|
|
26
|
+
from datahub.ingestion.source.aws.s3_util import (
|
|
27
|
+
get_bucket_name,
|
|
28
|
+
get_bucket_relative_path,
|
|
29
|
+
strip_s3_prefix,
|
|
30
|
+
)
|
|
31
|
+
from datahub.ingestion.source.azure.abs_folder_utils import (
|
|
32
|
+
get_abs_tags,
|
|
33
|
+
)
|
|
34
|
+
from datahub.ingestion.source.azure.abs_utils import (
|
|
35
|
+
get_container_relative_path,
|
|
36
|
+
strip_abs_prefix,
|
|
37
|
+
)
|
|
38
|
+
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
39
|
+
from datahub.ingestion.source.excel.config import ExcelSourceConfig
|
|
40
|
+
from datahub.ingestion.source.excel.excel_file import ExcelFile, ExcelTable
|
|
41
|
+
from datahub.ingestion.source.excel.profiling import ExcelProfiler
|
|
42
|
+
from datahub.ingestion.source.excel.report import ExcelSourceReport
|
|
43
|
+
from datahub.ingestion.source.excel.util import gen_dataset_name
|
|
44
|
+
from datahub.ingestion.source.s3.source import BrowsePath
|
|
45
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
46
|
+
StaleEntityRemovalHandler,
|
|
47
|
+
)
|
|
48
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
49
|
+
StatefulIngestionSourceBase,
|
|
50
|
+
)
|
|
51
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
|
|
52
|
+
from datahub.metadata.schema_classes import (
|
|
53
|
+
BooleanTypeClass,
|
|
54
|
+
ChangeTypeClass,
|
|
55
|
+
DatasetPropertiesClass,
|
|
56
|
+
DateTypeClass,
|
|
57
|
+
GlobalTagsClass,
|
|
58
|
+
NullTypeClass,
|
|
59
|
+
NumberTypeClass,
|
|
60
|
+
OtherSchemaClass,
|
|
61
|
+
RecordTypeClass,
|
|
62
|
+
SchemaFieldClass as SchemaField,
|
|
63
|
+
SchemaFieldDataTypeClass as SchemaFieldDataType,
|
|
64
|
+
SchemaMetadataClass as SchemaMetadata,
|
|
65
|
+
StringTypeClass,
|
|
66
|
+
)
|
|
67
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
68
|
+
|
|
69
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
70
|
+
|
|
71
|
+
field_type_mapping: Dict[str, Type] = {
|
|
72
|
+
"int8": NumberTypeClass,
|
|
73
|
+
"int16": NumberTypeClass,
|
|
74
|
+
"int32": NumberTypeClass,
|
|
75
|
+
"int64": NumberTypeClass,
|
|
76
|
+
"uint8": NumberTypeClass,
|
|
77
|
+
"uint16": NumberTypeClass,
|
|
78
|
+
"uint32": NumberTypeClass,
|
|
79
|
+
"uint64": NumberTypeClass,
|
|
80
|
+
"Int8": NumberTypeClass,
|
|
81
|
+
"Int16": NumberTypeClass,
|
|
82
|
+
"Int32": NumberTypeClass,
|
|
83
|
+
"Int64": NumberTypeClass,
|
|
84
|
+
"UInt8": NumberTypeClass,
|
|
85
|
+
"UInt16": NumberTypeClass,
|
|
86
|
+
"UInt32": NumberTypeClass,
|
|
87
|
+
"UInt64": NumberTypeClass,
|
|
88
|
+
"intp": NumberTypeClass,
|
|
89
|
+
"uintp": NumberTypeClass,
|
|
90
|
+
"float16": NumberTypeClass,
|
|
91
|
+
"float32": NumberTypeClass,
|
|
92
|
+
"float64": NumberTypeClass,
|
|
93
|
+
"float128": NumberTypeClass,
|
|
94
|
+
"Float32": NumberTypeClass,
|
|
95
|
+
"Float64": NumberTypeClass,
|
|
96
|
+
"complex64": NumberTypeClass,
|
|
97
|
+
"complex128": NumberTypeClass,
|
|
98
|
+
"complex256": NumberTypeClass,
|
|
99
|
+
"bool": BooleanTypeClass,
|
|
100
|
+
"boolean": BooleanTypeClass,
|
|
101
|
+
"object": StringTypeClass,
|
|
102
|
+
"string": StringTypeClass,
|
|
103
|
+
"datetime64": DateTypeClass,
|
|
104
|
+
"datetime64[ns]": DateTypeClass,
|
|
105
|
+
"datetime64[ns, tz]": DateTypeClass,
|
|
106
|
+
"timedelta64": DateTypeClass,
|
|
107
|
+
"timedelta64[ns]": DateTypeClass,
|
|
108
|
+
"period": DateTypeClass,
|
|
109
|
+
"period[D]": DateTypeClass,
|
|
110
|
+
"period[M]": DateTypeClass,
|
|
111
|
+
"period[Y]": DateTypeClass,
|
|
112
|
+
"category": RecordTypeClass,
|
|
113
|
+
"interval": RecordTypeClass,
|
|
114
|
+
"sparse": RecordTypeClass,
|
|
115
|
+
"NA": NullTypeClass,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
ALLOWED_EXTENSIONS = [".xlsx", ".xlsm", ".xltx", ".xltm"]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class UriType(Enum):
|
|
123
|
+
HTTP = auto()
|
|
124
|
+
HTTPS = auto()
|
|
125
|
+
LOCAL_FILE = auto()
|
|
126
|
+
ABSOLUTE_PATH = auto()
|
|
127
|
+
RELATIVE_PATH = auto()
|
|
128
|
+
S3 = auto()
|
|
129
|
+
S3A = auto()
|
|
130
|
+
ABS = auto()
|
|
131
|
+
UNKNOWN = auto()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@platform_name("Excel")
|
|
135
|
+
@config_class(ExcelSourceConfig)
|
|
136
|
+
@support_status(SupportStatus.INCUBATING)
|
|
137
|
+
@capability(SourceCapability.CONTAINERS, "Enabled by default")
|
|
138
|
+
@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
|
|
139
|
+
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
140
|
+
@capability(
|
|
141
|
+
SourceCapability.DELETION_DETECTION,
|
|
142
|
+
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
|
|
143
|
+
supported=True,
|
|
144
|
+
)
|
|
145
|
+
class ExcelSource(StatefulIngestionSourceBase):
|
|
146
|
+
config: ExcelSourceConfig
|
|
147
|
+
report: ExcelSourceReport
|
|
148
|
+
container_WU_creator: ContainerWUCreator
|
|
149
|
+
platform: str = "excel"
|
|
150
|
+
|
|
151
|
+
def __init__(self, ctx: PipelineContext, config: ExcelSourceConfig):
|
|
152
|
+
super().__init__(config, ctx)
|
|
153
|
+
self.ctx = ctx
|
|
154
|
+
self.config = config
|
|
155
|
+
self.report: ExcelSourceReport = ExcelSourceReport()
|
|
156
|
+
|
|
157
|
+
@classmethod
|
|
158
|
+
def create(cls, config_dict: dict, ctx: PipelineContext) -> "ExcelSource":
|
|
159
|
+
config = ExcelSourceConfig.parse_obj(config_dict)
|
|
160
|
+
return cls(ctx, config)
|
|
161
|
+
|
|
162
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
163
|
+
return [
|
|
164
|
+
*super().get_workunit_processors(),
|
|
165
|
+
StaleEntityRemovalHandler.create(
|
|
166
|
+
self, self.config, self.ctx
|
|
167
|
+
).workunit_processor,
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def uri_type(uri: str) -> Tuple[UriType, str]:
|
|
172
|
+
if not uri or not isinstance(uri, str):
|
|
173
|
+
return UriType.UNKNOWN, ""
|
|
174
|
+
|
|
175
|
+
uri = uri.strip()
|
|
176
|
+
parsed = urlparse(uri)
|
|
177
|
+
scheme = parsed.scheme.lower()
|
|
178
|
+
|
|
179
|
+
if scheme == "http":
|
|
180
|
+
return UriType.HTTP, uri[7:]
|
|
181
|
+
elif scheme == "https":
|
|
182
|
+
if parsed.netloc and ".blob.core.windows.net" in parsed.netloc:
|
|
183
|
+
return UriType.ABS, uri[8:]
|
|
184
|
+
else:
|
|
185
|
+
return UriType.HTTPS, uri[8:]
|
|
186
|
+
elif scheme == "file":
|
|
187
|
+
if uri.startswith("file:///"):
|
|
188
|
+
return UriType.LOCAL_FILE, uri[7:]
|
|
189
|
+
|
|
190
|
+
if scheme == "s3":
|
|
191
|
+
return UriType.S3, uri[5:]
|
|
192
|
+
elif scheme == "s3a":
|
|
193
|
+
return UriType.S3A, uri[6:]
|
|
194
|
+
|
|
195
|
+
if scheme:
|
|
196
|
+
return UriType.UNKNOWN, uri[len(scheme) + 3 :]
|
|
197
|
+
|
|
198
|
+
if os.path.isabs(uri):
|
|
199
|
+
return UriType.ABSOLUTE_PATH, uri
|
|
200
|
+
else:
|
|
201
|
+
return UriType.RELATIVE_PATH, uri
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def is_excel_file(path: str) -> bool:
|
|
205
|
+
_, ext = os.path.splitext(path)
|
|
206
|
+
return ext.lower() in ALLOWED_EXTENSIONS
|
|
207
|
+
|
|
208
|
+
@staticmethod
|
|
209
|
+
def local_browser(path_spec: str) -> Iterable[BrowsePath]:
|
|
210
|
+
matching_paths = glob.glob(path_spec, recursive=True)
|
|
211
|
+
matching_files = [path for path in matching_paths if os.path.isfile(path)]
|
|
212
|
+
|
|
213
|
+
for file in sorted(matching_files):
|
|
214
|
+
full_path = PurePath(os.path.normpath(file)).as_posix()
|
|
215
|
+
yield BrowsePath(
|
|
216
|
+
file=full_path,
|
|
217
|
+
timestamp=datetime.fromtimestamp(
|
|
218
|
+
os.path.getmtime(full_path), timezone.utc
|
|
219
|
+
),
|
|
220
|
+
size=os.path.getsize(full_path),
|
|
221
|
+
partitions=[],
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def get_local_file(self, file_path: str) -> Union[BytesIO, None]:
|
|
225
|
+
try:
|
|
226
|
+
with open(file_path, "rb") as f:
|
|
227
|
+
bytes_io = io.BytesIO(f.read())
|
|
228
|
+
bytes_io.seek(0)
|
|
229
|
+
return bytes_io
|
|
230
|
+
except Exception as e:
|
|
231
|
+
self.report.report_file_dropped(file_path)
|
|
232
|
+
self.report.warning(
|
|
233
|
+
message="Error reading local Excel file",
|
|
234
|
+
context=f"Path={file_path}",
|
|
235
|
+
exc=e,
|
|
236
|
+
)
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
@staticmethod
|
|
240
|
+
def get_prefix(relative_path: str) -> str:
|
|
241
|
+
index = re.search(r"[*|{]", relative_path)
|
|
242
|
+
if index:
|
|
243
|
+
return relative_path[: index.start()]
|
|
244
|
+
else:
|
|
245
|
+
return relative_path
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def create_s3_path(bucket_name: str, key: str) -> str:
|
|
249
|
+
return f"s3://{bucket_name}/{key}"
|
|
250
|
+
|
|
251
|
+
def create_abs_path(self, key: str) -> str:
|
|
252
|
+
if self.config.azure_config:
|
|
253
|
+
account_name = self.config.azure_config.account_name
|
|
254
|
+
container_name = self.config.azure_config.container_name
|
|
255
|
+
return (
|
|
256
|
+
f"https://{account_name}.blob.core.windows.net/{container_name}/{key}"
|
|
257
|
+
)
|
|
258
|
+
return ""
|
|
259
|
+
|
|
260
|
+
@staticmethod
|
|
261
|
+
def strip_file_prefix(path: str) -> str:
|
|
262
|
+
if path.startswith("/"):
|
|
263
|
+
return path[1:]
|
|
264
|
+
else:
|
|
265
|
+
return path
|
|
266
|
+
|
|
267
|
+
def s3_browser(self, path_spec: str) -> Iterable[BrowsePath]:
|
|
268
|
+
if self.config.aws_config is None:
|
|
269
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
270
|
+
s3 = self.config.aws_config.get_s3_resource(self.config.verify_ssl)
|
|
271
|
+
bucket_name = get_bucket_name(path_spec)
|
|
272
|
+
logger.debug(f"Scanning bucket: {bucket_name}")
|
|
273
|
+
bucket = s3.Bucket(bucket_name)
|
|
274
|
+
prefix = self.get_prefix(get_bucket_relative_path(path_spec))
|
|
275
|
+
logger.debug(f"Scanning objects with prefix:{prefix}")
|
|
276
|
+
|
|
277
|
+
for obj in bucket.objects.filter(Prefix=prefix).page_size(1000):
|
|
278
|
+
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
279
|
+
logger.debug(f"Path: {s3_path}")
|
|
280
|
+
|
|
281
|
+
yield BrowsePath(
|
|
282
|
+
file=s3_path,
|
|
283
|
+
timestamp=obj.last_modified,
|
|
284
|
+
size=obj.size,
|
|
285
|
+
partitions=[],
|
|
286
|
+
content_type=None,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def get_s3_file(self, path_spec: str) -> Union[BytesIO, None]:
|
|
290
|
+
if self.config.aws_config is None:
|
|
291
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
292
|
+
s3 = self.config.aws_config.get_s3_resource(self.config.verify_ssl)
|
|
293
|
+
bucket_name = get_bucket_name(path_spec)
|
|
294
|
+
key = get_bucket_relative_path(path_spec)
|
|
295
|
+
logger.debug(f"Getting file: {key} from bucket: {bucket_name}")
|
|
296
|
+
try:
|
|
297
|
+
obj = s3.Object(bucket_name, key)
|
|
298
|
+
file_content = obj.get()["Body"].read()
|
|
299
|
+
binary_stream = io.BytesIO(file_content)
|
|
300
|
+
binary_stream.seek(0)
|
|
301
|
+
return binary_stream
|
|
302
|
+
except Exception as e:
|
|
303
|
+
self.report.report_file_dropped(path_spec)
|
|
304
|
+
self.report.warning(
|
|
305
|
+
message="Error reading Excel file from S3",
|
|
306
|
+
context=f"Path={path_spec}",
|
|
307
|
+
exc=e,
|
|
308
|
+
)
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
def process_s3_tags(
|
|
312
|
+
self, path_spec: str, dataset_urn: str
|
|
313
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
314
|
+
bucket_name = get_bucket_name(path_spec)
|
|
315
|
+
key = get_bucket_relative_path(path_spec)
|
|
316
|
+
|
|
317
|
+
s3_tags = get_s3_tags(
|
|
318
|
+
bucket_name,
|
|
319
|
+
key,
|
|
320
|
+
dataset_urn,
|
|
321
|
+
self.config.aws_config,
|
|
322
|
+
self.ctx,
|
|
323
|
+
self.config.use_s3_bucket_tags,
|
|
324
|
+
self.config.use_s3_object_tags,
|
|
325
|
+
self.config.verify_ssl,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if s3_tags:
|
|
329
|
+
yield from self.process_global_tags(s3_tags, dataset_urn)
|
|
330
|
+
|
|
331
|
+
def abs_browser(self, path_spec: str) -> Iterable[BrowsePath]:
|
|
332
|
+
if self.config.azure_config is None:
|
|
333
|
+
raise ValueError("azure_config not set. Cannot browse Azure Blob Storage")
|
|
334
|
+
abs_blob_service_client = self.config.azure_config.get_blob_service_client()
|
|
335
|
+
container_client = abs_blob_service_client.get_container_client(
|
|
336
|
+
self.config.azure_config.container_name
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
container_name = self.config.azure_config.container_name
|
|
340
|
+
logger.debug(f"Scanning container: {container_name}")
|
|
341
|
+
|
|
342
|
+
prefix = self.get_prefix(get_container_relative_path(path_spec))
|
|
343
|
+
logger.debug(f"Scanning objects with prefix: {prefix}")
|
|
344
|
+
|
|
345
|
+
for obj in container_client.list_blobs(
|
|
346
|
+
name_starts_with=f"{prefix}", results_per_page=1000
|
|
347
|
+
):
|
|
348
|
+
abs_path = self.create_abs_path(obj.name)
|
|
349
|
+
logger.debug(f"Path: {abs_path}")
|
|
350
|
+
|
|
351
|
+
yield BrowsePath(
|
|
352
|
+
file=abs_path,
|
|
353
|
+
timestamp=obj.last_modified,
|
|
354
|
+
size=obj.size,
|
|
355
|
+
partitions=[],
|
|
356
|
+
content_type=None,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
def get_abs_file(self, path_spec: str) -> Union[BytesIO, None]:
|
|
360
|
+
if self.config.azure_config is None:
|
|
361
|
+
raise ValueError("azure_config not set. Cannot browse Azure Blob Storage")
|
|
362
|
+
abs_blob_service_client = self.config.azure_config.get_blob_service_client()
|
|
363
|
+
container_client = abs_blob_service_client.get_container_client(
|
|
364
|
+
self.config.azure_config.container_name
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
container_name = self.config.azure_config.container_name
|
|
368
|
+
blob_path = get_container_relative_path(path_spec)
|
|
369
|
+
logger.debug(f"Getting file: {blob_path} from container: {container_name}")
|
|
370
|
+
|
|
371
|
+
try:
|
|
372
|
+
blob_client = container_client.get_blob_client(blob_path)
|
|
373
|
+
download_stream = blob_client.download_blob()
|
|
374
|
+
file_content = download_stream.readall()
|
|
375
|
+
binary_stream = io.BytesIO(file_content)
|
|
376
|
+
binary_stream.seek(0)
|
|
377
|
+
return binary_stream
|
|
378
|
+
except Exception as e:
|
|
379
|
+
self.report.report_file_dropped(path_spec)
|
|
380
|
+
self.report.warning(
|
|
381
|
+
message="Error reading Excel file from Azure Blob Storage",
|
|
382
|
+
context=f"Path={path_spec}",
|
|
383
|
+
exc=e,
|
|
384
|
+
)
|
|
385
|
+
return None
|
|
386
|
+
|
|
387
|
+
def process_abs_tags(
|
|
388
|
+
self, path_spec: str, dataset_urn: str
|
|
389
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
390
|
+
if (
|
|
391
|
+
self.config.azure_config
|
|
392
|
+
and self.config.azure_config.container_name is not None
|
|
393
|
+
):
|
|
394
|
+
container_name = self.config.azure_config.container_name
|
|
395
|
+
blob_path = get_container_relative_path(path_spec)
|
|
396
|
+
|
|
397
|
+
abs_tags = get_abs_tags(
|
|
398
|
+
container_name,
|
|
399
|
+
blob_path,
|
|
400
|
+
dataset_urn,
|
|
401
|
+
self.config.azure_config,
|
|
402
|
+
self.ctx,
|
|
403
|
+
self.config.use_abs_blob_tags,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
if abs_tags:
|
|
407
|
+
yield from self.process_global_tags(abs_tags, dataset_urn)
|
|
408
|
+
|
|
409
|
+
@staticmethod
|
|
410
|
+
def get_field_type(field_type: str) -> SchemaFieldDataType:
|
|
411
|
+
type_class = field_type_mapping.get(field_type, NullTypeClass)
|
|
412
|
+
return SchemaFieldDataType(type=type_class())
|
|
413
|
+
|
|
414
|
+
def construct_schema_field(self, f_name: str, f_type: str) -> SchemaField:
|
|
415
|
+
logger.debug(f"Field: {f_name} Type: {f_type}")
|
|
416
|
+
return SchemaField(
|
|
417
|
+
fieldPath=f_name,
|
|
418
|
+
nativeDataType=f_type,
|
|
419
|
+
type=self.get_field_type(f_type),
|
|
420
|
+
description=None,
|
|
421
|
+
nullable=False,
|
|
422
|
+
recursive=False,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
def construct_schema_metadata(
|
|
426
|
+
self,
|
|
427
|
+
name: str,
|
|
428
|
+
dataset: ExcelTable,
|
|
429
|
+
) -> SchemaMetadata:
|
|
430
|
+
canonical_schema: List[SchemaField] = []
|
|
431
|
+
|
|
432
|
+
# Get data types for each column
|
|
433
|
+
data_types = dataset.df.dtypes.to_dict()
|
|
434
|
+
|
|
435
|
+
# Convert numpy types to string representation for better readability
|
|
436
|
+
data_types = {col: str(dtype) for col, dtype in data_types.items()}
|
|
437
|
+
|
|
438
|
+
for f_name, f_type in data_types.items():
|
|
439
|
+
canonical_schema.append(self.construct_schema_field(f_name, f_type))
|
|
440
|
+
|
|
441
|
+
return SchemaMetadata(
|
|
442
|
+
schemaName=name,
|
|
443
|
+
platform=f"urn:li:dataPlatform:{self.platform}",
|
|
444
|
+
version=0,
|
|
445
|
+
hash="",
|
|
446
|
+
platformSchema=OtherSchemaClass(rawSchema=""),
|
|
447
|
+
fields=canonical_schema,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
@staticmethod
|
|
451
|
+
def get_dataset_attributes(metadata: Dict[str, Any]) -> dict:
|
|
452
|
+
result = {}
|
|
453
|
+
for key, value in metadata.items():
|
|
454
|
+
result[key] = str(value)
|
|
455
|
+
return result
|
|
456
|
+
|
|
457
|
+
@staticmethod
|
|
458
|
+
def process_global_tags(
|
|
459
|
+
global_tags: GlobalTagsClass, dataset_urn: str
|
|
460
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
461
|
+
yield MetadataChangeProposalWrapper(
|
|
462
|
+
entityType="dataset",
|
|
463
|
+
entityUrn=dataset_urn,
|
|
464
|
+
aspect=global_tags,
|
|
465
|
+
changeType=ChangeTypeClass.UPSERT,
|
|
466
|
+
).as_workunit()
|
|
467
|
+
|
|
468
|
+
def process_dataset(
|
|
469
|
+
self,
|
|
470
|
+
relative_path: str,
|
|
471
|
+
full_path: str,
|
|
472
|
+
filename: str,
|
|
473
|
+
table: ExcelTable,
|
|
474
|
+
source_type: UriType,
|
|
475
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
476
|
+
self.report.report_worksheet_processed()
|
|
477
|
+
dataset_name = gen_dataset_name(
|
|
478
|
+
relative_path, table.sheet_name, self.config.convert_urns_to_lowercase
|
|
479
|
+
)
|
|
480
|
+
dataset_urn = make_dataset_urn_with_platform_instance(
|
|
481
|
+
platform=self.platform,
|
|
482
|
+
name=dataset_name,
|
|
483
|
+
platform_instance=self.config.platform_instance,
|
|
484
|
+
env=self.config.env,
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
attributes = self.get_dataset_attributes(table.metadata)
|
|
488
|
+
created: Optional[datetime] = table.metadata.get("created")
|
|
489
|
+
modified: Optional[datetime] = table.metadata.get("modified")
|
|
490
|
+
dataset_properties = DatasetPropertiesClass(
|
|
491
|
+
tags=[],
|
|
492
|
+
customProperties=attributes,
|
|
493
|
+
created=(
|
|
494
|
+
TimeStamp(time=int(created.timestamp() * 1000)) if created else None
|
|
495
|
+
),
|
|
496
|
+
lastModified=(
|
|
497
|
+
TimeStamp(time=int(modified.timestamp() * 1000)) if modified else None
|
|
498
|
+
),
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
schema_metadata = self.construct_schema_metadata(
|
|
502
|
+
name=dataset_name,
|
|
503
|
+
dataset=table,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
yield MetadataChangeProposalWrapper(
|
|
507
|
+
entityUrn=dataset_urn,
|
|
508
|
+
aspect=schema_metadata,
|
|
509
|
+
).as_workunit()
|
|
510
|
+
|
|
511
|
+
yield MetadataChangeProposalWrapper(
|
|
512
|
+
entityUrn=dataset_urn,
|
|
513
|
+
aspect=dataset_properties,
|
|
514
|
+
).as_workunit()
|
|
515
|
+
|
|
516
|
+
yield from self.container_WU_creator.create_container_hierarchy(
|
|
517
|
+
relative_path, dataset_urn
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
if source_type == UriType.S3 and (
|
|
521
|
+
self.config.use_s3_bucket_tags or self.config.use_s3_object_tags
|
|
522
|
+
):
|
|
523
|
+
yield from self.process_s3_tags(full_path, dataset_urn)
|
|
524
|
+
elif source_type == UriType.ABS and self.config.use_abs_blob_tags:
|
|
525
|
+
yield from self.process_abs_tags(full_path, dataset_urn)
|
|
526
|
+
|
|
527
|
+
if self.config.is_profiling_enabled():
|
|
528
|
+
profiler = ExcelProfiler(
|
|
529
|
+
self.config,
|
|
530
|
+
self.report,
|
|
531
|
+
table.df,
|
|
532
|
+
filename,
|
|
533
|
+
table.sheet_name,
|
|
534
|
+
dataset_urn,
|
|
535
|
+
relative_path,
|
|
536
|
+
)
|
|
537
|
+
yield from profiler.get_workunits()
|
|
538
|
+
|
|
539
|
+
def process_file(
|
|
540
|
+
self,
|
|
541
|
+
file_content: BytesIO,
|
|
542
|
+
relative_path: str,
|
|
543
|
+
full_path: str,
|
|
544
|
+
filename: str,
|
|
545
|
+
source_type: UriType,
|
|
546
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
547
|
+
self.report.report_file_processed()
|
|
548
|
+
xls = ExcelFile(filename, file_content, self.report)
|
|
549
|
+
result = xls.load_workbook()
|
|
550
|
+
|
|
551
|
+
if result:
|
|
552
|
+
for table in xls.get_tables(active_only=self.config.active_sheet_only):
|
|
553
|
+
self.report.report_worksheet_scanned()
|
|
554
|
+
dataset_name = gen_dataset_name(
|
|
555
|
+
relative_path,
|
|
556
|
+
table.sheet_name,
|
|
557
|
+
self.config.convert_urns_to_lowercase,
|
|
558
|
+
)
|
|
559
|
+
if not self.config.worksheet_pattern.allowed(dataset_name):
|
|
560
|
+
self.report.report_dropped(dataset_name)
|
|
561
|
+
continue
|
|
562
|
+
yield from self.process_dataset(
|
|
563
|
+
relative_path, full_path, filename, table, source_type
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
def check_file_is_valid(self, filename: str) -> bool:
|
|
567
|
+
self.report.report_file_scanned()
|
|
568
|
+
if not self.config.path_pattern.allowed(filename):
|
|
569
|
+
self.report.report_dropped(filename)
|
|
570
|
+
return False
|
|
571
|
+
elif not self.is_excel_file(filename):
|
|
572
|
+
logger.debug(f"File is not an Excel workbook: {filename}")
|
|
573
|
+
return False
|
|
574
|
+
return True
|
|
575
|
+
|
|
576
|
+
def retrieve_file_data(
|
|
577
|
+
self, uri_type: UriType, path: str, path_spec: str
|
|
578
|
+
) -> Iterator[Tuple[BytesIO, str, str, str]]:
|
|
579
|
+
if (
|
|
580
|
+
uri_type == UriType.LOCAL_FILE
|
|
581
|
+
or uri_type == UriType.ABSOLUTE_PATH
|
|
582
|
+
or uri_type == UriType.RELATIVE_PATH
|
|
583
|
+
):
|
|
584
|
+
logger.debug(f"Searching local path: {path}")
|
|
585
|
+
for browse_path in self.local_browser(path):
|
|
586
|
+
if self.check_file_is_valid(browse_path.file):
|
|
587
|
+
basename = os.path.basename(browse_path.file)
|
|
588
|
+
file_path = self.strip_file_prefix(browse_path.file)
|
|
589
|
+
filename = os.path.splitext(basename)[0]
|
|
590
|
+
|
|
591
|
+
logger.debug(f"Processing {filename}")
|
|
592
|
+
with self.report.local_file_get_timer:
|
|
593
|
+
file_data = self.get_local_file(browse_path.file)
|
|
594
|
+
|
|
595
|
+
if file_data is not None:
|
|
596
|
+
yield file_data, file_path, browse_path.file, filename
|
|
597
|
+
|
|
598
|
+
elif uri_type == UriType.S3:
|
|
599
|
+
logger.debug(f"Searching S3 path: {path}")
|
|
600
|
+
for browse_path in self.s3_browser(path_spec):
|
|
601
|
+
if self.check_file_is_valid(browse_path.file):
|
|
602
|
+
uri_path = strip_s3_prefix(browse_path.file)
|
|
603
|
+
basename = os.path.basename(uri_path)
|
|
604
|
+
filename = os.path.splitext(basename)[0]
|
|
605
|
+
|
|
606
|
+
logger.debug(f"Processing {browse_path.file}")
|
|
607
|
+
with self.report.s3_file_get_timer:
|
|
608
|
+
file_data = self.get_s3_file(browse_path.file)
|
|
609
|
+
|
|
610
|
+
if file_data is not None:
|
|
611
|
+
yield file_data, uri_path, browse_path.file, filename
|
|
612
|
+
|
|
613
|
+
elif uri_type == UriType.ABS:
|
|
614
|
+
logger.debug(f"Searching Azure Blob Storage path: {path}")
|
|
615
|
+
for browse_path in self.abs_browser(path_spec):
|
|
616
|
+
if self.check_file_is_valid(browse_path.file):
|
|
617
|
+
uri_path = strip_abs_prefix(browse_path.file)
|
|
618
|
+
basename = os.path.basename(uri_path)
|
|
619
|
+
filename = os.path.splitext(basename)[0]
|
|
620
|
+
|
|
621
|
+
logger.debug(f"Processing {browse_path.file}")
|
|
622
|
+
with self.report.abs_file_get_timer:
|
|
623
|
+
file_data = self.get_abs_file(browse_path.file)
|
|
624
|
+
|
|
625
|
+
if file_data is not None:
|
|
626
|
+
yield file_data, uri_path, browse_path.file, filename
|
|
627
|
+
|
|
628
|
+
else:
|
|
629
|
+
self.report.report_file_dropped(path_spec)
|
|
630
|
+
self.report.warning(
|
|
631
|
+
message="Unsupported URI Type",
|
|
632
|
+
context=f"Type={uri_type.name},URI={path_spec}",
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
636
|
+
self.container_WU_creator = ContainerWUCreator(
|
|
637
|
+
self.platform,
|
|
638
|
+
self.config.platform_instance,
|
|
639
|
+
self.config.env,
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
with PerfTimer() as timer:
|
|
643
|
+
for path_spec in self.config.path_list:
|
|
644
|
+
logger.debug(f"Processing path: {path_spec}")
|
|
645
|
+
uri_type, path = self.uri_type(path_spec)
|
|
646
|
+
logger.debug(f"URI Type: {uri_type} Path: {path}")
|
|
647
|
+
|
|
648
|
+
for (
|
|
649
|
+
file_data,
|
|
650
|
+
relative_path,
|
|
651
|
+
full_path,
|
|
652
|
+
filename,
|
|
653
|
+
) in self.retrieve_file_data(uri_type, path, path_spec):
|
|
654
|
+
yield from self.process_file(
|
|
655
|
+
file_data, relative_path, full_path, filename, uri_type
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
time_taken = timer.elapsed_seconds()
|
|
659
|
+
logger.info(f"Finished ingestion in {time_taken:.3f} seconds")
|
|
660
|
+
|
|
661
|
+
def get_report(self):
|
|
662
|
+
return self.report
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def gen_dataset_name(path: str, sheet_name: str, lower_case: bool) -> str:
|
|
5
|
+
sheet_name = sheet_name.strip()
|
|
6
|
+
directory, filename = os.path.split(path)
|
|
7
|
+
|
|
8
|
+
if not directory:
|
|
9
|
+
excel_path = f"[{filename}]"
|
|
10
|
+
else:
|
|
11
|
+
excel_path = os.path.join(directory, f"[{filename}]")
|
|
12
|
+
|
|
13
|
+
name = f"{excel_path}{sheet_name}"
|
|
14
|
+
|
|
15
|
+
if lower_case:
|
|
16
|
+
name = name.lower()
|
|
17
|
+
|
|
18
|
+
return name
|
|
@@ -18,7 +18,14 @@ class FivetranLogQuery:
|
|
|
18
18
|
return f"use database {db_name}"
|
|
19
19
|
|
|
20
20
|
def set_schema(self, schema_name: str) -> None:
|
|
21
|
-
|
|
21
|
+
"""
|
|
22
|
+
Using Snowflake quoted identifiers convention
|
|
23
|
+
|
|
24
|
+
Add double quotes around an identifier
|
|
25
|
+
Use two quotes to use the double quote character inside a quoted identifier
|
|
26
|
+
"""
|
|
27
|
+
schema_name = schema_name.replace('"', '""')
|
|
28
|
+
self.schema_clause = f'"{schema_name}".'
|
|
22
29
|
|
|
23
30
|
def get_connectors_query(self) -> str:
|
|
24
31
|
return f"""\
|
|
@@ -333,7 +333,7 @@ class APISource(Source, ABC):
|
|
|
333
333
|
),
|
|
334
334
|
)
|
|
335
335
|
yield wu
|
|
336
|
-
elif endpoint_dets["method"] != "
|
|
336
|
+
elif endpoint_dets["method"] != "GET":
|
|
337
337
|
self.report.report_warning(
|
|
338
338
|
title="Failed to Extract Endpoint Metadata",
|
|
339
339
|
message=f"No example provided for {endpoint_dets['method']}",
|