acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (84) hide show
  1. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2562 -2476
  2. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
  3. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +46 -6
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  11. datahub/ingestion/source/common/subtypes.py +3 -0
  12. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  13. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  14. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  15. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  16. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  17. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  18. datahub/ingestion/source/excel/__init__.py +0 -0
  19. datahub/ingestion/source/excel/config.py +92 -0
  20. datahub/ingestion/source/excel/excel_file.py +539 -0
  21. datahub/ingestion/source/excel/profiling.py +308 -0
  22. datahub/ingestion/source/excel/report.py +49 -0
  23. datahub/ingestion/source/excel/source.py +662 -0
  24. datahub/ingestion/source/excel/util.py +18 -0
  25. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  26. datahub/ingestion/source/openapi.py +1 -1
  27. datahub/ingestion/source/powerbi/config.py +33 -0
  28. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  29. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  30. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  31. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
  32. datahub/ingestion/source/redshift/config.py +9 -6
  33. datahub/ingestion/source/redshift/lineage.py +386 -687
  34. datahub/ingestion/source/redshift/redshift.py +19 -106
  35. datahub/ingestion/source/s3/source.py +65 -59
  36. datahub/ingestion/source/snowflake/constants.py +2 -0
  37. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  38. datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
  39. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  40. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  41. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  42. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
  43. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  44. datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
  45. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
  46. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  47. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  48. datahub/ingestion/source/sql/mssql/source.py +62 -3
  49. datahub/ingestion/source/sql_queries.py +24 -2
  50. datahub/ingestion/source/state/checkpoint.py +3 -28
  51. datahub/ingestion/source/unity/config.py +74 -9
  52. datahub/ingestion/source/unity/proxy.py +167 -5
  53. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  54. datahub/ingestion/source/unity/proxy_types.py +24 -0
  55. datahub/ingestion/source/unity/report.py +5 -0
  56. datahub/ingestion/source/unity/source.py +111 -1
  57. datahub/ingestion/source/usage/usage_common.py +1 -0
  58. datahub/metadata/_internal_schema_classes.py +573 -517
  59. datahub/metadata/_urns/urn_defs.py +1748 -1748
  60. datahub/metadata/schema.avsc +18564 -18484
  61. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  62. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  63. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  64. datahub/metadata/schemas/LogicalParent.avsc +104 -100
  65. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  66. datahub/metadata/schemas/Ownership.avsc +69 -0
  67. datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
  68. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  69. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  70. datahub/metadata/schemas/__init__.py +3 -3
  71. datahub/sdk/chart.py +36 -22
  72. datahub/sdk/dashboard.py +38 -62
  73. datahub/sdk/lineage_client.py +6 -26
  74. datahub/sdk/main_client.py +7 -3
  75. datahub/sdk/search_filters.py +16 -0
  76. datahub/specific/aspect_helpers/siblings.py +73 -0
  77. datahub/specific/dataset.py +2 -0
  78. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  79. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  80. datahub/upgrade/upgrade.py +14 -2
  81. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  82. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
  83. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
  84. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,662 @@
1
+ import glob
2
+ import io
3
+ import logging
4
+ import os
5
+ import re
6
+ from datetime import datetime, timezone
7
+ from enum import Enum, auto
8
+ from io import BytesIO
9
+ from pathlib import PurePath
10
+ from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
11
+ from urllib.parse import urlparse
12
+
13
+ from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
14
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
+ from datahub.ingestion.api.common import PipelineContext
16
+ from datahub.ingestion.api.decorators import (
17
+ SupportStatus,
18
+ capability,
19
+ config_class,
20
+ platform_name,
21
+ support_status,
22
+ )
23
+ from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
24
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags
26
+ from datahub.ingestion.source.aws.s3_util import (
27
+ get_bucket_name,
28
+ get_bucket_relative_path,
29
+ strip_s3_prefix,
30
+ )
31
+ from datahub.ingestion.source.azure.abs_folder_utils import (
32
+ get_abs_tags,
33
+ )
34
+ from datahub.ingestion.source.azure.abs_utils import (
35
+ get_container_relative_path,
36
+ strip_abs_prefix,
37
+ )
38
+ from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
39
+ from datahub.ingestion.source.excel.config import ExcelSourceConfig
40
+ from datahub.ingestion.source.excel.excel_file import ExcelFile, ExcelTable
41
+ from datahub.ingestion.source.excel.profiling import ExcelProfiler
42
+ from datahub.ingestion.source.excel.report import ExcelSourceReport
43
+ from datahub.ingestion.source.excel.util import gen_dataset_name
44
+ from datahub.ingestion.source.s3.source import BrowsePath
45
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
46
+ StaleEntityRemovalHandler,
47
+ )
48
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
+ StatefulIngestionSourceBase,
50
+ )
51
+ from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
52
+ from datahub.metadata.schema_classes import (
53
+ BooleanTypeClass,
54
+ ChangeTypeClass,
55
+ DatasetPropertiesClass,
56
+ DateTypeClass,
57
+ GlobalTagsClass,
58
+ NullTypeClass,
59
+ NumberTypeClass,
60
+ OtherSchemaClass,
61
+ RecordTypeClass,
62
+ SchemaFieldClass as SchemaField,
63
+ SchemaFieldDataTypeClass as SchemaFieldDataType,
64
+ SchemaMetadataClass as SchemaMetadata,
65
+ StringTypeClass,
66
+ )
67
+ from datahub.utilities.perf_timer import PerfTimer
68
+
69
+ logger: logging.Logger = logging.getLogger(__name__)
70
+
71
+ field_type_mapping: Dict[str, Type] = {
72
+ "int8": NumberTypeClass,
73
+ "int16": NumberTypeClass,
74
+ "int32": NumberTypeClass,
75
+ "int64": NumberTypeClass,
76
+ "uint8": NumberTypeClass,
77
+ "uint16": NumberTypeClass,
78
+ "uint32": NumberTypeClass,
79
+ "uint64": NumberTypeClass,
80
+ "Int8": NumberTypeClass,
81
+ "Int16": NumberTypeClass,
82
+ "Int32": NumberTypeClass,
83
+ "Int64": NumberTypeClass,
84
+ "UInt8": NumberTypeClass,
85
+ "UInt16": NumberTypeClass,
86
+ "UInt32": NumberTypeClass,
87
+ "UInt64": NumberTypeClass,
88
+ "intp": NumberTypeClass,
89
+ "uintp": NumberTypeClass,
90
+ "float16": NumberTypeClass,
91
+ "float32": NumberTypeClass,
92
+ "float64": NumberTypeClass,
93
+ "float128": NumberTypeClass,
94
+ "Float32": NumberTypeClass,
95
+ "Float64": NumberTypeClass,
96
+ "complex64": NumberTypeClass,
97
+ "complex128": NumberTypeClass,
98
+ "complex256": NumberTypeClass,
99
+ "bool": BooleanTypeClass,
100
+ "boolean": BooleanTypeClass,
101
+ "object": StringTypeClass,
102
+ "string": StringTypeClass,
103
+ "datetime64": DateTypeClass,
104
+ "datetime64[ns]": DateTypeClass,
105
+ "datetime64[ns, tz]": DateTypeClass,
106
+ "timedelta64": DateTypeClass,
107
+ "timedelta64[ns]": DateTypeClass,
108
+ "period": DateTypeClass,
109
+ "period[D]": DateTypeClass,
110
+ "period[M]": DateTypeClass,
111
+ "period[Y]": DateTypeClass,
112
+ "category": RecordTypeClass,
113
+ "interval": RecordTypeClass,
114
+ "sparse": RecordTypeClass,
115
+ "NA": NullTypeClass,
116
+ }
117
+
118
+
119
+ ALLOWED_EXTENSIONS = [".xlsx", ".xlsm", ".xltx", ".xltm"]
120
+
121
+
122
+ class UriType(Enum):
123
+ HTTP = auto()
124
+ HTTPS = auto()
125
+ LOCAL_FILE = auto()
126
+ ABSOLUTE_PATH = auto()
127
+ RELATIVE_PATH = auto()
128
+ S3 = auto()
129
+ S3A = auto()
130
+ ABS = auto()
131
+ UNKNOWN = auto()
132
+
133
+
134
+ @platform_name("Excel")
135
+ @config_class(ExcelSourceConfig)
136
+ @support_status(SupportStatus.INCUBATING)
137
+ @capability(SourceCapability.CONTAINERS, "Enabled by default")
138
+ @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
139
+ @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
140
+ @capability(
141
+ SourceCapability.DELETION_DETECTION,
142
+ "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
143
+ supported=True,
144
+ )
145
+ class ExcelSource(StatefulIngestionSourceBase):
146
+ config: ExcelSourceConfig
147
+ report: ExcelSourceReport
148
+ container_WU_creator: ContainerWUCreator
149
+ platform: str = "excel"
150
+
151
+ def __init__(self, ctx: PipelineContext, config: ExcelSourceConfig):
152
+ super().__init__(config, ctx)
153
+ self.ctx = ctx
154
+ self.config = config
155
+ self.report: ExcelSourceReport = ExcelSourceReport()
156
+
157
+ @classmethod
158
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> "ExcelSource":
159
+ config = ExcelSourceConfig.parse_obj(config_dict)
160
+ return cls(ctx, config)
161
+
162
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
163
+ return [
164
+ *super().get_workunit_processors(),
165
+ StaleEntityRemovalHandler.create(
166
+ self, self.config, self.ctx
167
+ ).workunit_processor,
168
+ ]
169
+
170
+ @staticmethod
171
+ def uri_type(uri: str) -> Tuple[UriType, str]:
172
+ if not uri or not isinstance(uri, str):
173
+ return UriType.UNKNOWN, ""
174
+
175
+ uri = uri.strip()
176
+ parsed = urlparse(uri)
177
+ scheme = parsed.scheme.lower()
178
+
179
+ if scheme == "http":
180
+ return UriType.HTTP, uri[7:]
181
+ elif scheme == "https":
182
+ if parsed.netloc and ".blob.core.windows.net" in parsed.netloc:
183
+ return UriType.ABS, uri[8:]
184
+ else:
185
+ return UriType.HTTPS, uri[8:]
186
+ elif scheme == "file":
187
+ if uri.startswith("file:///"):
188
+ return UriType.LOCAL_FILE, uri[7:]
189
+
190
+ if scheme == "s3":
191
+ return UriType.S3, uri[5:]
192
+ elif scheme == "s3a":
193
+ return UriType.S3A, uri[6:]
194
+
195
+ if scheme:
196
+ return UriType.UNKNOWN, uri[len(scheme) + 3 :]
197
+
198
+ if os.path.isabs(uri):
199
+ return UriType.ABSOLUTE_PATH, uri
200
+ else:
201
+ return UriType.RELATIVE_PATH, uri
202
+
203
+ @staticmethod
204
+ def is_excel_file(path: str) -> bool:
205
+ _, ext = os.path.splitext(path)
206
+ return ext.lower() in ALLOWED_EXTENSIONS
207
+
208
+ @staticmethod
209
+ def local_browser(path_spec: str) -> Iterable[BrowsePath]:
210
+ matching_paths = glob.glob(path_spec, recursive=True)
211
+ matching_files = [path for path in matching_paths if os.path.isfile(path)]
212
+
213
+ for file in sorted(matching_files):
214
+ full_path = PurePath(os.path.normpath(file)).as_posix()
215
+ yield BrowsePath(
216
+ file=full_path,
217
+ timestamp=datetime.fromtimestamp(
218
+ os.path.getmtime(full_path), timezone.utc
219
+ ),
220
+ size=os.path.getsize(full_path),
221
+ partitions=[],
222
+ )
223
+
224
+ def get_local_file(self, file_path: str) -> Union[BytesIO, None]:
225
+ try:
226
+ with open(file_path, "rb") as f:
227
+ bytes_io = io.BytesIO(f.read())
228
+ bytes_io.seek(0)
229
+ return bytes_io
230
+ except Exception as e:
231
+ self.report.report_file_dropped(file_path)
232
+ self.report.warning(
233
+ message="Error reading local Excel file",
234
+ context=f"Path={file_path}",
235
+ exc=e,
236
+ )
237
+ return None
238
+
239
+ @staticmethod
240
+ def get_prefix(relative_path: str) -> str:
241
+ index = re.search(r"[*|{]", relative_path)
242
+ if index:
243
+ return relative_path[: index.start()]
244
+ else:
245
+ return relative_path
246
+
247
+ @staticmethod
248
+ def create_s3_path(bucket_name: str, key: str) -> str:
249
+ return f"s3://{bucket_name}/{key}"
250
+
251
+ def create_abs_path(self, key: str) -> str:
252
+ if self.config.azure_config:
253
+ account_name = self.config.azure_config.account_name
254
+ container_name = self.config.azure_config.container_name
255
+ return (
256
+ f"https://{account_name}.blob.core.windows.net/{container_name}/{key}"
257
+ )
258
+ return ""
259
+
260
+ @staticmethod
261
+ def strip_file_prefix(path: str) -> str:
262
+ if path.startswith("/"):
263
+ return path[1:]
264
+ else:
265
+ return path
266
+
267
+ def s3_browser(self, path_spec: str) -> Iterable[BrowsePath]:
268
+ if self.config.aws_config is None:
269
+ raise ValueError("aws_config not set. Cannot browse s3")
270
+ s3 = self.config.aws_config.get_s3_resource(self.config.verify_ssl)
271
+ bucket_name = get_bucket_name(path_spec)
272
+ logger.debug(f"Scanning bucket: {bucket_name}")
273
+ bucket = s3.Bucket(bucket_name)
274
+ prefix = self.get_prefix(get_bucket_relative_path(path_spec))
275
+ logger.debug(f"Scanning objects with prefix:{prefix}")
276
+
277
+ for obj in bucket.objects.filter(Prefix=prefix).page_size(1000):
278
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
279
+ logger.debug(f"Path: {s3_path}")
280
+
281
+ yield BrowsePath(
282
+ file=s3_path,
283
+ timestamp=obj.last_modified,
284
+ size=obj.size,
285
+ partitions=[],
286
+ content_type=None,
287
+ )
288
+
289
+ def get_s3_file(self, path_spec: str) -> Union[BytesIO, None]:
290
+ if self.config.aws_config is None:
291
+ raise ValueError("aws_config not set. Cannot browse s3")
292
+ s3 = self.config.aws_config.get_s3_resource(self.config.verify_ssl)
293
+ bucket_name = get_bucket_name(path_spec)
294
+ key = get_bucket_relative_path(path_spec)
295
+ logger.debug(f"Getting file: {key} from bucket: {bucket_name}")
296
+ try:
297
+ obj = s3.Object(bucket_name, key)
298
+ file_content = obj.get()["Body"].read()
299
+ binary_stream = io.BytesIO(file_content)
300
+ binary_stream.seek(0)
301
+ return binary_stream
302
+ except Exception as e:
303
+ self.report.report_file_dropped(path_spec)
304
+ self.report.warning(
305
+ message="Error reading Excel file from S3",
306
+ context=f"Path={path_spec}",
307
+ exc=e,
308
+ )
309
+ return None
310
+
311
+ def process_s3_tags(
312
+ self, path_spec: str, dataset_urn: str
313
+ ) -> Iterable[MetadataWorkUnit]:
314
+ bucket_name = get_bucket_name(path_spec)
315
+ key = get_bucket_relative_path(path_spec)
316
+
317
+ s3_tags = get_s3_tags(
318
+ bucket_name,
319
+ key,
320
+ dataset_urn,
321
+ self.config.aws_config,
322
+ self.ctx,
323
+ self.config.use_s3_bucket_tags,
324
+ self.config.use_s3_object_tags,
325
+ self.config.verify_ssl,
326
+ )
327
+
328
+ if s3_tags:
329
+ yield from self.process_global_tags(s3_tags, dataset_urn)
330
+
331
+ def abs_browser(self, path_spec: str) -> Iterable[BrowsePath]:
332
+ if self.config.azure_config is None:
333
+ raise ValueError("azure_config not set. Cannot browse Azure Blob Storage")
334
+ abs_blob_service_client = self.config.azure_config.get_blob_service_client()
335
+ container_client = abs_blob_service_client.get_container_client(
336
+ self.config.azure_config.container_name
337
+ )
338
+
339
+ container_name = self.config.azure_config.container_name
340
+ logger.debug(f"Scanning container: {container_name}")
341
+
342
+ prefix = self.get_prefix(get_container_relative_path(path_spec))
343
+ logger.debug(f"Scanning objects with prefix: {prefix}")
344
+
345
+ for obj in container_client.list_blobs(
346
+ name_starts_with=f"{prefix}", results_per_page=1000
347
+ ):
348
+ abs_path = self.create_abs_path(obj.name)
349
+ logger.debug(f"Path: {abs_path}")
350
+
351
+ yield BrowsePath(
352
+ file=abs_path,
353
+ timestamp=obj.last_modified,
354
+ size=obj.size,
355
+ partitions=[],
356
+ content_type=None,
357
+ )
358
+
359
+ def get_abs_file(self, path_spec: str) -> Union[BytesIO, None]:
360
+ if self.config.azure_config is None:
361
+ raise ValueError("azure_config not set. Cannot browse Azure Blob Storage")
362
+ abs_blob_service_client = self.config.azure_config.get_blob_service_client()
363
+ container_client = abs_blob_service_client.get_container_client(
364
+ self.config.azure_config.container_name
365
+ )
366
+
367
+ container_name = self.config.azure_config.container_name
368
+ blob_path = get_container_relative_path(path_spec)
369
+ logger.debug(f"Getting file: {blob_path} from container: {container_name}")
370
+
371
+ try:
372
+ blob_client = container_client.get_blob_client(blob_path)
373
+ download_stream = blob_client.download_blob()
374
+ file_content = download_stream.readall()
375
+ binary_stream = io.BytesIO(file_content)
376
+ binary_stream.seek(0)
377
+ return binary_stream
378
+ except Exception as e:
379
+ self.report.report_file_dropped(path_spec)
380
+ self.report.warning(
381
+ message="Error reading Excel file from Azure Blob Storage",
382
+ context=f"Path={path_spec}",
383
+ exc=e,
384
+ )
385
+ return None
386
+
387
+ def process_abs_tags(
388
+ self, path_spec: str, dataset_urn: str
389
+ ) -> Iterable[MetadataWorkUnit]:
390
+ if (
391
+ self.config.azure_config
392
+ and self.config.azure_config.container_name is not None
393
+ ):
394
+ container_name = self.config.azure_config.container_name
395
+ blob_path = get_container_relative_path(path_spec)
396
+
397
+ abs_tags = get_abs_tags(
398
+ container_name,
399
+ blob_path,
400
+ dataset_urn,
401
+ self.config.azure_config,
402
+ self.ctx,
403
+ self.config.use_abs_blob_tags,
404
+ )
405
+
406
+ if abs_tags:
407
+ yield from self.process_global_tags(abs_tags, dataset_urn)
408
+
409
+ @staticmethod
410
+ def get_field_type(field_type: str) -> SchemaFieldDataType:
411
+ type_class = field_type_mapping.get(field_type, NullTypeClass)
412
+ return SchemaFieldDataType(type=type_class())
413
+
414
+ def construct_schema_field(self, f_name: str, f_type: str) -> SchemaField:
415
+ logger.debug(f"Field: {f_name} Type: {f_type}")
416
+ return SchemaField(
417
+ fieldPath=f_name,
418
+ nativeDataType=f_type,
419
+ type=self.get_field_type(f_type),
420
+ description=None,
421
+ nullable=False,
422
+ recursive=False,
423
+ )
424
+
425
+ def construct_schema_metadata(
426
+ self,
427
+ name: str,
428
+ dataset: ExcelTable,
429
+ ) -> SchemaMetadata:
430
+ canonical_schema: List[SchemaField] = []
431
+
432
+ # Get data types for each column
433
+ data_types = dataset.df.dtypes.to_dict()
434
+
435
+ # Convert numpy types to string representation for better readability
436
+ data_types = {col: str(dtype) for col, dtype in data_types.items()}
437
+
438
+ for f_name, f_type in data_types.items():
439
+ canonical_schema.append(self.construct_schema_field(f_name, f_type))
440
+
441
+ return SchemaMetadata(
442
+ schemaName=name,
443
+ platform=f"urn:li:dataPlatform:{self.platform}",
444
+ version=0,
445
+ hash="",
446
+ platformSchema=OtherSchemaClass(rawSchema=""),
447
+ fields=canonical_schema,
448
+ )
449
+
450
+ @staticmethod
451
+ def get_dataset_attributes(metadata: Dict[str, Any]) -> dict:
452
+ result = {}
453
+ for key, value in metadata.items():
454
+ result[key] = str(value)
455
+ return result
456
+
457
+ @staticmethod
458
+ def process_global_tags(
459
+ global_tags: GlobalTagsClass, dataset_urn: str
460
+ ) -> Iterable[MetadataWorkUnit]:
461
+ yield MetadataChangeProposalWrapper(
462
+ entityType="dataset",
463
+ entityUrn=dataset_urn,
464
+ aspect=global_tags,
465
+ changeType=ChangeTypeClass.UPSERT,
466
+ ).as_workunit()
467
+
468
+ def process_dataset(
469
+ self,
470
+ relative_path: str,
471
+ full_path: str,
472
+ filename: str,
473
+ table: ExcelTable,
474
+ source_type: UriType,
475
+ ) -> Iterable[MetadataWorkUnit]:
476
+ self.report.report_worksheet_processed()
477
+ dataset_name = gen_dataset_name(
478
+ relative_path, table.sheet_name, self.config.convert_urns_to_lowercase
479
+ )
480
+ dataset_urn = make_dataset_urn_with_platform_instance(
481
+ platform=self.platform,
482
+ name=dataset_name,
483
+ platform_instance=self.config.platform_instance,
484
+ env=self.config.env,
485
+ )
486
+
487
+ attributes = self.get_dataset_attributes(table.metadata)
488
+ created: Optional[datetime] = table.metadata.get("created")
489
+ modified: Optional[datetime] = table.metadata.get("modified")
490
+ dataset_properties = DatasetPropertiesClass(
491
+ tags=[],
492
+ customProperties=attributes,
493
+ created=(
494
+ TimeStamp(time=int(created.timestamp() * 1000)) if created else None
495
+ ),
496
+ lastModified=(
497
+ TimeStamp(time=int(modified.timestamp() * 1000)) if modified else None
498
+ ),
499
+ )
500
+
501
+ schema_metadata = self.construct_schema_metadata(
502
+ name=dataset_name,
503
+ dataset=table,
504
+ )
505
+
506
+ yield MetadataChangeProposalWrapper(
507
+ entityUrn=dataset_urn,
508
+ aspect=schema_metadata,
509
+ ).as_workunit()
510
+
511
+ yield MetadataChangeProposalWrapper(
512
+ entityUrn=dataset_urn,
513
+ aspect=dataset_properties,
514
+ ).as_workunit()
515
+
516
+ yield from self.container_WU_creator.create_container_hierarchy(
517
+ relative_path, dataset_urn
518
+ )
519
+
520
+ if source_type == UriType.S3 and (
521
+ self.config.use_s3_bucket_tags or self.config.use_s3_object_tags
522
+ ):
523
+ yield from self.process_s3_tags(full_path, dataset_urn)
524
+ elif source_type == UriType.ABS and self.config.use_abs_blob_tags:
525
+ yield from self.process_abs_tags(full_path, dataset_urn)
526
+
527
+ if self.config.is_profiling_enabled():
528
+ profiler = ExcelProfiler(
529
+ self.config,
530
+ self.report,
531
+ table.df,
532
+ filename,
533
+ table.sheet_name,
534
+ dataset_urn,
535
+ relative_path,
536
+ )
537
+ yield from profiler.get_workunits()
538
+
539
+ def process_file(
540
+ self,
541
+ file_content: BytesIO,
542
+ relative_path: str,
543
+ full_path: str,
544
+ filename: str,
545
+ source_type: UriType,
546
+ ) -> Iterable[MetadataWorkUnit]:
547
+ self.report.report_file_processed()
548
+ xls = ExcelFile(filename, file_content, self.report)
549
+ result = xls.load_workbook()
550
+
551
+ if result:
552
+ for table in xls.get_tables(active_only=self.config.active_sheet_only):
553
+ self.report.report_worksheet_scanned()
554
+ dataset_name = gen_dataset_name(
555
+ relative_path,
556
+ table.sheet_name,
557
+ self.config.convert_urns_to_lowercase,
558
+ )
559
+ if not self.config.worksheet_pattern.allowed(dataset_name):
560
+ self.report.report_dropped(dataset_name)
561
+ continue
562
+ yield from self.process_dataset(
563
+ relative_path, full_path, filename, table, source_type
564
+ )
565
+
566
+ def check_file_is_valid(self, filename: str) -> bool:
567
+ self.report.report_file_scanned()
568
+ if not self.config.path_pattern.allowed(filename):
569
+ self.report.report_dropped(filename)
570
+ return False
571
+ elif not self.is_excel_file(filename):
572
+ logger.debug(f"File is not an Excel workbook: {filename}")
573
+ return False
574
+ return True
575
+
576
+ def retrieve_file_data(
577
+ self, uri_type: UriType, path: str, path_spec: str
578
+ ) -> Iterator[Tuple[BytesIO, str, str, str]]:
579
+ if (
580
+ uri_type == UriType.LOCAL_FILE
581
+ or uri_type == UriType.ABSOLUTE_PATH
582
+ or uri_type == UriType.RELATIVE_PATH
583
+ ):
584
+ logger.debug(f"Searching local path: {path}")
585
+ for browse_path in self.local_browser(path):
586
+ if self.check_file_is_valid(browse_path.file):
587
+ basename = os.path.basename(browse_path.file)
588
+ file_path = self.strip_file_prefix(browse_path.file)
589
+ filename = os.path.splitext(basename)[0]
590
+
591
+ logger.debug(f"Processing {filename}")
592
+ with self.report.local_file_get_timer:
593
+ file_data = self.get_local_file(browse_path.file)
594
+
595
+ if file_data is not None:
596
+ yield file_data, file_path, browse_path.file, filename
597
+
598
+ elif uri_type == UriType.S3:
599
+ logger.debug(f"Searching S3 path: {path}")
600
+ for browse_path in self.s3_browser(path_spec):
601
+ if self.check_file_is_valid(browse_path.file):
602
+ uri_path = strip_s3_prefix(browse_path.file)
603
+ basename = os.path.basename(uri_path)
604
+ filename = os.path.splitext(basename)[0]
605
+
606
+ logger.debug(f"Processing {browse_path.file}")
607
+ with self.report.s3_file_get_timer:
608
+ file_data = self.get_s3_file(browse_path.file)
609
+
610
+ if file_data is not None:
611
+ yield file_data, uri_path, browse_path.file, filename
612
+
613
+ elif uri_type == UriType.ABS:
614
+ logger.debug(f"Searching Azure Blob Storage path: {path}")
615
+ for browse_path in self.abs_browser(path_spec):
616
+ if self.check_file_is_valid(browse_path.file):
617
+ uri_path = strip_abs_prefix(browse_path.file)
618
+ basename = os.path.basename(uri_path)
619
+ filename = os.path.splitext(basename)[0]
620
+
621
+ logger.debug(f"Processing {browse_path.file}")
622
+ with self.report.abs_file_get_timer:
623
+ file_data = self.get_abs_file(browse_path.file)
624
+
625
+ if file_data is not None:
626
+ yield file_data, uri_path, browse_path.file, filename
627
+
628
+ else:
629
+ self.report.report_file_dropped(path_spec)
630
+ self.report.warning(
631
+ message="Unsupported URI Type",
632
+ context=f"Type={uri_type.name},URI={path_spec}",
633
+ )
634
+
635
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
636
+ self.container_WU_creator = ContainerWUCreator(
637
+ self.platform,
638
+ self.config.platform_instance,
639
+ self.config.env,
640
+ )
641
+
642
+ with PerfTimer() as timer:
643
+ for path_spec in self.config.path_list:
644
+ logger.debug(f"Processing path: {path_spec}")
645
+ uri_type, path = self.uri_type(path_spec)
646
+ logger.debug(f"URI Type: {uri_type} Path: {path}")
647
+
648
+ for (
649
+ file_data,
650
+ relative_path,
651
+ full_path,
652
+ filename,
653
+ ) in self.retrieve_file_data(uri_type, path, path_spec):
654
+ yield from self.process_file(
655
+ file_data, relative_path, full_path, filename, uri_type
656
+ )
657
+
658
+ time_taken = timer.elapsed_seconds()
659
+ logger.info(f"Finished ingestion in {time_taken:.3f} seconds")
660
+
661
+ def get_report(self):
662
+ return self.report
@@ -0,0 +1,18 @@
1
+ import os
2
+
3
+
4
+ def gen_dataset_name(path: str, sheet_name: str, lower_case: bool) -> str:
5
+ sheet_name = sheet_name.strip()
6
+ directory, filename = os.path.split(path)
7
+
8
+ if not directory:
9
+ excel_path = f"[{filename}]"
10
+ else:
11
+ excel_path = os.path.join(directory, f"[{filename}]")
12
+
13
+ name = f"{excel_path}{sheet_name}"
14
+
15
+ if lower_case:
16
+ name = name.lower()
17
+
18
+ return name
@@ -18,7 +18,14 @@ class FivetranLogQuery:
18
18
  return f"use database {db_name}"
19
19
 
20
20
  def set_schema(self, schema_name: str) -> None:
21
- self.schema_clause = f"{schema_name}."
21
+ """
22
+ Using Snowflake quoted identifiers convention
23
+
24
+ Add double quotes around an identifier
25
+ Use two quotes to use the double quote character inside a quoted identifier
26
+ """
27
+ schema_name = schema_name.replace('"', '""')
28
+ self.schema_clause = f'"{schema_name}".'
22
29
 
23
30
  def get_connectors_query(self) -> str:
24
31
  return f"""\
@@ -333,7 +333,7 @@ class APISource(Source, ABC):
333
333
  ),
334
334
  )
335
335
  yield wu
336
- elif endpoint_dets["method"] != "get":
336
+ elif endpoint_dets["method"] != "GET":
337
337
  self.report.report_warning(
338
338
  title="Failed to Extract Endpoint Metadata",
339
339
  message=f"No example provided for {endpoint_dets['method']}",