acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (63) hide show
  1. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/METADATA +2693 -2630
  2. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/RECORD +63 -55
  3. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +45 -5
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/common/subtypes.py +3 -0
  11. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  12. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  13. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  14. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  15. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  16. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  17. datahub/ingestion/source/excel/__init__.py +0 -0
  18. datahub/ingestion/source/excel/config.py +92 -0
  19. datahub/ingestion/source/excel/excel_file.py +539 -0
  20. datahub/ingestion/source/excel/profiling.py +308 -0
  21. datahub/ingestion/source/excel/report.py +49 -0
  22. datahub/ingestion/source/excel/source.py +662 -0
  23. datahub/ingestion/source/excel/util.py +18 -0
  24. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  25. datahub/ingestion/source/openapi.py +1 -1
  26. datahub/ingestion/source/powerbi/config.py +33 -0
  27. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  28. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  29. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  30. datahub/ingestion/source/s3/source.py +65 -59
  31. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  32. datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
  33. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  34. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  35. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  36. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
  37. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  38. datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
  39. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  40. datahub/ingestion/source/sql_queries.py +24 -2
  41. datahub/ingestion/source/state/checkpoint.py +3 -28
  42. datahub/metadata/_internal_schema_classes.py +568 -512
  43. datahub/metadata/_urns/urn_defs.py +1748 -1748
  44. datahub/metadata/schema.avsc +18242 -18168
  45. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  46. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  47. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  48. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  49. datahub/metadata/schemas/Ownership.avsc +69 -0
  50. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  51. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  52. datahub/metadata/schemas/__init__.py +3 -3
  53. datahub/sdk/lineage_client.py +6 -26
  54. datahub/sdk/main_client.py +7 -3
  55. datahub/sdk/search_filters.py +16 -0
  56. datahub/specific/aspect_helpers/siblings.py +73 -0
  57. datahub/specific/dataset.py +2 -0
  58. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  59. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  60. datahub/upgrade/upgrade.py +14 -2
  61. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/WHEEL +0 -0
  62. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/licenses/LICENSE +0 -0
  63. {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
1
+ import logging
2
+ import time
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, Iterable, List, Optional, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
10
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
11
+ from datahub.ingestion.source.excel.config import ExcelSourceConfig
12
+ from datahub.ingestion.source.excel.report import ExcelSourceReport
13
+ from datahub.ingestion.source.excel.util import gen_dataset_name
14
+ from datahub.metadata.schema_classes import (
15
+ DatasetFieldProfileClass,
16
+ DatasetProfileClass,
17
+ QuantileClass,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class ColumnMetric:
25
+ col_type: Union[str, None] = None
26
+ values: List[Any] = field(default_factory=list)
27
+ null_count: int = 0
28
+ total_count: int = 0
29
+ distinct_count: Optional[int] = None
30
+ min: Optional[Any] = None
31
+ max: Optional[Any] = None
32
+ mean: Optional[float] = None
33
+ stdev: Optional[float] = None
34
+ median: Optional[float] = None
35
+ quantiles: Optional[List[float]] = None
36
+ sample_values: Optional[Any] = None
37
+
38
+
39
+ @dataclass
40
+ class ProfileData:
41
+ row_count: Optional[int] = 0
42
+ column_count: Optional[int] = 0
43
+ column_metrics: Dict[str, ColumnMetric] = field(default_factory=dict)
44
+
45
+
46
+ class ExcelProfiler:
47
+ config: ExcelSourceConfig
48
+ report: ExcelSourceReport
49
+ df: pd.DataFrame
50
+ filename: str
51
+ sheet_name: str
52
+ dataset_urn: str
53
+ path: str
54
+
55
+ def __init__(
56
+ self,
57
+ config: ExcelSourceConfig,
58
+ report: ExcelSourceReport,
59
+ df: pd.DataFrame,
60
+ filename: str,
61
+ sheet_name: str,
62
+ dataset_urn: str,
63
+ path: str,
64
+ ) -> None:
65
+ self.config = config
66
+ self.report = report
67
+ self.df = df
68
+ self.filename = filename
69
+ self.sheet_name = sheet_name
70
+ self.dataset_urn = dataset_urn
71
+ self.path = path
72
+ self.sheet_path = f"[{self.filename}]{self.sheet_name}"
73
+
74
+ if self.config.profiling.use_sampling:
75
+ self.sample_size = self.config.profiling.sample_size
76
+ else:
77
+ self.sample_size = 0
78
+
79
+ self.field_sample_count = self.config.profiling.field_sample_values_limit
80
+
81
+ if self.config.profiling.max_number_of_fields_to_profile:
82
+ self.sample_fields = self.config.profiling.max_number_of_fields_to_profile
83
+ else:
84
+ self.sample_fields = 0
85
+
86
+ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
87
+ logger.info(f"Profiling worksheet {self.sheet_path}")
88
+
89
+ try:
90
+ yield from self.generate_profile()
91
+ except Exception as exc:
92
+ self.report.profiling_skipped_other[self.filename] += 1
93
+ self.report.failure(
94
+ message="Failed to profile Excel worksheet",
95
+ context=f"Worksheet={self.sheet_path}",
96
+ exc=exc,
97
+ )
98
+
99
+ def generate_profile(self) -> Iterable[MetadataWorkUnit]:
100
+ if (
101
+ not self.config.profile_pattern.allowed(
102
+ gen_dataset_name(
103
+ self.path,
104
+ self.sheet_name,
105
+ self.config.convert_urns_to_lowercase,
106
+ )
107
+ )
108
+ and self.config.profiling.report_dropped_profiles
109
+ and self.config.profiling.report_dropped_profiles
110
+ ):
111
+ self.report.profiling_skipped_table_profile_pattern[self.filename] += 1
112
+ logger.info(f"Profiling not allowed for worksheet {self.sheet_path}")
113
+ return
114
+
115
+ try:
116
+ profile_data = self.profile_workbook()
117
+ except Exception as exc:
118
+ self.report.warning(
119
+ message="Failed to profile Excel worksheet",
120
+ context=f"Worksheet={self.sheet_path}",
121
+ exc=exc,
122
+ )
123
+ return
124
+
125
+ profile_aspect = self.populate_profile_aspect(profile_data)
126
+
127
+ if profile_aspect:
128
+ self.report.report_entity_profiled()
129
+ mcp = MetadataChangeProposalWrapper(
130
+ entityUrn=self.dataset_urn, aspect=profile_aspect
131
+ )
132
+ yield mcp.as_workunit()
133
+
134
+ def populate_profile_aspect(self, profile_data: ProfileData) -> DatasetProfileClass:
135
+ field_profiles = [
136
+ self._create_field_profile(column_name, column_metrics)
137
+ for column_name, column_metrics in profile_data.column_metrics.items()
138
+ ]
139
+ return DatasetProfileClass(
140
+ timestampMillis=round(time.time() * 1000),
141
+ rowCount=profile_data.row_count,
142
+ columnCount=profile_data.column_count,
143
+ fieldProfiles=field_profiles,
144
+ )
145
+
146
+ @staticmethod
147
+ def _create_field_profile(
148
+ field_name: str, field_stats: ColumnMetric
149
+ ) -> DatasetFieldProfileClass:
150
+ quantiles = field_stats.quantiles
151
+ return DatasetFieldProfileClass(
152
+ fieldPath=field_name,
153
+ uniqueCount=field_stats.distinct_count,
154
+ nullCount=field_stats.null_count,
155
+ min=str(field_stats.min) if field_stats.min else None,
156
+ max=str(field_stats.max) if field_stats.max else None,
157
+ mean=str(field_stats.mean) if field_stats.mean else None,
158
+ median=str(field_stats.median) if field_stats.median else None,
159
+ stdev=str(field_stats.stdev) if field_stats.stdev else None,
160
+ quantiles=[
161
+ QuantileClass(quantile=str(0.25), value=str(quantiles[0])),
162
+ QuantileClass(quantile=str(0.75), value=str(quantiles[1])),
163
+ ]
164
+ if quantiles
165
+ else None,
166
+ sampleValues=field_stats.sample_values
167
+ if field_stats.sample_values
168
+ else None,
169
+ )
170
+
171
+ def profile_workbook(self) -> ProfileData:
172
+ profile_data = ProfileData()
173
+
174
+ if not self.config.profiling.profile_table_level_only:
175
+ return self.collect_column_data(profile_data)
176
+ else:
177
+ return self.collect_dataset_data(profile_data)
178
+
179
+ def collect_dataset_data(self, profile_data: ProfileData) -> ProfileData:
180
+ profile_data.row_count = self.df.shape[0]
181
+ profile_data.column_count = self.df.shape[1]
182
+
183
+ return profile_data
184
+
185
+ def collect_column_data(self, profile_data: ProfileData) -> ProfileData:
186
+ dropped_fields = set()
187
+ dataset_name = gen_dataset_name(
188
+ self.path, self.sheet_name, self.config.convert_urns_to_lowercase
189
+ )
190
+
191
+ logger.info(f"Attempting to profile dataset {dataset_name}")
192
+
193
+ # Get data types for each column
194
+ data_types = self.df.dtypes.to_dict()
195
+
196
+ # Convert numpy types to string representation for better readability
197
+ data_types = {col: str(dtype) for col, dtype in data_types.items()}
198
+
199
+ for n, (f_name, f_type) in enumerate(data_types.items()):
200
+ if 0 < self.sample_fields <= n:
201
+ dropped_fields.add(f_name)
202
+ continue
203
+ values = self.df[f_name].tolist()
204
+ profile_data.column_metrics[f_name] = ColumnMetric()
205
+ profile_data.column_metrics[f_name].values.extend(values)
206
+ profile_data.column_metrics[f_name].col_type = f_type
207
+
208
+ if len(dropped_fields) > 0:
209
+ if self.config.profiling.report_dropped_profiles:
210
+ self.report.report_dropped(
211
+ f"The max_number_of_fields_to_profile={self.sample_fields} reached. "
212
+ f"Dropped fields for {dataset_name} ({', '.join(sorted(dropped_fields))})"
213
+ )
214
+
215
+ profile_data.row_count = self.df.shape[0]
216
+ profile_data.column_count = self.df.shape[1]
217
+
218
+ return self.add_field_statistics(profile_data)
219
+
220
+ def add_field_statistics(self, profile_data: ProfileData) -> ProfileData:
221
+ for field_name, column_metrics in profile_data.column_metrics.items():
222
+ if column_metrics.values:
223
+ try:
224
+ self.compute_field_statistics(column_metrics)
225
+ except Exception as exc:
226
+ self.report.warning(
227
+ message="Profiling Failed For Column Statistics",
228
+ context=field_name,
229
+ exc=exc,
230
+ )
231
+ raise exc
232
+
233
+ return profile_data
234
+
235
+ def compute_field_statistics(self, column_metrics: ColumnMetric) -> None:
236
+ values = column_metrics.values
237
+ if not values:
238
+ return
239
+
240
+ logger.debug(
241
+ f"Computing statistics for column of type {column_metrics.col_type}"
242
+ )
243
+
244
+ column_metrics.total_count = len(values)
245
+
246
+ # ByDefault Null count is added
247
+ if not self.config.profiling.include_field_null_count:
248
+ column_metrics.null_count = 0
249
+
250
+ if self.config.profiling.include_field_distinct_count:
251
+ column_metrics.distinct_count = len(set(values))
252
+
253
+ if values and self._is_numeric_type(column_metrics.col_type):
254
+ if self.config.profiling.include_field_min_value:
255
+ column_metrics.min = min(values)
256
+ if self.config.profiling.include_field_max_value:
257
+ column_metrics.max = max(values)
258
+ if self.config.profiling.include_field_mean_value:
259
+ column_metrics.mean = round(float(np.mean(values)), 2)
260
+ if self.config.profiling.include_field_stddev_value:
261
+ column_metrics.stdev = round(float(np.std(values)), 2)
262
+ if self.config.profiling.include_field_median_value:
263
+ column_metrics.median = round(float(np.median(values)), 2)
264
+ if self.config.profiling.include_field_quantiles:
265
+ column_metrics.quantiles = [
266
+ float(np.percentile(values, 25)),
267
+ float(np.percentile(values, 75)),
268
+ ]
269
+
270
+ if values and self.config.profiling.include_field_sample_values:
271
+ column_metrics.sample_values = [
272
+ str(v) for v in values[: self.field_sample_count]
273
+ ]
274
+
275
+ @staticmethod
276
+ def _is_numeric_type(data_type: Union[str, None]) -> bool:
277
+ if not data_type:
278
+ return False
279
+ else:
280
+ return data_type.lower() in [
281
+ "int8",
282
+ "int16",
283
+ "int32",
284
+ "int64",
285
+ "uint8",
286
+ "uint16",
287
+ "uint32",
288
+ "uint64",
289
+ "intp",
290
+ "uintp",
291
+ "int8",
292
+ "int16",
293
+ "int32",
294
+ "int64",
295
+ "uint8",
296
+ "uint16",
297
+ "uint32",
298
+ "uint64",
299
+ "float16",
300
+ "float32",
301
+ "float64",
302
+ "float128",
303
+ "float32",
304
+ "float64",
305
+ "complex64",
306
+ "complex128",
307
+ "complex256",
308
+ ]
@@ -0,0 +1,49 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
5
+ StaleEntityRemovalSourceReport,
6
+ )
7
+ from datahub.utilities.perf_timer import PerfTimer
8
+ from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
9
+
10
+
11
+ @dataclass
12
+ class ExcelSourceReport(StaleEntityRemovalSourceReport):
13
+ files_scanned = 0
14
+ files_processed = 0
15
+ worksheets_scanned = 0
16
+ worksheets_processed = 0
17
+ datasets_profiled = 0
18
+ local_file_get_timer: PerfTimer = field(default_factory=PerfTimer)
19
+ s3_file_get_timer: PerfTimer = field(default_factory=PerfTimer)
20
+ abs_file_get_timer: PerfTimer = field(default_factory=PerfTimer)
21
+ filtered: List[str] = field(default_factory=list)
22
+ profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
23
+ profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
24
+ default_factory=int_top_k_dict
25
+ )
26
+
27
+ def report_dropped(self, name: str) -> None:
28
+ self.filtered.append(name)
29
+
30
+ def report_entity_profiled(self) -> None:
31
+ self.datasets_profiled += 1
32
+
33
+ def report_file_scanned(self) -> None:
34
+ self.files_scanned += 1
35
+
36
+ def report_file_processed(self) -> None:
37
+ self.files_processed += 1
38
+
39
+ def report_worksheet_scanned(self) -> None:
40
+ self.worksheets_scanned += 1
41
+
42
+ def report_worksheet_processed(self) -> None:
43
+ self.worksheets_processed += 1
44
+
45
+ def report_file_dropped(self, file: str) -> None:
46
+ self.filtered.append(file)
47
+
48
+ def report_worksheet_dropped(self, worksheet: str) -> None:
49
+ self.filtered.append(worksheet)