acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/METADATA +2659 -2578
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/RECORD +65 -57
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +45 -5
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +5 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/metadata/_internal_schema_classes.py +568 -512
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18242 -18168
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
10
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
11
|
+
from datahub.ingestion.source.excel.config import ExcelSourceConfig
|
|
12
|
+
from datahub.ingestion.source.excel.report import ExcelSourceReport
|
|
13
|
+
from datahub.ingestion.source.excel.util import gen_dataset_name
|
|
14
|
+
from datahub.metadata.schema_classes import (
|
|
15
|
+
DatasetFieldProfileClass,
|
|
16
|
+
DatasetProfileClass,
|
|
17
|
+
QuantileClass,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ColumnMetric:
|
|
25
|
+
col_type: Union[str, None] = None
|
|
26
|
+
values: List[Any] = field(default_factory=list)
|
|
27
|
+
null_count: int = 0
|
|
28
|
+
total_count: int = 0
|
|
29
|
+
distinct_count: Optional[int] = None
|
|
30
|
+
min: Optional[Any] = None
|
|
31
|
+
max: Optional[Any] = None
|
|
32
|
+
mean: Optional[float] = None
|
|
33
|
+
stdev: Optional[float] = None
|
|
34
|
+
median: Optional[float] = None
|
|
35
|
+
quantiles: Optional[List[float]] = None
|
|
36
|
+
sample_values: Optional[Any] = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ProfileData:
|
|
41
|
+
row_count: Optional[int] = 0
|
|
42
|
+
column_count: Optional[int] = 0
|
|
43
|
+
column_metrics: Dict[str, ColumnMetric] = field(default_factory=dict)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ExcelProfiler:
|
|
47
|
+
config: ExcelSourceConfig
|
|
48
|
+
report: ExcelSourceReport
|
|
49
|
+
df: pd.DataFrame
|
|
50
|
+
filename: str
|
|
51
|
+
sheet_name: str
|
|
52
|
+
dataset_urn: str
|
|
53
|
+
path: str
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
config: ExcelSourceConfig,
|
|
58
|
+
report: ExcelSourceReport,
|
|
59
|
+
df: pd.DataFrame,
|
|
60
|
+
filename: str,
|
|
61
|
+
sheet_name: str,
|
|
62
|
+
dataset_urn: str,
|
|
63
|
+
path: str,
|
|
64
|
+
) -> None:
|
|
65
|
+
self.config = config
|
|
66
|
+
self.report = report
|
|
67
|
+
self.df = df
|
|
68
|
+
self.filename = filename
|
|
69
|
+
self.sheet_name = sheet_name
|
|
70
|
+
self.dataset_urn = dataset_urn
|
|
71
|
+
self.path = path
|
|
72
|
+
self.sheet_path = f"[{self.filename}]{self.sheet_name}"
|
|
73
|
+
|
|
74
|
+
if self.config.profiling.use_sampling:
|
|
75
|
+
self.sample_size = self.config.profiling.sample_size
|
|
76
|
+
else:
|
|
77
|
+
self.sample_size = 0
|
|
78
|
+
|
|
79
|
+
self.field_sample_count = self.config.profiling.field_sample_values_limit
|
|
80
|
+
|
|
81
|
+
if self.config.profiling.max_number_of_fields_to_profile:
|
|
82
|
+
self.sample_fields = self.config.profiling.max_number_of_fields_to_profile
|
|
83
|
+
else:
|
|
84
|
+
self.sample_fields = 0
|
|
85
|
+
|
|
86
|
+
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
87
|
+
logger.info(f"Profiling worksheet {self.sheet_path}")
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
yield from self.generate_profile()
|
|
91
|
+
except Exception as exc:
|
|
92
|
+
self.report.profiling_skipped_other[self.filename] += 1
|
|
93
|
+
self.report.failure(
|
|
94
|
+
message="Failed to profile Excel worksheet",
|
|
95
|
+
context=f"Worksheet={self.sheet_path}",
|
|
96
|
+
exc=exc,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def generate_profile(self) -> Iterable[MetadataWorkUnit]:
|
|
100
|
+
if (
|
|
101
|
+
not self.config.profile_pattern.allowed(
|
|
102
|
+
gen_dataset_name(
|
|
103
|
+
self.path,
|
|
104
|
+
self.sheet_name,
|
|
105
|
+
self.config.convert_urns_to_lowercase,
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
and self.config.profiling.report_dropped_profiles
|
|
109
|
+
and self.config.profiling.report_dropped_profiles
|
|
110
|
+
):
|
|
111
|
+
self.report.profiling_skipped_table_profile_pattern[self.filename] += 1
|
|
112
|
+
logger.info(f"Profiling not allowed for worksheet {self.sheet_path}")
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
profile_data = self.profile_workbook()
|
|
117
|
+
except Exception as exc:
|
|
118
|
+
self.report.warning(
|
|
119
|
+
message="Failed to profile Excel worksheet",
|
|
120
|
+
context=f"Worksheet={self.sheet_path}",
|
|
121
|
+
exc=exc,
|
|
122
|
+
)
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
profile_aspect = self.populate_profile_aspect(profile_data)
|
|
126
|
+
|
|
127
|
+
if profile_aspect:
|
|
128
|
+
self.report.report_entity_profiled()
|
|
129
|
+
mcp = MetadataChangeProposalWrapper(
|
|
130
|
+
entityUrn=self.dataset_urn, aspect=profile_aspect
|
|
131
|
+
)
|
|
132
|
+
yield mcp.as_workunit()
|
|
133
|
+
|
|
134
|
+
def populate_profile_aspect(self, profile_data: ProfileData) -> DatasetProfileClass:
|
|
135
|
+
field_profiles = [
|
|
136
|
+
self._create_field_profile(column_name, column_metrics)
|
|
137
|
+
for column_name, column_metrics in profile_data.column_metrics.items()
|
|
138
|
+
]
|
|
139
|
+
return DatasetProfileClass(
|
|
140
|
+
timestampMillis=round(time.time() * 1000),
|
|
141
|
+
rowCount=profile_data.row_count,
|
|
142
|
+
columnCount=profile_data.column_count,
|
|
143
|
+
fieldProfiles=field_profiles,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def _create_field_profile(
|
|
148
|
+
field_name: str, field_stats: ColumnMetric
|
|
149
|
+
) -> DatasetFieldProfileClass:
|
|
150
|
+
quantiles = field_stats.quantiles
|
|
151
|
+
return DatasetFieldProfileClass(
|
|
152
|
+
fieldPath=field_name,
|
|
153
|
+
uniqueCount=field_stats.distinct_count,
|
|
154
|
+
nullCount=field_stats.null_count,
|
|
155
|
+
min=str(field_stats.min) if field_stats.min else None,
|
|
156
|
+
max=str(field_stats.max) if field_stats.max else None,
|
|
157
|
+
mean=str(field_stats.mean) if field_stats.mean else None,
|
|
158
|
+
median=str(field_stats.median) if field_stats.median else None,
|
|
159
|
+
stdev=str(field_stats.stdev) if field_stats.stdev else None,
|
|
160
|
+
quantiles=[
|
|
161
|
+
QuantileClass(quantile=str(0.25), value=str(quantiles[0])),
|
|
162
|
+
QuantileClass(quantile=str(0.75), value=str(quantiles[1])),
|
|
163
|
+
]
|
|
164
|
+
if quantiles
|
|
165
|
+
else None,
|
|
166
|
+
sampleValues=field_stats.sample_values
|
|
167
|
+
if field_stats.sample_values
|
|
168
|
+
else None,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def profile_workbook(self) -> ProfileData:
|
|
172
|
+
profile_data = ProfileData()
|
|
173
|
+
|
|
174
|
+
if not self.config.profiling.profile_table_level_only:
|
|
175
|
+
return self.collect_column_data(profile_data)
|
|
176
|
+
else:
|
|
177
|
+
return self.collect_dataset_data(profile_data)
|
|
178
|
+
|
|
179
|
+
def collect_dataset_data(self, profile_data: ProfileData) -> ProfileData:
|
|
180
|
+
profile_data.row_count = self.df.shape[0]
|
|
181
|
+
profile_data.column_count = self.df.shape[1]
|
|
182
|
+
|
|
183
|
+
return profile_data
|
|
184
|
+
|
|
185
|
+
def collect_column_data(self, profile_data: ProfileData) -> ProfileData:
|
|
186
|
+
dropped_fields = set()
|
|
187
|
+
dataset_name = gen_dataset_name(
|
|
188
|
+
self.path, self.sheet_name, self.config.convert_urns_to_lowercase
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
logger.info(f"Attempting to profile dataset {dataset_name}")
|
|
192
|
+
|
|
193
|
+
# Get data types for each column
|
|
194
|
+
data_types = self.df.dtypes.to_dict()
|
|
195
|
+
|
|
196
|
+
# Convert numpy types to string representation for better readability
|
|
197
|
+
data_types = {col: str(dtype) for col, dtype in data_types.items()}
|
|
198
|
+
|
|
199
|
+
for n, (f_name, f_type) in enumerate(data_types.items()):
|
|
200
|
+
if 0 < self.sample_fields <= n:
|
|
201
|
+
dropped_fields.add(f_name)
|
|
202
|
+
continue
|
|
203
|
+
values = self.df[f_name].tolist()
|
|
204
|
+
profile_data.column_metrics[f_name] = ColumnMetric()
|
|
205
|
+
profile_data.column_metrics[f_name].values.extend(values)
|
|
206
|
+
profile_data.column_metrics[f_name].col_type = f_type
|
|
207
|
+
|
|
208
|
+
if len(dropped_fields) > 0:
|
|
209
|
+
if self.config.profiling.report_dropped_profiles:
|
|
210
|
+
self.report.report_dropped(
|
|
211
|
+
f"The max_number_of_fields_to_profile={self.sample_fields} reached. "
|
|
212
|
+
f"Dropped fields for {dataset_name} ({', '.join(sorted(dropped_fields))})"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
profile_data.row_count = self.df.shape[0]
|
|
216
|
+
profile_data.column_count = self.df.shape[1]
|
|
217
|
+
|
|
218
|
+
return self.add_field_statistics(profile_data)
|
|
219
|
+
|
|
220
|
+
def add_field_statistics(self, profile_data: ProfileData) -> ProfileData:
|
|
221
|
+
for field_name, column_metrics in profile_data.column_metrics.items():
|
|
222
|
+
if column_metrics.values:
|
|
223
|
+
try:
|
|
224
|
+
self.compute_field_statistics(column_metrics)
|
|
225
|
+
except Exception as exc:
|
|
226
|
+
self.report.warning(
|
|
227
|
+
message="Profiling Failed For Column Statistics",
|
|
228
|
+
context=field_name,
|
|
229
|
+
exc=exc,
|
|
230
|
+
)
|
|
231
|
+
raise exc
|
|
232
|
+
|
|
233
|
+
return profile_data
|
|
234
|
+
|
|
235
|
+
def compute_field_statistics(self, column_metrics: ColumnMetric) -> None:
|
|
236
|
+
values = column_metrics.values
|
|
237
|
+
if not values:
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
logger.debug(
|
|
241
|
+
f"Computing statistics for column of type {column_metrics.col_type}"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
column_metrics.total_count = len(values)
|
|
245
|
+
|
|
246
|
+
# ByDefault Null count is added
|
|
247
|
+
if not self.config.profiling.include_field_null_count:
|
|
248
|
+
column_metrics.null_count = 0
|
|
249
|
+
|
|
250
|
+
if self.config.profiling.include_field_distinct_count:
|
|
251
|
+
column_metrics.distinct_count = len(set(values))
|
|
252
|
+
|
|
253
|
+
if values and self._is_numeric_type(column_metrics.col_type):
|
|
254
|
+
if self.config.profiling.include_field_min_value:
|
|
255
|
+
column_metrics.min = min(values)
|
|
256
|
+
if self.config.profiling.include_field_max_value:
|
|
257
|
+
column_metrics.max = max(values)
|
|
258
|
+
if self.config.profiling.include_field_mean_value:
|
|
259
|
+
column_metrics.mean = round(float(np.mean(values)), 2)
|
|
260
|
+
if self.config.profiling.include_field_stddev_value:
|
|
261
|
+
column_metrics.stdev = round(float(np.std(values)), 2)
|
|
262
|
+
if self.config.profiling.include_field_median_value:
|
|
263
|
+
column_metrics.median = round(float(np.median(values)), 2)
|
|
264
|
+
if self.config.profiling.include_field_quantiles:
|
|
265
|
+
column_metrics.quantiles = [
|
|
266
|
+
float(np.percentile(values, 25)),
|
|
267
|
+
float(np.percentile(values, 75)),
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
if values and self.config.profiling.include_field_sample_values:
|
|
271
|
+
column_metrics.sample_values = [
|
|
272
|
+
str(v) for v in values[: self.field_sample_count]
|
|
273
|
+
]
|
|
274
|
+
|
|
275
|
+
@staticmethod
|
|
276
|
+
def _is_numeric_type(data_type: Union[str, None]) -> bool:
|
|
277
|
+
if not data_type:
|
|
278
|
+
return False
|
|
279
|
+
else:
|
|
280
|
+
return data_type.lower() in [
|
|
281
|
+
"int8",
|
|
282
|
+
"int16",
|
|
283
|
+
"int32",
|
|
284
|
+
"int64",
|
|
285
|
+
"uint8",
|
|
286
|
+
"uint16",
|
|
287
|
+
"uint32",
|
|
288
|
+
"uint64",
|
|
289
|
+
"intp",
|
|
290
|
+
"uintp",
|
|
291
|
+
"int8",
|
|
292
|
+
"int16",
|
|
293
|
+
"int32",
|
|
294
|
+
"int64",
|
|
295
|
+
"uint8",
|
|
296
|
+
"uint16",
|
|
297
|
+
"uint32",
|
|
298
|
+
"uint64",
|
|
299
|
+
"float16",
|
|
300
|
+
"float32",
|
|
301
|
+
"float64",
|
|
302
|
+
"float128",
|
|
303
|
+
"float32",
|
|
304
|
+
"float64",
|
|
305
|
+
"complex64",
|
|
306
|
+
"complex128",
|
|
307
|
+
"complex256",
|
|
308
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
5
|
+
StaleEntityRemovalSourceReport,
|
|
6
|
+
)
|
|
7
|
+
from datahub.utilities.perf_timer import PerfTimer
|
|
8
|
+
from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ExcelSourceReport(StaleEntityRemovalSourceReport):
|
|
13
|
+
files_scanned = 0
|
|
14
|
+
files_processed = 0
|
|
15
|
+
worksheets_scanned = 0
|
|
16
|
+
worksheets_processed = 0
|
|
17
|
+
datasets_profiled = 0
|
|
18
|
+
local_file_get_timer: PerfTimer = field(default_factory=PerfTimer)
|
|
19
|
+
s3_file_get_timer: PerfTimer = field(default_factory=PerfTimer)
|
|
20
|
+
abs_file_get_timer: PerfTimer = field(default_factory=PerfTimer)
|
|
21
|
+
filtered: List[str] = field(default_factory=list)
|
|
22
|
+
profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
|
|
23
|
+
profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
|
|
24
|
+
default_factory=int_top_k_dict
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
def report_dropped(self, name: str) -> None:
|
|
28
|
+
self.filtered.append(name)
|
|
29
|
+
|
|
30
|
+
def report_entity_profiled(self) -> None:
|
|
31
|
+
self.datasets_profiled += 1
|
|
32
|
+
|
|
33
|
+
def report_file_scanned(self) -> None:
|
|
34
|
+
self.files_scanned += 1
|
|
35
|
+
|
|
36
|
+
def report_file_processed(self) -> None:
|
|
37
|
+
self.files_processed += 1
|
|
38
|
+
|
|
39
|
+
def report_worksheet_scanned(self) -> None:
|
|
40
|
+
self.worksheets_scanned += 1
|
|
41
|
+
|
|
42
|
+
def report_worksheet_processed(self) -> None:
|
|
43
|
+
self.worksheets_processed += 1
|
|
44
|
+
|
|
45
|
+
def report_file_dropped(self, file: str) -> None:
|
|
46
|
+
self.filtered.append(file)
|
|
47
|
+
|
|
48
|
+
def report_worksheet_dropped(self, worksheet: str) -> None:
|
|
49
|
+
self.filtered.append(worksheet)
|