acryl-datahub 0.14.1.13rc7__py3-none-any.whl → 0.14.1.13rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc7.dist-info → acryl_datahub-0.14.1.13rc9.dist-info}/METADATA +2355 -2355
- {acryl_datahub-0.14.1.13rc7.dist-info → acryl_datahub-0.14.1.13rc9.dist-info}/RECORD +15 -15
- datahub/__init__.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
- datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
- datahub/ingestion/source/ge_data_profiler.py +23 -8
- datahub/ingestion/source/ge_profiling_config.py +5 -0
- datahub/ingestion/source/kafka/kafka.py +2 -0
- datahub/ingestion/source/sigma/data_classes.py +1 -0
- datahub/ingestion/source/sigma/sigma.py +101 -43
- datahub/ingestion/source/sql/mssql/source.py +18 -4
- {acryl_datahub-0.14.1.13rc7.dist-info → acryl_datahub-0.14.1.13rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc7.dist-info → acryl_datahub-0.14.1.13rc9.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.14.1.13rc7.dist-info → acryl_datahub-0.14.1.13rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=Qii_Ygk-5SdmcNm5lZWzqq41uil0D4isccAfgzqcyu8,577
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -189,8 +189,8 @@ datahub/ingestion/source/demo_data.py,sha256=yzA_R-wfSX2WPz0i5ukYlscpmpb0Pt8D7Ek
|
|
|
189
189
|
datahub/ingestion/source/elastic_search.py,sha256=qFUVNzynTVJTabASTjGMu8Qhf9UpNbEtSBFjaPQjBJE,22641
|
|
190
190
|
datahub/ingestion/source/feast.py,sha256=NYaAjzLVRhmMKDawBwN0OL8AMyKDLsxOwEj3YFX0wIA,14244
|
|
191
191
|
datahub/ingestion/source/file.py,sha256=pH-Qkjh5FQ2XvyYPE7Z8XEY4vUk_SUHxm8p8IxG12tU,15879
|
|
192
|
-
datahub/ingestion/source/ge_data_profiler.py,sha256=
|
|
193
|
-
datahub/ingestion/source/ge_profiling_config.py,sha256=
|
|
192
|
+
datahub/ingestion/source/ge_data_profiler.py,sha256=JqTonv8y7Re4Rfn2YKOEaLufiiAOWKfK1XQvJfV5dvs,64126
|
|
193
|
+
datahub/ingestion/source/ge_profiling_config.py,sha256=P-9pd20koFvpxeEL_pqFvKWWz-qnpZ6XkELUyBKr7is,10807
|
|
194
194
|
datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
|
|
195
195
|
datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
|
|
196
196
|
datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
|
|
@@ -240,12 +240,12 @@ datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7
|
|
|
240
240
|
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256=8nuQ8hMuJEswWDZtV2RjbK8RvDJUzT_S74dnyPpGFdQ,4857
|
|
241
241
|
datahub/ingestion/source/bigquery_v2/bigquery_queries.py,sha256=EoHo9twb0_QdX7Nvd1HJC1Yn0rqtrfR52EVk7Hu3XOQ,3296
|
|
242
242
|
datahub/ingestion/source/bigquery_v2/bigquery_report.py,sha256=WxiLPFc7LwZXNDYfV9oySUD43kc2GcOf_pUokp3vFNM,8098
|
|
243
|
-
datahub/ingestion/source/bigquery_v2/bigquery_schema.py,sha256=
|
|
244
|
-
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=
|
|
243
|
+
datahub/ingestion/source/bigquery_v2/bigquery_schema.py,sha256=tyKPfxg88ERX6z7Q5lCmJS-H7cgsWRUwM2epqe62bI0,32277
|
|
244
|
+
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=Sv6BrK62nu3xpgjYGE-x1xdSTouvvnKDJtazPobhiKQ,50813
|
|
245
245
|
datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py,sha256=cATxwi5IPzj3BldRRAVcLqzSFmmYEPvqa7U0RFJbaAc,7645
|
|
246
246
|
datahub/ingestion/source/bigquery_v2/common.py,sha256=Cxjf1a8ibkL_YRQeS0BqsjlyMgFJpaZ3iq_d7e8T8MQ,4030
|
|
247
247
|
datahub/ingestion/source/bigquery_v2/lineage.py,sha256=Jg_pwnaj7l_KEcgq0enJXwrKh5jyUfBl4YB05YpkIVg,45415
|
|
248
|
-
datahub/ingestion/source/bigquery_v2/profiler.py,sha256=
|
|
248
|
+
datahub/ingestion/source/bigquery_v2/profiler.py,sha256=8-yAoq8sX0E6VIwr75YbM8wITRNhGfxgte9BCeGNkMM,10681
|
|
249
249
|
datahub/ingestion/source/bigquery_v2/queries.py,sha256=B2vJLZYfwM1J5JAckijKJTxLhDYA0yw3kfzj5oRQB5c,20151
|
|
250
250
|
datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=xLf-vCUAnNuDdTHghxJvPOyGeA_XLCW3r-xj-8cfn3Q,19528
|
|
251
251
|
datahub/ingestion/source/bigquery_v2/usage.py,sha256=xHb-gsugshoyzwScOQ5DHxZhoA-K0e4EbfSeVTLs428,40543
|
|
@@ -319,7 +319,7 @@ datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
|
|
|
319
319
|
datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR2woGLRJwBXUkgXq0,28809
|
|
320
320
|
datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
|
|
321
321
|
datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
322
|
-
datahub/ingestion/source/kafka/kafka.py,sha256=
|
|
322
|
+
datahub/ingestion/source/kafka/kafka.py,sha256=D54QXYoEQJYVb7oWyvLVCGbJunS8ZTGYlMMdaBqpmMs,25475
|
|
323
323
|
datahub/ingestion/source/kafka/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
|
|
324
324
|
datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
|
|
325
325
|
datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -410,8 +410,8 @@ datahub/ingestion/source/schema_inference/object.py,sha256=Aibf4dY4dXb4P9zfcNGnI
|
|
|
410
410
|
datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
|
|
411
411
|
datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
412
412
|
datahub/ingestion/source/sigma/config.py,sha256=zGh0ZU2Ty5NHfNXAVwFxVkK4NlsNSxtAyfCgMJJvzdc,3795
|
|
413
|
-
datahub/ingestion/source/sigma/data_classes.py,sha256=
|
|
414
|
-
datahub/ingestion/source/sigma/sigma.py,sha256=
|
|
413
|
+
datahub/ingestion/source/sigma/data_classes.py,sha256=YZkkzwftV34mq5c_4jlC2PCSiRKt4hvHjmqikLQhl1I,2012
|
|
414
|
+
datahub/ingestion/source/sigma/sigma.py,sha256=DZCxMgIx-uJp6poFG85KFygRCyFDkkM4KXtBbUsgkHk,24094
|
|
415
415
|
datahub/ingestion/source/sigma/sigma_api.py,sha256=jUkKbtqX3eRdfriruJE5KFT9aM7DCbvhmwfPzveYIiM,17814
|
|
416
416
|
datahub/ingestion/source/slack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
417
417
|
datahub/ingestion/source/slack/slack.py,sha256=C_3iXUS72h7HALhBW_AIyi3nNOqzyh7Ogflr-qI5ZEE,12946
|
|
@@ -464,7 +464,7 @@ datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5f
|
|
|
464
464
|
datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
|
|
465
465
|
datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
|
|
466
466
|
datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
|
|
467
|
-
datahub/ingestion/source/sql/mssql/source.py,sha256=
|
|
467
|
+
datahub/ingestion/source/sql/mssql/source.py,sha256=lOfvNUGfxLmgCAops2jwm1OTBvJxgBivHcEj-9DEaAE,30390
|
|
468
468
|
datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
|
|
469
469
|
datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
470
470
|
datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
|
|
@@ -971,8 +971,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
971
971
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
972
972
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
973
973
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
974
|
-
acryl_datahub-0.14.1.
|
|
975
|
-
acryl_datahub-0.14.1.
|
|
976
|
-
acryl_datahub-0.14.1.
|
|
977
|
-
acryl_datahub-0.14.1.
|
|
978
|
-
acryl_datahub-0.14.1.
|
|
974
|
+
acryl_datahub-0.14.1.13rc9.dist-info/METADATA,sha256=CvAHjZi0HfaRrZKhjpu6Rp_n-3c2oYSPb8A54uw9zNQ,171138
|
|
975
|
+
acryl_datahub-0.14.1.13rc9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
976
|
+
acryl_datahub-0.14.1.13rc9.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
|
|
977
|
+
acryl_datahub-0.14.1.13rc9.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
978
|
+
acryl_datahub-0.14.1.13rc9.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -118,7 +118,6 @@ class BigqueryTable(BaseTable):
|
|
|
118
118
|
active_billable_bytes: Optional[int] = None
|
|
119
119
|
long_term_billable_bytes: Optional[int] = None
|
|
120
120
|
partition_info: Optional[PartitionInfo] = None
|
|
121
|
-
columns_ignore_from_profiling: List[str] = field(default_factory=list)
|
|
122
121
|
external: bool = False
|
|
123
122
|
constraints: List[BigqueryTableConstraint] = field(default_factory=list)
|
|
124
123
|
table_type: Optional[str] = None
|
|
@@ -598,18 +598,6 @@ class BigQuerySchemaGenerator:
|
|
|
598
598
|
dataset_name=dataset_name,
|
|
599
599
|
)
|
|
600
600
|
|
|
601
|
-
# This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here
|
|
602
|
-
# because the profiler doesn't have access to columns
|
|
603
|
-
def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]:
|
|
604
|
-
ignore_list: List[str] = []
|
|
605
|
-
for column in columns:
|
|
606
|
-
if not column.data_type or any(
|
|
607
|
-
word in column.data_type.lower()
|
|
608
|
-
for word in ["array", "struct", "geography", "json"]
|
|
609
|
-
):
|
|
610
|
-
ignore_list.append(column.field_path)
|
|
611
|
-
return ignore_list
|
|
612
|
-
|
|
613
601
|
def _process_table(
|
|
614
602
|
self,
|
|
615
603
|
table: BigqueryTable,
|
|
@@ -631,15 +619,6 @@ class BigQuerySchemaGenerator:
|
|
|
631
619
|
)
|
|
632
620
|
table.column_count = len(columns)
|
|
633
621
|
|
|
634
|
-
# We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
|
|
635
|
-
if (
|
|
636
|
-
self.config.is_profiling_enabled()
|
|
637
|
-
and not self.config.profiling.profile_table_level_only
|
|
638
|
-
):
|
|
639
|
-
table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
|
|
640
|
-
columns
|
|
641
|
-
)
|
|
642
|
-
|
|
643
622
|
if not table.column_count:
|
|
644
623
|
logger.warning(
|
|
645
624
|
f"Table doesn't have any column or unable to get columns for table: {table_identifier}"
|
|
@@ -166,12 +166,6 @@ WHERE
|
|
|
166
166
|
normalized_table_name = BigqueryTableIdentifier(
|
|
167
167
|
project_id=project_id, dataset=dataset, table=table.name
|
|
168
168
|
).get_table_name()
|
|
169
|
-
for column in table.columns_ignore_from_profiling:
|
|
170
|
-
# Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
|
|
171
|
-
# We also filter columns without data type as it means that column is part of a complex type.
|
|
172
|
-
self.config.profile_pattern.deny.append(
|
|
173
|
-
f"^{normalized_table_name}.{column}$"
|
|
174
|
-
)
|
|
175
169
|
|
|
176
170
|
if table.external and not self.config.profiling.profile_external_tables:
|
|
177
171
|
self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1
|
|
@@ -7,6 +7,7 @@ import dataclasses
|
|
|
7
7
|
import functools
|
|
8
8
|
import json
|
|
9
9
|
import logging
|
|
10
|
+
import re
|
|
10
11
|
import threading
|
|
11
12
|
import traceback
|
|
12
13
|
import unittest.mock
|
|
@@ -123,6 +124,8 @@ ProfilerTypeMapping.BINARY_TYPE_NAMES.append("LargeBinary")
|
|
|
123
124
|
|
|
124
125
|
_datasource_connection_injection_lock = threading.Lock()
|
|
125
126
|
|
|
127
|
+
NORMALIZE_TYPE_PATTERN = re.compile(r"^(.*?)(?:[\[<(].*)?$")
|
|
128
|
+
|
|
126
129
|
|
|
127
130
|
@contextlib.contextmanager
|
|
128
131
|
def _inject_connection_into_datasource(conn: Connection) -> Iterator[None]:
|
|
@@ -165,11 +168,9 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
165
168
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
166
169
|
elif self.engine.dialect.name.lower() == BIGQUERY:
|
|
167
170
|
element_values = self.engine.execute(
|
|
168
|
-
sa.select(
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
]
|
|
172
|
-
).select_from(self._table)
|
|
171
|
+
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
|
|
172
|
+
self._table
|
|
173
|
+
)
|
|
173
174
|
)
|
|
174
175
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
175
176
|
elif self.engine.dialect.name.lower() == SNOWFLAKE:
|
|
@@ -378,6 +379,9 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
378
379
|
f"{self.dataset_name}.{col}"
|
|
379
380
|
):
|
|
380
381
|
ignored_columns_by_pattern.append(col)
|
|
382
|
+
# We try to ignore nested columns as well
|
|
383
|
+
elif not self.config.profile_nested_fields and "." in col:
|
|
384
|
+
ignored_columns_by_pattern.append(col)
|
|
381
385
|
elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
|
|
382
386
|
ignored_columns_by_type.append(col)
|
|
383
387
|
else:
|
|
@@ -407,9 +411,18 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
407
411
|
return columns_to_profile
|
|
408
412
|
|
|
409
413
|
def _should_ignore_column(self, sqlalchemy_type: sa.types.TypeEngine) -> bool:
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
414
|
+
# We don't profiles columns with None types
|
|
415
|
+
if str(sqlalchemy_type) == "NULL":
|
|
416
|
+
return True
|
|
417
|
+
|
|
418
|
+
sql_type = str(sqlalchemy_type)
|
|
419
|
+
|
|
420
|
+
match = re.match(NORMALIZE_TYPE_PATTERN, sql_type)
|
|
421
|
+
|
|
422
|
+
if match:
|
|
423
|
+
sql_type = match.group(1)
|
|
424
|
+
|
|
425
|
+
return sql_type in _get_column_types_to_ignore(self.dataset.engine.dialect.name)
|
|
413
426
|
|
|
414
427
|
@_run_with_query_combiner
|
|
415
428
|
def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
|
|
@@ -1397,6 +1410,8 @@ class DatahubGEProfiler:
|
|
|
1397
1410
|
def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
|
|
1398
1411
|
if dialect_name.lower() == POSTGRESQL:
|
|
1399
1412
|
return ["JSON"]
|
|
1413
|
+
elif dialect_name.lower() == BIGQUERY:
|
|
1414
|
+
return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]
|
|
1400
1415
|
|
|
1401
1416
|
return []
|
|
1402
1417
|
|
|
@@ -188,6 +188,11 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
188
188
|
),
|
|
189
189
|
)
|
|
190
190
|
|
|
191
|
+
profile_nested_fields: bool = Field(
|
|
192
|
+
default=False,
|
|
193
|
+
description="Whether to profile complex types like structs, arrays and maps. ",
|
|
194
|
+
)
|
|
195
|
+
|
|
191
196
|
@pydantic.root_validator(pre=True)
|
|
192
197
|
def deprecate_bigquery_temp_table_schema(cls, values):
|
|
193
198
|
# TODO: Update docs to remove mention of this field.
|
|
@@ -157,7 +157,9 @@ def get_kafka_consumer(
|
|
|
157
157
|
if CallableConsumerConfig.is_callable_config(connection.consumer_config):
|
|
158
158
|
# As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
|
|
159
159
|
# https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
|
|
160
|
+
logger.debug("Initiating polling for kafka consumer")
|
|
160
161
|
consumer.poll(timeout=30)
|
|
162
|
+
logger.debug("Initiated polling for kafka consumer")
|
|
161
163
|
|
|
162
164
|
return consumer
|
|
163
165
|
|
|
@@ -4,7 +4,12 @@ from typing import Dict, Iterable, List, Optional
|
|
|
4
4
|
import datahub.emitter.mce_builder as builder
|
|
5
5
|
from datahub.configuration.common import ConfigurationError
|
|
6
6
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
7
|
-
from datahub.emitter.mcp_builder import
|
|
7
|
+
from datahub.emitter.mcp_builder import (
|
|
8
|
+
add_entity_to_container,
|
|
9
|
+
add_owner_to_entity_wu,
|
|
10
|
+
add_tags_to_entity_wu,
|
|
11
|
+
gen_containers,
|
|
12
|
+
)
|
|
8
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
9
14
|
from datahub.ingestion.api.decorators import (
|
|
10
15
|
SourceCapability,
|
|
@@ -59,12 +64,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
59
64
|
UpstreamLineage,
|
|
60
65
|
)
|
|
61
66
|
from datahub.metadata.schema_classes import (
|
|
67
|
+
AuditStampClass,
|
|
62
68
|
BrowsePathEntryClass,
|
|
63
69
|
BrowsePathsV2Class,
|
|
64
70
|
ChangeAuditStampsClass,
|
|
65
71
|
ChartInfoClass,
|
|
66
72
|
DashboardInfoClass,
|
|
67
73
|
DataPlatformInstanceClass,
|
|
74
|
+
EdgeClass,
|
|
68
75
|
GlobalTagsClass,
|
|
69
76
|
InputFieldClass,
|
|
70
77
|
InputFieldsClass,
|
|
@@ -74,6 +81,7 @@ from datahub.metadata.schema_classes import (
|
|
|
74
81
|
SchemaFieldClass,
|
|
75
82
|
SchemaFieldDataTypeClass,
|
|
76
83
|
StringTypeClass,
|
|
84
|
+
SubTypesClass,
|
|
77
85
|
TagAssociationClass,
|
|
78
86
|
)
|
|
79
87
|
from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
|
|
@@ -257,11 +265,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
257
265
|
entries = [
|
|
258
266
|
BrowsePathEntryClass(id=parent_entity_urn, urn=parent_entity_urn)
|
|
259
267
|
] + [BrowsePathEntryClass(id=path) for path in paths]
|
|
260
|
-
if self.config.platform_instance:
|
|
261
|
-
urn = builder.make_dataplatform_instance_urn(
|
|
262
|
-
self.platform, self.config.platform_instance
|
|
263
|
-
)
|
|
264
|
-
entries = [BrowsePathEntryClass(id=urn, urn=urn)] + entries
|
|
265
268
|
return MetadataChangeProposalWrapper(
|
|
266
269
|
entityUrn=entity_urn,
|
|
267
270
|
aspect=BrowsePathsV2Class(entries),
|
|
@@ -424,11 +427,11 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
424
427
|
elements: List[Element],
|
|
425
428
|
workbook: Workbook,
|
|
426
429
|
all_input_fields: List[InputFieldClass],
|
|
430
|
+
paths: List[str],
|
|
427
431
|
) -> Iterable[MetadataWorkUnit]:
|
|
428
432
|
"""
|
|
429
433
|
Map Sigma page element to Datahub Chart
|
|
430
434
|
"""
|
|
431
|
-
|
|
432
435
|
for element in elements:
|
|
433
436
|
chart_urn = builder.make_chart_urn(
|
|
434
437
|
platform=self.platform,
|
|
@@ -459,11 +462,14 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
459
462
|
),
|
|
460
463
|
).as_workunit()
|
|
461
464
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
465
|
+
if workbook.workspaceId:
|
|
466
|
+
yield self._gen_entity_browsepath_aspect(
|
|
467
|
+
entity_urn=chart_urn,
|
|
468
|
+
parent_entity_urn=builder.make_container_urn(
|
|
469
|
+
self._gen_workspace_key(workbook.workspaceId)
|
|
470
|
+
),
|
|
471
|
+
paths=paths + [workbook.name],
|
|
472
|
+
)
|
|
467
473
|
|
|
468
474
|
# Add sigma dataset's upstream dataset urn mapping
|
|
469
475
|
for dataset_urn, upstream_dataset_urns in inputs.items():
|
|
@@ -494,7 +500,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
494
500
|
|
|
495
501
|
all_input_fields.extend(element_input_fields)
|
|
496
502
|
|
|
497
|
-
def _gen_pages_workunit(
|
|
503
|
+
def _gen_pages_workunit(
|
|
504
|
+
self, workbook: Workbook, paths: List[str]
|
|
505
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
498
506
|
"""
|
|
499
507
|
Map Sigma workbook page to Datahub dashboard
|
|
500
508
|
"""
|
|
@@ -505,20 +513,23 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
505
513
|
|
|
506
514
|
yield self._gen_dashboard_info_workunit(page)
|
|
507
515
|
|
|
508
|
-
yield from add_entity_to_container(
|
|
509
|
-
container_key=self._gen_workbook_key(workbook.workbookId),
|
|
510
|
-
entity_type="dashboard",
|
|
511
|
-
entity_urn=dashboard_urn,
|
|
512
|
-
)
|
|
513
|
-
|
|
514
516
|
dpi_aspect = self._gen_dataplatform_instance_aspect(dashboard_urn)
|
|
515
517
|
if dpi_aspect:
|
|
516
518
|
yield dpi_aspect
|
|
517
519
|
|
|
518
520
|
all_input_fields: List[InputFieldClass] = []
|
|
519
521
|
|
|
522
|
+
if workbook.workspaceId:
|
|
523
|
+
yield self._gen_entity_browsepath_aspect(
|
|
524
|
+
entity_urn=dashboard_urn,
|
|
525
|
+
parent_entity_urn=builder.make_container_urn(
|
|
526
|
+
self._gen_workspace_key(workbook.workspaceId)
|
|
527
|
+
),
|
|
528
|
+
paths=paths + [workbook.name],
|
|
529
|
+
)
|
|
530
|
+
|
|
520
531
|
yield from self._gen_elements_workunit(
|
|
521
|
-
page.elements, workbook, all_input_fields
|
|
532
|
+
page.elements, workbook, all_input_fields, paths
|
|
522
533
|
)
|
|
523
534
|
|
|
524
535
|
yield MetadataChangeProposalWrapper(
|
|
@@ -531,42 +542,89 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
531
542
|
Map Sigma Workbook to Datahub container
|
|
532
543
|
"""
|
|
533
544
|
owner_username = self.sigma_api.get_user_name(workbook.createdBy)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
545
|
+
|
|
546
|
+
dashboard_urn = self._gen_dashboard_urn(workbook.workbookId)
|
|
547
|
+
|
|
548
|
+
yield self._gen_entity_status_aspect(dashboard_urn)
|
|
549
|
+
|
|
550
|
+
lastModified = AuditStampClass(
|
|
551
|
+
time=int(workbook.updatedAt.timestamp() * 1000),
|
|
552
|
+
actor="urn:li:corpuser:datahub",
|
|
553
|
+
)
|
|
554
|
+
created = AuditStampClass(
|
|
555
|
+
time=int(workbook.createdAt.timestamp() * 1000),
|
|
556
|
+
actor="urn:li:corpuser:datahub",
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
dashboard_info_cls = DashboardInfoClass(
|
|
560
|
+
title=workbook.name,
|
|
561
|
+
description=workbook.description if workbook.description else "",
|
|
562
|
+
dashboards=[
|
|
563
|
+
EdgeClass(
|
|
564
|
+
destinationUrn=self._gen_dashboard_urn(page.get_urn_part()),
|
|
565
|
+
sourceUrn=dashboard_urn,
|
|
566
|
+
)
|
|
567
|
+
for page in workbook.pages
|
|
568
|
+
],
|
|
569
|
+
externalUrl=workbook.url,
|
|
570
|
+
lastModified=ChangeAuditStampsClass(
|
|
571
|
+
created=created, lastModified=lastModified
|
|
543
572
|
),
|
|
544
|
-
|
|
573
|
+
customProperties={
|
|
545
574
|
"path": workbook.path,
|
|
546
575
|
"latestVersion": str(workbook.latestVersion),
|
|
547
576
|
},
|
|
548
|
-
owner_urn=(
|
|
549
|
-
builder.make_user_urn(owner_username)
|
|
550
|
-
if self.config.ingest_owner and owner_username
|
|
551
|
-
else None
|
|
552
|
-
),
|
|
553
|
-
external_url=workbook.url,
|
|
554
|
-
tags=[workbook.badge] if workbook.badge else None,
|
|
555
|
-
created=int(workbook.createdAt.timestamp() * 1000),
|
|
556
|
-
last_modified=int(workbook.updatedAt.timestamp() * 1000),
|
|
557
577
|
)
|
|
578
|
+
yield MetadataChangeProposalWrapper(
|
|
579
|
+
entityUrn=dashboard_urn, aspect=dashboard_info_cls
|
|
580
|
+
).as_workunit()
|
|
581
|
+
|
|
582
|
+
# Set subtype
|
|
583
|
+
yield MetadataChangeProposalWrapper(
|
|
584
|
+
entityUrn=dashboard_urn,
|
|
585
|
+
aspect=SubTypesClass(typeNames=[BIContainerSubTypes.SIGMA_WORKBOOK]),
|
|
586
|
+
).as_workunit()
|
|
587
|
+
|
|
588
|
+
# Ownership
|
|
589
|
+
owner_urn = (
|
|
590
|
+
builder.make_user_urn(owner_username)
|
|
591
|
+
if self.config.ingest_owner and owner_username
|
|
592
|
+
else None
|
|
593
|
+
)
|
|
594
|
+
if owner_urn:
|
|
595
|
+
yield from add_owner_to_entity_wu(
|
|
596
|
+
entity_type="dashboard",
|
|
597
|
+
entity_urn=dashboard_urn,
|
|
598
|
+
owner_urn=owner_urn,
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
# Tags
|
|
602
|
+
tags = [workbook.badge] if workbook.badge else None
|
|
603
|
+
if tags:
|
|
604
|
+
yield from add_tags_to_entity_wu(
|
|
605
|
+
entity_type="dashboard",
|
|
606
|
+
entity_urn=dashboard_urn,
|
|
607
|
+
tags=sorted(tags),
|
|
608
|
+
)
|
|
558
609
|
|
|
559
610
|
paths = workbook.path.split("/")[1:]
|
|
560
|
-
if
|
|
611
|
+
if workbook.workspaceId:
|
|
561
612
|
yield self._gen_entity_browsepath_aspect(
|
|
562
|
-
entity_urn=
|
|
613
|
+
entity_urn=dashboard_urn,
|
|
563
614
|
parent_entity_urn=builder.make_container_urn(
|
|
564
615
|
self._gen_workspace_key(workbook.workspaceId)
|
|
565
616
|
),
|
|
566
|
-
paths=paths,
|
|
617
|
+
paths=paths + [workbook.name],
|
|
567
618
|
)
|
|
568
619
|
|
|
569
|
-
|
|
620
|
+
if len(paths) == 0:
|
|
621
|
+
yield from add_entity_to_container(
|
|
622
|
+
container_key=self._gen_workspace_key(workbook.workspaceId),
|
|
623
|
+
entity_type="dashboard",
|
|
624
|
+
entity_urn=dashboard_urn,
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
yield from self._gen_pages_workunit(workbook, paths)
|
|
570
628
|
|
|
571
629
|
def _gen_sigma_dataset_upstream_lineage_workunit(
|
|
572
630
|
self,
|
|
@@ -50,6 +50,7 @@ from datahub.ingestion.source.sql.sql_config import (
|
|
|
50
50
|
BasicSQLAlchemyConfig,
|
|
51
51
|
make_sqlalchemy_uri,
|
|
52
52
|
)
|
|
53
|
+
from datahub.ingestion.source.sql.sql_report import SQLSourceReport
|
|
53
54
|
from datahub.metadata.schema_classes import (
|
|
54
55
|
BooleanTypeClass,
|
|
55
56
|
NumberTypeClass,
|
|
@@ -78,6 +79,11 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
|
78
79
|
include_stored_procedures_code: bool = Field(
|
|
79
80
|
default=True, description="Include information about object code."
|
|
80
81
|
)
|
|
82
|
+
procedure_pattern: AllowDenyPattern = Field(
|
|
83
|
+
default=AllowDenyPattern.allow_all(),
|
|
84
|
+
description="Regex patterns for stored procedures to filter in ingestion."
|
|
85
|
+
"Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
|
|
86
|
+
)
|
|
81
87
|
include_jobs: bool = Field(
|
|
82
88
|
default=True,
|
|
83
89
|
description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.",
|
|
@@ -164,6 +170,8 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
164
170
|
If you do use pyodbc, make sure to change the source type from `mssql` to `mssql-odbc` so that we pull in the right set of dependencies. This will be needed in most cases where encryption is required, such as managed SQL Server services in Azure.
|
|
165
171
|
"""
|
|
166
172
|
|
|
173
|
+
report: SQLSourceReport
|
|
174
|
+
|
|
167
175
|
def __init__(self, config: SQLServerConfig, ctx: PipelineContext):
|
|
168
176
|
super().__init__(config, ctx, "mssql")
|
|
169
177
|
# Cache the table and column descriptions
|
|
@@ -416,10 +424,16 @@ class SQLServerSource(SQLAlchemySource):
|
|
|
416
424
|
data_flow = MSSQLDataFlow(entity=mssql_default_job)
|
|
417
425
|
with inspector.engine.connect() as conn:
|
|
418
426
|
procedures_data_list = self._get_stored_procedures(conn, db_name, schema)
|
|
419
|
-
procedures = [
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
427
|
+
procedures: List[StoredProcedure] = []
|
|
428
|
+
for procedure_data in procedures_data_list:
|
|
429
|
+
procedure_full_name = f"{db_name}.{schema}.{procedure_data['name']}"
|
|
430
|
+
if not self.config.procedure_pattern.allowed(procedure_full_name):
|
|
431
|
+
self.report.report_dropped(procedure_full_name)
|
|
432
|
+
continue
|
|
433
|
+
procedures.append(
|
|
434
|
+
StoredProcedure(flow=mssql_default_job, **procedure_data)
|
|
435
|
+
)
|
|
436
|
+
|
|
423
437
|
if procedures:
|
|
424
438
|
yield from self.construct_flow_workunits(data_flow=data_flow)
|
|
425
439
|
for procedure in procedures:
|
|
File without changes
|
{acryl_datahub-0.14.1.13rc7.dist-info → acryl_datahub-0.14.1.13rc9.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|