acryl-datahub 0.14.1.13rc7__py3-none-any.whl → 0.14.1.13rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=iJWtbGzvLO2Ab_1AQTeXndrbJHFOcpOmE_RHQzAvOEo,577
1
+ datahub/__init__.py,sha256=Qii_Ygk-5SdmcNm5lZWzqq41uil0D4isccAfgzqcyu8,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -189,8 +189,8 @@ datahub/ingestion/source/demo_data.py,sha256=yzA_R-wfSX2WPz0i5ukYlscpmpb0Pt8D7Ek
189
189
  datahub/ingestion/source/elastic_search.py,sha256=qFUVNzynTVJTabASTjGMu8Qhf9UpNbEtSBFjaPQjBJE,22641
190
190
  datahub/ingestion/source/feast.py,sha256=NYaAjzLVRhmMKDawBwN0OL8AMyKDLsxOwEj3YFX0wIA,14244
191
191
  datahub/ingestion/source/file.py,sha256=pH-Qkjh5FQ2XvyYPE7Z8XEY4vUk_SUHxm8p8IxG12tU,15879
192
- datahub/ingestion/source/ge_data_profiler.py,sha256=jORUlsmN2XtHm3QyltENvhEyt-CwbF2O548mFxtisxY,63587
193
- datahub/ingestion/source/ge_profiling_config.py,sha256=E65adlsUrs17mQB7WQnoe3QCjvbGaGoNNPMf8szNK6s,10648
192
+ datahub/ingestion/source/ge_data_profiler.py,sha256=JqTonv8y7Re4Rfn2YKOEaLufiiAOWKfK1XQvJfV5dvs,64126
193
+ datahub/ingestion/source/ge_profiling_config.py,sha256=P-9pd20koFvpxeEL_pqFvKWWz-qnpZ6XkELUyBKr7is,10807
194
194
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
195
195
  datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
196
196
  datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
@@ -240,12 +240,12 @@ datahub/ingestion/source/bigquery_v2/bigquery_helper.py,sha256=QER3gY8e_k1_eNVj7
240
240
  datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py,sha256=8nuQ8hMuJEswWDZtV2RjbK8RvDJUzT_S74dnyPpGFdQ,4857
241
241
  datahub/ingestion/source/bigquery_v2/bigquery_queries.py,sha256=EoHo9twb0_QdX7Nvd1HJC1Yn0rqtrfR52EVk7Hu3XOQ,3296
242
242
  datahub/ingestion/source/bigquery_v2/bigquery_report.py,sha256=WxiLPFc7LwZXNDYfV9oySUD43kc2GcOf_pUokp3vFNM,8098
243
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py,sha256=QQk2xnyLCpywwRA3a2Pm95zJd0LgJUGbe5ht-5yadmQ,32352
244
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=JkQqG8GrMnsp6efUCSv1Efc0ZUmdC6q5_M6wWIyg_dQ,51774
243
+ datahub/ingestion/source/bigquery_v2/bigquery_schema.py,sha256=tyKPfxg88ERX6z7Q5lCmJS-H7cgsWRUwM2epqe62bI0,32277
244
+ datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py,sha256=Sv6BrK62nu3xpgjYGE-x1xdSTouvvnKDJtazPobhiKQ,50813
245
245
  datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py,sha256=cATxwi5IPzj3BldRRAVcLqzSFmmYEPvqa7U0RFJbaAc,7645
246
246
  datahub/ingestion/source/bigquery_v2/common.py,sha256=Cxjf1a8ibkL_YRQeS0BqsjlyMgFJpaZ3iq_d7e8T8MQ,4030
247
247
  datahub/ingestion/source/bigquery_v2/lineage.py,sha256=Jg_pwnaj7l_KEcgq0enJXwrKh5jyUfBl4YB05YpkIVg,45415
248
- datahub/ingestion/source/bigquery_v2/profiler.py,sha256=ddNnPsc9NaraWYy4HctL0y83QtsyiiHx7zE9DgY8hTQ,11140
248
+ datahub/ingestion/source/bigquery_v2/profiler.py,sha256=8-yAoq8sX0E6VIwr75YbM8wITRNhGfxgte9BCeGNkMM,10681
249
249
  datahub/ingestion/source/bigquery_v2/queries.py,sha256=B2vJLZYfwM1J5JAckijKJTxLhDYA0yw3kfzj5oRQB5c,20151
250
250
  datahub/ingestion/source/bigquery_v2/queries_extractor.py,sha256=xLf-vCUAnNuDdTHghxJvPOyGeA_XLCW3r-xj-8cfn3Q,19528
251
251
  datahub/ingestion/source/bigquery_v2/usage.py,sha256=xHb-gsugshoyzwScOQ5DHxZhoA-K0e4EbfSeVTLs428,40543
@@ -319,7 +319,7 @@ datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
319
319
  datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR2woGLRJwBXUkgXq0,28809
320
320
  datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
321
321
  datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
322
- datahub/ingestion/source/kafka/kafka.py,sha256=wC5tZ31fvXWLjse9qH3YONPhMxXIZ9QV9vbBcWDAP_w,25352
322
+ datahub/ingestion/source/kafka/kafka.py,sha256=D54QXYoEQJYVb7oWyvLVCGbJunS8ZTGYlMMdaBqpmMs,25475
323
323
  datahub/ingestion/source/kafka/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
324
324
  datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
325
325
  datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -410,8 +410,8 @@ datahub/ingestion/source/schema_inference/object.py,sha256=Aibf4dY4dXb4P9zfcNGnI
410
410
  datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
411
411
  datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
412
412
  datahub/ingestion/source/sigma/config.py,sha256=zGh0ZU2Ty5NHfNXAVwFxVkK4NlsNSxtAyfCgMJJvzdc,3795
413
- datahub/ingestion/source/sigma/data_classes.py,sha256=kaSuWs0TaDCCsyRbHaKq8LknE5I42VOJ4hQXt5SKFg0,1974
414
- datahub/ingestion/source/sigma/sigma.py,sha256=PgFoSj0O309iL8kMT4jGnN9sTTtpM6kJoCLa84vckPg,22412
413
+ datahub/ingestion/source/sigma/data_classes.py,sha256=YZkkzwftV34mq5c_4jlC2PCSiRKt4hvHjmqikLQhl1I,2012
414
+ datahub/ingestion/source/sigma/sigma.py,sha256=DZCxMgIx-uJp6poFG85KFygRCyFDkkM4KXtBbUsgkHk,24094
415
415
  datahub/ingestion/source/sigma/sigma_api.py,sha256=jUkKbtqX3eRdfriruJE5KFT9aM7DCbvhmwfPzveYIiM,17814
416
416
  datahub/ingestion/source/slack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
417
417
  datahub/ingestion/source/slack/slack.py,sha256=C_3iXUS72h7HALhBW_AIyi3nNOqzyh7Ogflr-qI5ZEE,12946
@@ -464,7 +464,7 @@ datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5f
464
464
  datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
465
465
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
466
466
  datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
467
- datahub/ingestion/source/sql/mssql/source.py,sha256=fzpWjwexGvJgpd6Z4DCsK6Ld2vQCfPkD2M1xE4pU9Ec,29542
467
+ datahub/ingestion/source/sql/mssql/source.py,sha256=lOfvNUGfxLmgCAops2jwm1OTBvJxgBivHcEj-9DEaAE,30390
468
468
  datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
469
469
  datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
470
470
  datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
@@ -971,8 +971,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
971
971
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
972
972
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
973
973
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
974
- acryl_datahub-0.14.1.13rc7.dist-info/METADATA,sha256=GLfCmMbHHo7KoYo_AKEO6D8Cty0VP2obvuJ5sSSclMo,171138
975
- acryl_datahub-0.14.1.13rc7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
976
- acryl_datahub-0.14.1.13rc7.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
977
- acryl_datahub-0.14.1.13rc7.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
978
- acryl_datahub-0.14.1.13rc7.dist-info/RECORD,,
974
+ acryl_datahub-0.14.1.13rc9.dist-info/METADATA,sha256=CvAHjZi0HfaRrZKhjpu6Rp_n-3c2oYSPb8A54uw9zNQ,171138
975
+ acryl_datahub-0.14.1.13rc9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
976
+ acryl_datahub-0.14.1.13rc9.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
977
+ acryl_datahub-0.14.1.13rc9.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
978
+ acryl_datahub-0.14.1.13rc9.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.14.1.13rc7"
6
+ __version__ = "0.14.1.13rc9"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -118,7 +118,6 @@ class BigqueryTable(BaseTable):
118
118
  active_billable_bytes: Optional[int] = None
119
119
  long_term_billable_bytes: Optional[int] = None
120
120
  partition_info: Optional[PartitionInfo] = None
121
- columns_ignore_from_profiling: List[str] = field(default_factory=list)
122
121
  external: bool = False
123
122
  constraints: List[BigqueryTableConstraint] = field(default_factory=list)
124
123
  table_type: Optional[str] = None
@@ -598,18 +598,6 @@ class BigQuerySchemaGenerator:
598
598
  dataset_name=dataset_name,
599
599
  )
600
600
 
601
- # This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here
602
- # because the profiler doesn't have access to columns
603
- def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]:
604
- ignore_list: List[str] = []
605
- for column in columns:
606
- if not column.data_type or any(
607
- word in column.data_type.lower()
608
- for word in ["array", "struct", "geography", "json"]
609
- ):
610
- ignore_list.append(column.field_path)
611
- return ignore_list
612
-
613
601
  def _process_table(
614
602
  self,
615
603
  table: BigqueryTable,
@@ -631,15 +619,6 @@ class BigQuerySchemaGenerator:
631
619
  )
632
620
  table.column_count = len(columns)
633
621
 
634
- # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
635
- if (
636
- self.config.is_profiling_enabled()
637
- and not self.config.profiling.profile_table_level_only
638
- ):
639
- table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
640
- columns
641
- )
642
-
643
622
  if not table.column_count:
644
623
  logger.warning(
645
624
  f"Table doesn't have any column or unable to get columns for table: {table_identifier}"
@@ -166,12 +166,6 @@ WHERE
166
166
  normalized_table_name = BigqueryTableIdentifier(
167
167
  project_id=project_id, dataset=dataset, table=table.name
168
168
  ).get_table_name()
169
- for column in table.columns_ignore_from_profiling:
170
- # Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
171
- # We also filter columns without data type as it means that column is part of a complex type.
172
- self.config.profile_pattern.deny.append(
173
- f"^{normalized_table_name}.{column}$"
174
- )
175
169
 
176
170
  if table.external and not self.config.profiling.profile_external_tables:
177
171
  self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1
@@ -7,6 +7,7 @@ import dataclasses
7
7
  import functools
8
8
  import json
9
9
  import logging
10
+ import re
10
11
  import threading
11
12
  import traceback
12
13
  import unittest.mock
@@ -123,6 +124,8 @@ ProfilerTypeMapping.BINARY_TYPE_NAMES.append("LargeBinary")
123
124
 
124
125
  _datasource_connection_injection_lock = threading.Lock()
125
126
 
127
+ NORMALIZE_TYPE_PATTERN = re.compile(r"^(.*?)(?:[\[<(].*)?$")
128
+
126
129
 
127
130
  @contextlib.contextmanager
128
131
  def _inject_connection_into_datasource(conn: Connection) -> Iterator[None]:
@@ -165,11 +168,9 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
165
168
  return convert_to_json_serializable(element_values.fetchone()[0])
166
169
  elif self.engine.dialect.name.lower() == BIGQUERY:
167
170
  element_values = self.engine.execute(
168
- sa.select(
169
- [
170
- sa.func.coalesce(sa.text(f"APPROX_COUNT_DISTINCT(`{column}`)")),
171
- ]
172
- ).select_from(self._table)
171
+ sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
172
+ self._table
173
+ )
173
174
  )
174
175
  return convert_to_json_serializable(element_values.fetchone()[0])
175
176
  elif self.engine.dialect.name.lower() == SNOWFLAKE:
@@ -378,6 +379,9 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
378
379
  f"{self.dataset_name}.{col}"
379
380
  ):
380
381
  ignored_columns_by_pattern.append(col)
382
+ # We try to ignore nested columns as well
383
+ elif not self.config.profile_nested_fields and "." in col:
384
+ ignored_columns_by_pattern.append(col)
381
385
  elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
382
386
  ignored_columns_by_type.append(col)
383
387
  else:
@@ -407,9 +411,18 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
407
411
  return columns_to_profile
408
412
 
409
413
  def _should_ignore_column(self, sqlalchemy_type: sa.types.TypeEngine) -> bool:
410
- return str(sqlalchemy_type) in _get_column_types_to_ignore(
411
- self.dataset.engine.dialect.name
412
- )
414
+ # We don't profiles columns with None types
415
+ if str(sqlalchemy_type) == "NULL":
416
+ return True
417
+
418
+ sql_type = str(sqlalchemy_type)
419
+
420
+ match = re.match(NORMALIZE_TYPE_PATTERN, sql_type)
421
+
422
+ if match:
423
+ sql_type = match.group(1)
424
+
425
+ return sql_type in _get_column_types_to_ignore(self.dataset.engine.dialect.name)
413
426
 
414
427
  @_run_with_query_combiner
415
428
  def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
@@ -1397,6 +1410,8 @@ class DatahubGEProfiler:
1397
1410
  def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
1398
1411
  if dialect_name.lower() == POSTGRESQL:
1399
1412
  return ["JSON"]
1413
+ elif dialect_name.lower() == BIGQUERY:
1414
+ return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]
1400
1415
 
1401
1416
  return []
1402
1417
 
@@ -188,6 +188,11 @@ class GEProfilingConfig(GEProfilingBaseConfig):
188
188
  ),
189
189
  )
190
190
 
191
+ profile_nested_fields: bool = Field(
192
+ default=False,
193
+ description="Whether to profile complex types like structs, arrays and maps. ",
194
+ )
195
+
191
196
  @pydantic.root_validator(pre=True)
192
197
  def deprecate_bigquery_temp_table_schema(cls, values):
193
198
  # TODO: Update docs to remove mention of this field.
@@ -157,7 +157,9 @@ def get_kafka_consumer(
157
157
  if CallableConsumerConfig.is_callable_config(connection.consumer_config):
158
158
  # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
159
159
  # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
160
+ logger.debug("Initiating polling for kafka consumer")
160
161
  consumer.poll(timeout=30)
162
+ logger.debug("Initiated polling for kafka consumer")
161
163
 
162
164
  return consumer
163
165
 
@@ -80,6 +80,7 @@ class Workbook(BaseModel):
80
80
  path: str
81
81
  latestVersion: int
82
82
  workspaceId: Optional[str] = None
83
+ description: Optional[str] = None
83
84
  pages: List[Page] = []
84
85
  badge: Optional[str] = None
85
86
 
@@ -4,7 +4,12 @@ from typing import Dict, Iterable, List, Optional
4
4
  import datahub.emitter.mce_builder as builder
5
5
  from datahub.configuration.common import ConfigurationError
6
6
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
7
- from datahub.emitter.mcp_builder import add_entity_to_container, gen_containers
7
+ from datahub.emitter.mcp_builder import (
8
+ add_entity_to_container,
9
+ add_owner_to_entity_wu,
10
+ add_tags_to_entity_wu,
11
+ gen_containers,
12
+ )
8
13
  from datahub.ingestion.api.common import PipelineContext
9
14
  from datahub.ingestion.api.decorators import (
10
15
  SourceCapability,
@@ -59,12 +64,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
59
64
  UpstreamLineage,
60
65
  )
61
66
  from datahub.metadata.schema_classes import (
67
+ AuditStampClass,
62
68
  BrowsePathEntryClass,
63
69
  BrowsePathsV2Class,
64
70
  ChangeAuditStampsClass,
65
71
  ChartInfoClass,
66
72
  DashboardInfoClass,
67
73
  DataPlatformInstanceClass,
74
+ EdgeClass,
68
75
  GlobalTagsClass,
69
76
  InputFieldClass,
70
77
  InputFieldsClass,
@@ -74,6 +81,7 @@ from datahub.metadata.schema_classes import (
74
81
  SchemaFieldClass,
75
82
  SchemaFieldDataTypeClass,
76
83
  StringTypeClass,
84
+ SubTypesClass,
77
85
  TagAssociationClass,
78
86
  )
79
87
  from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
@@ -257,11 +265,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
257
265
  entries = [
258
266
  BrowsePathEntryClass(id=parent_entity_urn, urn=parent_entity_urn)
259
267
  ] + [BrowsePathEntryClass(id=path) for path in paths]
260
- if self.config.platform_instance:
261
- urn = builder.make_dataplatform_instance_urn(
262
- self.platform, self.config.platform_instance
263
- )
264
- entries = [BrowsePathEntryClass(id=urn, urn=urn)] + entries
265
268
  return MetadataChangeProposalWrapper(
266
269
  entityUrn=entity_urn,
267
270
  aspect=BrowsePathsV2Class(entries),
@@ -424,11 +427,11 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
424
427
  elements: List[Element],
425
428
  workbook: Workbook,
426
429
  all_input_fields: List[InputFieldClass],
430
+ paths: List[str],
427
431
  ) -> Iterable[MetadataWorkUnit]:
428
432
  """
429
433
  Map Sigma page element to Datahub Chart
430
434
  """
431
-
432
435
  for element in elements:
433
436
  chart_urn = builder.make_chart_urn(
434
437
  platform=self.platform,
@@ -459,11 +462,14 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
459
462
  ),
460
463
  ).as_workunit()
461
464
 
462
- yield from add_entity_to_container(
463
- container_key=self._gen_workbook_key(workbook.workbookId),
464
- entity_type="chart",
465
- entity_urn=chart_urn,
466
- )
465
+ if workbook.workspaceId:
466
+ yield self._gen_entity_browsepath_aspect(
467
+ entity_urn=chart_urn,
468
+ parent_entity_urn=builder.make_container_urn(
469
+ self._gen_workspace_key(workbook.workspaceId)
470
+ ),
471
+ paths=paths + [workbook.name],
472
+ )
467
473
 
468
474
  # Add sigma dataset's upstream dataset urn mapping
469
475
  for dataset_urn, upstream_dataset_urns in inputs.items():
@@ -494,7 +500,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
494
500
 
495
501
  all_input_fields.extend(element_input_fields)
496
502
 
497
- def _gen_pages_workunit(self, workbook: Workbook) -> Iterable[MetadataWorkUnit]:
503
+ def _gen_pages_workunit(
504
+ self, workbook: Workbook, paths: List[str]
505
+ ) -> Iterable[MetadataWorkUnit]:
498
506
  """
499
507
  Map Sigma workbook page to Datahub dashboard
500
508
  """
@@ -505,20 +513,23 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
505
513
 
506
514
  yield self._gen_dashboard_info_workunit(page)
507
515
 
508
- yield from add_entity_to_container(
509
- container_key=self._gen_workbook_key(workbook.workbookId),
510
- entity_type="dashboard",
511
- entity_urn=dashboard_urn,
512
- )
513
-
514
516
  dpi_aspect = self._gen_dataplatform_instance_aspect(dashboard_urn)
515
517
  if dpi_aspect:
516
518
  yield dpi_aspect
517
519
 
518
520
  all_input_fields: List[InputFieldClass] = []
519
521
 
522
+ if workbook.workspaceId:
523
+ yield self._gen_entity_browsepath_aspect(
524
+ entity_urn=dashboard_urn,
525
+ parent_entity_urn=builder.make_container_urn(
526
+ self._gen_workspace_key(workbook.workspaceId)
527
+ ),
528
+ paths=paths + [workbook.name],
529
+ )
530
+
520
531
  yield from self._gen_elements_workunit(
521
- page.elements, workbook, all_input_fields
532
+ page.elements, workbook, all_input_fields, paths
522
533
  )
523
534
 
524
535
  yield MetadataChangeProposalWrapper(
@@ -531,42 +542,89 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
531
542
  Map Sigma Workbook to Datahub container
532
543
  """
533
544
  owner_username = self.sigma_api.get_user_name(workbook.createdBy)
534
- workbook_key = self._gen_workbook_key(workbook.workbookId)
535
- yield from gen_containers(
536
- container_key=workbook_key,
537
- name=workbook.name,
538
- sub_types=[BIContainerSubTypes.SIGMA_WORKBOOK],
539
- parent_container_key=(
540
- self._gen_workspace_key(workbook.workspaceId)
541
- if workbook.workspaceId
542
- else None
545
+
546
+ dashboard_urn = self._gen_dashboard_urn(workbook.workbookId)
547
+
548
+ yield self._gen_entity_status_aspect(dashboard_urn)
549
+
550
+ lastModified = AuditStampClass(
551
+ time=int(workbook.updatedAt.timestamp() * 1000),
552
+ actor="urn:li:corpuser:datahub",
553
+ )
554
+ created = AuditStampClass(
555
+ time=int(workbook.createdAt.timestamp() * 1000),
556
+ actor="urn:li:corpuser:datahub",
557
+ )
558
+
559
+ dashboard_info_cls = DashboardInfoClass(
560
+ title=workbook.name,
561
+ description=workbook.description if workbook.description else "",
562
+ dashboards=[
563
+ EdgeClass(
564
+ destinationUrn=self._gen_dashboard_urn(page.get_urn_part()),
565
+ sourceUrn=dashboard_urn,
566
+ )
567
+ for page in workbook.pages
568
+ ],
569
+ externalUrl=workbook.url,
570
+ lastModified=ChangeAuditStampsClass(
571
+ created=created, lastModified=lastModified
543
572
  ),
544
- extra_properties={
573
+ customProperties={
545
574
  "path": workbook.path,
546
575
  "latestVersion": str(workbook.latestVersion),
547
576
  },
548
- owner_urn=(
549
- builder.make_user_urn(owner_username)
550
- if self.config.ingest_owner and owner_username
551
- else None
552
- ),
553
- external_url=workbook.url,
554
- tags=[workbook.badge] if workbook.badge else None,
555
- created=int(workbook.createdAt.timestamp() * 1000),
556
- last_modified=int(workbook.updatedAt.timestamp() * 1000),
557
577
  )
578
+ yield MetadataChangeProposalWrapper(
579
+ entityUrn=dashboard_urn, aspect=dashboard_info_cls
580
+ ).as_workunit()
581
+
582
+ # Set subtype
583
+ yield MetadataChangeProposalWrapper(
584
+ entityUrn=dashboard_urn,
585
+ aspect=SubTypesClass(typeNames=[BIContainerSubTypes.SIGMA_WORKBOOK]),
586
+ ).as_workunit()
587
+
588
+ # Ownership
589
+ owner_urn = (
590
+ builder.make_user_urn(owner_username)
591
+ if self.config.ingest_owner and owner_username
592
+ else None
593
+ )
594
+ if owner_urn:
595
+ yield from add_owner_to_entity_wu(
596
+ entity_type="dashboard",
597
+ entity_urn=dashboard_urn,
598
+ owner_urn=owner_urn,
599
+ )
600
+
601
+ # Tags
602
+ tags = [workbook.badge] if workbook.badge else None
603
+ if tags:
604
+ yield from add_tags_to_entity_wu(
605
+ entity_type="dashboard",
606
+ entity_urn=dashboard_urn,
607
+ tags=sorted(tags),
608
+ )
558
609
 
559
610
  paths = workbook.path.split("/")[1:]
560
- if len(paths) > 0 and workbook.workspaceId:
611
+ if workbook.workspaceId:
561
612
  yield self._gen_entity_browsepath_aspect(
562
- entity_urn=builder.make_container_urn(workbook_key),
613
+ entity_urn=dashboard_urn,
563
614
  parent_entity_urn=builder.make_container_urn(
564
615
  self._gen_workspace_key(workbook.workspaceId)
565
616
  ),
566
- paths=paths,
617
+ paths=paths + [workbook.name],
567
618
  )
568
619
 
569
- yield from self._gen_pages_workunit(workbook)
620
+ if len(paths) == 0:
621
+ yield from add_entity_to_container(
622
+ container_key=self._gen_workspace_key(workbook.workspaceId),
623
+ entity_type="dashboard",
624
+ entity_urn=dashboard_urn,
625
+ )
626
+
627
+ yield from self._gen_pages_workunit(workbook, paths)
570
628
 
571
629
  def _gen_sigma_dataset_upstream_lineage_workunit(
572
630
  self,
@@ -50,6 +50,7 @@ from datahub.ingestion.source.sql.sql_config import (
50
50
  BasicSQLAlchemyConfig,
51
51
  make_sqlalchemy_uri,
52
52
  )
53
+ from datahub.ingestion.source.sql.sql_report import SQLSourceReport
53
54
  from datahub.metadata.schema_classes import (
54
55
  BooleanTypeClass,
55
56
  NumberTypeClass,
@@ -78,6 +79,11 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
78
79
  include_stored_procedures_code: bool = Field(
79
80
  default=True, description="Include information about object code."
80
81
  )
82
+ procedure_pattern: AllowDenyPattern = Field(
83
+ default=AllowDenyPattern.allow_all(),
84
+ description="Regex patterns for stored procedures to filter in ingestion."
85
+ "Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'",
86
+ )
81
87
  include_jobs: bool = Field(
82
88
  default=True,
83
89
  description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.",
@@ -164,6 +170,8 @@ class SQLServerSource(SQLAlchemySource):
164
170
  If you do use pyodbc, make sure to change the source type from `mssql` to `mssql-odbc` so that we pull in the right set of dependencies. This will be needed in most cases where encryption is required, such as managed SQL Server services in Azure.
165
171
  """
166
172
 
173
+ report: SQLSourceReport
174
+
167
175
  def __init__(self, config: SQLServerConfig, ctx: PipelineContext):
168
176
  super().__init__(config, ctx, "mssql")
169
177
  # Cache the table and column descriptions
@@ -416,10 +424,16 @@ class SQLServerSource(SQLAlchemySource):
416
424
  data_flow = MSSQLDataFlow(entity=mssql_default_job)
417
425
  with inspector.engine.connect() as conn:
418
426
  procedures_data_list = self._get_stored_procedures(conn, db_name, schema)
419
- procedures = [
420
- StoredProcedure(flow=mssql_default_job, **procedure_data)
421
- for procedure_data in procedures_data_list
422
- ]
427
+ procedures: List[StoredProcedure] = []
428
+ for procedure_data in procedures_data_list:
429
+ procedure_full_name = f"{db_name}.{schema}.{procedure_data['name']}"
430
+ if not self.config.procedure_pattern.allowed(procedure_full_name):
431
+ self.report.report_dropped(procedure_full_name)
432
+ continue
433
+ procedures.append(
434
+ StoredProcedure(flow=mssql_default_job, **procedure_data)
435
+ )
436
+
423
437
  if procedures:
424
438
  yield from self.construct_flow_workunits(data_flow=data_flow)
425
439
  for procedure in procedures: