acryl-datahub 1.2.0.5rc1__py3-none-any.whl → 1.2.0.6rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.2.0.5rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.2.0.6rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=KSlHYPz315hvUomSrL-xCymHj7Nx74tIb2-JrBfK6_s,323
4
+ datahub/_version.py,sha256=UpTxFRBPZj7BDFXx0obVG8eaUPk2TF8H9bqlPMVnt2A,323
5
5
  datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -148,7 +148,7 @@ datahub/ingestion/api/incremental_properties_helper.py,sha256=KzdxdrQtaMV2XMHfPs
148
148
  datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py,sha256=3lLdkkxVqE9MVc26cdXImPeWy16az5BwgcorWxeBV50,1759
149
149
  datahub/ingestion/api/pipeline_run_listener.py,sha256=5uBP__LbMQxJ2utlf07cIzQINqPbUOKiZyOJta6a0og,713
150
150
  datahub/ingestion/api/registry.py,sha256=LbdZr89465Lj7ptQRVB4vI1JR1igWABvQFj9-WX63bI,7454
151
- datahub/ingestion/api/report.py,sha256=jdAfqkAn1tlwBJBmC_t0-6KzKhKBxO8N3y_NziyBXSU,17889
151
+ datahub/ingestion/api/report.py,sha256=-xduHhIRUgf5G51mUb3uTi6GBxVli6ZK25AS5ikXuII,18312
152
152
  datahub/ingestion/api/report_helpers.py,sha256=WbUC1kQeaKqIagGV3XzfPmPs7slAT1mfNY4og2BH2A8,994
153
153
  datahub/ingestion/api/sink.py,sha256=GZt48PV56FAhNoma-V5EwwRZvezhb40YH_zprm8_Yo0,4961
154
154
  datahub/ingestion/api/source.py,sha256=uf0fNbiOy0bS_aKFOcNv6NvuZe0LSDIDdNza9hraP7s,21857
@@ -400,7 +400,7 @@ datahub/ingestion/source/metadata/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
400
400
  datahub/ingestion/source/metadata/business_glossary.py,sha256=T_RJHst6iQRghJNmLLPeSBMEDsbEKf3yBldOAgMcGuo,19666
401
401
  datahub/ingestion/source/metadata/lineage.py,sha256=PA4JwSeQ-30XFMN4O5tPwIu-hZF1e-xMZ_CnEUE2c-Q,9595
402
402
  datahub/ingestion/source/mock_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
403
- datahub/ingestion/source/mock_data/datahub_mock_data.py,sha256=jVJjqOzBN9JteLxcFRklg2eG9RmVdEozCxFYPArzjX8,19361
403
+ datahub/ingestion/source/mock_data/datahub_mock_data.py,sha256=3i3SdBp267cZRszhmD_JWJLTGIot2FI8REFpjJQ4jD8,19822
404
404
  datahub/ingestion/source/mock_data/datahub_mock_data_report.py,sha256=sV_H7JgcuVbrpIBqtGse_BBigMdqP32ZXuanpeXmwVI,331
405
405
  datahub/ingestion/source/mock_data/table_naming_helper.py,sha256=zJtEBSJGDvVr-kiKjK7LbHAifK3sfE786M3yO--Bn2o,3493
406
406
  datahub/ingestion/source/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -481,12 +481,12 @@ datahub/ingestion/source/snowflake/constants.py,sha256=prPRPVgDajTeEbPhXB6a9gVW5
481
481
  datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
482
482
  datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
483
483
  datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
484
- datahub/ingestion/source/snowflake/snowflake_config.py,sha256=6uR7eZDmEfLpfs_QYNJGTTZI6N2VOvjqXKmG_7c6-9Q,22291
484
+ datahub/ingestion/source/snowflake/snowflake_config.py,sha256=3oakxSKMoZRw6J-15mIIXzutypvdCnwPZG3wvKEobh0,22825
485
485
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=3-nP3HHCblUnUHYo_fvFp5VOAteCtR4GNjaUEvyNTNQ,18175
486
486
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
487
487
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=a2vDWZNthV3AqD3Y_Rd4lHSZbaiyuEc--WXWcJqcV0k,21711
488
488
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=PmQi-qDlRhdJ-PsJ7x-EScIiswWRAxDDOKHydvN3mTY,7404
489
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=deJwFf9Bg3bphMi6WNfPon59hFSLYJZ4ByQA4pUkTkU,42926
489
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=Oie4qRiKfqyn80nwr2c0unjdOnOV-GSefgR9gV0B1tI,44865
490
490
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=iMIbT5iQzy6mxPLNkjbup_C6sLa2482K-QI7vaVZ8Ac,39677
491
491
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=O-465aBA8uaYZ6WepP7i6cgK6Q1jXJPjDA1j9C8klus,6762
492
492
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=x1gN8dmZZlNOANju0RnGQWqtBpm5UaNJDlGHBFL2LsE,33963
@@ -496,7 +496,7 @@ datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=WJfsP8w3HceUkM6GK
496
496
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=eA9xh-G1Ydr1OwUUtrbXUWp26hE1jF0zvyKNky_i_nQ,8887
497
497
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=mM0v9b4PHRJAT-SdRids3wdzc5O96gWCCww3e42itV8,24982
498
498
  datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=kL9W_PrJuP8QPgCe9dSjWkMvDzW2EcJciLQRSvrkGiI,14346
499
- datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=lUSAAZ3FzJEgLpLYfoLzCigMegbS5RClTg4Dhd4WxRI,35334
499
+ datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=bIjbw6oI3zyGSBxhLmtpylE1rc2nyzE02xgvME2g0qQ,35421
500
500
  datahub/ingestion/source/snowflake/stored_proc_lineage.py,sha256=rOb78iHiWiK8v8WdVs1xDwVut4Y0OHmszej6IopQfCo,5341
501
501
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
502
502
  datahub/ingestion/source/sql/athena.py,sha256=TPKwL9oRiZlVnqIsOSBWUEwyvoW-1ssXvY4PfjxOR6g,28175
@@ -1104,8 +1104,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1104
1104
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1105
1105
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1106
1106
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1107
- acryl_datahub-1.2.0.5rc1.dist-info/METADATA,sha256=vAftVMr-wWAhe89fUtDvsCm9BjFdT92jn4Ea4-cSPV4,182056
1108
- acryl_datahub-1.2.0.5rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1109
- acryl_datahub-1.2.0.5rc1.dist-info/entry_points.txt,sha256=bnGf6eX9UhiW8yVHtt6MJCVcmLErvrVQxTJAayA-PKc,9885
1110
- acryl_datahub-1.2.0.5rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1111
- acryl_datahub-1.2.0.5rc1.dist-info/RECORD,,
1107
+ acryl_datahub-1.2.0.6rc1.dist-info/METADATA,sha256=d_JiiaPjg1a5n4u__cmZ1gBZ7FpY1qf6flPFgzZM2D8,182056
1108
+ acryl_datahub-1.2.0.6rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1109
+ acryl_datahub-1.2.0.6rc1.dist-info/entry_points.txt,sha256=bnGf6eX9UhiW8yVHtt6MJCVcmLErvrVQxTJAayA-PKc,9885
1110
+ acryl_datahub-1.2.0.6rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1111
+ acryl_datahub-1.2.0.6rc1.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.2.0.5rc1"
3
+ __version__ = "1.2.0.6rc1"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -23,6 +23,7 @@ from datahub.ingestion.autogenerated.lineage_helper import is_lineage_aspect
23
23
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
24
24
  from datahub.metadata.schema_classes import (
25
25
  MetadataChangeProposalClass,
26
+ StatusClass,
26
27
  SubTypesClass,
27
28
  UpstreamLineageClass,
28
29
  )
@@ -161,6 +162,7 @@ class SourceReportSubtypes:
161
162
  entity_type: str
162
163
  subType: str = field(default="unknown")
163
164
  aspects: Dict[str, int] = field(default_factory=dict)
165
+ soft_deleted: bool = field(default=False)
164
166
 
165
167
 
166
168
  class ReportAttribute(BaseModel):
@@ -217,6 +219,7 @@ class ExamplesReport(Report, Closeable):
217
219
  "entityType": lambda val: val.entity_type,
218
220
  "subTypes": lambda val: val.subType,
219
221
  "aspects": lambda val: json.dumps(val.aspects),
222
+ "soft_deleted": lambda val: val.soft_deleted,
220
223
  },
221
224
  )
222
225
 
@@ -376,6 +379,13 @@ class ExamplesReport(Report, Closeable):
376
379
  subType=sub_type,
377
380
  aspects=aspects_dict,
378
381
  )
382
+ if (
383
+ isinstance(mcp.aspect, StatusClass)
384
+ and mcp is not None
385
+ and mcp.aspect is not None
386
+ ):
387
+ self._file_based_dict[urn].soft_deleted = mcp.aspect.removed
388
+ self._file_based_dict.mark_dirty(urn)
379
389
 
380
390
  def _store_workunit_data(self, wu: MetadataWorkUnit) -> None:
381
391
  urn = wu.get_urn()
@@ -401,7 +411,8 @@ class ExamplesReport(Report, Closeable):
401
411
  query = """
402
412
  SELECT entityType, subTypes, aspects, count(*) as count
403
413
  FROM urn_aspects
404
- group by entityType, subTypes, aspects
414
+ WHERE soft_deleted = 0
415
+ GROUP BY entityType, subTypes, aspects
405
416
  """
406
417
 
407
418
  entity_subtype_aspect_counts: Dict[str, Dict[str, Dict[str, int]]] = (
@@ -410,6 +421,7 @@ class ExamplesReport(Report, Closeable):
410
421
  entity_subtype_aspect_counts_exist: Dict[str, Dict[str, Dict[str, int]]] = (
411
422
  defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
412
423
  )
424
+
413
425
  for row in self._file_based_dict.sql_query(query):
414
426
  entity_type = row["entityType"]
415
427
  sub_type = row["subTypes"]
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional, Tuple
2
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
3
3
 
4
4
  from pydantic import Field
5
5
 
@@ -14,6 +14,7 @@ from datahub.ingestion.api.decorators import (
14
14
  support_status,
15
15
  )
16
16
  from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
17
+ from datahub.ingestion.api.source_helpers import AutoSystemMetadata, auto_workunit
17
18
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
19
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
19
20
  from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
@@ -31,6 +32,7 @@ from datahub.metadata.schema_classes import (
31
32
  UpstreamClass,
32
33
  UpstreamLineageClass,
33
34
  )
35
+ from datahub.sdk.entity import Entity
34
36
  from datahub.utilities.str_enum import StrEnum
35
37
 
36
38
  logger = logging.getLogger(__name__)
@@ -165,6 +167,14 @@ class DataHubMockDataSource(Source):
165
167
  self.report = DataHubMockDataReport()
166
168
 
167
169
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
170
+ workunit_processors = [AutoSystemMetadata(self.ctx).stamp]
171
+ return self._apply_workunit_processors(
172
+ workunit_processors, auto_workunit(self.get_workunits_internal())
173
+ )
174
+
175
+ def get_workunits_internal(
176
+ self,
177
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
168
178
  # We don't want any implicit aspects to be produced
169
179
  # so we are not using get_workunits_internal
170
180
 
@@ -356,11 +356,18 @@ class SnowflakeV2Config(
356
356
 
357
357
  pushdown_deny_usernames: List[str] = Field(
358
358
  default=[],
359
- description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
359
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
360
360
  "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
361
361
  "Only applicable if `use_queries_v2` is enabled.",
362
362
  )
363
363
 
364
+ pushdown_allow_usernames: List[str] = Field(
365
+ default=[],
366
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
367
+ "This is primarily useful for improving performance by filtering in only specific users. "
368
+ "Only applicable if `use_queries_v2` is enabled. If not specified, all users not in deny list are included.",
369
+ )
370
+
364
371
  push_down_database_pattern_access_history: bool = Field(
365
372
  default=False,
366
373
  description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
@@ -89,10 +89,17 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
89
89
 
90
90
  pushdown_deny_usernames: List[str] = pydantic.Field(
91
91
  default=[],
92
- description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
92
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
93
93
  "This is primarily useful for improving performance by filtering out users with extremely high query volumes.",
94
94
  )
95
95
 
96
+ pushdown_allow_usernames: List[str] = pydantic.Field(
97
+ default=[],
98
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
99
+ "This is primarily useful for improving performance by filtering in only specific users. "
100
+ "If not specified, all users not in deny list are included.",
101
+ )
102
+
96
103
  user_email_pattern: AllowDenyPattern = pydantic.Field(
97
104
  default=AllowDenyPattern.allow_all(),
98
105
  description="Regex patterns for user emails to filter in usage.",
@@ -396,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
396
403
  end_time=self.config.window.end_time,
397
404
  bucket_duration=self.config.window.bucket_duration,
398
405
  deny_usernames=self.config.pushdown_deny_usernames,
406
+ allow_usernames=self.config.pushdown_allow_usernames,
399
407
  dedup_strategy=self.config.query_dedup_strategy,
400
408
  database_pattern=self.filters.filter_config.database_pattern
401
409
  if self.config.push_down_database_pattern_access_history
@@ -740,7 +748,8 @@ class QueryLogQueryBuilder:
740
748
  start_time: datetime,
741
749
  end_time: datetime,
742
750
  bucket_duration: BucketDuration,
743
- deny_usernames: Optional[List[str]],
751
+ deny_usernames: Optional[List[str]] = None,
752
+ allow_usernames: Optional[List[str]] = None,
744
753
  max_tables_per_query: int = 20,
745
754
  dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
746
755
  database_pattern: Optional[AllowDenyPattern] = None,
@@ -753,10 +762,7 @@ class QueryLogQueryBuilder:
753
762
  self.max_tables_per_query = max_tables_per_query
754
763
  self.dedup_strategy = dedup_strategy
755
764
 
756
- self.users_filter = "TRUE"
757
- if deny_usernames:
758
- user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
759
- self.users_filter = f"user_name NOT IN ({user_not_in})"
765
+ self.users_filter = self._build_user_filter(deny_usernames, allow_usernames)
760
766
 
761
767
  self.access_history_database_filter = (
762
768
  self._build_access_history_database_filter_condition(
@@ -767,6 +773,43 @@ class QueryLogQueryBuilder:
767
773
  self.time_bucket_size = bucket_duration.value
768
774
  assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
769
775
 
776
+ def _build_user_filter(
777
+ self,
778
+ deny_usernames: Optional[List[str]] = None,
779
+ allow_usernames: Optional[List[str]] = None,
780
+ ) -> str:
781
+ """
782
+ Build user filter SQL condition based on deny and allow username patterns.
783
+
784
+ Args:
785
+ deny_usernames: List of username patterns to exclude (SQL LIKE patterns)
786
+ allow_usernames: List of username patterns to include (SQL LIKE patterns)
787
+
788
+ Returns:
789
+ SQL WHERE condition string for filtering users
790
+ """
791
+ user_filters = []
792
+
793
+ if deny_usernames:
794
+ deny_conditions = []
795
+ for pattern in deny_usernames:
796
+ # Escape single quotes for SQL safety
797
+ escaped_pattern = pattern.replace("'", "''")
798
+ deny_conditions.append(f"user_name NOT ILIKE '{escaped_pattern}'")
799
+ if deny_conditions:
800
+ user_filters.append(f"({' AND '.join(deny_conditions)})")
801
+
802
+ if allow_usernames:
803
+ allow_conditions = []
804
+ for pattern in allow_usernames:
805
+ # Escape single quotes for SQL safety
806
+ escaped_pattern = pattern.replace("'", "''")
807
+ allow_conditions.append(f"user_name ILIKE '{escaped_pattern}'")
808
+ if allow_conditions:
809
+ user_filters.append(f"({' OR '.join(allow_conditions)})")
810
+
811
+ return " AND ".join(user_filters) if user_filters else "TRUE"
812
+
770
813
  def _build_access_history_database_filter_condition(
771
814
  self,
772
815
  database_pattern: Optional[AllowDenyPattern],
@@ -602,6 +602,7 @@ class SnowflakeV2Source(
602
602
  include_query_usage_statistics=self.config.include_query_usage_statistics,
603
603
  user_email_pattern=self.config.user_email_pattern,
604
604
  pushdown_deny_usernames=self.config.pushdown_deny_usernames,
605
+ pushdown_allow_usernames=self.config.pushdown_allow_usernames,
605
606
  query_dedup_strategy=self.config.query_dedup_strategy,
606
607
  push_down_database_pattern_access_history=self.config.push_down_database_pattern_access_history,
607
608
  additional_database_names_allowlist=self.config.additional_database_names_allowlist,