acryl-datahub 1.2.0.4rc4__py3-none-any.whl → 1.2.0.5rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (41) hide show
  1. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/METADATA +2631 -2631
  2. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/RECORD +41 -39
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/common/serialized_value.py +1 -1
  5. datahub/api/entities/external/external_entities.py +500 -15
  6. datahub/ingestion/source/aws/glue.py +18 -14
  7. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  8. datahub/ingestion/source/aws/tag_entities.py +82 -104
  9. datahub/ingestion/source/common/subtypes.py +1 -0
  10. datahub/ingestion/source/hex/api.py +2 -0
  11. datahub/ingestion/source/hex/mapper.py +16 -2
  12. datahub/ingestion/source/hex/model.py +2 -0
  13. datahub/ingestion/source/looker/looker_common.py +26 -0
  14. datahub/ingestion/source/mock_data/datahub_mock_data.py +11 -1
  15. datahub/ingestion/source/snowflake/constants.py +1 -0
  16. datahub/ingestion/source/snowflake/snowflake_config.py +8 -1
  17. datahub/ingestion/source/snowflake/snowflake_queries.py +49 -6
  18. datahub/ingestion/source/snowflake/snowflake_query.py +50 -5
  19. datahub/ingestion/source/snowflake/snowflake_schema.py +173 -9
  20. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -2
  21. datahub/ingestion/source/snowflake/snowflake_utils.py +16 -3
  22. datahub/ingestion/source/snowflake/snowflake_v2.py +4 -1
  23. datahub/ingestion/source/sql/mssql/source.py +2 -25
  24. datahub/ingestion/source/sql/mysql.py +54 -0
  25. datahub/ingestion/source/sql/postgres.py +5 -134
  26. datahub/ingestion/source/sql/sql_common.py +137 -0
  27. datahub/ingestion/source/superset.py +140 -56
  28. datahub/ingestion/source/unity/config.py +11 -0
  29. datahub/ingestion/source/unity/connection_test.py +1 -0
  30. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  31. datahub/ingestion/source/unity/proxy.py +20 -6
  32. datahub/ingestion/source/unity/report.py +9 -1
  33. datahub/ingestion/source/unity/source.py +51 -16
  34. datahub/ingestion/source/unity/tag_entities.py +49 -147
  35. datahub/metadata/_internal_schema_classes.py +1 -1
  36. datahub/metadata/schema.avsc +4 -2
  37. datahub/metadata/schemas/Operation.avsc +4 -2
  38. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/entry_points.txt +0 -0
  40. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/licenses/LICENSE +0 -0
  41. {acryl_datahub-1.2.0.4rc4.dist-info → acryl_datahub-1.2.0.5rc2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,10 @@
1
1
  import logging
2
- from typing import List, Optional
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from datahub.ingestion.source.aws.platform_resource_repository import (
6
+ GluePlatformResourceRepository,
7
+ )
3
8
 
4
9
  from pydantic import BaseModel
5
10
 
@@ -7,7 +12,6 @@ from datahub.api.entities.external.external_entities import (
7
12
  ExternalEntity,
8
13
  ExternalEntityId,
9
14
  LinkedResourceSet,
10
- PlatformResourceRepository,
11
15
  )
12
16
  from datahub.api.entities.external.lake_formation_external_entites import (
13
17
  LakeFormationTag,
@@ -15,10 +19,8 @@ from datahub.api.entities.external.lake_formation_external_entites import (
15
19
  from datahub.api.entities.platformresource.platform_resource import (
16
20
  PlatformResource,
17
21
  PlatformResourceKey,
18
- PlatformResourceSearchFields,
19
22
  )
20
23
  from datahub.metadata.urns import TagUrn
21
- from datahub.utilities.search_utils import ElasticDocumentQuery
22
24
  from datahub.utilities.urns.urn import Urn
23
25
 
24
26
  logger = logging.getLogger(__name__)
@@ -29,8 +31,12 @@ class LakeFormationTagSyncContext(BaseModel):
29
31
  platform_instance: Optional[str] = None
30
32
  catalog: Optional[str] = None
31
33
 
34
+ # Making it compatible with SyncContext interface
35
+ def get_platform_instance(self) -> Optional[str]:
36
+ return self.platform_instance
37
+
32
38
 
33
- class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
39
+ class LakeFormationTagPlatformResourceId(ExternalEntityId):
34
40
  """
35
41
  A LakeFormationTag is a unique identifier for a Lakeformation tag.
36
42
  """
@@ -42,9 +48,6 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
42
48
  exists_in_lake_formation: bool = False
43
49
  persisted: bool = False
44
50
 
45
- def __hash__(self) -> int:
46
- return hash(self.to_platform_resource_key().id)
47
-
48
51
  # this is a hack to make sure the property is a string and not private pydantic field
49
52
  @staticmethod
50
53
  def _RESOURCE_TYPE() -> str:
@@ -61,24 +64,26 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
61
64
  )
62
65
 
63
66
  @classmethod
64
- def from_tag(
67
+ def get_or_create_from_tag(
65
68
  cls,
66
69
  tag: LakeFormationTag,
67
- platform_instance: Optional[str],
68
- platform_resource_repository: PlatformResourceRepository,
69
- catalog: Optional[str] = None,
70
+ platform_resource_repository: "GluePlatformResourceRepository",
70
71
  exists_in_lake_formation: bool = False,
72
+ catalog_id: Optional[str] = None,
71
73
  ) -> "LakeFormationTagPlatformResourceId":
72
74
  """
73
75
  Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
74
76
  """
75
77
 
78
+ # Use catalog_id if provided, otherwise fall back to repository catalog
79
+ effective_catalog = catalog_id or platform_resource_repository.catalog
80
+
76
81
  existing_platform_resource = cls.search_by_urn(
77
82
  tag.to_datahub_tag_urn().urn(),
78
83
  platform_resource_repository=platform_resource_repository,
79
84
  tag_sync_context=LakeFormationTagSyncContext(
80
- platform_instance=platform_instance,
81
- catalog=catalog,
85
+ platform_instance=platform_resource_repository.platform_instance,
86
+ catalog=effective_catalog,
82
87
  ),
83
88
  )
84
89
  if existing_platform_resource:
@@ -90,9 +95,9 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
90
95
  return LakeFormationTagPlatformResourceId(
91
96
  tag_key=str(tag.key),
92
97
  tag_value=str(tag.value) if tag.value is not None else None,
93
- platform_instance=platform_instance,
98
+ platform_instance=platform_resource_repository.platform_instance,
99
+ catalog=effective_catalog,
94
100
  exists_in_lake_formation=exists_in_lake_formation,
95
- catalog=catalog,
96
101
  persisted=False,
97
102
  )
98
103
 
@@ -100,64 +105,48 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
100
105
  def search_by_urn(
101
106
  cls,
102
107
  urn: str,
103
- platform_resource_repository: PlatformResourceRepository,
108
+ platform_resource_repository: "GluePlatformResourceRepository",
104
109
  tag_sync_context: LakeFormationTagSyncContext,
105
110
  ) -> Optional["LakeFormationTagPlatformResourceId"]:
106
- mapped_tags = [
107
- t
108
- for t in platform_resource_repository.search_by_filter(
109
- ElasticDocumentQuery.create_from(
110
- (
111
- PlatformResourceSearchFields.RESOURCE_TYPE,
112
- str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
113
- ),
114
- (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
111
+ """
112
+ Search for existing Lake Formation tag entity by URN using repository caching.
113
+
114
+ This method now delegates to the repository's search_entity_by_urn method to ensure
115
+ consistent caching behavior across all platform implementations.
116
+ """
117
+ # Use repository's cached search method instead of duplicating search logic
118
+ existing_entity_id = platform_resource_repository.search_entity_by_urn(urn)
119
+
120
+ if existing_entity_id:
121
+ # Verify platform instance and catalog match
122
+ if (
123
+ existing_entity_id.platform_instance
124
+ == tag_sync_context.platform_instance
125
+ and existing_entity_id.catalog == tag_sync_context.catalog
126
+ ):
127
+ logger.info(
128
+ f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_entity_id}"
115
129
  )
116
- )
117
- ]
130
+ # Create a new ID with the correct state instead of mutating
131
+ return LakeFormationTagPlatformResourceId(
132
+ tag_key=existing_entity_id.tag_key,
133
+ tag_value=existing_entity_id.tag_value,
134
+ platform_instance=existing_entity_id.platform_instance,
135
+ catalog=existing_entity_id.catalog,
136
+ exists_in_lake_formation=True, # This tag exists in Lake Formation
137
+ persisted=True, # And it's persisted in DataHub
138
+ )
139
+
118
140
  logger.info(
119
- f"Found {len(mapped_tags)} mapped tags for URN {urn}. {mapped_tags}"
141
+ f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
120
142
  )
121
- if len(mapped_tags) > 0:
122
- for platform_resource in mapped_tags:
123
- if (
124
- platform_resource.resource_info
125
- and platform_resource.resource_info.value
126
- ):
127
- lake_formation_tag_platform_resource = (
128
- LakeFormationTagPlatformResource(
129
- **platform_resource.resource_info.value.as_pydantic_object(
130
- LakeFormationTagPlatformResource
131
- ).dict()
132
- )
133
- )
134
- if (
135
- lake_formation_tag_platform_resource.id.platform_instance
136
- == tag_sync_context.platform_instance
137
- and lake_formation_tag_platform_resource.id.catalog
138
- == tag_sync_context.catalog
139
- ):
140
- lake_formation_tag_id = lake_formation_tag_platform_resource.id
141
- lake_formation_tag_id.exists_in_lake_formation = True
142
- lake_formation_tag_id.persisted = True
143
- return lake_formation_tag_id
144
- else:
145
- logger.warning(
146
- f"Platform resource {platform_resource} does not have a resource_info value"
147
- )
148
- continue
149
-
150
- # If we reach here, it means we did not find a mapped tag for the URN
151
- logger.info(
152
- f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
153
- )
154
143
  return None
155
144
 
156
145
  @classmethod
157
146
  def from_datahub_urn(
158
147
  cls,
159
148
  urn: str,
160
- platform_resource_repository: PlatformResourceRepository,
149
+ platform_resource_repository: "GluePlatformResourceRepository",
161
150
  tag_sync_context: LakeFormationTagSyncContext,
162
151
  ) -> "LakeFormationTagPlatformResourceId":
163
152
  """
@@ -188,11 +177,17 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
188
177
  logger.info(
189
178
  f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
190
179
  )
191
- new_tag_id.exists_in_lake_formation = (
192
- True # TODO: Check if this is a safe assumption
180
+ # Create a new ID with the correct state instead of mutating
181
+ return LakeFormationTagPlatformResourceId(
182
+ tag_key=new_tag_id.tag_key,
183
+ tag_value=new_tag_id.tag_value,
184
+ platform_instance=new_tag_id.platform_instance,
185
+ catalog=new_tag_id.catalog,
186
+ exists_in_lake_formation=True, # This tag exists in Lake Formation
187
+ persisted=new_tag_id.persisted,
193
188
  )
194
189
  return new_tag_id
195
- raise ValueError(f"Unable to create SnowflakeTagId from DataHub URN: {urn}")
190
+ raise ValueError(f"Unable to create LakeFormationTagId from DataHub URN: {urn}")
196
191
 
197
192
  @classmethod
198
193
  def generate_tag_id(
@@ -223,7 +218,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
223
218
  )
224
219
 
225
220
 
226
- class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
221
+ class LakeFormationTagPlatformResource(ExternalEntity):
227
222
  datahub_urns: LinkedResourceSet
228
223
  managed_by_datahub: bool
229
224
  id: LakeFormationTagPlatformResourceId
@@ -246,46 +241,29 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
246
241
  )
247
242
 
248
243
  @classmethod
249
- def get_from_datahub(
244
+ def create_default(
250
245
  cls,
251
- lake_formation_tag_id: LakeFormationTagPlatformResourceId,
252
- platform_resource_repository: PlatformResourceRepository,
253
- managed_by_datahub: bool = False,
246
+ entity_id: ExternalEntityId,
247
+ managed_by_datahub: bool,
254
248
  ) -> "LakeFormationTagPlatformResource":
255
- # Search for linked DataHub URNs
256
- platform_resources = [
257
- r
258
- for r in platform_resource_repository.search_by_filter(
259
- ElasticDocumentQuery.create_from(
260
- (
261
- PlatformResourceSearchFields.RESOURCE_TYPE,
262
- str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
263
- ),
264
- (
265
- PlatformResourceSearchFields.PRIMARY_KEY,
266
- f"{lake_formation_tag_id.tag_key}/{lake_formation_tag_id.tag_value}",
267
- ),
268
- )
269
- )
270
- ]
271
- for platform_resource in platform_resources:
272
- if (
273
- platform_resource.resource_info
274
- and platform_resource.resource_info.value
275
- ):
276
- lf_tag = LakeFormationTagPlatformResource(
277
- **platform_resource.resource_info.value.as_pydantic_object(
278
- LakeFormationTagPlatformResource
279
- ).dict()
280
- )
281
- if (
282
- lf_tag.id.platform_instance
283
- == lake_formation_tag_id.platform_instance
284
- and lf_tag.id.catalog == lake_formation_tag_id.catalog
285
- ):
286
- return lf_tag
249
+ """Create a default Lake Formation tag entity when none found in DataHub."""
250
+ # Type narrowing: we know this will be a LakeFormationTagPlatformResourceId
251
+ assert isinstance(entity_id, LakeFormationTagPlatformResourceId), (
252
+ f"Expected LakeFormationTagPlatformResourceId, got {type(entity_id)}"
253
+ )
254
+
255
+ # Create a new entity ID with correct default state instead of mutating
256
+ default_entity_id = LakeFormationTagPlatformResourceId(
257
+ tag_key=entity_id.tag_key,
258
+ tag_value=entity_id.tag_value,
259
+ platform_instance=entity_id.platform_instance,
260
+ catalog=entity_id.catalog,
261
+ exists_in_lake_formation=False, # New entities don't exist in Lake Formation yet
262
+ persisted=False, # New entities are not persisted yet
263
+ )
264
+
287
265
  return cls(
288
- id=lake_formation_tag_id,
266
+ id=default_entity_id,
289
267
  datahub_urns=LinkedResourceSet(urns=[]),
290
268
  managed_by_datahub=managed_by_datahub,
291
269
  allowed_values=None,
@@ -30,6 +30,7 @@ class DatasetSubTypes(StrEnum):
30
30
  NEO4J_NODE = "Neo4j Node"
31
31
  NEO4J_RELATIONSHIP = "Neo4j Relationship"
32
32
  SNOWFLAKE_STREAM = "Snowflake Stream"
33
+ DYNAMIC_TABLE = "Dynamic Table"
33
34
  API_ENDPOINT = "API Endpoint"
34
35
  SLACK_CHANNEL = "Slack Channel"
35
36
  PROJECTIONS = "Projections"
@@ -375,6 +375,7 @@ class HexApi:
375
375
  description=hex_item.description,
376
376
  created_at=hex_item.created_at,
377
377
  last_edited_at=hex_item.last_edited_at,
378
+ last_published_at=hex_item.last_published_at,
378
379
  status=status,
379
380
  categories=categories,
380
381
  collections=collections,
@@ -389,6 +390,7 @@ class HexApi:
389
390
  description=hex_item.description,
390
391
  created_at=hex_item.created_at,
391
392
  last_edited_at=hex_item.last_edited_at,
393
+ last_published_at=hex_item.last_published_at,
392
394
  status=status,
393
395
  categories=categories,
394
396
  collections=collections,
@@ -122,7 +122,7 @@ class Mapper:
122
122
  lastModified=self._change_audit_stamps(
123
123
  created_at=project.created_at, last_edited_at=project.last_edited_at
124
124
  ),
125
- externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
125
+ externalUrl=self._get_project_or_component_external_url(project),
126
126
  customProperties=dict(id=project.id),
127
127
  datasetEdges=self._dataset_edges(project.upstream_datasets),
128
128
  # TODO: support schema field upstream, maybe InputFields?
@@ -173,7 +173,7 @@ class Mapper:
173
173
  lastModified=self._change_audit_stamps(
174
174
  created_at=component.created_at, last_edited_at=component.last_edited_at
175
175
  ),
176
- externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{component.id}",
176
+ externalUrl=self._get_project_or_component_external_url(component),
177
177
  customProperties=dict(id=component.id),
178
178
  )
179
179
 
@@ -242,6 +242,20 @@ class Mapper:
242
242
  assert isinstance(dashboard_urn, DashboardUrn)
243
243
  return dashboard_urn
244
244
 
245
+ def _get_project_or_component_external_url(
246
+ self,
247
+ project_or_component: Union[Project, Component],
248
+ ) -> Optional[str]:
249
+ if project_or_component.last_published_at is None:
250
+ return (
251
+ f"{self._base_url}/{self._workspace_name}/hex/{project_or_component.id}"
252
+ )
253
+ else:
254
+ # published Projects/Components have a different URL that everybody, not just editors, can access
255
+ return (
256
+ f"{self._base_url}/{self._workspace_name}/app/{project_or_component.id}"
257
+ )
258
+
245
259
  def _change_audit_stamps(
246
260
  self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
247
261
  ) -> ChangeAuditStampsClass:
@@ -46,6 +46,7 @@ class Project:
46
46
  title: str
47
47
  description: Optional[str]
48
48
  last_edited_at: Optional[datetime] = None
49
+ last_published_at: Optional[datetime] = None
49
50
  created_at: Optional[datetime] = None
50
51
  status: Optional[Status] = None
51
52
  categories: Optional[List[Category]] = None # TODO: emit category description!
@@ -67,6 +68,7 @@ class Component:
67
68
  title: str
68
69
  description: Optional[str]
69
70
  last_edited_at: Optional[datetime] = None
71
+ last_published_at: Optional[datetime] = None
70
72
  created_at: Optional[datetime] = None
71
73
  status: Optional[Status] = None
72
74
  categories: Optional[List[Category]] = None
@@ -379,6 +379,14 @@ class ExploreUpstreamViewField:
379
379
  : -(len(self.field.field_group_variant.lower()) + 1)
380
380
  ]
381
381
 
382
+ # Validate that field_name is not empty to prevent invalid schema field URNs
383
+ if not field_name or not field_name.strip():
384
+ logger.warning(
385
+ f"Empty field name detected for field '{self.field.name}' in explore '{self.explore.name}'. "
386
+ f"Skipping field to prevent invalid schema field URN generation."
387
+ )
388
+ return None
389
+
382
390
  assert view_name # for lint false positive
383
391
 
384
392
  project_include: ProjectInclude = ProjectInclude(
@@ -1351,7 +1359,25 @@ class LookerExplore:
1351
1359
  fine_grained_lineages = []
1352
1360
  if config.extract_column_level_lineage:
1353
1361
  for field in self.fields or []:
1362
+ # Skip creating fine-grained lineage for empty field names to prevent invalid schema field URNs
1363
+ if not field.name or not field.name.strip():
1364
+ logger.warning(
1365
+ f"Skipping fine-grained lineage for field with empty name in explore '{self.name}'"
1366
+ )
1367
+ continue
1368
+
1354
1369
  for upstream_column_ref in field.upstream_fields:
1370
+ # Skip creating fine-grained lineage for empty column names to prevent invalid schema field URNs
1371
+ if (
1372
+ not upstream_column_ref.column
1373
+ or not upstream_column_ref.column.strip()
1374
+ ):
1375
+ logger.warning(
1376
+ f"Skipping some fine-grained lineage for field '{field.name}' in explore '{self.name}' "
1377
+ f"due to empty upstream column name in table '{upstream_column_ref.table}'"
1378
+ )
1379
+ continue
1380
+
1355
1381
  fine_grained_lineages.append(
1356
1382
  FineGrainedLineageClass(
1357
1383
  upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional, Tuple
2
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
3
3
 
4
4
  from pydantic import Field
5
5
 
@@ -14,6 +14,7 @@ from datahub.ingestion.api.decorators import (
14
14
  support_status,
15
15
  )
16
16
  from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
17
+ from datahub.ingestion.api.source_helpers import AutoSystemMetadata, auto_workunit
17
18
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
19
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
19
20
  from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
@@ -31,6 +32,7 @@ from datahub.metadata.schema_classes import (
31
32
  UpstreamClass,
32
33
  UpstreamLineageClass,
33
34
  )
35
+ from datahub.sdk.entity import Entity
34
36
  from datahub.utilities.str_enum import StrEnum
35
37
 
36
38
  logger = logging.getLogger(__name__)
@@ -165,6 +167,14 @@ class DataHubMockDataSource(Source):
165
167
  self.report = DataHubMockDataReport()
166
168
 
167
169
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
170
+ workunit_processors = [AutoSystemMetadata(self.ctx).stamp]
171
+ return self._apply_workunit_processors(
172
+ workunit_processors, auto_workunit(self.get_workunits_internal())
173
+ )
174
+
175
+ def get_workunits_internal(
176
+ self,
177
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]]:
168
178
  # We don't want any implicit aspects to be produced
169
179
  # so we are not using get_workunits_internal
170
180
 
@@ -55,6 +55,7 @@ class SnowflakeObjectDomain(StrEnum):
55
55
  ICEBERG_TABLE = "iceberg table"
56
56
  STREAM = "stream"
57
57
  PROCEDURE = "procedure"
58
+ DYNAMIC_TABLE = "dynamic table"
58
59
 
59
60
 
60
61
  GENERIC_PERMISSION_ERROR_KEY = "permission-error"
@@ -356,11 +356,18 @@ class SnowflakeV2Config(
356
356
 
357
357
  pushdown_deny_usernames: List[str] = Field(
358
358
  default=[],
359
- description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
359
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
360
360
  "This is primarily useful for improving performance by filtering out users with extremely high query volumes. "
361
361
  "Only applicable if `use_queries_v2` is enabled.",
362
362
  )
363
363
 
364
+ pushdown_allow_usernames: List[str] = Field(
365
+ default=[],
366
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
367
+ "This is primarily useful for improving performance by filtering in only specific users. "
368
+ "Only applicable if `use_queries_v2` is enabled. If not specified, all users not in deny list are included.",
369
+ )
370
+
364
371
  push_down_database_pattern_access_history: bool = Field(
365
372
  default=False,
366
373
  description="If enabled, pushes down database pattern filtering to the access_history table for improved performance. "
@@ -89,10 +89,17 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
89
89
 
90
90
  pushdown_deny_usernames: List[str] = pydantic.Field(
91
91
  default=[],
92
- description="List of snowflake usernames which will not be considered for lineage/usage/queries extraction. "
92
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'SERVICE_%', '%_PROD', 'TEST_USER') which will NOT be considered for lineage/usage/queries extraction. "
93
93
  "This is primarily useful for improving performance by filtering out users with extremely high query volumes.",
94
94
  )
95
95
 
96
+ pushdown_allow_usernames: List[str] = pydantic.Field(
97
+ default=[],
98
+ description="List of snowflake usernames (SQL LIKE patterns, e.g., 'ANALYST_%', '%_USER', 'MAIN_ACCOUNT') which WILL be considered for lineage/usage/queries extraction. "
99
+ "This is primarily useful for improving performance by filtering in only specific users. "
100
+ "If not specified, all users not in deny list are included.",
101
+ )
102
+
96
103
  user_email_pattern: AllowDenyPattern = pydantic.Field(
97
104
  default=AllowDenyPattern.allow_all(),
98
105
  description="Regex patterns for user emails to filter in usage.",
@@ -396,6 +403,7 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
396
403
  end_time=self.config.window.end_time,
397
404
  bucket_duration=self.config.window.bucket_duration,
398
405
  deny_usernames=self.config.pushdown_deny_usernames,
406
+ allow_usernames=self.config.pushdown_allow_usernames,
399
407
  dedup_strategy=self.config.query_dedup_strategy,
400
408
  database_pattern=self.filters.filter_config.database_pattern
401
409
  if self.config.push_down_database_pattern_access_history
@@ -740,7 +748,8 @@ class QueryLogQueryBuilder:
740
748
  start_time: datetime,
741
749
  end_time: datetime,
742
750
  bucket_duration: BucketDuration,
743
- deny_usernames: Optional[List[str]],
751
+ deny_usernames: Optional[List[str]] = None,
752
+ allow_usernames: Optional[List[str]] = None,
744
753
  max_tables_per_query: int = 20,
745
754
  dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
746
755
  database_pattern: Optional[AllowDenyPattern] = None,
@@ -753,10 +762,7 @@ class QueryLogQueryBuilder:
753
762
  self.max_tables_per_query = max_tables_per_query
754
763
  self.dedup_strategy = dedup_strategy
755
764
 
756
- self.users_filter = "TRUE"
757
- if deny_usernames:
758
- user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
759
- self.users_filter = f"user_name NOT IN ({user_not_in})"
765
+ self.users_filter = self._build_user_filter(deny_usernames, allow_usernames)
760
766
 
761
767
  self.access_history_database_filter = (
762
768
  self._build_access_history_database_filter_condition(
@@ -767,6 +773,43 @@ class QueryLogQueryBuilder:
767
773
  self.time_bucket_size = bucket_duration.value
768
774
  assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
769
775
 
776
+ def _build_user_filter(
777
+ self,
778
+ deny_usernames: Optional[List[str]] = None,
779
+ allow_usernames: Optional[List[str]] = None,
780
+ ) -> str:
781
+ """
782
+ Build user filter SQL condition based on deny and allow username patterns.
783
+
784
+ Args:
785
+ deny_usernames: List of username patterns to exclude (SQL LIKE patterns)
786
+ allow_usernames: List of username patterns to include (SQL LIKE patterns)
787
+
788
+ Returns:
789
+ SQL WHERE condition string for filtering users
790
+ """
791
+ user_filters = []
792
+
793
+ if deny_usernames:
794
+ deny_conditions = []
795
+ for pattern in deny_usernames:
796
+ # Escape single quotes for SQL safety
797
+ escaped_pattern = pattern.replace("'", "''")
798
+ deny_conditions.append(f"user_name NOT ILIKE '{escaped_pattern}'")
799
+ if deny_conditions:
800
+ user_filters.append(f"({' AND '.join(deny_conditions)})")
801
+
802
+ if allow_usernames:
803
+ allow_conditions = []
804
+ for pattern in allow_usernames:
805
+ # Escape single quotes for SQL safety
806
+ escaped_pattern = pattern.replace("'", "''")
807
+ allow_conditions.append(f"user_name ILIKE '{escaped_pattern}'")
808
+ if allow_conditions:
809
+ user_filters.append(f"({' OR '.join(allow_conditions)})")
810
+
811
+ return " AND ".join(user_filters) if user_filters else "TRUE"
812
+
770
813
  def _build_access_history_database_filter_condition(
771
814
  self,
772
815
  database_pattern: Optional[AllowDenyPattern],