pycarlo 0.12.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pycarlo might be problematic. Click here for more details.
- pycarlo/__init__.py +0 -0
- pycarlo/common/__init__.py +31 -0
- pycarlo/common/errors.py +31 -0
- pycarlo/common/files.py +78 -0
- pycarlo/common/http.py +36 -0
- pycarlo/common/mcon.py +26 -0
- pycarlo/common/retries.py +129 -0
- pycarlo/common/settings.py +89 -0
- pycarlo/common/utils.py +51 -0
- pycarlo/core/__init__.py +10 -0
- pycarlo/core/client.py +267 -0
- pycarlo/core/endpoint.py +289 -0
- pycarlo/core/operations.py +25 -0
- pycarlo/core/session.py +127 -0
- pycarlo/features/__init__.py +10 -0
- pycarlo/features/circuit_breakers/__init__.py +3 -0
- pycarlo/features/circuit_breakers/exceptions.py +10 -0
- pycarlo/features/circuit_breakers/service.py +346 -0
- pycarlo/features/dbt/__init__.py +3 -0
- pycarlo/features/dbt/dbt_importer.py +208 -0
- pycarlo/features/dbt/queries.py +31 -0
- pycarlo/features/exceptions.py +18 -0
- pycarlo/features/metadata/__init__.py +32 -0
- pycarlo/features/metadata/asset_allow_block_list.py +22 -0
- pycarlo/features/metadata/asset_filters_container.py +79 -0
- pycarlo/features/metadata/base_allow_block_list.py +137 -0
- pycarlo/features/metadata/metadata_allow_block_list.py +94 -0
- pycarlo/features/metadata/metadata_filters_container.py +262 -0
- pycarlo/features/pii/__init__.py +5 -0
- pycarlo/features/pii/constants.py +3 -0
- pycarlo/features/pii/pii_filterer.py +179 -0
- pycarlo/features/pii/queries.py +20 -0
- pycarlo/features/pii/service.py +56 -0
- pycarlo/features/user/__init__.py +4 -0
- pycarlo/features/user/exceptions.py +10 -0
- pycarlo/features/user/models.py +9 -0
- pycarlo/features/user/queries.py +13 -0
- pycarlo/features/user/service.py +71 -0
- pycarlo/lib/README.md +35 -0
- pycarlo/lib/__init__.py +0 -0
- pycarlo/lib/schema.json +210020 -0
- pycarlo/lib/schema.py +82620 -0
- pycarlo/lib/types.py +68 -0
- pycarlo-0.12.24.dist-info/LICENSE +201 -0
- pycarlo-0.12.24.dist-info/METADATA +249 -0
- pycarlo-0.12.24.dist-info/RECORD +48 -0
- pycarlo-0.12.24.dist-info/WHEEL +5 -0
- pycarlo-0.12.24.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Callable, Dict, Optional, Union, cast
|
|
4
|
+
from uuid import UUID
|
|
5
|
+
|
|
6
|
+
from dataclasses_json import LetterCase, dataclass_json
|
|
7
|
+
|
|
8
|
+
from pycarlo.common import get_logger, http
|
|
9
|
+
from pycarlo.common.files import BytesFileReader, JsonFileReader, to_path
|
|
10
|
+
from pycarlo.common.settings import (
|
|
11
|
+
HEADER_MCD_TELEMETRY_REASON,
|
|
12
|
+
HEADER_MCD_TELEMETRY_SERVICE,
|
|
13
|
+
RequestReason,
|
|
14
|
+
)
|
|
15
|
+
from pycarlo.core import Client, Query
|
|
16
|
+
from pycarlo.features.dbt.queries import (
|
|
17
|
+
GET_DBT_UPLOAD_URL,
|
|
18
|
+
SEND_DBT_ARTIFACTS_EVENT,
|
|
19
|
+
)
|
|
20
|
+
from pycarlo.features.pii import PiiFilterer, PiiService
|
|
21
|
+
from pycarlo.features.user import UserService
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class InvalidArtifactsException(Exception):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class InvalidFileFormatException(Exception):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass_json(letter_case=LetterCase.CAMEL) # type: ignore
|
|
35
|
+
@dataclass
|
|
36
|
+
class DbtArtifacts:
|
|
37
|
+
manifest: str
|
|
38
|
+
run_results: str
|
|
39
|
+
logs: Optional[str]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class DbtImporter:
|
|
43
|
+
"""
|
|
44
|
+
Import dbt run artifacts to Monte Carlo
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
DEFAULT_PROJECT_NAME = "default-project"
|
|
48
|
+
DEFAULT_JOB_NAME = "default-job"
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
mc_client: Optional[Client] = None,
|
|
53
|
+
user_service: Optional[UserService] = None,
|
|
54
|
+
pii_service: Optional[PiiService] = None,
|
|
55
|
+
print_func: Callable = logger.info,
|
|
56
|
+
):
|
|
57
|
+
self._mc_client = mc_client or Client()
|
|
58
|
+
self._user_service = user_service or UserService(mc_client=self._mc_client)
|
|
59
|
+
self._pii_service = pii_service or PiiService(mc_client=self._mc_client)
|
|
60
|
+
self._print_func = print_func
|
|
61
|
+
self._pii_filterer = self._init_pii_filterer()
|
|
62
|
+
|
|
63
|
+
def import_run(
|
|
64
|
+
self,
|
|
65
|
+
manifest_path: Union[Path, str],
|
|
66
|
+
run_results_path: Union[Path, str],
|
|
67
|
+
logs_path: Optional[Union[Path, str]] = None,
|
|
68
|
+
project_name: str = DEFAULT_PROJECT_NAME,
|
|
69
|
+
job_name: str = DEFAULT_JOB_NAME,
|
|
70
|
+
resource_id: Optional[Union[str, UUID]] = None,
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Import artifacts from a single dbt command execution.
|
|
74
|
+
|
|
75
|
+
:param manifest_path: local path to the dbt manifest file (manifest.json)
|
|
76
|
+
:param run_results_path: local path to the dbt run results file (run_results.json)
|
|
77
|
+
:param logs_path: local path to a file containing dbt run logs
|
|
78
|
+
:param project_name: Project name (perhaps a logical group of dbt models, analogous to a
|
|
79
|
+
project in dbt Cloud)
|
|
80
|
+
:param job_name: Job name (perhaps a logical sequence of dbt commands, analogous to a
|
|
81
|
+
job in dbt Cloud)
|
|
82
|
+
:param resource_id: identifier of a Monte Carlo resource (warehouse or lake) to use to
|
|
83
|
+
resolve dbt models to tables, this will be required if you have more
|
|
84
|
+
than one
|
|
85
|
+
"""
|
|
86
|
+
# get resource
|
|
87
|
+
resource = self._user_service.get_resource(resource_id)
|
|
88
|
+
|
|
89
|
+
# read local artifacts
|
|
90
|
+
manifest = JsonFileReader(manifest_path).read()
|
|
91
|
+
run_results = JsonFileReader(run_results_path).read()
|
|
92
|
+
logs = BytesFileReader(logs_path).read() if logs_path else None
|
|
93
|
+
|
|
94
|
+
# extract dbt invocation id (and verify it is the same for each artifact)
|
|
95
|
+
invocation_id = self._get_invocation_id(
|
|
96
|
+
manifest_path=manifest_path,
|
|
97
|
+
manifest=manifest,
|
|
98
|
+
run_results_path=run_results_path,
|
|
99
|
+
run_results=run_results,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# upload artifacts to S3 (using pre-signed URLs)
|
|
103
|
+
artifacts = DbtArtifacts(
|
|
104
|
+
manifest=self._upload_artifact(
|
|
105
|
+
project_name=project_name,
|
|
106
|
+
invocation_id=invocation_id,
|
|
107
|
+
file_path=to_path(manifest_path),
|
|
108
|
+
content=manifest,
|
|
109
|
+
),
|
|
110
|
+
run_results=self._upload_artifact(
|
|
111
|
+
project_name=project_name,
|
|
112
|
+
invocation_id=invocation_id,
|
|
113
|
+
file_path=to_path(run_results_path),
|
|
114
|
+
content=run_results,
|
|
115
|
+
),
|
|
116
|
+
logs=self._upload_artifact(
|
|
117
|
+
project_name=project_name,
|
|
118
|
+
invocation_id=invocation_id,
|
|
119
|
+
file_path=to_path(logs_path), # type: ignore
|
|
120
|
+
content=logs,
|
|
121
|
+
)
|
|
122
|
+
if logs
|
|
123
|
+
else None,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# publish event indicating run artifacts are ready for processing
|
|
127
|
+
self._mc_client(
|
|
128
|
+
query=SEND_DBT_ARTIFACTS_EVENT,
|
|
129
|
+
variables=dict(
|
|
130
|
+
projectName=project_name,
|
|
131
|
+
jobName=job_name,
|
|
132
|
+
invocationId=invocation_id,
|
|
133
|
+
artifacts=artifacts.to_dict(), # type: ignore
|
|
134
|
+
resourceId=str(resource.id),
|
|
135
|
+
),
|
|
136
|
+
additional_headers={
|
|
137
|
+
HEADER_MCD_TELEMETRY_REASON: RequestReason.SERVICE.value,
|
|
138
|
+
HEADER_MCD_TELEMETRY_SERVICE: "dbt_importer",
|
|
139
|
+
},
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
self._print_func("Finished sending run artifacts to Monte Carlo")
|
|
143
|
+
|
|
144
|
+
def _get_invocation_id(
|
|
145
|
+
self,
|
|
146
|
+
manifest_path: Union[Path, str],
|
|
147
|
+
manifest: Dict,
|
|
148
|
+
run_results_path: Union[Path, str],
|
|
149
|
+
run_results: Dict,
|
|
150
|
+
) -> str:
|
|
151
|
+
manifest_invocation_id = self._extract_invocation_id(path=manifest_path, data=manifest)
|
|
152
|
+
run_results_invocation_id = self._extract_invocation_id(
|
|
153
|
+
path=run_results_path, data=run_results
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
if manifest_invocation_id != run_results_invocation_id:
|
|
157
|
+
raise InvalidArtifactsException(
|
|
158
|
+
"dbt invocation ids do not match between manifest and run results files"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return manifest_invocation_id
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _extract_invocation_id(path: Union[Path, str], data: Dict) -> str:
|
|
165
|
+
try:
|
|
166
|
+
return data["metadata"]["invocation_id"]
|
|
167
|
+
except KeyError:
|
|
168
|
+
raise InvalidArtifactsException(
|
|
169
|
+
f"Unable to get dbt invocation id from '{path}'. Unexpected file format"
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def _upload_artifact(
|
|
173
|
+
self,
|
|
174
|
+
project_name: str,
|
|
175
|
+
invocation_id: str,
|
|
176
|
+
file_path: Path,
|
|
177
|
+
content: Union[bytes, str, Dict],
|
|
178
|
+
) -> str:
|
|
179
|
+
self._print_func(f"Uploading {file_path.name}...")
|
|
180
|
+
http.upload(
|
|
181
|
+
method="put",
|
|
182
|
+
url=self._get_presigned_url(
|
|
183
|
+
project_name=project_name, invocation_id=invocation_id, file_name=file_path.name
|
|
184
|
+
),
|
|
185
|
+
content=self._pii_filterer.filter_content(content),
|
|
186
|
+
)
|
|
187
|
+
return file_path.name
|
|
188
|
+
|
|
189
|
+
def _get_presigned_url(self, project_name: str, invocation_id: str, file_name: str) -> str:
|
|
190
|
+
response = cast(
|
|
191
|
+
Query,
|
|
192
|
+
self._mc_client(
|
|
193
|
+
query=GET_DBT_UPLOAD_URL,
|
|
194
|
+
variables=dict(
|
|
195
|
+
projectName=project_name, invocationId=invocation_id, fileName=file_name
|
|
196
|
+
),
|
|
197
|
+
additional_headers={
|
|
198
|
+
HEADER_MCD_TELEMETRY_REASON: RequestReason.SERVICE.value,
|
|
199
|
+
HEADER_MCD_TELEMETRY_SERVICE: "dbt_importer",
|
|
200
|
+
},
|
|
201
|
+
),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return cast(str, response.get_dbt_upload_url)
|
|
205
|
+
|
|
206
|
+
def _init_pii_filterer(self):
|
|
207
|
+
pii_filters = self._pii_service.get_pii_filters_config()
|
|
208
|
+
return PiiFilterer(filters_config=pii_filters)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
GET_DBT_UPLOAD_URL = """
|
|
2
|
+
query getDbtUploadUrl(
|
|
3
|
+
$projectName: String!,
|
|
4
|
+
$invocationId: String!,
|
|
5
|
+
$fileName: String!) {
|
|
6
|
+
getDbtUploadUrl(
|
|
7
|
+
projectName: $projectName,
|
|
8
|
+
invocationId: $invocationId,
|
|
9
|
+
fileName: $fileName
|
|
10
|
+
)
|
|
11
|
+
}
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
SEND_DBT_ARTIFACTS_EVENT = """
|
|
15
|
+
mutation sendDbtArtifactsEvent(
|
|
16
|
+
$projectName: String!,
|
|
17
|
+
$jobName: String!,
|
|
18
|
+
$invocationId: UUID!,
|
|
19
|
+
$artifacts: DbtArtifactsInput!,
|
|
20
|
+
$resourceId: UUID) {
|
|
21
|
+
sendDbtArtifactsEvent(
|
|
22
|
+
projectName: $projectName,
|
|
23
|
+
jobName: $jobName,
|
|
24
|
+
invocationId: $invocationId,
|
|
25
|
+
artifacts: $artifacts,
|
|
26
|
+
resourceId: $resourceId
|
|
27
|
+
) {
|
|
28
|
+
ok
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
"""
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from pycarlo.features.circuit_breakers.exceptions import (
|
|
2
|
+
CircuitBreakerPipelineException,
|
|
3
|
+
CircuitBreakerPollException,
|
|
4
|
+
)
|
|
5
|
+
from pycarlo.features.user.exceptions import (
|
|
6
|
+
MultipleResourcesFoundException,
|
|
7
|
+
ResourceNotFoundException,
|
|
8
|
+
UserServiceException,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"CircuitBreakerPipelineException",
|
|
13
|
+
"CircuitBreakerPipelineException",
|
|
14
|
+
"CircuitBreakerPollException",
|
|
15
|
+
"MultipleResourcesFoundException",
|
|
16
|
+
"ResourceNotFoundException",
|
|
17
|
+
"UserServiceException",
|
|
18
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from pycarlo.features.metadata.asset_allow_block_list import AssetAllowBlockList
|
|
2
|
+
from pycarlo.features.metadata.asset_filters_container import AssetFiltersContainer
|
|
3
|
+
from pycarlo.features.metadata.base_allow_block_list import (
|
|
4
|
+
BaseAllowBlockList,
|
|
5
|
+
ComparisonType,
|
|
6
|
+
FilterEffectType,
|
|
7
|
+
FilterRule,
|
|
8
|
+
FilterType,
|
|
9
|
+
RuleEffect,
|
|
10
|
+
)
|
|
11
|
+
from pycarlo.features.metadata.metadata_allow_block_list import (
|
|
12
|
+
MetadataAllowBlockList,
|
|
13
|
+
MetadataFilter,
|
|
14
|
+
)
|
|
15
|
+
from pycarlo.features.metadata.metadata_filters_container import MetadataFiltersContainer
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
# Base classes
|
|
19
|
+
"FilterRule",
|
|
20
|
+
"BaseAllowBlockList",
|
|
21
|
+
"FilterEffectType",
|
|
22
|
+
"RuleEffect",
|
|
23
|
+
"FilterType",
|
|
24
|
+
"ComparisonType",
|
|
25
|
+
# Metadata filtering classes
|
|
26
|
+
"MetadataFilter",
|
|
27
|
+
"MetadataAllowBlockList",
|
|
28
|
+
"MetadataFiltersContainer",
|
|
29
|
+
# Asset filtering classes
|
|
30
|
+
"AssetAllowBlockList",
|
|
31
|
+
"AssetFiltersContainer",
|
|
32
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from dataclasses_json import DataClassJsonMixin
|
|
5
|
+
|
|
6
|
+
from pycarlo.common import get_logger
|
|
7
|
+
from pycarlo.features.metadata.base_allow_block_list import BaseAllowBlockList, FilterRule
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class AssetAllowBlockList(BaseAllowBlockList[FilterRule], DataClassJsonMixin):
|
|
14
|
+
# JSON deserialization fails without this ugly override
|
|
15
|
+
rules: Optional[List[FilterRule]] = field(default_factory=list)
|
|
16
|
+
|
|
17
|
+
asset_type: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
def __post_init__(self):
|
|
20
|
+
# We can't remove the default value because of properties with defaults in the parent class.
|
|
21
|
+
if not self.asset_type:
|
|
22
|
+
raise ValueError("asset_type is required")
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from dataclasses_json import DataClassJsonMixin
|
|
5
|
+
|
|
6
|
+
from .asset_allow_block_list import AssetAllowBlockList
|
|
7
|
+
from .base_allow_block_list import FilterEffectType
|
|
8
|
+
|
|
9
|
+
# Mapping of resource types to their supported asset types for collection preferences.
|
|
10
|
+
# This is used for validating asset collection preferences.
|
|
11
|
+
# When support for filtering an asset type is implemented in the DC, it should be added here.
|
|
12
|
+
# The reason it is here instead of in Monolith, is so that it can be referenced by the CLI.
|
|
13
|
+
# The pycarlo version in CLI and monolith should be updated after updating this and releasing a
|
|
14
|
+
# new version.
|
|
15
|
+
ASSET_TYPE_ATTRIBUTES = {"tableau": {"project": ["name"], "workbook": ["name", "luid"]}}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class AssetFiltersContainer(DataClassJsonMixin):
|
|
20
|
+
"""
|
|
21
|
+
Simple container for asset filtering that focuses on in-memory filtering for REST APIs.
|
|
22
|
+
|
|
23
|
+
This class provides basic asset filtering functionality without SQL generation complexity.
|
|
24
|
+
It's designed for the initial phase where assets are collected via REST APIs rather than
|
|
25
|
+
SQL queries.
|
|
26
|
+
|
|
27
|
+
Example usage:
|
|
28
|
+
# Block all external assets
|
|
29
|
+
filters = AssetAllowBlockList(
|
|
30
|
+
filters=[AssetFilter(asset_type="external", effect=FilterEffectType.BLOCK)]
|
|
31
|
+
)
|
|
32
|
+
container = AssetFiltersContainer(asset_filters=filters)
|
|
33
|
+
|
|
34
|
+
# Check if an asset is blocked
|
|
35
|
+
is_blocked = container.is_asset_blocked("external", "my_table") # True
|
|
36
|
+
is_blocked = container.is_asset_blocked("table", "users") # False
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
asset_filters: List[AssetAllowBlockList] = field(default_factory=list)
|
|
40
|
+
|
|
41
|
+
def is_asset_type_filtered(self, asset_type: str) -> bool:
|
|
42
|
+
"""Returns True if any filters are configured for the given asset type."""
|
|
43
|
+
return bool(self._get_asset_filters(asset_type))
|
|
44
|
+
|
|
45
|
+
def is_asset_blocked(self, asset_type: str, attributes: Dict[str, str]) -> bool:
|
|
46
|
+
"""
|
|
47
|
+
Returns True if the specified asset is blocked by the current filters.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
asset_type: The type of asset (e.g., 'tableau_workbook_v2', 'jobs', 'power_bi_workspace')
|
|
51
|
+
attributes: A dictionary representing the attributes of the asset
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
True if the asset is blocked, False if it's allowed
|
|
55
|
+
"""
|
|
56
|
+
asset_filters = self._get_asset_filters(asset_type)
|
|
57
|
+
|
|
58
|
+
is_blocked = False
|
|
59
|
+
|
|
60
|
+
for asset_filter in asset_filters:
|
|
61
|
+
default_effect_matches = asset_filter.get_default_effect_rules(
|
|
62
|
+
lambda f: f.matches(force_regexp=False, **attributes)
|
|
63
|
+
)
|
|
64
|
+
if default_effect_matches:
|
|
65
|
+
is_blocked = asset_filter.default_effect == FilterEffectType.BLOCK
|
|
66
|
+
else:
|
|
67
|
+
other_effect_matches = asset_filter.get_other_effect_rules(
|
|
68
|
+
lambda f: f.matches(force_regexp=False, **attributes)
|
|
69
|
+
)
|
|
70
|
+
if other_effect_matches:
|
|
71
|
+
is_blocked = asset_filter.other_effect == FilterEffectType.BLOCK
|
|
72
|
+
else:
|
|
73
|
+
# No matches, use default effect
|
|
74
|
+
is_blocked = asset_filter.default_effect == FilterEffectType.BLOCK
|
|
75
|
+
|
|
76
|
+
return is_blocked
|
|
77
|
+
|
|
78
|
+
def _get_asset_filters(self, asset_type: str) -> List[AssetAllowBlockList]:
|
|
79
|
+
return [f for f in self.asset_filters if f.asset_type == asset_type]
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Callable, Generic, List, Optional, TypeVar
|
|
5
|
+
|
|
6
|
+
from dataclasses_json import DataClassJsonMixin
|
|
7
|
+
|
|
8
|
+
from pycarlo.common import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger(__name__)
|
|
11
|
+
|
|
12
|
+
# For documentation and samples check the link below:
|
|
13
|
+
# https://www.notion.so/montecarlodata/Catalog-Schema-Filtering-59edd6eff7f74c94ab6bfca75d2e3ff1
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _exclude_none_values(value: Any) -> bool:
|
|
17
|
+
return value is None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FilterEffectType(enum.Enum):
|
|
21
|
+
BLOCK = "block"
|
|
22
|
+
ALLOW = "allow"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
RuleEffect = FilterEffectType
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FilterType(enum.Enum):
|
|
29
|
+
EXACT_MATCH = "exact_match"
|
|
30
|
+
PREFIX = "prefix"
|
|
31
|
+
SUFFIX = "suffix"
|
|
32
|
+
SUBSTRING = "substring"
|
|
33
|
+
REGEXP = "regexp"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
ComparisonType = FilterType
|
|
37
|
+
|
|
38
|
+
# Type variable for the filter class
|
|
39
|
+
FilterRuleT = TypeVar("FilterRuleT", bound="FilterRule")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class RuleCondition(DataClassJsonMixin):
|
|
44
|
+
attribute_name: str
|
|
45
|
+
value: str
|
|
46
|
+
comparison_type: ComparisonType = ComparisonType.EXACT_MATCH
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class FilterRule(DataClassJsonMixin):
|
|
51
|
+
"""
|
|
52
|
+
Base class for all filter types. Provides common filtering logic that can be
|
|
53
|
+
shared between different filter implementations (e.g., metadata filters, asset filters).
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
conditions: Optional[List[RuleCondition]] = field(default_factory=list)
|
|
57
|
+
effect: RuleEffect = RuleEffect.BLOCK
|
|
58
|
+
|
|
59
|
+
def matches(self, force_regexp: bool = False, **kwargs: Any) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Returns True if all properties specified in kwargs match the conditions specified in
|
|
62
|
+
properties of the same name in this object.
|
|
63
|
+
If any of the conditions (for example self.field) is None, that condition will be matched.
|
|
64
|
+
"""
|
|
65
|
+
if not kwargs:
|
|
66
|
+
raise ValueError("At least one field needs to be specified for matching")
|
|
67
|
+
|
|
68
|
+
# kwargs must match the field names in this class, if any of them do not,
|
|
69
|
+
# invalidate the filter.
|
|
70
|
+
try:
|
|
71
|
+
return all(
|
|
72
|
+
condition.attribute_name not in kwargs
|
|
73
|
+
or self._match(
|
|
74
|
+
condition=condition,
|
|
75
|
+
value=kwargs.get(condition.attribute_name),
|
|
76
|
+
force_regexp=force_regexp,
|
|
77
|
+
)
|
|
78
|
+
for condition in self.conditions or []
|
|
79
|
+
)
|
|
80
|
+
except AttributeError:
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def _match(cls, condition: RuleCondition, value: Optional[str], force_regexp: bool) -> bool:
|
|
85
|
+
# Field not specified on this object, e.g. self.field=None, which matches everything
|
|
86
|
+
if value is None:
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
# The comparison is performed case-insensitive (check BaseFilter._safe_match)
|
|
90
|
+
# We can use LOWER here since it is part of standard SQL (like AND/OR/NOT), so including it
|
|
91
|
+
# here is a way to make sure that all comparisons are case-insensitive in the SQL sentences
|
|
92
|
+
# for all engines. Added option to not always LOWER since customers do have lower/upper case
|
|
93
|
+
# databases logged in MC
|
|
94
|
+
filter_value = condition.value.lower()
|
|
95
|
+
value = value.lower()
|
|
96
|
+
|
|
97
|
+
if force_regexp or condition.comparison_type == FilterType.REGEXP:
|
|
98
|
+
regexp = f"^{filter_value}$"
|
|
99
|
+
return re.match(regexp, value) is not None
|
|
100
|
+
elif condition.comparison_type == FilterType.PREFIX:
|
|
101
|
+
return value.startswith(filter_value)
|
|
102
|
+
elif condition.comparison_type == FilterType.SUFFIX:
|
|
103
|
+
return value.endswith(filter_value)
|
|
104
|
+
elif condition.comparison_type == FilterType.SUBSTRING:
|
|
105
|
+
return filter_value in value
|
|
106
|
+
else: # filter_type == FilterType.EXACT_MATCH
|
|
107
|
+
return filter_value == value
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class BaseAllowBlockList(Generic[FilterRuleT], DataClassJsonMixin):
|
|
112
|
+
rules: Optional[List[FilterRuleT]] = field(default_factory=list)
|
|
113
|
+
default_effect: RuleEffect = RuleEffect.ALLOW
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def other_effect(self) -> RuleEffect:
|
|
117
|
+
return RuleEffect.ALLOW if self.default_effect == RuleEffect.BLOCK else RuleEffect.BLOCK
|
|
118
|
+
|
|
119
|
+
def get_default_effect_rules(
|
|
120
|
+
self, condition: Optional[Callable[[FilterRuleT], bool]] = None
|
|
121
|
+
) -> List[FilterRuleT]:
|
|
122
|
+
return list(
|
|
123
|
+
filter(
|
|
124
|
+
lambda f: f.effect == self.default_effect and (condition is None or condition(f)),
|
|
125
|
+
self.rules or [],
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def get_other_effect_rules(
|
|
130
|
+
self, condition: Optional[Callable[[FilterRuleT], bool]] = None
|
|
131
|
+
) -> List[FilterRuleT]:
|
|
132
|
+
return list(
|
|
133
|
+
filter(
|
|
134
|
+
lambda f: f.effect != self.default_effect and (condition is None or condition(f)),
|
|
135
|
+
self.rules or [],
|
|
136
|
+
)
|
|
137
|
+
)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from dataclasses_json import config, dataclass_json
|
|
5
|
+
|
|
6
|
+
from pycarlo.common import get_logger
|
|
7
|
+
from pycarlo.features.metadata.base_allow_block_list import (
|
|
8
|
+
BaseAllowBlockList,
|
|
9
|
+
ComparisonType,
|
|
10
|
+
FilterRule,
|
|
11
|
+
FilterType,
|
|
12
|
+
RuleCondition,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# For documentation and samples check the link below:
|
|
18
|
+
# https://www.notion.so/montecarlodata/Catalog-Schema-Filtering-59edd6eff7f74c94ab6bfca75d2e3ff1
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass_json
|
|
22
|
+
@dataclass
|
|
23
|
+
class MetadataFilter(FilterRule):
|
|
24
|
+
type: FilterType = FilterType.EXACT_MATCH
|
|
25
|
+
|
|
26
|
+
# we're using exclude=_exclude_none_values to prevent these properties to be serialized to json
|
|
27
|
+
# when None, to keep the json doc simpler
|
|
28
|
+
project: Optional[str] = field(metadata=config(exclude=lambda x: x is None), default=None)
|
|
29
|
+
dataset: Optional[str] = field(metadata=config(exclude=lambda x: x is None), default=None)
|
|
30
|
+
table_type: Optional[str] = field(metadata=config(exclude=lambda x: x is None), default=None)
|
|
31
|
+
table_name: Optional[str] = field(metadata=config(exclude=lambda x: x is None), default=None)
|
|
32
|
+
|
|
33
|
+
def __post_init__(self):
|
|
34
|
+
# For backwards compatibility, we now create a set of conditions based on the
|
|
35
|
+
# metadata-specific fields.
|
|
36
|
+
self.conditions = self.conditions or []
|
|
37
|
+
if self.table_name is not None:
|
|
38
|
+
is_target_field = self.filter_type_target_field() == "table_name"
|
|
39
|
+
condition = RuleCondition(
|
|
40
|
+
comparison_type=self.type if is_target_field else ComparisonType.EXACT_MATCH,
|
|
41
|
+
attribute_name="table_name",
|
|
42
|
+
value=self.table_name,
|
|
43
|
+
)
|
|
44
|
+
self.conditions.append(condition)
|
|
45
|
+
|
|
46
|
+
if self.dataset is not None:
|
|
47
|
+
is_target_field = self.filter_type_target_field() == "dataset"
|
|
48
|
+
condition = RuleCondition(
|
|
49
|
+
comparison_type=self.type if is_target_field else ComparisonType.EXACT_MATCH,
|
|
50
|
+
attribute_name="dataset",
|
|
51
|
+
value=self.dataset,
|
|
52
|
+
)
|
|
53
|
+
self.conditions.append(condition)
|
|
54
|
+
|
|
55
|
+
if self.project is not None:
|
|
56
|
+
is_target_field = self.filter_type_target_field() == "project"
|
|
57
|
+
condition = RuleCondition(
|
|
58
|
+
comparison_type=self.type if is_target_field else ComparisonType.EXACT_MATCH,
|
|
59
|
+
attribute_name="project",
|
|
60
|
+
value=self.project,
|
|
61
|
+
)
|
|
62
|
+
self.conditions.append(condition)
|
|
63
|
+
|
|
64
|
+
if self.table_type is not None:
|
|
65
|
+
condition = RuleCondition(
|
|
66
|
+
comparison_type=ComparisonType.EXACT_MATCH,
|
|
67
|
+
attribute_name="table_type",
|
|
68
|
+
value=self.table_type,
|
|
69
|
+
)
|
|
70
|
+
self.conditions.append(condition)
|
|
71
|
+
|
|
72
|
+
def filter_type_target_field(self) -> str:
|
|
73
|
+
"""
|
|
74
|
+
The field that is evaluated using filter type. Other fields should be
|
|
75
|
+
compared using exact match.
|
|
76
|
+
"""
|
|
77
|
+
if self.table_name is not None:
|
|
78
|
+
return "table_name"
|
|
79
|
+
if self.dataset is not None:
|
|
80
|
+
return "dataset"
|
|
81
|
+
if self.project is not None:
|
|
82
|
+
return "project"
|
|
83
|
+
|
|
84
|
+
logger.exception("Invalid filter, missing target values")
|
|
85
|
+
return ""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass_json
|
|
89
|
+
@dataclass
|
|
90
|
+
class MetadataAllowBlockList(BaseAllowBlockList[MetadataFilter]):
|
|
91
|
+
filters: List[MetadataFilter] = field(default_factory=list)
|
|
92
|
+
|
|
93
|
+
def __post_init__(self):
|
|
94
|
+
self.rules = self.filters
|