vh-pytoolbox 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vh_pytoolbox-0.1.0/LICENSE.txt +21 -0
- vh_pytoolbox-0.1.0/PKG-INFO +71 -0
- vh_pytoolbox-0.1.0/README.md +22 -0
- vh_pytoolbox-0.1.0/pyproject.toml +45 -0
- vh_pytoolbox-0.1.0/setup.cfg +4 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/__init__.py +1 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/api_handler.py +333 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/cli.py +66 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/data_transformations/__init__.py +1 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/data_transformations/anonymization.py +185 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/iohandler/__init__.py +0 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/iohandler/file_io.py +102 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/iohandler/path_io.py +27 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/__init__.py +17 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/config_loader.py +22 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/csv_handler.py +49 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/date_functions.py +128 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/logging_setup.py +31 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/string_functions.py +57 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/text_processing.py +91 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/PKG-INFO +71 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/SOURCES.txt +27 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/dependency_links.txt +1 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/requires.txt +12 -0
- vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/top_level.txt +1 -0
- vh_pytoolbox-0.1.0/tests/test_anonymization.py +44 -0
- vh_pytoolbox-0.1.0/tests/test_api_handler.py +101 -0
- vh_pytoolbox-0.1.0/tests/test_config_loader.py +19 -0
- vh_pytoolbox-0.1.0/tests/test_date_functions.py +89 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Vinayak Hegde
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vh-pytoolbox
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reusable Python code for different projects
|
|
5
|
+
Author-email: Vinayak Hegde <vinayakvhegde@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 Vinayak Hegde
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
Classifier: Development Status :: 4 - Beta
|
|
28
|
+
Classifier: Intended Audience :: Developers
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
32
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
33
|
+
Classifier: Topic :: Utilities
|
|
34
|
+
Requires-Python: >=3.13
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
License-File: LICENSE.txt
|
|
37
|
+
Requires-Dist: pandas>=2.2.3
|
|
38
|
+
Requires-Dist: pyyaml>=6.0
|
|
39
|
+
Requires-Dist: requests>=2.31.0
|
|
40
|
+
Requires-Dist: msal>=1.25.0
|
|
41
|
+
Requires-Dist: azure-identity>=1.15.0
|
|
42
|
+
Provides-Extra: dev
|
|
43
|
+
Requires-Dist: build>=1.2.2.post1; extra == "dev"
|
|
44
|
+
Requires-Dist: ruff>=0.8.4; extra == "dev"
|
|
45
|
+
Requires-Dist: setuptools>=75.6.0; extra == "dev"
|
|
46
|
+
Requires-Dist: wheel>=0.45.1; extra == "dev"
|
|
47
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
# vh-pytoolbox
|
|
51
|
+
|
|
52
|
+
My own re-usable Python packages.
|
|
53
|
+
|
|
54
|
+
## Description
|
|
55
|
+
|
|
56
|
+
This package provides various utilities for handling file I/O, string processing, CSV manipulation, and logging setup. It is designed to streamline common tasks in data processing and analysis.
|
|
57
|
+
|
|
58
|
+
## Features
|
|
59
|
+
|
|
60
|
+
- **Path Initialization**: Functions to initialize and manage project paths.
|
|
61
|
+
- **File I/O**: Utilities for saving and loading JSON metadata.
|
|
62
|
+
- **String Processing**: Functions to convert strings to snake_case and translate text based on mappings.
|
|
63
|
+
- **CSV Handling**: Functions to remove columns from CSV files and transform text columns.
|
|
64
|
+
- **Logging Setup**: Easy setup for logging configuration.
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
You can install the package using pip:
|
|
69
|
+
|
|
70
|
+
```sh
|
|
71
|
+
pip install git+https://github.com/vinay/vh-repo-python-package.git
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# vh-pytoolbox
|
|
2
|
+
|
|
3
|
+
My own re-usable Python packages.
|
|
4
|
+
|
|
5
|
+
## Description
|
|
6
|
+
|
|
7
|
+
This package provides various utilities for handling file I/O, string processing, CSV manipulation, and logging setup. It is designed to streamline common tasks in data processing and analysis.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Path Initialization**: Functions to initialize and manage project paths.
|
|
12
|
+
- **File I/O**: Utilities for saving and loading JSON metadata.
|
|
13
|
+
- **String Processing**: Functions to convert strings to snake_case and translate text based on mappings.
|
|
14
|
+
- **CSV Handling**: Functions to remove columns from CSV files and transform text columns.
|
|
15
|
+
- **Logging Setup**: Easy setup for logging configuration.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
You can install the package using pip:
|
|
20
|
+
|
|
21
|
+
```sh
|
|
22
|
+
pip install git+https://github.com/vinay/vh-repo-python-package.git
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vh-pytoolbox"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Reusable Python code for different projects"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = {file = "LICENSE.txt"}
|
|
7
|
+
classifiers = [
|
|
8
|
+
"Development Status :: 4 - Beta",
|
|
9
|
+
"Intended Audience :: Developers",
|
|
10
|
+
"License :: OSI Approved :: MIT License",
|
|
11
|
+
"Programming Language :: Python :: 3",
|
|
12
|
+
"Programming Language :: Python :: 3.13",
|
|
13
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
14
|
+
"Topic :: Utilities",
|
|
15
|
+
]
|
|
16
|
+
requires-python = ">=3.13"
|
|
17
|
+
authors = [
|
|
18
|
+
{name = "Vinayak Hegde", email = "vinayakvhegde@gmail.com"},
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"pandas>=2.2.3",
|
|
22
|
+
"pyyaml>=6.0",
|
|
23
|
+
"requests>=2.31.0",
|
|
24
|
+
"msal>=1.25.0",
|
|
25
|
+
"azure-identity>=1.15.0",
|
|
26
|
+
]
|
|
27
|
+
optional-dependencies = { dev = ["build>=1.2.2.post1", "ruff>=0.8.4", "setuptools>=75.6.0", "wheel>=0.45.1", "pytest>=8.0.0"] }
|
|
28
|
+
|
|
29
|
+
[build-system]
|
|
30
|
+
requires = ["setuptools>=75.6.0", "wheel>=0.45.1"]
|
|
31
|
+
build-backend = "setuptools.build_meta"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
|
|
36
|
+
[tool.ruff]
|
|
37
|
+
line-length = 88
|
|
38
|
+
target-version = "py313"
|
|
39
|
+
|
|
40
|
+
[tool.ruff.lint]
|
|
41
|
+
select = ["E", "F", "W", "C90", "I", "N", "UP", "YTT", "S", "BLE", "FBT", "B", "A", "COM", "C4", "DTZ", "T10", "DJ", "EM", "EXE", "FA", "ISC", "ICN", "G", "INP", "PIE", "T20", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SLOT", "SIM", "TID", "TCH", "INT", "ARG", "PTH", "ERA", "PD", "PGH", "PL", "TRY", "FLY", "NPY", "AIR", "PERF", "FURB", "LOG", "RUF"]
|
|
42
|
+
ignore = ["S101", "COM812", "ISC001"] # Allow assert, trailing comma, etc.
|
|
43
|
+
|
|
44
|
+
[tool.ruff.lint.per-file-ignores]
|
|
45
|
+
"tests/*" = ["S101"] # Allow assert in tests
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from . import api_handler, cli, data_transformations, iohandler, utils
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"""
|
|
2
|
+
API handling utilities for Microsoft Graph and Power BI Admin APIs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
import msal
|
|
10
|
+
import requests
|
|
11
|
+
from azure.identity import ClientSecretCredential
|
|
12
|
+
|
|
13
|
+
from .utils.logging_setup import setup_logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MSGraphApi:
|
|
17
|
+
"""
|
|
18
|
+
A helper class to handle Microsoft Graph API
|
|
19
|
+
using client credential authentication and automatic pagination.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
SCOPE = ["https://graph.microsoft.com/.default"]
|
|
23
|
+
AUTH_URL = "https://login.microsoftonline.com"
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
tenant_id: str,
|
|
28
|
+
client_id: str,
|
|
29
|
+
client_secret: str,
|
|
30
|
+
logger: Optional[Any] = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
self.tenant_id = tenant_id
|
|
33
|
+
self.client_id = client_id
|
|
34
|
+
self.client_secret = client_secret
|
|
35
|
+
|
|
36
|
+
self.app = msal.ConfidentialClientApplication(
|
|
37
|
+
client_id=self.client_id,
|
|
38
|
+
client_credential=self.client_secret,
|
|
39
|
+
authority=f"{self.AUTH_URL}/{self.tenant_id}",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
self.logger = logger or setup_logger()
|
|
43
|
+
if self.logger:
|
|
44
|
+
self.logger.info("MS GraphApi initialized.")
|
|
45
|
+
|
|
46
|
+
def _build_headers(self) -> Dict[str, str]:
|
|
47
|
+
token = self.app.acquire_token_for_client(scopes=self.SCOPE)["access_token"]
|
|
48
|
+
return {
|
|
49
|
+
"Authorization": f"Bearer {token}",
|
|
50
|
+
"Content-Type": "application/json",
|
|
51
|
+
"ConsistencyLevel": "eventual",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
def get_all_pages(self, url: str) -> Dict[str, Any]:
|
|
55
|
+
"""
|
|
56
|
+
Standard pagination using @odata.nextLink.
|
|
57
|
+
"""
|
|
58
|
+
headers = self._build_headers()
|
|
59
|
+
session = requests.Session()
|
|
60
|
+
session.headers.update(headers)
|
|
61
|
+
|
|
62
|
+
aggregated_items: list = []
|
|
63
|
+
retryable = {429, 500, 502, 503, 504}
|
|
64
|
+
next_url = url
|
|
65
|
+
attempts = 0
|
|
66
|
+
|
|
67
|
+
while next_url:
|
|
68
|
+
response = session.get(next_url)
|
|
69
|
+
|
|
70
|
+
if response.status_code in retryable:
|
|
71
|
+
retry_after = response.headers.get("Retry-After")
|
|
72
|
+
delay = (
|
|
73
|
+
int(retry_after)
|
|
74
|
+
if retry_after and retry_after.isdigit()
|
|
75
|
+
else min(2**attempts, 60)
|
|
76
|
+
)
|
|
77
|
+
if self.logger:
|
|
78
|
+
self.logger.warning(
|
|
79
|
+
f"{response.status_code} retrying in {delay}s ..."
|
|
80
|
+
)
|
|
81
|
+
time.sleep(delay)
|
|
82
|
+
attempts += 1
|
|
83
|
+
if attempts > 5:
|
|
84
|
+
raise Exception(f"Too many retries for URL: {next_url}")
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
if response.status_code != 200:
|
|
88
|
+
msg = f"GET failed ({response.status_code}): {response.text}"
|
|
89
|
+
if self.logger:
|
|
90
|
+
self.logger.error(msg)
|
|
91
|
+
raise Exception(msg)
|
|
92
|
+
|
|
93
|
+
attempts = 0
|
|
94
|
+
data = response.json()
|
|
95
|
+
|
|
96
|
+
# Append only 'value' items if present and is a list
|
|
97
|
+
page_items = data.get("value", [])
|
|
98
|
+
if isinstance(page_items, list) and page_items:
|
|
99
|
+
aggregated_items.extend(page_items)
|
|
100
|
+
|
|
101
|
+
# Track last @odata.nextLink
|
|
102
|
+
delta_link = data.get("@odata.deltaLink", None)
|
|
103
|
+
next_url = data.get("@odata.nextLink")
|
|
104
|
+
|
|
105
|
+
if self.logger:
|
|
106
|
+
self.logger.info(
|
|
107
|
+
f"All pages retrieved successfully. Total items: "
|
|
108
|
+
f"{len(aggregated_items)}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return {"value": aggregated_items, "delta_link": delta_link}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class PowerBIAdminApi:
|
|
115
|
+
"""
|
|
116
|
+
A helper class for interacting with the Microsoft Power BI Admin APIs
|
|
117
|
+
using a client credential (service principal) authentication flow.
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
# Class-level constants
|
|
121
|
+
SCOPE: str = "https://analysis.windows.net/powerbi/api/.default"
|
|
122
|
+
BASE_URL: str = "https://api.powerbi.com/v1.0/myorg"
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
tenant_id: str,
|
|
127
|
+
client_id: str,
|
|
128
|
+
client_secret: str,
|
|
129
|
+
logger: Optional[Any] = None,
|
|
130
|
+
) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Initialize PowerBI admin helper class.
|
|
133
|
+
|
|
134
|
+
:param tenant_id: Azure AD tenant ID
|
|
135
|
+
:param client_id: Service principal client ID
|
|
136
|
+
:param client_secret: Service principal client secret
|
|
137
|
+
:param logger: Optional logger instance
|
|
138
|
+
"""
|
|
139
|
+
self.credential = ClientSecretCredential(
|
|
140
|
+
tenant_id=tenant_id, client_id=client_id, client_secret=client_secret
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
self.logger = logger or setup_logger()
|
|
144
|
+
self.logger.info("PowerBIAdminApi initialized.")
|
|
145
|
+
|
|
146
|
+
# ------------------------------------------------------
|
|
147
|
+
# SINGLE: BUILD HEADERS (GETS TOKEN + FORMS HEADERS)
|
|
148
|
+
# ------------------------------------------------------
|
|
149
|
+
def _build_headers(self) -> Dict[str, str]:
|
|
150
|
+
"""
|
|
151
|
+
Build HTTP request headers with a fresh OAuth2 bearer token.
|
|
152
|
+
|
|
153
|
+
:return: A dictionary with Authorization and Content-Type headers.
|
|
154
|
+
"""
|
|
155
|
+
token = self.credential.get_token(self.SCOPE).token
|
|
156
|
+
return {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
|
|
157
|
+
|
|
158
|
+
# ------------------------------------------------------
|
|
159
|
+
# GET REQUEST
|
|
160
|
+
# ------------------------------------------------------
|
|
161
|
+
def get_response(self, url: str) -> Dict[str, Any]:
|
|
162
|
+
"""
|
|
163
|
+
Send a GET request with full pagination and retry handling.
|
|
164
|
+
Returns a merged JSON response containing all pages.
|
|
165
|
+
|
|
166
|
+
:param url: Full request URL
|
|
167
|
+
:return: JSON response as dict
|
|
168
|
+
"""
|
|
169
|
+
headers = self._build_headers()
|
|
170
|
+
resp = requests.get(url, headers=headers)
|
|
171
|
+
|
|
172
|
+
if resp.status_code != 200:
|
|
173
|
+
self.logger.error(f"GET failed ({resp.status_code}): {resp.text}")
|
|
174
|
+
raise Exception(resp.text)
|
|
175
|
+
|
|
176
|
+
data = resp.json()
|
|
177
|
+
return data
|
|
178
|
+
|
|
179
|
+
# ------------------------------------------------------
|
|
180
|
+
# GET REQUEST - Paginated
|
|
181
|
+
# ------------------------------------------------------
|
|
182
|
+
def get_paginated_response(self, url: str) -> Dict[str, Any]:
|
|
183
|
+
"""
|
|
184
|
+
GET request with full pagination + retry handling.
|
|
185
|
+
Supports only Power BI REST API pagination via continuationUri.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
headers = self._build_headers()
|
|
189
|
+
session = requests.Session()
|
|
190
|
+
session.headers.update(headers)
|
|
191
|
+
|
|
192
|
+
next_url = url
|
|
193
|
+
attempt = 0
|
|
194
|
+
|
|
195
|
+
aggregated = defaultdict(list)
|
|
196
|
+
retryable = {429, 500, 502, 503, 504}
|
|
197
|
+
|
|
198
|
+
while next_url:
|
|
199
|
+
response = session.get(next_url)
|
|
200
|
+
|
|
201
|
+
# Retry for transient errors (429, 5xx)
|
|
202
|
+
if response.status_code in retryable:
|
|
203
|
+
retry_after = response.headers.get("Retry-After")
|
|
204
|
+
delay = (
|
|
205
|
+
int(retry_after)
|
|
206
|
+
if retry_after and retry_after.isdigit()
|
|
207
|
+
else min(2**attempts, 60)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
self.logger.warning(
|
|
211
|
+
f"{response.status_code} received. Retrying in {delay}s… "
|
|
212
|
+
f"({attempt+1}/5)"
|
|
213
|
+
)
|
|
214
|
+
time.sleep(delay)
|
|
215
|
+
|
|
216
|
+
attempt += 1
|
|
217
|
+
if attempt > 5:
|
|
218
|
+
raise Exception(f"Too many retries for URL: {next_url}")
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
# Hard failure
|
|
222
|
+
if response.status_code != 200:
|
|
223
|
+
self.logger.error(
|
|
224
|
+
f"GET failed ({response.status_code}): {response.text}"
|
|
225
|
+
)
|
|
226
|
+
raise Exception(response.text)
|
|
227
|
+
|
|
228
|
+
# Success
|
|
229
|
+
attempt = 0
|
|
230
|
+
data = response.json()
|
|
231
|
+
|
|
232
|
+
# Merge keys (list keys extended, non-list keys stored last)
|
|
233
|
+
for key, value in data.items():
|
|
234
|
+
if isinstance(value, list):
|
|
235
|
+
aggregated[key].extend(value)
|
|
236
|
+
else:
|
|
237
|
+
aggregated[key] = value
|
|
238
|
+
|
|
239
|
+
# Pagination handling
|
|
240
|
+
if data.get("continuationUri", None):
|
|
241
|
+
next_url = data.get("continuationUri")
|
|
242
|
+
else:
|
|
243
|
+
next_url = None # No more pages
|
|
244
|
+
|
|
245
|
+
self.logger.info("✅ All pages collected successfully.")
|
|
246
|
+
return dict(aggregated)
|
|
247
|
+
|
|
248
|
+
# ------------------------------------------------------
|
|
249
|
+
# POST REQUEST – START WORKSPACE SCAN
|
|
250
|
+
# ------------------------------------------------------
|
|
251
|
+
def post_request(self, url: str, body: Dict[str, Any]) -> str:
|
|
252
|
+
"""
|
|
253
|
+
Send a POST request and extract scanId from response.
|
|
254
|
+
|
|
255
|
+
:param url: POST URL
|
|
256
|
+
:param body: JSON request body
|
|
257
|
+
:return: scanId value
|
|
258
|
+
"""
|
|
259
|
+
headers = self._build_headers()
|
|
260
|
+
resp = requests.post(url, json=body, headers=headers)
|
|
261
|
+
|
|
262
|
+
if resp.status_code not in (200, 202):
|
|
263
|
+
self.logger.error(f"POST failed ({resp.status_code}): {resp.text}")
|
|
264
|
+
raise Exception(resp.text)
|
|
265
|
+
|
|
266
|
+
data = resp.json()
|
|
267
|
+
scan_id = data.get("id") or data.get("scanId")
|
|
268
|
+
|
|
269
|
+
if not scan_id:
|
|
270
|
+
self.logger.error("scanId missing from response.")
|
|
271
|
+
raise Exception("scanId missing")
|
|
272
|
+
|
|
273
|
+
self.logger.info(f"Scan started. scanId={scan_id}")
|
|
274
|
+
return scan_id
|
|
275
|
+
|
|
276
|
+
# ------------------------------------------------------
|
|
277
|
+
# POLL STATUS
|
|
278
|
+
# ------------------------------------------------------
|
|
279
|
+
def poll_scan_status(self, url: str, scan_id: str) -> Dict[str, Any]:
|
|
280
|
+
"""
|
|
281
|
+
Poll the scan status until completion.
|
|
282
|
+
|
|
283
|
+
:param url: Scan status URL
|
|
284
|
+
:param scan_id: Scan identifier
|
|
285
|
+
:return: JSON response with final status
|
|
286
|
+
"""
|
|
287
|
+
poll_interval = 10
|
|
288
|
+
max_attempts = 60
|
|
289
|
+
headers = self._build_headers()
|
|
290
|
+
self.logger.info(f"Polling status for scanId={scan_id}")
|
|
291
|
+
|
|
292
|
+
url = f"{url}/{scan_id}"
|
|
293
|
+
for attempt in range(1, max_attempts + 1):
|
|
294
|
+
resp = requests.get(url, headers=headers)
|
|
295
|
+
|
|
296
|
+
if resp.status_code != 200:
|
|
297
|
+
self.logger.error(f"Polling failed ({resp.status_code}): {resp.text}")
|
|
298
|
+
raise Exception(resp.text)
|
|
299
|
+
|
|
300
|
+
data = resp.json()
|
|
301
|
+
status = data.get("status")
|
|
302
|
+
self.logger.info(f"Attempt {attempt}: status={status}")
|
|
303
|
+
|
|
304
|
+
if status in ("Succeeded", "Failed", "Unknown"):
|
|
305
|
+
return data
|
|
306
|
+
|
|
307
|
+
time.sleep(poll_interval)
|
|
308
|
+
|
|
309
|
+
raise TimeoutError("Scan polling timed out.")
|
|
310
|
+
|
|
311
|
+
# ------------------------------------------------------
|
|
312
|
+
# GET SCAN RESULT
|
|
313
|
+
# ------------------------------------------------------
|
|
314
|
+
def get_scan_result(self, url: str, scan_id: str) -> Dict[str, Any]:
|
|
315
|
+
"""
|
|
316
|
+
Retrieve the final scan result.
|
|
317
|
+
|
|
318
|
+
:param url: Scan result URL
|
|
319
|
+
:param scan_id: Scan identifier
|
|
320
|
+
:return: JSON scan result
|
|
321
|
+
"""
|
|
322
|
+
headers = self._build_headers()
|
|
323
|
+
self.logger.info(f"Fetching scanResult for scanId={scan_id}")
|
|
324
|
+
url = f"{url}/{scan_id}"
|
|
325
|
+
resp = requests.get(url, headers=headers)
|
|
326
|
+
|
|
327
|
+
if resp.status_code != 200:
|
|
328
|
+
self.logger.error(f"Result fetch failed ({resp.status_code}): {resp.text}")
|
|
329
|
+
raise Exception(resp.text)
|
|
330
|
+
|
|
331
|
+
data = resp.json()
|
|
332
|
+
self.logger.info("Scan result retrieved successfully.")
|
|
333
|
+
return data
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Simple command-line entry point for dataset anonymization."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .data_transformations import anonymization
|
|
7
|
+
from .iohandler.file_io import save_metadata_to_json
|
|
8
|
+
from .utils.config_loader import load_yaml_config
|
|
9
|
+
from .utils.logging_setup import setup_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> None:
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
description="Anonymize CSV datasets according to a YAML configuration."
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"--config", "-c", required=True, help="Path to the YAML config file."
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"--raw", "-r", required=True, help="Path to raw data folder to process."
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument(
|
|
23
|
+
"--out", "-o", default=None, help="Path to output (anonymized) folder."
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument("--log", help="Path to log file (default: console).")
|
|
26
|
+
|
|
27
|
+
args = parser.parse_args()
|
|
28
|
+
|
|
29
|
+
logger = setup_logger(log_file=args.log)
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
config = load_yaml_config(args.config)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.error(f"Failed to load config from {args.config}: {e}")
|
|
35
|
+
return
|
|
36
|
+
|
|
37
|
+
raw_folder = Path(args.raw)
|
|
38
|
+
if args.out:
|
|
39
|
+
anonymized_folder = Path(args.out)
|
|
40
|
+
else:
|
|
41
|
+
anonymized_folder = raw_folder.parent / "anonymized"
|
|
42
|
+
|
|
43
|
+
anonymized_folder.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
logger.info(f"Starting anonymization. raw={raw_folder} out={anonymized_folder}")
|
|
46
|
+
try:
|
|
47
|
+
file_meta, col_meta = anonymization.anonymize_source_files(
|
|
48
|
+
raw_folder=str(raw_folder),
|
|
49
|
+
anonymized_folder=str(anonymized_folder),
|
|
50
|
+
config=config,
|
|
51
|
+
logger=logger,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Save metadata
|
|
55
|
+
file_meta_path = anonymized_folder / "file_metadata.json"
|
|
56
|
+
col_meta_path = anonymized_folder / "column_metadata.json"
|
|
57
|
+
save_metadata_to_json(
|
|
58
|
+
file_meta, col_meta, str(file_meta_path), str(col_meta_path)
|
|
59
|
+
)
|
|
60
|
+
logger.info("Finished anonymization.")
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.error(f"Anonymization failed: {e}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from . import anonymization
|