vh-pytoolbox 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. vh_pytoolbox-0.1.0/LICENSE.txt +21 -0
  2. vh_pytoolbox-0.1.0/PKG-INFO +71 -0
  3. vh_pytoolbox-0.1.0/README.md +22 -0
  4. vh_pytoolbox-0.1.0/pyproject.toml +45 -0
  5. vh_pytoolbox-0.1.0/setup.cfg +4 -0
  6. vh_pytoolbox-0.1.0/src/vh_pytoolbox/__init__.py +1 -0
  7. vh_pytoolbox-0.1.0/src/vh_pytoolbox/api_handler.py +333 -0
  8. vh_pytoolbox-0.1.0/src/vh_pytoolbox/cli.py +66 -0
  9. vh_pytoolbox-0.1.0/src/vh_pytoolbox/data_transformations/__init__.py +1 -0
  10. vh_pytoolbox-0.1.0/src/vh_pytoolbox/data_transformations/anonymization.py +185 -0
  11. vh_pytoolbox-0.1.0/src/vh_pytoolbox/iohandler/__init__.py +0 -0
  12. vh_pytoolbox-0.1.0/src/vh_pytoolbox/iohandler/file_io.py +102 -0
  13. vh_pytoolbox-0.1.0/src/vh_pytoolbox/iohandler/path_io.py +27 -0
  14. vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/__init__.py +17 -0
  15. vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/config_loader.py +22 -0
  16. vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/csv_handler.py +49 -0
  17. vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/date_functions.py +128 -0
  18. vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/logging_setup.py +31 -0
  19. vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/string_functions.py +57 -0
  20. vh_pytoolbox-0.1.0/src/vh_pytoolbox/utils/text_processing.py +91 -0
  21. vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/PKG-INFO +71 -0
  22. vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/SOURCES.txt +27 -0
  23. vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/dependency_links.txt +1 -0
  24. vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/requires.txt +12 -0
  25. vh_pytoolbox-0.1.0/src/vh_pytoolbox.egg-info/top_level.txt +1 -0
  26. vh_pytoolbox-0.1.0/tests/test_anonymization.py +44 -0
  27. vh_pytoolbox-0.1.0/tests/test_api_handler.py +101 -0
  28. vh_pytoolbox-0.1.0/tests/test_config_loader.py +19 -0
  29. vh_pytoolbox-0.1.0/tests/test_date_functions.py +89 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Vinayak Hegde
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: vh-pytoolbox
3
+ Version: 0.1.0
4
+ Summary: Reusable Python code for different projects
5
+ Author-email: Vinayak Hegde <vinayakvhegde@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Vinayak Hegde
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ Classifier: Development Status :: 4 - Beta
28
+ Classifier: Intended Audience :: Developers
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.13
32
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
33
+ Classifier: Topic :: Utilities
34
+ Requires-Python: >=3.13
35
+ Description-Content-Type: text/markdown
36
+ License-File: LICENSE.txt
37
+ Requires-Dist: pandas>=2.2.3
38
+ Requires-Dist: pyyaml>=6.0
39
+ Requires-Dist: requests>=2.31.0
40
+ Requires-Dist: msal>=1.25.0
41
+ Requires-Dist: azure-identity>=1.15.0
42
+ Provides-Extra: dev
43
+ Requires-Dist: build>=1.2.2.post1; extra == "dev"
44
+ Requires-Dist: ruff>=0.8.4; extra == "dev"
45
+ Requires-Dist: setuptools>=75.6.0; extra == "dev"
46
+ Requires-Dist: wheel>=0.45.1; extra == "dev"
47
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
48
+ Dynamic: license-file
49
+
50
+ # vh-pytoolbox
51
+
52
+ My own re-usable Python packages.
53
+
54
+ ## Description
55
+
56
+ This package provides various utilities for handling file I/O, string processing, CSV manipulation, and logging setup. It is designed to streamline common tasks in data processing and analysis.
57
+
58
+ ## Features
59
+
60
+ - **Path Initialization**: Functions to initialize and manage project paths.
61
+ - **File I/O**: Utilities for saving and loading JSON metadata.
62
+ - **String Processing**: Functions to convert strings to snake_case and translate text based on mappings.
63
+ - **CSV Handling**: Functions to remove columns from CSV files and transform text columns.
64
+ - **Logging Setup**: Easy setup for logging configuration.
65
+
66
+ ## Installation
67
+
68
+ You can install the package using pip:
69
+
70
+ ```sh
71
+ pip install git+https://github.com/vinay/vh-repo-python-package.git
@@ -0,0 +1,22 @@
1
+ # vh-pytoolbox
2
+
3
+ My own re-usable Python packages.
4
+
5
+ ## Description
6
+
7
+ This package provides various utilities for handling file I/O, string processing, CSV manipulation, and logging setup. It is designed to streamline common tasks in data processing and analysis.
8
+
9
+ ## Features
10
+
11
+ - **Path Initialization**: Functions to initialize and manage project paths.
12
+ - **File I/O**: Utilities for saving and loading JSON metadata.
13
+ - **String Processing**: Functions to convert strings to snake_case and translate text based on mappings.
14
+ - **CSV Handling**: Functions to remove columns from CSV files and transform text columns.
15
+ - **Logging Setup**: Easy setup for logging configuration.
16
+
17
+ ## Installation
18
+
19
+ You can install the package using pip:
20
+
21
+ ```sh
22
+ pip install git+https://github.com/vinay/vh-repo-python-package.git
@@ -0,0 +1,45 @@
1
+ [project]
2
+ name = "vh-pytoolbox"
3
+ version = "0.1.0"
4
+ description = "Reusable Python code for different projects"
5
+ readme = "README.md"
6
+ license = {file = "LICENSE.txt"}
7
+ classifiers = [
8
+ "Development Status :: 4 - Beta",
9
+ "Intended Audience :: Developers",
10
+ "License :: OSI Approved :: MIT License",
11
+ "Programming Language :: Python :: 3",
12
+ "Programming Language :: Python :: 3.13",
13
+ "Topic :: Software Development :: Libraries :: Python Modules",
14
+ "Topic :: Utilities",
15
+ ]
16
+ requires-python = ">=3.13"
17
+ authors = [
18
+ {name = "Vinayak Hegde", email = "vinayakvhegde@gmail.com"},
19
+ ]
20
+ dependencies = [
21
+ "pandas>=2.2.3",
22
+ "pyyaml>=6.0",
23
+ "requests>=2.31.0",
24
+ "msal>=1.25.0",
25
+ "azure-identity>=1.15.0",
26
+ ]
27
+ optional-dependencies = { dev = ["build>=1.2.2.post1", "ruff>=0.8.4", "setuptools>=75.6.0", "wheel>=0.45.1", "pytest>=8.0.0"] }
28
+
29
+ [build-system]
30
+ requires = ["setuptools>=75.6.0", "wheel>=0.45.1"]
31
+ build-backend = "setuptools.build_meta"
32
+
33
+ [tool.setuptools.packages.find]
34
+ where = ["src"]
35
+
36
+ [tool.ruff]
37
+ line-length = 88
38
+ target-version = "py313"
39
+
40
+ [tool.ruff.lint]
41
+ select = ["E", "F", "W", "C90", "I", "N", "UP", "YTT", "S", "BLE", "FBT", "B", "A", "COM", "C4", "DTZ", "T10", "DJ", "EM", "EXE", "FA", "ISC", "ICN", "G", "INP", "PIE", "T20", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SLOT", "SIM", "TID", "TCH", "INT", "ARG", "PTH", "ERA", "PD", "PGH", "PL", "TRY", "FLY", "NPY", "AIR", "PERF", "FURB", "LOG", "RUF"]
42
+ ignore = ["S101", "COM812", "ISC001"] # Allow assert, trailing comma, etc.
43
+
44
+ [tool.ruff.lint.per-file-ignores]
45
+ "tests/*" = ["S101"] # Allow assert in tests
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ from . import api_handler, cli, data_transformations, iohandler, utils
@@ -0,0 +1,333 @@
1
+ """
2
+ API handling utilities for Microsoft Graph and Power BI Admin APIs.
3
+ """
4
+
5
+ import time
6
+ from collections import defaultdict
7
+ from typing import Any, Dict, Optional
8
+
9
+ import msal
10
+ import requests
11
+ from azure.identity import ClientSecretCredential
12
+
13
+ from .utils.logging_setup import setup_logger
14
+
15
+
16
+ class MSGraphApi:
17
+ """
18
+ A helper class to handle Microsoft Graph API
19
+ using client credential authentication and automatic pagination.
20
+ """
21
+
22
+ SCOPE = ["https://graph.microsoft.com/.default"]
23
+ AUTH_URL = "https://login.microsoftonline.com"
24
+
25
+ def __init__(
26
+ self,
27
+ tenant_id: str,
28
+ client_id: str,
29
+ client_secret: str,
30
+ logger: Optional[Any] = None,
31
+ ) -> None:
32
+ self.tenant_id = tenant_id
33
+ self.client_id = client_id
34
+ self.client_secret = client_secret
35
+
36
+ self.app = msal.ConfidentialClientApplication(
37
+ client_id=self.client_id,
38
+ client_credential=self.client_secret,
39
+ authority=f"{self.AUTH_URL}/{self.tenant_id}",
40
+ )
41
+
42
+ self.logger = logger or setup_logger()
43
+ if self.logger:
44
+ self.logger.info("MS GraphApi initialized.")
45
+
46
+ def _build_headers(self) -> Dict[str, str]:
47
+ token = self.app.acquire_token_for_client(scopes=self.SCOPE)["access_token"]
48
+ return {
49
+ "Authorization": f"Bearer {token}",
50
+ "Content-Type": "application/json",
51
+ "ConsistencyLevel": "eventual",
52
+ }
53
+
54
+ def get_all_pages(self, url: str) -> Dict[str, Any]:
55
+ """
56
+ Standard pagination using @odata.nextLink.
57
+ """
58
+ headers = self._build_headers()
59
+ session = requests.Session()
60
+ session.headers.update(headers)
61
+
62
+ aggregated_items: list = []
63
+ retryable = {429, 500, 502, 503, 504}
64
+ next_url = url
65
+ attempts = 0
66
+
67
+ while next_url:
68
+ response = session.get(next_url)
69
+
70
+ if response.status_code in retryable:
71
+ retry_after = response.headers.get("Retry-After")
72
+ delay = (
73
+ int(retry_after)
74
+ if retry_after and retry_after.isdigit()
75
+ else min(2**attempts, 60)
76
+ )
77
+ if self.logger:
78
+ self.logger.warning(
79
+ f"{response.status_code} retrying in {delay}s ..."
80
+ )
81
+ time.sleep(delay)
82
+ attempts += 1
83
+ if attempts > 5:
84
+ raise Exception(f"Too many retries for URL: {next_url}")
85
+ continue
86
+
87
+ if response.status_code != 200:
88
+ msg = f"GET failed ({response.status_code}): {response.text}"
89
+ if self.logger:
90
+ self.logger.error(msg)
91
+ raise Exception(msg)
92
+
93
+ attempts = 0
94
+ data = response.json()
95
+
96
+ # Append only 'value' items if present and is a list
97
+ page_items = data.get("value", [])
98
+ if isinstance(page_items, list) and page_items:
99
+ aggregated_items.extend(page_items)
100
+
101
+ # Track last @odata.nextLink
102
+ delta_link = data.get("@odata.deltaLink", None)
103
+ next_url = data.get("@odata.nextLink")
104
+
105
+ if self.logger:
106
+ self.logger.info(
107
+ f"All pages retrieved successfully. Total items: "
108
+ f"{len(aggregated_items)}"
109
+ )
110
+
111
+ return {"value": aggregated_items, "delta_link": delta_link}
112
+
113
+
114
+ class PowerBIAdminApi:
115
+ """
116
+ A helper class for interacting with the Microsoft Power BI Admin APIs
117
+ using a client credential (service principal) authentication flow.
118
+ """
119
+
120
+ # Class-level constants
121
+ SCOPE: str = "https://analysis.windows.net/powerbi/api/.default"
122
+ BASE_URL: str = "https://api.powerbi.com/v1.0/myorg"
123
+
124
+ def __init__(
125
+ self,
126
+ tenant_id: str,
127
+ client_id: str,
128
+ client_secret: str,
129
+ logger: Optional[Any] = None,
130
+ ) -> None:
131
+ """
132
+ Initialize PowerBI admin helper class.
133
+
134
+ :param tenant_id: Azure AD tenant ID
135
+ :param client_id: Service principal client ID
136
+ :param client_secret: Service principal client secret
137
+ :param logger: Optional logger instance
138
+ """
139
+ self.credential = ClientSecretCredential(
140
+ tenant_id=tenant_id, client_id=client_id, client_secret=client_secret
141
+ )
142
+
143
+ self.logger = logger or setup_logger()
144
+ self.logger.info("PowerBIAdminApi initialized.")
145
+
146
+ # ------------------------------------------------------
147
+ # SINGLE: BUILD HEADERS (GETS TOKEN + FORMS HEADERS)
148
+ # ------------------------------------------------------
149
+ def _build_headers(self) -> Dict[str, str]:
150
+ """
151
+ Build HTTP request headers with a fresh OAuth2 bearer token.
152
+
153
+ :return: A dictionary with Authorization and Content-Type headers.
154
+ """
155
+ token = self.credential.get_token(self.SCOPE).token
156
+ return {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
157
+
158
+ # ------------------------------------------------------
159
+ # GET REQUEST
160
+ # ------------------------------------------------------
161
+ def get_response(self, url: str) -> Dict[str, Any]:
162
+ """
163
+ Send a GET request with full pagination and retry handling.
164
+ Returns a merged JSON response containing all pages.
165
+
166
+ :param url: Full request URL
167
+ :return: JSON response as dict
168
+ """
169
+ headers = self._build_headers()
170
+ resp = requests.get(url, headers=headers)
171
+
172
+ if resp.status_code != 200:
173
+ self.logger.error(f"GET failed ({resp.status_code}): {resp.text}")
174
+ raise Exception(resp.text)
175
+
176
+ data = resp.json()
177
+ return data
178
+
179
+ # ------------------------------------------------------
180
+ # GET REQUEST - Paginated
181
+ # ------------------------------------------------------
182
+ def get_paginated_response(self, url: str) -> Dict[str, Any]:
183
+ """
184
+ GET request with full pagination + retry handling.
185
+ Supports only Power BI REST API pagination via continuationUri.
186
+ """
187
+
188
+ headers = self._build_headers()
189
+ session = requests.Session()
190
+ session.headers.update(headers)
191
+
192
+ next_url = url
193
+ attempt = 0
194
+
195
+ aggregated = defaultdict(list)
196
+ retryable = {429, 500, 502, 503, 504}
197
+
198
+ while next_url:
199
+ response = session.get(next_url)
200
+
201
+ # Retry for transient errors (429, 5xx)
202
+ if response.status_code in retryable:
203
+ retry_after = response.headers.get("Retry-After")
204
+ delay = (
205
+ int(retry_after)
206
+ if retry_after and retry_after.isdigit()
207
+ else min(2**attempts, 60)
208
+ )
209
+
210
+ self.logger.warning(
211
+ f"{response.status_code} received. Retrying in {delay}s… "
212
+ f"({attempt+1}/5)"
213
+ )
214
+ time.sleep(delay)
215
+
216
+ attempt += 1
217
+ if attempt > 5:
218
+ raise Exception(f"Too many retries for URL: {next_url}")
219
+ continue
220
+
221
+ # Hard failure
222
+ if response.status_code != 200:
223
+ self.logger.error(
224
+ f"GET failed ({response.status_code}): {response.text}"
225
+ )
226
+ raise Exception(response.text)
227
+
228
+ # Success
229
+ attempt = 0
230
+ data = response.json()
231
+
232
+ # Merge keys (list keys extended, non-list keys stored last)
233
+ for key, value in data.items():
234
+ if isinstance(value, list):
235
+ aggregated[key].extend(value)
236
+ else:
237
+ aggregated[key] = value
238
+
239
+ # Pagination handling
240
+ if data.get("continuationUri", None):
241
+ next_url = data.get("continuationUri")
242
+ else:
243
+ next_url = None # No more pages
244
+
245
+ self.logger.info("✅ All pages collected successfully.")
246
+ return dict(aggregated)
247
+
248
+ # ------------------------------------------------------
249
+ # POST REQUEST – START WORKSPACE SCAN
250
+ # ------------------------------------------------------
251
+ def post_request(self, url: str, body: Dict[str, Any]) -> str:
252
+ """
253
+ Send a POST request and extract scanId from response.
254
+
255
+ :param url: POST URL
256
+ :param body: JSON request body
257
+ :return: scanId value
258
+ """
259
+ headers = self._build_headers()
260
+ resp = requests.post(url, json=body, headers=headers)
261
+
262
+ if resp.status_code not in (200, 202):
263
+ self.logger.error(f"POST failed ({resp.status_code}): {resp.text}")
264
+ raise Exception(resp.text)
265
+
266
+ data = resp.json()
267
+ scan_id = data.get("id") or data.get("scanId")
268
+
269
+ if not scan_id:
270
+ self.logger.error("scanId missing from response.")
271
+ raise Exception("scanId missing")
272
+
273
+ self.logger.info(f"Scan started. scanId={scan_id}")
274
+ return scan_id
275
+
276
+ # ------------------------------------------------------
277
+ # POLL STATUS
278
+ # ------------------------------------------------------
279
+ def poll_scan_status(self, url: str, scan_id: str) -> Dict[str, Any]:
280
+ """
281
+ Poll the scan status until completion.
282
+
283
+ :param url: Scan status URL
284
+ :param scan_id: Scan identifier
285
+ :return: JSON response with final status
286
+ """
287
+ poll_interval = 10
288
+ max_attempts = 60
289
+ headers = self._build_headers()
290
+ self.logger.info(f"Polling status for scanId={scan_id}")
291
+
292
+ url = f"{url}/{scan_id}"
293
+ for attempt in range(1, max_attempts + 1):
294
+ resp = requests.get(url, headers=headers)
295
+
296
+ if resp.status_code != 200:
297
+ self.logger.error(f"Polling failed ({resp.status_code}): {resp.text}")
298
+ raise Exception(resp.text)
299
+
300
+ data = resp.json()
301
+ status = data.get("status")
302
+ self.logger.info(f"Attempt {attempt}: status={status}")
303
+
304
+ if status in ("Succeeded", "Failed", "Unknown"):
305
+ return data
306
+
307
+ time.sleep(poll_interval)
308
+
309
+ raise TimeoutError("Scan polling timed out.")
310
+
311
+ # ------------------------------------------------------
312
+ # GET SCAN RESULT
313
+ # ------------------------------------------------------
314
+ def get_scan_result(self, url: str, scan_id: str) -> Dict[str, Any]:
315
+ """
316
+ Retrieve the final scan result.
317
+
318
+ :param url: Scan result URL
319
+ :param scan_id: Scan identifier
320
+ :return: JSON scan result
321
+ """
322
+ headers = self._build_headers()
323
+ self.logger.info(f"Fetching scanResult for scanId={scan_id}")
324
+ url = f"{url}/{scan_id}"
325
+ resp = requests.get(url, headers=headers)
326
+
327
+ if resp.status_code != 200:
328
+ self.logger.error(f"Result fetch failed ({resp.status_code}): {resp.text}")
329
+ raise Exception(resp.text)
330
+
331
+ data = resp.json()
332
+ self.logger.info("Scan result retrieved successfully.")
333
+ return data
@@ -0,0 +1,66 @@
1
+ """Simple command-line entry point for dataset anonymization."""
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from .data_transformations import anonymization
7
+ from .iohandler.file_io import save_metadata_to_json
8
+ from .utils.config_loader import load_yaml_config
9
+ from .utils.logging_setup import setup_logger
10
+
11
+
12
+ def main() -> None:
13
+ parser = argparse.ArgumentParser(
14
+ description="Anonymize CSV datasets according to a YAML configuration."
15
+ )
16
+ parser.add_argument(
17
+ "--config", "-c", required=True, help="Path to the YAML config file."
18
+ )
19
+ parser.add_argument(
20
+ "--raw", "-r", required=True, help="Path to raw data folder to process."
21
+ )
22
+ parser.add_argument(
23
+ "--out", "-o", default=None, help="Path to output (anonymized) folder."
24
+ )
25
+ parser.add_argument("--log", help="Path to log file (default: console).")
26
+
27
+ args = parser.parse_args()
28
+
29
+ logger = setup_logger(log_file=args.log)
30
+
31
+ try:
32
+ config = load_yaml_config(args.config)
33
+ except Exception as e:
34
+ logger.error(f"Failed to load config from {args.config}: {e}")
35
+ return
36
+
37
+ raw_folder = Path(args.raw)
38
+ if args.out:
39
+ anonymized_folder = Path(args.out)
40
+ else:
41
+ anonymized_folder = raw_folder.parent / "anonymized"
42
+
43
+ anonymized_folder.mkdir(parents=True, exist_ok=True)
44
+
45
+ logger.info(f"Starting anonymization. raw={raw_folder} out={anonymized_folder}")
46
+ try:
47
+ file_meta, col_meta = anonymization.anonymize_source_files(
48
+ raw_folder=str(raw_folder),
49
+ anonymized_folder=str(anonymized_folder),
50
+ config=config,
51
+ logger=logger,
52
+ )
53
+
54
+ # Save metadata
55
+ file_meta_path = anonymized_folder / "file_metadata.json"
56
+ col_meta_path = anonymized_folder / "column_metadata.json"
57
+ save_metadata_to_json(
58
+ file_meta, col_meta, str(file_meta_path), str(col_meta_path)
59
+ )
60
+ logger.info("Finished anonymization.")
61
+ except Exception as e:
62
+ logger.error(f"Anonymization failed: {e}")
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
@@ -0,0 +1 @@
1
+ from . import anonymization