winipedia-utils 0.1.63__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of winipedia-utils might be problematic. Click here for more details.
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/PKG-INFO +5 -7
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/pyproject.toml +3 -7
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/concurrent.py +6 -3
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/multiprocessing.py +16 -1
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/consts.py +0 -2
- winipedia_utils-0.2.1/winipedia_utils/data/dataframe/cleaning.py +378 -0
- winipedia_utils-0.2.1/winipedia_utils/data/structures/dicts.py +16 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/function.py +0 -2
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/mixins/meta.py +3 -1
- winipedia_utils-0.1.63/winipedia_utils/data/dataframe.py +0 -7
- winipedia_utils-0.1.63/winipedia_utils/django/__init__.py +0 -24
- winipedia_utils-0.1.63/winipedia_utils/django/bulk.py +0 -538
- winipedia_utils-0.1.63/winipedia_utils/django/command.py +0 -334
- winipedia_utils-0.1.63/winipedia_utils/django/database.py +0 -289
- winipedia_utils-0.1.63/winipedia_utils/pyside/__init__.py +0 -1
- winipedia_utils-0.1.63/winipedia_utils/pyside/core/__init__.py +0 -1
- winipedia_utils-0.1.63/winipedia_utils/pyside/core/py_qiodevice.py +0 -476
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/__init__.py +0 -1
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/base/base.py +0 -180
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/base/base.py +0 -92
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/browser.py +0 -26
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/player.py +0 -85
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/browser.py +0 -243
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/clickable_widget.py +0 -57
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/media_player.py +0 -430
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/notification.py +0 -78
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/windows/__init__.py +0 -1
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/windows/base/__init__.py +0 -1
- winipedia_utils-0.1.63/winipedia_utils/pyside/ui/windows/base/base.py +0 -49
- winipedia_utils-0.1.63/winipedia_utils/resources/__init__.py +0 -1
- winipedia_utils-0.1.63/winipedia_utils/resources/svgs/__init__.py +0 -1
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/LICENSE +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/README.md +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/multithreading.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/data/__init__.py +0 -0
- {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/base → winipedia_utils-0.2.1/winipedia_utils/data/dataframe}/__init__.py +0 -0
- {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages → winipedia_utils-0.2.1/winipedia_utils/data/structures}/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/gitignore/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/gitignore/gitignore.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/config.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/hooks.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/run_hooks.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/iterating/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/iterating/iterate.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/ansi.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/config.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/logger.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/class_.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/module.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/package.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/mixins/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/mixins/mixin.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/os/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/os/os.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/poetry/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/poetry/config.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/poetry/poetry.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/project.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/py.typed +0 -0
- {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/base → winipedia_utils-0.2.1/winipedia_utils/resources}/__init__.py +0 -0
- {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets → winipedia_utils-0.2.1/winipedia_utils/resources/svgs}/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/delete_garbage_can.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/download_arrow.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/exit_fullscreen_icon.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/fullscreen_icon.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/menu_icon.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/pause_icon.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/play_icon.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/plus_icon.svg +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/svg.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/security/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/security/cryptography.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/security/keyring.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/setup.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/assertions.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/convention.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/create_tests.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/fixtures.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/fixture.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/class_.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/function.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/module.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/package.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/session.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/utils/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/utils/utils.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/conftest.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/text/__init__.py +0 -0
- {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/text/string.py +0 -0
|
@@ -1,22 +1,20 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: winipedia-utils
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A package with many utility functions
|
|
5
|
-
License: MIT
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Author: Winipedia
|
|
7
8
|
Author-email: win.steveker@gmx.de
|
|
8
9
|
Requires-Python: >=3.12,<3.14
|
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.13
|
|
13
13
|
Requires-Dist: cryptography (>=45.0.5,<46.0.0)
|
|
14
14
|
Requires-Dist: defusedxml (>=0.7.1,<0.8.0)
|
|
15
|
-
Requires-Dist: django (>=5.2.1,<6.0.0)
|
|
16
15
|
Requires-Dist: keyring (>=25.6.0,<26.0.0)
|
|
17
16
|
Requires-Dist: pathspec (>=0.12.1,<0.13.0)
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist: pyside6 (>=6.9.1,<7.0.0)
|
|
17
|
+
Requires-Dist: polars (>=1.34.0,<2.0.0)
|
|
20
18
|
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
|
21
19
|
Requires-Dist: setuptools (>=80.3.1,<81.0.0)
|
|
22
20
|
Requires-Dist: tomlkit (>=0.13.2,<0.14.0)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Project section
|
|
2
2
|
[project]
|
|
3
3
|
name = "winipedia-utils"
|
|
4
|
-
version = "0.1
|
|
4
|
+
version = "0.2.1"
|
|
5
5
|
description = "A package with many utility functions"
|
|
6
6
|
readme = "README.md"
|
|
7
7
|
requires-python = ">=3.12,<3.14"
|
|
@@ -9,7 +9,7 @@ dynamic = [ "dependencies",]
|
|
|
9
9
|
authors = [
|
|
10
10
|
{ name = "Winipedia", email = "win.steveker@gmx.de"}
|
|
11
11
|
]
|
|
12
|
-
license =
|
|
12
|
+
license = "MIT"
|
|
13
13
|
|
|
14
14
|
# Build system section
|
|
15
15
|
[build-system]
|
|
@@ -26,12 +26,10 @@ setuptools = "^80.3.1"
|
|
|
26
26
|
defusedxml = "^0.7.1"
|
|
27
27
|
tomlkit = "^0.13.2"
|
|
28
28
|
pathspec = "^0.12.1"
|
|
29
|
-
django = "^5.2.1"
|
|
30
29
|
pyyaml = "^6.0.2"
|
|
31
30
|
keyring = "^25.6.0"
|
|
32
31
|
cryptography = "^45.0.5"
|
|
33
|
-
|
|
34
|
-
pyqt-toast-notification = "^1.3.3"
|
|
32
|
+
polars = "^1.34.0"
|
|
35
33
|
|
|
36
34
|
[tool.poetry.group.dev.dependencies]
|
|
37
35
|
ruff = "^0.11.7"
|
|
@@ -44,8 +42,6 @@ types-tqdm = "^4.67.0.20250417"
|
|
|
44
42
|
types-defusedxml = "^0.7.0.20240218"
|
|
45
43
|
types-pyyaml = "^6.0.12.20250516"
|
|
46
44
|
pytest-mock = "^3.14.0"
|
|
47
|
-
django-stubs = "^5.2.0"
|
|
48
|
-
pytest-qt = "^4.5.0"
|
|
49
45
|
|
|
50
46
|
[tool.ruff]
|
|
51
47
|
exclude = [ ".*", "**/migrations/*.py",]
|
|
@@ -16,15 +16,18 @@ from collections.abc import Callable, Generator, Iterable
|
|
|
16
16
|
from concurrent.futures import ThreadPoolExecutor
|
|
17
17
|
from copy import deepcopy
|
|
18
18
|
from functools import partial
|
|
19
|
-
from
|
|
20
|
-
from typing import Any, cast
|
|
19
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
21
20
|
|
|
22
21
|
from tqdm import tqdm
|
|
23
22
|
|
|
23
|
+
from winipedia_utils.concurrent.multiprocessing import get_spwan_pool
|
|
24
24
|
from winipedia_utils.concurrent.multithreading import imap_unordered
|
|
25
25
|
from winipedia_utils.iterating.iterate import get_len_with_default
|
|
26
26
|
from winipedia_utils.logging.logger import get_logger
|
|
27
27
|
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from multiprocessing.pool import Pool
|
|
30
|
+
|
|
28
31
|
logger = get_logger(__name__)
|
|
29
32
|
|
|
30
33
|
|
|
@@ -218,7 +221,7 @@ def concurrent_loop( # noqa: PLR0913
|
|
|
218
221
|
pool_executor = (
|
|
219
222
|
ThreadPoolExecutor(max_workers=max_workers)
|
|
220
223
|
if threading
|
|
221
|
-
else
|
|
224
|
+
else get_spwan_pool(processes=max_workers)
|
|
222
225
|
)
|
|
223
226
|
with pool_executor as pool:
|
|
224
227
|
map_func: Callable[[Callable[..., Any], Iterable[Any]], Any]
|
{winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/multiprocessing.py
RENAMED
|
@@ -20,6 +20,20 @@ from winipedia_utils.logging.logger import get_logger
|
|
|
20
20
|
logger = get_logger(__name__)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def get_spwan_pool(*args: Any, **kwargs: Any) -> Pool:
|
|
24
|
+
"""Get a multiprocessing pool with the spawn context.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
*args: Positional arguments to pass to the Pool constructor
|
|
28
|
+
**kwargs: Keyword arguments to pass to the Pool constructor
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
A multiprocessing pool with the spawn context
|
|
32
|
+
|
|
33
|
+
"""
|
|
34
|
+
return multiprocessing.get_context("spawn").Pool(*args, **kwargs)
|
|
35
|
+
|
|
36
|
+
|
|
23
37
|
def cancel_on_timeout(seconds: float, message: str) -> Callable[..., Any]:
|
|
24
38
|
"""Cancel a function execution if it exceeds a specified timeout.
|
|
25
39
|
|
|
@@ -48,7 +62,8 @@ def cancel_on_timeout(seconds: float, message: str) -> Callable[..., Any]:
|
|
|
48
62
|
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
49
63
|
@wraps(func)
|
|
50
64
|
def wrapper(*args: object, **kwargs: object) -> object:
|
|
51
|
-
|
|
65
|
+
spawn_pool = get_spwan_pool(processes=1)
|
|
66
|
+
with spawn_pool as pool:
|
|
52
67
|
async_result = pool.apply_async(func, args, kwargs)
|
|
53
68
|
try:
|
|
54
69
|
return async_result.get(timeout=seconds)
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
"""A Cleaning DF class that streamlines common cleaning operations on dataframes.
|
|
2
|
+
|
|
3
|
+
This is usefull to build Pipelines and when extending the class you can add your own
|
|
4
|
+
cleaning operations.
|
|
5
|
+
This module uses polars for dataframe operations and assumes some standards on the data
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import abstractmethod
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import polars as pl
|
|
13
|
+
from polars.datatypes.classes import FloatType
|
|
14
|
+
|
|
15
|
+
from winipedia_utils.data.structures.dicts import reverse_dict
|
|
16
|
+
from winipedia_utils.oop.mixins.mixin import ABCLoggingMixin
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CleaningDF(ABCLoggingMixin):
|
|
20
|
+
"""Inherits from polars.DataFrame and ABCLoggingMixin.
|
|
21
|
+
|
|
22
|
+
This will be a base class for importing all kinds of Data to e.g. a database.
|
|
23
|
+
It will be used to import data from different sources an clean it
|
|
24
|
+
Bring the data into the correct format and name the columns correctly.
|
|
25
|
+
And the df takes over and does the rest, like cleaning the data, filling NAs, etc.
|
|
26
|
+
|
|
27
|
+
It is good practice to define col names as str constants in the child class.
|
|
28
|
+
E.g.
|
|
29
|
+
COL_NAME_1 = "col_name_1" so they can be reused and are easy to change.
|
|
30
|
+
|
|
31
|
+
This class defaults to nan_to_null=True when creating the dataframe for simplicity.
|
|
32
|
+
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
36
|
+
"""Initialize the CleaningDF."""
|
|
37
|
+
self.df = pl.DataFrame(*args, nan_to_null=True, **kwargs)
|
|
38
|
+
self.clean()
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def get_rename_map(cls) -> dict[str, str]:
|
|
43
|
+
"""Rename the columns.
|
|
44
|
+
|
|
45
|
+
This method must be implemented in the child class.
|
|
46
|
+
This will be done before any other cleaning operations.
|
|
47
|
+
Format: {new_name: old_name, ...}
|
|
48
|
+
ClenaingDF convention is to map the real col names to smth in all maps
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
dict[str, str]: Dictionary mapping old column names to new column names
|
|
52
|
+
Format: {new_name: old_name, ...}
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
|
|
58
|
+
"""Map the column names to the correct data type.
|
|
59
|
+
|
|
60
|
+
This method must be implemented in the child class.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
dict[str, type[pl.DataType]]: Dictionary mapping column names to their types
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
|
|
69
|
+
"""Drops rows where the subset of columns are all null.
|
|
70
|
+
|
|
71
|
+
Drops a row if all columns in the subset are null.
|
|
72
|
+
You can define several subsets to check.
|
|
73
|
+
Each returned tuple is one subset.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
tuple[tuple[str, ...], ...]: Tuple of tuples of column names
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
@classmethod
|
|
80
|
+
@abstractmethod
|
|
81
|
+
def get_fill_null_map(cls) -> dict[str, Any]:
|
|
82
|
+
"""Fill null values with the specified value.
|
|
83
|
+
|
|
84
|
+
This method must be implemented in the child class.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
dict[str, Any]: Dictionary mapping column names to their fill value
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
|
|
93
|
+
"""Sort the dataframe by the specified columns.
|
|
94
|
+
|
|
95
|
+
This method must be implemented in the child class.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
tuple[tuple[str, bool], ...]: Tuple of tuples of column names and
|
|
99
|
+
how to sort, True for descending, False for ascending in polars
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
@abstractmethod
|
|
104
|
+
def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
|
|
105
|
+
"""Drop duplicates based on the specified subsets.
|
|
106
|
+
|
|
107
|
+
This method must be implemented in the child class.
|
|
108
|
+
E.g.
|
|
109
|
+
(
|
|
110
|
+
(("col1", "col2"), # subset 1
|
|
111
|
+
("col3", "col4"), # subset 2
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
tuple[tuple[tuple[str, bool], ...], ...]: Tuple of tuples of column names
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
@abstractmethod
|
|
120
|
+
def get_no_null_cols(cls) -> tuple[str, ...]:
|
|
121
|
+
"""Disallow null values in the specified columns.
|
|
122
|
+
|
|
123
|
+
This method must be implemented in the child class.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
tuple[str, ...]: Tuple of column names
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
@abstractmethod
|
|
131
|
+
def get_col_converter_map(
|
|
132
|
+
cls,
|
|
133
|
+
) -> dict[str, Callable[[pl.Series], pl.Series]]:
|
|
134
|
+
"""Convert the column to the specified type.
|
|
135
|
+
|
|
136
|
+
This method must be implemented in the child class.
|
|
137
|
+
It takes a polars series and returns a polars series.
|
|
138
|
+
Can be used to e.g. parse dates, or do a specific operation on a column.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
dict[str, Callable[[pl.Series], pl.Series]]: Dictionary mapping column names
|
|
142
|
+
to their conversion function
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
@abstractmethod
|
|
147
|
+
def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
|
|
148
|
+
"""Adds the values of cols together when dupliactes of two rows are found.
|
|
149
|
+
|
|
150
|
+
This method must be implemented in the child class.
|
|
151
|
+
duplicates are determined by the get_unique_subsets method.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
tuple[str, ...]: Tuple of column names
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
@classmethod
|
|
158
|
+
@abstractmethod
|
|
159
|
+
def get_col_precision_map(cls) -> dict[str, int]:
|
|
160
|
+
"""Round the column to the specified precision.
|
|
161
|
+
|
|
162
|
+
This method must be implemented in the child class.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
dict[str, int]: Dictionary mapping column names to their precision
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def get_col_names(cls) -> tuple[str, ...]:
|
|
170
|
+
"""Get the column names of the dataframe."""
|
|
171
|
+
return tuple(cls.get_col_dtype_map().keys())
|
|
172
|
+
|
|
173
|
+
def clean(self) -> None:
|
|
174
|
+
"""Clean the dataframe."""
|
|
175
|
+
self.rename_cols()
|
|
176
|
+
self.drop_cols()
|
|
177
|
+
self.fill_nulls()
|
|
178
|
+
self.convert_cols()
|
|
179
|
+
self.drop_null_subsets()
|
|
180
|
+
self.handle_duplicates()
|
|
181
|
+
self.sort_cols()
|
|
182
|
+
self.check()
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def raise_on_missing_cols(
|
|
186
|
+
cls,
|
|
187
|
+
map_func: Callable[..., dict[str, Any]],
|
|
188
|
+
col_names: tuple[str, ...] | None = None,
|
|
189
|
+
) -> None:
|
|
190
|
+
"""Raise a KeyError if the columns in the map are not in the dataframe."""
|
|
191
|
+
if col_names is None:
|
|
192
|
+
col_names = cls.get_col_names()
|
|
193
|
+
missing_cols = set(col_names) - set(map_func().keys())
|
|
194
|
+
if missing_cols:
|
|
195
|
+
msg = f"Missing columns in {map_func.__name__}: {missing_cols}"
|
|
196
|
+
raise KeyError(msg)
|
|
197
|
+
|
|
198
|
+
def rename_cols(self) -> None:
|
|
199
|
+
"""Rename the columns according to the rename map."""
|
|
200
|
+
self.raise_on_missing_cols(self.get_rename_map)
|
|
201
|
+
self.df = self.df.rename(reverse_dict(self.get_rename_map()))
|
|
202
|
+
|
|
203
|
+
def drop_cols(self) -> None:
|
|
204
|
+
"""Drop columns that are not in the col_dtype_map."""
|
|
205
|
+
self.df = self.df.select(self.get_col_names())
|
|
206
|
+
|
|
207
|
+
def fill_nulls(self) -> None:
|
|
208
|
+
"""Fill null values with the specified values from the fill null map."""
|
|
209
|
+
self.raise_on_missing_cols(self.get_fill_null_map)
|
|
210
|
+
self.df = self.df.with_columns(
|
|
211
|
+
[
|
|
212
|
+
pl.col(col_name).fill_null(fill_value)
|
|
213
|
+
for col_name, fill_value in self.get_fill_null_map().items()
|
|
214
|
+
]
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def convert_cols(self) -> None:
|
|
218
|
+
"""Apply the conversion functions to the columns."""
|
|
219
|
+
self.raise_on_missing_cols(self.get_col_converter_map)
|
|
220
|
+
self.standard_convert_cols()
|
|
221
|
+
self.custom_convert_cols()
|
|
222
|
+
|
|
223
|
+
def standard_convert_cols(self) -> None:
|
|
224
|
+
"""Assumes some Data standards and converts cols accordingly.
|
|
225
|
+
|
|
226
|
+
E.g. strips strings, rounds floats
|
|
227
|
+
"""
|
|
228
|
+
for col_name, dtype in self.get_col_dtype_map().items():
|
|
229
|
+
if dtype == pl.Utf8:
|
|
230
|
+
converter = self.strip_col
|
|
231
|
+
elif dtype == pl.Float64:
|
|
232
|
+
converter = self.round_col
|
|
233
|
+
else:
|
|
234
|
+
continue
|
|
235
|
+
self.df = self.df.with_columns(
|
|
236
|
+
pl.col(col_name).map_batches(converter, return_dtype=dtype)
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def custom_convert_cols(self) -> None:
|
|
240
|
+
"""Apply the conversion functions to the columns."""
|
|
241
|
+
self.df = self.df.with_columns(
|
|
242
|
+
[
|
|
243
|
+
pl.col(col_name).map_batches(
|
|
244
|
+
converter, return_dtype=self.get_col_dtype_map()[col_name]
|
|
245
|
+
)
|
|
246
|
+
for col_name, converter in self.get_col_converter_map().items()
|
|
247
|
+
if converter.__name__ != self.skip_col_converter.__name__
|
|
248
|
+
]
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
@classmethod
|
|
252
|
+
def strip_col(cls, col: pl.Series) -> pl.Series:
|
|
253
|
+
"""Strip the column of leading and trailing whitespace."""
|
|
254
|
+
return col.str.strip_chars()
|
|
255
|
+
|
|
256
|
+
@classmethod
|
|
257
|
+
def lower_col(cls, col: pl.Series) -> pl.Series:
|
|
258
|
+
"""Convert the column to lowercase."""
|
|
259
|
+
return col.str.to_lowercase()
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
def round_col(
|
|
263
|
+
cls,
|
|
264
|
+
col: pl.Series,
|
|
265
|
+
precision: int | None = None,
|
|
266
|
+
*,
|
|
267
|
+
compensate: bool = True,
|
|
268
|
+
) -> pl.Series:
|
|
269
|
+
"""Round the column to the specified precision.
|
|
270
|
+
|
|
271
|
+
The precision is defined in the get_col_precision_map method.
|
|
272
|
+
"""
|
|
273
|
+
if precision is None:
|
|
274
|
+
precision = cls.get_col_precision_map()[str(col.name)]
|
|
275
|
+
if not compensate:
|
|
276
|
+
return col.round(precision)
|
|
277
|
+
|
|
278
|
+
# compensate for rounding errors with kahan sum
|
|
279
|
+
error = 0.0
|
|
280
|
+
values = []
|
|
281
|
+
for value in col.to_list(): # Ensure iteration over Python floats
|
|
282
|
+
corrected = value + error
|
|
283
|
+
rounded = round(corrected, precision)
|
|
284
|
+
error = corrected - rounded
|
|
285
|
+
values.append(rounded)
|
|
286
|
+
|
|
287
|
+
return pl.Series(name=col.name, values=values, dtype=col.dtype)
|
|
288
|
+
|
|
289
|
+
@classmethod
|
|
290
|
+
def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
|
|
291
|
+
"""Conversion is not needed for this column and will be skipped.
|
|
292
|
+
|
|
293
|
+
Function should not be invoked if col_name is in get_col_converter_map.
|
|
294
|
+
"""
|
|
295
|
+
msg = (
|
|
296
|
+
"skip_col_converter is just a flag to skip conversion for a column "
|
|
297
|
+
"and should not be actually called."
|
|
298
|
+
)
|
|
299
|
+
raise NotImplementedError(msg)
|
|
300
|
+
|
|
301
|
+
def drop_null_subsets(self) -> None:
|
|
302
|
+
"""Drop rows where the subset of columns are all null.
|
|
303
|
+
|
|
304
|
+
If no subsets are defined, drop all rows where all columns are null.
|
|
305
|
+
"""
|
|
306
|
+
subsets = self.get_drop_null_subsets()
|
|
307
|
+
if not subsets:
|
|
308
|
+
self.df = self.df.drop_nulls()
|
|
309
|
+
return
|
|
310
|
+
for subset in subsets:
|
|
311
|
+
self.df = self.df.drop_nulls(subset=subset)
|
|
312
|
+
|
|
313
|
+
def handle_duplicates(self) -> None:
|
|
314
|
+
"""Drop duplicates based on the specified subsets.
|
|
315
|
+
|
|
316
|
+
If add_on_duplicate_cols are defined, add the values of the cols together.
|
|
317
|
+
This func adds up the vals of the duplicates and keeps the first row.
|
|
318
|
+
E.g. if you have a df with two rows with the same subset
|
|
319
|
+
and value 1 and 2 in col1 the result will be 3 in col1 for the first row.
|
|
320
|
+
"""
|
|
321
|
+
for subset in self.get_unique_subsets():
|
|
322
|
+
for col in self.get_add_on_duplicate_cols():
|
|
323
|
+
self.df = self.df.with_columns(pl.col(col).sum().over(subset))
|
|
324
|
+
self.df = self.df.unique(subset=subset, keep="first")
|
|
325
|
+
|
|
326
|
+
def sort_cols(self) -> None:
|
|
327
|
+
"""Sort the dataframe by the specified columns."""
|
|
328
|
+
cols, desc = zip(*self.get_sort_cols(), strict=True)
|
|
329
|
+
if not cols:
|
|
330
|
+
return
|
|
331
|
+
self.df = self.df.sort(cols, descending=desc)
|
|
332
|
+
|
|
333
|
+
def check(self) -> None:
|
|
334
|
+
"""Check the data and some conditions.
|
|
335
|
+
|
|
336
|
+
This method is called at the end of the clean method.
|
|
337
|
+
checks e.g. non null values in no_null_cols
|
|
338
|
+
"""
|
|
339
|
+
self.check_correct_dtypes()
|
|
340
|
+
self.check_no_null_cols()
|
|
341
|
+
self.check_no_nan()
|
|
342
|
+
|
|
343
|
+
def check_correct_dtypes(self) -> None:
|
|
344
|
+
"""Check that all columns have the correct dtype."""
|
|
345
|
+
schema = self.df.schema
|
|
346
|
+
col_dtype_map = self.get_col_dtype_map()
|
|
347
|
+
for col, dtype in col_dtype_map.items():
|
|
348
|
+
schema_dtype = schema[col]
|
|
349
|
+
if schema_dtype != dtype:
|
|
350
|
+
msg = f"Expected dtype {dtype} for column {col}, got {schema_dtype}"
|
|
351
|
+
raise TypeError(msg)
|
|
352
|
+
|
|
353
|
+
def check_no_null_cols(self) -> None:
|
|
354
|
+
"""Check that there are no null values in the no null columns."""
|
|
355
|
+
no_null_cols = self.get_no_null_cols()
|
|
356
|
+
# Use a single select to check all columns at once
|
|
357
|
+
null_flags = self.df.select(
|
|
358
|
+
[pl.col(col).is_null().any() for col in no_null_cols]
|
|
359
|
+
)
|
|
360
|
+
# Iterate over columns and check if any have nulls
|
|
361
|
+
for col in no_null_cols:
|
|
362
|
+
if null_flags[col].item():
|
|
363
|
+
msg = f"Null values found in column: {col}"
|
|
364
|
+
raise ValueError(msg)
|
|
365
|
+
|
|
366
|
+
def check_no_nan(self) -> None:
|
|
367
|
+
"""Check that there are no nan values in the df."""
|
|
368
|
+
float_cols = [
|
|
369
|
+
col
|
|
370
|
+
for col, dtype in self.get_col_dtype_map().items()
|
|
371
|
+
if issubclass(dtype, FloatType)
|
|
372
|
+
]
|
|
373
|
+
has_nan = self.df.select(
|
|
374
|
+
pl.any_horizontal(pl.col(float_cols).is_nan().any())
|
|
375
|
+
).item()
|
|
376
|
+
if has_nan:
|
|
377
|
+
msg = "NaN values found in the dataframe"
|
|
378
|
+
raise ValueError(msg)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Common dict utilities."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def reverse_dict(d: dict[Any, Any]) -> dict[Any, Any]:
|
|
7
|
+
"""Reverse a dictionary.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
d: Dictionary to reverse
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
Reversed dictionary
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
return {v: k for k, v in d.items()}
|
|
@@ -73,7 +73,8 @@ class ABCLoggingMeta(ABCMeta):
|
|
|
73
73
|
"""Determine if a method should have logging applied.
|
|
74
74
|
|
|
75
75
|
Args:
|
|
76
|
-
method: The method to check
|
|
76
|
+
method: The method to check, properties are not logged
|
|
77
|
+
as they are not callable and it turns out to be tricky with them
|
|
77
78
|
|
|
78
79
|
Returns:
|
|
79
80
|
True if the method should be wrapped with logging, False otherwise
|
|
@@ -81,6 +82,7 @@ class ABCLoggingMeta(ABCMeta):
|
|
|
81
82
|
"""
|
|
82
83
|
return (
|
|
83
84
|
is_func(method) # must be a method-like attribute
|
|
85
|
+
and hasattr(method, "__name__") # must have a name
|
|
84
86
|
and not method.__name__.startswith("__") # must not be a magic method
|
|
85
87
|
)
|
|
86
88
|
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
"""Dataframe utilities for data manipulation and analysis.
|
|
2
|
-
|
|
3
|
-
This module provides utility functions for working with pandas DataFrames,
|
|
4
|
-
including data cleaning, transformation, and aggregation operations.
|
|
5
|
-
These utilities help with data preprocessing and analysis tasks.
|
|
6
|
-
|
|
7
|
-
"""
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
"""__init__ module for winipedia_utils.django."""
|
|
2
|
-
|
|
3
|
-
import django
|
|
4
|
-
import django_stubs_ext
|
|
5
|
-
from django.conf import settings
|
|
6
|
-
|
|
7
|
-
from winipedia_utils.logging.logger import get_logger
|
|
8
|
-
|
|
9
|
-
logger = get_logger(__name__)
|
|
10
|
-
|
|
11
|
-
django_stubs_ext.monkeypatch()
|
|
12
|
-
logger.info("Monkeypatched django-stubs")
|
|
13
|
-
|
|
14
|
-
if not settings.configured:
|
|
15
|
-
logger.info("Configuring minimal django settings")
|
|
16
|
-
settings.configure(
|
|
17
|
-
DATABASES={
|
|
18
|
-
"default": {
|
|
19
|
-
"ENGINE": "django.db.backends.sqlite3",
|
|
20
|
-
"NAME": ":memory:",
|
|
21
|
-
}
|
|
22
|
-
},
|
|
23
|
-
)
|
|
24
|
-
django.setup()
|