winipedia-utils 0.1.63__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of winipedia-utils might be problematic. Click here for more details.

Files changed (102) hide show
  1. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/PKG-INFO +5 -7
  2. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/pyproject.toml +3 -7
  3. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/concurrent.py +6 -3
  4. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/multiprocessing.py +16 -1
  5. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/consts.py +0 -2
  6. winipedia_utils-0.2.1/winipedia_utils/data/dataframe/cleaning.py +378 -0
  7. winipedia_utils-0.2.1/winipedia_utils/data/structures/dicts.py +16 -0
  8. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/function.py +0 -2
  9. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/mixins/meta.py +3 -1
  10. winipedia_utils-0.1.63/winipedia_utils/data/dataframe.py +0 -7
  11. winipedia_utils-0.1.63/winipedia_utils/django/__init__.py +0 -24
  12. winipedia_utils-0.1.63/winipedia_utils/django/bulk.py +0 -538
  13. winipedia_utils-0.1.63/winipedia_utils/django/command.py +0 -334
  14. winipedia_utils-0.1.63/winipedia_utils/django/database.py +0 -289
  15. winipedia_utils-0.1.63/winipedia_utils/pyside/__init__.py +0 -1
  16. winipedia_utils-0.1.63/winipedia_utils/pyside/core/__init__.py +0 -1
  17. winipedia_utils-0.1.63/winipedia_utils/pyside/core/py_qiodevice.py +0 -476
  18. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/__init__.py +0 -1
  19. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/base/base.py +0 -180
  20. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/base/base.py +0 -92
  21. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/browser.py +0 -26
  22. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/player.py +0 -85
  23. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/browser.py +0 -243
  24. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/clickable_widget.py +0 -57
  25. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/media_player.py +0 -430
  26. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets/notification.py +0 -78
  27. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/windows/__init__.py +0 -1
  28. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/windows/base/__init__.py +0 -1
  29. winipedia_utils-0.1.63/winipedia_utils/pyside/ui/windows/base/base.py +0 -49
  30. winipedia_utils-0.1.63/winipedia_utils/resources/__init__.py +0 -1
  31. winipedia_utils-0.1.63/winipedia_utils/resources/svgs/__init__.py +0 -1
  32. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/LICENSE +0 -0
  33. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/README.md +0 -0
  34. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/__init__.py +0 -0
  35. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/__init__.py +0 -0
  36. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/concurrent/multithreading.py +0 -0
  37. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/data/__init__.py +0 -0
  38. {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/base → winipedia_utils-0.2.1/winipedia_utils/data/dataframe}/__init__.py +0 -0
  39. {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages → winipedia_utils-0.2.1/winipedia_utils/data/structures}/__init__.py +0 -0
  40. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/__init__.py +0 -0
  41. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/gitignore/__init__.py +0 -0
  42. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/gitignore/gitignore.py +0 -0
  43. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/__init__.py +0 -0
  44. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/config.py +0 -0
  45. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/hooks.py +0 -0
  46. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/git/pre_commit/run_hooks.py +0 -0
  47. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/iterating/__init__.py +0 -0
  48. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/iterating/iterate.py +0 -0
  49. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/__init__.py +0 -0
  50. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/ansi.py +0 -0
  51. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/config.py +0 -0
  52. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/logging/logger.py +0 -0
  53. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/__init__.py +0 -0
  54. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/class_.py +0 -0
  55. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/module.py +0 -0
  56. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/modules/package.py +0 -0
  57. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/__init__.py +0 -0
  58. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/mixins/__init__.py +0 -0
  59. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/oop/mixins/mixin.py +0 -0
  60. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/os/__init__.py +0 -0
  61. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/os/os.py +0 -0
  62. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/__init__.py +0 -0
  63. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/poetry/__init__.py +0 -0
  64. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/poetry/config.py +0 -0
  65. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/poetry/poetry.py +0 -0
  66. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/projects/project.py +0 -0
  67. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/py.typed +0 -0
  68. {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/pages/base → winipedia_utils-0.2.1/winipedia_utils/resources}/__init__.py +0 -0
  69. {winipedia_utils-0.1.63/winipedia_utils/pyside/ui/widgets → winipedia_utils-0.2.1/winipedia_utils/resources/svgs}/__init__.py +0 -0
  70. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/delete_garbage_can.svg +0 -0
  71. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/download_arrow.svg +0 -0
  72. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/exit_fullscreen_icon.svg +0 -0
  73. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/fullscreen_icon.svg +0 -0
  74. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/menu_icon.svg +0 -0
  75. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/pause_icon.svg +0 -0
  76. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/play_icon.svg +0 -0
  77. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/plus_icon.svg +0 -0
  78. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/resources/svgs/svg.py +0 -0
  79. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/security/__init__.py +0 -0
  80. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/security/cryptography.py +0 -0
  81. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/security/keyring.py +0 -0
  82. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/setup.py +0 -0
  83. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/__init__.py +0 -0
  84. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/assertions.py +0 -0
  85. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/convention.py +0 -0
  86. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/create_tests.py +0 -0
  87. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/fixtures.py +0 -0
  88. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/__init__.py +0 -0
  89. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/__init__.py +0 -0
  90. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/__init__.py +0 -0
  91. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/fixture.py +0 -0
  92. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/__init__.py +0 -0
  93. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/class_.py +0 -0
  94. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/function.py +0 -0
  95. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/module.py +0 -0
  96. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/package.py +0 -0
  97. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/fixtures/scopes/session.py +0 -0
  98. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/utils/__init__.py +0 -0
  99. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/base/utils/utils.py +0 -0
  100. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/testing/tests/conftest.py +0 -0
  101. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/text/__init__.py +0 -0
  102. {winipedia_utils-0.1.63 → winipedia_utils-0.2.1}/winipedia_utils/text/string.py +0 -0
@@ -1,22 +1,20 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: winipedia-utils
3
- Version: 0.1.63
3
+ Version: 0.2.1
4
4
  Summary: A package with many utility functions
5
- License: MIT
5
+ License-Expression: MIT
6
+ License-File: LICENSE
6
7
  Author: Winipedia
7
8
  Author-email: win.steveker@gmx.de
8
9
  Requires-Python: >=3.12,<3.14
9
- Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Classifier: Programming Language :: Python :: 3.13
13
13
  Requires-Dist: cryptography (>=45.0.5,<46.0.0)
14
14
  Requires-Dist: defusedxml (>=0.7.1,<0.8.0)
15
- Requires-Dist: django (>=5.2.1,<6.0.0)
16
15
  Requires-Dist: keyring (>=25.6.0,<26.0.0)
17
16
  Requires-Dist: pathspec (>=0.12.1,<0.13.0)
18
- Requires-Dist: pyqt-toast-notification (>=1.3.3,<2.0.0)
19
- Requires-Dist: pyside6 (>=6.9.1,<7.0.0)
17
+ Requires-Dist: polars (>=1.34.0,<2.0.0)
20
18
  Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
21
19
  Requires-Dist: setuptools (>=80.3.1,<81.0.0)
22
20
  Requires-Dist: tomlkit (>=0.13.2,<0.14.0)
@@ -1,7 +1,7 @@
1
1
  # Project section
2
2
  [project]
3
3
  name = "winipedia-utils"
4
- version = "0.1.63"
4
+ version = "0.2.1"
5
5
  description = "A package with many utility functions"
6
6
  readme = "README.md"
7
7
  requires-python = ">=3.12,<3.14"
@@ -9,7 +9,7 @@ dynamic = [ "dependencies",]
9
9
  authors = [
10
10
  { name = "Winipedia", email = "win.steveker@gmx.de"}
11
11
  ]
12
- license = { text = "MIT" }
12
+ license = "MIT"
13
13
 
14
14
  # Build system section
15
15
  [build-system]
@@ -26,12 +26,10 @@ setuptools = "^80.3.1"
26
26
  defusedxml = "^0.7.1"
27
27
  tomlkit = "^0.13.2"
28
28
  pathspec = "^0.12.1"
29
- django = "^5.2.1"
30
29
  pyyaml = "^6.0.2"
31
30
  keyring = "^25.6.0"
32
31
  cryptography = "^45.0.5"
33
- pyside6 = "^6.9.1"
34
- pyqt-toast-notification = "^1.3.3"
32
+ polars = "^1.34.0"
35
33
 
36
34
  [tool.poetry.group.dev.dependencies]
37
35
  ruff = "^0.11.7"
@@ -44,8 +42,6 @@ types-tqdm = "^4.67.0.20250417"
44
42
  types-defusedxml = "^0.7.0.20240218"
45
43
  types-pyyaml = "^6.0.12.20250516"
46
44
  pytest-mock = "^3.14.0"
47
- django-stubs = "^5.2.0"
48
- pytest-qt = "^4.5.0"
49
45
 
50
46
  [tool.ruff]
51
47
  exclude = [ ".*", "**/migrations/*.py",]
@@ -16,15 +16,18 @@ from collections.abc import Callable, Generator, Iterable
16
16
  from concurrent.futures import ThreadPoolExecutor
17
17
  from copy import deepcopy
18
18
  from functools import partial
19
- from multiprocessing.pool import Pool
20
- from typing import Any, cast
19
+ from typing import TYPE_CHECKING, Any, cast
21
20
 
22
21
  from tqdm import tqdm
23
22
 
23
+ from winipedia_utils.concurrent.multiprocessing import get_spwan_pool
24
24
  from winipedia_utils.concurrent.multithreading import imap_unordered
25
25
  from winipedia_utils.iterating.iterate import get_len_with_default
26
26
  from winipedia_utils.logging.logger import get_logger
27
27
 
28
+ if TYPE_CHECKING:
29
+ from multiprocessing.pool import Pool
30
+
28
31
  logger = get_logger(__name__)
29
32
 
30
33
 
@@ -218,7 +221,7 @@ def concurrent_loop( # noqa: PLR0913
218
221
  pool_executor = (
219
222
  ThreadPoolExecutor(max_workers=max_workers)
220
223
  if threading
221
- else Pool(processes=max_workers)
224
+ else get_spwan_pool(processes=max_workers)
222
225
  )
223
226
  with pool_executor as pool:
224
227
  map_func: Callable[[Callable[..., Any], Iterable[Any]], Any]
@@ -20,6 +20,20 @@ from winipedia_utils.logging.logger import get_logger
20
20
  logger = get_logger(__name__)
21
21
 
22
22
 
23
+ def get_spwan_pool(*args: Any, **kwargs: Any) -> Pool:
24
+ """Get a multiprocessing pool with the spawn context.
25
+
26
+ Args:
27
+ *args: Positional arguments to pass to the Pool constructor
28
+ **kwargs: Keyword arguments to pass to the Pool constructor
29
+
30
+ Returns:
31
+ A multiprocessing pool with the spawn context
32
+
33
+ """
34
+ return multiprocessing.get_context("spawn").Pool(*args, **kwargs)
35
+
36
+
23
37
  def cancel_on_timeout(seconds: float, message: str) -> Callable[..., Any]:
24
38
  """Cancel a function execution if it exceeds a specified timeout.
25
39
 
@@ -48,7 +62,8 @@ def cancel_on_timeout(seconds: float, message: str) -> Callable[..., Any]:
48
62
  def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
49
63
  @wraps(func)
50
64
  def wrapper(*args: object, **kwargs: object) -> object:
51
- with Pool(processes=1) as pool:
65
+ spawn_pool = get_spwan_pool(processes=1)
66
+ with spawn_pool as pool:
52
67
  async_result = pool.apply_async(func, args, kwargs)
53
68
  try:
54
69
  return async_result.get(timeout=seconds)
@@ -18,6 +18,4 @@ _DEV_DEPENDENCIES = [
18
18
  "types-defusedxml",
19
19
  "types-pyyaml",
20
20
  "pytest-mock",
21
- "pytest-qt",
22
- "django-stubs",
23
21
  ]
@@ -0,0 +1,378 @@
1
+ """A Cleaning DF class that streamlines common cleaning operations on dataframes.
2
+
3
+ This is usefull to build Pipelines and when extending the class you can add your own
4
+ cleaning operations.
5
+ This module uses polars for dataframe operations and assumes some standards on the data
6
+ """
7
+
8
+ from abc import abstractmethod
9
+ from collections.abc import Callable
10
+ from typing import Any
11
+
12
+ import polars as pl
13
+ from polars.datatypes.classes import FloatType
14
+
15
+ from winipedia_utils.data.structures.dicts import reverse_dict
16
+ from winipedia_utils.oop.mixins.mixin import ABCLoggingMixin
17
+
18
+
19
+ class CleaningDF(ABCLoggingMixin):
20
+ """Inherits from polars.DataFrame and ABCLoggingMixin.
21
+
22
+ This will be a base class for importing all kinds of Data to e.g. a database.
23
+ It will be used to import data from different sources an clean it
24
+ Bring the data into the correct format and name the columns correctly.
25
+ And the df takes over and does the rest, like cleaning the data, filling NAs, etc.
26
+
27
+ It is good practice to define col names as str constants in the child class.
28
+ E.g.
29
+ COL_NAME_1 = "col_name_1" so they can be reused and are easy to change.
30
+
31
+ This class defaults to nan_to_null=True when creating the dataframe for simplicity.
32
+
33
+ """
34
+
35
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
36
+ """Initialize the CleaningDF."""
37
+ self.df = pl.DataFrame(*args, nan_to_null=True, **kwargs)
38
+ self.clean()
39
+
40
+ @classmethod
41
+ @abstractmethod
42
+ def get_rename_map(cls) -> dict[str, str]:
43
+ """Rename the columns.
44
+
45
+ This method must be implemented in the child class.
46
+ This will be done before any other cleaning operations.
47
+ Format: {new_name: old_name, ...}
48
+ ClenaingDF convention is to map the real col names to smth in all maps
49
+
50
+ Returns:
51
+ dict[str, str]: Dictionary mapping old column names to new column names
52
+ Format: {new_name: old_name, ...}
53
+ """
54
+
55
+ @classmethod
56
+ @abstractmethod
57
+ def get_col_dtype_map(cls) -> dict[str, type[pl.DataType]]:
58
+ """Map the column names to the correct data type.
59
+
60
+ This method must be implemented in the child class.
61
+
62
+ Returns:
63
+ dict[str, type[pl.DataType]]: Dictionary mapping column names to their types
64
+ """
65
+
66
+ @classmethod
67
+ @abstractmethod
68
+ def get_drop_null_subsets(cls) -> tuple[tuple[str, ...], ...]:
69
+ """Drops rows where the subset of columns are all null.
70
+
71
+ Drops a row if all columns in the subset are null.
72
+ You can define several subsets to check.
73
+ Each returned tuple is one subset.
74
+
75
+ Returns:
76
+ tuple[tuple[str, ...], ...]: Tuple of tuples of column names
77
+ """
78
+
79
+ @classmethod
80
+ @abstractmethod
81
+ def get_fill_null_map(cls) -> dict[str, Any]:
82
+ """Fill null values with the specified value.
83
+
84
+ This method must be implemented in the child class.
85
+
86
+ Returns:
87
+ dict[str, Any]: Dictionary mapping column names to their fill value
88
+ """
89
+
90
+ @classmethod
91
+ @abstractmethod
92
+ def get_sort_cols(cls) -> tuple[tuple[str, bool], ...]:
93
+ """Sort the dataframe by the specified columns.
94
+
95
+ This method must be implemented in the child class.
96
+
97
+ Returns:
98
+ tuple[tuple[str, bool], ...]: Tuple of tuples of column names and
99
+ how to sort, True for descending, False for ascending in polars
100
+ """
101
+
102
+ @classmethod
103
+ @abstractmethod
104
+ def get_unique_subsets(cls) -> tuple[tuple[str, ...], ...]:
105
+ """Drop duplicates based on the specified subsets.
106
+
107
+ This method must be implemented in the child class.
108
+ E.g.
109
+ (
110
+ (("col1", "col2"), # subset 1
111
+ ("col3", "col4"), # subset 2
112
+ )
113
+
114
+ Returns:
115
+ tuple[tuple[tuple[str, bool], ...], ...]: Tuple of tuples of column names
116
+ """
117
+
118
+ @classmethod
119
+ @abstractmethod
120
+ def get_no_null_cols(cls) -> tuple[str, ...]:
121
+ """Disallow null values in the specified columns.
122
+
123
+ This method must be implemented in the child class.
124
+
125
+ Returns:
126
+ tuple[str, ...]: Tuple of column names
127
+ """
128
+
129
+ @classmethod
130
+ @abstractmethod
131
+ def get_col_converter_map(
132
+ cls,
133
+ ) -> dict[str, Callable[[pl.Series], pl.Series]]:
134
+ """Convert the column to the specified type.
135
+
136
+ This method must be implemented in the child class.
137
+ It takes a polars series and returns a polars series.
138
+ Can be used to e.g. parse dates, or do a specific operation on a column.
139
+
140
+ Returns:
141
+ dict[str, Callable[[pl.Series], pl.Series]]: Dictionary mapping column names
142
+ to their conversion function
143
+ """
144
+
145
+ @classmethod
146
+ @abstractmethod
147
+ def get_add_on_duplicate_cols(cls) -> tuple[str, ...]:
148
+ """Adds the values of cols together when dupliactes of two rows are found.
149
+
150
+ This method must be implemented in the child class.
151
+ duplicates are determined by the get_unique_subsets method.
152
+
153
+ Returns:
154
+ tuple[str, ...]: Tuple of column names
155
+ """
156
+
157
+ @classmethod
158
+ @abstractmethod
159
+ def get_col_precision_map(cls) -> dict[str, int]:
160
+ """Round the column to the specified precision.
161
+
162
+ This method must be implemented in the child class.
163
+
164
+ Returns:
165
+ dict[str, int]: Dictionary mapping column names to their precision
166
+ """
167
+
168
+ @classmethod
169
+ def get_col_names(cls) -> tuple[str, ...]:
170
+ """Get the column names of the dataframe."""
171
+ return tuple(cls.get_col_dtype_map().keys())
172
+
173
+ def clean(self) -> None:
174
+ """Clean the dataframe."""
175
+ self.rename_cols()
176
+ self.drop_cols()
177
+ self.fill_nulls()
178
+ self.convert_cols()
179
+ self.drop_null_subsets()
180
+ self.handle_duplicates()
181
+ self.sort_cols()
182
+ self.check()
183
+
184
+ @classmethod
185
+ def raise_on_missing_cols(
186
+ cls,
187
+ map_func: Callable[..., dict[str, Any]],
188
+ col_names: tuple[str, ...] | None = None,
189
+ ) -> None:
190
+ """Raise a KeyError if the columns in the map are not in the dataframe."""
191
+ if col_names is None:
192
+ col_names = cls.get_col_names()
193
+ missing_cols = set(col_names) - set(map_func().keys())
194
+ if missing_cols:
195
+ msg = f"Missing columns in {map_func.__name__}: {missing_cols}"
196
+ raise KeyError(msg)
197
+
198
+ def rename_cols(self) -> None:
199
+ """Rename the columns according to the rename map."""
200
+ self.raise_on_missing_cols(self.get_rename_map)
201
+ self.df = self.df.rename(reverse_dict(self.get_rename_map()))
202
+
203
+ def drop_cols(self) -> None:
204
+ """Drop columns that are not in the col_dtype_map."""
205
+ self.df = self.df.select(self.get_col_names())
206
+
207
+ def fill_nulls(self) -> None:
208
+ """Fill null values with the specified values from the fill null map."""
209
+ self.raise_on_missing_cols(self.get_fill_null_map)
210
+ self.df = self.df.with_columns(
211
+ [
212
+ pl.col(col_name).fill_null(fill_value)
213
+ for col_name, fill_value in self.get_fill_null_map().items()
214
+ ]
215
+ )
216
+
217
+ def convert_cols(self) -> None:
218
+ """Apply the conversion functions to the columns."""
219
+ self.raise_on_missing_cols(self.get_col_converter_map)
220
+ self.standard_convert_cols()
221
+ self.custom_convert_cols()
222
+
223
+ def standard_convert_cols(self) -> None:
224
+ """Assumes some Data standards and converts cols accordingly.
225
+
226
+ E.g. strips strings, rounds floats
227
+ """
228
+ for col_name, dtype in self.get_col_dtype_map().items():
229
+ if dtype == pl.Utf8:
230
+ converter = self.strip_col
231
+ elif dtype == pl.Float64:
232
+ converter = self.round_col
233
+ else:
234
+ continue
235
+ self.df = self.df.with_columns(
236
+ pl.col(col_name).map_batches(converter, return_dtype=dtype)
237
+ )
238
+
239
+ def custom_convert_cols(self) -> None:
240
+ """Apply the conversion functions to the columns."""
241
+ self.df = self.df.with_columns(
242
+ [
243
+ pl.col(col_name).map_batches(
244
+ converter, return_dtype=self.get_col_dtype_map()[col_name]
245
+ )
246
+ for col_name, converter in self.get_col_converter_map().items()
247
+ if converter.__name__ != self.skip_col_converter.__name__
248
+ ]
249
+ )
250
+
251
+ @classmethod
252
+ def strip_col(cls, col: pl.Series) -> pl.Series:
253
+ """Strip the column of leading and trailing whitespace."""
254
+ return col.str.strip_chars()
255
+
256
+ @classmethod
257
+ def lower_col(cls, col: pl.Series) -> pl.Series:
258
+ """Convert the column to lowercase."""
259
+ return col.str.to_lowercase()
260
+
261
+ @classmethod
262
+ def round_col(
263
+ cls,
264
+ col: pl.Series,
265
+ precision: int | None = None,
266
+ *,
267
+ compensate: bool = True,
268
+ ) -> pl.Series:
269
+ """Round the column to the specified precision.
270
+
271
+ The precision is defined in the get_col_precision_map method.
272
+ """
273
+ if precision is None:
274
+ precision = cls.get_col_precision_map()[str(col.name)]
275
+ if not compensate:
276
+ return col.round(precision)
277
+
278
+ # compensate for rounding errors with kahan sum
279
+ error = 0.0
280
+ values = []
281
+ for value in col.to_list(): # Ensure iteration over Python floats
282
+ corrected = value + error
283
+ rounded = round(corrected, precision)
284
+ error = corrected - rounded
285
+ values.append(rounded)
286
+
287
+ return pl.Series(name=col.name, values=values, dtype=col.dtype)
288
+
289
+ @classmethod
290
+ def skip_col_converter(cls, _col: pl.Series) -> pl.Series:
291
+ """Conversion is not needed for this column and will be skipped.
292
+
293
+ Function should not be invoked if col_name is in get_col_converter_map.
294
+ """
295
+ msg = (
296
+ "skip_col_converter is just a flag to skip conversion for a column "
297
+ "and should not be actually called."
298
+ )
299
+ raise NotImplementedError(msg)
300
+
301
+ def drop_null_subsets(self) -> None:
302
+ """Drop rows where the subset of columns are all null.
303
+
304
+ If no subsets are defined, drop all rows where all columns are null.
305
+ """
306
+ subsets = self.get_drop_null_subsets()
307
+ if not subsets:
308
+ self.df = self.df.drop_nulls()
309
+ return
310
+ for subset in subsets:
311
+ self.df = self.df.drop_nulls(subset=subset)
312
+
313
+ def handle_duplicates(self) -> None:
314
+ """Drop duplicates based on the specified subsets.
315
+
316
+ If add_on_duplicate_cols are defined, add the values of the cols together.
317
+ This func adds up the vals of the duplicates and keeps the first row.
318
+ E.g. if you have a df with two rows with the same subset
319
+ and value 1 and 2 in col1 the result will be 3 in col1 for the first row.
320
+ """
321
+ for subset in self.get_unique_subsets():
322
+ for col in self.get_add_on_duplicate_cols():
323
+ self.df = self.df.with_columns(pl.col(col).sum().over(subset))
324
+ self.df = self.df.unique(subset=subset, keep="first")
325
+
326
+ def sort_cols(self) -> None:
327
+ """Sort the dataframe by the specified columns."""
328
+ cols, desc = zip(*self.get_sort_cols(), strict=True)
329
+ if not cols:
330
+ return
331
+ self.df = self.df.sort(cols, descending=desc)
332
+
333
+ def check(self) -> None:
334
+ """Check the data and some conditions.
335
+
336
+ This method is called at the end of the clean method.
337
+ checks e.g. non null values in no_null_cols
338
+ """
339
+ self.check_correct_dtypes()
340
+ self.check_no_null_cols()
341
+ self.check_no_nan()
342
+
343
+ def check_correct_dtypes(self) -> None:
344
+ """Check that all columns have the correct dtype."""
345
+ schema = self.df.schema
346
+ col_dtype_map = self.get_col_dtype_map()
347
+ for col, dtype in col_dtype_map.items():
348
+ schema_dtype = schema[col]
349
+ if schema_dtype != dtype:
350
+ msg = f"Expected dtype {dtype} for column {col}, got {schema_dtype}"
351
+ raise TypeError(msg)
352
+
353
+ def check_no_null_cols(self) -> None:
354
+ """Check that there are no null values in the no null columns."""
355
+ no_null_cols = self.get_no_null_cols()
356
+ # Use a single select to check all columns at once
357
+ null_flags = self.df.select(
358
+ [pl.col(col).is_null().any() for col in no_null_cols]
359
+ )
360
+ # Iterate over columns and check if any have nulls
361
+ for col in no_null_cols:
362
+ if null_flags[col].item():
363
+ msg = f"Null values found in column: {col}"
364
+ raise ValueError(msg)
365
+
366
+ def check_no_nan(self) -> None:
367
+ """Check that there are no nan values in the df."""
368
+ float_cols = [
369
+ col
370
+ for col, dtype in self.get_col_dtype_map().items()
371
+ if issubclass(dtype, FloatType)
372
+ ]
373
+ has_nan = self.df.select(
374
+ pl.any_horizontal(pl.col(float_cols).is_nan().any())
375
+ ).item()
376
+ if has_nan:
377
+ msg = "NaN values found in the dataframe"
378
+ raise ValueError(msg)
@@ -0,0 +1,16 @@
1
+ """Common dict utilities."""
2
+
3
+ from typing import Any
4
+
5
+
6
+ def reverse_dict(d: dict[Any, Any]) -> dict[Any, Any]:
7
+ """Reverse a dictionary.
8
+
9
+ Args:
10
+ d: Dictionary to reverse
11
+
12
+ Returns:
13
+ Reversed dictionary
14
+
15
+ """
16
+ return {v: k for k, v in d.items()}
@@ -52,8 +52,6 @@ def is_func(obj: Any) -> bool:
52
52
  if isinstance(obj, (staticmethod, classmethod, property)):
53
53
  return True
54
54
 
55
- # unwrap any wrappers (@functools.wraps) and retest
56
-
57
55
  unwrapped = inspect.unwrap(obj)
58
56
 
59
57
  return is_func_or_method(unwrapped)
@@ -73,7 +73,8 @@ class ABCLoggingMeta(ABCMeta):
73
73
  """Determine if a method should have logging applied.
74
74
 
75
75
  Args:
76
- method: The method to check
76
+ method: The method to check, properties are not logged
77
+ as they are not callable and it turns out to be tricky with them
77
78
 
78
79
  Returns:
79
80
  True if the method should be wrapped with logging, False otherwise
@@ -81,6 +82,7 @@ class ABCLoggingMeta(ABCMeta):
81
82
  """
82
83
  return (
83
84
  is_func(method) # must be a method-like attribute
85
+ and hasattr(method, "__name__") # must have a name
84
86
  and not method.__name__.startswith("__") # must not be a magic method
85
87
  )
86
88
 
@@ -1,7 +0,0 @@
1
- """Dataframe utilities for data manipulation and analysis.
2
-
3
- This module provides utility functions for working with pandas DataFrames,
4
- including data cleaning, transformation, and aggregation operations.
5
- These utilities help with data preprocessing and analysis tasks.
6
-
7
- """
@@ -1,24 +0,0 @@
1
- """__init__ module for winipedia_utils.django."""
2
-
3
- import django
4
- import django_stubs_ext
5
- from django.conf import settings
6
-
7
- from winipedia_utils.logging.logger import get_logger
8
-
9
- logger = get_logger(__name__)
10
-
11
- django_stubs_ext.monkeypatch()
12
- logger.info("Monkeypatched django-stubs")
13
-
14
- if not settings.configured:
15
- logger.info("Configuring minimal django settings")
16
- settings.configure(
17
- DATABASES={
18
- "default": {
19
- "ENGINE": "django.db.backends.sqlite3",
20
- "NAME": ":memory:",
21
- }
22
- },
23
- )
24
- django.setup()