absfuyu 5.0.0__py3-none-any.whl → 6.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of absfuyu might be problematic. Click here for more details.
- absfuyu/__init__.py +5 -3
- absfuyu/__main__.py +3 -3
- absfuyu/cli/__init__.py +13 -2
- absfuyu/cli/audio_group.py +98 -0
- absfuyu/cli/color.py +30 -14
- absfuyu/cli/config_group.py +9 -2
- absfuyu/cli/do_group.py +23 -6
- absfuyu/cli/game_group.py +27 -2
- absfuyu/cli/tool_group.py +81 -11
- absfuyu/config/__init__.py +3 -3
- absfuyu/core/__init__.py +12 -8
- absfuyu/core/baseclass.py +929 -96
- absfuyu/core/baseclass2.py +44 -3
- absfuyu/core/decorator.py +70 -4
- absfuyu/core/docstring.py +64 -41
- absfuyu/core/dummy_cli.py +3 -3
- absfuyu/core/dummy_func.py +19 -6
- absfuyu/dxt/__init__.py +2 -2
- absfuyu/dxt/base_type.py +93 -0
- absfuyu/dxt/dictext.py +204 -16
- absfuyu/dxt/dxt_support.py +2 -2
- absfuyu/dxt/intext.py +151 -34
- absfuyu/dxt/listext.py +969 -127
- absfuyu/dxt/strext.py +77 -17
- absfuyu/extra/__init__.py +2 -2
- absfuyu/extra/audio/__init__.py +8 -0
- absfuyu/extra/audio/_util.py +57 -0
- absfuyu/extra/audio/convert.py +192 -0
- absfuyu/extra/audio/lossless.py +281 -0
- absfuyu/extra/beautiful.py +3 -2
- absfuyu/extra/da/__init__.py +72 -0
- absfuyu/extra/da/dadf.py +1600 -0
- absfuyu/extra/da/dadf_base.py +186 -0
- absfuyu/extra/da/df_func.py +181 -0
- absfuyu/extra/da/mplt.py +219 -0
- absfuyu/extra/ggapi/__init__.py +8 -0
- absfuyu/extra/ggapi/gdrive.py +223 -0
- absfuyu/extra/ggapi/glicense.py +148 -0
- absfuyu/extra/ggapi/glicense_df.py +186 -0
- absfuyu/extra/ggapi/gsheet.py +88 -0
- absfuyu/extra/img/__init__.py +30 -0
- absfuyu/extra/img/converter.py +402 -0
- absfuyu/extra/img/dup_check.py +291 -0
- absfuyu/extra/pdf.py +87 -0
- absfuyu/extra/rclone.py +253 -0
- absfuyu/extra/xml.py +90 -0
- absfuyu/fun/__init__.py +7 -20
- absfuyu/fun/rubik.py +442 -0
- absfuyu/fun/tarot.py +2 -2
- absfuyu/game/__init__.py +2 -2
- absfuyu/game/game_stat.py +2 -2
- absfuyu/game/schulte.py +78 -0
- absfuyu/game/sudoku.py +2 -2
- absfuyu/game/tictactoe.py +2 -3
- absfuyu/game/wordle.py +6 -4
- absfuyu/general/__init__.py +4 -4
- absfuyu/general/content.py +4 -4
- absfuyu/general/human.py +2 -2
- absfuyu/general/resrel.py +213 -0
- absfuyu/general/shape.py +3 -8
- absfuyu/general/tax.py +344 -0
- absfuyu/logger.py +806 -59
- absfuyu/numbers/__init__.py +13 -0
- absfuyu/numbers/number_to_word.py +321 -0
- absfuyu/numbers/shorten_number.py +303 -0
- absfuyu/numbers/time_duration.py +217 -0
- absfuyu/pkg_data/__init__.py +2 -2
- absfuyu/pkg_data/deprecated.py +2 -2
- absfuyu/pkg_data/logo.py +1462 -0
- absfuyu/sort.py +4 -4
- absfuyu/tools/__init__.py +28 -2
- absfuyu/tools/checksum.py +144 -9
- absfuyu/tools/converter.py +120 -34
- absfuyu/tools/generator.py +461 -0
- absfuyu/tools/inspector.py +752 -0
- absfuyu/tools/keygen.py +2 -2
- absfuyu/tools/obfuscator.py +47 -9
- absfuyu/tools/passwordlib.py +89 -25
- absfuyu/tools/shutdownizer.py +3 -8
- absfuyu/tools/sw.py +718 -0
- absfuyu/tools/web.py +10 -13
- absfuyu/typings.py +138 -0
- absfuyu/util/__init__.py +114 -6
- absfuyu/util/api.py +41 -18
- absfuyu/util/cli.py +119 -0
- absfuyu/util/gui.py +91 -0
- absfuyu/util/json_method.py +43 -14
- absfuyu/util/lunar.py +2 -2
- absfuyu/util/package.py +124 -0
- absfuyu/util/path.py +702 -82
- absfuyu/util/performance.py +122 -7
- absfuyu/util/shorten_number.py +244 -21
- absfuyu/util/text_table.py +481 -0
- absfuyu/util/zipped.py +8 -7
- absfuyu/version.py +79 -59
- {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/METADATA +52 -11
- absfuyu-6.1.2.dist-info/RECORD +105 -0
- {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/WHEEL +1 -1
- absfuyu/extra/data_analysis.py +0 -1078
- absfuyu/general/generator.py +0 -303
- absfuyu-5.0.0.dist-info/RECORD +0 -68
- {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/entry_points.txt +0 -0
- {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/licenses/LICENSE +0 -0
absfuyu/extra/da/dadf.py
ADDED
|
@@ -0,0 +1,1600 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Absfuyu: Data Analysis
|
|
3
|
+
----------------------
|
|
4
|
+
Data Analyst DataFrame
|
|
5
|
+
|
|
6
|
+
Version: 6.1.1
|
|
7
|
+
Date updated: 30/12/2025 (dd/mm/yyyy)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Module level
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
__all__ = [
|
|
13
|
+
"DADF",
|
|
14
|
+
"DataAnalystDataFrameColumnMethodMixin",
|
|
15
|
+
"DataAnalystDataFrameRowMethodMixin",
|
|
16
|
+
"DataAnalystDataFrameInfoMixin",
|
|
17
|
+
"DataAnalystDataFrameNAMixin",
|
|
18
|
+
"DataAnalystDataFrameOtherMixin",
|
|
19
|
+
"DataAnalystDataFrameDateMixin",
|
|
20
|
+
"DataAnalystDataFrameExportMixin",
|
|
21
|
+
"DataAnalystDataFrameCityMixin",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Library
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
import random
|
|
28
|
+
import string
|
|
29
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
30
|
+
from datetime import datetime, timedelta
|
|
31
|
+
from typing import Any, Literal, Self, cast, override
|
|
32
|
+
|
|
33
|
+
import numpy as np
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from xlsxwriter import Workbook
|
|
36
|
+
from xlsxwriter.worksheet import Worksheet
|
|
37
|
+
|
|
38
|
+
from absfuyu.core.baseclass import GetClassMembersMixin
|
|
39
|
+
from absfuyu.core.docstring import deprecated, versionadded, versionchanged
|
|
40
|
+
from absfuyu.core.dummy_func import unidecode
|
|
41
|
+
from absfuyu.extra.da.dadf_base import CityData
|
|
42
|
+
from absfuyu.extra.da.dadf_base import DataAnalystDataFrameBase as DFBase
|
|
43
|
+
from absfuyu.extra.da.dadf_base import SplittedDF
|
|
44
|
+
from absfuyu.typings import R as _R
|
|
45
|
+
from absfuyu.typings import T as _T
|
|
46
|
+
from absfuyu.util import set_min_max
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Column method
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
class DataAnalystDataFrameColumnMethodMixin(DFBase):
|
|
52
|
+
"""
|
|
53
|
+
Data Analyst ``pd.DataFrame`` - Column method
|
|
54
|
+
|
|
55
|
+
- Rearrange rightmost column
|
|
56
|
+
- Drop columns
|
|
57
|
+
- Drop rightmost column
|
|
58
|
+
- Add blank column
|
|
59
|
+
- Split str column
|
|
60
|
+
- Get column name unidecoded
|
|
61
|
+
- Get column unidecoded
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def rearrange_rightmost_column(
|
|
65
|
+
self, insert_to_col: str, num_of_cols: int = 1
|
|
66
|
+
) -> Self:
|
|
67
|
+
"""
|
|
68
|
+
Move right-most columns to selected position
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
insert_to_col : str
|
|
73
|
+
Name of the column that the right-most column will be moved next to
|
|
74
|
+
|
|
75
|
+
num_of_cols : int
|
|
76
|
+
Number of columns moved, by default ``1``
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
Self
|
|
81
|
+
Modified DataFrame
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
Example:
|
|
85
|
+
--------
|
|
86
|
+
>>> df = DADF.sample_df(2)
|
|
87
|
+
>>> df
|
|
88
|
+
number number_big number_range missing_value text date
|
|
89
|
+
0 -1.583590 756 700 NaN eqklyckc 2023-05-20
|
|
90
|
+
1 0.203968 167 100 NaN wzrsxinb 2011-02-27
|
|
91
|
+
>>> df.rearrange_rightmost_column("number")
|
|
92
|
+
number date number_big number_range missing_value text
|
|
93
|
+
0 -1.583590 2023-05-20 756 700 NaN eqklyckc
|
|
94
|
+
1 0.203968 2011-02-27 167 100 NaN wzrsxinb
|
|
95
|
+
"""
|
|
96
|
+
cols: list[str] = self.columns.to_list() # List of columns
|
|
97
|
+
num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols)))
|
|
98
|
+
col_index: int = cols.index(insert_to_col)
|
|
99
|
+
new_cols: list[str] = (
|
|
100
|
+
cols[: col_index + 1]
|
|
101
|
+
+ cols[-num_of_cols:]
|
|
102
|
+
+ cols[col_index + 1 : len(cols) - num_of_cols]
|
|
103
|
+
)
|
|
104
|
+
self = self.__class__(self[new_cols])
|
|
105
|
+
return self
|
|
106
|
+
|
|
107
|
+
def drop_columns(self, columns: Sequence[str]) -> Self:
|
|
108
|
+
"""
|
|
109
|
+
Drop columns in DataFrame
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
columns : Iterable[str]
|
|
114
|
+
List of columns need to drop
|
|
115
|
+
|
|
116
|
+
Returns
|
|
117
|
+
-------
|
|
118
|
+
Self
|
|
119
|
+
Modified DataFrame
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
Example:
|
|
123
|
+
--------
|
|
124
|
+
>>> df = DADF.sample_df(2)
|
|
125
|
+
>>> df
|
|
126
|
+
number number_big number_range missing_value text date
|
|
127
|
+
0 -0.283019 666 600 NaN ztoeeblx 2022-11-13
|
|
128
|
+
1 1.194725 939 900 NaN fxardqvh 2005-08-04
|
|
129
|
+
>>> df.drop_columns(["date", "text"])
|
|
130
|
+
number number_big number_range missing_value
|
|
131
|
+
0 -0.283019 666 600 NaN
|
|
132
|
+
1 1.194725 939 900 NaN
|
|
133
|
+
"""
|
|
134
|
+
for column in columns:
|
|
135
|
+
try:
|
|
136
|
+
self.drop(columns=[column], inplace=True)
|
|
137
|
+
except KeyError:
|
|
138
|
+
# logger.debug(f"{column} column does not exist")
|
|
139
|
+
pass
|
|
140
|
+
return self
|
|
141
|
+
|
|
142
|
+
def drop_rightmost(self, num_of_cols: int = 1) -> Self:
|
|
143
|
+
"""
|
|
144
|
+
Drop ``num_of_cols`` right-most columns
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
num_of_cols : int
|
|
149
|
+
Number of columns to drop
|
|
150
|
+
|
|
151
|
+
Returns
|
|
152
|
+
-------
|
|
153
|
+
Self
|
|
154
|
+
Modified DataFrame
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
Example:
|
|
158
|
+
--------
|
|
159
|
+
>>> df = DADF.sample_df(2)
|
|
160
|
+
>>> df
|
|
161
|
+
number number_big number_range missing_value text date
|
|
162
|
+
0 0.851953 572 500 5 ncpbnzef 2020-08-15
|
|
163
|
+
1 0.381643 595 500 53 iojogbgj 2011-12-04
|
|
164
|
+
>>> df.drop_rightmost(5)
|
|
165
|
+
number
|
|
166
|
+
0 0.851953
|
|
167
|
+
1 0.381643
|
|
168
|
+
"""
|
|
169
|
+
# Restrain
|
|
170
|
+
# if num_of_cols < 1:
|
|
171
|
+
# num_of_cols = 1
|
|
172
|
+
# if num_of_cols > self.shape[1]:
|
|
173
|
+
# num_of_cols = self.shape[1]
|
|
174
|
+
num_of_cols = int(
|
|
175
|
+
set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Logic
|
|
179
|
+
for _ in range(num_of_cols):
|
|
180
|
+
self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
|
|
181
|
+
return self
|
|
182
|
+
|
|
183
|
+
@deprecated("5.1.0", reason="Use pd.DataFrame.assign(...) method instead")
|
|
184
|
+
def add_blank_column(self, column_name: str, fill: Any = np.nan, /) -> Self:
|
|
185
|
+
"""
|
|
186
|
+
[DEPRECATED] Add a blank column.
|
|
187
|
+
|
|
188
|
+
E.g: Use `pd.DataFrame.assign(new_col=lambda x: x['old_col'])` instead
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
column_name : str
|
|
193
|
+
Name of the column to add
|
|
194
|
+
|
|
195
|
+
fill : Any
|
|
196
|
+
Fill the column with data
|
|
197
|
+
|
|
198
|
+
Returns
|
|
199
|
+
-------
|
|
200
|
+
Self
|
|
201
|
+
Modified DataFrame
|
|
202
|
+
"""
|
|
203
|
+
self[column_name] = [fill] * self.shape[0]
|
|
204
|
+
return self
|
|
205
|
+
|
|
206
|
+
@versionadded("5.2.0") # No test cases
|
|
207
|
+
def split_str_column(
|
|
208
|
+
self,
|
|
209
|
+
col: str,
|
|
210
|
+
pattern: str = " ",
|
|
211
|
+
*,
|
|
212
|
+
n: int | None = None,
|
|
213
|
+
regex: bool = False,
|
|
214
|
+
) -> Self:
|
|
215
|
+
"""
|
|
216
|
+
Split column with dtype[str] into other columns.
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
col : str
|
|
221
|
+
Column name
|
|
222
|
+
|
|
223
|
+
pattern : str, optional
|
|
224
|
+
Split pattern, by default ``" "``
|
|
225
|
+
|
|
226
|
+
n : int | None, optional
|
|
227
|
+
Split by how many times, by default ``None``
|
|
228
|
+
|
|
229
|
+
regex : bool, optional
|
|
230
|
+
Regex mode, by default ``False``
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
Self
|
|
235
|
+
DataFrame
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
Example:
|
|
239
|
+
--------
|
|
240
|
+
>>> df = DADF(DADF.sample_df(5)[["text"]])
|
|
241
|
+
>>> df.split_str_column("text", "s"))
|
|
242
|
+
text text_0 text_1
|
|
243
|
+
0 uwfzbsgj uwfzb gj
|
|
244
|
+
1 lxlskayx lxl kayx
|
|
245
|
+
2 fzgpzjtp fzgpzjtp None
|
|
246
|
+
3 lxnytktz lxnytktz None
|
|
247
|
+
4 onryaxtt onryaxtt None
|
|
248
|
+
"""
|
|
249
|
+
if n is None:
|
|
250
|
+
pass
|
|
251
|
+
splited_data: pd.DataFrame = self[col].str.split(pat=pattern, n=n, expand=True, regex=regex) # type: ignore
|
|
252
|
+
num_of_splitted_cols = splited_data.shape[1]
|
|
253
|
+
new_col_names = [f"{col}_{x}" for x in range(num_of_splitted_cols)]
|
|
254
|
+
self[new_col_names] = splited_data
|
|
255
|
+
return self
|
|
256
|
+
|
|
257
|
+
@versionadded("5.12.0") # No test cases
|
|
258
|
+
def get_column_name_unidecoded(self, col_name: str, /, *, mode: Literal["start", "end", "in"] = "start") -> str:
|
|
259
|
+
"""
|
|
260
|
+
Get column name from lowercase unidecode'd version name
|
|
261
|
+
|
|
262
|
+
Parameters
|
|
263
|
+
----------
|
|
264
|
+
col_name : str
|
|
265
|
+
Column name to find
|
|
266
|
+
|
|
267
|
+
mode : Literal["start", "end", "in"], optional
|
|
268
|
+
Which mode to find, by default "start"
|
|
269
|
+
- "start": str.startswith()
|
|
270
|
+
- "end": str.endswith()
|
|
271
|
+
- "in": if x in y
|
|
272
|
+
|
|
273
|
+
Returns
|
|
274
|
+
-------
|
|
275
|
+
str
|
|
276
|
+
Column name
|
|
277
|
+
|
|
278
|
+
Raises
|
|
279
|
+
------
|
|
280
|
+
ValueError
|
|
281
|
+
Column not found
|
|
282
|
+
"""
|
|
283
|
+
for x in self.columns.to_list():
|
|
284
|
+
col_name_mod = cast(str, unidecode(x.strip().lower()))
|
|
285
|
+
if mode == "start":
|
|
286
|
+
if col_name_mod.startswith(col_name):
|
|
287
|
+
return x
|
|
288
|
+
elif mode == "end":
|
|
289
|
+
if col_name_mod.endswith(col_name):
|
|
290
|
+
return x
|
|
291
|
+
elif mode == "in":
|
|
292
|
+
if col_name_mod in col_name:
|
|
293
|
+
return x
|
|
294
|
+
|
|
295
|
+
raise ValueError(f"Column not found: {col_name}")
|
|
296
|
+
|
|
297
|
+
@versionadded("5.12.0") # No test cases
|
|
298
|
+
def get_column_unidecoded(self, col_name: str, /, *, mode: Literal["start", "end", "in"] = "start") -> pd.Series:
|
|
299
|
+
"""
|
|
300
|
+
Get column from lowercase unidecode'd version column name
|
|
301
|
+
|
|
302
|
+
Parameters
|
|
303
|
+
----------
|
|
304
|
+
col_name : str
|
|
305
|
+
Column name to find
|
|
306
|
+
|
|
307
|
+
mode : Literal["start", "end", "in"], optional
|
|
308
|
+
Which mode to find, by default "start"
|
|
309
|
+
- "start": str.startswith()
|
|
310
|
+
- "end": str.endswith()
|
|
311
|
+
- "in": if x in y
|
|
312
|
+
|
|
313
|
+
Returns
|
|
314
|
+
-------
|
|
315
|
+
Series
|
|
316
|
+
Column data
|
|
317
|
+
"""
|
|
318
|
+
return self[self.get_column_name_unidecoded(col_name, mode=mode)]
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# Row method
|
|
322
|
+
# ---------------------------------------------------------------------------
|
|
323
|
+
class DataAnalystDataFrameRowMethodMixin(DFBase):
|
|
324
|
+
"""
|
|
325
|
+
Data Analyst ``pd.DataFrame`` - Row method
|
|
326
|
+
|
|
327
|
+
- Get different rows
|
|
328
|
+
- Add blank row
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
@versionadded("4.0.0")
|
|
332
|
+
def get_different_rows(self, other: Self | pd.DataFrame) -> Self:
|
|
333
|
+
"""
|
|
334
|
+
Subtract DataFrame to find the different rows
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
other : Self | pd.DataFrame
|
|
339
|
+
DataFrame to subtract
|
|
340
|
+
|
|
341
|
+
Returns
|
|
342
|
+
-------
|
|
343
|
+
Self
|
|
344
|
+
Different row DataFrame
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
Example:
|
|
348
|
+
--------
|
|
349
|
+
>>> df1 = DADF({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
|
|
350
|
+
>>> df2 = DADF({"A": [1, 2, 3, 4], "B": [7, 6, 6, 8]})
|
|
351
|
+
>>> df1.get_different_rows(df2)
|
|
352
|
+
A B
|
|
353
|
+
0 1 7
|
|
354
|
+
2 3 6
|
|
355
|
+
"""
|
|
356
|
+
df = self.copy()
|
|
357
|
+
out = (
|
|
358
|
+
df.merge(other, indicator=True, how="right")
|
|
359
|
+
.query("_merge=='right_only'")
|
|
360
|
+
.drop("_merge", axis=1)
|
|
361
|
+
)
|
|
362
|
+
return self.__class__(out)
|
|
363
|
+
|
|
364
|
+
@versionchanged("6.0.0", reason="Improved logic")
|
|
365
|
+
@versionadded("5.7.0")
|
|
366
|
+
def add_blank_row(self, fill: Any = np.nan, /) -> Self:
|
|
367
|
+
"""
|
|
368
|
+
Add a new row to the end of a DataFrame.
|
|
369
|
+
|
|
370
|
+
Parameters
|
|
371
|
+
----------
|
|
372
|
+
fill : Any, default np.nan
|
|
373
|
+
Value to fill in the new row (e.g., np.nan, None, "", 0).
|
|
374
|
+
|
|
375
|
+
Returns
|
|
376
|
+
-------
|
|
377
|
+
Self
|
|
378
|
+
DataFrame with the new row appended.
|
|
379
|
+
"""
|
|
380
|
+
# Create a dict with all columns filled with fill
|
|
381
|
+
new_row = {col: fill for col in self.columns}
|
|
382
|
+
safe_types = self._safe_dtypes(self.dtypes)
|
|
383
|
+
blank_row_df = pd.DataFrame([new_row], columns=self.columns).astype(safe_types)
|
|
384
|
+
|
|
385
|
+
# self.loc[len(self)] = new_row # type: ignore
|
|
386
|
+
# return self
|
|
387
|
+
out = cast(pd.DataFrame, pd.concat([self, blank_row_df], ignore_index=True))
|
|
388
|
+
return self.__class__(out)
|
|
389
|
+
|
|
390
|
+
@versionadded("6.0.0") # Support
|
|
391
|
+
def _safe_dtypes(self, dtypes: pd.Series) -> dict[str, Any]:
|
|
392
|
+
"""
|
|
393
|
+
Convert DataFrame dtypes into a safe mapping for operations involving
|
|
394
|
+
missing values (NA), especially during row insertion or concatenation.
|
|
395
|
+
|
|
396
|
+
This function is primarily used to prevent pandas errors when inserting
|
|
397
|
+
rows containing missing values (``NaN``) into columns with non-nullable
|
|
398
|
+
integer dtypes (e.g. ``int64``). Since standard NumPy integer dtypes do not
|
|
399
|
+
support missing values, they are converted to pandas' nullable integer
|
|
400
|
+
dtype (``Int64``).
|
|
401
|
+
|
|
402
|
+
All non-integer dtypes are preserved without modification.
|
|
403
|
+
|
|
404
|
+
- Pandas nullable integer dtypes (``Int64``, ``Int32``, etc.) allow missing
|
|
405
|
+
values via ``pd.NA``, unlike NumPy integer dtypes.
|
|
406
|
+
- This function is commonly used before calling ``DataFrame.astype`` to
|
|
407
|
+
avoid ``IntCastingNaNError`` when NA values are present.
|
|
408
|
+
- The function does **not** modify floating-point, boolean, datetime,
|
|
409
|
+
categorical, or object dtypes.
|
|
410
|
+
|
|
411
|
+
Parameters
|
|
412
|
+
----------
|
|
413
|
+
dtypes : Series
|
|
414
|
+
A Series mapping column names to their pandas dtypes, typically obtained
|
|
415
|
+
from ``DataFrame.dtypes``.
|
|
416
|
+
|
|
417
|
+
Returns
|
|
418
|
+
-------
|
|
419
|
+
dict
|
|
420
|
+
A dictionary mapping column names to safe dtypes. Integer dtypes are
|
|
421
|
+
converted to pandas nullable integer dtype (``"Int64"``), while all
|
|
422
|
+
other dtypes remain unchanged.
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
Example:
|
|
426
|
+
--------
|
|
427
|
+
Basic usage with a DataFrame::
|
|
428
|
+
|
|
429
|
+
>>> df.dtypes
|
|
430
|
+
id int64
|
|
431
|
+
name object
|
|
432
|
+
amount float64
|
|
433
|
+
dtype: object
|
|
434
|
+
|
|
435
|
+
>>> _safe_dtypes(df.dtypes)
|
|
436
|
+
{
|
|
437
|
+
"id": "Int64",
|
|
438
|
+
"name": dtype("O"),
|
|
439
|
+
"amount": dtype("float64"),
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
Typical integration with ``astype``::
|
|
443
|
+
|
|
444
|
+
>>> safe_types = _safe_dtypes(df.dtypes)
|
|
445
|
+
>>> new_df = df.astype(safe_types)
|
|
446
|
+
|
|
447
|
+
This is especially useful when inserting rows with missing values::
|
|
448
|
+
|
|
449
|
+
>>> sep_row = {"id": pd.NA, "name": "---", "amount": pd.NA}
|
|
450
|
+
>>> sep_df = pd.DataFrame([sep_row]).astype(_safe_dtypes(df.dtypes))
|
|
451
|
+
"""
|
|
452
|
+
out = {}
|
|
453
|
+
for col, dt in dtypes.items():
|
|
454
|
+
if pd.api.types.is_integer_dtype(dt):
|
|
455
|
+
out[col] = "Int64" # nullable integer
|
|
456
|
+
else:
|
|
457
|
+
out[col] = dt
|
|
458
|
+
return out
|
|
459
|
+
|
|
460
|
+
@versionadded("6.0.0") # Better version of add_blank_row()
|
|
461
|
+
def add_separator_row(
|
|
462
|
+
self,
|
|
463
|
+
group_cols: str | Iterable[str],
|
|
464
|
+
*,
|
|
465
|
+
separator: Mapping[str, object] | None = None,
|
|
466
|
+
drop_last: bool = True,
|
|
467
|
+
) -> Self:
|
|
468
|
+
"""
|
|
469
|
+
Insert a separator row after each group in a DataFrame.
|
|
470
|
+
|
|
471
|
+
Parameters
|
|
472
|
+
----------
|
|
473
|
+
df : pandas.DataFrame
|
|
474
|
+
Input DataFrame (must be pre-sorted by ``group_cols``).
|
|
475
|
+
|
|
476
|
+
group_cols : str | Iterable[str]
|
|
477
|
+
Column(s) used to define grouping boundaries.
|
|
478
|
+
|
|
479
|
+
separator : Mapping[str, object] | None, optional
|
|
480
|
+
Custom separator row values (e.g. {"col": "---"}).
|
|
481
|
+
Columns not provided will be filled with NaN.
|
|
482
|
+
If None, a fully blank row is inserted.
|
|
483
|
+
|
|
484
|
+
drop_last : bool, optional
|
|
485
|
+
If True, do not insert a separator after the last group.
|
|
486
|
+
|
|
487
|
+
Returns
|
|
488
|
+
-------
|
|
489
|
+
Self
|
|
490
|
+
DataFrame with separator rows inserted.
|
|
491
|
+
"""
|
|
492
|
+
df = self.copy()
|
|
493
|
+
|
|
494
|
+
if isinstance(group_cols, str):
|
|
495
|
+
group_cols = [group_cols]
|
|
496
|
+
|
|
497
|
+
# Validate columns
|
|
498
|
+
missing = set(group_cols) - set(df.columns)
|
|
499
|
+
if missing:
|
|
500
|
+
raise KeyError(f"Missing columns: {missing}")
|
|
501
|
+
|
|
502
|
+
# Build separator row template
|
|
503
|
+
if separator is None:
|
|
504
|
+
sep_row = {c: np.nan for c in df.columns}
|
|
505
|
+
else:
|
|
506
|
+
sep_row = {c: separator.get(c, np.nan) for c in df.columns}
|
|
507
|
+
|
|
508
|
+
rows = []
|
|
509
|
+
|
|
510
|
+
safe_types = self._safe_dtypes(df.dtypes)
|
|
511
|
+
|
|
512
|
+
# Group while preserving order
|
|
513
|
+
for _, g in df.groupby(group_cols, sort=False):
|
|
514
|
+
rows.append(g)
|
|
515
|
+
|
|
516
|
+
sep_df = pd.DataFrame([sep_row], columns=df.columns).astype(safe_types)
|
|
517
|
+
rows.append(sep_df)
|
|
518
|
+
|
|
519
|
+
out = cast(pd.DataFrame, pd.concat(rows, ignore_index=True))
|
|
520
|
+
|
|
521
|
+
if drop_last:
|
|
522
|
+
out = out.iloc[:-1].reset_index(drop=True)
|
|
523
|
+
|
|
524
|
+
return self.__class__(out)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
# Info
|
|
528
|
+
# ---------------------------------------------------------------------------
|
|
529
|
+
class DataAnalystDataFrameInfoMixin(DFBase):
|
|
530
|
+
"""
|
|
531
|
+
Data Analyst ``pd.DataFrame`` - Info
|
|
532
|
+
|
|
533
|
+
- Quick info
|
|
534
|
+
- Quick describe
|
|
535
|
+
- Show distribution
|
|
536
|
+
- Threshold filter
|
|
537
|
+
"""
|
|
538
|
+
|
|
539
|
+
# Quick info
|
|
540
|
+
@versionadded("3.2.0")
|
|
541
|
+
def qinfo(self) -> str:
|
|
542
|
+
"""
|
|
543
|
+
Show quick infomation about DataFrame
|
|
544
|
+
|
|
545
|
+
Example:
|
|
546
|
+
--------
|
|
547
|
+
>>> DADF.sample_df().qinfo()
|
|
548
|
+
Dataset Information:
|
|
549
|
+
- Number of Rows: 100
|
|
550
|
+
- Number of Columns: 6
|
|
551
|
+
- Total observation: 600
|
|
552
|
+
- Missing value: 13 (2.17%)
|
|
553
|
+
|
|
554
|
+
Column names:
|
|
555
|
+
['number', 'number_big', 'number_range', 'missing_value', 'text', 'date']
|
|
556
|
+
"""
|
|
557
|
+
missing_values = self.isnull().sum().sum()
|
|
558
|
+
total_observation = self.shape[0] * self.shape[1]
|
|
559
|
+
mv_rate = missing_values / total_observation * 100
|
|
560
|
+
info = (
|
|
561
|
+
f"Dataset Information:\n"
|
|
562
|
+
f"- Number of Rows: {self.shape[0]:,}\n"
|
|
563
|
+
f"- Number of Columns: {self.shape[1]:,}\n"
|
|
564
|
+
f"- Total observation: {total_observation:,}\n"
|
|
565
|
+
f"- Missing value: {missing_values:,} ({mv_rate:.2f}%)\n\n"
|
|
566
|
+
f"Column names:\n{self.columns.to_list()}"
|
|
567
|
+
)
|
|
568
|
+
return info
|
|
569
|
+
|
|
570
|
+
@override
|
|
571
|
+
def describe(self, percentiles=None, include=None, exclude=None) -> Self: # type: ignore
|
|
572
|
+
"""pd.DataFrame.describe() override"""
|
|
573
|
+
return self.__class__(super().describe(percentiles, include, exclude)) # type: ignore [no-any-return]
|
|
574
|
+
|
|
575
|
+
# Quick describe
|
|
576
|
+
@versionadded("3.2.0")
|
|
577
|
+
def qdescribe(self) -> Self:
|
|
578
|
+
"""
|
|
579
|
+
Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype
|
|
580
|
+
|
|
581
|
+
Returns
|
|
582
|
+
-------
|
|
583
|
+
Self
|
|
584
|
+
Modified DataFrame
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
Example:
|
|
588
|
+
--------
|
|
589
|
+
>>> DADF.sample_df().qdescribe()
|
|
590
|
+
number number_big missing_value
|
|
591
|
+
count 100.000000 100.000000 48.000000
|
|
592
|
+
mean -0.052935 586.750000 22.916667
|
|
593
|
+
std 0.954170 237.248596 11.987286
|
|
594
|
+
min -2.392952 105.000000 3.000000
|
|
595
|
+
25% -0.738311 407.500000 13.000000
|
|
596
|
+
50% -0.068014 607.000000 23.500000
|
|
597
|
+
75% 0.614025 790.250000 36.000000
|
|
598
|
+
max 2.512533 988.000000 42.000000
|
|
599
|
+
"""
|
|
600
|
+
return self.__class__( # type: ignore [no-any-return]
|
|
601
|
+
self[self.select_dtypes(exclude=["object", "datetime"]).columns].describe()
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
@versionadded("3.2.0")
|
|
605
|
+
def show_distribution(
|
|
606
|
+
self,
|
|
607
|
+
column_name: str,
|
|
608
|
+
dropna: bool = True,
|
|
609
|
+
*,
|
|
610
|
+
show_percentage: bool = True,
|
|
611
|
+
percentage_round_up: int = 2,
|
|
612
|
+
) -> Self:
|
|
613
|
+
"""
|
|
614
|
+
Show distribution of a column
|
|
615
|
+
|
|
616
|
+
Parameters
|
|
617
|
+
----------
|
|
618
|
+
column_name : str
|
|
619
|
+
Column to show distribution
|
|
620
|
+
|
|
621
|
+
dropna : bool
|
|
622
|
+
Count N/A when ``False``
|
|
623
|
+
(Default: ``True``)
|
|
624
|
+
|
|
625
|
+
show_percentage : bool
|
|
626
|
+
Show proportion in range 0% - 100% instead of [0, 1]
|
|
627
|
+
(Default: ``True``)
|
|
628
|
+
|
|
629
|
+
percentage_round_up : int
|
|
630
|
+
Round up to which decimals
|
|
631
|
+
(Default: ``2``)
|
|
632
|
+
|
|
633
|
+
Returns
|
|
634
|
+
-------
|
|
635
|
+
Self
|
|
636
|
+
Distribution DataFrame
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
Example:
|
|
640
|
+
--------
|
|
641
|
+
>>> DADF.sample_df().show_distribution("number_range")
|
|
642
|
+
number_range count percentage
|
|
643
|
+
0 900 16 16.0
|
|
644
|
+
1 700 15 15.0
|
|
645
|
+
2 300 12 12.0
|
|
646
|
+
3 200 12 12.0
|
|
647
|
+
4 400 11 11.0
|
|
648
|
+
5 600 11 11.0
|
|
649
|
+
6 800 10 10.0
|
|
650
|
+
7 100 9 9.0
|
|
651
|
+
8 500 4 4.0
|
|
652
|
+
"""
|
|
653
|
+
out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index()
|
|
654
|
+
if show_percentage:
|
|
655
|
+
out["percentage"] = (out["count"] / self.shape[0] * 100).round(
|
|
656
|
+
percentage_round_up
|
|
657
|
+
)
|
|
658
|
+
else:
|
|
659
|
+
out["percentage"] = (out["count"] / self.shape[0]).round(
|
|
660
|
+
percentage_round_up
|
|
661
|
+
)
|
|
662
|
+
return self.__class__(out)
|
|
663
|
+
|
|
664
|
+
@deprecated("5.1.0", reason="Rework THIS")
|
|
665
|
+
def threshold_filter(
|
|
666
|
+
self,
|
|
667
|
+
destination_column: str,
|
|
668
|
+
threshold: int | float = 10,
|
|
669
|
+
*,
|
|
670
|
+
top: int | None = None,
|
|
671
|
+
replace_with: Any = "Other",
|
|
672
|
+
) -> Self:
|
|
673
|
+
"""
|
|
674
|
+
Filter out percentage of data that smaller than the ``threshold``,
|
|
675
|
+
replace all of the smaller data to ``replace_with``.
|
|
676
|
+
As a result, pie chart is less messy.
|
|
677
|
+
|
|
678
|
+
Parameters
|
|
679
|
+
----------
|
|
680
|
+
destination_column : str
|
|
681
|
+
Column to be filtered
|
|
682
|
+
|
|
683
|
+
threshold : int | float
|
|
684
|
+
Which percentage to cut-off
|
|
685
|
+
(Default: 10%)
|
|
686
|
+
|
|
687
|
+
top : int
|
|
688
|
+
Only show top ``x`` categories in pie chart
|
|
689
|
+
(replace threshold mode)
|
|
690
|
+
(Default: ``None``)
|
|
691
|
+
|
|
692
|
+
replace_with : Any
|
|
693
|
+
Replace all of the smaller data with specified value
|
|
694
|
+
|
|
695
|
+
Returns
|
|
696
|
+
-------
|
|
697
|
+
Self
|
|
698
|
+
Modified DataFrame
|
|
699
|
+
"""
|
|
700
|
+
# Clean
|
|
701
|
+
try:
|
|
702
|
+
self[destination_column] = self[
|
|
703
|
+
destination_column
|
|
704
|
+
].str.strip() # Remove trailing space
|
|
705
|
+
except Exception:
|
|
706
|
+
pass
|
|
707
|
+
|
|
708
|
+
# Logic
|
|
709
|
+
col_df = self.show_distribution(destination_column)
|
|
710
|
+
|
|
711
|
+
# Rename
|
|
712
|
+
if top is not None:
|
|
713
|
+
list_of_keep: list = (
|
|
714
|
+
col_df[destination_column]
|
|
715
|
+
.head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0])) # type: ignore
|
|
716
|
+
.to_list()
|
|
717
|
+
)
|
|
718
|
+
# logger.debug(list_of_keep)
|
|
719
|
+
else:
|
|
720
|
+
list_of_keep = col_df[col_df["percentage"] >= threshold][
|
|
721
|
+
destination_column
|
|
722
|
+
].to_list() # values that will not be renamed
|
|
723
|
+
self[f"{destination_column}_filtered"] = self[destination_column].apply(
|
|
724
|
+
lambda x: replace_with if x not in list_of_keep else x
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# Return
|
|
728
|
+
return self
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
# Missing value
|
|
732
|
+
# ---------------------------------------------------------------------------
|
|
733
|
+
class DataAnalystDataFrameNAMixin(DFBase):
|
|
734
|
+
"""
|
|
735
|
+
Data Analyst ``pd.DataFrame`` - Missing value
|
|
736
|
+
|
|
737
|
+
- Fill missing values
|
|
738
|
+
- Get missing values
|
|
739
|
+
- Split N/A
|
|
740
|
+
- Apply not null
|
|
741
|
+
- Apply not null row
|
|
742
|
+
"""
|
|
743
|
+
|
|
744
|
+
def fill_missing_values(
|
|
745
|
+
self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
|
|
746
|
+
) -> Self:
|
|
747
|
+
"""
|
|
748
|
+
Fill missing values in specified column
|
|
749
|
+
|
|
750
|
+
Parameters
|
|
751
|
+
----------
|
|
752
|
+
column_name : str
|
|
753
|
+
Column name
|
|
754
|
+
|
|
755
|
+
fill : Any
|
|
756
|
+
Fill the missing values with, by default ``np.nan``
|
|
757
|
+
|
|
758
|
+
fill_when_not_exist : Any
|
|
759
|
+
When ``column_name`` does not exist,
|
|
760
|
+
create a new column and fill with
|
|
761
|
+
``fill_when_not_exist``, by default ``np.nan``
|
|
762
|
+
|
|
763
|
+
Returns
|
|
764
|
+
-------
|
|
765
|
+
Self
|
|
766
|
+
Modified DataFrame
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
Example:
|
|
770
|
+
--------
|
|
771
|
+
>>> df = DADF.sample_df(2)
|
|
772
|
+
>>> df
|
|
773
|
+
number number_big number_range missing_value text date
|
|
774
|
+
0 0.174303 926 900 NaN tenkiakh 2006-09-08
|
|
775
|
+
1 0.305137 140 100 NaN jzuddamf 2012-04-04
|
|
776
|
+
>>> df.fill_missing_values("missing_value", 0)
|
|
777
|
+
number number_big number_range missing_value text date
|
|
778
|
+
0 0.174303 926 900 0.0 tenkiakh 2006-09-08
|
|
779
|
+
1 0.305137 140 100 0.0 jzuddamf 2012-04-04
|
|
780
|
+
>>> df.fill_missing_values("missing_column", 0, fill_when_not_exist=0)
|
|
781
|
+
number number_big number_range missing_value text date missing_column
|
|
782
|
+
0 0.174303 926 900 0.0 tenkiakh 2006-09-08 0
|
|
783
|
+
1 0.305137 140 100 0.0 jzuddamf 2012-04-04 0
|
|
784
|
+
"""
|
|
785
|
+
try:
|
|
786
|
+
self[column_name] = self[column_name].fillna(fill)
|
|
787
|
+
except KeyError:
|
|
788
|
+
if getattr(self, "add_blank_column", None) is not None:
|
|
789
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
790
|
+
self.add_blank_column(column_name, fill_when_not_exist) # type: ignore
|
|
791
|
+
return self
|
|
792
|
+
|
|
793
|
+
def get_missing_values(
|
|
794
|
+
self, hightlight: bool = True, *, percentage_round_up: int = 2
|
|
795
|
+
) -> Self:
|
|
796
|
+
"""
|
|
797
|
+
Get a DataFrame contains count of missing values for each column
|
|
798
|
+
|
|
799
|
+
Parameters
|
|
800
|
+
----------
|
|
801
|
+
hightlight : bool
|
|
802
|
+
Shows only columns with missing values when ``True``, by default ``True``
|
|
803
|
+
|
|
804
|
+
percentage_round_up : int
|
|
805
|
+
Round up to which decimals, by default ``2``
|
|
806
|
+
|
|
807
|
+
Returns
|
|
808
|
+
-------
|
|
809
|
+
Self
|
|
810
|
+
Missing value DataFrame
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
Example:
|
|
814
|
+
--------
|
|
815
|
+
>>> DADF.sample_df(152).get_missing_values()
|
|
816
|
+
Num of N/A Percentage
|
|
817
|
+
missing_value 42 27.63
|
|
818
|
+
"""
|
|
819
|
+
# Check for missing value
|
|
820
|
+
df_na = self.isnull().sum().sort_values(ascending=False)
|
|
821
|
+
if hightlight:
|
|
822
|
+
out = df_na[df_na != 0].to_frame()
|
|
823
|
+
else:
|
|
824
|
+
out = df_na.to_frame()
|
|
825
|
+
out.rename(columns={0: "Num of N/A"}, inplace=True)
|
|
826
|
+
out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round(
|
|
827
|
+
percentage_round_up
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
# logger.debug(
|
|
831
|
+
# f"Percentage of N/A over entire DF: "
|
|
832
|
+
# f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%"
|
|
833
|
+
# )
|
|
834
|
+
return self.__class__(out)
|
|
835
|
+
|
|
836
|
+
@versionadded("3.1.0")
|
|
837
|
+
def split_na(self, by_column: str) -> SplittedDF:
|
|
838
|
+
"""
|
|
839
|
+
Split DataFrame into 2 parts:
|
|
840
|
+
- Without missing value in specified column
|
|
841
|
+
- With missing value in specified column
|
|
842
|
+
|
|
843
|
+
Parameters
|
|
844
|
+
----------
|
|
845
|
+
by_column : str
|
|
846
|
+
Split by column
|
|
847
|
+
|
|
848
|
+
Returns
|
|
849
|
+
-------
|
|
850
|
+
SplittedDF
|
|
851
|
+
Splitted DataFrame
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
Example:
|
|
855
|
+
--------
|
|
856
|
+
>>> DADF.sample_df(10).split_na("missing_value")
|
|
857
|
+
SplittedDF(
|
|
858
|
+
df= number number_big number_range missing_value text date
|
|
859
|
+
0 0.643254 690 600 3.0 cinvofwj 2018-08-15
|
|
860
|
+
2 0.499345 255 200 13.0 jasifzez 2005-06-01
|
|
861
|
+
3 -1.727036 804 800 38.0 esxjmger 2009-07-24
|
|
862
|
+
4 0.873058 690 600 32.0 htewfpld 2022-07-22
|
|
863
|
+
5 -2.389884 442 400 30.0 hbcnfogu 2006-02-25
|
|
864
|
+
8 0.264584 432 400 2.0 ejbvbmwn 2013-05-11
|
|
865
|
+
9 0.813655 137 100 20.0 oecttada 2024-11-22,
|
|
866
|
+
df_na= number number_big number_range missing_value text date
|
|
867
|
+
1 -0.411354 363 300 NaN juzecani 2014-12-02
|
|
868
|
+
6 -0.833857 531 500 NaN ybnntryh 2023-11-03
|
|
869
|
+
7 1.355589 472 400 NaN zjltghjr 2024-10-09
|
|
870
|
+
)
|
|
871
|
+
"""
|
|
872
|
+
out = SplittedDF(
|
|
873
|
+
# df=self[~self[by_column].isna()], # DF
|
|
874
|
+
df=self[self[by_column].notna()], # DF
|
|
875
|
+
df_na=self[self[by_column].isna()], # DF w/o NA
|
|
876
|
+
)
|
|
877
|
+
return out
|
|
878
|
+
|
|
879
|
+
@versionadded("5.1.0")
|
|
880
|
+
def apply_notnull(self, col: str, callable: Callable[[Any], _R]) -> Self:
|
|
881
|
+
"""
|
|
882
|
+
Only apply callable to not NaN value in column
|
|
883
|
+
|
|
884
|
+
Parameters
|
|
885
|
+
----------
|
|
886
|
+
col : str
|
|
887
|
+
Column to apply
|
|
888
|
+
|
|
889
|
+
callable : Callable[[Any], _R]
|
|
890
|
+
Callable
|
|
891
|
+
|
|
892
|
+
Returns
|
|
893
|
+
-------
|
|
894
|
+
Self
|
|
895
|
+
Applied DataFrame
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
Example:
|
|
899
|
+
--------
|
|
900
|
+
>>> DADF.sample_df(5).apply_notnull("missing_value", lambda _: "REPLACED")
|
|
901
|
+
number number_big number_range missing_value text date
|
|
902
|
+
0 0.852218 157 100 REPLACED dqzxaxxs 2006-03-08
|
|
903
|
+
1 1.522428 616 600 NaN mivkaooe 2018-12-27
|
|
904
|
+
2 0.108506 745 700 REPLACED qanwwjet 2005-07-14
|
|
905
|
+
3 -1.435079 400 400 REPLACED ywahcasi 2024-05-20
|
|
906
|
+
4 0.118993 861 800 REPLACED saoupuby 2019-04-28
|
|
907
|
+
"""
|
|
908
|
+
self[col] = self[col].apply(lambda x: callable(x) if pd.notnull(x) else x) # type: ignore
|
|
909
|
+
return self
|
|
910
|
+
|
|
911
|
+
@versionadded("5.1.0") # type: ignore
|
|
912
|
+
def apply_notnull_row(
|
|
913
|
+
self,
|
|
914
|
+
apply_when_null: Callable[[Any], _R] | _T | None = None,
|
|
915
|
+
apply_when_not_null: Callable[[Any], _R] | _T | None = None,
|
|
916
|
+
col_name: str | None = None,
|
|
917
|
+
) -> Self:
|
|
918
|
+
"""
|
|
919
|
+
Apply to DataFrame's row with missing value.
|
|
920
|
+
|
|
921
|
+
Parameters
|
|
922
|
+
----------
|
|
923
|
+
apply_when_null : Callable[[Any], R] | T | None, optional
|
|
924
|
+
Callable or Any, by default ``None``: returns if entire row is not null
|
|
925
|
+
|
|
926
|
+
apply_when_not_null : Callable[[Any], R] | T | None, optional
|
|
927
|
+
Callable or Any, by default ``None``: returns if entire row is not null
|
|
928
|
+
|
|
929
|
+
col_name : str | None, optional
|
|
930
|
+
Output column name, by default ``None`` (uses custom name)
|
|
931
|
+
|
|
932
|
+
Returns
|
|
933
|
+
-------
|
|
934
|
+
Self
|
|
935
|
+
Modified DataDrame
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
Example:
|
|
939
|
+
--------
|
|
940
|
+
>>> df = DADF({"A": [None, 2, 3, 4], "B": [1, None, 3, 4], "C": [None, 2, None, 4]})
|
|
941
|
+
>>> df.apply_notnull_row()
|
|
942
|
+
A B C applied_row_null
|
|
943
|
+
0 NaN 1.0 NaN False
|
|
944
|
+
1 2.0 NaN 2.0 False
|
|
945
|
+
2 3.0 3.0 NaN False
|
|
946
|
+
3 4.0 4.0 4.0 True
|
|
947
|
+
>>> df.apply_notnull_row(0, 1)
|
|
948
|
+
A B C applied_row_null
|
|
949
|
+
0 NaN 1.0 NaN 0
|
|
950
|
+
1 2.0 NaN 2.0 0
|
|
951
|
+
2 3.0 3.0 NaN 0
|
|
952
|
+
3 4.0 4.0 4.0 1
|
|
953
|
+
>>> df.apply_notnull_row(lambda _: "n", lambda _: "y", col_name="mod")
|
|
954
|
+
A B C mod
|
|
955
|
+
0 NaN 1.0 NaN n
|
|
956
|
+
1 2.0 NaN 2.0 n
|
|
957
|
+
2 3.0 3.0 NaN n
|
|
958
|
+
3 4.0 4.0 4.0 y
|
|
959
|
+
"""
|
|
960
|
+
|
|
961
|
+
def apply_func(row: pd.Series):
|
|
962
|
+
# Both None
|
|
963
|
+
if apply_when_null is None and apply_when_not_null is None:
|
|
964
|
+
return row.notnull().all()
|
|
965
|
+
|
|
966
|
+
# When all values in row are not null
|
|
967
|
+
if row.notnull().all():
|
|
968
|
+
if callable(apply_when_not_null):
|
|
969
|
+
return apply_when_not_null(row)
|
|
970
|
+
return apply_when_not_null
|
|
971
|
+
|
|
972
|
+
# When any value in row is null
|
|
973
|
+
if callable(apply_when_null):
|
|
974
|
+
return apply_when_null(row)
|
|
975
|
+
return apply_when_null
|
|
976
|
+
|
|
977
|
+
# Column name
|
|
978
|
+
cname = "applied_row_null" if col_name is None else col_name
|
|
979
|
+
self[cname] = self.apply(apply_func, axis=1) # type: ignore
|
|
980
|
+
|
|
981
|
+
return self
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
# Other
|
|
985
|
+
# ---------------------------------------------------------------------------
|
|
986
|
+
class DataAnalystDataFrameOtherMixin(DFBase):
|
|
987
|
+
"""
|
|
988
|
+
Data Analyst ``pd.DataFrame`` - Other method/Stuff
|
|
989
|
+
|
|
990
|
+
- Merge left
|
|
991
|
+
"""
|
|
992
|
+
|
|
993
|
+
@versionadded("4.0.0")
|
|
994
|
+
def merge_left(
|
|
995
|
+
self,
|
|
996
|
+
other: Self | pd.DataFrame,
|
|
997
|
+
on: str,
|
|
998
|
+
columns: list[str] | None = None,
|
|
999
|
+
) -> Self:
|
|
1000
|
+
"""
|
|
1001
|
+
Merge left of 2 DataFrame
|
|
1002
|
+
|
|
1003
|
+
Parameters
|
|
1004
|
+
----------
|
|
1005
|
+
other : Self | pd.DataFrame
|
|
1006
|
+
DataFrame to merge
|
|
1007
|
+
|
|
1008
|
+
on : str
|
|
1009
|
+
Merge on which column
|
|
1010
|
+
|
|
1011
|
+
columns : list[str] | None, optional
|
|
1012
|
+
Columns to take from other DataFrame, by default ``None``
|
|
1013
|
+
(Take all columns)
|
|
1014
|
+
|
|
1015
|
+
Returns
|
|
1016
|
+
-------
|
|
1017
|
+
Self
|
|
1018
|
+
Merged DataFrame
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
Example:
|
|
1022
|
+
--------
|
|
1023
|
+
>>> df1 = DADF({
|
|
1024
|
+
... "id": [1, 2, 5],
|
|
1025
|
+
... "name": ["Alice", "Bob", "Rich"],
|
|
1026
|
+
... "age": [20, 20, 20],
|
|
1027
|
+
... })
|
|
1028
|
+
>>> df2 = DADF({
|
|
1029
|
+
... "id": [1, 2, 3],
|
|
1030
|
+
... "age": [25, 30, 45],
|
|
1031
|
+
... "department": ["HR", "IT", "PM"],
|
|
1032
|
+
... "salary": [50000, 60000, 55000],
|
|
1033
|
+
... })
|
|
1034
|
+
>>> df1.merge_left(df2, on="id")
|
|
1035
|
+
id name age_x age_y department salary
|
|
1036
|
+
0 1 Alice 20 25.0 HR 50000.0
|
|
1037
|
+
1 2 Bob 20 30.0 IT 60000.0
|
|
1038
|
+
2 5 Rich 20 NaN NaN NaN
|
|
1039
|
+
>>> df1.merge_left(df2, on="id", columns=["salary"])
|
|
1040
|
+
id name age department salary
|
|
1041
|
+
0 1 Alice 25.0 HR 50000.0
|
|
1042
|
+
1 2 Bob 30.0 IT 60000.0
|
|
1043
|
+
2 5 Rich NaN NaN NaN
|
|
1044
|
+
"""
|
|
1045
|
+
|
|
1046
|
+
if columns is not None:
|
|
1047
|
+
current_col = [on]
|
|
1048
|
+
current_col.extend(columns)
|
|
1049
|
+
col = other.columns.to_list()
|
|
1050
|
+
cols = list(set(col) - set(current_col))
|
|
1051
|
+
|
|
1052
|
+
if getattr(self, "drop_columns", None) is not None:
|
|
1053
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
1054
|
+
self.drop_columns(cols) # type: ignore
|
|
1055
|
+
|
|
1056
|
+
out = self.merge(other, how="left", on=on)
|
|
1057
|
+
return self.__class__(out)
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
# Date
|
|
1061
|
+
# ---------------------------------------------------------------------------
|
|
1062
|
+
class DataAnalystDataFrameDateMixin(DFBase):
|
|
1063
|
+
"""
|
|
1064
|
+
Data Analyst ``pd.DataFrame`` - Date
|
|
1065
|
+
|
|
1066
|
+
- Add date column from month column
|
|
1067
|
+
- Add detail date
|
|
1068
|
+
- Delta date (How many days inbetween)
|
|
1069
|
+
"""
|
|
1070
|
+
|
|
1071
|
+
def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self:
|
|
1072
|
+
"""
|
|
1073
|
+
Add dummy ``date`` column from ``month`` column
|
|
1074
|
+
|
|
1075
|
+
Parameters
|
|
1076
|
+
----------
|
|
1077
|
+
month_column : str
|
|
1078
|
+
Month column
|
|
1079
|
+
|
|
1080
|
+
col_name : str
|
|
1081
|
+
New date column name, by default: ``"date"``
|
|
1082
|
+
|
|
1083
|
+
Returns
|
|
1084
|
+
-------
|
|
1085
|
+
Self
|
|
1086
|
+
Modified DataFrame
|
|
1087
|
+
|
|
1088
|
+
|
|
1089
|
+
Example:
|
|
1090
|
+
--------
|
|
1091
|
+
>>> df = (
|
|
1092
|
+
... DADF.sample_df(2)
|
|
1093
|
+
... .add_detail_date("date", mode="m")
|
|
1094
|
+
... .drop_columns(["date", "number", "number_range"])
|
|
1095
|
+
... )
|
|
1096
|
+
>>> df
|
|
1097
|
+
number_big missing_value text month
|
|
1098
|
+
0 755 NaN lincgqzl 4
|
|
1099
|
+
1 907 NaN gxltrjku 10
|
|
1100
|
+
>>> df.add_date_from_month("month")
|
|
1101
|
+
number_big missing_value text month date
|
|
1102
|
+
0 755 NaN lincgqzl 4 2025-04-01
|
|
1103
|
+
1 907 NaN gxltrjku 10 2025-10-01
|
|
1104
|
+
"""
|
|
1105
|
+
_this_year = datetime.now().year
|
|
1106
|
+
self[col_name] = pd.to_datetime(
|
|
1107
|
+
f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
|
|
1108
|
+
format="%Y-%m-%d",
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
# Rearrange
|
|
1112
|
+
if getattr(self, "rearrange_rightmost_column", None) is not None:
|
|
1113
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
1114
|
+
return self.rearrange_rightmost_column(month_column) # type: ignore [no-any-return]
|
|
1115
|
+
return self
|
|
1116
|
+
|
|
1117
|
+
def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self:
|
|
1118
|
+
"""
|
|
1119
|
+
Add these columns from ``date_column``:
|
|
1120
|
+
- ``date`` (won't add if ``date_column`` value is ``"date"``)
|
|
1121
|
+
- ``day`` (overwrite if already exist)
|
|
1122
|
+
- ``week`` (overwrite if already exist)
|
|
1123
|
+
- ``month`` (overwrite if already exist)
|
|
1124
|
+
- ``year`` (overwrite if already exist)
|
|
1125
|
+
|
|
1126
|
+
Parameters
|
|
1127
|
+
----------
|
|
1128
|
+
date_column : str
|
|
1129
|
+
Date column
|
|
1130
|
+
|
|
1131
|
+
mode : str
|
|
1132
|
+
| Detailed column to add
|
|
1133
|
+
| ``d``: day
|
|
1134
|
+
| ``w``: week number
|
|
1135
|
+
| ``m``: month
|
|
1136
|
+
| ``y``: year
|
|
1137
|
+
| (Default: ``"dwmy"``)
|
|
1138
|
+
|
|
1139
|
+
Returns
|
|
1140
|
+
-------
|
|
1141
|
+
Self
|
|
1142
|
+
Modified DataFrame
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
Example:
|
|
1146
|
+
--------
|
|
1147
|
+
>>> df = DADF.sample_df(2)
|
|
1148
|
+
>>> df
|
|
1149
|
+
number number_big number_range missing_value text date
|
|
1150
|
+
0 0.331195 902 900 20 fgyanxik 2021-10-18
|
|
1151
|
+
1 -0.877727 378 300 13 dqvaggjo 2007-03-06
|
|
1152
|
+
>>> df.add_detail_date("date")
|
|
1153
|
+
number number_big number_range missing_value text date day week month year
|
|
1154
|
+
0 0.331195 902 900 20 fgyanxik 2021-10-18 18 42 10 2021
|
|
1155
|
+
1 -0.877727 378 300 13 dqvaggjo 2007-03-06 6 10 3 2007
|
|
1156
|
+
"""
|
|
1157
|
+
# Convert to datetime
|
|
1158
|
+
self["date"] = pd.to_datetime(self[date_column])
|
|
1159
|
+
|
|
1160
|
+
# Logic
|
|
1161
|
+
col_counter = 0
|
|
1162
|
+
# self["weekday"] = self["day"].dt.isocalendar().day # Weekday
|
|
1163
|
+
if mode.find("d") != -1:
|
|
1164
|
+
# logger.debug("Mode: 'day'")
|
|
1165
|
+
self["day"] = self["date"].dt.day
|
|
1166
|
+
col_counter += 1
|
|
1167
|
+
if mode.find("w") != -1:
|
|
1168
|
+
# logger.debug("Mode: 'weekday'")
|
|
1169
|
+
self["week"] = self["date"].dt.isocalendar().week
|
|
1170
|
+
col_counter += 1
|
|
1171
|
+
if mode.find("m") != -1:
|
|
1172
|
+
# logger.debug("Mode: 'month'")
|
|
1173
|
+
self["month"] = self["date"].dt.month
|
|
1174
|
+
col_counter += 1
|
|
1175
|
+
if mode.find("y") != -1:
|
|
1176
|
+
# logger.debug("Mode: 'year'")
|
|
1177
|
+
self["year"] = self["date"].dt.year
|
|
1178
|
+
col_counter += 1
|
|
1179
|
+
|
|
1180
|
+
# Return
|
|
1181
|
+
if getattr(self, "rearrange_rightmost_column", None) is not None:
|
|
1182
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
1183
|
+
return self.rearrange_rightmost_column(date_column, col_counter) # type: ignore [no-any-return]
|
|
1184
|
+
return self
|
|
1185
|
+
|
|
1186
|
+
def delta_date(
|
|
1187
|
+
self,
|
|
1188
|
+
date_column: str,
|
|
1189
|
+
mode: Literal["now", "between_row"] = "now",
|
|
1190
|
+
*,
|
|
1191
|
+
col_name: str = "delta_date",
|
|
1192
|
+
) -> Self:
|
|
1193
|
+
"""
|
|
1194
|
+
Calculate date interval
|
|
1195
|
+
|
|
1196
|
+
Parameters
|
|
1197
|
+
----------
|
|
1198
|
+
date_column : str
|
|
1199
|
+
Date column
|
|
1200
|
+
|
|
1201
|
+
mode : str
|
|
1202
|
+
| Mode to calculate
|
|
1203
|
+
| ``"between_row"``: Calculate date interval between each row
|
|
1204
|
+
| ``"now"``: Calculate date interval to current date
|
|
1205
|
+
| (Default: ``"now"``)
|
|
1206
|
+
|
|
1207
|
+
col_name : str
|
|
1208
|
+
| New delta date column name
|
|
1209
|
+
| (Default: ``"delta_date"``)
|
|
1210
|
+
|
|
1211
|
+
Returns
|
|
1212
|
+
-------
|
|
1213
|
+
Self
|
|
1214
|
+
Modified DataFrame
|
|
1215
|
+
|
|
1216
|
+
|
|
1217
|
+
Example:
|
|
1218
|
+
--------
|
|
1219
|
+
>>> df = DADF.sample_df(2)
|
|
1220
|
+
>>> df
|
|
1221
|
+
number number_big number_range missing_value text date
|
|
1222
|
+
0 -0.729988 435 400 21 xkrqqouf 2014-08-01
|
|
1223
|
+
1 -0.846031 210 200 5 rbkmiqxt 2024-07-10
|
|
1224
|
+
>>> df.delta_date("date")
|
|
1225
|
+
number number_big number_range missing_value text date delta_date
|
|
1226
|
+
0 -0.729988 435 400 21 xkrqqouf 2014-08-01 3873
|
|
1227
|
+
1 -0.846031 210 200 5 rbkmiqxt 2024-07-10 242
|
|
1228
|
+
"""
|
|
1229
|
+
if mode.lower().startswith("between_row"):
|
|
1230
|
+
dated = self[date_column].to_list()
|
|
1231
|
+
cal: list[timedelta] = []
|
|
1232
|
+
for i in range(len(dated)):
|
|
1233
|
+
if i == 0:
|
|
1234
|
+
cal.append(dated[i] - dated[i])
|
|
1235
|
+
# cal.append(relativedelta(dated[i], dated[i]))
|
|
1236
|
+
else:
|
|
1237
|
+
cal.append(dated[i] - dated[i - 1])
|
|
1238
|
+
# cal.append(relativedelta(dated[i], dated[i - 1]))
|
|
1239
|
+
self[col_name] = [x.days for x in cal]
|
|
1240
|
+
else: # mode="now"
|
|
1241
|
+
self[col_name] = self[date_column].apply(
|
|
1242
|
+
lambda x: (datetime.now() - x).days
|
|
1243
|
+
)
|
|
1244
|
+
return self
|
|
1245
|
+
|
|
1246
|
+
@versionadded("6.0.0")
|
|
1247
|
+
def normalize_datetime_column(
|
|
1248
|
+
self,
|
|
1249
|
+
col: str,
|
|
1250
|
+
*,
|
|
1251
|
+
inplace: bool = False,
|
|
1252
|
+
) -> Self:
|
|
1253
|
+
"""
|
|
1254
|
+
Normalize a datetime column by removing the time component.
|
|
1255
|
+
|
|
1256
|
+
This function converts the specified column to pandas datetime (``datetime64[ns]``)
|
|
1257
|
+
(if not already), then normalizes all values so that the time
|
|
1258
|
+
component is set to ``00:00:00``. The date component is preserved.
|
|
1259
|
+
|
|
1260
|
+
The function safely handles missing or invalid values by coercing
|
|
1261
|
+
them to ``NaT``.
|
|
1262
|
+
|
|
1263
|
+
Parameters
|
|
1264
|
+
----------
|
|
1265
|
+
col : str
|
|
1266
|
+
Name of the column to normalize. The column may contain
|
|
1267
|
+
datetime-like values, strings, or mixed types.
|
|
1268
|
+
|
|
1269
|
+
inplace : bool, default False
|
|
1270
|
+
| If ``True``, modify the input DataFrame in place.
|
|
1271
|
+
| If ``False``, operate on a copy and return the modified DataFrame.
|
|
1272
|
+
|
|
1273
|
+
Returns
|
|
1274
|
+
-------
|
|
1275
|
+
Self
|
|
1276
|
+
DataFrame with the normalized datetime column.
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
Example:
|
|
1280
|
+
--------
|
|
1281
|
+
Basic usage::
|
|
1282
|
+
|
|
1283
|
+
>>> df = DADF({
|
|
1284
|
+
... "created_at": ["2024-01-01 10:15:30", "2024-01-02 23:59:59"]
|
|
1285
|
+
... })
|
|
1286
|
+
>>> normalize_datetime_column(df, "created_at")
|
|
1287
|
+
created_at
|
|
1288
|
+
0 2024-01-01 00:00:00
|
|
1289
|
+
1 2024-01-02 00:00:00
|
|
1290
|
+
|
|
1291
|
+
In-place modification::
|
|
1292
|
+
|
|
1293
|
+
>>> normalize_datetime_column(df, "created_at", inplace=True)
|
|
1294
|
+
|
|
1295
|
+
Handling invalid values::
|
|
1296
|
+
|
|
1297
|
+
>>> df = DADF({"dt": ["2024-01-01 10:00", "invalid"]})
|
|
1298
|
+
>>> normalize_datetime_column(df, "dt")
|
|
1299
|
+
dt
|
|
1300
|
+
0 2024-01-01 00:00:00
|
|
1301
|
+
1 NaT
|
|
1302
|
+
|
|
1303
|
+
"""
|
|
1304
|
+
if not inplace:
|
|
1305
|
+
df = self.copy()
|
|
1306
|
+
else:
|
|
1307
|
+
df = self
|
|
1308
|
+
|
|
1309
|
+
# Using ``df.loc[:, col]`` avoids ``SettingWithCopyWarning`` when the input DataFrame is a slice.
|
|
1310
|
+
df.loc[:, col] = pd.to_datetime(df[col], errors="coerce").dt.normalize()
|
|
1311
|
+
return df
|
|
1312
|
+
|
|
1313
|
+
|
|
1314
|
+
# Export
|
|
1315
|
+
# ---------------------------------------------------------------------------
|
|
1316
|
+
class DataAnalystDataFrameExportMixin(DFBase):
|
|
1317
|
+
"""
|
|
1318
|
+
Data Analyst ``pd.DataFrame`` - Export method
|
|
1319
|
+
|
|
1320
|
+
- da_export
|
|
1321
|
+
"""
|
|
1322
|
+
|
|
1323
|
+
@versionchanged("5.8.0", "New parameter")
|
|
1324
|
+
def da_export(
|
|
1325
|
+
self,
|
|
1326
|
+
path: str,
|
|
1327
|
+
sheet_name: str = "Sheet1",
|
|
1328
|
+
*,
|
|
1329
|
+
auto_width: bool = True,
|
|
1330
|
+
cols_contain_centered_text: list[str] | None = None,
|
|
1331
|
+
cols_contain_number: list[str] | None = None,
|
|
1332
|
+
cols_contain_percentage: list[str] | None = None,
|
|
1333
|
+
) -> None:
|
|
1334
|
+
"""
|
|
1335
|
+
Export DataFrame with `xlsxwriter` engine
|
|
1336
|
+
|
|
1337
|
+
Parameters
|
|
1338
|
+
----------
|
|
1339
|
+
path : Path | str
|
|
1340
|
+
Path to export
|
|
1341
|
+
|
|
1342
|
+
sheet_name : str, optional
|
|
1343
|
+
Sheet name, by default "Sheet1"
|
|
1344
|
+
|
|
1345
|
+
auto_width : bool, optional
|
|
1346
|
+
Auto resize column width, by default ``True``
|
|
1347
|
+
|
|
1348
|
+
cols_contain_centered_text : list[str] | None, optional
|
|
1349
|
+
Columns that contain centered text (Align center), by default None
|
|
1350
|
+
|
|
1351
|
+
cols_contain_number : list[str] | None, optional
|
|
1352
|
+
Columns that contain number value (to format as number - int), by default None
|
|
1353
|
+
|
|
1354
|
+
cols_contain_percentage : list[str] | None, optional
|
|
1355
|
+
Columns that contain percentage value (to format as percentage), by default None
|
|
1356
|
+
"""
|
|
1357
|
+
|
|
1358
|
+
# Using xlsxwriter engine
|
|
1359
|
+
with pd.ExcelWriter(path, engine="xlsxwriter") as writer:
|
|
1360
|
+
self.to_excel(writer, sheet_name=sheet_name, index=False, float_format="%.2f", na_rep="")
|
|
1361
|
+
|
|
1362
|
+
# Format style
|
|
1363
|
+
workbook: Workbook = writer.book # type: ignore
|
|
1364
|
+
header_fmt = workbook.add_format(
|
|
1365
|
+
{
|
|
1366
|
+
"bold": True,
|
|
1367
|
+
"text_wrap": True,
|
|
1368
|
+
"border": 1,
|
|
1369
|
+
"align": "center",
|
|
1370
|
+
"valign": "vcenter",
|
|
1371
|
+
# "bg_color": "#A0BEFD",
|
|
1372
|
+
}
|
|
1373
|
+
)
|
|
1374
|
+
number_fmt = workbook.add_format(
|
|
1375
|
+
{"num_format": "#,##0", "align": "center", "valign": "vcenter"}
|
|
1376
|
+
) # 1,000,000
|
|
1377
|
+
percent_fmt = workbook.add_format({"num_format": "0.00%", "align": "center", "valign": "vcenter"}) # 1.00%
|
|
1378
|
+
text_fmt = workbook.add_format({"valign": "vcenter"})
|
|
1379
|
+
text_center_fmt = workbook.add_format({"align": "center", "valign": "vcenter"})
|
|
1380
|
+
|
|
1381
|
+
# Format sheet
|
|
1382
|
+
worksheet: Worksheet = writer.sheets[sheet_name]
|
|
1383
|
+
|
|
1384
|
+
# Format header - First row
|
|
1385
|
+
for col_num, value in enumerate(self.columns.values):
|
|
1386
|
+
worksheet.write(0, col_num, value, header_fmt)
|
|
1387
|
+
|
|
1388
|
+
rules = [
|
|
1389
|
+
(cols_contain_number, number_fmt),
|
|
1390
|
+
(cols_contain_percentage, percent_fmt),
|
|
1391
|
+
(cols_contain_centered_text, text_center_fmt),
|
|
1392
|
+
]
|
|
1393
|
+
|
|
1394
|
+
# Auto width + col format
|
|
1395
|
+
for i, col in enumerate(self.columns):
|
|
1396
|
+
# Max str len of each column
|
|
1397
|
+
max_len = None if auto_width is None else max(self[col].astype(str).map(len).max(), len(col)) + 2
|
|
1398
|
+
worksheet.set_column(i, i, max_len) # Set width
|
|
1399
|
+
|
|
1400
|
+
# Format style
|
|
1401
|
+
fmt = text_fmt # default
|
|
1402
|
+
for cols, f in rules:
|
|
1403
|
+
if cols is not None and col in cols:
|
|
1404
|
+
fmt = f
|
|
1405
|
+
break
|
|
1406
|
+
worksheet.set_column(i, i, max_len, fmt)
|
|
1407
|
+
|
|
1408
|
+
# if cols_contain_number is not None:
|
|
1409
|
+
# for x in cols_contain_number:
|
|
1410
|
+
# self[x] = pd.to_numeric(self[x], errors="coerce")
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
# City
|
|
1414
|
+
# ---------------------------------------------------------------------------
|
|
1415
|
+
class DataAnalystDataFrameCityMixin(DFBase):
|
|
1416
|
+
"""
|
|
1417
|
+
Data Analyst ``pd.DataFrame`` - City
|
|
1418
|
+
|
|
1419
|
+
- Convert city
|
|
1420
|
+
"""
|
|
1421
|
+
|
|
1422
|
+
def convert_city(
|
|
1423
|
+
self,
|
|
1424
|
+
city_column: str,
|
|
1425
|
+
city_list: list[CityData],
|
|
1426
|
+
*,
|
|
1427
|
+
mode: str = "ra",
|
|
1428
|
+
) -> Self:
|
|
1429
|
+
"""
|
|
1430
|
+
Get ``region`` and ``area`` of a city
|
|
1431
|
+
|
|
1432
|
+
Parameters
|
|
1433
|
+
----------
|
|
1434
|
+
city_column : str
|
|
1435
|
+
Column contains city data
|
|
1436
|
+
|
|
1437
|
+
city_list : list[CityData]
|
|
1438
|
+
List of city in correct format
|
|
1439
|
+
(Default: ``None``)
|
|
1440
|
+
|
|
1441
|
+
mode : str
|
|
1442
|
+
| Detailed column to add
|
|
1443
|
+
| ``r``: region
|
|
1444
|
+
| ``a``: area
|
|
1445
|
+
| (Default: ``"ra"``)
|
|
1446
|
+
|
|
1447
|
+
Returns
|
|
1448
|
+
-------
|
|
1449
|
+
DataAnalystDataFrame
|
|
1450
|
+
Modified DataFrame
|
|
1451
|
+
"""
|
|
1452
|
+
|
|
1453
|
+
# Support function
|
|
1454
|
+
def _convert_city_support(value: str) -> CityData:
|
|
1455
|
+
for x in city_list:
|
|
1456
|
+
if x.city.lower().startswith(value.lower()):
|
|
1457
|
+
return x
|
|
1458
|
+
return CityData(city=value, region=np.nan, area=np.nan) # type: ignore
|
|
1459
|
+
|
|
1460
|
+
# Convert
|
|
1461
|
+
col_counter = 0
|
|
1462
|
+
if mode.find("r") != -1:
|
|
1463
|
+
# logger.debug("Mode: 'region'")
|
|
1464
|
+
self["region"] = self[city_column].apply(
|
|
1465
|
+
lambda x: _convert_city_support(x).region
|
|
1466
|
+
)
|
|
1467
|
+
col_counter += 1
|
|
1468
|
+
if mode.find("a") != -1:
|
|
1469
|
+
# logger.debug("Mode: 'area'")
|
|
1470
|
+
self["area"] = self[city_column].apply(
|
|
1471
|
+
lambda x: _convert_city_support(x).area
|
|
1472
|
+
)
|
|
1473
|
+
col_counter += 1
|
|
1474
|
+
|
|
1475
|
+
# Rearrange
|
|
1476
|
+
if getattr(self, "rearrange_rightmost_column", None) is not None:
|
|
1477
|
+
return self.rearrange_rightmost_column(city_column, col_counter) # type: ignore [no-any-return]
|
|
1478
|
+
return self
|
|
1479
|
+
|
|
1480
|
+
|
|
1481
|
+
# Main
|
|
1482
|
+
# ---------------------------------------------------------------------------
|
|
1483
|
+
class DADF(
|
|
1484
|
+
GetClassMembersMixin,
|
|
1485
|
+
DataAnalystDataFrameCityMixin,
|
|
1486
|
+
DataAnalystDataFrameExportMixin,
|
|
1487
|
+
DataAnalystDataFrameDateMixin,
|
|
1488
|
+
DataAnalystDataFrameOtherMixin,
|
|
1489
|
+
DataAnalystDataFrameNAMixin,
|
|
1490
|
+
DataAnalystDataFrameInfoMixin,
|
|
1491
|
+
DataAnalystDataFrameRowMethodMixin,
|
|
1492
|
+
DataAnalystDataFrameColumnMethodMixin,
|
|
1493
|
+
):
|
|
1494
|
+
"""
|
|
1495
|
+
Data Analyst ``pd.DataFrame``
|
|
1496
|
+
|
|
1497
|
+
For a list of extra methods:
|
|
1498
|
+
>>> print(DADF.DADF_METHODS)
|
|
1499
|
+
"""
|
|
1500
|
+
|
|
1501
|
+
@classmethod
|
|
1502
|
+
@deprecated("5.1.0")
|
|
1503
|
+
@versionadded("3.2.0")
|
|
1504
|
+
def dadf_help(cls) -> list[str]:
|
|
1505
|
+
"""
|
|
1506
|
+
Show all available method of DataAnalystDataFrame
|
|
1507
|
+
"""
|
|
1508
|
+
list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame)))
|
|
1509
|
+
return sorted(list_of_method)
|
|
1510
|
+
|
|
1511
|
+
@classmethod
|
|
1512
|
+
def sample_df(cls, size: int = 100) -> Self:
|
|
1513
|
+
"""
|
|
1514
|
+
Create sample DataFrame
|
|
1515
|
+
|
|
1516
|
+
Parameters
|
|
1517
|
+
----------
|
|
1518
|
+
size : int
|
|
1519
|
+
Number of observations, by default ``100``
|
|
1520
|
+
|
|
1521
|
+
Returns
|
|
1522
|
+
-------
|
|
1523
|
+
Self
|
|
1524
|
+
DataFrame with these columns:
|
|
1525
|
+
[number, number_big, number_range, missing_value, text, date]
|
|
1526
|
+
|
|
1527
|
+
|
|
1528
|
+
Example:
|
|
1529
|
+
--------
|
|
1530
|
+
>>> DataAnalystDataFrame.sample_df()
|
|
1531
|
+
number number_big number_range missing_value text date
|
|
1532
|
+
0 -2.089770 785 700 NaN vwnlqoql 2013-11-20
|
|
1533
|
+
1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13
|
|
1534
|
+
2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24
|
|
1535
|
+
3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28
|
|
1536
|
+
4 1.687803 878 800 NaN aajtncum 2005-10-05
|
|
1537
|
+
.. ... ... ... ... ... ...
|
|
1538
|
+
95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12
|
|
1539
|
+
96 1.296795 255 200 NaN lwvytego 2014-05-10
|
|
1540
|
+
97 1.440746 297 200 5.0 lqsoykun 2010-04-03
|
|
1541
|
+
98 0.327702 845 800 NaN leadkvsy 2005-08-05
|
|
1542
|
+
99 0.556720 981 900 36.0 bozmxixy 2004-02-22
|
|
1543
|
+
[100 rows x 6 columns]
|
|
1544
|
+
"""
|
|
1545
|
+
# Restrain
|
|
1546
|
+
size = max(size, 1)
|
|
1547
|
+
|
|
1548
|
+
# Number col
|
|
1549
|
+
df = cls(np.random.randn(size, 1), columns=["number"])
|
|
1550
|
+
df["number_big"] = [
|
|
1551
|
+
random.choice(range(100, 999)) for _ in range(size)
|
|
1552
|
+
] # Big number in range 100-999
|
|
1553
|
+
df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
|
|
1554
|
+
|
|
1555
|
+
# Missing value col
|
|
1556
|
+
na_rate = random.randint(1, 99)
|
|
1557
|
+
d = [random.randint(1, 99) for _ in range(size)]
|
|
1558
|
+
df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
|
|
1559
|
+
# df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
|
|
1560
|
+
|
|
1561
|
+
# Text col
|
|
1562
|
+
df["text"] = [
|
|
1563
|
+
"".join([random.choice(string.ascii_lowercase) for _ in range(8)])
|
|
1564
|
+
for _ in range(size)
|
|
1565
|
+
]
|
|
1566
|
+
|
|
1567
|
+
# Random date col
|
|
1568
|
+
df["date"] = [
|
|
1569
|
+
datetime(
|
|
1570
|
+
year=random.randint(datetime.now().year - 20, datetime.now().year),
|
|
1571
|
+
month=random.randint(1, 12),
|
|
1572
|
+
day=random.randint(1, 28),
|
|
1573
|
+
)
|
|
1574
|
+
for _ in range(size)
|
|
1575
|
+
]
|
|
1576
|
+
|
|
1577
|
+
# Return
|
|
1578
|
+
return df
|
|
1579
|
+
|
|
1580
|
+
|
|
1581
|
+
class DADF_WIP(DADF):
|
|
1582
|
+
"""
|
|
1583
|
+
W.I.P - No test cases written
|
|
1584
|
+
"""
|
|
1585
|
+
|
|
1586
|
+
pass
|
|
1587
|
+
|
|
1588
|
+
if __name__ == "__main__":
|
|
1589
|
+
from pathlib import Path
|
|
1590
|
+
|
|
1591
|
+
# t = DADF.sample_df().show_distribution("number_range", show_percentage=False)
|
|
1592
|
+
# t.da_export(
|
|
1593
|
+
# Path(__file__).parent.joinpath("a.xlsx").resolve().__str__(),
|
|
1594
|
+
# cols_contain_number=["number_range"],
|
|
1595
|
+
# cols_contain_percentage=["percentage"],
|
|
1596
|
+
# )
|
|
1597
|
+
# print(t)
|
|
1598
|
+
|
|
1599
|
+
df = DADF.sample_df(10)
|
|
1600
|
+
print(df.add_blank_row())
|