absfuyu 5.0.1__py3-none-any.whl → 5.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of absfuyu might be problematic. Click here for more details.
- absfuyu/__init__.py +1 -1
- absfuyu/__main__.py +3 -3
- absfuyu/cli/__init__.py +2 -2
- absfuyu/cli/color.py +30 -14
- absfuyu/cli/config_group.py +9 -2
- absfuyu/cli/do_group.py +13 -6
- absfuyu/cli/game_group.py +9 -2
- absfuyu/cli/tool_group.py +15 -9
- absfuyu/config/__init__.py +2 -2
- absfuyu/core/__init__.py +2 -2
- absfuyu/core/baseclass.py +448 -79
- absfuyu/core/baseclass2.py +2 -2
- absfuyu/core/decorator.py +70 -4
- absfuyu/core/docstring.py +43 -25
- absfuyu/core/dummy_cli.py +2 -2
- absfuyu/core/dummy_func.py +15 -4
- absfuyu/dxt/__init__.py +2 -2
- absfuyu/dxt/dictext.py +5 -2
- absfuyu/dxt/dxt_support.py +2 -2
- absfuyu/dxt/intext.py +34 -3
- absfuyu/dxt/listext.py +300 -113
- absfuyu/dxt/strext.py +75 -15
- absfuyu/extra/__init__.py +2 -2
- absfuyu/extra/beautiful.py +2 -2
- absfuyu/extra/da/__init__.py +36 -0
- absfuyu/extra/da/dadf.py +1177 -0
- absfuyu/extra/da/dadf_base.py +186 -0
- absfuyu/extra/da/df_func.py +97 -0
- absfuyu/extra/da/mplt.py +219 -0
- absfuyu/extra/data_analysis.py +10 -1067
- absfuyu/fun/__init__.py +2 -2
- absfuyu/fun/tarot.py +2 -2
- absfuyu/game/__init__.py +2 -2
- absfuyu/game/game_stat.py +2 -2
- absfuyu/game/sudoku.py +2 -2
- absfuyu/game/tictactoe.py +2 -3
- absfuyu/game/wordle.py +2 -2
- absfuyu/general/__init__.py +2 -2
- absfuyu/general/content.py +2 -2
- absfuyu/general/human.py +2 -2
- absfuyu/general/shape.py +2 -2
- absfuyu/logger.py +2 -2
- absfuyu/pkg_data/__init__.py +2 -2
- absfuyu/pkg_data/deprecated.py +2 -2
- absfuyu/sort.py +2 -2
- absfuyu/tools/__init__.py +28 -2
- absfuyu/tools/checksum.py +27 -7
- absfuyu/tools/converter.py +120 -34
- absfuyu/tools/generator.py +251 -110
- absfuyu/tools/inspector.py +463 -0
- absfuyu/tools/keygen.py +2 -2
- absfuyu/tools/obfuscator.py +45 -7
- absfuyu/tools/passwordlib.py +88 -24
- absfuyu/tools/shutdownizer.py +2 -2
- absfuyu/tools/web.py +2 -2
- absfuyu/typings.py +136 -0
- absfuyu/util/__init__.py +18 -4
- absfuyu/util/api.py +36 -16
- absfuyu/util/json_method.py +43 -14
- absfuyu/util/lunar.py +2 -2
- absfuyu/util/path.py +190 -82
- absfuyu/util/performance.py +122 -7
- absfuyu/util/shorten_number.py +40 -10
- absfuyu/util/text_table.py +306 -0
- absfuyu/util/zipped.py +8 -7
- absfuyu/version.py +2 -2
- {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/METADATA +9 -2
- absfuyu-5.2.0.dist-info/RECORD +76 -0
- absfuyu-5.0.1.dist-info/RECORD +0 -68
- {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/WHEEL +0 -0
- {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/entry_points.txt +0 -0
- {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/licenses/LICENSE +0 -0
absfuyu/extra/da/dadf.py
ADDED
|
@@ -0,0 +1,1177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Absfuyu: Data Analysis
|
|
3
|
+
----------------------
|
|
4
|
+
Data Analyst DataFrame
|
|
5
|
+
|
|
6
|
+
Version: 5.2.0
|
|
7
|
+
Date updated: 15/03/2025 (dd/mm/yyyy)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Module level
|
|
11
|
+
# ---------------------------------------------------------------------------
|
|
12
|
+
__all__ = [
|
|
13
|
+
"DADF",
|
|
14
|
+
"DataAnalystDataFrameColumnMethodMixin",
|
|
15
|
+
"DataAnalystDataFrameRowMethodMixin",
|
|
16
|
+
"DataAnalystDataFrameInfoMixin",
|
|
17
|
+
"DataAnalystDataFrameNAMixin",
|
|
18
|
+
"DataAnalystDataFrameOtherMixin",
|
|
19
|
+
"DataAnalystDataFrameDateMixin",
|
|
20
|
+
"DataAnalystDataFrameCityMixin",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Library
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
import random
|
|
27
|
+
import string
|
|
28
|
+
from collections.abc import Callable, Sequence
|
|
29
|
+
from datetime import datetime, timedelta
|
|
30
|
+
from typing import Any, Literal, Self
|
|
31
|
+
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from typing import override # type: ignore
|
|
37
|
+
except ImportError:
|
|
38
|
+
from absfuyu.core.decorator import dummy_decorator as override
|
|
39
|
+
|
|
40
|
+
from absfuyu.core.baseclass import ShowAllMethodsMixin
|
|
41
|
+
from absfuyu.core.docstring import deprecated, versionadded
|
|
42
|
+
from absfuyu.extra.da.dadf_base import CityData
|
|
43
|
+
from absfuyu.extra.da.dadf_base import DataAnalystDataFrameBase as DFBase
|
|
44
|
+
from absfuyu.extra.da.dadf_base import SplittedDF
|
|
45
|
+
from absfuyu.logger import logger
|
|
46
|
+
from absfuyu.typings import R as _R
|
|
47
|
+
from absfuyu.typings import T as _T
|
|
48
|
+
from absfuyu.util import set_min_max
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Column method
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
class DataAnalystDataFrameColumnMethodMixin(DFBase):
|
|
54
|
+
"""
|
|
55
|
+
Data Analyst ``pd.DataFrame`` - Column method
|
|
56
|
+
|
|
57
|
+
- Rearrange rightmost column
|
|
58
|
+
- Drop columns
|
|
59
|
+
- Drop rightmost column
|
|
60
|
+
- Add blank column
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def rearrange_rightmost_column(
|
|
64
|
+
self, insert_to_col: str, num_of_cols: int = 1
|
|
65
|
+
) -> Self:
|
|
66
|
+
"""
|
|
67
|
+
Move right-most columns to selected position
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
insert_to_col : str
|
|
72
|
+
Name of the column that the right-most column will be moved next to
|
|
73
|
+
|
|
74
|
+
num_of_cols : int
|
|
75
|
+
Number of columns moved, by default ``1``
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
Self
|
|
80
|
+
Modified DataFrame
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
--------
|
|
85
|
+
>>> df = DADF.sample_df(2)
|
|
86
|
+
>>> df
|
|
87
|
+
number number_big number_range missing_value text date
|
|
88
|
+
0 -1.583590 756 700 NaN eqklyckc 2023-05-20
|
|
89
|
+
1 0.203968 167 100 NaN wzrsxinb 2011-02-27
|
|
90
|
+
>>> df.rearrange_rightmost_column("number")
|
|
91
|
+
number date number_big number_range missing_value text
|
|
92
|
+
0 -1.583590 2023-05-20 756 700 NaN eqklyckc
|
|
93
|
+
1 0.203968 2011-02-27 167 100 NaN wzrsxinb
|
|
94
|
+
"""
|
|
95
|
+
cols: list[str] = self.columns.to_list() # List of columns
|
|
96
|
+
num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols)))
|
|
97
|
+
col_index: int = cols.index(insert_to_col)
|
|
98
|
+
new_cols: list[str] = (
|
|
99
|
+
cols[: col_index + 1]
|
|
100
|
+
+ cols[-num_of_cols:]
|
|
101
|
+
+ cols[col_index + 1 : len(cols) - num_of_cols]
|
|
102
|
+
)
|
|
103
|
+
self = self.__class__(self[new_cols])
|
|
104
|
+
return self
|
|
105
|
+
|
|
106
|
+
def drop_columns(self, columns: Sequence[str]) -> Self:
|
|
107
|
+
"""
|
|
108
|
+
Drop columns in DataFrame
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
columns : Iterable[str]
|
|
113
|
+
List of columns need to drop
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
Self
|
|
118
|
+
Modified DataFrame
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
Example:
|
|
122
|
+
--------
|
|
123
|
+
>>> df = DADF.sample_df(2)
|
|
124
|
+
>>> df
|
|
125
|
+
number number_big number_range missing_value text date
|
|
126
|
+
0 -0.283019 666 600 NaN ztoeeblx 2022-11-13
|
|
127
|
+
1 1.194725 939 900 NaN fxardqvh 2005-08-04
|
|
128
|
+
>>> df.drop_columns(["date", "text"])
|
|
129
|
+
number number_big number_range missing_value
|
|
130
|
+
0 -0.283019 666 600 NaN
|
|
131
|
+
1 1.194725 939 900 NaN
|
|
132
|
+
"""
|
|
133
|
+
for column in columns:
|
|
134
|
+
try:
|
|
135
|
+
self.drop(columns=[column], inplace=True)
|
|
136
|
+
except KeyError:
|
|
137
|
+
logger.debug(f"{column} column does not exist")
|
|
138
|
+
# pass
|
|
139
|
+
return self
|
|
140
|
+
|
|
141
|
+
def drop_rightmost(self, num_of_cols: int = 1) -> Self:
|
|
142
|
+
"""
|
|
143
|
+
Drop ``num_of_cols`` right-most columns
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
num_of_cols : int
|
|
148
|
+
Number of columns to drop
|
|
149
|
+
|
|
150
|
+
Returns
|
|
151
|
+
-------
|
|
152
|
+
Self
|
|
153
|
+
Modified DataFrame
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
Example:
|
|
157
|
+
--------
|
|
158
|
+
>>> df = DADF.sample_df(2)
|
|
159
|
+
>>> df
|
|
160
|
+
number number_big number_range missing_value text date
|
|
161
|
+
0 0.851953 572 500 5 ncpbnzef 2020-08-15
|
|
162
|
+
1 0.381643 595 500 53 iojogbgj 2011-12-04
|
|
163
|
+
>>> df.drop_rightmost(5)
|
|
164
|
+
number
|
|
165
|
+
0 0.851953
|
|
166
|
+
1 0.381643
|
|
167
|
+
"""
|
|
168
|
+
# Restrain
|
|
169
|
+
# if num_of_cols < 1:
|
|
170
|
+
# num_of_cols = 1
|
|
171
|
+
# if num_of_cols > self.shape[1]:
|
|
172
|
+
# num_of_cols = self.shape[1]
|
|
173
|
+
num_of_cols = int(
|
|
174
|
+
set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Logic
|
|
178
|
+
for _ in range(num_of_cols):
|
|
179
|
+
self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
|
|
180
|
+
return self
|
|
181
|
+
|
|
182
|
+
@deprecated("5.1.0", reason="Use pd.DataFrame.assign(...) method instead")
|
|
183
|
+
def add_blank_column(self, column_name: str, fill: Any = np.nan, /) -> Self:
|
|
184
|
+
"""
|
|
185
|
+
Add a blank column
|
|
186
|
+
|
|
187
|
+
Parameters
|
|
188
|
+
----------
|
|
189
|
+
column_name : str
|
|
190
|
+
Name of the column to add
|
|
191
|
+
|
|
192
|
+
fill : Any
|
|
193
|
+
Fill the column with data
|
|
194
|
+
|
|
195
|
+
Returns
|
|
196
|
+
-------
|
|
197
|
+
Self
|
|
198
|
+
Modified DataFrame
|
|
199
|
+
"""
|
|
200
|
+
self[column_name] = [fill] * self.shape[0]
|
|
201
|
+
return self
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# Row method
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
class DataAnalystDataFrameRowMethodMixin(DFBase):
|
|
207
|
+
"""
|
|
208
|
+
Data Analyst ``pd.DataFrame`` - Row method
|
|
209
|
+
|
|
210
|
+
- Get different rows
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
@versionadded("4.0.0")
|
|
214
|
+
def get_different_rows(self, other: Self | pd.DataFrame) -> Self:
|
|
215
|
+
"""
|
|
216
|
+
Subtract DataFrame to find the different rows
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
other : Self | pd.DataFrame
|
|
221
|
+
DataFrame to subtract
|
|
222
|
+
|
|
223
|
+
Returns
|
|
224
|
+
-------
|
|
225
|
+
Self
|
|
226
|
+
Different row DataFrame
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
Example:
|
|
230
|
+
--------
|
|
231
|
+
>>> df1 = DADF({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
|
|
232
|
+
>>> df2 = DADF({"A": [1, 2, 3, 4], "B": [7, 6, 6, 8]})
|
|
233
|
+
>>> df1.get_different_rows(df2)
|
|
234
|
+
A B
|
|
235
|
+
0 1 7
|
|
236
|
+
2 3 6
|
|
237
|
+
"""
|
|
238
|
+
df = self.copy()
|
|
239
|
+
out = (
|
|
240
|
+
df.merge(other, indicator=True, how="right")
|
|
241
|
+
.query("_merge=='right_only'")
|
|
242
|
+
.drop("_merge", axis=1)
|
|
243
|
+
)
|
|
244
|
+
return self.__class__(out)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# Info
|
|
248
|
+
# ---------------------------------------------------------------------------
|
|
249
|
+
class DataAnalystDataFrameInfoMixin(DFBase):
|
|
250
|
+
"""
|
|
251
|
+
Data Analyst ``pd.DataFrame`` - Info
|
|
252
|
+
|
|
253
|
+
- Quick info
|
|
254
|
+
- Quick describe
|
|
255
|
+
- Show distribution
|
|
256
|
+
- Threshold filter
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
# Quick info
|
|
260
|
+
@versionadded("3.2.0")
|
|
261
|
+
def qinfo(self) -> str:
|
|
262
|
+
"""
|
|
263
|
+
Show quick infomation about DataFrame
|
|
264
|
+
|
|
265
|
+
Example:
|
|
266
|
+
--------
|
|
267
|
+
>>> DADF.sample_df().qinfo()
|
|
268
|
+
Dataset Information:
|
|
269
|
+
- Number of Rows: 100
|
|
270
|
+
- Number of Columns: 6
|
|
271
|
+
- Total observation: 600
|
|
272
|
+
- Missing value: 13 (2.17%)
|
|
273
|
+
|
|
274
|
+
Column names:
|
|
275
|
+
['number', 'number_big', 'number_range', 'missing_value', 'text', 'date']
|
|
276
|
+
"""
|
|
277
|
+
missing_values = self.isnull().sum().sum()
|
|
278
|
+
total_observation = self.shape[0] * self.shape[1]
|
|
279
|
+
mv_rate = missing_values / total_observation * 100
|
|
280
|
+
info = (
|
|
281
|
+
f"Dataset Information:\n"
|
|
282
|
+
f"- Number of Rows: {self.shape[0]:,}\n"
|
|
283
|
+
f"- Number of Columns: {self.shape[1]:,}\n"
|
|
284
|
+
f"- Total observation: {total_observation:,}\n"
|
|
285
|
+
f"- Missing value: {missing_values:,} ({mv_rate:.2f}%)\n\n"
|
|
286
|
+
f"Column names:\n{self.columns.to_list()}"
|
|
287
|
+
)
|
|
288
|
+
return info
|
|
289
|
+
|
|
290
|
+
@override
|
|
291
|
+
def describe(self, percentiles=None, include=None, exclude=None) -> Self:
|
|
292
|
+
"""pd.DataFrame.describe() override"""
|
|
293
|
+
return self.__class__(super().describe(percentiles, include, exclude)) # type: ignore [no-any-return]
|
|
294
|
+
|
|
295
|
+
# Quick describe
|
|
296
|
+
@versionadded("3.2.0")
|
|
297
|
+
def qdescribe(self) -> Self:
|
|
298
|
+
"""
|
|
299
|
+
Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype
|
|
300
|
+
|
|
301
|
+
Returns
|
|
302
|
+
-------
|
|
303
|
+
Self
|
|
304
|
+
Modified DataFrame
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
Example:
|
|
308
|
+
--------
|
|
309
|
+
>>> DADF.sample_df().qdescribe()
|
|
310
|
+
number number_big missing_value
|
|
311
|
+
count 100.000000 100.000000 48.000000
|
|
312
|
+
mean -0.052935 586.750000 22.916667
|
|
313
|
+
std 0.954170 237.248596 11.987286
|
|
314
|
+
min -2.392952 105.000000 3.000000
|
|
315
|
+
25% -0.738311 407.500000 13.000000
|
|
316
|
+
50% -0.068014 607.000000 23.500000
|
|
317
|
+
75% 0.614025 790.250000 36.000000
|
|
318
|
+
max 2.512533 988.000000 42.000000
|
|
319
|
+
"""
|
|
320
|
+
return self.__class__( # type: ignore [no-any-return]
|
|
321
|
+
self[self.select_dtypes(exclude=["object", "datetime"]).columns].describe()
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
@versionadded("3.2.0")
|
|
325
|
+
def show_distribution(
|
|
326
|
+
self,
|
|
327
|
+
column_name: str,
|
|
328
|
+
dropna: bool = True,
|
|
329
|
+
*,
|
|
330
|
+
show_percentage: bool = True,
|
|
331
|
+
percentage_round_up: int = 2,
|
|
332
|
+
) -> Self:
|
|
333
|
+
"""
|
|
334
|
+
Show distribution of a column
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
column_name : str
|
|
339
|
+
Column to show distribution
|
|
340
|
+
|
|
341
|
+
dropna : bool
|
|
342
|
+
Count N/A when ``False``
|
|
343
|
+
(Default: ``True``)
|
|
344
|
+
|
|
345
|
+
show_percentage : bool
|
|
346
|
+
Show proportion in range 0% - 100% instead of [0, 1]
|
|
347
|
+
(Default: ``True``)
|
|
348
|
+
|
|
349
|
+
percentage_round_up : int
|
|
350
|
+
Round up to which decimals
|
|
351
|
+
(Default: ``2``)
|
|
352
|
+
|
|
353
|
+
Returns
|
|
354
|
+
-------
|
|
355
|
+
Self
|
|
356
|
+
Distribution DataFrame
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
Example:
|
|
360
|
+
--------
|
|
361
|
+
>>> DADF.sample_df().show_distribution("number_range")
|
|
362
|
+
number_range count percentage
|
|
363
|
+
0 900 16 16.0
|
|
364
|
+
1 700 15 15.0
|
|
365
|
+
2 300 12 12.0
|
|
366
|
+
3 200 12 12.0
|
|
367
|
+
4 400 11 11.0
|
|
368
|
+
5 600 11 11.0
|
|
369
|
+
6 800 10 10.0
|
|
370
|
+
7 100 9 9.0
|
|
371
|
+
8 500 4 4.0
|
|
372
|
+
"""
|
|
373
|
+
out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index()
|
|
374
|
+
if show_percentage:
|
|
375
|
+
out["percentage"] = (out["count"] / self.shape[0] * 100).round(
|
|
376
|
+
percentage_round_up
|
|
377
|
+
)
|
|
378
|
+
else:
|
|
379
|
+
out["percentage"] = (out["count"] / self.shape[0]).round(
|
|
380
|
+
percentage_round_up
|
|
381
|
+
)
|
|
382
|
+
return self.__class__(out)
|
|
383
|
+
|
|
384
|
+
@deprecated("5.1.0", reason="Rework THIS")
|
|
385
|
+
def threshold_filter(
|
|
386
|
+
self,
|
|
387
|
+
destination_column: str,
|
|
388
|
+
threshold: int | float = 10,
|
|
389
|
+
*,
|
|
390
|
+
top: int | None = None,
|
|
391
|
+
replace_with: Any = "Other",
|
|
392
|
+
) -> Self:
|
|
393
|
+
"""
|
|
394
|
+
Filter out percentage of data that smaller than the ``threshold``,
|
|
395
|
+
replace all of the smaller data to ``replace_with``.
|
|
396
|
+
As a result, pie chart is less messy.
|
|
397
|
+
|
|
398
|
+
Parameters
|
|
399
|
+
----------
|
|
400
|
+
destination_column : str
|
|
401
|
+
Column to be filtered
|
|
402
|
+
|
|
403
|
+
threshold : int | float
|
|
404
|
+
Which percentage to cut-off
|
|
405
|
+
(Default: 10%)
|
|
406
|
+
|
|
407
|
+
top : int
|
|
408
|
+
Only show top ``x`` categories in pie chart
|
|
409
|
+
(replace threshold mode)
|
|
410
|
+
(Default: ``None``)
|
|
411
|
+
|
|
412
|
+
replace_with : Any
|
|
413
|
+
Replace all of the smaller data with specified value
|
|
414
|
+
|
|
415
|
+
Returns
|
|
416
|
+
-------
|
|
417
|
+
Self
|
|
418
|
+
Modified DataFrame
|
|
419
|
+
"""
|
|
420
|
+
# Clean
|
|
421
|
+
try:
|
|
422
|
+
self[destination_column] = self[
|
|
423
|
+
destination_column
|
|
424
|
+
].str.strip() # Remove trailing space
|
|
425
|
+
except Exception:
|
|
426
|
+
pass
|
|
427
|
+
|
|
428
|
+
# Logic
|
|
429
|
+
col_df = self.show_distribution(destination_column)
|
|
430
|
+
|
|
431
|
+
# Rename
|
|
432
|
+
if top is not None:
|
|
433
|
+
list_of_keep: list = (
|
|
434
|
+
col_df[destination_column]
|
|
435
|
+
.head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0]))
|
|
436
|
+
.to_list()
|
|
437
|
+
)
|
|
438
|
+
# logger.debug(list_of_keep)
|
|
439
|
+
else:
|
|
440
|
+
list_of_keep = col_df[col_df["percentage"] >= threshold][
|
|
441
|
+
destination_column
|
|
442
|
+
].to_list() # values that will not be renamed
|
|
443
|
+
self[f"{destination_column}_filtered"] = self[destination_column].apply(
|
|
444
|
+
lambda x: replace_with if x not in list_of_keep else x
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Return
|
|
448
|
+
return self
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
# Missing value
|
|
452
|
+
# ---------------------------------------------------------------------------
|
|
453
|
+
class DataAnalystDataFrameNAMixin(DFBase):
|
|
454
|
+
"""
|
|
455
|
+
Data Analyst ``pd.DataFrame`` - Missing value
|
|
456
|
+
|
|
457
|
+
- Fill missing values
|
|
458
|
+
- Get missing values
|
|
459
|
+
- Split N/A
|
|
460
|
+
- Apply not null
|
|
461
|
+
- Apply not null row
|
|
462
|
+
"""
|
|
463
|
+
|
|
464
|
+
def fill_missing_values(
|
|
465
|
+
self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
|
|
466
|
+
) -> Self:
|
|
467
|
+
"""
|
|
468
|
+
Fill missing values in specified column
|
|
469
|
+
|
|
470
|
+
Parameters
|
|
471
|
+
----------
|
|
472
|
+
column_name : str
|
|
473
|
+
Column name
|
|
474
|
+
|
|
475
|
+
fill : Any
|
|
476
|
+
Fill the missing values with, by default ``np.nan``
|
|
477
|
+
|
|
478
|
+
fill_when_not_exist : Any
|
|
479
|
+
When ``column_name`` does not exist,
|
|
480
|
+
create a new column and fill with
|
|
481
|
+
``fill_when_not_exist``, by default ``np.nan``
|
|
482
|
+
|
|
483
|
+
Returns
|
|
484
|
+
-------
|
|
485
|
+
Self
|
|
486
|
+
Modified DataFrame
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
Example:
|
|
490
|
+
--------
|
|
491
|
+
>>> df = DADF.sample_df(2)
|
|
492
|
+
>>> df
|
|
493
|
+
number number_big number_range missing_value text date
|
|
494
|
+
0 0.174303 926 900 NaN tenkiakh 2006-09-08
|
|
495
|
+
1 0.305137 140 100 NaN jzuddamf 2012-04-04
|
|
496
|
+
>>> df.fill_missing_values("missing_value", 0)
|
|
497
|
+
number number_big number_range missing_value text date
|
|
498
|
+
0 0.174303 926 900 0.0 tenkiakh 2006-09-08
|
|
499
|
+
1 0.305137 140 100 0.0 jzuddamf 2012-04-04
|
|
500
|
+
>>> df.fill_missing_values("missing_column", 0, fill_when_not_exist=0)
|
|
501
|
+
number number_big number_range missing_value text date missing_column
|
|
502
|
+
0 0.174303 926 900 0.0 tenkiakh 2006-09-08 0
|
|
503
|
+
1 0.305137 140 100 0.0 jzuddamf 2012-04-04 0
|
|
504
|
+
"""
|
|
505
|
+
try:
|
|
506
|
+
self[column_name] = self[column_name].fillna(fill)
|
|
507
|
+
except KeyError:
|
|
508
|
+
if getattr(self, "add_blank_column", None) is not None:
|
|
509
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
510
|
+
self.add_blank_column(column_name, fill_when_not_exist)
|
|
511
|
+
return self
|
|
512
|
+
|
|
513
|
+
def get_missing_values(
|
|
514
|
+
self, hightlight: bool = True, *, percentage_round_up: int = 2
|
|
515
|
+
) -> Self:
|
|
516
|
+
"""
|
|
517
|
+
Get a DataFrame contains count of missing values for each column
|
|
518
|
+
|
|
519
|
+
Parameters
|
|
520
|
+
----------
|
|
521
|
+
hightlight : bool
|
|
522
|
+
Shows only columns with missing values when ``True``, by default ``True``
|
|
523
|
+
|
|
524
|
+
percentage_round_up : int
|
|
525
|
+
Round up to which decimals, by default ``2``
|
|
526
|
+
|
|
527
|
+
Returns
|
|
528
|
+
-------
|
|
529
|
+
Self
|
|
530
|
+
Missing value DataFrame
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
Example:
|
|
534
|
+
--------
|
|
535
|
+
>>> DADF.sample_df(152).get_missing_values()
|
|
536
|
+
Num of N/A Percentage
|
|
537
|
+
missing_value 42 27.63
|
|
538
|
+
"""
|
|
539
|
+
# Check for missing value
|
|
540
|
+
df_na = self.isnull().sum().sort_values(ascending=False)
|
|
541
|
+
if hightlight:
|
|
542
|
+
out = df_na[df_na != 0].to_frame()
|
|
543
|
+
else:
|
|
544
|
+
out = df_na.to_frame()
|
|
545
|
+
out.rename(columns={0: "Num of N/A"}, inplace=True)
|
|
546
|
+
out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round(
|
|
547
|
+
percentage_round_up
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# logger.debug(
|
|
551
|
+
# f"Percentage of N/A over entire DF: "
|
|
552
|
+
# f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%"
|
|
553
|
+
# )
|
|
554
|
+
return self.__class__(out)
|
|
555
|
+
|
|
556
|
+
@versionadded("3.1.0")
|
|
557
|
+
def split_na(self, by_column: str) -> SplittedDF:
|
|
558
|
+
"""
|
|
559
|
+
Split DataFrame into 2 parts:
|
|
560
|
+
- Without missing value in specified column
|
|
561
|
+
- With missing value in specified column
|
|
562
|
+
|
|
563
|
+
Parameters
|
|
564
|
+
----------
|
|
565
|
+
by_column : str
|
|
566
|
+
Split by column
|
|
567
|
+
|
|
568
|
+
Returns
|
|
569
|
+
-------
|
|
570
|
+
SplittedDF
|
|
571
|
+
Splitted DataFrame
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
Example:
|
|
575
|
+
--------
|
|
576
|
+
>>> DADF.sample_df(10).split_na("missing_value")
|
|
577
|
+
SplittedDF(
|
|
578
|
+
df= number number_big number_range missing_value text date
|
|
579
|
+
0 0.643254 690 600 3.0 cinvofwj 2018-08-15
|
|
580
|
+
2 0.499345 255 200 13.0 jasifzez 2005-06-01
|
|
581
|
+
3 -1.727036 804 800 38.0 esxjmger 2009-07-24
|
|
582
|
+
4 0.873058 690 600 32.0 htewfpld 2022-07-22
|
|
583
|
+
5 -2.389884 442 400 30.0 hbcnfogu 2006-02-25
|
|
584
|
+
8 0.264584 432 400 2.0 ejbvbmwn 2013-05-11
|
|
585
|
+
9 0.813655 137 100 20.0 oecttada 2024-11-22,
|
|
586
|
+
df_na= number number_big number_range missing_value text date
|
|
587
|
+
1 -0.411354 363 300 NaN juzecani 2014-12-02
|
|
588
|
+
6 -0.833857 531 500 NaN ybnntryh 2023-11-03
|
|
589
|
+
7 1.355589 472 400 NaN zjltghjr 2024-10-09
|
|
590
|
+
)
|
|
591
|
+
"""
|
|
592
|
+
out = SplittedDF(
|
|
593
|
+
# df=self[~self[by_column].isna()], # DF
|
|
594
|
+
df=self[self[by_column].notna()], # DF
|
|
595
|
+
df_na=self[self[by_column].isna()], # DF w/o NA
|
|
596
|
+
)
|
|
597
|
+
return out
|
|
598
|
+
|
|
599
|
+
@versionadded("5.1.0")
|
|
600
|
+
def apply_notnull(self, col: str, callable: Callable[[Any], _R]) -> Self:
|
|
601
|
+
"""
|
|
602
|
+
Only apply callable to not NaN value in column
|
|
603
|
+
|
|
604
|
+
Parameters
|
|
605
|
+
----------
|
|
606
|
+
col : str
|
|
607
|
+
Column to apply
|
|
608
|
+
|
|
609
|
+
callable : Callable[[Any], _R]
|
|
610
|
+
Callable
|
|
611
|
+
|
|
612
|
+
Returns
|
|
613
|
+
-------
|
|
614
|
+
Self
|
|
615
|
+
Applied DataFrame
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
Example:
|
|
619
|
+
--------
|
|
620
|
+
>>> DADF.sample_df(5).apply_notnull("missing_value", lambda _: "REPLACED")
|
|
621
|
+
number number_big number_range missing_value text date
|
|
622
|
+
0 0.852218 157 100 REPLACED dqzxaxxs 2006-03-08
|
|
623
|
+
1 1.522428 616 600 NaN mivkaooe 2018-12-27
|
|
624
|
+
2 0.108506 745 700 REPLACED qanwwjet 2005-07-14
|
|
625
|
+
3 -1.435079 400 400 REPLACED ywahcasi 2024-05-20
|
|
626
|
+
4 0.118993 861 800 REPLACED saoupuby 2019-04-28
|
|
627
|
+
"""
|
|
628
|
+
self[col] = self[col].apply(lambda x: callable(x) if pd.notnull(x) else x)
|
|
629
|
+
return self
|
|
630
|
+
|
|
631
|
+
@versionadded("5.1.0") # type: ignore
|
|
632
|
+
def apply_notnull_row(
|
|
633
|
+
self,
|
|
634
|
+
apply_when_null: Callable[[Any], _R] | _T | None = None,
|
|
635
|
+
apply_when_not_null: Callable[[Any], _R] | _T | None = None,
|
|
636
|
+
col_name: str | None = None,
|
|
637
|
+
) -> Self:
|
|
638
|
+
"""
|
|
639
|
+
Apply to DataFrame's row with missing value.
|
|
640
|
+
|
|
641
|
+
Parameters
|
|
642
|
+
----------
|
|
643
|
+
apply_when_null : Callable[[Any], R] | T | None, optional
|
|
644
|
+
Callable or Any, by default ``None``: returns if entire row is not null
|
|
645
|
+
|
|
646
|
+
apply_when_not_null : Callable[[Any], R] | T | None, optional
|
|
647
|
+
Callable or Any, by default ``None``: returns if entire row is not null
|
|
648
|
+
|
|
649
|
+
col_name : str | None, optional
|
|
650
|
+
Output column name, by default ``None`` (uses custom name)
|
|
651
|
+
|
|
652
|
+
Returns
|
|
653
|
+
-------
|
|
654
|
+
Self
|
|
655
|
+
Modified DataDrame
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
Example:
|
|
659
|
+
--------
|
|
660
|
+
>>> df = DADF({"A": [None, 2, 3, 4], "B": [1, None, 3, 4], "C": [None, 2, None, 4]})
|
|
661
|
+
>>> df.apply_notnull_row()
|
|
662
|
+
A B C applied_row_null
|
|
663
|
+
0 NaN 1.0 NaN False
|
|
664
|
+
1 2.0 NaN 2.0 False
|
|
665
|
+
2 3.0 3.0 NaN False
|
|
666
|
+
3 4.0 4.0 4.0 True
|
|
667
|
+
>>> df.apply_notnull_row(0, 1)
|
|
668
|
+
A B C applied_row_null
|
|
669
|
+
0 NaN 1.0 NaN 0
|
|
670
|
+
1 2.0 NaN 2.0 0
|
|
671
|
+
2 3.0 3.0 NaN 0
|
|
672
|
+
3 4.0 4.0 4.0 1
|
|
673
|
+
>>> df.apply_notnull_row(lambda _: "n", lambda _: "y", col_name="mod")
|
|
674
|
+
A B C mod
|
|
675
|
+
0 NaN 1.0 NaN n
|
|
676
|
+
1 2.0 NaN 2.0 n
|
|
677
|
+
2 3.0 3.0 NaN n
|
|
678
|
+
3 4.0 4.0 4.0 y
|
|
679
|
+
"""
|
|
680
|
+
|
|
681
|
+
def apply_func(row: pd.Series):
|
|
682
|
+
# Both None
|
|
683
|
+
if apply_when_null is None and apply_when_not_null is None:
|
|
684
|
+
return row.notnull().all()
|
|
685
|
+
|
|
686
|
+
# When all values in row are not null
|
|
687
|
+
if row.notnull().all():
|
|
688
|
+
if callable(apply_when_not_null):
|
|
689
|
+
return apply_when_not_null(row)
|
|
690
|
+
return apply_when_not_null
|
|
691
|
+
|
|
692
|
+
# When any value in row is null
|
|
693
|
+
if callable(apply_when_null):
|
|
694
|
+
return apply_when_null(row)
|
|
695
|
+
return apply_when_null
|
|
696
|
+
|
|
697
|
+
# Column name
|
|
698
|
+
cname = "applied_row_null" if col_name is None else col_name
|
|
699
|
+
self[cname] = self.apply(apply_func, axis=1)
|
|
700
|
+
|
|
701
|
+
return self
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
# Other
|
|
705
|
+
# ---------------------------------------------------------------------------
|
|
706
|
+
class DataAnalystDataFrameOtherMixin(DFBase):
|
|
707
|
+
"""
|
|
708
|
+
Data Analyst ``pd.DataFrame`` - Other method/Stuff
|
|
709
|
+
|
|
710
|
+
- Merge left
|
|
711
|
+
"""
|
|
712
|
+
|
|
713
|
+
@versionadded("4.0.0")
|
|
714
|
+
def merge_left(
|
|
715
|
+
self,
|
|
716
|
+
other: Self | pd.DataFrame,
|
|
717
|
+
on: str,
|
|
718
|
+
columns: list[str] | None = None,
|
|
719
|
+
) -> Self:
|
|
720
|
+
"""
|
|
721
|
+
Merge left of 2 DataFrame
|
|
722
|
+
|
|
723
|
+
Parameters
|
|
724
|
+
----------
|
|
725
|
+
other : Self | pd.DataFrame
|
|
726
|
+
DataFrame to merge
|
|
727
|
+
|
|
728
|
+
on : str
|
|
729
|
+
Merge on which column
|
|
730
|
+
|
|
731
|
+
columns : list[str] | None, optional
|
|
732
|
+
Columns to take from other DataFrame, by default ``None``
|
|
733
|
+
(Take all columns)
|
|
734
|
+
|
|
735
|
+
Returns
|
|
736
|
+
-------
|
|
737
|
+
Self
|
|
738
|
+
Merged DataFrame
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
Example:
|
|
742
|
+
--------
|
|
743
|
+
>>> df1 = DADF({
|
|
744
|
+
... "id": [1, 2, 5],
|
|
745
|
+
... "name": ["Alice", "Bob", "Rich"],
|
|
746
|
+
... "age": [20, 20, 20],
|
|
747
|
+
... })
|
|
748
|
+
>>> df2 = DADF({
|
|
749
|
+
... "id": [1, 2, 3],
|
|
750
|
+
... "age": [25, 30, 45],
|
|
751
|
+
... "department": ["HR", "IT", "PM"],
|
|
752
|
+
... "salary": [50000, 60000, 55000],
|
|
753
|
+
... })
|
|
754
|
+
>>> df1.merge_left(df2, on="id")
|
|
755
|
+
id name age_x age_y department salary
|
|
756
|
+
0 1 Alice 20 25.0 HR 50000.0
|
|
757
|
+
1 2 Bob 20 30.0 IT 60000.0
|
|
758
|
+
2 5 Rich 20 NaN NaN NaN
|
|
759
|
+
>>> df1.merge_left(df2, on="id", columns=["salary"])
|
|
760
|
+
id name age department salary
|
|
761
|
+
0 1 Alice 25.0 HR 50000.0
|
|
762
|
+
1 2 Bob 30.0 IT 60000.0
|
|
763
|
+
2 5 Rich NaN NaN NaN
|
|
764
|
+
"""
|
|
765
|
+
|
|
766
|
+
if columns is not None:
|
|
767
|
+
current_col = [on]
|
|
768
|
+
current_col.extend(columns)
|
|
769
|
+
col = other.columns.to_list()
|
|
770
|
+
cols = list(set(col) - set(current_col))
|
|
771
|
+
|
|
772
|
+
if getattr(self, "drop_columns", None) is not None:
|
|
773
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
774
|
+
self.drop_columns(cols)
|
|
775
|
+
|
|
776
|
+
out = self.merge(other, how="left", on=on)
|
|
777
|
+
return self.__class__(out)
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
# Date
|
|
781
|
+
# ---------------------------------------------------------------------------
|
|
782
|
+
class DataAnalystDataFrameDateMixin(DFBase):
|
|
783
|
+
"""
|
|
784
|
+
Data Analyst ``pd.DataFrame`` - Date
|
|
785
|
+
|
|
786
|
+
- Add date column from month column
|
|
787
|
+
- Add detail date
|
|
788
|
+
- Delta date (How many days inbetween)
|
|
789
|
+
"""
|
|
790
|
+
|
|
791
|
+
def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self:
|
|
792
|
+
"""
|
|
793
|
+
Add dummy ``date`` column from ``month`` column
|
|
794
|
+
|
|
795
|
+
Parameters
|
|
796
|
+
----------
|
|
797
|
+
month_column : str
|
|
798
|
+
Month column
|
|
799
|
+
|
|
800
|
+
col_name : str
|
|
801
|
+
New date column name, by default: ``"date"``
|
|
802
|
+
|
|
803
|
+
Returns
|
|
804
|
+
-------
|
|
805
|
+
Self
|
|
806
|
+
Modified DataFrame
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
Example:
|
|
810
|
+
--------
|
|
811
|
+
>>> df = (
|
|
812
|
+
... DADF.sample_df(2)
|
|
813
|
+
... .add_detail_date("date", mode="m")
|
|
814
|
+
... .drop_columns(["date", "number", "number_range"])
|
|
815
|
+
... )
|
|
816
|
+
>>> df
|
|
817
|
+
number_big missing_value text month
|
|
818
|
+
0 755 NaN lincgqzl 4
|
|
819
|
+
1 907 NaN gxltrjku 10
|
|
820
|
+
>>> df.add_date_from_month("month")
|
|
821
|
+
number_big missing_value text month date
|
|
822
|
+
0 755 NaN lincgqzl 4 2025-04-01
|
|
823
|
+
1 907 NaN gxltrjku 10 2025-10-01
|
|
824
|
+
"""
|
|
825
|
+
_this_year = datetime.now().year
|
|
826
|
+
self[col_name] = pd.to_datetime(
|
|
827
|
+
f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
|
|
828
|
+
format="%Y-%m-%d",
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
# Rearrange
|
|
832
|
+
if getattr(self, "rearrange_rightmost_column", None) is not None:
|
|
833
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
834
|
+
return self.rearrange_rightmost_column(month_column) # type: ignore [no-any-return]
|
|
835
|
+
return self
|
|
836
|
+
|
|
837
|
+
def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self:
|
|
838
|
+
"""
|
|
839
|
+
Add these columns from ``date_column``:
|
|
840
|
+
- ``date`` (won't add if ``date_column`` value is ``"date"``)
|
|
841
|
+
- ``day`` (overwrite if already exist)
|
|
842
|
+
- ``week`` (overwrite if already exist)
|
|
843
|
+
- ``month`` (overwrite if already exist)
|
|
844
|
+
- ``year`` (overwrite if already exist)
|
|
845
|
+
|
|
846
|
+
Parameters
|
|
847
|
+
----------
|
|
848
|
+
date_column : str
|
|
849
|
+
Date column
|
|
850
|
+
|
|
851
|
+
mode : str
|
|
852
|
+
| Detailed column to add
|
|
853
|
+
| ``d``: day
|
|
854
|
+
| ``w``: week number
|
|
855
|
+
| ``m``: month
|
|
856
|
+
| ``y``: year
|
|
857
|
+
| (Default: ``"dwmy"``)
|
|
858
|
+
|
|
859
|
+
Returns
|
|
860
|
+
-------
|
|
861
|
+
Self
|
|
862
|
+
Modified DataFrame
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
Example:
|
|
866
|
+
--------
|
|
867
|
+
>>> df = DADF.sample_df(2)
|
|
868
|
+
>>> df
|
|
869
|
+
number number_big number_range missing_value text date
|
|
870
|
+
0 0.331195 902 900 20 fgyanxik 2021-10-18
|
|
871
|
+
1 -0.877727 378 300 13 dqvaggjo 2007-03-06
|
|
872
|
+
>>> df.add_detail_date("date")
|
|
873
|
+
number number_big number_range missing_value text date day week month year
|
|
874
|
+
0 0.331195 902 900 20 fgyanxik 2021-10-18 18 42 10 2021
|
|
875
|
+
1 -0.877727 378 300 13 dqvaggjo 2007-03-06 6 10 3 2007
|
|
876
|
+
"""
|
|
877
|
+
# Convert to datetime
|
|
878
|
+
self["date"] = pd.to_datetime(self[date_column])
|
|
879
|
+
|
|
880
|
+
# Logic
|
|
881
|
+
col_counter = 0
|
|
882
|
+
# self["weekday"] = self["day"].dt.isocalendar().day # Weekday
|
|
883
|
+
if mode.find("d") != -1:
|
|
884
|
+
logger.debug("Mode: 'day'")
|
|
885
|
+
self["day"] = self["date"].dt.day
|
|
886
|
+
col_counter += 1
|
|
887
|
+
if mode.find("w") != -1:
|
|
888
|
+
logger.debug("Mode: 'weekday'")
|
|
889
|
+
self["week"] = self["date"].dt.isocalendar().week
|
|
890
|
+
col_counter += 1
|
|
891
|
+
if mode.find("m") != -1:
|
|
892
|
+
logger.debug("Mode: 'month'")
|
|
893
|
+
self["month"] = self["date"].dt.month
|
|
894
|
+
col_counter += 1
|
|
895
|
+
if mode.find("y") != -1:
|
|
896
|
+
logger.debug("Mode: 'year'")
|
|
897
|
+
self["year"] = self["date"].dt.year
|
|
898
|
+
col_counter += 1
|
|
899
|
+
|
|
900
|
+
# Return
|
|
901
|
+
if getattr(self, "rearrange_rightmost_column", None) is not None:
|
|
902
|
+
# Compatible with DataAnalystDataFrameColumnMethodMixin
|
|
903
|
+
return self.rearrange_rightmost_column(date_column, col_counter) # type: ignore [no-any-return]
|
|
904
|
+
return self
|
|
905
|
+
|
|
906
|
+
def delta_date(
|
|
907
|
+
self,
|
|
908
|
+
date_column: str,
|
|
909
|
+
mode: Literal["now", "between_row"] = "now",
|
|
910
|
+
*,
|
|
911
|
+
col_name: str = "delta_date",
|
|
912
|
+
) -> Self:
|
|
913
|
+
"""
|
|
914
|
+
Calculate date interval
|
|
915
|
+
|
|
916
|
+
Parameters
|
|
917
|
+
----------
|
|
918
|
+
date_column : str
|
|
919
|
+
Date column
|
|
920
|
+
|
|
921
|
+
mode : str
|
|
922
|
+
| Mode to calculate
|
|
923
|
+
| ``"between_row"``: Calculate date interval between each row
|
|
924
|
+
| ``"now"``: Calculate date interval to current date
|
|
925
|
+
| (Default: ``"now"``)
|
|
926
|
+
|
|
927
|
+
col_name : str
|
|
928
|
+
| New delta date column name
|
|
929
|
+
| (Default: ``"delta_date"``)
|
|
930
|
+
|
|
931
|
+
Returns
|
|
932
|
+
-------
|
|
933
|
+
Self
|
|
934
|
+
Modified DataFrame
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
Example:
|
|
938
|
+
--------
|
|
939
|
+
>>> df = DADF.sample_df(2)
|
|
940
|
+
>>> df
|
|
941
|
+
number number_big number_range missing_value text date
|
|
942
|
+
0 -0.729988 435 400 21 xkrqqouf 2014-08-01
|
|
943
|
+
1 -0.846031 210 200 5 rbkmiqxt 2024-07-10
|
|
944
|
+
>>> df.delta_date("date")
|
|
945
|
+
number number_big number_range missing_value text date delta_date
|
|
946
|
+
0 -0.729988 435 400 21 xkrqqouf 2014-08-01 3873
|
|
947
|
+
1 -0.846031 210 200 5 rbkmiqxt 2024-07-10 242
|
|
948
|
+
"""
|
|
949
|
+
if mode.lower().startswith("between_row"):
|
|
950
|
+
dated = self[date_column].to_list()
|
|
951
|
+
cal: list[timedelta] = []
|
|
952
|
+
for i in range(len(dated)):
|
|
953
|
+
if i == 0:
|
|
954
|
+
cal.append(dated[i] - dated[i])
|
|
955
|
+
# cal.append(relativedelta(dated[i], dated[i]))
|
|
956
|
+
else:
|
|
957
|
+
cal.append(dated[i] - dated[i - 1])
|
|
958
|
+
# cal.append(relativedelta(dated[i], dated[i - 1]))
|
|
959
|
+
self[col_name] = [x.days for x in cal]
|
|
960
|
+
else: # mode="now"
|
|
961
|
+
self[col_name] = self[date_column].apply(
|
|
962
|
+
lambda x: (datetime.now() - x).days
|
|
963
|
+
)
|
|
964
|
+
return self
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
# City
|
|
968
|
+
# ---------------------------------------------------------------------------
|
|
969
|
+
class DataAnalystDataFrameCityMixin(DFBase):
|
|
970
|
+
"""
|
|
971
|
+
Data Analyst ``pd.DataFrame`` - City
|
|
972
|
+
|
|
973
|
+
- Convert city
|
|
974
|
+
"""
|
|
975
|
+
|
|
976
|
+
def convert_city(
|
|
977
|
+
self,
|
|
978
|
+
city_column: str,
|
|
979
|
+
city_list: list[CityData],
|
|
980
|
+
*,
|
|
981
|
+
mode: str = "ra",
|
|
982
|
+
) -> Self:
|
|
983
|
+
"""
|
|
984
|
+
Get ``region`` and ``area`` of a city
|
|
985
|
+
|
|
986
|
+
Parameters
|
|
987
|
+
----------
|
|
988
|
+
city_column : str
|
|
989
|
+
Column contains city data
|
|
990
|
+
|
|
991
|
+
city_list : list[CityData]
|
|
992
|
+
List of city in correct format
|
|
993
|
+
(Default: ``None``)
|
|
994
|
+
|
|
995
|
+
mode : str
|
|
996
|
+
| Detailed column to add
|
|
997
|
+
| ``r``: region
|
|
998
|
+
| ``a``: area
|
|
999
|
+
| (Default: ``"ra"``)
|
|
1000
|
+
|
|
1001
|
+
Returns
|
|
1002
|
+
-------
|
|
1003
|
+
DataAnalystDataFrame
|
|
1004
|
+
Modified DataFrame
|
|
1005
|
+
"""
|
|
1006
|
+
|
|
1007
|
+
# Support function
|
|
1008
|
+
def _convert_city_support(value: str) -> CityData:
|
|
1009
|
+
for x in city_list:
|
|
1010
|
+
if x.city.lower().startswith(value.lower()):
|
|
1011
|
+
return x
|
|
1012
|
+
return CityData(city=value, region=np.nan, area=np.nan) # type: ignore
|
|
1013
|
+
|
|
1014
|
+
# Convert
|
|
1015
|
+
col_counter = 0
|
|
1016
|
+
if mode.find("r") != -1:
|
|
1017
|
+
logger.debug("Mode: 'region'")
|
|
1018
|
+
self["region"] = self[city_column].apply(
|
|
1019
|
+
lambda x: _convert_city_support(x).region
|
|
1020
|
+
)
|
|
1021
|
+
col_counter += 1
|
|
1022
|
+
if mode.find("a") != -1:
|
|
1023
|
+
logger.debug("Mode: 'area'")
|
|
1024
|
+
self["area"] = self[city_column].apply(
|
|
1025
|
+
lambda x: _convert_city_support(x).area
|
|
1026
|
+
)
|
|
1027
|
+
col_counter += 1
|
|
1028
|
+
|
|
1029
|
+
# Rearrange
|
|
1030
|
+
if getattr(self, "rearrange_rightmost_column", None) is not None:
|
|
1031
|
+
return self.rearrange_rightmost_column(city_column, col_counter) # type: ignore [no-any-return]
|
|
1032
|
+
return self
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
# Main
|
|
1036
|
+
# ---------------------------------------------------------------------------
|
|
1037
|
+
class DADF(
|
|
1038
|
+
ShowAllMethodsMixin,
|
|
1039
|
+
DataAnalystDataFrameCityMixin,
|
|
1040
|
+
DataAnalystDataFrameDateMixin,
|
|
1041
|
+
DataAnalystDataFrameOtherMixin,
|
|
1042
|
+
DataAnalystDataFrameNAMixin,
|
|
1043
|
+
DataAnalystDataFrameInfoMixin,
|
|
1044
|
+
DataAnalystDataFrameRowMethodMixin,
|
|
1045
|
+
DataAnalystDataFrameColumnMethodMixin,
|
|
1046
|
+
):
|
|
1047
|
+
"""
|
|
1048
|
+
Data Analyst ``pd.DataFrame``
|
|
1049
|
+
|
|
1050
|
+
For a list of extra methods:
|
|
1051
|
+
>>> print(DADF.DADF_METHODS)
|
|
1052
|
+
"""
|
|
1053
|
+
|
|
1054
|
+
@classmethod
|
|
1055
|
+
@deprecated("5.1.0")
|
|
1056
|
+
@versionadded("3.2.0")
|
|
1057
|
+
def dadf_help(cls) -> list[str]:
|
|
1058
|
+
"""
|
|
1059
|
+
Show all available method of DataAnalystDataFrame
|
|
1060
|
+
"""
|
|
1061
|
+
list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame)))
|
|
1062
|
+
return sorted(list_of_method)
|
|
1063
|
+
|
|
1064
|
+
@classmethod
|
|
1065
|
+
def sample_df(cls, size: int = 100) -> Self:
|
|
1066
|
+
"""
|
|
1067
|
+
Create sample DataFrame
|
|
1068
|
+
|
|
1069
|
+
Parameters
|
|
1070
|
+
----------
|
|
1071
|
+
size : int
|
|
1072
|
+
Number of observations, by default ``100``
|
|
1073
|
+
|
|
1074
|
+
Returns
|
|
1075
|
+
-------
|
|
1076
|
+
Self
|
|
1077
|
+
DataFrame with these columns:
|
|
1078
|
+
[number, number_big, number_range, missing_value, text, date]
|
|
1079
|
+
|
|
1080
|
+
|
|
1081
|
+
Example:
|
|
1082
|
+
--------
|
|
1083
|
+
>>> DataAnalystDataFrame.sample_df()
|
|
1084
|
+
number number_big number_range missing_value text date
|
|
1085
|
+
0 -2.089770 785 700 NaN vwnlqoql 2013-11-20
|
|
1086
|
+
1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13
|
|
1087
|
+
2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24
|
|
1088
|
+
3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28
|
|
1089
|
+
4 1.687803 878 800 NaN aajtncum 2005-10-05
|
|
1090
|
+
.. ... ... ... ... ... ...
|
|
1091
|
+
95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12
|
|
1092
|
+
96 1.296795 255 200 NaN lwvytego 2014-05-10
|
|
1093
|
+
97 1.440746 297 200 5.0 lqsoykun 2010-04-03
|
|
1094
|
+
98 0.327702 845 800 NaN leadkvsy 2005-08-05
|
|
1095
|
+
99 0.556720 981 900 36.0 bozmxixy 2004-02-22
|
|
1096
|
+
[100 rows x 6 columns]
|
|
1097
|
+
"""
|
|
1098
|
+
# Restrain
|
|
1099
|
+
size = max(size, 1)
|
|
1100
|
+
|
|
1101
|
+
# Number col
|
|
1102
|
+
df = cls(np.random.randn(size, 1), columns=["number"])
|
|
1103
|
+
df["number_big"] = [
|
|
1104
|
+
random.choice(range(100, 999)) for _ in range(size)
|
|
1105
|
+
] # Big number in range 100-999
|
|
1106
|
+
df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
|
|
1107
|
+
|
|
1108
|
+
# Missing value col
|
|
1109
|
+
na_rate = random.randint(1, 99)
|
|
1110
|
+
d = [random.randint(1, 99) for _ in range(size)]
|
|
1111
|
+
df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
|
|
1112
|
+
# df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
|
|
1113
|
+
|
|
1114
|
+
# Text col
|
|
1115
|
+
df["text"] = [
|
|
1116
|
+
"".join([random.choice(string.ascii_lowercase) for _ in range(8)])
|
|
1117
|
+
for _ in range(size)
|
|
1118
|
+
]
|
|
1119
|
+
|
|
1120
|
+
# Random date col
|
|
1121
|
+
df["date"] = [
|
|
1122
|
+
datetime(
|
|
1123
|
+
year=random.randint(datetime.now().year - 20, datetime.now().year),
|
|
1124
|
+
month=random.randint(1, 12),
|
|
1125
|
+
day=random.randint(1, 28),
|
|
1126
|
+
)
|
|
1127
|
+
for _ in range(size)
|
|
1128
|
+
]
|
|
1129
|
+
|
|
1130
|
+
# Return
|
|
1131
|
+
return df
|
|
1132
|
+
|
|
1133
|
+
|
|
1134
|
+
class DADF_WIP(DADF):
|
|
1135
|
+
"""
|
|
1136
|
+
W.I.P - No test cases written
|
|
1137
|
+
"""
|
|
1138
|
+
|
|
1139
|
+
def split_str_column(
|
|
1140
|
+
self,
|
|
1141
|
+
col: str,
|
|
1142
|
+
pattern: str = " ",
|
|
1143
|
+
*,
|
|
1144
|
+
n: int | None = None,
|
|
1145
|
+
regex: bool = False,
|
|
1146
|
+
) -> Self:
|
|
1147
|
+
"""
|
|
1148
|
+
Split column with dtype[str] into other columns.
|
|
1149
|
+
|
|
1150
|
+
Parameters
|
|
1151
|
+
----------
|
|
1152
|
+
col : str
|
|
1153
|
+
Column name
|
|
1154
|
+
|
|
1155
|
+
pattern : str, optional
|
|
1156
|
+
Split pattern, by default ``" "``
|
|
1157
|
+
|
|
1158
|
+
n : int | None, optional
|
|
1159
|
+
Split by how many times, by default ``None``
|
|
1160
|
+
|
|
1161
|
+
regex : bool, optional
|
|
1162
|
+
Regex mode, by default ``False``
|
|
1163
|
+
|
|
1164
|
+
Returns
|
|
1165
|
+
-------
|
|
1166
|
+
Self
|
|
1167
|
+
DataFrame
|
|
1168
|
+
"""
|
|
1169
|
+
if n is None:
|
|
1170
|
+
pass
|
|
1171
|
+
splited_data: pd.DataFrame = self[col].str.split(
|
|
1172
|
+
pat=pattern, n=n, expand=True, regex=regex
|
|
1173
|
+
)
|
|
1174
|
+
num_of_splitted_cols = splited_data.shape[1]
|
|
1175
|
+
new_col_names = [f"{col}_{x}" for x in range(num_of_splitted_cols)]
|
|
1176
|
+
self[new_col_names] = splited_data
|
|
1177
|
+
return self
|