absfuyu 5.0.0__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of absfuyu might be problematic. Click here for more details.

Files changed (72) hide show
  1. absfuyu/__init__.py +1 -1
  2. absfuyu/__main__.py +2 -2
  3. absfuyu/cli/__init__.py +2 -2
  4. absfuyu/cli/color.py +30 -14
  5. absfuyu/cli/config_group.py +9 -2
  6. absfuyu/cli/do_group.py +13 -6
  7. absfuyu/cli/game_group.py +9 -2
  8. absfuyu/cli/tool_group.py +16 -9
  9. absfuyu/config/__init__.py +2 -2
  10. absfuyu/core/__init__.py +2 -2
  11. absfuyu/core/baseclass.py +449 -80
  12. absfuyu/core/baseclass2.py +2 -2
  13. absfuyu/core/decorator.py +69 -3
  14. absfuyu/core/docstring.py +25 -22
  15. absfuyu/core/dummy_cli.py +2 -2
  16. absfuyu/core/dummy_func.py +19 -6
  17. absfuyu/core/typings.py +40 -0
  18. absfuyu/dxt/__init__.py +2 -2
  19. absfuyu/dxt/dictext.py +2 -2
  20. absfuyu/dxt/dxt_support.py +2 -2
  21. absfuyu/dxt/intext.py +31 -3
  22. absfuyu/dxt/listext.py +28 -3
  23. absfuyu/dxt/strext.py +3 -3
  24. absfuyu/extra/__init__.py +2 -2
  25. absfuyu/extra/beautiful.py +3 -2
  26. absfuyu/extra/da/__init__.py +36 -0
  27. absfuyu/extra/da/dadf.py +1138 -0
  28. absfuyu/extra/da/dadf_base.py +186 -0
  29. absfuyu/extra/da/df_func.py +97 -0
  30. absfuyu/extra/da/mplt.py +219 -0
  31. absfuyu/extra/data_analysis.py +10 -1067
  32. absfuyu/fun/__init__.py +2 -2
  33. absfuyu/fun/tarot.py +2 -2
  34. absfuyu/game/__init__.py +2 -2
  35. absfuyu/game/game_stat.py +2 -2
  36. absfuyu/game/sudoku.py +2 -2
  37. absfuyu/game/tictactoe.py +2 -2
  38. absfuyu/game/wordle.py +2 -2
  39. absfuyu/general/__init__.py +4 -4
  40. absfuyu/general/content.py +2 -2
  41. absfuyu/general/human.py +2 -2
  42. absfuyu/general/shape.py +2 -2
  43. absfuyu/logger.py +2 -2
  44. absfuyu/pkg_data/__init__.py +2 -2
  45. absfuyu/pkg_data/deprecated.py +2 -2
  46. absfuyu/sort.py +2 -2
  47. absfuyu/tools/__init__.py +25 -2
  48. absfuyu/tools/checksum.py +27 -7
  49. absfuyu/tools/converter.py +93 -28
  50. absfuyu/{general → tools}/generator.py +2 -2
  51. absfuyu/tools/inspector.py +433 -0
  52. absfuyu/tools/keygen.py +2 -2
  53. absfuyu/tools/obfuscator.py +46 -8
  54. absfuyu/tools/passwordlib.py +88 -23
  55. absfuyu/tools/shutdownizer.py +2 -2
  56. absfuyu/tools/web.py +2 -2
  57. absfuyu/util/__init__.py +2 -2
  58. absfuyu/util/api.py +2 -2
  59. absfuyu/util/json_method.py +2 -2
  60. absfuyu/util/lunar.py +2 -2
  61. absfuyu/util/path.py +190 -82
  62. absfuyu/util/performance.py +4 -4
  63. absfuyu/util/shorten_number.py +40 -10
  64. absfuyu/util/text_table.py +272 -0
  65. absfuyu/util/zipped.py +6 -6
  66. absfuyu/version.py +59 -42
  67. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/METADATA +10 -3
  68. absfuyu-5.1.0.dist-info/RECORD +76 -0
  69. absfuyu-5.0.0.dist-info/RECORD +0 -68
  70. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/WHEEL +0 -0
  71. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/entry_points.txt +0 -0
  72. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1138 @@
1
+ """
2
+ Absfuyu: Data Analysis
3
+ ----------------------
4
+ Data Analyst DataFrame
5
+
6
+ Version: 5.1.0
7
+ Date updated: 10/03/2025 (dd/mm/yyyy)
8
+ """
9
+
10
+ # Module level
11
+ # ---------------------------------------------------------------------------
12
+ __all__ = [
13
+ "DADF",
14
+ "DataAnalystDataFrameColumnMethodMixin",
15
+ "DataAnalystDataFrameRowMethodMixin",
16
+ "DataAnalystDataFrameInfoMixin",
17
+ "DataAnalystDataFrameNAMixin",
18
+ "DataAnalystDataFrameOtherMixin",
19
+ "DataAnalystDataFrameDateMixin",
20
+ "DataAnalystDataFrameCityMixin",
21
+ ]
22
+
23
+
24
+ # Library
25
+ # ---------------------------------------------------------------------------
26
+ import random
27
+ import string
28
+ from collections.abc import Callable, Sequence
29
+ from datetime import datetime, timedelta
30
+ from typing import Any, Literal, Self
31
+
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+ try:
36
+ from typing import override # type: ignore
37
+ except ImportError:
38
+ from absfuyu.core.decorator import dummy_decorator as override
39
+
40
+ from absfuyu.core.baseclass import ShowAllMethodsMixin
41
+ from absfuyu.core.docstring import deprecated, versionadded
42
+ from absfuyu.core.typings import _R, _T
43
+ from absfuyu.extra.da.dadf_base import CityData
44
+ from absfuyu.extra.da.dadf_base import DataAnalystDataFrameBase as DFBase
45
+ from absfuyu.extra.da.dadf_base import SplittedDF
46
+ from absfuyu.logger import logger
47
+ from absfuyu.util import set_min_max
48
+
49
+
50
+ # Column method
51
+ # ---------------------------------------------------------------------------
52
+ class DataAnalystDataFrameColumnMethodMixin(DFBase):
53
+ """
54
+ Data Analyst ``pd.DataFrame`` - Column method
55
+
56
+ - Rearrange rightmost column
57
+ - Drop columns
58
+ - Drop rightmost column
59
+ - Add blank column
60
+ """
61
+
62
+ def rearrange_rightmost_column(
63
+ self, insert_to_col: str, num_of_cols: int = 1
64
+ ) -> Self:
65
+ """
66
+ Move right-most columns to selected position
67
+
68
+ Parameters
69
+ ----------
70
+ insert_to_col : str
71
+ Name of the column that the right-most column will be moved next to
72
+
73
+ num_of_cols : int
74
+ Number of columns moved, by default ``1``
75
+
76
+ Returns
77
+ -------
78
+ Self
79
+ Modified DataFrame
80
+
81
+
82
+ Example:
83
+ --------
84
+ >>> df = DADF.sample_df(2)
85
+ >>> df
86
+ number number_big number_range missing_value text date
87
+ 0 -1.583590 756 700 NaN eqklyckc 2023-05-20
88
+ 1 0.203968 167 100 NaN wzrsxinb 2011-02-27
89
+ >>> df.rearrange_rightmost_column("number")
90
+ number date number_big number_range missing_value text
91
+ 0 -1.583590 2023-05-20 756 700 NaN eqklyckc
92
+ 1 0.203968 2011-02-27 167 100 NaN wzrsxinb
93
+ """
94
+ cols: list[str] = self.columns.to_list() # List of columns
95
+ num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols)))
96
+ col_index: int = cols.index(insert_to_col)
97
+ new_cols: list[str] = (
98
+ cols[: col_index + 1]
99
+ + cols[-num_of_cols:]
100
+ + cols[col_index + 1 : len(cols) - num_of_cols]
101
+ )
102
+ self = self.__class__(self[new_cols])
103
+ return self
104
+
105
+ def drop_columns(self, columns: Sequence[str]) -> Self:
106
+ """
107
+ Drop columns in DataFrame
108
+
109
+ Parameters
110
+ ----------
111
+ columns : Iterable[str]
112
+ List of columns need to drop
113
+
114
+ Returns
115
+ -------
116
+ Self
117
+ Modified DataFrame
118
+
119
+
120
+ Example:
121
+ --------
122
+ >>> df = DADF.sample_df(2)
123
+ >>> df
124
+ number number_big number_range missing_value text date
125
+ 0 -0.283019 666 600 NaN ztoeeblx 2022-11-13
126
+ 1 1.194725 939 900 NaN fxardqvh 2005-08-04
127
+ >>> df.drop_columns(["date", "text"])
128
+ number number_big number_range missing_value
129
+ 0 -0.283019 666 600 NaN
130
+ 1 1.194725 939 900 NaN
131
+ """
132
+ for column in columns:
133
+ try:
134
+ self.drop(columns=[column], inplace=True)
135
+ except KeyError:
136
+ logger.debug(f"{column} column does not exist")
137
+ # pass
138
+ return self
139
+
140
+ def drop_rightmost(self, num_of_cols: int = 1) -> Self:
141
+ """
142
+ Drop ``num_of_cols`` right-most columns
143
+
144
+ Parameters
145
+ ----------
146
+ num_of_cols : int
147
+ Number of columns to drop
148
+
149
+ Returns
150
+ -------
151
+ Self
152
+ Modified DataFrame
153
+
154
+
155
+ Example:
156
+ --------
157
+ >>> df = DADF.sample_df(2)
158
+ >>> df
159
+ number number_big number_range missing_value text date
160
+ 0 0.851953 572 500 5 ncpbnzef 2020-08-15
161
+ 1 0.381643 595 500 53 iojogbgj 2011-12-04
162
+ >>> df.drop_rightmost(5)
163
+ number
164
+ 0 0.851953
165
+ 1 0.381643
166
+ """
167
+ # Restrain
168
+ # if num_of_cols < 1:
169
+ # num_of_cols = 1
170
+ # if num_of_cols > self.shape[1]:
171
+ # num_of_cols = self.shape[1]
172
+ num_of_cols = int(
173
+ set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
174
+ )
175
+
176
+ # Logic
177
+ for _ in range(num_of_cols):
178
+ self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
179
+ return self
180
+
181
+ @deprecated("5.1.0", reason="Use pd.DataFrame.assign(...) method instead")
182
+ def add_blank_column(self, column_name: str, fill: Any = np.nan, /) -> Self:
183
+ """
184
+ Add a blank column
185
+
186
+ Parameters
187
+ ----------
188
+ column_name : str
189
+ Name of the column to add
190
+
191
+ fill : Any
192
+ Fill the column with data
193
+
194
+ Returns
195
+ -------
196
+ Self
197
+ Modified DataFrame
198
+ """
199
+ self[column_name] = [fill] * self.shape[0]
200
+ return self
201
+
202
+
203
+ # Row method
204
+ # ---------------------------------------------------------------------------
205
+ class DataAnalystDataFrameRowMethodMixin(DFBase):
206
+ """
207
+ Data Analyst ``pd.DataFrame`` - Row method
208
+
209
+ - Get different rows
210
+ """
211
+
212
+ @versionadded("4.0.0")
213
+ def get_different_rows(self, other: Self | pd.DataFrame) -> Self:
214
+ """
215
+ Subtract DataFrame to find the different rows
216
+
217
+ Parameters
218
+ ----------
219
+ other : Self | pd.DataFrame
220
+ DataFrame to subtract
221
+
222
+ Returns
223
+ -------
224
+ Self
225
+ Different row DataFrame
226
+
227
+
228
+ Example:
229
+ --------
230
+ >>> df1 = DADF({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
231
+ >>> df2 = DADF({"A": [1, 2, 3, 4], "B": [7, 6, 6, 8]})
232
+ >>> df1.get_different_rows(df2)
233
+ A B
234
+ 0 1 7
235
+ 2 3 6
236
+ """
237
+ df = self.copy()
238
+ out = (
239
+ df.merge(other, indicator=True, how="right")
240
+ .query("_merge=='right_only'")
241
+ .drop("_merge", axis=1)
242
+ )
243
+ return self.__class__(out)
244
+
245
+
246
+ # Info
247
+ # ---------------------------------------------------------------------------
248
+ class DataAnalystDataFrameInfoMixin(DFBase):
249
+ """
250
+ Data Analyst ``pd.DataFrame`` - Info
251
+
252
+ - Quick info
253
+ - Quick describe
254
+ - Show distribution
255
+ - Threshold filter
256
+ """
257
+
258
+ # Quick info
259
+ @versionadded("3.2.0")
260
+ def qinfo(self) -> str:
261
+ """
262
+ Show quick infomation about DataFrame
263
+
264
+ Example:
265
+ --------
266
+ >>> DADF.sample_df().qinfo()
267
+ Dataset Information:
268
+ - Number of Rows: 100
269
+ - Number of Columns: 6
270
+ - Total observation: 600
271
+ - Missing value: 13 (2.17%)
272
+
273
+ Column names:
274
+ ['number', 'number_big', 'number_range', 'missing_value', 'text', 'date']
275
+ """
276
+ missing_values = self.isnull().sum().sum()
277
+ total_observation = self.shape[0] * self.shape[1]
278
+ mv_rate = missing_values / total_observation * 100
279
+ info = (
280
+ f"Dataset Information:\n"
281
+ f"- Number of Rows: {self.shape[0]:,}\n"
282
+ f"- Number of Columns: {self.shape[1]:,}\n"
283
+ f"- Total observation: {total_observation:,}\n"
284
+ f"- Missing value: {missing_values:,} ({mv_rate:.2f}%)\n\n"
285
+ f"Column names:\n{self.columns.to_list()}"
286
+ )
287
+ return info
288
+
289
+ @override
290
+ def describe(self, percentiles=None, include=None, exclude=None) -> Self:
291
+ """pd.DataFrame.describe() override"""
292
+ return self.__class__(super().describe(percentiles, include, exclude)) # type: ignore [no-any-return]
293
+
294
+ # Quick describe
295
+ @versionadded("3.2.0")
296
+ def qdescribe(self) -> Self:
297
+ """
298
+ Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype
299
+
300
+ Returns
301
+ -------
302
+ Self
303
+ Modified DataFrame
304
+
305
+
306
+ Example:
307
+ --------
308
+ >>> DADF.sample_df().qdescribe()
309
+ number number_big missing_value
310
+ count 100.000000 100.000000 48.000000
311
+ mean -0.052935 586.750000 22.916667
312
+ std 0.954170 237.248596 11.987286
313
+ min -2.392952 105.000000 3.000000
314
+ 25% -0.738311 407.500000 13.000000
315
+ 50% -0.068014 607.000000 23.500000
316
+ 75% 0.614025 790.250000 36.000000
317
+ max 2.512533 988.000000 42.000000
318
+ """
319
+ return self.__class__( # type: ignore [no-any-return]
320
+ self[self.select_dtypes(exclude=["object", "datetime"]).columns].describe()
321
+ )
322
+
323
+ @versionadded("3.2.0")
324
+ def show_distribution(
325
+ self,
326
+ column_name: str,
327
+ dropna: bool = True,
328
+ *,
329
+ show_percentage: bool = True,
330
+ percentage_round_up: int = 2,
331
+ ) -> Self:
332
+ """
333
+ Show distribution of a column
334
+
335
+ Parameters
336
+ ----------
337
+ column_name : str
338
+ Column to show distribution
339
+
340
+ dropna : bool
341
+ Count N/A when ``False``
342
+ (Default: ``True``)
343
+
344
+ show_percentage : bool
345
+ Show proportion in range 0% - 100% instead of [0, 1]
346
+ (Default: ``True``)
347
+
348
+ percentage_round_up : int
349
+ Round up to which decimals
350
+ (Default: ``2``)
351
+
352
+ Returns
353
+ -------
354
+ Self
355
+ Distribution DataFrame
356
+
357
+
358
+ Example:
359
+ --------
360
+ >>> DADF.sample_df().show_distribution("number_range")
361
+ number_range count percentage
362
+ 0 900 16 16.0
363
+ 1 700 15 15.0
364
+ 2 300 12 12.0
365
+ 3 200 12 12.0
366
+ 4 400 11 11.0
367
+ 5 600 11 11.0
368
+ 6 800 10 10.0
369
+ 7 100 9 9.0
370
+ 8 500 4 4.0
371
+ """
372
+ out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index()
373
+ if show_percentage:
374
+ out["percentage"] = (out["count"] / self.shape[0] * 100).round(
375
+ percentage_round_up
376
+ )
377
+ else:
378
+ out["percentage"] = (out["count"] / self.shape[0]).round(
379
+ percentage_round_up
380
+ )
381
+ return self.__class__(out)
382
+
383
+ @deprecated("5.1.0", reason="Rework THIS")
384
+ def threshold_filter(
385
+ self,
386
+ destination_column: str,
387
+ threshold: int | float = 10,
388
+ *,
389
+ top: int | None = None,
390
+ replace_with: Any = "Other",
391
+ ) -> Self:
392
+ """
393
+ Filter out percentage of data that smaller than the ``threshold``,
394
+ replace all of the smaller data to ``replace_with``.
395
+ As a result, pie chart is less messy.
396
+
397
+ Parameters
398
+ ----------
399
+ destination_column : str
400
+ Column to be filtered
401
+
402
+ threshold : int | float
403
+ Which percentage to cut-off
404
+ (Default: 10%)
405
+
406
+ top : int
407
+ Only show top ``x`` categories in pie chart
408
+ (replace threshold mode)
409
+ (Default: ``None``)
410
+
411
+ replace_with : Any
412
+ Replace all of the smaller data with specified value
413
+
414
+ Returns
415
+ -------
416
+ Self
417
+ Modified DataFrame
418
+ """
419
+ # Clean
420
+ try:
421
+ self[destination_column] = self[
422
+ destination_column
423
+ ].str.strip() # Remove trailing space
424
+ except Exception:
425
+ pass
426
+
427
+ # Logic
428
+ col_df = self.show_distribution(destination_column)
429
+
430
+ # Rename
431
+ if top is not None:
432
+ list_of_keep: list = (
433
+ col_df[destination_column]
434
+ .head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0]))
435
+ .to_list()
436
+ )
437
+ # logger.debug(list_of_keep)
438
+ else:
439
+ list_of_keep = col_df[col_df["percentage"] >= threshold][
440
+ destination_column
441
+ ].to_list() # values that will not be renamed
442
+ self[f"{destination_column}_filtered"] = self[destination_column].apply(
443
+ lambda x: replace_with if x not in list_of_keep else x
444
+ )
445
+
446
+ # Return
447
+ return self
448
+
449
+
450
+ # Missing value
451
+ # ---------------------------------------------------------------------------
452
+ class DataAnalystDataFrameNAMixin(DFBase):
453
+ """
454
+ Data Analyst ``pd.DataFrame`` - Missing value
455
+
456
+ - Fill missing values
457
+ - Get missing values
458
+ - Split N/A
459
+ - Apply not null
460
+ - Apply not null row
461
+ """
462
+
463
+ def fill_missing_values(
464
+ self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
465
+ ) -> Self:
466
+ """
467
+ Fill missing values in specified column
468
+
469
+ Parameters
470
+ ----------
471
+ column_name : str
472
+ Column name
473
+
474
+ fill : Any
475
+ Fill the missing values with, by default ``np.nan``
476
+
477
+ fill_when_not_exist : Any
478
+ When ``column_name`` does not exist,
479
+ create a new column and fill with
480
+ ``fill_when_not_exist``, by default ``np.nan``
481
+
482
+ Returns
483
+ -------
484
+ Self
485
+ Modified DataFrame
486
+
487
+
488
+ Example:
489
+ --------
490
+ >>> df = DADF.sample_df(2)
491
+ >>> df
492
+ number number_big number_range missing_value text date
493
+ 0 0.174303 926 900 NaN tenkiakh 2006-09-08
494
+ 1 0.305137 140 100 NaN jzuddamf 2012-04-04
495
+ >>> df.fill_missing_values("missing_value", 0)
496
+ number number_big number_range missing_value text date
497
+ 0 0.174303 926 900 0.0 tenkiakh 2006-09-08
498
+ 1 0.305137 140 100 0.0 jzuddamf 2012-04-04
499
+ >>> df.fill_missing_values("missing_column", 0, fill_when_not_exist=0)
500
+ number number_big number_range missing_value text date missing_column
501
+ 0 0.174303 926 900 0.0 tenkiakh 2006-09-08 0
502
+ 1 0.305137 140 100 0.0 jzuddamf 2012-04-04 0
503
+ """
504
+ try:
505
+ self[column_name] = self[column_name].fillna(fill)
506
+ except KeyError:
507
+ if getattr(self, "add_blank_column", None) is not None:
508
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
509
+ self.add_blank_column(column_name, fill_when_not_exist)
510
+ return self
511
+
512
+ def get_missing_values(
513
+ self, hightlight: bool = True, *, percentage_round_up: int = 2
514
+ ) -> Self:
515
+ """
516
+ Get a DataFrame contains count of missing values for each column
517
+
518
+ Parameters
519
+ ----------
520
+ hightlight : bool
521
+ Shows only columns with missing values when ``True``, by default ``True``
522
+
523
+ percentage_round_up : int
524
+ Round up to which decimals, by default ``2``
525
+
526
+ Returns
527
+ -------
528
+ Self
529
+ Missing value DataFrame
530
+
531
+
532
+ Example:
533
+ --------
534
+ >>> DADF.sample_df(152).get_missing_values()
535
+ Num of N/A Percentage
536
+ missing_value 42 27.63
537
+ """
538
+ # Check for missing value
539
+ df_na = self.isnull().sum().sort_values(ascending=False)
540
+ if hightlight:
541
+ out = df_na[df_na != 0].to_frame()
542
+ else:
543
+ out = df_na.to_frame()
544
+ out.rename(columns={0: "Num of N/A"}, inplace=True)
545
+ out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round(
546
+ percentage_round_up
547
+ )
548
+
549
+ # logger.debug(
550
+ # f"Percentage of N/A over entire DF: "
551
+ # f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%"
552
+ # )
553
+ return self.__class__(out)
554
+
555
+ @versionadded("3.1.0")
556
+ def split_na(self, by_column: str) -> SplittedDF:
557
+ """
558
+ Split DataFrame into 2 parts:
559
+ - Without missing value in specified column
560
+ - With missing value in specified column
561
+
562
+ Parameters
563
+ ----------
564
+ by_column : str
565
+ Split by column
566
+
567
+ Returns
568
+ -------
569
+ SplittedDF
570
+ Splitted DataFrame
571
+
572
+
573
+ Example:
574
+ --------
575
+ >>> DADF.sample_df(10).split_na("missing_value")
576
+ SplittedDF(
577
+ df= number number_big number_range missing_value text date
578
+ 0 0.643254 690 600 3.0 cinvofwj 2018-08-15
579
+ 2 0.499345 255 200 13.0 jasifzez 2005-06-01
580
+ 3 -1.727036 804 800 38.0 esxjmger 2009-07-24
581
+ 4 0.873058 690 600 32.0 htewfpld 2022-07-22
582
+ 5 -2.389884 442 400 30.0 hbcnfogu 2006-02-25
583
+ 8 0.264584 432 400 2.0 ejbvbmwn 2013-05-11
584
+ 9 0.813655 137 100 20.0 oecttada 2024-11-22,
585
+ df_na= number number_big number_range missing_value text date
586
+ 1 -0.411354 363 300 NaN juzecani 2014-12-02
587
+ 6 -0.833857 531 500 NaN ybnntryh 2023-11-03
588
+ 7 1.355589 472 400 NaN zjltghjr 2024-10-09
589
+ )
590
+ """
591
+ out = SplittedDF(
592
+ # df=self[~self[by_column].isna()], # DF
593
+ df=self[self[by_column].notna()], # DF
594
+ df_na=self[self[by_column].isna()], # DF w/o NA
595
+ )
596
+ return out
597
+
598
+ @versionadded("5.1.0")
599
+ def apply_notnull(self, col: str, callable: Callable[[Any], _R]) -> Self:
600
+ """
601
+ Only apply callable to not NaN value in column
602
+
603
+ Parameters
604
+ ----------
605
+ col : str
606
+ Column to apply
607
+
608
+ callable : Callable[[Any], _R]
609
+ Callable
610
+
611
+ Returns
612
+ -------
613
+ Self
614
+ Applied DataFrame
615
+
616
+
617
+ Example:
618
+ --------
619
+ >>> DADF.sample_df(5).apply_notnull("missing_value", lambda _: "REPLACED")
620
+ number number_big number_range missing_value text date
621
+ 0 0.852218 157 100 REPLACED dqzxaxxs 2006-03-08
622
+ 1 1.522428 616 600 NaN mivkaooe 2018-12-27
623
+ 2 0.108506 745 700 REPLACED qanwwjet 2005-07-14
624
+ 3 -1.435079 400 400 REPLACED ywahcasi 2024-05-20
625
+ 4 0.118993 861 800 REPLACED saoupuby 2019-04-28
626
+ """
627
+ self[col] = self[col].apply(lambda x: callable(x) if pd.notnull(x) else x)
628
+ return self
629
+
630
+ @versionadded("5.1.0") # type: ignore
631
+ def apply_notnull_row(
632
+ self,
633
+ apply_when_null: Callable[[Any], _R] | _T | None = None,
634
+ apply_when_not_null: Callable[[Any], _R] | _T | None = None,
635
+ col_name: str | None = None,
636
+ ) -> Self:
637
+ """
638
+ Apply to DataFrame's row with missing value.
639
+
640
+ Parameters
641
+ ----------
642
+ apply_when_null : Callable[[Any], R] | T | None, optional
643
+ Callable or Any, by default ``None``: returns if entire row is not null
644
+
645
+ apply_when_not_null : Callable[[Any], R] | T | None, optional
646
+ Callable or Any, by default ``None``: returns if entire row is not null
647
+
648
+ col_name : str | None, optional
649
+ Output column name, by default ``None`` (uses custom name)
650
+
651
+ Returns
652
+ -------
653
+ Self
654
+ Modified DataDrame
655
+
656
+
657
+ Example:
658
+ --------
659
+ >>> df = DADF({"A": [None, 2, 3, 4], "B": [1, None, 3, 4], "C": [None, 2, None, 4]})
660
+ >>> df.apply_notnull_row()
661
+ A B C applied_row_null
662
+ 0 NaN 1.0 NaN False
663
+ 1 2.0 NaN 2.0 False
664
+ 2 3.0 3.0 NaN False
665
+ 3 4.0 4.0 4.0 True
666
+ >>> df.apply_notnull_row(0, 1)
667
+ A B C applied_row_null
668
+ 0 NaN 1.0 NaN 0
669
+ 1 2.0 NaN 2.0 0
670
+ 2 3.0 3.0 NaN 0
671
+ 3 4.0 4.0 4.0 1
672
+ >>> df.apply_notnull_row(lambda _: "n", lambda _: "y", col_name="mod")
673
+ A B C mod
674
+ 0 NaN 1.0 NaN n
675
+ 1 2.0 NaN 2.0 n
676
+ 2 3.0 3.0 NaN n
677
+ 3 4.0 4.0 4.0 y
678
+ """
679
+
680
+ def apply_func(row: pd.Series):
681
+ # Both None
682
+ if apply_when_null is None and apply_when_not_null is None:
683
+ return row.notnull().all()
684
+
685
+ # When all values in row are not null
686
+ if row.notnull().all():
687
+ if callable(apply_when_not_null):
688
+ return apply_when_not_null(row)
689
+ return apply_when_not_null
690
+
691
+ # When any value in row is null
692
+ if callable(apply_when_null):
693
+ return apply_when_null(row)
694
+ return apply_when_null
695
+
696
+ # Column name
697
+ cname = "applied_row_null" if col_name is None else col_name
698
+ self[cname] = self.apply(apply_func, axis=1)
699
+
700
+ return self
701
+
702
+
703
+ # Other
704
+ # ---------------------------------------------------------------------------
705
+ class DataAnalystDataFrameOtherMixin(DFBase):
706
+ """
707
+ Data Analyst ``pd.DataFrame`` - Other method/Stuff
708
+
709
+ - Merge left
710
+ """
711
+
712
+ @versionadded("4.0.0")
713
+ def merge_left(
714
+ self,
715
+ other: Self | pd.DataFrame,
716
+ on: str,
717
+ columns: list[str] | None = None,
718
+ ) -> Self:
719
+ """
720
+ Merge left of 2 DataFrame
721
+
722
+ Parameters
723
+ ----------
724
+ other : Self | pd.DataFrame
725
+ DataFrame to merge
726
+
727
+ on : str
728
+ Merge on which column
729
+
730
+ columns : list[str] | None, optional
731
+ Columns to take from other DataFrame, by default ``None``
732
+ (Take all columns)
733
+
734
+ Returns
735
+ -------
736
+ Self
737
+ Merged DataFrame
738
+
739
+
740
+ Example:
741
+ --------
742
+ >>> df1 = DADF({
743
+ ... "id": [1, 2, 5],
744
+ ... "name": ["Alice", "Bob", "Rich"],
745
+ ... "age": [20, 20, 20],
746
+ ... })
747
+ >>> df2 = DADF({
748
+ ... "id": [1, 2, 3],
749
+ ... "age": [25, 30, 45],
750
+ ... "department": ["HR", "IT", "PM"],
751
+ ... "salary": [50000, 60000, 55000],
752
+ ... })
753
+ >>> df1.merge_left(df2, on="id")
754
+ id name age_x age_y department salary
755
+ 0 1 Alice 20 25.0 HR 50000.0
756
+ 1 2 Bob 20 30.0 IT 60000.0
757
+ 2 5 Rich 20 NaN NaN NaN
758
+ >>> df1.merge_left(df2, on="id", columns=["salary"])
759
+ id name age department salary
760
+ 0 1 Alice 25.0 HR 50000.0
761
+ 1 2 Bob 30.0 IT 60000.0
762
+ 2 5 Rich NaN NaN NaN
763
+ """
764
+
765
+ if columns is not None:
766
+ current_col = [on]
767
+ current_col.extend(columns)
768
+ col = other.columns.to_list()
769
+ cols = list(set(col) - set(current_col))
770
+
771
+ if getattr(self, "drop_columns", None) is not None:
772
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
773
+ self.drop_columns(cols)
774
+
775
+ out = self.merge(other, how="left", on=on)
776
+ return self.__class__(out)
777
+
778
+
779
+ # Date
780
+ # ---------------------------------------------------------------------------
781
+ class DataAnalystDataFrameDateMixin(DFBase):
782
+ """
783
+ Data Analyst ``pd.DataFrame`` - Date
784
+
785
+ - Add date column from month column
786
+ - Add detail date
787
+ - Delta date (How many days inbetween)
788
+ """
789
+
790
+ def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self:
791
+ """
792
+ Add dummy ``date`` column from ``month`` column
793
+
794
+ Parameters
795
+ ----------
796
+ month_column : str
797
+ Month column
798
+
799
+ col_name : str
800
+ New date column name, by default: ``"date"``
801
+
802
+ Returns
803
+ -------
804
+ Self
805
+ Modified DataFrame
806
+
807
+
808
+ Example:
809
+ --------
810
+ >>> df = (
811
+ ... DADF.sample_df(2)
812
+ ... .add_detail_date("date", mode="m")
813
+ ... .drop_columns(["date", "number", "number_range"])
814
+ ... )
815
+ >>> df
816
+ number_big missing_value text month
817
+ 0 755 NaN lincgqzl 4
818
+ 1 907 NaN gxltrjku 10
819
+ >>> df.add_date_from_month("month")
820
+ number_big missing_value text month date
821
+ 0 755 NaN lincgqzl 4 2025-04-01
822
+ 1 907 NaN gxltrjku 10 2025-10-01
823
+ """
824
+ _this_year = datetime.now().year
825
+ self[col_name] = pd.to_datetime(
826
+ f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
827
+ format="%Y-%m-%d",
828
+ )
829
+
830
+ # Rearrange
831
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
832
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
833
+ return self.rearrange_rightmost_column(month_column) # type: ignore [no-any-return]
834
+ return self
835
+
836
+ def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self:
837
+ """
838
+ Add these columns from ``date_column``:
839
+ - ``date`` (won't add if ``date_column`` value is ``"date"``)
840
+ - ``day`` (overwrite if already exist)
841
+ - ``week`` (overwrite if already exist)
842
+ - ``month`` (overwrite if already exist)
843
+ - ``year`` (overwrite if already exist)
844
+
845
+ Parameters
846
+ ----------
847
+ date_column : str
848
+ Date column
849
+
850
+ mode : str
851
+ | Detailed column to add
852
+ | ``d``: day
853
+ | ``w``: week number
854
+ | ``m``: month
855
+ | ``y``: year
856
+ | (Default: ``"dwmy"``)
857
+
858
+ Returns
859
+ -------
860
+ Self
861
+ Modified DataFrame
862
+
863
+
864
+ Example:
865
+ --------
866
+ >>> df = DADF.sample_df(2)
867
+ >>> df
868
+ number number_big number_range missing_value text date
869
+ 0 0.331195 902 900 20 fgyanxik 2021-10-18
870
+ 1 -0.877727 378 300 13 dqvaggjo 2007-03-06
871
+ >>> df.add_detail_date("date")
872
+ number number_big number_range missing_value text date day week month year
873
+ 0 0.331195 902 900 20 fgyanxik 2021-10-18 18 42 10 2021
874
+ 1 -0.877727 378 300 13 dqvaggjo 2007-03-06 6 10 3 2007
875
+ """
876
+ # Convert to datetime
877
+ self["date"] = pd.to_datetime(self[date_column])
878
+
879
+ # Logic
880
+ col_counter = 0
881
+ # self["weekday"] = self["day"].dt.isocalendar().day # Weekday
882
+ if mode.find("d") != -1:
883
+ logger.debug("Mode: 'day'")
884
+ self["day"] = self["date"].dt.day
885
+ col_counter += 1
886
+ if mode.find("w") != -1:
887
+ logger.debug("Mode: 'weekday'")
888
+ self["week"] = self["date"].dt.isocalendar().week
889
+ col_counter += 1
890
+ if mode.find("m") != -1:
891
+ logger.debug("Mode: 'month'")
892
+ self["month"] = self["date"].dt.month
893
+ col_counter += 1
894
+ if mode.find("y") != -1:
895
+ logger.debug("Mode: 'year'")
896
+ self["year"] = self["date"].dt.year
897
+ col_counter += 1
898
+
899
+ # Return
900
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
901
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
902
+ return self.rearrange_rightmost_column(date_column, col_counter) # type: ignore [no-any-return]
903
+ return self
904
+
905
+ def delta_date(
906
+ self,
907
+ date_column: str,
908
+ mode: Literal["now", "between_row"] = "now",
909
+ *,
910
+ col_name: str = "delta_date",
911
+ ) -> Self:
912
+ """
913
+ Calculate date interval
914
+
915
+ Parameters
916
+ ----------
917
+ date_column : str
918
+ Date column
919
+
920
+ mode : str
921
+ | Mode to calculate
922
+ | ``"between_row"``: Calculate date interval between each row
923
+ | ``"now"``: Calculate date interval to current date
924
+ | (Default: ``"now"``)
925
+
926
+ col_name : str
927
+ | New delta date column name
928
+ | (Default: ``"delta_date"``)
929
+
930
+ Returns
931
+ -------
932
+ Self
933
+ Modified DataFrame
934
+
935
+
936
+ Example:
937
+ --------
938
+ >>> df = DADF.sample_df(2)
939
+ >>> df
940
+ number number_big number_range missing_value text date
941
+ 0 -0.729988 435 400 21 xkrqqouf 2014-08-01
942
+ 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10
943
+ >>> df.delta_date("date")
944
+ number number_big number_range missing_value text date delta_date
945
+ 0 -0.729988 435 400 21 xkrqqouf 2014-08-01 3873
946
+ 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10 242
947
+ """
948
+ if mode.lower().startswith("between_row"):
949
+ dated = self[date_column].to_list()
950
+ cal: list[timedelta] = []
951
+ for i in range(len(dated)):
952
+ if i == 0:
953
+ cal.append(dated[i] - dated[i])
954
+ # cal.append(relativedelta(dated[i], dated[i]))
955
+ else:
956
+ cal.append(dated[i] - dated[i - 1])
957
+ # cal.append(relativedelta(dated[i], dated[i - 1]))
958
+ self[col_name] = [x.days for x in cal]
959
+ else: # mode="now"
960
+ self[col_name] = self[date_column].apply(
961
+ lambda x: (datetime.now() - x).days
962
+ )
963
+ return self
964
+
965
+
966
+ # City
967
+ # ---------------------------------------------------------------------------
968
+ class DataAnalystDataFrameCityMixin(DFBase):
969
+ """
970
+ Data Analyst ``pd.DataFrame`` - City
971
+
972
+ - Convert city
973
+ """
974
+
975
+ def convert_city(
976
+ self,
977
+ city_column: str,
978
+ city_list: list[CityData],
979
+ *,
980
+ mode: str = "ra",
981
+ ) -> Self:
982
+ """
983
+ Get ``region`` and ``area`` of a city
984
+
985
+ Parameters
986
+ ----------
987
+ city_column : str
988
+ Column contains city data
989
+
990
+ city_list : list[CityData]
991
+ List of city in correct format
992
+ (Default: ``None``)
993
+
994
+ mode : str
995
+ | Detailed column to add
996
+ | ``r``: region
997
+ | ``a``: area
998
+ | (Default: ``"ra"``)
999
+
1000
+ Returns
1001
+ -------
1002
+ DataAnalystDataFrame
1003
+ Modified DataFrame
1004
+ """
1005
+
1006
+ # Support function
1007
+ def _convert_city_support(value: str) -> CityData:
1008
+ for x in city_list:
1009
+ if x.city.lower().startswith(value.lower()):
1010
+ return x
1011
+ return CityData(city=value, region=np.nan, area=np.nan) # type: ignore
1012
+
1013
+ # Convert
1014
+ col_counter = 0
1015
+ if mode.find("r") != -1:
1016
+ logger.debug("Mode: 'region'")
1017
+ self["region"] = self[city_column].apply(
1018
+ lambda x: _convert_city_support(x).region
1019
+ )
1020
+ col_counter += 1
1021
+ if mode.find("a") != -1:
1022
+ logger.debug("Mode: 'area'")
1023
+ self["area"] = self[city_column].apply(
1024
+ lambda x: _convert_city_support(x).area
1025
+ )
1026
+ col_counter += 1
1027
+
1028
+ # Rearrange
1029
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
1030
+ return self.rearrange_rightmost_column(city_column, col_counter) # type: ignore [no-any-return]
1031
+ return self
1032
+
1033
+
1034
+ # Main
1035
+ # ---------------------------------------------------------------------------
1036
+ class DADF(
1037
+ ShowAllMethodsMixin,
1038
+ DataAnalystDataFrameCityMixin,
1039
+ DataAnalystDataFrameDateMixin,
1040
+ DataAnalystDataFrameOtherMixin,
1041
+ DataAnalystDataFrameNAMixin,
1042
+ DataAnalystDataFrameInfoMixin,
1043
+ DataAnalystDataFrameRowMethodMixin,
1044
+ DataAnalystDataFrameColumnMethodMixin,
1045
+ ):
1046
+ """
1047
+ Data Analyst ``pd.DataFrame``
1048
+
1049
+ For a list of extra methods:
1050
+ >>> print(DADF.DADF_METHODS)
1051
+ """
1052
+
1053
+ @classmethod
1054
+ @deprecated("5.1.0")
1055
+ @versionadded("3.2.0")
1056
+ def dadf_help(cls) -> list[str]:
1057
+ """
1058
+ Show all available method of DataAnalystDataFrame
1059
+ """
1060
+ list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame)))
1061
+ return sorted(list_of_method)
1062
+
1063
+ @classmethod
1064
+ def sample_df(cls, size: int = 100) -> Self:
1065
+ """
1066
+ Create sample DataFrame
1067
+
1068
+ Parameters
1069
+ ----------
1070
+ size : int
1071
+ Number of observations, by default ``100``
1072
+
1073
+ Returns
1074
+ -------
1075
+ Self
1076
+ DataFrame with these columns:
1077
+ [number, number_big, number_range, missing_value, text, date]
1078
+
1079
+
1080
+ Example:
1081
+ --------
1082
+ >>> DataAnalystDataFrame.sample_df()
1083
+ number number_big number_range missing_value text date
1084
+ 0 -2.089770 785 700 NaN vwnlqoql 2013-11-20
1085
+ 1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13
1086
+ 2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24
1087
+ 3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28
1088
+ 4 1.687803 878 800 NaN aajtncum 2005-10-05
1089
+ .. ... ... ... ... ... ...
1090
+ 95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12
1091
+ 96 1.296795 255 200 NaN lwvytego 2014-05-10
1092
+ 97 1.440746 297 200 5.0 lqsoykun 2010-04-03
1093
+ 98 0.327702 845 800 NaN leadkvsy 2005-08-05
1094
+ 99 0.556720 981 900 36.0 bozmxixy 2004-02-22
1095
+ [100 rows x 6 columns]
1096
+ """
1097
+ # Restrain
1098
+ size = max(size, 1)
1099
+
1100
+ # Number col
1101
+ df = cls(np.random.randn(size, 1), columns=["number"])
1102
+ df["number_big"] = [
1103
+ random.choice(range(100, 999)) for _ in range(size)
1104
+ ] # Big number in range 100-999
1105
+ df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
1106
+
1107
+ # Missing value col
1108
+ na_rate = random.randint(1, 99)
1109
+ d = [random.randint(1, 99) for _ in range(size)]
1110
+ df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
1111
+ # df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
1112
+
1113
+ # Text col
1114
+ df["text"] = [
1115
+ "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
1116
+ for _ in range(size)
1117
+ ]
1118
+
1119
+ # Random date col
1120
+ df["date"] = [
1121
+ datetime(
1122
+ year=random.randint(datetime.now().year - 20, datetime.now().year),
1123
+ month=random.randint(1, 12),
1124
+ day=random.randint(1, 28),
1125
+ )
1126
+ for _ in range(size)
1127
+ ]
1128
+
1129
+ # Return
1130
+ return df
1131
+
1132
+
1133
+ class DADF_WIP(DADF):
1134
+ """
1135
+ W.I.P - No test cases written
1136
+ """
1137
+
1138
+ pass