absfuyu 5.0.1__py3-none-any.whl → 5.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of absfuyu might be problematic. Click here for more details.

Files changed (72) hide show
  1. absfuyu/__init__.py +1 -1
  2. absfuyu/__main__.py +3 -3
  3. absfuyu/cli/__init__.py +2 -2
  4. absfuyu/cli/color.py +30 -14
  5. absfuyu/cli/config_group.py +9 -2
  6. absfuyu/cli/do_group.py +13 -6
  7. absfuyu/cli/game_group.py +9 -2
  8. absfuyu/cli/tool_group.py +15 -9
  9. absfuyu/config/__init__.py +2 -2
  10. absfuyu/core/__init__.py +2 -2
  11. absfuyu/core/baseclass.py +448 -79
  12. absfuyu/core/baseclass2.py +2 -2
  13. absfuyu/core/decorator.py +70 -4
  14. absfuyu/core/docstring.py +43 -25
  15. absfuyu/core/dummy_cli.py +2 -2
  16. absfuyu/core/dummy_func.py +15 -4
  17. absfuyu/dxt/__init__.py +2 -2
  18. absfuyu/dxt/dictext.py +5 -2
  19. absfuyu/dxt/dxt_support.py +2 -2
  20. absfuyu/dxt/intext.py +34 -3
  21. absfuyu/dxt/listext.py +300 -113
  22. absfuyu/dxt/strext.py +75 -15
  23. absfuyu/extra/__init__.py +2 -2
  24. absfuyu/extra/beautiful.py +2 -2
  25. absfuyu/extra/da/__init__.py +36 -0
  26. absfuyu/extra/da/dadf.py +1177 -0
  27. absfuyu/extra/da/dadf_base.py +186 -0
  28. absfuyu/extra/da/df_func.py +97 -0
  29. absfuyu/extra/da/mplt.py +219 -0
  30. absfuyu/extra/data_analysis.py +10 -1067
  31. absfuyu/fun/__init__.py +2 -2
  32. absfuyu/fun/tarot.py +2 -2
  33. absfuyu/game/__init__.py +2 -2
  34. absfuyu/game/game_stat.py +2 -2
  35. absfuyu/game/sudoku.py +2 -2
  36. absfuyu/game/tictactoe.py +2 -3
  37. absfuyu/game/wordle.py +2 -2
  38. absfuyu/general/__init__.py +2 -2
  39. absfuyu/general/content.py +2 -2
  40. absfuyu/general/human.py +2 -2
  41. absfuyu/general/shape.py +2 -2
  42. absfuyu/logger.py +2 -2
  43. absfuyu/pkg_data/__init__.py +2 -2
  44. absfuyu/pkg_data/deprecated.py +2 -2
  45. absfuyu/sort.py +2 -2
  46. absfuyu/tools/__init__.py +28 -2
  47. absfuyu/tools/checksum.py +27 -7
  48. absfuyu/tools/converter.py +120 -34
  49. absfuyu/tools/generator.py +251 -110
  50. absfuyu/tools/inspector.py +463 -0
  51. absfuyu/tools/keygen.py +2 -2
  52. absfuyu/tools/obfuscator.py +45 -7
  53. absfuyu/tools/passwordlib.py +88 -24
  54. absfuyu/tools/shutdownizer.py +2 -2
  55. absfuyu/tools/web.py +2 -2
  56. absfuyu/typings.py +136 -0
  57. absfuyu/util/__init__.py +18 -4
  58. absfuyu/util/api.py +36 -16
  59. absfuyu/util/json_method.py +43 -14
  60. absfuyu/util/lunar.py +2 -2
  61. absfuyu/util/path.py +190 -82
  62. absfuyu/util/performance.py +122 -7
  63. absfuyu/util/shorten_number.py +40 -10
  64. absfuyu/util/text_table.py +306 -0
  65. absfuyu/util/zipped.py +8 -7
  66. absfuyu/version.py +2 -2
  67. {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/METADATA +9 -2
  68. absfuyu-5.2.0.dist-info/RECORD +76 -0
  69. absfuyu-5.0.1.dist-info/RECORD +0 -68
  70. {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/WHEEL +0 -0
  71. {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/entry_points.txt +0 -0
  72. {absfuyu-5.0.1.dist-info → absfuyu-5.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1177 @@
1
+ """
2
+ Absfuyu: Data Analysis
3
+ ----------------------
4
+ Data Analyst DataFrame
5
+
6
+ Version: 5.2.0
7
+ Date updated: 15/03/2025 (dd/mm/yyyy)
8
+ """
9
+
10
+ # Module level
11
+ # ---------------------------------------------------------------------------
12
+ __all__ = [
13
+ "DADF",
14
+ "DataAnalystDataFrameColumnMethodMixin",
15
+ "DataAnalystDataFrameRowMethodMixin",
16
+ "DataAnalystDataFrameInfoMixin",
17
+ "DataAnalystDataFrameNAMixin",
18
+ "DataAnalystDataFrameOtherMixin",
19
+ "DataAnalystDataFrameDateMixin",
20
+ "DataAnalystDataFrameCityMixin",
21
+ ]
22
+
23
+
24
+ # Library
25
+ # ---------------------------------------------------------------------------
26
+ import random
27
+ import string
28
+ from collections.abc import Callable, Sequence
29
+ from datetime import datetime, timedelta
30
+ from typing import Any, Literal, Self
31
+
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+ try:
36
+ from typing import override # type: ignore
37
+ except ImportError:
38
+ from absfuyu.core.decorator import dummy_decorator as override
39
+
40
+ from absfuyu.core.baseclass import ShowAllMethodsMixin
41
+ from absfuyu.core.docstring import deprecated, versionadded
42
+ from absfuyu.extra.da.dadf_base import CityData
43
+ from absfuyu.extra.da.dadf_base import DataAnalystDataFrameBase as DFBase
44
+ from absfuyu.extra.da.dadf_base import SplittedDF
45
+ from absfuyu.logger import logger
46
+ from absfuyu.typings import R as _R
47
+ from absfuyu.typings import T as _T
48
+ from absfuyu.util import set_min_max
49
+
50
+
51
+ # Column method
52
+ # ---------------------------------------------------------------------------
53
+ class DataAnalystDataFrameColumnMethodMixin(DFBase):
54
+ """
55
+ Data Analyst ``pd.DataFrame`` - Column method
56
+
57
+ - Rearrange rightmost column
58
+ - Drop columns
59
+ - Drop rightmost column
60
+ - Add blank column
61
+ """
62
+
63
+ def rearrange_rightmost_column(
64
+ self, insert_to_col: str, num_of_cols: int = 1
65
+ ) -> Self:
66
+ """
67
+ Move right-most columns to selected position
68
+
69
+ Parameters
70
+ ----------
71
+ insert_to_col : str
72
+ Name of the column that the right-most column will be moved next to
73
+
74
+ num_of_cols : int
75
+ Number of columns moved, by default ``1``
76
+
77
+ Returns
78
+ -------
79
+ Self
80
+ Modified DataFrame
81
+
82
+
83
+ Example:
84
+ --------
85
+ >>> df = DADF.sample_df(2)
86
+ >>> df
87
+ number number_big number_range missing_value text date
88
+ 0 -1.583590 756 700 NaN eqklyckc 2023-05-20
89
+ 1 0.203968 167 100 NaN wzrsxinb 2011-02-27
90
+ >>> df.rearrange_rightmost_column("number")
91
+ number date number_big number_range missing_value text
92
+ 0 -1.583590 2023-05-20 756 700 NaN eqklyckc
93
+ 1 0.203968 2011-02-27 167 100 NaN wzrsxinb
94
+ """
95
+ cols: list[str] = self.columns.to_list() # List of columns
96
+ num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols)))
97
+ col_index: int = cols.index(insert_to_col)
98
+ new_cols: list[str] = (
99
+ cols[: col_index + 1]
100
+ + cols[-num_of_cols:]
101
+ + cols[col_index + 1 : len(cols) - num_of_cols]
102
+ )
103
+ self = self.__class__(self[new_cols])
104
+ return self
105
+
106
+ def drop_columns(self, columns: Sequence[str]) -> Self:
107
+ """
108
+ Drop columns in DataFrame
109
+
110
+ Parameters
111
+ ----------
112
+ columns : Iterable[str]
113
+ List of columns need to drop
114
+
115
+ Returns
116
+ -------
117
+ Self
118
+ Modified DataFrame
119
+
120
+
121
+ Example:
122
+ --------
123
+ >>> df = DADF.sample_df(2)
124
+ >>> df
125
+ number number_big number_range missing_value text date
126
+ 0 -0.283019 666 600 NaN ztoeeblx 2022-11-13
127
+ 1 1.194725 939 900 NaN fxardqvh 2005-08-04
128
+ >>> df.drop_columns(["date", "text"])
129
+ number number_big number_range missing_value
130
+ 0 -0.283019 666 600 NaN
131
+ 1 1.194725 939 900 NaN
132
+ """
133
+ for column in columns:
134
+ try:
135
+ self.drop(columns=[column], inplace=True)
136
+ except KeyError:
137
+ logger.debug(f"{column} column does not exist")
138
+ # pass
139
+ return self
140
+
141
+ def drop_rightmost(self, num_of_cols: int = 1) -> Self:
142
+ """
143
+ Drop ``num_of_cols`` right-most columns
144
+
145
+ Parameters
146
+ ----------
147
+ num_of_cols : int
148
+ Number of columns to drop
149
+
150
+ Returns
151
+ -------
152
+ Self
153
+ Modified DataFrame
154
+
155
+
156
+ Example:
157
+ --------
158
+ >>> df = DADF.sample_df(2)
159
+ >>> df
160
+ number number_big number_range missing_value text date
161
+ 0 0.851953 572 500 5 ncpbnzef 2020-08-15
162
+ 1 0.381643 595 500 53 iojogbgj 2011-12-04
163
+ >>> df.drop_rightmost(5)
164
+ number
165
+ 0 0.851953
166
+ 1 0.381643
167
+ """
168
+ # Restrain
169
+ # if num_of_cols < 1:
170
+ # num_of_cols = 1
171
+ # if num_of_cols > self.shape[1]:
172
+ # num_of_cols = self.shape[1]
173
+ num_of_cols = int(
174
+ set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
175
+ )
176
+
177
+ # Logic
178
+ for _ in range(num_of_cols):
179
+ self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
180
+ return self
181
+
182
+ @deprecated("5.1.0", reason="Use pd.DataFrame.assign(...) method instead")
183
+ def add_blank_column(self, column_name: str, fill: Any = np.nan, /) -> Self:
184
+ """
185
+ Add a blank column
186
+
187
+ Parameters
188
+ ----------
189
+ column_name : str
190
+ Name of the column to add
191
+
192
+ fill : Any
193
+ Fill the column with data
194
+
195
+ Returns
196
+ -------
197
+ Self
198
+ Modified DataFrame
199
+ """
200
+ self[column_name] = [fill] * self.shape[0]
201
+ return self
202
+
203
+
204
+ # Row method
205
+ # ---------------------------------------------------------------------------
206
+ class DataAnalystDataFrameRowMethodMixin(DFBase):
207
+ """
208
+ Data Analyst ``pd.DataFrame`` - Row method
209
+
210
+ - Get different rows
211
+ """
212
+
213
+ @versionadded("4.0.0")
214
+ def get_different_rows(self, other: Self | pd.DataFrame) -> Self:
215
+ """
216
+ Subtract DataFrame to find the different rows
217
+
218
+ Parameters
219
+ ----------
220
+ other : Self | pd.DataFrame
221
+ DataFrame to subtract
222
+
223
+ Returns
224
+ -------
225
+ Self
226
+ Different row DataFrame
227
+
228
+
229
+ Example:
230
+ --------
231
+ >>> df1 = DADF({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
232
+ >>> df2 = DADF({"A": [1, 2, 3, 4], "B": [7, 6, 6, 8]})
233
+ >>> df1.get_different_rows(df2)
234
+ A B
235
+ 0 1 7
236
+ 2 3 6
237
+ """
238
+ df = self.copy()
239
+ out = (
240
+ df.merge(other, indicator=True, how="right")
241
+ .query("_merge=='right_only'")
242
+ .drop("_merge", axis=1)
243
+ )
244
+ return self.__class__(out)
245
+
246
+
247
+ # Info
248
+ # ---------------------------------------------------------------------------
249
+ class DataAnalystDataFrameInfoMixin(DFBase):
250
+ """
251
+ Data Analyst ``pd.DataFrame`` - Info
252
+
253
+ - Quick info
254
+ - Quick describe
255
+ - Show distribution
256
+ - Threshold filter
257
+ """
258
+
259
+ # Quick info
260
+ @versionadded("3.2.0")
261
+ def qinfo(self) -> str:
262
+ """
263
+ Show quick infomation about DataFrame
264
+
265
+ Example:
266
+ --------
267
+ >>> DADF.sample_df().qinfo()
268
+ Dataset Information:
269
+ - Number of Rows: 100
270
+ - Number of Columns: 6
271
+ - Total observation: 600
272
+ - Missing value: 13 (2.17%)
273
+
274
+ Column names:
275
+ ['number', 'number_big', 'number_range', 'missing_value', 'text', 'date']
276
+ """
277
+ missing_values = self.isnull().sum().sum()
278
+ total_observation = self.shape[0] * self.shape[1]
279
+ mv_rate = missing_values / total_observation * 100
280
+ info = (
281
+ f"Dataset Information:\n"
282
+ f"- Number of Rows: {self.shape[0]:,}\n"
283
+ f"- Number of Columns: {self.shape[1]:,}\n"
284
+ f"- Total observation: {total_observation:,}\n"
285
+ f"- Missing value: {missing_values:,} ({mv_rate:.2f}%)\n\n"
286
+ f"Column names:\n{self.columns.to_list()}"
287
+ )
288
+ return info
289
+
290
+ @override
291
+ def describe(self, percentiles=None, include=None, exclude=None) -> Self:
292
+ """pd.DataFrame.describe() override"""
293
+ return self.__class__(super().describe(percentiles, include, exclude)) # type: ignore [no-any-return]
294
+
295
+ # Quick describe
296
+ @versionadded("3.2.0")
297
+ def qdescribe(self) -> Self:
298
+ """
299
+ Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype
300
+
301
+ Returns
302
+ -------
303
+ Self
304
+ Modified DataFrame
305
+
306
+
307
+ Example:
308
+ --------
309
+ >>> DADF.sample_df().qdescribe()
310
+ number number_big missing_value
311
+ count 100.000000 100.000000 48.000000
312
+ mean -0.052935 586.750000 22.916667
313
+ std 0.954170 237.248596 11.987286
314
+ min -2.392952 105.000000 3.000000
315
+ 25% -0.738311 407.500000 13.000000
316
+ 50% -0.068014 607.000000 23.500000
317
+ 75% 0.614025 790.250000 36.000000
318
+ max 2.512533 988.000000 42.000000
319
+ """
320
+ return self.__class__( # type: ignore [no-any-return]
321
+ self[self.select_dtypes(exclude=["object", "datetime"]).columns].describe()
322
+ )
323
+
324
+ @versionadded("3.2.0")
325
+ def show_distribution(
326
+ self,
327
+ column_name: str,
328
+ dropna: bool = True,
329
+ *,
330
+ show_percentage: bool = True,
331
+ percentage_round_up: int = 2,
332
+ ) -> Self:
333
+ """
334
+ Show distribution of a column
335
+
336
+ Parameters
337
+ ----------
338
+ column_name : str
339
+ Column to show distribution
340
+
341
+ dropna : bool
342
+ Count N/A when ``False``
343
+ (Default: ``True``)
344
+
345
+ show_percentage : bool
346
+ Show proportion in range 0% - 100% instead of [0, 1]
347
+ (Default: ``True``)
348
+
349
+ percentage_round_up : int
350
+ Round up to which decimals
351
+ (Default: ``2``)
352
+
353
+ Returns
354
+ -------
355
+ Self
356
+ Distribution DataFrame
357
+
358
+
359
+ Example:
360
+ --------
361
+ >>> DADF.sample_df().show_distribution("number_range")
362
+ number_range count percentage
363
+ 0 900 16 16.0
364
+ 1 700 15 15.0
365
+ 2 300 12 12.0
366
+ 3 200 12 12.0
367
+ 4 400 11 11.0
368
+ 5 600 11 11.0
369
+ 6 800 10 10.0
370
+ 7 100 9 9.0
371
+ 8 500 4 4.0
372
+ """
373
+ out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index()
374
+ if show_percentage:
375
+ out["percentage"] = (out["count"] / self.shape[0] * 100).round(
376
+ percentage_round_up
377
+ )
378
+ else:
379
+ out["percentage"] = (out["count"] / self.shape[0]).round(
380
+ percentage_round_up
381
+ )
382
+ return self.__class__(out)
383
+
384
+ @deprecated("5.1.0", reason="Rework THIS")
385
+ def threshold_filter(
386
+ self,
387
+ destination_column: str,
388
+ threshold: int | float = 10,
389
+ *,
390
+ top: int | None = None,
391
+ replace_with: Any = "Other",
392
+ ) -> Self:
393
+ """
394
+ Filter out percentage of data that smaller than the ``threshold``,
395
+ replace all of the smaller data to ``replace_with``.
396
+ As a result, pie chart is less messy.
397
+
398
+ Parameters
399
+ ----------
400
+ destination_column : str
401
+ Column to be filtered
402
+
403
+ threshold : int | float
404
+ Which percentage to cut-off
405
+ (Default: 10%)
406
+
407
+ top : int
408
+ Only show top ``x`` categories in pie chart
409
+ (replace threshold mode)
410
+ (Default: ``None``)
411
+
412
+ replace_with : Any
413
+ Replace all of the smaller data with specified value
414
+
415
+ Returns
416
+ -------
417
+ Self
418
+ Modified DataFrame
419
+ """
420
+ # Clean
421
+ try:
422
+ self[destination_column] = self[
423
+ destination_column
424
+ ].str.strip() # Remove trailing space
425
+ except Exception:
426
+ pass
427
+
428
+ # Logic
429
+ col_df = self.show_distribution(destination_column)
430
+
431
+ # Rename
432
+ if top is not None:
433
+ list_of_keep: list = (
434
+ col_df[destination_column]
435
+ .head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0]))
436
+ .to_list()
437
+ )
438
+ # logger.debug(list_of_keep)
439
+ else:
440
+ list_of_keep = col_df[col_df["percentage"] >= threshold][
441
+ destination_column
442
+ ].to_list() # values that will not be renamed
443
+ self[f"{destination_column}_filtered"] = self[destination_column].apply(
444
+ lambda x: replace_with if x not in list_of_keep else x
445
+ )
446
+
447
+ # Return
448
+ return self
449
+
450
+
451
+ # Missing value
452
+ # ---------------------------------------------------------------------------
453
+ class DataAnalystDataFrameNAMixin(DFBase):
454
+ """
455
+ Data Analyst ``pd.DataFrame`` - Missing value
456
+
457
+ - Fill missing values
458
+ - Get missing values
459
+ - Split N/A
460
+ - Apply not null
461
+ - Apply not null row
462
+ """
463
+
464
+ def fill_missing_values(
465
+ self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
466
+ ) -> Self:
467
+ """
468
+ Fill missing values in specified column
469
+
470
+ Parameters
471
+ ----------
472
+ column_name : str
473
+ Column name
474
+
475
+ fill : Any
476
+ Fill the missing values with, by default ``np.nan``
477
+
478
+ fill_when_not_exist : Any
479
+ When ``column_name`` does not exist,
480
+ create a new column and fill with
481
+ ``fill_when_not_exist``, by default ``np.nan``
482
+
483
+ Returns
484
+ -------
485
+ Self
486
+ Modified DataFrame
487
+
488
+
489
+ Example:
490
+ --------
491
+ >>> df = DADF.sample_df(2)
492
+ >>> df
493
+ number number_big number_range missing_value text date
494
+ 0 0.174303 926 900 NaN tenkiakh 2006-09-08
495
+ 1 0.305137 140 100 NaN jzuddamf 2012-04-04
496
+ >>> df.fill_missing_values("missing_value", 0)
497
+ number number_big number_range missing_value text date
498
+ 0 0.174303 926 900 0.0 tenkiakh 2006-09-08
499
+ 1 0.305137 140 100 0.0 jzuddamf 2012-04-04
500
+ >>> df.fill_missing_values("missing_column", 0, fill_when_not_exist=0)
501
+ number number_big number_range missing_value text date missing_column
502
+ 0 0.174303 926 900 0.0 tenkiakh 2006-09-08 0
503
+ 1 0.305137 140 100 0.0 jzuddamf 2012-04-04 0
504
+ """
505
+ try:
506
+ self[column_name] = self[column_name].fillna(fill)
507
+ except KeyError:
508
+ if getattr(self, "add_blank_column", None) is not None:
509
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
510
+ self.add_blank_column(column_name, fill_when_not_exist)
511
+ return self
512
+
513
+ def get_missing_values(
514
+ self, hightlight: bool = True, *, percentage_round_up: int = 2
515
+ ) -> Self:
516
+ """
517
+ Get a DataFrame contains count of missing values for each column
518
+
519
+ Parameters
520
+ ----------
521
+ hightlight : bool
522
+ Shows only columns with missing values when ``True``, by default ``True``
523
+
524
+ percentage_round_up : int
525
+ Round up to which decimals, by default ``2``
526
+
527
+ Returns
528
+ -------
529
+ Self
530
+ Missing value DataFrame
531
+
532
+
533
+ Example:
534
+ --------
535
+ >>> DADF.sample_df(152).get_missing_values()
536
+ Num of N/A Percentage
537
+ missing_value 42 27.63
538
+ """
539
+ # Check for missing value
540
+ df_na = self.isnull().sum().sort_values(ascending=False)
541
+ if hightlight:
542
+ out = df_na[df_na != 0].to_frame()
543
+ else:
544
+ out = df_na.to_frame()
545
+ out.rename(columns={0: "Num of N/A"}, inplace=True)
546
+ out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round(
547
+ percentage_round_up
548
+ )
549
+
550
+ # logger.debug(
551
+ # f"Percentage of N/A over entire DF: "
552
+ # f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%"
553
+ # )
554
+ return self.__class__(out)
555
+
556
+ @versionadded("3.1.0")
557
+ def split_na(self, by_column: str) -> SplittedDF:
558
+ """
559
+ Split DataFrame into 2 parts:
560
+ - Without missing value in specified column
561
+ - With missing value in specified column
562
+
563
+ Parameters
564
+ ----------
565
+ by_column : str
566
+ Split by column
567
+
568
+ Returns
569
+ -------
570
+ SplittedDF
571
+ Splitted DataFrame
572
+
573
+
574
+ Example:
575
+ --------
576
+ >>> DADF.sample_df(10).split_na("missing_value")
577
+ SplittedDF(
578
+ df= number number_big number_range missing_value text date
579
+ 0 0.643254 690 600 3.0 cinvofwj 2018-08-15
580
+ 2 0.499345 255 200 13.0 jasifzez 2005-06-01
581
+ 3 -1.727036 804 800 38.0 esxjmger 2009-07-24
582
+ 4 0.873058 690 600 32.0 htewfpld 2022-07-22
583
+ 5 -2.389884 442 400 30.0 hbcnfogu 2006-02-25
584
+ 8 0.264584 432 400 2.0 ejbvbmwn 2013-05-11
585
+ 9 0.813655 137 100 20.0 oecttada 2024-11-22,
586
+ df_na= number number_big number_range missing_value text date
587
+ 1 -0.411354 363 300 NaN juzecani 2014-12-02
588
+ 6 -0.833857 531 500 NaN ybnntryh 2023-11-03
589
+ 7 1.355589 472 400 NaN zjltghjr 2024-10-09
590
+ )
591
+ """
592
+ out = SplittedDF(
593
+ # df=self[~self[by_column].isna()], # DF
594
+ df=self[self[by_column].notna()], # DF
595
+ df_na=self[self[by_column].isna()], # DF w/o NA
596
+ )
597
+ return out
598
+
599
+ @versionadded("5.1.0")
600
+ def apply_notnull(self, col: str, callable: Callable[[Any], _R]) -> Self:
601
+ """
602
+ Only apply callable to not NaN value in column
603
+
604
+ Parameters
605
+ ----------
606
+ col : str
607
+ Column to apply
608
+
609
+ callable : Callable[[Any], _R]
610
+ Callable
611
+
612
+ Returns
613
+ -------
614
+ Self
615
+ Applied DataFrame
616
+
617
+
618
+ Example:
619
+ --------
620
+ >>> DADF.sample_df(5).apply_notnull("missing_value", lambda _: "REPLACED")
621
+ number number_big number_range missing_value text date
622
+ 0 0.852218 157 100 REPLACED dqzxaxxs 2006-03-08
623
+ 1 1.522428 616 600 NaN mivkaooe 2018-12-27
624
+ 2 0.108506 745 700 REPLACED qanwwjet 2005-07-14
625
+ 3 -1.435079 400 400 REPLACED ywahcasi 2024-05-20
626
+ 4 0.118993 861 800 REPLACED saoupuby 2019-04-28
627
+ """
628
+ self[col] = self[col].apply(lambda x: callable(x) if pd.notnull(x) else x)
629
+ return self
630
+
631
+ @versionadded("5.1.0") # type: ignore
632
+ def apply_notnull_row(
633
+ self,
634
+ apply_when_null: Callable[[Any], _R] | _T | None = None,
635
+ apply_when_not_null: Callable[[Any], _R] | _T | None = None,
636
+ col_name: str | None = None,
637
+ ) -> Self:
638
+ """
639
+ Apply to DataFrame's row with missing value.
640
+
641
+ Parameters
642
+ ----------
643
+ apply_when_null : Callable[[Any], R] | T | None, optional
644
+ Callable or Any, by default ``None``: returns if entire row is not null
645
+
646
+ apply_when_not_null : Callable[[Any], R] | T | None, optional
647
+ Callable or Any, by default ``None``: returns if entire row is not null
648
+
649
+ col_name : str | None, optional
650
+ Output column name, by default ``None`` (uses custom name)
651
+
652
+ Returns
653
+ -------
654
+ Self
655
+ Modified DataDrame
656
+
657
+
658
+ Example:
659
+ --------
660
+ >>> df = DADF({"A": [None, 2, 3, 4], "B": [1, None, 3, 4], "C": [None, 2, None, 4]})
661
+ >>> df.apply_notnull_row()
662
+ A B C applied_row_null
663
+ 0 NaN 1.0 NaN False
664
+ 1 2.0 NaN 2.0 False
665
+ 2 3.0 3.0 NaN False
666
+ 3 4.0 4.0 4.0 True
667
+ >>> df.apply_notnull_row(0, 1)
668
+ A B C applied_row_null
669
+ 0 NaN 1.0 NaN 0
670
+ 1 2.0 NaN 2.0 0
671
+ 2 3.0 3.0 NaN 0
672
+ 3 4.0 4.0 4.0 1
673
+ >>> df.apply_notnull_row(lambda _: "n", lambda _: "y", col_name="mod")
674
+ A B C mod
675
+ 0 NaN 1.0 NaN n
676
+ 1 2.0 NaN 2.0 n
677
+ 2 3.0 3.0 NaN n
678
+ 3 4.0 4.0 4.0 y
679
+ """
680
+
681
+ def apply_func(row: pd.Series):
682
+ # Both None
683
+ if apply_when_null is None and apply_when_not_null is None:
684
+ return row.notnull().all()
685
+
686
+ # When all values in row are not null
687
+ if row.notnull().all():
688
+ if callable(apply_when_not_null):
689
+ return apply_when_not_null(row)
690
+ return apply_when_not_null
691
+
692
+ # When any value in row is null
693
+ if callable(apply_when_null):
694
+ return apply_when_null(row)
695
+ return apply_when_null
696
+
697
+ # Column name
698
+ cname = "applied_row_null" if col_name is None else col_name
699
+ self[cname] = self.apply(apply_func, axis=1)
700
+
701
+ return self
702
+
703
+
704
+ # Other
705
+ # ---------------------------------------------------------------------------
706
+ class DataAnalystDataFrameOtherMixin(DFBase):
707
+ """
708
+ Data Analyst ``pd.DataFrame`` - Other method/Stuff
709
+
710
+ - Merge left
711
+ """
712
+
713
+ @versionadded("4.0.0")
714
+ def merge_left(
715
+ self,
716
+ other: Self | pd.DataFrame,
717
+ on: str,
718
+ columns: list[str] | None = None,
719
+ ) -> Self:
720
+ """
721
+ Merge left of 2 DataFrame
722
+
723
+ Parameters
724
+ ----------
725
+ other : Self | pd.DataFrame
726
+ DataFrame to merge
727
+
728
+ on : str
729
+ Merge on which column
730
+
731
+ columns : list[str] | None, optional
732
+ Columns to take from other DataFrame, by default ``None``
733
+ (Take all columns)
734
+
735
+ Returns
736
+ -------
737
+ Self
738
+ Merged DataFrame
739
+
740
+
741
+ Example:
742
+ --------
743
+ >>> df1 = DADF({
744
+ ... "id": [1, 2, 5],
745
+ ... "name": ["Alice", "Bob", "Rich"],
746
+ ... "age": [20, 20, 20],
747
+ ... })
748
+ >>> df2 = DADF({
749
+ ... "id": [1, 2, 3],
750
+ ... "age": [25, 30, 45],
751
+ ... "department": ["HR", "IT", "PM"],
752
+ ... "salary": [50000, 60000, 55000],
753
+ ... })
754
+ >>> df1.merge_left(df2, on="id")
755
+ id name age_x age_y department salary
756
+ 0 1 Alice 20 25.0 HR 50000.0
757
+ 1 2 Bob 20 30.0 IT 60000.0
758
+ 2 5 Rich 20 NaN NaN NaN
759
+ >>> df1.merge_left(df2, on="id", columns=["salary"])
760
+ id name age department salary
761
+ 0 1 Alice 25.0 HR 50000.0
762
+ 1 2 Bob 30.0 IT 60000.0
763
+ 2 5 Rich NaN NaN NaN
764
+ """
765
+
766
+ if columns is not None:
767
+ current_col = [on]
768
+ current_col.extend(columns)
769
+ col = other.columns.to_list()
770
+ cols = list(set(col) - set(current_col))
771
+
772
+ if getattr(self, "drop_columns", None) is not None:
773
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
774
+ self.drop_columns(cols)
775
+
776
+ out = self.merge(other, how="left", on=on)
777
+ return self.__class__(out)
778
+
779
+
780
+ # Date
781
+ # ---------------------------------------------------------------------------
782
+ class DataAnalystDataFrameDateMixin(DFBase):
783
+ """
784
+ Data Analyst ``pd.DataFrame`` - Date
785
+
786
+ - Add date column from month column
787
+ - Add detail date
788
+ - Delta date (How many days inbetween)
789
+ """
790
+
791
+ def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self:
792
+ """
793
+ Add dummy ``date`` column from ``month`` column
794
+
795
+ Parameters
796
+ ----------
797
+ month_column : str
798
+ Month column
799
+
800
+ col_name : str
801
+ New date column name, by default: ``"date"``
802
+
803
+ Returns
804
+ -------
805
+ Self
806
+ Modified DataFrame
807
+
808
+
809
+ Example:
810
+ --------
811
+ >>> df = (
812
+ ... DADF.sample_df(2)
813
+ ... .add_detail_date("date", mode="m")
814
+ ... .drop_columns(["date", "number", "number_range"])
815
+ ... )
816
+ >>> df
817
+ number_big missing_value text month
818
+ 0 755 NaN lincgqzl 4
819
+ 1 907 NaN gxltrjku 10
820
+ >>> df.add_date_from_month("month")
821
+ number_big missing_value text month date
822
+ 0 755 NaN lincgqzl 4 2025-04-01
823
+ 1 907 NaN gxltrjku 10 2025-10-01
824
+ """
825
+ _this_year = datetime.now().year
826
+ self[col_name] = pd.to_datetime(
827
+ f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
828
+ format="%Y-%m-%d",
829
+ )
830
+
831
+ # Rearrange
832
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
833
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
834
+ return self.rearrange_rightmost_column(month_column) # type: ignore [no-any-return]
835
+ return self
836
+
837
+ def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self:
838
+ """
839
+ Add these columns from ``date_column``:
840
+ - ``date`` (won't add if ``date_column`` value is ``"date"``)
841
+ - ``day`` (overwrite if already exist)
842
+ - ``week`` (overwrite if already exist)
843
+ - ``month`` (overwrite if already exist)
844
+ - ``year`` (overwrite if already exist)
845
+
846
+ Parameters
847
+ ----------
848
+ date_column : str
849
+ Date column
850
+
851
+ mode : str
852
+ | Detailed column to add
853
+ | ``d``: day
854
+ | ``w``: week number
855
+ | ``m``: month
856
+ | ``y``: year
857
+ | (Default: ``"dwmy"``)
858
+
859
+ Returns
860
+ -------
861
+ Self
862
+ Modified DataFrame
863
+
864
+
865
+ Example:
866
+ --------
867
+ >>> df = DADF.sample_df(2)
868
+ >>> df
869
+ number number_big number_range missing_value text date
870
+ 0 0.331195 902 900 20 fgyanxik 2021-10-18
871
+ 1 -0.877727 378 300 13 dqvaggjo 2007-03-06
872
+ >>> df.add_detail_date("date")
873
+ number number_big number_range missing_value text date day week month year
874
+ 0 0.331195 902 900 20 fgyanxik 2021-10-18 18 42 10 2021
875
+ 1 -0.877727 378 300 13 dqvaggjo 2007-03-06 6 10 3 2007
876
+ """
877
+ # Convert to datetime
878
+ self["date"] = pd.to_datetime(self[date_column])
879
+
880
+ # Logic
881
+ col_counter = 0
882
+ # self["weekday"] = self["day"].dt.isocalendar().day # Weekday
883
+ if mode.find("d") != -1:
884
+ logger.debug("Mode: 'day'")
885
+ self["day"] = self["date"].dt.day
886
+ col_counter += 1
887
+ if mode.find("w") != -1:
888
+ logger.debug("Mode: 'weekday'")
889
+ self["week"] = self["date"].dt.isocalendar().week
890
+ col_counter += 1
891
+ if mode.find("m") != -1:
892
+ logger.debug("Mode: 'month'")
893
+ self["month"] = self["date"].dt.month
894
+ col_counter += 1
895
+ if mode.find("y") != -1:
896
+ logger.debug("Mode: 'year'")
897
+ self["year"] = self["date"].dt.year
898
+ col_counter += 1
899
+
900
+ # Return
901
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
902
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
903
+ return self.rearrange_rightmost_column(date_column, col_counter) # type: ignore [no-any-return]
904
+ return self
905
+
906
+ def delta_date(
907
+ self,
908
+ date_column: str,
909
+ mode: Literal["now", "between_row"] = "now",
910
+ *,
911
+ col_name: str = "delta_date",
912
+ ) -> Self:
913
+ """
914
+ Calculate date interval
915
+
916
+ Parameters
917
+ ----------
918
+ date_column : str
919
+ Date column
920
+
921
+ mode : str
922
+ | Mode to calculate
923
+ | ``"between_row"``: Calculate date interval between each row
924
+ | ``"now"``: Calculate date interval to current date
925
+ | (Default: ``"now"``)
926
+
927
+ col_name : str
928
+ | New delta date column name
929
+ | (Default: ``"delta_date"``)
930
+
931
+ Returns
932
+ -------
933
+ Self
934
+ Modified DataFrame
935
+
936
+
937
+ Example:
938
+ --------
939
+ >>> df = DADF.sample_df(2)
940
+ >>> df
941
+ number number_big number_range missing_value text date
942
+ 0 -0.729988 435 400 21 xkrqqouf 2014-08-01
943
+ 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10
944
+ >>> df.delta_date("date")
945
+ number number_big number_range missing_value text date delta_date
946
+ 0 -0.729988 435 400 21 xkrqqouf 2014-08-01 3873
947
+ 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10 242
948
+ """
949
+ if mode.lower().startswith("between_row"):
950
+ dated = self[date_column].to_list()
951
+ cal: list[timedelta] = []
952
+ for i in range(len(dated)):
953
+ if i == 0:
954
+ cal.append(dated[i] - dated[i])
955
+ # cal.append(relativedelta(dated[i], dated[i]))
956
+ else:
957
+ cal.append(dated[i] - dated[i - 1])
958
+ # cal.append(relativedelta(dated[i], dated[i - 1]))
959
+ self[col_name] = [x.days for x in cal]
960
+ else: # mode="now"
961
+ self[col_name] = self[date_column].apply(
962
+ lambda x: (datetime.now() - x).days
963
+ )
964
+ return self
965
+
966
+
967
+ # City
968
+ # ---------------------------------------------------------------------------
969
+ class DataAnalystDataFrameCityMixin(DFBase):
970
+ """
971
+ Data Analyst ``pd.DataFrame`` - City
972
+
973
+ - Convert city
974
+ """
975
+
976
+ def convert_city(
977
+ self,
978
+ city_column: str,
979
+ city_list: list[CityData],
980
+ *,
981
+ mode: str = "ra",
982
+ ) -> Self:
983
+ """
984
+ Get ``region`` and ``area`` of a city
985
+
986
+ Parameters
987
+ ----------
988
+ city_column : str
989
+ Column contains city data
990
+
991
+ city_list : list[CityData]
992
+ List of city in correct format
993
+ (Default: ``None``)
994
+
995
+ mode : str
996
+ | Detailed column to add
997
+ | ``r``: region
998
+ | ``a``: area
999
+ | (Default: ``"ra"``)
1000
+
1001
+ Returns
1002
+ -------
1003
+ DataAnalystDataFrame
1004
+ Modified DataFrame
1005
+ """
1006
+
1007
+ # Support function
1008
+ def _convert_city_support(value: str) -> CityData:
1009
+ for x in city_list:
1010
+ if x.city.lower().startswith(value.lower()):
1011
+ return x
1012
+ return CityData(city=value, region=np.nan, area=np.nan) # type: ignore
1013
+
1014
+ # Convert
1015
+ col_counter = 0
1016
+ if mode.find("r") != -1:
1017
+ logger.debug("Mode: 'region'")
1018
+ self["region"] = self[city_column].apply(
1019
+ lambda x: _convert_city_support(x).region
1020
+ )
1021
+ col_counter += 1
1022
+ if mode.find("a") != -1:
1023
+ logger.debug("Mode: 'area'")
1024
+ self["area"] = self[city_column].apply(
1025
+ lambda x: _convert_city_support(x).area
1026
+ )
1027
+ col_counter += 1
1028
+
1029
+ # Rearrange
1030
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
1031
+ return self.rearrange_rightmost_column(city_column, col_counter) # type: ignore [no-any-return]
1032
+ return self
1033
+
1034
+
1035
+ # Main
1036
+ # ---------------------------------------------------------------------------
1037
+ class DADF(
1038
+ ShowAllMethodsMixin,
1039
+ DataAnalystDataFrameCityMixin,
1040
+ DataAnalystDataFrameDateMixin,
1041
+ DataAnalystDataFrameOtherMixin,
1042
+ DataAnalystDataFrameNAMixin,
1043
+ DataAnalystDataFrameInfoMixin,
1044
+ DataAnalystDataFrameRowMethodMixin,
1045
+ DataAnalystDataFrameColumnMethodMixin,
1046
+ ):
1047
+ """
1048
+ Data Analyst ``pd.DataFrame``
1049
+
1050
+ For a list of extra methods:
1051
+ >>> print(DADF.DADF_METHODS)
1052
+ """
1053
+
1054
+ @classmethod
1055
+ @deprecated("5.1.0")
1056
+ @versionadded("3.2.0")
1057
+ def dadf_help(cls) -> list[str]:
1058
+ """
1059
+ Show all available method of DataAnalystDataFrame
1060
+ """
1061
+ list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame)))
1062
+ return sorted(list_of_method)
1063
+
1064
+ @classmethod
1065
+ def sample_df(cls, size: int = 100) -> Self:
1066
+ """
1067
+ Create sample DataFrame
1068
+
1069
+ Parameters
1070
+ ----------
1071
+ size : int
1072
+ Number of observations, by default ``100``
1073
+
1074
+ Returns
1075
+ -------
1076
+ Self
1077
+ DataFrame with these columns:
1078
+ [number, number_big, number_range, missing_value, text, date]
1079
+
1080
+
1081
+ Example:
1082
+ --------
1083
+ >>> DataAnalystDataFrame.sample_df()
1084
+ number number_big number_range missing_value text date
1085
+ 0 -2.089770 785 700 NaN vwnlqoql 2013-11-20
1086
+ 1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13
1087
+ 2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24
1088
+ 3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28
1089
+ 4 1.687803 878 800 NaN aajtncum 2005-10-05
1090
+ .. ... ... ... ... ... ...
1091
+ 95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12
1092
+ 96 1.296795 255 200 NaN lwvytego 2014-05-10
1093
+ 97 1.440746 297 200 5.0 lqsoykun 2010-04-03
1094
+ 98 0.327702 845 800 NaN leadkvsy 2005-08-05
1095
+ 99 0.556720 981 900 36.0 bozmxixy 2004-02-22
1096
+ [100 rows x 6 columns]
1097
+ """
1098
+ # Restrain
1099
+ size = max(size, 1)
1100
+
1101
+ # Number col
1102
+ df = cls(np.random.randn(size, 1), columns=["number"])
1103
+ df["number_big"] = [
1104
+ random.choice(range(100, 999)) for _ in range(size)
1105
+ ] # Big number in range 100-999
1106
+ df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
1107
+
1108
+ # Missing value col
1109
+ na_rate = random.randint(1, 99)
1110
+ d = [random.randint(1, 99) for _ in range(size)]
1111
+ df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
1112
+ # df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
1113
+
1114
+ # Text col
1115
+ df["text"] = [
1116
+ "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
1117
+ for _ in range(size)
1118
+ ]
1119
+
1120
+ # Random date col
1121
+ df["date"] = [
1122
+ datetime(
1123
+ year=random.randint(datetime.now().year - 20, datetime.now().year),
1124
+ month=random.randint(1, 12),
1125
+ day=random.randint(1, 28),
1126
+ )
1127
+ for _ in range(size)
1128
+ ]
1129
+
1130
+ # Return
1131
+ return df
1132
+
1133
+
1134
+ class DADF_WIP(DADF):
1135
+ """
1136
+ W.I.P - No test cases written
1137
+ """
1138
+
1139
+ def split_str_column(
1140
+ self,
1141
+ col: str,
1142
+ pattern: str = " ",
1143
+ *,
1144
+ n: int | None = None,
1145
+ regex: bool = False,
1146
+ ) -> Self:
1147
+ """
1148
+ Split column with dtype[str] into other columns.
1149
+
1150
+ Parameters
1151
+ ----------
1152
+ col : str
1153
+ Column name
1154
+
1155
+ pattern : str, optional
1156
+ Split pattern, by default ``" "``
1157
+
1158
+ n : int | None, optional
1159
+ Split by how many times, by default ``None``
1160
+
1161
+ regex : bool, optional
1162
+ Regex mode, by default ``False``
1163
+
1164
+ Returns
1165
+ -------
1166
+ Self
1167
+ DataFrame
1168
+ """
1169
+ if n is None:
1170
+ pass
1171
+ splited_data: pd.DataFrame = self[col].str.split(
1172
+ pat=pattern, n=n, expand=True, regex=regex
1173
+ )
1174
+ num_of_splitted_cols = splited_data.shape[1]
1175
+ new_col_names = [f"{col}_{x}" for x in range(num_of_splitted_cols)]
1176
+ self[new_col_names] = splited_data
1177
+ return self