absfuyu 5.0.0__py3-none-any.whl → 6.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of absfuyu might be problematic. Click here for more details.

Files changed (103) hide show
  1. absfuyu/__init__.py +5 -3
  2. absfuyu/__main__.py +3 -3
  3. absfuyu/cli/__init__.py +13 -2
  4. absfuyu/cli/audio_group.py +98 -0
  5. absfuyu/cli/color.py +30 -14
  6. absfuyu/cli/config_group.py +9 -2
  7. absfuyu/cli/do_group.py +23 -6
  8. absfuyu/cli/game_group.py +27 -2
  9. absfuyu/cli/tool_group.py +81 -11
  10. absfuyu/config/__init__.py +3 -3
  11. absfuyu/core/__init__.py +12 -8
  12. absfuyu/core/baseclass.py +929 -96
  13. absfuyu/core/baseclass2.py +44 -3
  14. absfuyu/core/decorator.py +70 -4
  15. absfuyu/core/docstring.py +64 -41
  16. absfuyu/core/dummy_cli.py +3 -3
  17. absfuyu/core/dummy_func.py +19 -6
  18. absfuyu/dxt/__init__.py +2 -2
  19. absfuyu/dxt/base_type.py +93 -0
  20. absfuyu/dxt/dictext.py +204 -16
  21. absfuyu/dxt/dxt_support.py +2 -2
  22. absfuyu/dxt/intext.py +151 -34
  23. absfuyu/dxt/listext.py +969 -127
  24. absfuyu/dxt/strext.py +77 -17
  25. absfuyu/extra/__init__.py +2 -2
  26. absfuyu/extra/audio/__init__.py +8 -0
  27. absfuyu/extra/audio/_util.py +57 -0
  28. absfuyu/extra/audio/convert.py +192 -0
  29. absfuyu/extra/audio/lossless.py +281 -0
  30. absfuyu/extra/beautiful.py +3 -2
  31. absfuyu/extra/da/__init__.py +72 -0
  32. absfuyu/extra/da/dadf.py +1600 -0
  33. absfuyu/extra/da/dadf_base.py +186 -0
  34. absfuyu/extra/da/df_func.py +181 -0
  35. absfuyu/extra/da/mplt.py +219 -0
  36. absfuyu/extra/ggapi/__init__.py +8 -0
  37. absfuyu/extra/ggapi/gdrive.py +223 -0
  38. absfuyu/extra/ggapi/glicense.py +148 -0
  39. absfuyu/extra/ggapi/glicense_df.py +186 -0
  40. absfuyu/extra/ggapi/gsheet.py +88 -0
  41. absfuyu/extra/img/__init__.py +30 -0
  42. absfuyu/extra/img/converter.py +402 -0
  43. absfuyu/extra/img/dup_check.py +291 -0
  44. absfuyu/extra/pdf.py +87 -0
  45. absfuyu/extra/rclone.py +253 -0
  46. absfuyu/extra/xml.py +90 -0
  47. absfuyu/fun/__init__.py +7 -20
  48. absfuyu/fun/rubik.py +442 -0
  49. absfuyu/fun/tarot.py +2 -2
  50. absfuyu/game/__init__.py +2 -2
  51. absfuyu/game/game_stat.py +2 -2
  52. absfuyu/game/schulte.py +78 -0
  53. absfuyu/game/sudoku.py +2 -2
  54. absfuyu/game/tictactoe.py +2 -3
  55. absfuyu/game/wordle.py +6 -4
  56. absfuyu/general/__init__.py +4 -4
  57. absfuyu/general/content.py +4 -4
  58. absfuyu/general/human.py +2 -2
  59. absfuyu/general/resrel.py +213 -0
  60. absfuyu/general/shape.py +3 -8
  61. absfuyu/general/tax.py +344 -0
  62. absfuyu/logger.py +806 -59
  63. absfuyu/numbers/__init__.py +13 -0
  64. absfuyu/numbers/number_to_word.py +321 -0
  65. absfuyu/numbers/shorten_number.py +303 -0
  66. absfuyu/numbers/time_duration.py +217 -0
  67. absfuyu/pkg_data/__init__.py +2 -2
  68. absfuyu/pkg_data/deprecated.py +2 -2
  69. absfuyu/pkg_data/logo.py +1462 -0
  70. absfuyu/sort.py +4 -4
  71. absfuyu/tools/__init__.py +28 -2
  72. absfuyu/tools/checksum.py +144 -9
  73. absfuyu/tools/converter.py +120 -34
  74. absfuyu/tools/generator.py +461 -0
  75. absfuyu/tools/inspector.py +752 -0
  76. absfuyu/tools/keygen.py +2 -2
  77. absfuyu/tools/obfuscator.py +47 -9
  78. absfuyu/tools/passwordlib.py +89 -25
  79. absfuyu/tools/shutdownizer.py +3 -8
  80. absfuyu/tools/sw.py +718 -0
  81. absfuyu/tools/web.py +10 -13
  82. absfuyu/typings.py +138 -0
  83. absfuyu/util/__init__.py +114 -6
  84. absfuyu/util/api.py +41 -18
  85. absfuyu/util/cli.py +119 -0
  86. absfuyu/util/gui.py +91 -0
  87. absfuyu/util/json_method.py +43 -14
  88. absfuyu/util/lunar.py +2 -2
  89. absfuyu/util/package.py +124 -0
  90. absfuyu/util/path.py +702 -82
  91. absfuyu/util/performance.py +122 -7
  92. absfuyu/util/shorten_number.py +244 -21
  93. absfuyu/util/text_table.py +481 -0
  94. absfuyu/util/zipped.py +8 -7
  95. absfuyu/version.py +79 -59
  96. {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/METADATA +52 -11
  97. absfuyu-6.1.2.dist-info/RECORD +105 -0
  98. {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/WHEEL +1 -1
  99. absfuyu/extra/data_analysis.py +0 -1078
  100. absfuyu/general/generator.py +0 -303
  101. absfuyu-5.0.0.dist-info/RECORD +0 -68
  102. {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/entry_points.txt +0 -0
  103. {absfuyu-5.0.0.dist-info → absfuyu-6.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1600 @@
1
+ """
2
+ Absfuyu: Data Analysis
3
+ ----------------------
4
+ Data Analyst DataFrame
5
+
6
+ Version: 6.1.1
7
+ Date updated: 30/12/2025 (dd/mm/yyyy)
8
+ """
9
+
10
+ # Module level
11
+ # ---------------------------------------------------------------------------
12
+ __all__ = [
13
+ "DADF",
14
+ "DataAnalystDataFrameColumnMethodMixin",
15
+ "DataAnalystDataFrameRowMethodMixin",
16
+ "DataAnalystDataFrameInfoMixin",
17
+ "DataAnalystDataFrameNAMixin",
18
+ "DataAnalystDataFrameOtherMixin",
19
+ "DataAnalystDataFrameDateMixin",
20
+ "DataAnalystDataFrameExportMixin",
21
+ "DataAnalystDataFrameCityMixin",
22
+ ]
23
+
24
+
25
+ # Library
26
+ # ---------------------------------------------------------------------------
27
+ import random
28
+ import string
29
+ from collections.abc import Callable, Iterable, Mapping, Sequence
30
+ from datetime import datetime, timedelta
31
+ from typing import Any, Literal, Self, cast, override
32
+
33
+ import numpy as np
34
+ import pandas as pd
35
+ from xlsxwriter import Workbook
36
+ from xlsxwriter.worksheet import Worksheet
37
+
38
+ from absfuyu.core.baseclass import GetClassMembersMixin
39
+ from absfuyu.core.docstring import deprecated, versionadded, versionchanged
40
+ from absfuyu.core.dummy_func import unidecode
41
+ from absfuyu.extra.da.dadf_base import CityData
42
+ from absfuyu.extra.da.dadf_base import DataAnalystDataFrameBase as DFBase
43
+ from absfuyu.extra.da.dadf_base import SplittedDF
44
+ from absfuyu.typings import R as _R
45
+ from absfuyu.typings import T as _T
46
+ from absfuyu.util import set_min_max
47
+
48
+
49
+ # Column method
50
+ # ---------------------------------------------------------------------------
51
+ class DataAnalystDataFrameColumnMethodMixin(DFBase):
52
+ """
53
+ Data Analyst ``pd.DataFrame`` - Column method
54
+
55
+ - Rearrange rightmost column
56
+ - Drop columns
57
+ - Drop rightmost column
58
+ - Add blank column
59
+ - Split str column
60
+ - Get column name unidecoded
61
+ - Get column unidecoded
62
+ """
63
+
64
+ def rearrange_rightmost_column(
65
+ self, insert_to_col: str, num_of_cols: int = 1
66
+ ) -> Self:
67
+ """
68
+ Move right-most columns to selected position
69
+
70
+ Parameters
71
+ ----------
72
+ insert_to_col : str
73
+ Name of the column that the right-most column will be moved next to
74
+
75
+ num_of_cols : int
76
+ Number of columns moved, by default ``1``
77
+
78
+ Returns
79
+ -------
80
+ Self
81
+ Modified DataFrame
82
+
83
+
84
+ Example:
85
+ --------
86
+ >>> df = DADF.sample_df(2)
87
+ >>> df
88
+ number number_big number_range missing_value text date
89
+ 0 -1.583590 756 700 NaN eqklyckc 2023-05-20
90
+ 1 0.203968 167 100 NaN wzrsxinb 2011-02-27
91
+ >>> df.rearrange_rightmost_column("number")
92
+ number date number_big number_range missing_value text
93
+ 0 -1.583590 2023-05-20 756 700 NaN eqklyckc
94
+ 1 0.203968 2011-02-27 167 100 NaN wzrsxinb
95
+ """
96
+ cols: list[str] = self.columns.to_list() # List of columns
97
+ num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols)))
98
+ col_index: int = cols.index(insert_to_col)
99
+ new_cols: list[str] = (
100
+ cols[: col_index + 1]
101
+ + cols[-num_of_cols:]
102
+ + cols[col_index + 1 : len(cols) - num_of_cols]
103
+ )
104
+ self = self.__class__(self[new_cols])
105
+ return self
106
+
107
+ def drop_columns(self, columns: Sequence[str]) -> Self:
108
+ """
109
+ Drop columns in DataFrame
110
+
111
+ Parameters
112
+ ----------
113
+ columns : Iterable[str]
114
+ List of columns need to drop
115
+
116
+ Returns
117
+ -------
118
+ Self
119
+ Modified DataFrame
120
+
121
+
122
+ Example:
123
+ --------
124
+ >>> df = DADF.sample_df(2)
125
+ >>> df
126
+ number number_big number_range missing_value text date
127
+ 0 -0.283019 666 600 NaN ztoeeblx 2022-11-13
128
+ 1 1.194725 939 900 NaN fxardqvh 2005-08-04
129
+ >>> df.drop_columns(["date", "text"])
130
+ number number_big number_range missing_value
131
+ 0 -0.283019 666 600 NaN
132
+ 1 1.194725 939 900 NaN
133
+ """
134
+ for column in columns:
135
+ try:
136
+ self.drop(columns=[column], inplace=True)
137
+ except KeyError:
138
+ # logger.debug(f"{column} column does not exist")
139
+ pass
140
+ return self
141
+
142
+ def drop_rightmost(self, num_of_cols: int = 1) -> Self:
143
+ """
144
+ Drop ``num_of_cols`` right-most columns
145
+
146
+ Parameters
147
+ ----------
148
+ num_of_cols : int
149
+ Number of columns to drop
150
+
151
+ Returns
152
+ -------
153
+ Self
154
+ Modified DataFrame
155
+
156
+
157
+ Example:
158
+ --------
159
+ >>> df = DADF.sample_df(2)
160
+ >>> df
161
+ number number_big number_range missing_value text date
162
+ 0 0.851953 572 500 5 ncpbnzef 2020-08-15
163
+ 1 0.381643 595 500 53 iojogbgj 2011-12-04
164
+ >>> df.drop_rightmost(5)
165
+ number
166
+ 0 0.851953
167
+ 1 0.381643
168
+ """
169
+ # Restrain
170
+ # if num_of_cols < 1:
171
+ # num_of_cols = 1
172
+ # if num_of_cols > self.shape[1]:
173
+ # num_of_cols = self.shape[1]
174
+ num_of_cols = int(
175
+ set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
176
+ )
177
+
178
+ # Logic
179
+ for _ in range(num_of_cols):
180
+ self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
181
+ return self
182
+
183
+ @deprecated("5.1.0", reason="Use pd.DataFrame.assign(...) method instead")
184
+ def add_blank_column(self, column_name: str, fill: Any = np.nan, /) -> Self:
185
+ """
186
+ [DEPRECATED] Add a blank column.
187
+
188
+ E.g: Use `pd.DataFrame.assign(new_col=lambda x: x['old_col'])` instead
189
+
190
+ Parameters
191
+ ----------
192
+ column_name : str
193
+ Name of the column to add
194
+
195
+ fill : Any
196
+ Fill the column with data
197
+
198
+ Returns
199
+ -------
200
+ Self
201
+ Modified DataFrame
202
+ """
203
+ self[column_name] = [fill] * self.shape[0]
204
+ return self
205
+
206
+ @versionadded("5.2.0") # No test cases
207
+ def split_str_column(
208
+ self,
209
+ col: str,
210
+ pattern: str = " ",
211
+ *,
212
+ n: int | None = None,
213
+ regex: bool = False,
214
+ ) -> Self:
215
+ """
216
+ Split column with dtype[str] into other columns.
217
+
218
+ Parameters
219
+ ----------
220
+ col : str
221
+ Column name
222
+
223
+ pattern : str, optional
224
+ Split pattern, by default ``" "``
225
+
226
+ n : int | None, optional
227
+ Split by how many times, by default ``None``
228
+
229
+ regex : bool, optional
230
+ Regex mode, by default ``False``
231
+
232
+ Returns
233
+ -------
234
+ Self
235
+ DataFrame
236
+
237
+
238
+ Example:
239
+ --------
240
+ >>> df = DADF(DADF.sample_df(5)[["text"]])
241
+ >>> df.split_str_column("text", "s"))
242
+ text text_0 text_1
243
+ 0 uwfzbsgj uwfzb gj
244
+ 1 lxlskayx lxl kayx
245
+ 2 fzgpzjtp fzgpzjtp None
246
+ 3 lxnytktz lxnytktz None
247
+ 4 onryaxtt onryaxtt None
248
+ """
249
+ if n is None:
250
+ pass
251
+ splited_data: pd.DataFrame = self[col].str.split(pat=pattern, n=n, expand=True, regex=regex) # type: ignore
252
+ num_of_splitted_cols = splited_data.shape[1]
253
+ new_col_names = [f"{col}_{x}" for x in range(num_of_splitted_cols)]
254
+ self[new_col_names] = splited_data
255
+ return self
256
+
257
+ @versionadded("5.12.0") # No test cases
258
+ def get_column_name_unidecoded(self, col_name: str, /, *, mode: Literal["start", "end", "in"] = "start") -> str:
259
+ """
260
+ Get column name from lowercase unidecode'd version name
261
+
262
+ Parameters
263
+ ----------
264
+ col_name : str
265
+ Column name to find
266
+
267
+ mode : Literal["start", "end", "in"], optional
268
+ Which mode to find, by default "start"
269
+ - "start": str.startswith()
270
+ - "end": str.endswith()
271
+ - "in": if x in y
272
+
273
+ Returns
274
+ -------
275
+ str
276
+ Column name
277
+
278
+ Raises
279
+ ------
280
+ ValueError
281
+ Column not found
282
+ """
283
+ for x in self.columns.to_list():
284
+ col_name_mod = cast(str, unidecode(x.strip().lower()))
285
+ if mode == "start":
286
+ if col_name_mod.startswith(col_name):
287
+ return x
288
+ elif mode == "end":
289
+ if col_name_mod.endswith(col_name):
290
+ return x
291
+ elif mode == "in":
292
+ if col_name_mod in col_name:
293
+ return x
294
+
295
+ raise ValueError(f"Column not found: {col_name}")
296
+
297
+ @versionadded("5.12.0") # No test cases
298
+ def get_column_unidecoded(self, col_name: str, /, *, mode: Literal["start", "end", "in"] = "start") -> pd.Series:
299
+ """
300
+ Get column from lowercase unidecode'd version column name
301
+
302
+ Parameters
303
+ ----------
304
+ col_name : str
305
+ Column name to find
306
+
307
+ mode : Literal["start", "end", "in"], optional
308
+ Which mode to find, by default "start"
309
+ - "start": str.startswith()
310
+ - "end": str.endswith()
311
+ - "in": if x in y
312
+
313
+ Returns
314
+ -------
315
+ Series
316
+ Column data
317
+ """
318
+ return self[self.get_column_name_unidecoded(col_name, mode=mode)]
319
+
320
+
321
+ # Row method
322
+ # ---------------------------------------------------------------------------
323
+ class DataAnalystDataFrameRowMethodMixin(DFBase):
324
+ """
325
+ Data Analyst ``pd.DataFrame`` - Row method
326
+
327
+ - Get different rows
328
+ - Add blank row
329
+ """
330
+
331
+ @versionadded("4.0.0")
332
+ def get_different_rows(self, other: Self | pd.DataFrame) -> Self:
333
+ """
334
+ Subtract DataFrame to find the different rows
335
+
336
+ Parameters
337
+ ----------
338
+ other : Self | pd.DataFrame
339
+ DataFrame to subtract
340
+
341
+ Returns
342
+ -------
343
+ Self
344
+ Different row DataFrame
345
+
346
+
347
+ Example:
348
+ --------
349
+ >>> df1 = DADF({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
350
+ >>> df2 = DADF({"A": [1, 2, 3, 4], "B": [7, 6, 6, 8]})
351
+ >>> df1.get_different_rows(df2)
352
+ A B
353
+ 0 1 7
354
+ 2 3 6
355
+ """
356
+ df = self.copy()
357
+ out = (
358
+ df.merge(other, indicator=True, how="right")
359
+ .query("_merge=='right_only'")
360
+ .drop("_merge", axis=1)
361
+ )
362
+ return self.__class__(out)
363
+
364
+ @versionchanged("6.0.0", reason="Improved logic")
365
+ @versionadded("5.7.0")
366
+ def add_blank_row(self, fill: Any = np.nan, /) -> Self:
367
+ """
368
+ Add a new row to the end of a DataFrame.
369
+
370
+ Parameters
371
+ ----------
372
+ fill : Any, default np.nan
373
+ Value to fill in the new row (e.g., np.nan, None, "", 0).
374
+
375
+ Returns
376
+ -------
377
+ Self
378
+ DataFrame with the new row appended.
379
+ """
380
+ # Create a dict with all columns filled with fill
381
+ new_row = {col: fill for col in self.columns}
382
+ safe_types = self._safe_dtypes(self.dtypes)
383
+ blank_row_df = pd.DataFrame([new_row], columns=self.columns).astype(safe_types)
384
+
385
+ # self.loc[len(self)] = new_row # type: ignore
386
+ # return self
387
+ out = cast(pd.DataFrame, pd.concat([self, blank_row_df], ignore_index=True))
388
+ return self.__class__(out)
389
+
390
+ @versionadded("6.0.0") # Support
391
+ def _safe_dtypes(self, dtypes: pd.Series) -> dict[str, Any]:
392
+ """
393
+ Convert DataFrame dtypes into a safe mapping for operations involving
394
+ missing values (NA), especially during row insertion or concatenation.
395
+
396
+ This function is primarily used to prevent pandas errors when inserting
397
+ rows containing missing values (``NaN``) into columns with non-nullable
398
+ integer dtypes (e.g. ``int64``). Since standard NumPy integer dtypes do not
399
+ support missing values, they are converted to pandas' nullable integer
400
+ dtype (``Int64``).
401
+
402
+ All non-integer dtypes are preserved without modification.
403
+
404
+ - Pandas nullable integer dtypes (``Int64``, ``Int32``, etc.) allow missing
405
+ values via ``pd.NA``, unlike NumPy integer dtypes.
406
+ - This function is commonly used before calling ``DataFrame.astype`` to
407
+ avoid ``IntCastingNaNError`` when NA values are present.
408
+ - The function does **not** modify floating-point, boolean, datetime,
409
+ categorical, or object dtypes.
410
+
411
+ Parameters
412
+ ----------
413
+ dtypes : Series
414
+ A Series mapping column names to their pandas dtypes, typically obtained
415
+ from ``DataFrame.dtypes``.
416
+
417
+ Returns
418
+ -------
419
+ dict
420
+ A dictionary mapping column names to safe dtypes. Integer dtypes are
421
+ converted to pandas nullable integer dtype (``"Int64"``), while all
422
+ other dtypes remain unchanged.
423
+
424
+
425
+ Example:
426
+ --------
427
+ Basic usage with a DataFrame::
428
+
429
+ >>> df.dtypes
430
+ id int64
431
+ name object
432
+ amount float64
433
+ dtype: object
434
+
435
+ >>> _safe_dtypes(df.dtypes)
436
+ {
437
+ "id": "Int64",
438
+ "name": dtype("O"),
439
+ "amount": dtype("float64"),
440
+ }
441
+
442
+ Typical integration with ``astype``::
443
+
444
+ >>> safe_types = _safe_dtypes(df.dtypes)
445
+ >>> new_df = df.astype(safe_types)
446
+
447
+ This is especially useful when inserting rows with missing values::
448
+
449
+ >>> sep_row = {"id": pd.NA, "name": "---", "amount": pd.NA}
450
+ >>> sep_df = pd.DataFrame([sep_row]).astype(_safe_dtypes(df.dtypes))
451
+ """
452
+ out = {}
453
+ for col, dt in dtypes.items():
454
+ if pd.api.types.is_integer_dtype(dt):
455
+ out[col] = "Int64" # nullable integer
456
+ else:
457
+ out[col] = dt
458
+ return out
459
+
460
+ @versionadded("6.0.0") # Better version of add_blank_row()
461
+ def add_separator_row(
462
+ self,
463
+ group_cols: str | Iterable[str],
464
+ *,
465
+ separator: Mapping[str, object] | None = None,
466
+ drop_last: bool = True,
467
+ ) -> Self:
468
+ """
469
+ Insert a separator row after each group in a DataFrame.
470
+
471
+ Parameters
472
+ ----------
473
+ df : pandas.DataFrame
474
+ Input DataFrame (must be pre-sorted by ``group_cols``).
475
+
476
+ group_cols : str | Iterable[str]
477
+ Column(s) used to define grouping boundaries.
478
+
479
+ separator : Mapping[str, object] | None, optional
480
+ Custom separator row values (e.g. {"col": "---"}).
481
+ Columns not provided will be filled with NaN.
482
+ If None, a fully blank row is inserted.
483
+
484
+ drop_last : bool, optional
485
+ If True, do not insert a separator after the last group.
486
+
487
+ Returns
488
+ -------
489
+ Self
490
+ DataFrame with separator rows inserted.
491
+ """
492
+ df = self.copy()
493
+
494
+ if isinstance(group_cols, str):
495
+ group_cols = [group_cols]
496
+
497
+ # Validate columns
498
+ missing = set(group_cols) - set(df.columns)
499
+ if missing:
500
+ raise KeyError(f"Missing columns: {missing}")
501
+
502
+ # Build separator row template
503
+ if separator is None:
504
+ sep_row = {c: np.nan for c in df.columns}
505
+ else:
506
+ sep_row = {c: separator.get(c, np.nan) for c in df.columns}
507
+
508
+ rows = []
509
+
510
+ safe_types = self._safe_dtypes(df.dtypes)
511
+
512
+ # Group while preserving order
513
+ for _, g in df.groupby(group_cols, sort=False):
514
+ rows.append(g)
515
+
516
+ sep_df = pd.DataFrame([sep_row], columns=df.columns).astype(safe_types)
517
+ rows.append(sep_df)
518
+
519
+ out = cast(pd.DataFrame, pd.concat(rows, ignore_index=True))
520
+
521
+ if drop_last:
522
+ out = out.iloc[:-1].reset_index(drop=True)
523
+
524
+ return self.__class__(out)
525
+
526
+
527
+ # Info
528
+ # ---------------------------------------------------------------------------
529
+ class DataAnalystDataFrameInfoMixin(DFBase):
530
+ """
531
+ Data Analyst ``pd.DataFrame`` - Info
532
+
533
+ - Quick info
534
+ - Quick describe
535
+ - Show distribution
536
+ - Threshold filter
537
+ """
538
+
539
+ # Quick info
540
+ @versionadded("3.2.0")
541
+ def qinfo(self) -> str:
542
+ """
543
+ Show quick infomation about DataFrame
544
+
545
+ Example:
546
+ --------
547
+ >>> DADF.sample_df().qinfo()
548
+ Dataset Information:
549
+ - Number of Rows: 100
550
+ - Number of Columns: 6
551
+ - Total observation: 600
552
+ - Missing value: 13 (2.17%)
553
+
554
+ Column names:
555
+ ['number', 'number_big', 'number_range', 'missing_value', 'text', 'date']
556
+ """
557
+ missing_values = self.isnull().sum().sum()
558
+ total_observation = self.shape[0] * self.shape[1]
559
+ mv_rate = missing_values / total_observation * 100
560
+ info = (
561
+ f"Dataset Information:\n"
562
+ f"- Number of Rows: {self.shape[0]:,}\n"
563
+ f"- Number of Columns: {self.shape[1]:,}\n"
564
+ f"- Total observation: {total_observation:,}\n"
565
+ f"- Missing value: {missing_values:,} ({mv_rate:.2f}%)\n\n"
566
+ f"Column names:\n{self.columns.to_list()}"
567
+ )
568
+ return info
569
+
570
+ @override
571
+ def describe(self, percentiles=None, include=None, exclude=None) -> Self: # type: ignore
572
+ """pd.DataFrame.describe() override"""
573
+ return self.__class__(super().describe(percentiles, include, exclude)) # type: ignore [no-any-return]
574
+
575
+ # Quick describe
576
+ @versionadded("3.2.0")
577
+ def qdescribe(self) -> Self:
578
+ """
579
+ Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype
580
+
581
+ Returns
582
+ -------
583
+ Self
584
+ Modified DataFrame
585
+
586
+
587
+ Example:
588
+ --------
589
+ >>> DADF.sample_df().qdescribe()
590
+ number number_big missing_value
591
+ count 100.000000 100.000000 48.000000
592
+ mean -0.052935 586.750000 22.916667
593
+ std 0.954170 237.248596 11.987286
594
+ min -2.392952 105.000000 3.000000
595
+ 25% -0.738311 407.500000 13.000000
596
+ 50% -0.068014 607.000000 23.500000
597
+ 75% 0.614025 790.250000 36.000000
598
+ max 2.512533 988.000000 42.000000
599
+ """
600
+ return self.__class__( # type: ignore [no-any-return]
601
+ self[self.select_dtypes(exclude=["object", "datetime"]).columns].describe()
602
+ )
603
+
604
+ @versionadded("3.2.0")
605
+ def show_distribution(
606
+ self,
607
+ column_name: str,
608
+ dropna: bool = True,
609
+ *,
610
+ show_percentage: bool = True,
611
+ percentage_round_up: int = 2,
612
+ ) -> Self:
613
+ """
614
+ Show distribution of a column
615
+
616
+ Parameters
617
+ ----------
618
+ column_name : str
619
+ Column to show distribution
620
+
621
+ dropna : bool
622
+ Count N/A when ``False``
623
+ (Default: ``True``)
624
+
625
+ show_percentage : bool
626
+ Show proportion in range 0% - 100% instead of [0, 1]
627
+ (Default: ``True``)
628
+
629
+ percentage_round_up : int
630
+ Round up to which decimals
631
+ (Default: ``2``)
632
+
633
+ Returns
634
+ -------
635
+ Self
636
+ Distribution DataFrame
637
+
638
+
639
+ Example:
640
+ --------
641
+ >>> DADF.sample_df().show_distribution("number_range")
642
+ number_range count percentage
643
+ 0 900 16 16.0
644
+ 1 700 15 15.0
645
+ 2 300 12 12.0
646
+ 3 200 12 12.0
647
+ 4 400 11 11.0
648
+ 5 600 11 11.0
649
+ 6 800 10 10.0
650
+ 7 100 9 9.0
651
+ 8 500 4 4.0
652
+ """
653
+ out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index()
654
+ if show_percentage:
655
+ out["percentage"] = (out["count"] / self.shape[0] * 100).round(
656
+ percentage_round_up
657
+ )
658
+ else:
659
+ out["percentage"] = (out["count"] / self.shape[0]).round(
660
+ percentage_round_up
661
+ )
662
+ return self.__class__(out)
663
+
664
+ @deprecated("5.1.0", reason="Rework THIS")
665
+ def threshold_filter(
666
+ self,
667
+ destination_column: str,
668
+ threshold: int | float = 10,
669
+ *,
670
+ top: int | None = None,
671
+ replace_with: Any = "Other",
672
+ ) -> Self:
673
+ """
674
+ Filter out percentage of data that smaller than the ``threshold``,
675
+ replace all of the smaller data to ``replace_with``.
676
+ As a result, pie chart is less messy.
677
+
678
+ Parameters
679
+ ----------
680
+ destination_column : str
681
+ Column to be filtered
682
+
683
+ threshold : int | float
684
+ Which percentage to cut-off
685
+ (Default: 10%)
686
+
687
+ top : int
688
+ Only show top ``x`` categories in pie chart
689
+ (replace threshold mode)
690
+ (Default: ``None``)
691
+
692
+ replace_with : Any
693
+ Replace all of the smaller data with specified value
694
+
695
+ Returns
696
+ -------
697
+ Self
698
+ Modified DataFrame
699
+ """
700
+ # Clean
701
+ try:
702
+ self[destination_column] = self[
703
+ destination_column
704
+ ].str.strip() # Remove trailing space
705
+ except Exception:
706
+ pass
707
+
708
+ # Logic
709
+ col_df = self.show_distribution(destination_column)
710
+
711
+ # Rename
712
+ if top is not None:
713
+ list_of_keep: list = (
714
+ col_df[destination_column]
715
+ .head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0])) # type: ignore
716
+ .to_list()
717
+ )
718
+ # logger.debug(list_of_keep)
719
+ else:
720
+ list_of_keep = col_df[col_df["percentage"] >= threshold][
721
+ destination_column
722
+ ].to_list() # values that will not be renamed
723
+ self[f"{destination_column}_filtered"] = self[destination_column].apply(
724
+ lambda x: replace_with if x not in list_of_keep else x
725
+ )
726
+
727
+ # Return
728
+ return self
729
+
730
+
731
+ # Missing value
732
+ # ---------------------------------------------------------------------------
733
+ class DataAnalystDataFrameNAMixin(DFBase):
734
+ """
735
+ Data Analyst ``pd.DataFrame`` - Missing value
736
+
737
+ - Fill missing values
738
+ - Get missing values
739
+ - Split N/A
740
+ - Apply not null
741
+ - Apply not null row
742
+ """
743
+
744
+ def fill_missing_values(
745
+ self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
746
+ ) -> Self:
747
+ """
748
+ Fill missing values in specified column
749
+
750
+ Parameters
751
+ ----------
752
+ column_name : str
753
+ Column name
754
+
755
+ fill : Any
756
+ Fill the missing values with, by default ``np.nan``
757
+
758
+ fill_when_not_exist : Any
759
+ When ``column_name`` does not exist,
760
+ create a new column and fill with
761
+ ``fill_when_not_exist``, by default ``np.nan``
762
+
763
+ Returns
764
+ -------
765
+ Self
766
+ Modified DataFrame
767
+
768
+
769
+ Example:
770
+ --------
771
+ >>> df = DADF.sample_df(2)
772
+ >>> df
773
+ number number_big number_range missing_value text date
774
+ 0 0.174303 926 900 NaN tenkiakh 2006-09-08
775
+ 1 0.305137 140 100 NaN jzuddamf 2012-04-04
776
+ >>> df.fill_missing_values("missing_value", 0)
777
+ number number_big number_range missing_value text date
778
+ 0 0.174303 926 900 0.0 tenkiakh 2006-09-08
779
+ 1 0.305137 140 100 0.0 jzuddamf 2012-04-04
780
+ >>> df.fill_missing_values("missing_column", 0, fill_when_not_exist=0)
781
+ number number_big number_range missing_value text date missing_column
782
+ 0 0.174303 926 900 0.0 tenkiakh 2006-09-08 0
783
+ 1 0.305137 140 100 0.0 jzuddamf 2012-04-04 0
784
+ """
785
+ try:
786
+ self[column_name] = self[column_name].fillna(fill)
787
+ except KeyError:
788
+ if getattr(self, "add_blank_column", None) is not None:
789
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
790
+ self.add_blank_column(column_name, fill_when_not_exist) # type: ignore
791
+ return self
792
+
793
+ def get_missing_values(
794
+ self, hightlight: bool = True, *, percentage_round_up: int = 2
795
+ ) -> Self:
796
+ """
797
+ Get a DataFrame contains count of missing values for each column
798
+
799
+ Parameters
800
+ ----------
801
+ hightlight : bool
802
+ Shows only columns with missing values when ``True``, by default ``True``
803
+
804
+ percentage_round_up : int
805
+ Round up to which decimals, by default ``2``
806
+
807
+ Returns
808
+ -------
809
+ Self
810
+ Missing value DataFrame
811
+
812
+
813
+ Example:
814
+ --------
815
+ >>> DADF.sample_df(152).get_missing_values()
816
+ Num of N/A Percentage
817
+ missing_value 42 27.63
818
+ """
819
+ # Check for missing value
820
+ df_na = self.isnull().sum().sort_values(ascending=False)
821
+ if hightlight:
822
+ out = df_na[df_na != 0].to_frame()
823
+ else:
824
+ out = df_na.to_frame()
825
+ out.rename(columns={0: "Num of N/A"}, inplace=True)
826
+ out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round(
827
+ percentage_round_up
828
+ )
829
+
830
+ # logger.debug(
831
+ # f"Percentage of N/A over entire DF: "
832
+ # f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%"
833
+ # )
834
+ return self.__class__(out)
835
+
836
+ @versionadded("3.1.0")
837
+ def split_na(self, by_column: str) -> SplittedDF:
838
+ """
839
+ Split DataFrame into 2 parts:
840
+ - Without missing value in specified column
841
+ - With missing value in specified column
842
+
843
+ Parameters
844
+ ----------
845
+ by_column : str
846
+ Split by column
847
+
848
+ Returns
849
+ -------
850
+ SplittedDF
851
+ Splitted DataFrame
852
+
853
+
854
+ Example:
855
+ --------
856
+ >>> DADF.sample_df(10).split_na("missing_value")
857
+ SplittedDF(
858
+ df= number number_big number_range missing_value text date
859
+ 0 0.643254 690 600 3.0 cinvofwj 2018-08-15
860
+ 2 0.499345 255 200 13.0 jasifzez 2005-06-01
861
+ 3 -1.727036 804 800 38.0 esxjmger 2009-07-24
862
+ 4 0.873058 690 600 32.0 htewfpld 2022-07-22
863
+ 5 -2.389884 442 400 30.0 hbcnfogu 2006-02-25
864
+ 8 0.264584 432 400 2.0 ejbvbmwn 2013-05-11
865
+ 9 0.813655 137 100 20.0 oecttada 2024-11-22,
866
+ df_na= number number_big number_range missing_value text date
867
+ 1 -0.411354 363 300 NaN juzecani 2014-12-02
868
+ 6 -0.833857 531 500 NaN ybnntryh 2023-11-03
869
+ 7 1.355589 472 400 NaN zjltghjr 2024-10-09
870
+ )
871
+ """
872
+ out = SplittedDF(
873
+ # df=self[~self[by_column].isna()], # DF
874
+ df=self[self[by_column].notna()], # DF
875
+ df_na=self[self[by_column].isna()], # DF w/o NA
876
+ )
877
+ return out
878
+
879
+ @versionadded("5.1.0")
880
+ def apply_notnull(self, col: str, callable: Callable[[Any], _R]) -> Self:
881
+ """
882
+ Only apply callable to not NaN value in column
883
+
884
+ Parameters
885
+ ----------
886
+ col : str
887
+ Column to apply
888
+
889
+ callable : Callable[[Any], _R]
890
+ Callable
891
+
892
+ Returns
893
+ -------
894
+ Self
895
+ Applied DataFrame
896
+
897
+
898
+ Example:
899
+ --------
900
+ >>> DADF.sample_df(5).apply_notnull("missing_value", lambda _: "REPLACED")
901
+ number number_big number_range missing_value text date
902
+ 0 0.852218 157 100 REPLACED dqzxaxxs 2006-03-08
903
+ 1 1.522428 616 600 NaN mivkaooe 2018-12-27
904
+ 2 0.108506 745 700 REPLACED qanwwjet 2005-07-14
905
+ 3 -1.435079 400 400 REPLACED ywahcasi 2024-05-20
906
+ 4 0.118993 861 800 REPLACED saoupuby 2019-04-28
907
+ """
908
+ self[col] = self[col].apply(lambda x: callable(x) if pd.notnull(x) else x) # type: ignore
909
+ return self
910
+
911
+ @versionadded("5.1.0") # type: ignore
912
+ def apply_notnull_row(
913
+ self,
914
+ apply_when_null: Callable[[Any], _R] | _T | None = None,
915
+ apply_when_not_null: Callable[[Any], _R] | _T | None = None,
916
+ col_name: str | None = None,
917
+ ) -> Self:
918
+ """
919
+ Apply to DataFrame's row with missing value.
920
+
921
+ Parameters
922
+ ----------
923
+ apply_when_null : Callable[[Any], R] | T | None, optional
924
+ Callable or Any, by default ``None``: returns if entire row is not null
925
+
926
+ apply_when_not_null : Callable[[Any], R] | T | None, optional
927
+ Callable or Any, by default ``None``: returns if entire row is not null
928
+
929
+ col_name : str | None, optional
930
+ Output column name, by default ``None`` (uses custom name)
931
+
932
+ Returns
933
+ -------
934
+ Self
935
+ Modified DataDrame
936
+
937
+
938
+ Example:
939
+ --------
940
+ >>> df = DADF({"A": [None, 2, 3, 4], "B": [1, None, 3, 4], "C": [None, 2, None, 4]})
941
+ >>> df.apply_notnull_row()
942
+ A B C applied_row_null
943
+ 0 NaN 1.0 NaN False
944
+ 1 2.0 NaN 2.0 False
945
+ 2 3.0 3.0 NaN False
946
+ 3 4.0 4.0 4.0 True
947
+ >>> df.apply_notnull_row(0, 1)
948
+ A B C applied_row_null
949
+ 0 NaN 1.0 NaN 0
950
+ 1 2.0 NaN 2.0 0
951
+ 2 3.0 3.0 NaN 0
952
+ 3 4.0 4.0 4.0 1
953
+ >>> df.apply_notnull_row(lambda _: "n", lambda _: "y", col_name="mod")
954
+ A B C mod
955
+ 0 NaN 1.0 NaN n
956
+ 1 2.0 NaN 2.0 n
957
+ 2 3.0 3.0 NaN n
958
+ 3 4.0 4.0 4.0 y
959
+ """
960
+
961
+ def apply_func(row: pd.Series):
962
+ # Both None
963
+ if apply_when_null is None and apply_when_not_null is None:
964
+ return row.notnull().all()
965
+
966
+ # When all values in row are not null
967
+ if row.notnull().all():
968
+ if callable(apply_when_not_null):
969
+ return apply_when_not_null(row)
970
+ return apply_when_not_null
971
+
972
+ # When any value in row is null
973
+ if callable(apply_when_null):
974
+ return apply_when_null(row)
975
+ return apply_when_null
976
+
977
+ # Column name
978
+ cname = "applied_row_null" if col_name is None else col_name
979
+ self[cname] = self.apply(apply_func, axis=1) # type: ignore
980
+
981
+ return self
982
+
983
+
984
+ # Other
985
+ # ---------------------------------------------------------------------------
986
+ class DataAnalystDataFrameOtherMixin(DFBase):
987
+ """
988
+ Data Analyst ``pd.DataFrame`` - Other method/Stuff
989
+
990
+ - Merge left
991
+ """
992
+
993
+ @versionadded("4.0.0")
994
+ def merge_left(
995
+ self,
996
+ other: Self | pd.DataFrame,
997
+ on: str,
998
+ columns: list[str] | None = None,
999
+ ) -> Self:
1000
+ """
1001
+ Merge left of 2 DataFrame
1002
+
1003
+ Parameters
1004
+ ----------
1005
+ other : Self | pd.DataFrame
1006
+ DataFrame to merge
1007
+
1008
+ on : str
1009
+ Merge on which column
1010
+
1011
+ columns : list[str] | None, optional
1012
+ Columns to take from other DataFrame, by default ``None``
1013
+ (Take all columns)
1014
+
1015
+ Returns
1016
+ -------
1017
+ Self
1018
+ Merged DataFrame
1019
+
1020
+
1021
+ Example:
1022
+ --------
1023
+ >>> df1 = DADF({
1024
+ ... "id": [1, 2, 5],
1025
+ ... "name": ["Alice", "Bob", "Rich"],
1026
+ ... "age": [20, 20, 20],
1027
+ ... })
1028
+ >>> df2 = DADF({
1029
+ ... "id": [1, 2, 3],
1030
+ ... "age": [25, 30, 45],
1031
+ ... "department": ["HR", "IT", "PM"],
1032
+ ... "salary": [50000, 60000, 55000],
1033
+ ... })
1034
+ >>> df1.merge_left(df2, on="id")
1035
+ id name age_x age_y department salary
1036
+ 0 1 Alice 20 25.0 HR 50000.0
1037
+ 1 2 Bob 20 30.0 IT 60000.0
1038
+ 2 5 Rich 20 NaN NaN NaN
1039
+ >>> df1.merge_left(df2, on="id", columns=["salary"])
1040
+ id name age department salary
1041
+ 0 1 Alice 25.0 HR 50000.0
1042
+ 1 2 Bob 30.0 IT 60000.0
1043
+ 2 5 Rich NaN NaN NaN
1044
+ """
1045
+
1046
+ if columns is not None:
1047
+ current_col = [on]
1048
+ current_col.extend(columns)
1049
+ col = other.columns.to_list()
1050
+ cols = list(set(col) - set(current_col))
1051
+
1052
+ if getattr(self, "drop_columns", None) is not None:
1053
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
1054
+ self.drop_columns(cols) # type: ignore
1055
+
1056
+ out = self.merge(other, how="left", on=on)
1057
+ return self.__class__(out)
1058
+
1059
+
1060
+ # Date
1061
+ # ---------------------------------------------------------------------------
1062
+ class DataAnalystDataFrameDateMixin(DFBase):
1063
+ """
1064
+ Data Analyst ``pd.DataFrame`` - Date
1065
+
1066
+ - Add date column from month column
1067
+ - Add detail date
1068
+ - Delta date (How many days inbetween)
1069
+ """
1070
+
1071
+ def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self:
1072
+ """
1073
+ Add dummy ``date`` column from ``month`` column
1074
+
1075
+ Parameters
1076
+ ----------
1077
+ month_column : str
1078
+ Month column
1079
+
1080
+ col_name : str
1081
+ New date column name, by default: ``"date"``
1082
+
1083
+ Returns
1084
+ -------
1085
+ Self
1086
+ Modified DataFrame
1087
+
1088
+
1089
+ Example:
1090
+ --------
1091
+ >>> df = (
1092
+ ... DADF.sample_df(2)
1093
+ ... .add_detail_date("date", mode="m")
1094
+ ... .drop_columns(["date", "number", "number_range"])
1095
+ ... )
1096
+ >>> df
1097
+ number_big missing_value text month
1098
+ 0 755 NaN lincgqzl 4
1099
+ 1 907 NaN gxltrjku 10
1100
+ >>> df.add_date_from_month("month")
1101
+ number_big missing_value text month date
1102
+ 0 755 NaN lincgqzl 4 2025-04-01
1103
+ 1 907 NaN gxltrjku 10 2025-10-01
1104
+ """
1105
+ _this_year = datetime.now().year
1106
+ self[col_name] = pd.to_datetime(
1107
+ f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
1108
+ format="%Y-%m-%d",
1109
+ )
1110
+
1111
+ # Rearrange
1112
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
1113
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
1114
+ return self.rearrange_rightmost_column(month_column) # type: ignore [no-any-return]
1115
+ return self
1116
+
1117
+ def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self:
1118
+ """
1119
+ Add these columns from ``date_column``:
1120
+ - ``date`` (won't add if ``date_column`` value is ``"date"``)
1121
+ - ``day`` (overwrite if already exist)
1122
+ - ``week`` (overwrite if already exist)
1123
+ - ``month`` (overwrite if already exist)
1124
+ - ``year`` (overwrite if already exist)
1125
+
1126
+ Parameters
1127
+ ----------
1128
+ date_column : str
1129
+ Date column
1130
+
1131
+ mode : str
1132
+ | Detailed column to add
1133
+ | ``d``: day
1134
+ | ``w``: week number
1135
+ | ``m``: month
1136
+ | ``y``: year
1137
+ | (Default: ``"dwmy"``)
1138
+
1139
+ Returns
1140
+ -------
1141
+ Self
1142
+ Modified DataFrame
1143
+
1144
+
1145
+ Example:
1146
+ --------
1147
+ >>> df = DADF.sample_df(2)
1148
+ >>> df
1149
+ number number_big number_range missing_value text date
1150
+ 0 0.331195 902 900 20 fgyanxik 2021-10-18
1151
+ 1 -0.877727 378 300 13 dqvaggjo 2007-03-06
1152
+ >>> df.add_detail_date("date")
1153
+ number number_big number_range missing_value text date day week month year
1154
+ 0 0.331195 902 900 20 fgyanxik 2021-10-18 18 42 10 2021
1155
+ 1 -0.877727 378 300 13 dqvaggjo 2007-03-06 6 10 3 2007
1156
+ """
1157
+ # Convert to datetime
1158
+ self["date"] = pd.to_datetime(self[date_column])
1159
+
1160
+ # Logic
1161
+ col_counter = 0
1162
+ # self["weekday"] = self["day"].dt.isocalendar().day # Weekday
1163
+ if mode.find("d") != -1:
1164
+ # logger.debug("Mode: 'day'")
1165
+ self["day"] = self["date"].dt.day
1166
+ col_counter += 1
1167
+ if mode.find("w") != -1:
1168
+ # logger.debug("Mode: 'weekday'")
1169
+ self["week"] = self["date"].dt.isocalendar().week
1170
+ col_counter += 1
1171
+ if mode.find("m") != -1:
1172
+ # logger.debug("Mode: 'month'")
1173
+ self["month"] = self["date"].dt.month
1174
+ col_counter += 1
1175
+ if mode.find("y") != -1:
1176
+ # logger.debug("Mode: 'year'")
1177
+ self["year"] = self["date"].dt.year
1178
+ col_counter += 1
1179
+
1180
+ # Return
1181
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
1182
+ # Compatible with DataAnalystDataFrameColumnMethodMixin
1183
+ return self.rearrange_rightmost_column(date_column, col_counter) # type: ignore [no-any-return]
1184
+ return self
1185
+
1186
+ def delta_date(
1187
+ self,
1188
+ date_column: str,
1189
+ mode: Literal["now", "between_row"] = "now",
1190
+ *,
1191
+ col_name: str = "delta_date",
1192
+ ) -> Self:
1193
+ """
1194
+ Calculate date interval
1195
+
1196
+ Parameters
1197
+ ----------
1198
+ date_column : str
1199
+ Date column
1200
+
1201
+ mode : str
1202
+ | Mode to calculate
1203
+ | ``"between_row"``: Calculate date interval between each row
1204
+ | ``"now"``: Calculate date interval to current date
1205
+ | (Default: ``"now"``)
1206
+
1207
+ col_name : str
1208
+ | New delta date column name
1209
+ | (Default: ``"delta_date"``)
1210
+
1211
+ Returns
1212
+ -------
1213
+ Self
1214
+ Modified DataFrame
1215
+
1216
+
1217
+ Example:
1218
+ --------
1219
+ >>> df = DADF.sample_df(2)
1220
+ >>> df
1221
+ number number_big number_range missing_value text date
1222
+ 0 -0.729988 435 400 21 xkrqqouf 2014-08-01
1223
+ 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10
1224
+ >>> df.delta_date("date")
1225
+ number number_big number_range missing_value text date delta_date
1226
+ 0 -0.729988 435 400 21 xkrqqouf 2014-08-01 3873
1227
+ 1 -0.846031 210 200 5 rbkmiqxt 2024-07-10 242
1228
+ """
1229
+ if mode.lower().startswith("between_row"):
1230
+ dated = self[date_column].to_list()
1231
+ cal: list[timedelta] = []
1232
+ for i in range(len(dated)):
1233
+ if i == 0:
1234
+ cal.append(dated[i] - dated[i])
1235
+ # cal.append(relativedelta(dated[i], dated[i]))
1236
+ else:
1237
+ cal.append(dated[i] - dated[i - 1])
1238
+ # cal.append(relativedelta(dated[i], dated[i - 1]))
1239
+ self[col_name] = [x.days for x in cal]
1240
+ else: # mode="now"
1241
+ self[col_name] = self[date_column].apply(
1242
+ lambda x: (datetime.now() - x).days
1243
+ )
1244
+ return self
1245
+
1246
+ @versionadded("6.0.0")
1247
+ def normalize_datetime_column(
1248
+ self,
1249
+ col: str,
1250
+ *,
1251
+ inplace: bool = False,
1252
+ ) -> Self:
1253
+ """
1254
+ Normalize a datetime column by removing the time component.
1255
+
1256
+ This function converts the specified column to pandas datetime (``datetime64[ns]``)
1257
+ (if not already), then normalizes all values so that the time
1258
+ component is set to ``00:00:00``. The date component is preserved.
1259
+
1260
+ The function safely handles missing or invalid values by coercing
1261
+ them to ``NaT``.
1262
+
1263
+ Parameters
1264
+ ----------
1265
+ col : str
1266
+ Name of the column to normalize. The column may contain
1267
+ datetime-like values, strings, or mixed types.
1268
+
1269
+ inplace : bool, default False
1270
+ | If ``True``, modify the input DataFrame in place.
1271
+ | If ``False``, operate on a copy and return the modified DataFrame.
1272
+
1273
+ Returns
1274
+ -------
1275
+ Self
1276
+ DataFrame with the normalized datetime column.
1277
+
1278
+
1279
+ Example:
1280
+ --------
1281
+ Basic usage::
1282
+
1283
+ >>> df = DADF({
1284
+ ... "created_at": ["2024-01-01 10:15:30", "2024-01-02 23:59:59"]
1285
+ ... })
1286
+ >>> normalize_datetime_column(df, "created_at")
1287
+ created_at
1288
+ 0 2024-01-01 00:00:00
1289
+ 1 2024-01-02 00:00:00
1290
+
1291
+ In-place modification::
1292
+
1293
+ >>> normalize_datetime_column(df, "created_at", inplace=True)
1294
+
1295
+ Handling invalid values::
1296
+
1297
+ >>> df = DADF({"dt": ["2024-01-01 10:00", "invalid"]})
1298
+ >>> normalize_datetime_column(df, "dt")
1299
+ dt
1300
+ 0 2024-01-01 00:00:00
1301
+ 1 NaT
1302
+
1303
+ """
1304
+ if not inplace:
1305
+ df = self.copy()
1306
+ else:
1307
+ df = self
1308
+
1309
+ # Using ``df.loc[:, col]`` avoids ``SettingWithCopyWarning`` when the input DataFrame is a slice.
1310
+ df.loc[:, col] = pd.to_datetime(df[col], errors="coerce").dt.normalize()
1311
+ return df
1312
+
1313
+
1314
+ # Export
1315
+ # ---------------------------------------------------------------------------
1316
+ class DataAnalystDataFrameExportMixin(DFBase):
1317
+ """
1318
+ Data Analyst ``pd.DataFrame`` - Export method
1319
+
1320
+ - da_export
1321
+ """
1322
+
1323
+ @versionchanged("5.8.0", "New parameter")
1324
+ def da_export(
1325
+ self,
1326
+ path: str,
1327
+ sheet_name: str = "Sheet1",
1328
+ *,
1329
+ auto_width: bool = True,
1330
+ cols_contain_centered_text: list[str] | None = None,
1331
+ cols_contain_number: list[str] | None = None,
1332
+ cols_contain_percentage: list[str] | None = None,
1333
+ ) -> None:
1334
+ """
1335
+ Export DataFrame with `xlsxwriter` engine
1336
+
1337
+ Parameters
1338
+ ----------
1339
+ path : Path | str
1340
+ Path to export
1341
+
1342
+ sheet_name : str, optional
1343
+ Sheet name, by default "Sheet1"
1344
+
1345
+ auto_width : bool, optional
1346
+ Auto resize column width, by default ``True``
1347
+
1348
+ cols_contain_centered_text : list[str] | None, optional
1349
+ Columns that contain centered text (Align center), by default None
1350
+
1351
+ cols_contain_number : list[str] | None, optional
1352
+ Columns that contain number value (to format as number - int), by default None
1353
+
1354
+ cols_contain_percentage : list[str] | None, optional
1355
+ Columns that contain percentage value (to format as percentage), by default None
1356
+ """
1357
+
1358
+ # Using xlsxwriter engine
1359
+ with pd.ExcelWriter(path, engine="xlsxwriter") as writer:
1360
+ self.to_excel(writer, sheet_name=sheet_name, index=False, float_format="%.2f", na_rep="")
1361
+
1362
+ # Format style
1363
+ workbook: Workbook = writer.book # type: ignore
1364
+ header_fmt = workbook.add_format(
1365
+ {
1366
+ "bold": True,
1367
+ "text_wrap": True,
1368
+ "border": 1,
1369
+ "align": "center",
1370
+ "valign": "vcenter",
1371
+ # "bg_color": "#A0BEFD",
1372
+ }
1373
+ )
1374
+ number_fmt = workbook.add_format(
1375
+ {"num_format": "#,##0", "align": "center", "valign": "vcenter"}
1376
+ ) # 1,000,000
1377
+ percent_fmt = workbook.add_format({"num_format": "0.00%", "align": "center", "valign": "vcenter"}) # 1.00%
1378
+ text_fmt = workbook.add_format({"valign": "vcenter"})
1379
+ text_center_fmt = workbook.add_format({"align": "center", "valign": "vcenter"})
1380
+
1381
+ # Format sheet
1382
+ worksheet: Worksheet = writer.sheets[sheet_name]
1383
+
1384
+ # Format header - First row
1385
+ for col_num, value in enumerate(self.columns.values):
1386
+ worksheet.write(0, col_num, value, header_fmt)
1387
+
1388
+ rules = [
1389
+ (cols_contain_number, number_fmt),
1390
+ (cols_contain_percentage, percent_fmt),
1391
+ (cols_contain_centered_text, text_center_fmt),
1392
+ ]
1393
+
1394
+ # Auto width + col format
1395
+ for i, col in enumerate(self.columns):
1396
+ # Max str len of each column
1397
+ max_len = None if auto_width is None else max(self[col].astype(str).map(len).max(), len(col)) + 2
1398
+ worksheet.set_column(i, i, max_len) # Set width
1399
+
1400
+ # Format style
1401
+ fmt = text_fmt # default
1402
+ for cols, f in rules:
1403
+ if cols is not None and col in cols:
1404
+ fmt = f
1405
+ break
1406
+ worksheet.set_column(i, i, max_len, fmt)
1407
+
1408
+ # if cols_contain_number is not None:
1409
+ # for x in cols_contain_number:
1410
+ # self[x] = pd.to_numeric(self[x], errors="coerce")
1411
+
1412
+
1413
+ # City
1414
+ # ---------------------------------------------------------------------------
1415
+ class DataAnalystDataFrameCityMixin(DFBase):
1416
+ """
1417
+ Data Analyst ``pd.DataFrame`` - City
1418
+
1419
+ - Convert city
1420
+ """
1421
+
1422
+ def convert_city(
1423
+ self,
1424
+ city_column: str,
1425
+ city_list: list[CityData],
1426
+ *,
1427
+ mode: str = "ra",
1428
+ ) -> Self:
1429
+ """
1430
+ Get ``region`` and ``area`` of a city
1431
+
1432
+ Parameters
1433
+ ----------
1434
+ city_column : str
1435
+ Column contains city data
1436
+
1437
+ city_list : list[CityData]
1438
+ List of city in correct format
1439
+ (Default: ``None``)
1440
+
1441
+ mode : str
1442
+ | Detailed column to add
1443
+ | ``r``: region
1444
+ | ``a``: area
1445
+ | (Default: ``"ra"``)
1446
+
1447
+ Returns
1448
+ -------
1449
+ DataAnalystDataFrame
1450
+ Modified DataFrame
1451
+ """
1452
+
1453
+ # Support function
1454
+ def _convert_city_support(value: str) -> CityData:
1455
+ for x in city_list:
1456
+ if x.city.lower().startswith(value.lower()):
1457
+ return x
1458
+ return CityData(city=value, region=np.nan, area=np.nan) # type: ignore
1459
+
1460
+ # Convert
1461
+ col_counter = 0
1462
+ if mode.find("r") != -1:
1463
+ # logger.debug("Mode: 'region'")
1464
+ self["region"] = self[city_column].apply(
1465
+ lambda x: _convert_city_support(x).region
1466
+ )
1467
+ col_counter += 1
1468
+ if mode.find("a") != -1:
1469
+ # logger.debug("Mode: 'area'")
1470
+ self["area"] = self[city_column].apply(
1471
+ lambda x: _convert_city_support(x).area
1472
+ )
1473
+ col_counter += 1
1474
+
1475
+ # Rearrange
1476
+ if getattr(self, "rearrange_rightmost_column", None) is not None:
1477
+ return self.rearrange_rightmost_column(city_column, col_counter) # type: ignore [no-any-return]
1478
+ return self
1479
+
1480
+
1481
+ # Main
1482
+ # ---------------------------------------------------------------------------
1483
+ class DADF(
1484
+ GetClassMembersMixin,
1485
+ DataAnalystDataFrameCityMixin,
1486
+ DataAnalystDataFrameExportMixin,
1487
+ DataAnalystDataFrameDateMixin,
1488
+ DataAnalystDataFrameOtherMixin,
1489
+ DataAnalystDataFrameNAMixin,
1490
+ DataAnalystDataFrameInfoMixin,
1491
+ DataAnalystDataFrameRowMethodMixin,
1492
+ DataAnalystDataFrameColumnMethodMixin,
1493
+ ):
1494
+ """
1495
+ Data Analyst ``pd.DataFrame``
1496
+
1497
+ For a list of extra methods:
1498
+ >>> print(DADF.DADF_METHODS)
1499
+ """
1500
+
1501
+ @classmethod
1502
+ @deprecated("5.1.0")
1503
+ @versionadded("3.2.0")
1504
+ def dadf_help(cls) -> list[str]:
1505
+ """
1506
+ Show all available method of DataAnalystDataFrame
1507
+ """
1508
+ list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame)))
1509
+ return sorted(list_of_method)
1510
+
1511
+ @classmethod
1512
+ def sample_df(cls, size: int = 100) -> Self:
1513
+ """
1514
+ Create sample DataFrame
1515
+
1516
+ Parameters
1517
+ ----------
1518
+ size : int
1519
+ Number of observations, by default ``100``
1520
+
1521
+ Returns
1522
+ -------
1523
+ Self
1524
+ DataFrame with these columns:
1525
+ [number, number_big, number_range, missing_value, text, date]
1526
+
1527
+
1528
+ Example:
1529
+ --------
1530
+ >>> DataAnalystDataFrame.sample_df()
1531
+ number number_big number_range missing_value text date
1532
+ 0 -2.089770 785 700 NaN vwnlqoql 2013-11-20
1533
+ 1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13
1534
+ 2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24
1535
+ 3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28
1536
+ 4 1.687803 878 800 NaN aajtncum 2005-10-05
1537
+ .. ... ... ... ... ... ...
1538
+ 95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12
1539
+ 96 1.296795 255 200 NaN lwvytego 2014-05-10
1540
+ 97 1.440746 297 200 5.0 lqsoykun 2010-04-03
1541
+ 98 0.327702 845 800 NaN leadkvsy 2005-08-05
1542
+ 99 0.556720 981 900 36.0 bozmxixy 2004-02-22
1543
+ [100 rows x 6 columns]
1544
+ """
1545
+ # Restrain
1546
+ size = max(size, 1)
1547
+
1548
+ # Number col
1549
+ df = cls(np.random.randn(size, 1), columns=["number"])
1550
+ df["number_big"] = [
1551
+ random.choice(range(100, 999)) for _ in range(size)
1552
+ ] # Big number in range 100-999
1553
+ df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
1554
+
1555
+ # Missing value col
1556
+ na_rate = random.randint(1, 99)
1557
+ d = [random.randint(1, 99) for _ in range(size)]
1558
+ df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
1559
+ # df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
1560
+
1561
+ # Text col
1562
+ df["text"] = [
1563
+ "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
1564
+ for _ in range(size)
1565
+ ]
1566
+
1567
+ # Random date col
1568
+ df["date"] = [
1569
+ datetime(
1570
+ year=random.randint(datetime.now().year - 20, datetime.now().year),
1571
+ month=random.randint(1, 12),
1572
+ day=random.randint(1, 28),
1573
+ )
1574
+ for _ in range(size)
1575
+ ]
1576
+
1577
+ # Return
1578
+ return df
1579
+
1580
+
1581
+ class DADF_WIP(DADF):
1582
+ """
1583
+ W.I.P - No test cases written
1584
+ """
1585
+
1586
+ pass
1587
+
1588
+ if __name__ == "__main__":
1589
+ from pathlib import Path
1590
+
1591
+ # t = DADF.sample_df().show_distribution("number_range", show_percentage=False)
1592
+ # t.da_export(
1593
+ # Path(__file__).parent.joinpath("a.xlsx").resolve().__str__(),
1594
+ # cols_contain_number=["number_range"],
1595
+ # cols_contain_percentage=["percentage"],
1596
+ # )
1597
+ # print(t)
1598
+
1599
+ df = DADF.sample_df(10)
1600
+ print(df.add_blank_row())