absfuyu 5.0.0__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of absfuyu might be problematic. Click here for more details.

Files changed (72) hide show
  1. absfuyu/__init__.py +1 -1
  2. absfuyu/__main__.py +2 -2
  3. absfuyu/cli/__init__.py +2 -2
  4. absfuyu/cli/color.py +30 -14
  5. absfuyu/cli/config_group.py +9 -2
  6. absfuyu/cli/do_group.py +13 -6
  7. absfuyu/cli/game_group.py +9 -2
  8. absfuyu/cli/tool_group.py +16 -9
  9. absfuyu/config/__init__.py +2 -2
  10. absfuyu/core/__init__.py +2 -2
  11. absfuyu/core/baseclass.py +449 -80
  12. absfuyu/core/baseclass2.py +2 -2
  13. absfuyu/core/decorator.py +69 -3
  14. absfuyu/core/docstring.py +25 -22
  15. absfuyu/core/dummy_cli.py +2 -2
  16. absfuyu/core/dummy_func.py +19 -6
  17. absfuyu/core/typings.py +40 -0
  18. absfuyu/dxt/__init__.py +2 -2
  19. absfuyu/dxt/dictext.py +2 -2
  20. absfuyu/dxt/dxt_support.py +2 -2
  21. absfuyu/dxt/intext.py +31 -3
  22. absfuyu/dxt/listext.py +28 -3
  23. absfuyu/dxt/strext.py +3 -3
  24. absfuyu/extra/__init__.py +2 -2
  25. absfuyu/extra/beautiful.py +3 -2
  26. absfuyu/extra/da/__init__.py +36 -0
  27. absfuyu/extra/da/dadf.py +1138 -0
  28. absfuyu/extra/da/dadf_base.py +186 -0
  29. absfuyu/extra/da/df_func.py +97 -0
  30. absfuyu/extra/da/mplt.py +219 -0
  31. absfuyu/extra/data_analysis.py +10 -1067
  32. absfuyu/fun/__init__.py +2 -2
  33. absfuyu/fun/tarot.py +2 -2
  34. absfuyu/game/__init__.py +2 -2
  35. absfuyu/game/game_stat.py +2 -2
  36. absfuyu/game/sudoku.py +2 -2
  37. absfuyu/game/tictactoe.py +2 -2
  38. absfuyu/game/wordle.py +2 -2
  39. absfuyu/general/__init__.py +4 -4
  40. absfuyu/general/content.py +2 -2
  41. absfuyu/general/human.py +2 -2
  42. absfuyu/general/shape.py +2 -2
  43. absfuyu/logger.py +2 -2
  44. absfuyu/pkg_data/__init__.py +2 -2
  45. absfuyu/pkg_data/deprecated.py +2 -2
  46. absfuyu/sort.py +2 -2
  47. absfuyu/tools/__init__.py +25 -2
  48. absfuyu/tools/checksum.py +27 -7
  49. absfuyu/tools/converter.py +93 -28
  50. absfuyu/{general → tools}/generator.py +2 -2
  51. absfuyu/tools/inspector.py +433 -0
  52. absfuyu/tools/keygen.py +2 -2
  53. absfuyu/tools/obfuscator.py +46 -8
  54. absfuyu/tools/passwordlib.py +88 -23
  55. absfuyu/tools/shutdownizer.py +2 -2
  56. absfuyu/tools/web.py +2 -2
  57. absfuyu/util/__init__.py +2 -2
  58. absfuyu/util/api.py +2 -2
  59. absfuyu/util/json_method.py +2 -2
  60. absfuyu/util/lunar.py +2 -2
  61. absfuyu/util/path.py +190 -82
  62. absfuyu/util/performance.py +4 -4
  63. absfuyu/util/shorten_number.py +40 -10
  64. absfuyu/util/text_table.py +272 -0
  65. absfuyu/util/zipped.py +6 -6
  66. absfuyu/version.py +59 -42
  67. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/METADATA +10 -3
  68. absfuyu-5.1.0.dist-info/RECORD +76 -0
  69. absfuyu-5.0.0.dist-info/RECORD +0 -68
  70. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/WHEEL +0 -0
  71. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/entry_points.txt +0 -0
  72. {absfuyu-5.0.0.dist-info → absfuyu-5.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,1077 +2,20 @@
2
2
  Absfuyu: Data Analysis [W.I.P]
3
3
  ------------------------------
4
4
  Extension for ``pd.DataFrame``
5
+ (deprecated)
5
6
 
6
- Version: 5.0.0
7
- Date updated: 13/02/2025 (dd/mm/yyyy)
7
+ Version: 5.1.0
8
+ Date updated: 10/03/2025 (dd/mm/yyyy)
8
9
  """
9
10
 
10
- # Module level
11
- # ---------------------------------------------------------------------------
12
- __all__ = [
13
- # Function
14
- "compare_2_list",
15
- # Support
16
- "CityData",
17
- "SplittedDF",
18
- "PLTFormatString",
19
- # Main
20
- "MatplotlibFormatString",
21
- "DataAnalystDataFrame",
22
- "DADF",
23
- ]
24
-
25
-
26
11
  # Library
27
12
  # ---------------------------------------------------------------------------
28
- import random
29
- import string
30
- from collections import deque
31
- from datetime import datetime
32
- from itertools import chain, product
33
- from typing import Any, ClassVar, Literal, NamedTuple, Self
34
-
35
- DA_MODE = False
36
-
37
- try:
38
- import numpy as np
39
- import pandas as pd
40
- except ImportError:
41
- from subprocess import run
42
-
43
- from absfuyu.config import ABSFUYU_CONFIG
44
-
45
- if ABSFUYU_CONFIG._get_setting("auto-install-extra").value:
46
- cmd = "python -m pip install -U absfuyu[full]".split()
47
- run(cmd)
48
- else:
49
- raise SystemExit("This feature is in absfuyu[full] package") # noqa: B904
50
- else:
51
- DA_MODE = True
52
-
53
-
54
- from absfuyu.core import ShowAllMethodsMixin, versionadded # noqa: E402
55
- from absfuyu.logger import logger # noqa: E402
56
- from absfuyu.util import set_min, set_min_max # noqa: E402
57
-
58
-
59
- # Function
60
- # ---------------------------------------------------------------------------
61
- def equalize_df(data: dict[str, list], fillna=np.nan) -> dict[str, list]:
62
- """
63
- Make all list in dict have equal length to make pd.DataFrame
64
-
65
- :param data: `dict` data that ready for `pd.DataFrame`
66
- :param fillna: Fill N/A value (Default: `np.nan`)
67
- """
68
- max_len = max(map(len, data.values()))
69
- for _, v in data.items():
70
- if len(v) < max_len:
71
- missings = max_len - len(v)
72
- for _ in range(missings):
73
- v.append(fillna)
74
- return data
75
-
76
-
77
- def compare_2_list(*arr) -> pd.DataFrame:
78
- """
79
- Compare 2 lists then create DataFrame
80
- to see which items are missing
81
-
82
- Parameters
83
- ----------
84
- arr : list
85
- List
86
-
87
- Returns
88
- -------
89
- DataFrame
90
- Compare result
91
- """
92
- # Setup
93
- col_name = "list"
94
- arr = [sorted(x) for x in arr] # type: ignore # map(sorted, arr)
95
-
96
- # Total array
97
- tarr = sorted(list(set(chain.from_iterable(arr))))
98
- # max_len = len(tarr)
99
-
100
- # Temp dataset
101
- temp_dict = {"base": tarr}
102
- for idx, x in enumerate(arr):
103
- name = f"{col_name}{idx}"
104
-
105
- # convert list
106
- temp = [item if item in x else np.nan for item in tarr]
107
-
108
- temp_dict.setdefault(name, temp)
109
-
110
- df = pd.DataFrame(temp_dict)
111
- df["Compare"] = np.where(
112
- df[f"{col_name}0"].apply(lambda x: str(x).lower())
113
- == df[f"{col_name}1"].apply(lambda x: str(x).lower()),
114
- df[f"{col_name}0"], # Value when True
115
- np.nan, # Value when False
116
- )
117
- return df
118
-
119
-
120
- def rename_with_dict(df: pd.DataFrame, col: str, rename_dict: dict) -> pd.DataFrame:
121
- """
122
- Version: 2.0.0
123
- :param df: DataFrame
124
- :param col: Column name
125
- :param rename_dict: Rename dictionary
126
- """
127
-
128
- name = f"{col}_filtered"
129
- df[name] = df[col]
130
- rename_val = list(rename_dict.keys())
131
- df[name] = df[name].apply(lambda x: "Other" if x in rename_val else x)
132
- return df
133
-
134
-
135
- # Class
136
- # ---------------------------------------------------------------------------
137
- class CityData(NamedTuple):
138
- """
139
- Parameters
140
- ----------
141
- city : str
142
- City name
143
-
144
- region : str
145
- Region of the city
146
-
147
- area : str
148
- Area of the region
149
- """
150
-
151
- city: str
152
- region: str
153
- area: str
154
-
155
- @staticmethod
156
- def _sample_city_data(size: int = 100) -> list:
157
- """
158
- Generate sample city data (testing purpose)
159
- """
160
- sample_range = 10 ** len(str(size))
161
-
162
- # Serial list
163
- serials: list[str] = []
164
- while len(serials) != size: # Unique serial
165
- serial = random.randint(0, sample_range - 1)
166
- serial = str(serial).rjust(len(str(size)), "0") # type: ignore
167
- if serial not in serials: # type: ignore
168
- serials.append(serial) # type: ignore
169
-
170
- ss2 = deque(serials[: int(len(serials) / 2)]) # Cut half for region
171
- ss2.rotate(random.randrange(1, 5))
172
- [ss2.extend(ss2) for _ in range(2)] # type: ignore # Extend back
173
-
174
- ss3 = deque(serials[: int(len(serials) / 4)]) # Cut forth for area
175
- ss3.rotate(random.randrange(1, 5))
176
- [ss3.extend(ss3) for _ in range(4)] # type: ignore # Extend back
177
-
178
- serials = ["city_" + x for x in serials]
179
- ss2 = ["region_" + x for x in ss2] # type: ignore
180
- ss3 = ["area_" + x for x in ss3] # type: ignore
181
-
182
- ss = list(zip(serials, ss2, ss3)) # Zip back
183
- out = list(map(CityData._make, ss))
184
-
185
- return out
186
-
187
-
188
- class SplittedDF(NamedTuple):
189
- """
190
- DataFrame splitted into contains
191
- missing values only and vice versa
192
-
193
- Parameters
194
- ----------
195
- df : DataFrame
196
- DataFrame without missing values
197
-
198
- df_na : DataFrame
199
- DataFrame with missing values only
200
- """
201
-
202
- df: pd.DataFrame
203
- df_na: pd.DataFrame
204
-
205
- @staticmethod
206
- def concat_df(
207
- df_list: list[pd.DataFrame], join: Literal["inner", "outer"] = "inner"
208
- ) -> pd.DataFrame:
209
- """
210
- Concat the list of DataFrame (static method)
211
-
212
- Parameters
213
- ----------
214
- df_list : list[DataFrame]
215
- A sequence of DataFrame
216
-
217
- join : str
218
- Join type
219
- (Default: ``"inner"``)
220
-
221
- Returns
222
- -------
223
- DataFrame
224
- Joined DataFrame
225
- """
226
- df: pd.DataFrame = pd.concat(df_list, axis=0, join=join).reset_index()
227
- df.drop(columns=["index"], inplace=True)
228
- return df
229
-
230
- def concat(self, join: Literal["inner", "outer"] = "inner") -> pd.DataFrame:
231
- """
232
- Concat the splitted DataFrame
233
-
234
- Parameters
235
- ----------
236
- join : str
237
- Join type
238
- (Default: ``"inner"``)
239
-
240
- Returns
241
- -------
242
- DataFrame
243
- Joined DataFrame
244
- """
245
- return self.concat_df(self, join=join) # type: ignore
246
-
247
- @staticmethod
248
- def divide_dataframe(df: pd.DataFrame, by_column: str) -> list[pd.DataFrame]:
249
- """
250
- Divide DataFrame into a list of DataFrame
251
-
252
- Parameters
253
- ----------
254
- df : DataFrame
255
- DataFrame
256
-
257
- by_column : str
258
- By which column
259
-
260
- Returns
261
- -------
262
- list[DataFrame]
263
- Splitted DataFrame
264
- """
265
- divided = [x for _, x in df.groupby(by_column)]
266
- return divided
267
-
268
-
269
- ##
270
- class PLTFormatString(NamedTuple):
271
- """Matplotlib format string"""
272
-
273
- marker: str
274
- line_style: str
275
- color: str
276
-
277
-
278
- class _DictToAtrr:
279
- """Convert `keys` or `values` of `dict` into attribute"""
280
-
281
- def __init__(
282
- self,
283
- dict_data: dict,
284
- *,
285
- key_as_atrribute: bool = True,
286
- remove_char: str = r"( ) [ ] { }",
287
- ) -> None:
288
- """
289
- dict_data: Dictionary to convert
290
- key_as_atrribute: Use `dict.keys()` as atrribute when True, else use `dict.values()`
291
- remove_char: Characters that excluded from attribute name
292
- """
293
- self._data = dict_data
294
-
295
- if key_as_atrribute:
296
- # temp = list(map(self._remove_space, self._data.keys()))
297
- temp = [self._remove_space(x, remove_char) for x in self._data.keys()]
298
- [self.__setattr__(k, v) for k, v in zip(temp, self._data.values())] # type: ignore
299
- else:
300
- temp = [self._remove_space(x, remove_char) for x in self._data.values()]
301
- [self.__setattr__(k, v) for k, v in zip(temp, self._data.keys())] # type: ignore
302
- self._keys = temp
303
-
304
- def __str__(self) -> str:
305
- return f"{self.__class__.__name__}({self._keys})"
306
-
307
- def __repr__(self) -> str:
308
- return self.__str__()
309
-
310
- @staticmethod
311
- def _remove_space(value: str, remove_char: str) -> str:
312
- """
313
- Remove special characters and replace space with underscore
314
- """
315
- remove_char = remove_char.split(" ") # type: ignore
316
- logger.debug(remove_char)
317
- for x in remove_char:
318
- value = value.replace(x, "")
319
- value = value.replace(" ", "_")
320
- return value
321
-
322
-
323
- class MatplotlibFormatString:
324
- """
325
- Format string format: `[marker][line][color]` or `[color][marker][line]`
326
- """
327
-
328
- MARKER_LIST: ClassVar[dict[str, str]] = {
329
- ".": "point marker",
330
- ",": "pixel marker",
331
- "o": "circle marker",
332
- "v": "triangle_down marker",
333
- "^": "triangle_up marker",
334
- "<": "triangle_left marker",
335
- ">": "triangle_right marker",
336
- "1": "tri_down marker",
337
- "2": "tri_up marker",
338
- "3": "tri_left marker",
339
- "4": "tri_right marker",
340
- "8": "octagon marker",
341
- "s": "square marker",
342
- "p": "pentagon marker",
343
- "P": "plus (filled) marker",
344
- "*": "star marker",
345
- "h": "hexagon1 marker",
346
- "H": "hexagon2 marker",
347
- "+": "plus marker",
348
- "x": "x marker",
349
- "X": "x (filled) marker",
350
- "D": "diamond marker",
351
- "d": "thin_diamond marker",
352
- "|": "vline marker",
353
- "_": "hline marker",
354
- }
355
- LINE_STYLE_LIST: ClassVar[dict[str, str]] = {
356
- "-": "solid line style",
357
- "--": "dashed line style",
358
- "-.": "dash-dot line style",
359
- ":": "dotted line style",
360
- }
361
- COLOR_LIST: ClassVar[dict[str, str]] = {
362
- "b": "blue",
363
- "g": "green",
364
- "r": "red",
365
- "c": "cyan",
366
- "m": "magenta",
367
- "y": "yellow",
368
- "k": "black",
369
- "w": "white",
370
- }
371
- Marker = _DictToAtrr(MARKER_LIST, key_as_atrribute=False)
372
- LineStyle = _DictToAtrr(LINE_STYLE_LIST, key_as_atrribute=False)
373
- Color = _DictToAtrr(COLOR_LIST, key_as_atrribute=False)
374
-
375
- @classmethod
376
- def all_format_string(cls) -> list[PLTFormatString]:
377
- fmt_str = [
378
- cls.MARKER_LIST,
379
- cls.LINE_STYLE_LIST,
380
- cls.COLOR_LIST,
381
- ]
382
- return [PLTFormatString._make(x) for x in list(product(*fmt_str))]
383
-
384
- @staticmethod
385
- def get_random(alt: bool = False) -> str:
386
- temp = random.choice(__class__.all_format_string()) # type: ignore
387
- if alt:
388
- return f"{temp.marker}{temp.line_style}{temp.color}"
389
- else:
390
- return f"{temp.color}{temp.marker}{temp.line_style}"
391
-
13
+ from absfuyu.extra.da.dadf import DADF # noqa
14
+ from absfuyu.extra.da.df_func import ( # noqa
15
+ compare_2_list,
16
+ equalize_df,
17
+ rename_with_dict,
18
+ )
392
19
 
393
20
  # Class - DA
394
- # ---------------------------------------------------------------------------
395
- class DataAnalystDataFrame(ShowAllMethodsMixin, pd.DataFrame):
396
- """
397
- Data Analyst ``pd.DataFrame``
398
- """
399
-
400
- # Support
401
- # ================================================================
402
- # Rearrange column
403
- def rearrange_column(self, insert_to_col: str, num_of_cols: int = 1) -> Self:
404
- """
405
- Move right-most columns to selected position
406
-
407
- Parameters
408
- ----------
409
- insert_to_col : str
410
- Name of the column that the right-most column will be moved next to
411
-
412
- num_of_cols : int
413
- Number of columns moved
414
-
415
- Returns
416
- -------
417
- DataAnalystDataFrame
418
- Modified DataFrame
419
- """
420
- cols = self.columns.to_list() # List of columns
421
- num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols)))
422
- col_index = cols.index(insert_to_col)
423
- cols = (
424
- cols[: col_index + 1]
425
- + cols[-num_of_cols:]
426
- + cols[col_index + 1 : len(cols) - num_of_cols]
427
- )
428
- self = self.__class__(self[cols])
429
- return self
430
-
431
- # Drop a list of column
432
- def drop_columns(self, columns: list[str]) -> Self:
433
- """
434
- Drop columns in DataFrame
435
-
436
- Parameters
437
- ----------
438
- columns : list[str]
439
- List of columns need to drop
440
-
441
- Returns
442
- -------
443
- DataAnalystDataFrame
444
- Modified DataFrame
445
- """
446
- for column in columns:
447
- try:
448
- self.drop(columns=[column], inplace=True)
449
- except Exception:
450
- logger.debug(f"{column} column does not exist")
451
- # pass
452
- return self
453
-
454
- # Drop right-most columns
455
- def drop_rightmost(self, num_of_cols: int = 1) -> Self:
456
- """
457
- Drop ``num_of_cols`` right-most columns
458
-
459
- Parameters
460
- ----------
461
- num_of_cols : int
462
- Number of columns to drop
463
-
464
- Returns
465
- -------
466
- DataAnalystDataFrame
467
- Modified DataFrame
468
- """
469
- # Restrain
470
- # if num_of_cols < 1:
471
- # num_of_cols = 1
472
- # if num_of_cols > self.shape[1]:
473
- # num_of_cols = self.shape[1]
474
- num_of_cols = int(
475
- set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
476
- )
477
-
478
- # Logic
479
- for _ in range(num_of_cols):
480
- self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
481
- return self
482
-
483
- # Add blank column
484
- def add_blank_column(self, column_name: str, fill: Any) -> Self:
485
- """
486
- Add a blank column
487
-
488
- Parameters
489
- ----------
490
- column_name : str
491
- Name of the column to add
492
-
493
- fill : Any
494
- Fill the column with data
495
-
496
- Returns
497
- -------
498
- DataAnalystDataFrame
499
- Modified DataFrame
500
- """
501
- self[column_name] = [fill] * self.shape[0]
502
- return self
503
-
504
- # Modify
505
- # ================================================================
506
- # Convert city
507
- def convert_city(
508
- self,
509
- city_column: str,
510
- city_list: list[CityData],
511
- *,
512
- mode: str = "ra",
513
- ) -> Self:
514
- """
515
- Get ``region`` and ``area`` of a city
516
-
517
- Parameters
518
- ----------
519
- city_column : str
520
- Column contains city data
521
-
522
- city_list : list[CityData]
523
- List of city in correct format
524
- (Default: ``None``)
525
-
526
- mode : str
527
- | Detailed column to add
528
- | ``r``: region
529
- | ``a``: area
530
- | (Default: ``"ra"``)
531
-
532
- Returns
533
- -------
534
- DataAnalystDataFrame
535
- Modified DataFrame
536
- """
537
-
538
- # Support function
539
- def _convert_city_support(value: str) -> CityData:
540
- for x in city_list:
541
- if x.city.lower().startswith(value.lower()):
542
- return x
543
- return CityData(city=value, region=np.nan, area=np.nan) # type: ignore
544
-
545
- # Convert
546
- col_counter = 0
547
- if mode.find("r") != -1:
548
- logger.debug("Mode: 'region'")
549
- self["region"] = self[city_column].apply(
550
- lambda x: _convert_city_support(x).region
551
- )
552
- col_counter += 1
553
- if mode.find("a") != -1:
554
- logger.debug("Mode: 'area'")
555
- self["area"] = self[city_column].apply(
556
- lambda x: _convert_city_support(x).area
557
- )
558
- col_counter += 1
559
-
560
- # Rearrange
561
- return self.rearrange_column(city_column, col_counter)
562
-
563
- # Date related
564
- def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self:
565
- """
566
- Add dummy ``date`` column from ``month`` column
567
-
568
- Parameters
569
- ----------
570
- month_column : str
571
- Month column
572
-
573
- col_name : str
574
- New date column name
575
- (Default: ``"date"``)
576
-
577
- Returns
578
- -------
579
- DataAnalystDataFrame
580
- Modified DataFrame
581
- """
582
- _this_year = datetime.now().year
583
- self[col_name] = pd.to_datetime(
584
- f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
585
- format="%Y-%m-%d",
586
- )
587
- # Rearrange
588
- return self.rearrange_column(month_column)
589
-
590
- def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self:
591
- """
592
- Add these columns from ``date_column``:
593
- - ``date`` (won't add if ``date_column`` value is ``"date"``)
594
- - ``day`` (overwrite if already exist)
595
- - ``week`` (overwrite if already exist)
596
- - ``month`` (overwrite if already exist)
597
- - ``year`` (overwrite if already exist)
598
-
599
- Parameters
600
- ----------
601
- date_column : str
602
- Date column
603
-
604
- mode : str
605
- | Detailed column to add
606
- | ``d``: day
607
- | ``w``: week number
608
- | ``m``: month
609
- | ``y``: year
610
- | (Default: ``"dwmy"``)
611
-
612
- Returns
613
- -------
614
- DataAnalystDataFrame
615
- Modified DataFrame
616
- """
617
- # Convert to datetime
618
- self["date"] = pd.to_datetime(self[date_column])
619
-
620
- # Logic
621
- col_counter = 0
622
- # self["weekday"] = self["day"].dt.isocalendar().day # Weekday
623
- if mode.find("d") != -1:
624
- logger.debug("Mode: 'day'")
625
- self["day"] = self["date"].dt.day
626
- col_counter += 1
627
- if mode.find("w") != -1:
628
- logger.debug("Mode: 'weekday'")
629
- self["week"] = self["date"].dt.isocalendar().week
630
- col_counter += 1
631
- if mode.find("m") != -1:
632
- logger.debug("Mode: 'month'")
633
- self["month"] = self["date"].dt.month
634
- col_counter += 1
635
- if mode.find("y") != -1:
636
- logger.debug("Mode: 'year'")
637
- self["year"] = self["date"].dt.year
638
- col_counter += 1
639
-
640
- # Return
641
- return self.rearrange_column(date_column, col_counter)
642
-
643
- def delta_date(
644
- self,
645
- date_column: str,
646
- mode: Literal["now", "between_row"] = "now",
647
- *,
648
- col_name: str = "delta_date",
649
- ) -> Self:
650
- """
651
- Calculate date interval
652
-
653
- Parameters
654
- ----------
655
- date_column : str
656
- Date column
657
-
658
- mode : str
659
- | Mode to calculate
660
- | ``"between_row"``: Calculate date interval between each row
661
- | ``"now"``: Calculate date interval to current date
662
- | (Default: ``"now"``)
663
-
664
- col_name : str
665
- | New delta date column name
666
- | (Default: ``"delta_date"``)
667
-
668
- Returns
669
- -------
670
- DataAnalystDataFrame
671
- Modified DataFrame
672
- """
673
- if mode.lower().startswith("between_row"):
674
- dated = self[date_column].to_list()
675
- cal = []
676
- for i in range(len(dated)):
677
- if i == 0:
678
- cal.append(dated[i] - dated[i])
679
- # cal.append(relativedelta(dated[i], dated[i]))
680
- else:
681
- cal.append(dated[i] - dated[i - 1])
682
- # cal.append(relativedelta(dated[i], dated[i - 1]))
683
- self[col_name] = [x.days for x in cal]
684
- return self
685
- else: # mode="now"
686
- self[col_name] = self[date_column].apply(
687
- lambda x: (datetime.now() - x).days
688
- )
689
- return self
690
-
691
- # Fill missing value
692
- def fill_missing_values(
693
- self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
694
- ) -> Self:
695
- """
696
- Fill missing values in specified column
697
-
698
- Parameters
699
- ----------
700
- column_name : str
701
- Column name
702
-
703
- fill : Any
704
- Fill the missing values with
705
- (Default: ``np.nan``)
706
-
707
- fill_when_not_exist : Any
708
- When ``column_name`` does not exist,
709
- create a new column and fill with ``fill_when_not_exist``
710
- (Default: ``np.nan``)
711
-
712
- Returns
713
- -------
714
- DataAnalystDataFrame
715
- Modified DataFrame
716
- """
717
- try:
718
- self[column_name] = self[column_name].fillna(fill)
719
- except Exception:
720
- self.add_blank_column(column_name, fill_when_not_exist)
721
- return self
722
-
723
- # Split DataFrame
724
- def split_na(self, by_column: str) -> SplittedDF:
725
- """
726
- Split DataFrame into 2 parts:
727
- - Without missing value in specified column
728
- - With missing value in specified column
729
-
730
- Parameters
731
- ----------
732
- by_column : str
733
- Split by column
734
-
735
- Returns
736
- -------
737
- SplittedDF
738
- Splitted DataFrame
739
- """
740
- out = SplittedDF(
741
- df=self[~self[by_column].isna()], # DF
742
- df_na=self[self[by_column].isna()], # DF w/o NA
743
- )
744
- return out
745
-
746
- # Threshold filter
747
- # @versionchanged(version="3.2.0", reason="Optimized the code")
748
- def threshold_filter(
749
- self,
750
- destination_column: str,
751
- threshold: int | float = 10,
752
- *,
753
- top: int | None = None,
754
- replace_with: Any = "Other",
755
- ) -> Self:
756
- """
757
- Filter out percentage of data that smaller than the ``threshold``,
758
- replace all of the smaller data to ``replace_with``.
759
- As a result, pie chart is less messy.
760
-
761
- Parameters
762
- ----------
763
- destination_column : str
764
- Column to be filtered
765
-
766
- threshold : int | float
767
- Which percentage to cut-off
768
- (Default: 10%)
769
-
770
- top : int
771
- Only show top ``x`` categories in pie chart
772
- (replace threshold mode)
773
- (Default: ``None``)
774
-
775
- replace_with : Any
776
- Replace all of the smaller data with specified value
777
-
778
- Returns
779
- -------
780
- DataAnalystDataFrame
781
- Modified DataFrame
782
- """
783
- # Clean
784
- try:
785
- self[destination_column] = self[
786
- destination_column
787
- ].str.strip() # Remove trailing space
788
- except Exception:
789
- pass
790
-
791
- # Logic
792
- col_df = self.show_distribution(destination_column)
793
-
794
- # Rename
795
- if top is not None:
796
- list_of_keep: list = (
797
- col_df[destination_column]
798
- .head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0]))
799
- .to_list()
800
- )
801
- # logger.debug(list_of_keep)
802
- else:
803
- list_of_keep = col_df[col_df["percentage"] >= threshold][
804
- destination_column
805
- ].to_list() # values that will not be renamed
806
- self[f"{destination_column}_filtered"] = self[destination_column].apply(
807
- lambda x: replace_with if x not in list_of_keep else x
808
- )
809
-
810
- # Return
811
- return self
812
-
813
- # Info
814
- # ================================================================
815
- # Total observation
816
- @property
817
- @versionadded("3.2.0")
818
- def total_observation(self) -> int:
819
- """
820
- Returns total observation of the DataFrame
821
- """
822
- return self.shape[0] * self.shape[1] # type: ignore
823
-
824
- # Quick info
825
- @versionadded("3.2.0")
826
- def qinfo(self) -> str:
827
- """
828
- Show quick infomation about DataFrame
829
- """
830
- mv = self.isnull().sum().sum() # missing values
831
- to = self.total_observation
832
- info = (
833
- f"Dataset Information:\n"
834
- f"- Number of Rows: {self.shape[0]:,}\n"
835
- f"- Number of Columns: {self.shape[1]:,}\n"
836
- f"- Total observation: {to:,}\n"
837
- f"- Missing value: {mv:,} ({(mv / to * 100):.2f}%)\n\n"
838
- f"Column names:\n{self.columns.to_list()}"
839
- )
840
- return info
841
-
842
- # Quick describe
843
- @versionadded("3.2.0")
844
- def qdescribe(self) -> pd.DataFrame:
845
- """
846
- Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype
847
- """
848
- return self[
849
- self.select_dtypes(exclude=["object", "datetime"]).columns
850
- ].describe()
851
-
852
- # Missing values analyze
853
- def get_missing_values(
854
- self, hightlight: bool = True, *, percentage_round_up: int = 2
855
- ) -> pd.DataFrame:
856
- """
857
- Get a DataFrame contains count of missing values for each column
858
-
859
- Parameters
860
- ----------
861
- hightlight : bool
862
- Shows only columns with missing values when ``True``
863
- (Default: ``True``)
864
-
865
- percentage_round_up : int
866
- Round up to which decimals
867
- (Default: ``2``)
868
-
869
- Returns
870
- -------
871
- DataFrame
872
- Missing value DataFrame
873
- """
874
- # Check for missing value
875
- df_na = self.isnull().sum().sort_values(ascending=False)
876
- if hightlight:
877
- out = df_na[df_na != 0].to_frame()
878
- else:
879
- out = df_na.to_frame()
880
- out.rename(columns={0: "Num of N/A"}, inplace=True)
881
- out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round(
882
- percentage_round_up
883
- )
884
-
885
- # logger.debug(
886
- # f"Percentage of N/A over entire DF: "
887
- # f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%"
888
- # )
889
- return out
890
-
891
- # Show distribution
892
- @versionadded("3.2.0")
893
- def show_distribution(
894
- self,
895
- column_name: str,
896
- dropna: bool = True,
897
- *,
898
- show_percentage: bool = True,
899
- percentage_round_up: int = 2,
900
- ) -> pd.DataFrame:
901
- """
902
- Show distribution of a column
903
-
904
- Parameters
905
- ----------
906
- column_name : str
907
- Column to show distribution
908
-
909
- dropna : bool
910
- Count N/A when ``False``
911
- (Default: ``True``)
912
-
913
- show_percentage : bool
914
- Show proportion in range 0% - 100% instead of [0, 1]
915
- (Default: ``True``)
916
-
917
- percentage_round_up : int
918
- Round up to which decimals
919
- (Default: ``2``)
920
-
921
- Returns
922
- -------
923
- DataFrame
924
- Distribution DataFrame
925
-
926
-
927
- Example:
928
- --------
929
- >>> DataAnalystDataFrame.sample_df().show_distribution("number_range")
930
- number_range count percentage
931
- 0 900 16 16.0
932
- 1 700 15 15.0
933
- 2 300 12 12.0
934
- 3 200 12 12.0
935
- 4 400 11 11.0
936
- 5 600 11 11.0
937
- 6 800 10 10.0
938
- 7 100 9 9.0
939
- 8 500 4 4.0
940
-
941
-
942
- """
943
- out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index()
944
- if show_percentage:
945
- out["percentage"] = (out["count"] / self.shape[0] * 100).round(
946
- percentage_round_up
947
- )
948
- else:
949
- out["percentage"] = (out["count"] / self.shape[0]).round(
950
- percentage_round_up
951
- )
952
- return out
953
-
954
- # Help
955
- @classmethod
956
- def dadf_help(cls) -> list[str]:
957
- """
958
- Show all available method of DataAnalystDataFrame
959
- """
960
- list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame)))
961
- return sorted(list_of_method)
962
-
963
- # Sample DataFrame
964
- @classmethod
965
- def sample_df(cls, size: int = 100) -> Self:
966
- """
967
- Create sample DataFrame
968
-
969
- Parameters
970
- ----------
971
- size : int
972
- Number of observations
973
- (Default: ``100``)
974
-
975
- Returns
976
- -------
977
- DataAnalystDataFrame
978
- DataFrame with these columns:
979
- [number, number_big, number_range, missing_value, text, date]
980
-
981
-
982
- Example:
983
- --------
984
- >>> DataAnalystDataFrame.sample_df()
985
- number number_big number_range missing_value text date
986
- 0 -2.089770 785 700 NaN vwnlqoql 2013-11-20
987
- 1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13
988
- 2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24
989
- 3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28
990
- 4 1.687803 878 800 NaN aajtncum 2005-10-05
991
- .. ... ... ... ... ... ...
992
- 95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12
993
- 96 1.296795 255 200 NaN lwvytego 2014-05-10
994
- 97 1.440746 297 200 5.0 lqsoykun 2010-04-03
995
- 98 0.327702 845 800 NaN leadkvsy 2005-08-05
996
- 99 0.556720 981 900 36.0 bozmxixy 2004-02-22
997
- [100 rows x 6 columns]
998
- """
999
- # Restrain
1000
- size = int(set_min(size, min_value=1))
1001
-
1002
- # Number col
1003
- df = pd.DataFrame(np.random.randn(size, 1), columns=["number"])
1004
- df["number_big"] = [
1005
- random.choice(range(100, 999)) for _ in range(size)
1006
- ] # Big number in range 100-999
1007
- df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
1008
-
1009
- # Missing value col
1010
- na_rate = random.randint(1, 99)
1011
- d = [random.randint(1, 99) for _ in range(size)]
1012
- df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
1013
- # df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
1014
-
1015
- # Text col
1016
- df["text"] = [
1017
- "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
1018
- for _ in range(size)
1019
- ]
1020
-
1021
- # Random date col
1022
- df["date"] = [
1023
- datetime(
1024
- year=random.randint(datetime.now().year - 20, datetime.now().year),
1025
- month=random.randint(1, 12),
1026
- day=random.randint(1, 28),
1027
- )
1028
- for _ in range(size)
1029
- ]
1030
-
1031
- # Return
1032
- return cls(df)
1033
-
1034
-
1035
- class DADF(DataAnalystDataFrame):
1036
- """Short name for ``DataAnalystDataFrame``"""
1037
-
1038
- pass
1039
-
1040
-
1041
- class DADF_WIP(DADF):
1042
- """W.I.P"""
1043
-
1044
- @versionadded("4.0.0")
1045
- def subtract_df(self, other: Self | pd.DataFrame) -> Self:
1046
- """
1047
- Subtract DF to find the different rows
1048
- """
1049
- temp = self.copy()
1050
- out = (
1051
- temp.merge(other, indicator=True, how="right")
1052
- .query("_merge=='right_only'")
1053
- .drop("_merge", axis=1)
1054
- )
1055
- return self.__class__(out)
1056
-
1057
- @versionadded("4.0.0")
1058
- def merge_left(
1059
- self,
1060
- other: Self | pd.DataFrame,
1061
- on: str,
1062
- columns: list[str] | None = None,
1063
- ) -> Self:
1064
- """
1065
- Merge left of 2 dfs
1066
-
1067
- :param columns: Columns to take from df2
1068
- """
1069
-
1070
- if columns is not None:
1071
- current_col = [on]
1072
- current_col.extend(columns)
1073
- col = other.columns.to_list()
1074
- cols = list(set(col) - set(current_col))
1075
- self.drop_columns(cols)
1076
-
1077
- out = self.merge(other, how="left", on=on)
1078
- return self.__class__(out)
21
+ # TODO: split column df[['A','B']]=df['AB'].str.split(' ',n=1,expand=True) | drop dups | Combine: row with data, row NaN