absfuyu 2.8.1__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of absfuyu might be problematic. Click here for more details.

Files changed (42) hide show
  1. absfuyu/__init__.py +13 -10
  2. absfuyu/__main__.py +55 -38
  3. absfuyu/config/config.json +3 -3
  4. absfuyu/core.py +39 -25
  5. absfuyu/everything.py +4 -5
  6. absfuyu/extensions/__init__.py +3 -2
  7. absfuyu/extensions/dev/__init__.py +162 -19
  8. absfuyu/extensions/dev/password_hash.py +11 -10
  9. absfuyu/extensions/dev/passwordlib.py +256 -0
  10. absfuyu/extensions/dev/pkglib.py +53 -57
  11. absfuyu/extensions/dev/project_starter.py +58 -0
  12. absfuyu/extensions/dev/shutdownizer.py +8 -0
  13. absfuyu/extensions/extra/data_analysis.py +687 -119
  14. absfuyu/fun/__init__.py +88 -118
  15. absfuyu/fun/tarot.py +32 -34
  16. absfuyu/game/tictactoe2.py +90 -78
  17. absfuyu/{collections → general}/__init__.py +14 -12
  18. absfuyu/{collections → general}/content.py +105 -87
  19. absfuyu/{collections → general}/data_extension.py +652 -172
  20. absfuyu/{collections → general}/generator.py +65 -4
  21. absfuyu/{collections → general}/human.py +28 -3
  22. absfuyu/pkg_data/__init__.py +14 -36
  23. absfuyu/pkg_data/chemistry.pkl +0 -0
  24. absfuyu/pkg_data/tarot.pkl +0 -0
  25. absfuyu/tools/converter.py +58 -31
  26. absfuyu/tools/obfuscator.py +4 -4
  27. absfuyu/tools/stats.py +4 -4
  28. absfuyu/tools/web.py +2 -2
  29. absfuyu/util/lunar.py +144 -123
  30. absfuyu/util/path.py +22 -3
  31. absfuyu/util/performance.py +101 -14
  32. absfuyu/version.py +93 -84
  33. {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/METADATA +63 -33
  34. absfuyu-3.1.0.dist-info/RECORD +55 -0
  35. {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/WHEEL +1 -1
  36. absfuyu-3.1.0.dist-info/entry_points.txt +2 -0
  37. absfuyu/pkg_data/chemistry.json +0 -6268
  38. absfuyu/pkg_data/tarot.json +0 -2593
  39. absfuyu-2.8.1.dist-info/RECORD +0 -52
  40. absfuyu-2.8.1.dist-info/entry_points.txt +0 -2
  41. {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/LICENSE +0 -0
  42. {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/top_level.txt +0 -0
@@ -3,35 +3,55 @@ Absfuyu: Data Analysis [W.I.P]
3
3
  ------------------------------
4
4
  Extension for ``pd.DataFrame``
5
5
 
6
- Version: 2.0.0.dev3
7
- Date updated: 24/11/2023 (dd/mm/yyyy)
6
+ Version: 2.0.0.dev10
7
+ Date updated: 06/03/2024 (dd/mm/yyyy)
8
8
  """
9
9
 
10
10
 
11
+ # Module level
12
+ ###########################################################################
13
+ __all__ = [
14
+ # Function
15
+ "compare_2_list",
16
+ # Support
17
+ "CityData",
18
+ "SplittedDF",
19
+ "PLTFormatString",
20
+ # Main
21
+ "MatplotlibFormatString",
22
+ "DataAnalystDataFrame",
23
+ "DADF",
24
+ ]
25
+
26
+
11
27
  # Library
12
28
  ###########################################################################
13
- from collections import namedtuple
29
+ from datetime import datetime
30
+ from functools import partial
14
31
  import random
15
32
  from itertools import chain, product
16
- from typing import Dict, List, Union
33
+ import string
34
+ from typing import Any, Dict, List, NamedTuple, Optional, Union
17
35
 
18
36
  # import matplotlib.pyplot as plt
37
+ # from scipy import stats
38
+ # from dateutil.relativedelta import relativedelta
19
39
  import numpy as np
20
40
  import pandas as pd
21
- # from scipy import stats
22
41
 
23
42
  from absfuyu.logger import logger
43
+ from absfuyu.util import set_min_max, set_min
24
44
 
25
45
 
26
46
  # Function
27
47
  ###########################################################################
28
- def summary(data: Union[list, np.ndarray]):
48
+ def summary(data: Union[list, np.ndarray]): # del this
29
49
  """
30
50
  Quick summary of data
31
-
32
- data : np.ndarray | list
51
+
52
+ :param data: np.ndarray | list
33
53
  """
34
-
54
+
35
55
  if not isinstance(data, np.ndarray):
36
56
  data = np.array(data)
37
57
 
@@ -54,42 +74,7 @@ def summary(data: Union[list, np.ndarray]):
54
74
  return output
55
75
 
56
76
 
57
- def divide_dataframe(df: pd.DataFrame, by: str) -> list:
58
- """
59
- Divide df into a list of df
60
- """
61
- divided = [y for _, y in df.groupby(by)]
62
- # divided[0] # this is the first separated df
63
- # divided[len(divided)-1] # this is the last separated df
64
- return divided
65
-
66
-
67
- def delta_date(df: pd.DataFrame, date_field: str, col_name: str="delta_date"):
68
- """
69
- Calculate date interval between row
70
- """
71
- dated = df[date_field].to_list()
72
- cal = []
73
- for i in range(len(dated)):
74
- if i==0:
75
- cal.append(dated[i]-dated[i])
76
- else:
77
- cal.append(dated[i]-dated[i-1])
78
- df[col_name] = [x.days for x in cal]
79
- return df
80
-
81
-
82
- def modify_date(df: pd.DataFrame, date_col: str):
83
- """
84
- Add date, week, and year column for date_col
85
- """
86
- df["Date"] = pd.to_datetime(df[date_col])
87
- df["Week"] = df["Date"].dt.isocalendar().week
88
- df["Year"] = df["Date"].dt.isocalendar().year
89
- return df
90
-
91
-
92
- def equalize_df(data: Dict[str, list], fillna = np.nan) -> Dict[str, list]:
77
+ def equalize_df(data: Dict[str, list], fillna=np.nan) -> Dict[str, list]:
93
78
  """
94
79
  Make all list in dict have equal length to make pd.DataFrame
95
80
 
@@ -99,23 +84,33 @@ def equalize_df(data: Dict[str, list], fillna = np.nan) -> Dict[str, list]:
99
84
  max_len = max(map(len, data.values()))
100
85
  for _, v in data.items():
101
86
  if len(v) < max_len:
102
- missings = max_len-len(v)
87
+ missings = max_len - len(v)
103
88
  for _ in range(missings):
104
89
  v.append(fillna)
105
90
  return data
106
91
 
92
+
107
93
  ## Update 05/10
108
94
 
95
+
109
96
  def compare_2_list(*arr: list) -> pd.DataFrame:
110
97
  """
111
- Compare lists then create DataFrame
98
+ Compare 2 lists then create DataFrame
112
99
  to see which items are missing
113
100
 
114
- :param arr: list
101
+ Parameters
102
+ ----------
103
+ arr : list
104
+ List
105
+
106
+ Returns
107
+ -------
108
+ DataFrame
109
+ Compare result
115
110
  """
116
111
  # Setup
117
112
  col_name = "list"
118
- arr = [sorted(x) for x in arr] # map(sorted, arr)
113
+ arr = [sorted(x) for x in arr] # map(sorted, arr)
119
114
 
120
115
  # Total array
121
116
  tarr = sorted(list(set(chain.from_iterable(arr))))
@@ -133,9 +128,10 @@ def compare_2_list(*arr: list) -> pd.DataFrame:
133
128
 
134
129
  df = pd.DataFrame(temp_dict)
135
130
  df["Compare"] = np.where(
136
- df[f"{col_name}0"].apply(lambda x: str(x).lower()) == df[f"{col_name}1"].apply(lambda x: str(x).lower()),
137
- df[f"{col_name}0"], # Value when True
138
- np.nan # Value when False
131
+ df[f"{col_name}0"].apply(lambda x: str(x).lower())
132
+ == df[f"{col_name}1"].apply(lambda x: str(x).lower()),
133
+ df[f"{col_name}0"], # Value when True
134
+ np.nan, # Value when False
139
135
  )
140
136
  return df
141
137
 
@@ -160,50 +156,126 @@ def rename_with_dict(df: pd.DataFrame, col: str, rename_dict: dict) -> pd.DataFr
160
156
  df[name] = df[name].apply(lambda x: "Other" if x in rename_val else x)
161
157
  return df
162
158
 
163
- def threshold_filter(
164
- df: pd.DataFrame,
165
- col: str,
166
- col2: str,
167
- threshold: int = 10
168
- ) -> pd.DataFrame:
159
+
160
+ # Class
161
+ ###########################################################################
162
+ class CityData(NamedTuple):
169
163
  """
170
- Filter out percentage of data that smaller than threshold
171
- Version: 1.0.0
164
+ Parameters
165
+ ----------
166
+ city : str
167
+ City name
172
168
 
173
- :param df: DataFrame
174
- :param col: Column name
175
- :param col2: Secondary filter column
176
- :param threshold: which percentage to cut-off
169
+ region : str
170
+ Region of the city
171
+
172
+ area : str
173
+ Area of the region
177
174
  """
178
- # Clean
179
- df[col] = df[col].str.strip() # Remove trailing space
180
-
181
- col_df = df.groupby(col)[col2].count().sort_values(ascending=False)/df.shape[0]*100 # percentage of col
182
- name_of_type: list = col_df[col_df.values>=threshold].keys().to_list() # get all the `col` that has larger than threshold
183
175
 
184
- rename_list = list(set(df[col].unique().tolist()) - set(name_of_type))
185
- rename_dict = dict(zip(rename_list, ["Other"]*len(rename_list)))
176
+ city: str
177
+ region: str
178
+ area: str
186
179
 
187
- df = rename_with_dict(df, col, rename_dict)
188
180
 
189
- return df
181
+ class SplittedDF(NamedTuple):
182
+ """
183
+ DataFrame splitted into contains
184
+ missing values only and vice versa
190
185
 
186
+ Parameters
187
+ ----------
188
+ df : DataFrame
189
+ DataFrame without missing values
191
190
 
191
+ df_na : DataFrame
192
+ DataFrame with missing values only
193
+ """
192
194
 
195
+ df: pd.DataFrame
196
+ df_na: pd.DataFrame
197
+
198
+ @staticmethod
199
+ def concat_df(df_list: List[pd.DataFrame], join: str = "inner"):
200
+ """
201
+ Concat the list of DataFrame (static method)
202
+
203
+ Parameters
204
+ ----------
205
+ df_list : list[DataFrame]
206
+ A sequence of DataFrame
207
+
208
+ join : str
209
+ Join type
210
+ (Default: ``"inner"``)
211
+
212
+ Returns
213
+ -------
214
+ DataFrame
215
+ Joined DataFrame
216
+ """
217
+ df: pd.DataFrame = pd.concat(df_list, axis=0, join=join).reset_index()
218
+ df.drop(columns=["index"], inplace=True)
219
+ return df
220
+
221
+ def concat(self, join: str = "inner"):
222
+ """
223
+ Concat the splitted DataFrame
224
+
225
+ Parameters
226
+ ----------
227
+ join : str
228
+ Join type
229
+ (Default: ``"inner"``)
230
+
231
+ Returns
232
+ -------
233
+ DataFrame
234
+ Joined DataFrame
235
+ """
236
+ return self.concat_df(self, join=join)
237
+
238
+ @staticmethod
239
+ def divide_dataframe(df: pd.DataFrame, by_column: str) -> List[pd.DataFrame]:
240
+ """
241
+ Divide DataFrame into a list of DataFrame
242
+
243
+ Parameters
244
+ ----------
245
+ df : DataFrame
246
+ DataFrame
247
+
248
+ by_column : str
249
+ By which column
250
+
251
+ Returns
252
+ -------
253
+ list[DataFrame]
254
+ Splitted DataFrame
255
+ """
256
+ divided = [x for _, x in df.groupby(by_column)]
257
+ return divided
258
+
259
+
260
+ ##
261
+ class PLTFormatString(NamedTuple):
262
+ """Matplotlib format string"""
263
+
264
+ marker: str
265
+ line_style: str
266
+ color: str
193
267
 
194
- # Class
195
- ###########################################################################
196
- PLTFormatString = namedtuple("PLTFormatString", ["marker", "line_style", "color"])
197
268
 
198
269
  class _DictToAtrr:
199
270
  """Convert `keys` or `values` of `dict` into attribute"""
271
+
200
272
  def __init__(
201
- self,
202
- dict_data: dict,
203
- *,
204
- key_as_atrribute: bool = True,
205
- remove_char: str = r"( ) [ ] { }"
206
- ) -> None:
273
+ self,
274
+ dict_data: dict,
275
+ *,
276
+ key_as_atrribute: bool = True,
277
+ remove_char: str = r"( ) [ ] { }",
278
+ ) -> None:
207
279
  """
208
280
  dict_data: Dictionary to convert
209
281
  key_as_atrribute: Use `dict.keys()` as atrribute when True, else use `dict.values()`
@@ -222,9 +294,10 @@ class _DictToAtrr:
222
294
 
223
295
  def __str__(self) -> str:
224
296
  return f"{self.__class__.__name__}({self._keys})"
297
+
225
298
  def __repr__(self) -> str:
226
299
  return self.__str__()
227
-
300
+
228
301
  @staticmethod
229
302
  def _remove_space(value: str, remove_char: str) -> str:
230
303
  """
@@ -242,6 +315,7 @@ class MatplotlibFormatString:
242
315
  """
243
316
  Format string format: `[marker][line][color]` or `[color][marker][line]`
244
317
  """
318
+
245
319
  MARKER_LIST = {
246
320
  ".": "point marker",
247
321
  ",": "pixel marker",
@@ -267,13 +341,13 @@ class MatplotlibFormatString:
267
341
  "D": "diamond marker",
268
342
  "d": "thin_diamond marker",
269
343
  "|": "vline marker",
270
- "_": "hline marker"
344
+ "_": "hline marker",
271
345
  }
272
346
  LINE_STYLE_LIST = {
273
347
  "-": "solid line style",
274
348
  "--": "dashed line style",
275
349
  "-.": "dash-dot line style",
276
- ":": "dotted line style"
350
+ ":": "dotted line style",
277
351
  }
278
352
  COLOR_LIST = {
279
353
  "b": "blue",
@@ -283,7 +357,7 @@ class MatplotlibFormatString:
283
357
  "m": "magenta",
284
358
  "y": "yellow",
285
359
  "k": "black",
286
- "w": "white"
360
+ "w": "white",
287
361
  }
288
362
  Marker = _DictToAtrr(MARKER_LIST, key_as_atrribute=False)
289
363
  LineStyle = _DictToAtrr(LINE_STYLE_LIST, key_as_atrribute=False)
@@ -291,7 +365,11 @@ class MatplotlibFormatString:
291
365
 
292
366
  @staticmethod
293
367
  def all_format_string() -> List[PLTFormatString]:
294
- fmt_str = [__class__.MARKER_LIST, __class__.LINE_STYLE_LIST, __class__.COLOR_LIST]
368
+ fmt_str = [
369
+ __class__.MARKER_LIST,
370
+ __class__.LINE_STYLE_LIST,
371
+ __class__.COLOR_LIST,
372
+ ]
295
373
  return [PLTFormatString._make(x) for x in list(product(*fmt_str))]
296
374
 
297
375
  @staticmethod
@@ -301,50 +379,540 @@ class MatplotlibFormatString:
301
379
  return f"{temp.marker}{temp.line_style}{temp.color}"
302
380
  else:
303
381
  return f"{temp.color}{temp.marker}{temp.line_style}"
304
-
305
382
 
306
383
 
307
- class DataFrameKai(pd.DataFrame):
308
- def get_unique(self, col: str):
384
+ # Class - DA
385
+ ###########################################################################
386
+ class DataAnalystDataFrame(pd.DataFrame):
387
+ """Data Analyst ``pd.DataFrame``"""
388
+
389
+ _DADF_Version = (1, 1, 0)
390
+
391
+ # Support
392
+ # ================================================================
393
+ # Rearrange column
394
+ def rearrange_column(self, insert_to_col: str, num_of_cols: int = 1):
395
+ """
396
+ Move right-most columns to selected position
397
+
398
+ Parameters
399
+ ----------
400
+ insert_to_col : str
401
+ Name of the column that the right-most column will be moved next to
402
+
403
+ num_of_cols : int
404
+ Number of columns moved
405
+
406
+ Returns
407
+ -------
408
+ DataAnalystDataFrame
409
+ Modified DataFrame
410
+ """
411
+ cols = self.columns.to_list() # List of columns
412
+ num_of_cols = set_min_max(num_of_cols, min_value=1, max_value=len(cols))
413
+ col_index = cols.index(insert_to_col)
414
+ cols = (
415
+ cols[: col_index + 1]
416
+ + cols[-num_of_cols:]
417
+ + cols[col_index + 1 : len(cols) - num_of_cols]
418
+ )
419
+ self = __class__(self[cols])
420
+ return self
421
+
422
+ # Drop a list of column
423
+ def drop_columns(self, columns: List[str]):
424
+ """
425
+ Drop columns in DataFrame
426
+
427
+ Parameters
428
+ ----------
429
+ columns : list[str]
430
+ List of columns need to drop
431
+
432
+ Returns
433
+ -------
434
+ DataAnalystDataFrame
435
+ Modified DataFrame
436
+ """
437
+ for column in columns:
438
+ try:
439
+ self.drop(columns=[column], inplace=True)
440
+ except:
441
+ logger.debug(f"{column} column does not exist")
442
+ # pass
443
+ return self
444
+
445
+ # Drop right-most columns
446
+ def drop_rightmost(self, num_of_cols: int = 1):
447
+ """
448
+ Drop ``num_of_cols`` right-most columns
449
+
450
+ Parameters
451
+ ----------
452
+ num_of_cols : int
453
+ Number of columns to drop
454
+
455
+ Returns
456
+ -------
457
+ DataAnalystDataFrame
458
+ Modified DataFrame
459
+ """
460
+ # Restrain
461
+ # if num_of_cols < 1:
462
+ # num_of_cols = 1
463
+ # if num_of_cols > self.shape[1]:
464
+ # num_of_cols = self.shape[1]
465
+ num_of_cols = set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
466
+
467
+ # Logic
468
+ for _ in range(num_of_cols):
469
+ self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
470
+ return self
471
+
472
+ # Add blank column
473
+ def add_blank_column(self, column_name: str, fill: Any):
474
+ """
475
+ Add a blank column
476
+
477
+ Parameters
478
+ ----------
479
+ column_name : str
480
+ Name of the column to add
481
+
482
+ fill : Any
483
+ Fill the column with data
484
+
485
+ Returns
486
+ -------
487
+ DataAnalystDataFrame
488
+ Modified DataFrame
309
489
  """
310
- Return a list of unique values in a column
490
+ self[column_name] = [fill] * self.shape[0]
491
+ return self
492
+
493
+ # Modify
494
+ # ================================================================
495
+ # Convert city
496
+ def convert_city(self, city_column: str, city_list: List[CityData] = None):
311
497
  """
312
- return list(self[col].unique())
313
-
314
- def convert_to_SeriesKai(self):
315
- pass
498
+ Get region and area of a city
499
+
500
+ Parameters
501
+ ----------
502
+ city_column : str
503
+ Column contains city data
504
+
505
+ city_list : list[CityData]
506
+ List of city in correct format
507
+ (Default: ``None``)
316
508
 
317
- def summary(self, col: str):
509
+ Returns
510
+ -------
511
+ DataAnalystDataFrame
512
+ Modified DataFrame
318
513
  """
319
- Quick summary of data
514
+
515
+ # Support function
516
+ def _convert_city_prep(
517
+ value: str,
518
+ rtype: str = "region",
519
+ ) -> str:
520
+ """
521
+ :param value: Value
522
+ :param rtype: "region" or "area"
523
+ :param city_list: list of cities with city, region, area
524
+ """
525
+ for x in city_list:
526
+ if x.city.lower().startswith(value.lower()):
527
+ if rtype.lower().strip().startswith("region"):
528
+ return x.region
529
+ if rtype.lower().strip().startswith("area"):
530
+ return x.area
531
+ return value
532
+
533
+ _convert_city_prep2 = partial(_convert_city_prep, rtype="area")
534
+
535
+ # Convert
536
+ self["region"] = self[city_column].apply(_convert_city_prep)
537
+ self["area"] = self[city_column].apply(_convert_city_prep2)
538
+
539
+ # Rearrange
540
+ return self.rearrange_column(city_column, 2)
541
+
542
+ # Date related
543
+ def add_date_from_month(self, month_column: str, *, col_name: str = "date"):
320
544
  """
321
- data = self[col]
322
-
323
- if not isinstance(data, np.ndarray):
324
- data = np.array(data)
545
+ Add dummy ``date`` column from ``month`` column
546
+
547
+ Parameters
548
+ ----------
549
+ month_column : str
550
+ Month column
551
+
552
+ col_name : str
553
+ New date column name
554
+ (Default: ``"date"``)
325
555
 
326
- output = {
327
- "Observations": len(data),
328
- "Mean": np.mean(data),
329
- "Median": np.median(data),
330
- # "Mode": stats.mode(data)[0][0],
331
- "Standard deviation": np.std(data),
332
- "Variance": np.var(data),
333
- "Max": max(data),
334
- "Min": min(data),
335
- "Percentiles": {
336
- "1st Quartile": np.quantile(data, 0.25),
337
- "2nd Quartile": np.quantile(data, 0.50),
338
- "3rd Quartile": np.quantile(data, 0.75),
339
- # "IQR": stats.iqr(data),
340
- },
341
- }
342
- return output
556
+ Returns
557
+ -------
558
+ DataAnalystDataFrame
559
+ Modified DataFrame
560
+ """
561
+ _this_year = datetime.now().year
562
+ self[col_name] = pd.to_datetime(
563
+ f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
564
+ format="%Y-%m-%d",
565
+ )
566
+ # Rearrange
567
+ return self.rearrange_column(month_column)
568
+
569
+ def add_detail_date(self, date_column: str, mode: str = "dwmy"):
570
+ """
571
+ Add these columns from ``date_column``:
572
+ - ``date`` (won't add if ``date_column`` value is ``"date"``)
573
+ - ``day`` (overwrite if already exist)
574
+ - ``week`` (overwrite if already exist)
575
+ - ``month`` (overwrite if already exist)
576
+ - ``year`` (overwrite if already exist)
577
+
578
+ Parameters
579
+ ----------
580
+ date_column : str
581
+ Date column
582
+
583
+ mode : str
584
+ | Detailed column to add
585
+ | ``d``: day
586
+ | ``w``: week number
587
+ | ``m``: month
588
+ | ``y``: year
589
+ | (Default: ``"dwmy"``)
590
+
591
+ Returns
592
+ -------
593
+ DataAnalystDataFrame
594
+ Modified DataFrame
595
+ """
596
+ # Convert to datetime
597
+ self["date"] = pd.to_datetime(self[date_column])
598
+
599
+ # Logic
600
+ col_counter = 0
601
+ # self["weekday"] = self["day"].dt.isocalendar().day # Weekday
602
+ if mode.find("d") != -1:
603
+ logger.debug("Mode: 'day'")
604
+ self["day"] = self["date"].dt.day
605
+ col_counter += 1
606
+ if mode.find("w") != -1:
607
+ logger.debug("Mode: 'weekday'")
608
+ self["week"] = self["date"].dt.isocalendar().week
609
+ col_counter += 1
610
+ if mode.find("m") != -1:
611
+ logger.debug("Mode: 'month'")
612
+ self["month"] = self["date"].dt.month
613
+ col_counter += 1
614
+ if mode.find("y") != -1:
615
+ logger.debug("Mode: 'year'")
616
+ self["year"] = self["date"].dt.year
617
+ col_counter += 1
618
+ return self.rearrange_column(date_column, col_counter)
619
+
620
+ def delta_date(
621
+ self, date_column: str, mode: str = "now", *, col_name: str = "delta_date"
622
+ ):
623
+ """
624
+ Calculate date interval
625
+
626
+ Parameters
627
+ ----------
628
+ date_column : str
629
+ Date column
630
+
631
+ mode : str
632
+ | Mode to calculate
633
+ | ``"between_row"``: Calculate date interval between each row
634
+ | ``"now"``: Calculate date interval to current date
635
+ | (Default: ``"between_row"``)
636
+
637
+ col_name : str
638
+ | New delta date column name
639
+ | (Default: ``"delta_date"``)
640
+
641
+ Returns
642
+ -------
643
+ DataAnalystDataFrame
644
+ Modified DataFrame
645
+ """
646
+ if mode.lower().startswith("between_row"):
647
+ dated = self[date_column].to_list()
648
+ cal = []
649
+ for i in range(len(dated)):
650
+ if i == 0:
651
+ cal.append(dated[i] - dated[i])
652
+ # cal.append(relativedelta(dated[i], dated[i]))
653
+ else:
654
+ cal.append(dated[i] - dated[i - 1])
655
+ # cal.append(relativedelta(dated[i], dated[i - 1]))
656
+ self[col_name] = [x.days for x in cal]
657
+ return self
658
+ else: # mode="now"
659
+ self[col_name] = self[date_column].apply(lambda x: (datetime.now() - x).days)
660
+ return self
661
+
662
+ # Fill missing value
663
+ def fill_missing_values(
664
+ self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
665
+ ):
666
+ """
667
+ Fill missing values in specified column
668
+
669
+ Parameters
670
+ ----------
671
+ column_name : str
672
+ Column name
673
+
674
+ fill : Any
675
+ Fill the missing values with
676
+ (Default: ``np.nan``)
677
+
678
+ fill_when_not_exist : Any
679
+ When ``column_name`` does not exist,
680
+ create a new column and fill with ``fill_when_not_exist``
681
+ (Default: ``np.nan``)
682
+
683
+ Returns
684
+ -------
685
+ DataAnalystDataFrame
686
+ Modified DataFrame
687
+ """
688
+ try:
689
+ self[column_name] = self[column_name].fillna(fill)
690
+ except:
691
+ self.add_blank_column(column_name, fill_when_not_exist)
692
+ return self
693
+
694
+ # Split DataFrame
695
+ def split_na(self, by_column: str) -> SplittedDF:
696
+ """
697
+ Split DataFrame into 2 parts:
698
+ - Without missing value in specified column
699
+ - With missing value in specified column
700
+
701
+ Parameters
702
+ ----------
703
+ by_column : str
704
+ Split by column
705
+
706
+ Returns
707
+ -------
708
+ SplittedDF
709
+ Splitted DataFrame
710
+ """
711
+ out = SplittedDF(
712
+ df=self[~self[by_column].isna()], # DF
713
+ df_na=self[self[by_column].isna()], # DF w/o NA
714
+ )
715
+ return out
716
+
717
+ # Threshold filter
718
+ def threshold_filter(
719
+ self,
720
+ destination_column: str,
721
+ threshold: Union[int, float] = 10,
722
+ *,
723
+ top: Optional[int] = None,
724
+ group_by_column: Optional[str] = None,
725
+ replace_with: Any = "Other",
726
+ ):
727
+ """
728
+ Filter out percentage of data that smaller than the ``threshold``,
729
+ replace all of the smaller data to ``replace_with``.
730
+ As a result, pie chart is less messy.
731
+ Version: 1.1.0
732
+
733
+ Parameters
734
+ ----------
735
+ destination_column : str
736
+ Column to be filtered
737
+
738
+ threshold : int | float
739
+ Which percentage to cut-off
740
+ (Default: 10%)
741
+
742
+ top : int
743
+ Only show top ``x`` categories in pie chart
744
+ (replace threshold mode)
745
+ (Default: ``None``)
746
+
747
+ group_by_column : str
748
+ Calculate threshold for each category in selected column [W.I.P]
749
+ (Default: ``None``)
750
+
751
+ replace_with : Any
752
+ Replace all of the smaller data with specified value
753
+
754
+ Returns
755
+ -------
756
+ DataAnalystDataFrame
757
+ Modified DataFrame
758
+ """
759
+ # Clean
760
+ try:
761
+ self[destination_column] = self[
762
+ destination_column
763
+ ].str.strip() # Remove trailing space
764
+ except:
765
+ pass
766
+
767
+ # Logic
768
+ if group_by_column is None:
769
+ # Get a column with no missing values
770
+ col_with_no_na = ""
771
+ for col_name in self.columns:
772
+ if col_name == destination_column:
773
+ continue
774
+ if self[col_name].isna().sum() == 0:
775
+ col_with_no_na = col_name
776
+ break
777
+ if col_with_no_na == "":
778
+ # CASE: every col has NA else where
779
+ for col_name in self.columns:
780
+ if col_name == destination_column:
781
+ continue
782
+ else:
783
+ col_with_no_na = col_name
784
+ break
785
+ self[col_with_no_na].fillna("N/A")
786
+
787
+ # Calculate threshold
788
+ col_df = (
789
+ self.groupby(destination_column)
790
+ .count()[col_with_no_na]
791
+ .sort_values(ascending=False)
792
+ .to_frame()
793
+ .reset_index()
794
+ )
795
+ col_df.rename(columns={col_with_no_na: "total_count"}, inplace=True)
796
+ col_df["percentage"] = col_df["total_count"] / self.shape[0] * 100
797
+ # logger.debug(col_df) # Show calculation result
798
+ else:
799
+ # Real logic: manually select a column to perform percentage calculation
800
+ # Calculate threshold for each category in selected column may be will be added in the future
801
+ col_df = (
802
+ self.groupby(destination_column)[group_by_column]
803
+ .count()
804
+ .sort_values(ascending=False)
805
+ / self.shape[0]
806
+ * 100
807
+ ) # percentage of destination_column
808
+ col_df = col_df.reset_index()
809
+ col_df.rename(columns={group_by_column: "percentage"}, inplace=True)
810
+
811
+ # Rename
812
+ if top is not None:
813
+ list_of_keep: list = (
814
+ col_df[destination_column]
815
+ .head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0]))
816
+ .to_list()
817
+ )
818
+ # logger.debug(list_of_keep)
819
+ else:
820
+ list_of_keep: list = col_df[col_df["percentage"] >= threshold][
821
+ destination_column
822
+ ].to_list() # values that will not be renamed
823
+ self[f"{destination_column}_filtered"] = self[destination_column].apply(
824
+ lambda x: replace_with if x not in list_of_keep else x
825
+ )
826
+
827
+ # Return
828
+ return self
829
+
830
+ # Info
831
+ # ================================================================
832
+ def get_missing_values(self, hightlight: bool = True) -> pd.DataFrame:
833
+ """
834
+ Get a DataFrame contains count of missing values for each column
835
+
836
+ Parameters
837
+ ----------
838
+ hightlight : bool
839
+ Shows only columns with missing values when ``True``
840
+ (Default: ``True``)
841
+
842
+ Returns
843
+ -------
844
+ DataFrame
845
+ Missing value DataFrame
846
+ """
847
+ # Check for missing value
848
+ df_na = self.isnull().sum().sort_values(ascending=False)
849
+ if hightlight:
850
+ out = df_na[df_na != 0].to_frame()
851
+ else:
852
+ out = df_na.to_frame()
853
+ out.rename(columns={0: "Num of N/A"}, inplace=True)
854
+ return out
343
855
 
856
+ # Sample DataFrame
857
+ @classmethod
858
+ def sample_df(cls, size: int = 100):
859
+ """
860
+ Create sample DataFrame
861
+
862
+ Parameters
863
+ ----------
864
+ size : int
865
+ Number of observations
866
+ (Default: ``100``)
867
+
868
+ Returns
869
+ -------
870
+ DataAnalystDataFrame
871
+ DataFrame with these columns:
872
+ [number, number_big, number_range, missing_value, text, date]
873
+ """
874
+ # Restrain
875
+ size = set_min(size, min_value=1)
876
+
877
+ # Number col
878
+ df = pd.DataFrame(np.random.randn(size, 1), columns=["number"])
879
+ df["number_big"] = [
880
+ random.choice(range(100, 999)) for _ in range(size)
881
+ ] # Big number in range 100-999
882
+ df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
883
+
884
+ # Missing value col
885
+ na_rate = random.randint(1, 99)
886
+ d = [random.randint(1, 99) for _ in range(size)]
887
+ df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
888
+ # df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
889
+
890
+ # Text col
891
+ df["text"] = [
892
+ "".join([random.choice(string.ascii_lowercase) for _ in range(8)])
893
+ for _ in range(size)
894
+ ]
895
+
896
+ # Random date col
897
+ df["date"] = [
898
+ datetime(
899
+ year=random.randint(datetime.now().year - 20, datetime.now().year),
900
+ month=random.randint(1, 12),
901
+ day=random.randint(1, 28),
902
+ )
903
+ for _ in range(size)
904
+ ]
905
+
906
+ # Return
907
+ return cls(df)
908
+
909
+
910
+ class DADF(DataAnalystDataFrame):
911
+ """Short name for ``DataAnalystDataFrame``"""
344
912
 
345
- class SeriesKai(pd.Series):
346
913
  pass
347
914
 
915
+
348
916
  # Run
349
917
  ###########################################################################
350
918
  if __name__ == "__main__":