absfuyu 2.8.1__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of absfuyu might be problematic. Click here for more details.
- absfuyu/__init__.py +13 -10
- absfuyu/__main__.py +55 -38
- absfuyu/config/config.json +3 -3
- absfuyu/core.py +39 -25
- absfuyu/everything.py +4 -5
- absfuyu/extensions/__init__.py +3 -2
- absfuyu/extensions/dev/__init__.py +162 -19
- absfuyu/extensions/dev/password_hash.py +11 -10
- absfuyu/extensions/dev/passwordlib.py +256 -0
- absfuyu/extensions/dev/pkglib.py +53 -57
- absfuyu/extensions/dev/project_starter.py +58 -0
- absfuyu/extensions/dev/shutdownizer.py +8 -0
- absfuyu/extensions/extra/data_analysis.py +687 -119
- absfuyu/fun/__init__.py +88 -118
- absfuyu/fun/tarot.py +32 -34
- absfuyu/game/tictactoe2.py +90 -78
- absfuyu/{collections → general}/__init__.py +14 -12
- absfuyu/{collections → general}/content.py +105 -87
- absfuyu/{collections → general}/data_extension.py +652 -172
- absfuyu/{collections → general}/generator.py +65 -4
- absfuyu/{collections → general}/human.py +28 -3
- absfuyu/pkg_data/__init__.py +14 -36
- absfuyu/pkg_data/chemistry.pkl +0 -0
- absfuyu/pkg_data/tarot.pkl +0 -0
- absfuyu/tools/converter.py +58 -31
- absfuyu/tools/obfuscator.py +4 -4
- absfuyu/tools/stats.py +4 -4
- absfuyu/tools/web.py +2 -2
- absfuyu/util/lunar.py +144 -123
- absfuyu/util/path.py +22 -3
- absfuyu/util/performance.py +101 -14
- absfuyu/version.py +93 -84
- {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/METADATA +63 -33
- absfuyu-3.1.0.dist-info/RECORD +55 -0
- {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/WHEEL +1 -1
- absfuyu-3.1.0.dist-info/entry_points.txt +2 -0
- absfuyu/pkg_data/chemistry.json +0 -6268
- absfuyu/pkg_data/tarot.json +0 -2593
- absfuyu-2.8.1.dist-info/RECORD +0 -52
- absfuyu-2.8.1.dist-info/entry_points.txt +0 -2
- {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/LICENSE +0 -0
- {absfuyu-2.8.1.dist-info → absfuyu-3.1.0.dist-info}/top_level.txt +0 -0
|
@@ -3,35 +3,55 @@ Absfuyu: Data Analysis [W.I.P]
|
|
|
3
3
|
------------------------------
|
|
4
4
|
Extension for ``pd.DataFrame``
|
|
5
5
|
|
|
6
|
-
Version: 2.0.0.
|
|
7
|
-
Date updated:
|
|
6
|
+
Version: 2.0.0.dev10
|
|
7
|
+
Date updated: 06/03/2024 (dd/mm/yyyy)
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
# Module level
|
|
12
|
+
###########################################################################
|
|
13
|
+
__all__ = [
|
|
14
|
+
# Function
|
|
15
|
+
"compare_2_list",
|
|
16
|
+
# Support
|
|
17
|
+
"CityData",
|
|
18
|
+
"SplittedDF",
|
|
19
|
+
"PLTFormatString",
|
|
20
|
+
# Main
|
|
21
|
+
"MatplotlibFormatString",
|
|
22
|
+
"DataAnalystDataFrame",
|
|
23
|
+
"DADF",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
11
27
|
# Library
|
|
12
28
|
###########################################################################
|
|
13
|
-
from
|
|
29
|
+
from datetime import datetime
|
|
30
|
+
from functools import partial
|
|
14
31
|
import random
|
|
15
32
|
from itertools import chain, product
|
|
16
|
-
|
|
33
|
+
import string
|
|
34
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Union
|
|
17
35
|
|
|
18
36
|
# import matplotlib.pyplot as plt
|
|
37
|
+
# from scipy import stats
|
|
38
|
+
# from dateutil.relativedelta import relativedelta
|
|
19
39
|
import numpy as np
|
|
20
40
|
import pandas as pd
|
|
21
|
-
# from scipy import stats
|
|
22
41
|
|
|
23
42
|
from absfuyu.logger import logger
|
|
43
|
+
from absfuyu.util import set_min_max, set_min
|
|
24
44
|
|
|
25
45
|
|
|
26
46
|
# Function
|
|
27
47
|
###########################################################################
|
|
28
|
-
def summary(data: Union[list, np.ndarray]):
|
|
48
|
+
def summary(data: Union[list, np.ndarray]): # del this
|
|
29
49
|
"""
|
|
30
50
|
Quick summary of data
|
|
31
|
-
|
|
32
|
-
data
|
|
51
|
+
|
|
52
|
+
:param data: np.ndarray | list
|
|
33
53
|
"""
|
|
34
|
-
|
|
54
|
+
|
|
35
55
|
if not isinstance(data, np.ndarray):
|
|
36
56
|
data = np.array(data)
|
|
37
57
|
|
|
@@ -54,42 +74,7 @@ def summary(data: Union[list, np.ndarray]):
|
|
|
54
74
|
return output
|
|
55
75
|
|
|
56
76
|
|
|
57
|
-
def
|
|
58
|
-
"""
|
|
59
|
-
Divide df into a list of df
|
|
60
|
-
"""
|
|
61
|
-
divided = [y for _, y in df.groupby(by)]
|
|
62
|
-
# divided[0] # this is the first separated df
|
|
63
|
-
# divided[len(divided)-1] # this is the last separated df
|
|
64
|
-
return divided
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def delta_date(df: pd.DataFrame, date_field: str, col_name: str="delta_date"):
|
|
68
|
-
"""
|
|
69
|
-
Calculate date interval between row
|
|
70
|
-
"""
|
|
71
|
-
dated = df[date_field].to_list()
|
|
72
|
-
cal = []
|
|
73
|
-
for i in range(len(dated)):
|
|
74
|
-
if i==0:
|
|
75
|
-
cal.append(dated[i]-dated[i])
|
|
76
|
-
else:
|
|
77
|
-
cal.append(dated[i]-dated[i-1])
|
|
78
|
-
df[col_name] = [x.days for x in cal]
|
|
79
|
-
return df
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def modify_date(df: pd.DataFrame, date_col: str):
|
|
83
|
-
"""
|
|
84
|
-
Add date, week, and year column for date_col
|
|
85
|
-
"""
|
|
86
|
-
df["Date"] = pd.to_datetime(df[date_col])
|
|
87
|
-
df["Week"] = df["Date"].dt.isocalendar().week
|
|
88
|
-
df["Year"] = df["Date"].dt.isocalendar().year
|
|
89
|
-
return df
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def equalize_df(data: Dict[str, list], fillna = np.nan) -> Dict[str, list]:
|
|
77
|
+
def equalize_df(data: Dict[str, list], fillna=np.nan) -> Dict[str, list]:
|
|
93
78
|
"""
|
|
94
79
|
Make all list in dict have equal length to make pd.DataFrame
|
|
95
80
|
|
|
@@ -99,23 +84,33 @@ def equalize_df(data: Dict[str, list], fillna = np.nan) -> Dict[str, list]:
|
|
|
99
84
|
max_len = max(map(len, data.values()))
|
|
100
85
|
for _, v in data.items():
|
|
101
86
|
if len(v) < max_len:
|
|
102
|
-
missings = max_len-len(v)
|
|
87
|
+
missings = max_len - len(v)
|
|
103
88
|
for _ in range(missings):
|
|
104
89
|
v.append(fillna)
|
|
105
90
|
return data
|
|
106
91
|
|
|
92
|
+
|
|
107
93
|
## Update 05/10
|
|
108
94
|
|
|
95
|
+
|
|
109
96
|
def compare_2_list(*arr: list) -> pd.DataFrame:
|
|
110
97
|
"""
|
|
111
|
-
Compare lists then create DataFrame
|
|
98
|
+
Compare 2 lists then create DataFrame
|
|
112
99
|
to see which items are missing
|
|
113
100
|
|
|
114
|
-
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
arr : list
|
|
104
|
+
List
|
|
105
|
+
|
|
106
|
+
Returns
|
|
107
|
+
-------
|
|
108
|
+
DataFrame
|
|
109
|
+
Compare result
|
|
115
110
|
"""
|
|
116
111
|
# Setup
|
|
117
112
|
col_name = "list"
|
|
118
|
-
arr = [sorted(x) for x in arr]
|
|
113
|
+
arr = [sorted(x) for x in arr] # map(sorted, arr)
|
|
119
114
|
|
|
120
115
|
# Total array
|
|
121
116
|
tarr = sorted(list(set(chain.from_iterable(arr))))
|
|
@@ -133,9 +128,10 @@ def compare_2_list(*arr: list) -> pd.DataFrame:
|
|
|
133
128
|
|
|
134
129
|
df = pd.DataFrame(temp_dict)
|
|
135
130
|
df["Compare"] = np.where(
|
|
136
|
-
df[f"{col_name}0"].apply(lambda x: str(x).lower())
|
|
137
|
-
df[f"{col_name}
|
|
138
|
-
|
|
131
|
+
df[f"{col_name}0"].apply(lambda x: str(x).lower())
|
|
132
|
+
== df[f"{col_name}1"].apply(lambda x: str(x).lower()),
|
|
133
|
+
df[f"{col_name}0"], # Value when True
|
|
134
|
+
np.nan, # Value when False
|
|
139
135
|
)
|
|
140
136
|
return df
|
|
141
137
|
|
|
@@ -160,50 +156,126 @@ def rename_with_dict(df: pd.DataFrame, col: str, rename_dict: dict) -> pd.DataFr
|
|
|
160
156
|
df[name] = df[name].apply(lambda x: "Other" if x in rename_val else x)
|
|
161
157
|
return df
|
|
162
158
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
threshold: int = 10
|
|
168
|
-
) -> pd.DataFrame:
|
|
159
|
+
|
|
160
|
+
# Class
|
|
161
|
+
###########################################################################
|
|
162
|
+
class CityData(NamedTuple):
|
|
169
163
|
"""
|
|
170
|
-
|
|
171
|
-
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
city : str
|
|
167
|
+
City name
|
|
172
168
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
169
|
+
region : str
|
|
170
|
+
Region of the city
|
|
171
|
+
|
|
172
|
+
area : str
|
|
173
|
+
Area of the region
|
|
177
174
|
"""
|
|
178
|
-
# Clean
|
|
179
|
-
df[col] = df[col].str.strip() # Remove trailing space
|
|
180
|
-
|
|
181
|
-
col_df = df.groupby(col)[col2].count().sort_values(ascending=False)/df.shape[0]*100 # percentage of col
|
|
182
|
-
name_of_type: list = col_df[col_df.values>=threshold].keys().to_list() # get all the `col` that has larger than threshold
|
|
183
175
|
|
|
184
|
-
|
|
185
|
-
|
|
176
|
+
city: str
|
|
177
|
+
region: str
|
|
178
|
+
area: str
|
|
186
179
|
|
|
187
|
-
df = rename_with_dict(df, col, rename_dict)
|
|
188
180
|
|
|
189
|
-
|
|
181
|
+
class SplittedDF(NamedTuple):
|
|
182
|
+
"""
|
|
183
|
+
DataFrame splitted into contains
|
|
184
|
+
missing values only and vice versa
|
|
190
185
|
|
|
186
|
+
Parameters
|
|
187
|
+
----------
|
|
188
|
+
df : DataFrame
|
|
189
|
+
DataFrame without missing values
|
|
191
190
|
|
|
191
|
+
df_na : DataFrame
|
|
192
|
+
DataFrame with missing values only
|
|
193
|
+
"""
|
|
192
194
|
|
|
195
|
+
df: pd.DataFrame
|
|
196
|
+
df_na: pd.DataFrame
|
|
197
|
+
|
|
198
|
+
@staticmethod
|
|
199
|
+
def concat_df(df_list: List[pd.DataFrame], join: str = "inner"):
|
|
200
|
+
"""
|
|
201
|
+
Concat the list of DataFrame (static method)
|
|
202
|
+
|
|
203
|
+
Parameters
|
|
204
|
+
----------
|
|
205
|
+
df_list : list[DataFrame]
|
|
206
|
+
A sequence of DataFrame
|
|
207
|
+
|
|
208
|
+
join : str
|
|
209
|
+
Join type
|
|
210
|
+
(Default: ``"inner"``)
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
DataFrame
|
|
215
|
+
Joined DataFrame
|
|
216
|
+
"""
|
|
217
|
+
df: pd.DataFrame = pd.concat(df_list, axis=0, join=join).reset_index()
|
|
218
|
+
df.drop(columns=["index"], inplace=True)
|
|
219
|
+
return df
|
|
220
|
+
|
|
221
|
+
def concat(self, join: str = "inner"):
|
|
222
|
+
"""
|
|
223
|
+
Concat the splitted DataFrame
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
join : str
|
|
228
|
+
Join type
|
|
229
|
+
(Default: ``"inner"``)
|
|
230
|
+
|
|
231
|
+
Returns
|
|
232
|
+
-------
|
|
233
|
+
DataFrame
|
|
234
|
+
Joined DataFrame
|
|
235
|
+
"""
|
|
236
|
+
return self.concat_df(self, join=join)
|
|
237
|
+
|
|
238
|
+
@staticmethod
|
|
239
|
+
def divide_dataframe(df: pd.DataFrame, by_column: str) -> List[pd.DataFrame]:
|
|
240
|
+
"""
|
|
241
|
+
Divide DataFrame into a list of DataFrame
|
|
242
|
+
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
df : DataFrame
|
|
246
|
+
DataFrame
|
|
247
|
+
|
|
248
|
+
by_column : str
|
|
249
|
+
By which column
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
list[DataFrame]
|
|
254
|
+
Splitted DataFrame
|
|
255
|
+
"""
|
|
256
|
+
divided = [x for _, x in df.groupby(by_column)]
|
|
257
|
+
return divided
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
##
|
|
261
|
+
class PLTFormatString(NamedTuple):
|
|
262
|
+
"""Matplotlib format string"""
|
|
263
|
+
|
|
264
|
+
marker: str
|
|
265
|
+
line_style: str
|
|
266
|
+
color: str
|
|
193
267
|
|
|
194
|
-
# Class
|
|
195
|
-
###########################################################################
|
|
196
|
-
PLTFormatString = namedtuple("PLTFormatString", ["marker", "line_style", "color"])
|
|
197
268
|
|
|
198
269
|
class _DictToAtrr:
|
|
199
270
|
"""Convert `keys` or `values` of `dict` into attribute"""
|
|
271
|
+
|
|
200
272
|
def __init__(
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
273
|
+
self,
|
|
274
|
+
dict_data: dict,
|
|
275
|
+
*,
|
|
276
|
+
key_as_atrribute: bool = True,
|
|
277
|
+
remove_char: str = r"( ) [ ] { }",
|
|
278
|
+
) -> None:
|
|
207
279
|
"""
|
|
208
280
|
dict_data: Dictionary to convert
|
|
209
281
|
key_as_atrribute: Use `dict.keys()` as atrribute when True, else use `dict.values()`
|
|
@@ -222,9 +294,10 @@ class _DictToAtrr:
|
|
|
222
294
|
|
|
223
295
|
def __str__(self) -> str:
|
|
224
296
|
return f"{self.__class__.__name__}({self._keys})"
|
|
297
|
+
|
|
225
298
|
def __repr__(self) -> str:
|
|
226
299
|
return self.__str__()
|
|
227
|
-
|
|
300
|
+
|
|
228
301
|
@staticmethod
|
|
229
302
|
def _remove_space(value: str, remove_char: str) -> str:
|
|
230
303
|
"""
|
|
@@ -242,6 +315,7 @@ class MatplotlibFormatString:
|
|
|
242
315
|
"""
|
|
243
316
|
Format string format: `[marker][line][color]` or `[color][marker][line]`
|
|
244
317
|
"""
|
|
318
|
+
|
|
245
319
|
MARKER_LIST = {
|
|
246
320
|
".": "point marker",
|
|
247
321
|
",": "pixel marker",
|
|
@@ -267,13 +341,13 @@ class MatplotlibFormatString:
|
|
|
267
341
|
"D": "diamond marker",
|
|
268
342
|
"d": "thin_diamond marker",
|
|
269
343
|
"|": "vline marker",
|
|
270
|
-
"_": "hline marker"
|
|
344
|
+
"_": "hline marker",
|
|
271
345
|
}
|
|
272
346
|
LINE_STYLE_LIST = {
|
|
273
347
|
"-": "solid line style",
|
|
274
348
|
"--": "dashed line style",
|
|
275
349
|
"-.": "dash-dot line style",
|
|
276
|
-
":": "dotted line style"
|
|
350
|
+
":": "dotted line style",
|
|
277
351
|
}
|
|
278
352
|
COLOR_LIST = {
|
|
279
353
|
"b": "blue",
|
|
@@ -283,7 +357,7 @@ class MatplotlibFormatString:
|
|
|
283
357
|
"m": "magenta",
|
|
284
358
|
"y": "yellow",
|
|
285
359
|
"k": "black",
|
|
286
|
-
"w": "white"
|
|
360
|
+
"w": "white",
|
|
287
361
|
}
|
|
288
362
|
Marker = _DictToAtrr(MARKER_LIST, key_as_atrribute=False)
|
|
289
363
|
LineStyle = _DictToAtrr(LINE_STYLE_LIST, key_as_atrribute=False)
|
|
@@ -291,7 +365,11 @@ class MatplotlibFormatString:
|
|
|
291
365
|
|
|
292
366
|
@staticmethod
|
|
293
367
|
def all_format_string() -> List[PLTFormatString]:
|
|
294
|
-
fmt_str = [
|
|
368
|
+
fmt_str = [
|
|
369
|
+
__class__.MARKER_LIST,
|
|
370
|
+
__class__.LINE_STYLE_LIST,
|
|
371
|
+
__class__.COLOR_LIST,
|
|
372
|
+
]
|
|
295
373
|
return [PLTFormatString._make(x) for x in list(product(*fmt_str))]
|
|
296
374
|
|
|
297
375
|
@staticmethod
|
|
@@ -301,50 +379,540 @@ class MatplotlibFormatString:
|
|
|
301
379
|
return f"{temp.marker}{temp.line_style}{temp.color}"
|
|
302
380
|
else:
|
|
303
381
|
return f"{temp.color}{temp.marker}{temp.line_style}"
|
|
304
|
-
|
|
305
382
|
|
|
306
383
|
|
|
307
|
-
|
|
308
|
-
|
|
384
|
+
# Class - DA
|
|
385
|
+
###########################################################################
|
|
386
|
+
class DataAnalystDataFrame(pd.DataFrame):
|
|
387
|
+
"""Data Analyst ``pd.DataFrame``"""
|
|
388
|
+
|
|
389
|
+
_DADF_Version = (1, 1, 0)
|
|
390
|
+
|
|
391
|
+
# Support
|
|
392
|
+
# ================================================================
|
|
393
|
+
# Rearrange column
|
|
394
|
+
def rearrange_column(self, insert_to_col: str, num_of_cols: int = 1):
|
|
395
|
+
"""
|
|
396
|
+
Move right-most columns to selected position
|
|
397
|
+
|
|
398
|
+
Parameters
|
|
399
|
+
----------
|
|
400
|
+
insert_to_col : str
|
|
401
|
+
Name of the column that the right-most column will be moved next to
|
|
402
|
+
|
|
403
|
+
num_of_cols : int
|
|
404
|
+
Number of columns moved
|
|
405
|
+
|
|
406
|
+
Returns
|
|
407
|
+
-------
|
|
408
|
+
DataAnalystDataFrame
|
|
409
|
+
Modified DataFrame
|
|
410
|
+
"""
|
|
411
|
+
cols = self.columns.to_list() # List of columns
|
|
412
|
+
num_of_cols = set_min_max(num_of_cols, min_value=1, max_value=len(cols))
|
|
413
|
+
col_index = cols.index(insert_to_col)
|
|
414
|
+
cols = (
|
|
415
|
+
cols[: col_index + 1]
|
|
416
|
+
+ cols[-num_of_cols:]
|
|
417
|
+
+ cols[col_index + 1 : len(cols) - num_of_cols]
|
|
418
|
+
)
|
|
419
|
+
self = __class__(self[cols])
|
|
420
|
+
return self
|
|
421
|
+
|
|
422
|
+
# Drop a list of column
|
|
423
|
+
def drop_columns(self, columns: List[str]):
|
|
424
|
+
"""
|
|
425
|
+
Drop columns in DataFrame
|
|
426
|
+
|
|
427
|
+
Parameters
|
|
428
|
+
----------
|
|
429
|
+
columns : list[str]
|
|
430
|
+
List of columns need to drop
|
|
431
|
+
|
|
432
|
+
Returns
|
|
433
|
+
-------
|
|
434
|
+
DataAnalystDataFrame
|
|
435
|
+
Modified DataFrame
|
|
436
|
+
"""
|
|
437
|
+
for column in columns:
|
|
438
|
+
try:
|
|
439
|
+
self.drop(columns=[column], inplace=True)
|
|
440
|
+
except:
|
|
441
|
+
logger.debug(f"{column} column does not exist")
|
|
442
|
+
# pass
|
|
443
|
+
return self
|
|
444
|
+
|
|
445
|
+
# Drop right-most columns
|
|
446
|
+
def drop_rightmost(self, num_of_cols: int = 1):
|
|
447
|
+
"""
|
|
448
|
+
Drop ``num_of_cols`` right-most columns
|
|
449
|
+
|
|
450
|
+
Parameters
|
|
451
|
+
----------
|
|
452
|
+
num_of_cols : int
|
|
453
|
+
Number of columns to drop
|
|
454
|
+
|
|
455
|
+
Returns
|
|
456
|
+
-------
|
|
457
|
+
DataAnalystDataFrame
|
|
458
|
+
Modified DataFrame
|
|
459
|
+
"""
|
|
460
|
+
# Restrain
|
|
461
|
+
# if num_of_cols < 1:
|
|
462
|
+
# num_of_cols = 1
|
|
463
|
+
# if num_of_cols > self.shape[1]:
|
|
464
|
+
# num_of_cols = self.shape[1]
|
|
465
|
+
num_of_cols = set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
|
|
466
|
+
|
|
467
|
+
# Logic
|
|
468
|
+
for _ in range(num_of_cols):
|
|
469
|
+
self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
|
|
470
|
+
return self
|
|
471
|
+
|
|
472
|
+
# Add blank column
|
|
473
|
+
def add_blank_column(self, column_name: str, fill: Any):
|
|
474
|
+
"""
|
|
475
|
+
Add a blank column
|
|
476
|
+
|
|
477
|
+
Parameters
|
|
478
|
+
----------
|
|
479
|
+
column_name : str
|
|
480
|
+
Name of the column to add
|
|
481
|
+
|
|
482
|
+
fill : Any
|
|
483
|
+
Fill the column with data
|
|
484
|
+
|
|
485
|
+
Returns
|
|
486
|
+
-------
|
|
487
|
+
DataAnalystDataFrame
|
|
488
|
+
Modified DataFrame
|
|
309
489
|
"""
|
|
310
|
-
|
|
490
|
+
self[column_name] = [fill] * self.shape[0]
|
|
491
|
+
return self
|
|
492
|
+
|
|
493
|
+
# Modify
|
|
494
|
+
# ================================================================
|
|
495
|
+
# Convert city
|
|
496
|
+
def convert_city(self, city_column: str, city_list: List[CityData] = None):
|
|
311
497
|
"""
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
498
|
+
Get region and area of a city
|
|
499
|
+
|
|
500
|
+
Parameters
|
|
501
|
+
----------
|
|
502
|
+
city_column : str
|
|
503
|
+
Column contains city data
|
|
504
|
+
|
|
505
|
+
city_list : list[CityData]
|
|
506
|
+
List of city in correct format
|
|
507
|
+
(Default: ``None``)
|
|
316
508
|
|
|
317
|
-
|
|
509
|
+
Returns
|
|
510
|
+
-------
|
|
511
|
+
DataAnalystDataFrame
|
|
512
|
+
Modified DataFrame
|
|
318
513
|
"""
|
|
319
|
-
|
|
514
|
+
|
|
515
|
+
# Support function
|
|
516
|
+
def _convert_city_prep(
|
|
517
|
+
value: str,
|
|
518
|
+
rtype: str = "region",
|
|
519
|
+
) -> str:
|
|
520
|
+
"""
|
|
521
|
+
:param value: Value
|
|
522
|
+
:param rtype: "region" or "area"
|
|
523
|
+
:param city_list: list of cities with city, region, area
|
|
524
|
+
"""
|
|
525
|
+
for x in city_list:
|
|
526
|
+
if x.city.lower().startswith(value.lower()):
|
|
527
|
+
if rtype.lower().strip().startswith("region"):
|
|
528
|
+
return x.region
|
|
529
|
+
if rtype.lower().strip().startswith("area"):
|
|
530
|
+
return x.area
|
|
531
|
+
return value
|
|
532
|
+
|
|
533
|
+
_convert_city_prep2 = partial(_convert_city_prep, rtype="area")
|
|
534
|
+
|
|
535
|
+
# Convert
|
|
536
|
+
self["region"] = self[city_column].apply(_convert_city_prep)
|
|
537
|
+
self["area"] = self[city_column].apply(_convert_city_prep2)
|
|
538
|
+
|
|
539
|
+
# Rearrange
|
|
540
|
+
return self.rearrange_column(city_column, 2)
|
|
541
|
+
|
|
542
|
+
# Date related
|
|
543
|
+
def add_date_from_month(self, month_column: str, *, col_name: str = "date"):
|
|
320
544
|
"""
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
545
|
+
Add dummy ``date`` column from ``month`` column
|
|
546
|
+
|
|
547
|
+
Parameters
|
|
548
|
+
----------
|
|
549
|
+
month_column : str
|
|
550
|
+
Month column
|
|
551
|
+
|
|
552
|
+
col_name : str
|
|
553
|
+
New date column name
|
|
554
|
+
(Default: ``"date"``)
|
|
325
555
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
"
|
|
334
|
-
"
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
556
|
+
Returns
|
|
557
|
+
-------
|
|
558
|
+
DataAnalystDataFrame
|
|
559
|
+
Modified DataFrame
|
|
560
|
+
"""
|
|
561
|
+
_this_year = datetime.now().year
|
|
562
|
+
self[col_name] = pd.to_datetime(
|
|
563
|
+
f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
|
|
564
|
+
format="%Y-%m-%d",
|
|
565
|
+
)
|
|
566
|
+
# Rearrange
|
|
567
|
+
return self.rearrange_column(month_column)
|
|
568
|
+
|
|
569
|
+
def add_detail_date(self, date_column: str, mode: str = "dwmy"):
|
|
570
|
+
"""
|
|
571
|
+
Add these columns from ``date_column``:
|
|
572
|
+
- ``date`` (won't add if ``date_column`` value is ``"date"``)
|
|
573
|
+
- ``day`` (overwrite if already exist)
|
|
574
|
+
- ``week`` (overwrite if already exist)
|
|
575
|
+
- ``month`` (overwrite if already exist)
|
|
576
|
+
- ``year`` (overwrite if already exist)
|
|
577
|
+
|
|
578
|
+
Parameters
|
|
579
|
+
----------
|
|
580
|
+
date_column : str
|
|
581
|
+
Date column
|
|
582
|
+
|
|
583
|
+
mode : str
|
|
584
|
+
| Detailed column to add
|
|
585
|
+
| ``d``: day
|
|
586
|
+
| ``w``: week number
|
|
587
|
+
| ``m``: month
|
|
588
|
+
| ``y``: year
|
|
589
|
+
| (Default: ``"dwmy"``)
|
|
590
|
+
|
|
591
|
+
Returns
|
|
592
|
+
-------
|
|
593
|
+
DataAnalystDataFrame
|
|
594
|
+
Modified DataFrame
|
|
595
|
+
"""
|
|
596
|
+
# Convert to datetime
|
|
597
|
+
self["date"] = pd.to_datetime(self[date_column])
|
|
598
|
+
|
|
599
|
+
# Logic
|
|
600
|
+
col_counter = 0
|
|
601
|
+
# self["weekday"] = self["day"].dt.isocalendar().day # Weekday
|
|
602
|
+
if mode.find("d") != -1:
|
|
603
|
+
logger.debug("Mode: 'day'")
|
|
604
|
+
self["day"] = self["date"].dt.day
|
|
605
|
+
col_counter += 1
|
|
606
|
+
if mode.find("w") != -1:
|
|
607
|
+
logger.debug("Mode: 'weekday'")
|
|
608
|
+
self["week"] = self["date"].dt.isocalendar().week
|
|
609
|
+
col_counter += 1
|
|
610
|
+
if mode.find("m") != -1:
|
|
611
|
+
logger.debug("Mode: 'month'")
|
|
612
|
+
self["month"] = self["date"].dt.month
|
|
613
|
+
col_counter += 1
|
|
614
|
+
if mode.find("y") != -1:
|
|
615
|
+
logger.debug("Mode: 'year'")
|
|
616
|
+
self["year"] = self["date"].dt.year
|
|
617
|
+
col_counter += 1
|
|
618
|
+
return self.rearrange_column(date_column, col_counter)
|
|
619
|
+
|
|
620
|
+
def delta_date(
|
|
621
|
+
self, date_column: str, mode: str = "now", *, col_name: str = "delta_date"
|
|
622
|
+
):
|
|
623
|
+
"""
|
|
624
|
+
Calculate date interval
|
|
625
|
+
|
|
626
|
+
Parameters
|
|
627
|
+
----------
|
|
628
|
+
date_column : str
|
|
629
|
+
Date column
|
|
630
|
+
|
|
631
|
+
mode : str
|
|
632
|
+
| Mode to calculate
|
|
633
|
+
| ``"between_row"``: Calculate date interval between each row
|
|
634
|
+
| ``"now"``: Calculate date interval to current date
|
|
635
|
+
| (Default: ``"between_row"``)
|
|
636
|
+
|
|
637
|
+
col_name : str
|
|
638
|
+
| New delta date column name
|
|
639
|
+
| (Default: ``"delta_date"``)
|
|
640
|
+
|
|
641
|
+
Returns
|
|
642
|
+
-------
|
|
643
|
+
DataAnalystDataFrame
|
|
644
|
+
Modified DataFrame
|
|
645
|
+
"""
|
|
646
|
+
if mode.lower().startswith("between_row"):
|
|
647
|
+
dated = self[date_column].to_list()
|
|
648
|
+
cal = []
|
|
649
|
+
for i in range(len(dated)):
|
|
650
|
+
if i == 0:
|
|
651
|
+
cal.append(dated[i] - dated[i])
|
|
652
|
+
# cal.append(relativedelta(dated[i], dated[i]))
|
|
653
|
+
else:
|
|
654
|
+
cal.append(dated[i] - dated[i - 1])
|
|
655
|
+
# cal.append(relativedelta(dated[i], dated[i - 1]))
|
|
656
|
+
self[col_name] = [x.days for x in cal]
|
|
657
|
+
return self
|
|
658
|
+
else: # mode="now"
|
|
659
|
+
self[col_name] = self[date_column].apply(lambda x: (datetime.now() - x).days)
|
|
660
|
+
return self
|
|
661
|
+
|
|
662
|
+
# Fill missing value
|
|
663
|
+
def fill_missing_values(
|
|
664
|
+
self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
|
|
665
|
+
):
|
|
666
|
+
"""
|
|
667
|
+
Fill missing values in specified column
|
|
668
|
+
|
|
669
|
+
Parameters
|
|
670
|
+
----------
|
|
671
|
+
column_name : str
|
|
672
|
+
Column name
|
|
673
|
+
|
|
674
|
+
fill : Any
|
|
675
|
+
Fill the missing values with
|
|
676
|
+
(Default: ``np.nan``)
|
|
677
|
+
|
|
678
|
+
fill_when_not_exist : Any
|
|
679
|
+
When ``column_name`` does not exist,
|
|
680
|
+
create a new column and fill with ``fill_when_not_exist``
|
|
681
|
+
(Default: ``np.nan``)
|
|
682
|
+
|
|
683
|
+
Returns
|
|
684
|
+
-------
|
|
685
|
+
DataAnalystDataFrame
|
|
686
|
+
Modified DataFrame
|
|
687
|
+
"""
|
|
688
|
+
try:
|
|
689
|
+
self[column_name] = self[column_name].fillna(fill)
|
|
690
|
+
except:
|
|
691
|
+
self.add_blank_column(column_name, fill_when_not_exist)
|
|
692
|
+
return self
|
|
693
|
+
|
|
694
|
+
# Split DataFrame
|
|
695
|
+
def split_na(self, by_column: str) -> SplittedDF:
|
|
696
|
+
"""
|
|
697
|
+
Split DataFrame into 2 parts:
|
|
698
|
+
- Without missing value in specified column
|
|
699
|
+
- With missing value in specified column
|
|
700
|
+
|
|
701
|
+
Parameters
|
|
702
|
+
----------
|
|
703
|
+
by_column : str
|
|
704
|
+
Split by column
|
|
705
|
+
|
|
706
|
+
Returns
|
|
707
|
+
-------
|
|
708
|
+
SplittedDF
|
|
709
|
+
Splitted DataFrame
|
|
710
|
+
"""
|
|
711
|
+
out = SplittedDF(
|
|
712
|
+
df=self[~self[by_column].isna()], # DF
|
|
713
|
+
df_na=self[self[by_column].isna()], # DF w/o NA
|
|
714
|
+
)
|
|
715
|
+
return out
|
|
716
|
+
|
|
717
|
+
# Threshold filter
|
|
718
|
+
def threshold_filter(
|
|
719
|
+
self,
|
|
720
|
+
destination_column: str,
|
|
721
|
+
threshold: Union[int, float] = 10,
|
|
722
|
+
*,
|
|
723
|
+
top: Optional[int] = None,
|
|
724
|
+
group_by_column: Optional[str] = None,
|
|
725
|
+
replace_with: Any = "Other",
|
|
726
|
+
):
|
|
727
|
+
"""
|
|
728
|
+
Filter out percentage of data that smaller than the ``threshold``,
|
|
729
|
+
replace all of the smaller data to ``replace_with``.
|
|
730
|
+
As a result, pie chart is less messy.
|
|
731
|
+
Version: 1.1.0
|
|
732
|
+
|
|
733
|
+
Parameters
|
|
734
|
+
----------
|
|
735
|
+
destination_column : str
|
|
736
|
+
Column to be filtered
|
|
737
|
+
|
|
738
|
+
threshold : int | float
|
|
739
|
+
Which percentage to cut-off
|
|
740
|
+
(Default: 10%)
|
|
741
|
+
|
|
742
|
+
top : int
|
|
743
|
+
Only show top ``x`` categories in pie chart
|
|
744
|
+
(replace threshold mode)
|
|
745
|
+
(Default: ``None``)
|
|
746
|
+
|
|
747
|
+
group_by_column : str
|
|
748
|
+
Calculate threshold for each category in selected column [W.I.P]
|
|
749
|
+
(Default: ``None``)
|
|
750
|
+
|
|
751
|
+
replace_with : Any
|
|
752
|
+
Replace all of the smaller data with specified value
|
|
753
|
+
|
|
754
|
+
Returns
|
|
755
|
+
-------
|
|
756
|
+
DataAnalystDataFrame
|
|
757
|
+
Modified DataFrame
|
|
758
|
+
"""
|
|
759
|
+
# Clean
|
|
760
|
+
try:
|
|
761
|
+
self[destination_column] = self[
|
|
762
|
+
destination_column
|
|
763
|
+
].str.strip() # Remove trailing space
|
|
764
|
+
except:
|
|
765
|
+
pass
|
|
766
|
+
|
|
767
|
+
# Logic
|
|
768
|
+
if group_by_column is None:
|
|
769
|
+
# Get a column with no missing values
|
|
770
|
+
col_with_no_na = ""
|
|
771
|
+
for col_name in self.columns:
|
|
772
|
+
if col_name == destination_column:
|
|
773
|
+
continue
|
|
774
|
+
if self[col_name].isna().sum() == 0:
|
|
775
|
+
col_with_no_na = col_name
|
|
776
|
+
break
|
|
777
|
+
if col_with_no_na == "":
|
|
778
|
+
# CASE: every col has NA else where
|
|
779
|
+
for col_name in self.columns:
|
|
780
|
+
if col_name == destination_column:
|
|
781
|
+
continue
|
|
782
|
+
else:
|
|
783
|
+
col_with_no_na = col_name
|
|
784
|
+
break
|
|
785
|
+
self[col_with_no_na].fillna("N/A")
|
|
786
|
+
|
|
787
|
+
# Calculate threshold
|
|
788
|
+
col_df = (
|
|
789
|
+
self.groupby(destination_column)
|
|
790
|
+
.count()[col_with_no_na]
|
|
791
|
+
.sort_values(ascending=False)
|
|
792
|
+
.to_frame()
|
|
793
|
+
.reset_index()
|
|
794
|
+
)
|
|
795
|
+
col_df.rename(columns={col_with_no_na: "total_count"}, inplace=True)
|
|
796
|
+
col_df["percentage"] = col_df["total_count"] / self.shape[0] * 100
|
|
797
|
+
# logger.debug(col_df) # Show calculation result
|
|
798
|
+
else:
|
|
799
|
+
# Real logic: manually select a column to perform percentage calculation
|
|
800
|
+
# Calculate threshold for each category in selected column may be will be added in the future
|
|
801
|
+
col_df = (
|
|
802
|
+
self.groupby(destination_column)[group_by_column]
|
|
803
|
+
.count()
|
|
804
|
+
.sort_values(ascending=False)
|
|
805
|
+
/ self.shape[0]
|
|
806
|
+
* 100
|
|
807
|
+
) # percentage of destination_column
|
|
808
|
+
col_df = col_df.reset_index()
|
|
809
|
+
col_df.rename(columns={group_by_column: "percentage"}, inplace=True)
|
|
810
|
+
|
|
811
|
+
# Rename
|
|
812
|
+
if top is not None:
|
|
813
|
+
list_of_keep: list = (
|
|
814
|
+
col_df[destination_column]
|
|
815
|
+
.head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0]))
|
|
816
|
+
.to_list()
|
|
817
|
+
)
|
|
818
|
+
# logger.debug(list_of_keep)
|
|
819
|
+
else:
|
|
820
|
+
list_of_keep: list = col_df[col_df["percentage"] >= threshold][
|
|
821
|
+
destination_column
|
|
822
|
+
].to_list() # values that will not be renamed
|
|
823
|
+
self[f"{destination_column}_filtered"] = self[destination_column].apply(
|
|
824
|
+
lambda x: replace_with if x not in list_of_keep else x
|
|
825
|
+
)
|
|
826
|
+
|
|
827
|
+
# Return
|
|
828
|
+
return self
|
|
829
|
+
|
|
830
|
+
# Info
|
|
831
|
+
# ================================================================
|
|
832
|
+
def get_missing_values(self, hightlight: bool = True) -> pd.DataFrame:
|
|
833
|
+
"""
|
|
834
|
+
Get a DataFrame contains count of missing values for each column
|
|
835
|
+
|
|
836
|
+
Parameters
|
|
837
|
+
----------
|
|
838
|
+
hightlight : bool
|
|
839
|
+
Shows only columns with missing values when ``True``
|
|
840
|
+
(Default: ``True``)
|
|
841
|
+
|
|
842
|
+
Returns
|
|
843
|
+
-------
|
|
844
|
+
DataFrame
|
|
845
|
+
Missing value DataFrame
|
|
846
|
+
"""
|
|
847
|
+
# Check for missing value
|
|
848
|
+
df_na = self.isnull().sum().sort_values(ascending=False)
|
|
849
|
+
if hightlight:
|
|
850
|
+
out = df_na[df_na != 0].to_frame()
|
|
851
|
+
else:
|
|
852
|
+
out = df_na.to_frame()
|
|
853
|
+
out.rename(columns={0: "Num of N/A"}, inplace=True)
|
|
854
|
+
return out
|
|
343
855
|
|
|
856
|
+
# Sample DataFrame
|
|
857
|
+
@classmethod
|
|
858
|
+
def sample_df(cls, size: int = 100):
|
|
859
|
+
"""
|
|
860
|
+
Create sample DataFrame
|
|
861
|
+
|
|
862
|
+
Parameters
|
|
863
|
+
----------
|
|
864
|
+
size : int
|
|
865
|
+
Number of observations
|
|
866
|
+
(Default: ``100``)
|
|
867
|
+
|
|
868
|
+
Returns
|
|
869
|
+
-------
|
|
870
|
+
DataAnalystDataFrame
|
|
871
|
+
DataFrame with these columns:
|
|
872
|
+
[number, number_big, number_range, missing_value, text, date]
|
|
873
|
+
"""
|
|
874
|
+
# Restrain
|
|
875
|
+
size = set_min(size, min_value=1)
|
|
876
|
+
|
|
877
|
+
# Number col
|
|
878
|
+
df = pd.DataFrame(np.random.randn(size, 1), columns=["number"])
|
|
879
|
+
df["number_big"] = [
|
|
880
|
+
random.choice(range(100, 999)) for _ in range(size)
|
|
881
|
+
] # Big number in range 100-999
|
|
882
|
+
df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
|
|
883
|
+
|
|
884
|
+
# Missing value col
|
|
885
|
+
na_rate = random.randint(1, 99)
|
|
886
|
+
d = [random.randint(1, 99) for _ in range(size)]
|
|
887
|
+
df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
|
|
888
|
+
# df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
|
|
889
|
+
|
|
890
|
+
# Text col
|
|
891
|
+
df["text"] = [
|
|
892
|
+
"".join([random.choice(string.ascii_lowercase) for _ in range(8)])
|
|
893
|
+
for _ in range(size)
|
|
894
|
+
]
|
|
895
|
+
|
|
896
|
+
# Random date col
|
|
897
|
+
df["date"] = [
|
|
898
|
+
datetime(
|
|
899
|
+
year=random.randint(datetime.now().year - 20, datetime.now().year),
|
|
900
|
+
month=random.randint(1, 12),
|
|
901
|
+
day=random.randint(1, 28),
|
|
902
|
+
)
|
|
903
|
+
for _ in range(size)
|
|
904
|
+
]
|
|
905
|
+
|
|
906
|
+
# Return
|
|
907
|
+
return cls(df)
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
class DADF(DataAnalystDataFrame):
|
|
911
|
+
"""Short name for ``DataAnalystDataFrame``"""
|
|
344
912
|
|
|
345
|
-
class SeriesKai(pd.Series):
|
|
346
913
|
pass
|
|
347
914
|
|
|
915
|
+
|
|
348
916
|
# Run
|
|
349
917
|
###########################################################################
|
|
350
918
|
if __name__ == "__main__":
|