absfuyu 5.0.1__py3-none-any.whl → 5.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of absfuyu might be problematic. Click here for more details.
- absfuyu/__init__.py +1 -1
- absfuyu/__main__.py +2 -2
- absfuyu/cli/__init__.py +2 -2
- absfuyu/cli/color.py +30 -14
- absfuyu/cli/config_group.py +9 -2
- absfuyu/cli/do_group.py +13 -6
- absfuyu/cli/game_group.py +9 -2
- absfuyu/cli/tool_group.py +16 -9
- absfuyu/config/__init__.py +2 -2
- absfuyu/core/__init__.py +2 -2
- absfuyu/core/baseclass.py +448 -79
- absfuyu/core/baseclass2.py +2 -2
- absfuyu/core/decorator.py +69 -3
- absfuyu/core/docstring.py +2 -2
- absfuyu/core/dummy_cli.py +2 -2
- absfuyu/core/dummy_func.py +13 -2
- absfuyu/core/typings.py +40 -0
- absfuyu/dxt/__init__.py +2 -2
- absfuyu/dxt/dictext.py +2 -2
- absfuyu/dxt/dxt_support.py +2 -2
- absfuyu/dxt/intext.py +31 -3
- absfuyu/dxt/listext.py +28 -3
- absfuyu/dxt/strext.py +2 -2
- absfuyu/extra/__init__.py +2 -2
- absfuyu/extra/beautiful.py +2 -2
- absfuyu/extra/da/__init__.py +36 -0
- absfuyu/extra/da/dadf.py +1138 -0
- absfuyu/extra/da/dadf_base.py +186 -0
- absfuyu/extra/da/df_func.py +97 -0
- absfuyu/extra/da/mplt.py +219 -0
- absfuyu/extra/data_analysis.py +10 -1067
- absfuyu/fun/__init__.py +2 -2
- absfuyu/fun/tarot.py +2 -2
- absfuyu/game/__init__.py +2 -2
- absfuyu/game/game_stat.py +2 -2
- absfuyu/game/sudoku.py +2 -2
- absfuyu/game/tictactoe.py +2 -2
- absfuyu/game/wordle.py +2 -2
- absfuyu/general/__init__.py +2 -2
- absfuyu/general/content.py +2 -2
- absfuyu/general/human.py +2 -2
- absfuyu/general/shape.py +2 -2
- absfuyu/logger.py +2 -2
- absfuyu/pkg_data/__init__.py +2 -2
- absfuyu/pkg_data/deprecated.py +2 -2
- absfuyu/sort.py +2 -2
- absfuyu/tools/__init__.py +25 -2
- absfuyu/tools/checksum.py +27 -7
- absfuyu/tools/converter.py +93 -28
- absfuyu/tools/generator.py +2 -2
- absfuyu/tools/inspector.py +433 -0
- absfuyu/tools/keygen.py +2 -2
- absfuyu/tools/obfuscator.py +45 -7
- absfuyu/tools/passwordlib.py +87 -22
- absfuyu/tools/shutdownizer.py +2 -2
- absfuyu/tools/web.py +2 -2
- absfuyu/util/__init__.py +2 -2
- absfuyu/util/api.py +2 -2
- absfuyu/util/json_method.py +2 -2
- absfuyu/util/lunar.py +2 -2
- absfuyu/util/path.py +190 -82
- absfuyu/util/performance.py +4 -4
- absfuyu/util/shorten_number.py +40 -10
- absfuyu/util/text_table.py +272 -0
- absfuyu/util/zipped.py +6 -6
- absfuyu/version.py +2 -2
- {absfuyu-5.0.1.dist-info → absfuyu-5.1.0.dist-info}/METADATA +9 -2
- absfuyu-5.1.0.dist-info/RECORD +76 -0
- absfuyu-5.0.1.dist-info/RECORD +0 -68
- {absfuyu-5.0.1.dist-info → absfuyu-5.1.0.dist-info}/WHEEL +0 -0
- {absfuyu-5.0.1.dist-info → absfuyu-5.1.0.dist-info}/entry_points.txt +0 -0
- {absfuyu-5.0.1.dist-info → absfuyu-5.1.0.dist-info}/licenses/LICENSE +0 -0
absfuyu/extra/data_analysis.py
CHANGED
|
@@ -2,1077 +2,20 @@
|
|
|
2
2
|
Absfuyu: Data Analysis [W.I.P]
|
|
3
3
|
------------------------------
|
|
4
4
|
Extension for ``pd.DataFrame``
|
|
5
|
+
(deprecated)
|
|
5
6
|
|
|
6
|
-
Version: 5.
|
|
7
|
-
Date updated:
|
|
7
|
+
Version: 5.1.0
|
|
8
|
+
Date updated: 10/03/2025 (dd/mm/yyyy)
|
|
8
9
|
"""
|
|
9
10
|
|
|
10
|
-
# Module level
|
|
11
|
-
# ---------------------------------------------------------------------------
|
|
12
|
-
__all__ = [
|
|
13
|
-
# Function
|
|
14
|
-
"compare_2_list",
|
|
15
|
-
# Support
|
|
16
|
-
"CityData",
|
|
17
|
-
"SplittedDF",
|
|
18
|
-
"PLTFormatString",
|
|
19
|
-
# Main
|
|
20
|
-
"MatplotlibFormatString",
|
|
21
|
-
"DataAnalystDataFrame",
|
|
22
|
-
"DADF",
|
|
23
|
-
]
|
|
24
|
-
|
|
25
|
-
|
|
26
11
|
# Library
|
|
27
12
|
# ---------------------------------------------------------------------------
|
|
28
|
-
import
|
|
29
|
-
import
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
DA_MODE = False
|
|
36
|
-
|
|
37
|
-
try:
|
|
38
|
-
import numpy as np
|
|
39
|
-
import pandas as pd
|
|
40
|
-
except ImportError:
|
|
41
|
-
from subprocess import run
|
|
42
|
-
|
|
43
|
-
from absfuyu.config import ABSFUYU_CONFIG
|
|
44
|
-
|
|
45
|
-
if ABSFUYU_CONFIG._get_setting("auto-install-extra").value:
|
|
46
|
-
cmd = "python -m pip install -U absfuyu[full]".split()
|
|
47
|
-
run(cmd)
|
|
48
|
-
else:
|
|
49
|
-
raise SystemExit("This feature is in absfuyu[full] package") # noqa: B904
|
|
50
|
-
else:
|
|
51
|
-
DA_MODE = True
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
from absfuyu.core import ShowAllMethodsMixin, versionadded # noqa: E402
|
|
55
|
-
from absfuyu.logger import logger # noqa: E402
|
|
56
|
-
from absfuyu.util import set_min, set_min_max # noqa: E402
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
# Function
|
|
60
|
-
# ---------------------------------------------------------------------------
|
|
61
|
-
def equalize_df(data: dict[str, list], fillna=np.nan) -> dict[str, list]:
|
|
62
|
-
"""
|
|
63
|
-
Make all list in dict have equal length to make pd.DataFrame
|
|
64
|
-
|
|
65
|
-
:param data: `dict` data that ready for `pd.DataFrame`
|
|
66
|
-
:param fillna: Fill N/A value (Default: `np.nan`)
|
|
67
|
-
"""
|
|
68
|
-
max_len = max(map(len, data.values()))
|
|
69
|
-
for _, v in data.items():
|
|
70
|
-
if len(v) < max_len:
|
|
71
|
-
missings = max_len - len(v)
|
|
72
|
-
for _ in range(missings):
|
|
73
|
-
v.append(fillna)
|
|
74
|
-
return data
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def compare_2_list(*arr) -> pd.DataFrame:
|
|
78
|
-
"""
|
|
79
|
-
Compare 2 lists then create DataFrame
|
|
80
|
-
to see which items are missing
|
|
81
|
-
|
|
82
|
-
Parameters
|
|
83
|
-
----------
|
|
84
|
-
arr : list
|
|
85
|
-
List
|
|
86
|
-
|
|
87
|
-
Returns
|
|
88
|
-
-------
|
|
89
|
-
DataFrame
|
|
90
|
-
Compare result
|
|
91
|
-
"""
|
|
92
|
-
# Setup
|
|
93
|
-
col_name = "list"
|
|
94
|
-
arr = [sorted(x) for x in arr] # type: ignore # map(sorted, arr)
|
|
95
|
-
|
|
96
|
-
# Total array
|
|
97
|
-
tarr = sorted(list(set(chain.from_iterable(arr))))
|
|
98
|
-
# max_len = len(tarr)
|
|
99
|
-
|
|
100
|
-
# Temp dataset
|
|
101
|
-
temp_dict = {"base": tarr}
|
|
102
|
-
for idx, x in enumerate(arr):
|
|
103
|
-
name = f"{col_name}{idx}"
|
|
104
|
-
|
|
105
|
-
# convert list
|
|
106
|
-
temp = [item if item in x else np.nan for item in tarr]
|
|
107
|
-
|
|
108
|
-
temp_dict.setdefault(name, temp)
|
|
109
|
-
|
|
110
|
-
df = pd.DataFrame(temp_dict)
|
|
111
|
-
df["Compare"] = np.where(
|
|
112
|
-
df[f"{col_name}0"].apply(lambda x: str(x).lower())
|
|
113
|
-
== df[f"{col_name}1"].apply(lambda x: str(x).lower()),
|
|
114
|
-
df[f"{col_name}0"], # Value when True
|
|
115
|
-
np.nan, # Value when False
|
|
116
|
-
)
|
|
117
|
-
return df
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def rename_with_dict(df: pd.DataFrame, col: str, rename_dict: dict) -> pd.DataFrame:
|
|
121
|
-
"""
|
|
122
|
-
Version: 2.0.0
|
|
123
|
-
:param df: DataFrame
|
|
124
|
-
:param col: Column name
|
|
125
|
-
:param rename_dict: Rename dictionary
|
|
126
|
-
"""
|
|
127
|
-
|
|
128
|
-
name = f"{col}_filtered"
|
|
129
|
-
df[name] = df[col]
|
|
130
|
-
rename_val = list(rename_dict.keys())
|
|
131
|
-
df[name] = df[name].apply(lambda x: "Other" if x in rename_val else x)
|
|
132
|
-
return df
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
# Class
|
|
136
|
-
# ---------------------------------------------------------------------------
|
|
137
|
-
class CityData(NamedTuple):
|
|
138
|
-
"""
|
|
139
|
-
Parameters
|
|
140
|
-
----------
|
|
141
|
-
city : str
|
|
142
|
-
City name
|
|
143
|
-
|
|
144
|
-
region : str
|
|
145
|
-
Region of the city
|
|
146
|
-
|
|
147
|
-
area : str
|
|
148
|
-
Area of the region
|
|
149
|
-
"""
|
|
150
|
-
|
|
151
|
-
city: str
|
|
152
|
-
region: str
|
|
153
|
-
area: str
|
|
154
|
-
|
|
155
|
-
@staticmethod
|
|
156
|
-
def _sample_city_data(size: int = 100) -> list:
|
|
157
|
-
"""
|
|
158
|
-
Generate sample city data (testing purpose)
|
|
159
|
-
"""
|
|
160
|
-
sample_range = 10 ** len(str(size))
|
|
161
|
-
|
|
162
|
-
# Serial list
|
|
163
|
-
serials: list[str] = []
|
|
164
|
-
while len(serials) != size: # Unique serial
|
|
165
|
-
serial = random.randint(0, sample_range - 1)
|
|
166
|
-
serial = str(serial).rjust(len(str(size)), "0") # type: ignore
|
|
167
|
-
if serial not in serials: # type: ignore
|
|
168
|
-
serials.append(serial) # type: ignore
|
|
169
|
-
|
|
170
|
-
ss2 = deque(serials[: int(len(serials) / 2)]) # Cut half for region
|
|
171
|
-
ss2.rotate(random.randrange(1, 5))
|
|
172
|
-
[ss2.extend(ss2) for _ in range(2)] # type: ignore # Extend back
|
|
173
|
-
|
|
174
|
-
ss3 = deque(serials[: int(len(serials) / 4)]) # Cut forth for area
|
|
175
|
-
ss3.rotate(random.randrange(1, 5))
|
|
176
|
-
[ss3.extend(ss3) for _ in range(4)] # type: ignore # Extend back
|
|
177
|
-
|
|
178
|
-
serials = ["city_" + x for x in serials]
|
|
179
|
-
ss2 = ["region_" + x for x in ss2] # type: ignore
|
|
180
|
-
ss3 = ["area_" + x for x in ss3] # type: ignore
|
|
181
|
-
|
|
182
|
-
ss = list(zip(serials, ss2, ss3)) # Zip back
|
|
183
|
-
out = list(map(CityData._make, ss))
|
|
184
|
-
|
|
185
|
-
return out
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
class SplittedDF(NamedTuple):
|
|
189
|
-
"""
|
|
190
|
-
DataFrame splitted into contains
|
|
191
|
-
missing values only and vice versa
|
|
192
|
-
|
|
193
|
-
Parameters
|
|
194
|
-
----------
|
|
195
|
-
df : DataFrame
|
|
196
|
-
DataFrame without missing values
|
|
197
|
-
|
|
198
|
-
df_na : DataFrame
|
|
199
|
-
DataFrame with missing values only
|
|
200
|
-
"""
|
|
201
|
-
|
|
202
|
-
df: pd.DataFrame
|
|
203
|
-
df_na: pd.DataFrame
|
|
204
|
-
|
|
205
|
-
@staticmethod
|
|
206
|
-
def concat_df(
|
|
207
|
-
df_list: list[pd.DataFrame], join: Literal["inner", "outer"] = "inner"
|
|
208
|
-
) -> pd.DataFrame:
|
|
209
|
-
"""
|
|
210
|
-
Concat the list of DataFrame (static method)
|
|
211
|
-
|
|
212
|
-
Parameters
|
|
213
|
-
----------
|
|
214
|
-
df_list : list[DataFrame]
|
|
215
|
-
A sequence of DataFrame
|
|
216
|
-
|
|
217
|
-
join : str
|
|
218
|
-
Join type
|
|
219
|
-
(Default: ``"inner"``)
|
|
220
|
-
|
|
221
|
-
Returns
|
|
222
|
-
-------
|
|
223
|
-
DataFrame
|
|
224
|
-
Joined DataFrame
|
|
225
|
-
"""
|
|
226
|
-
df: pd.DataFrame = pd.concat(df_list, axis=0, join=join).reset_index()
|
|
227
|
-
df.drop(columns=["index"], inplace=True)
|
|
228
|
-
return df
|
|
229
|
-
|
|
230
|
-
def concat(self, join: Literal["inner", "outer"] = "inner") -> pd.DataFrame:
|
|
231
|
-
"""
|
|
232
|
-
Concat the splitted DataFrame
|
|
233
|
-
|
|
234
|
-
Parameters
|
|
235
|
-
----------
|
|
236
|
-
join : str
|
|
237
|
-
Join type
|
|
238
|
-
(Default: ``"inner"``)
|
|
239
|
-
|
|
240
|
-
Returns
|
|
241
|
-
-------
|
|
242
|
-
DataFrame
|
|
243
|
-
Joined DataFrame
|
|
244
|
-
"""
|
|
245
|
-
return self.concat_df(self, join=join) # type: ignore
|
|
246
|
-
|
|
247
|
-
@staticmethod
|
|
248
|
-
def divide_dataframe(df: pd.DataFrame, by_column: str) -> list[pd.DataFrame]:
|
|
249
|
-
"""
|
|
250
|
-
Divide DataFrame into a list of DataFrame
|
|
251
|
-
|
|
252
|
-
Parameters
|
|
253
|
-
----------
|
|
254
|
-
df : DataFrame
|
|
255
|
-
DataFrame
|
|
256
|
-
|
|
257
|
-
by_column : str
|
|
258
|
-
By which column
|
|
259
|
-
|
|
260
|
-
Returns
|
|
261
|
-
-------
|
|
262
|
-
list[DataFrame]
|
|
263
|
-
Splitted DataFrame
|
|
264
|
-
"""
|
|
265
|
-
divided = [x for _, x in df.groupby(by_column)]
|
|
266
|
-
return divided
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
##
|
|
270
|
-
class PLTFormatString(NamedTuple):
|
|
271
|
-
"""Matplotlib format string"""
|
|
272
|
-
|
|
273
|
-
marker: str
|
|
274
|
-
line_style: str
|
|
275
|
-
color: str
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
class _DictToAtrr:
|
|
279
|
-
"""Convert `keys` or `values` of `dict` into attribute"""
|
|
280
|
-
|
|
281
|
-
def __init__(
|
|
282
|
-
self,
|
|
283
|
-
dict_data: dict,
|
|
284
|
-
*,
|
|
285
|
-
key_as_atrribute: bool = True,
|
|
286
|
-
remove_char: str = r"( ) [ ] { }",
|
|
287
|
-
) -> None:
|
|
288
|
-
"""
|
|
289
|
-
dict_data: Dictionary to convert
|
|
290
|
-
key_as_atrribute: Use `dict.keys()` as atrribute when True, else use `dict.values()`
|
|
291
|
-
remove_char: Characters that excluded from attribute name
|
|
292
|
-
"""
|
|
293
|
-
self._data = dict_data
|
|
294
|
-
|
|
295
|
-
if key_as_atrribute:
|
|
296
|
-
# temp = list(map(self._remove_space, self._data.keys()))
|
|
297
|
-
temp = [self._remove_space(x, remove_char) for x in self._data.keys()]
|
|
298
|
-
[self.__setattr__(k, v) for k, v in zip(temp, self._data.values())] # type: ignore
|
|
299
|
-
else:
|
|
300
|
-
temp = [self._remove_space(x, remove_char) for x in self._data.values()]
|
|
301
|
-
[self.__setattr__(k, v) for k, v in zip(temp, self._data.keys())] # type: ignore
|
|
302
|
-
self._keys = temp
|
|
303
|
-
|
|
304
|
-
def __str__(self) -> str:
|
|
305
|
-
return f"{self.__class__.__name__}({self._keys})"
|
|
306
|
-
|
|
307
|
-
def __repr__(self) -> str:
|
|
308
|
-
return self.__str__()
|
|
309
|
-
|
|
310
|
-
@staticmethod
|
|
311
|
-
def _remove_space(value: str, remove_char: str) -> str:
|
|
312
|
-
"""
|
|
313
|
-
Remove special characters and replace space with underscore
|
|
314
|
-
"""
|
|
315
|
-
remove_char = remove_char.split(" ") # type: ignore
|
|
316
|
-
logger.debug(remove_char)
|
|
317
|
-
for x in remove_char:
|
|
318
|
-
value = value.replace(x, "")
|
|
319
|
-
value = value.replace(" ", "_")
|
|
320
|
-
return value
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
class MatplotlibFormatString:
|
|
324
|
-
"""
|
|
325
|
-
Format string format: `[marker][line][color]` or `[color][marker][line]`
|
|
326
|
-
"""
|
|
327
|
-
|
|
328
|
-
MARKER_LIST: ClassVar[dict[str, str]] = {
|
|
329
|
-
".": "point marker",
|
|
330
|
-
",": "pixel marker",
|
|
331
|
-
"o": "circle marker",
|
|
332
|
-
"v": "triangle_down marker",
|
|
333
|
-
"^": "triangle_up marker",
|
|
334
|
-
"<": "triangle_left marker",
|
|
335
|
-
">": "triangle_right marker",
|
|
336
|
-
"1": "tri_down marker",
|
|
337
|
-
"2": "tri_up marker",
|
|
338
|
-
"3": "tri_left marker",
|
|
339
|
-
"4": "tri_right marker",
|
|
340
|
-
"8": "octagon marker",
|
|
341
|
-
"s": "square marker",
|
|
342
|
-
"p": "pentagon marker",
|
|
343
|
-
"P": "plus (filled) marker",
|
|
344
|
-
"*": "star marker",
|
|
345
|
-
"h": "hexagon1 marker",
|
|
346
|
-
"H": "hexagon2 marker",
|
|
347
|
-
"+": "plus marker",
|
|
348
|
-
"x": "x marker",
|
|
349
|
-
"X": "x (filled) marker",
|
|
350
|
-
"D": "diamond marker",
|
|
351
|
-
"d": "thin_diamond marker",
|
|
352
|
-
"|": "vline marker",
|
|
353
|
-
"_": "hline marker",
|
|
354
|
-
}
|
|
355
|
-
LINE_STYLE_LIST: ClassVar[dict[str, str]] = {
|
|
356
|
-
"-": "solid line style",
|
|
357
|
-
"--": "dashed line style",
|
|
358
|
-
"-.": "dash-dot line style",
|
|
359
|
-
":": "dotted line style",
|
|
360
|
-
}
|
|
361
|
-
COLOR_LIST: ClassVar[dict[str, str]] = {
|
|
362
|
-
"b": "blue",
|
|
363
|
-
"g": "green",
|
|
364
|
-
"r": "red",
|
|
365
|
-
"c": "cyan",
|
|
366
|
-
"m": "magenta",
|
|
367
|
-
"y": "yellow",
|
|
368
|
-
"k": "black",
|
|
369
|
-
"w": "white",
|
|
370
|
-
}
|
|
371
|
-
Marker = _DictToAtrr(MARKER_LIST, key_as_atrribute=False)
|
|
372
|
-
LineStyle = _DictToAtrr(LINE_STYLE_LIST, key_as_atrribute=False)
|
|
373
|
-
Color = _DictToAtrr(COLOR_LIST, key_as_atrribute=False)
|
|
374
|
-
|
|
375
|
-
@classmethod
|
|
376
|
-
def all_format_string(cls) -> list[PLTFormatString]:
|
|
377
|
-
fmt_str = [
|
|
378
|
-
cls.MARKER_LIST,
|
|
379
|
-
cls.LINE_STYLE_LIST,
|
|
380
|
-
cls.COLOR_LIST,
|
|
381
|
-
]
|
|
382
|
-
return [PLTFormatString._make(x) for x in list(product(*fmt_str))]
|
|
383
|
-
|
|
384
|
-
@staticmethod
|
|
385
|
-
def get_random(alt: bool = False) -> str:
|
|
386
|
-
temp = random.choice(__class__.all_format_string()) # type: ignore
|
|
387
|
-
if alt:
|
|
388
|
-
return f"{temp.marker}{temp.line_style}{temp.color}"
|
|
389
|
-
else:
|
|
390
|
-
return f"{temp.color}{temp.marker}{temp.line_style}"
|
|
391
|
-
|
|
13
|
+
from absfuyu.extra.da.dadf import DADF # noqa
|
|
14
|
+
from absfuyu.extra.da.df_func import ( # noqa
|
|
15
|
+
compare_2_list,
|
|
16
|
+
equalize_df,
|
|
17
|
+
rename_with_dict,
|
|
18
|
+
)
|
|
392
19
|
|
|
393
20
|
# Class - DA
|
|
394
|
-
#
|
|
395
|
-
class DataAnalystDataFrame(ShowAllMethodsMixin, pd.DataFrame):
|
|
396
|
-
"""
|
|
397
|
-
Data Analyst ``pd.DataFrame``
|
|
398
|
-
"""
|
|
399
|
-
|
|
400
|
-
# Support
|
|
401
|
-
# ================================================================
|
|
402
|
-
# Rearrange column
|
|
403
|
-
def rearrange_column(self, insert_to_col: str, num_of_cols: int = 1) -> Self:
|
|
404
|
-
"""
|
|
405
|
-
Move right-most columns to selected position
|
|
406
|
-
|
|
407
|
-
Parameters
|
|
408
|
-
----------
|
|
409
|
-
insert_to_col : str
|
|
410
|
-
Name of the column that the right-most column will be moved next to
|
|
411
|
-
|
|
412
|
-
num_of_cols : int
|
|
413
|
-
Number of columns moved
|
|
414
|
-
|
|
415
|
-
Returns
|
|
416
|
-
-------
|
|
417
|
-
DataAnalystDataFrame
|
|
418
|
-
Modified DataFrame
|
|
419
|
-
"""
|
|
420
|
-
cols = self.columns.to_list() # List of columns
|
|
421
|
-
num_of_cols = int(set_min_max(num_of_cols, min_value=1, max_value=len(cols)))
|
|
422
|
-
col_index = cols.index(insert_to_col)
|
|
423
|
-
cols = (
|
|
424
|
-
cols[: col_index + 1]
|
|
425
|
-
+ cols[-num_of_cols:]
|
|
426
|
-
+ cols[col_index + 1 : len(cols) - num_of_cols]
|
|
427
|
-
)
|
|
428
|
-
self = self.__class__(self[cols])
|
|
429
|
-
return self
|
|
430
|
-
|
|
431
|
-
# Drop a list of column
|
|
432
|
-
def drop_columns(self, columns: list[str]) -> Self:
|
|
433
|
-
"""
|
|
434
|
-
Drop columns in DataFrame
|
|
435
|
-
|
|
436
|
-
Parameters
|
|
437
|
-
----------
|
|
438
|
-
columns : list[str]
|
|
439
|
-
List of columns need to drop
|
|
440
|
-
|
|
441
|
-
Returns
|
|
442
|
-
-------
|
|
443
|
-
DataAnalystDataFrame
|
|
444
|
-
Modified DataFrame
|
|
445
|
-
"""
|
|
446
|
-
for column in columns:
|
|
447
|
-
try:
|
|
448
|
-
self.drop(columns=[column], inplace=True)
|
|
449
|
-
except Exception:
|
|
450
|
-
logger.debug(f"{column} column does not exist")
|
|
451
|
-
# pass
|
|
452
|
-
return self
|
|
453
|
-
|
|
454
|
-
# Drop right-most columns
|
|
455
|
-
def drop_rightmost(self, num_of_cols: int = 1) -> Self:
|
|
456
|
-
"""
|
|
457
|
-
Drop ``num_of_cols`` right-most columns
|
|
458
|
-
|
|
459
|
-
Parameters
|
|
460
|
-
----------
|
|
461
|
-
num_of_cols : int
|
|
462
|
-
Number of columns to drop
|
|
463
|
-
|
|
464
|
-
Returns
|
|
465
|
-
-------
|
|
466
|
-
DataAnalystDataFrame
|
|
467
|
-
Modified DataFrame
|
|
468
|
-
"""
|
|
469
|
-
# Restrain
|
|
470
|
-
# if num_of_cols < 1:
|
|
471
|
-
# num_of_cols = 1
|
|
472
|
-
# if num_of_cols > self.shape[1]:
|
|
473
|
-
# num_of_cols = self.shape[1]
|
|
474
|
-
num_of_cols = int(
|
|
475
|
-
set_min_max(num_of_cols, min_value=1, max_value=self.shape[1])
|
|
476
|
-
)
|
|
477
|
-
|
|
478
|
-
# Logic
|
|
479
|
-
for _ in range(num_of_cols):
|
|
480
|
-
self.drop(self.columns[len(self.columns) - 1], axis=1, inplace=True)
|
|
481
|
-
return self
|
|
482
|
-
|
|
483
|
-
# Add blank column
|
|
484
|
-
def add_blank_column(self, column_name: str, fill: Any) -> Self:
|
|
485
|
-
"""
|
|
486
|
-
Add a blank column
|
|
487
|
-
|
|
488
|
-
Parameters
|
|
489
|
-
----------
|
|
490
|
-
column_name : str
|
|
491
|
-
Name of the column to add
|
|
492
|
-
|
|
493
|
-
fill : Any
|
|
494
|
-
Fill the column with data
|
|
495
|
-
|
|
496
|
-
Returns
|
|
497
|
-
-------
|
|
498
|
-
DataAnalystDataFrame
|
|
499
|
-
Modified DataFrame
|
|
500
|
-
"""
|
|
501
|
-
self[column_name] = [fill] * self.shape[0]
|
|
502
|
-
return self
|
|
503
|
-
|
|
504
|
-
# Modify
|
|
505
|
-
# ================================================================
|
|
506
|
-
# Convert city
|
|
507
|
-
def convert_city(
|
|
508
|
-
self,
|
|
509
|
-
city_column: str,
|
|
510
|
-
city_list: list[CityData],
|
|
511
|
-
*,
|
|
512
|
-
mode: str = "ra",
|
|
513
|
-
) -> Self:
|
|
514
|
-
"""
|
|
515
|
-
Get ``region`` and ``area`` of a city
|
|
516
|
-
|
|
517
|
-
Parameters
|
|
518
|
-
----------
|
|
519
|
-
city_column : str
|
|
520
|
-
Column contains city data
|
|
521
|
-
|
|
522
|
-
city_list : list[CityData]
|
|
523
|
-
List of city in correct format
|
|
524
|
-
(Default: ``None``)
|
|
525
|
-
|
|
526
|
-
mode : str
|
|
527
|
-
| Detailed column to add
|
|
528
|
-
| ``r``: region
|
|
529
|
-
| ``a``: area
|
|
530
|
-
| (Default: ``"ra"``)
|
|
531
|
-
|
|
532
|
-
Returns
|
|
533
|
-
-------
|
|
534
|
-
DataAnalystDataFrame
|
|
535
|
-
Modified DataFrame
|
|
536
|
-
"""
|
|
537
|
-
|
|
538
|
-
# Support function
|
|
539
|
-
def _convert_city_support(value: str) -> CityData:
|
|
540
|
-
for x in city_list:
|
|
541
|
-
if x.city.lower().startswith(value.lower()):
|
|
542
|
-
return x
|
|
543
|
-
return CityData(city=value, region=np.nan, area=np.nan) # type: ignore
|
|
544
|
-
|
|
545
|
-
# Convert
|
|
546
|
-
col_counter = 0
|
|
547
|
-
if mode.find("r") != -1:
|
|
548
|
-
logger.debug("Mode: 'region'")
|
|
549
|
-
self["region"] = self[city_column].apply(
|
|
550
|
-
lambda x: _convert_city_support(x).region
|
|
551
|
-
)
|
|
552
|
-
col_counter += 1
|
|
553
|
-
if mode.find("a") != -1:
|
|
554
|
-
logger.debug("Mode: 'area'")
|
|
555
|
-
self["area"] = self[city_column].apply(
|
|
556
|
-
lambda x: _convert_city_support(x).area
|
|
557
|
-
)
|
|
558
|
-
col_counter += 1
|
|
559
|
-
|
|
560
|
-
# Rearrange
|
|
561
|
-
return self.rearrange_column(city_column, col_counter)
|
|
562
|
-
|
|
563
|
-
# Date related
|
|
564
|
-
def add_date_from_month(self, month_column: str, *, col_name: str = "date") -> Self:
|
|
565
|
-
"""
|
|
566
|
-
Add dummy ``date`` column from ``month`` column
|
|
567
|
-
|
|
568
|
-
Parameters
|
|
569
|
-
----------
|
|
570
|
-
month_column : str
|
|
571
|
-
Month column
|
|
572
|
-
|
|
573
|
-
col_name : str
|
|
574
|
-
New date column name
|
|
575
|
-
(Default: ``"date"``)
|
|
576
|
-
|
|
577
|
-
Returns
|
|
578
|
-
-------
|
|
579
|
-
DataAnalystDataFrame
|
|
580
|
-
Modified DataFrame
|
|
581
|
-
"""
|
|
582
|
-
_this_year = datetime.now().year
|
|
583
|
-
self[col_name] = pd.to_datetime(
|
|
584
|
-
f"{_this_year}-" + self[month_column].astype(int).astype(str) + "-1",
|
|
585
|
-
format="%Y-%m-%d",
|
|
586
|
-
)
|
|
587
|
-
# Rearrange
|
|
588
|
-
return self.rearrange_column(month_column)
|
|
589
|
-
|
|
590
|
-
def add_detail_date(self, date_column: str, mode: str = "dwmy") -> Self:
|
|
591
|
-
"""
|
|
592
|
-
Add these columns from ``date_column``:
|
|
593
|
-
- ``date`` (won't add if ``date_column`` value is ``"date"``)
|
|
594
|
-
- ``day`` (overwrite if already exist)
|
|
595
|
-
- ``week`` (overwrite if already exist)
|
|
596
|
-
- ``month`` (overwrite if already exist)
|
|
597
|
-
- ``year`` (overwrite if already exist)
|
|
598
|
-
|
|
599
|
-
Parameters
|
|
600
|
-
----------
|
|
601
|
-
date_column : str
|
|
602
|
-
Date column
|
|
603
|
-
|
|
604
|
-
mode : str
|
|
605
|
-
| Detailed column to add
|
|
606
|
-
| ``d``: day
|
|
607
|
-
| ``w``: week number
|
|
608
|
-
| ``m``: month
|
|
609
|
-
| ``y``: year
|
|
610
|
-
| (Default: ``"dwmy"``)
|
|
611
|
-
|
|
612
|
-
Returns
|
|
613
|
-
-------
|
|
614
|
-
DataAnalystDataFrame
|
|
615
|
-
Modified DataFrame
|
|
616
|
-
"""
|
|
617
|
-
# Convert to datetime
|
|
618
|
-
self["date"] = pd.to_datetime(self[date_column])
|
|
619
|
-
|
|
620
|
-
# Logic
|
|
621
|
-
col_counter = 0
|
|
622
|
-
# self["weekday"] = self["day"].dt.isocalendar().day # Weekday
|
|
623
|
-
if mode.find("d") != -1:
|
|
624
|
-
logger.debug("Mode: 'day'")
|
|
625
|
-
self["day"] = self["date"].dt.day
|
|
626
|
-
col_counter += 1
|
|
627
|
-
if mode.find("w") != -1:
|
|
628
|
-
logger.debug("Mode: 'weekday'")
|
|
629
|
-
self["week"] = self["date"].dt.isocalendar().week
|
|
630
|
-
col_counter += 1
|
|
631
|
-
if mode.find("m") != -1:
|
|
632
|
-
logger.debug("Mode: 'month'")
|
|
633
|
-
self["month"] = self["date"].dt.month
|
|
634
|
-
col_counter += 1
|
|
635
|
-
if mode.find("y") != -1:
|
|
636
|
-
logger.debug("Mode: 'year'")
|
|
637
|
-
self["year"] = self["date"].dt.year
|
|
638
|
-
col_counter += 1
|
|
639
|
-
|
|
640
|
-
# Return
|
|
641
|
-
return self.rearrange_column(date_column, col_counter)
|
|
642
|
-
|
|
643
|
-
def delta_date(
|
|
644
|
-
self,
|
|
645
|
-
date_column: str,
|
|
646
|
-
mode: Literal["now", "between_row"] = "now",
|
|
647
|
-
*,
|
|
648
|
-
col_name: str = "delta_date",
|
|
649
|
-
) -> Self:
|
|
650
|
-
"""
|
|
651
|
-
Calculate date interval
|
|
652
|
-
|
|
653
|
-
Parameters
|
|
654
|
-
----------
|
|
655
|
-
date_column : str
|
|
656
|
-
Date column
|
|
657
|
-
|
|
658
|
-
mode : str
|
|
659
|
-
| Mode to calculate
|
|
660
|
-
| ``"between_row"``: Calculate date interval between each row
|
|
661
|
-
| ``"now"``: Calculate date interval to current date
|
|
662
|
-
| (Default: ``"now"``)
|
|
663
|
-
|
|
664
|
-
col_name : str
|
|
665
|
-
| New delta date column name
|
|
666
|
-
| (Default: ``"delta_date"``)
|
|
667
|
-
|
|
668
|
-
Returns
|
|
669
|
-
-------
|
|
670
|
-
DataAnalystDataFrame
|
|
671
|
-
Modified DataFrame
|
|
672
|
-
"""
|
|
673
|
-
if mode.lower().startswith("between_row"):
|
|
674
|
-
dated = self[date_column].to_list()
|
|
675
|
-
cal = []
|
|
676
|
-
for i in range(len(dated)):
|
|
677
|
-
if i == 0:
|
|
678
|
-
cal.append(dated[i] - dated[i])
|
|
679
|
-
# cal.append(relativedelta(dated[i], dated[i]))
|
|
680
|
-
else:
|
|
681
|
-
cal.append(dated[i] - dated[i - 1])
|
|
682
|
-
# cal.append(relativedelta(dated[i], dated[i - 1]))
|
|
683
|
-
self[col_name] = [x.days for x in cal]
|
|
684
|
-
return self
|
|
685
|
-
else: # mode="now"
|
|
686
|
-
self[col_name] = self[date_column].apply(
|
|
687
|
-
lambda x: (datetime.now() - x).days
|
|
688
|
-
)
|
|
689
|
-
return self
|
|
690
|
-
|
|
691
|
-
# Fill missing value
|
|
692
|
-
def fill_missing_values(
|
|
693
|
-
self, column_name: str, fill: Any = np.nan, *, fill_when_not_exist: Any = np.nan
|
|
694
|
-
) -> Self:
|
|
695
|
-
"""
|
|
696
|
-
Fill missing values in specified column
|
|
697
|
-
|
|
698
|
-
Parameters
|
|
699
|
-
----------
|
|
700
|
-
column_name : str
|
|
701
|
-
Column name
|
|
702
|
-
|
|
703
|
-
fill : Any
|
|
704
|
-
Fill the missing values with
|
|
705
|
-
(Default: ``np.nan``)
|
|
706
|
-
|
|
707
|
-
fill_when_not_exist : Any
|
|
708
|
-
When ``column_name`` does not exist,
|
|
709
|
-
create a new column and fill with ``fill_when_not_exist``
|
|
710
|
-
(Default: ``np.nan``)
|
|
711
|
-
|
|
712
|
-
Returns
|
|
713
|
-
-------
|
|
714
|
-
DataAnalystDataFrame
|
|
715
|
-
Modified DataFrame
|
|
716
|
-
"""
|
|
717
|
-
try:
|
|
718
|
-
self[column_name] = self[column_name].fillna(fill)
|
|
719
|
-
except Exception:
|
|
720
|
-
self.add_blank_column(column_name, fill_when_not_exist)
|
|
721
|
-
return self
|
|
722
|
-
|
|
723
|
-
# Split DataFrame
|
|
724
|
-
def split_na(self, by_column: str) -> SplittedDF:
|
|
725
|
-
"""
|
|
726
|
-
Split DataFrame into 2 parts:
|
|
727
|
-
- Without missing value in specified column
|
|
728
|
-
- With missing value in specified column
|
|
729
|
-
|
|
730
|
-
Parameters
|
|
731
|
-
----------
|
|
732
|
-
by_column : str
|
|
733
|
-
Split by column
|
|
734
|
-
|
|
735
|
-
Returns
|
|
736
|
-
-------
|
|
737
|
-
SplittedDF
|
|
738
|
-
Splitted DataFrame
|
|
739
|
-
"""
|
|
740
|
-
out = SplittedDF(
|
|
741
|
-
df=self[~self[by_column].isna()], # DF
|
|
742
|
-
df_na=self[self[by_column].isna()], # DF w/o NA
|
|
743
|
-
)
|
|
744
|
-
return out
|
|
745
|
-
|
|
746
|
-
# Threshold filter
|
|
747
|
-
# @versionchanged(version="3.2.0", reason="Optimized the code")
|
|
748
|
-
def threshold_filter(
|
|
749
|
-
self,
|
|
750
|
-
destination_column: str,
|
|
751
|
-
threshold: int | float = 10,
|
|
752
|
-
*,
|
|
753
|
-
top: int | None = None,
|
|
754
|
-
replace_with: Any = "Other",
|
|
755
|
-
) -> Self:
|
|
756
|
-
"""
|
|
757
|
-
Filter out percentage of data that smaller than the ``threshold``,
|
|
758
|
-
replace all of the smaller data to ``replace_with``.
|
|
759
|
-
As a result, pie chart is less messy.
|
|
760
|
-
|
|
761
|
-
Parameters
|
|
762
|
-
----------
|
|
763
|
-
destination_column : str
|
|
764
|
-
Column to be filtered
|
|
765
|
-
|
|
766
|
-
threshold : int | float
|
|
767
|
-
Which percentage to cut-off
|
|
768
|
-
(Default: 10%)
|
|
769
|
-
|
|
770
|
-
top : int
|
|
771
|
-
Only show top ``x`` categories in pie chart
|
|
772
|
-
(replace threshold mode)
|
|
773
|
-
(Default: ``None``)
|
|
774
|
-
|
|
775
|
-
replace_with : Any
|
|
776
|
-
Replace all of the smaller data with specified value
|
|
777
|
-
|
|
778
|
-
Returns
|
|
779
|
-
-------
|
|
780
|
-
DataAnalystDataFrame
|
|
781
|
-
Modified DataFrame
|
|
782
|
-
"""
|
|
783
|
-
# Clean
|
|
784
|
-
try:
|
|
785
|
-
self[destination_column] = self[
|
|
786
|
-
destination_column
|
|
787
|
-
].str.strip() # Remove trailing space
|
|
788
|
-
except Exception:
|
|
789
|
-
pass
|
|
790
|
-
|
|
791
|
-
# Logic
|
|
792
|
-
col_df = self.show_distribution(destination_column)
|
|
793
|
-
|
|
794
|
-
# Rename
|
|
795
|
-
if top is not None:
|
|
796
|
-
list_of_keep: list = (
|
|
797
|
-
col_df[destination_column]
|
|
798
|
-
.head(set_min_max(top - 1, min_value=1, max_value=col_df.shape[0]))
|
|
799
|
-
.to_list()
|
|
800
|
-
)
|
|
801
|
-
# logger.debug(list_of_keep)
|
|
802
|
-
else:
|
|
803
|
-
list_of_keep = col_df[col_df["percentage"] >= threshold][
|
|
804
|
-
destination_column
|
|
805
|
-
].to_list() # values that will not be renamed
|
|
806
|
-
self[f"{destination_column}_filtered"] = self[destination_column].apply(
|
|
807
|
-
lambda x: replace_with if x not in list_of_keep else x
|
|
808
|
-
)
|
|
809
|
-
|
|
810
|
-
# Return
|
|
811
|
-
return self
|
|
812
|
-
|
|
813
|
-
# Info
|
|
814
|
-
# ================================================================
|
|
815
|
-
# Total observation
|
|
816
|
-
@property
|
|
817
|
-
@versionadded("3.2.0")
|
|
818
|
-
def total_observation(self) -> int:
|
|
819
|
-
"""
|
|
820
|
-
Returns total observation of the DataFrame
|
|
821
|
-
"""
|
|
822
|
-
return self.shape[0] * self.shape[1] # type: ignore
|
|
823
|
-
|
|
824
|
-
# Quick info
|
|
825
|
-
@versionadded("3.2.0")
|
|
826
|
-
def qinfo(self) -> str:
|
|
827
|
-
"""
|
|
828
|
-
Show quick infomation about DataFrame
|
|
829
|
-
"""
|
|
830
|
-
mv = self.isnull().sum().sum() # missing values
|
|
831
|
-
to = self.total_observation
|
|
832
|
-
info = (
|
|
833
|
-
f"Dataset Information:\n"
|
|
834
|
-
f"- Number of Rows: {self.shape[0]:,}\n"
|
|
835
|
-
f"- Number of Columns: {self.shape[1]:,}\n"
|
|
836
|
-
f"- Total observation: {to:,}\n"
|
|
837
|
-
f"- Missing value: {mv:,} ({(mv / to * 100):.2f}%)\n\n"
|
|
838
|
-
f"Column names:\n{self.columns.to_list()}"
|
|
839
|
-
)
|
|
840
|
-
return info
|
|
841
|
-
|
|
842
|
-
# Quick describe
|
|
843
|
-
@versionadded("3.2.0")
|
|
844
|
-
def qdescribe(self) -> pd.DataFrame:
|
|
845
|
-
"""
|
|
846
|
-
Quick ``describe()`` that exclude ``object`` and ``datetime`` dtype
|
|
847
|
-
"""
|
|
848
|
-
return self[
|
|
849
|
-
self.select_dtypes(exclude=["object", "datetime"]).columns
|
|
850
|
-
].describe()
|
|
851
|
-
|
|
852
|
-
# Missing values analyze
|
|
853
|
-
def get_missing_values(
|
|
854
|
-
self, hightlight: bool = True, *, percentage_round_up: int = 2
|
|
855
|
-
) -> pd.DataFrame:
|
|
856
|
-
"""
|
|
857
|
-
Get a DataFrame contains count of missing values for each column
|
|
858
|
-
|
|
859
|
-
Parameters
|
|
860
|
-
----------
|
|
861
|
-
hightlight : bool
|
|
862
|
-
Shows only columns with missing values when ``True``
|
|
863
|
-
(Default: ``True``)
|
|
864
|
-
|
|
865
|
-
percentage_round_up : int
|
|
866
|
-
Round up to which decimals
|
|
867
|
-
(Default: ``2``)
|
|
868
|
-
|
|
869
|
-
Returns
|
|
870
|
-
-------
|
|
871
|
-
DataFrame
|
|
872
|
-
Missing value DataFrame
|
|
873
|
-
"""
|
|
874
|
-
# Check for missing value
|
|
875
|
-
df_na = self.isnull().sum().sort_values(ascending=False)
|
|
876
|
-
if hightlight:
|
|
877
|
-
out = df_na[df_na != 0].to_frame()
|
|
878
|
-
else:
|
|
879
|
-
out = df_na.to_frame()
|
|
880
|
-
out.rename(columns={0: "Num of N/A"}, inplace=True)
|
|
881
|
-
out["Percentage"] = (out["Num of N/A"] / self.shape[0] * 100).round(
|
|
882
|
-
percentage_round_up
|
|
883
|
-
)
|
|
884
|
-
|
|
885
|
-
# logger.debug(
|
|
886
|
-
# f"Percentage of N/A over entire DF: "
|
|
887
|
-
# f"{(self.isnull().sum().sum() / (self.shape[0] * self.shape[1]) * 100).round(percentage_round_up)}%"
|
|
888
|
-
# )
|
|
889
|
-
return out
|
|
890
|
-
|
|
891
|
-
# Show distribution
|
|
892
|
-
@versionadded("3.2.0")
|
|
893
|
-
def show_distribution(
|
|
894
|
-
self,
|
|
895
|
-
column_name: str,
|
|
896
|
-
dropna: bool = True,
|
|
897
|
-
*,
|
|
898
|
-
show_percentage: bool = True,
|
|
899
|
-
percentage_round_up: int = 2,
|
|
900
|
-
) -> pd.DataFrame:
|
|
901
|
-
"""
|
|
902
|
-
Show distribution of a column
|
|
903
|
-
|
|
904
|
-
Parameters
|
|
905
|
-
----------
|
|
906
|
-
column_name : str
|
|
907
|
-
Column to show distribution
|
|
908
|
-
|
|
909
|
-
dropna : bool
|
|
910
|
-
Count N/A when ``False``
|
|
911
|
-
(Default: ``True``)
|
|
912
|
-
|
|
913
|
-
show_percentage : bool
|
|
914
|
-
Show proportion in range 0% - 100% instead of [0, 1]
|
|
915
|
-
(Default: ``True``)
|
|
916
|
-
|
|
917
|
-
percentage_round_up : int
|
|
918
|
-
Round up to which decimals
|
|
919
|
-
(Default: ``2``)
|
|
920
|
-
|
|
921
|
-
Returns
|
|
922
|
-
-------
|
|
923
|
-
DataFrame
|
|
924
|
-
Distribution DataFrame
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
Example:
|
|
928
|
-
--------
|
|
929
|
-
>>> DataAnalystDataFrame.sample_df().show_distribution("number_range")
|
|
930
|
-
number_range count percentage
|
|
931
|
-
0 900 16 16.0
|
|
932
|
-
1 700 15 15.0
|
|
933
|
-
2 300 12 12.0
|
|
934
|
-
3 200 12 12.0
|
|
935
|
-
4 400 11 11.0
|
|
936
|
-
5 600 11 11.0
|
|
937
|
-
6 800 10 10.0
|
|
938
|
-
7 100 9 9.0
|
|
939
|
-
8 500 4 4.0
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
"""
|
|
943
|
-
out = self[column_name].value_counts(dropna=dropna).to_frame().reset_index()
|
|
944
|
-
if show_percentage:
|
|
945
|
-
out["percentage"] = (out["count"] / self.shape[0] * 100).round(
|
|
946
|
-
percentage_round_up
|
|
947
|
-
)
|
|
948
|
-
else:
|
|
949
|
-
out["percentage"] = (out["count"] / self.shape[0]).round(
|
|
950
|
-
percentage_round_up
|
|
951
|
-
)
|
|
952
|
-
return out
|
|
953
|
-
|
|
954
|
-
# Help
|
|
955
|
-
@classmethod
|
|
956
|
-
def dadf_help(cls) -> list[str]:
|
|
957
|
-
"""
|
|
958
|
-
Show all available method of DataAnalystDataFrame
|
|
959
|
-
"""
|
|
960
|
-
list_of_method = list(set(dir(cls)) - set(dir(pd.DataFrame)))
|
|
961
|
-
return sorted(list_of_method)
|
|
962
|
-
|
|
963
|
-
# Sample DataFrame
|
|
964
|
-
@classmethod
|
|
965
|
-
def sample_df(cls, size: int = 100) -> Self:
|
|
966
|
-
"""
|
|
967
|
-
Create sample DataFrame
|
|
968
|
-
|
|
969
|
-
Parameters
|
|
970
|
-
----------
|
|
971
|
-
size : int
|
|
972
|
-
Number of observations
|
|
973
|
-
(Default: ``100``)
|
|
974
|
-
|
|
975
|
-
Returns
|
|
976
|
-
-------
|
|
977
|
-
DataAnalystDataFrame
|
|
978
|
-
DataFrame with these columns:
|
|
979
|
-
[number, number_big, number_range, missing_value, text, date]
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
Example:
|
|
983
|
-
--------
|
|
984
|
-
>>> DataAnalystDataFrame.sample_df()
|
|
985
|
-
number number_big number_range missing_value text date
|
|
986
|
-
0 -2.089770 785 700 NaN vwnlqoql 2013-11-20
|
|
987
|
-
1 -0.526689 182 100 24.0 prjjcvqc 2007-04-13
|
|
988
|
-
2 -1.596514 909 900 8.0 cbcpzlac 2023-05-24
|
|
989
|
-
3 2.982191 989 900 21.0 ivwqwuvd 2022-04-28
|
|
990
|
-
4 1.687803 878 800 NaN aajtncum 2005-10-05
|
|
991
|
-
.. ... ... ... ... ... ...
|
|
992
|
-
95 -1.295145 968 900 16.0 mgqunkhi 2016-04-12
|
|
993
|
-
96 1.296795 255 200 NaN lwvytego 2014-05-10
|
|
994
|
-
97 1.440746 297 200 5.0 lqsoykun 2010-04-03
|
|
995
|
-
98 0.327702 845 800 NaN leadkvsy 2005-08-05
|
|
996
|
-
99 0.556720 981 900 36.0 bozmxixy 2004-02-22
|
|
997
|
-
[100 rows x 6 columns]
|
|
998
|
-
"""
|
|
999
|
-
# Restrain
|
|
1000
|
-
size = int(set_min(size, min_value=1))
|
|
1001
|
-
|
|
1002
|
-
# Number col
|
|
1003
|
-
df = pd.DataFrame(np.random.randn(size, 1), columns=["number"])
|
|
1004
|
-
df["number_big"] = [
|
|
1005
|
-
random.choice(range(100, 999)) for _ in range(size)
|
|
1006
|
-
] # Big number in range 100-999
|
|
1007
|
-
df["number_range"] = df["number_big"].apply(lambda x: str(x)[0] + "00")
|
|
1008
|
-
|
|
1009
|
-
# Missing value col
|
|
1010
|
-
na_rate = random.randint(1, 99)
|
|
1011
|
-
d = [random.randint(1, 99) for _ in range(size)]
|
|
1012
|
-
df["missing_value"] = list(map(lambda x: x if x < na_rate else np.nan, d))
|
|
1013
|
-
# df["missing_value"] = [random.choice([random.randint(1, 99), np.nan]) for _ in range(observations)]
|
|
1014
|
-
|
|
1015
|
-
# Text col
|
|
1016
|
-
df["text"] = [
|
|
1017
|
-
"".join([random.choice(string.ascii_lowercase) for _ in range(8)])
|
|
1018
|
-
for _ in range(size)
|
|
1019
|
-
]
|
|
1020
|
-
|
|
1021
|
-
# Random date col
|
|
1022
|
-
df["date"] = [
|
|
1023
|
-
datetime(
|
|
1024
|
-
year=random.randint(datetime.now().year - 20, datetime.now().year),
|
|
1025
|
-
month=random.randint(1, 12),
|
|
1026
|
-
day=random.randint(1, 28),
|
|
1027
|
-
)
|
|
1028
|
-
for _ in range(size)
|
|
1029
|
-
]
|
|
1030
|
-
|
|
1031
|
-
# Return
|
|
1032
|
-
return cls(df)
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
class DADF(DataAnalystDataFrame):
|
|
1036
|
-
"""Short name for ``DataAnalystDataFrame``"""
|
|
1037
|
-
|
|
1038
|
-
pass
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
class DADF_WIP(DADF):
|
|
1042
|
-
"""W.I.P"""
|
|
1043
|
-
|
|
1044
|
-
@versionadded("4.0.0")
|
|
1045
|
-
def subtract_df(self, other: Self | pd.DataFrame) -> Self:
|
|
1046
|
-
"""
|
|
1047
|
-
Subtract DF to find the different rows
|
|
1048
|
-
"""
|
|
1049
|
-
temp = self.copy()
|
|
1050
|
-
out = (
|
|
1051
|
-
temp.merge(other, indicator=True, how="right")
|
|
1052
|
-
.query("_merge=='right_only'")
|
|
1053
|
-
.drop("_merge", axis=1)
|
|
1054
|
-
)
|
|
1055
|
-
return self.__class__(out)
|
|
1056
|
-
|
|
1057
|
-
@versionadded("4.0.0")
|
|
1058
|
-
def merge_left(
|
|
1059
|
-
self,
|
|
1060
|
-
other: Self | pd.DataFrame,
|
|
1061
|
-
on: str,
|
|
1062
|
-
columns: list[str] | None = None,
|
|
1063
|
-
) -> Self:
|
|
1064
|
-
"""
|
|
1065
|
-
Merge left of 2 dfs
|
|
1066
|
-
|
|
1067
|
-
:param columns: Columns to take from df2
|
|
1068
|
-
"""
|
|
1069
|
-
|
|
1070
|
-
if columns is not None:
|
|
1071
|
-
current_col = [on]
|
|
1072
|
-
current_col.extend(columns)
|
|
1073
|
-
col = other.columns.to_list()
|
|
1074
|
-
cols = list(set(col) - set(current_col))
|
|
1075
|
-
self.drop_columns(cols)
|
|
1076
|
-
|
|
1077
|
-
out = self.merge(other, how="left", on=on)
|
|
1078
|
-
return self.__class__(out)
|
|
21
|
+
# TODO: split column df[['A','B']]=df['AB'].str.split(' ',n=1,expand=True) | drop dups | Combine: row with data, row NaN
|