feilian 1.2.2__py3-none-any.whl → 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of feilian might be problematic. Click here for more details.
- feilian/_dist_ver.py +2 -2
- feilian/dataframe.py +22 -4
- feilian/process.py +12 -4
- {feilian-1.2.2.dist-info → feilian-1.2.3.dist-info}/METADATA +2 -2
- {feilian-1.2.2.dist-info → feilian-1.2.3.dist-info}/RECORD +7 -7
- {feilian-1.2.2.dist-info → feilian-1.2.3.dist-info}/WHEEL +0 -0
- {feilian-1.2.2.dist-info → feilian-1.2.3.dist-info}/top_level.txt +0 -0
feilian/_dist_ver.py
CHANGED
feilian/dataframe.py
CHANGED
|
@@ -25,9 +25,18 @@ if pd_version[0] < 1 or (pd_version[0] == 1 and pd_version[1] < 5):
|
|
|
25
25
|
FILE_FORMAT = Literal['csv', 'tsv', 'json', 'xlsx', 'parquet']
|
|
26
26
|
COMPRESSION_FORMAT = Literal[None, 'infer', 'snappy', 'gzip', 'brotli', 'bz2', 'zip', 'xz']
|
|
27
27
|
|
|
28
|
+
def _drop_na_values(data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], axis: Literal['columns', 'rows']):
|
|
29
|
+
if isinstance(data, pd.DataFrame):
|
|
30
|
+
data.dropna(axis=axis, how='all', inplace=True)
|
|
31
|
+
else:
|
|
32
|
+
assert isinstance(data, dict)
|
|
33
|
+
for df in data.values():
|
|
34
|
+
df.dropna(axis=axis, how='all', inplace=True)
|
|
35
|
+
|
|
28
36
|
def read_dataframe(file: str, *args, sheet_name=0,
|
|
29
37
|
file_format: FILE_FORMAT = None,
|
|
30
38
|
jsonl=False, dtype: type = None,
|
|
39
|
+
drop_na_columns=False, drop_na_rows=False,
|
|
31
40
|
**kwargs) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
|
|
32
41
|
"""
|
|
33
42
|
read file as pandas `DataFrame`
|
|
@@ -37,6 +46,8 @@ def read_dataframe(file: str, *args, sheet_name=0,
|
|
|
37
46
|
:param file_format: csv, tsv, json ,xlsx, parquet
|
|
38
47
|
:param jsonl: jsonl format or not, only used in json format
|
|
39
48
|
:param dtype: `dtype` for `pd.read_xx()`
|
|
49
|
+
:param drop_na_columns: drop column if all values of the column is na
|
|
50
|
+
:param drop_na_rows: drop row if all values of the row is na
|
|
40
51
|
:param kwargs: extra kwargs for `pd.read_xx()`
|
|
41
52
|
"""
|
|
42
53
|
# decide the file format
|
|
@@ -61,16 +72,23 @@ def read_dataframe(file: str, *args, sheet_name=0,
|
|
|
61
72
|
jsonl = True
|
|
62
73
|
|
|
63
74
|
if file_format == 'csv':
|
|
64
|
-
|
|
75
|
+
df = pd.read_csv(file, *args, dtype=dtype, **kwargs)
|
|
65
76
|
elif file_format == 'xlsx':
|
|
66
|
-
|
|
77
|
+
df = pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
|
|
67
78
|
elif file_format == 'json':
|
|
68
|
-
|
|
79
|
+
df = pd.read_json(file, *args, lines=jsonl, dtype=dtype, **kwargs)
|
|
69
80
|
elif file_format == 'parquet':
|
|
70
|
-
|
|
81
|
+
df = pd.read_parquet(file, *args, **kwargs)
|
|
71
82
|
else:
|
|
72
83
|
raise IOError(f"Unknown file format: {file}")
|
|
73
84
|
|
|
85
|
+
if drop_na_columns:
|
|
86
|
+
_drop_na_values(df, axis='columns')
|
|
87
|
+
if drop_na_rows:
|
|
88
|
+
_drop_na_values(df, axis='rows')
|
|
89
|
+
|
|
90
|
+
return df
|
|
91
|
+
|
|
74
92
|
def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]'],
|
|
75
93
|
df: Union[pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]]],
|
|
76
94
|
*args, sheet_name='Sheet1',
|
feilian/process.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import tqdm
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from typing import
|
|
5
|
-
|
|
4
|
+
from typing import (
|
|
5
|
+
Any, Dict, Hashable, List,
|
|
6
|
+
Tuple, Union, Iterable, Optional,
|
|
7
|
+
)
|
|
8
|
+
from .dataframe import (
|
|
9
|
+
read_dataframe,
|
|
10
|
+
save_dataframe,
|
|
11
|
+
)
|
|
6
12
|
|
|
7
13
|
class BaseProcessor(abc.ABC):
|
|
8
14
|
"""
|
|
@@ -74,9 +80,10 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
|
|
|
74
80
|
save_dataframe(filepath, result)
|
|
75
81
|
|
|
76
82
|
@abc.abstractmethod
|
|
77
|
-
def process_row(self, i: Hashable, row: pd.Series) -> Dict[str, Any]:
|
|
83
|
+
def process_row(self, i: Hashable, row: pd.Series) -> Optional[Dict[str, Any]]:
|
|
78
84
|
"""
|
|
79
85
|
Process a single row of data.
|
|
86
|
+
:return: if `None`, ignore this row
|
|
80
87
|
"""
|
|
81
88
|
|
|
82
89
|
def process(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -84,6 +91,7 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
|
|
|
84
91
|
if self.progress:
|
|
85
92
|
desc = "process" if self.progress is True else self.progress
|
|
86
93
|
bar = tqdm.tqdm(bar, total=len(data), desc=desc)
|
|
87
|
-
res =
|
|
94
|
+
res = (self.process_row(i, row) for i, row in bar)
|
|
95
|
+
res = (x for x in res if x is not None)
|
|
88
96
|
return pd.DataFrame(res)
|
|
89
97
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: feilian
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.3
|
|
4
4
|
Summary: General data processing tool.
|
|
5
5
|
Author-email: darkpeath <darkpeath@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/darkpeath/feilian
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
Requires-Dist: pandas
|
|
9
9
|
Provides-Extra: extra
|
|
10
|
-
Requires-Dist: tqdm
|
|
10
|
+
Requires-Dist: tqdm; extra == "extra"
|
|
11
11
|
|
|
12
12
|
# feilian
|
|
13
13
|
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
feilian/__init__.py,sha256=97Rvz2O6LknvCWInAjINH4GeqG4-QPPcep9n-V7KD8k,911
|
|
2
|
-
feilian/_dist_ver.py,sha256=
|
|
2
|
+
feilian/_dist_ver.py,sha256=WDsMjivFi6Plqpo4JtGV48YaZNQnJ56AduVgKWA5Klo,148
|
|
3
3
|
feilian/arg.py,sha256=n2nIcmC_3rb9A6BOzm9C5z3-T4lnubaGzH2sFhtqwZQ,8402
|
|
4
|
-
feilian/dataframe.py,sha256=
|
|
4
|
+
feilian/dataframe.py,sha256=BtW6IQrXTF9hQCeccIXkedSjrpfuSKkzWGxd0VxiVHU,11390
|
|
5
5
|
feilian/datetime.py,sha256=IONvWhLeGEy9IVe6GWKEW3FhrfRrShyhGP8-RTf9r3c,763
|
|
6
6
|
feilian/io.py,sha256=aYN3QwWcLoRKzhGMNutqdkmxArVcXfeWXzxCB07LcFc,155
|
|
7
7
|
feilian/json.py,sha256=1FkQ6e4JmbccpxhMobXpsGg-f7uVrTtUmu6jDCXCFTQ,1406
|
|
8
|
-
feilian/process.py,sha256=
|
|
8
|
+
feilian/process.py,sha256=3uig0s_X_espK26tD1cKkapi1gatT1M3zoiFwAws6PU,3153
|
|
9
9
|
feilian/string.py,sha256=G_X3dnR0Oxmi4hXF-6E5jm5M7GPjGoMYrSMyI1dj6Z4,370
|
|
10
10
|
feilian/utils.py,sha256=pzzGEgngidVkYvuNJWY7KWjkdgGRuH5_ENaLS6kxBtk,2648
|
|
11
11
|
feilian/version.py,sha256=oH_DvE7jRCWlCCX9SSadwxwRJXFas_rIisYLBGPYZn4,350
|
|
12
|
-
feilian-1.2.
|
|
13
|
-
feilian-1.2.
|
|
14
|
-
feilian-1.2.
|
|
15
|
-
feilian-1.2.
|
|
12
|
+
feilian-1.2.3.dist-info/METADATA,sha256=1yull6iP_bypv8etP-9wxoc0zOMdYgXwvlixByFN4xs,3722
|
|
13
|
+
feilian-1.2.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
14
|
+
feilian-1.2.3.dist-info/top_level.txt,sha256=1Q2-B6KJrcTr7drW_kik35PTVEUJLPP4wVrn0kYKwGw,8
|
|
15
|
+
feilian-1.2.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|