feilian 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feilian/__init__.py +31 -0
- feilian/_dist_ver.py +5 -0
- feilian/_typing.py +5 -0
- feilian/arg.py +179 -0
- feilian/dataframe.py +311 -0
- feilian/datetime.py +21 -0
- feilian/excel.py +83 -0
- feilian/io.py +6 -0
- feilian/json.py +262 -0
- feilian/process.py +99 -0
- feilian/string.py +13 -0
- feilian/txt.py +54 -0
- feilian/utils.py +82 -0
- feilian/version.py +12 -0
- feilian-1.3.4.dist-info/METADATA +212 -0
- feilian-1.3.4.dist-info/RECORD +18 -0
- feilian-1.3.4.dist-info/WHEEL +5 -0
- feilian-1.3.4.dist-info/top_level.txt +1 -0
feilian/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from .io import ensure_parent_dir_exist
|
|
4
|
+
from .dataframe import read_dataframe, save_dataframe, extract_dataframe_sample, merge_dataframe_rows, iter_dataframe
|
|
5
|
+
from .dataframe import is_empty_text, is_nonempty_text, is_blank_text, is_non_blank_text
|
|
6
|
+
from .datetime import format_time, format_date
|
|
7
|
+
from .arg import ArgValueParser
|
|
8
|
+
from .json import read_json, save_json, write_json, read_big_json
|
|
9
|
+
from .txt import (
|
|
10
|
+
detect_stream_encoding, detect_file_encoding, get_file_encoding,
|
|
11
|
+
read_txt, save_txt, write_txt,
|
|
12
|
+
)
|
|
13
|
+
from .process import DataframeProcessor
|
|
14
|
+
from .excel import save_excel, write_excel
|
|
15
|
+
from .utils import flatten_dict, flatten_list
|
|
16
|
+
from .version import __version__
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
'ensure_parent_dir_exist',
|
|
20
|
+
'read_dataframe', 'save_dataframe', 'extract_dataframe_sample', 'merge_dataframe_rows', 'iter_dataframe',
|
|
21
|
+
'is_empty_text', 'is_nonempty_text', 'is_blank_text', 'is_non_blank_text',
|
|
22
|
+
'format_time', 'format_date',
|
|
23
|
+
'ArgValueParser',
|
|
24
|
+
'read_json', 'save_json', 'write_json', 'read_big_json',
|
|
25
|
+
'detect_stream_encoding', 'detect_file_encoding', 'get_file_encoding',
|
|
26
|
+
'read_txt', 'save_txt', 'write_txt',
|
|
27
|
+
'save_excel', 'write_excel',
|
|
28
|
+
'DataframeProcessor',
|
|
29
|
+
'flatten_dict', 'flatten_list',
|
|
30
|
+
'__version__',
|
|
31
|
+
]
|
feilian/_dist_ver.py
ADDED
feilian/_typing.py
ADDED
feilian/arg.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from ._typing import (
|
|
4
|
+
Union, List, Any, Iterable,
|
|
5
|
+
Callable, Set, Optional, Tuple,
|
|
6
|
+
Dict, Hashable, Sequence,
|
|
7
|
+
Literal,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
_build_in_na_checkers = {
|
|
11
|
+
'always_na': lambda x: True,
|
|
12
|
+
'never_na': lambda x: False,
|
|
13
|
+
'is_none': lambda x: x is None,
|
|
14
|
+
}
|
|
15
|
+
_NA_CHECKER_TYPES = Union[Callable[[Any], bool], Literal['always_na', 'never_na', 'is_none']]
|
|
16
|
+
|
|
17
|
+
_build_in_na_converters = {
|
|
18
|
+
'none': lambda x: None,
|
|
19
|
+
'self': lambda x: x,
|
|
20
|
+
'empty': lambda x: [],
|
|
21
|
+
'single': lambda x: [x],
|
|
22
|
+
}
|
|
23
|
+
_NA_CONVERTER_TYPES = Union[Callable[[Any], Any], Literal['none', 'self', 'empty', 'single']]
|
|
24
|
+
|
|
25
|
+
def _get_or_default(value: Any, mapping: Dict[Hashable, Any], default_key: Any) -> Any:
|
|
26
|
+
if value is None:
|
|
27
|
+
return mapping[default_key]
|
|
28
|
+
return mapping.get(value, value)
|
|
29
|
+
|
|
30
|
+
class ArgValueParser(object):
|
|
31
|
+
@classmethod
|
|
32
|
+
def split_and_parse_strs(cls, strings: Union[Sequence[str], str, None],
|
|
33
|
+
func: Callable[[str], Any] = None,
|
|
34
|
+
sep=',', do_trim=True, ignore_blank=True) -> Iterable[Any]:
|
|
35
|
+
"""
|
|
36
|
+
split and parse multi string values
|
|
37
|
+
:param strings: sequence of strings
|
|
38
|
+
:param func: function to parse single string value
|
|
39
|
+
:param sep: seperator to split single string
|
|
40
|
+
:param do_trim: trim every word or not
|
|
41
|
+
:param ignore_blank: ignore blank or not; in some case, this must be `True`
|
|
42
|
+
"""
|
|
43
|
+
if isinstance(strings, str):
|
|
44
|
+
strings = [strings]
|
|
45
|
+
if strings:
|
|
46
|
+
for value in strings:
|
|
47
|
+
for x in value.split(sep):
|
|
48
|
+
if do_trim:
|
|
49
|
+
x = x.strip()
|
|
50
|
+
if not x and ignore_blank:
|
|
51
|
+
continue
|
|
52
|
+
yield func(x) if func else x
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def split_strs_to_set(cls, values: Union[Sequence[str], str, None],
|
|
56
|
+
func: Callable[[str], Any] = None,
|
|
57
|
+
sep=',', do_trim=True, ignore_blank=True) -> Optional[Set[Any]]:
|
|
58
|
+
"""
|
|
59
|
+
split multi string values as words to a set
|
|
60
|
+
"""
|
|
61
|
+
return set(cls.split_and_parse_strs(values, func, sep, do_trim, ignore_blank))
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def split_strs_to_list(cls, values: Union[Sequence[str], str, None],
|
|
65
|
+
func: Callable[[str], Any] = None,
|
|
66
|
+
sep=',', do_trim=True, ignore_blank=True) -> Optional[List[Any]]:
|
|
67
|
+
"""
|
|
68
|
+
split multi string values as words to a list
|
|
69
|
+
"""
|
|
70
|
+
return list(cls.split_and_parse_strs(values, func, sep, do_trim, ignore_blank))
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def bound_collection_if_singleton(value: Any, collection_type: type,
|
|
74
|
+
elem_type: Union[type, Tuple[type]] = (),
|
|
75
|
+
allowed_type=(list, tuple, set)) -> Any:
|
|
76
|
+
"""
|
|
77
|
+
if `value` is a singleton element, bound it with a collection type.
|
|
78
|
+
:param value: the input value may be bounded
|
|
79
|
+
:param collection_type: may be `list`, `tuple` or `set`
|
|
80
|
+
:param elem_type: if `value` is instance of `elem_type` but not instance of `allowed_type`, bound it.
|
|
81
|
+
:param allowed_type: if `value` is instance of `allowed_type`, return the `value` unchanged.
|
|
82
|
+
"""
|
|
83
|
+
if isinstance(value, allowed_type):
|
|
84
|
+
return value
|
|
85
|
+
if isinstance(value, elem_type):
|
|
86
|
+
return collection_type([value])
|
|
87
|
+
return value
|
|
88
|
+
|
|
89
|
+
@classmethod
|
|
90
|
+
def bound_list_if_singleton(cls, value: Any, elem_type: Union[type, Tuple[type]] = (),
|
|
91
|
+
allowed_type=(list, tuple, set)) -> Any:
|
|
92
|
+
"""
|
|
93
|
+
If `value` is a singleton element, bound it to be a `list`.
|
|
94
|
+
See more arg docs in `bound_collection_if_singleton()`.
|
|
95
|
+
"""
|
|
96
|
+
return cls.bound_collection_if_singleton(value, collection_type=list,
|
|
97
|
+
elem_type=elem_type, allowed_type=allowed_type)
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def bound_tuple_if_singleton(cls, value: Any, elem_type: Union[type, Tuple[type]] = (),
|
|
101
|
+
allowed_type=(list, tuple, set)) -> Any:
|
|
102
|
+
"""
|
|
103
|
+
If `value` is a singleton element, bound it to be a `tuple`.
|
|
104
|
+
See more arg docs in `bound_collection_if_singleton()`.
|
|
105
|
+
"""
|
|
106
|
+
return cls.bound_collection_if_singleton(value, collection_type=tuple,
|
|
107
|
+
elem_type=elem_type, allowed_type=allowed_type)
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def bound_set_if_singleton(cls, value: Any, elem_type: Union[type, Tuple[type]] = (),
|
|
111
|
+
allowed_type=(list, tuple, set)) -> Any:
|
|
112
|
+
"""
|
|
113
|
+
If `value` is a singleton element, bound it to be a `set`.
|
|
114
|
+
See more arg docs in `bound_collection_if_singleton()`.
|
|
115
|
+
"""
|
|
116
|
+
return cls.bound_collection_if_singleton(value, collection_type=set,
|
|
117
|
+
elem_type=elem_type, allowed_type=allowed_type)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def ensure_collection(value: Any, expected_type: type, collection_type: Union[type, Tuple[type, ...]],
|
|
121
|
+
na_checker: _NA_CHECKER_TYPES = None, na_converter: _NA_CONVERTER_TYPES = None) -> Any:
|
|
122
|
+
"""
|
|
123
|
+
Ensure the value to be a list, tuple or set.
|
|
124
|
+
:param value: any type value
|
|
125
|
+
:param expected_type: expected return type, can be list, tuple or set
|
|
126
|
+
:param collection_type: other collection type to be convert
|
|
127
|
+
:param na_checker: check if `value` is na, default is 'is_none'
|
|
128
|
+
str value means some built-in functions:
|
|
129
|
+
always_na: always treat the value as na
|
|
130
|
+
never_na: never treat the value as na
|
|
131
|
+
is_none: test if the value is `None`
|
|
132
|
+
:param na_converter: if `value` is na, return output of this function, default is 'self'
|
|
133
|
+
str value means some built-in functions:
|
|
134
|
+
none: `None`
|
|
135
|
+
self: the value no changed
|
|
136
|
+
empty: an empty list
|
|
137
|
+
single: a single value list: `[value]`
|
|
138
|
+
:return: expected to be an instance of `expected_type`, or be `None` for some condition
|
|
139
|
+
"""
|
|
140
|
+
na_checker = _get_or_default(na_checker, _build_in_na_checkers, 'is_none')
|
|
141
|
+
if na_checker(value):
|
|
142
|
+
na_converter = _get_or_default(na_converter, _build_in_na_converters, 'self')
|
|
143
|
+
return na_converter(value)
|
|
144
|
+
if isinstance(value, expected_type):
|
|
145
|
+
return value
|
|
146
|
+
if isinstance(value, collection_type):
|
|
147
|
+
return expected_type(value)
|
|
148
|
+
return expected_type([value])
|
|
149
|
+
|
|
150
|
+
@classmethod
|
|
151
|
+
def ensure_list(cls, value: Any, na_checker: _NA_CHECKER_TYPES = None,
|
|
152
|
+
na_converter: _NA_CONVERTER_TYPES = None) -> Optional[List[Any]]:
|
|
153
|
+
"""
|
|
154
|
+
Ensure the value to be a list.
|
|
155
|
+
See more arg docs in `ensure_collection()`.
|
|
156
|
+
"""
|
|
157
|
+
return cls.ensure_collection(value, expected_type=list, collection_type=(tuple, set),
|
|
158
|
+
na_checker=na_checker, na_converter=na_converter)
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def ensure_tuple(cls, value: Any, na_checker: _NA_CHECKER_TYPES = None,
|
|
162
|
+
na_converter: _NA_CONVERTER_TYPES = None) -> Optional[Tuple[Any]]:
|
|
163
|
+
"""
|
|
164
|
+
Ensure the value to be a tuple.
|
|
165
|
+
See more arg docs in `ensure_collection()`.
|
|
166
|
+
"""
|
|
167
|
+
return cls.ensure_collection(value, expected_type=tuple, collection_type=(list, set),
|
|
168
|
+
na_checker=na_checker, na_converter=na_converter)
|
|
169
|
+
|
|
170
|
+
@classmethod
|
|
171
|
+
def ensure_set(cls, value: Any, na_checker: _NA_CHECKER_TYPES = None,
|
|
172
|
+
na_converter: _NA_CONVERTER_TYPES = None) -> Optional[Set[Any]]:
|
|
173
|
+
"""
|
|
174
|
+
Ensure the value to be a set.
|
|
175
|
+
See more arg docs in `ensure_collection()`.
|
|
176
|
+
"""
|
|
177
|
+
return cls.ensure_collection(value, expected_type=set, collection_type=(list, tuple),
|
|
178
|
+
na_checker=na_checker, na_converter=na_converter)
|
|
179
|
+
|
feilian/dataframe.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
Encapsulate methods for pandas `DataFrame`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import io
|
|
8
|
+
import os
|
|
9
|
+
import pathlib
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import random
|
|
12
|
+
import collections
|
|
13
|
+
from ._typing import Union, Iterable, Dict, List, Any, Sequence, Callable, Tuple, Hashable, Literal
|
|
14
|
+
from .io import ensure_parent_dir_exist
|
|
15
|
+
from .txt import detect_stream_encoding, detect_file_encoding
|
|
16
|
+
|
|
17
|
+
# Compatible with different pandas versions
|
|
18
|
+
PD_PARAM_NEWLINE = 'lineterminator'
|
|
19
|
+
pd_version = [int(x) for x in pd.__version__.split('.')]
|
|
20
|
+
if pd_version[0] < 1 or (pd_version[0] == 1 and pd_version[1] < 5):
|
|
21
|
+
PD_PARAM_NEWLINE = 'line_terminator'
|
|
22
|
+
|
|
23
|
+
FILE_FORMAT = Literal['csv', 'tsv', 'json', 'xlsx', 'parquet']
|
|
24
|
+
COMPRESSION_FORMAT = Literal[None, 'infer', 'snappy', 'gzip', 'brotli', 'bz2', 'zip', 'xz']
|
|
25
|
+
|
|
26
|
+
def _drop_na_values(data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], axis: Literal['columns', 'rows']):
|
|
27
|
+
if isinstance(data, pd.DataFrame):
|
|
28
|
+
data.dropna(axis=axis, how='all', inplace=True)
|
|
29
|
+
else:
|
|
30
|
+
assert isinstance(data, dict)
|
|
31
|
+
for df in data.values():
|
|
32
|
+
df.dropna(axis=axis, how='all', inplace=True)
|
|
33
|
+
|
|
34
|
+
def _infer_file_format(file) -> str:
|
|
35
|
+
if isinstance(file, pd.ExcelWriter):
|
|
36
|
+
return 'xlsx'
|
|
37
|
+
elif isinstance(file, str):
|
|
38
|
+
return os.path.splitext(file)[1].lower()[1:]
|
|
39
|
+
elif isinstance(file, pathlib.PurePath):
|
|
40
|
+
suf = file.suffix
|
|
41
|
+
return suf[1:] if suf.startswith('.') else suf
|
|
42
|
+
elif isinstance(file, os.PathLike):
|
|
43
|
+
return os.path.splitext(os.fspath(file))[1].lower().lstrip('.')
|
|
44
|
+
else:
|
|
45
|
+
raise ValueError(f"Cannot infer format for type: {type(file)}")
|
|
46
|
+
|
|
47
|
+
def read_dataframe(file: Union[str, os.PathLike, io.IOBase], *args, sheet_name=0,
|
|
48
|
+
file_format: FILE_FORMAT = None, encoding='auto',
|
|
49
|
+
jsonl=False, dtype: type = None,
|
|
50
|
+
drop_na_columns=False, drop_na_rows=False,
|
|
51
|
+
**kwargs) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
|
|
52
|
+
"""
|
|
53
|
+
read file as pandas `DataFrame`
|
|
54
|
+
:param file: the file to be read
|
|
55
|
+
:param args: extra args for `pd.read_xx()`
|
|
56
|
+
:param sheet_name: `sheet_name` for `pd.read_excel()`
|
|
57
|
+
:param file_format: csv, tsv, json ,xlsx, parquet
|
|
58
|
+
:param encoding: text file encoding
|
|
59
|
+
:param jsonl: jsonl format or not, only used in json format
|
|
60
|
+
:param dtype: `dtype` for `pd.read_xx()`
|
|
61
|
+
:param drop_na_columns: drop column if all values of the column is na
|
|
62
|
+
:param drop_na_rows: drop row if all values of the row is na
|
|
63
|
+
:param kwargs: extra kwargs for `pd.read_xx()`
|
|
64
|
+
"""
|
|
65
|
+
# decide the file format
|
|
66
|
+
if not file_format:
|
|
67
|
+
file_format = _infer_file_format(file)
|
|
68
|
+
|
|
69
|
+
for key in ['lines', 'line_delimited_json_format']:
|
|
70
|
+
if key in kwargs and kwargs.pop(key):
|
|
71
|
+
jsonl = True
|
|
72
|
+
|
|
73
|
+
# handle special formats
|
|
74
|
+
if file_format == 'tsv':
|
|
75
|
+
# if the file format is tsv, actually same as csv
|
|
76
|
+
file_format = 'csv'
|
|
77
|
+
if 'sep' in kwargs:
|
|
78
|
+
kwargs.pop('sep')
|
|
79
|
+
kwargs['delimiter'] = '\t'
|
|
80
|
+
elif file_format == 'jsonl':
|
|
81
|
+
file_format = 'json'
|
|
82
|
+
jsonl = True
|
|
83
|
+
|
|
84
|
+
# detect encoding
|
|
85
|
+
if encoding == 'auto' and file_format in ['csv', 'json']:
|
|
86
|
+
if isinstance(file, (str, os.PathLike)):
|
|
87
|
+
encoding = detect_file_encoding(file)
|
|
88
|
+
elif isinstance(file, io.IOBase) and file.seekable():
|
|
89
|
+
tell = file.tell()
|
|
90
|
+
encoding = detect_stream_encoding(file)
|
|
91
|
+
file.seek(tell)
|
|
92
|
+
else:
|
|
93
|
+
# read file may cause content change, so we cannot detect the encoding
|
|
94
|
+
encoding = None
|
|
95
|
+
|
|
96
|
+
if file_format == 'csv':
|
|
97
|
+
df = pd.read_csv(file, *args, encoding=encoding, dtype=dtype, **kwargs)
|
|
98
|
+
elif file_format == 'xlsx':
|
|
99
|
+
df = pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
|
|
100
|
+
elif file_format == 'json':
|
|
101
|
+
try:
|
|
102
|
+
df = pd.read_json(file, *args, encoding=encoding, lines=jsonl, dtype=dtype, **kwargs)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
# if failed, try again with different arg `lines`
|
|
105
|
+
try:
|
|
106
|
+
df = pd.read_json(file, *args, lines=not jsonl, dtype=dtype, **kwargs)
|
|
107
|
+
except Exception:
|
|
108
|
+
raise e
|
|
109
|
+
elif file_format == 'parquet':
|
|
110
|
+
df = pd.read_parquet(file, *args, **kwargs)
|
|
111
|
+
else:
|
|
112
|
+
raise IOError(f"Unknown file format: {file}")
|
|
113
|
+
|
|
114
|
+
if drop_na_columns:
|
|
115
|
+
_drop_na_values(df, axis='columns')
|
|
116
|
+
if drop_na_rows:
|
|
117
|
+
_drop_na_values(df, axis='rows')
|
|
118
|
+
|
|
119
|
+
return df
|
|
120
|
+
|
|
121
|
+
def save_dataframe(file: Union[str, os.PathLike, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]'],
|
|
122
|
+
df: Union[pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]]],
|
|
123
|
+
*args, sheet_name='Sheet1',
|
|
124
|
+
file_format: FILE_FORMAT = None,
|
|
125
|
+
compression: COMPRESSION_FORMAT = None,
|
|
126
|
+
index=False, index_label=None,
|
|
127
|
+
encoding='utf-8', newline='\n',
|
|
128
|
+
force_ascii=False,
|
|
129
|
+
orient='records', jsonl=True, indent=None,
|
|
130
|
+
column_mapper: Union[Dict[str, str], Sequence[str]] = None,
|
|
131
|
+
include_columns: Sequence[str] = None,
|
|
132
|
+
exclude_columns: Sequence[str] = None,
|
|
133
|
+
**kwargs):
|
|
134
|
+
"""
|
|
135
|
+
save data into file
|
|
136
|
+
:param file: where to save the data to
|
|
137
|
+
:param df: the data
|
|
138
|
+
:param args: extra args for df.to_xx()
|
|
139
|
+
:param sheet_name: `sheet_name` for excel format
|
|
140
|
+
:param file_format: csv, tsv, json, xlsx, parquet
|
|
141
|
+
:param compression: name of the compression to use.
|
|
142
|
+
use `None` for no compression.
|
|
143
|
+
:param index: save index or not, see docs in df.to_csv();
|
|
144
|
+
if set as str and `index_label` not set, `index_label` will be set as this
|
|
145
|
+
:param index_label: header for the index when `index` is `True`
|
|
146
|
+
:param encoding: text file encoding
|
|
147
|
+
:param newline: text file newline
|
|
148
|
+
:param force_ascii: `force_ascii` for json format
|
|
149
|
+
:param orient: `orient` for json format
|
|
150
|
+
:param jsonl: jsonl format or not
|
|
151
|
+
:param indent: indent for json format
|
|
152
|
+
:param column_mapper: rename columns; if set, columns not list here will be ignored
|
|
153
|
+
:param include_columns: if set, columns not list here will be ignored
|
|
154
|
+
:param exclude_columns: if set, columns list here will be ignored
|
|
155
|
+
:param kwargs: extra kwargs for df.to_xx()
|
|
156
|
+
"""
|
|
157
|
+
# decide file format
|
|
158
|
+
if not file_format:
|
|
159
|
+
file_format = _infer_file_format(file)
|
|
160
|
+
|
|
161
|
+
# convert data to be a dataframe
|
|
162
|
+
if not isinstance(df, pd.DataFrame):
|
|
163
|
+
df = pd.DataFrame(df)
|
|
164
|
+
|
|
165
|
+
for key in ['lines', 'line_delimited_json_format']:
|
|
166
|
+
if key in kwargs and kwargs.pop(key):
|
|
167
|
+
jsonl = True
|
|
168
|
+
|
|
169
|
+
# deal with columns
|
|
170
|
+
if column_mapper:
|
|
171
|
+
df = df.rename(columns=column_mapper)
|
|
172
|
+
if exclude_columns:
|
|
173
|
+
df = df.drop(exclude_columns, axis=1)
|
|
174
|
+
if include_columns:
|
|
175
|
+
df = df.reindex(include_columns, axis=1)
|
|
176
|
+
|
|
177
|
+
# ensure parent dir exists
|
|
178
|
+
if isinstance(file, (str, os.PathLike)):
|
|
179
|
+
ensure_parent_dir_exist(file)
|
|
180
|
+
|
|
181
|
+
# compatible for set index just use arg `index`
|
|
182
|
+
if index_label is None and isinstance(index, str):
|
|
183
|
+
index, index_label = True, index
|
|
184
|
+
|
|
185
|
+
# handle special formats
|
|
186
|
+
if file_format == 'tsv':
|
|
187
|
+
# tsv is actually a csv
|
|
188
|
+
file_format = 'csv'
|
|
189
|
+
kwargs['sep'] = '\t'
|
|
190
|
+
elif file_format == 'jsonl':
|
|
191
|
+
file_format = 'json'
|
|
192
|
+
jsonl = True
|
|
193
|
+
|
|
194
|
+
# save to file for different format
|
|
195
|
+
if file_format == 'csv':
|
|
196
|
+
kwargs[PD_PARAM_NEWLINE] = newline
|
|
197
|
+
df.to_csv(file, *args, compression=compression, index=index, index_label=index_label,
|
|
198
|
+
encoding=encoding, **kwargs)
|
|
199
|
+
elif file_format == 'xlsx':
|
|
200
|
+
df.to_excel(file, *args, index=index, index_label=index_label, sheet_name=sheet_name, **kwargs)
|
|
201
|
+
elif file_format == 'json':
|
|
202
|
+
if jsonl:
|
|
203
|
+
orient = 'records'
|
|
204
|
+
indent = None
|
|
205
|
+
if orient not in ['split', 'table']:
|
|
206
|
+
index = True
|
|
207
|
+
df.to_json(file, *args, compression=compression, index=index,
|
|
208
|
+
force_ascii=force_ascii, orient=orient, lines=jsonl,
|
|
209
|
+
indent=indent, **kwargs)
|
|
210
|
+
elif file_format == 'parquet':
|
|
211
|
+
df.to_parquet(file, *args, compression=compression, index=index, **kwargs)
|
|
212
|
+
else:
|
|
213
|
+
raise IOError(f"Unknown file format: {file}")
|
|
214
|
+
|
|
215
|
+
def iter_dataframe(data: pd.DataFrame,
|
|
216
|
+
progress_bar: Union[bool, str, 'tqdm', Callable[[Iterable[Any]], 'tqdm']] = False
|
|
217
|
+
) -> Iterable[Tuple[Hashable, pd.Series]]:
|
|
218
|
+
"""
|
|
219
|
+
iter dataframe rows, may show a progress bar
|
|
220
|
+
:param data: dataframe
|
|
221
|
+
:param progress_bar: show a progress bar or not
|
|
222
|
+
if set a non-empty string, the string will be set as the progress bar description
|
|
223
|
+
"""
|
|
224
|
+
rows = data.iterrows()
|
|
225
|
+
if progress_bar:
|
|
226
|
+
from tqdm import tqdm
|
|
227
|
+
if isinstance(progress_bar, tqdm):
|
|
228
|
+
progress_bar.iterable = rows
|
|
229
|
+
rows = progress_bar
|
|
230
|
+
elif isinstance(progress_bar, str):
|
|
231
|
+
rows = tqdm(rows, total=len(data), desc=progress_bar)
|
|
232
|
+
elif callable(progress_bar):
|
|
233
|
+
rows = progress_bar(rows)
|
|
234
|
+
else:
|
|
235
|
+
rows = tqdm(rows, total=len(data))
|
|
236
|
+
return rows
|
|
237
|
+
|
|
238
|
+
def extract_dataframe_sample(data: pd.DataFrame,
|
|
239
|
+
filter_func: Callable[[pd.Series], bool],
|
|
240
|
+
size=0, shuffle=False,
|
|
241
|
+
return_format: Literal['df', 'dataframe', 'list'] = 'dataframe',
|
|
242
|
+
progress_bar=False) -> Union[pd.DataFrame, List[pd.Series]]:
|
|
243
|
+
"""
|
|
244
|
+
extract sample from a dataframe
|
|
245
|
+
:param data: original data
|
|
246
|
+
:param filter_func: bool function, `True` means to reserve the row
|
|
247
|
+
:param size: max size for the result
|
|
248
|
+
:param shuffle: shuffle result or not
|
|
249
|
+
:param progress_bar: passed to `iter_dataframe()`
|
|
250
|
+
:param return_format: one of {'dataframe', 'list'}
|
|
251
|
+
"""
|
|
252
|
+
result = [row for _, row in iter_dataframe(data, progress_bar=progress_bar) if filter_func(row)]
|
|
253
|
+
if shuffle:
|
|
254
|
+
random.shuffle(result)
|
|
255
|
+
if 0 < size < len(result):
|
|
256
|
+
result = result[:size]
|
|
257
|
+
if return_format == 'df' or return_format == 'dataframe':
|
|
258
|
+
try:
|
|
259
|
+
return pd.DataFrame(result)
|
|
260
|
+
except pd.errors.InvalidIndexError:
|
|
261
|
+
return pd.DataFrame([{k: v for k, v in x.items()} for x in result])
|
|
262
|
+
elif return_format == 'list':
|
|
263
|
+
return result
|
|
264
|
+
raise ValueError("Param 'return_format' should be one of {'dataframe', 'list'}.")
|
|
265
|
+
|
|
266
|
+
def is_empty_text(s: str) -> bool:
|
|
267
|
+
return pd.isna(s) or not s
|
|
268
|
+
|
|
269
|
+
def is_nonempty_text(s: str) -> bool:
|
|
270
|
+
return pd.notna(s) and isinstance(s, str) and s
|
|
271
|
+
|
|
272
|
+
def is_blank_text(s: str) -> bool:
|
|
273
|
+
return pd.isna(s) or isinstance(s, str) and not s.strip()
|
|
274
|
+
|
|
275
|
+
def is_non_blank_text(s: str) -> bool:
|
|
276
|
+
return pd.notna(s) and isinstance(s, str) and s.strip()
|
|
277
|
+
|
|
278
|
+
def join_values(values: Sequence[Any], sep=None) -> str:
|
|
279
|
+
if not values:
|
|
280
|
+
return ''
|
|
281
|
+
if len(values) == 1:
|
|
282
|
+
return str(values[0])
|
|
283
|
+
return sep.join(map(str, values)) if sep else values
|
|
284
|
+
|
|
285
|
+
def merge_dataframe_rows(data: pd.DataFrame, col_id='ID', na=None, join_sep=None, progress_bar=False) -> pd.DataFrame:
|
|
286
|
+
"""
|
|
287
|
+
merge rows of same id to one row, similar to group by in sql
|
|
288
|
+
:param data: original data
|
|
289
|
+
:param col_id: column name for the id col
|
|
290
|
+
:param na: values to be treated as na
|
|
291
|
+
:param join_sep: seperator to join multi values
|
|
292
|
+
:param progress_bar: passed to `iter_dataframe()`
|
|
293
|
+
"""
|
|
294
|
+
if na is None:
|
|
295
|
+
na = set()
|
|
296
|
+
elif isinstance(na, str):
|
|
297
|
+
na = {na}
|
|
298
|
+
else:
|
|
299
|
+
na = set(na)
|
|
300
|
+
counts = collections.defaultdict(lambda: collections.defaultdict(collections.Counter))
|
|
301
|
+
rows = iter_dataframe(data, progress_bar=progress_bar)
|
|
302
|
+
for i, row in rows:
|
|
303
|
+
eid = row[col_id]
|
|
304
|
+
for k, v in row.items():
|
|
305
|
+
if pd.notna(v) and v not in na:
|
|
306
|
+
counts[eid][k][v] += 1
|
|
307
|
+
result = []
|
|
308
|
+
for x in counts.values():
|
|
309
|
+
item = {col: join_values(list(values.keys()), sep=join_sep) for col, values in x.items()}
|
|
310
|
+
result.append(item)
|
|
311
|
+
return pd.DataFrame(result)
|
feilian/datetime.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from ._typing import Union
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import datetime
|
|
6
|
+
|
|
7
|
+
def format_time(time: Union[str, int, float, datetime.datetime] = None, fmt='%Y-%m-%d %H:%M:%S') -> str:
|
|
8
|
+
if time is None:
|
|
9
|
+
time = datetime.datetime.now()
|
|
10
|
+
elif isinstance(time, (int, float)):
|
|
11
|
+
time = datetime.datetime.fromtimestamp(time)
|
|
12
|
+
elif isinstance(time, str):
|
|
13
|
+
time = pd.to_datetime(time)
|
|
14
|
+
else:
|
|
15
|
+
if not isinstance(time, datetime.datetime):
|
|
16
|
+
raise ValueError(f"Unexpected type: {type(time)}")
|
|
17
|
+
return time.strftime(fmt)
|
|
18
|
+
|
|
19
|
+
# when format a date, no sep is used more
|
|
20
|
+
def format_date(date: Union[str, int, float, datetime.datetime] = None, sep='') -> str:
|
|
21
|
+
return format_time(date, fmt=sep.join(['%Y', '%m', '%d']))
|
feilian/excel.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from ._typing import Union, Iterable, Dict, Sequence, Any, List, Tuple
|
|
3
|
+
from .dataframe import save_dataframe
|
|
4
|
+
|
|
5
|
+
def _save_excel(file, df, *args, **kwargs):
|
|
6
|
+
# if df is a list of dataframe, then save each dataframe into a sheet
|
|
7
|
+
if isinstance(df, (list, tuple)) and df and all(isinstance(x, pd.DataFrame) for x in df):
|
|
8
|
+
if 'sheet_name' in kwargs:
|
|
9
|
+
kwargs.pop('sheet_name')
|
|
10
|
+
with pd.ExcelWriter(file) as writer:
|
|
11
|
+
for i, x in enumerate(df, 1):
|
|
12
|
+
save_dataframe(writer, x, *args, sheet_name=f"Sheet{i}", **kwargs)
|
|
13
|
+
elif isinstance(df, dict) and df and all(isinstance(x, pd.DataFrame) for x in df.values()):
|
|
14
|
+
if 'sheet_name' in kwargs:
|
|
15
|
+
kwargs.pop('sheet_name')
|
|
16
|
+
with pd.ExcelWriter(file) as writer:
|
|
17
|
+
for name, x in df.items():
|
|
18
|
+
save_dataframe(writer, x, *args, sheet_name=name, **kwargs)
|
|
19
|
+
else:
|
|
20
|
+
return save_dataframe(file, df, *args, **kwargs)
|
|
21
|
+
|
|
22
|
+
_FILE_TYPES = Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]']
|
|
23
|
+
_DATA_TYPES = Union[
|
|
24
|
+
pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]],
|
|
25
|
+
List[pd.DataFrame], Tuple[pd.DataFrame], Dict[str, pd.DataFrame]
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
def save_excel(file: _FILE_TYPES, df: _DATA_TYPES,
|
|
29
|
+
*args, sheet_name='Sheet1',
|
|
30
|
+
header: Union[Sequence[str], bool] = True,
|
|
31
|
+
index=False, index_label=None,
|
|
32
|
+
column_mapper: Union[Dict[str, str], Sequence[str]] = None,
|
|
33
|
+
include_columns: Sequence[str] = None,
|
|
34
|
+
exclude_columns: Sequence[str] = None,
|
|
35
|
+
**kwargs):
|
|
36
|
+
"""
|
|
37
|
+
save data into file
|
|
38
|
+
:param file: where to save the data to
|
|
39
|
+
:param df: the data
|
|
40
|
+
:param args: extra args for df.to_xx()
|
|
41
|
+
:param sheet_name: `sheet_name` for excel format
|
|
42
|
+
:param header: `header` for excel format
|
|
43
|
+
:param index: save index or not, see docs in df.to_csv();
|
|
44
|
+
if set as str and `index_label` not set, `index_label` will be set as this
|
|
45
|
+
:param index_label: header for the index when `index` is `True`
|
|
46
|
+
:param column_mapper: rename columns; if set, columns not list here will be ignored
|
|
47
|
+
:param include_columns: if set, columns not list here will be ignored
|
|
48
|
+
:param exclude_columns: if set, columns list here will be ignored
|
|
49
|
+
:param kwargs: extra kwargs for df.to_xx()
|
|
50
|
+
"""
|
|
51
|
+
_save_excel(
|
|
52
|
+
file, df, *args,
|
|
53
|
+
sheet_name=sheet_name,
|
|
54
|
+
header=header,
|
|
55
|
+
index=index,
|
|
56
|
+
index_label=index_label,
|
|
57
|
+
column_mapper=column_mapper,
|
|
58
|
+
include_columns=include_columns,
|
|
59
|
+
exclude_columns=exclude_columns,
|
|
60
|
+
**kwargs
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def write_excel(
|
|
64
|
+
file: _FILE_TYPES, df: _DATA_TYPES,
|
|
65
|
+
*args, sheet_name='Sheet1',
|
|
66
|
+
header: Union[Sequence[str], bool] = True,
|
|
67
|
+
index=False, index_label=None,
|
|
68
|
+
column_mapper: Union[Dict[str, str], Sequence[str]] = None,
|
|
69
|
+
include_columns: Sequence[str] = None,
|
|
70
|
+
exclude_columns: Sequence[str] = None,
|
|
71
|
+
**kwargs
|
|
72
|
+
):
|
|
73
|
+
save_excel(
|
|
74
|
+
file, df, *args,
|
|
75
|
+
sheet_name=sheet_name,
|
|
76
|
+
header=header,
|
|
77
|
+
index=index,
|
|
78
|
+
index_label=index_label,
|
|
79
|
+
column_mapper=column_mapper,
|
|
80
|
+
include_columns=include_columns,
|
|
81
|
+
exclude_columns=exclude_columns,
|
|
82
|
+
**kwargs
|
|
83
|
+
)
|