feilian 1.2.2__tar.gz → 1.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {feilian-1.2.2 → feilian-1.3.4}/PKG-INFO +4 -2
- {feilian-1.2.2 → feilian-1.3.4}/feilian/__init__.py +10 -2
- {feilian-1.2.2 → feilian-1.3.4}/feilian/_dist_ver.py +2 -2
- feilian-1.3.4/feilian/_typing.py +5 -0
- {feilian-1.2.2 → feilian-1.3.4}/feilian/arg.py +2 -5
- {feilian-1.2.2 → feilian-1.3.4}/feilian/dataframe.py +68 -24
- {feilian-1.2.2 → feilian-1.3.4}/feilian/datetime.py +1 -1
- feilian-1.3.4/feilian/excel.py +83 -0
- feilian-1.3.4/feilian/json.py +262 -0
- {feilian-1.2.2 → feilian-1.3.4}/feilian/process.py +16 -6
- {feilian-1.2.2 → feilian-1.3.4}/feilian/string.py +1 -1
- feilian-1.3.4/feilian/txt.py +54 -0
- {feilian-1.2.2 → feilian-1.3.4}/feilian/utils.py +1 -1
- {feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/PKG-INFO +4 -2
- {feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/SOURCES.txt +3 -0
- {feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/requires.txt +2 -0
- {feilian-1.2.2 → feilian-1.3.4}/pyproject.toml +3 -1
- {feilian-1.2.2 → feilian-1.3.4}/requirements.txt +3 -1
- feilian-1.2.2/feilian/json.py +0 -39
- {feilian-1.2.2 → feilian-1.3.4}/README.md +0 -0
- {feilian-1.2.2 → feilian-1.3.4}/build.sh +0 -0
- {feilian-1.2.2 → feilian-1.3.4}/feilian/io.py +0 -0
- {feilian-1.2.2 → feilian-1.3.4}/feilian/version.py +0 -0
- {feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/dependency_links.txt +0 -0
- {feilian-1.2.2 → feilian-1.3.4}/setup.cfg +0 -0
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: feilian
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.4
|
|
4
4
|
Summary: General data processing tool.
|
|
5
5
|
Author-email: darkpeath <darkpeath@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/darkpeath/feilian
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: chardet
|
|
8
9
|
Requires-Dist: pandas
|
|
9
10
|
Provides-Extra: extra
|
|
10
11
|
Requires-Dist: tqdm; extra == "extra"
|
|
12
|
+
Requires-Dist: ijson; extra == "extra"
|
|
11
13
|
|
|
12
14
|
# feilian
|
|
13
15
|
|
|
@@ -5,8 +5,13 @@ from .dataframe import read_dataframe, save_dataframe, extract_dataframe_sample,
|
|
|
5
5
|
from .dataframe import is_empty_text, is_nonempty_text, is_blank_text, is_non_blank_text
|
|
6
6
|
from .datetime import format_time, format_date
|
|
7
7
|
from .arg import ArgValueParser
|
|
8
|
-
from .json import read_json, save_json
|
|
8
|
+
from .json import read_json, save_json, write_json, read_big_json
|
|
9
|
+
from .txt import (
|
|
10
|
+
detect_stream_encoding, detect_file_encoding, get_file_encoding,
|
|
11
|
+
read_txt, save_txt, write_txt,
|
|
12
|
+
)
|
|
9
13
|
from .process import DataframeProcessor
|
|
14
|
+
from .excel import save_excel, write_excel
|
|
10
15
|
from .utils import flatten_dict, flatten_list
|
|
11
16
|
from .version import __version__
|
|
12
17
|
|
|
@@ -16,7 +21,10 @@ __all__ = [
|
|
|
16
21
|
'is_empty_text', 'is_nonempty_text', 'is_blank_text', 'is_non_blank_text',
|
|
17
22
|
'format_time', 'format_date',
|
|
18
23
|
'ArgValueParser',
|
|
19
|
-
'read_json', 'save_json',
|
|
24
|
+
'read_json', 'save_json', 'write_json', 'read_big_json',
|
|
25
|
+
'detect_stream_encoding', 'detect_file_encoding', 'get_file_encoding',
|
|
26
|
+
'read_txt', 'save_txt', 'write_txt',
|
|
27
|
+
'save_excel', 'write_excel',
|
|
20
28
|
'DataframeProcessor',
|
|
21
29
|
'flatten_dict', 'flatten_list',
|
|
22
30
|
'__version__',
|
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from ._typing import (
|
|
4
4
|
Union, List, Any, Iterable,
|
|
5
5
|
Callable, Set, Optional, Tuple,
|
|
6
6
|
Dict, Hashable, Sequence,
|
|
7
|
+
Literal,
|
|
7
8
|
)
|
|
8
|
-
try:
|
|
9
|
-
from typing import Literal
|
|
10
|
-
except ImportError:
|
|
11
|
-
from typing_extensions import Literal
|
|
12
9
|
|
|
13
10
|
_build_in_na_checkers = {
|
|
14
11
|
'always_na': lambda x: True,
|
|
@@ -4,17 +4,15 @@
|
|
|
4
4
|
Encapsulate methods for pandas `DataFrame`.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
try:
|
|
9
|
-
from typing import Literal
|
|
10
|
-
except ImportError:
|
|
11
|
-
from typing_extensions import Literal
|
|
12
|
-
|
|
7
|
+
import io
|
|
13
8
|
import os
|
|
9
|
+
import pathlib
|
|
14
10
|
import pandas as pd
|
|
15
11
|
import random
|
|
16
12
|
import collections
|
|
13
|
+
from ._typing import Union, Iterable, Dict, List, Any, Sequence, Callable, Tuple, Hashable, Literal
|
|
17
14
|
from .io import ensure_parent_dir_exist
|
|
15
|
+
from .txt import detect_stream_encoding, detect_file_encoding
|
|
18
16
|
|
|
19
17
|
# Compatible with different pandas versions
|
|
20
18
|
PD_PARAM_NEWLINE = 'lineterminator'
|
|
@@ -25,9 +23,31 @@ if pd_version[0] < 1 or (pd_version[0] == 1 and pd_version[1] < 5):
|
|
|
25
23
|
FILE_FORMAT = Literal['csv', 'tsv', 'json', 'xlsx', 'parquet']
|
|
26
24
|
COMPRESSION_FORMAT = Literal[None, 'infer', 'snappy', 'gzip', 'brotli', 'bz2', 'zip', 'xz']
|
|
27
25
|
|
|
28
|
-
def
|
|
29
|
-
|
|
26
|
+
def _drop_na_values(data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], axis: Literal['columns', 'rows']):
|
|
27
|
+
if isinstance(data, pd.DataFrame):
|
|
28
|
+
data.dropna(axis=axis, how='all', inplace=True)
|
|
29
|
+
else:
|
|
30
|
+
assert isinstance(data, dict)
|
|
31
|
+
for df in data.values():
|
|
32
|
+
df.dropna(axis=axis, how='all', inplace=True)
|
|
33
|
+
|
|
34
|
+
def _infer_file_format(file) -> str:
|
|
35
|
+
if isinstance(file, pd.ExcelWriter):
|
|
36
|
+
return 'xlsx'
|
|
37
|
+
elif isinstance(file, str):
|
|
38
|
+
return os.path.splitext(file)[1].lower()[1:]
|
|
39
|
+
elif isinstance(file, pathlib.PurePath):
|
|
40
|
+
suf = file.suffix
|
|
41
|
+
return suf[1:] if suf.startswith('.') else suf
|
|
42
|
+
elif isinstance(file, os.PathLike):
|
|
43
|
+
return os.path.splitext(os.fspath(file))[1].lower().lstrip('.')
|
|
44
|
+
else:
|
|
45
|
+
raise ValueError(f"Cannot infer format for type: {type(file)}")
|
|
46
|
+
|
|
47
|
+
def read_dataframe(file: Union[str, os.PathLike, io.IOBase], *args, sheet_name=0,
|
|
48
|
+
file_format: FILE_FORMAT = None, encoding='auto',
|
|
30
49
|
jsonl=False, dtype: type = None,
|
|
50
|
+
drop_na_columns=False, drop_na_rows=False,
|
|
31
51
|
**kwargs) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
|
|
32
52
|
"""
|
|
33
53
|
read file as pandas `DataFrame`
|
|
@@ -35,15 +55,16 @@ def read_dataframe(file: str, *args, sheet_name=0,
|
|
|
35
55
|
:param args: extra args for `pd.read_xx()`
|
|
36
56
|
:param sheet_name: `sheet_name` for `pd.read_excel()`
|
|
37
57
|
:param file_format: csv, tsv, json ,xlsx, parquet
|
|
58
|
+
:param encoding: text file encoding
|
|
38
59
|
:param jsonl: jsonl format or not, only used in json format
|
|
39
60
|
:param dtype: `dtype` for `pd.read_xx()`
|
|
61
|
+
:param drop_na_columns: drop column if all values of the column is na
|
|
62
|
+
:param drop_na_rows: drop row if all values of the row is na
|
|
40
63
|
:param kwargs: extra kwargs for `pd.read_xx()`
|
|
41
64
|
"""
|
|
42
65
|
# decide the file format
|
|
43
66
|
if not file_format:
|
|
44
|
-
|
|
45
|
-
raise ValueError("Format should given!")
|
|
46
|
-
file_format = os.path.splitext(file)[1].lower()[1:]
|
|
67
|
+
file_format = _infer_file_format(file)
|
|
47
68
|
|
|
48
69
|
for key in ['lines', 'line_delimited_json_format']:
|
|
49
70
|
if key in kwargs and kwargs.pop(key):
|
|
@@ -60,18 +81,44 @@ def read_dataframe(file: str, *args, sheet_name=0,
|
|
|
60
81
|
file_format = 'json'
|
|
61
82
|
jsonl = True
|
|
62
83
|
|
|
84
|
+
# detect encoding
|
|
85
|
+
if encoding == 'auto' and file_format in ['csv', 'json']:
|
|
86
|
+
if isinstance(file, (str, os.PathLike)):
|
|
87
|
+
encoding = detect_file_encoding(file)
|
|
88
|
+
elif isinstance(file, io.IOBase) and file.seekable():
|
|
89
|
+
tell = file.tell()
|
|
90
|
+
encoding = detect_stream_encoding(file)
|
|
91
|
+
file.seek(tell)
|
|
92
|
+
else:
|
|
93
|
+
# read file may cause content change, so we cannot detect the encoding
|
|
94
|
+
encoding = None
|
|
95
|
+
|
|
63
96
|
if file_format == 'csv':
|
|
64
|
-
|
|
97
|
+
df = pd.read_csv(file, *args, encoding=encoding, dtype=dtype, **kwargs)
|
|
65
98
|
elif file_format == 'xlsx':
|
|
66
|
-
|
|
99
|
+
df = pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
|
|
67
100
|
elif file_format == 'json':
|
|
68
|
-
|
|
101
|
+
try:
|
|
102
|
+
df = pd.read_json(file, *args, encoding=encoding, lines=jsonl, dtype=dtype, **kwargs)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
# if failed, try again with different arg `lines`
|
|
105
|
+
try:
|
|
106
|
+
df = pd.read_json(file, *args, lines=not jsonl, dtype=dtype, **kwargs)
|
|
107
|
+
except Exception:
|
|
108
|
+
raise e
|
|
69
109
|
elif file_format == 'parquet':
|
|
70
|
-
|
|
110
|
+
df = pd.read_parquet(file, *args, **kwargs)
|
|
71
111
|
else:
|
|
72
112
|
raise IOError(f"Unknown file format: {file}")
|
|
73
113
|
|
|
74
|
-
|
|
114
|
+
if drop_na_columns:
|
|
115
|
+
_drop_na_values(df, axis='columns')
|
|
116
|
+
if drop_na_rows:
|
|
117
|
+
_drop_na_values(df, axis='rows')
|
|
118
|
+
|
|
119
|
+
return df
|
|
120
|
+
|
|
121
|
+
def save_dataframe(file: Union[str, os.PathLike, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]'],
|
|
75
122
|
df: Union[pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]]],
|
|
76
123
|
*args, sheet_name='Sheet1',
|
|
77
124
|
file_format: FILE_FORMAT = None,
|
|
@@ -79,7 +126,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
|
|
|
79
126
|
index=False, index_label=None,
|
|
80
127
|
encoding='utf-8', newline='\n',
|
|
81
128
|
force_ascii=False,
|
|
82
|
-
orient='records', jsonl=True,
|
|
129
|
+
orient='records', jsonl=True, indent=None,
|
|
83
130
|
column_mapper: Union[Dict[str, str], Sequence[str]] = None,
|
|
84
131
|
include_columns: Sequence[str] = None,
|
|
85
132
|
exclude_columns: Sequence[str] = None,
|
|
@@ -101,6 +148,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
|
|
|
101
148
|
:param force_ascii: `force_ascii` for json format
|
|
102
149
|
:param orient: `orient` for json format
|
|
103
150
|
:param jsonl: jsonl format or not
|
|
151
|
+
:param indent: indent for json format
|
|
104
152
|
:param column_mapper: rename columns; if set, columns not list here will be ignored
|
|
105
153
|
:param include_columns: if set, columns not list here will be ignored
|
|
106
154
|
:param exclude_columns: if set, columns list here will be ignored
|
|
@@ -108,12 +156,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
|
|
|
108
156
|
"""
|
|
109
157
|
# decide file format
|
|
110
158
|
if not file_format:
|
|
111
|
-
|
|
112
|
-
file_format = os.path.splitext(file)[1].lower()[1:]
|
|
113
|
-
elif isinstance(file, pd.ExcelWriter):
|
|
114
|
-
file_format = 'xlsx'
|
|
115
|
-
else:
|
|
116
|
-
raise ValueError("Format should given!")
|
|
159
|
+
file_format = _infer_file_format(file)
|
|
117
160
|
|
|
118
161
|
# convert data to be a dataframe
|
|
119
162
|
if not isinstance(df, pd.DataFrame):
|
|
@@ -158,11 +201,12 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
|
|
|
158
201
|
elif file_format == 'json':
|
|
159
202
|
if jsonl:
|
|
160
203
|
orient = 'records'
|
|
204
|
+
indent = None
|
|
161
205
|
if orient not in ['split', 'table']:
|
|
162
206
|
index = True
|
|
163
207
|
df.to_json(file, *args, compression=compression, index=index,
|
|
164
208
|
force_ascii=force_ascii, orient=orient, lines=jsonl,
|
|
165
|
-
**kwargs)
|
|
209
|
+
indent=indent, **kwargs)
|
|
166
210
|
elif file_format == 'parquet':
|
|
167
211
|
df.to_parquet(file, *args, compression=compression, index=index, **kwargs)
|
|
168
212
|
else:
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from ._typing import Union, Iterable, Dict, Sequence, Any, List, Tuple
|
|
3
|
+
from .dataframe import save_dataframe
|
|
4
|
+
|
|
5
|
+
def _save_excel(file, df, *args, **kwargs):
|
|
6
|
+
# if df is a list of dataframe, then save each dataframe into a sheet
|
|
7
|
+
if isinstance(df, (list, tuple)) and df and all(isinstance(x, pd.DataFrame) for x in df):
|
|
8
|
+
if 'sheet_name' in kwargs:
|
|
9
|
+
kwargs.pop('sheet_name')
|
|
10
|
+
with pd.ExcelWriter(file) as writer:
|
|
11
|
+
for i, x in enumerate(df, 1):
|
|
12
|
+
save_dataframe(writer, x, *args, sheet_name=f"Sheet{i}", **kwargs)
|
|
13
|
+
elif isinstance(df, dict) and df and all(isinstance(x, pd.DataFrame) for x in df.values()):
|
|
14
|
+
if 'sheet_name' in kwargs:
|
|
15
|
+
kwargs.pop('sheet_name')
|
|
16
|
+
with pd.ExcelWriter(file) as writer:
|
|
17
|
+
for name, x in df.items():
|
|
18
|
+
save_dataframe(writer, x, *args, sheet_name=name, **kwargs)
|
|
19
|
+
else:
|
|
20
|
+
return save_dataframe(file, df, *args, **kwargs)
|
|
21
|
+
|
|
22
|
+
_FILE_TYPES = Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]']
|
|
23
|
+
_DATA_TYPES = Union[
|
|
24
|
+
pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]],
|
|
25
|
+
List[pd.DataFrame], Tuple[pd.DataFrame], Dict[str, pd.DataFrame]
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
def save_excel(file: _FILE_TYPES, df: _DATA_TYPES,
|
|
29
|
+
*args, sheet_name='Sheet1',
|
|
30
|
+
header: Union[Sequence[str], bool] = True,
|
|
31
|
+
index=False, index_label=None,
|
|
32
|
+
column_mapper: Union[Dict[str, str], Sequence[str]] = None,
|
|
33
|
+
include_columns: Sequence[str] = None,
|
|
34
|
+
exclude_columns: Sequence[str] = None,
|
|
35
|
+
**kwargs):
|
|
36
|
+
"""
|
|
37
|
+
save data into file
|
|
38
|
+
:param file: where to save the data to
|
|
39
|
+
:param df: the data
|
|
40
|
+
:param args: extra args for df.to_xx()
|
|
41
|
+
:param sheet_name: `sheet_name` for excel format
|
|
42
|
+
:param header: `header` for excel format
|
|
43
|
+
:param index: save index or not, see docs in df.to_csv();
|
|
44
|
+
if set as str and `index_label` not set, `index_label` will be set as this
|
|
45
|
+
:param index_label: header for the index when `index` is `True`
|
|
46
|
+
:param column_mapper: rename columns; if set, columns not list here will be ignored
|
|
47
|
+
:param include_columns: if set, columns not list here will be ignored
|
|
48
|
+
:param exclude_columns: if set, columns list here will be ignored
|
|
49
|
+
:param kwargs: extra kwargs for df.to_xx()
|
|
50
|
+
"""
|
|
51
|
+
_save_excel(
|
|
52
|
+
file, df, *args,
|
|
53
|
+
sheet_name=sheet_name,
|
|
54
|
+
header=header,
|
|
55
|
+
index=index,
|
|
56
|
+
index_label=index_label,
|
|
57
|
+
column_mapper=column_mapper,
|
|
58
|
+
include_columns=include_columns,
|
|
59
|
+
exclude_columns=exclude_columns,
|
|
60
|
+
**kwargs
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def write_excel(
|
|
64
|
+
file: _FILE_TYPES, df: _DATA_TYPES,
|
|
65
|
+
*args, sheet_name='Sheet1',
|
|
66
|
+
header: Union[Sequence[str], bool] = True,
|
|
67
|
+
index=False, index_label=None,
|
|
68
|
+
column_mapper: Union[Dict[str, str], Sequence[str]] = None,
|
|
69
|
+
include_columns: Sequence[str] = None,
|
|
70
|
+
exclude_columns: Sequence[str] = None,
|
|
71
|
+
**kwargs
|
|
72
|
+
):
|
|
73
|
+
save_excel(
|
|
74
|
+
file, df, *args,
|
|
75
|
+
sheet_name=sheet_name,
|
|
76
|
+
header=header,
|
|
77
|
+
index=index,
|
|
78
|
+
index_label=index_label,
|
|
79
|
+
column_mapper=column_mapper,
|
|
80
|
+
include_columns=include_columns,
|
|
81
|
+
exclude_columns=exclude_columns,
|
|
82
|
+
**kwargs
|
|
83
|
+
)
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List, Union, Any
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import os
|
|
6
|
+
import abc
|
|
7
|
+
import json
|
|
8
|
+
from decimal import Decimal
|
|
9
|
+
from .io import ensure_parent_dir_exist
|
|
10
|
+
from .txt import get_file_encoding
|
|
11
|
+
try:
|
|
12
|
+
import ijson
|
|
13
|
+
except ImportError as e:
|
|
14
|
+
ijson = None
|
|
15
|
+
|
|
16
|
+
def _read_json(filepath: Union[str, os.PathLike], jsonl: bool, encoding='utf-8', **kwargs):
|
|
17
|
+
"""
|
|
18
|
+
The actual read function.
|
|
19
|
+
"""
|
|
20
|
+
encoding = get_file_encoding(filepath, encoding=encoding)
|
|
21
|
+
with open(filepath, encoding=encoding) as f:
|
|
22
|
+
if jsonl:
|
|
23
|
+
return [json.loads(x, **kwargs) for x in f]
|
|
24
|
+
else:
|
|
25
|
+
return json.load(f, **kwargs)
|
|
26
|
+
|
|
27
|
+
def _is_jsonl(filepath: Union[str, os.PathLike], jsonl: bool = None) -> bool:
|
|
28
|
+
if jsonl is None:
|
|
29
|
+
filepath = Path(filepath)
|
|
30
|
+
jsonl = filepath.suffix.lower() == '.jsonl'
|
|
31
|
+
return jsonl
|
|
32
|
+
|
|
33
|
+
def read_json(
|
|
34
|
+
filepath: Union[str, os.PathLike],
|
|
35
|
+
jsonl: bool = None,
|
|
36
|
+
encoding: str = 'auto',
|
|
37
|
+
**kwargs
|
|
38
|
+
) -> Union[Dict[str, Any], List[Any]]:
|
|
39
|
+
"""
|
|
40
|
+
An agent for `json.load()` with some default value.
|
|
41
|
+
"""
|
|
42
|
+
jsonl = _is_jsonl(filepath, jsonl)
|
|
43
|
+
try:
|
|
44
|
+
return _read_json(filepath, jsonl=jsonl, encoding=encoding, **kwargs)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
# if failed, try again with different arg `jsonl`
|
|
47
|
+
try:
|
|
48
|
+
return _read_json(filepath, jsonl=not jsonl, encoding=encoding, **kwargs)
|
|
49
|
+
except Exception:
|
|
50
|
+
raise e
|
|
51
|
+
|
|
52
|
+
def save_json(
|
|
53
|
+
filepath: Union[str, os.PathLike],
|
|
54
|
+
data: Union[Dict[str, Any], List[Any]],
|
|
55
|
+
jsonl: bool = None,
|
|
56
|
+
encoding: str = 'utf-8',
|
|
57
|
+
newline: str = '\n',
|
|
58
|
+
indent: int = 2,
|
|
59
|
+
ensure_ascii: bool = False,
|
|
60
|
+
**kwargs
|
|
61
|
+
):
|
|
62
|
+
"""
|
|
63
|
+
An agent for `json.dump()` with some default value.
|
|
64
|
+
"""
|
|
65
|
+
jsonl = _is_jsonl(filepath, jsonl)
|
|
66
|
+
if jsonl and not isinstance(data, list):
|
|
67
|
+
# data should be a list
|
|
68
|
+
raise ValueError("data should be a list when save as jsonl format")
|
|
69
|
+
ensure_parent_dir_exist(filepath)
|
|
70
|
+
with open(filepath, 'w', encoding=encoding, newline=newline) as f:
|
|
71
|
+
if jsonl:
|
|
72
|
+
for x in data:
|
|
73
|
+
f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
|
|
74
|
+
f.write(newline)
|
|
75
|
+
else:
|
|
76
|
+
json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)
|
|
77
|
+
|
|
78
|
+
def write_json(
|
|
79
|
+
filepath: Union[str, os.PathLike],
|
|
80
|
+
data: Union[Dict[str, Any], List[Any]],
|
|
81
|
+
jsonl: bool = None,
|
|
82
|
+
encoding: str = 'utf-8',
|
|
83
|
+
newline: str = '\n',
|
|
84
|
+
indent: int = 2,
|
|
85
|
+
ensure_ascii: bool = False,
|
|
86
|
+
**kwargs
|
|
87
|
+
):
|
|
88
|
+
save_json(
|
|
89
|
+
filepath=filepath,
|
|
90
|
+
data=data,
|
|
91
|
+
jsonl=jsonl,
|
|
92
|
+
encoding=encoding,
|
|
93
|
+
newline=newline,
|
|
94
|
+
indent=indent,
|
|
95
|
+
ensure_ascii=ensure_ascii,
|
|
96
|
+
**kwargs
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class _JsonNode:
|
|
101
|
+
def __init__(self, type: str = '', parent: '_JsonNode' = None):
|
|
102
|
+
self._type = ''
|
|
103
|
+
self._value = None
|
|
104
|
+
self._parent = parent
|
|
105
|
+
if type:
|
|
106
|
+
self.type = type
|
|
107
|
+
|
|
108
|
+
def clear(self):
|
|
109
|
+
if self._type == 'map':
|
|
110
|
+
self._value.clear()
|
|
111
|
+
elif self._type == 'array':
|
|
112
|
+
self._value.clear()
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def parent(self):
|
|
116
|
+
return self._parent
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def type(self):
|
|
120
|
+
return self._type
|
|
121
|
+
|
|
122
|
+
@type.setter
|
|
123
|
+
def type(self, value):
|
|
124
|
+
if self._type:
|
|
125
|
+
raise ValueError('type is already set')
|
|
126
|
+
self._type = value
|
|
127
|
+
if value == 'map':
|
|
128
|
+
self._value = {}
|
|
129
|
+
elif value == 'array':
|
|
130
|
+
self._value = []
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def value(self):
|
|
134
|
+
if not self._type:
|
|
135
|
+
raise ValueError('type is not set')
|
|
136
|
+
if self._type == 'dummy':
|
|
137
|
+
assert isinstance(self._value, _JsonNode)
|
|
138
|
+
return self._value.value
|
|
139
|
+
if self._type == 'map':
|
|
140
|
+
assert isinstance(self._value, dict)
|
|
141
|
+
return {k: v.value for k, v in self._value.items()}
|
|
142
|
+
if self._type == 'array':
|
|
143
|
+
assert isinstance(self._value, list)
|
|
144
|
+
return [v.value for v in self._value]
|
|
145
|
+
return self._value
|
|
146
|
+
|
|
147
|
+
@value.setter
|
|
148
|
+
def value(self, value):
|
|
149
|
+
if not self._type:
|
|
150
|
+
raise ValueError('type is not set')
|
|
151
|
+
if self._type in ['dummy', 'map', 'array']:
|
|
152
|
+
raise RuntimeError('cannot set value for dummy, map, array')
|
|
153
|
+
self._value = value
|
|
154
|
+
|
|
155
|
+
def __repr__(self):
|
|
156
|
+
return str(self.value)
|
|
157
|
+
|
|
158
|
+
def __str__(self):
|
|
159
|
+
return str(self.value)
|
|
160
|
+
|
|
161
|
+
class StreamJsonReader(abc.ABC):
|
|
162
|
+
"""
|
|
163
|
+
Iterate over a json file.
|
|
164
|
+
"""
|
|
165
|
+
def __init__(self, filepath: Union[str, os.PathLike], encoding: str = None, limit: int = float('inf')):
|
|
166
|
+
self.filepath = filepath
|
|
167
|
+
self.encoding = encoding
|
|
168
|
+
self.limit = limit
|
|
169
|
+
self._data_type = '' # dict or list
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def data_type(self):
|
|
173
|
+
return self._data_type
|
|
174
|
+
|
|
175
|
+
def __iter__(self):
|
|
176
|
+
raise NotImplementedError
|
|
177
|
+
|
|
178
|
+
class BigJsonReader(StreamJsonReader):
|
|
179
|
+
def __iter__(self):
|
|
180
|
+
with open(self.filepath, 'rb') as f:
|
|
181
|
+
parser = ijson.parse(f)
|
|
182
|
+
dummy = node = _JsonNode('')
|
|
183
|
+
cnt = 0
|
|
184
|
+
for prefix, event, value in parser:
|
|
185
|
+
if event == 'start_map':
|
|
186
|
+
if node.type == 'array':
|
|
187
|
+
child = _JsonNode(type='map', parent=node)
|
|
188
|
+
node._value.append(child)
|
|
189
|
+
node = child
|
|
190
|
+
else:
|
|
191
|
+
node.type = 'map'
|
|
192
|
+
elif event == 'end_map':
|
|
193
|
+
node = node.parent
|
|
194
|
+
elif event == 'start_array':
|
|
195
|
+
node.type = 'array'
|
|
196
|
+
elif event == 'end_array':
|
|
197
|
+
node = node.parent
|
|
198
|
+
elif event == 'map_key':
|
|
199
|
+
assert node.type == 'map', f"{event} {value} {prefix}"
|
|
200
|
+
child = _JsonNode(parent=node)
|
|
201
|
+
node._value[value] = child
|
|
202
|
+
node = child
|
|
203
|
+
else:
|
|
204
|
+
assert event in ['null', 'boolean', 'integer', 'double', 'number', 'string']
|
|
205
|
+
if isinstance(value, Decimal):
|
|
206
|
+
value = float(value)
|
|
207
|
+
if node.type == 'array':
|
|
208
|
+
child = _JsonNode(type=event, parent=node)
|
|
209
|
+
child.value = value
|
|
210
|
+
node._value.append(child)
|
|
211
|
+
else:
|
|
212
|
+
assert not node.type
|
|
213
|
+
node.type = event
|
|
214
|
+
node.value = value
|
|
215
|
+
node = node.parent
|
|
216
|
+
if node == dummy and event not in ['start_map', 'start_array']:
|
|
217
|
+
assert node.type in ['map', 'array']
|
|
218
|
+
if node.type == 'map':
|
|
219
|
+
value = node.value
|
|
220
|
+
assert isinstance(value, dict)
|
|
221
|
+
assert len(value) == 1
|
|
222
|
+
k, v = list(value.items())[0]
|
|
223
|
+
self._data_type = 'dict'
|
|
224
|
+
yield k, v
|
|
225
|
+
node.clear()
|
|
226
|
+
elif node.type == 'array':
|
|
227
|
+
value = node.value
|
|
228
|
+
assert isinstance(value, list)
|
|
229
|
+
assert len(value) == 1
|
|
230
|
+
self._data_type = 'list'
|
|
231
|
+
yield value[0]
|
|
232
|
+
node.clear()
|
|
233
|
+
cnt += 1
|
|
234
|
+
if cnt >= self.limit:
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
class JsonlReader(StreamJsonReader):
|
|
238
|
+
@property
|
|
239
|
+
def data_type(self):
|
|
240
|
+
return 'list'
|
|
241
|
+
|
|
242
|
+
def __iter__(self):
|
|
243
|
+
with open(self.filepath, encoding=self.encoding) as f:
|
|
244
|
+
for i, line in enumerate(f, 1):
|
|
245
|
+
yield json.loads(line)
|
|
246
|
+
if i >= self.limit:
|
|
247
|
+
break
|
|
248
|
+
|
|
249
|
+
def read_big_json(
|
|
250
|
+
filepath: Union[str, os.PathLike],
|
|
251
|
+
jsonl: bool = None,
|
|
252
|
+
encoding: str = 'auto',
|
|
253
|
+
) -> StreamJsonReader:
|
|
254
|
+
jsonl = _is_jsonl(filepath, jsonl)
|
|
255
|
+
encoding = get_file_encoding(filepath, encoding=encoding)
|
|
256
|
+
if jsonl:
|
|
257
|
+
return JsonlReader(filepath, encoding=encoding)
|
|
258
|
+
else:
|
|
259
|
+
if ijson is None:
|
|
260
|
+
raise ImportError('ijson is not installed')
|
|
261
|
+
return BigJsonReader(filepath)
|
|
262
|
+
|
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import tqdm
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from
|
|
5
|
-
|
|
4
|
+
from ._typing import (
|
|
5
|
+
Any, Dict, Hashable, List,
|
|
6
|
+
Tuple, Union, Iterable, Optional,
|
|
7
|
+
)
|
|
8
|
+
from .dataframe import (
|
|
9
|
+
read_dataframe,
|
|
10
|
+
save_dataframe,
|
|
11
|
+
)
|
|
6
12
|
|
|
7
13
|
class BaseProcessor(abc.ABC):
|
|
8
14
|
"""
|
|
@@ -55,11 +61,13 @@ class BaseProcessor(abc.ABC):
|
|
|
55
61
|
self.save_result(output_path or input_path, result)
|
|
56
62
|
|
|
57
63
|
class DataframeProcessor(BaseProcessor, abc.ABC):
|
|
58
|
-
def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None
|
|
64
|
+
def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None,
|
|
65
|
+
write_args: Dict[str, Any] = None):
|
|
59
66
|
self.progress = progress
|
|
60
67
|
self.read_args = read_args or {}
|
|
61
68
|
if input_dtype is not None:
|
|
62
69
|
self.read_args['dtype'] = input_dtype
|
|
70
|
+
self.write_args = write_args or {}
|
|
63
71
|
|
|
64
72
|
def read_single_file(self, filepath: str) -> pd.DataFrame:
|
|
65
73
|
return read_dataframe(filepath, **self.read_args)
|
|
@@ -71,12 +79,13 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
|
|
|
71
79
|
return super().read_data(filepath)
|
|
72
80
|
|
|
73
81
|
def save_result(self, filepath: str, result: pd.DataFrame):
|
|
74
|
-
save_dataframe(filepath, result)
|
|
82
|
+
save_dataframe(filepath, result, **self.write_args)
|
|
75
83
|
|
|
76
84
|
@abc.abstractmethod
|
|
77
|
-
def process_row(self, i: Hashable, row: pd.Series) -> Dict[str, Any]:
|
|
85
|
+
def process_row(self, i: Hashable, row: pd.Series) -> Optional[Dict[str, Any]]:
|
|
78
86
|
"""
|
|
79
87
|
Process a single row of data.
|
|
88
|
+
:return: if `None`, ignore this row
|
|
80
89
|
"""
|
|
81
90
|
|
|
82
91
|
def process(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -84,6 +93,7 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
|
|
|
84
93
|
if self.progress:
|
|
85
94
|
desc = "process" if self.progress is True else self.progress
|
|
86
95
|
bar = tqdm.tqdm(bar, total=len(data), desc=desc)
|
|
87
|
-
res =
|
|
96
|
+
res = (self.process_row(i, row) for i, row in bar)
|
|
97
|
+
res = (x for x in res if x is not None)
|
|
88
98
|
return pd.DataFrame(res)
|
|
89
99
|
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from ._typing import Union, Literal
|
|
2
|
+
import os
|
|
3
|
+
import io
|
|
4
|
+
import inspect
|
|
5
|
+
import chardet
|
|
6
|
+
|
|
7
|
+
_DEFAULT_CHUNK_SIZE = 1024
|
|
8
|
+
|
|
9
|
+
if 'should_rename_legacy' in inspect.signature(chardet.UniversalDetector).parameters:
|
|
10
|
+
def _create_detector(should_rename_legacy: bool):
|
|
11
|
+
return chardet.UniversalDetector(should_rename_legacy=should_rename_legacy)
|
|
12
|
+
else:
|
|
13
|
+
def _create_detector(should_rename_legacy: bool):
|
|
14
|
+
return chardet.UniversalDetector()
|
|
15
|
+
|
|
16
|
+
def detect_stream_encoding(stream: io.IOBase, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
|
|
17
|
+
detector = _create_detector(should_rename_legacy=should_rename_legacy)
|
|
18
|
+
while True:
|
|
19
|
+
raw = stream.read(chunk_size)
|
|
20
|
+
if not raw:
|
|
21
|
+
break
|
|
22
|
+
detector.feed(raw)
|
|
23
|
+
if detector.done:
|
|
24
|
+
break
|
|
25
|
+
detector.close()
|
|
26
|
+
return detector.result.get('encoding')
|
|
27
|
+
|
|
28
|
+
def detect_text_encoding(raw: bytes, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
|
|
29
|
+
return detect_stream_encoding(io.BytesIO(raw), chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
|
|
30
|
+
|
|
31
|
+
def detect_file_encoding(path: Union[str, os.PathLike], chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
|
|
32
|
+
with open(path, 'rb') as f:
|
|
33
|
+
return detect_stream_encoding(f, chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
|
|
34
|
+
|
|
35
|
+
def get_file_encoding(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
|
|
36
|
+
if encoding == 'auto':
|
|
37
|
+
encoding = detect_file_encoding(path)
|
|
38
|
+
return encoding
|
|
39
|
+
|
|
40
|
+
def read_txt(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
|
|
41
|
+
if encoding == 'auto':
|
|
42
|
+
with open(path, 'rb') as f:
|
|
43
|
+
raw = f.read()
|
|
44
|
+
encoding = detect_stream_encoding(io.BytesIO(raw))
|
|
45
|
+
return raw.decode(encoding)
|
|
46
|
+
with open(path, 'r', encoding=encoding) as f:
|
|
47
|
+
return f.read()
|
|
48
|
+
|
|
49
|
+
def save_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
|
|
50
|
+
with open(path, 'w', encoding=encoding) as f:
|
|
51
|
+
f.write(content)
|
|
52
|
+
|
|
53
|
+
def write_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
|
|
54
|
+
save_txt(path=path, content=content, encoding=encoding)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python
|
|
2
2
|
# coding: utf-8
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from ._typing import Dict, Any, Union, Collection, List
|
|
5
5
|
|
|
6
6
|
def flatten_dict(data: Dict[str, Any], prefix="", joiner=".",
|
|
7
7
|
exclude: Union[None, str, Collection[str]] = None,
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: feilian
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.4
|
|
4
4
|
Summary: General data processing tool.
|
|
5
5
|
Author-email: darkpeath <darkpeath@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/darkpeath/feilian
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: chardet
|
|
8
9
|
Requires-Dist: pandas
|
|
9
10
|
Provides-Extra: extra
|
|
10
11
|
Requires-Dist: tqdm; extra == "extra"
|
|
12
|
+
Requires-Dist: ijson; extra == "extra"
|
|
11
13
|
|
|
12
14
|
# feilian
|
|
13
15
|
|
|
@@ -4,13 +4,16 @@ pyproject.toml
|
|
|
4
4
|
requirements.txt
|
|
5
5
|
feilian/__init__.py
|
|
6
6
|
feilian/_dist_ver.py
|
|
7
|
+
feilian/_typing.py
|
|
7
8
|
feilian/arg.py
|
|
8
9
|
feilian/dataframe.py
|
|
9
10
|
feilian/datetime.py
|
|
11
|
+
feilian/excel.py
|
|
10
12
|
feilian/io.py
|
|
11
13
|
feilian/json.py
|
|
12
14
|
feilian/process.py
|
|
13
15
|
feilian/string.py
|
|
16
|
+
feilian/txt.py
|
|
14
17
|
feilian/utils.py
|
|
15
18
|
feilian/version.py
|
|
16
19
|
feilian.egg-info/PKG-INFO
|
|
@@ -11,12 +11,14 @@ authors = [
|
|
|
11
11
|
{name = "darkpeath", email = "darkpeath@gmail.com"}
|
|
12
12
|
]
|
|
13
13
|
dependencies = [
|
|
14
|
+
"chardet",
|
|
14
15
|
"pandas",
|
|
15
16
|
]
|
|
16
17
|
|
|
17
18
|
[project.optional-dependencies]
|
|
18
19
|
extra = [
|
|
19
20
|
"tqdm",
|
|
21
|
+
"ijson",
|
|
20
22
|
]
|
|
21
23
|
|
|
22
24
|
[project.urls]
|
|
@@ -33,4 +35,4 @@ write_to_template = """
|
|
|
33
35
|
# don't change, don't track in version control
|
|
34
36
|
VERSION = {version_tuple}
|
|
35
37
|
__version__ = '{version}'
|
|
36
|
-
"""
|
|
38
|
+
"""
|
feilian-1.2.2/feilian/json.py
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
|
-
from typing import Dict, List, Union, Any
|
|
4
|
-
import json
|
|
5
|
-
from .io import ensure_parent_dir_exist
|
|
6
|
-
|
|
7
|
-
def _is_jsonl(filepath: str, jsonl=None) -> bool:
|
|
8
|
-
if jsonl is None:
|
|
9
|
-
jsonl = filepath.lower().endswith('.jsonl')
|
|
10
|
-
return jsonl
|
|
11
|
-
|
|
12
|
-
def read_json(filepath: str, jsonl=None, encoding='utf-8', **kwargs):
|
|
13
|
-
"""
|
|
14
|
-
An agent for `json.load()` with some default value.
|
|
15
|
-
"""
|
|
16
|
-
jsonl = _is_jsonl(filepath, jsonl)
|
|
17
|
-
with open(filepath, encoding=encoding) as f:
|
|
18
|
-
if jsonl:
|
|
19
|
-
return [json.loads(x) for x in f]
|
|
20
|
-
else:
|
|
21
|
-
return json.load(f, **kwargs)
|
|
22
|
-
|
|
23
|
-
def save_json(filepath: str, data: Union[Dict[str, Any], List[Any]], jsonl=False,
|
|
24
|
-
encoding='utf-8', newline='\n', indent=2, ensure_ascii=False, **kwargs):
|
|
25
|
-
"""
|
|
26
|
-
An agent for `json.dump()` with some default value.
|
|
27
|
-
"""
|
|
28
|
-
jsonl = _is_jsonl(filepath, jsonl)
|
|
29
|
-
if jsonl and not isinstance(data, list):
|
|
30
|
-
# data should be a list
|
|
31
|
-
raise ValueError("data should be a list when save as jsonl format")
|
|
32
|
-
ensure_parent_dir_exist(filepath)
|
|
33
|
-
with open(filepath, 'w', encoding=encoding, newline=newline) as f:
|
|
34
|
-
if jsonl:
|
|
35
|
-
for x in data:
|
|
36
|
-
f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
|
|
37
|
-
f.write(newline)
|
|
38
|
-
else:
|
|
39
|
-
json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|