feilian 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
feilian/json.py ADDED
@@ -0,0 +1,262 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Dict, List, Union, Any
4
+ from pathlib import Path
5
+ import os
6
+ import abc
7
+ import json
8
+ from decimal import Decimal
9
+ from .io import ensure_parent_dir_exist
10
+ from .txt import get_file_encoding
11
+ try:
12
+ import ijson
13
+ except ImportError as e:
14
+ ijson = None
15
+
16
+ def _read_json(filepath: Union[str, os.PathLike], jsonl: bool, encoding='utf-8', **kwargs):
17
+ """
18
+ The actual read function.
19
+ """
20
+ encoding = get_file_encoding(filepath, encoding=encoding)
21
+ with open(filepath, encoding=encoding) as f:
22
+ if jsonl:
23
+ return [json.loads(x, **kwargs) for x in f]
24
+ else:
25
+ return json.load(f, **kwargs)
26
+
27
+ def _is_jsonl(filepath: Union[str, os.PathLike], jsonl: bool = None) -> bool:
28
+ if jsonl is None:
29
+ filepath = Path(filepath)
30
+ jsonl = filepath.suffix.lower() == '.jsonl'
31
+ return jsonl
32
+
33
+ def read_json(
34
+ filepath: Union[str, os.PathLike],
35
+ jsonl: bool = None,
36
+ encoding: str = 'auto',
37
+ **kwargs
38
+ ) -> Union[Dict[str, Any], List[Any]]:
39
+ """
40
+ An agent for `json.load()` with some default value.
41
+ """
42
+ jsonl = _is_jsonl(filepath, jsonl)
43
+ try:
44
+ return _read_json(filepath, jsonl=jsonl, encoding=encoding, **kwargs)
45
+ except Exception as e:
46
+ # if failed, try again with different arg `jsonl`
47
+ try:
48
+ return _read_json(filepath, jsonl=not jsonl, encoding=encoding, **kwargs)
49
+ except Exception:
50
+ raise e
51
+
52
+ def save_json(
53
+ filepath: Union[str, os.PathLike],
54
+ data: Union[Dict[str, Any], List[Any]],
55
+ jsonl: bool = None,
56
+ encoding: str = 'utf-8',
57
+ newline: str = '\n',
58
+ indent: int = 2,
59
+ ensure_ascii: bool = False,
60
+ **kwargs
61
+ ):
62
+ """
63
+ An agent for `json.dump()` with some default value.
64
+ """
65
+ jsonl = _is_jsonl(filepath, jsonl)
66
+ if jsonl and not isinstance(data, list):
67
+ # data should be a list
68
+ raise ValueError("data should be a list when save as jsonl format")
69
+ ensure_parent_dir_exist(filepath)
70
+ with open(filepath, 'w', encoding=encoding, newline=newline) as f:
71
+ if jsonl:
72
+ for x in data:
73
+ f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
74
+ f.write(newline)
75
+ else:
76
+ json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)
77
+
78
+ def write_json(
79
+ filepath: Union[str, os.PathLike],
80
+ data: Union[Dict[str, Any], List[Any]],
81
+ jsonl: bool = None,
82
+ encoding: str = 'utf-8',
83
+ newline: str = '\n',
84
+ indent: int = 2,
85
+ ensure_ascii: bool = False,
86
+ **kwargs
87
+ ):
88
+ save_json(
89
+ filepath=filepath,
90
+ data=data,
91
+ jsonl=jsonl,
92
+ encoding=encoding,
93
+ newline=newline,
94
+ indent=indent,
95
+ ensure_ascii=ensure_ascii,
96
+ **kwargs
97
+ )
98
+
99
+
100
+ class _JsonNode:
101
+ def __init__(self, type: str = '', parent: '_JsonNode' = None):
102
+ self._type = ''
103
+ self._value = None
104
+ self._parent = parent
105
+ if type:
106
+ self.type = type
107
+
108
+ def clear(self):
109
+ if self._type == 'map':
110
+ self._value.clear()
111
+ elif self._type == 'array':
112
+ self._value.clear()
113
+
114
+ @property
115
+ def parent(self):
116
+ return self._parent
117
+
118
+ @property
119
+ def type(self):
120
+ return self._type
121
+
122
+ @type.setter
123
+ def type(self, value):
124
+ if self._type:
125
+ raise ValueError('type is already set')
126
+ self._type = value
127
+ if value == 'map':
128
+ self._value = {}
129
+ elif value == 'array':
130
+ self._value = []
131
+
132
+ @property
133
+ def value(self):
134
+ if not self._type:
135
+ raise ValueError('type is not set')
136
+ if self._type == 'dummy':
137
+ assert isinstance(self._value, _JsonNode)
138
+ return self._value.value
139
+ if self._type == 'map':
140
+ assert isinstance(self._value, dict)
141
+ return {k: v.value for k, v in self._value.items()}
142
+ if self._type == 'array':
143
+ assert isinstance(self._value, list)
144
+ return [v.value for v in self._value]
145
+ return self._value
146
+
147
+ @value.setter
148
+ def value(self, value):
149
+ if not self._type:
150
+ raise ValueError('type is not set')
151
+ if self._type in ['dummy', 'map', 'array']:
152
+ raise RuntimeError('cannot set value for dummy, map, array')
153
+ self._value = value
154
+
155
+ def __repr__(self):
156
+ return str(self.value)
157
+
158
+ def __str__(self):
159
+ return str(self.value)
160
+
161
+ class StreamJsonReader(abc.ABC):
162
+ """
163
+ Iterate over a json file.
164
+ """
165
+ def __init__(self, filepath: Union[str, os.PathLike], encoding: str = None, limit: int = float('inf')):
166
+ self.filepath = filepath
167
+ self.encoding = encoding
168
+ self.limit = limit
169
+ self._data_type = '' # dict or list
170
+
171
+ @property
172
+ def data_type(self):
173
+ return self._data_type
174
+
175
+ def __iter__(self):
176
+ raise NotImplementedError
177
+
178
+ class BigJsonReader(StreamJsonReader):
179
+ def __iter__(self):
180
+ with open(self.filepath, 'rb') as f:
181
+ parser = ijson.parse(f)
182
+ dummy = node = _JsonNode('')
183
+ cnt = 0
184
+ for prefix, event, value in parser:
185
+ if event == 'start_map':
186
+ if node.type == 'array':
187
+ child = _JsonNode(type='map', parent=node)
188
+ node._value.append(child)
189
+ node = child
190
+ else:
191
+ node.type = 'map'
192
+ elif event == 'end_map':
193
+ node = node.parent
194
+ elif event == 'start_array':
195
+ node.type = 'array'
196
+ elif event == 'end_array':
197
+ node = node.parent
198
+ elif event == 'map_key':
199
+ assert node.type == 'map', f"{event} {value} {prefix}"
200
+ child = _JsonNode(parent=node)
201
+ node._value[value] = child
202
+ node = child
203
+ else:
204
+ assert event in ['null', 'boolean', 'integer', 'double', 'number', 'string']
205
+ if isinstance(value, Decimal):
206
+ value = float(value)
207
+ if node.type == 'array':
208
+ child = _JsonNode(type=event, parent=node)
209
+ child.value = value
210
+ node._value.append(child)
211
+ else:
212
+ assert not node.type
213
+ node.type = event
214
+ node.value = value
215
+ node = node.parent
216
+ if node == dummy and event not in ['start_map', 'start_array']:
217
+ assert node.type in ['map', 'array']
218
+ if node.type == 'map':
219
+ value = node.value
220
+ assert isinstance(value, dict)
221
+ assert len(value) == 1
222
+ k, v = list(value.items())[0]
223
+ self._data_type = 'dict'
224
+ yield k, v
225
+ node.clear()
226
+ elif node.type == 'array':
227
+ value = node.value
228
+ assert isinstance(value, list)
229
+ assert len(value) == 1
230
+ self._data_type = 'list'
231
+ yield value[0]
232
+ node.clear()
233
+ cnt += 1
234
+ if cnt >= self.limit:
235
+ break
236
+
237
+ class JsonlReader(StreamJsonReader):
238
+ @property
239
+ def data_type(self):
240
+ return 'list'
241
+
242
+ def __iter__(self):
243
+ with open(self.filepath, encoding=self.encoding) as f:
244
+ for i, line in enumerate(f, 1):
245
+ yield json.loads(line)
246
+ if i >= self.limit:
247
+ break
248
+
249
+ def read_big_json(
250
+ filepath: Union[str, os.PathLike],
251
+ jsonl: bool = None,
252
+ encoding: str = 'auto',
253
+ ) -> StreamJsonReader:
254
+ jsonl = _is_jsonl(filepath, jsonl)
255
+ encoding = get_file_encoding(filepath, encoding=encoding)
256
+ if jsonl:
257
+ return JsonlReader(filepath, encoding=encoding)
258
+ else:
259
+ if ijson is None:
260
+ raise ImportError('ijson is not installed')
261
+ return BigJsonReader(filepath)
262
+
feilian/process.py ADDED
@@ -0,0 +1,99 @@
1
+ import abc
2
+ import tqdm
3
+ import pandas as pd
4
+ from ._typing import (
5
+ Any, Dict, Hashable, List,
6
+ Tuple, Union, Iterable, Optional,
7
+ )
8
+ from .dataframe import (
9
+ read_dataframe,
10
+ save_dataframe,
11
+ )
12
+
13
+ class BaseProcessor(abc.ABC):
14
+ """
15
+ Base class for processing data.
16
+ """
17
+
18
+ @abc.abstractmethod
19
+ def read_single_file(self, filepath: str) -> Any:
20
+ """
21
+ Actual method to read data from a single file.
22
+ """
23
+
24
+ def merge_input_data(self, data: Iterable[Any]) -> Any:
25
+ """
26
+ Merge data read from multi files.
27
+ """
28
+ return data
29
+
30
+ def read_data(self, filepath: Union[str, List[str], Tuple[str]]) -> Any:
31
+ """
32
+ Read data from input file.
33
+ """
34
+ if isinstance(filepath, (list, tuple)):
35
+ return self.merge_input_data(self.read_single_file(x) for x in filepath)
36
+ else:
37
+ return self.read_single_file(filepath)
38
+
39
+ @abc.abstractmethod
40
+ def save_result(self, filepath: str, result: Any):
41
+ """
42
+ Save result to output file.
43
+ """
44
+
45
+ @abc.abstractmethod
46
+ def process(self, data: Any) -> Any:
47
+ """
48
+ Process data and return result.
49
+ """
50
+
51
+ def run(self, input_path: Union[str, List[str], Tuple[str]], output_path: str = None, write_output=True):
52
+ """
53
+ Read from a file, and save result to another file.
54
+ :param input_path: file with the data
55
+ :param output_path: where to save the result, if not given, use input_path
56
+ :param write_output: whether to write the result to the output_file
57
+ """
58
+ data = self.read_data(input_path)
59
+ result = self.process(data)
60
+ if write_output:
61
+ self.save_result(output_path or input_path, result)
62
+
63
+ class DataframeProcessor(BaseProcessor, abc.ABC):
64
+ def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None,
65
+ write_args: Dict[str, Any] = None):
66
+ self.progress = progress
67
+ self.read_args = read_args or {}
68
+ if input_dtype is not None:
69
+ self.read_args['dtype'] = input_dtype
70
+ self.write_args = write_args or {}
71
+
72
+ def read_single_file(self, filepath: str) -> pd.DataFrame:
73
+ return read_dataframe(filepath, **self.read_args)
74
+
75
+ def merge_input_data(self, data: Iterable[pd.DataFrame]) -> pd.DataFrame:
76
+ return pd.concat(data)
77
+
78
+ def read_data(self, filepath: Union[str, List[str], Tuple[str]]) -> pd.DataFrame:
79
+ return super().read_data(filepath)
80
+
81
+ def save_result(self, filepath: str, result: pd.DataFrame):
82
+ save_dataframe(filepath, result, **self.write_args)
83
+
84
+ @abc.abstractmethod
85
+ def process_row(self, i: Hashable, row: pd.Series) -> Optional[Dict[str, Any]]:
86
+ """
87
+ Process a single row of data.
88
+ :return: if `None`, ignore this row
89
+ """
90
+
91
+ def process(self, data: pd.DataFrame) -> pd.DataFrame:
92
+ bar = data.iterrows()
93
+ if self.progress:
94
+ desc = "process" if self.progress is True else self.progress
95
+ bar = tqdm.tqdm(bar, total=len(data), desc=desc)
96
+ res = (self.process_row(i, row) for i, row in bar)
97
+ res = (x for x in res if x is not None)
98
+ return pd.DataFrame(res)
99
+
feilian/string.py ADDED
@@ -0,0 +1,13 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from ._typing import Any, Callable
4
+
5
+ def join_values(*values: Any, sep='', func: Callable[[Any], str] = str, do_trim=False, ignore_empty=False):
6
+ def f():
7
+ for x in values:
8
+ s = func(x)
9
+ if do_trim:
10
+ s = s.strip()
11
+ if s or not ignore_empty:
12
+ yield s
13
+ return sep.join(f())
feilian/txt.py ADDED
@@ -0,0 +1,54 @@
1
+ from ._typing import Union, Literal
2
+ import os
3
+ import io
4
+ import inspect
5
+ import chardet
6
+
7
+ _DEFAULT_CHUNK_SIZE = 1024
8
+
9
+ if 'should_rename_legacy' in inspect.signature(chardet.UniversalDetector).parameters:
10
+ def _create_detector(should_rename_legacy: bool):
11
+ return chardet.UniversalDetector(should_rename_legacy=should_rename_legacy)
12
+ else:
13
+ def _create_detector(should_rename_legacy: bool):
14
+ return chardet.UniversalDetector()
15
+
16
+ def detect_stream_encoding(stream: io.IOBase, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
17
+ detector = _create_detector(should_rename_legacy=should_rename_legacy)
18
+ while True:
19
+ raw = stream.read(chunk_size)
20
+ if not raw:
21
+ break
22
+ detector.feed(raw)
23
+ if detector.done:
24
+ break
25
+ detector.close()
26
+ return detector.result.get('encoding')
27
+
28
+ def detect_text_encoding(raw: bytes, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
29
+ return detect_stream_encoding(io.BytesIO(raw), chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
30
+
31
+ def detect_file_encoding(path: Union[str, os.PathLike], chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
32
+ with open(path, 'rb') as f:
33
+ return detect_stream_encoding(f, chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
34
+
35
+ def get_file_encoding(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
36
+ if encoding == 'auto':
37
+ encoding = detect_file_encoding(path)
38
+ return encoding
39
+
40
+ def read_txt(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
41
+ if encoding == 'auto':
42
+ with open(path, 'rb') as f:
43
+ raw = f.read()
44
+ encoding = detect_stream_encoding(io.BytesIO(raw))
45
+ return raw.decode(encoding)
46
+ with open(path, 'r', encoding=encoding) as f:
47
+ return f.read()
48
+
49
+ def save_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
50
+ with open(path, 'w', encoding=encoding) as f:
51
+ f.write(content)
52
+
53
+ def write_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
54
+ save_txt(path=path, content=content, encoding=encoding)
feilian/utils.py ADDED
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ from ._typing import Dict, Any, Union, Collection, List
5
+
6
+ def flatten_dict(data: Dict[str, Any], prefix="", joiner=".",
7
+ exclude: Union[None, str, Collection[str]] = None,
8
+ frozen: Union[None, str, Collection[str]] = None,
9
+ empty_as_default=False, empty_value=None,
10
+ res: Dict[str, Any] = None) -> Dict[str, Any]:
11
+ """
12
+ flatten dict as a flat one layer dict
13
+ :param data: origin dict
14
+ :param prefix: prefix for key in the dict
15
+ :param joiner: join symbol for different layer key
16
+ :param exclude: prefix to be excluded from result
17
+ :param frozen: keys not to be flattened
18
+ :param empty_as_default: should set a default value if value is an empty dict
19
+ :param empty_value: if `empty_as_default` is `True`, used as the default value for empty dict
20
+ :param res: the result flat layer dict, create a new one if not given.
21
+ """
22
+ if res is None:
23
+ res = {}
24
+ if isinstance(exclude, str):
25
+ exclude = {exclude}
26
+ if isinstance(frozen, str):
27
+ frozen = {frozen}
28
+
29
+ # all keys are start with the prefix, ignore data
30
+ if exclude and prefix in exclude:
31
+ return res
32
+
33
+ # all keys in data should be frozen
34
+ if frozen and prefix in frozen:
35
+ for k, v in data.items():
36
+ res[prefix+k] = v
37
+ return res
38
+
39
+ for k, v in data.items():
40
+ k = prefix + k
41
+
42
+ if exclude and k in exclude:
43
+ # only the key should be excluded
44
+ continue
45
+
46
+ if frozen and k in frozen:
47
+ # frozen key, keep it as original value
48
+ res[k] = v
49
+ continue
50
+
51
+ if isinstance(v, dict):
52
+ if len(v) == 0:
53
+ # empty dict, set as default value if set
54
+ if empty_as_default:
55
+ res[k] = empty_value
56
+ else:
57
+ # value is a dict, flatten recursively
58
+ flatten_dict(v, prefix=k+joiner, joiner=joiner, exclude=exclude, frozen=frozen, res=res)
59
+ else:
60
+ # normal value, keep it as original value
61
+ res[k] = v
62
+
63
+ return res
64
+
65
+
66
+ def flatten_list(data: List[Any], res: List[Any] = None) -> List[Any]:
67
+ """
68
+ Flatten nested list as a flat on layer list.
69
+ :param data: a nested list
70
+ :param res: the result flat layer list, create a new one if not given.
71
+ """
72
+ if res is None:
73
+ res = []
74
+
75
+ for x in data:
76
+ if isinstance(x, list):
77
+ # flatten recursively
78
+ flatten_list(x, res)
79
+ else:
80
+ res.append(x)
81
+
82
+ return res
feilian/version.py ADDED
@@ -0,0 +1,12 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ try:
4
+ from ._dist_ver import VERSION, __version__
5
+ except ImportError:
6
+ from importlib_metadata import version, PackageNotFoundError
7
+ try:
8
+ __version__ = version('feilian')
9
+ except PackageNotFoundError:
10
+ # package is not installed
11
+ __version__ = "UNKNOWN"
12
+ VERSION = __version__.split('.')