feilian 1.2.2__tar.gz → 1.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,15 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: feilian
3
- Version: 1.2.2
3
+ Version: 1.3.4
4
4
  Summary: General data processing tool.
5
5
  Author-email: darkpeath <darkpeath@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/darkpeath/feilian
7
7
  Description-Content-Type: text/markdown
8
+ Requires-Dist: chardet
8
9
  Requires-Dist: pandas
9
10
  Provides-Extra: extra
10
11
  Requires-Dist: tqdm; extra == "extra"
12
+ Requires-Dist: ijson; extra == "extra"
11
13
 
12
14
  # feilian
13
15
 
@@ -5,8 +5,13 @@ from .dataframe import read_dataframe, save_dataframe, extract_dataframe_sample,
5
5
  from .dataframe import is_empty_text, is_nonempty_text, is_blank_text, is_non_blank_text
6
6
  from .datetime import format_time, format_date
7
7
  from .arg import ArgValueParser
8
- from .json import read_json, save_json
8
+ from .json import read_json, save_json, write_json, read_big_json
9
+ from .txt import (
10
+ detect_stream_encoding, detect_file_encoding, get_file_encoding,
11
+ read_txt, save_txt, write_txt,
12
+ )
9
13
  from .process import DataframeProcessor
14
+ from .excel import save_excel, write_excel
10
15
  from .utils import flatten_dict, flatten_list
11
16
  from .version import __version__
12
17
 
@@ -16,7 +21,10 @@ __all__ = [
16
21
  'is_empty_text', 'is_nonempty_text', 'is_blank_text', 'is_non_blank_text',
17
22
  'format_time', 'format_date',
18
23
  'ArgValueParser',
19
- 'read_json', 'save_json',
24
+ 'read_json', 'save_json', 'write_json', 'read_big_json',
25
+ 'detect_stream_encoding', 'detect_file_encoding', 'get_file_encoding',
26
+ 'read_txt', 'save_txt', 'write_txt',
27
+ 'save_excel', 'write_excel',
20
28
  'DataframeProcessor',
21
29
  'flatten_dict', 'flatten_list',
22
30
  '__version__',
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # file generated by setuptools_scm
3
3
  # don't change, don't track in version control
4
- VERSION = (1, 2, 2)
5
- __version__ = '1.2.2'
4
+ VERSION = (1, 3, 4)
5
+ __version__ = '1.3.4'
@@ -0,0 +1,5 @@
1
+ from typing import *
2
+ try:
3
+ from typing import Literal
4
+ except ImportError:
5
+ from typing_extensions import Literal
@@ -1,14 +1,11 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
- from typing import (
3
+ from ._typing import (
4
4
  Union, List, Any, Iterable,
5
5
  Callable, Set, Optional, Tuple,
6
6
  Dict, Hashable, Sequence,
7
+ Literal,
7
8
  )
8
- try:
9
- from typing import Literal
10
- except ImportError:
11
- from typing_extensions import Literal
12
9
 
13
10
  _build_in_na_checkers = {
14
11
  'always_na': lambda x: True,
@@ -4,17 +4,15 @@
4
4
  Encapsulate methods for pandas `DataFrame`.
5
5
  """
6
6
 
7
- from typing import Union, Iterable, Dict, List, Any, Sequence, Callable, Tuple, Hashable
8
- try:
9
- from typing import Literal
10
- except ImportError:
11
- from typing_extensions import Literal
12
-
7
+ import io
13
8
  import os
9
+ import pathlib
14
10
  import pandas as pd
15
11
  import random
16
12
  import collections
13
+ from ._typing import Union, Iterable, Dict, List, Any, Sequence, Callable, Tuple, Hashable, Literal
17
14
  from .io import ensure_parent_dir_exist
15
+ from .txt import detect_stream_encoding, detect_file_encoding
18
16
 
19
17
  # Compatible with different pandas versions
20
18
  PD_PARAM_NEWLINE = 'lineterminator'
@@ -25,9 +23,31 @@ if pd_version[0] < 1 or (pd_version[0] == 1 and pd_version[1] < 5):
25
23
  FILE_FORMAT = Literal['csv', 'tsv', 'json', 'xlsx', 'parquet']
26
24
  COMPRESSION_FORMAT = Literal[None, 'infer', 'snappy', 'gzip', 'brotli', 'bz2', 'zip', 'xz']
27
25
 
28
- def read_dataframe(file: str, *args, sheet_name=0,
29
- file_format: FILE_FORMAT = None,
26
+ def _drop_na_values(data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], axis: Literal['columns', 'rows']):
27
+ if isinstance(data, pd.DataFrame):
28
+ data.dropna(axis=axis, how='all', inplace=True)
29
+ else:
30
+ assert isinstance(data, dict)
31
+ for df in data.values():
32
+ df.dropna(axis=axis, how='all', inplace=True)
33
+
34
+ def _infer_file_format(file) -> str:
35
+ if isinstance(file, pd.ExcelWriter):
36
+ return 'xlsx'
37
+ elif isinstance(file, str):
38
+ return os.path.splitext(file)[1].lower()[1:]
39
+ elif isinstance(file, pathlib.PurePath):
40
+ suf = file.suffix
41
+ return suf[1:] if suf.startswith('.') else suf
42
+ elif isinstance(file, os.PathLike):
43
+ return os.path.splitext(os.fspath(file))[1].lower().lstrip('.')
44
+ else:
45
+ raise ValueError(f"Cannot infer format for type: {type(file)}")
46
+
47
+ def read_dataframe(file: Union[str, os.PathLike, io.IOBase], *args, sheet_name=0,
48
+ file_format: FILE_FORMAT = None, encoding='auto',
30
49
  jsonl=False, dtype: type = None,
50
+ drop_na_columns=False, drop_na_rows=False,
31
51
  **kwargs) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
32
52
  """
33
53
  read file as pandas `DataFrame`
@@ -35,15 +55,16 @@ def read_dataframe(file: str, *args, sheet_name=0,
35
55
  :param args: extra args for `pd.read_xx()`
36
56
  :param sheet_name: `sheet_name` for `pd.read_excel()`
37
57
  :param file_format: csv, tsv, json ,xlsx, parquet
58
+ :param encoding: text file encoding
38
59
  :param jsonl: jsonl format or not, only used in json format
39
60
  :param dtype: `dtype` for `pd.read_xx()`
61
+ :param drop_na_columns: drop column if all values of the column is na
62
+ :param drop_na_rows: drop row if all values of the row is na
40
63
  :param kwargs: extra kwargs for `pd.read_xx()`
41
64
  """
42
65
  # decide the file format
43
66
  if not file_format:
44
- if not isinstance(file, str):
45
- raise ValueError("Format should given!")
46
- file_format = os.path.splitext(file)[1].lower()[1:]
67
+ file_format = _infer_file_format(file)
47
68
 
48
69
  for key in ['lines', 'line_delimited_json_format']:
49
70
  if key in kwargs and kwargs.pop(key):
@@ -60,18 +81,44 @@ def read_dataframe(file: str, *args, sheet_name=0,
60
81
  file_format = 'json'
61
82
  jsonl = True
62
83
 
84
+ # detect encoding
85
+ if encoding == 'auto' and file_format in ['csv', 'json']:
86
+ if isinstance(file, (str, os.PathLike)):
87
+ encoding = detect_file_encoding(file)
88
+ elif isinstance(file, io.IOBase) and file.seekable():
89
+ tell = file.tell()
90
+ encoding = detect_stream_encoding(file)
91
+ file.seek(tell)
92
+ else:
93
+ # read file may cause content change, so we cannot detect the encoding
94
+ encoding = None
95
+
63
96
  if file_format == 'csv':
64
- return pd.read_csv(file, *args, dtype=dtype, **kwargs)
97
+ df = pd.read_csv(file, *args, encoding=encoding, dtype=dtype, **kwargs)
65
98
  elif file_format == 'xlsx':
66
- return pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
99
+ df = pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
67
100
  elif file_format == 'json':
68
- return pd.read_json(file, *args, lines=jsonl, dtype=dtype, **kwargs)
101
+ try:
102
+ df = pd.read_json(file, *args, encoding=encoding, lines=jsonl, dtype=dtype, **kwargs)
103
+ except Exception as e:
104
+ # if failed, try again with different arg `lines`
105
+ try:
106
+ df = pd.read_json(file, *args, lines=not jsonl, dtype=dtype, **kwargs)
107
+ except Exception:
108
+ raise e
69
109
  elif file_format == 'parquet':
70
- return pd.read_parquet(file, *args, **kwargs)
110
+ df = pd.read_parquet(file, *args, **kwargs)
71
111
  else:
72
112
  raise IOError(f"Unknown file format: {file}")
73
113
 
74
- def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]'],
114
+ if drop_na_columns:
115
+ _drop_na_values(df, axis='columns')
116
+ if drop_na_rows:
117
+ _drop_na_values(df, axis='rows')
118
+
119
+ return df
120
+
121
+ def save_dataframe(file: Union[str, os.PathLike, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]'],
75
122
  df: Union[pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]]],
76
123
  *args, sheet_name='Sheet1',
77
124
  file_format: FILE_FORMAT = None,
@@ -79,7 +126,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
79
126
  index=False, index_label=None,
80
127
  encoding='utf-8', newline='\n',
81
128
  force_ascii=False,
82
- orient='records', jsonl=True,
129
+ orient='records', jsonl=True, indent=None,
83
130
  column_mapper: Union[Dict[str, str], Sequence[str]] = None,
84
131
  include_columns: Sequence[str] = None,
85
132
  exclude_columns: Sequence[str] = None,
@@ -101,6 +148,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
101
148
  :param force_ascii: `force_ascii` for json format
102
149
  :param orient: `orient` for json format
103
150
  :param jsonl: jsonl format or not
151
+ :param indent: indent for json format
104
152
  :param column_mapper: rename columns; if set, columns not list here will be ignored
105
153
  :param include_columns: if set, columns not list here will be ignored
106
154
  :param exclude_columns: if set, columns list here will be ignored
@@ -108,12 +156,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
108
156
  """
109
157
  # decide file format
110
158
  if not file_format:
111
- if isinstance(file, str):
112
- file_format = os.path.splitext(file)[1].lower()[1:]
113
- elif isinstance(file, pd.ExcelWriter):
114
- file_format = 'xlsx'
115
- else:
116
- raise ValueError("Format should given!")
159
+ file_format = _infer_file_format(file)
117
160
 
118
161
  # convert data to be a dataframe
119
162
  if not isinstance(df, pd.DataFrame):
@@ -158,11 +201,12 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[st
158
201
  elif file_format == 'json':
159
202
  if jsonl:
160
203
  orient = 'records'
204
+ indent = None
161
205
  if orient not in ['split', 'table']:
162
206
  index = True
163
207
  df.to_json(file, *args, compression=compression, index=index,
164
208
  force_ascii=force_ascii, orient=orient, lines=jsonl,
165
- **kwargs)
209
+ indent=indent, **kwargs)
166
210
  elif file_format == 'parquet':
167
211
  df.to_parquet(file, *args, compression=compression, index=index, **kwargs)
168
212
  else:
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
- from typing import Union
3
+ from ._typing import Union
4
4
  import pandas as pd
5
5
  import datetime
6
6
 
@@ -0,0 +1,83 @@
1
+ import pandas as pd
2
+ from ._typing import Union, Iterable, Dict, Sequence, Any, List, Tuple
3
+ from .dataframe import save_dataframe
4
+
5
+ def _save_excel(file, df, *args, **kwargs):
6
+ # if df is a list of dataframe, then save each dataframe into a sheet
7
+ if isinstance(df, (list, tuple)) and df and all(isinstance(x, pd.DataFrame) for x in df):
8
+ if 'sheet_name' in kwargs:
9
+ kwargs.pop('sheet_name')
10
+ with pd.ExcelWriter(file) as writer:
11
+ for i, x in enumerate(df, 1):
12
+ save_dataframe(writer, x, *args, sheet_name=f"Sheet{i}", **kwargs)
13
+ elif isinstance(df, dict) and df and all(isinstance(x, pd.DataFrame) for x in df.values()):
14
+ if 'sheet_name' in kwargs:
15
+ kwargs.pop('sheet_name')
16
+ with pd.ExcelWriter(file) as writer:
17
+ for name, x in df.items():
18
+ save_dataframe(writer, x, *args, sheet_name=name, **kwargs)
19
+ else:
20
+ return save_dataframe(file, df, *args, **kwargs)
21
+
22
+ _FILE_TYPES = Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]']
23
+ _DATA_TYPES = Union[
24
+ pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]],
25
+ List[pd.DataFrame], Tuple[pd.DataFrame], Dict[str, pd.DataFrame]
26
+ ]
27
+
28
+ def save_excel(file: _FILE_TYPES, df: _DATA_TYPES,
29
+ *args, sheet_name='Sheet1',
30
+ header: Union[Sequence[str], bool] = True,
31
+ index=False, index_label=None,
32
+ column_mapper: Union[Dict[str, str], Sequence[str]] = None,
33
+ include_columns: Sequence[str] = None,
34
+ exclude_columns: Sequence[str] = None,
35
+ **kwargs):
36
+ """
37
+ save data into file
38
+ :param file: where to save the data to
39
+ :param df: the data
40
+ :param args: extra args for df.to_xx()
41
+ :param sheet_name: `sheet_name` for excel format
42
+ :param header: `header` for excel format
43
+ :param index: save index or not, see docs in df.to_csv();
44
+ if set as str and `index_label` not set, `index_label` will be set as this
45
+ :param index_label: header for the index when `index` is `True`
46
+ :param column_mapper: rename columns; if set, columns not list here will be ignored
47
+ :param include_columns: if set, columns not list here will be ignored
48
+ :param exclude_columns: if set, columns list here will be ignored
49
+ :param kwargs: extra kwargs for df.to_xx()
50
+ """
51
+ _save_excel(
52
+ file, df, *args,
53
+ sheet_name=sheet_name,
54
+ header=header,
55
+ index=index,
56
+ index_label=index_label,
57
+ column_mapper=column_mapper,
58
+ include_columns=include_columns,
59
+ exclude_columns=exclude_columns,
60
+ **kwargs
61
+ )
62
+
63
+ def write_excel(
64
+ file: _FILE_TYPES, df: _DATA_TYPES,
65
+ *args, sheet_name='Sheet1',
66
+ header: Union[Sequence[str], bool] = True,
67
+ index=False, index_label=None,
68
+ column_mapper: Union[Dict[str, str], Sequence[str]] = None,
69
+ include_columns: Sequence[str] = None,
70
+ exclude_columns: Sequence[str] = None,
71
+ **kwargs
72
+ ):
73
+ save_excel(
74
+ file, df, *args,
75
+ sheet_name=sheet_name,
76
+ header=header,
77
+ index=index,
78
+ index_label=index_label,
79
+ column_mapper=column_mapper,
80
+ include_columns=include_columns,
81
+ exclude_columns=exclude_columns,
82
+ **kwargs
83
+ )
@@ -0,0 +1,262 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Dict, List, Union, Any
4
+ from pathlib import Path
5
+ import os
6
+ import abc
7
+ import json
8
+ from decimal import Decimal
9
+ from .io import ensure_parent_dir_exist
10
+ from .txt import get_file_encoding
11
+ try:
12
+ import ijson
13
+ except ImportError as e:
14
+ ijson = None
15
+
16
+ def _read_json(filepath: Union[str, os.PathLike], jsonl: bool, encoding='utf-8', **kwargs):
17
+ """
18
+ The actual read function.
19
+ """
20
+ encoding = get_file_encoding(filepath, encoding=encoding)
21
+ with open(filepath, encoding=encoding) as f:
22
+ if jsonl:
23
+ return [json.loads(x, **kwargs) for x in f]
24
+ else:
25
+ return json.load(f, **kwargs)
26
+
27
+ def _is_jsonl(filepath: Union[str, os.PathLike], jsonl: bool = None) -> bool:
28
+ if jsonl is None:
29
+ filepath = Path(filepath)
30
+ jsonl = filepath.suffix.lower() == '.jsonl'
31
+ return jsonl
32
+
33
+ def read_json(
34
+ filepath: Union[str, os.PathLike],
35
+ jsonl: bool = None,
36
+ encoding: str = 'auto',
37
+ **kwargs
38
+ ) -> Union[Dict[str, Any], List[Any]]:
39
+ """
40
+ An agent for `json.load()` with some default value.
41
+ """
42
+ jsonl = _is_jsonl(filepath, jsonl)
43
+ try:
44
+ return _read_json(filepath, jsonl=jsonl, encoding=encoding, **kwargs)
45
+ except Exception as e:
46
+ # if failed, try again with different arg `jsonl`
47
+ try:
48
+ return _read_json(filepath, jsonl=not jsonl, encoding=encoding, **kwargs)
49
+ except Exception:
50
+ raise e
51
+
52
+ def save_json(
53
+ filepath: Union[str, os.PathLike],
54
+ data: Union[Dict[str, Any], List[Any]],
55
+ jsonl: bool = None,
56
+ encoding: str = 'utf-8',
57
+ newline: str = '\n',
58
+ indent: int = 2,
59
+ ensure_ascii: bool = False,
60
+ **kwargs
61
+ ):
62
+ """
63
+ An agent for `json.dump()` with some default value.
64
+ """
65
+ jsonl = _is_jsonl(filepath, jsonl)
66
+ if jsonl and not isinstance(data, list):
67
+ # data should be a list
68
+ raise ValueError("data should be a list when save as jsonl format")
69
+ ensure_parent_dir_exist(filepath)
70
+ with open(filepath, 'w', encoding=encoding, newline=newline) as f:
71
+ if jsonl:
72
+ for x in data:
73
+ f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
74
+ f.write(newline)
75
+ else:
76
+ json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)
77
+
78
+ def write_json(
79
+ filepath: Union[str, os.PathLike],
80
+ data: Union[Dict[str, Any], List[Any]],
81
+ jsonl: bool = None,
82
+ encoding: str = 'utf-8',
83
+ newline: str = '\n',
84
+ indent: int = 2,
85
+ ensure_ascii: bool = False,
86
+ **kwargs
87
+ ):
88
+ save_json(
89
+ filepath=filepath,
90
+ data=data,
91
+ jsonl=jsonl,
92
+ encoding=encoding,
93
+ newline=newline,
94
+ indent=indent,
95
+ ensure_ascii=ensure_ascii,
96
+ **kwargs
97
+ )
98
+
99
+
100
+ class _JsonNode:
101
+ def __init__(self, type: str = '', parent: '_JsonNode' = None):
102
+ self._type = ''
103
+ self._value = None
104
+ self._parent = parent
105
+ if type:
106
+ self.type = type
107
+
108
+ def clear(self):
109
+ if self._type == 'map':
110
+ self._value.clear()
111
+ elif self._type == 'array':
112
+ self._value.clear()
113
+
114
+ @property
115
+ def parent(self):
116
+ return self._parent
117
+
118
+ @property
119
+ def type(self):
120
+ return self._type
121
+
122
+ @type.setter
123
+ def type(self, value):
124
+ if self._type:
125
+ raise ValueError('type is already set')
126
+ self._type = value
127
+ if value == 'map':
128
+ self._value = {}
129
+ elif value == 'array':
130
+ self._value = []
131
+
132
+ @property
133
+ def value(self):
134
+ if not self._type:
135
+ raise ValueError('type is not set')
136
+ if self._type == 'dummy':
137
+ assert isinstance(self._value, _JsonNode)
138
+ return self._value.value
139
+ if self._type == 'map':
140
+ assert isinstance(self._value, dict)
141
+ return {k: v.value for k, v in self._value.items()}
142
+ if self._type == 'array':
143
+ assert isinstance(self._value, list)
144
+ return [v.value for v in self._value]
145
+ return self._value
146
+
147
+ @value.setter
148
+ def value(self, value):
149
+ if not self._type:
150
+ raise ValueError('type is not set')
151
+ if self._type in ['dummy', 'map', 'array']:
152
+ raise RuntimeError('cannot set value for dummy, map, array')
153
+ self._value = value
154
+
155
+ def __repr__(self):
156
+ return str(self.value)
157
+
158
+ def __str__(self):
159
+ return str(self.value)
160
+
161
+ class StreamJsonReader(abc.ABC):
162
+ """
163
+ Iterate over a json file.
164
+ """
165
+ def __init__(self, filepath: Union[str, os.PathLike], encoding: str = None, limit: int = float('inf')):
166
+ self.filepath = filepath
167
+ self.encoding = encoding
168
+ self.limit = limit
169
+ self._data_type = '' # dict or list
170
+
171
+ @property
172
+ def data_type(self):
173
+ return self._data_type
174
+
175
+ def __iter__(self):
176
+ raise NotImplementedError
177
+
178
+ class BigJsonReader(StreamJsonReader):
179
+ def __iter__(self):
180
+ with open(self.filepath, 'rb') as f:
181
+ parser = ijson.parse(f)
182
+ dummy = node = _JsonNode('')
183
+ cnt = 0
184
+ for prefix, event, value in parser:
185
+ if event == 'start_map':
186
+ if node.type == 'array':
187
+ child = _JsonNode(type='map', parent=node)
188
+ node._value.append(child)
189
+ node = child
190
+ else:
191
+ node.type = 'map'
192
+ elif event == 'end_map':
193
+ node = node.parent
194
+ elif event == 'start_array':
195
+ node.type = 'array'
196
+ elif event == 'end_array':
197
+ node = node.parent
198
+ elif event == 'map_key':
199
+ assert node.type == 'map', f"{event} {value} {prefix}"
200
+ child = _JsonNode(parent=node)
201
+ node._value[value] = child
202
+ node = child
203
+ else:
204
+ assert event in ['null', 'boolean', 'integer', 'double', 'number', 'string']
205
+ if isinstance(value, Decimal):
206
+ value = float(value)
207
+ if node.type == 'array':
208
+ child = _JsonNode(type=event, parent=node)
209
+ child.value = value
210
+ node._value.append(child)
211
+ else:
212
+ assert not node.type
213
+ node.type = event
214
+ node.value = value
215
+ node = node.parent
216
+ if node == dummy and event not in ['start_map', 'start_array']:
217
+ assert node.type in ['map', 'array']
218
+ if node.type == 'map':
219
+ value = node.value
220
+ assert isinstance(value, dict)
221
+ assert len(value) == 1
222
+ k, v = list(value.items())[0]
223
+ self._data_type = 'dict'
224
+ yield k, v
225
+ node.clear()
226
+ elif node.type == 'array':
227
+ value = node.value
228
+ assert isinstance(value, list)
229
+ assert len(value) == 1
230
+ self._data_type = 'list'
231
+ yield value[0]
232
+ node.clear()
233
+ cnt += 1
234
+ if cnt >= self.limit:
235
+ break
236
+
237
+ class JsonlReader(StreamJsonReader):
238
+ @property
239
+ def data_type(self):
240
+ return 'list'
241
+
242
+ def __iter__(self):
243
+ with open(self.filepath, encoding=self.encoding) as f:
244
+ for i, line in enumerate(f, 1):
245
+ yield json.loads(line)
246
+ if i >= self.limit:
247
+ break
248
+
249
+ def read_big_json(
250
+ filepath: Union[str, os.PathLike],
251
+ jsonl: bool = None,
252
+ encoding: str = 'auto',
253
+ ) -> StreamJsonReader:
254
+ jsonl = _is_jsonl(filepath, jsonl)
255
+ encoding = get_file_encoding(filepath, encoding=encoding)
256
+ if jsonl:
257
+ return JsonlReader(filepath, encoding=encoding)
258
+ else:
259
+ if ijson is None:
260
+ raise ImportError('ijson is not installed')
261
+ return BigJsonReader(filepath)
262
+
@@ -1,8 +1,14 @@
1
1
  import abc
2
2
  import tqdm
3
3
  import pandas as pd
4
- from typing import Any, Dict, Hashable, List, Tuple, Union, Iterable
5
- from .dataframe import read_dataframe, save_dataframe
4
+ from ._typing import (
5
+ Any, Dict, Hashable, List,
6
+ Tuple, Union, Iterable, Optional,
7
+ )
8
+ from .dataframe import (
9
+ read_dataframe,
10
+ save_dataframe,
11
+ )
6
12
 
7
13
  class BaseProcessor(abc.ABC):
8
14
  """
@@ -55,11 +61,13 @@ class BaseProcessor(abc.ABC):
55
61
  self.save_result(output_path or input_path, result)
56
62
 
57
63
  class DataframeProcessor(BaseProcessor, abc.ABC):
58
- def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None):
64
+ def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None,
65
+ write_args: Dict[str, Any] = None):
59
66
  self.progress = progress
60
67
  self.read_args = read_args or {}
61
68
  if input_dtype is not None:
62
69
  self.read_args['dtype'] = input_dtype
70
+ self.write_args = write_args or {}
63
71
 
64
72
  def read_single_file(self, filepath: str) -> pd.DataFrame:
65
73
  return read_dataframe(filepath, **self.read_args)
@@ -71,12 +79,13 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
71
79
  return super().read_data(filepath)
72
80
 
73
81
  def save_result(self, filepath: str, result: pd.DataFrame):
74
- save_dataframe(filepath, result)
82
+ save_dataframe(filepath, result, **self.write_args)
75
83
 
76
84
  @abc.abstractmethod
77
- def process_row(self, i: Hashable, row: pd.Series) -> Dict[str, Any]:
85
+ def process_row(self, i: Hashable, row: pd.Series) -> Optional[Dict[str, Any]]:
78
86
  """
79
87
  Process a single row of data.
88
+ :return: if `None`, ignore this row
80
89
  """
81
90
 
82
91
  def process(self, data: pd.DataFrame) -> pd.DataFrame:
@@ -84,6 +93,7 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
84
93
  if self.progress:
85
94
  desc = "process" if self.progress is True else self.progress
86
95
  bar = tqdm.tqdm(bar, total=len(data), desc=desc)
87
- res = [self.process_row(i, row) for i, row in bar]
96
+ res = (self.process_row(i, row) for i, row in bar)
97
+ res = (x for x in res if x is not None)
88
98
  return pd.DataFrame(res)
89
99
 
@@ -1,6 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
- from typing import Any, Callable
3
+ from ._typing import Any, Callable
4
4
 
5
5
  def join_values(*values: Any, sep='', func: Callable[[Any], str] = str, do_trim=False, ignore_empty=False):
6
6
  def f():
@@ -0,0 +1,54 @@
1
+ from ._typing import Union, Literal
2
+ import os
3
+ import io
4
+ import inspect
5
+ import chardet
6
+
7
+ _DEFAULT_CHUNK_SIZE = 1024
8
+
9
+ if 'should_rename_legacy' in inspect.signature(chardet.UniversalDetector).parameters:
10
+ def _create_detector(should_rename_legacy: bool):
11
+ return chardet.UniversalDetector(should_rename_legacy=should_rename_legacy)
12
+ else:
13
+ def _create_detector(should_rename_legacy: bool):
14
+ return chardet.UniversalDetector()
15
+
16
+ def detect_stream_encoding(stream: io.IOBase, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
17
+ detector = _create_detector(should_rename_legacy=should_rename_legacy)
18
+ while True:
19
+ raw = stream.read(chunk_size)
20
+ if not raw:
21
+ break
22
+ detector.feed(raw)
23
+ if detector.done:
24
+ break
25
+ detector.close()
26
+ return detector.result.get('encoding')
27
+
28
+ def detect_text_encoding(raw: bytes, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
29
+ return detect_stream_encoding(io.BytesIO(raw), chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
30
+
31
+ def detect_file_encoding(path: Union[str, os.PathLike], chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
32
+ with open(path, 'rb') as f:
33
+ return detect_stream_encoding(f, chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
34
+
35
+ def get_file_encoding(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
36
+ if encoding == 'auto':
37
+ encoding = detect_file_encoding(path)
38
+ return encoding
39
+
40
+ def read_txt(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
41
+ if encoding == 'auto':
42
+ with open(path, 'rb') as f:
43
+ raw = f.read()
44
+ encoding = detect_stream_encoding(io.BytesIO(raw))
45
+ return raw.decode(encoding)
46
+ with open(path, 'r', encoding=encoding) as f:
47
+ return f.read()
48
+
49
+ def save_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
50
+ with open(path, 'w', encoding=encoding) as f:
51
+ f.write(content)
52
+
53
+ def write_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
54
+ save_txt(path=path, content=content, encoding=encoding)
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python
2
2
  # coding: utf-8
3
3
 
4
- from typing import Dict, Any, Union, Collection, List
4
+ from ._typing import Dict, Any, Union, Collection, List
5
5
 
6
6
  def flatten_dict(data: Dict[str, Any], prefix="", joiner=".",
7
7
  exclude: Union[None, str, Collection[str]] = None,
@@ -1,13 +1,15 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: feilian
3
- Version: 1.2.2
3
+ Version: 1.3.4
4
4
  Summary: General data processing tool.
5
5
  Author-email: darkpeath <darkpeath@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/darkpeath/feilian
7
7
  Description-Content-Type: text/markdown
8
+ Requires-Dist: chardet
8
9
  Requires-Dist: pandas
9
10
  Provides-Extra: extra
10
11
  Requires-Dist: tqdm; extra == "extra"
12
+ Requires-Dist: ijson; extra == "extra"
11
13
 
12
14
  # feilian
13
15
 
@@ -4,13 +4,16 @@ pyproject.toml
4
4
  requirements.txt
5
5
  feilian/__init__.py
6
6
  feilian/_dist_ver.py
7
+ feilian/_typing.py
7
8
  feilian/arg.py
8
9
  feilian/dataframe.py
9
10
  feilian/datetime.py
11
+ feilian/excel.py
10
12
  feilian/io.py
11
13
  feilian/json.py
12
14
  feilian/process.py
13
15
  feilian/string.py
16
+ feilian/txt.py
14
17
  feilian/utils.py
15
18
  feilian/version.py
16
19
  feilian.egg-info/PKG-INFO
@@ -1,4 +1,6 @@
1
+ chardet
1
2
  pandas
2
3
 
3
4
  [extra]
4
5
  tqdm
6
+ ijson
@@ -11,12 +11,14 @@ authors = [
11
11
  {name = "darkpeath", email = "darkpeath@gmail.com"}
12
12
  ]
13
13
  dependencies = [
14
+ "chardet",
14
15
  "pandas",
15
16
  ]
16
17
 
17
18
  [project.optional-dependencies]
18
19
  extra = [
19
20
  "tqdm",
21
+ "ijson",
20
22
  ]
21
23
 
22
24
  [project.urls]
@@ -33,4 +35,4 @@ write_to_template = """
33
35
  # don't change, don't track in version control
34
36
  VERSION = {version_tuple}
35
37
  __version__ = '{version}'
36
- """
38
+ """
@@ -1,4 +1,6 @@
1
1
  setuptools>=42
2
2
  setuptools_scm[toml]>=3.4
3
- pandas
3
+ pandas<2.0.0
4
4
  tqdm
5
+ chardet
6
+ ijson
@@ -1,39 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- from typing import Dict, List, Union, Any
4
- import json
5
- from .io import ensure_parent_dir_exist
6
-
7
- def _is_jsonl(filepath: str, jsonl=None) -> bool:
8
- if jsonl is None:
9
- jsonl = filepath.lower().endswith('.jsonl')
10
- return jsonl
11
-
12
- def read_json(filepath: str, jsonl=None, encoding='utf-8', **kwargs):
13
- """
14
- An agent for `json.load()` with some default value.
15
- """
16
- jsonl = _is_jsonl(filepath, jsonl)
17
- with open(filepath, encoding=encoding) as f:
18
- if jsonl:
19
- return [json.loads(x) for x in f]
20
- else:
21
- return json.load(f, **kwargs)
22
-
23
- def save_json(filepath: str, data: Union[Dict[str, Any], List[Any]], jsonl=False,
24
- encoding='utf-8', newline='\n', indent=2, ensure_ascii=False, **kwargs):
25
- """
26
- An agent for `json.dump()` with some default value.
27
- """
28
- jsonl = _is_jsonl(filepath, jsonl)
29
- if jsonl and not isinstance(data, list):
30
- # data should be a list
31
- raise ValueError("data should be a list when save as jsonl format")
32
- ensure_parent_dir_exist(filepath)
33
- with open(filepath, 'w', encoding=encoding, newline=newline) as f:
34
- if jsonl:
35
- for x in data:
36
- f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
37
- f.write(newline)
38
- else:
39
- json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)
File without changes
File without changes
File without changes
File without changes
File without changes