feilian 1.2.1__py3-none-any.whl → 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of feilian might be problematic. Click here for more details.

feilian/_dist_ver.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  # file generated by setuptools_scm
3
3
  # don't change, don't track in version control
4
- VERSION = (1, 2, 1)
5
- __version__ = '1.2.1'
4
+ VERSION = (1, 2, 3)
5
+ __version__ = '1.2.3'
feilian/dataframe.py CHANGED
@@ -25,9 +25,18 @@ if pd_version[0] < 1 or (pd_version[0] == 1 and pd_version[1] < 5):
25
25
  FILE_FORMAT = Literal['csv', 'tsv', 'json', 'xlsx', 'parquet']
26
26
  COMPRESSION_FORMAT = Literal[None, 'infer', 'snappy', 'gzip', 'brotli', 'bz2', 'zip', 'xz']
27
27
 
28
+ def _drop_na_values(data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], axis: Literal['columns', 'rows']):
29
+ if isinstance(data, pd.DataFrame):
30
+ data.dropna(axis=axis, how='all', inplace=True)
31
+ else:
32
+ assert isinstance(data, dict)
33
+ for df in data.values():
34
+ df.dropna(axis=axis, how='all', inplace=True)
35
+
28
36
  def read_dataframe(file: str, *args, sheet_name=0,
29
37
  file_format: FILE_FORMAT = None,
30
38
  jsonl=False, dtype: type = None,
39
+ drop_na_columns=False, drop_na_rows=False,
31
40
  **kwargs) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
32
41
  """
33
42
  read file as pandas `DataFrame`
@@ -37,6 +46,8 @@ def read_dataframe(file: str, *args, sheet_name=0,
37
46
  :param file_format: csv, tsv, json ,xlsx, parquet
38
47
  :param jsonl: jsonl format or not, only used in json format
39
48
  :param dtype: `dtype` for `pd.read_xx()`
49
+ :param drop_na_columns: drop column if all values of the column is na
50
+ :param drop_na_rows: drop row if all values of the row is na
40
51
  :param kwargs: extra kwargs for `pd.read_xx()`
41
52
  """
42
53
  # decide the file format
@@ -61,16 +72,23 @@ def read_dataframe(file: str, *args, sheet_name=0,
61
72
  jsonl = True
62
73
 
63
74
  if file_format == 'csv':
64
- return pd.read_csv(file, *args, dtype=dtype, **kwargs)
75
+ df = pd.read_csv(file, *args, dtype=dtype, **kwargs)
65
76
  elif file_format == 'xlsx':
66
- return pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
77
+ df = pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
67
78
  elif file_format == 'json':
68
- return pd.read_json(file, *args, lines=jsonl, dtype=dtype, **kwargs)
79
+ df = pd.read_json(file, *args, lines=jsonl, dtype=dtype, **kwargs)
69
80
  elif file_format == 'parquet':
70
- return pd.read_parquet(file, *args, **kwargs)
81
+ df = pd.read_parquet(file, *args, **kwargs)
71
82
  else:
72
83
  raise IOError(f"Unknown file format: {file}")
73
84
 
85
+ if drop_na_columns:
86
+ _drop_na_values(df, axis='columns')
87
+ if drop_na_rows:
88
+ _drop_na_values(df, axis='rows')
89
+
90
+ return df
91
+
74
92
  def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]'],
75
93
  df: Union[pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]]],
76
94
  *args, sheet_name='Sheet1',
feilian/process.py CHANGED
@@ -1,8 +1,14 @@
1
1
  import abc
2
2
  import tqdm
3
3
  import pandas as pd
4
- from typing import Any, Dict, Hashable
5
- from .dataframe import read_dataframe, save_dataframe
4
+ from typing import (
5
+ Any, Dict, Hashable, List,
6
+ Tuple, Union, Iterable, Optional,
7
+ )
8
+ from .dataframe import (
9
+ read_dataframe,
10
+ save_dataframe,
11
+ )
6
12
 
7
13
  class BaseProcessor(abc.ABC):
8
14
  """
@@ -10,10 +16,25 @@ class BaseProcessor(abc.ABC):
10
16
  """
11
17
 
12
18
  @abc.abstractmethod
13
- def read_data(self, filepath: str) -> Any:
19
+ def read_single_file(self, filepath: str) -> Any:
20
+ """
21
+ Actual method to read data from a single file.
22
+ """
23
+
24
+ def merge_input_data(self, data: Iterable[Any]) -> Any:
25
+ """
26
+ Merge data read from multi files.
27
+ """
28
+ return data
29
+
30
+ def read_data(self, filepath: Union[str, List[str], Tuple[str]]) -> Any:
14
31
  """
15
32
  Read data from input file.
16
33
  """
34
+ if isinstance(filepath, (list, tuple)):
35
+ return self.merge_input_data(self.read_single_file(x) for x in filepath)
36
+ else:
37
+ return self.read_single_file(filepath)
17
38
 
18
39
  @abc.abstractmethod
19
40
  def save_result(self, filepath: str, result: Any):
@@ -27,7 +48,7 @@ class BaseProcessor(abc.ABC):
27
48
  Process data and return result.
28
49
  """
29
50
 
30
- def run(self, input_path: str, output_path: str = None, write_output=True):
51
+ def run(self, input_path: Union[str, List[str], Tuple[str]], output_path: str = None, write_output=True):
31
52
  """
32
53
  Read from a file, and save result to another file.
33
54
  :param input_path: file with the data
@@ -40,20 +61,29 @@ class BaseProcessor(abc.ABC):
40
61
  self.save_result(output_path or input_path, result)
41
62
 
42
63
  class DataframeProcessor(BaseProcessor, abc.ABC):
43
- def __init__(self, input_dtype=None, progress=False):
44
- self.input_dtype = input_dtype
64
+ def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None):
45
65
  self.progress = progress
66
+ self.read_args = read_args or {}
67
+ if input_dtype is not None:
68
+ self.read_args['dtype'] = input_dtype
69
+
70
+ def read_single_file(self, filepath: str) -> pd.DataFrame:
71
+ return read_dataframe(filepath, **self.read_args)
72
+
73
+ def merge_input_data(self, data: Iterable[pd.DataFrame]) -> pd.DataFrame:
74
+ return pd.concat(data)
46
75
 
47
- def read_data(self, filepath: str) -> pd.DataFrame:
48
- return read_dataframe(filepath, dtype=self.input_dtype)
76
+ def read_data(self, filepath: Union[str, List[str], Tuple[str]]) -> pd.DataFrame:
77
+ return super().read_data(filepath)
49
78
 
50
79
  def save_result(self, filepath: str, result: pd.DataFrame):
51
80
  save_dataframe(filepath, result)
52
81
 
53
82
  @abc.abstractmethod
54
- def process_row(self, i: Hashable, row: pd.Series) -> Dict[str, Any]:
83
+ def process_row(self, i: Hashable, row: pd.Series) -> Optional[Dict[str, Any]]:
55
84
  """
56
85
  Process a single row of data.
86
+ :return: if `None`, ignore this row
57
87
  """
58
88
 
59
89
  def process(self, data: pd.DataFrame) -> pd.DataFrame:
@@ -61,6 +91,7 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
61
91
  if self.progress:
62
92
  desc = "process" if self.progress is True else self.progress
63
93
  bar = tqdm.tqdm(bar, total=len(data), desc=desc)
64
- res = [self.process_row(i, row) for i, row in bar]
94
+ res = (self.process_row(i, row) for i, row in bar)
95
+ res = (x for x in res if x is not None)
65
96
  return pd.DataFrame(res)
66
97
 
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: feilian
3
- Version: 1.2.1
3
+ Version: 1.2.3
4
4
  Summary: General data processing tool.
5
5
  Author-email: darkpeath <darkpeath@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/darkpeath/feilian
7
7
  Description-Content-Type: text/markdown
8
8
  Requires-Dist: pandas
9
9
  Provides-Extra: extra
10
- Requires-Dist: tqdm ; extra == 'extra'
10
+ Requires-Dist: tqdm; extra == "extra"
11
11
 
12
12
  # feilian
13
13
 
@@ -1,15 +1,15 @@
1
1
  feilian/__init__.py,sha256=97Rvz2O6LknvCWInAjINH4GeqG4-QPPcep9n-V7KD8k,911
2
- feilian/_dist_ver.py,sha256=quKRw8l2uSmWwO0SXPBC-2dE29rT4esN_0UQ9PP_-bk,148
2
+ feilian/_dist_ver.py,sha256=WDsMjivFi6Plqpo4JtGV48YaZNQnJ56AduVgKWA5Klo,148
3
3
  feilian/arg.py,sha256=n2nIcmC_3rb9A6BOzm9C5z3-T4lnubaGzH2sFhtqwZQ,8402
4
- feilian/dataframe.py,sha256=DsUiNRuVAe2H6WwqtZ6TGu6WHiLxFEN_Xjl208xvvnw,10698
4
+ feilian/dataframe.py,sha256=BtW6IQrXTF9hQCeccIXkedSjrpfuSKkzWGxd0VxiVHU,11390
5
5
  feilian/datetime.py,sha256=IONvWhLeGEy9IVe6GWKEW3FhrfRrShyhGP8-RTf9r3c,763
6
6
  feilian/io.py,sha256=aYN3QwWcLoRKzhGMNutqdkmxArVcXfeWXzxCB07LcFc,155
7
7
  feilian/json.py,sha256=1FkQ6e4JmbccpxhMobXpsGg-f7uVrTtUmu6jDCXCFTQ,1406
8
- feilian/process.py,sha256=GLkmogYnhkxi6qf-JX-FwIlJAHmTynmgB7zSAggEe1E,2080
8
+ feilian/process.py,sha256=3uig0s_X_espK26tD1cKkapi1gatT1M3zoiFwAws6PU,3153
9
9
  feilian/string.py,sha256=G_X3dnR0Oxmi4hXF-6E5jm5M7GPjGoMYrSMyI1dj6Z4,370
10
10
  feilian/utils.py,sha256=pzzGEgngidVkYvuNJWY7KWjkdgGRuH5_ENaLS6kxBtk,2648
11
11
  feilian/version.py,sha256=oH_DvE7jRCWlCCX9SSadwxwRJXFas_rIisYLBGPYZn4,350
12
- feilian-1.2.1.dist-info/METADATA,sha256=v0pXur6UQVS3a6U8irka8ucSq9M-wyQ4bgshSv6IBJo,3723
13
- feilian-1.2.1.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
14
- feilian-1.2.1.dist-info/top_level.txt,sha256=1Q2-B6KJrcTr7drW_kik35PTVEUJLPP4wVrn0kYKwGw,8
15
- feilian-1.2.1.dist-info/RECORD,,
12
+ feilian-1.2.3.dist-info/METADATA,sha256=1yull6iP_bypv8etP-9wxoc0zOMdYgXwvlixByFN4xs,3722
13
+ feilian-1.2.3.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
14
+ feilian-1.2.3.dist-info/top_level.txt,sha256=1Q2-B6KJrcTr7drW_kik35PTVEUJLPP4wVrn0kYKwGw,8
15
+ feilian-1.2.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.1.0)
2
+ Generator: setuptools (72.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5