PyPI - feilian - Versions diffs - 1.2.2__tar.gz → 1.3.4__tar.gz - Mend

feilian 1.2.2tar.gz → 1.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{feilian-1.2.2 → feilian-1.3.4}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,15 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: feilian
-Version: 1.2.2
+Version: 1.3.4
 Summary: General data processing tool.
 Author-email: darkpeath <darkpeath@gmail.com>
 Project-URL: Homepage, https://github.com/darkpeath/feilian
 Description-Content-Type: text/markdown
+Requires-Dist: chardet
 Requires-Dist: pandas
 Provides-Extra: extra
 Requires-Dist: tqdm; extra == "extra"
+Requires-Dist: ijson; extra == "extra"
 # feilian

{feilian-1.2.2 → feilian-1.3.4}/feilian/__init__.py RENAMED Viewed

@@ -5,8 +5,13 @@ from .dataframe import read_dataframe, save_dataframe, extract_dataframe_sample,
 from .dataframe import is_empty_text, is_nonempty_text, is_blank_text, is_non_blank_text
 from .datetime import format_time, format_date
 from .arg import ArgValueParser
-from .json import read_json, save_json
+from .json import read_json, save_json, write_json, read_big_json
+from .txt import (
+    detect_stream_encoding, detect_file_encoding, get_file_encoding,
+    read_txt, save_txt, write_txt,
+)
 from .process import DataframeProcessor
+from .excel import save_excel, write_excel
 from .utils import flatten_dict, flatten_list
 from .version import __version__
@@ -16,7 +21,10 @@ __all__ = [
     'is_empty_text', 'is_nonempty_text', 'is_blank_text', 'is_non_blank_text',
     'format_time', 'format_date',
     'ArgValueParser',
-    'read_json', 'save_json',
+    'read_json', 'save_json', 'write_json', 'read_big_json',
+    'detect_stream_encoding', 'detect_file_encoding', 'get_file_encoding',
+    'read_txt', 'save_txt', 'write_txt',
+    'save_excel', 'write_excel',
     'DataframeProcessor',
     'flatten_dict', 'flatten_list',
     '__version__',

{feilian-1.2.2 → feilian-1.3.4}/feilian/_dist_ver.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 # file generated by setuptools_scm
 # don't change, don't track in version control
-VERSION = (1, 2, 2)
-__version__ = '1.2.2'
+VERSION = (1, 3, 4)
+__version__ = '1.3.4'

feilian-1.3.4/feilian/_typing.py ADDED Viewed

@@ -0,0 +1,5 @@
+from typing import *
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal

{feilian-1.2.2 → feilian-1.3.4}/feilian/arg.py RENAMED Viewed

@@ -1,14 +1,11 @@
 # -*- coding: utf-8 -*-
-from typing import (
+from ._typing import (
     Union, List, Any, Iterable,
     Callable, Set, Optional, Tuple,
     Dict, Hashable, Sequence,
+    Literal,
 )
-try:
-    from typing import Literal
-except ImportError:
-    from typing_extensions import Literal
 _build_in_na_checkers = {
     'always_na': lambda x: True,

{feilian-1.2.2 → feilian-1.3.4}/feilian/dataframe.py RENAMED Viewed

@@ -4,17 +4,15 @@
 Encapsulate methods for pandas `DataFrame`.
 """
-from typing import Union, Iterable, Dict, List, Any, Sequence, Callable, Tuple, Hashable
-try:
-    from typing import Literal
-except ImportError:
-    from typing_extensions import Literal
+import io
 import os
+import pathlib
 import pandas as pd
 import random
 import collections
+from ._typing import Union, Iterable, Dict, List, Any, Sequence, Callable, Tuple, Hashable, Literal
 from .io import ensure_parent_dir_exist
+from .txt import detect_stream_encoding, detect_file_encoding
 # Compatible with different pandas versions
 PD_PARAM_NEWLINE = 'lineterminator'
@@ -25,9 +23,31 @@ if pd_version[0] < 1 or (pd_version[0] == 1 and pd_version[1] < 5):
 FILE_FORMAT = Literal['csv', 'tsv', 'json', 'xlsx', 'parquet']
 COMPRESSION_FORMAT = Literal[None, 'infer', 'snappy', 'gzip', 'brotli', 'bz2', 'zip', 'xz']
-def read_dataframe(file: str, *args, sheet_name=0,
-                   file_format: FILE_FORMAT = None,
+def _drop_na_values(data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], axis: Literal['columns', 'rows']):
+    if isinstance(data, pd.DataFrame):
+        data.dropna(axis=axis, how='all', inplace=True)
+    else:
+        assert isinstance(data, dict)
+        for df in data.values():
+            df.dropna(axis=axis, how='all', inplace=True)
+def _infer_file_format(file) -> str:
+    if isinstance(file, pd.ExcelWriter):
+        return 'xlsx'
+    elif isinstance(file, str):
+        return os.path.splitext(file)[1].lower()[1:]
+    elif isinstance(file, pathlib.PurePath):
+        suf = file.suffix
+        return suf[1:] if suf.startswith('.') else suf
+    elif isinstance(file, os.PathLike):
+        return os.path.splitext(os.fspath(file))[1].lower().lstrip('.')
+    else:
+        raise ValueError(f"Cannot infer format for type: {type(file)}")
+def read_dataframe(file: Union[str, os.PathLike, io.IOBase], *args, sheet_name=0,
+                   file_format: FILE_FORMAT = None, encoding='auto',
                    jsonl=False, dtype: type = None,
+                   drop_na_columns=False, drop_na_rows=False,
                    **kwargs) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
     """
     read file as pandas `DataFrame`
@@ -35,15 +55,16 @@ def read_dataframe(file: str, *args, sheet_name=0,
     :param args:        extra args for `pd.read_xx()`
     :param sheet_name:      `sheet_name` for `pd.read_excel()`
     :param file_format:     csv, tsv, json ,xlsx, parquet
+    :param encoding:    text file encoding
     :param jsonl:       jsonl format or not, only used in json format
     :param dtype:       `dtype` for `pd.read_xx()`
+    :param drop_na_columns:     drop column if all values of the column is na
+    :param drop_na_rows:        drop row if all values of the row is na
     :param kwargs:      extra kwargs for `pd.read_xx()`
     """
     # decide the file format
     if not file_format:
-        if not isinstance(file, str):
-            raise ValueError("Format should given!")
-        file_format = os.path.splitext(file)[1].lower()[1:]
+        file_format = _infer_file_format(file)
     for key in ['lines', 'line_delimited_json_format']:
         if key in kwargs and kwargs.pop(key):
@@ -60,18 +81,44 @@ def read_dataframe(file: str, *args, sheet_name=0,
         file_format = 'json'
         jsonl = True
+    # detect encoding
+    if encoding == 'auto' and file_format in ['csv', 'json']:
+        if isinstance(file, (str, os.PathLike)):
+            encoding = detect_file_encoding(file)
+        elif isinstance(file, io.IOBase) and file.seekable():
+            tell = file.tell()
+            encoding = detect_stream_encoding(file)
+            file.seek(tell)
+        else:
+            # read file may cause content change, so we cannot detect the encoding
+            encoding = None
     if file_format == 'csv':
-        return pd.read_csv(file, *args, dtype=dtype, **kwargs)
+        df = pd.read_csv(file, *args, encoding=encoding, dtype=dtype, **kwargs)
     elif file_format == 'xlsx':
-        return pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
+        df = pd.read_excel(file, *args, sheet_name=sheet_name, dtype=dtype, **kwargs)
     elif file_format == 'json':
-        return pd.read_json(file, *args, lines=jsonl, dtype=dtype, **kwargs)
+        try:
+            df = pd.read_json(file, *args, encoding=encoding, lines=jsonl, dtype=dtype, **kwargs)
+        except Exception as e:
+            # if failed, try again with different arg `lines`
+            try:
+                df = pd.read_json(file, *args, lines=not jsonl, dtype=dtype, **kwargs)
+            except Exception:
+                raise e
     elif file_format == 'parquet':
-        return pd.read_parquet(file, *args, **kwargs)
+        df = pd.read_parquet(file, *args, **kwargs)
     else:
         raise IOError(f"Unknown file format: {file}")
-def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]',  'pd.WriteBuffer[str]'],
+    if drop_na_columns:
+        _drop_na_values(df, axis='columns')
+    if drop_na_rows:
+        _drop_na_values(df, axis='rows')
+    return df
+def save_dataframe(file: Union[str, os.PathLike, 'pd.WriteBuffer[bytes]',  'pd.WriteBuffer[str]'],
                    df: Union[pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]]],
                    *args, sheet_name='Sheet1',
                    file_format: FILE_FORMAT = None,
@@ -79,7 +126,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]',  'pd.WriteBuffer[st
                    index=False, index_label=None,
                    encoding='utf-8', newline='\n',
                    force_ascii=False,
-                   orient='records', jsonl=True,
+                   orient='records', jsonl=True, indent=None,
                    column_mapper: Union[Dict[str, str], Sequence[str]] = None,
                    include_columns: Sequence[str] = None,
                    exclude_columns: Sequence[str] = None,
@@ -101,6 +148,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]',  'pd.WriteBuffer[st
     :param force_ascii:         `force_ascii` for json format
     :param orient:              `orient` for json format
     :param jsonl:               jsonl format or not
+    :param indent:              indent for json format
     :param column_mapper:       rename columns; if set, columns not list here will be ignored
     :param include_columns:     if set, columns not list here will be ignored
     :param exclude_columns:     if set, columns list here will be ignored
@@ -108,12 +156,7 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]',  'pd.WriteBuffer[st
     """
     # decide file format
     if not file_format:
-        if isinstance(file, str):
-            file_format = os.path.splitext(file)[1].lower()[1:]
-        elif isinstance(file, pd.ExcelWriter):
-            file_format = 'xlsx'
-        else:
-            raise ValueError("Format should given!")
+        file_format = _infer_file_format(file)
     # convert data to be a dataframe
     if not isinstance(df, pd.DataFrame):
@@ -158,11 +201,12 @@ def save_dataframe(file: Union[str, 'pd.WriteBuffer[bytes]',  'pd.WriteBuffer[st
     elif file_format == 'json':
         if jsonl:
             orient = 'records'
+            indent = None
         if orient not in ['split', 'table']:
             index = True
         df.to_json(file, *args, compression=compression, index=index,
                    force_ascii=force_ascii, orient=orient, lines=jsonl,
-                   **kwargs)
+                   indent=indent, **kwargs)
     elif file_format == 'parquet':
         df.to_parquet(file, *args, compression=compression, index=index, **kwargs)
     else:

{feilian-1.2.2 → feilian-1.3.4}/feilian/datetime.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-from typing import Union
+from ._typing import Union
 import pandas as pd
 import datetime

feilian-1.3.4/feilian/excel.py ADDED Viewed

@@ -0,0 +1,83 @@
+import pandas as pd
+from ._typing import Union, Iterable, Dict, Sequence, Any, List, Tuple
+from .dataframe import save_dataframe
+def _save_excel(file, df, *args, **kwargs):
+    # if df is a list of dataframe, then save each dataframe into a sheet
+    if isinstance(df, (list, tuple)) and df and all(isinstance(x, pd.DataFrame) for x in df):
+        if 'sheet_name' in kwargs:
+            kwargs.pop('sheet_name')
+        with pd.ExcelWriter(file) as writer:
+            for i, x in enumerate(df, 1):
+                save_dataframe(writer, x, *args, sheet_name=f"Sheet{i}", **kwargs)
+    elif isinstance(df, dict) and df and all(isinstance(x, pd.DataFrame) for x in df.values()):
+        if 'sheet_name' in kwargs:
+            kwargs.pop('sheet_name')
+        with pd.ExcelWriter(file) as writer:
+            for name, x in df.items():
+                save_dataframe(writer, x, *args, sheet_name=name, **kwargs)
+    else:
+        return save_dataframe(file, df, *args, **kwargs)
+_FILE_TYPES = Union[str, 'pd.WriteBuffer[bytes]', 'pd.WriteBuffer[str]']
+_DATA_TYPES = Union[
+    pd.DataFrame, Iterable[Union[pd.Series, Dict[str, Any]]],
+    List[pd.DataFrame], Tuple[pd.DataFrame], Dict[str, pd.DataFrame]
+]
+def save_excel(file: _FILE_TYPES, df: _DATA_TYPES,
+               *args, sheet_name='Sheet1',
+               header: Union[Sequence[str], bool] = True,
+               index=False, index_label=None,
+               column_mapper: Union[Dict[str, str], Sequence[str]] = None,
+               include_columns: Sequence[str] = None,
+               exclude_columns: Sequence[str] = None,
+               **kwargs):
+    """
+    save data into file
+    :param file:                where to save the data to
+    :param df:                  the data
+    :param args:                extra args for df.to_xx()
+    :param sheet_name:          `sheet_name` for excel format
+    :param header:              `header` for excel format
+    :param index:               save index or not, see docs in df.to_csv();
+                                if set as str and `index_label` not set, `index_label` will be set as this
+    :param index_label:         header for the index when `index` is `True`
+    :param column_mapper:       rename columns; if set, columns not list here will be ignored
+    :param include_columns:     if set, columns not list here will be ignored
+    :param exclude_columns:     if set, columns list here will be ignored
+    :param kwargs:              extra kwargs for df.to_xx()
+    """
+    _save_excel(
+        file, df, *args,
+        sheet_name=sheet_name,
+        header=header,
+        index=index,
+        index_label=index_label,
+        column_mapper=column_mapper,
+        include_columns=include_columns,
+        exclude_columns=exclude_columns,
+        **kwargs
+    )
+def write_excel(
+    file: _FILE_TYPES, df: _DATA_TYPES,
+    *args, sheet_name='Sheet1',
+    header: Union[Sequence[str], bool] = True,
+    index=False, index_label=None,
+    column_mapper: Union[Dict[str, str], Sequence[str]] = None,
+    include_columns: Sequence[str] = None,
+    exclude_columns: Sequence[str] = None,
+    **kwargs
+):
+    save_excel(
+        file, df, *args,
+        sheet_name=sheet_name,
+        header=header,
+        index=index,
+        index_label=index_label,
+        column_mapper=column_mapper,
+        include_columns=include_columns,
+        exclude_columns=exclude_columns,
+        **kwargs
+    )

feilian-1.3.4/feilian/json.py ADDED Viewed

@@ -0,0 +1,262 @@
+# -*- coding: utf-8 -*-
+from typing import Dict, List, Union, Any
+from pathlib import Path
+import os
+import abc
+import json
+from decimal import Decimal
+from .io import ensure_parent_dir_exist
+from .txt import get_file_encoding
+try:
+    import ijson
+except ImportError as e:
+    ijson = None
+def _read_json(filepath: Union[str, os.PathLike], jsonl: bool, encoding='utf-8', **kwargs):
+    """
+    The actual read function.
+    """
+    encoding = get_file_encoding(filepath, encoding=encoding)
+    with open(filepath, encoding=encoding) as f:
+        if jsonl:
+            return [json.loads(x, **kwargs) for x in f]
+        else:
+            return json.load(f, **kwargs)
+def _is_jsonl(filepath: Union[str, os.PathLike], jsonl: bool = None) -> bool:
+    if jsonl is None:
+        filepath = Path(filepath)
+        jsonl = filepath.suffix.lower() == '.jsonl'
+    return jsonl
+def read_json(
+    filepath: Union[str, os.PathLike],
+    jsonl: bool = None,
+    encoding: str = 'auto',
+    **kwargs
+) -> Union[Dict[str, Any], List[Any]]:
+    """
+    An agent for `json.load()` with some default value.
+    """
+    jsonl = _is_jsonl(filepath, jsonl)
+    try:
+        return _read_json(filepath, jsonl=jsonl, encoding=encoding, **kwargs)
+    except Exception as e:
+        # if failed, try again with different arg `jsonl`
+        try:
+            return _read_json(filepath, jsonl=not jsonl, encoding=encoding, **kwargs)
+        except Exception:
+            raise e
+def save_json(
+    filepath: Union[str, os.PathLike],
+    data: Union[Dict[str, Any], List[Any]],
+    jsonl: bool = None,
+    encoding: str = 'utf-8',
+    newline: str = '\n',
+    indent: int = 2,
+    ensure_ascii: bool = False,
+    **kwargs
+):
+    """
+    An agent for `json.dump()` with some default value.
+    """
+    jsonl = _is_jsonl(filepath, jsonl)
+    if jsonl and not isinstance(data, list):
+        # data should be a list
+        raise ValueError("data should be a list when save as jsonl format")
+    ensure_parent_dir_exist(filepath)
+    with open(filepath, 'w', encoding=encoding, newline=newline) as f:
+        if jsonl:
+            for x in data:
+                f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
+                f.write(newline)
+        else:
+            json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)
+def write_json(
+    filepath: Union[str, os.PathLike],
+    data: Union[Dict[str, Any], List[Any]],
+    jsonl: bool = None,
+    encoding: str = 'utf-8',
+    newline: str = '\n',
+    indent: int = 2,
+    ensure_ascii: bool = False,
+    **kwargs
+):
+    save_json(
+        filepath=filepath,
+        data=data,
+        jsonl=jsonl,
+        encoding=encoding,
+        newline=newline,
+        indent=indent,
+        ensure_ascii=ensure_ascii,
+        **kwargs
+    )
+class _JsonNode:
+    def __init__(self, type: str = '', parent: '_JsonNode' = None):
+        self._type = ''
+        self._value = None
+        self._parent = parent
+        if type:
+            self.type = type
+    def clear(self):
+        if self._type == 'map':
+            self._value.clear()
+        elif self._type == 'array':
+            self._value.clear()
+    @property
+    def parent(self):
+        return self._parent
+    @property
+    def type(self):
+        return self._type
+    @type.setter
+    def type(self, value):
+        if self._type:
+            raise ValueError('type is already set')
+        self._type = value
+        if value == 'map':
+            self._value = {}
+        elif value == 'array':
+            self._value = []
+    @property
+    def value(self):
+        if not self._type:
+            raise ValueError('type is not set')
+        if self._type == 'dummy':
+            assert isinstance(self._value, _JsonNode)
+            return self._value.value
+        if self._type == 'map':
+            assert isinstance(self._value, dict)
+            return {k: v.value for k, v in self._value.items()}
+        if self._type == 'array':
+            assert isinstance(self._value, list)
+            return [v.value for v in self._value]
+        return self._value
+    @value.setter
+    def value(self, value):
+        if not self._type:
+            raise ValueError('type is not set')
+        if self._type in ['dummy', 'map', 'array']:
+            raise RuntimeError('cannot set value for dummy, map, array')
+        self._value = value
+    def __repr__(self):
+        return str(self.value)
+    def __str__(self):
+        return str(self.value)
+class StreamJsonReader(abc.ABC):
+    """
+    Iterate over a json file.
+    """
+    def __init__(self, filepath: Union[str, os.PathLike], encoding: str = None, limit: int = float('inf')):
+        self.filepath = filepath
+        self.encoding = encoding
+        self.limit = limit
+        self._data_type = ''  # dict or list
+    @property
+    def data_type(self):
+        return self._data_type
+    def __iter__(self):
+        raise NotImplementedError
+class BigJsonReader(StreamJsonReader):
+    def __iter__(self):
+        with open(self.filepath, 'rb') as f:
+            parser = ijson.parse(f)
+            dummy = node = _JsonNode('')
+            cnt = 0
+            for prefix, event, value in parser:
+                if event == 'start_map':
+                    if node.type == 'array':
+                        child = _JsonNode(type='map', parent=node)
+                        node._value.append(child)
+                        node = child
+                    else:
+                        node.type = 'map'
+                elif event == 'end_map':
+                    node = node.parent
+                elif event == 'start_array':
+                    node.type = 'array'
+                elif event == 'end_array':
+                    node = node.parent
+                elif event == 'map_key':
+                    assert node.type == 'map', f"{event} {value} {prefix}"
+                    child = _JsonNode(parent=node)
+                    node._value[value] = child
+                    node = child
+                else:
+                    assert event in ['null', 'boolean', 'integer', 'double', 'number', 'string']
+                    if isinstance(value, Decimal):
+                        value = float(value)
+                    if node.type == 'array':
+                        child = _JsonNode(type=event, parent=node)
+                        child.value = value
+                        node._value.append(child)
+                    else:
+                        assert not node.type
+                        node.type = event
+                        node.value = value
+                        node = node.parent
+                if node == dummy and event not in ['start_map', 'start_array']:
+                    assert node.type in ['map', 'array']
+                    if node.type == 'map':
+                        value = node.value
+                        assert isinstance(value, dict)
+                        assert len(value) == 1
+                        k, v = list(value.items())[0]
+                        self._data_type = 'dict'
+                        yield k, v
+                        node.clear()
+                    elif node.type == 'array':
+                        value = node.value
+                        assert isinstance(value, list)
+                        assert len(value) == 1
+                        self._data_type = 'list'
+                        yield value[0]
+                        node.clear()
+                    cnt += 1
+                    if cnt >= self.limit:
+                        break
+class JsonlReader(StreamJsonReader):
+    @property
+    def data_type(self):
+        return 'list'
+    def __iter__(self):
+        with open(self.filepath, encoding=self.encoding) as f:
+            for i, line in enumerate(f, 1):
+                yield json.loads(line)
+                if i >= self.limit:
+                    break
+def read_big_json(
+    filepath: Union[str, os.PathLike],
+    jsonl: bool = None,
+    encoding: str = 'auto',
+) -> StreamJsonReader:
+    jsonl = _is_jsonl(filepath, jsonl)
+    encoding = get_file_encoding(filepath, encoding=encoding)
+    if jsonl:
+        return JsonlReader(filepath, encoding=encoding)
+    else:
+        if ijson is None:
+            raise ImportError('ijson is not installed')
+        return BigJsonReader(filepath)

{feilian-1.2.2 → feilian-1.3.4}/feilian/process.py RENAMED Viewed

@@ -1,8 +1,14 @@
 import abc
 import tqdm
 import pandas as pd
-from typing import Any, Dict, Hashable, List, Tuple, Union, Iterable
-from .dataframe import read_dataframe, save_dataframe
+from ._typing import (
+    Any, Dict, Hashable, List,
+    Tuple, Union, Iterable, Optional,
+)
+from .dataframe import (
+    read_dataframe,
+    save_dataframe,
+)
 class BaseProcessor(abc.ABC):
     """
@@ -55,11 +61,13 @@ class BaseProcessor(abc.ABC):
             self.save_result(output_path or input_path, result)
 class DataframeProcessor(BaseProcessor, abc.ABC):
-    def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None):
+    def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None,
+                 write_args: Dict[str, Any] = None):
         self.progress = progress
         self.read_args = read_args or {}
         if input_dtype is not None:
             self.read_args['dtype'] = input_dtype
+        self.write_args = write_args or {}
     def read_single_file(self, filepath: str) -> pd.DataFrame:
         return read_dataframe(filepath, **self.read_args)
@@ -71,12 +79,13 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
         return super().read_data(filepath)
     def save_result(self, filepath: str, result: pd.DataFrame):
-        save_dataframe(filepath, result)
+        save_dataframe(filepath, result, **self.write_args)
     @abc.abstractmethod
-    def process_row(self, i: Hashable, row: pd.Series) -> Dict[str, Any]:
+    def process_row(self, i: Hashable, row: pd.Series) -> Optional[Dict[str, Any]]:
         """
         Process a single row of data.
+        :return:    if `None`, ignore this row
         """
     def process(self, data: pd.DataFrame) -> pd.DataFrame:
@@ -84,6 +93,7 @@ class DataframeProcessor(BaseProcessor, abc.ABC):
         if self.progress:
             desc = "process" if self.progress is True else self.progress
             bar = tqdm.tqdm(bar, total=len(data), desc=desc)
-        res = [self.process_row(i, row) for i, row in bar]
+        res = (self.process_row(i, row) for i, row in bar)
+        res = (x for x in res if x is not None)
         return pd.DataFrame(res)

{feilian-1.2.2 → feilian-1.3.4}/feilian/string.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-from typing import Any, Callable
+from ._typing import Any, Callable
 def join_values(*values: Any, sep='', func: Callable[[Any], str] = str, do_trim=False, ignore_empty=False):
     def f():

feilian-1.3.4/feilian/txt.py ADDED Viewed

@@ -0,0 +1,54 @@
+from ._typing import Union, Literal
+import os
+import io
+import inspect
+import chardet
+_DEFAULT_CHUNK_SIZE = 1024
+if 'should_rename_legacy' in inspect.signature(chardet.UniversalDetector).parameters:
+    def _create_detector(should_rename_legacy: bool):
+        return chardet.UniversalDetector(should_rename_legacy=should_rename_legacy)
+else:
+    def _create_detector(should_rename_legacy: bool):
+        return chardet.UniversalDetector()
+def detect_stream_encoding(stream: io.IOBase, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
+    detector = _create_detector(should_rename_legacy=should_rename_legacy)
+    while True:
+        raw = stream.read(chunk_size)
+        if not raw:
+            break
+        detector.feed(raw)
+        if detector.done:
+            break
+    detector.close()
+    return detector.result.get('encoding')
+def detect_text_encoding(raw: bytes, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
+    return detect_stream_encoding(io.BytesIO(raw), chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
+def detect_file_encoding(path: Union[str, os.PathLike], chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
+    with open(path, 'rb') as f:
+        return detect_stream_encoding(f, chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
+def get_file_encoding(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
+    if encoding == 'auto':
+        encoding = detect_file_encoding(path)
+    return encoding
+def read_txt(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
+    if encoding == 'auto':
+        with open(path, 'rb') as f:
+            raw = f.read()
+        encoding = detect_stream_encoding(io.BytesIO(raw))
+        return raw.decode(encoding)
+    with open(path, 'r', encoding=encoding) as f:
+        return f.read()
+def save_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
+    with open(path, 'w', encoding=encoding) as f:
+        f.write(content)
+def write_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
+    save_txt(path=path, content=content, encoding=encoding)

{feilian-1.2.2 → feilian-1.3.4}/feilian/utils.py RENAMED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # coding: utf-8
-from typing import Dict, Any, Union, Collection, List
+from ._typing import Dict, Any, Union, Collection, List
 def flatten_dict(data: Dict[str, Any], prefix="", joiner=".",
                  exclude: Union[None, str, Collection[str]] = None,

{feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/PKG-INFO RENAMED Viewed

@@ -1,13 +1,15 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: feilian
-Version: 1.2.2
+Version: 1.3.4
 Summary: General data processing tool.
 Author-email: darkpeath <darkpeath@gmail.com>
 Project-URL: Homepage, https://github.com/darkpeath/feilian
 Description-Content-Type: text/markdown
+Requires-Dist: chardet
 Requires-Dist: pandas
 Provides-Extra: extra
 Requires-Dist: tqdm; extra == "extra"
+Requires-Dist: ijson; extra == "extra"
 # feilian

{feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/SOURCES.txt RENAMED Viewed

@@ -4,13 +4,16 @@ pyproject.toml
 requirements.txt
 feilian/__init__.py
 feilian/_dist_ver.py
+feilian/_typing.py
 feilian/arg.py
 feilian/dataframe.py
 feilian/datetime.py
+feilian/excel.py
 feilian/io.py
 feilian/json.py
 feilian/process.py
 feilian/string.py
+feilian/txt.py
 feilian/utils.py
 feilian/version.py
 feilian.egg-info/PKG-INFO

{feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,6 @@
+chardet
 pandas
 [extra]
 tqdm
+ijson

{feilian-1.2.2 → feilian-1.3.4}/pyproject.toml RENAMED Viewed

@@ -11,12 +11,14 @@ authors = [
     {name = "darkpeath", email = "darkpeath@gmail.com"}
 ]
 dependencies = [
+    "chardet",
     "pandas",
 ]
 [project.optional-dependencies]
 extra = [
     "tqdm",
+    "ijson",
 ]
 [project.urls]
@@ -33,4 +35,4 @@ write_to_template = """
 # don't change, don't track in version control
 VERSION = {version_tuple}
 __version__ = '{version}'
-"""
+"""

{feilian-1.2.2 → feilian-1.3.4}/requirements.txt RENAMED Viewed

@@ -1,4 +1,6 @@
 setuptools>=42
 setuptools_scm[toml]>=3.4
-pandas
+pandas<2.0.0
 tqdm
+chardet
+ijson

feilian-1.2.2/feilian/json.py DELETED Viewed

@@ -1,39 +0,0 @@
-# -*- coding: utf-8 -*-
-from typing import Dict, List, Union, Any
-import json
-from .io import ensure_parent_dir_exist
-def _is_jsonl(filepath: str, jsonl=None) -> bool:
-    if jsonl is None:
-        jsonl = filepath.lower().endswith('.jsonl')
-    return jsonl
-def read_json(filepath: str, jsonl=None, encoding='utf-8', **kwargs):
-    """
-    An agent for `json.load()` with some default value.
-    """
-    jsonl = _is_jsonl(filepath, jsonl)
-    with open(filepath, encoding=encoding) as f:
-        if jsonl:
-            return [json.loads(x) for x in f]
-        else:
-            return json.load(f, **kwargs)
-def save_json(filepath: str, data: Union[Dict[str, Any], List[Any]], jsonl=False,
-              encoding='utf-8', newline='\n', indent=2, ensure_ascii=False, **kwargs):
-    """
-    An agent for `json.dump()` with some default value.
-    """
-    jsonl = _is_jsonl(filepath, jsonl)
-    if jsonl and not isinstance(data, list):
-        # data should be a list
-        raise ValueError("data should be a list when save as jsonl format")
-    ensure_parent_dir_exist(filepath)
-    with open(filepath, 'w', encoding=encoding, newline=newline) as f:
-        if jsonl:
-            for x in data:
-                f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
-                f.write(newline)
-        else:
-            json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)

{feilian-1.2.2 → feilian-1.3.4}/README.md RENAMED Viewed

File without changes

{feilian-1.2.2 → feilian-1.3.4}/build.sh RENAMED Viewed

File without changes

{feilian-1.2.2 → feilian-1.3.4}/feilian/io.py RENAMED Viewed

File without changes

{feilian-1.2.2 → feilian-1.3.4}/feilian/version.py RENAMED Viewed

File without changes

{feilian-1.2.2 → feilian-1.3.4}/feilian.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{feilian-1.2.2 → feilian-1.3.4}/setup.cfg RENAMED Viewed

File without changes

feilian 1.2.2__tar.gz → 1.3.4__tar.gz

feilian 1.2.2tar.gz → 1.3.4tar.gz