PyPI - feilian - Versions diffs - 1.3.4__py3-none-any.whl - Mend

feilian 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

feilian/__init__.py +31 -0
feilian/_dist_ver.py +5 -0
feilian/_typing.py +5 -0
feilian/arg.py +179 -0
feilian/dataframe.py +311 -0
feilian/datetime.py +21 -0
feilian/excel.py +83 -0
feilian/io.py +6 -0
feilian/json.py +262 -0
feilian/process.py +99 -0
feilian/string.py +13 -0
feilian/txt.py +54 -0
feilian/utils.py +82 -0
feilian/version.py +12 -0
feilian-1.3.4.dist-info/METADATA +212 -0
feilian-1.3.4.dist-info/RECORD +18 -0
feilian-1.3.4.dist-info/WHEEL +5 -0
feilian-1.3.4.dist-info/top_level.txt +1 -0

feilian/json.py ADDED Viewed

@@ -0,0 +1,262 @@
+# -*- coding: utf-8 -*-
+from typing import Dict, List, Union, Any
+from pathlib import Path
+import os
+import abc
+import json
+from decimal import Decimal
+from .io import ensure_parent_dir_exist
+from .txt import get_file_encoding
+try:
+    import ijson
+except ImportError as e:
+    ijson = None
+def _read_json(filepath: Union[str, os.PathLike], jsonl: bool, encoding='utf-8', **kwargs):
+    """
+    The actual read function.
+    """
+    encoding = get_file_encoding(filepath, encoding=encoding)
+    with open(filepath, encoding=encoding) as f:
+        if jsonl:
+            return [json.loads(x, **kwargs) for x in f]
+        else:
+            return json.load(f, **kwargs)
+def _is_jsonl(filepath: Union[str, os.PathLike], jsonl: bool = None) -> bool:
+    if jsonl is None:
+        filepath = Path(filepath)
+        jsonl = filepath.suffix.lower() == '.jsonl'
+    return jsonl
+def read_json(
+    filepath: Union[str, os.PathLike],
+    jsonl: bool = None,
+    encoding: str = 'auto',
+    **kwargs
+) -> Union[Dict[str, Any], List[Any]]:
+    """
+    An agent for `json.load()` with some default value.
+    """
+    jsonl = _is_jsonl(filepath, jsonl)
+    try:
+        return _read_json(filepath, jsonl=jsonl, encoding=encoding, **kwargs)
+    except Exception as e:
+        # if failed, try again with different arg `jsonl`
+        try:
+            return _read_json(filepath, jsonl=not jsonl, encoding=encoding, **kwargs)
+        except Exception:
+            raise e
+def save_json(
+    filepath: Union[str, os.PathLike],
+    data: Union[Dict[str, Any], List[Any]],
+    jsonl: bool = None,
+    encoding: str = 'utf-8',
+    newline: str = '\n',
+    indent: int = 2,
+    ensure_ascii: bool = False,
+    **kwargs
+):
+    """
+    An agent for `json.dump()` with some default value.
+    """
+    jsonl = _is_jsonl(filepath, jsonl)
+    if jsonl and not isinstance(data, list):
+        # data should be a list
+        raise ValueError("data should be a list when save as jsonl format")
+    ensure_parent_dir_exist(filepath)
+    with open(filepath, 'w', encoding=encoding, newline=newline) as f:
+        if jsonl:
+            for x in data:
+                f.write(json.dumps(x, ensure_ascii=ensure_ascii, **kwargs))
+                f.write(newline)
+        else:
+            json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii, **kwargs)
+def write_json(
+    filepath: Union[str, os.PathLike],
+    data: Union[Dict[str, Any], List[Any]],
+    jsonl: bool = None,
+    encoding: str = 'utf-8',
+    newline: str = '\n',
+    indent: int = 2,
+    ensure_ascii: bool = False,
+    **kwargs
+):
+    save_json(
+        filepath=filepath,
+        data=data,
+        jsonl=jsonl,
+        encoding=encoding,
+        newline=newline,
+        indent=indent,
+        ensure_ascii=ensure_ascii,
+        **kwargs
+    )
+class _JsonNode:
+    def __init__(self, type: str = '', parent: '_JsonNode' = None):
+        self._type = ''
+        self._value = None
+        self._parent = parent
+        if type:
+            self.type = type
+    def clear(self):
+        if self._type == 'map':
+            self._value.clear()
+        elif self._type == 'array':
+            self._value.clear()
+    @property
+    def parent(self):
+        return self._parent
+    @property
+    def type(self):
+        return self._type
+    @type.setter
+    def type(self, value):
+        if self._type:
+            raise ValueError('type is already set')
+        self._type = value
+        if value == 'map':
+            self._value = {}
+        elif value == 'array':
+            self._value = []
+    @property
+    def value(self):
+        if not self._type:
+            raise ValueError('type is not set')
+        if self._type == 'dummy':
+            assert isinstance(self._value, _JsonNode)
+            return self._value.value
+        if self._type == 'map':
+            assert isinstance(self._value, dict)
+            return {k: v.value for k, v in self._value.items()}
+        if self._type == 'array':
+            assert isinstance(self._value, list)
+            return [v.value for v in self._value]
+        return self._value
+    @value.setter
+    def value(self, value):
+        if not self._type:
+            raise ValueError('type is not set')
+        if self._type in ['dummy', 'map', 'array']:
+            raise RuntimeError('cannot set value for dummy, map, array')
+        self._value = value
+    def __repr__(self):
+        return str(self.value)
+    def __str__(self):
+        return str(self.value)
+class StreamJsonReader(abc.ABC):
+    """
+    Iterate over a json file.
+    """
+    def __init__(self, filepath: Union[str, os.PathLike], encoding: str = None, limit: int = float('inf')):
+        self.filepath = filepath
+        self.encoding = encoding
+        self.limit = limit
+        self._data_type = ''  # dict or list
+    @property
+    def data_type(self):
+        return self._data_type
+    def __iter__(self):
+        raise NotImplementedError
+class BigJsonReader(StreamJsonReader):
+    def __iter__(self):
+        with open(self.filepath, 'rb') as f:
+            parser = ijson.parse(f)
+            dummy = node = _JsonNode('')
+            cnt = 0
+            for prefix, event, value in parser:
+                if event == 'start_map':
+                    if node.type == 'array':
+                        child = _JsonNode(type='map', parent=node)
+                        node._value.append(child)
+                        node = child
+                    else:
+                        node.type = 'map'
+                elif event == 'end_map':
+                    node = node.parent
+                elif event == 'start_array':
+                    node.type = 'array'
+                elif event == 'end_array':
+                    node = node.parent
+                elif event == 'map_key':
+                    assert node.type == 'map', f"{event} {value} {prefix}"
+                    child = _JsonNode(parent=node)
+                    node._value[value] = child
+                    node = child
+                else:
+                    assert event in ['null', 'boolean', 'integer', 'double', 'number', 'string']
+                    if isinstance(value, Decimal):
+                        value = float(value)
+                    if node.type == 'array':
+                        child = _JsonNode(type=event, parent=node)
+                        child.value = value
+                        node._value.append(child)
+                    else:
+                        assert not node.type
+                        node.type = event
+                        node.value = value
+                        node = node.parent
+                if node == dummy and event not in ['start_map', 'start_array']:
+                    assert node.type in ['map', 'array']
+                    if node.type == 'map':
+                        value = node.value
+                        assert isinstance(value, dict)
+                        assert len(value) == 1
+                        k, v = list(value.items())[0]
+                        self._data_type = 'dict'
+                        yield k, v
+                        node.clear()
+                    elif node.type == 'array':
+                        value = node.value
+                        assert isinstance(value, list)
+                        assert len(value) == 1
+                        self._data_type = 'list'
+                        yield value[0]
+                        node.clear()
+                    cnt += 1
+                    if cnt >= self.limit:
+                        break
+class JsonlReader(StreamJsonReader):
+    @property
+    def data_type(self):
+        return 'list'
+    def __iter__(self):
+        with open(self.filepath, encoding=self.encoding) as f:
+            for i, line in enumerate(f, 1):
+                yield json.loads(line)
+                if i >= self.limit:
+                    break
+def read_big_json(
+    filepath: Union[str, os.PathLike],
+    jsonl: bool = None,
+    encoding: str = 'auto',
+) -> StreamJsonReader:
+    jsonl = _is_jsonl(filepath, jsonl)
+    encoding = get_file_encoding(filepath, encoding=encoding)
+    if jsonl:
+        return JsonlReader(filepath, encoding=encoding)
+    else:
+        if ijson is None:
+            raise ImportError('ijson is not installed')
+        return BigJsonReader(filepath)

feilian/process.py ADDED Viewed

@@ -0,0 +1,99 @@
+import abc
+import tqdm
+import pandas as pd
+from ._typing import (
+    Any, Dict, Hashable, List,
+    Tuple, Union, Iterable, Optional,
+)
+from .dataframe import (
+    read_dataframe,
+    save_dataframe,
+)
+class BaseProcessor(abc.ABC):
+    """
+    Base class for processing data.
+    """
+    @abc.abstractmethod
+    def read_single_file(self, filepath: str) -> Any:
+        """
+        Actual method to read data from a single file.
+        """
+    def merge_input_data(self, data: Iterable[Any]) -> Any:
+        """
+        Merge data read from multi files.
+        """
+        return data
+    def read_data(self, filepath: Union[str, List[str], Tuple[str]]) -> Any:
+        """
+        Read data from input file.
+        """
+        if isinstance(filepath, (list, tuple)):
+            return self.merge_input_data(self.read_single_file(x) for x in filepath)
+        else:
+            return self.read_single_file(filepath)
+    @abc.abstractmethod
+    def save_result(self, filepath: str, result: Any):
+        """
+        Save result to output file.
+        """
+    @abc.abstractmethod
+    def process(self, data: Any) -> Any:
+        """
+        Process data and return result.
+        """
+    def run(self, input_path: Union[str, List[str], Tuple[str]], output_path: str = None, write_output=True):
+        """
+        Read from a file, and save result to another file.
+        :param input_path:      file with the data
+        :param output_path:     where to save the result, if not given, use input_path
+        :param write_output:    whether to write the result to the output_file
+        """
+        data = self.read_data(input_path)
+        result = self.process(data)
+        if write_output:
+            self.save_result(output_path or input_path, result)
+class DataframeProcessor(BaseProcessor, abc.ABC):
+    def __init__(self, input_dtype=None, progress=False, read_args: Dict[str, Any] = None,
+                 write_args: Dict[str, Any] = None):
+        self.progress = progress
+        self.read_args = read_args or {}
+        if input_dtype is not None:
+            self.read_args['dtype'] = input_dtype
+        self.write_args = write_args or {}
+    def read_single_file(self, filepath: str) -> pd.DataFrame:
+        return read_dataframe(filepath, **self.read_args)
+    def merge_input_data(self, data: Iterable[pd.DataFrame]) -> pd.DataFrame:
+        return pd.concat(data)
+    def read_data(self, filepath: Union[str, List[str], Tuple[str]]) -> pd.DataFrame:
+        return super().read_data(filepath)
+    def save_result(self, filepath: str, result: pd.DataFrame):
+        save_dataframe(filepath, result, **self.write_args)
+    @abc.abstractmethod
+    def process_row(self, i: Hashable, row: pd.Series) -> Optional[Dict[str, Any]]:
+        """
+        Process a single row of data.
+        :return:    if `None`, ignore this row
+        """
+    def process(self, data: pd.DataFrame) -> pd.DataFrame:
+        bar = data.iterrows()
+        if self.progress:
+            desc = "process" if self.progress is True else self.progress
+            bar = tqdm.tqdm(bar, total=len(data), desc=desc)
+        res = (self.process_row(i, row) for i, row in bar)
+        res = (x for x in res if x is not None)
+        return pd.DataFrame(res)

feilian/string.py ADDED Viewed

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from ._typing import Any, Callable
+def join_values(*values: Any, sep='', func: Callable[[Any], str] = str, do_trim=False, ignore_empty=False):
+    def f():
+        for x in values:
+            s = func(x)
+            if do_trim:
+                s = s.strip()
+            if s or not ignore_empty:
+                yield s
+    return sep.join(f())

feilian/txt.py ADDED Viewed

@@ -0,0 +1,54 @@
+from ._typing import Union, Literal
+import os
+import io
+import inspect
+import chardet
+_DEFAULT_CHUNK_SIZE = 1024
+if 'should_rename_legacy' in inspect.signature(chardet.UniversalDetector).parameters:
+    def _create_detector(should_rename_legacy: bool):
+        return chardet.UniversalDetector(should_rename_legacy=should_rename_legacy)
+else:
+    def _create_detector(should_rename_legacy: bool):
+        return chardet.UniversalDetector()
+def detect_stream_encoding(stream: io.IOBase, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
+    detector = _create_detector(should_rename_legacy=should_rename_legacy)
+    while True:
+        raw = stream.read(chunk_size)
+        if not raw:
+            break
+        detector.feed(raw)
+        if detector.done:
+            break
+    detector.close()
+    return detector.result.get('encoding')
+def detect_text_encoding(raw: bytes, chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
+    return detect_stream_encoding(io.BytesIO(raw), chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
+def detect_file_encoding(path: Union[str, os.PathLike], chunk_size=_DEFAULT_CHUNK_SIZE, should_rename_legacy=True) -> str:
+    with open(path, 'rb') as f:
+        return detect_stream_encoding(f, chunk_size=chunk_size, should_rename_legacy=should_rename_legacy)
+def get_file_encoding(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
+    if encoding == 'auto':
+        encoding = detect_file_encoding(path)
+    return encoding
+def read_txt(path: Union[str, os.PathLike], encoding: Union[None, Literal['auto'], str] = None) -> str:
+    if encoding == 'auto':
+        with open(path, 'rb') as f:
+            raw = f.read()
+        encoding = detect_stream_encoding(io.BytesIO(raw))
+        return raw.decode(encoding)
+    with open(path, 'r', encoding=encoding) as f:
+        return f.read()
+def save_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
+    with open(path, 'w', encoding=encoding) as f:
+        f.write(content)
+def write_txt(path: Union[str, os.PathLike], content: str, encoding: str = 'utf-8'):
+    save_txt(path=path, content=content, encoding=encoding)

feilian/utils.py ADDED Viewed

@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+# coding: utf-8
+from ._typing import Dict, Any, Union, Collection, List
+def flatten_dict(data: Dict[str, Any], prefix="", joiner=".",
+                 exclude: Union[None, str, Collection[str]] = None,
+                 frozen: Union[None, str, Collection[str]] = None,
+                 empty_as_default=False, empty_value=None,
+                 res: Dict[str, Any] = None) -> Dict[str, Any]:
+    """
+    flatten dict as a flat one layer dict
+    :param data:      origin dict
+    :param prefix:    prefix for key in the dict
+    :param joiner:    join symbol for different layer key
+    :param exclude:   prefix to be excluded from result
+    :param frozen:    keys not to be flattened
+    :param empty_as_default:  should set a default value if value is an empty dict
+    :param empty_value:       if `empty_as_default` is `True`, used as the default value for empty dict
+    :param res:       the result flat layer dict, create a new one if not given.
+    """
+    if res is None:
+        res = {}
+    if isinstance(exclude, str):
+        exclude = {exclude}
+    if isinstance(frozen, str):
+        frozen = {frozen}
+    # all keys are start with the prefix, ignore data
+    if exclude and prefix in exclude:
+        return res
+    # all keys in data should be frozen
+    if frozen and prefix in frozen:
+        for k, v in data.items():
+            res[prefix+k] = v
+        return res
+    for k, v in data.items():
+        k = prefix + k
+        if exclude and k in exclude:
+            # only the key should be excluded
+            continue
+        if frozen and k in frozen:
+            # frozen key, keep it as original value
+            res[k] = v
+            continue
+        if isinstance(v, dict):
+            if len(v) == 0:
+                # empty dict, set as default value if set
+                if empty_as_default:
+                    res[k] = empty_value
+            else:
+                # value is a dict, flatten recursively
+                flatten_dict(v, prefix=k+joiner, joiner=joiner, exclude=exclude, frozen=frozen, res=res)
+        else:
+            # normal value, keep it as original value
+            res[k] = v
+    return res
+def flatten_list(data: List[Any], res: List[Any] = None) -> List[Any]:
+    """
+    Flatten nested list as a flat on layer list.
+    :param data:    a nested list
+    :param res:     the result flat layer list, create a new one if not given.
+    """
+    if res is None:
+        res = []
+    for x in data:
+        if isinstance(x, list):
+            # flatten recursively
+            flatten_list(x, res)
+        else:
+            res.append(x)
+    return res

feilian/version.py ADDED Viewed

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+try:
+    from ._dist_ver import VERSION, __version__
+except ImportError:
+    from importlib_metadata import version, PackageNotFoundError
+    try:
+        __version__ = version('feilian')
+    except PackageNotFoundError:
+        # package is not installed
+        __version__ = "UNKNOWN"
+    VERSION = __version__.split('.')