PyPI - purrfectkit - Versions diffs - 0.2.1__py3-none-any.whl - Mend

purrfectkit 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

purrfectkit-0.2.1.dist-info/METADATA +102 -0
purrfectkit-0.2.1.dist-info/RECORD +24 -0
purrfectkit-0.2.1.dist-info/WHEEL +4 -0
purrfectkit-0.2.1.dist-info/licenses/LICENSE +21 -0
purrfectmeow/__init__.py +15 -0
purrfectmeow/meow/felis.py +171 -0
purrfectmeow/meow/kitty.py +52 -0
purrfectmeow/tc01_spl/__init__.py +3 -0
purrfectmeow/tc01_spl/base.py +107 -0
purrfectmeow/tc01_spl/markdown.py +57 -0
purrfectmeow/tc01_spl/ocr.py +141 -0
purrfectmeow/tc01_spl/simple.py +64 -0
purrfectmeow/tc02_mlt/__init__.py +3 -0
purrfectmeow/tc02_mlt/base.py +34 -0
purrfectmeow/tc02_mlt/separate.py +32 -0
purrfectmeow/tc02_mlt/token.py +55 -0
purrfectmeow/tc03_wcm/__init__.py +3 -0
purrfectmeow/tc03_wcm/base.py +14 -0
purrfectmeow/tc03_wcm/local.py +33 -0
purrfectmeow/tc04_kmn/__init__.py +3 -0
purrfectmeow/tc04_kmn/base.py +8 -0
purrfectmeow/tc04_kmn/cosine.py +40 -0
purrfectmeow/tc05_knj/__init__.py +3 -0
purrfectmeow/tc05_knj/base.py +6 -0

purrfectkit-0.2.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,102 @@
+Metadata-Version: 2.4
+Name: purrfectkit
+Version: 0.2.1
+Summary: **PurrfectKit** is a Python library for effortless Retrieval-Augmented Generation (RAG) workflows.
+Keywords: rag,nlp,llms,python,ai,ocr,document-processing,multilingual,text-extraction
+Author: SUWALUTIONS
+Author-email: SUWALUTIONS <suwa@suwalutions.com>
+License-Expression: MIT
+License-File: LICENSE
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: Linguistic
+Classifier: Topic :: Text Processing :: General
+Classifier: Natural Language :: English
+Classifier: Natural Language :: Thai
+Requires-Dist: python-magic<=0.4.27
+Requires-Dist: sentence-transformers<=5.1.0
+Requires-Dist: transformers<=4.52.1
+Requires-Dist: docling<=2.31.1
+Requires-Dist: markitdown<=0.1.1
+Requires-Dist: pymupdf4llm<=0.0.27
+Requires-Dist: pdf2image<=1.17.0
+Requires-Dist: pytesseract<=0.3.13
+Requires-Dist: easyocr<=1.7.2
+Requires-Dist: surya-ocr<=0.14.0
+Requires-Dist: python-doctr<=1.0.0
+Requires-Dist: pandas<=2.3.2
+Requires-Dist: langchain-text-splitters<=1.0.0
+Requires-Dist: tiktoken<=0.12.0
+Requires-Dist: sphinx<=8.2.3 ; extra == 'docs'
+Requires-Dist: sphinx-rtd-theme<=3.0.2 ; extra == 'docs'
+Requires-Dist: pytest<=8.4.2 ; extra == 'test'
+Requires-Dist: pytest-mock<=3.15.1 ; extra == 'test'
+Maintainer: KHARAPSY
+Maintainer-email: KHARAPSY <kharapsy@suwalutions.com>
+Requires-Python: >=3.10
+Project-URL: Documentation, https://suwalutions.github.io/PurrfectKit
+Project-URL: Issues, https://github.com/SUWALUTIONS/PurrfectKit/issues
+Project-URL: Repository, https://github.com/SUWALUTIONS/PurrfectKit
+Provides-Extra: dev
+Provides-Extra: docs
+Provides-Extra: test
+Description-Content-Type: text/markdown
+![PurrfectMeow Logo](docs/_static/repo-logo.png)
+# PurrfectKit
+[![Docker Image](https://github.com/suwalutions/PurrfectKit/actions/workflows/docker-image.yml/badge.svg)](https://github.com/suwalutions/PurrfectKit/actions/workflows/docker-image.yml)
+**PurrfectKit** is a toolkit that simplifies Retrieval-Augmented Generation (RAG) into 5 easy steps:
+1. Suphalak - read content from files
+2. Malet - split content into chunks
+3. WichienMaat - embed chunks into vectors
+4. KhaoManee - search vectors with queries
+5. Kornja - generate answers from vectors
+> **_NOTE:_** Each step is inspired by a unique Thai cat breed, making the workflow memorable and fun.
+## Quickstart
+### Prerequisites
+- python
+- tesseract
+- git
+### Installation
+```bash
+pip install git+https://github.com/suwalutions/PurrfectKit.git
+```
+### Usage
+```python
+from purrfectmeow.meow.felis import DocTemplate, MetaFile
+from purrfectmeow import Suphalak, Malet, WichienMaat, KhaoManee
+file_path = 'test/test.pdf'
+metadata = MetaFile.get_metadata(file_path)
+content = Suphalak.reading(open(file_path, 'rb').read(), 'test.pdf', loader='PYMUPDF')
+chunks = Malet.chunking(content, chunk_method='token', chunk_size='500', chunk_overlap='25')
+docs = DocTemplate.create_template(chunks, metadata)
+embedding = WichienMaat.embedding(chunks)
+query = WichienMaat.embedding("ทดสอบ")
+KhaoManee.searching(query, embedding, docs, 2)
+```
+## 📄 License
+PurrfectKit is released under the [MIT License](LICENSE).

purrfectkit-0.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,24 @@
+purrfectmeow/__init__.py,sha256=XEej-s0VH-Up9aob3XcDQqgS55Ftk_qNoXezdcedFJQ,271
+purrfectmeow/meow/felis.py,sha256=8d1kaizsEisr7dW-MKw8HqsYfOkLBGy-sYTv-4kClQ8,6149
+purrfectmeow/meow/kitty.py,sha256=WaLuh2t1PnigWYDNZlbNfCA_uqXnPYc-xxDuZlFfNNY,1971
+purrfectmeow/tc01_spl/__init__.py,sha256=7ENCidvXhj9YhMQvBcv_mm4XIr3Mwzc1USQxgzLO0Nw,51
+purrfectmeow/tc01_spl/base.py,sha256=iuIZiPUe-ofeF_PmknnCg-4NsJxDoH7rj-SMsqNBTAQ,3308
+purrfectmeow/tc01_spl/markdown.py,sha256=AUCSZ-6W0sXbZwGgZfe6utidbEemQGoi6c4rsLiH928,1861
+purrfectmeow/tc01_spl/ocr.py,sha256=A3orLTIVmu2WYJTi4joWlTmV27IDh3MTa7qc7IRAQkE,4784
+purrfectmeow/tc01_spl/simple.py,sha256=dwecYL2sviKz4BoJcOQntAprXACvaEig-ZbDiwTW-cU,2347
+purrfectmeow/tc02_mlt/__init__.py,sha256=qB2Eyc_wFDVELwj0L7ttG_YOL3IISaqPBRj0zqSJcPo,45
+purrfectmeow/tc02_mlt/base.py,sha256=cz1qFo1AdL-I2wnBPO06MhcYSQh90tcLCN99phIUKWw,1508
+purrfectmeow/tc02_mlt/separate.py,sha256=YQSnC5BODg1cJh4JrPkT_-tO1CbwgpxuCjMvHQwRUNE,1074
+purrfectmeow/tc02_mlt/token.py,sha256=qULVySiTAbDoBQrtWWuvPkO5Zqf5hjRutN1Q7foCwUU,2052
+purrfectmeow/tc03_wcm/__init__.py,sha256=8pXGo04Z5KUNGkhSTONLBlqwVc43LicDGSuQiQDIKIM,57
+purrfectmeow/tc03_wcm/base.py,sha256=pXaaiU8JMLIjI5uJRxMLRnQ1Wmwv3U6EEkQ_IwhPLwg,473
+purrfectmeow/tc03_wcm/local.py,sha256=5AfVSftW_cfaZBZBe-joSMJRRJ55G0g5lf9Qtcl0LUw,1074
+purrfectmeow/tc04_kmn/__init__.py,sha256=FBHZKVu4agf6-p1MdMx0jIgQuKbAy9rsOu7MRIQVwXg,53
+purrfectmeow/tc04_kmn/base.py,sha256=rj3Ar2Pv8VOL7vKvPB-snif8SRwBbGaLbWIpHFpd5b8,224
+purrfectmeow/tc04_kmn/cosine.py,sha256=DaDXVcy6YyNc5jwtPXeQg040FT7607phyt5Ub74E9aw,1147
+purrfectmeow/tc05_knj/__init__.py,sha256=XKwISvOAznPdTUWoTUnFDMBmxZF9Qd6FAi711W6bvZY,47
+purrfectmeow/tc05_knj/base.py,sha256=qN1VCx20G5H7YHcVzmg0YNXMLZM7TPkiD_UMEZykfjE,70
+purrfectkit-0.2.1.dist-info/licenses/LICENSE,sha256=9WlLgfJwKDGb71B1NwKYKKg6uL5u_knAr7ovGwIWvD4,1078
+purrfectkit-0.2.1.dist-info/WHEEL,sha256=5w2T7AS2mz1-rW9CNagNYWRCaB0iQqBMYLwKdlgiR4Q,78
+purrfectkit-0.2.1.dist-info/METADATA,sha256=cSe3NLmt6D8LaZSpilNU1c3G9k0P5XGThncqp6K2Crk,3765
+purrfectkit-0.2.1.dist-info/RECORD,,

purrfectkit-0.2.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.9.7
+Root-Is-Purelib: true
+Tag: py3-none-any

purrfectkit-0.2.1.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 SUWALUTIONS CO., LTD.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

purrfectmeow/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from .tc01_spl import Suphalak
+from .tc02_mlt import Malet
+from .tc03_wcm import WichienMaat
+from .tc04_kmn import KhaoManee
+from .tc05_knj import Kornja
+__all__ = [
+    "Suphalak",
+    "Malet",
+    "WichienMaat",
+    "KhaoManee",
+    "Kornja",
+]
+__version__ = "0.2.1"

purrfectmeow/meow/felis.py ADDED Viewed

@@ -0,0 +1,171 @@
+from typing import Any, Dict, List, Union
+from io import BytesIO
+class Document:
+    def __init__(self, page_content: str, metadata: Dict[str, Any]):
+        self.page_content = page_content
+        self.metadata = metadata or {}
+    def __repr__(self):
+        return f"{self.__class__.__name__}(page_content={self.page_content!r}, metadata={self.metadata!r})"
+    def __getitem__(self, key):
+        if key == "page_content":
+            return self.page_content
+        elif key == "metadata":
+            return self.metadata
+        else:
+            raise KeyError(f"{key} is not a valid key. Use 'page_content' or 'metadata'.")
+    def to_dict(self):
+        return {
+            "page_content": self.page_content,
+            "metadata": self.metadata
+        }
+class DocTemplate:
+    @staticmethod
+    def create_template(chunks: List[str], metadata: Dict[str, Any]) -> List[Document]:
+        if not isinstance(chunks, list):
+            raise TypeError(f"Expected 'chunks' to be a list, but got {type(chunks).__name__}.")
+        if not isinstance(metadata, dict):
+            raise TypeError(f"Expected 'metadata' to be a dict, but got {type(metadata).__name__}.")
+        if not all(isinstance(c, str) for c in chunks):
+            raise ValueError("All elements in 'chunks' must be strings.")
+        docs = []
+        chunk_hashes = []
+        import uuid
+        import hashlib
+        for idx, chunk in enumerate(chunks):
+            hash_val = hashlib.md5(chunk.encode()).hexdigest()
+            chunk_hashes.append(hash_val)
+        for idx, chunk in enumerate(chunks):
+            chunk_number = idx + 1
+            chunk_id = uuid.uuid4().hex
+            chunk_hash = chunk_hashes[idx]
+            prev_hash = chunk_hashes[idx - 1] if idx > 0 else None
+            next_hash = chunk_hashes[idx + 1] if idx < len(chunks) - 1 else None
+            chunk_size = len(chunk)
+            chunk_info = {
+                "chunk_number": chunk_number,
+                "chunk_id": chunk_id,
+                "chunk_hash": chunk_hash,
+                "previous_chunk_hash": prev_hash,
+                "next_chunk_hash": next_hash,
+                "chunk_size": chunk_size,
+            }
+            doc_metadata = {
+                "chunk_info": chunk_info,
+                "source_info": metadata
+            }
+            doc = Document(
+                page_content=chunk,
+                metadata=doc_metadata
+            )
+            docs.append(doc)
+        return docs
+class MetaFile:
+    @staticmethod
+    def get_metadata(file: Union[str, BytesIO], **kwargs: Any) -> Dict[str, Union[str, int]]:
+        if isinstance(file, bytes):
+            file = BytesIO(file)
+        if isinstance(file, BytesIO):
+            import os
+            os.makedirs(".cache/tmp", exist_ok=True)
+            file_name = kwargs.get('file_name')
+            if not file_name:
+                raise ValueError("file_name must be provided when using BytesIO.")
+            file_path = os.path.join(".cache/tmp", file_name)
+            with open(file_path, 'wb') as f:
+                f.write(file.getvalue())
+            try:
+                return MetaFile._get_metadata_from_path(file_path)
+            finally:
+                os.remove(file_path)
+        elif isinstance(file, str):
+            return MetaFile._get_metadata_from_path(file)
+        else:
+            raise TypeError(f"Unsupported file type: {type(file).__name__}. Expected str, bytes, or BytesIO.")
+    @staticmethod
+    def _get_metadata_from_path(file_path: str) -> Dict[str, Union[str, int]]:
+        metadata = {}
+        import os
+        import re
+        import time
+        import magic
+        import hashlib
+        import subprocess
+        try:
+            if not os.path.exists(file_path):
+                raise FileNotFoundError(f"File {file_path} does not exist")
+            stats = os.stat(file_path)
+            metadata["file_name"] = os.path.basename(file_path)
+            metadata["file_size"] = stats.st_size
+            metadata["file_created_date"] = time.strftime(
+                '%Y-%m-%d %H:%M:%S', time.localtime(stats.st_ctime)
+            )
+            metadata["file_modified_date"] = time.strftime(
+                '%Y-%m-%d %H:%M:%S', time.localtime(stats.st_mtime)
+            )
+            metadata["file_extension"] = os.path.splitext(file_path)[1] or "none"
+            try:
+                mime = magic.Magic(mime=True)
+                metadata["file_type"] = mime.from_file(file_path)
+                metadata["description"] = magic.from_file(file_path)
+            except Exception as e:
+                metadata["file_type"] = "unknown"
+                metadata["description"] = f"Could not determine file type: {str(e)}"
+            if metadata["file_type"].startswith("image/"):
+                metadata["total_pages"] = 1
+            elif metadata["file_type"].startswith("application/pdf"):
+                try:
+                    result = subprocess.run(
+                        ['pdfinfo', file_path],
+                        stdout=subprocess.PIPE,
+                        text=True,
+                        check=True
+                    )
+                    pages_match = re.search(r"Pages:\s*(\d+)", result.stdout)
+                    if pages_match:
+                        metadata["total_pages"] = int(pages_match.group(1))
+                    else:
+                        metadata["total_pages"] = "Unknown (could not parse page count)"
+                except (subprocess.CalledProcessError, FileNotFoundError):
+                    metadata["total_pages"] = "Unknown (pdfinfo not installed or failed)"
+            else:
+                metadata["total_pages"] = 1
+            with open(file_path, "rb") as f:
+                hash_md5 = hashlib.md5()
+                for chunk in iter(lambda: f.read(4096), b""):
+                    hash_md5.update(chunk)
+                metadata["file_md5"] = hash_md5.hexdigest()
+            return metadata
+        except Exception as e:
+            raise RuntimeError(f"Failed to extract metadata: {str(e)}")

purrfectmeow/meow/kitty.py ADDED Viewed

@@ -0,0 +1,52 @@
+import logging
+from logging.handlers import RotatingFileHandler
+from pathlib import Path
+class LevelBasedFormatter(logging.Formatter):
+    def __init__(self, default_fmt, info_fmt, datefmt=None):
+        super().__init__(datefmt=datefmt)
+        self.default_fmt = logging.Formatter(default_fmt, datefmt)
+        self.info_fmt = logging.Formatter(info_fmt, datefmt)
+    def format(self, record):
+        if record.levelno == logging.INFO:
+            return self.info_fmt.format(record)
+        return self.default_fmt.format(record)
+def kitty_logger(name: str, log_file: str = "kitty.log", log_level: str = "INFO") -> logging.Logger:
+    """
+    Sets up a logger with console and rotating file handlers.
+    Args:
+        name (str): Name of the logger (usually __name__ of the calling module).
+        log_file (str): Path to the log file. Defaults to 'kitty.log'.
+        log_level (str): Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'). Defaults to 'INFO'.
+    Returns:
+        logging.Logger: Configured logger instance.
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
+    if not logger.handlers:
+        default_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s"
+        info_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] - %(message)s"
+        datefmt = "%Y-%m-%d %H:%M:%S"
+        formatter = LevelBasedFormatter(default_fmt, info_fmt, datefmt)
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+        log_dir = Path(".cache/logs")
+        log_dir.mkdir(parents=True, exist_ok=True)
+        log_path = log_dir / log_file
+        file_handler = RotatingFileHandler(
+            log_path, maxBytes=5 * 1024 * 1024, backupCount=3
+        )
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    return logger

purrfectmeow/tc01_spl/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .base import Suphalak
+__all__ = ['Suphalak']

purrfectmeow/tc01_spl/base.py ADDED Viewed

@@ -0,0 +1,107 @@
+from typing import Dict, BinaryIO, Any
+from .markdown import Markdown
+from .ocr import Ocr
+from .simple import Simple
+class Suphalak:
+    tmp_dir = '.cache/tmp'
+    DEFAULT_LOADER = "PYMUPDF4LLM"
+    _LOADERS: Dict[str, Dict[str, Any]] = {
+        "MARKITDOWN": {
+            "func": Markdown.markitdown_convert,
+            "ext": ("csv", "docx", "md", "pdf", "pptx", "txt", "xls", "xlsx"),
+        },
+        "DOCLING": {
+            "func": Markdown.docling_convert,
+            "ext": ("csv", "docx", "jpg", "md", "pdf", "png", "pptx", "xlsx"),
+        },
+        "PYMUPDF4LLM": {
+            "func": Markdown.pymupdf4llm_convert,
+            "ext": ("docx", "pdf", "pptx", "txt", "xlsx"),
+        },
+        "PYTESSERACT": {
+            "func": Ocr.pytesseract_convert,
+            "ext": ("gif", "jpg", "pdf", "png"),
+        },
+        "EASYOCR": {
+            "func": Ocr.easyocr_convert,
+            "ext": ("gif", "jpg", "pdf", "png"),
+        },
+        "SURYAOCR": {
+            "func": Ocr.suryaocr_convert,
+            "ext": ("gif", "jpg", "pdf", "png"),
+        },
+        "DOCTR": {
+            "func": Ocr.doctr_convert,
+            "ext": ("gif", "jpg", "pdf", "png"),
+        },
+        "PYMUPDF": {
+            "func": Simple.pymupdf_convert,
+            "ext": ("docx", "md", "pdf", "pptx", "xlsx"),
+        },
+        "PANDAS": {
+            "func": Simple.pandas_convert,
+            "ext": ("csv", "xls", "xlsx"),
+        },
+        "ENCODING": {
+            "func": Simple.encoding_convert,
+            "ext": ("csv", "md", "txt"),
+        },
+    }
+    @classmethod
+    def _detect_loader(cls, file_ext: str) -> str:
+        priority = [
+            ("PANDAS", ("csv", "xls")),
+            ("PYTESSERACT", ("jpg", "png", "gif")),
+            ("PYMUPDF", ("pdf", "md")),
+            ("PYMUPDF4LLM", ("txt", "xlsx", "pptx", "docx")),
+        ]
+        for loader, extensions in priority:
+            if file_ext in extensions:
+                return loader
+        return cls.DEFAULT_LOADER
+    @classmethod
+    def reading(cls, file: BinaryIO, file_name: str, loader: str = None, **kwargs: Any) -> str:
+        import os
+        file_ext = file_name.split(".")[-1].lower()
+        if not loader:
+            loader = cls._detect_loader(file_ext)
+        if loader not in cls._LOADERS:
+            raise ValueError(f"Unsupported loader: '{loader}'")
+        loader_conf = cls._LOADERS[loader]
+        supported_ext = loader_conf["ext"]
+        if file_ext not in supported_ext:
+            raise TypeError(f"'{file_ext}' is not supported for '{loader}' loader.")
+        os.makedirs(cls.tmp_dir, exist_ok=True)
+        file_path = os.path.join(cls.tmp_dir, file_name)
+        try:
+            with open(file_path, "wb") as f:
+                f.write(file.read())
+            text = loader_conf["func"](file_path, **kwargs)
+            if (
+                file_ext == "pdf"
+                and (not text or not str(text).strip())
+                and loader not in ("PYTESSERACT", "EASYOCR", "SURYAOCR", "DOCTR")
+            ):
+                ocr_loader = cls._LOADERS["PYTESSERACT"]
+                text = ocr_loader["func"](file_path, **kwargs)
+            return text
+        finally:
+            if os.path.exists(file_path):
+                os.remove(file_path)

purrfectmeow/tc01_spl/markdown.py ADDED Viewed

@@ -0,0 +1,57 @@
+import time
+from typing import Callable
+from purrfectmeow.meow.kitty import kitty_logger
+class Markdown:
+    _logger = kitty_logger(__name__)
+    @classmethod
+    def _convert(cls, file_path: str, converter: Callable, extractor: Callable) -> str:
+        cls._logger.debug(f"Starting conversion for '{file_path}'")
+        start = time.time()
+        try:
+            content = converter.convert(file_path)
+            result = extractor(content)
+            cls._logger.debug(f"Succesfully converted '{file_path}'")
+            return result
+        finally:
+            elapsed = time.time() - start
+            cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
+    @classmethod
+    def markitdown_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using MarkItDown for Conversion")
+        from markitdown import MarkItDown
+        return cls._convert(file_path, MarkItDown(), lambda content: content.text_content)
+    @classmethod
+    def docling_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using Docling for Conversion")
+        from docling.document_converter import DocumentConverter
+        return cls._convert(file_path, DocumentConverter(), lambda content: content.document.export_to_markdown())
+    @classmethod
+    def pymupdf4llm_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using PyMuPDF4LLM for Conversion")
+        cls._logger.debug(f"Starting conversion for '{file_path}'")
+        start = time.time()
+        import pymupdf4llm
+        try:
+            res = pymupdf4llm.to_markdown(file_path)
+            cls._logger.debug(f"Succesfully converted '{file_path}'")
+            return res
+        finally:
+            elapsed = time.time() - start
+            cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")

purrfectmeow/tc01_spl/ocr.py ADDED Viewed

@@ -0,0 +1,141 @@
+import time
+from typing import Callable
+from purrfectmeow.meow.kitty import kitty_logger
+class Ocr:
+    _logger = kitty_logger(__name__)
+    _image_type = [
+        ".apng", ".png",
+        ".avif",
+        ".gif",
+        ".jpg", ".jpeg", ".jfif", ".pjpeg", ".pjp",
+        ".png",
+        ".svg",
+        ".webp",
+        ".bmp",
+        ".ico", ".cur",
+        ".tif", ".tiff"
+    ]
+    @classmethod
+    def _convert(cls, file_path: str, converter: Callable) -> str:
+        cls._logger.debug(f"Starting conversion for '{file_path}'")
+        start = time.time()
+        try:
+            content = []
+            match file_path.lower():
+                case path if path.endswith(".pdf"):
+                    from pdf2image import convert_from_path
+                    images = convert_from_path(file_path, fmt="png")
+                    for idx, image in enumerate(images):
+                        try:
+                            text = converter(image)
+                            cls._logger.debug(f"Text: {text}")
+                            content.append(text)
+                            cls._logger.debug(f"Page {idx+1} processed")
+                        except Exception as e:
+                            cls._logger.exception(f"Page {idx+1} failed: {e}")
+                            raise
+                case path if path.endswith(tuple(cls._image_type)):
+                    from PIL import Image
+                    image = Image.open(file_path)
+                    try:
+                        text = converter(image)
+                        cls._logger.debug(f"Text: {text}")
+                        content.append(text)
+                        cls._logger.debug("Page 1 processed")
+                    except Exception as e:
+                        cls._logger.debug(f"Page 1 failed: {e}")
+                        raise
+            cls._logger.debug(f"Successfully converted '{file_path}'")
+            return "\n".join(content)
+        finally:
+            elasped = time.time() - start
+            cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
+    @classmethod
+    def pytesseract_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using PyTesseract for Conversion")
+        def converter(image):
+            import pytesseract
+            return pytesseract.image_to_string(image, lang="tha+eng")
+        return cls._convert(file_path, converter)
+    @classmethod
+    def easyocr_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using EasyOCR for Conversion")
+        def converter(image):
+            import easyocr
+            import numpy
+            reader = easyocr.Reader(
+                ['th', 'en'],
+                gpu=False
+            )
+            res = reader.readtext(numpy.array(image))
+            return "\n".join(text for _, text, _ in res)
+        return cls._convert(file_path, converter)
+    @classmethod
+    def suryaocr_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using SuryaOCR for Conversion")
+        def converter(image):
+            from surya.recognition import RecognitionPredictor
+            from surya.detection import DetectionPredictor
+            rec_pred = RecognitionPredictor()
+            det_pred = DetectionPredictor()
+            prediction = rec_pred(
+                [image],
+                det_predictor=det_pred,
+                detection_batch_size=1,
+                recognition_batch_size=1,
+            )
+            return "\n".join(line.text for line in prediction[0].text_lines)
+        return cls._convert(file_path, converter)
+    @classmethod
+    def doctr_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using docTR for Conversion")
+        def converter(image):
+            import os
+            import tempfile
+            from doctr.io import DocumentFile
+            from doctr.models import ocr_predictor
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                image.save(tmp.name)
+                temp_image_path = tmp.name
+            model = ocr_predictor(pretrained=True)
+            doc = DocumentFile.from_images(temp_image_path)
+            result = model(doc)
+            data = result.export()
+            combined_text = "\n".join(
+                word["value"]
+                for page in data["pages"]
+                for block in page.get('blocks', [])
+                for line in block.get('lines', [])
+                for word in line.get('words', [])
+                if "value" in word
+            )
+            if os.path.exists(temp_image_path):
+                os.remove(temp_image_path)
+            return combined_text
+        return cls._convert(file_path, converter)

purrfectmeow/tc01_spl/simple.py ADDED Viewed

@@ -0,0 +1,64 @@
+import time
+from typing import Callable
+from purrfectmeow.meow.kitty import kitty_logger
+class Simple:
+    _logger = kitty_logger(__name__)
+    @classmethod
+    def _convert(cls, file_path: str, converter: Callable) -> str:
+        cls._logger.debug(f"Starting conversion for '{file_path}'")
+        start = time.time()
+        try:
+            res = converter(file_path)
+            cls._logger.debug(f"Successfully converted '{file_path}'")
+            return res
+        finally:
+            elasped = time.time() - start
+            cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
+    @classmethod
+    def encoding_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using Encoding for Conversion")
+        def reader(file_path):
+            with open(file_path, "r", encoding="utf-8") as f:
+                return f.read()
+        return cls._convert(file_path, lambda file_path: reader(file_path))
+    @classmethod
+    def pymupdf_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using PyMuPDF for Conversion")
+        def reader(file_path):
+            import pymupdf
+            if file_path.endswith(('.txt', '.md', '.json', '.html', '.xml')):
+                return "".join(page.get_text() for page in pymupdf.open(file_path, filetype="txt"))
+            else:
+                return "".join(page.get_text() for page in pymupdf.open(file_path))
+        return cls._convert(file_path, lambda file_path: reader(file_path))
+    @classmethod
+    def pandas_convert(cls, file_path: str) -> str:
+        cls._logger.debug("Using Pandas for Conversion")
+        def reader(file_path):
+            import pandas
+            if file_path.endswith(('.xls', '.xlsx')):
+                return pandas.read_excel(file_path).to_string(index=False)
+            elif file_path.endswith('.csv'):
+                return pandas.read_csv(file_path).to_string(index=False)
+            elif file_path.endswith('.json'):
+                return pandas.read_json(file_path).to_string(index=False)
+            elif file_path.endswith('.html'):
+                return pandas.read_html(file_path)
+            elif file_path.endswith('.xml'):
+                return pandas.read_xml(file_path).to_string(index=False)
+        return cls._convert(file_path, lambda file_path: reader(file_path))

purrfectmeow/tc02_mlt/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .base import Malet
+__all__ = ["Malet"]

purrfectmeow/tc02_mlt/base.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import Any, List, Literal, Optional
+from .token import TokenSplit
+from .separate import SeparateSplit
+class Malet:
+    DEFAULT_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
+    DEFAULT_CHUNK_SIZE = 500
+    DEFAULT_CHUNK_OVERLAP = 0
+    DEFAULT_CHUNK_SEPARATOR = '\n\n'
+    @staticmethod
+    def _get_kwarg(kwargs: dict, keys: List[str], default: Any = None) -> Any:
+        for key in keys:
+            if key in kwargs:
+                return kwargs[key]
+        return default
+    @classmethod
+    def chunking(cls, text: str, chunk_method: Optional[Literal["token", "separate"]] = "token", **kwargs: Any) -> List[str]:
+        match chunk_method:
+            case "token":
+                model_name = cls._get_kwarg(kwargs, ["model_name", "ModelName", "modelName"], cls.DEFAULT_MODEL_NAME)
+                chunk_size = cls._get_kwarg(kwargs, ["chunk_size", "ChunkSize", "chunkSize"], cls.DEFAULT_CHUNK_SIZE)
+                chunk_overlap = cls._get_kwarg(kwargs, ["chunk_overlap", "ChunkOverlap", "chunkOverlap"], cls.DEFAULT_CHUNK_OVERLAP)
+                method = TokenSplit.splitter(model_name, chunk_size, chunk_overlap)
+                return method.split_text(text)
+            case "separate":
+                chunk_separator = cls._get_kwarg(kwargs, ["chunk_separator", "ChunkSeparator", "chunkSeparator"], cls.DEFAULT_CHUNK_SEPARATOR)
+                method = SeparateSplit.splitter(chunk_separator)
+                return method.split_text(text)

purrfectmeow/tc02_mlt/separate.py ADDED Viewed

@@ -0,0 +1,32 @@
+import time
+from purrfectmeow.meow.kitty import kitty_logger
+class SeparateSplit:
+    _logger = kitty_logger(__name__)
+    @classmethod
+    def splitter(cls, chunk_separator: str):
+        cls._logger.debug("Initializing separate splitter")
+        start = time.time()
+        try:
+            splitter = cls.CharacterSeparator(chunk_separator)
+            cls._logger.debug("Separator splitter successfully initialized.")
+            return splitter
+        except Exception as e:
+            cls._logger.exception(f"Failed to initialize separate splitter: {e}")
+            raise
+        finally:
+            elapsed = time.time() - start
+            cls._logger.debug(f"Separate splitting completed in {elapsed:.2f} seconds.")
+    class CharacterSeparator:
+        def __init__(self, separator: str):
+            self.separator = separator
+        def split_text(self, text: str):
+            chunks = [chunk + self.separator for chunk in text.split(self.separator)]
+            chunks[-1] = chunks[-1].rstrip(self.separator)
+            return chunks

purrfectmeow/tc02_mlt/token.py ADDED Viewed

@@ -0,0 +1,55 @@
+import time
+from purrfectmeow.meow.kitty import kitty_logger
+class TokenSplit:
+    _logger = kitty_logger(__name__)
+    _OPENAI_EMBED_MODEL = {
+        'text-embedding-ada-002',
+        'text-embedding-3-small',
+        'text-embedding-3-large'
+    }
+    _OPENAI_HF_MODEL = {
+        'Xenova/text-embedding-ada-002'
+    }
+    _HF_MODEL_DIR = '.cache/huggingface/hub/'
+    @classmethod
+    def splitter(cls, model_name: str, chunk_size: int, chunk_overlap: int):
+        cls._logger.debug("Initializing token splitter")
+        start = time.time()
+        try:
+            cls._logger.debug(f"Using OpenAI model tokenizer: {model_name}")
+            from langchain_text_splitters import TokenTextSplitter
+            if model_name in cls._OPENAI_EMBED_MODEL:
+                splitter = TokenTextSplitter.from_tiktoken_encoder(
+                    model_name=model_name,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap
+                )
+            else:
+                cls._logger.debug(f"Using HuggingFace tokenizer: {model_name}")
+                from transformers import AutoTokenizer, GPT2TokenizerFast
+                if model_name in cls._OPENAI_HF_MODEL:
+                    tokenizer = GPT2TokenizerFast.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
+                else:
+                    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
+                splitter = TokenTextSplitter.from_huggingface_tokenizer(
+                    tokenizer=tokenizer,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap
+                )
+            cls._logger.debug("Token splitter successfully initialized.")
+            return splitter
+        except Exception as e:
+            cls._logger.exception(f"Failed to initialize token splitter: {e}")
+            raise
+        finally:
+            elapsed = time.time() - start
+            cls._logger.debug(f"Token splitting completed in {elapsed:.2f} seconds.")

purrfectmeow/tc03_wcm/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .base import WichienMaat
+__all__ = ["WichienMaat"]

purrfectmeow/tc03_wcm/base.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import List, Optional
+import numpy
+from .local import Local
+class WichienMaat:
+    DEFAULT_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
+    @classmethod
+    def embedding(cls, sentence: str | List[str], model_name: Optional[str] = None) -> numpy.ndarray:
+        if model_name:
+            return Local.model_encode(sentence, model_name)
+        else:
+            return Local.model_encode(sentence, cls.DEFAULT_MODEL_NAME)

purrfectmeow/tc03_wcm/local.py ADDED Viewed

@@ -0,0 +1,33 @@
+import time
+from typing import List
+from purrfectmeow.meow.kitty import kitty_logger
+class Local:
+    _logger = kitty_logger(__name__)
+    _HF_MODEL_DIR = '.cache/huggingface/hub/'
+    @classmethod
+    def model_encode(cls, sentence: str | List[str], model_name: str, **kwargs):
+        cls._logger.debug("Initializing local model encode")
+        start = time.time()
+        try:
+            from sentence_transformers import SentenceTransformer
+            model = SentenceTransformer(
+                model_name,
+                cache_folder=cls._HF_MODEL_DIR,
+                #local_files_only=True
+            )
+            embed = model.encode(sentence, convert_to_numpy=True)
+            cls._logger.debug("Local model encode successfully initialized.")
+            return embed
+        except Exception as e:
+            cls._logger.exception(f"Failed to initialize local model encode: {e}")
+            raise
+        finally:
+            elapsed = time.time() - start
+            cls._logger.debug(f"Local model encode completed in {elapsed:.2f} seconds.")

purrfectmeow/tc04_kmn/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .base import KhaoManee
+__all__ = ["KhaoManee"]

purrfectmeow/tc04_kmn/base.py ADDED Viewed

@@ -0,0 +1,8 @@
+from .cosine import ConsineSim
+class KhaoManee:
+    @classmethod
+    def searching(cls, query_embed, sentence_embed, document, top_k):
+        return ConsineSim.vector_search(query_embed, sentence_embed, document, top_k)

purrfectmeow/tc04_kmn/cosine.py ADDED Viewed

@@ -0,0 +1,40 @@
+import time
+from typing import List
+import numpy
+from purrfectmeow.meow.felis import Document
+from purrfectmeow.meow.kitty import kitty_logger
+class ConsineSim:
+    _logger = kitty_logger(__name__)
+    @classmethod
+    def vector_search(
+        cls,
+        embed_query: numpy.ndarray,
+        embed_sentence: numpy.ndarray | List[numpy.ndarray],
+        document: Document,
+        top_k: int
+    ):
+        cls._logger.debug("Initializing vector search")
+        start = time.time()
+        try:
+            from sklearn.metrics.pairwise import cosine_similarity
+            sims = cosine_similarity([embed_query], embed_sentence)[0]
+            top_indices = numpy.argsort(sims)[::-1][:top_k]
+            results = [{
+                "score": float(sims[i]),
+                "document": document[i]
+            } for i in top_indices]
+            return results
+        except Exception as e:
+            cls._logger.exception(f"Failed to initialize vector search: {e}")
+            raise
+        finally:
+            elapsed = time.time() - start
+            cls._logger.debug(f"Vector search completed in {elapsed:.2f} seconds.")

purrfectmeow/tc05_knj/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .base import Kornja
+__all__ = ["Kornja"]

purrfectmeow/tc05_knj/base.py ADDED Viewed

@@ -0,0 +1,6 @@
+class Kornja:
+    @classmethod
+    def generating(cls):
+        ...