purrfectkit 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 SUWALUTIONS CO., LTD.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,114 @@
1
+ Metadata-Version: 2.4
2
+ Name: purrfectkit
3
+ Version: 0.2.2
4
+ Summary: **PurrfectKit** is a Python library for effortless Retrieval-Augmented Generation (RAG) workflows.
5
+ Keywords: rag,nlp,llms,python,ai,ocr,document-processing,multilingual,text-extraction
6
+ Author: SUWALUTIONS
7
+ Author-email: SUWALUTIONS <suwa@suwalutions.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Text Processing :: Linguistic
23
+ Classifier: Topic :: Text Processing :: General
24
+ Classifier: Natural Language :: English
25
+ Classifier: Natural Language :: Thai
26
+ Requires-Dist: python-magic<=0.4.27
27
+ Requires-Dist: sentence-transformers<=5.1.0
28
+ Requires-Dist: transformers<=4.53.0
29
+ Requires-Dist: docling<=2.31.1
30
+ Requires-Dist: markitdown<=0.1.1
31
+ Requires-Dist: pymupdf4llm<=0.0.27
32
+ Requires-Dist: pdf2image<=1.17.0
33
+ Requires-Dist: pytesseract<=0.3.13
34
+ Requires-Dist: easyocr<=1.7.2
35
+ Requires-Dist: surya-ocr<=0.14.0
36
+ Requires-Dist: python-doctr<=1.0.0
37
+ Requires-Dist: pandas<=2.3.2
38
+ Requires-Dist: langchain-text-splitters<=1.0.0
39
+ Requires-Dist: tiktoken<=0.12.0
40
+ Requires-Dist: ruff<=0.6.0 ; extra == 'dev'
41
+ Requires-Dist: mypy<=1.11.0 ; extra == 'dev'
42
+ Requires-Dist: pre-commit<=3.8.0 ; extra == 'dev'
43
+ Requires-Dist: detect-secrets<=1.5.0 ; extra == 'dev'
44
+ Requires-Dist: codecov-cli<=11.2.4 ; extra == 'dev'
45
+ Requires-Dist: sphinx<=8.2.3 ; extra == 'docs'
46
+ Requires-Dist: sphinx-rtd-theme<=3.0.2 ; extra == 'docs'
47
+ Requires-Dist: pytest<=8.4.2 ; extra == 'test'
48
+ Requires-Dist: pytest-cov<=7.0.0 ; extra == 'test'
49
+ Requires-Dist: pytest-mock<=3.15.1 ; extra == 'test'
50
+ Maintainer: KHARAPSY
51
+ Maintainer-email: KHARAPSY <kharapsy@suwalutions.com>
52
+ Requires-Python: >=3.10
53
+ Project-URL: Documentation, https://suwalutions.github.io/PurrfectKit
54
+ Project-URL: Issues, https://github.com/SUWALUTIONS/PurrfectKit/issues
55
+ Project-URL: Repository, https://github.com/SUWALUTIONS/PurrfectKit
56
+ Provides-Extra: dev
57
+ Provides-Extra: docs
58
+ Provides-Extra: test
59
+ Description-Content-Type: text/markdown
60
+
61
+ ![PurrfectMeow Logo](https://github.com/suwalutions/PurrfectKit/blob/meow/docs/_static/repo-logo.png)
62
+
63
+ # PurrfectKit
64
+
65
+ [![Python 3.10–3.13](https://img.shields.io/badge/python-3.10–3.13-blue)](https://www.python.org)
66
+ [![PyPI](https://img.shields.io/pypi/v/purrfectkit?color=gold&label=PyPI)](https://pypi.org/project/purrfectkit/)
67
+ [![Downloads](https://img.shields.io/pypi/dm/purrfectkit?color=purple)](https://pypistats.org/packages/purrfectkit)
68
+ [![codecov](https://codecov.io/github/suwalutions/PurrfectKit/branch/meow/graph/badge.svg?token=Z6YETHJXCL)](https://codecov.io/github/suwalutions/PurrfectKit)
69
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
70
+ [![Docker](https://img.shields.io/docker/v/suwalutions/purrfectkit?label=docker)](https://ghcr.io/suwalutions/purrfectkit)
71
+ [![License](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
72
+
73
+ **PurrfectKit** is a toolkit that simplifies Retrieval-Augmented Generation (RAG) into 5 easy steps:
74
+ 1. Suphalak - read content from files
75
+ 2. Malet - split content into chunks
76
+ 3. WichienMaat - embed chunks into vectors
77
+ 4. KhaoManee - search vectors with queries
78
+ 5. Kornja - generate answers from vectors
79
+
80
+ > **_NOTE:_** Each step is inspired by a unique Thai cat breed, making the workflow memorable and fun.
81
+
82
+ ## Quickstart
83
+
84
+ ### Prerequisites
85
+ - python
86
+ - tesseract
87
+
88
+
89
+ ### Installation
90
+ ```bash
91
+ pip install purrfectkit
92
+
93
+ ```
94
+
95
+ ### Usage
96
+ ```python
97
+ from purrfectmeow.meow.felis import DocTemplate, MetaFile
98
+ from purrfectmeow import Suphalak, Malet, WichienMaat, KhaoManee
99
+
100
+ file_path = 'test/test.pdf'
101
+ metadata = MetaFile.get_metadata(file_path)
102
+ with open(file_path, 'rb') as f:
103
+ content = Suphalak.reading(f, 'test.pdf')
104
+ chunks = Malet.chunking(content, chunk_method='token', chunk_size='500', chunk_overlap='25')
105
+ docs = DocTemplate.create_template(chunks, metadata)
106
+ embedding = WichienMaat.embedding(chunks)
107
+ query = WichienMaat.embedding("ทดสอบ")
108
+ KhaoManee.searching(query, embedding, docs, 2)
109
+
110
+ ```
111
+
112
+ ## License
113
+
114
+ PurrfectKit is released under the [MIT License](LICENSE).
@@ -0,0 +1,54 @@
1
+ ![PurrfectMeow Logo](https://github.com/suwalutions/PurrfectKit/blob/meow/docs/_static/repo-logo.png)
2
+
3
+ # PurrfectKit
4
+
5
+ [![Python 3.10–3.13](https://img.shields.io/badge/python-3.10–3.13-blue)](https://www.python.org)
6
+ [![PyPI](https://img.shields.io/pypi/v/purrfectkit?color=gold&label=PyPI)](https://pypi.org/project/purrfectkit/)
7
+ [![Downloads](https://img.shields.io/pypi/dm/purrfectkit?color=purple)](https://pypistats.org/packages/purrfectkit)
8
+ [![codecov](https://codecov.io/github/suwalutions/PurrfectKit/branch/meow/graph/badge.svg?token=Z6YETHJXCL)](https://codecov.io/github/suwalutions/PurrfectKit)
9
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
10
+ [![Docker](https://img.shields.io/docker/v/suwalutions/purrfectkit?label=docker)](https://ghcr.io/suwalutions/purrfectkit)
11
+ [![License](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
12
+
13
+ **PurrfectKit** is a toolkit that simplifies Retrieval-Augmented Generation (RAG) into 5 easy steps:
14
+ 1. Suphalak - read content from files
15
+ 2. Malet - split content into chunks
16
+ 3. WichienMaat - embed chunks into vectors
17
+ 4. KhaoManee - search vectors with queries
18
+ 5. Kornja - generate answers from vectors
19
+
20
+ > **_NOTE:_** Each step is inspired by a unique Thai cat breed, making the workflow memorable and fun.
21
+
22
+ ## Quickstart
23
+
24
+ ### Prerequisites
25
+ - python
26
+ - tesseract
27
+
28
+
29
+ ### Installation
30
+ ```bash
31
+ pip install purrfectkit
32
+
33
+ ```
34
+
35
+ ### Usage
36
+ ```python
37
+ from purrfectmeow.meow.felis import DocTemplate, MetaFile
38
+ from purrfectmeow import Suphalak, Malet, WichienMaat, KhaoManee
39
+
40
+ file_path = 'test/test.pdf'
41
+ metadata = MetaFile.get_metadata(file_path)
42
+ with open(file_path, 'rb') as f:
43
+ content = Suphalak.reading(f, 'test.pdf')
44
+ chunks = Malet.chunking(content, chunk_method='token', chunk_size='500', chunk_overlap='25')
45
+ docs = DocTemplate.create_template(chunks, metadata)
46
+ embedding = WichienMaat.embedding(chunks)
47
+ query = WichienMaat.embedding("ทดสอบ")
48
+ KhaoManee.searching(query, embedding, docs, 2)
49
+
50
+ ```
51
+
52
+ ## License
53
+
54
+ PurrfectKit is released under the [MIT License](LICENSE).
@@ -0,0 +1,15 @@
1
+ from .tc01_spl import Suphalak
2
+ from .tc02_mlt import Malet
3
+ from .tc03_wcm import WichienMaat
4
+ from .tc04_kmn import KhaoManee
5
+ from .tc05_knj import Kornja
6
+
7
+ __all__ = [
8
+ "Suphalak",
9
+ "Malet",
10
+ "WichienMaat",
11
+ "KhaoManee",
12
+ "Kornja",
13
+ ]
14
+
15
+ __version__ = "0.2.2"
@@ -0,0 +1,20 @@
1
+ from typing import TypedDict
2
+
3
+ from .felis import Document
4
+
5
+
6
+ class FileMetadata(TypedDict, total=False):
7
+ file_name: str
8
+ file_size: int
9
+ file_created_date: str
10
+ file_modified_date: str
11
+ file_extension: str
12
+ file_type: str
13
+ description: str
14
+ total_pages: int | str
15
+ file_md5: str
16
+
17
+
18
+ class SimilarityResult(TypedDict, total=False):
19
+ score: float | str
20
+ document: Document
@@ -0,0 +1,159 @@
1
+ from io import BytesIO
2
+ from typing import Any
3
+
4
+ from .chaus import FileMetadata
5
+
6
+
7
+ class Document:
8
+ def __init__(self, page_content: str, metadata: dict[str, Any]) -> None:
9
+ self.page_content = page_content
10
+ self.metadata = metadata or {}
11
+
12
+ def __repr__(self) -> str:
13
+ return f"{self.__class__.__name__}(page_content={self.page_content!r}, metadata={self.metadata!r})"
14
+
15
+ def __getitem__(self, key: str) -> Any:
16
+ if key == "page_content":
17
+ return self.page_content
18
+ elif key == "metadata":
19
+ return self.metadata
20
+ else:
21
+ raise KeyError(f"{key} is not a valid key. Use 'page_content' or 'metadata'.")
22
+
23
+ def to_dict(self) -> dict[str, Any]:
24
+ return {"page_content": self.page_content, "metadata": self.metadata}
25
+
26
+
27
+ class DocTemplate:
28
+ @staticmethod
29
+ def create_template(chunks: list[str], metadata: dict[str, Any]) -> list[Document]:
30
+ if not isinstance(chunks, list):
31
+ raise TypeError(f"Expected 'chunks' to be a list, but got {type(chunks).__name__}.")
32
+
33
+ if not isinstance(metadata, dict):
34
+ raise TypeError(f"Expected 'metadata' to be a dict, but got {type(metadata).__name__}.")
35
+
36
+ if not all(isinstance(c, str) for c in chunks):
37
+ raise ValueError("All elements in 'chunks' must be strings.")
38
+
39
+ docs = []
40
+ chunk_hashes = []
41
+
42
+ import hashlib
43
+ import uuid
44
+
45
+ for _, chunk in enumerate(chunks):
46
+ hash_val = hashlib.md5(chunk.encode()).hexdigest()
47
+ chunk_hashes.append(hash_val)
48
+
49
+ for idx, chunk in enumerate(chunks):
50
+ chunk_number = idx + 1
51
+ chunk_id = uuid.uuid4().hex
52
+ chunk_hash = chunk_hashes[idx]
53
+ prev_hash = chunk_hashes[idx - 1] if idx > 0 else None
54
+ next_hash = chunk_hashes[idx + 1] if idx < len(chunks) - 1 else None
55
+ chunk_size = len(chunk)
56
+
57
+ chunk_info = {
58
+ "chunk_number": chunk_number,
59
+ "chunk_id": chunk_id,
60
+ "chunk_hash": chunk_hash,
61
+ "previous_chunk_hash": prev_hash,
62
+ "next_chunk_hash": next_hash,
63
+ "chunk_size": chunk_size,
64
+ }
65
+
66
+ doc_metadata = {"chunk_info": chunk_info, "source_info": metadata}
67
+
68
+ doc = Document(page_content=chunk, metadata=doc_metadata)
69
+ docs.append(doc)
70
+
71
+ return docs
72
+
73
+
74
+ class MetaFile:
75
+ @staticmethod
76
+ def get_metadata(file: str | BytesIO, **kwargs: Any) -> FileMetadata:
77
+ if isinstance(file, bytes):
78
+ file = BytesIO(file)
79
+
80
+ if isinstance(file, BytesIO):
81
+ import os
82
+
83
+ os.makedirs(".cache/tmp", exist_ok=True)
84
+ file_name = kwargs.get("file_name")
85
+
86
+ if not file_name:
87
+ raise ValueError("file_name must be provided when using BytesIO.")
88
+
89
+ file_path = os.path.join(".cache/tmp", file_name)
90
+ with open(file_path, "wb") as f:
91
+ f.write(file.getvalue())
92
+
93
+ try:
94
+ return MetaFile._get_metadata_from_path(file_path)
95
+ finally:
96
+ os.remove(file_path)
97
+
98
+ elif isinstance(file, str):
99
+ return MetaFile._get_metadata_from_path(file)
100
+
101
+ else:
102
+ raise TypeError(f"Unsupported file type: {type(file).__name__}. Expected str, bytes, or BytesIO.")
103
+
104
+ @staticmethod
105
+ def _get_metadata_from_path(file_path: str) -> FileMetadata:
106
+ metadata: FileMetadata = {}
107
+
108
+ import hashlib
109
+ import os
110
+ import re
111
+ import subprocess
112
+ import time
113
+
114
+ import magic
115
+
116
+ try:
117
+ if not os.path.exists(file_path):
118
+ raise FileNotFoundError(f"File {file_path} does not exist")
119
+
120
+ stats = os.stat(file_path)
121
+ metadata["file_name"] = os.path.basename(file_path)
122
+ metadata["file_size"] = stats.st_size
123
+ metadata["file_created_date"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stats.st_ctime))
124
+ metadata["file_modified_date"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stats.st_mtime))
125
+ metadata["file_extension"] = os.path.splitext(file_path)[1] or "none"
126
+
127
+ try:
128
+ mime = magic.Magic(mime=True)
129
+ metadata["file_type"] = mime.from_file(file_path)
130
+ metadata["description"] = magic.from_file(file_path)
131
+ except Exception as e:
132
+ metadata["file_type"] = "unknown"
133
+ metadata["description"] = f"Could not determine file type: {str(e)}"
134
+
135
+ if metadata["file_type"].startswith("image/"):
136
+ metadata["total_pages"] = 1
137
+ elif metadata["file_type"].startswith("application/pdf"):
138
+ try:
139
+ result = subprocess.run(["pdfinfo", file_path], stdout=subprocess.PIPE, text=True, check=True)
140
+ pages_match = re.search(r"Pages:\s*(\d+)", result.stdout)
141
+ if pages_match:
142
+ metadata["total_pages"] = int(pages_match.group(1))
143
+ else:
144
+ metadata["total_pages"] = "Unknown (could not parse page count)"
145
+ except (subprocess.CalledProcessError, FileNotFoundError):
146
+ metadata["total_pages"] = "Unknown (pdfinfo not installed or failed)"
147
+ else:
148
+ metadata["total_pages"] = 1
149
+
150
+ with open(file_path, "rb") as f:
151
+ hash_md5 = hashlib.md5()
152
+ for chunk in iter(lambda: f.read(4096), b""):
153
+ hash_md5.update(chunk)
154
+ metadata["file_md5"] = hash_md5.hexdigest()
155
+
156
+ return metadata
157
+
158
+ except Exception as e:
159
+ raise RuntimeError(f"Failed to extract metadata: {e}") from e
@@ -0,0 +1,52 @@
1
+ import logging
2
+ from logging.handlers import RotatingFileHandler
3
+ from pathlib import Path
4
+
5
+
6
+ class LevelBasedFormatter(logging.Formatter):
7
+ def __init__(self, default_fmt: str, info_fmt: str, datefmt: str | None = None) -> None:
8
+ super().__init__(datefmt=datefmt)
9
+ self.default_fmt: logging.Formatter = logging.Formatter(default_fmt, datefmt)
10
+ self.info_fmt: logging.Formatter = logging.Formatter(info_fmt, datefmt)
11
+
12
+ def format(self, record: logging.LogRecord) -> str:
13
+ if record.levelno == logging.INFO:
14
+ return self.info_fmt.format(record)
15
+ return self.default_fmt.format(record)
16
+
17
+
18
+ def kitty_logger(name: str, log_file: str = "kitty.log", log_level: str = "INFO") -> logging.Logger:
19
+ """
20
+ Sets up a logger with console and rotating file handlers.
21
+
22
+ Args:
23
+ name (str): Name of the logger (usually __name__ of the calling module).
24
+ log_file (str): Path to the log file. Defaults to 'kitty.log'.
25
+ log_level (str): Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'). Defaults to 'INFO'.
26
+
27
+ Returns:
28
+ logging.Logger: Configured logger instance.
29
+ """
30
+ logger = logging.getLogger(name)
31
+ logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
32
+
33
+ if not logger.handlers:
34
+ default_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s"
35
+ info_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] - %(message)s"
36
+ datefmt = "%Y-%m-%d %H:%M:%S"
37
+
38
+ formatter = LevelBasedFormatter(default_fmt, info_fmt, datefmt)
39
+
40
+ console_handler = logging.StreamHandler()
41
+ console_handler.setFormatter(formatter)
42
+ logger.addHandler(console_handler)
43
+
44
+ log_dir = Path(".cache/logs")
45
+ log_dir.mkdir(parents=True, exist_ok=True)
46
+ log_path = log_dir / log_file
47
+
48
+ file_handler = RotatingFileHandler(log_path, maxBytes=5 * 1024 * 1024, backupCount=3)
49
+ file_handler.setFormatter(formatter)
50
+ logger.addHandler(file_handler)
51
+
52
+ return logger
@@ -0,0 +1,3 @@
1
+ from .base import Suphalak
2
+
3
+ __all__ = ['Suphalak']
@@ -0,0 +1,110 @@
1
+ from typing import Any, BinaryIO
2
+
3
+ from .markdown import Markdown
4
+ from .ocr import Ocr
5
+ from .simple import Simple
6
+
7
+
8
+ class Suphalak:
9
+ tmp_dir = ".cache/tmp"
10
+ DEFAULT_LOADER = "PYMUPDF4LLM"
11
+
12
+ _LOADERS: dict[str, dict[str, Any]] = {
13
+ "MARKITDOWN": {
14
+ "func": Markdown.markitdown_convert,
15
+ "ext": ("csv", "docx", "md", "pdf", "pptx", "txt", "xls", "xlsx"),
16
+ },
17
+ "DOCLING": {
18
+ "func": Markdown.docling_convert,
19
+ "ext": ("csv", "docx", "jpg", "md", "pdf", "png", "pptx", "xlsx"),
20
+ },
21
+ "PYMUPDF4LLM": {
22
+ "func": Markdown.pymupdf4llm_convert,
23
+ "ext": ("docx", "pdf", "pptx", "txt", "xlsx"),
24
+ },
25
+ "PYTESSERACT": {
26
+ "func": Ocr.pytesseract_convert,
27
+ "ext": ("gif", "jpg", "pdf", "png"),
28
+ },
29
+ "EASYOCR": {
30
+ "func": Ocr.easyocr_convert,
31
+ "ext": ("gif", "jpg", "pdf", "png"),
32
+ },
33
+ "SURYAOCR": {
34
+ "func": Ocr.suryaocr_convert,
35
+ "ext": ("gif", "jpg", "pdf", "png"),
36
+ },
37
+ "DOCTR": {
38
+ "func": Ocr.doctr_convert,
39
+ "ext": ("gif", "jpg", "pdf", "png"),
40
+ },
41
+ "PYMUPDF": {
42
+ "func": Simple.pymupdf_convert,
43
+ "ext": ("docx", "md", "pdf", "pptx", "xlsx"),
44
+ },
45
+ "PANDAS": {
46
+ "func": Simple.pandas_convert,
47
+ "ext": ("csv", "xls", "xlsx"),
48
+ },
49
+ "ENCODING": {
50
+ "func": Simple.encoding_convert,
51
+ "ext": ("csv", "md", "txt"),
52
+ },
53
+ }
54
+
55
+ @classmethod
56
+ def _detect_loader(cls, file_ext: str) -> str:
57
+ priority = [
58
+ ("PANDAS", ("csv", "xls")),
59
+ ("PYTESSERACT", ("jpg", "png", "gif")),
60
+ ("PYMUPDF", ("pdf", "md")),
61
+ ("PYMUPDF4LLM", ("txt", "xlsx", "pptx", "docx")),
62
+ ]
63
+
64
+ for loader, extensions in priority:
65
+ if file_ext in extensions:
66
+ return loader
67
+
68
+ return cls.DEFAULT_LOADER
69
+
70
+ @classmethod
71
+ def reading(cls, file: BinaryIO, file_name: str, loader: str | None = None, **kwargs: Any) -> str:
72
+ import os
73
+
74
+ file_ext = file_name.split(".")[-1].lower()
75
+
76
+ if not loader:
77
+ loader = cls._detect_loader(file_ext)
78
+
79
+ if loader not in cls._LOADERS:
80
+ raise ValueError(f"Unsupported loader: '{loader}'")
81
+
82
+ loader_conf = cls._LOADERS[loader]
83
+ supported_ext = loader_conf["ext"]
84
+
85
+ if file_ext not in supported_ext:
86
+ raise TypeError(f"'{file_ext}' is not supported for '{loader}' loader.")
87
+
88
+ os.makedirs(cls.tmp_dir, exist_ok=True)
89
+ file_path = os.path.join(cls.tmp_dir, file_name)
90
+
91
+ try:
92
+ text: str
93
+ with open(file_path, "wb") as f:
94
+ f.write(file.read())
95
+
96
+ text = loader_conf["func"](file_path, **kwargs)
97
+
98
+ if (
99
+ file_ext == "pdf"
100
+ and (not text or not str(text).strip())
101
+ and loader not in ("PYTESSERACT", "EASYOCR", "SURYAOCR", "DOCTR")
102
+ ):
103
+ ocr_loader = cls._LOADERS["PYTESSERACT"]
104
+ text = ocr_loader["func"](file_path, **kwargs)
105
+
106
+ return text
107
+
108
+ finally:
109
+ if os.path.exists(file_path):
110
+ os.remove(file_path)
@@ -0,0 +1,64 @@
1
+ import time
2
+ from collections.abc import Callable
3
+ from typing import Any
4
+
5
+ from purrfectmeow.meow.kitty import kitty_logger
6
+
7
+
8
+ class Markdown:
9
+ _logger = kitty_logger(__name__)
10
+
11
+ @classmethod
12
+ def _convert(cls, file_path: str, converter: Callable[[str], Any], extractor: Callable[[Any], str]) -> str:
13
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
14
+ start = time.time()
15
+ try:
16
+ raw_content: Any = converter(file_path)
17
+ result: str = extractor(raw_content)
18
+
19
+ cls._logger.debug(f"Succesfully converted '{file_path}'")
20
+
21
+ return result
22
+
23
+ finally:
24
+ elapsed = time.time() - start
25
+ cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
26
+
27
+ @classmethod
28
+ def markitdown_convert(cls, file_path: str) -> str:
29
+ cls._logger.debug("Using MarkItDown for Conversion")
30
+
31
+ from markitdown import MarkItDown
32
+
33
+ mid = MarkItDown()
34
+
35
+ return cls._convert(file_path, lambda path: mid.convert(path), lambda content: content.text_content)
36
+
37
+ @classmethod
38
+ def docling_convert(cls, file_path: str) -> str:
39
+ cls._logger.debug("Using Docling for Conversion")
40
+
41
+ from docling.document_converter import DocumentConverter
42
+
43
+ dcl = DocumentConverter()
44
+
45
+ return cls._convert(
46
+ file_path, lambda path: dcl.convert(path).document, lambda content: content.document.export_to_markdown()
47
+ )
48
+
49
+ @classmethod
50
+ def pymupdf4llm_convert(cls, file_path: str) -> str:
51
+ cls._logger.debug("Using PyMuPDF4LLM for Conversion")
52
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
53
+ start = time.time()
54
+
55
+ import pymupdf4llm
56
+
57
+ try:
58
+ res: str = pymupdf4llm.to_markdown(file_path)
59
+ cls._logger.debug(f"Succesfully converted '{file_path}'")
60
+
61
+ return res
62
+ finally:
63
+ elapsed = time.time() - start
64
+ cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
@@ -0,0 +1,149 @@
1
+ import time
2
+ from collections.abc import Callable
3
+ from typing import Any
4
+
5
+ from purrfectmeow.meow.kitty import kitty_logger
6
+
7
+
8
+ class Ocr:
9
+ _logger = kitty_logger(__name__)
10
+ _image_type = [
11
+ ".apng",
12
+ ".png",
13
+ ".avif",
14
+ ".gif",
15
+ ".jpg",
16
+ ".jpeg",
17
+ ".jfif",
18
+ ".pjpeg",
19
+ ".pjp",
20
+ ".png",
21
+ ".svg",
22
+ ".webp",
23
+ ".bmp",
24
+ ".ico",
25
+ ".cur",
26
+ ".tif",
27
+ ".tiff",
28
+ ]
29
+
30
+ @classmethod
31
+ def _convert(cls, file_path: str, converter: Callable[[str], Any]) -> str:
32
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
33
+ start = time.time()
34
+
35
+ try:
36
+ content = []
37
+ match file_path.lower():
38
+ case path if path.endswith(".pdf"):
39
+ from pdf2image import convert_from_path
40
+
41
+ images = convert_from_path(file_path, fmt="png")
42
+ for idx, image in enumerate(images):
43
+ try:
44
+ text = converter(image)
45
+ cls._logger.debug(f"Text: {text}")
46
+ content.append(text)
47
+ cls._logger.debug(f"Page {idx + 1} processed")
48
+ except Exception as e:
49
+ cls._logger.exception(f"Page {idx + 1} failed: {e}")
50
+ raise
51
+ case path if path.endswith(tuple(cls._image_type)):
52
+ from PIL import Image
53
+
54
+ image = Image.open(file_path)
55
+ try:
56
+ text = converter(image)
57
+ cls._logger.debug(f"Text: {text}")
58
+ content.append(text)
59
+ cls._logger.debug("Page 1 processed")
60
+ except Exception as e:
61
+ cls._logger.debug(f"Page 1 failed: {e}")
62
+ raise
63
+
64
+ cls._logger.debug(f"Successfully converted '{file_path}'")
65
+ return "\n".join(content)
66
+
67
+ finally:
68
+ elasped = time.time() - start
69
+ cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
70
+
71
+ @classmethod
72
+ def pytesseract_convert(cls, file_path: str) -> str:
73
+ cls._logger.debug("Using PyTesseract for Conversion")
74
+
75
+ def converter(image: str) -> Any:
76
+ import pytesseract
77
+
78
+ return pytesseract.image_to_string(image, lang="tha+eng")
79
+
80
+ return cls._convert(file_path, converter)
81
+
82
+ @classmethod
83
+ def easyocr_convert(cls, file_path: str) -> str:
84
+ cls._logger.debug("Using EasyOCR for Conversion")
85
+
86
+ def converter(image: str) -> Any:
87
+ import easyocr
88
+ import numpy
89
+
90
+ reader = easyocr.Reader(["th", "en"], gpu=False)
91
+ res = reader.readtext(numpy.array(image))
92
+ return "\n".join(text for _, text, _ in res)
93
+
94
+ return cls._convert(file_path, converter)
95
+
96
+ @classmethod
97
+ def suryaocr_convert(cls, file_path: str) -> str:
98
+ cls._logger.debug("Using SuryaOCR for Conversion")
99
+
100
+ def converter(image: str) -> Any:
101
+ from surya.detection import DetectionPredictor
102
+ from surya.recognition import RecognitionPredictor
103
+
104
+ rec_pred = RecognitionPredictor()
105
+ det_pred = DetectionPredictor()
106
+
107
+ prediction = rec_pred(
108
+ [image],
109
+ det_predictor=det_pred,
110
+ detection_batch_size=1,
111
+ recognition_batch_size=1,
112
+ )
113
+ return "\n".join(line.text for line in prediction[0].text_lines)
114
+
115
+ return cls._convert(file_path, converter)
116
+
117
+ @classmethod
118
+ def doctr_convert(cls, file_path: str) -> str:
119
+ cls._logger.debug("Using docTR for Conversion")
120
+
121
+ def converter(image: str) -> str:
122
+ import os
123
+ import shutil
124
+ import tempfile
125
+
126
+ from doctr.io import DocumentFile
127
+ from doctr.models import ocr_predictor
128
+
129
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
130
+ shutil.copy(image, tmp.name)
131
+ temp_image_path = tmp.name
132
+
133
+ model = ocr_predictor(pretrained=True)
134
+ doc = DocumentFile.from_images(temp_image_path)
135
+ result = model(doc)
136
+ data = result.export()
137
+ combined_text = "\n".join(
138
+ word["value"]
139
+ for page in data["pages"]
140
+ for block in page.get("blocks", [])
141
+ for line in block.get("lines", [])
142
+ for word in line.get("words", [])
143
+ if "value" in word
144
+ )
145
+ if os.path.exists(temp_image_path):
146
+ os.remove(temp_image_path)
147
+ return combined_text
148
+
149
+ return cls._convert(file_path, converter)
@@ -0,0 +1,75 @@
1
+ import time
2
+ from collections.abc import Callable
3
+ from typing import Any
4
+
5
+ from purrfectmeow.meow.kitty import kitty_logger
6
+
7
+
8
+ class Simple:
9
+ _logger = kitty_logger(__name__)
10
+
11
+ @classmethod
12
+ def _convert(cls, file_path: str, converter: Callable[[str], Any]) -> str | Any:
13
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
14
+ start = time.time()
15
+
16
+ try:
17
+ res = converter(file_path)
18
+
19
+ cls._logger.debug(f"Successfully converted '{file_path}'")
20
+ return res
21
+
22
+ finally:
23
+ elasped = time.time() - start
24
+ cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
25
+
26
+ @classmethod
27
+ def encoding_convert(cls, file_path: str) -> str:
28
+ cls._logger.debug("Using Encoding for Conversion")
29
+
30
+ def reader(file_path: str) -> str:
31
+ with open(file_path, encoding="utf-8") as f:
32
+ return f.read()
33
+
34
+ return cls._convert(file_path, lambda file_path: reader(file_path))
35
+
36
+ @classmethod
37
+ def pymupdf_convert(cls, file_path: str) -> str:
38
+ cls._logger.debug("Using PyMuPDF for Conversion")
39
+
40
+ def reader(file_path: str) -> str:
41
+ import pymupdf
42
+
43
+ if file_path.endswith((".txt", ".md", ".json", ".html", ".xml")):
44
+ return "".join(page.get_text() for page in pymupdf.open(file_path, filetype="txt"))
45
+ else:
46
+ return "".join(page.get_text() for page in pymupdf.open(file_path))
47
+
48
+ return cls._convert(file_path, lambda file_path: reader(file_path))
49
+
50
+ @classmethod
51
+ def pandas_convert(cls, file_path: str) -> str:
52
+ cls._logger.debug("Using Pandas for Conversion")
53
+
54
+ def reader(file_path: str) -> Any:
55
+ import pandas
56
+
57
+ if file_path.endswith((".xls", ".xlsx")):
58
+ df_x: pandas.DataFrame = pandas.read_excel(file_path)
59
+ return df_x.to_string(index=False)
60
+ elif file_path.endswith(".csv"):
61
+ df_c: pandas.DataFrame = pandas.read_csv(file_path)
62
+ return df_c.to_string(index=False)
63
+ elif file_path.endswith(".json"):
64
+ df_j: pandas.DataFrame = pandas.read_json(file_path)
65
+ return df_j.to_string(index=False)
66
+ elif file_path.endswith(".html"):
67
+ df_h: list[pandas.DataFrame] = pandas.read_html(file_path)
68
+ return "".join(df.to_string(index=False) for df in df_h)
69
+ elif file_path.endswith(".xml"):
70
+ df_m: pandas.DataFrame = pandas.read_xml(file_path)
71
+ return df_m.to_string(index=False)
72
+ else:
73
+ return ""
74
+
75
+ return cls._convert(file_path, lambda file_path: reader(file_path))
@@ -0,0 +1,3 @@
1
+ from .base import Malet
2
+
3
+ __all__ = ["Malet"]
@@ -0,0 +1,43 @@
1
+ from typing import Any, Literal
2
+
3
+ from langchain_text_splitters import TokenTextSplitter
4
+
5
+ from .separate import SeparateSplit
6
+ from .token import TokenSplit
7
+
8
+
9
+ class Malet:
10
+ DEFAULT_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
11
+ DEFAULT_CHUNK_SIZE = 500
12
+ DEFAULT_CHUNK_OVERLAP = 0
13
+ DEFAULT_CHUNK_SEPARATOR = "\n\n"
14
+
15
+ @staticmethod
16
+ def _get_kwarg(kwargs: dict[str, Any], keys: list[str], default: Any = None) -> Any:
17
+ for key in keys:
18
+ if key in kwargs:
19
+ return kwargs[key]
20
+ return default
21
+
22
+ @classmethod
23
+ def chunking(
24
+ cls, text: str, chunk_method: Literal["token", "separate"] | None = "token", **kwargs: Any
25
+ ) -> TokenTextSplitter | SeparateSplit.CharacterSeparator:
26
+ match chunk_method:
27
+ case "token":
28
+ model_name = cls._get_kwarg(kwargs, ["model_name", "ModelName", "modelName"], cls.DEFAULT_MODEL_NAME)
29
+ chunk_size = cls._get_kwarg(kwargs, ["chunk_size", "ChunkSize", "chunkSize"], cls.DEFAULT_CHUNK_SIZE)
30
+ chunk_overlap = cls._get_kwarg(
31
+ kwargs, ["chunk_overlap", "ChunkOverlap", "chunkOverlap"], cls.DEFAULT_CHUNK_OVERLAP
32
+ )
33
+
34
+ method = TokenSplit.splitter(model_name, chunk_size, chunk_overlap)
35
+
36
+ case "separate":
37
+ chunk_separator = cls._get_kwarg(
38
+ kwargs, ["chunk_separator", "ChunkSeparator", "chunkSeparator"], cls.DEFAULT_CHUNK_SEPARATOR
39
+ )
40
+
41
+ method = SeparateSplit.splitter(chunk_separator)
42
+
43
+ return method.split_text(text)
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+
5
+ from purrfectmeow.meow.kitty import kitty_logger
6
+
7
+
8
+ class SeparateSplit:
9
+ _logger = kitty_logger(__name__)
10
+
11
+ @classmethod
12
+ def splitter(cls, chunk_separator: str) -> CharacterSeparator:
13
+ cls._logger.debug("Initializing separate splitter")
14
+ start = time.time()
15
+
16
+ try:
17
+ splitter = cls.CharacterSeparator(chunk_separator)
18
+
19
+ cls._logger.debug("Separator splitter successfully initialized.")
20
+ return splitter
21
+ except Exception as e:
22
+ cls._logger.exception(f"Failed to initialize separate splitter: {e}")
23
+ raise
24
+ finally:
25
+ elapsed = time.time() - start
26
+ cls._logger.debug(f"Separate splitting completed in {elapsed:.2f} seconds.")
27
+
28
+ class CharacterSeparator:
29
+ def __init__(self, separator: str):
30
+ self.separator = separator
31
+
32
+ def split_text(self, text: str) -> list[str]:
33
+ chunks = [chunk + self.separator for chunk in text.split(self.separator)]
34
+ chunks[-1] = chunks[-1].rstrip(self.separator)
35
+ return chunks
@@ -0,0 +1,47 @@
1
+ import time
2
+
3
+ from langchain_text_splitters import TokenTextSplitter
4
+
5
+ from purrfectmeow.meow.kitty import kitty_logger
6
+
7
+
8
+ class TokenSplit:
9
+ _logger = kitty_logger(__name__)
10
+
11
+ _OPENAI_EMBED_MODEL = {"text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"}
12
+ _OPENAI_HF_MODEL = {"Xenova/text-embedding-ada-002"}
13
+ _HF_MODEL_DIR = ".cache/huggingface/hub/"
14
+
15
+ @classmethod
16
+ def splitter(cls, model_name: str, chunk_size: int, chunk_overlap: int) -> TokenTextSplitter:
17
+ cls._logger.debug("Initializing token splitter")
18
+ start = time.time()
19
+
20
+ try:
21
+ cls._logger.debug(f"Using OpenAI model tokenizer: {model_name}")
22
+ if model_name in cls._OPENAI_EMBED_MODEL:
23
+ splitter = TokenTextSplitter.from_tiktoken_encoder(
24
+ model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap
25
+ )
26
+ else:
27
+ cls._logger.debug(f"Using HuggingFace tokenizer: {model_name}")
28
+ from transformers import AutoTokenizer, GPT2TokenizerFast
29
+
30
+ if model_name in cls._OPENAI_HF_MODEL:
31
+ tokenizer = GPT2TokenizerFast.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
32
+ else:
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
34
+ splitter = TokenTextSplitter.from_huggingface_tokenizer(
35
+ tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
36
+ )
37
+
38
+ cls._logger.debug("Token splitter successfully initialized.")
39
+ return splitter
40
+
41
+ except Exception as e:
42
+ cls._logger.exception(f"Failed to initialize token splitter: {e}")
43
+ raise
44
+
45
+ finally:
46
+ elapsed = time.time() - start
47
+ cls._logger.debug(f"Token splitting completed in {elapsed:.2f} seconds.")
@@ -0,0 +1,3 @@
1
+ from .base import WichienMaat
2
+
3
+ __all__ = ["WichienMaat"]
@@ -0,0 +1,14 @@
1
+ from typing import List, Optional
2
+ import numpy
3
+
4
+ from .local import Local
5
+
6
+ class WichienMaat:
7
+ DEFAULT_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
8
+
9
+ @classmethod
10
+ def embedding(cls, sentence: str | List[str], model_name: Optional[str] = None) -> numpy.ndarray:
11
+ if model_name:
12
+ return Local.model_encode(sentence, model_name)
13
+ else:
14
+ return Local.model_encode(sentence, cls.DEFAULT_MODEL_NAME)
@@ -0,0 +1,35 @@
1
+ import time
2
+ from typing import Any
3
+
4
+ import numpy
5
+
6
+ from purrfectmeow.meow.kitty import kitty_logger
7
+
8
+
9
+ class Local:
10
+ _logger = kitty_logger(__name__)
11
+ _HF_MODEL_DIR = ".cache/huggingface/hub/"
12
+
13
+ @classmethod
14
+ def model_encode(cls, sentence: str | list[str], model_name: str, **kwargs: Any) -> numpy.ndarray:
15
+ cls._logger.debug("Initializing local model encode")
16
+ start = time.time()
17
+ try:
18
+ from sentence_transformers import SentenceTransformer
19
+
20
+ model = SentenceTransformer(
21
+ model_name,
22
+ cache_folder=cls._HF_MODEL_DIR,
23
+ # local_files_only=True
24
+ )
25
+
26
+ embed = model.encode(sentence, convert_to_numpy=True)
27
+
28
+ cls._logger.debug("Local model encode successfully initialized.")
29
+ return embed
30
+ except Exception as e:
31
+ cls._logger.exception(f"Failed to initialize local model encode: {e}")
32
+ raise
33
+ finally:
34
+ elapsed = time.time() - start
35
+ cls._logger.debug(f"Local model encode completed in {elapsed:.2f} seconds.")
@@ -0,0 +1,3 @@
1
+ from .base import KhaoManee
2
+
3
+ __all__ = ["KhaoManee"]
@@ -0,0 +1,18 @@
1
+ import numpy
2
+
3
+ from purrfectmeow.meow.chaus import SimilarityResult
4
+ from purrfectmeow.meow.felis import Document
5
+
6
+ from .cosine import CosineSim
7
+
8
+
9
+ class KhaoManee:
10
+ @classmethod
11
+ def searching(
12
+ cls,
13
+ query_embed: numpy.ndarray,
14
+ sentence_embed: numpy.ndarray | list[numpy.ndarray],
15
+ documents: list[Document],
16
+ top_k: int,
17
+ ) -> list[SimilarityResult]:
18
+ return CosineSim.vector_search(query_embed, sentence_embed, documents, top_k)
@@ -0,0 +1,39 @@
1
+ import time
2
+
3
+ import numpy
4
+
5
+ from purrfectmeow.meow.chaus import SimilarityResult
6
+ from purrfectmeow.meow.felis import Document
7
+ from purrfectmeow.meow.kitty import kitty_logger
8
+
9
+
10
+ class CosineSim:
11
+ _logger = kitty_logger(__name__)
12
+
13
+ @classmethod
14
+ def vector_search(
15
+ cls,
16
+ embed_query: numpy.ndarray,
17
+ embed_sentence: numpy.ndarray | list[numpy.ndarray],
18
+ documents: list[Document],
19
+ top_k: int,
20
+ ) -> list[SimilarityResult]:
21
+ cls._logger.debug("Initializing vector search")
22
+ start = time.time()
23
+ try:
24
+ from sklearn.metrics.pairwise import cosine_similarity
25
+
26
+ sims = cosine_similarity([embed_query], embed_sentence)[0]
27
+ top_indices = numpy.argsort(sims)[::-1][:top_k]
28
+
29
+ results: list[SimilarityResult] = [
30
+ SimilarityResult(score=float(sims[i]), document=documents[i]) for i in top_indices
31
+ ]
32
+
33
+ return results
34
+ except Exception as e:
35
+ cls._logger.exception(f"Failed to initialize vector search: {e}")
36
+ raise
37
+ finally:
38
+ elapsed = time.time() - start
39
+ cls._logger.debug(f"Vector search completed in {elapsed:.2f} seconds.")
@@ -0,0 +1,3 @@
1
+ from .base import Kornja
2
+
3
+ __all__ = ["Kornja"]
@@ -0,0 +1,4 @@
1
+ class Kornja:
2
+ @classmethod
3
+ def generating(cls) -> None:
4
+ pass
@@ -0,0 +1,114 @@
1
+ [build-system]
2
+ requires = ["uv_build"]
3
+ build-backend = "uv_build"
4
+
5
+ [project]
6
+ name = "purrfectkit"
7
+ version = "0.2.2"
8
+ authors = [{ name = "SUWALUTIONS", email = "suwa@suwalutions.com" }]
9
+ maintainers = [{ name = "KHARAPSY", email = "kharapsy@suwalutions.com" }]
10
+ description = "**PurrfectKit** is a Python library for effortless Retrieval-Augmented Generation (RAG) workflows."
11
+ keywords = ["rag", "nlp", "llms", "python", "ai", "ocr", "document-processing", "multilingual", "text-extraction"]
12
+ readme = "README.md"
13
+ license = "MIT"
14
+ license-files = ["LICEN[CS]E*"]
15
+ requires-python = ">=3.10"
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Operating System :: OS Independent",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ "Topic :: Software Development :: Libraries",
27
+ "Topic :: Software Development :: Libraries :: Python Modules",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ "Topic :: Text Processing :: Linguistic",
30
+ "Topic :: Text Processing :: General",
31
+ "Natural Language :: English",
32
+ "Natural Language :: Thai",
33
+ ]
34
+
35
+ dependencies = [
36
+ "python-magic<=0.4.27",
37
+ "sentence_transformers<=5.1.0",
38
+ "transformers<=4.53.0",
39
+ "docling<=2.31.1",
40
+ "markitdown<=0.1.1",
41
+ "pymupdf4llm<=0.0.27",
42
+ "pdf2image<=1.17.0",
43
+ "pytesseract<=0.3.13",
44
+ "easyocr<=1.7.2",
45
+ "surya-ocr<=0.14.0",
46
+ "python-doctr<=1.0.0",
47
+ "pandas<=2.3.2",
48
+ "langchain-text-splitters<=1.0.0",
49
+ "tiktoken<=0.12.0",
50
+ ]
51
+
52
+ [project.optional-dependencies]
53
+ Dev = [
54
+ "ruff<=0.6.0",
55
+ "mypy<=1.11.0",
56
+ "pre-commit<=3.8.0",
57
+ "detect-secrets<=1.5.0",
58
+ "codecov-cli<=11.2.4"
59
+ ]
60
+ Docs = [
61
+ "sphinx<=8.2.3",
62
+ "sphinx-rtd-theme<=3.0.2",
63
+ ]
64
+ Test = [
65
+ "pytest<=8.4.2",
66
+ "pytest-cov<=7.0.0",
67
+ "pytest-mock<=3.15.1",
68
+ ]
69
+
70
+ [project.urls]
71
+ Documentation = "https://suwalutions.github.io/PurrfectKit"
72
+ Repository = "https://github.com/SUWALUTIONS/PurrfectKit"
73
+ Issues = "https://github.com/SUWALUTIONS/PurrfectKit/issues"
74
+
75
+ [tool.uv.build-backend]
76
+ module-root = ""
77
+ module-name = "purrfectmeow"
78
+ include = ["LICENSE", "README.md", "pyproject.toml"]
79
+ exclude = [
80
+ ".bumpversion.cfg",
81
+ ".cache/",
82
+ ".git/",
83
+ ".github/",
84
+ ".pytest_cache/",
85
+ ".venv/",
86
+ "build/",
87
+ "dist/",
88
+ "docs/",
89
+ "test",
90
+ "tests/",
91
+ ".bumpversion",
92
+ ".dockerignore",
93
+ ".gitignore",
94
+ ".pre-commit-config.yaml",
95
+ ".secrets.baseline",
96
+ "Dockerfile",
97
+ "Makefile",
98
+ "*.txt",
99
+ "*.tar",
100
+ "uv.lock",
101
+ ]
102
+
103
+ [tool.ruff]
104
+ select = ["E", "F", "W", "I", "B", "UP"]
105
+ ignore = []
106
+ fixable = ["ALL"]
107
+ line-length = 120
108
+
109
+ [tool.mypy]
110
+ strict = true
111
+ ignore_missing_imports = true
112
+
113
+ [tool.pytest.ini_options]
114
+ addopts = "-ra -q"