purrfectkit 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,102 @@
1
+ Metadata-Version: 2.4
2
+ Name: purrfectkit
3
+ Version: 0.2.1
4
+ Summary: **PurrfectKit** is a Python library for effortless Retrieval-Augmented Generation (RAG) workflows.
5
+ Keywords: rag,nlp,llms,python,ai,ocr,document-processing,multilingual,text-extraction
6
+ Author: SUWALUTIONS
7
+ Author-email: SUWALUTIONS <suwa@suwalutions.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Text Processing :: Linguistic
23
+ Classifier: Topic :: Text Processing :: General
24
+ Classifier: Natural Language :: English
25
+ Classifier: Natural Language :: Thai
26
+ Requires-Dist: python-magic<=0.4.27
27
+ Requires-Dist: sentence-transformers<=5.1.0
28
+ Requires-Dist: transformers<=4.52.1
29
+ Requires-Dist: docling<=2.31.1
30
+ Requires-Dist: markitdown<=0.1.1
31
+ Requires-Dist: pymupdf4llm<=0.0.27
32
+ Requires-Dist: pdf2image<=1.17.0
33
+ Requires-Dist: pytesseract<=0.3.13
34
+ Requires-Dist: easyocr<=1.7.2
35
+ Requires-Dist: surya-ocr<=0.14.0
36
+ Requires-Dist: python-doctr<=1.0.0
37
+ Requires-Dist: pandas<=2.3.2
38
+ Requires-Dist: langchain-text-splitters<=1.0.0
39
+ Requires-Dist: tiktoken<=0.12.0
40
+ Requires-Dist: sphinx<=8.2.3 ; extra == 'docs'
41
+ Requires-Dist: sphinx-rtd-theme<=3.0.2 ; extra == 'docs'
42
+ Requires-Dist: pytest<=8.4.2 ; extra == 'test'
43
+ Requires-Dist: pytest-mock<=3.15.1 ; extra == 'test'
44
+ Maintainer: KHARAPSY
45
+ Maintainer-email: KHARAPSY <kharapsy@suwalutions.com>
46
+ Requires-Python: >=3.10
47
+ Project-URL: Documentation, https://suwalutions.github.io/PurrfectKit
48
+ Project-URL: Issues, https://github.com/SUWALUTIONS/PurrfectKit/issues
49
+ Project-URL: Repository, https://github.com/SUWALUTIONS/PurrfectKit
50
+ Provides-Extra: dev
51
+ Provides-Extra: docs
52
+ Provides-Extra: test
53
+ Description-Content-Type: text/markdown
54
+
55
+ ![PurrfectMeow Logo](docs/_static/repo-logo.png)
56
+
57
+ # PurrfectKit
58
+
59
+ [![Docker Image](https://github.com/suwalutions/PurrfectKit/actions/workflows/docker-image.yml/badge.svg)](https://github.com/suwalutions/PurrfectKit/actions/workflows/docker-image.yml)
60
+
61
+ **PurrfectKit** is a toolkit that simplifies Retrieval-Augmented Generation (RAG) into 5 easy steps:
62
+ 1. Suphalak - read content from files
63
+ 2. Malet - split content into chunks
64
+ 3. WichienMaat - embed chunks into vectors
65
+ 4. KhaoManee - search vectors with queries
66
+ 5. Kornja - generate answers from vectors
67
+
68
+ > **_NOTE:_** Each step is inspired by a unique Thai cat breed, making the workflow memorable and fun.
69
+
70
+ ## Quickstart
71
+
72
+ ### Prerequisites
73
+ - python
74
+ - tesseract
75
+ - git
76
+
77
+
78
+ ### Installation
79
+ ```bash
80
+ pip install git+https://github.com/suwalutions/PurrfectKit.git
81
+
82
+ ```
83
+
84
+ ### Usage
85
+ ```python
86
+ from purrfectmeow.meow.felis import DocTemplate, MetaFile
87
+ from purrfectmeow import Suphalak, Malet, WichienMaat, KhaoManee
88
+
89
+ file_path = 'test/test.pdf'
90
+ metadata = MetaFile.get_metadata(file_path)
91
+ content = Suphalak.reading(open(file_path, 'rb').read(), 'test.pdf', loader='PYMUPDF')
92
+ chunks = Malet.chunking(content, chunk_method='token', chunk_size='500', chunk_overlap='25')
93
+ docs = DocTemplate.create_template(chunks, metadata)
94
+ embedding = WichienMaat.embedding(chunks)
95
+ query = WichienMaat.embedding("ทดสอบ")
96
+ KhaoManee.searching(query, embedding, docs, 2)
97
+
98
+ ```
99
+
100
+ ## 📄 License
101
+
102
+ PurrfectKit is released under the [MIT License](LICENSE).
@@ -0,0 +1,24 @@
1
+ purrfectmeow/__init__.py,sha256=XEej-s0VH-Up9aob3XcDQqgS55Ftk_qNoXezdcedFJQ,271
2
+ purrfectmeow/meow/felis.py,sha256=8d1kaizsEisr7dW-MKw8HqsYfOkLBGy-sYTv-4kClQ8,6149
3
+ purrfectmeow/meow/kitty.py,sha256=WaLuh2t1PnigWYDNZlbNfCA_uqXnPYc-xxDuZlFfNNY,1971
4
+ purrfectmeow/tc01_spl/__init__.py,sha256=7ENCidvXhj9YhMQvBcv_mm4XIr3Mwzc1USQxgzLO0Nw,51
5
+ purrfectmeow/tc01_spl/base.py,sha256=iuIZiPUe-ofeF_PmknnCg-4NsJxDoH7rj-SMsqNBTAQ,3308
6
+ purrfectmeow/tc01_spl/markdown.py,sha256=AUCSZ-6W0sXbZwGgZfe6utidbEemQGoi6c4rsLiH928,1861
7
+ purrfectmeow/tc01_spl/ocr.py,sha256=A3orLTIVmu2WYJTi4joWlTmV27IDh3MTa7qc7IRAQkE,4784
8
+ purrfectmeow/tc01_spl/simple.py,sha256=dwecYL2sviKz4BoJcOQntAprXACvaEig-ZbDiwTW-cU,2347
9
+ purrfectmeow/tc02_mlt/__init__.py,sha256=qB2Eyc_wFDVELwj0L7ttG_YOL3IISaqPBRj0zqSJcPo,45
10
+ purrfectmeow/tc02_mlt/base.py,sha256=cz1qFo1AdL-I2wnBPO06MhcYSQh90tcLCN99phIUKWw,1508
11
+ purrfectmeow/tc02_mlt/separate.py,sha256=YQSnC5BODg1cJh4JrPkT_-tO1CbwgpxuCjMvHQwRUNE,1074
12
+ purrfectmeow/tc02_mlt/token.py,sha256=qULVySiTAbDoBQrtWWuvPkO5Zqf5hjRutN1Q7foCwUU,2052
13
+ purrfectmeow/tc03_wcm/__init__.py,sha256=8pXGo04Z5KUNGkhSTONLBlqwVc43LicDGSuQiQDIKIM,57
14
+ purrfectmeow/tc03_wcm/base.py,sha256=pXaaiU8JMLIjI5uJRxMLRnQ1Wmwv3U6EEkQ_IwhPLwg,473
15
+ purrfectmeow/tc03_wcm/local.py,sha256=5AfVSftW_cfaZBZBe-joSMJRRJ55G0g5lf9Qtcl0LUw,1074
16
+ purrfectmeow/tc04_kmn/__init__.py,sha256=FBHZKVu4agf6-p1MdMx0jIgQuKbAy9rsOu7MRIQVwXg,53
17
+ purrfectmeow/tc04_kmn/base.py,sha256=rj3Ar2Pv8VOL7vKvPB-snif8SRwBbGaLbWIpHFpd5b8,224
18
+ purrfectmeow/tc04_kmn/cosine.py,sha256=DaDXVcy6YyNc5jwtPXeQg040FT7607phyt5Ub74E9aw,1147
19
+ purrfectmeow/tc05_knj/__init__.py,sha256=XKwISvOAznPdTUWoTUnFDMBmxZF9Qd6FAi711W6bvZY,47
20
+ purrfectmeow/tc05_knj/base.py,sha256=qN1VCx20G5H7YHcVzmg0YNXMLZM7TPkiD_UMEZykfjE,70
21
+ purrfectkit-0.2.1.dist-info/licenses/LICENSE,sha256=9WlLgfJwKDGb71B1NwKYKKg6uL5u_knAr7ovGwIWvD4,1078
22
+ purrfectkit-0.2.1.dist-info/WHEEL,sha256=5w2T7AS2mz1-rW9CNagNYWRCaB0iQqBMYLwKdlgiR4Q,78
23
+ purrfectkit-0.2.1.dist-info/METADATA,sha256=cSe3NLmt6D8LaZSpilNU1c3G9k0P5XGThncqp6K2Crk,3765
24
+ purrfectkit-0.2.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.9.7
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 SUWALUTIONS CO., LTD.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,15 @@
1
+ from .tc01_spl import Suphalak
2
+ from .tc02_mlt import Malet
3
+ from .tc03_wcm import WichienMaat
4
+ from .tc04_kmn import KhaoManee
5
+ from .tc05_knj import Kornja
6
+
7
+ __all__ = [
8
+ "Suphalak",
9
+ "Malet",
10
+ "WichienMaat",
11
+ "KhaoManee",
12
+ "Kornja",
13
+ ]
14
+
15
+ __version__ = "0.2.1"
@@ -0,0 +1,171 @@
1
+ from typing import Any, Dict, List, Union
2
+ from io import BytesIO
3
+
4
+ class Document:
5
+ def __init__(self, page_content: str, metadata: Dict[str, Any]):
6
+ self.page_content = page_content
7
+ self.metadata = metadata or {}
8
+
9
+ def __repr__(self):
10
+ return f"{self.__class__.__name__}(page_content={self.page_content!r}, metadata={self.metadata!r})"
11
+
12
+ def __getitem__(self, key):
13
+ if key == "page_content":
14
+ return self.page_content
15
+ elif key == "metadata":
16
+ return self.metadata
17
+ else:
18
+ raise KeyError(f"{key} is not a valid key. Use 'page_content' or 'metadata'.")
19
+
20
+ def to_dict(self):
21
+ return {
22
+ "page_content": self.page_content,
23
+ "metadata": self.metadata
24
+ }
25
+
26
+ class DocTemplate:
27
+ @staticmethod
28
+ def create_template(chunks: List[str], metadata: Dict[str, Any]) -> List[Document]:
29
+ if not isinstance(chunks, list):
30
+ raise TypeError(f"Expected 'chunks' to be a list, but got {type(chunks).__name__}.")
31
+
32
+ if not isinstance(metadata, dict):
33
+ raise TypeError(f"Expected 'metadata' to be a dict, but got {type(metadata).__name__}.")
34
+
35
+ if not all(isinstance(c, str) for c in chunks):
36
+ raise ValueError("All elements in 'chunks' must be strings.")
37
+
38
+ docs = []
39
+ chunk_hashes = []
40
+
41
+ import uuid
42
+ import hashlib
43
+
44
+ for idx, chunk in enumerate(chunks):
45
+ hash_val = hashlib.md5(chunk.encode()).hexdigest()
46
+ chunk_hashes.append(hash_val)
47
+
48
+ for idx, chunk in enumerate(chunks):
49
+ chunk_number = idx + 1
50
+ chunk_id = uuid.uuid4().hex
51
+ chunk_hash = chunk_hashes[idx]
52
+ prev_hash = chunk_hashes[idx - 1] if idx > 0 else None
53
+ next_hash = chunk_hashes[idx + 1] if idx < len(chunks) - 1 else None
54
+ chunk_size = len(chunk)
55
+
56
+ chunk_info = {
57
+ "chunk_number": chunk_number,
58
+ "chunk_id": chunk_id,
59
+ "chunk_hash": chunk_hash,
60
+ "previous_chunk_hash": prev_hash,
61
+ "next_chunk_hash": next_hash,
62
+ "chunk_size": chunk_size,
63
+ }
64
+
65
+ doc_metadata = {
66
+ "chunk_info": chunk_info,
67
+ "source_info": metadata
68
+ }
69
+
70
+ doc = Document(
71
+ page_content=chunk,
72
+ metadata=doc_metadata
73
+ )
74
+ docs.append(doc)
75
+
76
+ return docs
77
+
78
+ class MetaFile:
79
+ @staticmethod
80
+ def get_metadata(file: Union[str, BytesIO], **kwargs: Any) -> Dict[str, Union[str, int]]:
81
+ if isinstance(file, bytes):
82
+ file = BytesIO(file)
83
+
84
+ if isinstance(file, BytesIO):
85
+ import os
86
+
87
+ os.makedirs(".cache/tmp", exist_ok=True)
88
+ file_name = kwargs.get('file_name')
89
+
90
+ if not file_name:
91
+ raise ValueError("file_name must be provided when using BytesIO.")
92
+
93
+ file_path = os.path.join(".cache/tmp", file_name)
94
+ with open(file_path, 'wb') as f:
95
+ f.write(file.getvalue())
96
+
97
+ try:
98
+ return MetaFile._get_metadata_from_path(file_path)
99
+ finally:
100
+ os.remove(file_path)
101
+
102
+ elif isinstance(file, str):
103
+ return MetaFile._get_metadata_from_path(file)
104
+
105
+ else:
106
+ raise TypeError(f"Unsupported file type: {type(file).__name__}. Expected str, bytes, or BytesIO.")
107
+
108
+ @staticmethod
109
+ def _get_metadata_from_path(file_path: str) -> Dict[str, Union[str, int]]:
110
+ metadata = {}
111
+
112
+ import os
113
+ import re
114
+ import time
115
+ import magic
116
+ import hashlib
117
+ import subprocess
118
+
119
+ try:
120
+ if not os.path.exists(file_path):
121
+ raise FileNotFoundError(f"File {file_path} does not exist")
122
+
123
+ stats = os.stat(file_path)
124
+ metadata["file_name"] = os.path.basename(file_path)
125
+ metadata["file_size"] = stats.st_size
126
+ metadata["file_created_date"] = time.strftime(
127
+ '%Y-%m-%d %H:%M:%S', time.localtime(stats.st_ctime)
128
+ )
129
+ metadata["file_modified_date"] = time.strftime(
130
+ '%Y-%m-%d %H:%M:%S', time.localtime(stats.st_mtime)
131
+ )
132
+ metadata["file_extension"] = os.path.splitext(file_path)[1] or "none"
133
+
134
+ try:
135
+ mime = magic.Magic(mime=True)
136
+ metadata["file_type"] = mime.from_file(file_path)
137
+ metadata["description"] = magic.from_file(file_path)
138
+ except Exception as e:
139
+ metadata["file_type"] = "unknown"
140
+ metadata["description"] = f"Could not determine file type: {str(e)}"
141
+
142
+ if metadata["file_type"].startswith("image/"):
143
+ metadata["total_pages"] = 1
144
+ elif metadata["file_type"].startswith("application/pdf"):
145
+ try:
146
+ result = subprocess.run(
147
+ ['pdfinfo', file_path],
148
+ stdout=subprocess.PIPE,
149
+ text=True,
150
+ check=True
151
+ )
152
+ pages_match = re.search(r"Pages:\s*(\d+)", result.stdout)
153
+ if pages_match:
154
+ metadata["total_pages"] = int(pages_match.group(1))
155
+ else:
156
+ metadata["total_pages"] = "Unknown (could not parse page count)"
157
+ except (subprocess.CalledProcessError, FileNotFoundError):
158
+ metadata["total_pages"] = "Unknown (pdfinfo not installed or failed)"
159
+ else:
160
+ metadata["total_pages"] = 1
161
+
162
+ with open(file_path, "rb") as f:
163
+ hash_md5 = hashlib.md5()
164
+ for chunk in iter(lambda: f.read(4096), b""):
165
+ hash_md5.update(chunk)
166
+ metadata["file_md5"] = hash_md5.hexdigest()
167
+
168
+ return metadata
169
+
170
+ except Exception as e:
171
+ raise RuntimeError(f"Failed to extract metadata: {str(e)}")
@@ -0,0 +1,52 @@
1
+ import logging
2
+ from logging.handlers import RotatingFileHandler
3
+ from pathlib import Path
4
+
5
+ class LevelBasedFormatter(logging.Formatter):
6
+ def __init__(self, default_fmt, info_fmt, datefmt=None):
7
+ super().__init__(datefmt=datefmt)
8
+ self.default_fmt = logging.Formatter(default_fmt, datefmt)
9
+ self.info_fmt = logging.Formatter(info_fmt, datefmt)
10
+
11
+ def format(self, record):
12
+ if record.levelno == logging.INFO:
13
+ return self.info_fmt.format(record)
14
+ return self.default_fmt.format(record)
15
+
16
+ def kitty_logger(name: str, log_file: str = "kitty.log", log_level: str = "INFO") -> logging.Logger:
17
+ """
18
+ Sets up a logger with console and rotating file handlers.
19
+
20
+ Args:
21
+ name (str): Name of the logger (usually __name__ of the calling module).
22
+ log_file (str): Path to the log file. Defaults to 'kitty.log'.
23
+ log_level (str): Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'). Defaults to 'INFO'.
24
+
25
+ Returns:
26
+ logging.Logger: Configured logger instance.
27
+ """
28
+ logger = logging.getLogger(name)
29
+ logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
30
+
31
+ if not logger.handlers:
32
+ default_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s"
33
+ info_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] - %(message)s"
34
+ datefmt = "%Y-%m-%d %H:%M:%S"
35
+
36
+ formatter = LevelBasedFormatter(default_fmt, info_fmt, datefmt)
37
+
38
+ console_handler = logging.StreamHandler()
39
+ console_handler.setFormatter(formatter)
40
+ logger.addHandler(console_handler)
41
+
42
+ log_dir = Path(".cache/logs")
43
+ log_dir.mkdir(parents=True, exist_ok=True)
44
+ log_path = log_dir / log_file
45
+
46
+ file_handler = RotatingFileHandler(
47
+ log_path, maxBytes=5 * 1024 * 1024, backupCount=3
48
+ )
49
+ file_handler.setFormatter(formatter)
50
+ logger.addHandler(file_handler)
51
+
52
+ return logger
@@ -0,0 +1,3 @@
1
+ from .base import Suphalak
2
+
3
+ __all__ = ['Suphalak']
@@ -0,0 +1,107 @@
1
+ from typing import Dict, BinaryIO, Any
2
+
3
+ from .markdown import Markdown
4
+ from .ocr import Ocr
5
+ from .simple import Simple
6
+
7
+ class Suphalak:
8
+ tmp_dir = '.cache/tmp'
9
+ DEFAULT_LOADER = "PYMUPDF4LLM"
10
+
11
+ _LOADERS: Dict[str, Dict[str, Any]] = {
12
+ "MARKITDOWN": {
13
+ "func": Markdown.markitdown_convert,
14
+ "ext": ("csv", "docx", "md", "pdf", "pptx", "txt", "xls", "xlsx"),
15
+ },
16
+ "DOCLING": {
17
+ "func": Markdown.docling_convert,
18
+ "ext": ("csv", "docx", "jpg", "md", "pdf", "png", "pptx", "xlsx"),
19
+ },
20
+ "PYMUPDF4LLM": {
21
+ "func": Markdown.pymupdf4llm_convert,
22
+ "ext": ("docx", "pdf", "pptx", "txt", "xlsx"),
23
+ },
24
+ "PYTESSERACT": {
25
+ "func": Ocr.pytesseract_convert,
26
+ "ext": ("gif", "jpg", "pdf", "png"),
27
+ },
28
+ "EASYOCR": {
29
+ "func": Ocr.easyocr_convert,
30
+ "ext": ("gif", "jpg", "pdf", "png"),
31
+ },
32
+ "SURYAOCR": {
33
+ "func": Ocr.suryaocr_convert,
34
+ "ext": ("gif", "jpg", "pdf", "png"),
35
+ },
36
+ "DOCTR": {
37
+ "func": Ocr.doctr_convert,
38
+ "ext": ("gif", "jpg", "pdf", "png"),
39
+ },
40
+ "PYMUPDF": {
41
+ "func": Simple.pymupdf_convert,
42
+ "ext": ("docx", "md", "pdf", "pptx", "xlsx"),
43
+ },
44
+ "PANDAS": {
45
+ "func": Simple.pandas_convert,
46
+ "ext": ("csv", "xls", "xlsx"),
47
+ },
48
+ "ENCODING": {
49
+ "func": Simple.encoding_convert,
50
+ "ext": ("csv", "md", "txt"),
51
+ },
52
+ }
53
+
54
+ @classmethod
55
+ def _detect_loader(cls, file_ext: str) -> str:
56
+ priority = [
57
+ ("PANDAS", ("csv", "xls")),
58
+ ("PYTESSERACT", ("jpg", "png", "gif")),
59
+ ("PYMUPDF", ("pdf", "md")),
60
+ ("PYMUPDF4LLM", ("txt", "xlsx", "pptx", "docx")),
61
+ ]
62
+
63
+ for loader, extensions in priority:
64
+ if file_ext in extensions:
65
+ return loader
66
+
67
+ return cls.DEFAULT_LOADER
68
+
69
+ @classmethod
70
+ def reading(cls, file: BinaryIO, file_name: str, loader: str = None, **kwargs: Any) -> str:
71
+ import os
72
+ file_ext = file_name.split(".")[-1].lower()
73
+
74
+ if not loader:
75
+ loader = cls._detect_loader(file_ext)
76
+
77
+ if loader not in cls._LOADERS:
78
+ raise ValueError(f"Unsupported loader: '{loader}'")
79
+
80
+ loader_conf = cls._LOADERS[loader]
81
+ supported_ext = loader_conf["ext"]
82
+
83
+ if file_ext not in supported_ext:
84
+ raise TypeError(f"'{file_ext}' is not supported for '{loader}' loader.")
85
+
86
+ os.makedirs(cls.tmp_dir, exist_ok=True)
87
+ file_path = os.path.join(cls.tmp_dir, file_name)
88
+
89
+ try:
90
+ with open(file_path, "wb") as f:
91
+ f.write(file.read())
92
+
93
+ text = loader_conf["func"](file_path, **kwargs)
94
+
95
+ if (
96
+ file_ext == "pdf"
97
+ and (not text or not str(text).strip())
98
+ and loader not in ("PYTESSERACT", "EASYOCR", "SURYAOCR", "DOCTR")
99
+ ):
100
+ ocr_loader = cls._LOADERS["PYTESSERACT"]
101
+ text = ocr_loader["func"](file_path, **kwargs)
102
+
103
+ return text
104
+
105
+ finally:
106
+ if os.path.exists(file_path):
107
+ os.remove(file_path)
@@ -0,0 +1,57 @@
1
+ import time
2
+ from typing import Callable
3
+
4
+ from purrfectmeow.meow.kitty import kitty_logger
5
+
6
+ class Markdown:
7
+
8
+ _logger = kitty_logger(__name__)
9
+
10
+ @classmethod
11
+ def _convert(cls, file_path: str, converter: Callable, extractor: Callable) -> str:
12
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
13
+ start = time.time()
14
+ try:
15
+ content = converter.convert(file_path)
16
+ result = extractor(content)
17
+
18
+ cls._logger.debug(f"Succesfully converted '{file_path}'")
19
+
20
+ return result
21
+
22
+ finally:
23
+ elapsed = time.time() - start
24
+ cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
25
+
26
+ @classmethod
27
+ def markitdown_convert(cls, file_path: str) -> str:
28
+ cls._logger.debug("Using MarkItDown for Conversion")
29
+
30
+ from markitdown import MarkItDown
31
+
32
+ return cls._convert(file_path, MarkItDown(), lambda content: content.text_content)
33
+
34
+ @classmethod
35
+ def docling_convert(cls, file_path: str) -> str:
36
+ cls._logger.debug("Using Docling for Conversion")
37
+
38
+ from docling.document_converter import DocumentConverter
39
+
40
+ return cls._convert(file_path, DocumentConverter(), lambda content: content.document.export_to_markdown())
41
+
42
+ @classmethod
43
+ def pymupdf4llm_convert(cls, file_path: str) -> str:
44
+ cls._logger.debug("Using PyMuPDF4LLM for Conversion")
45
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
46
+ start = time.time()
47
+
48
+ import pymupdf4llm
49
+
50
+ try:
51
+ res = pymupdf4llm.to_markdown(file_path)
52
+ cls._logger.debug(f"Succesfully converted '{file_path}'")
53
+
54
+ return res
55
+ finally:
56
+ elapsed = time.time() - start
57
+ cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
@@ -0,0 +1,141 @@
1
+ import time
2
+ from typing import Callable
3
+
4
+ from purrfectmeow.meow.kitty import kitty_logger
5
+
6
+ class Ocr:
7
+
8
+ _logger = kitty_logger(__name__)
9
+ _image_type = [
10
+ ".apng", ".png",
11
+ ".avif",
12
+ ".gif",
13
+ ".jpg", ".jpeg", ".jfif", ".pjpeg", ".pjp",
14
+ ".png",
15
+ ".svg",
16
+ ".webp",
17
+ ".bmp",
18
+ ".ico", ".cur",
19
+ ".tif", ".tiff"
20
+ ]
21
+
22
+ @classmethod
23
+ def _convert(cls, file_path: str, converter: Callable) -> str:
24
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
25
+ start = time.time()
26
+
27
+ try:
28
+ content = []
29
+ match file_path.lower():
30
+ case path if path.endswith(".pdf"):
31
+
32
+ from pdf2image import convert_from_path
33
+
34
+ images = convert_from_path(file_path, fmt="png")
35
+ for idx, image in enumerate(images):
36
+ try:
37
+ text = converter(image)
38
+ cls._logger.debug(f"Text: {text}")
39
+ content.append(text)
40
+ cls._logger.debug(f"Page {idx+1} processed")
41
+ except Exception as e:
42
+ cls._logger.exception(f"Page {idx+1} failed: {e}")
43
+ raise
44
+ case path if path.endswith(tuple(cls._image_type)):
45
+
46
+ from PIL import Image
47
+
48
+ image = Image.open(file_path)
49
+ try:
50
+ text = converter(image)
51
+ cls._logger.debug(f"Text: {text}")
52
+ content.append(text)
53
+ cls._logger.debug("Page 1 processed")
54
+ except Exception as e:
55
+ cls._logger.debug(f"Page 1 failed: {e}")
56
+ raise
57
+
58
+ cls._logger.debug(f"Successfully converted '{file_path}'")
59
+ return "\n".join(content)
60
+
61
+ finally:
62
+ elasped = time.time() - start
63
+ cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
64
+
65
+ @classmethod
66
+ def pytesseract_convert(cls, file_path: str) -> str:
67
+ cls._logger.debug("Using PyTesseract for Conversion")
68
+
69
+ def converter(image):
70
+ import pytesseract
71
+
72
+ return pytesseract.image_to_string(image, lang="tha+eng")
73
+
74
+ return cls._convert(file_path, converter)
75
+
76
+ @classmethod
77
+ def easyocr_convert(cls, file_path: str) -> str:
78
+ cls._logger.debug("Using EasyOCR for Conversion")
79
+
80
+ def converter(image):
81
+ import easyocr
82
+ import numpy
83
+
84
+ reader = easyocr.Reader(
85
+ ['th', 'en'],
86
+ gpu=False
87
+ )
88
+ res = reader.readtext(numpy.array(image))
89
+ return "\n".join(text for _, text, _ in res)
90
+ return cls._convert(file_path, converter)
91
+
92
+ @classmethod
93
+ def suryaocr_convert(cls, file_path: str) -> str:
94
+ cls._logger.debug("Using SuryaOCR for Conversion")
95
+
96
+ def converter(image):
97
+ from surya.recognition import RecognitionPredictor
98
+ from surya.detection import DetectionPredictor
99
+
100
+ rec_pred = RecognitionPredictor()
101
+ det_pred = DetectionPredictor()
102
+
103
+ prediction = rec_pred(
104
+ [image],
105
+ det_predictor=det_pred,
106
+ detection_batch_size=1,
107
+ recognition_batch_size=1,
108
+ )
109
+ return "\n".join(line.text for line in prediction[0].text_lines)
110
+ return cls._convert(file_path, converter)
111
+
112
+ @classmethod
113
+ def doctr_convert(cls, file_path: str) -> str:
114
+ cls._logger.debug("Using docTR for Conversion")
115
+
116
+ def converter(image):
117
+ import os
118
+ import tempfile
119
+ from doctr.io import DocumentFile
120
+ from doctr.models import ocr_predictor
121
+
122
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
123
+ image.save(tmp.name)
124
+ temp_image_path = tmp.name
125
+
126
+ model = ocr_predictor(pretrained=True)
127
+ doc = DocumentFile.from_images(temp_image_path)
128
+ result = model(doc)
129
+ data = result.export()
130
+ combined_text = "\n".join(
131
+ word["value"]
132
+ for page in data["pages"]
133
+ for block in page.get('blocks', [])
134
+ for line in block.get('lines', [])
135
+ for word in line.get('words', [])
136
+ if "value" in word
137
+ )
138
+ if os.path.exists(temp_image_path):
139
+ os.remove(temp_image_path)
140
+ return combined_text
141
+ return cls._convert(file_path, converter)
@@ -0,0 +1,64 @@
1
+ import time
2
+ from typing import Callable
3
+
4
+ from purrfectmeow.meow.kitty import kitty_logger
5
+
6
+ class Simple:
7
+
8
+ _logger = kitty_logger(__name__)
9
+
10
+ @classmethod
11
+ def _convert(cls, file_path: str, converter: Callable) -> str:
12
+ cls._logger.debug(f"Starting conversion for '{file_path}'")
13
+ start = time.time()
14
+
15
+ try:
16
+ res = converter(file_path)
17
+
18
+ cls._logger.debug(f"Successfully converted '{file_path}'")
19
+ return res
20
+
21
+ finally:
22
+ elasped = time.time() - start
23
+ cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
24
+
25
+ @classmethod
26
+ def encoding_convert(cls, file_path: str) -> str:
27
+ cls._logger.debug("Using Encoding for Conversion")
28
+
29
+ def reader(file_path):
30
+ with open(file_path, "r", encoding="utf-8") as f:
31
+ return f.read()
32
+ return cls._convert(file_path, lambda file_path: reader(file_path))
33
+
34
+ @classmethod
35
+ def pymupdf_convert(cls, file_path: str) -> str:
36
+ cls._logger.debug("Using PyMuPDF for Conversion")
37
+
38
+ def reader(file_path):
39
+ import pymupdf
40
+
41
+ if file_path.endswith(('.txt', '.md', '.json', '.html', '.xml')):
42
+ return "".join(page.get_text() for page in pymupdf.open(file_path, filetype="txt"))
43
+ else:
44
+ return "".join(page.get_text() for page in pymupdf.open(file_path))
45
+ return cls._convert(file_path, lambda file_path: reader(file_path))
46
+
47
+ @classmethod
48
+ def pandas_convert(cls, file_path: str) -> str:
49
+ cls._logger.debug("Using Pandas for Conversion")
50
+
51
+ def reader(file_path):
52
+ import pandas
53
+
54
+ if file_path.endswith(('.xls', '.xlsx')):
55
+ return pandas.read_excel(file_path).to_string(index=False)
56
+ elif file_path.endswith('.csv'):
57
+ return pandas.read_csv(file_path).to_string(index=False)
58
+ elif file_path.endswith('.json'):
59
+ return pandas.read_json(file_path).to_string(index=False)
60
+ elif file_path.endswith('.html'):
61
+ return pandas.read_html(file_path)
62
+ elif file_path.endswith('.xml'):
63
+ return pandas.read_xml(file_path).to_string(index=False)
64
+ return cls._convert(file_path, lambda file_path: reader(file_path))
@@ -0,0 +1,3 @@
1
+ from .base import Malet
2
+
3
+ __all__ = ["Malet"]
@@ -0,0 +1,34 @@
1
+ from typing import Any, List, Literal, Optional
2
+
3
+ from .token import TokenSplit
4
+ from .separate import SeparateSplit
5
+
6
+ class Malet:
7
+ DEFAULT_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
8
+ DEFAULT_CHUNK_SIZE = 500
9
+ DEFAULT_CHUNK_OVERLAP = 0
10
+ DEFAULT_CHUNK_SEPARATOR = '\n\n'
11
+
12
+ @staticmethod
13
+ def _get_kwarg(kwargs: dict, keys: List[str], default: Any = None) -> Any:
14
+ for key in keys:
15
+ if key in kwargs:
16
+ return kwargs[key]
17
+ return default
18
+
19
+ @classmethod
20
+ def chunking(cls, text: str, chunk_method: Optional[Literal["token", "separate"]] = "token", **kwargs: Any) -> List[str]:
21
+ match chunk_method:
22
+ case "token":
23
+ model_name = cls._get_kwarg(kwargs, ["model_name", "ModelName", "modelName"], cls.DEFAULT_MODEL_NAME)
24
+ chunk_size = cls._get_kwarg(kwargs, ["chunk_size", "ChunkSize", "chunkSize"], cls.DEFAULT_CHUNK_SIZE)
25
+ chunk_overlap = cls._get_kwarg(kwargs, ["chunk_overlap", "ChunkOverlap", "chunkOverlap"], cls.DEFAULT_CHUNK_OVERLAP)
26
+
27
+ method = TokenSplit.splitter(model_name, chunk_size, chunk_overlap)
28
+ return method.split_text(text)
29
+
30
+ case "separate":
31
+ chunk_separator = cls._get_kwarg(kwargs, ["chunk_separator", "ChunkSeparator", "chunkSeparator"], cls.DEFAULT_CHUNK_SEPARATOR)
32
+
33
+ method = SeparateSplit.splitter(chunk_separator)
34
+ return method.split_text(text)
@@ -0,0 +1,32 @@
1
+ import time
2
+
3
+ from purrfectmeow.meow.kitty import kitty_logger
4
+
5
+ class SeparateSplit:
6
+ _logger = kitty_logger(__name__)
7
+
8
+ @classmethod
9
+ def splitter(cls, chunk_separator: str):
10
+ cls._logger.debug("Initializing separate splitter")
11
+ start = time.time()
12
+
13
+ try:
14
+ splitter = cls.CharacterSeparator(chunk_separator)
15
+
16
+ cls._logger.debug("Separator splitter successfully initialized.")
17
+ return splitter
18
+ except Exception as e:
19
+ cls._logger.exception(f"Failed to initialize separate splitter: {e}")
20
+ raise
21
+ finally:
22
+ elapsed = time.time() - start
23
+ cls._logger.debug(f"Separate splitting completed in {elapsed:.2f} seconds.")
24
+
25
+ class CharacterSeparator:
26
+ def __init__(self, separator: str):
27
+ self.separator = separator
28
+
29
+ def split_text(self, text: str):
30
+ chunks = [chunk + self.separator for chunk in text.split(self.separator)]
31
+ chunks[-1] = chunks[-1].rstrip(self.separator)
32
+ return chunks
@@ -0,0 +1,55 @@
1
+ import time
2
+
3
+ from purrfectmeow.meow.kitty import kitty_logger
4
+
5
+ class TokenSplit:
6
+ _logger = kitty_logger(__name__)
7
+
8
+ _OPENAI_EMBED_MODEL = {
9
+ 'text-embedding-ada-002',
10
+ 'text-embedding-3-small',
11
+ 'text-embedding-3-large'
12
+ }
13
+ _OPENAI_HF_MODEL = {
14
+ 'Xenova/text-embedding-ada-002'
15
+ }
16
+ _HF_MODEL_DIR = '.cache/huggingface/hub/'
17
+
18
+ @classmethod
19
+ def splitter(cls, model_name: str, chunk_size: int, chunk_overlap: int):
20
+ cls._logger.debug("Initializing token splitter")
21
+ start = time.time()
22
+
23
+ try:
24
+ cls._logger.debug(f"Using OpenAI model tokenizer: {model_name}")
25
+ from langchain_text_splitters import TokenTextSplitter
26
+ if model_name in cls._OPENAI_EMBED_MODEL:
27
+ splitter = TokenTextSplitter.from_tiktoken_encoder(
28
+ model_name=model_name,
29
+ chunk_size=chunk_size,
30
+ chunk_overlap=chunk_overlap
31
+ )
32
+ else:
33
+ cls._logger.debug(f"Using HuggingFace tokenizer: {model_name}")
34
+ from transformers import AutoTokenizer, GPT2TokenizerFast
35
+ if model_name in cls._OPENAI_HF_MODEL:
36
+ tokenizer = GPT2TokenizerFast.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
37
+ else:
38
+ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
39
+ splitter = TokenTextSplitter.from_huggingface_tokenizer(
40
+ tokenizer=tokenizer,
41
+ chunk_size=chunk_size,
42
+ chunk_overlap=chunk_overlap
43
+ )
44
+
45
+ cls._logger.debug("Token splitter successfully initialized.")
46
+ return splitter
47
+
48
+ except Exception as e:
49
+ cls._logger.exception(f"Failed to initialize token splitter: {e}")
50
+ raise
51
+
52
+ finally:
53
+ elapsed = time.time() - start
54
+ cls._logger.debug(f"Token splitting completed in {elapsed:.2f} seconds.")
55
+
@@ -0,0 +1,3 @@
1
+ from .base import WichienMaat
2
+
3
+ __all__ = ["WichienMaat"]
@@ -0,0 +1,14 @@
1
+ from typing import List, Optional
2
+ import numpy
3
+
4
+ from .local import Local
5
+
6
+ class WichienMaat:
7
+ DEFAULT_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
8
+
9
+ @classmethod
10
+ def embedding(cls, sentence: str | List[str], model_name: Optional[str] = None) -> numpy.ndarray:
11
+ if model_name:
12
+ return Local.model_encode(sentence, model_name)
13
+ else:
14
+ return Local.model_encode(sentence, cls.DEFAULT_MODEL_NAME)
@@ -0,0 +1,33 @@
1
+ import time
2
+ from typing import List
3
+
4
+ from purrfectmeow.meow.kitty import kitty_logger
5
+
6
+ class Local:
7
+
8
+ _logger = kitty_logger(__name__)
9
+
10
+ _HF_MODEL_DIR = '.cache/huggingface/hub/'
11
+
12
+ @classmethod
13
+ def model_encode(cls, sentence: str | List[str], model_name: str, **kwargs):
14
+ cls._logger.debug("Initializing local model encode")
15
+ start = time.time()
16
+ try:
17
+ from sentence_transformers import SentenceTransformer
18
+ model = SentenceTransformer(
19
+ model_name,
20
+ cache_folder=cls._HF_MODEL_DIR,
21
+ #local_files_only=True
22
+ )
23
+
24
+ embed = model.encode(sentence, convert_to_numpy=True)
25
+
26
+ cls._logger.debug("Local model encode successfully initialized.")
27
+ return embed
28
+ except Exception as e:
29
+ cls._logger.exception(f"Failed to initialize local model encode: {e}")
30
+ raise
31
+ finally:
32
+ elapsed = time.time() - start
33
+ cls._logger.debug(f"Local model encode completed in {elapsed:.2f} seconds.")
@@ -0,0 +1,3 @@
1
+ from .base import KhaoManee
2
+
3
+ __all__ = ["KhaoManee"]
@@ -0,0 +1,8 @@
1
+
2
+ from .cosine import ConsineSim
3
+ class KhaoManee:
4
+
5
+ @classmethod
6
+ def searching(cls, query_embed, sentence_embed, document, top_k):
7
+
8
+ return ConsineSim.vector_search(query_embed, sentence_embed, document, top_k)
@@ -0,0 +1,40 @@
1
+ import time
2
+ from typing import List
3
+
4
+ import numpy
5
+ from purrfectmeow.meow.felis import Document
6
+
7
+ from purrfectmeow.meow.kitty import kitty_logger
8
+
9
+ class ConsineSim:
10
+ _logger = kitty_logger(__name__)
11
+
12
+ @classmethod
13
+ def vector_search(
14
+ cls,
15
+ embed_query: numpy.ndarray,
16
+ embed_sentence: numpy.ndarray | List[numpy.ndarray],
17
+ document: Document,
18
+ top_k: int
19
+ ):
20
+ cls._logger.debug("Initializing vector search")
21
+ start = time.time()
22
+ try:
23
+ from sklearn.metrics.pairwise import cosine_similarity
24
+
25
+ sims = cosine_similarity([embed_query], embed_sentence)[0]
26
+ top_indices = numpy.argsort(sims)[::-1][:top_k]
27
+
28
+ results = [{
29
+ "score": float(sims[i]),
30
+ "document": document[i]
31
+ } for i in top_indices]
32
+
33
+ return results
34
+ except Exception as e:
35
+ cls._logger.exception(f"Failed to initialize vector search: {e}")
36
+ raise
37
+ finally:
38
+ elapsed = time.time() - start
39
+ cls._logger.debug(f"Vector search completed in {elapsed:.2f} seconds.")
40
+
@@ -0,0 +1,3 @@
1
+ from .base import Kornja
2
+
3
+ __all__ = ["Kornja"]
@@ -0,0 +1,6 @@
1
+
2
+ class Kornja:
3
+
4
+ @classmethod
5
+ def generating(cls):
6
+ ...