purrfectkit 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- purrfectkit-0.2.2/LICENSE +21 -0
- purrfectkit-0.2.2/PKG-INFO +114 -0
- purrfectkit-0.2.2/README.md +54 -0
- purrfectkit-0.2.2/purrfectmeow/__init__.py +15 -0
- purrfectkit-0.2.2/purrfectmeow/meow/chaus.py +20 -0
- purrfectkit-0.2.2/purrfectmeow/meow/felis.py +159 -0
- purrfectkit-0.2.2/purrfectmeow/meow/kitty.py +52 -0
- purrfectkit-0.2.2/purrfectmeow/tc01_spl/__init__.py +3 -0
- purrfectkit-0.2.2/purrfectmeow/tc01_spl/base.py +110 -0
- purrfectkit-0.2.2/purrfectmeow/tc01_spl/markdown.py +64 -0
- purrfectkit-0.2.2/purrfectmeow/tc01_spl/ocr.py +149 -0
- purrfectkit-0.2.2/purrfectmeow/tc01_spl/simple.py +75 -0
- purrfectkit-0.2.2/purrfectmeow/tc02_mlt/__init__.py +3 -0
- purrfectkit-0.2.2/purrfectmeow/tc02_mlt/base.py +43 -0
- purrfectkit-0.2.2/purrfectmeow/tc02_mlt/separate.py +35 -0
- purrfectkit-0.2.2/purrfectmeow/tc02_mlt/token.py +47 -0
- purrfectkit-0.2.2/purrfectmeow/tc03_wcm/__init__.py +3 -0
- purrfectkit-0.2.2/purrfectmeow/tc03_wcm/base.py +14 -0
- purrfectkit-0.2.2/purrfectmeow/tc03_wcm/local.py +35 -0
- purrfectkit-0.2.2/purrfectmeow/tc04_kmn/__init__.py +3 -0
- purrfectkit-0.2.2/purrfectmeow/tc04_kmn/base.py +18 -0
- purrfectkit-0.2.2/purrfectmeow/tc04_kmn/cosine.py +39 -0
- purrfectkit-0.2.2/purrfectmeow/tc05_knj/__init__.py +3 -0
- purrfectkit-0.2.2/purrfectmeow/tc05_knj/base.py +4 -0
- purrfectkit-0.2.2/pyproject.toml +114 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 SUWALUTIONS CO., LTD.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: purrfectkit
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: **PurrfectKit** is a Python library for effortless Retrieval-Augmented Generation (RAG) workflows.
|
|
5
|
+
Keywords: rag,nlp,llms,python,ai,ocr,document-processing,multilingual,text-extraction
|
|
6
|
+
Author: SUWALUTIONS
|
|
7
|
+
Author-email: SUWALUTIONS <suwa@suwalutions.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
+
Classifier: Topic :: Text Processing :: General
|
|
24
|
+
Classifier: Natural Language :: English
|
|
25
|
+
Classifier: Natural Language :: Thai
|
|
26
|
+
Requires-Dist: python-magic<=0.4.27
|
|
27
|
+
Requires-Dist: sentence-transformers<=5.1.0
|
|
28
|
+
Requires-Dist: transformers<=4.53.0
|
|
29
|
+
Requires-Dist: docling<=2.31.1
|
|
30
|
+
Requires-Dist: markitdown<=0.1.1
|
|
31
|
+
Requires-Dist: pymupdf4llm<=0.0.27
|
|
32
|
+
Requires-Dist: pdf2image<=1.17.0
|
|
33
|
+
Requires-Dist: pytesseract<=0.3.13
|
|
34
|
+
Requires-Dist: easyocr<=1.7.2
|
|
35
|
+
Requires-Dist: surya-ocr<=0.14.0
|
|
36
|
+
Requires-Dist: python-doctr<=1.0.0
|
|
37
|
+
Requires-Dist: pandas<=2.3.2
|
|
38
|
+
Requires-Dist: langchain-text-splitters<=1.0.0
|
|
39
|
+
Requires-Dist: tiktoken<=0.12.0
|
|
40
|
+
Requires-Dist: ruff<=0.6.0 ; extra == 'dev'
|
|
41
|
+
Requires-Dist: mypy<=1.11.0 ; extra == 'dev'
|
|
42
|
+
Requires-Dist: pre-commit<=3.8.0 ; extra == 'dev'
|
|
43
|
+
Requires-Dist: detect-secrets<=1.5.0 ; extra == 'dev'
|
|
44
|
+
Requires-Dist: codecov-cli<=11.2.4 ; extra == 'dev'
|
|
45
|
+
Requires-Dist: sphinx<=8.2.3 ; extra == 'docs'
|
|
46
|
+
Requires-Dist: sphinx-rtd-theme<=3.0.2 ; extra == 'docs'
|
|
47
|
+
Requires-Dist: pytest<=8.4.2 ; extra == 'test'
|
|
48
|
+
Requires-Dist: pytest-cov<=7.0.0 ; extra == 'test'
|
|
49
|
+
Requires-Dist: pytest-mock<=3.15.1 ; extra == 'test'
|
|
50
|
+
Maintainer: KHARAPSY
|
|
51
|
+
Maintainer-email: KHARAPSY <kharapsy@suwalutions.com>
|
|
52
|
+
Requires-Python: >=3.10
|
|
53
|
+
Project-URL: Documentation, https://suwalutions.github.io/PurrfectKit
|
|
54
|
+
Project-URL: Issues, https://github.com/SUWALUTIONS/PurrfectKit/issues
|
|
55
|
+
Project-URL: Repository, https://github.com/SUWALUTIONS/PurrfectKit
|
|
56
|
+
Provides-Extra: dev
|
|
57
|
+
Provides-Extra: docs
|
|
58
|
+
Provides-Extra: test
|
|
59
|
+
Description-Content-Type: text/markdown
|
|
60
|
+
|
|
61
|
+

|
|
62
|
+
|
|
63
|
+
# PurrfectKit
|
|
64
|
+
|
|
65
|
+
[](https://www.python.org)
|
|
66
|
+
[](https://pypi.org/project/purrfectkit/)
|
|
67
|
+
[](https://pypistats.org/packages/purrfectkit)
|
|
68
|
+
[](https://codecov.io/github/suwalutions/PurrfectKit)
|
|
69
|
+
[](https://github.com/astral-sh/ruff)
|
|
70
|
+
[](https://ghcr.io/suwalutions/purrfectkit)
|
|
71
|
+
[](LICENSE)
|
|
72
|
+
|
|
73
|
+
**PurrfectKit** is a toolkit that simplifies Retrieval-Augmented Generation (RAG) into 5 easy steps:
|
|
74
|
+
1. Suphalak - read content from files
|
|
75
|
+
2. Malet - split content into chunks
|
|
76
|
+
3. WichienMaat - embed chunks into vectors
|
|
77
|
+
4. KhaoManee - search vectors with queries
|
|
78
|
+
5. Kornja - generate answers from vectors
|
|
79
|
+
|
|
80
|
+
> **_NOTE:_** Each step is inspired by a unique Thai cat breed, making the workflow memorable and fun.
|
|
81
|
+
|
|
82
|
+
## Quickstart
|
|
83
|
+
|
|
84
|
+
### Prerequisites
|
|
85
|
+
- python
|
|
86
|
+
- tesseract
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
### Installation
|
|
90
|
+
```bash
|
|
91
|
+
pip install purrfectkit
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Usage
|
|
96
|
+
```python
|
|
97
|
+
from purrfectmeow.meow.felis import DocTemplate, MetaFile
|
|
98
|
+
from purrfectmeow import Suphalak, Malet, WichienMaat, KhaoManee
|
|
99
|
+
|
|
100
|
+
file_path = 'test/test.pdf'
|
|
101
|
+
metadata = MetaFile.get_metadata(file_path)
|
|
102
|
+
with open(file_path, 'rb') as f:
|
|
103
|
+
content = Suphalak.reading(f, 'test.pdf')
|
|
104
|
+
chunks = Malet.chunking(content, chunk_method='token', chunk_size='500', chunk_overlap='25')
|
|
105
|
+
docs = DocTemplate.create_template(chunks, metadata)
|
|
106
|
+
embedding = WichienMaat.embedding(chunks)
|
|
107
|
+
query = WichienMaat.embedding("ทดสอบ")
|
|
108
|
+
KhaoManee.searching(query, embedding, docs, 2)
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
PurrfectKit is released under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# PurrfectKit
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org)
|
|
6
|
+
[](https://pypi.org/project/purrfectkit/)
|
|
7
|
+
[](https://pypistats.org/packages/purrfectkit)
|
|
8
|
+
[](https://codecov.io/github/suwalutions/PurrfectKit)
|
|
9
|
+
[](https://github.com/astral-sh/ruff)
|
|
10
|
+
[](https://ghcr.io/suwalutions/purrfectkit)
|
|
11
|
+
[](LICENSE)
|
|
12
|
+
|
|
13
|
+
**PurrfectKit** is a toolkit that simplifies Retrieval-Augmented Generation (RAG) into 5 easy steps:
|
|
14
|
+
1. Suphalak - read content from files
|
|
15
|
+
2. Malet - split content into chunks
|
|
16
|
+
3. WichienMaat - embed chunks into vectors
|
|
17
|
+
4. KhaoManee - search vectors with queries
|
|
18
|
+
5. Kornja - generate answers from vectors
|
|
19
|
+
|
|
20
|
+
> **_NOTE:_** Each step is inspired by a unique Thai cat breed, making the workflow memorable and fun.
|
|
21
|
+
|
|
22
|
+
## Quickstart
|
|
23
|
+
|
|
24
|
+
### Prerequisites
|
|
25
|
+
- python
|
|
26
|
+
- tesseract
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
### Installation
|
|
30
|
+
```bash
|
|
31
|
+
pip install purrfectkit
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Usage
|
|
36
|
+
```python
|
|
37
|
+
from purrfectmeow.meow.felis import DocTemplate, MetaFile
|
|
38
|
+
from purrfectmeow import Suphalak, Malet, WichienMaat, KhaoManee
|
|
39
|
+
|
|
40
|
+
file_path = 'test/test.pdf'
|
|
41
|
+
metadata = MetaFile.get_metadata(file_path)
|
|
42
|
+
with open(file_path, 'rb') as f:
|
|
43
|
+
content = Suphalak.reading(f, 'test.pdf')
|
|
44
|
+
chunks = Malet.chunking(content, chunk_method='token', chunk_size='500', chunk_overlap='25')
|
|
45
|
+
docs = DocTemplate.create_template(chunks, metadata)
|
|
46
|
+
embedding = WichienMaat.embedding(chunks)
|
|
47
|
+
query = WichienMaat.embedding("ทดสอบ")
|
|
48
|
+
KhaoManee.searching(query, embedding, docs, 2)
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## License
|
|
53
|
+
|
|
54
|
+
PurrfectKit is released under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .tc01_spl import Suphalak
|
|
2
|
+
from .tc02_mlt import Malet
|
|
3
|
+
from .tc03_wcm import WichienMaat
|
|
4
|
+
from .tc04_kmn import KhaoManee
|
|
5
|
+
from .tc05_knj import Kornja
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Suphalak",
|
|
9
|
+
"Malet",
|
|
10
|
+
"WichienMaat",
|
|
11
|
+
"KhaoManee",
|
|
12
|
+
"Kornja",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
__version__ = "0.2.2"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import TypedDict
|
|
2
|
+
|
|
3
|
+
from .felis import Document
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FileMetadata(TypedDict, total=False):
|
|
7
|
+
file_name: str
|
|
8
|
+
file_size: int
|
|
9
|
+
file_created_date: str
|
|
10
|
+
file_modified_date: str
|
|
11
|
+
file_extension: str
|
|
12
|
+
file_type: str
|
|
13
|
+
description: str
|
|
14
|
+
total_pages: int | str
|
|
15
|
+
file_md5: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SimilarityResult(TypedDict, total=False):
|
|
19
|
+
score: float | str
|
|
20
|
+
document: Document
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from .chaus import FileMetadata
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Document:
|
|
8
|
+
def __init__(self, page_content: str, metadata: dict[str, Any]) -> None:
|
|
9
|
+
self.page_content = page_content
|
|
10
|
+
self.metadata = metadata or {}
|
|
11
|
+
|
|
12
|
+
def __repr__(self) -> str:
|
|
13
|
+
return f"{self.__class__.__name__}(page_content={self.page_content!r}, metadata={self.metadata!r})"
|
|
14
|
+
|
|
15
|
+
def __getitem__(self, key: str) -> Any:
|
|
16
|
+
if key == "page_content":
|
|
17
|
+
return self.page_content
|
|
18
|
+
elif key == "metadata":
|
|
19
|
+
return self.metadata
|
|
20
|
+
else:
|
|
21
|
+
raise KeyError(f"{key} is not a valid key. Use 'page_content' or 'metadata'.")
|
|
22
|
+
|
|
23
|
+
def to_dict(self) -> dict[str, Any]:
|
|
24
|
+
return {"page_content": self.page_content, "metadata": self.metadata}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DocTemplate:
|
|
28
|
+
@staticmethod
|
|
29
|
+
def create_template(chunks: list[str], metadata: dict[str, Any]) -> list[Document]:
|
|
30
|
+
if not isinstance(chunks, list):
|
|
31
|
+
raise TypeError(f"Expected 'chunks' to be a list, but got {type(chunks).__name__}.")
|
|
32
|
+
|
|
33
|
+
if not isinstance(metadata, dict):
|
|
34
|
+
raise TypeError(f"Expected 'metadata' to be a dict, but got {type(metadata).__name__}.")
|
|
35
|
+
|
|
36
|
+
if not all(isinstance(c, str) for c in chunks):
|
|
37
|
+
raise ValueError("All elements in 'chunks' must be strings.")
|
|
38
|
+
|
|
39
|
+
docs = []
|
|
40
|
+
chunk_hashes = []
|
|
41
|
+
|
|
42
|
+
import hashlib
|
|
43
|
+
import uuid
|
|
44
|
+
|
|
45
|
+
for _, chunk in enumerate(chunks):
|
|
46
|
+
hash_val = hashlib.md5(chunk.encode()).hexdigest()
|
|
47
|
+
chunk_hashes.append(hash_val)
|
|
48
|
+
|
|
49
|
+
for idx, chunk in enumerate(chunks):
|
|
50
|
+
chunk_number = idx + 1
|
|
51
|
+
chunk_id = uuid.uuid4().hex
|
|
52
|
+
chunk_hash = chunk_hashes[idx]
|
|
53
|
+
prev_hash = chunk_hashes[idx - 1] if idx > 0 else None
|
|
54
|
+
next_hash = chunk_hashes[idx + 1] if idx < len(chunks) - 1 else None
|
|
55
|
+
chunk_size = len(chunk)
|
|
56
|
+
|
|
57
|
+
chunk_info = {
|
|
58
|
+
"chunk_number": chunk_number,
|
|
59
|
+
"chunk_id": chunk_id,
|
|
60
|
+
"chunk_hash": chunk_hash,
|
|
61
|
+
"previous_chunk_hash": prev_hash,
|
|
62
|
+
"next_chunk_hash": next_hash,
|
|
63
|
+
"chunk_size": chunk_size,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
doc_metadata = {"chunk_info": chunk_info, "source_info": metadata}
|
|
67
|
+
|
|
68
|
+
doc = Document(page_content=chunk, metadata=doc_metadata)
|
|
69
|
+
docs.append(doc)
|
|
70
|
+
|
|
71
|
+
return docs
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class MetaFile:
|
|
75
|
+
@staticmethod
|
|
76
|
+
def get_metadata(file: str | BytesIO, **kwargs: Any) -> FileMetadata:
|
|
77
|
+
if isinstance(file, bytes):
|
|
78
|
+
file = BytesIO(file)
|
|
79
|
+
|
|
80
|
+
if isinstance(file, BytesIO):
|
|
81
|
+
import os
|
|
82
|
+
|
|
83
|
+
os.makedirs(".cache/tmp", exist_ok=True)
|
|
84
|
+
file_name = kwargs.get("file_name")
|
|
85
|
+
|
|
86
|
+
if not file_name:
|
|
87
|
+
raise ValueError("file_name must be provided when using BytesIO.")
|
|
88
|
+
|
|
89
|
+
file_path = os.path.join(".cache/tmp", file_name)
|
|
90
|
+
with open(file_path, "wb") as f:
|
|
91
|
+
f.write(file.getvalue())
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
return MetaFile._get_metadata_from_path(file_path)
|
|
95
|
+
finally:
|
|
96
|
+
os.remove(file_path)
|
|
97
|
+
|
|
98
|
+
elif isinstance(file, str):
|
|
99
|
+
return MetaFile._get_metadata_from_path(file)
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
raise TypeError(f"Unsupported file type: {type(file).__name__}. Expected str, bytes, or BytesIO.")
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def _get_metadata_from_path(file_path: str) -> FileMetadata:
|
|
106
|
+
metadata: FileMetadata = {}
|
|
107
|
+
|
|
108
|
+
import hashlib
|
|
109
|
+
import os
|
|
110
|
+
import re
|
|
111
|
+
import subprocess
|
|
112
|
+
import time
|
|
113
|
+
|
|
114
|
+
import magic
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
if not os.path.exists(file_path):
|
|
118
|
+
raise FileNotFoundError(f"File {file_path} does not exist")
|
|
119
|
+
|
|
120
|
+
stats = os.stat(file_path)
|
|
121
|
+
metadata["file_name"] = os.path.basename(file_path)
|
|
122
|
+
metadata["file_size"] = stats.st_size
|
|
123
|
+
metadata["file_created_date"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stats.st_ctime))
|
|
124
|
+
metadata["file_modified_date"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stats.st_mtime))
|
|
125
|
+
metadata["file_extension"] = os.path.splitext(file_path)[1] or "none"
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
mime = magic.Magic(mime=True)
|
|
129
|
+
metadata["file_type"] = mime.from_file(file_path)
|
|
130
|
+
metadata["description"] = magic.from_file(file_path)
|
|
131
|
+
except Exception as e:
|
|
132
|
+
metadata["file_type"] = "unknown"
|
|
133
|
+
metadata["description"] = f"Could not determine file type: {str(e)}"
|
|
134
|
+
|
|
135
|
+
if metadata["file_type"].startswith("image/"):
|
|
136
|
+
metadata["total_pages"] = 1
|
|
137
|
+
elif metadata["file_type"].startswith("application/pdf"):
|
|
138
|
+
try:
|
|
139
|
+
result = subprocess.run(["pdfinfo", file_path], stdout=subprocess.PIPE, text=True, check=True)
|
|
140
|
+
pages_match = re.search(r"Pages:\s*(\d+)", result.stdout)
|
|
141
|
+
if pages_match:
|
|
142
|
+
metadata["total_pages"] = int(pages_match.group(1))
|
|
143
|
+
else:
|
|
144
|
+
metadata["total_pages"] = "Unknown (could not parse page count)"
|
|
145
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
146
|
+
metadata["total_pages"] = "Unknown (pdfinfo not installed or failed)"
|
|
147
|
+
else:
|
|
148
|
+
metadata["total_pages"] = 1
|
|
149
|
+
|
|
150
|
+
with open(file_path, "rb") as f:
|
|
151
|
+
hash_md5 = hashlib.md5()
|
|
152
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
153
|
+
hash_md5.update(chunk)
|
|
154
|
+
metadata["file_md5"] = hash_md5.hexdigest()
|
|
155
|
+
|
|
156
|
+
return metadata
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
raise RuntimeError(f"Failed to extract metadata: {e}") from e
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from logging.handlers import RotatingFileHandler
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LevelBasedFormatter(logging.Formatter):
|
|
7
|
+
def __init__(self, default_fmt: str, info_fmt: str, datefmt: str | None = None) -> None:
|
|
8
|
+
super().__init__(datefmt=datefmt)
|
|
9
|
+
self.default_fmt: logging.Formatter = logging.Formatter(default_fmt, datefmt)
|
|
10
|
+
self.info_fmt: logging.Formatter = logging.Formatter(info_fmt, datefmt)
|
|
11
|
+
|
|
12
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
13
|
+
if record.levelno == logging.INFO:
|
|
14
|
+
return self.info_fmt.format(record)
|
|
15
|
+
return self.default_fmt.format(record)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def kitty_logger(name: str, log_file: str = "kitty.log", log_level: str = "INFO") -> logging.Logger:
|
|
19
|
+
"""
|
|
20
|
+
Sets up a logger with console and rotating file handlers.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
name (str): Name of the logger (usually __name__ of the calling module).
|
|
24
|
+
log_file (str): Path to the log file. Defaults to 'kitty.log'.
|
|
25
|
+
log_level (str): Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'). Defaults to 'INFO'.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
logging.Logger: Configured logger instance.
|
|
29
|
+
"""
|
|
30
|
+
logger = logging.getLogger(name)
|
|
31
|
+
logger.setLevel(getattr(logging, log_level.upper(), logging.INFO))
|
|
32
|
+
|
|
33
|
+
if not logger.handlers:
|
|
34
|
+
default_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] %(name)s:%(lineno)d - %(message)s"
|
|
35
|
+
info_fmt = "PurrfectKit | %(asctime)s [%(levelname)s] - %(message)s"
|
|
36
|
+
datefmt = "%Y-%m-%d %H:%M:%S"
|
|
37
|
+
|
|
38
|
+
formatter = LevelBasedFormatter(default_fmt, info_fmt, datefmt)
|
|
39
|
+
|
|
40
|
+
console_handler = logging.StreamHandler()
|
|
41
|
+
console_handler.setFormatter(formatter)
|
|
42
|
+
logger.addHandler(console_handler)
|
|
43
|
+
|
|
44
|
+
log_dir = Path(".cache/logs")
|
|
45
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
log_path = log_dir / log_file
|
|
47
|
+
|
|
48
|
+
file_handler = RotatingFileHandler(log_path, maxBytes=5 * 1024 * 1024, backupCount=3)
|
|
49
|
+
file_handler.setFormatter(formatter)
|
|
50
|
+
logger.addHandler(file_handler)
|
|
51
|
+
|
|
52
|
+
return logger
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from typing import Any, BinaryIO
|
|
2
|
+
|
|
3
|
+
from .markdown import Markdown
|
|
4
|
+
from .ocr import Ocr
|
|
5
|
+
from .simple import Simple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Suphalak:
|
|
9
|
+
tmp_dir = ".cache/tmp"
|
|
10
|
+
DEFAULT_LOADER = "PYMUPDF4LLM"
|
|
11
|
+
|
|
12
|
+
_LOADERS: dict[str, dict[str, Any]] = {
|
|
13
|
+
"MARKITDOWN": {
|
|
14
|
+
"func": Markdown.markitdown_convert,
|
|
15
|
+
"ext": ("csv", "docx", "md", "pdf", "pptx", "txt", "xls", "xlsx"),
|
|
16
|
+
},
|
|
17
|
+
"DOCLING": {
|
|
18
|
+
"func": Markdown.docling_convert,
|
|
19
|
+
"ext": ("csv", "docx", "jpg", "md", "pdf", "png", "pptx", "xlsx"),
|
|
20
|
+
},
|
|
21
|
+
"PYMUPDF4LLM": {
|
|
22
|
+
"func": Markdown.pymupdf4llm_convert,
|
|
23
|
+
"ext": ("docx", "pdf", "pptx", "txt", "xlsx"),
|
|
24
|
+
},
|
|
25
|
+
"PYTESSERACT": {
|
|
26
|
+
"func": Ocr.pytesseract_convert,
|
|
27
|
+
"ext": ("gif", "jpg", "pdf", "png"),
|
|
28
|
+
},
|
|
29
|
+
"EASYOCR": {
|
|
30
|
+
"func": Ocr.easyocr_convert,
|
|
31
|
+
"ext": ("gif", "jpg", "pdf", "png"),
|
|
32
|
+
},
|
|
33
|
+
"SURYAOCR": {
|
|
34
|
+
"func": Ocr.suryaocr_convert,
|
|
35
|
+
"ext": ("gif", "jpg", "pdf", "png"),
|
|
36
|
+
},
|
|
37
|
+
"DOCTR": {
|
|
38
|
+
"func": Ocr.doctr_convert,
|
|
39
|
+
"ext": ("gif", "jpg", "pdf", "png"),
|
|
40
|
+
},
|
|
41
|
+
"PYMUPDF": {
|
|
42
|
+
"func": Simple.pymupdf_convert,
|
|
43
|
+
"ext": ("docx", "md", "pdf", "pptx", "xlsx"),
|
|
44
|
+
},
|
|
45
|
+
"PANDAS": {
|
|
46
|
+
"func": Simple.pandas_convert,
|
|
47
|
+
"ext": ("csv", "xls", "xlsx"),
|
|
48
|
+
},
|
|
49
|
+
"ENCODING": {
|
|
50
|
+
"func": Simple.encoding_convert,
|
|
51
|
+
"ext": ("csv", "md", "txt"),
|
|
52
|
+
},
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def _detect_loader(cls, file_ext: str) -> str:
|
|
57
|
+
priority = [
|
|
58
|
+
("PANDAS", ("csv", "xls")),
|
|
59
|
+
("PYTESSERACT", ("jpg", "png", "gif")),
|
|
60
|
+
("PYMUPDF", ("pdf", "md")),
|
|
61
|
+
("PYMUPDF4LLM", ("txt", "xlsx", "pptx", "docx")),
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
for loader, extensions in priority:
|
|
65
|
+
if file_ext in extensions:
|
|
66
|
+
return loader
|
|
67
|
+
|
|
68
|
+
return cls.DEFAULT_LOADER
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def reading(cls, file: BinaryIO, file_name: str, loader: str | None = None, **kwargs: Any) -> str:
|
|
72
|
+
import os
|
|
73
|
+
|
|
74
|
+
file_ext = file_name.split(".")[-1].lower()
|
|
75
|
+
|
|
76
|
+
if not loader:
|
|
77
|
+
loader = cls._detect_loader(file_ext)
|
|
78
|
+
|
|
79
|
+
if loader not in cls._LOADERS:
|
|
80
|
+
raise ValueError(f"Unsupported loader: '{loader}'")
|
|
81
|
+
|
|
82
|
+
loader_conf = cls._LOADERS[loader]
|
|
83
|
+
supported_ext = loader_conf["ext"]
|
|
84
|
+
|
|
85
|
+
if file_ext not in supported_ext:
|
|
86
|
+
raise TypeError(f"'{file_ext}' is not supported for '{loader}' loader.")
|
|
87
|
+
|
|
88
|
+
os.makedirs(cls.tmp_dir, exist_ok=True)
|
|
89
|
+
file_path = os.path.join(cls.tmp_dir, file_name)
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
text: str
|
|
93
|
+
with open(file_path, "wb") as f:
|
|
94
|
+
f.write(file.read())
|
|
95
|
+
|
|
96
|
+
text = loader_conf["func"](file_path, **kwargs)
|
|
97
|
+
|
|
98
|
+
if (
|
|
99
|
+
file_ext == "pdf"
|
|
100
|
+
and (not text or not str(text).strip())
|
|
101
|
+
and loader not in ("PYTESSERACT", "EASYOCR", "SURYAOCR", "DOCTR")
|
|
102
|
+
):
|
|
103
|
+
ocr_loader = cls._LOADERS["PYTESSERACT"]
|
|
104
|
+
text = ocr_loader["func"](file_path, **kwargs)
|
|
105
|
+
|
|
106
|
+
return text
|
|
107
|
+
|
|
108
|
+
finally:
|
|
109
|
+
if os.path.exists(file_path):
|
|
110
|
+
os.remove(file_path)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from purrfectmeow.meow.kitty import kitty_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Markdown:
|
|
9
|
+
_logger = kitty_logger(__name__)
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def _convert(cls, file_path: str, converter: Callable[[str], Any], extractor: Callable[[Any], str]) -> str:
|
|
13
|
+
cls._logger.debug(f"Starting conversion for '{file_path}'")
|
|
14
|
+
start = time.time()
|
|
15
|
+
try:
|
|
16
|
+
raw_content: Any = converter(file_path)
|
|
17
|
+
result: str = extractor(raw_content)
|
|
18
|
+
|
|
19
|
+
cls._logger.debug(f"Succesfully converted '{file_path}'")
|
|
20
|
+
|
|
21
|
+
return result
|
|
22
|
+
|
|
23
|
+
finally:
|
|
24
|
+
elapsed = time.time() - start
|
|
25
|
+
cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def markitdown_convert(cls, file_path: str) -> str:
|
|
29
|
+
cls._logger.debug("Using MarkItDown for Conversion")
|
|
30
|
+
|
|
31
|
+
from markitdown import MarkItDown
|
|
32
|
+
|
|
33
|
+
mid = MarkItDown()
|
|
34
|
+
|
|
35
|
+
return cls._convert(file_path, lambda path: mid.convert(path), lambda content: content.text_content)
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def docling_convert(cls, file_path: str) -> str:
|
|
39
|
+
cls._logger.debug("Using Docling for Conversion")
|
|
40
|
+
|
|
41
|
+
from docling.document_converter import DocumentConverter
|
|
42
|
+
|
|
43
|
+
dcl = DocumentConverter()
|
|
44
|
+
|
|
45
|
+
return cls._convert(
|
|
46
|
+
file_path, lambda path: dcl.convert(path).document, lambda content: content.document.export_to_markdown()
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def pymupdf4llm_convert(cls, file_path: str) -> str:
|
|
51
|
+
cls._logger.debug("Using PyMuPDF4LLM for Conversion")
|
|
52
|
+
cls._logger.debug(f"Starting conversion for '{file_path}'")
|
|
53
|
+
start = time.time()
|
|
54
|
+
|
|
55
|
+
import pymupdf4llm
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
res: str = pymupdf4llm.to_markdown(file_path)
|
|
59
|
+
cls._logger.debug(f"Succesfully converted '{file_path}'")
|
|
60
|
+
|
|
61
|
+
return res
|
|
62
|
+
finally:
|
|
63
|
+
elapsed = time.time() - start
|
|
64
|
+
cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from purrfectmeow.meow.kitty import kitty_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Ocr:
|
|
9
|
+
_logger = kitty_logger(__name__)
|
|
10
|
+
_image_type = [
|
|
11
|
+
".apng",
|
|
12
|
+
".png",
|
|
13
|
+
".avif",
|
|
14
|
+
".gif",
|
|
15
|
+
".jpg",
|
|
16
|
+
".jpeg",
|
|
17
|
+
".jfif",
|
|
18
|
+
".pjpeg",
|
|
19
|
+
".pjp",
|
|
20
|
+
".png",
|
|
21
|
+
".svg",
|
|
22
|
+
".webp",
|
|
23
|
+
".bmp",
|
|
24
|
+
".ico",
|
|
25
|
+
".cur",
|
|
26
|
+
".tif",
|
|
27
|
+
".tiff",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def _convert(cls, file_path: str, converter: Callable[[str], Any]) -> str:
|
|
32
|
+
cls._logger.debug(f"Starting conversion for '{file_path}'")
|
|
33
|
+
start = time.time()
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
content = []
|
|
37
|
+
match file_path.lower():
|
|
38
|
+
case path if path.endswith(".pdf"):
|
|
39
|
+
from pdf2image import convert_from_path
|
|
40
|
+
|
|
41
|
+
images = convert_from_path(file_path, fmt="png")
|
|
42
|
+
for idx, image in enumerate(images):
|
|
43
|
+
try:
|
|
44
|
+
text = converter(image)
|
|
45
|
+
cls._logger.debug(f"Text: {text}")
|
|
46
|
+
content.append(text)
|
|
47
|
+
cls._logger.debug(f"Page {idx + 1} processed")
|
|
48
|
+
except Exception as e:
|
|
49
|
+
cls._logger.exception(f"Page {idx + 1} failed: {e}")
|
|
50
|
+
raise
|
|
51
|
+
case path if path.endswith(tuple(cls._image_type)):
|
|
52
|
+
from PIL import Image
|
|
53
|
+
|
|
54
|
+
image = Image.open(file_path)
|
|
55
|
+
try:
|
|
56
|
+
text = converter(image)
|
|
57
|
+
cls._logger.debug(f"Text: {text}")
|
|
58
|
+
content.append(text)
|
|
59
|
+
cls._logger.debug("Page 1 processed")
|
|
60
|
+
except Exception as e:
|
|
61
|
+
cls._logger.debug(f"Page 1 failed: {e}")
|
|
62
|
+
raise
|
|
63
|
+
|
|
64
|
+
cls._logger.debug(f"Successfully converted '{file_path}'")
|
|
65
|
+
return "\n".join(content)
|
|
66
|
+
|
|
67
|
+
finally:
|
|
68
|
+
elasped = time.time() - start
|
|
69
|
+
cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def pytesseract_convert(cls, file_path: str) -> str:
|
|
73
|
+
cls._logger.debug("Using PyTesseract for Conversion")
|
|
74
|
+
|
|
75
|
+
def converter(image: str) -> Any:
|
|
76
|
+
import pytesseract
|
|
77
|
+
|
|
78
|
+
return pytesseract.image_to_string(image, lang="tha+eng")
|
|
79
|
+
|
|
80
|
+
return cls._convert(file_path, converter)
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def easyocr_convert(cls, file_path: str) -> str:
|
|
84
|
+
cls._logger.debug("Using EasyOCR for Conversion")
|
|
85
|
+
|
|
86
|
+
def converter(image: str) -> Any:
|
|
87
|
+
import easyocr
|
|
88
|
+
import numpy
|
|
89
|
+
|
|
90
|
+
reader = easyocr.Reader(["th", "en"], gpu=False)
|
|
91
|
+
res = reader.readtext(numpy.array(image))
|
|
92
|
+
return "\n".join(text for _, text, _ in res)
|
|
93
|
+
|
|
94
|
+
return cls._convert(file_path, converter)
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def suryaocr_convert(cls, file_path: str) -> str:
|
|
98
|
+
cls._logger.debug("Using SuryaOCR for Conversion")
|
|
99
|
+
|
|
100
|
+
def converter(image: str) -> Any:
|
|
101
|
+
from surya.detection import DetectionPredictor
|
|
102
|
+
from surya.recognition import RecognitionPredictor
|
|
103
|
+
|
|
104
|
+
rec_pred = RecognitionPredictor()
|
|
105
|
+
det_pred = DetectionPredictor()
|
|
106
|
+
|
|
107
|
+
prediction = rec_pred(
|
|
108
|
+
[image],
|
|
109
|
+
det_predictor=det_pred,
|
|
110
|
+
detection_batch_size=1,
|
|
111
|
+
recognition_batch_size=1,
|
|
112
|
+
)
|
|
113
|
+
return "\n".join(line.text for line in prediction[0].text_lines)
|
|
114
|
+
|
|
115
|
+
return cls._convert(file_path, converter)
|
|
116
|
+
|
|
117
|
+
@classmethod
|
|
118
|
+
def doctr_convert(cls, file_path: str) -> str:
|
|
119
|
+
cls._logger.debug("Using docTR for Conversion")
|
|
120
|
+
|
|
121
|
+
def converter(image: str) -> str:
|
|
122
|
+
import os
|
|
123
|
+
import shutil
|
|
124
|
+
import tempfile
|
|
125
|
+
|
|
126
|
+
from doctr.io import DocumentFile
|
|
127
|
+
from doctr.models import ocr_predictor
|
|
128
|
+
|
|
129
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
|
130
|
+
shutil.copy(image, tmp.name)
|
|
131
|
+
temp_image_path = tmp.name
|
|
132
|
+
|
|
133
|
+
model = ocr_predictor(pretrained=True)
|
|
134
|
+
doc = DocumentFile.from_images(temp_image_path)
|
|
135
|
+
result = model(doc)
|
|
136
|
+
data = result.export()
|
|
137
|
+
combined_text = "\n".join(
|
|
138
|
+
word["value"]
|
|
139
|
+
for page in data["pages"]
|
|
140
|
+
for block in page.get("blocks", [])
|
|
141
|
+
for line in block.get("lines", [])
|
|
142
|
+
for word in line.get("words", [])
|
|
143
|
+
if "value" in word
|
|
144
|
+
)
|
|
145
|
+
if os.path.exists(temp_image_path):
|
|
146
|
+
os.remove(temp_image_path)
|
|
147
|
+
return combined_text
|
|
148
|
+
|
|
149
|
+
return cls._convert(file_path, converter)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from purrfectmeow.meow.kitty import kitty_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Simple:
|
|
9
|
+
_logger = kitty_logger(__name__)
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def _convert(cls, file_path: str, converter: Callable[[str], Any]) -> str | Any:
|
|
13
|
+
cls._logger.debug(f"Starting conversion for '{file_path}'")
|
|
14
|
+
start = time.time()
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
res = converter(file_path)
|
|
18
|
+
|
|
19
|
+
cls._logger.debug(f"Successfully converted '{file_path}'")
|
|
20
|
+
return res
|
|
21
|
+
|
|
22
|
+
finally:
|
|
23
|
+
elasped = time.time() - start
|
|
24
|
+
cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def encoding_convert(cls, file_path: str) -> str:
|
|
28
|
+
cls._logger.debug("Using Encoding for Conversion")
|
|
29
|
+
|
|
30
|
+
def reader(file_path: str) -> str:
|
|
31
|
+
with open(file_path, encoding="utf-8") as f:
|
|
32
|
+
return f.read()
|
|
33
|
+
|
|
34
|
+
return cls._convert(file_path, lambda file_path: reader(file_path))
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def pymupdf_convert(cls, file_path: str) -> str:
|
|
38
|
+
cls._logger.debug("Using PyMuPDF for Conversion")
|
|
39
|
+
|
|
40
|
+
def reader(file_path: str) -> str:
|
|
41
|
+
import pymupdf
|
|
42
|
+
|
|
43
|
+
if file_path.endswith((".txt", ".md", ".json", ".html", ".xml")):
|
|
44
|
+
return "".join(page.get_text() for page in pymupdf.open(file_path, filetype="txt"))
|
|
45
|
+
else:
|
|
46
|
+
return "".join(page.get_text() for page in pymupdf.open(file_path))
|
|
47
|
+
|
|
48
|
+
return cls._convert(file_path, lambda file_path: reader(file_path))
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def pandas_convert(cls, file_path: str) -> str:
|
|
52
|
+
cls._logger.debug("Using Pandas for Conversion")
|
|
53
|
+
|
|
54
|
+
def reader(file_path: str) -> Any:
|
|
55
|
+
import pandas
|
|
56
|
+
|
|
57
|
+
if file_path.endswith((".xls", ".xlsx")):
|
|
58
|
+
df_x: pandas.DataFrame = pandas.read_excel(file_path)
|
|
59
|
+
return df_x.to_string(index=False)
|
|
60
|
+
elif file_path.endswith(".csv"):
|
|
61
|
+
df_c: pandas.DataFrame = pandas.read_csv(file_path)
|
|
62
|
+
return df_c.to_string(index=False)
|
|
63
|
+
elif file_path.endswith(".json"):
|
|
64
|
+
df_j: pandas.DataFrame = pandas.read_json(file_path)
|
|
65
|
+
return df_j.to_string(index=False)
|
|
66
|
+
elif file_path.endswith(".html"):
|
|
67
|
+
df_h: list[pandas.DataFrame] = pandas.read_html(file_path)
|
|
68
|
+
return "".join(df.to_string(index=False) for df in df_h)
|
|
69
|
+
elif file_path.endswith(".xml"):
|
|
70
|
+
df_m: pandas.DataFrame = pandas.read_xml(file_path)
|
|
71
|
+
return df_m.to_string(index=False)
|
|
72
|
+
else:
|
|
73
|
+
return ""
|
|
74
|
+
|
|
75
|
+
return cls._convert(file_path, lambda file_path: reader(file_path))
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Any, Literal
|
|
2
|
+
|
|
3
|
+
from langchain_text_splitters import TokenTextSplitter
|
|
4
|
+
|
|
5
|
+
from .separate import SeparateSplit
|
|
6
|
+
from .token import TokenSplit
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Malet:
|
|
10
|
+
DEFAULT_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
11
|
+
DEFAULT_CHUNK_SIZE = 500
|
|
12
|
+
DEFAULT_CHUNK_OVERLAP = 0
|
|
13
|
+
DEFAULT_CHUNK_SEPARATOR = "\n\n"
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def _get_kwarg(kwargs: dict[str, Any], keys: list[str], default: Any = None) -> Any:
|
|
17
|
+
for key in keys:
|
|
18
|
+
if key in kwargs:
|
|
19
|
+
return kwargs[key]
|
|
20
|
+
return default
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def chunking(
|
|
24
|
+
cls, text: str, chunk_method: Literal["token", "separate"] | None = "token", **kwargs: Any
|
|
25
|
+
) -> TokenTextSplitter | SeparateSplit.CharacterSeparator:
|
|
26
|
+
match chunk_method:
|
|
27
|
+
case "token":
|
|
28
|
+
model_name = cls._get_kwarg(kwargs, ["model_name", "ModelName", "modelName"], cls.DEFAULT_MODEL_NAME)
|
|
29
|
+
chunk_size = cls._get_kwarg(kwargs, ["chunk_size", "ChunkSize", "chunkSize"], cls.DEFAULT_CHUNK_SIZE)
|
|
30
|
+
chunk_overlap = cls._get_kwarg(
|
|
31
|
+
kwargs, ["chunk_overlap", "ChunkOverlap", "chunkOverlap"], cls.DEFAULT_CHUNK_OVERLAP
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
method = TokenSplit.splitter(model_name, chunk_size, chunk_overlap)
|
|
35
|
+
|
|
36
|
+
case "separate":
|
|
37
|
+
chunk_separator = cls._get_kwarg(
|
|
38
|
+
kwargs, ["chunk_separator", "ChunkSeparator", "chunkSeparator"], cls.DEFAULT_CHUNK_SEPARATOR
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
method = SeparateSplit.splitter(chunk_separator)
|
|
42
|
+
|
|
43
|
+
return method.split_text(text)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
from purrfectmeow.meow.kitty import kitty_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SeparateSplit:
|
|
9
|
+
_logger = kitty_logger(__name__)
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def splitter(cls, chunk_separator: str) -> CharacterSeparator:
|
|
13
|
+
cls._logger.debug("Initializing separate splitter")
|
|
14
|
+
start = time.time()
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
splitter = cls.CharacterSeparator(chunk_separator)
|
|
18
|
+
|
|
19
|
+
cls._logger.debug("Separator splitter successfully initialized.")
|
|
20
|
+
return splitter
|
|
21
|
+
except Exception as e:
|
|
22
|
+
cls._logger.exception(f"Failed to initialize separate splitter: {e}")
|
|
23
|
+
raise
|
|
24
|
+
finally:
|
|
25
|
+
elapsed = time.time() - start
|
|
26
|
+
cls._logger.debug(f"Separate splitting completed in {elapsed:.2f} seconds.")
|
|
27
|
+
|
|
28
|
+
class CharacterSeparator:
|
|
29
|
+
def __init__(self, separator: str):
|
|
30
|
+
self.separator = separator
|
|
31
|
+
|
|
32
|
+
def split_text(self, text: str) -> list[str]:
|
|
33
|
+
chunks = [chunk + self.separator for chunk in text.split(self.separator)]
|
|
34
|
+
chunks[-1] = chunks[-1].rstrip(self.separator)
|
|
35
|
+
return chunks
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
from langchain_text_splitters import TokenTextSplitter
|
|
4
|
+
|
|
5
|
+
from purrfectmeow.meow.kitty import kitty_logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TokenSplit:
|
|
9
|
+
_logger = kitty_logger(__name__)
|
|
10
|
+
|
|
11
|
+
_OPENAI_EMBED_MODEL = {"text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"}
|
|
12
|
+
_OPENAI_HF_MODEL = {"Xenova/text-embedding-ada-002"}
|
|
13
|
+
_HF_MODEL_DIR = ".cache/huggingface/hub/"
|
|
14
|
+
|
|
15
|
+
@classmethod
|
|
16
|
+
def splitter(cls, model_name: str, chunk_size: int, chunk_overlap: int) -> TokenTextSplitter:
|
|
17
|
+
cls._logger.debug("Initializing token splitter")
|
|
18
|
+
start = time.time()
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
cls._logger.debug(f"Using OpenAI model tokenizer: {model_name}")
|
|
22
|
+
if model_name in cls._OPENAI_EMBED_MODEL:
|
|
23
|
+
splitter = TokenTextSplitter.from_tiktoken_encoder(
|
|
24
|
+
model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
25
|
+
)
|
|
26
|
+
else:
|
|
27
|
+
cls._logger.debug(f"Using HuggingFace tokenizer: {model_name}")
|
|
28
|
+
from transformers import AutoTokenizer, GPT2TokenizerFast
|
|
29
|
+
|
|
30
|
+
if model_name in cls._OPENAI_HF_MODEL:
|
|
31
|
+
tokenizer = GPT2TokenizerFast.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
|
|
32
|
+
else:
|
|
33
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
|
|
34
|
+
splitter = TokenTextSplitter.from_huggingface_tokenizer(
|
|
35
|
+
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
cls._logger.debug("Token splitter successfully initialized.")
|
|
39
|
+
return splitter
|
|
40
|
+
|
|
41
|
+
except Exception as e:
|
|
42
|
+
cls._logger.exception(f"Failed to initialize token splitter: {e}")
|
|
43
|
+
raise
|
|
44
|
+
|
|
45
|
+
finally:
|
|
46
|
+
elapsed = time.time() - start
|
|
47
|
+
cls._logger.debug(f"Token splitting completed in {elapsed:.2f} seconds.")
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
import numpy
|
|
3
|
+
|
|
4
|
+
from .local import Local
|
|
5
|
+
|
|
6
|
+
class WichienMaat:
|
|
7
|
+
DEFAULT_MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
|
|
8
|
+
|
|
9
|
+
@classmethod
|
|
10
|
+
def embedding(cls, sentence: str | List[str], model_name: Optional[str] = None) -> numpy.ndarray:
|
|
11
|
+
if model_name:
|
|
12
|
+
return Local.model_encode(sentence, model_name)
|
|
13
|
+
else:
|
|
14
|
+
return Local.model_encode(sentence, cls.DEFAULT_MODEL_NAME)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import numpy
|
|
5
|
+
|
|
6
|
+
from purrfectmeow.meow.kitty import kitty_logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Local:
|
|
10
|
+
_logger = kitty_logger(__name__)
|
|
11
|
+
_HF_MODEL_DIR = ".cache/huggingface/hub/"
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def model_encode(cls, sentence: str | list[str], model_name: str, **kwargs: Any) -> numpy.ndarray:
|
|
15
|
+
cls._logger.debug("Initializing local model encode")
|
|
16
|
+
start = time.time()
|
|
17
|
+
try:
|
|
18
|
+
from sentence_transformers import SentenceTransformer
|
|
19
|
+
|
|
20
|
+
model = SentenceTransformer(
|
|
21
|
+
model_name,
|
|
22
|
+
cache_folder=cls._HF_MODEL_DIR,
|
|
23
|
+
# local_files_only=True
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
embed = model.encode(sentence, convert_to_numpy=True)
|
|
27
|
+
|
|
28
|
+
cls._logger.debug("Local model encode successfully initialized.")
|
|
29
|
+
return embed
|
|
30
|
+
except Exception as e:
|
|
31
|
+
cls._logger.exception(f"Failed to initialize local model encode: {e}")
|
|
32
|
+
raise
|
|
33
|
+
finally:
|
|
34
|
+
elapsed = time.time() - start
|
|
35
|
+
cls._logger.debug(f"Local model encode completed in {elapsed:.2f} seconds.")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy
|
|
2
|
+
|
|
3
|
+
from purrfectmeow.meow.chaus import SimilarityResult
|
|
4
|
+
from purrfectmeow.meow.felis import Document
|
|
5
|
+
|
|
6
|
+
from .cosine import CosineSim
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class KhaoManee:
|
|
10
|
+
@classmethod
|
|
11
|
+
def searching(
|
|
12
|
+
cls,
|
|
13
|
+
query_embed: numpy.ndarray,
|
|
14
|
+
sentence_embed: numpy.ndarray | list[numpy.ndarray],
|
|
15
|
+
documents: list[Document],
|
|
16
|
+
top_k: int,
|
|
17
|
+
) -> list[SimilarityResult]:
|
|
18
|
+
return CosineSim.vector_search(query_embed, sentence_embed, documents, top_k)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import numpy
|
|
4
|
+
|
|
5
|
+
from purrfectmeow.meow.chaus import SimilarityResult
|
|
6
|
+
from purrfectmeow.meow.felis import Document
|
|
7
|
+
from purrfectmeow.meow.kitty import kitty_logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CosineSim:
|
|
11
|
+
_logger = kitty_logger(__name__)
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def vector_search(
|
|
15
|
+
cls,
|
|
16
|
+
embed_query: numpy.ndarray,
|
|
17
|
+
embed_sentence: numpy.ndarray | list[numpy.ndarray],
|
|
18
|
+
documents: list[Document],
|
|
19
|
+
top_k: int,
|
|
20
|
+
) -> list[SimilarityResult]:
|
|
21
|
+
cls._logger.debug("Initializing vector search")
|
|
22
|
+
start = time.time()
|
|
23
|
+
try:
|
|
24
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
25
|
+
|
|
26
|
+
sims = cosine_similarity([embed_query], embed_sentence)[0]
|
|
27
|
+
top_indices = numpy.argsort(sims)[::-1][:top_k]
|
|
28
|
+
|
|
29
|
+
results: list[SimilarityResult] = [
|
|
30
|
+
SimilarityResult(score=float(sims[i]), document=documents[i]) for i in top_indices
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
return results
|
|
34
|
+
except Exception as e:
|
|
35
|
+
cls._logger.exception(f"Failed to initialize vector search: {e}")
|
|
36
|
+
raise
|
|
37
|
+
finally:
|
|
38
|
+
elapsed = time.time() - start
|
|
39
|
+
cls._logger.debug(f"Vector search completed in {elapsed:.2f} seconds.")
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "purrfectkit"
|
|
7
|
+
version = "0.2.2"
|
|
8
|
+
authors = [{ name = "SUWALUTIONS", email = "suwa@suwalutions.com" }]
|
|
9
|
+
maintainers = [{ name = "KHARAPSY", email = "kharapsy@suwalutions.com" }]
|
|
10
|
+
description = "**PurrfectKit** is a Python library for effortless Retrieval-Augmented Generation (RAG) workflows."
|
|
11
|
+
keywords = ["rag", "nlp", "llms", "python", "ai", "ocr", "document-processing", "multilingual", "text-extraction"]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = "MIT"
|
|
14
|
+
license-files = ["LICEN[CS]E*"]
|
|
15
|
+
requires-python = ">=3.10"
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Topic :: Software Development :: Libraries",
|
|
27
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
29
|
+
"Topic :: Text Processing :: Linguistic",
|
|
30
|
+
"Topic :: Text Processing :: General",
|
|
31
|
+
"Natural Language :: English",
|
|
32
|
+
"Natural Language :: Thai",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
dependencies = [
|
|
36
|
+
"python-magic<=0.4.27",
|
|
37
|
+
"sentence_transformers<=5.1.0",
|
|
38
|
+
"transformers<=4.53.0",
|
|
39
|
+
"docling<=2.31.1",
|
|
40
|
+
"markitdown<=0.1.1",
|
|
41
|
+
"pymupdf4llm<=0.0.27",
|
|
42
|
+
"pdf2image<=1.17.0",
|
|
43
|
+
"pytesseract<=0.3.13",
|
|
44
|
+
"easyocr<=1.7.2",
|
|
45
|
+
"surya-ocr<=0.14.0",
|
|
46
|
+
"python-doctr<=1.0.0",
|
|
47
|
+
"pandas<=2.3.2",
|
|
48
|
+
"langchain-text-splitters<=1.0.0",
|
|
49
|
+
"tiktoken<=0.12.0",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[project.optional-dependencies]
|
|
53
|
+
Dev = [
|
|
54
|
+
"ruff<=0.6.0",
|
|
55
|
+
"mypy<=1.11.0",
|
|
56
|
+
"pre-commit<=3.8.0",
|
|
57
|
+
"detect-secrets<=1.5.0",
|
|
58
|
+
"codecov-cli<=11.2.4"
|
|
59
|
+
]
|
|
60
|
+
Docs = [
|
|
61
|
+
"sphinx<=8.2.3",
|
|
62
|
+
"sphinx-rtd-theme<=3.0.2",
|
|
63
|
+
]
|
|
64
|
+
Test = [
|
|
65
|
+
"pytest<=8.4.2",
|
|
66
|
+
"pytest-cov<=7.0.0",
|
|
67
|
+
"pytest-mock<=3.15.1",
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
[project.urls]
|
|
71
|
+
Documentation = "https://suwalutions.github.io/PurrfectKit"
|
|
72
|
+
Repository = "https://github.com/SUWALUTIONS/PurrfectKit"
|
|
73
|
+
Issues = "https://github.com/SUWALUTIONS/PurrfectKit/issues"
|
|
74
|
+
|
|
75
|
+
[tool.uv.build-backend]
|
|
76
|
+
module-root = ""
|
|
77
|
+
module-name = "purrfectmeow"
|
|
78
|
+
include = ["LICENSE", "README.md", "pyproject.toml"]
|
|
79
|
+
exclude = [
|
|
80
|
+
".bumpversion.cfg",
|
|
81
|
+
".cache/",
|
|
82
|
+
".git/",
|
|
83
|
+
".github/",
|
|
84
|
+
".pytest_cache/",
|
|
85
|
+
".venv/",
|
|
86
|
+
"build/",
|
|
87
|
+
"dist/",
|
|
88
|
+
"docs/",
|
|
89
|
+
"test",
|
|
90
|
+
"tests/",
|
|
91
|
+
".bumpversion",
|
|
92
|
+
".dockerignore",
|
|
93
|
+
".gitignore",
|
|
94
|
+
".pre-commit-config.yaml",
|
|
95
|
+
".secrets.baseline",
|
|
96
|
+
"Dockerfile",
|
|
97
|
+
"Makefile",
|
|
98
|
+
"*.txt",
|
|
99
|
+
"*.tar",
|
|
100
|
+
"uv.lock",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
[tool.ruff]
|
|
104
|
+
select = ["E", "F", "W", "I", "B", "UP"]
|
|
105
|
+
ignore = []
|
|
106
|
+
fixable = ["ALL"]
|
|
107
|
+
line-length = 120
|
|
108
|
+
|
|
109
|
+
[tool.mypy]
|
|
110
|
+
strict = true
|
|
111
|
+
ignore_missing_imports = true
|
|
112
|
+
|
|
113
|
+
[tool.pytest.ini_options]
|
|
114
|
+
addopts = "-ra -q"
|