purrfectkit 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {purrfectkit-0.2.1.dist-info → purrfectkit-0.2.2.dist-info}/METADATA +20 -8
- purrfectkit-0.2.2.dist-info/RECORD +25 -0
- {purrfectkit-0.2.1.dist-info → purrfectkit-0.2.2.dist-info}/WHEEL +1 -1
- purrfectmeow/__init__.py +1 -1
- purrfectmeow/meow/chaus.py +20 -0
- purrfectmeow/meow/felis.py +33 -45
- purrfectmeow/meow/kitty.py +7 -7
- purrfectmeow/tc01_spl/base.py +7 -4
- purrfectmeow/tc01_spl/markdown.py +18 -11
- purrfectmeow/tc01_spl/ocr.py +36 -28
- purrfectmeow/tc01_spl/simple.py +30 -19
- purrfectmeow/tc02_mlt/base.py +19 -10
- purrfectmeow/tc02_mlt/separate.py +7 -4
- purrfectmeow/tc02_mlt/token.py +11 -19
- purrfectmeow/tc03_wcm/local.py +9 -7
- purrfectmeow/tc04_kmn/base.py +15 -5
- purrfectmeow/tc04_kmn/cosine.py +13 -14
- purrfectmeow/tc05_knj/base.py +2 -4
- purrfectkit-0.2.1.dist-info/RECORD +0 -24
- {purrfectkit-0.2.1.dist-info → purrfectkit-0.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: purrfectkit
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: **PurrfectKit** is a Python library for effortless Retrieval-Augmented Generation (RAG) workflows.
|
|
5
5
|
Keywords: rag,nlp,llms,python,ai,ocr,document-processing,multilingual,text-extraction
|
|
6
6
|
Author: SUWALUTIONS
|
|
@@ -25,7 +25,7 @@ Classifier: Natural Language :: English
|
|
|
25
25
|
Classifier: Natural Language :: Thai
|
|
26
26
|
Requires-Dist: python-magic<=0.4.27
|
|
27
27
|
Requires-Dist: sentence-transformers<=5.1.0
|
|
28
|
-
Requires-Dist: transformers<=4.
|
|
28
|
+
Requires-Dist: transformers<=4.53.0
|
|
29
29
|
Requires-Dist: docling<=2.31.1
|
|
30
30
|
Requires-Dist: markitdown<=0.1.1
|
|
31
31
|
Requires-Dist: pymupdf4llm<=0.0.27
|
|
@@ -37,9 +37,15 @@ Requires-Dist: python-doctr<=1.0.0
|
|
|
37
37
|
Requires-Dist: pandas<=2.3.2
|
|
38
38
|
Requires-Dist: langchain-text-splitters<=1.0.0
|
|
39
39
|
Requires-Dist: tiktoken<=0.12.0
|
|
40
|
+
Requires-Dist: ruff<=0.6.0 ; extra == 'dev'
|
|
41
|
+
Requires-Dist: mypy<=1.11.0 ; extra == 'dev'
|
|
42
|
+
Requires-Dist: pre-commit<=3.8.0 ; extra == 'dev'
|
|
43
|
+
Requires-Dist: detect-secrets<=1.5.0 ; extra == 'dev'
|
|
44
|
+
Requires-Dist: codecov-cli<=11.2.4 ; extra == 'dev'
|
|
40
45
|
Requires-Dist: sphinx<=8.2.3 ; extra == 'docs'
|
|
41
46
|
Requires-Dist: sphinx-rtd-theme<=3.0.2 ; extra == 'docs'
|
|
42
47
|
Requires-Dist: pytest<=8.4.2 ; extra == 'test'
|
|
48
|
+
Requires-Dist: pytest-cov<=7.0.0 ; extra == 'test'
|
|
43
49
|
Requires-Dist: pytest-mock<=3.15.1 ; extra == 'test'
|
|
44
50
|
Maintainer: KHARAPSY
|
|
45
51
|
Maintainer-email: KHARAPSY <kharapsy@suwalutions.com>
|
|
@@ -52,11 +58,17 @@ Provides-Extra: docs
|
|
|
52
58
|
Provides-Extra: test
|
|
53
59
|
Description-Content-Type: text/markdown
|
|
54
60
|
|
|
55
|
-

|
|
61
|
+

|
|
56
62
|
|
|
57
63
|
# PurrfectKit
|
|
58
64
|
|
|
59
|
-
[](https://www.python.org)
|
|
66
|
+
[](https://pypi.org/project/purrfectkit/)
|
|
67
|
+
[](https://pypistats.org/packages/purrfectkit)
|
|
68
|
+
[](https://codecov.io/github/suwalutions/PurrfectKit)
|
|
69
|
+
[](https://github.com/astral-sh/ruff)
|
|
70
|
+
[](https://ghcr.io/suwalutions/purrfectkit)
|
|
71
|
+
[](LICENSE)
|
|
60
72
|
|
|
61
73
|
**PurrfectKit** is a toolkit that simplifies Retrieval-Augmented Generation (RAG) into 5 easy steps:
|
|
62
74
|
1. Suphalak - read content from files
|
|
@@ -72,12 +84,11 @@ Description-Content-Type: text/markdown
|
|
|
72
84
|
### Prerequisites
|
|
73
85
|
- python
|
|
74
86
|
- tesseract
|
|
75
|
-
- git
|
|
76
87
|
|
|
77
88
|
|
|
78
89
|
### Installation
|
|
79
90
|
```bash
|
|
80
|
-
pip install
|
|
91
|
+
pip install purrfectkit
|
|
81
92
|
|
|
82
93
|
```
|
|
83
94
|
|
|
@@ -88,7 +99,8 @@ from purrfectmeow import Suphalak, Malet, WichienMaat, KhaoManee
|
|
|
88
99
|
|
|
89
100
|
file_path = 'test/test.pdf'
|
|
90
101
|
metadata = MetaFile.get_metadata(file_path)
|
|
91
|
-
|
|
102
|
+
with open(file_path, 'rb') as f:
|
|
103
|
+
content = Suphalak.reading(f, 'test.pdf')
|
|
92
104
|
chunks = Malet.chunking(content, chunk_method='token', chunk_size='500', chunk_overlap='25')
|
|
93
105
|
docs = DocTemplate.create_template(chunks, metadata)
|
|
94
106
|
embedding = WichienMaat.embedding(chunks)
|
|
@@ -97,6 +109,6 @@ KhaoManee.searching(query, embedding, docs, 2)
|
|
|
97
109
|
|
|
98
110
|
```
|
|
99
111
|
|
|
100
|
-
##
|
|
112
|
+
## License
|
|
101
113
|
|
|
102
114
|
PurrfectKit is released under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
purrfectmeow/__init__.py,sha256=t6Bq_9cB4lsYL3gBoAnSMubeITCqqoYzcC1I8wkD8QY,271
|
|
2
|
+
purrfectmeow/meow/chaus.py,sha256=PG95kQaMgIqQdd2MFUxCGUpz1-8yq7FrY1G9Imz7BEA,402
|
|
3
|
+
purrfectmeow/meow/felis.py,sha256=sIz4kjyH-Y1Qfzy-NLcUkSx5tiairFJmWjbmutGq8YM,5844
|
|
4
|
+
purrfectmeow/meow/kitty.py,sha256=ygbG8L29XwzC9CCz5BoZg5wKuWEWENRPHUPEhRwYSMY,2047
|
|
5
|
+
purrfectmeow/tc01_spl/__init__.py,sha256=7ENCidvXhj9YhMQvBcv_mm4XIr3Mwzc1USQxgzLO0Nw,51
|
|
6
|
+
purrfectmeow/tc01_spl/base.py,sha256=OUZy8u7avz1nlJ9hKVyFYeVkloSagGPW01O_zxyiLwI,3333
|
|
7
|
+
purrfectmeow/tc01_spl/markdown.py,sha256=WEyO8zjXgNJnb082dmvb9lpzJ8cqyOhdJV-Tos8SzPA,2027
|
|
8
|
+
purrfectmeow/tc01_spl/ocr.py,sha256=pWRd3C5K53SyTS8J1QXBAAg_ldtJSVDReyD3nKPkcCQ,4878
|
|
9
|
+
purrfectmeow/tc01_spl/simple.py,sha256=Am1lnuj9QLu5g78HU1QpJk1OjYg-cvetpeaebLJd8z4,2744
|
|
10
|
+
purrfectmeow/tc02_mlt/__init__.py,sha256=qB2Eyc_wFDVELwj0L7ttG_YOL3IISaqPBRj0zqSJcPo,45
|
|
11
|
+
purrfectmeow/tc02_mlt/base.py,sha256=FC_0FiVYd7D8MkpCYdEHlDlOuxEqiOr1T8xdULWGhL4,1635
|
|
12
|
+
purrfectmeow/tc02_mlt/separate.py,sha256=xUM2-qGF9psgJBbbJTLgFsiXuE3ftd4dTNhcVdBuXHE,1134
|
|
13
|
+
purrfectmeow/tc02_mlt/token.py,sha256=siGciepOFgHtaUzTQbaGPmd-Bl7fN2B6Cp6zlOai8Sk,1931
|
|
14
|
+
purrfectmeow/tc03_wcm/__init__.py,sha256=8pXGo04Z5KUNGkhSTONLBlqwVc43LicDGSuQiQDIKIM,57
|
|
15
|
+
purrfectmeow/tc03_wcm/base.py,sha256=pXaaiU8JMLIjI5uJRxMLRnQ1Wmwv3U6EEkQ_IwhPLwg,473
|
|
16
|
+
purrfectmeow/tc03_wcm/local.py,sha256=gfqXqAEDoozhi0EHnDXNLOlWZPzFE9RTeaHjGNVAFQI,1109
|
|
17
|
+
purrfectmeow/tc04_kmn/__init__.py,sha256=FBHZKVu4agf6-p1MdMx0jIgQuKbAy9rsOu7MRIQVwXg,53
|
|
18
|
+
purrfectmeow/tc04_kmn/base.py,sha256=InNetlSjwP9Need94IYvNrSmRYWgcD59KWb6NBrQCkk,482
|
|
19
|
+
purrfectmeow/tc04_kmn/cosine.py,sha256=_zAvnnDH6N0Urz-rScRHxM7umandMODbddzCfTfIwh4,1225
|
|
20
|
+
purrfectmeow/tc05_knj/__init__.py,sha256=XKwISvOAznPdTUWoTUnFDMBmxZF9Qd6FAi711W6bvZY,47
|
|
21
|
+
purrfectmeow/tc05_knj/base.py,sha256=9itMmUvSYAI7G8DdM2H7GyTRC2LEXOsBc1QZf6HiImU,77
|
|
22
|
+
purrfectkit-0.2.2.dist-info/licenses/LICENSE,sha256=9WlLgfJwKDGb71B1NwKYKKg6uL5u_knAr7ovGwIWvD4,1078
|
|
23
|
+
purrfectkit-0.2.2.dist-info/WHEEL,sha256=DpNsHFUm_gffZe1FgzmqwuqiuPC6Y-uBCzibcJcdupM,78
|
|
24
|
+
purrfectkit-0.2.2.dist-info/METADATA,sha256=4CJdWyS8cZVy7M3BdE7_XDW2kOz5OzT9OTSfg3eSh8c,4721
|
|
25
|
+
purrfectkit-0.2.2.dist-info/RECORD,,
|
purrfectmeow/__init__.py
CHANGED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import TypedDict
|
|
2
|
+
|
|
3
|
+
from .felis import Document
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FileMetadata(TypedDict, total=False):
|
|
7
|
+
file_name: str
|
|
8
|
+
file_size: int
|
|
9
|
+
file_created_date: str
|
|
10
|
+
file_modified_date: str
|
|
11
|
+
file_extension: str
|
|
12
|
+
file_type: str
|
|
13
|
+
description: str
|
|
14
|
+
total_pages: int | str
|
|
15
|
+
file_md5: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SimilarityResult(TypedDict, total=False):
|
|
19
|
+
score: float | str
|
|
20
|
+
document: Document
|
purrfectmeow/meow/felis.py
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Union
|
|
2
1
|
from io import BytesIO
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from .chaus import FileMetadata
|
|
5
|
+
|
|
3
6
|
|
|
4
7
|
class Document:
|
|
5
|
-
def __init__(self, page_content: str, metadata:
|
|
8
|
+
def __init__(self, page_content: str, metadata: dict[str, Any]) -> None:
|
|
6
9
|
self.page_content = page_content
|
|
7
10
|
self.metadata = metadata or {}
|
|
8
11
|
|
|
9
|
-
def __repr__(self):
|
|
12
|
+
def __repr__(self) -> str:
|
|
10
13
|
return f"{self.__class__.__name__}(page_content={self.page_content!r}, metadata={self.metadata!r})"
|
|
11
14
|
|
|
12
|
-
def __getitem__(self, key):
|
|
15
|
+
def __getitem__(self, key: str) -> Any:
|
|
13
16
|
if key == "page_content":
|
|
14
17
|
return self.page_content
|
|
15
18
|
elif key == "metadata":
|
|
@@ -17,31 +20,29 @@ class Document:
|
|
|
17
20
|
else:
|
|
18
21
|
raise KeyError(f"{key} is not a valid key. Use 'page_content' or 'metadata'.")
|
|
19
22
|
|
|
20
|
-
def to_dict(self):
|
|
21
|
-
return {
|
|
22
|
-
|
|
23
|
-
"metadata": self.metadata
|
|
24
|
-
}
|
|
23
|
+
def to_dict(self) -> dict[str, Any]:
|
|
24
|
+
return {"page_content": self.page_content, "metadata": self.metadata}
|
|
25
|
+
|
|
25
26
|
|
|
26
27
|
class DocTemplate:
|
|
27
28
|
@staticmethod
|
|
28
|
-
def create_template(chunks:
|
|
29
|
+
def create_template(chunks: list[str], metadata: dict[str, Any]) -> list[Document]:
|
|
29
30
|
if not isinstance(chunks, list):
|
|
30
31
|
raise TypeError(f"Expected 'chunks' to be a list, but got {type(chunks).__name__}.")
|
|
31
32
|
|
|
32
33
|
if not isinstance(metadata, dict):
|
|
33
34
|
raise TypeError(f"Expected 'metadata' to be a dict, but got {type(metadata).__name__}.")
|
|
34
|
-
|
|
35
|
+
|
|
35
36
|
if not all(isinstance(c, str) for c in chunks):
|
|
36
37
|
raise ValueError("All elements in 'chunks' must be strings.")
|
|
37
38
|
|
|
38
39
|
docs = []
|
|
39
40
|
chunk_hashes = []
|
|
40
41
|
|
|
41
|
-
import uuid
|
|
42
42
|
import hashlib
|
|
43
|
+
import uuid
|
|
43
44
|
|
|
44
|
-
for
|
|
45
|
+
for _, chunk in enumerate(chunks):
|
|
45
46
|
hash_val = hashlib.md5(chunk.encode()).hexdigest()
|
|
46
47
|
chunk_hashes.append(hash_val)
|
|
47
48
|
|
|
@@ -62,22 +63,17 @@ class DocTemplate:
|
|
|
62
63
|
"chunk_size": chunk_size,
|
|
63
64
|
}
|
|
64
65
|
|
|
65
|
-
doc_metadata = {
|
|
66
|
-
"chunk_info": chunk_info,
|
|
67
|
-
"source_info": metadata
|
|
68
|
-
}
|
|
66
|
+
doc_metadata = {"chunk_info": chunk_info, "source_info": metadata}
|
|
69
67
|
|
|
70
|
-
doc = Document(
|
|
71
|
-
page_content=chunk,
|
|
72
|
-
metadata=doc_metadata
|
|
73
|
-
)
|
|
68
|
+
doc = Document(page_content=chunk, metadata=doc_metadata)
|
|
74
69
|
docs.append(doc)
|
|
75
70
|
|
|
76
71
|
return docs
|
|
77
72
|
|
|
73
|
+
|
|
78
74
|
class MetaFile:
|
|
79
75
|
@staticmethod
|
|
80
|
-
def get_metadata(file:
|
|
76
|
+
def get_metadata(file: str | BytesIO, **kwargs: Any) -> FileMetadata:
|
|
81
77
|
if isinstance(file, bytes):
|
|
82
78
|
file = BytesIO(file)
|
|
83
79
|
|
|
@@ -85,13 +81,13 @@ class MetaFile:
|
|
|
85
81
|
import os
|
|
86
82
|
|
|
87
83
|
os.makedirs(".cache/tmp", exist_ok=True)
|
|
88
|
-
file_name = kwargs.get(
|
|
84
|
+
file_name = kwargs.get("file_name")
|
|
89
85
|
|
|
90
86
|
if not file_name:
|
|
91
87
|
raise ValueError("file_name must be provided when using BytesIO.")
|
|
92
|
-
|
|
88
|
+
|
|
93
89
|
file_path = os.path.join(".cache/tmp", file_name)
|
|
94
|
-
with open(file_path,
|
|
90
|
+
with open(file_path, "wb") as f:
|
|
95
91
|
f.write(file.getvalue())
|
|
96
92
|
|
|
97
93
|
try:
|
|
@@ -101,21 +97,22 @@ class MetaFile:
|
|
|
101
97
|
|
|
102
98
|
elif isinstance(file, str):
|
|
103
99
|
return MetaFile._get_metadata_from_path(file)
|
|
104
|
-
|
|
100
|
+
|
|
105
101
|
else:
|
|
106
102
|
raise TypeError(f"Unsupported file type: {type(file).__name__}. Expected str, bytes, or BytesIO.")
|
|
107
103
|
|
|
108
104
|
@staticmethod
|
|
109
|
-
def _get_metadata_from_path(file_path: str) ->
|
|
110
|
-
metadata = {}
|
|
111
|
-
|
|
105
|
+
def _get_metadata_from_path(file_path: str) -> FileMetadata:
|
|
106
|
+
metadata: FileMetadata = {}
|
|
107
|
+
|
|
108
|
+
import hashlib
|
|
112
109
|
import os
|
|
113
110
|
import re
|
|
111
|
+
import subprocess
|
|
114
112
|
import time
|
|
113
|
+
|
|
115
114
|
import magic
|
|
116
|
-
|
|
117
|
-
import subprocess
|
|
118
|
-
|
|
115
|
+
|
|
119
116
|
try:
|
|
120
117
|
if not os.path.exists(file_path):
|
|
121
118
|
raise FileNotFoundError(f"File {file_path} does not exist")
|
|
@@ -123,12 +120,8 @@ class MetaFile:
|
|
|
123
120
|
stats = os.stat(file_path)
|
|
124
121
|
metadata["file_name"] = os.path.basename(file_path)
|
|
125
122
|
metadata["file_size"] = stats.st_size
|
|
126
|
-
metadata["file_created_date"] = time.strftime(
|
|
127
|
-
|
|
128
|
-
)
|
|
129
|
-
metadata["file_modified_date"] = time.strftime(
|
|
130
|
-
'%Y-%m-%d %H:%M:%S', time.localtime(stats.st_mtime)
|
|
131
|
-
)
|
|
123
|
+
metadata["file_created_date"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stats.st_ctime))
|
|
124
|
+
metadata["file_modified_date"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stats.st_mtime))
|
|
132
125
|
metadata["file_extension"] = os.path.splitext(file_path)[1] or "none"
|
|
133
126
|
|
|
134
127
|
try:
|
|
@@ -143,12 +136,7 @@ class MetaFile:
|
|
|
143
136
|
metadata["total_pages"] = 1
|
|
144
137
|
elif metadata["file_type"].startswith("application/pdf"):
|
|
145
138
|
try:
|
|
146
|
-
result = subprocess.run(
|
|
147
|
-
['pdfinfo', file_path],
|
|
148
|
-
stdout=subprocess.PIPE,
|
|
149
|
-
text=True,
|
|
150
|
-
check=True
|
|
151
|
-
)
|
|
139
|
+
result = subprocess.run(["pdfinfo", file_path], stdout=subprocess.PIPE, text=True, check=True)
|
|
152
140
|
pages_match = re.search(r"Pages:\s*(\d+)", result.stdout)
|
|
153
141
|
if pages_match:
|
|
154
142
|
metadata["total_pages"] = int(pages_match.group(1))
|
|
@@ -168,4 +156,4 @@ class MetaFile:
|
|
|
168
156
|
return metadata
|
|
169
157
|
|
|
170
158
|
except Exception as e:
|
|
171
|
-
raise RuntimeError(f"Failed to extract metadata: {
|
|
159
|
+
raise RuntimeError(f"Failed to extract metadata: {e}") from e
|
purrfectmeow/meow/kitty.py
CHANGED
|
@@ -2,17 +2,19 @@ import logging
|
|
|
2
2
|
from logging.handlers import RotatingFileHandler
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
|
+
|
|
5
6
|
class LevelBasedFormatter(logging.Formatter):
|
|
6
|
-
def __init__(self, default_fmt, info_fmt, datefmt=None):
|
|
7
|
+
def __init__(self, default_fmt: str, info_fmt: str, datefmt: str | None = None) -> None:
|
|
7
8
|
super().__init__(datefmt=datefmt)
|
|
8
|
-
self.default_fmt = logging.Formatter(default_fmt, datefmt)
|
|
9
|
-
self.info_fmt = logging.Formatter(info_fmt, datefmt)
|
|
9
|
+
self.default_fmt: logging.Formatter = logging.Formatter(default_fmt, datefmt)
|
|
10
|
+
self.info_fmt: logging.Formatter = logging.Formatter(info_fmt, datefmt)
|
|
10
11
|
|
|
11
|
-
def format(self, record):
|
|
12
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
12
13
|
if record.levelno == logging.INFO:
|
|
13
14
|
return self.info_fmt.format(record)
|
|
14
15
|
return self.default_fmt.format(record)
|
|
15
16
|
|
|
17
|
+
|
|
16
18
|
def kitty_logger(name: str, log_file: str = "kitty.log", log_level: str = "INFO") -> logging.Logger:
|
|
17
19
|
"""
|
|
18
20
|
Sets up a logger with console and rotating file handlers.
|
|
@@ -43,9 +45,7 @@ def kitty_logger(name: str, log_file: str = "kitty.log", log_level: str = "INFO"
|
|
|
43
45
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
44
46
|
log_path = log_dir / log_file
|
|
45
47
|
|
|
46
|
-
file_handler = RotatingFileHandler(
|
|
47
|
-
log_path, maxBytes=5 * 1024 * 1024, backupCount=3
|
|
48
|
-
)
|
|
48
|
+
file_handler = RotatingFileHandler(log_path, maxBytes=5 * 1024 * 1024, backupCount=3)
|
|
49
49
|
file_handler.setFormatter(formatter)
|
|
50
50
|
logger.addHandler(file_handler)
|
|
51
51
|
|
purrfectmeow/tc01_spl/base.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, BinaryIO
|
|
2
2
|
|
|
3
3
|
from .markdown import Markdown
|
|
4
4
|
from .ocr import Ocr
|
|
5
5
|
from .simple import Simple
|
|
6
6
|
|
|
7
|
+
|
|
7
8
|
class Suphalak:
|
|
8
|
-
tmp_dir =
|
|
9
|
+
tmp_dir = ".cache/tmp"
|
|
9
10
|
DEFAULT_LOADER = "PYMUPDF4LLM"
|
|
10
11
|
|
|
11
|
-
_LOADERS:
|
|
12
|
+
_LOADERS: dict[str, dict[str, Any]] = {
|
|
12
13
|
"MARKITDOWN": {
|
|
13
14
|
"func": Markdown.markitdown_convert,
|
|
14
15
|
"ext": ("csv", "docx", "md", "pdf", "pptx", "txt", "xls", "xlsx"),
|
|
@@ -67,8 +68,9 @@ class Suphalak:
|
|
|
67
68
|
return cls.DEFAULT_LOADER
|
|
68
69
|
|
|
69
70
|
@classmethod
|
|
70
|
-
def reading(cls, file: BinaryIO, file_name: str, loader: str = None, **kwargs: Any) -> str:
|
|
71
|
+
def reading(cls, file: BinaryIO, file_name: str, loader: str | None = None, **kwargs: Any) -> str:
|
|
71
72
|
import os
|
|
73
|
+
|
|
72
74
|
file_ext = file_name.split(".")[-1].lower()
|
|
73
75
|
|
|
74
76
|
if not loader:
|
|
@@ -87,6 +89,7 @@ class Suphalak:
|
|
|
87
89
|
file_path = os.path.join(cls.tmp_dir, file_name)
|
|
88
90
|
|
|
89
91
|
try:
|
|
92
|
+
text: str
|
|
90
93
|
with open(file_path, "wb") as f:
|
|
91
94
|
f.write(file.read())
|
|
92
95
|
|
|
@@ -1,24 +1,25 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
from purrfectmeow.meow.kitty import kitty_logger
|
|
5
6
|
|
|
7
|
+
|
|
6
8
|
class Markdown:
|
|
7
|
-
|
|
8
9
|
_logger = kitty_logger(__name__)
|
|
9
10
|
|
|
10
11
|
@classmethod
|
|
11
|
-
def _convert(cls, file_path: str, converter: Callable, extractor: Callable) -> str:
|
|
12
|
+
def _convert(cls, file_path: str, converter: Callable[[str], Any], extractor: Callable[[Any], str]) -> str:
|
|
12
13
|
cls._logger.debug(f"Starting conversion for '{file_path}'")
|
|
13
14
|
start = time.time()
|
|
14
15
|
try:
|
|
15
|
-
|
|
16
|
-
result = extractor(
|
|
16
|
+
raw_content: Any = converter(file_path)
|
|
17
|
+
result: str = extractor(raw_content)
|
|
17
18
|
|
|
18
19
|
cls._logger.debug(f"Succesfully converted '{file_path}'")
|
|
19
20
|
|
|
20
21
|
return result
|
|
21
|
-
|
|
22
|
+
|
|
22
23
|
finally:
|
|
23
24
|
elapsed = time.time() - start
|
|
24
25
|
cls._logger.debug(f"Conversion time spent '{elapsed:.2f}' seconds.")
|
|
@@ -28,16 +29,22 @@ class Markdown:
|
|
|
28
29
|
cls._logger.debug("Using MarkItDown for Conversion")
|
|
29
30
|
|
|
30
31
|
from markitdown import MarkItDown
|
|
31
|
-
|
|
32
|
-
|
|
32
|
+
|
|
33
|
+
mid = MarkItDown()
|
|
34
|
+
|
|
35
|
+
return cls._convert(file_path, lambda path: mid.convert(path), lambda content: content.text_content)
|
|
33
36
|
|
|
34
37
|
@classmethod
|
|
35
38
|
def docling_convert(cls, file_path: str) -> str:
|
|
36
39
|
cls._logger.debug("Using Docling for Conversion")
|
|
37
|
-
|
|
40
|
+
|
|
38
41
|
from docling.document_converter import DocumentConverter
|
|
39
42
|
|
|
40
|
-
|
|
43
|
+
dcl = DocumentConverter()
|
|
44
|
+
|
|
45
|
+
return cls._convert(
|
|
46
|
+
file_path, lambda path: dcl.convert(path).document, lambda content: content.document.export_to_markdown()
|
|
47
|
+
)
|
|
41
48
|
|
|
42
49
|
@classmethod
|
|
43
50
|
def pymupdf4llm_convert(cls, file_path: str) -> str:
|
|
@@ -48,7 +55,7 @@ class Markdown:
|
|
|
48
55
|
import pymupdf4llm
|
|
49
56
|
|
|
50
57
|
try:
|
|
51
|
-
res = pymupdf4llm.to_markdown(file_path)
|
|
58
|
+
res: str = pymupdf4llm.to_markdown(file_path)
|
|
52
59
|
cls._logger.debug(f"Succesfully converted '{file_path}'")
|
|
53
60
|
|
|
54
61
|
return res
|
purrfectmeow/tc01_spl/ocr.py
CHANGED
|
@@ -1,26 +1,34 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
from purrfectmeow.meow.kitty import kitty_logger
|
|
5
6
|
|
|
6
|
-
class Ocr:
|
|
7
7
|
|
|
8
|
+
class Ocr:
|
|
8
9
|
_logger = kitty_logger(__name__)
|
|
9
10
|
_image_type = [
|
|
10
|
-
".apng",
|
|
11
|
+
".apng",
|
|
12
|
+
".png",
|
|
11
13
|
".avif",
|
|
12
14
|
".gif",
|
|
13
|
-
".jpg",
|
|
15
|
+
".jpg",
|
|
16
|
+
".jpeg",
|
|
17
|
+
".jfif",
|
|
18
|
+
".pjpeg",
|
|
19
|
+
".pjp",
|
|
14
20
|
".png",
|
|
15
21
|
".svg",
|
|
16
22
|
".webp",
|
|
17
23
|
".bmp",
|
|
18
|
-
".ico",
|
|
19
|
-
".
|
|
24
|
+
".ico",
|
|
25
|
+
".cur",
|
|
26
|
+
".tif",
|
|
27
|
+
".tiff",
|
|
20
28
|
]
|
|
21
29
|
|
|
22
30
|
@classmethod
|
|
23
|
-
def _convert(cls, file_path: str, converter: Callable) -> str:
|
|
31
|
+
def _convert(cls, file_path: str, converter: Callable[[str], Any]) -> str:
|
|
24
32
|
cls._logger.debug(f"Starting conversion for '{file_path}'")
|
|
25
33
|
start = time.time()
|
|
26
34
|
|
|
@@ -28,7 +36,6 @@ class Ocr:
|
|
|
28
36
|
content = []
|
|
29
37
|
match file_path.lower():
|
|
30
38
|
case path if path.endswith(".pdf"):
|
|
31
|
-
|
|
32
39
|
from pdf2image import convert_from_path
|
|
33
40
|
|
|
34
41
|
images = convert_from_path(file_path, fmt="png")
|
|
@@ -37,12 +44,11 @@ class Ocr:
|
|
|
37
44
|
text = converter(image)
|
|
38
45
|
cls._logger.debug(f"Text: {text}")
|
|
39
46
|
content.append(text)
|
|
40
|
-
cls._logger.debug(f"Page {idx+1} processed")
|
|
47
|
+
cls._logger.debug(f"Page {idx + 1} processed")
|
|
41
48
|
except Exception as e:
|
|
42
|
-
cls._logger.exception(f"Page {idx+1} failed: {e}")
|
|
49
|
+
cls._logger.exception(f"Page {idx + 1} failed: {e}")
|
|
43
50
|
raise
|
|
44
51
|
case path if path.endswith(tuple(cls._image_type)):
|
|
45
|
-
|
|
46
52
|
from PIL import Image
|
|
47
53
|
|
|
48
54
|
image = Image.open(file_path)
|
|
@@ -61,41 +67,39 @@ class Ocr:
|
|
|
61
67
|
finally:
|
|
62
68
|
elasped = time.time() - start
|
|
63
69
|
cls._logger.debug(f"Conversion time spent '{elasped:.2f}' seconds.")
|
|
64
|
-
|
|
70
|
+
|
|
65
71
|
@classmethod
|
|
66
72
|
def pytesseract_convert(cls, file_path: str) -> str:
|
|
67
73
|
cls._logger.debug("Using PyTesseract for Conversion")
|
|
68
74
|
|
|
69
|
-
def converter(image):
|
|
75
|
+
def converter(image: str) -> Any:
|
|
70
76
|
import pytesseract
|
|
71
77
|
|
|
72
78
|
return pytesseract.image_to_string(image, lang="tha+eng")
|
|
73
|
-
|
|
79
|
+
|
|
74
80
|
return cls._convert(file_path, converter)
|
|
75
81
|
|
|
76
82
|
@classmethod
|
|
77
83
|
def easyocr_convert(cls, file_path: str) -> str:
|
|
78
84
|
cls._logger.debug("Using EasyOCR for Conversion")
|
|
79
|
-
|
|
80
|
-
def converter(image):
|
|
85
|
+
|
|
86
|
+
def converter(image: str) -> Any:
|
|
81
87
|
import easyocr
|
|
82
88
|
import numpy
|
|
83
89
|
|
|
84
|
-
reader = easyocr.Reader(
|
|
85
|
-
['th', 'en'],
|
|
86
|
-
gpu=False
|
|
87
|
-
)
|
|
90
|
+
reader = easyocr.Reader(["th", "en"], gpu=False)
|
|
88
91
|
res = reader.readtext(numpy.array(image))
|
|
89
92
|
return "\n".join(text for _, text, _ in res)
|
|
93
|
+
|
|
90
94
|
return cls._convert(file_path, converter)
|
|
91
95
|
|
|
92
96
|
@classmethod
|
|
93
97
|
def suryaocr_convert(cls, file_path: str) -> str:
|
|
94
98
|
cls._logger.debug("Using SuryaOCR for Conversion")
|
|
95
|
-
|
|
96
|
-
def converter(image):
|
|
97
|
-
from surya.recognition import RecognitionPredictor
|
|
99
|
+
|
|
100
|
+
def converter(image: str) -> Any:
|
|
98
101
|
from surya.detection import DetectionPredictor
|
|
102
|
+
from surya.recognition import RecognitionPredictor
|
|
99
103
|
|
|
100
104
|
rec_pred = RecognitionPredictor()
|
|
101
105
|
det_pred = DetectionPredictor()
|
|
@@ -107,20 +111,23 @@ class Ocr:
|
|
|
107
111
|
recognition_batch_size=1,
|
|
108
112
|
)
|
|
109
113
|
return "\n".join(line.text for line in prediction[0].text_lines)
|
|
114
|
+
|
|
110
115
|
return cls._convert(file_path, converter)
|
|
111
116
|
|
|
112
117
|
@classmethod
|
|
113
118
|
def doctr_convert(cls, file_path: str) -> str:
|
|
114
119
|
cls._logger.debug("Using docTR for Conversion")
|
|
115
120
|
|
|
116
|
-
def converter(image):
|
|
121
|
+
def converter(image: str) -> str:
|
|
117
122
|
import os
|
|
123
|
+
import shutil
|
|
118
124
|
import tempfile
|
|
125
|
+
|
|
119
126
|
from doctr.io import DocumentFile
|
|
120
127
|
from doctr.models import ocr_predictor
|
|
121
128
|
|
|
122
129
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
|
123
|
-
|
|
130
|
+
shutil.copy(image, tmp.name)
|
|
124
131
|
temp_image_path = tmp.name
|
|
125
132
|
|
|
126
133
|
model = ocr_predictor(pretrained=True)
|
|
@@ -130,12 +137,13 @@ class Ocr:
|
|
|
130
137
|
combined_text = "\n".join(
|
|
131
138
|
word["value"]
|
|
132
139
|
for page in data["pages"]
|
|
133
|
-
for block in page.get(
|
|
134
|
-
for line in block.get(
|
|
135
|
-
for word in line.get(
|
|
140
|
+
for block in page.get("blocks", [])
|
|
141
|
+
for line in block.get("lines", [])
|
|
142
|
+
for word in line.get("words", [])
|
|
136
143
|
if "value" in word
|
|
137
144
|
)
|
|
138
145
|
if os.path.exists(temp_image_path):
|
|
139
146
|
os.remove(temp_image_path)
|
|
140
147
|
return combined_text
|
|
148
|
+
|
|
141
149
|
return cls._convert(file_path, converter)
|
purrfectmeow/tc01_spl/simple.py
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
from purrfectmeow.meow.kitty import kitty_logger
|
|
5
6
|
|
|
6
|
-
class Simple:
|
|
7
7
|
|
|
8
|
+
class Simple:
|
|
8
9
|
_logger = kitty_logger(__name__)
|
|
9
10
|
|
|
10
11
|
@classmethod
|
|
11
|
-
def _convert(cls, file_path: str, converter: Callable) -> str:
|
|
12
|
+
def _convert(cls, file_path: str, converter: Callable[[str], Any]) -> str | Any:
|
|
12
13
|
cls._logger.debug(f"Starting conversion for '{file_path}'")
|
|
13
14
|
start = time.time()
|
|
14
15
|
|
|
15
16
|
try:
|
|
16
17
|
res = converter(file_path)
|
|
17
|
-
|
|
18
|
+
|
|
18
19
|
cls._logger.debug(f"Successfully converted '{file_path}'")
|
|
19
20
|
return res
|
|
20
21
|
|
|
@@ -26,39 +27,49 @@ class Simple:
|
|
|
26
27
|
def encoding_convert(cls, file_path: str) -> str:
|
|
27
28
|
cls._logger.debug("Using Encoding for Conversion")
|
|
28
29
|
|
|
29
|
-
def reader(file_path):
|
|
30
|
-
with open(file_path,
|
|
30
|
+
def reader(file_path: str) -> str:
|
|
31
|
+
with open(file_path, encoding="utf-8") as f:
|
|
31
32
|
return f.read()
|
|
33
|
+
|
|
32
34
|
return cls._convert(file_path, lambda file_path: reader(file_path))
|
|
33
35
|
|
|
34
36
|
@classmethod
|
|
35
37
|
def pymupdf_convert(cls, file_path: str) -> str:
|
|
36
38
|
cls._logger.debug("Using PyMuPDF for Conversion")
|
|
37
39
|
|
|
38
|
-
def reader(file_path):
|
|
40
|
+
def reader(file_path: str) -> str:
|
|
39
41
|
import pymupdf
|
|
40
42
|
|
|
41
|
-
if file_path.endswith((
|
|
43
|
+
if file_path.endswith((".txt", ".md", ".json", ".html", ".xml")):
|
|
42
44
|
return "".join(page.get_text() for page in pymupdf.open(file_path, filetype="txt"))
|
|
43
45
|
else:
|
|
44
46
|
return "".join(page.get_text() for page in pymupdf.open(file_path))
|
|
47
|
+
|
|
45
48
|
return cls._convert(file_path, lambda file_path: reader(file_path))
|
|
46
49
|
|
|
47
50
|
@classmethod
|
|
48
51
|
def pandas_convert(cls, file_path: str) -> str:
|
|
49
52
|
cls._logger.debug("Using Pandas for Conversion")
|
|
50
53
|
|
|
51
|
-
def reader(file_path):
|
|
54
|
+
def reader(file_path: str) -> Any:
|
|
52
55
|
import pandas
|
|
53
56
|
|
|
54
|
-
if file_path.endswith((
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
return
|
|
60
|
-
elif file_path.endswith(
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
57
|
+
if file_path.endswith((".xls", ".xlsx")):
|
|
58
|
+
df_x: pandas.DataFrame = pandas.read_excel(file_path)
|
|
59
|
+
return df_x.to_string(index=False)
|
|
60
|
+
elif file_path.endswith(".csv"):
|
|
61
|
+
df_c: pandas.DataFrame = pandas.read_csv(file_path)
|
|
62
|
+
return df_c.to_string(index=False)
|
|
63
|
+
elif file_path.endswith(".json"):
|
|
64
|
+
df_j: pandas.DataFrame = pandas.read_json(file_path)
|
|
65
|
+
return df_j.to_string(index=False)
|
|
66
|
+
elif file_path.endswith(".html"):
|
|
67
|
+
df_h: list[pandas.DataFrame] = pandas.read_html(file_path)
|
|
68
|
+
return "".join(df.to_string(index=False) for df in df_h)
|
|
69
|
+
elif file_path.endswith(".xml"):
|
|
70
|
+
df_m: pandas.DataFrame = pandas.read_xml(file_path)
|
|
71
|
+
return df_m.to_string(index=False)
|
|
72
|
+
else:
|
|
73
|
+
return ""
|
|
74
|
+
|
|
64
75
|
return cls._convert(file_path, lambda file_path: reader(file_path))
|
purrfectmeow/tc02_mlt/base.py
CHANGED
|
@@ -1,34 +1,43 @@
|
|
|
1
|
-
from typing import Any,
|
|
1
|
+
from typing import Any, Literal
|
|
2
|
+
|
|
3
|
+
from langchain_text_splitters import TokenTextSplitter
|
|
2
4
|
|
|
3
|
-
from .token import TokenSplit
|
|
4
5
|
from .separate import SeparateSplit
|
|
6
|
+
from .token import TokenSplit
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
class Malet:
|
|
7
|
-
DEFAULT_MODEL_NAME =
|
|
10
|
+
DEFAULT_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
8
11
|
DEFAULT_CHUNK_SIZE = 500
|
|
9
12
|
DEFAULT_CHUNK_OVERLAP = 0
|
|
10
|
-
DEFAULT_CHUNK_SEPARATOR =
|
|
13
|
+
DEFAULT_CHUNK_SEPARATOR = "\n\n"
|
|
11
14
|
|
|
12
15
|
@staticmethod
|
|
13
|
-
def _get_kwarg(kwargs: dict, keys:
|
|
16
|
+
def _get_kwarg(kwargs: dict[str, Any], keys: list[str], default: Any = None) -> Any:
|
|
14
17
|
for key in keys:
|
|
15
18
|
if key in kwargs:
|
|
16
19
|
return kwargs[key]
|
|
17
20
|
return default
|
|
18
21
|
|
|
19
22
|
@classmethod
|
|
20
|
-
def chunking(
|
|
23
|
+
def chunking(
|
|
24
|
+
cls, text: str, chunk_method: Literal["token", "separate"] | None = "token", **kwargs: Any
|
|
25
|
+
) -> TokenTextSplitter | SeparateSplit.CharacterSeparator:
|
|
21
26
|
match chunk_method:
|
|
22
27
|
case "token":
|
|
23
28
|
model_name = cls._get_kwarg(kwargs, ["model_name", "ModelName", "modelName"], cls.DEFAULT_MODEL_NAME)
|
|
24
29
|
chunk_size = cls._get_kwarg(kwargs, ["chunk_size", "ChunkSize", "chunkSize"], cls.DEFAULT_CHUNK_SIZE)
|
|
25
|
-
chunk_overlap = cls._get_kwarg(
|
|
30
|
+
chunk_overlap = cls._get_kwarg(
|
|
31
|
+
kwargs, ["chunk_overlap", "ChunkOverlap", "chunkOverlap"], cls.DEFAULT_CHUNK_OVERLAP
|
|
32
|
+
)
|
|
26
33
|
|
|
27
34
|
method = TokenSplit.splitter(model_name, chunk_size, chunk_overlap)
|
|
28
|
-
return method.split_text(text)
|
|
29
35
|
|
|
30
36
|
case "separate":
|
|
31
|
-
chunk_separator = cls._get_kwarg(
|
|
37
|
+
chunk_separator = cls._get_kwarg(
|
|
38
|
+
kwargs, ["chunk_separator", "ChunkSeparator", "chunkSeparator"], cls.DEFAULT_CHUNK_SEPARATOR
|
|
39
|
+
)
|
|
32
40
|
|
|
33
41
|
method = SeparateSplit.splitter(chunk_separator)
|
|
34
|
-
|
|
42
|
+
|
|
43
|
+
return method.split_text(text)
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import time
|
|
2
4
|
|
|
3
5
|
from purrfectmeow.meow.kitty import kitty_logger
|
|
4
6
|
|
|
7
|
+
|
|
5
8
|
class SeparateSplit:
|
|
6
9
|
_logger = kitty_logger(__name__)
|
|
7
|
-
|
|
10
|
+
|
|
8
11
|
@classmethod
|
|
9
|
-
def splitter(cls, chunk_separator: str):
|
|
12
|
+
def splitter(cls, chunk_separator: str) -> CharacterSeparator:
|
|
10
13
|
cls._logger.debug("Initializing separate splitter")
|
|
11
14
|
start = time.time()
|
|
12
15
|
|
|
@@ -25,8 +28,8 @@ class SeparateSplit:
|
|
|
25
28
|
class CharacterSeparator:
|
|
26
29
|
def __init__(self, separator: str):
|
|
27
30
|
self.separator = separator
|
|
28
|
-
|
|
29
|
-
def split_text(self, text: str):
|
|
31
|
+
|
|
32
|
+
def split_text(self, text: str) -> list[str]:
|
|
30
33
|
chunks = [chunk + self.separator for chunk in text.split(self.separator)]
|
|
31
34
|
chunks[-1] = chunks[-1].rstrip(self.separator)
|
|
32
35
|
return chunks
|
purrfectmeow/tc02_mlt/token.py
CHANGED
|
@@ -1,45 +1,38 @@
|
|
|
1
1
|
import time
|
|
2
2
|
|
|
3
|
+
from langchain_text_splitters import TokenTextSplitter
|
|
4
|
+
|
|
3
5
|
from purrfectmeow.meow.kitty import kitty_logger
|
|
4
6
|
|
|
7
|
+
|
|
5
8
|
class TokenSplit:
|
|
6
9
|
_logger = kitty_logger(__name__)
|
|
7
10
|
|
|
8
|
-
_OPENAI_EMBED_MODEL = {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
'text-embedding-3-large'
|
|
12
|
-
}
|
|
13
|
-
_OPENAI_HF_MODEL = {
|
|
14
|
-
'Xenova/text-embedding-ada-002'
|
|
15
|
-
}
|
|
16
|
-
_HF_MODEL_DIR = '.cache/huggingface/hub/'
|
|
11
|
+
_OPENAI_EMBED_MODEL = {"text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"}
|
|
12
|
+
_OPENAI_HF_MODEL = {"Xenova/text-embedding-ada-002"}
|
|
13
|
+
_HF_MODEL_DIR = ".cache/huggingface/hub/"
|
|
17
14
|
|
|
18
15
|
@classmethod
|
|
19
|
-
def splitter(cls, model_name: str, chunk_size: int, chunk_overlap: int):
|
|
16
|
+
def splitter(cls, model_name: str, chunk_size: int, chunk_overlap: int) -> TokenTextSplitter:
|
|
20
17
|
cls._logger.debug("Initializing token splitter")
|
|
21
18
|
start = time.time()
|
|
22
|
-
|
|
19
|
+
|
|
23
20
|
try:
|
|
24
21
|
cls._logger.debug(f"Using OpenAI model tokenizer: {model_name}")
|
|
25
|
-
from langchain_text_splitters import TokenTextSplitter
|
|
26
22
|
if model_name in cls._OPENAI_EMBED_MODEL:
|
|
27
23
|
splitter = TokenTextSplitter.from_tiktoken_encoder(
|
|
28
|
-
model_name=model_name,
|
|
29
|
-
chunk_size=chunk_size,
|
|
30
|
-
chunk_overlap=chunk_overlap
|
|
24
|
+
model_name=model_name, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
31
25
|
)
|
|
32
26
|
else:
|
|
33
27
|
cls._logger.debug(f"Using HuggingFace tokenizer: {model_name}")
|
|
34
28
|
from transformers import AutoTokenizer, GPT2TokenizerFast
|
|
29
|
+
|
|
35
30
|
if model_name in cls._OPENAI_HF_MODEL:
|
|
36
31
|
tokenizer = GPT2TokenizerFast.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
|
|
37
32
|
else:
|
|
38
33
|
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cls._HF_MODEL_DIR)
|
|
39
34
|
splitter = TokenTextSplitter.from_huggingface_tokenizer(
|
|
40
|
-
tokenizer=tokenizer,
|
|
41
|
-
chunk_size=chunk_size,
|
|
42
|
-
chunk_overlap=chunk_overlap
|
|
35
|
+
tokenizer=tokenizer, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
43
36
|
)
|
|
44
37
|
|
|
45
38
|
cls._logger.debug("Token splitter successfully initialized.")
|
|
@@ -52,4 +45,3 @@ class TokenSplit:
|
|
|
52
45
|
finally:
|
|
53
46
|
elapsed = time.time() - start
|
|
54
47
|
cls._logger.debug(f"Token splitting completed in {elapsed:.2f} seconds.")
|
|
55
|
-
|
purrfectmeow/tc03_wcm/local.py
CHANGED
|
@@ -1,24 +1,26 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import numpy
|
|
3
5
|
|
|
4
6
|
from purrfectmeow.meow.kitty import kitty_logger
|
|
5
7
|
|
|
6
|
-
class Local:
|
|
7
8
|
|
|
9
|
+
class Local:
|
|
8
10
|
_logger = kitty_logger(__name__)
|
|
9
|
-
|
|
10
|
-
_HF_MODEL_DIR = '.cache/huggingface/hub/'
|
|
11
|
+
_HF_MODEL_DIR = ".cache/huggingface/hub/"
|
|
11
12
|
|
|
12
13
|
@classmethod
|
|
13
|
-
def model_encode(cls, sentence: str |
|
|
14
|
+
def model_encode(cls, sentence: str | list[str], model_name: str, **kwargs: Any) -> numpy.ndarray:
|
|
14
15
|
cls._logger.debug("Initializing local model encode")
|
|
15
16
|
start = time.time()
|
|
16
17
|
try:
|
|
17
18
|
from sentence_transformers import SentenceTransformer
|
|
19
|
+
|
|
18
20
|
model = SentenceTransformer(
|
|
19
|
-
model_name,
|
|
21
|
+
model_name,
|
|
20
22
|
cache_folder=cls._HF_MODEL_DIR,
|
|
21
|
-
#local_files_only=True
|
|
23
|
+
# local_files_only=True
|
|
22
24
|
)
|
|
23
25
|
|
|
24
26
|
embed = model.encode(sentence, convert_to_numpy=True)
|
purrfectmeow/tc04_kmn/base.py
CHANGED
|
@@ -1,8 +1,18 @@
|
|
|
1
|
+
import numpy
|
|
1
2
|
|
|
2
|
-
from .
|
|
3
|
-
|
|
3
|
+
from purrfectmeow.meow.chaus import SimilarityResult
|
|
4
|
+
from purrfectmeow.meow.felis import Document
|
|
5
|
+
|
|
6
|
+
from .cosine import CosineSim
|
|
4
7
|
|
|
5
|
-
@classmethod
|
|
6
|
-
def searching(cls, query_embed, sentence_embed, document, top_k):
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
class KhaoManee:
|
|
10
|
+
@classmethod
|
|
11
|
+
def searching(
|
|
12
|
+
cls,
|
|
13
|
+
query_embed: numpy.ndarray,
|
|
14
|
+
sentence_embed: numpy.ndarray | list[numpy.ndarray],
|
|
15
|
+
documents: list[Document],
|
|
16
|
+
top_k: int,
|
|
17
|
+
) -> list[SimilarityResult]:
|
|
18
|
+
return CosineSim.vector_search(query_embed, sentence_embed, documents, top_k)
|
purrfectmeow/tc04_kmn/cosine.py
CHANGED
|
@@ -1,22 +1,23 @@
|
|
|
1
1
|
import time
|
|
2
|
-
from typing import List
|
|
3
2
|
|
|
4
3
|
import numpy
|
|
5
|
-
from purrfectmeow.meow.felis import Document
|
|
6
4
|
|
|
5
|
+
from purrfectmeow.meow.chaus import SimilarityResult
|
|
6
|
+
from purrfectmeow.meow.felis import Document
|
|
7
7
|
from purrfectmeow.meow.kitty import kitty_logger
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
|
|
10
|
+
class CosineSim:
|
|
10
11
|
_logger = kitty_logger(__name__)
|
|
11
12
|
|
|
12
13
|
@classmethod
|
|
13
14
|
def vector_search(
|
|
14
|
-
cls,
|
|
15
|
-
embed_query: numpy.ndarray,
|
|
16
|
-
embed_sentence: numpy.ndarray |
|
|
17
|
-
|
|
18
|
-
top_k: int
|
|
19
|
-
):
|
|
15
|
+
cls,
|
|
16
|
+
embed_query: numpy.ndarray,
|
|
17
|
+
embed_sentence: numpy.ndarray | list[numpy.ndarray],
|
|
18
|
+
documents: list[Document],
|
|
19
|
+
top_k: int,
|
|
20
|
+
) -> list[SimilarityResult]:
|
|
20
21
|
cls._logger.debug("Initializing vector search")
|
|
21
22
|
start = time.time()
|
|
22
23
|
try:
|
|
@@ -25,10 +26,9 @@ class ConsineSim:
|
|
|
25
26
|
sims = cosine_similarity([embed_query], embed_sentence)[0]
|
|
26
27
|
top_indices = numpy.argsort(sims)[::-1][:top_k]
|
|
27
28
|
|
|
28
|
-
results = [
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
} for i in top_indices]
|
|
29
|
+
results: list[SimilarityResult] = [
|
|
30
|
+
SimilarityResult(score=float(sims[i]), document=documents[i]) for i in top_indices
|
|
31
|
+
]
|
|
32
32
|
|
|
33
33
|
return results
|
|
34
34
|
except Exception as e:
|
|
@@ -37,4 +37,3 @@ class ConsineSim:
|
|
|
37
37
|
finally:
|
|
38
38
|
elapsed = time.time() - start
|
|
39
39
|
cls._logger.debug(f"Vector search completed in {elapsed:.2f} seconds.")
|
|
40
|
-
|
purrfectmeow/tc05_knj/base.py
CHANGED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
purrfectmeow/__init__.py,sha256=XEej-s0VH-Up9aob3XcDQqgS55Ftk_qNoXezdcedFJQ,271
|
|
2
|
-
purrfectmeow/meow/felis.py,sha256=8d1kaizsEisr7dW-MKw8HqsYfOkLBGy-sYTv-4kClQ8,6149
|
|
3
|
-
purrfectmeow/meow/kitty.py,sha256=WaLuh2t1PnigWYDNZlbNfCA_uqXnPYc-xxDuZlFfNNY,1971
|
|
4
|
-
purrfectmeow/tc01_spl/__init__.py,sha256=7ENCidvXhj9YhMQvBcv_mm4XIr3Mwzc1USQxgzLO0Nw,51
|
|
5
|
-
purrfectmeow/tc01_spl/base.py,sha256=iuIZiPUe-ofeF_PmknnCg-4NsJxDoH7rj-SMsqNBTAQ,3308
|
|
6
|
-
purrfectmeow/tc01_spl/markdown.py,sha256=AUCSZ-6W0sXbZwGgZfe6utidbEemQGoi6c4rsLiH928,1861
|
|
7
|
-
purrfectmeow/tc01_spl/ocr.py,sha256=A3orLTIVmu2WYJTi4joWlTmV27IDh3MTa7qc7IRAQkE,4784
|
|
8
|
-
purrfectmeow/tc01_spl/simple.py,sha256=dwecYL2sviKz4BoJcOQntAprXACvaEig-ZbDiwTW-cU,2347
|
|
9
|
-
purrfectmeow/tc02_mlt/__init__.py,sha256=qB2Eyc_wFDVELwj0L7ttG_YOL3IISaqPBRj0zqSJcPo,45
|
|
10
|
-
purrfectmeow/tc02_mlt/base.py,sha256=cz1qFo1AdL-I2wnBPO06MhcYSQh90tcLCN99phIUKWw,1508
|
|
11
|
-
purrfectmeow/tc02_mlt/separate.py,sha256=YQSnC5BODg1cJh4JrPkT_-tO1CbwgpxuCjMvHQwRUNE,1074
|
|
12
|
-
purrfectmeow/tc02_mlt/token.py,sha256=qULVySiTAbDoBQrtWWuvPkO5Zqf5hjRutN1Q7foCwUU,2052
|
|
13
|
-
purrfectmeow/tc03_wcm/__init__.py,sha256=8pXGo04Z5KUNGkhSTONLBlqwVc43LicDGSuQiQDIKIM,57
|
|
14
|
-
purrfectmeow/tc03_wcm/base.py,sha256=pXaaiU8JMLIjI5uJRxMLRnQ1Wmwv3U6EEkQ_IwhPLwg,473
|
|
15
|
-
purrfectmeow/tc03_wcm/local.py,sha256=5AfVSftW_cfaZBZBe-joSMJRRJ55G0g5lf9Qtcl0LUw,1074
|
|
16
|
-
purrfectmeow/tc04_kmn/__init__.py,sha256=FBHZKVu4agf6-p1MdMx0jIgQuKbAy9rsOu7MRIQVwXg,53
|
|
17
|
-
purrfectmeow/tc04_kmn/base.py,sha256=rj3Ar2Pv8VOL7vKvPB-snif8SRwBbGaLbWIpHFpd5b8,224
|
|
18
|
-
purrfectmeow/tc04_kmn/cosine.py,sha256=DaDXVcy6YyNc5jwtPXeQg040FT7607phyt5Ub74E9aw,1147
|
|
19
|
-
purrfectmeow/tc05_knj/__init__.py,sha256=XKwISvOAznPdTUWoTUnFDMBmxZF9Qd6FAi711W6bvZY,47
|
|
20
|
-
purrfectmeow/tc05_knj/base.py,sha256=qN1VCx20G5H7YHcVzmg0YNXMLZM7TPkiD_UMEZykfjE,70
|
|
21
|
-
purrfectkit-0.2.1.dist-info/licenses/LICENSE,sha256=9WlLgfJwKDGb71B1NwKYKKg6uL5u_knAr7ovGwIWvD4,1078
|
|
22
|
-
purrfectkit-0.2.1.dist-info/WHEEL,sha256=5w2T7AS2mz1-rW9CNagNYWRCaB0iQqBMYLwKdlgiR4Q,78
|
|
23
|
-
purrfectkit-0.2.1.dist-info/METADATA,sha256=cSe3NLmt6D8LaZSpilNU1c3G9k0P5XGThncqp6K2Crk,3765
|
|
24
|
-
purrfectkit-0.2.1.dist-info/RECORD,,
|
|
File without changes
|