llmflowstack 1.1.0__tar.gz → 1.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/PKG-INFO +1 -1
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/models/LLaMA4.py +2 -1
- llmflowstack-1.1.2/llmflowstack/rag/pipeline.py +279 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/pyproject.toml +1 -1
- llmflowstack-1.1.0/llmflowstack/rag/pipeline.py +0 -114
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/.github/workflows/python-publish.yml +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/.gitignore +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/LICENSE +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/README.md +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/__init__.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/base/__init__.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/base/base.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/callbacks/__init__.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/callbacks/log_collector.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/callbacks/stop_on_token.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/models/GPT_OSS.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/models/Gemma.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/models/LLaMA3.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/models/MedGemma.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/models/__init__.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/rag/__iinit__.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/schemas/__init__.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/schemas/params.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/utils/__init__.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/utils/evaluation_methods.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/utils/exceptions.py +0 -0
- {llmflowstack-1.1.0 → llmflowstack-1.1.2}/llmflowstack/utils/generation_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmflowstack
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.2
|
|
4
4
|
Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference. Public fork without institution-specific components.
|
|
5
5
|
Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
import chromadb
|
|
4
|
+
import chromadb.config
|
|
5
|
+
from langchain_chroma import Chroma
|
|
6
|
+
from langchain_core.documents import Document
|
|
7
|
+
from langchain_core.embeddings import Embeddings
|
|
8
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
9
|
+
from sentence_transformers import SentenceTransformer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EncoderWrapper(Embeddings):
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
model: SentenceTransformer
|
|
16
|
+
) -> None:
|
|
17
|
+
self.model = model
|
|
18
|
+
|
|
19
|
+
def embed_documents(
|
|
20
|
+
self,
|
|
21
|
+
texts: list[str]
|
|
22
|
+
) -> list[list[float]]:
|
|
23
|
+
vectors = self.model.encode(texts, task="retrieval", show_progress_bar=False)
|
|
24
|
+
return vectors.tolist()
|
|
25
|
+
|
|
26
|
+
def embed_query(
|
|
27
|
+
self,
|
|
28
|
+
text: str
|
|
29
|
+
) -> list[float]:
|
|
30
|
+
vectors = self.model.encode(text, task="retrieval", show_progress_bar=False)
|
|
31
|
+
return vectors.tolist()
|
|
32
|
+
|
|
33
|
+
class RAGPipeline:
|
|
34
|
+
"""
|
|
35
|
+
A modular Retrieval-Augmented Generation (RAG) pipeline for embedding, indexing, and retrieving scientific or textual data using SentenceTransformers and Chroma as a vector store.
|
|
36
|
+
|
|
37
|
+
Supports both persistent (disk-based) and transient (in-memory) modes depending on whether `persist_directory` is provided.
|
|
38
|
+
"""
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
checkpoint: str,
|
|
42
|
+
collection_name: str = "rag_memory",
|
|
43
|
+
persist_directory: str | None = None,
|
|
44
|
+
chunk_size: int = 1000,
|
|
45
|
+
chunk_overlap: int = 200
|
|
46
|
+
) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initializes the RAG pipeline.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
checkpoint (str): Path or name of the SentenceTransformer checkpoint.
|
|
52
|
+
collection_name (str): Name of the Chroma collection to create or load.
|
|
53
|
+
persist_directory (str | None): Directory where the vector database is stored. If None, all data is kept in-memory and discarded after the session ends.
|
|
54
|
+
chunk_size (int): Maximum size (in characters) for text chunks during indexing.
|
|
55
|
+
chunk_overlap (int): Overlap (in characters) between consecutive text chunks.
|
|
56
|
+
"""
|
|
57
|
+
self.encoder = SentenceTransformer(checkpoint, trust_remote_code=True)
|
|
58
|
+
|
|
59
|
+
client_settings = chromadb.config.Settings(
|
|
60
|
+
anonymized_telemetry=False
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
self.collection = Chroma(
|
|
64
|
+
collection_name=collection_name,
|
|
65
|
+
embedding_function=EncoderWrapper(self.encoder),
|
|
66
|
+
persist_directory=persist_directory,
|
|
67
|
+
client_settings=client_settings
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self.splitter = RecursiveCharacterTextSplitter(
|
|
71
|
+
chunk_size=chunk_size,
|
|
72
|
+
chunk_overlap=chunk_overlap,
|
|
73
|
+
add_start_index=True,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def index_documents(
|
|
77
|
+
self,
|
|
78
|
+
docs: list[Document],
|
|
79
|
+
ids: list[str],
|
|
80
|
+
can_split: bool = True
|
|
81
|
+
) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Indexes a list of documents into the Chroma vector store.
|
|
84
|
+
|
|
85
|
+
Each document is assigned a unique `source_id` and, optionally, split into smaller chunks for more granular retrieval. Each resulting chunk is embedded and stored with its metadata for later similarity search.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
docs (list[Document]): List of LangChain `Document` objects to index.
|
|
89
|
+
ids (list[str]): Unique identifiers corresponding to each document.
|
|
90
|
+
can_split (bool): Whether to split documents into smaller chunks before
|
|
91
|
+
indexing. Set to False to index each document as a single entry
|
|
92
|
+
(e.g., for short or self-contained texts).
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
None
|
|
96
|
+
"""
|
|
97
|
+
for doc, src_id in zip(docs, ids):
|
|
98
|
+
if doc.metadata is None:
|
|
99
|
+
doc.metadata = {}
|
|
100
|
+
doc.metadata["source_id"] = src_id
|
|
101
|
+
|
|
102
|
+
if can_split:
|
|
103
|
+
splits = self.splitter.split_documents(docs)
|
|
104
|
+
else:
|
|
105
|
+
splits = docs
|
|
106
|
+
|
|
107
|
+
split_ids = []
|
|
108
|
+
metadatas = []
|
|
109
|
+
texts = []
|
|
110
|
+
|
|
111
|
+
for i, s in enumerate(splits):
|
|
112
|
+
src = s.metadata.get("source_id", "unknown")
|
|
113
|
+
sid = f"{src}_{i}"
|
|
114
|
+
split_ids.append(sid)
|
|
115
|
+
metadatas.append(s.metadata.copy())
|
|
116
|
+
texts.append(s.page_content)
|
|
117
|
+
|
|
118
|
+
self.collection.add_texts(
|
|
119
|
+
texts=texts,
|
|
120
|
+
ids=split_ids,
|
|
121
|
+
metadatas=metadatas
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def create(
|
|
125
|
+
self,
|
|
126
|
+
information: str,
|
|
127
|
+
other_info: dict[str, str] | None = None,
|
|
128
|
+
doc_id: str | None = None,
|
|
129
|
+
should_index: bool = True,
|
|
130
|
+
can_split: bool = True
|
|
131
|
+
) -> Document:
|
|
132
|
+
"""
|
|
133
|
+
Creates a new `Document` and optionally indexes it in the collection.
|
|
134
|
+
|
|
135
|
+
This is a convenience method that wraps both document creation and embedding/indexing in one step. Metadata fields are merged into the document and can include any descriptive information (e.g., title, DOI, year).
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
information (str): Main textual content of the document.
|
|
139
|
+
other_info (dict[str, str] | None): Optional metadata fields to include.
|
|
140
|
+
doc_id (str | None): Custom document identifier. If None, a UUID is generated.
|
|
141
|
+
should_index (bool): Whether to immediately add the document to the vector store.
|
|
142
|
+
can_split (bool): Whether to allow splitting before indexing.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Document: The created LangChain `Document` object (indexed if specified).
|
|
146
|
+
"""
|
|
147
|
+
if other_info is None:
|
|
148
|
+
other_info = {}
|
|
149
|
+
|
|
150
|
+
if doc_id is None:
|
|
151
|
+
doc_id = str(uuid.uuid4())
|
|
152
|
+
|
|
153
|
+
metadata = {"source_id": doc_id, **other_info}
|
|
154
|
+
doc = Document(
|
|
155
|
+
page_content=information,
|
|
156
|
+
metadata=metadata
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if should_index:
|
|
160
|
+
self.index_documents(
|
|
161
|
+
docs=[doc],
|
|
162
|
+
ids=[doc_id],
|
|
163
|
+
can_split=can_split
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return doc
|
|
167
|
+
|
|
168
|
+
def update(
|
|
169
|
+
self,
|
|
170
|
+
doc_id: str,
|
|
171
|
+
new_information: str,
|
|
172
|
+
other_info: dict[str, str] | None = None
|
|
173
|
+
) -> Document:
|
|
174
|
+
"""
|
|
175
|
+
Updates an existing document in the collection with new content and metadata.
|
|
176
|
+
|
|
177
|
+
All vector entries associated with the provided `doc_id` are deleted, and a new document is created and re-indexed in their place. This ensures that embeddings remain consistent with the latest text content.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
doc_id (str): Identifier of the document to update.
|
|
181
|
+
new_information (str): Updated text content for the document.
|
|
182
|
+
other_info (dict[str, str] | None): Optional new metadata to associate.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Document: The newly created (updated) `Document` object.
|
|
186
|
+
"""
|
|
187
|
+
if other_info is None:
|
|
188
|
+
other_info = {}
|
|
189
|
+
|
|
190
|
+
documents_to_delete = self.collection.get(
|
|
191
|
+
where={
|
|
192
|
+
"source_id": doc_id
|
|
193
|
+
}
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
ids_to_delete = documents_to_delete.get("ids", [])
|
|
197
|
+
|
|
198
|
+
if ids_to_delete:
|
|
199
|
+
self.collection.delete(ids=ids_to_delete)
|
|
200
|
+
|
|
201
|
+
return self.create(
|
|
202
|
+
information=new_information,
|
|
203
|
+
other_info=other_info,
|
|
204
|
+
doc_id=doc_id
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def delete(
|
|
208
|
+
self,
|
|
209
|
+
doc_id: str
|
|
210
|
+
) -> None:
|
|
211
|
+
"""
|
|
212
|
+
Deletes all indexed entries associated with a specific document ID.
|
|
213
|
+
|
|
214
|
+
Removes all vectors and metadata tied to the provided `doc_id` from the collection. Use this to completely erase a document's content from the indexed database.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
doc_id (str): Identifier of the document to delete.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
None
|
|
221
|
+
"""
|
|
222
|
+
self.collection.delete(ids=[doc_id])
|
|
223
|
+
|
|
224
|
+
def rquery(
|
|
225
|
+
self,
|
|
226
|
+
query: str,
|
|
227
|
+
k: int = 4,
|
|
228
|
+
filter: dict | None = None
|
|
229
|
+
) -> list[Document]:
|
|
230
|
+
"""
|
|
231
|
+
Perform a **raw semantic search** on the collection.
|
|
232
|
+
|
|
233
|
+
This method queries the vector store using the provided text query and returns the top-`k` most similar `Document` objects, optionally filtered by metadata.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
query (str): The natural-language query text to embed and search for.
|
|
237
|
+
k (int, optional): Number of top results to return. Defaults to 4.
|
|
238
|
+
filter (dict | None, optional): Metadata filter applied during search
|
|
239
|
+
(e.g., {"type": "article"}). Defaults to None.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
list[Document]: A list of matching documents sorted by similarity score.
|
|
243
|
+
"""
|
|
244
|
+
return self.collection.similarity_search(
|
|
245
|
+
query=query,
|
|
246
|
+
k=k,
|
|
247
|
+
filter=filter
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def query(
|
|
251
|
+
self,
|
|
252
|
+
query: str,
|
|
253
|
+
k: int = 4,
|
|
254
|
+
filter: dict | None = None
|
|
255
|
+
) -> str:
|
|
256
|
+
"""
|
|
257
|
+
Perform a **semantic search** and return the combined text content.
|
|
258
|
+
|
|
259
|
+
This method wraps `rquery()` and concatenates the retrieved document contents into a single string, suitable for direct use in downstream LLM prompts or text processing.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
query (str): The natural-language query text to search for.
|
|
263
|
+
k (int, optional): Number of top results to return. Defaults to 4.
|
|
264
|
+
filter (dict | None, optional): Metadata filter applied during search. If None, all documents are considered.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
str: A newline-separated string containing the page contents of
|
|
268
|
+
the retrieved documents.
|
|
269
|
+
"""
|
|
270
|
+
if filter:
|
|
271
|
+
docs = self.collection.similarity_search(
|
|
272
|
+
query=query,
|
|
273
|
+
k=k,
|
|
274
|
+
filter=filter
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
docs = self.collection.similarity_search(query, k=k)
|
|
278
|
+
|
|
279
|
+
return "\n\n".join(doc.page_content for doc in docs)
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
import uuid
|
|
2
|
-
|
|
3
|
-
from langchain_chroma import Chroma
|
|
4
|
-
from langchain_core.documents import Document
|
|
5
|
-
from langchain_core.embeddings import Embeddings
|
|
6
|
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
7
|
-
from sentence_transformers import SentenceTransformer
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class EncoderWrapper(Embeddings):
|
|
11
|
-
def __init__(
|
|
12
|
-
self,
|
|
13
|
-
model: SentenceTransformer
|
|
14
|
-
) -> None:
|
|
15
|
-
self.model = model
|
|
16
|
-
|
|
17
|
-
def embed_documents(
|
|
18
|
-
self,
|
|
19
|
-
texts: list[str]
|
|
20
|
-
) -> list[list[float]]:
|
|
21
|
-
return self.model.encode(texts, task="retrieval", show_progress_bar=True).tolist()
|
|
22
|
-
|
|
23
|
-
def embed_query(
|
|
24
|
-
self,
|
|
25
|
-
text: str
|
|
26
|
-
) -> list[float]:
|
|
27
|
-
return self.model.encode(text, task="retrieval", show_progress_bar=True).tolist()
|
|
28
|
-
|
|
29
|
-
class RAGPipeline:
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
checkpoint: str,
|
|
33
|
-
collection_name: str = "rag_memory",
|
|
34
|
-
persist_directory: str = "./chroma_store",
|
|
35
|
-
chunk_size: int = 1000,
|
|
36
|
-
chunk_overlap: int = 200
|
|
37
|
-
) -> None:
|
|
38
|
-
|
|
39
|
-
self.encoder = SentenceTransformer(checkpoint, trust_remote_code=True)
|
|
40
|
-
|
|
41
|
-
self.vector_store = Chroma(
|
|
42
|
-
collection_name=collection_name,
|
|
43
|
-
embedding_function=EncoderWrapper(self.encoder),
|
|
44
|
-
persist_directory=persist_directory
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
self.splitter = RecursiveCharacterTextSplitter(
|
|
48
|
-
chunk_size=chunk_size,
|
|
49
|
-
chunk_overlap=chunk_overlap,
|
|
50
|
-
add_start_index=True,
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
def index_documents(
|
|
54
|
-
self,
|
|
55
|
-
docs: list[Document],
|
|
56
|
-
ids: list[str]
|
|
57
|
-
) -> None:
|
|
58
|
-
splits = self.splitter.split_documents(docs)
|
|
59
|
-
split_ids = [f"{ids[0]}_{i}" for i in range(len(splits))]
|
|
60
|
-
self.vector_store.add_documents(splits, ids=split_ids)
|
|
61
|
-
|
|
62
|
-
def create(
|
|
63
|
-
self,
|
|
64
|
-
information: str,
|
|
65
|
-
other_info: dict[str, str] = {},
|
|
66
|
-
doc_id: str | None = None,
|
|
67
|
-
should_index: bool = True
|
|
68
|
-
) -> Document:
|
|
69
|
-
if doc_id is None:
|
|
70
|
-
doc_id = str(uuid.uuid4())
|
|
71
|
-
|
|
72
|
-
doc = Document(
|
|
73
|
-
page_content=information,
|
|
74
|
-
metadata={"id": doc_id, **other_info}
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
if should_index:
|
|
78
|
-
self.index_documents([doc], ids=[doc_id])
|
|
79
|
-
|
|
80
|
-
return doc
|
|
81
|
-
|
|
82
|
-
def update(
|
|
83
|
-
self,
|
|
84
|
-
doc_id: str,
|
|
85
|
-
new_information: str,
|
|
86
|
-
other_info: dict[str, str] = {}
|
|
87
|
-
) -> Document:
|
|
88
|
-
self.vector_store.delete(ids=[doc_id])
|
|
89
|
-
|
|
90
|
-
return self.create(
|
|
91
|
-
information=new_information,
|
|
92
|
-
other_info=other_info,
|
|
93
|
-
doc_id=doc_id
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
def delete(
|
|
97
|
-
self, doc_id: str
|
|
98
|
-
) -> None:
|
|
99
|
-
self.vector_store.delete(ids=[doc_id])
|
|
100
|
-
|
|
101
|
-
def query(
|
|
102
|
-
self,
|
|
103
|
-
query: str,
|
|
104
|
-
k: int = 4,
|
|
105
|
-
category: str | None = None
|
|
106
|
-
) -> str:
|
|
107
|
-
if category:
|
|
108
|
-
docs = self.vector_store.similarity_search(
|
|
109
|
-
query, k=k, filter={"category": category}
|
|
110
|
-
)
|
|
111
|
-
else:
|
|
112
|
-
docs = self.vector_store.similarity_search(query, k=k)
|
|
113
|
-
|
|
114
|
-
return "\n\n".join(doc.page_content for doc in docs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|