llmflowstack 1.2.6__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/PKG-INFO +33 -132
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/README.md +27 -120
- llmflowstack-1.3.0/llmflowstack/__init__.py +27 -0
- llmflowstack-1.3.0/llmflowstack/callbacks/force_json.py +428 -0
- llmflowstack-1.3.0/llmflowstack/collators/multimodal_causal.py +122 -0
- llmflowstack-1.3.0/llmflowstack/decoders/__init__.py +18 -0
- llmflowstack-1.3.0/llmflowstack/decoders/base_decoder.py +694 -0
- llmflowstack-1.3.0/llmflowstack/decoders/gemma_3.py +143 -0
- llmflowstack-1.3.0/llmflowstack/decoders/gpt_2.py +106 -0
- llmflowstack-1.3.0/llmflowstack/decoders/gpt_oss.py +174 -0
- llmflowstack-1.3.0/llmflowstack/decoders/llama_3.py +123 -0
- llmflowstack-1.3.0/llmflowstack/decoders/llama_4.py +134 -0
- llmflowstack-1.3.0/llmflowstack/decoders/medgemma.py +169 -0
- llmflowstack-1.3.0/llmflowstack/decoders/qwen_3.py +194 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/rag/VectorDatabase.py +48 -14
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/rag/__init__.py +0 -4
- llmflowstack-1.3.0/llmflowstack/schemas/__init__.py +6 -0
- llmflowstack-1.3.0/llmflowstack/schemas/params.py +106 -0
- llmflowstack-1.3.0/pyproject.toml +37 -0
- llmflowstack-1.2.6/llmflowstack/__init__.py +0 -23
- llmflowstack-1.2.6/llmflowstack/callbacks/stop_on_token.py +0 -16
- llmflowstack-1.2.6/llmflowstack/decoders/BaseDecoder.py +0 -487
- llmflowstack-1.2.6/llmflowstack/decoders/GPT_OSS.py +0 -300
- llmflowstack-1.2.6/llmflowstack/decoders/Gemma.py +0 -327
- llmflowstack-1.2.6/llmflowstack/decoders/LLaMA3.py +0 -244
- llmflowstack-1.2.6/llmflowstack/decoders/LLaMA4.py +0 -324
- llmflowstack-1.2.6/llmflowstack/decoders/MedGemma.py +0 -275
- llmflowstack-1.2.6/llmflowstack/decoders/__init__.py +0 -13
- llmflowstack-1.2.6/llmflowstack/schemas/__init__.py +0 -9
- llmflowstack-1.2.6/llmflowstack/schemas/params.py +0 -40
- llmflowstack-1.2.6/llmflowstack/utils/generation_utils.py +0 -30
- llmflowstack-1.2.6/pyproject.toml +0 -43
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/.github/workflows/python-publish.yml +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/.gitignore +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/LICENSE +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/callbacks/__init__.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/callbacks/log_collector.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/utils/__init__.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/utils/evaluation_methods.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/utils/exceptions.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.0}/llmflowstack/utils/logging.py +0 -0
|
@@ -1,35 +1,29 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmflowstack
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference.
|
|
3
|
+
Version: 1.3.0
|
|
4
|
+
Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference.
|
|
5
5
|
Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Python: >=3.12
|
|
9
9
|
Requires-Dist: accelerate
|
|
10
10
|
Requires-Dist: bert-score
|
|
11
|
-
Requires-Dist: bitsandbytes
|
|
12
11
|
Requires-Dist: chromadb
|
|
13
12
|
Requires-Dist: datasets
|
|
14
13
|
Requires-Dist: evaluate
|
|
15
|
-
Requires-Dist:
|
|
14
|
+
Requires-Dist: fbgemm-gpu-genai
|
|
16
15
|
Requires-Dist: kernels
|
|
17
16
|
Requires-Dist: langchain-chroma
|
|
18
17
|
Requires-Dist: langchain-community
|
|
19
18
|
Requires-Dist: nltk
|
|
20
|
-
Requires-Dist: numpy
|
|
21
|
-
Requires-Dist: openai-harmony
|
|
22
|
-
Requires-Dist: pandas
|
|
23
19
|
Requires-Dist: peft
|
|
20
|
+
Requires-Dist: pillow
|
|
24
21
|
Requires-Dist: rouge-score
|
|
25
22
|
Requires-Dist: safetensors
|
|
26
|
-
Requires-Dist: scikit-learn
|
|
27
|
-
Requires-Dist: scipy
|
|
28
23
|
Requires-Dist: sentence-transformers
|
|
29
24
|
Requires-Dist: torch
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist: transformers
|
|
25
|
+
Requires-Dist: torchao
|
|
26
|
+
Requires-Dist: transformers==4.57.6
|
|
33
27
|
Requires-Dist: triton
|
|
34
28
|
Requires-Dist: trl
|
|
35
29
|
Description-Content-Type: text/markdown
|
|
@@ -53,32 +47,23 @@ The goal is to make experimentation with LLMs more accessible, without the need
|
|
|
53
47
|
This framework is designed to provide flexibility when working with different open-source and commercial LLMs. Currently, the following models are supported:
|
|
54
48
|
|
|
55
49
|
- **GPT-OSS**
|
|
56
|
-
|
|
57
50
|
- [`GPT-OSS 20B`](https://huggingface.co/openai/gpt-oss-20b)
|
|
58
51
|
- [`GPT-OSS 120B`](https://huggingface.co/openai/gpt-oss-120b)
|
|
59
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
60
52
|
|
|
61
53
|
- **LLaMA 3**
|
|
62
|
-
|
|
63
54
|
- [`LLaMA 3.1 8B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
|
|
64
55
|
- [`LLaMA 3.1 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
|
|
65
56
|
- [`LLaMA 3.3 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)
|
|
66
57
|
- [`LLaMA 3.3 405B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
|
|
67
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
68
58
|
|
|
69
59
|
- **LLaMA 4**
|
|
70
|
-
|
|
71
60
|
- [`LLaMA 4 Scout - Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
|
|
72
|
-
> DAPT and Inference Available
|
|
73
61
|
|
|
74
62
|
- **Gemma**
|
|
75
|
-
|
|
76
63
|
- [`Gemma 3 27B - Instruct`](https://huggingface.co/google/gemma-3-27b-it)
|
|
77
|
-
> DAPT and Inference Available
|
|
78
64
|
|
|
79
65
|
- **MedGemma**
|
|
80
|
-
- [`MedGemma 27B
|
|
81
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
66
|
+
- [`MedGemma 27B - Instruct`](https://huggingface.co/google/medgemma-27b-it)
|
|
82
67
|
|
|
83
68
|
> Other architectures based on those **may** function correctly.
|
|
84
69
|
|
|
@@ -101,22 +86,22 @@ This section presents a bit of what you can do with the framework.
|
|
|
101
86
|
You can load as many models as your hardware allows (H100 GPU recommended)...
|
|
102
87
|
|
|
103
88
|
```python
|
|
104
|
-
from llmflowstack import
|
|
89
|
+
from llmflowstack import GptOss, Llama3
|
|
105
90
|
|
|
106
|
-
# Loading a
|
|
107
|
-
first_model =
|
|
91
|
+
# Loading a Llama model
|
|
92
|
+
first_model = Llama3()
|
|
108
93
|
first_model.load_checkpoint(
|
|
109
94
|
checkpoint="/llama-3.1-8b-Instruct",
|
|
110
95
|
)
|
|
111
96
|
|
|
112
|
-
# Loading a quantized
|
|
113
|
-
second_model =
|
|
97
|
+
# Loading a quantized Llama model
|
|
98
|
+
second_model = Llama3(
|
|
114
99
|
checkpoint="/llama-3.3-70b-Instruct",
|
|
115
100
|
quantization="4bit"
|
|
116
101
|
)
|
|
117
102
|
|
|
118
103
|
# Loading a GPT-OSS, quantized and with seed
|
|
119
|
-
thrid_model =
|
|
104
|
+
thrid_model = GptOss(
|
|
120
105
|
checkpoint="/gpt-oss-20b",
|
|
121
106
|
quantization=True,
|
|
122
107
|
seed=1234
|
|
@@ -126,32 +111,31 @@ thrid_model = GPT_OSS(
|
|
|
126
111
|
### Inference Examples
|
|
127
112
|
|
|
128
113
|
```python
|
|
129
|
-
> from llmflowstack import
|
|
114
|
+
> from llmflowstack import GptOss, GenerationParams
|
|
130
115
|
|
|
131
|
-
> gpt_oss_model =
|
|
116
|
+
> gpt_oss_model = GptOss(checkpoint="/gpt-oss-120b")
|
|
132
117
|
|
|
133
118
|
> gpt_oss_model.generate("Tell me a joke!")
|
|
134
119
|
'Why did the scarecrow become a successful motivational speaker? Because he was outstanding **in** his field! 🌾😄'
|
|
135
120
|
|
|
136
121
|
# Exclusive for GPT-OSS
|
|
137
|
-
> gpt_oss_model.set_reasoning_level("High")
|
|
122
|
+
> gpt_oss_model.set_reasoning_level("High") # Low, Medium, High, Off
|
|
138
123
|
|
|
139
124
|
> custom_input = gpt_oss_model.build_input(
|
|
140
125
|
input_text="Tell me another joke!",
|
|
141
126
|
developer_message="You are a clown and after every joke, you should say 'HONK HONK'"
|
|
142
127
|
)
|
|
143
128
|
> gpt_oss_model.generate(
|
|
144
|
-
|
|
129
|
+
data=custom_input,
|
|
145
130
|
params=GenerationParams(
|
|
131
|
+
mode="sample", # greedy, sample or beam
|
|
146
132
|
max_new_tokens=1024,
|
|
147
|
-
|
|
148
|
-
temperature=0.3
|
|
149
|
-
)
|
|
133
|
+
temperature=0.3
|
|
150
134
|
)
|
|
151
135
|
)
|
|
152
136
|
'Why did the scarecrow win an award? Because he was outstanding in his field! \n\nHONK HONK'
|
|
153
137
|
|
|
154
|
-
> llama_model =
|
|
138
|
+
> llama_model = Llama3(checkpoint="/llama-3.3-70B-Instruct", quantization="4bit")
|
|
155
139
|
> llama_model.generate("Why is the sky blue?")
|
|
156
140
|
'The sky appears blue because of a phenomenon called Rayleigh scattering, which is the scattering of light'
|
|
157
141
|
|
|
@@ -162,7 +146,7 @@ thrid_model = GPT_OSS(
|
|
|
162
146
|
You can also generate tokens using a streamer, that is, receiving one token at a time by using the iterator version of the generate function:
|
|
163
147
|
|
|
164
148
|
```python
|
|
165
|
-
llama_4 =
|
|
149
|
+
llama_4 = Llama4(
|
|
166
150
|
checkpoint="llama-4-scout-17b-16e-instruct"
|
|
167
151
|
)
|
|
168
152
|
|
|
@@ -175,10 +159,10 @@ for text in it:
|
|
|
175
159
|
### Training Examples (DAPT & Fine-tune)
|
|
176
160
|
|
|
177
161
|
```python
|
|
178
|
-
from llmflowstack import
|
|
162
|
+
from llmflowstack import Llama3
|
|
179
163
|
from llmflowstack.schemas import TrainParams
|
|
180
164
|
|
|
181
|
-
model =
|
|
165
|
+
model = Llama3(
|
|
182
166
|
checkpoint="llama-3.1-8b-Instruct"
|
|
183
167
|
)
|
|
184
168
|
|
|
@@ -186,28 +170,29 @@ model = LLaMA3(
|
|
|
186
170
|
dataset = []
|
|
187
171
|
dataset.append(model.build_input(
|
|
188
172
|
input_text="Chico is a cat, which color he is?",
|
|
189
|
-
|
|
173
|
+
output_text="Black!"
|
|
190
174
|
))
|
|
191
175
|
|
|
192
176
|
dataset.append(model.build_input(
|
|
193
177
|
input_text="Fred is a dog, which color he is?",
|
|
194
|
-
|
|
178
|
+
output_text="White!"
|
|
195
179
|
))
|
|
196
180
|
|
|
197
181
|
# Does the DAPT in the full model
|
|
198
|
-
model.
|
|
199
|
-
|
|
182
|
+
model.train(
|
|
183
|
+
train_data=dataset,
|
|
200
184
|
params=TrainParams(
|
|
201
185
|
batch_size=1,
|
|
202
186
|
epochs=3,
|
|
203
187
|
gradient_accumulation=1,
|
|
204
188
|
lr=2e-5
|
|
205
|
-
)
|
|
189
|
+
),
|
|
190
|
+
mode="DAPT"
|
|
206
191
|
)
|
|
207
192
|
|
|
208
193
|
# Does the fine-tune this time
|
|
209
|
-
model.
|
|
210
|
-
|
|
194
|
+
model.train(
|
|
195
|
+
train_data=dataset,
|
|
211
196
|
params=TrainParams(
|
|
212
197
|
batch_size=1,
|
|
213
198
|
gradient_accumulation=1,
|
|
@@ -216,7 +201,8 @@ model.fine_tune(
|
|
|
216
201
|
),
|
|
217
202
|
save_at_end=True,
|
|
218
203
|
# It will save the model
|
|
219
|
-
save_path="./output"
|
|
204
|
+
save_path="./output",
|
|
205
|
+
mode="FT"
|
|
220
206
|
)
|
|
221
207
|
|
|
222
208
|
# Saving the final result
|
|
@@ -224,88 +210,3 @@ model.save_checkpoint(
|
|
|
224
210
|
path="./model-output"
|
|
225
211
|
)
|
|
226
212
|
```
|
|
227
|
-
|
|
228
|
-
### RAG Pipeline
|
|
229
|
-
|
|
230
|
-
A prototype of a RAG pipeline is also available. You can instantiate and use it as follows:
|
|
231
|
-
|
|
232
|
-
```python
|
|
233
|
-
from llmflowstack import VectorDatabase
|
|
234
|
-
|
|
235
|
-
vector_db = VectorDatabase(
|
|
236
|
-
checkpoint="jina-embeddings-v4",
|
|
237
|
-
chunk_size=1000,
|
|
238
|
-
chunk_overlap=200
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
# Create or load an existing collection
|
|
242
|
-
vector_db.get_collection(
|
|
243
|
-
collection_name="memory_rag",
|
|
244
|
-
persist_directory="./memory"
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
vector_db.get_collection(
|
|
248
|
-
collection_name="files_rag",
|
|
249
|
-
persist_directory="./files"
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
# You may also omit the persist directory; in this case, the RAG data will be stored in memory
|
|
253
|
-
vector_db.get_collection(
|
|
254
|
-
collection_name="files_rag"
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
# To create a new document in a collection
|
|
258
|
-
vector_db.create(
|
|
259
|
-
collection_name="memory_rag",
|
|
260
|
-
information="User loves Pizza!", # Main information to be indexed in the vector database
|
|
261
|
-
other_info={"category": "food"},
|
|
262
|
-
can_split=False, # Indicates whether the information can be split into chunks
|
|
263
|
-
should_index=True # Defaults to True — defines whether the document should be indexed or only returned as a Document instance
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
# After adding documents, you can query the database
|
|
267
|
-
query_result = vector_db.query(
|
|
268
|
-
collection_name="memory_rag",
|
|
269
|
-
query="pizza",
|
|
270
|
-
filter={"category": "food"},
|
|
271
|
-
k=3 # Number of chunks to retrieve
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
print(query_result)
|
|
275
|
-
# > "User loves Pizza!"
|
|
276
|
-
```
|
|
277
|
-
|
|
278
|
-
### NLP Evaluation
|
|
279
|
-
|
|
280
|
-
> **Disclaimer**
|
|
281
|
-
> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
|
|
282
|
-
|
|
283
|
-
```python
|
|
284
|
-
> from llmflowstack import text_evaluation
|
|
285
|
-
> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
286
|
-
|
|
287
|
-
# Predictions from some model
|
|
288
|
-
> predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
|
|
289
|
-
# References text (ground truth)
|
|
290
|
-
> references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
|
|
291
|
-
|
|
292
|
-
# BERT Score Evaluation
|
|
293
|
-
> bert_score_evaluation(predictions, references)
|
|
294
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
|
|
295
|
-
|
|
296
|
-
# Bleu Score Evaluation
|
|
297
|
-
> bleu_score_evaluation(predictions, references)
|
|
298
|
-
{'bleu_score': 0.3656}
|
|
299
|
-
|
|
300
|
-
# Cosine Similarity Evaluation
|
|
301
|
-
> cosine_similarity_evaluation(predictions, references)
|
|
302
|
-
{'cosine_similarity': 0.7443}
|
|
303
|
-
|
|
304
|
-
# Rouge Score Evaluation
|
|
305
|
-
> rouge_evaluation(predictions, references)
|
|
306
|
-
{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
307
|
-
|
|
308
|
-
# All-in-one function
|
|
309
|
-
> text_evaluation(predictions, references)
|
|
310
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
311
|
-
```
|
|
@@ -17,32 +17,23 @@ The goal is to make experimentation with LLMs more accessible, without the need
|
|
|
17
17
|
This framework is designed to provide flexibility when working with different open-source and commercial LLMs. Currently, the following models are supported:
|
|
18
18
|
|
|
19
19
|
- **GPT-OSS**
|
|
20
|
-
|
|
21
20
|
- [`GPT-OSS 20B`](https://huggingface.co/openai/gpt-oss-20b)
|
|
22
21
|
- [`GPT-OSS 120B`](https://huggingface.co/openai/gpt-oss-120b)
|
|
23
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
24
22
|
|
|
25
23
|
- **LLaMA 3**
|
|
26
|
-
|
|
27
24
|
- [`LLaMA 3.1 8B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
|
|
28
25
|
- [`LLaMA 3.1 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
|
|
29
26
|
- [`LLaMA 3.3 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)
|
|
30
27
|
- [`LLaMA 3.3 405B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
|
|
31
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
32
28
|
|
|
33
29
|
- **LLaMA 4**
|
|
34
|
-
|
|
35
30
|
- [`LLaMA 4 Scout - Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
|
|
36
|
-
> DAPT and Inference Available
|
|
37
31
|
|
|
38
32
|
- **Gemma**
|
|
39
|
-
|
|
40
33
|
- [`Gemma 3 27B - Instruct`](https://huggingface.co/google/gemma-3-27b-it)
|
|
41
|
-
> DAPT and Inference Available
|
|
42
34
|
|
|
43
35
|
- **MedGemma**
|
|
44
|
-
- [`MedGemma 27B
|
|
45
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
36
|
+
- [`MedGemma 27B - Instruct`](https://huggingface.co/google/medgemma-27b-it)
|
|
46
37
|
|
|
47
38
|
> Other architectures based on those **may** function correctly.
|
|
48
39
|
|
|
@@ -65,22 +56,22 @@ This section presents a bit of what you can do with the framework.
|
|
|
65
56
|
You can load as many models as your hardware allows (H100 GPU recommended)...
|
|
66
57
|
|
|
67
58
|
```python
|
|
68
|
-
from llmflowstack import
|
|
59
|
+
from llmflowstack import GptOss, Llama3
|
|
69
60
|
|
|
70
|
-
# Loading a
|
|
71
|
-
first_model =
|
|
61
|
+
# Loading a Llama model
|
|
62
|
+
first_model = Llama3()
|
|
72
63
|
first_model.load_checkpoint(
|
|
73
64
|
checkpoint="/llama-3.1-8b-Instruct",
|
|
74
65
|
)
|
|
75
66
|
|
|
76
|
-
# Loading a quantized
|
|
77
|
-
second_model =
|
|
67
|
+
# Loading a quantized Llama model
|
|
68
|
+
second_model = Llama3(
|
|
78
69
|
checkpoint="/llama-3.3-70b-Instruct",
|
|
79
70
|
quantization="4bit"
|
|
80
71
|
)
|
|
81
72
|
|
|
82
73
|
# Loading a GPT-OSS, quantized and with seed
|
|
83
|
-
thrid_model =
|
|
74
|
+
thrid_model = GptOss(
|
|
84
75
|
checkpoint="/gpt-oss-20b",
|
|
85
76
|
quantization=True,
|
|
86
77
|
seed=1234
|
|
@@ -90,32 +81,31 @@ thrid_model = GPT_OSS(
|
|
|
90
81
|
### Inference Examples
|
|
91
82
|
|
|
92
83
|
```python
|
|
93
|
-
> from llmflowstack import
|
|
84
|
+
> from llmflowstack import GptOss, GenerationParams
|
|
94
85
|
|
|
95
|
-
> gpt_oss_model =
|
|
86
|
+
> gpt_oss_model = GptOss(checkpoint="/gpt-oss-120b")
|
|
96
87
|
|
|
97
88
|
> gpt_oss_model.generate("Tell me a joke!")
|
|
98
89
|
'Why did the scarecrow become a successful motivational speaker? Because he was outstanding **in** his field! 🌾😄'
|
|
99
90
|
|
|
100
91
|
# Exclusive for GPT-OSS
|
|
101
|
-
> gpt_oss_model.set_reasoning_level("High")
|
|
92
|
+
> gpt_oss_model.set_reasoning_level("High") # Low, Medium, High, Off
|
|
102
93
|
|
|
103
94
|
> custom_input = gpt_oss_model.build_input(
|
|
104
95
|
input_text="Tell me another joke!",
|
|
105
96
|
developer_message="You are a clown and after every joke, you should say 'HONK HONK'"
|
|
106
97
|
)
|
|
107
98
|
> gpt_oss_model.generate(
|
|
108
|
-
|
|
99
|
+
data=custom_input,
|
|
109
100
|
params=GenerationParams(
|
|
101
|
+
mode="sample", # greedy, sample or beam
|
|
110
102
|
max_new_tokens=1024,
|
|
111
|
-
|
|
112
|
-
temperature=0.3
|
|
113
|
-
)
|
|
103
|
+
temperature=0.3
|
|
114
104
|
)
|
|
115
105
|
)
|
|
116
106
|
'Why did the scarecrow win an award? Because he was outstanding in his field! \n\nHONK HONK'
|
|
117
107
|
|
|
118
|
-
> llama_model =
|
|
108
|
+
> llama_model = Llama3(checkpoint="/llama-3.3-70B-Instruct", quantization="4bit")
|
|
119
109
|
> llama_model.generate("Why is the sky blue?")
|
|
120
110
|
'The sky appears blue because of a phenomenon called Rayleigh scattering, which is the scattering of light'
|
|
121
111
|
|
|
@@ -126,7 +116,7 @@ thrid_model = GPT_OSS(
|
|
|
126
116
|
You can also generate tokens using a streamer, that is, receiving one token at a time by using the iterator version of the generate function:
|
|
127
117
|
|
|
128
118
|
```python
|
|
129
|
-
llama_4 =
|
|
119
|
+
llama_4 = Llama4(
|
|
130
120
|
checkpoint="llama-4-scout-17b-16e-instruct"
|
|
131
121
|
)
|
|
132
122
|
|
|
@@ -139,10 +129,10 @@ for text in it:
|
|
|
139
129
|
### Training Examples (DAPT & Fine-tune)
|
|
140
130
|
|
|
141
131
|
```python
|
|
142
|
-
from llmflowstack import
|
|
132
|
+
from llmflowstack import Llama3
|
|
143
133
|
from llmflowstack.schemas import TrainParams
|
|
144
134
|
|
|
145
|
-
model =
|
|
135
|
+
model = Llama3(
|
|
146
136
|
checkpoint="llama-3.1-8b-Instruct"
|
|
147
137
|
)
|
|
148
138
|
|
|
@@ -150,28 +140,29 @@ model = LLaMA3(
|
|
|
150
140
|
dataset = []
|
|
151
141
|
dataset.append(model.build_input(
|
|
152
142
|
input_text="Chico is a cat, which color he is?",
|
|
153
|
-
|
|
143
|
+
output_text="Black!"
|
|
154
144
|
))
|
|
155
145
|
|
|
156
146
|
dataset.append(model.build_input(
|
|
157
147
|
input_text="Fred is a dog, which color he is?",
|
|
158
|
-
|
|
148
|
+
output_text="White!"
|
|
159
149
|
))
|
|
160
150
|
|
|
161
151
|
# Does the DAPT in the full model
|
|
162
|
-
model.
|
|
163
|
-
|
|
152
|
+
model.train(
|
|
153
|
+
train_data=dataset,
|
|
164
154
|
params=TrainParams(
|
|
165
155
|
batch_size=1,
|
|
166
156
|
epochs=3,
|
|
167
157
|
gradient_accumulation=1,
|
|
168
158
|
lr=2e-5
|
|
169
|
-
)
|
|
159
|
+
),
|
|
160
|
+
mode="DAPT"
|
|
170
161
|
)
|
|
171
162
|
|
|
172
163
|
# Does the fine-tune this time
|
|
173
|
-
model.
|
|
174
|
-
|
|
164
|
+
model.train(
|
|
165
|
+
train_data=dataset,
|
|
175
166
|
params=TrainParams(
|
|
176
167
|
batch_size=1,
|
|
177
168
|
gradient_accumulation=1,
|
|
@@ -180,7 +171,8 @@ model.fine_tune(
|
|
|
180
171
|
),
|
|
181
172
|
save_at_end=True,
|
|
182
173
|
# It will save the model
|
|
183
|
-
save_path="./output"
|
|
174
|
+
save_path="./output",
|
|
175
|
+
mode="FT"
|
|
184
176
|
)
|
|
185
177
|
|
|
186
178
|
# Saving the final result
|
|
@@ -188,88 +180,3 @@ model.save_checkpoint(
|
|
|
188
180
|
path="./model-output"
|
|
189
181
|
)
|
|
190
182
|
```
|
|
191
|
-
|
|
192
|
-
### RAG Pipeline
|
|
193
|
-
|
|
194
|
-
A prototype of a RAG pipeline is also available. You can instantiate and use it as follows:
|
|
195
|
-
|
|
196
|
-
```python
|
|
197
|
-
from llmflowstack import VectorDatabase
|
|
198
|
-
|
|
199
|
-
vector_db = VectorDatabase(
|
|
200
|
-
checkpoint="jina-embeddings-v4",
|
|
201
|
-
chunk_size=1000,
|
|
202
|
-
chunk_overlap=200
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
# Create or load an existing collection
|
|
206
|
-
vector_db.get_collection(
|
|
207
|
-
collection_name="memory_rag",
|
|
208
|
-
persist_directory="./memory"
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
vector_db.get_collection(
|
|
212
|
-
collection_name="files_rag",
|
|
213
|
-
persist_directory="./files"
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
# You may also omit the persist directory; in this case, the RAG data will be stored in memory
|
|
217
|
-
vector_db.get_collection(
|
|
218
|
-
collection_name="files_rag"
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
# To create a new document in a collection
|
|
222
|
-
vector_db.create(
|
|
223
|
-
collection_name="memory_rag",
|
|
224
|
-
information="User loves Pizza!", # Main information to be indexed in the vector database
|
|
225
|
-
other_info={"category": "food"},
|
|
226
|
-
can_split=False, # Indicates whether the information can be split into chunks
|
|
227
|
-
should_index=True # Defaults to True — defines whether the document should be indexed or only returned as a Document instance
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
# After adding documents, you can query the database
|
|
231
|
-
query_result = vector_db.query(
|
|
232
|
-
collection_name="memory_rag",
|
|
233
|
-
query="pizza",
|
|
234
|
-
filter={"category": "food"},
|
|
235
|
-
k=3 # Number of chunks to retrieve
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
print(query_result)
|
|
239
|
-
# > "User loves Pizza!"
|
|
240
|
-
```
|
|
241
|
-
|
|
242
|
-
### NLP Evaluation
|
|
243
|
-
|
|
244
|
-
> **Disclaimer**
|
|
245
|
-
> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
|
|
246
|
-
|
|
247
|
-
```python
|
|
248
|
-
> from llmflowstack import text_evaluation
|
|
249
|
-
> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
250
|
-
|
|
251
|
-
# Predictions from some model
|
|
252
|
-
> predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
|
|
253
|
-
# References text (ground truth)
|
|
254
|
-
> references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
|
|
255
|
-
|
|
256
|
-
# BERT Score Evaluation
|
|
257
|
-
> bert_score_evaluation(predictions, references)
|
|
258
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
|
|
259
|
-
|
|
260
|
-
# Bleu Score Evaluation
|
|
261
|
-
> bleu_score_evaluation(predictions, references)
|
|
262
|
-
{'bleu_score': 0.3656}
|
|
263
|
-
|
|
264
|
-
# Cosine Similarity Evaluation
|
|
265
|
-
> cosine_similarity_evaluation(predictions, references)
|
|
266
|
-
{'cosine_similarity': 0.7443}
|
|
267
|
-
|
|
268
|
-
# Rouge Score Evaluation
|
|
269
|
-
> rouge_evaluation(predictions, references)
|
|
270
|
-
{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
271
|
-
|
|
272
|
-
# All-in-one function
|
|
273
|
-
> text_evaluation(predictions, references)
|
|
274
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
275
|
-
```
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .decoders.gemma_3 import Gemma3
|
|
2
|
+
from .decoders.gpt_2 import Gpt2
|
|
3
|
+
from .decoders.gpt_oss import GptOss
|
|
4
|
+
from .decoders.llama_3 import Llama3
|
|
5
|
+
from .decoders.llama_4 import Llama4
|
|
6
|
+
from .decoders.medgemma import MedGemma
|
|
7
|
+
#from .decoders.qwen_3 import Qwen3
|
|
8
|
+
from .rag.VectorDatabase import VectorDatabase
|
|
9
|
+
from .schemas.params import GenerationParams, TrainParams
|
|
10
|
+
from .utils.evaluation_methods import text_evaluation
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Gemma3",
|
|
14
|
+
"Gpt2",
|
|
15
|
+
"GptOss",
|
|
16
|
+
"Llama3",
|
|
17
|
+
"Llama4",
|
|
18
|
+
"MedGemma",
|
|
19
|
+
# "Qwen3",
|
|
20
|
+
|
|
21
|
+
"VectorDatabase",
|
|
22
|
+
|
|
23
|
+
"GenerationParams",
|
|
24
|
+
"TrainParams",
|
|
25
|
+
|
|
26
|
+
"text_evaluation"
|
|
27
|
+
]
|