llmflowstack 1.2.6__tar.gz → 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/PKG-INFO +34 -132
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/README.md +27 -120
- llmflowstack-1.3.1/llmflowstack/__init__.py +27 -0
- llmflowstack-1.3.1/llmflowstack/callbacks/force_json.py +428 -0
- llmflowstack-1.3.1/llmflowstack/collators/multimodal_causal.py +122 -0
- llmflowstack-1.3.1/llmflowstack/decoders/__init__.py +18 -0
- llmflowstack-1.3.1/llmflowstack/decoders/base_decoder.py +694 -0
- llmflowstack-1.3.1/llmflowstack/decoders/gemma_3.py +143 -0
- llmflowstack-1.3.1/llmflowstack/decoders/gpt_2.py +106 -0
- llmflowstack-1.3.1/llmflowstack/decoders/gpt_oss.py +174 -0
- llmflowstack-1.3.1/llmflowstack/decoders/llama_3.py +123 -0
- llmflowstack-1.3.1/llmflowstack/decoders/llama_4.py +134 -0
- llmflowstack-1.3.1/llmflowstack/decoders/medgemma.py +169 -0
- llmflowstack-1.3.1/llmflowstack/decoders/qwen_3.py +194 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/rag/VectorDatabase.py +48 -14
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/rag/__init__.py +0 -4
- llmflowstack-1.3.1/llmflowstack/schemas/__init__.py +6 -0
- llmflowstack-1.3.1/llmflowstack/schemas/params.py +106 -0
- llmflowstack-1.3.1/pyproject.toml +38 -0
- llmflowstack-1.2.6/llmflowstack/__init__.py +0 -23
- llmflowstack-1.2.6/llmflowstack/callbacks/stop_on_token.py +0 -16
- llmflowstack-1.2.6/llmflowstack/decoders/BaseDecoder.py +0 -487
- llmflowstack-1.2.6/llmflowstack/decoders/GPT_OSS.py +0 -300
- llmflowstack-1.2.6/llmflowstack/decoders/Gemma.py +0 -327
- llmflowstack-1.2.6/llmflowstack/decoders/LLaMA3.py +0 -244
- llmflowstack-1.2.6/llmflowstack/decoders/LLaMA4.py +0 -324
- llmflowstack-1.2.6/llmflowstack/decoders/MedGemma.py +0 -275
- llmflowstack-1.2.6/llmflowstack/decoders/__init__.py +0 -13
- llmflowstack-1.2.6/llmflowstack/schemas/__init__.py +0 -9
- llmflowstack-1.2.6/llmflowstack/schemas/params.py +0 -40
- llmflowstack-1.2.6/llmflowstack/utils/generation_utils.py +0 -30
- llmflowstack-1.2.6/pyproject.toml +0 -43
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/.github/workflows/python-publish.yml +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/.gitignore +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/LICENSE +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/callbacks/__init__.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/callbacks/log_collector.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/utils/__init__.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/utils/evaluation_methods.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/utils/exceptions.py +0 -0
- {llmflowstack-1.2.6 → llmflowstack-1.3.1}/llmflowstack/utils/logging.py +0 -0
|
@@ -1,35 +1,30 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmflowstack
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference.
|
|
3
|
+
Version: 1.3.1
|
|
4
|
+
Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference.
|
|
5
5
|
Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
License-File: LICENSE
|
|
8
8
|
Requires-Python: >=3.12
|
|
9
9
|
Requires-Dist: accelerate
|
|
10
10
|
Requires-Dist: bert-score
|
|
11
|
-
Requires-Dist: bitsandbytes
|
|
12
11
|
Requires-Dist: chromadb
|
|
13
12
|
Requires-Dist: datasets
|
|
14
13
|
Requires-Dist: evaluate
|
|
15
|
-
Requires-Dist:
|
|
14
|
+
Requires-Dist: fbgemm-gpu-genai
|
|
16
15
|
Requires-Dist: kernels
|
|
17
16
|
Requires-Dist: langchain-chroma
|
|
18
17
|
Requires-Dist: langchain-community
|
|
18
|
+
Requires-Dist: mslk-cuda==1.0.0
|
|
19
19
|
Requires-Dist: nltk
|
|
20
|
-
Requires-Dist: numpy
|
|
21
|
-
Requires-Dist: openai-harmony
|
|
22
|
-
Requires-Dist: pandas
|
|
23
20
|
Requires-Dist: peft
|
|
21
|
+
Requires-Dist: pillow
|
|
24
22
|
Requires-Dist: rouge-score
|
|
25
23
|
Requires-Dist: safetensors
|
|
26
|
-
Requires-Dist: scikit-learn
|
|
27
|
-
Requires-Dist: scipy
|
|
28
24
|
Requires-Dist: sentence-transformers
|
|
29
25
|
Requires-Dist: torch
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist: transformers
|
|
26
|
+
Requires-Dist: torchao==0.16.0
|
|
27
|
+
Requires-Dist: transformers==4.57.6
|
|
33
28
|
Requires-Dist: triton
|
|
34
29
|
Requires-Dist: trl
|
|
35
30
|
Description-Content-Type: text/markdown
|
|
@@ -53,32 +48,23 @@ The goal is to make experimentation with LLMs more accessible, without the need
|
|
|
53
48
|
This framework is designed to provide flexibility when working with different open-source and commercial LLMs. Currently, the following models are supported:
|
|
54
49
|
|
|
55
50
|
- **GPT-OSS**
|
|
56
|
-
|
|
57
51
|
- [`GPT-OSS 20B`](https://huggingface.co/openai/gpt-oss-20b)
|
|
58
52
|
- [`GPT-OSS 120B`](https://huggingface.co/openai/gpt-oss-120b)
|
|
59
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
60
53
|
|
|
61
54
|
- **LLaMA 3**
|
|
62
|
-
|
|
63
55
|
- [`LLaMA 3.1 8B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
|
|
64
56
|
- [`LLaMA 3.1 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
|
|
65
57
|
- [`LLaMA 3.3 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)
|
|
66
58
|
- [`LLaMA 3.3 405B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
|
|
67
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
68
59
|
|
|
69
60
|
- **LLaMA 4**
|
|
70
|
-
|
|
71
61
|
- [`LLaMA 4 Scout - Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
|
|
72
|
-
> DAPT and Inference Available
|
|
73
62
|
|
|
74
63
|
- **Gemma**
|
|
75
|
-
|
|
76
64
|
- [`Gemma 3 27B - Instruct`](https://huggingface.co/google/gemma-3-27b-it)
|
|
77
|
-
> DAPT and Inference Available
|
|
78
65
|
|
|
79
66
|
- **MedGemma**
|
|
80
|
-
- [`MedGemma 27B
|
|
81
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
67
|
+
- [`MedGemma 27B - Instruct`](https://huggingface.co/google/medgemma-27b-it)
|
|
82
68
|
|
|
83
69
|
> Other architectures based on those **may** function correctly.
|
|
84
70
|
|
|
@@ -101,22 +87,22 @@ This section presents a bit of what you can do with the framework.
|
|
|
101
87
|
You can load as many models as your hardware allows (H100 GPU recommended)...
|
|
102
88
|
|
|
103
89
|
```python
|
|
104
|
-
from llmflowstack import
|
|
90
|
+
from llmflowstack import GptOss, Llama3
|
|
105
91
|
|
|
106
|
-
# Loading a
|
|
107
|
-
first_model =
|
|
92
|
+
# Loading a Llama model
|
|
93
|
+
first_model = Llama3()
|
|
108
94
|
first_model.load_checkpoint(
|
|
109
95
|
checkpoint="/llama-3.1-8b-Instruct",
|
|
110
96
|
)
|
|
111
97
|
|
|
112
|
-
# Loading a quantized
|
|
113
|
-
second_model =
|
|
98
|
+
# Loading a quantized Llama model
|
|
99
|
+
second_model = Llama3(
|
|
114
100
|
checkpoint="/llama-3.3-70b-Instruct",
|
|
115
101
|
quantization="4bit"
|
|
116
102
|
)
|
|
117
103
|
|
|
118
104
|
# Loading a GPT-OSS, quantized and with seed
|
|
119
|
-
thrid_model =
|
|
105
|
+
thrid_model = GptOss(
|
|
120
106
|
checkpoint="/gpt-oss-20b",
|
|
121
107
|
quantization=True,
|
|
122
108
|
seed=1234
|
|
@@ -126,32 +112,31 @@ thrid_model = GPT_OSS(
|
|
|
126
112
|
### Inference Examples
|
|
127
113
|
|
|
128
114
|
```python
|
|
129
|
-
> from llmflowstack import
|
|
115
|
+
> from llmflowstack import GptOss, GenerationParams
|
|
130
116
|
|
|
131
|
-
> gpt_oss_model =
|
|
117
|
+
> gpt_oss_model = GptOss(checkpoint="/gpt-oss-120b")
|
|
132
118
|
|
|
133
119
|
> gpt_oss_model.generate("Tell me a joke!")
|
|
134
120
|
'Why did the scarecrow become a successful motivational speaker? Because he was outstanding **in** his field! 🌾😄'
|
|
135
121
|
|
|
136
122
|
# Exclusive for GPT-OSS
|
|
137
|
-
> gpt_oss_model.set_reasoning_level("High")
|
|
123
|
+
> gpt_oss_model.set_reasoning_level("High") # Low, Medium, High, Off
|
|
138
124
|
|
|
139
125
|
> custom_input = gpt_oss_model.build_input(
|
|
140
126
|
input_text="Tell me another joke!",
|
|
141
127
|
developer_message="You are a clown and after every joke, you should say 'HONK HONK'"
|
|
142
128
|
)
|
|
143
129
|
> gpt_oss_model.generate(
|
|
144
|
-
|
|
130
|
+
data=custom_input,
|
|
145
131
|
params=GenerationParams(
|
|
132
|
+
mode="sample", # greedy, sample or beam
|
|
146
133
|
max_new_tokens=1024,
|
|
147
|
-
|
|
148
|
-
temperature=0.3
|
|
149
|
-
)
|
|
134
|
+
temperature=0.3
|
|
150
135
|
)
|
|
151
136
|
)
|
|
152
137
|
'Why did the scarecrow win an award? Because he was outstanding in his field! \n\nHONK HONK'
|
|
153
138
|
|
|
154
|
-
> llama_model =
|
|
139
|
+
> llama_model = Llama3(checkpoint="/llama-3.3-70B-Instruct", quantization="4bit")
|
|
155
140
|
> llama_model.generate("Why is the sky blue?")
|
|
156
141
|
'The sky appears blue because of a phenomenon called Rayleigh scattering, which is the scattering of light'
|
|
157
142
|
|
|
@@ -162,7 +147,7 @@ thrid_model = GPT_OSS(
|
|
|
162
147
|
You can also generate tokens using a streamer, that is, receiving one token at a time by using the iterator version of the generate function:
|
|
163
148
|
|
|
164
149
|
```python
|
|
165
|
-
llama_4 =
|
|
150
|
+
llama_4 = Llama4(
|
|
166
151
|
checkpoint="llama-4-scout-17b-16e-instruct"
|
|
167
152
|
)
|
|
168
153
|
|
|
@@ -175,10 +160,10 @@ for text in it:
|
|
|
175
160
|
### Training Examples (DAPT & Fine-tune)
|
|
176
161
|
|
|
177
162
|
```python
|
|
178
|
-
from llmflowstack import
|
|
163
|
+
from llmflowstack import Llama3
|
|
179
164
|
from llmflowstack.schemas import TrainParams
|
|
180
165
|
|
|
181
|
-
model =
|
|
166
|
+
model = Llama3(
|
|
182
167
|
checkpoint="llama-3.1-8b-Instruct"
|
|
183
168
|
)
|
|
184
169
|
|
|
@@ -186,28 +171,29 @@ model = LLaMA3(
|
|
|
186
171
|
dataset = []
|
|
187
172
|
dataset.append(model.build_input(
|
|
188
173
|
input_text="Chico is a cat, which color he is?",
|
|
189
|
-
|
|
174
|
+
output_text="Black!"
|
|
190
175
|
))
|
|
191
176
|
|
|
192
177
|
dataset.append(model.build_input(
|
|
193
178
|
input_text="Fred is a dog, which color he is?",
|
|
194
|
-
|
|
179
|
+
output_text="White!"
|
|
195
180
|
))
|
|
196
181
|
|
|
197
182
|
# Does the DAPT in the full model
|
|
198
|
-
model.
|
|
199
|
-
|
|
183
|
+
model.train(
|
|
184
|
+
train_data=dataset,
|
|
200
185
|
params=TrainParams(
|
|
201
186
|
batch_size=1,
|
|
202
187
|
epochs=3,
|
|
203
188
|
gradient_accumulation=1,
|
|
204
189
|
lr=2e-5
|
|
205
|
-
)
|
|
190
|
+
),
|
|
191
|
+
mode="DAPT"
|
|
206
192
|
)
|
|
207
193
|
|
|
208
194
|
# Does the fine-tune this time
|
|
209
|
-
model.
|
|
210
|
-
|
|
195
|
+
model.train(
|
|
196
|
+
train_data=dataset,
|
|
211
197
|
params=TrainParams(
|
|
212
198
|
batch_size=1,
|
|
213
199
|
gradient_accumulation=1,
|
|
@@ -216,7 +202,8 @@ model.fine_tune(
|
|
|
216
202
|
),
|
|
217
203
|
save_at_end=True,
|
|
218
204
|
# It will save the model
|
|
219
|
-
save_path="./output"
|
|
205
|
+
save_path="./output",
|
|
206
|
+
mode="FT"
|
|
220
207
|
)
|
|
221
208
|
|
|
222
209
|
# Saving the final result
|
|
@@ -224,88 +211,3 @@ model.save_checkpoint(
|
|
|
224
211
|
path="./model-output"
|
|
225
212
|
)
|
|
226
213
|
```
|
|
227
|
-
|
|
228
|
-
### RAG Pipeline
|
|
229
|
-
|
|
230
|
-
A prototype of a RAG pipeline is also available. You can instantiate and use it as follows:
|
|
231
|
-
|
|
232
|
-
```python
|
|
233
|
-
from llmflowstack import VectorDatabase
|
|
234
|
-
|
|
235
|
-
vector_db = VectorDatabase(
|
|
236
|
-
checkpoint="jina-embeddings-v4",
|
|
237
|
-
chunk_size=1000,
|
|
238
|
-
chunk_overlap=200
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
# Create or load an existing collection
|
|
242
|
-
vector_db.get_collection(
|
|
243
|
-
collection_name="memory_rag",
|
|
244
|
-
persist_directory="./memory"
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
vector_db.get_collection(
|
|
248
|
-
collection_name="files_rag",
|
|
249
|
-
persist_directory="./files"
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
# You may also omit the persist directory; in this case, the RAG data will be stored in memory
|
|
253
|
-
vector_db.get_collection(
|
|
254
|
-
collection_name="files_rag"
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
# To create a new document in a collection
|
|
258
|
-
vector_db.create(
|
|
259
|
-
collection_name="memory_rag",
|
|
260
|
-
information="User loves Pizza!", # Main information to be indexed in the vector database
|
|
261
|
-
other_info={"category": "food"},
|
|
262
|
-
can_split=False, # Indicates whether the information can be split into chunks
|
|
263
|
-
should_index=True # Defaults to True — defines whether the document should be indexed or only returned as a Document instance
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
# After adding documents, you can query the database
|
|
267
|
-
query_result = vector_db.query(
|
|
268
|
-
collection_name="memory_rag",
|
|
269
|
-
query="pizza",
|
|
270
|
-
filter={"category": "food"},
|
|
271
|
-
k=3 # Number of chunks to retrieve
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
print(query_result)
|
|
275
|
-
# > "User loves Pizza!"
|
|
276
|
-
```
|
|
277
|
-
|
|
278
|
-
### NLP Evaluation
|
|
279
|
-
|
|
280
|
-
> **Disclaimer**
|
|
281
|
-
> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
|
|
282
|
-
|
|
283
|
-
```python
|
|
284
|
-
> from llmflowstack import text_evaluation
|
|
285
|
-
> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
286
|
-
|
|
287
|
-
# Predictions from some model
|
|
288
|
-
> predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
|
|
289
|
-
# References text (ground truth)
|
|
290
|
-
> references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
|
|
291
|
-
|
|
292
|
-
# BERT Score Evaluation
|
|
293
|
-
> bert_score_evaluation(predictions, references)
|
|
294
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
|
|
295
|
-
|
|
296
|
-
# Bleu Score Evaluation
|
|
297
|
-
> bleu_score_evaluation(predictions, references)
|
|
298
|
-
{'bleu_score': 0.3656}
|
|
299
|
-
|
|
300
|
-
# Cosine Similarity Evaluation
|
|
301
|
-
> cosine_similarity_evaluation(predictions, references)
|
|
302
|
-
{'cosine_similarity': 0.7443}
|
|
303
|
-
|
|
304
|
-
# Rouge Score Evaluation
|
|
305
|
-
> rouge_evaluation(predictions, references)
|
|
306
|
-
{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
307
|
-
|
|
308
|
-
# All-in-one function
|
|
309
|
-
> text_evaluation(predictions, references)
|
|
310
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
311
|
-
```
|
|
@@ -17,32 +17,23 @@ The goal is to make experimentation with LLMs more accessible, without the need
|
|
|
17
17
|
This framework is designed to provide flexibility when working with different open-source and commercial LLMs. Currently, the following models are supported:
|
|
18
18
|
|
|
19
19
|
- **GPT-OSS**
|
|
20
|
-
|
|
21
20
|
- [`GPT-OSS 20B`](https://huggingface.co/openai/gpt-oss-20b)
|
|
22
21
|
- [`GPT-OSS 120B`](https://huggingface.co/openai/gpt-oss-120b)
|
|
23
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
24
22
|
|
|
25
23
|
- **LLaMA 3**
|
|
26
|
-
|
|
27
24
|
- [`LLaMA 3.1 8B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
|
|
28
25
|
- [`LLaMA 3.1 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
|
|
29
26
|
- [`LLaMA 3.3 70B - Instruct`](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)
|
|
30
27
|
- [`LLaMA 3.3 405B - Instruct`](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
|
|
31
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
32
28
|
|
|
33
29
|
- **LLaMA 4**
|
|
34
|
-
|
|
35
30
|
- [`LLaMA 4 Scout - Instruct`](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
|
|
36
|
-
> DAPT and Inference Available
|
|
37
31
|
|
|
38
32
|
- **Gemma**
|
|
39
|
-
|
|
40
33
|
- [`Gemma 3 27B - Instruct`](https://huggingface.co/google/gemma-3-27b-it)
|
|
41
|
-
> DAPT and Inference Available
|
|
42
34
|
|
|
43
35
|
- **MedGemma**
|
|
44
|
-
- [`MedGemma 27B
|
|
45
|
-
> Fine-Tuning, DAPT and Inference Available
|
|
36
|
+
- [`MedGemma 27B - Instruct`](https://huggingface.co/google/medgemma-27b-it)
|
|
46
37
|
|
|
47
38
|
> Other architectures based on those **may** function correctly.
|
|
48
39
|
|
|
@@ -65,22 +56,22 @@ This section presents a bit of what you can do with the framework.
|
|
|
65
56
|
You can load as many models as your hardware allows (H100 GPU recommended)...
|
|
66
57
|
|
|
67
58
|
```python
|
|
68
|
-
from llmflowstack import
|
|
59
|
+
from llmflowstack import GptOss, Llama3
|
|
69
60
|
|
|
70
|
-
# Loading a
|
|
71
|
-
first_model =
|
|
61
|
+
# Loading a Llama model
|
|
62
|
+
first_model = Llama3()
|
|
72
63
|
first_model.load_checkpoint(
|
|
73
64
|
checkpoint="/llama-3.1-8b-Instruct",
|
|
74
65
|
)
|
|
75
66
|
|
|
76
|
-
# Loading a quantized
|
|
77
|
-
second_model =
|
|
67
|
+
# Loading a quantized Llama model
|
|
68
|
+
second_model = Llama3(
|
|
78
69
|
checkpoint="/llama-3.3-70b-Instruct",
|
|
79
70
|
quantization="4bit"
|
|
80
71
|
)
|
|
81
72
|
|
|
82
73
|
# Loading a GPT-OSS, quantized and with seed
|
|
83
|
-
thrid_model =
|
|
74
|
+
thrid_model = GptOss(
|
|
84
75
|
checkpoint="/gpt-oss-20b",
|
|
85
76
|
quantization=True,
|
|
86
77
|
seed=1234
|
|
@@ -90,32 +81,31 @@ thrid_model = GPT_OSS(
|
|
|
90
81
|
### Inference Examples
|
|
91
82
|
|
|
92
83
|
```python
|
|
93
|
-
> from llmflowstack import
|
|
84
|
+
> from llmflowstack import GptOss, GenerationParams
|
|
94
85
|
|
|
95
|
-
> gpt_oss_model =
|
|
86
|
+
> gpt_oss_model = GptOss(checkpoint="/gpt-oss-120b")
|
|
96
87
|
|
|
97
88
|
> gpt_oss_model.generate("Tell me a joke!")
|
|
98
89
|
'Why did the scarecrow become a successful motivational speaker? Because he was outstanding **in** his field! 🌾😄'
|
|
99
90
|
|
|
100
91
|
# Exclusive for GPT-OSS
|
|
101
|
-
> gpt_oss_model.set_reasoning_level("High")
|
|
92
|
+
> gpt_oss_model.set_reasoning_level("High") # Low, Medium, High, Off
|
|
102
93
|
|
|
103
94
|
> custom_input = gpt_oss_model.build_input(
|
|
104
95
|
input_text="Tell me another joke!",
|
|
105
96
|
developer_message="You are a clown and after every joke, you should say 'HONK HONK'"
|
|
106
97
|
)
|
|
107
98
|
> gpt_oss_model.generate(
|
|
108
|
-
|
|
99
|
+
data=custom_input,
|
|
109
100
|
params=GenerationParams(
|
|
101
|
+
mode="sample", # greedy, sample or beam
|
|
110
102
|
max_new_tokens=1024,
|
|
111
|
-
|
|
112
|
-
temperature=0.3
|
|
113
|
-
)
|
|
103
|
+
temperature=0.3
|
|
114
104
|
)
|
|
115
105
|
)
|
|
116
106
|
'Why did the scarecrow win an award? Because he was outstanding in his field! \n\nHONK HONK'
|
|
117
107
|
|
|
118
|
-
> llama_model =
|
|
108
|
+
> llama_model = Llama3(checkpoint="/llama-3.3-70B-Instruct", quantization="4bit")
|
|
119
109
|
> llama_model.generate("Why is the sky blue?")
|
|
120
110
|
'The sky appears blue because of a phenomenon called Rayleigh scattering, which is the scattering of light'
|
|
121
111
|
|
|
@@ -126,7 +116,7 @@ thrid_model = GPT_OSS(
|
|
|
126
116
|
You can also generate tokens using a streamer, that is, receiving one token at a time by using the iterator version of the generate function:
|
|
127
117
|
|
|
128
118
|
```python
|
|
129
|
-
llama_4 =
|
|
119
|
+
llama_4 = Llama4(
|
|
130
120
|
checkpoint="llama-4-scout-17b-16e-instruct"
|
|
131
121
|
)
|
|
132
122
|
|
|
@@ -139,10 +129,10 @@ for text in it:
|
|
|
139
129
|
### Training Examples (DAPT & Fine-tune)
|
|
140
130
|
|
|
141
131
|
```python
|
|
142
|
-
from llmflowstack import
|
|
132
|
+
from llmflowstack import Llama3
|
|
143
133
|
from llmflowstack.schemas import TrainParams
|
|
144
134
|
|
|
145
|
-
model =
|
|
135
|
+
model = Llama3(
|
|
146
136
|
checkpoint="llama-3.1-8b-Instruct"
|
|
147
137
|
)
|
|
148
138
|
|
|
@@ -150,28 +140,29 @@ model = LLaMA3(
|
|
|
150
140
|
dataset = []
|
|
151
141
|
dataset.append(model.build_input(
|
|
152
142
|
input_text="Chico is a cat, which color he is?",
|
|
153
|
-
|
|
143
|
+
output_text="Black!"
|
|
154
144
|
))
|
|
155
145
|
|
|
156
146
|
dataset.append(model.build_input(
|
|
157
147
|
input_text="Fred is a dog, which color he is?",
|
|
158
|
-
|
|
148
|
+
output_text="White!"
|
|
159
149
|
))
|
|
160
150
|
|
|
161
151
|
# Does the DAPT in the full model
|
|
162
|
-
model.
|
|
163
|
-
|
|
152
|
+
model.train(
|
|
153
|
+
train_data=dataset,
|
|
164
154
|
params=TrainParams(
|
|
165
155
|
batch_size=1,
|
|
166
156
|
epochs=3,
|
|
167
157
|
gradient_accumulation=1,
|
|
168
158
|
lr=2e-5
|
|
169
|
-
)
|
|
159
|
+
),
|
|
160
|
+
mode="DAPT"
|
|
170
161
|
)
|
|
171
162
|
|
|
172
163
|
# Does the fine-tune this time
|
|
173
|
-
model.
|
|
174
|
-
|
|
164
|
+
model.train(
|
|
165
|
+
train_data=dataset,
|
|
175
166
|
params=TrainParams(
|
|
176
167
|
batch_size=1,
|
|
177
168
|
gradient_accumulation=1,
|
|
@@ -180,7 +171,8 @@ model.fine_tune(
|
|
|
180
171
|
),
|
|
181
172
|
save_at_end=True,
|
|
182
173
|
# It will save the model
|
|
183
|
-
save_path="./output"
|
|
174
|
+
save_path="./output",
|
|
175
|
+
mode="FT"
|
|
184
176
|
)
|
|
185
177
|
|
|
186
178
|
# Saving the final result
|
|
@@ -188,88 +180,3 @@ model.save_checkpoint(
|
|
|
188
180
|
path="./model-output"
|
|
189
181
|
)
|
|
190
182
|
```
|
|
191
|
-
|
|
192
|
-
### RAG Pipeline
|
|
193
|
-
|
|
194
|
-
A prototype of a RAG pipeline is also available. You can instantiate and use it as follows:
|
|
195
|
-
|
|
196
|
-
```python
|
|
197
|
-
from llmflowstack import VectorDatabase
|
|
198
|
-
|
|
199
|
-
vector_db = VectorDatabase(
|
|
200
|
-
checkpoint="jina-embeddings-v4",
|
|
201
|
-
chunk_size=1000,
|
|
202
|
-
chunk_overlap=200
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
# Create or load an existing collection
|
|
206
|
-
vector_db.get_collection(
|
|
207
|
-
collection_name="memory_rag",
|
|
208
|
-
persist_directory="./memory"
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
vector_db.get_collection(
|
|
212
|
-
collection_name="files_rag",
|
|
213
|
-
persist_directory="./files"
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
# You may also omit the persist directory; in this case, the RAG data will be stored in memory
|
|
217
|
-
vector_db.get_collection(
|
|
218
|
-
collection_name="files_rag"
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
# To create a new document in a collection
|
|
222
|
-
vector_db.create(
|
|
223
|
-
collection_name="memory_rag",
|
|
224
|
-
information="User loves Pizza!", # Main information to be indexed in the vector database
|
|
225
|
-
other_info={"category": "food"},
|
|
226
|
-
can_split=False, # Indicates whether the information can be split into chunks
|
|
227
|
-
should_index=True # Defaults to True — defines whether the document should be indexed or only returned as a Document instance
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
# After adding documents, you can query the database
|
|
231
|
-
query_result = vector_db.query(
|
|
232
|
-
collection_name="memory_rag",
|
|
233
|
-
query="pizza",
|
|
234
|
-
filter={"category": "food"},
|
|
235
|
-
k=3 # Number of chunks to retrieve
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
print(query_result)
|
|
239
|
-
# > "User loves Pizza!"
|
|
240
|
-
```
|
|
241
|
-
|
|
242
|
-
### NLP Evaluation
|
|
243
|
-
|
|
244
|
-
> **Disclaimer**
|
|
245
|
-
> These evaluation functions are designed for batch processing. Models and encoders are loaded internally on each call, which may be inefficient for per-sample or streaming evaluation.
|
|
246
|
-
|
|
247
|
-
```python
|
|
248
|
-
> from llmflowstack import text_evaluation
|
|
249
|
-
> from llmflowstack.utils import (bert_score_evaluation, bleu_score_evaluation, cosine_similarity_evaluation, rouge_evaluation)
|
|
250
|
-
|
|
251
|
-
# Predictions from some model
|
|
252
|
-
> predictions = ["Chico is a dog, and he is orange!", "Fred is a cat, and he is white!"]
|
|
253
|
-
# References text (ground truth)
|
|
254
|
-
> references = ["Chico is a cat, and he is black!", "Fred is a dog, and he is white!"]
|
|
255
|
-
|
|
256
|
-
# BERT Score Evaluation
|
|
257
|
-
> bert_score_evaluation(predictions, references)
|
|
258
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773}
|
|
259
|
-
|
|
260
|
-
# Bleu Score Evaluation
|
|
261
|
-
> bleu_score_evaluation(predictions, references)
|
|
262
|
-
{'bleu_score': 0.3656}
|
|
263
|
-
|
|
264
|
-
# Cosine Similarity Evaluation
|
|
265
|
-
> cosine_similarity_evaluation(predictions, references)
|
|
266
|
-
{'cosine_similarity': 0.7443}
|
|
267
|
-
|
|
268
|
-
# Rouge Score Evaluation
|
|
269
|
-
> rouge_evaluation(predictions, references)
|
|
270
|
-
{'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
271
|
-
|
|
272
|
-
# All-in-one function
|
|
273
|
-
> text_evaluation(predictions, references)
|
|
274
|
-
{'bertscore_precision': 0.9773, 'bertscore_recall': 0.9773, 'bertscore_f1': 0.9773, 'bleu_score': 0.3656, 'cosine_similarity': 0.7443, 'rouge1': 0.8125, 'rouge2': 0.6429, 'rougeL': 0.8125}
|
|
275
|
-
```
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from .decoders.gemma_3 import Gemma3
|
|
2
|
+
from .decoders.gpt_2 import Gpt2
|
|
3
|
+
from .decoders.gpt_oss import GptOss
|
|
4
|
+
from .decoders.llama_3 import Llama3
|
|
5
|
+
from .decoders.llama_4 import Llama4
|
|
6
|
+
from .decoders.medgemma import MedGemma
|
|
7
|
+
#from .decoders.qwen_3 import Qwen3
|
|
8
|
+
from .rag.VectorDatabase import VectorDatabase
|
|
9
|
+
from .schemas.params import GenerationParams, TrainParams
|
|
10
|
+
from .utils.evaluation_methods import text_evaluation
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"Gemma3",
|
|
14
|
+
"Gpt2",
|
|
15
|
+
"GptOss",
|
|
16
|
+
"Llama3",
|
|
17
|
+
"Llama4",
|
|
18
|
+
"MedGemma",
|
|
19
|
+
# "Qwen3",
|
|
20
|
+
|
|
21
|
+
"VectorDatabase",
|
|
22
|
+
|
|
23
|
+
"GenerationParams",
|
|
24
|
+
"TrainParams",
|
|
25
|
+
|
|
26
|
+
"text_evaluation"
|
|
27
|
+
]
|