llmflowstack 1.2.0__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/PKG-INFO +1 -1
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/GPT_OSS.py +21 -7
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/Gemma.py +10 -1
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/LLaMA3.py +21 -13
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/LLaMA4.py +10 -1
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/MedGemma.py +10 -1
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/pyproject.toml +1 -1
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/.github/workflows/python-publish.yml +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/.gitignore +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/LICENSE +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/README.md +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/__init__.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/callbacks/__init__.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/callbacks/log_collector.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/callbacks/stop_on_token.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/BaseDecoder.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/__init__.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/rag/VectorDatabase.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/rag/__init__.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/schemas/__init__.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/schemas/params.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/__init__.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/evaluation_methods.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/exceptions.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/generation_utils.py +0 -0
- {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/logging.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llmflowstack
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference. Public fork without institution-specific components.
|
|
5
5
|
Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -24,11 +24,11 @@ class GPTOSSInput(TypedDict):
|
|
|
24
24
|
developer_message: str | None
|
|
25
25
|
expected_answer: str | None
|
|
26
26
|
reasoning_message: str | None
|
|
27
|
-
reasoning_level: Literal["Low", "Medium", "High"] | None
|
|
27
|
+
reasoning_level: Literal["Low", "Medium", "High", "Off"] | None
|
|
28
28
|
|
|
29
29
|
class GPT_OSS(BaseDecoder):
|
|
30
30
|
model: GptOssForCausalLM | None = None
|
|
31
|
-
reasoning_level: Literal["Low", "Medium", "High"] = "Low"
|
|
31
|
+
reasoning_level: Literal["Low", "Medium", "High", "Off"] = "Low"
|
|
32
32
|
question_fields = ["input_text", "developer_message", "system_message"]
|
|
33
33
|
answer_fields = ["expected_answer", "reasoning_message"]
|
|
34
34
|
|
|
@@ -102,6 +102,8 @@ class GPT_OSS(BaseDecoder):
|
|
|
102
102
|
|
|
103
103
|
system_message = data.get("system_message", "")
|
|
104
104
|
system_text = f"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\n\nReasoning: {reasoning}\n\n{system_message}# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|>"
|
|
105
|
+
if reasoning == "Off":
|
|
106
|
+
system_text = f"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\n\n{system_message}# Valid channels: final. Channel must be included for every message.<|end|>"
|
|
105
107
|
|
|
106
108
|
developer_text = ""
|
|
107
109
|
developer_message = data.get("developer_message", "")
|
|
@@ -117,6 +119,9 @@ class GPT_OSS(BaseDecoder):
|
|
|
117
119
|
if expected_answer:
|
|
118
120
|
assistant_text += f"<|start|>assistant<|channel|>final<|message|>{expected_answer}<|return|>"
|
|
119
121
|
|
|
122
|
+
if not expected_answer and reasoning == "Off":
|
|
123
|
+
assistant_text = "<|start|>assistant<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>"
|
|
124
|
+
|
|
120
125
|
return (
|
|
121
126
|
f"{system_text}{developer_text}"
|
|
122
127
|
f"<|start|>user<|message|>{data["input_text"]}<|end|>"
|
|
@@ -130,7 +135,7 @@ class GPT_OSS(BaseDecoder):
|
|
|
130
135
|
developer_message: str | None = None,
|
|
131
136
|
expected_answer: str | None = None,
|
|
132
137
|
reasoning_message: str | None = None,
|
|
133
|
-
reasoning_level: Literal["Low", "Medium", "High"] | None = None
|
|
138
|
+
reasoning_level: Literal["Low", "Medium", "High", "Off"] | None = None
|
|
134
139
|
) -> GPTOSSInput:
|
|
135
140
|
if not self.tokenizer:
|
|
136
141
|
raise MissingEssentialProp("Could not find tokenizer.")
|
|
@@ -146,7 +151,7 @@ class GPT_OSS(BaseDecoder):
|
|
|
146
151
|
|
|
147
152
|
def set_reasoning_level(
|
|
148
153
|
self,
|
|
149
|
-
level: Literal["Low", "Medium", "High"]
|
|
154
|
+
level: Literal["Low", "Medium", "High", "Off"]
|
|
150
155
|
) -> None:
|
|
151
156
|
self.reasoning_level = level
|
|
152
157
|
|
|
@@ -229,6 +234,8 @@ class GPT_OSS(BaseDecoder):
|
|
|
229
234
|
yield ""
|
|
230
235
|
return
|
|
231
236
|
|
|
237
|
+
self._log(f"Processing received input...'")
|
|
238
|
+
|
|
232
239
|
if params is None:
|
|
233
240
|
params = GenerationParams(max_new_tokens=32768)
|
|
234
241
|
elif params.max_new_tokens is None:
|
|
@@ -268,19 +275,26 @@ class GPT_OSS(BaseDecoder):
|
|
|
268
275
|
stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
|
|
269
276
|
)
|
|
270
277
|
|
|
278
|
+
start = time()
|
|
279
|
+
|
|
271
280
|
thread = threading.Thread(target=generate_fn)
|
|
272
281
|
thread.start()
|
|
273
282
|
|
|
274
|
-
done_thinking =
|
|
283
|
+
done_thinking = self.reasoning_level == "Off"
|
|
275
284
|
buffer = ""
|
|
276
285
|
|
|
277
286
|
for new_text in streamer:
|
|
278
287
|
buffer += new_text
|
|
279
288
|
|
|
280
|
-
if "final" in buffer:
|
|
289
|
+
if "final" in buffer and not done_thinking:
|
|
281
290
|
done_thinking = True
|
|
282
291
|
buffer = buffer.split("final", 1)[1]
|
|
283
292
|
|
|
284
293
|
if done_thinking:
|
|
285
294
|
yield buffer
|
|
286
|
-
buffer = ""
|
|
295
|
+
buffer = ""
|
|
296
|
+
|
|
297
|
+
end = time()
|
|
298
|
+
total_time = end - start
|
|
299
|
+
|
|
300
|
+
self._log(f"Response generated in {total_time:.4f} seconds")
|
|
@@ -270,6 +270,8 @@ class Gemma3(BaseDecoder):
|
|
|
270
270
|
if False:
|
|
271
271
|
yield ""
|
|
272
272
|
return
|
|
273
|
+
|
|
274
|
+
self._log(f"Processing received input...'")
|
|
273
275
|
|
|
274
276
|
if params is None:
|
|
275
277
|
params = GenerationParams(max_new_tokens=32768)
|
|
@@ -311,8 +313,15 @@ class Gemma3(BaseDecoder):
|
|
|
311
313
|
stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
|
|
312
314
|
)
|
|
313
315
|
|
|
316
|
+
start = time()
|
|
317
|
+
|
|
314
318
|
thread = threading.Thread(target=generate_fn)
|
|
315
319
|
thread.start()
|
|
316
320
|
|
|
317
321
|
for new_text in streamer:
|
|
318
|
-
yield new_text
|
|
322
|
+
yield new_text
|
|
323
|
+
|
|
324
|
+
end = time()
|
|
325
|
+
total_time = end - start
|
|
326
|
+
|
|
327
|
+
self._log(f"Response generated in {total_time:.4f} seconds")
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import threading
|
|
2
|
+
from functools import partial
|
|
2
3
|
from time import time
|
|
3
4
|
from typing import Iterator, Literal, TypedDict, cast
|
|
4
5
|
|
|
@@ -187,6 +188,8 @@ class LLaMA3(BaseDecoder):
|
|
|
187
188
|
yield ""
|
|
188
189
|
return
|
|
189
190
|
|
|
191
|
+
self._log(f"Processing received input...'")
|
|
192
|
+
|
|
190
193
|
if params is None:
|
|
191
194
|
params = GenerationParams(max_new_tokens=8192)
|
|
192
195
|
elif params.max_new_tokens is None:
|
|
@@ -217,20 +220,25 @@ class LLaMA3(BaseDecoder):
|
|
|
217
220
|
skip_special_tokens=True
|
|
218
221
|
)
|
|
219
222
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
|
|
230
|
-
)
|
|
223
|
+
generate_fn = partial(
|
|
224
|
+
self.model.generate,
|
|
225
|
+
input_ids=input_ids,
|
|
226
|
+
attention_mask=attention_mask,
|
|
227
|
+
use_cache=True,
|
|
228
|
+
eos_token_id=None,
|
|
229
|
+
streamer=streamer,
|
|
230
|
+
stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
|
|
231
|
+
)
|
|
231
232
|
|
|
232
|
-
|
|
233
|
+
start = time()
|
|
234
|
+
|
|
235
|
+
thread = threading.Thread(target=generate_fn)
|
|
233
236
|
thread.start()
|
|
234
237
|
|
|
235
238
|
for new_text in streamer:
|
|
236
|
-
yield new_text
|
|
239
|
+
yield new_text
|
|
240
|
+
|
|
241
|
+
end = time()
|
|
242
|
+
total_time = end - start
|
|
243
|
+
|
|
244
|
+
self._log(f"Response generated in {total_time:.4f} seconds")
|
|
@@ -268,6 +268,8 @@ class LLaMA4(BaseDecoder):
|
|
|
268
268
|
yield ""
|
|
269
269
|
return
|
|
270
270
|
|
|
271
|
+
self._log(f"Processing received input...'")
|
|
272
|
+
|
|
271
273
|
if params is None:
|
|
272
274
|
params = GenerationParams(max_new_tokens=32768)
|
|
273
275
|
elif params.max_new_tokens is None:
|
|
@@ -308,8 +310,15 @@ class LLaMA4(BaseDecoder):
|
|
|
308
310
|
stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
|
|
309
311
|
)
|
|
310
312
|
|
|
313
|
+
start = time()
|
|
314
|
+
|
|
311
315
|
thread = threading.Thread(target=generate_fn)
|
|
312
316
|
thread.start()
|
|
313
317
|
|
|
314
318
|
for new_text in streamer:
|
|
315
|
-
yield new_text
|
|
319
|
+
yield new_text
|
|
320
|
+
|
|
321
|
+
end = time()
|
|
322
|
+
total_time = end - start
|
|
323
|
+
|
|
324
|
+
self._log(f"Response generated in {total_time:.4f} seconds")
|
|
@@ -199,6 +199,8 @@ class MedGemma(BaseDecoder):
|
|
|
199
199
|
yield ""
|
|
200
200
|
return
|
|
201
201
|
|
|
202
|
+
self._log(f"Processing received input...'")
|
|
203
|
+
|
|
202
204
|
if params is None:
|
|
203
205
|
params = GenerationParams(max_new_tokens=32768)
|
|
204
206
|
elif params.max_new_tokens is None:
|
|
@@ -239,6 +241,8 @@ class MedGemma(BaseDecoder):
|
|
|
239
241
|
stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
|
|
240
242
|
)
|
|
241
243
|
|
|
244
|
+
start = time()
|
|
245
|
+
|
|
242
246
|
thread = threading.Thread(target=generate_fn)
|
|
243
247
|
thread.start()
|
|
244
248
|
|
|
@@ -263,4 +267,9 @@ class MedGemma(BaseDecoder):
|
|
|
263
267
|
else:
|
|
264
268
|
if buffer.find("<unused95>") != -1:
|
|
265
269
|
is_thinking = False
|
|
266
|
-
buffer = buffer.split("<unused95>", 1)[1]
|
|
270
|
+
buffer = buffer.split("<unused95>", 1)[1]
|
|
271
|
+
|
|
272
|
+
end = time()
|
|
273
|
+
total_time = end - start
|
|
274
|
+
|
|
275
|
+
self._log(f"Response generated in {total_time:.4f} seconds")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|