llmflowstack 1.2.0__tar.gz → 1.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/PKG-INFO +1 -1
  2. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/GPT_OSS.py +21 -7
  3. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/Gemma.py +10 -1
  4. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/LLaMA3.py +21 -13
  5. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/LLaMA4.py +10 -1
  6. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/MedGemma.py +10 -1
  7. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/pyproject.toml +1 -1
  8. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/.github/workflows/python-publish.yml +0 -0
  9. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/.gitignore +0 -0
  10. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/LICENSE +0 -0
  11. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/README.md +0 -0
  12. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/__init__.py +0 -0
  13. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/callbacks/__init__.py +0 -0
  14. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/callbacks/log_collector.py +0 -0
  15. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/callbacks/stop_on_token.py +0 -0
  16. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/BaseDecoder.py +0 -0
  17. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/decoders/__init__.py +0 -0
  18. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/rag/VectorDatabase.py +0 -0
  19. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/rag/__init__.py +0 -0
  20. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/schemas/__init__.py +0 -0
  21. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/schemas/params.py +0 -0
  22. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/__init__.py +0 -0
  23. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/evaluation_methods.py +0 -0
  24. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/exceptions.py +0 -0
  25. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/generation_utils.py +0 -0
  26. {llmflowstack-1.2.0 → llmflowstack-1.2.2}/llmflowstack/utils/logging.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llmflowstack
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: LLMFlowStack is a framework for training and using LLMs (LLaMA, GPT-OSS, Gemma, ...). Supports DAPT, fine-tuning, and distributed inference. Public fork without institution-specific components.
5
5
  Author-email: Gustavo Henrique Ferreira Cruz <gustavohferreiracruz@gmail.com>
6
6
  License: MIT
@@ -24,11 +24,11 @@ class GPTOSSInput(TypedDict):
24
24
  developer_message: str | None
25
25
  expected_answer: str | None
26
26
  reasoning_message: str | None
27
- reasoning_level: Literal["Low", "Medium", "High"] | None
27
+ reasoning_level: Literal["Low", "Medium", "High", "Off"] | None
28
28
 
29
29
  class GPT_OSS(BaseDecoder):
30
30
  model: GptOssForCausalLM | None = None
31
- reasoning_level: Literal["Low", "Medium", "High"] = "Low"
31
+ reasoning_level: Literal["Low", "Medium", "High", "Off"] = "Low"
32
32
  question_fields = ["input_text", "developer_message", "system_message"]
33
33
  answer_fields = ["expected_answer", "reasoning_message"]
34
34
 
@@ -102,6 +102,8 @@ class GPT_OSS(BaseDecoder):
102
102
 
103
103
  system_message = data.get("system_message", "")
104
104
  system_text = f"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\n\nReasoning: {reasoning}\n\n{system_message}# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|>"
105
+ if reasoning == "Off":
106
+ system_text = f"<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\n\n{system_message}# Valid channels: final. Channel must be included for every message.<|end|>"
105
107
 
106
108
  developer_text = ""
107
109
  developer_message = data.get("developer_message", "")
@@ -117,6 +119,9 @@ class GPT_OSS(BaseDecoder):
117
119
  if expected_answer:
118
120
  assistant_text += f"<|start|>assistant<|channel|>final<|message|>{expected_answer}<|return|>"
119
121
 
122
+ if not expected_answer and reasoning == "Off":
123
+ assistant_text = "<|start|>assistant<|channel|>analysis<|message|><|end|><|start|>assistant<|channel|>final<|message|>"
124
+
120
125
  return (
121
126
  f"{system_text}{developer_text}"
122
127
  f"<|start|>user<|message|>{data["input_text"]}<|end|>"
@@ -130,7 +135,7 @@ class GPT_OSS(BaseDecoder):
130
135
  developer_message: str | None = None,
131
136
  expected_answer: str | None = None,
132
137
  reasoning_message: str | None = None,
133
- reasoning_level: Literal["Low", "Medium", "High"] | None = None
138
+ reasoning_level: Literal["Low", "Medium", "High", "Off"] | None = None
134
139
  ) -> GPTOSSInput:
135
140
  if not self.tokenizer:
136
141
  raise MissingEssentialProp("Could not find tokenizer.")
@@ -146,7 +151,7 @@ class GPT_OSS(BaseDecoder):
146
151
 
147
152
  def set_reasoning_level(
148
153
  self,
149
- level: Literal["Low", "Medium", "High"]
154
+ level: Literal["Low", "Medium", "High", "Off"]
150
155
  ) -> None:
151
156
  self.reasoning_level = level
152
157
 
@@ -229,6 +234,8 @@ class GPT_OSS(BaseDecoder):
229
234
  yield ""
230
235
  return
231
236
 
237
+ self._log(f"Processing received input...'")
238
+
232
239
  if params is None:
233
240
  params = GenerationParams(max_new_tokens=32768)
234
241
  elif params.max_new_tokens is None:
@@ -268,19 +275,26 @@ class GPT_OSS(BaseDecoder):
268
275
  stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
269
276
  )
270
277
 
278
+ start = time()
279
+
271
280
  thread = threading.Thread(target=generate_fn)
272
281
  thread.start()
273
282
 
274
- done_thinking = False
283
+ done_thinking = self.reasoning_level == "Off"
275
284
  buffer = ""
276
285
 
277
286
  for new_text in streamer:
278
287
  buffer += new_text
279
288
 
280
- if "final" in buffer:
289
+ if "final" in buffer and not done_thinking:
281
290
  done_thinking = True
282
291
  buffer = buffer.split("final", 1)[1]
283
292
 
284
293
  if done_thinking:
285
294
  yield buffer
286
- buffer = ""
295
+ buffer = ""
296
+
297
+ end = time()
298
+ total_time = end - start
299
+
300
+ self._log(f"Response generated in {total_time:.4f} seconds")
@@ -270,6 +270,8 @@ class Gemma3(BaseDecoder):
270
270
  if False:
271
271
  yield ""
272
272
  return
273
+
274
+ self._log(f"Processing received input...'")
273
275
 
274
276
  if params is None:
275
277
  params = GenerationParams(max_new_tokens=32768)
@@ -311,8 +313,15 @@ class Gemma3(BaseDecoder):
311
313
  stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
312
314
  )
313
315
 
316
+ start = time()
317
+
314
318
  thread = threading.Thread(target=generate_fn)
315
319
  thread.start()
316
320
 
317
321
  for new_text in streamer:
318
- yield new_text
322
+ yield new_text
323
+
324
+ end = time()
325
+ total_time = end - start
326
+
327
+ self._log(f"Response generated in {total_time:.4f} seconds")
@@ -1,4 +1,5 @@
1
1
  import threading
2
+ from functools import partial
2
3
  from time import time
3
4
  from typing import Iterator, Literal, TypedDict, cast
4
5
 
@@ -187,6 +188,8 @@ class LLaMA3(BaseDecoder):
187
188
  yield ""
188
189
  return
189
190
 
191
+ self._log(f"Processing received input...'")
192
+
190
193
  if params is None:
191
194
  params = GenerationParams(max_new_tokens=8192)
192
195
  elif params.max_new_tokens is None:
@@ -217,20 +220,25 @@ class LLaMA3(BaseDecoder):
217
220
  skip_special_tokens=True
218
221
  )
219
222
 
220
- def _generate() -> None:
221
- assert self.model is not None
222
- with torch.no_grad():
223
- self.model.generate(
224
- input_ids=input_ids,
225
- attention_mask=attention_mask,
226
- use_cache=True,
227
- eos_token_id=None,
228
- streamer=streamer,
229
- stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
230
- )
223
+ generate_fn = partial(
224
+ self.model.generate,
225
+ input_ids=input_ids,
226
+ attention_mask=attention_mask,
227
+ use_cache=True,
228
+ eos_token_id=None,
229
+ streamer=streamer,
230
+ stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
231
+ )
231
232
 
232
- thread = threading.Thread(target=_generate)
233
+ start = time()
234
+
235
+ thread = threading.Thread(target=generate_fn)
233
236
  thread.start()
234
237
 
235
238
  for new_text in streamer:
236
- yield new_text
239
+ yield new_text
240
+
241
+ end = time()
242
+ total_time = end - start
243
+
244
+ self._log(f"Response generated in {total_time:.4f} seconds")
@@ -268,6 +268,8 @@ class LLaMA4(BaseDecoder):
268
268
  yield ""
269
269
  return
270
270
 
271
+ self._log(f"Processing received input...'")
272
+
271
273
  if params is None:
272
274
  params = GenerationParams(max_new_tokens=32768)
273
275
  elif params.max_new_tokens is None:
@@ -308,8 +310,15 @@ class LLaMA4(BaseDecoder):
308
310
  stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
309
311
  )
310
312
 
313
+ start = time()
314
+
311
315
  thread = threading.Thread(target=generate_fn)
312
316
  thread.start()
313
317
 
314
318
  for new_text in streamer:
315
- yield new_text
319
+ yield new_text
320
+
321
+ end = time()
322
+ total_time = end - start
323
+
324
+ self._log(f"Response generated in {total_time:.4f} seconds")
@@ -199,6 +199,8 @@ class MedGemma(BaseDecoder):
199
199
  yield ""
200
200
  return
201
201
 
202
+ self._log(f"Processing received input...'")
203
+
202
204
  if params is None:
203
205
  params = GenerationParams(max_new_tokens=32768)
204
206
  elif params.max_new_tokens is None:
@@ -239,6 +241,8 @@ class MedGemma(BaseDecoder):
239
241
  stopping_criteria=StoppingCriteriaList([StopOnToken(self.stop_token_ids)])
240
242
  )
241
243
 
244
+ start = time()
245
+
242
246
  thread = threading.Thread(target=generate_fn)
243
247
  thread.start()
244
248
 
@@ -263,4 +267,9 @@ class MedGemma(BaseDecoder):
263
267
  else:
264
268
  if buffer.find("<unused95>") != -1:
265
269
  is_thinking = False
266
- buffer = buffer.split("<unused95>", 1)[1]
270
+ buffer = buffer.split("<unused95>", 1)[1]
271
+
272
+ end = time()
273
+ total_time = end - start
274
+
275
+ self._log(f"Response generated in {total_time:.4f} seconds")
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "llmflowstack"
7
- version = "1.2.0"
7
+ version = "1.2.2"
8
8
  authors = [
9
9
  { name = "Gustavo Henrique Ferreira Cruz", email = "gustavohferreiracruz@gmail.com" }
10
10
  ]
File without changes
File without changes
File without changes