DeepFabric 4.10.0__py3-none-any.whl → 4.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepfabric/cli.py +83 -27
- deepfabric/cloud_upload.py +1 -1
- deepfabric/config.py +6 -4
- deepfabric/constants.py +1 -1
- deepfabric/dataset_manager.py +264 -62
- deepfabric/generator.py +687 -82
- deepfabric/graph.py +25 -1
- deepfabric/llm/retry_handler.py +28 -9
- deepfabric/progress.py +42 -0
- deepfabric/topic_manager.py +22 -2
- deepfabric/topic_model.py +26 -0
- deepfabric/tree.py +41 -16
- deepfabric/tui.py +448 -349
- deepfabric/utils.py +4 -1
- {deepfabric-4.10.0.dist-info → deepfabric-4.11.0.dist-info}/METADATA +4 -2
- {deepfabric-4.10.0.dist-info → deepfabric-4.11.0.dist-info}/RECORD +19 -19
- {deepfabric-4.10.0.dist-info → deepfabric-4.11.0.dist-info}/licenses/LICENSE +1 -1
- {deepfabric-4.10.0.dist-info → deepfabric-4.11.0.dist-info}/WHEEL +0 -0
- {deepfabric-4.10.0.dist-info → deepfabric-4.11.0.dist-info}/entry_points.txt +0 -0
deepfabric/graph.py
CHANGED
|
@@ -26,7 +26,7 @@ from .prompts import (
|
|
|
26
26
|
)
|
|
27
27
|
from .schemas import GraphSubtopics
|
|
28
28
|
from .stream_simulator import simulate_stream
|
|
29
|
-
from .topic_model import TopicModel, TopicPath
|
|
29
|
+
from .topic_model import Topic, TopicModel, TopicPath
|
|
30
30
|
|
|
31
31
|
if TYPE_CHECKING: # only for type hints to avoid runtime cycles
|
|
32
32
|
from .progress import ProgressReporter
|
|
@@ -615,6 +615,30 @@ class Graph(TopicModel):
|
|
|
615
615
|
|
|
616
616
|
visited.remove(node.id)
|
|
617
617
|
|
|
618
|
+
def get_unique_topics(self) -> list[Topic]:
|
|
619
|
+
"""Returns deduplicated topics by UUID.
|
|
620
|
+
|
|
621
|
+
Iterates through all nodes in the graph and returns unique topics.
|
|
622
|
+
Each node has a UUID in its metadata, ensuring uniqueness.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
List of Topic namedtuples containing (uuid, topic).
|
|
626
|
+
Each UUID appears exactly once.
|
|
627
|
+
"""
|
|
628
|
+
seen_uuids: set[str] = set()
|
|
629
|
+
result: list[Topic] = []
|
|
630
|
+
|
|
631
|
+
for node in self.nodes.values():
|
|
632
|
+
# Skip root node — it holds the generation seed prompt, not a topic
|
|
633
|
+
if node.id == self.root.id:
|
|
634
|
+
continue
|
|
635
|
+
node_uuid = node.metadata.get("uuid")
|
|
636
|
+
if node_uuid and node_uuid not in seen_uuids:
|
|
637
|
+
seen_uuids.add(node_uuid)
|
|
638
|
+
result.append(Topic(uuid=node_uuid, topic=node.topic))
|
|
639
|
+
|
|
640
|
+
return result
|
|
641
|
+
|
|
618
642
|
def _dfs_paths(
|
|
619
643
|
self, node: Node, current_path: list[str], paths: list[list[str]], visited: set[int]
|
|
620
644
|
) -> None:
|
deepfabric/llm/retry_handler.py
CHANGED
|
@@ -7,15 +7,21 @@ import time
|
|
|
7
7
|
|
|
8
8
|
from collections.abc import Callable, Coroutine
|
|
9
9
|
from functools import wraps
|
|
10
|
-
from typing import Any, TypeVar
|
|
10
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
11
11
|
|
|
12
12
|
from .rate_limit_config import BackoffStrategy, RateLimitConfig
|
|
13
13
|
from .rate_limit_detector import RateLimitDetector
|
|
14
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from deepfabric.progress import ProgressReporter
|
|
17
|
+
|
|
15
18
|
logger = logging.getLogger(__name__)
|
|
16
19
|
|
|
17
20
|
T = TypeVar("T")
|
|
18
21
|
|
|
22
|
+
# Max chars for error summaries emitted through progress reporter
|
|
23
|
+
_ERROR_SUMMARY_MAX_LENGTH = 200
|
|
24
|
+
|
|
19
25
|
|
|
20
26
|
class RetryHandler:
|
|
21
27
|
"""Intelligent retry handler for LLM API calls with provider-aware backoff."""
|
|
@@ -30,6 +36,7 @@ class RetryHandler:
|
|
|
30
36
|
self.config = config
|
|
31
37
|
self.provider = provider
|
|
32
38
|
self.detector = RateLimitDetector()
|
|
39
|
+
self.progress_reporter: ProgressReporter | None = None
|
|
33
40
|
|
|
34
41
|
def should_retry(self, exception: Exception) -> bool:
|
|
35
42
|
"""Determine if an exception should trigger a retry.
|
|
@@ -126,14 +133,26 @@ class RetryHandler:
|
|
|
126
133
|
if quota_info.quota_type:
|
|
127
134
|
quota_info_str = f" (quota_type: {quota_info.quota_type})"
|
|
128
135
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
136
|
+
if self.progress_reporter:
|
|
137
|
+
error_summary = str(exception)
|
|
138
|
+
if len(error_summary) > _ERROR_SUMMARY_MAX_LENGTH:
|
|
139
|
+
error_summary = error_summary[:_ERROR_SUMMARY_MAX_LENGTH] + "..."
|
|
140
|
+
self.progress_reporter.emit_llm_retry(
|
|
141
|
+
provider=self.provider,
|
|
142
|
+
attempt=tries,
|
|
143
|
+
wait=wait,
|
|
144
|
+
error_summary=error_summary,
|
|
145
|
+
quota_type=quota_info_str.strip(" ()") if quota_info_str else "",
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
logger.warning(
|
|
149
|
+
"Rate limit/transient error for %s on attempt %d, backing off %.2fs%s: %s",
|
|
150
|
+
self.provider,
|
|
151
|
+
tries,
|
|
152
|
+
wait,
|
|
153
|
+
quota_info_str,
|
|
154
|
+
exception,
|
|
155
|
+
)
|
|
137
156
|
|
|
138
157
|
def on_giveup_handler(self, details: dict[str, Any]) -> None:
|
|
139
158
|
"""Callback when giving up after max retries.
|
deepfabric/progress.py
CHANGED
|
@@ -81,6 +81,25 @@ class StreamObserver(Protocol):
|
|
|
81
81
|
"""
|
|
82
82
|
...
|
|
83
83
|
|
|
84
|
+
def on_llm_retry(
|
|
85
|
+
self,
|
|
86
|
+
provider: str,
|
|
87
|
+
attempt: int,
|
|
88
|
+
wait: float,
|
|
89
|
+
error_summary: str,
|
|
90
|
+
metadata: dict[str, Any],
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Called when an LLM API call is retried due to rate limiting or transient error.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
provider: LLM provider name (e.g., "gemini", "openai")
|
|
96
|
+
attempt: Current attempt number (1-based)
|
|
97
|
+
wait: Backoff delay in seconds
|
|
98
|
+
error_summary: Brief description of the error
|
|
99
|
+
metadata: Additional context (e.g., quota_type)
|
|
100
|
+
"""
|
|
101
|
+
...
|
|
102
|
+
|
|
84
103
|
|
|
85
104
|
class ProgressReporter:
|
|
86
105
|
"""Central progress reporter that notifies observers of generation events.
|
|
@@ -184,6 +203,29 @@ class ProgressReporter:
|
|
|
184
203
|
if hasattr(observer, "on_retry"):
|
|
185
204
|
observer.on_retry(sample_idx, attempt, max_attempts, error_summary, metadata)
|
|
186
205
|
|
|
206
|
+
def emit_llm_retry(
|
|
207
|
+
self,
|
|
208
|
+
provider: str,
|
|
209
|
+
attempt: int,
|
|
210
|
+
wait: float,
|
|
211
|
+
error_summary: str,
|
|
212
|
+
**metadata,
|
|
213
|
+
) -> None:
|
|
214
|
+
"""Emit an LLM retry event to all observers.
|
|
215
|
+
|
|
216
|
+
Used to track LLM API rate limits and transient errors.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
provider: LLM provider name
|
|
220
|
+
attempt: Current attempt number (1-based)
|
|
221
|
+
wait: Backoff delay in seconds
|
|
222
|
+
error_summary: Brief description of the error
|
|
223
|
+
**metadata: Additional context as keyword arguments
|
|
224
|
+
"""
|
|
225
|
+
for observer in self._observers:
|
|
226
|
+
if hasattr(observer, "on_llm_retry"):
|
|
227
|
+
observer.on_llm_retry(provider, attempt, wait, error_summary, metadata)
|
|
228
|
+
|
|
187
229
|
def emit_tool_execution(
|
|
188
230
|
self,
|
|
189
231
|
tool_name: str,
|
deepfabric/topic_manager.py
CHANGED
|
@@ -45,6 +45,8 @@ async def _process_graph_events(graph: Graph, debug: bool = False) -> dict | Non
|
|
|
45
45
|
progress_reporter = ProgressReporter()
|
|
46
46
|
progress_reporter.attach(tui)
|
|
47
47
|
graph.progress_reporter = progress_reporter
|
|
48
|
+
if hasattr(graph, "llm_client"):
|
|
49
|
+
graph.llm_client.retry_handler.progress_reporter = progress_reporter
|
|
48
50
|
|
|
49
51
|
tui_started = False
|
|
50
52
|
|
|
@@ -116,6 +118,8 @@ async def _process_tree_events(tree: Tree, debug: bool = False) -> dict | None:
|
|
|
116
118
|
progress_reporter = ProgressReporter()
|
|
117
119
|
progress_reporter.attach(tui)
|
|
118
120
|
tree.progress_reporter = progress_reporter
|
|
121
|
+
if hasattr(tree, "llm_client"):
|
|
122
|
+
tree.llm_client.retry_handler.progress_reporter = progress_reporter
|
|
119
123
|
|
|
120
124
|
final_event = None
|
|
121
125
|
try:
|
|
@@ -129,6 +133,8 @@ async def _process_tree_events(tree: Tree, debug: bool = False) -> dict | None:
|
|
|
129
133
|
tui.add_failure()
|
|
130
134
|
if debug and "error" in event:
|
|
131
135
|
get_tui().error(f"Debug: Tree generation failure - {event['error']}")
|
|
136
|
+
else:
|
|
137
|
+
tui.advance_simple_progress()
|
|
132
138
|
elif event["event"] == "build_complete":
|
|
133
139
|
total_paths = (
|
|
134
140
|
int(event["total_paths"]) if isinstance(event["total_paths"], str | int) else 0
|
|
@@ -233,8 +239,22 @@ def load_or_build_topic_model(
|
|
|
233
239
|
tui = get_tui()
|
|
234
240
|
|
|
235
241
|
if topics_load:
|
|
236
|
-
#
|
|
237
|
-
is_graph = config.topics.mode == "graph"
|
|
242
|
+
# Config mode takes precedence; file extension is only used to warn on mismatch
|
|
243
|
+
is_graph = config.topics.mode == "graph"
|
|
244
|
+
|
|
245
|
+
# Warn if file extension doesn't match the configured mode
|
|
246
|
+
if not is_graph and topics_load.endswith(".json"):
|
|
247
|
+
tui.warning(
|
|
248
|
+
f"File '{topics_load}' has .json extension (typically a graph) "
|
|
249
|
+
f"but mode is '{config.topics.mode}'. "
|
|
250
|
+
"If this is a graph set mode: graph in config."
|
|
251
|
+
)
|
|
252
|
+
elif is_graph and topics_load.endswith(".jsonl"):
|
|
253
|
+
tui.warning(
|
|
254
|
+
f"File '{topics_load}' has .jsonl extension (typically a tree) "
|
|
255
|
+
"but mode is 'graph'. "
|
|
256
|
+
"If this is a tree set mode: tree in config."
|
|
257
|
+
)
|
|
238
258
|
|
|
239
259
|
if is_graph:
|
|
240
260
|
tui.info(f"Reading topic graph from JSON file: {topics_load}")
|
deepfabric/topic_model.py
CHANGED
|
@@ -9,6 +9,18 @@ class TopicPath(NamedTuple):
|
|
|
9
9
|
topic_id: str
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
class Topic(NamedTuple):
|
|
13
|
+
"""A unique topic with its UUID and content.
|
|
14
|
+
|
|
15
|
+
Used for generation where we iterate over unique topics (by UUID)
|
|
16
|
+
rather than paths. This deduplicated view is essential for graphs
|
|
17
|
+
where multiple paths can lead to the same topic node.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
uuid: str
|
|
21
|
+
topic: str # The topic text/content
|
|
22
|
+
|
|
23
|
+
|
|
12
24
|
class TopicModel(ABC):
|
|
13
25
|
"""Abstract base class for topic models like Tree and Graph."""
|
|
14
26
|
|
|
@@ -37,6 +49,20 @@ class TopicModel(ABC):
|
|
|
37
49
|
"""
|
|
38
50
|
raise NotImplementedError
|
|
39
51
|
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def get_unique_topics(self) -> list[Topic]:
|
|
54
|
+
"""Returns deduplicated topics by UUID.
|
|
55
|
+
|
|
56
|
+
For generation, we iterate over unique topics rather than paths.
|
|
57
|
+
This is important for graphs where multiple paths can lead to the
|
|
58
|
+
same topic node - we only want to generate one sample per unique topic.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
List of Topic namedtuples containing (uuid, topic).
|
|
62
|
+
Each UUID appears exactly once.
|
|
63
|
+
"""
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
40
66
|
def get_path_by_id(self, topic_id: str) -> list[str] | None:
|
|
41
67
|
"""Look up a path by its topic_id.
|
|
42
68
|
|
deepfabric/tree.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import hashlib
|
|
2
3
|
import json
|
|
3
4
|
import time
|
|
4
5
|
import warnings
|
|
@@ -21,7 +22,7 @@ from .metrics import trace
|
|
|
21
22
|
from .prompts import TreePromptBuilder
|
|
22
23
|
from .schemas import TopicList
|
|
23
24
|
from .stream_simulator import simulate_stream
|
|
24
|
-
from .topic_model import TopicModel, TopicPath
|
|
25
|
+
from .topic_model import Topic, TopicModel, TopicPath
|
|
25
26
|
|
|
26
27
|
warnings.filterwarnings("ignore", message=".*Pydantic serializer warnings:.*")
|
|
27
28
|
|
|
@@ -242,24 +243,41 @@ class Tree(TopicModel):
|
|
|
242
243
|
"""Returns all the paths in the topic model."""
|
|
243
244
|
return self.tree_paths
|
|
244
245
|
|
|
246
|
+
@staticmethod
|
|
247
|
+
def _path_to_id(path: list[str]) -> str:
|
|
248
|
+
"""Compute a deterministic topic ID from a tree path."""
|
|
249
|
+
return hashlib.sha256(json.dumps(path).encode()).hexdigest()[:16]
|
|
250
|
+
|
|
251
|
+
def _add_path(self, path: list[str]) -> None:
|
|
252
|
+
"""Add a path to the tree.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
path: The topic path to add.
|
|
256
|
+
"""
|
|
257
|
+
self.tree_paths.append(path)
|
|
258
|
+
|
|
245
259
|
def get_all_paths_with_ids(self) -> list[TopicPath]:
|
|
246
260
|
"""Returns all paths with their unique identifiers.
|
|
247
261
|
|
|
248
|
-
For Tree, we generate stable IDs by hashing the path content.
|
|
249
|
-
This ensures the same path always gets the same ID across runs.
|
|
250
|
-
|
|
251
262
|
Returns:
|
|
252
263
|
List of TopicPath namedtuples containing (path, topic_id).
|
|
264
|
+
The topic_id is computed deterministically from the path content.
|
|
253
265
|
"""
|
|
254
|
-
|
|
266
|
+
return [TopicPath(path=path, topic_id=self._path_to_id(path)) for path in self.tree_paths]
|
|
255
267
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
268
|
+
def get_unique_topics(self) -> list[Topic]:
|
|
269
|
+
"""Returns all leaf topics with computed IDs.
|
|
270
|
+
|
|
271
|
+
For Trees, each path is unique by definition, so this returns
|
|
272
|
+
all leaf topics with deterministic path-based IDs.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
List of Topic namedtuples containing (uuid, topic).
|
|
276
|
+
"""
|
|
277
|
+
return [
|
|
278
|
+
Topic(uuid=self._path_to_id(path), topic=path[-1] if path else "")
|
|
279
|
+
for path in self.tree_paths
|
|
280
|
+
]
|
|
263
281
|
|
|
264
282
|
async def get_subtopics(
|
|
265
283
|
self, system_prompt: str, node_path: list[str], num_subtopics: int
|
|
@@ -361,7 +379,7 @@ class Tree(TopicModel):
|
|
|
361
379
|
yield {"event": "subtree_start", "node_path": node_path, "depth": current_depth}
|
|
362
380
|
|
|
363
381
|
if current_depth > total_depth:
|
|
364
|
-
self.
|
|
382
|
+
self._add_path(node_path)
|
|
365
383
|
yield {"event": "leaf_reached", "path": node_path}
|
|
366
384
|
return
|
|
367
385
|
|
|
@@ -383,7 +401,7 @@ class Tree(TopicModel):
|
|
|
383
401
|
yield event
|
|
384
402
|
|
|
385
403
|
if not subtopics:
|
|
386
|
-
self.
|
|
404
|
+
self._add_path(node_path)
|
|
387
405
|
yield {"event": "leaf_reached", "path": node_path}
|
|
388
406
|
return
|
|
389
407
|
|
|
@@ -403,7 +421,11 @@ class Tree(TopicModel):
|
|
|
403
421
|
yield child_event
|
|
404
422
|
|
|
405
423
|
def save(self, save_path: str) -> None:
|
|
406
|
-
"""Save the topic tree to a file.
|
|
424
|
+
"""Save the topic tree to a file.
|
|
425
|
+
|
|
426
|
+
Format: {"path": [...]}
|
|
427
|
+
IDs are computed on-the-fly from path content, not persisted.
|
|
428
|
+
"""
|
|
407
429
|
from pathlib import Path # noqa: PLC0415
|
|
408
430
|
|
|
409
431
|
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -446,8 +468,11 @@ class Tree(TopicModel):
|
|
|
446
468
|
def from_dict_list(self, dict_list: list[dict[str, Any]]) -> None:
|
|
447
469
|
"""Construct the topic tree from a list of dictionaries.
|
|
448
470
|
|
|
471
|
+
Accepts both the current format (``{"path": [...]}``) and the
|
|
472
|
+
legacy format that included a ``leaf_uuid`` field (silently ignored).
|
|
473
|
+
|
|
449
474
|
Args:
|
|
450
|
-
dict_list
|
|
475
|
+
dict_list: The list of dictionaries representing the topic tree.
|
|
451
476
|
"""
|
|
452
477
|
# Clear existing data
|
|
453
478
|
self.tree_paths = []
|