DeepFabric 4.8.1__py3-none-any.whl → 4.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepfabric/config.py +5 -0
- deepfabric/evaluation/backends/transformers_backend.py +39 -17
- deepfabric/graph.py +51 -11
- deepfabric/metrics.py +2 -2
- deepfabric/prompts.py +134 -0
- deepfabric/schemas.py +13 -3
- deepfabric/utils.py +33 -0
- {deepfabric-4.8.1.dist-info → deepfabric-4.8.3.dist-info}/METADATA +6 -5
- {deepfabric-4.8.1.dist-info → deepfabric-4.8.3.dist-info}/RECORD +12 -12
- {deepfabric-4.8.1.dist-info → deepfabric-4.8.3.dist-info}/WHEEL +0 -0
- {deepfabric-4.8.1.dist-info → deepfabric-4.8.3.dist-info}/entry_points.txt +0 -0
- {deepfabric-4.8.1.dist-info → deepfabric-4.8.3.dist-info}/licenses/LICENSE +0 -0
deepfabric/config.py
CHANGED
|
@@ -109,6 +109,10 @@ class TopicsConfig(BaseModel):
|
|
|
109
109
|
description="Maximum concurrent LLM calls during graph expansion (helps avoid rate limits)",
|
|
110
110
|
)
|
|
111
111
|
save_as: str | None = Field(default=None, description="Where to save the generated topics")
|
|
112
|
+
prompt_style: Literal["default", "isolated", "anchored"] = Field(
|
|
113
|
+
default="default",
|
|
114
|
+
description="For graph mode: 'default' enables cross-connections with generic prompts, 'isolated' disables connections with generic prompts, 'anchored' disables connections and uses domain-aware prompts with examples for focused topic generation",
|
|
115
|
+
)
|
|
112
116
|
|
|
113
117
|
# Optional LLM overrides (inherits from top-level llm if not specified)
|
|
114
118
|
llm: LLMConfig | None = Field(
|
|
@@ -603,6 +607,7 @@ See documentation for full examples.
|
|
|
603
607
|
"depth": self.topics.depth,
|
|
604
608
|
"degree": self.topics.degree,
|
|
605
609
|
"max_concurrent": self.topics.max_concurrent,
|
|
610
|
+
"prompt_style": self.topics.prompt_style,
|
|
606
611
|
}
|
|
607
612
|
|
|
608
613
|
# Handle overrides
|
|
@@ -2,13 +2,13 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import sys
|
|
4
4
|
|
|
5
|
+
from functools import cached_property
|
|
5
6
|
from typing import Any
|
|
6
7
|
|
|
7
|
-
import torch
|
|
8
|
-
|
|
9
8
|
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
|
10
9
|
|
|
11
10
|
from ...schemas import ToolDefinition
|
|
11
|
+
from ...utils import import_optional_dependency
|
|
12
12
|
from ..inference import InferenceBackend, InferenceConfig, ModelResponse
|
|
13
13
|
from .tool_call_parsers import ToolCallParser, get_parser
|
|
14
14
|
|
|
@@ -29,6 +29,30 @@ logger = logging.getLogger(__name__)
|
|
|
29
29
|
class TransformersBackend(InferenceBackend):
|
|
30
30
|
"""Inference backend using HuggingFace Transformers."""
|
|
31
31
|
|
|
32
|
+
@cached_property
|
|
33
|
+
def _torch(self) -> Any:
|
|
34
|
+
"""Dynamically import 'torch' and verify its availability.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The imported torch module.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
ModuleNotFoundError: If 'torch' is not installed in the environment.
|
|
41
|
+
"""
|
|
42
|
+
return import_optional_dependency("torch", "training")
|
|
43
|
+
|
|
44
|
+
@cached_property
|
|
45
|
+
def _peft(self) -> Any:
|
|
46
|
+
"""Dynamically import 'peft' and verify its availability.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
The imported peft module.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ModuleNotFoundError: If 'peft' is not installed in the environment.
|
|
53
|
+
"""
|
|
54
|
+
return import_optional_dependency("peft", "training")
|
|
55
|
+
|
|
32
56
|
def __init__(self, config: InferenceConfig):
|
|
33
57
|
"""Initialize Transformers backend.
|
|
34
58
|
|
|
@@ -47,22 +71,22 @@ class TransformersBackend(InferenceBackend):
|
|
|
47
71
|
# Get device from pre-loaded model
|
|
48
72
|
self.device = str(next(config.model.parameters()).device)
|
|
49
73
|
# Auto-detect best available device
|
|
50
|
-
elif
|
|
74
|
+
elif self._torch.cuda.is_available():
|
|
51
75
|
self.device = "cuda"
|
|
52
|
-
elif
|
|
76
|
+
elif self._torch.backends.mps.is_available():
|
|
53
77
|
self.device = "mps"
|
|
54
78
|
else:
|
|
55
79
|
self.device = "cpu"
|
|
56
80
|
|
|
57
81
|
# Determine dtype based on device
|
|
58
82
|
if self.device == "cuda" or self.device.startswith("cuda:"):
|
|
59
|
-
dtype =
|
|
83
|
+
dtype = self._torch.float16
|
|
60
84
|
device_map = "auto"
|
|
61
85
|
elif self.device == "mps":
|
|
62
|
-
dtype =
|
|
86
|
+
dtype = self._torch.float32 # MPS works best with float32
|
|
63
87
|
device_map = None
|
|
64
88
|
else:
|
|
65
|
-
dtype =
|
|
89
|
+
dtype = self._torch.float32
|
|
66
90
|
device_map = None
|
|
67
91
|
|
|
68
92
|
# Handle pre-loaded model case - skip all loading logic
|
|
@@ -138,9 +162,9 @@ class TransformersBackend(InferenceBackend):
|
|
|
138
162
|
load_in_4bit=config.load_in_4bit,
|
|
139
163
|
)
|
|
140
164
|
# Load LoRA adapter using PEFT
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
165
|
+
self.model = self._peft.PeftModel.from_pretrained(
|
|
166
|
+
self.model, config.adapter_path
|
|
167
|
+
)
|
|
144
168
|
else:
|
|
145
169
|
# Load merged model or base model directly
|
|
146
170
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
|
|
@@ -172,9 +196,7 @@ class TransformersBackend(InferenceBackend):
|
|
|
172
196
|
|
|
173
197
|
# Load PEFT adapter if provided
|
|
174
198
|
if config.adapter_path:
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
self.model = PeftModel.from_pretrained(self.model, config.adapter_path)
|
|
199
|
+
self.model = self._peft.PeftModel.from_pretrained(self.model, config.adapter_path)
|
|
178
200
|
|
|
179
201
|
# Move to device if not using device_map
|
|
180
202
|
if self.device in ("cpu", "mps"):
|
|
@@ -217,7 +239,7 @@ class TransformersBackend(InferenceBackend):
|
|
|
217
239
|
).to(self.model.device)
|
|
218
240
|
|
|
219
241
|
# Generate with optimizations
|
|
220
|
-
with
|
|
242
|
+
with self._torch.no_grad():
|
|
221
243
|
outputs = self.model.generate(
|
|
222
244
|
**inputs,
|
|
223
245
|
max_new_tokens=self.config.max_tokens,
|
|
@@ -273,7 +295,7 @@ class TransformersBackend(InferenceBackend):
|
|
|
273
295
|
).to(self.model.device)
|
|
274
296
|
|
|
275
297
|
# Generate batch with optimizations
|
|
276
|
-
with
|
|
298
|
+
with self._torch.no_grad():
|
|
277
299
|
outputs = self.model.generate(
|
|
278
300
|
**inputs,
|
|
279
301
|
max_new_tokens=self.config.max_tokens,
|
|
@@ -316,8 +338,8 @@ class TransformersBackend(InferenceBackend):
|
|
|
316
338
|
del self.model
|
|
317
339
|
if hasattr(self, "tokenizer"):
|
|
318
340
|
del self.tokenizer
|
|
319
|
-
if
|
|
320
|
-
|
|
341
|
+
if self._torch.cuda.is_available():
|
|
342
|
+
self._torch.cuda.empty_cache()
|
|
321
343
|
|
|
322
344
|
def _format_prompt(
|
|
323
345
|
self,
|
deepfabric/graph.py
CHANGED
|
@@ -5,7 +5,7 @@ import textwrap
|
|
|
5
5
|
import uuid
|
|
6
6
|
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
9
9
|
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, Field
|
|
11
11
|
|
|
@@ -19,7 +19,11 @@ from .constants import (
|
|
|
19
19
|
from .llm import LLMClient
|
|
20
20
|
from .llm.rate_limit_detector import RateLimitDetector
|
|
21
21
|
from .metrics import trace
|
|
22
|
-
from .prompts import
|
|
22
|
+
from .prompts import (
|
|
23
|
+
GRAPH_EXPANSION_PROMPT,
|
|
24
|
+
GRAPH_EXPANSION_PROMPT_NO_CONNECTIONS,
|
|
25
|
+
GraphPromptBuilder,
|
|
26
|
+
)
|
|
23
27
|
from .schemas import GraphSubtopics
|
|
24
28
|
from .stream_simulator import simulate_stream
|
|
25
29
|
from .topic_model import TopicModel
|
|
@@ -70,6 +74,10 @@ class GraphConfig(BaseModel):
|
|
|
70
74
|
default=None,
|
|
71
75
|
description="Base URL for API endpoint (e.g., custom OpenAI-compatible servers)",
|
|
72
76
|
)
|
|
77
|
+
prompt_style: Literal["default", "isolated", "anchored"] = Field(
|
|
78
|
+
default="default",
|
|
79
|
+
description="Prompt style: 'default' (cross-connections, generic), 'isolated' (no connections, generic), 'anchored' (no connections, domain-aware)",
|
|
80
|
+
)
|
|
73
81
|
|
|
74
82
|
|
|
75
83
|
class GraphMetadata(BaseModel):
|
|
@@ -148,6 +156,7 @@ class Graph(TopicModel):
|
|
|
148
156
|
self.degree = self.config.degree
|
|
149
157
|
self.depth = self.config.depth
|
|
150
158
|
self.max_concurrent = self.config.max_concurrent
|
|
159
|
+
self.prompt_style = self.config.prompt_style
|
|
151
160
|
|
|
152
161
|
# Initialize LLM client
|
|
153
162
|
llm_kwargs = {}
|
|
@@ -493,6 +502,17 @@ class Graph(TopicModel):
|
|
|
493
502
|
)
|
|
494
503
|
return None
|
|
495
504
|
|
|
505
|
+
def _get_path_to_node(self, node: Node) -> list[str]:
|
|
506
|
+
"""Get the topic path from root to the given node."""
|
|
507
|
+
path = []
|
|
508
|
+
current = node
|
|
509
|
+
while current is not None:
|
|
510
|
+
path.append(current.topic)
|
|
511
|
+
# First parent is the primary parent from tree expansion;
|
|
512
|
+
# cross-connections are added later and appear after index 0
|
|
513
|
+
current = current.parents[0] if current.parents else None
|
|
514
|
+
return list(reversed(path))
|
|
515
|
+
|
|
496
516
|
async def get_subtopics_and_connections(
|
|
497
517
|
self, parent_node: Node, num_subtopics: int
|
|
498
518
|
) -> tuple[int, int]:
|
|
@@ -505,15 +525,35 @@ class Graph(TopicModel):
|
|
|
505
525
|
Returns:
|
|
506
526
|
A tuple of (subtopics_added, connections_added).
|
|
507
527
|
"""
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
528
|
+
# Choose prompt based on prompt_style setting
|
|
529
|
+
if self.prompt_style == "anchored":
|
|
530
|
+
# Domain-aware prompts with examples for focused generation
|
|
531
|
+
topic_path = self._get_path_to_node(parent_node)
|
|
532
|
+
domain = GraphPromptBuilder.detect_domain(self.model_system_prompt, topic_path)
|
|
533
|
+
graph_prompt = GraphPromptBuilder.build_anchored_prompt(
|
|
534
|
+
topic_path=topic_path,
|
|
535
|
+
num_subtopics=num_subtopics,
|
|
536
|
+
system_prompt=self.model_system_prompt,
|
|
537
|
+
domain=domain,
|
|
538
|
+
)
|
|
539
|
+
elif self.prompt_style == "isolated":
|
|
540
|
+
# No connections, generic prompt
|
|
541
|
+
graph_prompt = GRAPH_EXPANSION_PROMPT_NO_CONNECTIONS.replace(
|
|
542
|
+
"{{current_topic}}", parent_node.topic
|
|
543
|
+
)
|
|
544
|
+
graph_prompt = graph_prompt.replace("{{num_subtopics}}", str(num_subtopics))
|
|
545
|
+
else:
|
|
546
|
+
# default: cross-connections enabled, generic prompt
|
|
547
|
+
graph_summary = (
|
|
548
|
+
self.to_json()
|
|
549
|
+
if len(self.nodes) <= TOPIC_GRAPH_SUMMARY
|
|
550
|
+
else "Graph too large to display"
|
|
551
|
+
)
|
|
552
|
+
graph_prompt = GRAPH_EXPANSION_PROMPT.replace(
|
|
553
|
+
"{{current_graph_summary}}", graph_summary
|
|
554
|
+
)
|
|
555
|
+
graph_prompt = graph_prompt.replace("{{current_topic}}", parent_node.topic)
|
|
556
|
+
graph_prompt = graph_prompt.replace("{{num_subtopics}}", str(num_subtopics))
|
|
517
557
|
|
|
518
558
|
response = await self._generate_subtopics_with_retry(graph_prompt, parent_node)
|
|
519
559
|
if response is None:
|
deepfabric/metrics.py
CHANGED
|
@@ -19,8 +19,8 @@ except (ImportError, importlib.metadata.PackageNotFoundError):
|
|
|
19
19
|
|
|
20
20
|
# Initialize PostHog client
|
|
21
21
|
posthog = Posthog(
|
|
22
|
-
project_api_key="
|
|
23
|
-
host="https://
|
|
22
|
+
project_api_key="phc_JZWiTzIDNnBp6Jj6uUb0JQKuIp3dv0gkay9aU50n38h",
|
|
23
|
+
host="https://eu.i.posthog.com",
|
|
24
24
|
)
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
deepfabric/prompts.py
CHANGED
|
@@ -264,6 +264,140 @@ Generate a list of {{num_subtopics}} subtopics. For each subtopic, provide:
|
|
|
264
264
|
2. A "connections" list of IDs of existing topics it should connect to for creating cross-links (use empty list if no connections)
|
|
265
265
|
"""
|
|
266
266
|
|
|
267
|
+
GRAPH_EXPANSION_PROMPT_NO_CONNECTIONS = """
|
|
268
|
+
You are an expert in topic generation. Your task is to expand a topic into a set of focused subtopics.
|
|
269
|
+
|
|
270
|
+
You are expanding the topic: "{{current_topic}}"
|
|
271
|
+
|
|
272
|
+
Generate a list of {{num_subtopics}} subtopics. For each subtopic, provide:
|
|
273
|
+
1. A "topic" string - the name of the new subtopic
|
|
274
|
+
2. A "connections" list - ALWAYS use an empty list []
|
|
275
|
+
|
|
276
|
+
IMPORTANT: Do NOT create cross-connections between topics. Each subtopic should be independent and directly related only to its parent topic. Always return connections as an empty list [].
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class GraphPromptBuilder:
|
|
281
|
+
"""Build domain-aware prompts for graph topic expansion with anchoring examples."""
|
|
282
|
+
|
|
283
|
+
MAX_PROMPT_EXAMPLES = 3
|
|
284
|
+
|
|
285
|
+
SECURITY_KEYWORDS = frozenset({
|
|
286
|
+
"security",
|
|
287
|
+
"attack",
|
|
288
|
+
"credential",
|
|
289
|
+
"exfiltration",
|
|
290
|
+
"injection",
|
|
291
|
+
"malicious",
|
|
292
|
+
"adversarial",
|
|
293
|
+
"threat",
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
# Domain-specific expansion examples - formatted to match GraphSubtopics schema
|
|
297
|
+
EXAMPLES = {
|
|
298
|
+
"security": [
|
|
299
|
+
{
|
|
300
|
+
"path": ["Security Threats", "Credential Access"],
|
|
301
|
+
"subtopics": [
|
|
302
|
+
{"topic": "reading .env files", "connections": []},
|
|
303
|
+
{"topic": "extracting API keys", "connections": []},
|
|
304
|
+
{"topic": "accessing SSH keys", "connections": []},
|
|
305
|
+
{"topic": "dumping AWS credentials", "connections": []},
|
|
306
|
+
{"topic": "stealing database passwords", "connections": []},
|
|
307
|
+
],
|
|
308
|
+
},
|
|
309
|
+
{
|
|
310
|
+
"path": ["Security Threats", "Data Exfiltration"],
|
|
311
|
+
"subtopics": [
|
|
312
|
+
{"topic": "sending to webhooks", "connections": []},
|
|
313
|
+
{"topic": "encoding in base64", "connections": []},
|
|
314
|
+
{"topic": "uploading to external URLs", "connections": []},
|
|
315
|
+
{"topic": "email forwarding", "connections": []},
|
|
316
|
+
{"topic": "DNS tunneling", "connections": []},
|
|
317
|
+
],
|
|
318
|
+
},
|
|
319
|
+
],
|
|
320
|
+
"technical": [
|
|
321
|
+
{
|
|
322
|
+
"path": ["Programming", "Python"],
|
|
323
|
+
"subtopics": [
|
|
324
|
+
{"topic": "pandas", "connections": []},
|
|
325
|
+
{"topic": "flask", "connections": []},
|
|
326
|
+
{"topic": "pytest", "connections": []},
|
|
327
|
+
{"topic": "asyncio", "connections": []},
|
|
328
|
+
{"topic": "django", "connections": []},
|
|
329
|
+
],
|
|
330
|
+
},
|
|
331
|
+
{
|
|
332
|
+
"path": ["Infrastructure", "Kubernetes"],
|
|
333
|
+
"subtopics": [
|
|
334
|
+
{"topic": "pods", "connections": []},
|
|
335
|
+
{"topic": "deployments", "connections": []},
|
|
336
|
+
{"topic": "services", "connections": []},
|
|
337
|
+
{"topic": "ingress", "connections": []},
|
|
338
|
+
{"topic": "helm charts", "connections": []},
|
|
339
|
+
],
|
|
340
|
+
},
|
|
341
|
+
],
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
@classmethod
|
|
345
|
+
def build_anchored_prompt(
|
|
346
|
+
cls,
|
|
347
|
+
topic_path: list[str],
|
|
348
|
+
num_subtopics: int,
|
|
349
|
+
system_prompt: str = "",
|
|
350
|
+
domain: str = "technical",
|
|
351
|
+
) -> str:
|
|
352
|
+
"""Build a domain-anchored prompt for graph expansion.
|
|
353
|
+
|
|
354
|
+
Returns a prompt that produces focused, on-topic subtopics by providing
|
|
355
|
+
domain-specific examples and the full topic path context.
|
|
356
|
+
"""
|
|
357
|
+
path_str = " -> ".join(f'"{topic}"' for topic in topic_path)
|
|
358
|
+
examples = cls._format_examples(cls.EXAMPLES.get(domain, cls.EXAMPLES["technical"]))
|
|
359
|
+
|
|
360
|
+
return f"""Generate {num_subtopics} subtopics for training data organization.
|
|
361
|
+
|
|
362
|
+
Task: Create diverse but related subtopics that expand on the given topic path.
|
|
363
|
+
|
|
364
|
+
Examples:
|
|
365
|
+
{examples}
|
|
366
|
+
|
|
367
|
+
Context: {system_prompt}
|
|
368
|
+
|
|
369
|
+
Topic path: {path_str}
|
|
370
|
+
|
|
371
|
+
Generate {num_subtopics} subtopics. For each subtopic, provide:
|
|
372
|
+
1. A "topic" string - a specific, concrete subtopic directly related to the parent
|
|
373
|
+
2. A "connections" list - ALWAYS use an empty list []
|
|
374
|
+
|
|
375
|
+
Return focused subtopics that stay on-topic with the path above."""
|
|
376
|
+
|
|
377
|
+
@classmethod
|
|
378
|
+
def _format_examples(cls, examples: list) -> str:
|
|
379
|
+
"""Format examples for inclusion in prompt."""
|
|
380
|
+
formatted = []
|
|
381
|
+
for ex in examples[: cls.MAX_PROMPT_EXAMPLES]:
|
|
382
|
+
path_str = " -> ".join(f'"{topic}"' for topic in ex["path"])
|
|
383
|
+
subtopics_str = str(ex["subtopics"])
|
|
384
|
+
formatted.append(f"Path: {path_str}\nSubtopics: {subtopics_str}")
|
|
385
|
+
return "\n\n".join(formatted)
|
|
386
|
+
|
|
387
|
+
@classmethod
|
|
388
|
+
def detect_domain(cls, system_prompt: str, topic_path: list[str]) -> str:
|
|
389
|
+
"""Detect the appropriate domain for prompt examples based on context.
|
|
390
|
+
|
|
391
|
+
Returns 'security' or 'technical' based on keywords in the system prompt
|
|
392
|
+
and topic path. Defaults to 'technical' if no security keywords found.
|
|
393
|
+
"""
|
|
394
|
+
combined_text = f"{system_prompt} {' '.join(topic_path)}".lower()
|
|
395
|
+
|
|
396
|
+
if any(word in combined_text for word in cls.SECURITY_KEYWORDS):
|
|
397
|
+
return "security"
|
|
398
|
+
return "technical"
|
|
399
|
+
|
|
400
|
+
|
|
267
401
|
# Chain of Thought prompts for reasoning-based dataset generation
|
|
268
402
|
FREETEXT_COT_PROMPT = """Generate a reasoning problem that requires analytical thinking to solve.
|
|
269
403
|
|
deepfabric/schemas.py
CHANGED
|
@@ -136,7 +136,9 @@ class MCPInputSchemaProperty(BaseModel):
|
|
|
136
136
|
|
|
137
137
|
model_config = {"extra": "allow"}
|
|
138
138
|
|
|
139
|
-
type: str = Field(
|
|
139
|
+
type: str | list[str] = Field(
|
|
140
|
+
default="string", description="JSON Schema type (string or array for nullable)"
|
|
141
|
+
)
|
|
140
142
|
description: str = Field(default="", description="Property description")
|
|
141
143
|
default: Any | None = Field(default=None, description="Default value")
|
|
142
144
|
|
|
@@ -159,7 +161,7 @@ class MCPToolDefinition(BaseModel):
|
|
|
159
161
|
See: https://modelcontextprotocol.io/specification/2025-06-18/schema#tool
|
|
160
162
|
"""
|
|
161
163
|
|
|
162
|
-
model_config = {"extra": "allow"}
|
|
164
|
+
model_config = {"extra": "allow", "populate_by_name": True}
|
|
163
165
|
|
|
164
166
|
name: str = Field(description="Tool name")
|
|
165
167
|
description: str = Field(default="", description="Tool description")
|
|
@@ -367,7 +369,15 @@ class ToolDefinition(BaseModel):
|
|
|
367
369
|
required_params = set(input_schema.required)
|
|
368
370
|
|
|
369
371
|
for param_name, param_props in input_schema.properties.items():
|
|
370
|
-
|
|
372
|
+
# Handle type as either string or array (for nullable types like ["string", "null"])
|
|
373
|
+
param_type = param_props.type
|
|
374
|
+
if isinstance(param_type, list):
|
|
375
|
+
# Extract the primary type (non-null type from array)
|
|
376
|
+
primary_type = next((t for t in param_type if t != "null"), "string")
|
|
377
|
+
else:
|
|
378
|
+
primary_type = param_type
|
|
379
|
+
|
|
380
|
+
df_type = type_mapping.get(primary_type, "str")
|
|
371
381
|
default_str = str(param_props.default) if param_props.default is not None else ""
|
|
372
382
|
|
|
373
383
|
parameters.append(
|
deepfabric/utils.py
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import asyncio
|
|
3
|
+
import importlib
|
|
3
4
|
import json
|
|
4
5
|
import os
|
|
5
6
|
import re
|
|
6
7
|
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
7
10
|
VALIDATION_ERROR_INDICATORS = [
|
|
8
11
|
"validation error",
|
|
9
12
|
"value error",
|
|
@@ -162,3 +165,33 @@ def get_bool_env(key: str, default: bool = False) -> bool:
|
|
|
162
165
|
if val is None:
|
|
163
166
|
return default
|
|
164
167
|
return val.lower() in ("1", "true", "yes", "on")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def import_optional_dependency(
|
|
171
|
+
module_name: str,
|
|
172
|
+
extra: str | None = None,
|
|
173
|
+
) -> Any:
|
|
174
|
+
"""
|
|
175
|
+
Import an optional dependency at runtime.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
module_name (str): The name of the module to import.
|
|
179
|
+
extra (str | None): The optional dependency group providing this module.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Any: The imported module.
|
|
183
|
+
|
|
184
|
+
Raises:
|
|
185
|
+
ModuleNotFoundError: If the module is not installed.
|
|
186
|
+
"""
|
|
187
|
+
try:
|
|
188
|
+
return importlib.import_module(module_name)
|
|
189
|
+
except ModuleNotFoundError:
|
|
190
|
+
if extra:
|
|
191
|
+
msg = (
|
|
192
|
+
f"The '{module_name}' library is required for the '{extra}' features. "
|
|
193
|
+
f"Please install it using: pip install 'deepfabric[{extra}]'"
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
msg = f"The '{module_name}' library is required but is not installed."
|
|
197
|
+
raise ModuleNotFoundError(msg) from None
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: DeepFabric
|
|
3
|
-
Version: 4.8.
|
|
3
|
+
Version: 4.8.3
|
|
4
4
|
Summary: Curate High Quality Datasets, Train, Evaluate and Ship
|
|
5
5
|
Author-email: DeepFabric Team <oss@alwaysfurther.ai>
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: >=3.10
|
|
8
|
-
Requires-Dist: accelerate>=0.20.0
|
|
9
8
|
Requires-Dist: anthropic>=0.75.0
|
|
10
9
|
Requires-Dist: click>=8.1.7
|
|
11
10
|
Requires-Dist: componentize-py>=0.19.3
|
|
@@ -19,7 +18,6 @@ Requires-Dist: ollama>=0.6.1
|
|
|
19
18
|
Requires-Dist: openai>=1.107.2
|
|
20
19
|
Requires-Dist: outlines==1.2.9
|
|
21
20
|
Requires-Dist: packaging>=25.0
|
|
22
|
-
Requires-Dist: peft>=0.7.0
|
|
23
21
|
Requires-Dist: posthog>=3.0.0
|
|
24
22
|
Requires-Dist: protobuf>=3.20.0
|
|
25
23
|
Requires-Dist: pydantic>=2.0.0
|
|
@@ -27,9 +25,7 @@ Requires-Dist: pyyaml>=6.0.1
|
|
|
27
25
|
Requires-Dist: rich>=13.0.0
|
|
28
26
|
Requires-Dist: sentencepiece>=0.1.99
|
|
29
27
|
Requires-Dist: spin-sdk>=3.4.1
|
|
30
|
-
Requires-Dist: torch>=2.4.0
|
|
31
28
|
Requires-Dist: transformers>=4.57.1
|
|
32
|
-
Requires-Dist: trl>=0.26.2
|
|
33
29
|
Provides-Extra: dev
|
|
34
30
|
Requires-Dist: bandit>=1.7.10; extra == 'dev'
|
|
35
31
|
Requires-Dist: mermaid-py>=0.2.0; extra == 'dev'
|
|
@@ -42,6 +38,11 @@ Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
|
42
38
|
Provides-Extra: docs
|
|
43
39
|
Requires-Dist: mkdocs-material>=9.0.0; extra == 'docs'
|
|
44
40
|
Requires-Dist: mkdocstrings[python]>=0.30.0; extra == 'docs'
|
|
41
|
+
Provides-Extra: training
|
|
42
|
+
Requires-Dist: accelerate>=0.20.0; extra == 'training'
|
|
43
|
+
Requires-Dist: peft>=0.7.0; extra == 'training'
|
|
44
|
+
Requires-Dist: torch>=2.4.0; extra == 'training'
|
|
45
|
+
Requires-Dist: trl>=0.26.2; extra == 'training'
|
|
45
46
|
Description-Content-Type: text/markdown
|
|
46
47
|
|
|
47
48
|
<div align="center">
|
|
@@ -5,7 +5,7 @@ deepfabric/builders.py,sha256=XKlKsAhsed2_M_uHft-VB-n8T1ZhAbIo_7XXc2mE3Ug,11503
|
|
|
5
5
|
deepfabric/builders_agent.py,sha256=7xaXVNzmW_vBDkxJNKR_9HveljrdljwDAFx3iu-L_-M,48468
|
|
6
6
|
deepfabric/cli.py,sha256=gf7HoXlLn-8SZHHmpNWI7dwXvv_VCzu7wCQraOJkKsc,51236
|
|
7
7
|
deepfabric/cloud_upload.py,sha256=WYaISQY1XxorNdL7_F_FYwQUPHGJr2Bb_bohAa5xpbY,27801
|
|
8
|
-
deepfabric/config.py,sha256=
|
|
8
|
+
deepfabric/config.py,sha256=5M8BhvWXGmC0rcvSlY3TKwGI8TIrYHL6xjQ95ITGs8o,34780
|
|
9
9
|
deepfabric/config_manager.py,sha256=CIOJV121tBpH_V_ljwTenvyFO31yoohPSjW0yrHCD-w,9041
|
|
10
10
|
deepfabric/constants.py,sha256=MwADziDmnt0zi9t9gG65EM7AJvIQP0FSsXgGj7Yqxm8,2578
|
|
11
11
|
deepfabric/dataset.py,sha256=bZfx35A-dt0kMflgskU9Ge-NLVesq8xNKHsrxTnNn6Q,9740
|
|
@@ -14,21 +14,21 @@ deepfabric/error_codes.py,sha256=HGGWsahUTI8UG996C74X-XgNuaPX8RHo4gOidlaJql4,176
|
|
|
14
14
|
deepfabric/exceptions.py,sha256=pEg4YFQaDEWtBoJaSkxsJJoBBp2-6EE3M7m5H7R6i_8,1586
|
|
15
15
|
deepfabric/factory.py,sha256=OCqo3w-eiYNWvK_I_egDZuWj192kf18yD3SPj8rrPxU,753
|
|
16
16
|
deepfabric/generator.py,sha256=wdGxuKQOMGY8oEpa-YvXX2ceCnzRApDgzweLHgwtjlw,44226
|
|
17
|
-
deepfabric/graph.py,sha256=
|
|
17
|
+
deepfabric/graph.py,sha256=JQ68GXnLymtR7ESfeZgdMh3YrReSPX5wEWEqXlkIR4Q,24175
|
|
18
18
|
deepfabric/hf_hub.py,sha256=hw2CWqZ3CzyAzMo552VPZKVWtuv-j0TQ2_gV5K0AUto,7670
|
|
19
19
|
deepfabric/kaggle_hub.py,sha256=CXVO1Lv3IRhdO0bp9_IQr6nUs-v5jOWi5k4EwPkbJmw,7927
|
|
20
20
|
deepfabric/loader.py,sha256=YNTGZZE-POjR0BIlx6WCT4bIzf0T4lW_fQl7ev9UFqE,18584
|
|
21
|
-
deepfabric/metrics.py,sha256=
|
|
21
|
+
deepfabric/metrics.py,sha256=txqmXDM_r6cWPjdnnEjoA5xJkCHxFrjKWTpihE_jimA,6129
|
|
22
22
|
deepfabric/progress.py,sha256=3XQQrf2pUZlyd-8eRcNATH1v0Oi8JMedVHGbhPcca-8,9354
|
|
23
|
-
deepfabric/prompts.py,sha256=
|
|
24
|
-
deepfabric/schemas.py,sha256=
|
|
23
|
+
deepfabric/prompts.py,sha256=XKFaoiT9G_t7z3VhWNr1xsWx78I-2kzq6wErc6DW1eI,15397
|
|
24
|
+
deepfabric/schemas.py,sha256=N1cTvXuAyV8r8YS5DSAcFgpfxF0AqVGJbbOpeT5H72g,37881
|
|
25
25
|
deepfabric/stream_simulator.py,sha256=GzvAxWxHVsuTwgXlqwXNfrTUDn6sND2kJOoQuYg88FA,3028
|
|
26
26
|
deepfabric/topic_manager.py,sha256=6YxMO6dQHaGyxghsI8iNJGP1miaekBe5Mh1WdYeLqdI,11164
|
|
27
27
|
deepfabric/topic_model.py,sha256=i_wYpw2kUl8NLodOSaqNu-C4_d6caYT1kPe_vkKjoyw,707
|
|
28
28
|
deepfabric/tree.py,sha256=Kxl2iLHU55xPq2MwdoLM0-M2nZRx51bRj9FM36jqs-M,14933
|
|
29
29
|
deepfabric/tui.py,sha256=9ETtGFQk26U9PQ2b5foplVYDKxaFGd-8UqK7uSKyHwE,50480
|
|
30
30
|
deepfabric/update_checker.py,sha256=AUa9iUdkGNzu7tWkQRxIlF19YRmKLetwxu-Ys2ONS8Y,5145
|
|
31
|
-
deepfabric/utils.py,sha256=
|
|
31
|
+
deepfabric/utils.py,sha256=a9G6VTw52UdddTFoMw-JjunjawtPN54N275-XGPL2cQ,5822
|
|
32
32
|
deepfabric/validation.py,sha256=1x1X_45kyI0w_FCdUiNdvy4LQu3B0KVR-fyvLkrKEGw,5125
|
|
33
33
|
deepfabric/evaluation/__init__.py,sha256=7xMLmYXaNC1U7qf88S9fMxWTABoDRiOcimSYfCt_PSo,1224
|
|
34
34
|
deepfabric/evaluation/evaluator.py,sha256=qNowle5v2ukDJ11igNOCParlBfXT8QUeOvXx6sSJ_Ug,34480
|
|
@@ -39,7 +39,7 @@ deepfabric/evaluation/backends/__init__.py,sha256=GqC0FfpWmtgJmjHd0kVKNg7g-NjhRo
|
|
|
39
39
|
deepfabric/evaluation/backends/llm_eval_backend.py,sha256=4jp5tnTp7v_0pHCGhcPbI55ig79-eVxdzooesi2PymA,18827
|
|
40
40
|
deepfabric/evaluation/backends/ollama_backend.py,sha256=mtPp1JtIDRjb76X_rTa1jS1ETzMjte8t3WJjuYV1oDQ,4372
|
|
41
41
|
deepfabric/evaluation/backends/tool_call_parsers.py,sha256=Ufg4Xt3mrDS-WbGor6tOOr4xZNCHk3Co2C-z_o-pAkM,14126
|
|
42
|
-
deepfabric/evaluation/backends/transformers_backend.py,sha256=
|
|
42
|
+
deepfabric/evaluation/backends/transformers_backend.py,sha256=f3rbFxjWdv2NhDvlMfl0YwFUkfh0i5dlM3JKYeoJgvQ,15243
|
|
43
43
|
deepfabric/evaluation/evaluators/__init__.py,sha256=NdH65YvanskRGe6r7JepkTNGGt8xA-GLugagU3VQ_WM,353
|
|
44
44
|
deepfabric/evaluation/evaluators/base.py,sha256=1TiLr-_oF9dRmdSgJs94dDbf0gTwRS8TGGz2C1Z3nag,2946
|
|
45
45
|
deepfabric/evaluation/evaluators/registry.py,sha256=VGeb1AHFGkn9TLpcqfuGIZi1jgh7Qw0NNILT6z3Se6M,2171
|
|
@@ -69,8 +69,8 @@ deepfabric/training/api_key_prompt.py,sha256=pSIMX3eDGyV9x_r7MHE4TyIsIB2SqYb8gKC
|
|
|
69
69
|
deepfabric/training/callback.py,sha256=5zdifbHA2PWILHl2cVFyO65aW7cGAQhcvDqm3s8_I0Q,13221
|
|
70
70
|
deepfabric/training/dataset_utils.py,sha256=klx8DoawEwuMigBDP-RpMAfe7FvYxRbhj599MErxBr4,7313
|
|
71
71
|
deepfabric/training/metrics_sender.py,sha256=ZCyvMv5hRu8XJnQYVGXJ9wh7HEMJ0l3Ktyi8_etOpZs,10833
|
|
72
|
-
deepfabric-4.8.
|
|
73
|
-
deepfabric-4.8.
|
|
74
|
-
deepfabric-4.8.
|
|
75
|
-
deepfabric-4.8.
|
|
76
|
-
deepfabric-4.8.
|
|
72
|
+
deepfabric-4.8.3.dist-info/METADATA,sha256=ppBY0UdQd2bybvZF0HcXivMwFnHGEn_Nk6kCsDFBR6c,20536
|
|
73
|
+
deepfabric-4.8.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
74
|
+
deepfabric-4.8.3.dist-info/entry_points.txt,sha256=zatevils13hfs8x29_vmUyivQ6rTtq7hE2RBusZw1Fo,50
|
|
75
|
+
deepfabric-4.8.3.dist-info/licenses/LICENSE,sha256=-qRt8wmrhQ9aMf7KhmZXc2vrTETYZF-6_T1KCeUhvHY,11340
|
|
76
|
+
deepfabric-4.8.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|