DeepFabric 4.8.2__py3-none-any.whl → 4.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepfabric/config.py CHANGED
@@ -109,6 +109,10 @@ class TopicsConfig(BaseModel):
109
109
  description="Maximum concurrent LLM calls during graph expansion (helps avoid rate limits)",
110
110
  )
111
111
  save_as: str | None = Field(default=None, description="Where to save the generated topics")
112
+ prompt_style: Literal["default", "isolated", "anchored"] = Field(
113
+ default="default",
114
+ description="For graph mode: 'default' enables cross-connections with generic prompts, 'isolated' disables connections with generic prompts, 'anchored' disables connections and uses domain-aware prompts with examples for focused topic generation",
115
+ )
112
116
 
113
117
  # Optional LLM overrides (inherits from top-level llm if not specified)
114
118
  llm: LLMConfig | None = Field(
@@ -603,6 +607,7 @@ See documentation for full examples.
603
607
  "depth": self.topics.depth,
604
608
  "degree": self.topics.degree,
605
609
  "max_concurrent": self.topics.max_concurrent,
610
+ "prompt_style": self.topics.prompt_style,
606
611
  }
607
612
 
608
613
  # Handle overrides
deepfabric/graph.py CHANGED
@@ -5,7 +5,7 @@ import textwrap
5
5
  import uuid
6
6
 
7
7
  from datetime import datetime, timezone
8
- from typing import TYPE_CHECKING, Any
8
+ from typing import TYPE_CHECKING, Any, Literal
9
9
 
10
10
  from pydantic import BaseModel, ConfigDict, Field
11
11
 
@@ -19,7 +19,11 @@ from .constants import (
19
19
  from .llm import LLMClient
20
20
  from .llm.rate_limit_detector import RateLimitDetector
21
21
  from .metrics import trace
22
- from .prompts import GRAPH_EXPANSION_PROMPT
22
+ from .prompts import (
23
+ GRAPH_EXPANSION_PROMPT,
24
+ GRAPH_EXPANSION_PROMPT_NO_CONNECTIONS,
25
+ GraphPromptBuilder,
26
+ )
23
27
  from .schemas import GraphSubtopics
24
28
  from .stream_simulator import simulate_stream
25
29
  from .topic_model import TopicModel
@@ -70,6 +74,10 @@ class GraphConfig(BaseModel):
70
74
  default=None,
71
75
  description="Base URL for API endpoint (e.g., custom OpenAI-compatible servers)",
72
76
  )
77
+ prompt_style: Literal["default", "isolated", "anchored"] = Field(
78
+ default="default",
79
+ description="Prompt style: 'default' (cross-connections, generic), 'isolated' (no connections, generic), 'anchored' (no connections, domain-aware)",
80
+ )
73
81
 
74
82
 
75
83
  class GraphMetadata(BaseModel):
@@ -148,6 +156,7 @@ class Graph(TopicModel):
148
156
  self.degree = self.config.degree
149
157
  self.depth = self.config.depth
150
158
  self.max_concurrent = self.config.max_concurrent
159
+ self.prompt_style = self.config.prompt_style
151
160
 
152
161
  # Initialize LLM client
153
162
  llm_kwargs = {}
@@ -493,6 +502,17 @@ class Graph(TopicModel):
493
502
  )
494
503
  return None
495
504
 
505
+ def _get_path_to_node(self, node: Node) -> list[str]:
506
+ """Get the topic path from root to the given node."""
507
+ path = []
508
+ current = node
509
+ while current is not None:
510
+ path.append(current.topic)
511
+ # First parent is the primary parent from tree expansion;
512
+ # cross-connections are added later and appear after index 0
513
+ current = current.parents[0] if current.parents else None
514
+ return list(reversed(path))
515
+
496
516
  async def get_subtopics_and_connections(
497
517
  self, parent_node: Node, num_subtopics: int
498
518
  ) -> tuple[int, int]:
@@ -505,15 +525,35 @@ class Graph(TopicModel):
505
525
  Returns:
506
526
  A tuple of (subtopics_added, connections_added).
507
527
  """
508
- graph_summary = (
509
- self.to_json()
510
- if len(self.nodes) <= TOPIC_GRAPH_SUMMARY
511
- else "Graph too large to display"
512
- )
513
-
514
- graph_prompt = GRAPH_EXPANSION_PROMPT.replace("{{current_graph_summary}}", graph_summary)
515
- graph_prompt = graph_prompt.replace("{{current_topic}}", parent_node.topic)
516
- graph_prompt = graph_prompt.replace("{{num_subtopics}}", str(num_subtopics))
528
+ # Choose prompt based on prompt_style setting
529
+ if self.prompt_style == "anchored":
530
+ # Domain-aware prompts with examples for focused generation
531
+ topic_path = self._get_path_to_node(parent_node)
532
+ domain = GraphPromptBuilder.detect_domain(self.model_system_prompt, topic_path)
533
+ graph_prompt = GraphPromptBuilder.build_anchored_prompt(
534
+ topic_path=topic_path,
535
+ num_subtopics=num_subtopics,
536
+ system_prompt=self.model_system_prompt,
537
+ domain=domain,
538
+ )
539
+ elif self.prompt_style == "isolated":
540
+ # No connections, generic prompt
541
+ graph_prompt = GRAPH_EXPANSION_PROMPT_NO_CONNECTIONS.replace(
542
+ "{{current_topic}}", parent_node.topic
543
+ )
544
+ graph_prompt = graph_prompt.replace("{{num_subtopics}}", str(num_subtopics))
545
+ else:
546
+ # default: cross-connections enabled, generic prompt
547
+ graph_summary = (
548
+ self.to_json()
549
+ if len(self.nodes) <= TOPIC_GRAPH_SUMMARY
550
+ else "Graph too large to display"
551
+ )
552
+ graph_prompt = GRAPH_EXPANSION_PROMPT.replace(
553
+ "{{current_graph_summary}}", graph_summary
554
+ )
555
+ graph_prompt = graph_prompt.replace("{{current_topic}}", parent_node.topic)
556
+ graph_prompt = graph_prompt.replace("{{num_subtopics}}", str(num_subtopics))
517
557
 
518
558
  response = await self._generate_subtopics_with_retry(graph_prompt, parent_node)
519
559
  if response is None:
deepfabric/metrics.py CHANGED
@@ -19,8 +19,8 @@ except (ImportError, importlib.metadata.PackageNotFoundError):
19
19
 
20
20
  # Initialize PostHog client
21
21
  posthog = Posthog(
22
- project_api_key="phc_Kn8hKQIXHm5OHp5OTxvMvFDUmT7HyOUNlJvWkduB9qO",
23
- host="https://us.i.posthog.com",
22
+ project_api_key="phc_JZWiTzIDNnBp6Jj6uUb0JQKuIp3dv0gkay9aU50n38h",
23
+ host="https://eu.i.posthog.com",
24
24
  )
25
25
 
26
26
  logger = logging.getLogger(__name__)
deepfabric/prompts.py CHANGED
@@ -264,6 +264,140 @@ Generate a list of {{num_subtopics}} subtopics. For each subtopic, provide:
264
264
  2. A "connections" list of IDs of existing topics it should connect to for creating cross-links (use empty list if no connections)
265
265
  """
266
266
 
267
+ GRAPH_EXPANSION_PROMPT_NO_CONNECTIONS = """
268
+ You are an expert in topic generation. Your task is to expand a topic into a set of focused subtopics.
269
+
270
+ You are expanding the topic: "{{current_topic}}"
271
+
272
+ Generate a list of {{num_subtopics}} subtopics. For each subtopic, provide:
273
+ 1. A "topic" string - the name of the new subtopic
274
+ 2. A "connections" list - ALWAYS use an empty list []
275
+
276
+ IMPORTANT: Do NOT create cross-connections between topics. Each subtopic should be independent and directly related only to its parent topic. Always return connections as an empty list [].
277
+ """
278
+
279
+
280
+ class GraphPromptBuilder:
281
+ """Build domain-aware prompts for graph topic expansion with anchoring examples."""
282
+
283
+ MAX_PROMPT_EXAMPLES = 3
284
+
285
+ SECURITY_KEYWORDS = frozenset({
286
+ "security",
287
+ "attack",
288
+ "credential",
289
+ "exfiltration",
290
+ "injection",
291
+ "malicious",
292
+ "adversarial",
293
+ "threat",
294
+ })
295
+
296
+ # Domain-specific expansion examples - formatted to match GraphSubtopics schema
297
+ EXAMPLES = {
298
+ "security": [
299
+ {
300
+ "path": ["Security Threats", "Credential Access"],
301
+ "subtopics": [
302
+ {"topic": "reading .env files", "connections": []},
303
+ {"topic": "extracting API keys", "connections": []},
304
+ {"topic": "accessing SSH keys", "connections": []},
305
+ {"topic": "dumping AWS credentials", "connections": []},
306
+ {"topic": "stealing database passwords", "connections": []},
307
+ ],
308
+ },
309
+ {
310
+ "path": ["Security Threats", "Data Exfiltration"],
311
+ "subtopics": [
312
+ {"topic": "sending to webhooks", "connections": []},
313
+ {"topic": "encoding in base64", "connections": []},
314
+ {"topic": "uploading to external URLs", "connections": []},
315
+ {"topic": "email forwarding", "connections": []},
316
+ {"topic": "DNS tunneling", "connections": []},
317
+ ],
318
+ },
319
+ ],
320
+ "technical": [
321
+ {
322
+ "path": ["Programming", "Python"],
323
+ "subtopics": [
324
+ {"topic": "pandas", "connections": []},
325
+ {"topic": "flask", "connections": []},
326
+ {"topic": "pytest", "connections": []},
327
+ {"topic": "asyncio", "connections": []},
328
+ {"topic": "django", "connections": []},
329
+ ],
330
+ },
331
+ {
332
+ "path": ["Infrastructure", "Kubernetes"],
333
+ "subtopics": [
334
+ {"topic": "pods", "connections": []},
335
+ {"topic": "deployments", "connections": []},
336
+ {"topic": "services", "connections": []},
337
+ {"topic": "ingress", "connections": []},
338
+ {"topic": "helm charts", "connections": []},
339
+ ],
340
+ },
341
+ ],
342
+ }
343
+
344
+ @classmethod
345
+ def build_anchored_prompt(
346
+ cls,
347
+ topic_path: list[str],
348
+ num_subtopics: int,
349
+ system_prompt: str = "",
350
+ domain: str = "technical",
351
+ ) -> str:
352
+ """Build a domain-anchored prompt for graph expansion.
353
+
354
+ Returns a prompt that produces focused, on-topic subtopics by providing
355
+ domain-specific examples and the full topic path context.
356
+ """
357
+ path_str = " -> ".join(f'"{topic}"' for topic in topic_path)
358
+ examples = cls._format_examples(cls.EXAMPLES.get(domain, cls.EXAMPLES["technical"]))
359
+
360
+ return f"""Generate {num_subtopics} subtopics for training data organization.
361
+
362
+ Task: Create diverse but related subtopics that expand on the given topic path.
363
+
364
+ Examples:
365
+ {examples}
366
+
367
+ Context: {system_prompt}
368
+
369
+ Topic path: {path_str}
370
+
371
+ Generate {num_subtopics} subtopics. For each subtopic, provide:
372
+ 1. A "topic" string - a specific, concrete subtopic directly related to the parent
373
+ 2. A "connections" list - ALWAYS use an empty list []
374
+
375
+ Return focused subtopics that stay on-topic with the path above."""
376
+
377
+ @classmethod
378
+ def _format_examples(cls, examples: list) -> str:
379
+ """Format examples for inclusion in prompt."""
380
+ formatted = []
381
+ for ex in examples[: cls.MAX_PROMPT_EXAMPLES]:
382
+ path_str = " -> ".join(f'"{topic}"' for topic in ex["path"])
383
+ subtopics_str = str(ex["subtopics"])
384
+ formatted.append(f"Path: {path_str}\nSubtopics: {subtopics_str}")
385
+ return "\n\n".join(formatted)
386
+
387
+ @classmethod
388
+ def detect_domain(cls, system_prompt: str, topic_path: list[str]) -> str:
389
+ """Detect the appropriate domain for prompt examples based on context.
390
+
391
+ Returns 'security' or 'technical' based on keywords in the system prompt
392
+ and topic path. Defaults to 'technical' if no security keywords found.
393
+ """
394
+ combined_text = f"{system_prompt} {' '.join(topic_path)}".lower()
395
+
396
+ if any(word in combined_text for word in cls.SECURITY_KEYWORDS):
397
+ return "security"
398
+ return "technical"
399
+
400
+
267
401
  # Chain of Thought prompts for reasoning-based dataset generation
268
402
  FREETEXT_COT_PROMPT = """Generate a reasoning problem that requires analytical thinking to solve.
269
403
 
deepfabric/schemas.py CHANGED
@@ -136,7 +136,9 @@ class MCPInputSchemaProperty(BaseModel):
136
136
 
137
137
  model_config = {"extra": "allow"}
138
138
 
139
- type: str | list[str] = Field(default="string", description="JSON Schema type (string or array for nullable)")
139
+ type: str | list[str] = Field(
140
+ default="string", description="JSON Schema type (string or array for nullable)"
141
+ )
140
142
  description: str = Field(default="", description="Property description")
141
143
  default: Any | None = Field(default=None, description="Default value")
142
144
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: DeepFabric
3
- Version: 4.8.2
3
+ Version: 4.8.3
4
4
  Summary: Curate High Quality Datasets, Train, Evaluate and Ship
5
5
  Author-email: DeepFabric Team <oss@alwaysfurther.ai>
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ deepfabric/builders.py,sha256=XKlKsAhsed2_M_uHft-VB-n8T1ZhAbIo_7XXc2mE3Ug,11503
5
5
  deepfabric/builders_agent.py,sha256=7xaXVNzmW_vBDkxJNKR_9HveljrdljwDAFx3iu-L_-M,48468
6
6
  deepfabric/cli.py,sha256=gf7HoXlLn-8SZHHmpNWI7dwXvv_VCzu7wCQraOJkKsc,51236
7
7
  deepfabric/cloud_upload.py,sha256=WYaISQY1XxorNdL7_F_FYwQUPHGJr2Bb_bohAa5xpbY,27801
8
- deepfabric/config.py,sha256=Ze0OaCdUqFwX9bmjssf9ffU0XXwa5sJmvlyHr4FENHw,34367
8
+ deepfabric/config.py,sha256=5M8BhvWXGmC0rcvSlY3TKwGI8TIrYHL6xjQ95ITGs8o,34780
9
9
  deepfabric/config_manager.py,sha256=CIOJV121tBpH_V_ljwTenvyFO31yoohPSjW0yrHCD-w,9041
10
10
  deepfabric/constants.py,sha256=MwADziDmnt0zi9t9gG65EM7AJvIQP0FSsXgGj7Yqxm8,2578
11
11
  deepfabric/dataset.py,sha256=bZfx35A-dt0kMflgskU9Ge-NLVesq8xNKHsrxTnNn6Q,9740
@@ -14,14 +14,14 @@ deepfabric/error_codes.py,sha256=HGGWsahUTI8UG996C74X-XgNuaPX8RHo4gOidlaJql4,176
14
14
  deepfabric/exceptions.py,sha256=pEg4YFQaDEWtBoJaSkxsJJoBBp2-6EE3M7m5H7R6i_8,1586
15
15
  deepfabric/factory.py,sha256=OCqo3w-eiYNWvK_I_egDZuWj192kf18yD3SPj8rrPxU,753
16
16
  deepfabric/generator.py,sha256=wdGxuKQOMGY8oEpa-YvXX2ceCnzRApDgzweLHgwtjlw,44226
17
- deepfabric/graph.py,sha256=O-I2Gto9Yxp4dXYMIRrLBVG4ODn7PI4deMzvJFEkCqs,22252
17
+ deepfabric/graph.py,sha256=JQ68GXnLymtR7ESfeZgdMh3YrReSPX5wEWEqXlkIR4Q,24175
18
18
  deepfabric/hf_hub.py,sha256=hw2CWqZ3CzyAzMo552VPZKVWtuv-j0TQ2_gV5K0AUto,7670
19
19
  deepfabric/kaggle_hub.py,sha256=CXVO1Lv3IRhdO0bp9_IQr6nUs-v5jOWi5k4EwPkbJmw,7927
20
20
  deepfabric/loader.py,sha256=YNTGZZE-POjR0BIlx6WCT4bIzf0T4lW_fQl7ev9UFqE,18584
21
- deepfabric/metrics.py,sha256=iwtNHBX4ZTYUg2FZgtFcG3U0e9RlV2c1cm1Kp34FeWU,6129
21
+ deepfabric/metrics.py,sha256=txqmXDM_r6cWPjdnnEjoA5xJkCHxFrjKWTpihE_jimA,6129
22
22
  deepfabric/progress.py,sha256=3XQQrf2pUZlyd-8eRcNATH1v0Oi8JMedVHGbhPcca-8,9354
23
- deepfabric/prompts.py,sha256=JVFMeeBa2qqOMvmP_xx8bWzZ6ot9eyqOP3u8XzzPx3g,10290
24
- deepfabric/schemas.py,sha256=r8qQuu19o9ev0JNxcChjbpWym9m5WLItyxw8szYLSjI,37867
23
+ deepfabric/prompts.py,sha256=XKFaoiT9G_t7z3VhWNr1xsWx78I-2kzq6wErc6DW1eI,15397
24
+ deepfabric/schemas.py,sha256=N1cTvXuAyV8r8YS5DSAcFgpfxF0AqVGJbbOpeT5H72g,37881
25
25
  deepfabric/stream_simulator.py,sha256=GzvAxWxHVsuTwgXlqwXNfrTUDn6sND2kJOoQuYg88FA,3028
26
26
  deepfabric/topic_manager.py,sha256=6YxMO6dQHaGyxghsI8iNJGP1miaekBe5Mh1WdYeLqdI,11164
27
27
  deepfabric/topic_model.py,sha256=i_wYpw2kUl8NLodOSaqNu-C4_d6caYT1kPe_vkKjoyw,707
@@ -69,8 +69,8 @@ deepfabric/training/api_key_prompt.py,sha256=pSIMX3eDGyV9x_r7MHE4TyIsIB2SqYb8gKC
69
69
  deepfabric/training/callback.py,sha256=5zdifbHA2PWILHl2cVFyO65aW7cGAQhcvDqm3s8_I0Q,13221
70
70
  deepfabric/training/dataset_utils.py,sha256=klx8DoawEwuMigBDP-RpMAfe7FvYxRbhj599MErxBr4,7313
71
71
  deepfabric/training/metrics_sender.py,sha256=ZCyvMv5hRu8XJnQYVGXJ9wh7HEMJ0l3Ktyi8_etOpZs,10833
72
- deepfabric-4.8.2.dist-info/METADATA,sha256=15ZBOITSr6pigZYD-GBn_IwRlnzftZ64hwrXldKr1mg,20536
73
- deepfabric-4.8.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
74
- deepfabric-4.8.2.dist-info/entry_points.txt,sha256=zatevils13hfs8x29_vmUyivQ6rTtq7hE2RBusZw1Fo,50
75
- deepfabric-4.8.2.dist-info/licenses/LICENSE,sha256=-qRt8wmrhQ9aMf7KhmZXc2vrTETYZF-6_T1KCeUhvHY,11340
76
- deepfabric-4.8.2.dist-info/RECORD,,
72
+ deepfabric-4.8.3.dist-info/METADATA,sha256=ppBY0UdQd2bybvZF0HcXivMwFnHGEn_Nk6kCsDFBR6c,20536
73
+ deepfabric-4.8.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
74
+ deepfabric-4.8.3.dist-info/entry_points.txt,sha256=zatevils13hfs8x29_vmUyivQ6rTtq7hE2RBusZw1Fo,50
75
+ deepfabric-4.8.3.dist-info/licenses/LICENSE,sha256=-qRt8wmrhQ9aMf7KhmZXc2vrTETYZF-6_T1KCeUhvHY,11340
76
+ deepfabric-4.8.3.dist-info/RECORD,,