PyPI - DeepFabric - Versions diffs - 4.10.1__py3-none-any.whl → 4.11.0__py3-none-any.whl - Mend

DeepFabric 4.10.1py3-none-any.whl → 4.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

deepfabric/cli.py +83 -27
deepfabric/cloud_upload.py +1 -1
deepfabric/config.py +6 -4
deepfabric/constants.py +1 -1
deepfabric/dataset_manager.py +264 -62
deepfabric/generator.py +687 -82
deepfabric/graph.py +25 -1
deepfabric/llm/retry_handler.py +28 -9
deepfabric/progress.py +42 -0
deepfabric/topic_manager.py +22 -2
deepfabric/topic_model.py +26 -0
deepfabric/tree.py +41 -16
deepfabric/tui.py +448 -349
deepfabric/utils.py +4 -1
{deepfabric-4.10.1.dist-info → deepfabric-4.11.0.dist-info}/METADATA +3 -1
{deepfabric-4.10.1.dist-info → deepfabric-4.11.0.dist-info}/RECORD +19 -19
{deepfabric-4.10.1.dist-info → deepfabric-4.11.0.dist-info}/licenses/LICENSE +1 -1
{deepfabric-4.10.1.dist-info → deepfabric-4.11.0.dist-info}/WHEEL +0 -0
{deepfabric-4.10.1.dist-info → deepfabric-4.11.0.dist-info}/entry_points.txt +0 -0

deepfabric/generator.py CHANGED Viewed

@@ -46,7 +46,7 @@ from .prompts import (
 from .schemas import Conversation, ToolRegistry, get_conversation_schema
 from .tools import BUILTIN_TOOL_REGISTRY
 from .tools.loader import load_tools_from_dict, load_tools_from_endpoint
-from .topic_model import TopicModel, TopicPath
+from .topic_model import Topic, TopicModel, TopicPath
 from .utils import ensure_not_running_loop, get_checkpoint_dir, is_validation_error
 # Handle circular import for type hints
@@ -275,13 +275,16 @@ class DataSetGenerator:
         # Checkpoint state
         self._checkpoint_samples_since_save = 0
-        self._processed_ids: set[str] = set()  # Track processed topic IDs (UUIDs)
+        self._completed: set[tuple[str, int]] = set()  # Track (uuid, cycle) tuples
         self._checkpoint_metadata_path: Path | None = None
         self._checkpoint_samples_path: Path | None = None
         self._checkpoint_failures_path: Path | None = None
         # Memory optimization: track flushed counts for checkpoint mode
         self._flushed_samples_count = 0
         self._flushed_failures_count = 0
+        # Generation state for cycle-based iteration
+        self._unique_topics: int = 0  # Number of unique topics
+        self._cycles_needed: int = 0  # Total cycles for requested samples
         # Graceful stop flag - set by signal handler to stop at next checkpoint
         self.stop_requested = False
@@ -383,7 +386,7 @@ class DataSetGenerator:
         self,
         new_samples: list[dict],
         new_failures: list[dict],
-        processed_topic_paths: list[TopicPath | None],
+        completed_items: list[tuple[str, int]],
         flush_memory: bool = True,
     ) -> None:
         """Save checkpoint data incrementally.
@@ -391,7 +394,7 @@ class DataSetGenerator:
         Args:
             new_samples: New successful samples to append
             new_failures: New failed samples to append
-            processed_topic_paths: TopicPath objects that were processed in this batch
+            completed_items: List of (uuid, cycle) tuples that were completed
             flush_memory: If True, clear flushed samples from memory (memory optimization)
         """
         if self._checkpoint_samples_path is None:
@@ -409,10 +412,9 @@ class DataSetGenerator:
                 for failure in new_failures:
                     f.write(json.dumps(failure, separators=(",", ":")) + "\n")
-        # Track processed topic IDs
-        for topic_path in processed_topic_paths:
-            if topic_path is not None:
-                self._processed_ids.add(topic_path.topic_id)
+        # Track completed (uuid, cycle) tuples
+        for item in completed_items:
+            self._completed.add(item)
         # Memory optimization: track flushed counts and clear in-memory lists
         # Must happen BEFORE saving metadata so counts are accurate
@@ -427,10 +429,10 @@ class DataSetGenerator:
         self._save_checkpoint_metadata()
         logger.debug(
-            "Checkpoint saved: %d samples, %d failures, %d total IDs processed (flushed=%s)",
+            "Checkpoint saved: %d samples, %d failures, %d completed tuples (flushed=%s)",
             len(new_samples),
             len(new_failures),
-            len(self._processed_ids),
+            len(self._completed),
             flush_memory,
         )
@@ -443,6 +445,9 @@ class DataSetGenerator:
         total_samples = self._flushed_samples_count + len(self._samples)
         total_failures = self._flushed_failures_count + len(self.failed_samples)
+        # Convert completed set of (uuid, cycle) tuples to list of [uuid, cycle] arrays
+        completed_list = [[uuid, cycle] for uuid, cycle in self._completed]
         metadata = {
             "version": CHECKPOINT_VERSION,
             "created_at": datetime.now(timezone.utc).isoformat(),
@@ -452,7 +457,9 @@ class DataSetGenerator:
             "reasoning_style": self.config.reasoning_style,
             "total_samples": total_samples,
             "total_failures": total_failures,
-            "processed_ids": list(self._processed_ids),
+            "completed": completed_list,  # List of [uuid, cycle] arrays
+            "unique_topics": self._unique_topics,
+            "cycles_needed": self._cycles_needed,
             "checkpoint_interval": self.config.checkpoint_interval,
             "topics_file": self.config.topics_file,
         }
@@ -528,12 +535,26 @@ class DataSetGenerator:
         version = metadata.get("version")
         if version is None:
             error_msg = "Missing 'version' field in checkpoint metadata"
-        elif version != CHECKPOINT_VERSION:
-            error_msg = f"Unsupported checkpoint version: {version} (expected {CHECKPOINT_VERSION})"
+        elif version < CHECKPOINT_VERSION:
+            # Old checkpoint format - require fresh start
+            error_msg = (
+                f"Checkpoint format v{version} is incompatible with current version v{CHECKPOINT_VERSION}. "
+                "Please delete the checkpoint and restart: rm -rf .checkpoints/"
+            )
+        elif version > CHECKPOINT_VERSION:
+            error_msg = (
+                f"Checkpoint version {version} is newer than supported version {CHECKPOINT_VERSION}"
+            )
-        # Check required fields
+        # Check required fields for v4 format
         if error_msg is None:
-            required_fields = ["created_at", "total_samples", "processed_ids"]
+            required_fields = [
+                "created_at",
+                "total_samples",
+                "completed",
+                "unique_topics",
+                "cycles_needed",
+            ]
             for field in required_fields:
                 if field not in metadata:
                     error_msg = f"Missing required field in checkpoint metadata: {field}"
@@ -579,8 +600,7 @@ class DataSetGenerator:
         self._initialize_checkpoint_paths()
         return (
-            self._checkpoint_metadata_path is not None
-            and self._checkpoint_metadata_path.exists()
+            self._checkpoint_metadata_path is not None and self._checkpoint_metadata_path.exists()
         )
     def load_checkpoint(self, retry_failed: bool = False) -> bool:
@@ -614,8 +634,11 @@ class DataSetGenerator:
             # Validate config compatibility
             self._validate_checkpoint_compatibility(metadata)
-            # Restore processed IDs
-            self._processed_ids = set(metadata.get("processed_ids", []))
+            # Restore completed (uuid, cycle) tuples
+            completed_list = metadata.get("completed", [])
+            self._completed = {(item[0], item[1]) for item in completed_list}
+            self._unique_topics = metadata.get("unique_topics", 0)
+            self._cycles_needed = metadata.get("cycles_needed", 0)
             # Count existing samples (don't load into memory - they're already on disk)
             # Memory optimization: track as flushed counts instead of loading into RAM
@@ -631,36 +654,47 @@ class DataSetGenerator:
             failed_ids: set[str] = set()
             if self._checkpoint_failures_path and self._checkpoint_failures_path.exists():
                 failure_count = 0
+                failed_tuples: set[tuple[str, int]] = set()
                 with open(self._checkpoint_failures_path, encoding="utf-8") as f:
                     for raw_line in f:
                         stripped = raw_line.strip()
                         if stripped:
                             failure = json.loads(stripped)
                             failure_count += 1
-                            # Track the topic_id that failed for potential retry
+                            # Track failed (topic_id, cycle) for targeted retry
                             if "topic_id" in failure:
                                 failed_ids.add(failure["topic_id"])
+                                if "cycle" in failure:
+                                    failed_tuples.add((failure["topic_id"], failure["cycle"]))
                 self._flushed_failures_count = failure_count
-            # If retry_failed is True, remove failed IDs from processed set
+            # If retry_failed is True, remove failed entries from completed set
             # so they will be retried during generation
             if retry_failed and failed_ids:
-                ids_to_retry = self._processed_ids & failed_ids
-                self._processed_ids -= ids_to_retry
+                if failed_tuples:
+                    # Targeted retry: only remove the specific (uuid, cycle) that failed
+                    tuples_to_remove = self._completed & failed_tuples
+                else:
+                    # Legacy fallback: no cycle info, remove all cycles for failed UUIDs
+                    tuples_to_remove = {
+                        (uuid, cycle) for uuid, cycle in self._completed if uuid in failed_ids
+                    }
+                self._completed -= tuples_to_remove
                 # Clear failures file since we're retrying
                 if self._checkpoint_failures_path and self._checkpoint_failures_path.exists():
                     os.remove(self._checkpoint_failures_path)
                 self._flushed_failures_count = 0
                 logger.info(
-                    "Retry mode: %d failed IDs will be retried",
-                    len(ids_to_retry),
+                    "Retry mode: %d failed UUIDs (%d tuples) will be retried",
+                    len(failed_ids),
+                    len(tuples_to_remove),
                 )
             logger.info(
-                "Loaded checkpoint: %d samples, %d failures, %d IDs processed",
+                "Loaded checkpoint: %d samples, %d failures, %d completed (uuid, cycle) tuples",
                 self._flushed_samples_count,
                 self._flushed_failures_count,
-                len(self._processed_ids),
+                len(self._completed),
             )
         except Exception as e:  # noqa: BLE001
             logger.warning("Failed to load checkpoint: %s", e)
@@ -676,7 +710,7 @@ class DataSetGenerator:
             os.remove(self._checkpoint_samples_path)
         if self._checkpoint_failures_path and self._checkpoint_failures_path.exists():
             os.remove(self._checkpoint_failures_path)
-        self._processed_ids.clear()
+        self._completed.clear()
         self._flushed_samples_count = 0
         self._flushed_failures_count = 0
         logger.info("Checkpoint files cleared")
@@ -723,18 +757,31 @@ class DataSetGenerator:
         return all_failures
-    def _is_topic_processed(self, topic_path: TopicPath | None) -> bool:
-        """Check if a topic has already been processed.
+    def _is_completed(self, uuid: str, cycle: int) -> bool:
+        """Check if a (uuid, cycle) combination has been completed.
         Args:
-            topic_path: TopicPath to check
+            uuid: Topic UUID
+            cycle: Cycle number (0-indexed)
         Returns:
-            True if topic was already processed in a previous run
+            True if this (uuid, cycle) was already completed
         """
-        if topic_path is None:
-            return False
-        return topic_path.topic_id in self._processed_ids
+        return (uuid, cycle) in self._completed
+    def _is_uuid_completed_any_cycle(self, uuid: str) -> bool:
+        """Check if a UUID has been completed in any cycle.
+        Used during transition period when step-based generation
+        still needs to check for completed topics.
+        Args:
+            uuid: Topic UUID to check
+        Returns:
+            True if this UUID has been completed in at least one cycle
+        """
+        return any(u == uuid for u, _cycle in self._completed)
     def _validate_create_data_params(
         self,
@@ -766,20 +813,109 @@ class DataSetGenerator:
             topic_paths = topic_model.get_all_paths_with_ids()
             total_paths = len(topic_paths)
             required_samples = num_steps * batch_size
+            logger.info(
+                "Topic preparation: total_paths=%d, required_samples=%d, num_steps=%d, batch_size=%d",
+                total_paths,
+                required_samples,
+                num_steps,
+                batch_size,
+            )
             if required_samples > total_paths:
                 # Cycle through topics to generate more samples than paths
                 # Each topic will be used multiple times for even coverage
+                # IMPORTANT: Create unique topic_ids using global index position.
+                # This handles two cases:
+                # 1. Cycling: same path appears multiple times across cycles
+                # 2. Graph duplicates: multiple paths can share the same node topic_id
+                # Using index ensures every position in the list has a unique ID.
                 multiplier = math.ceil(required_samples / total_paths)
-                topic_paths = (topic_paths * multiplier)[:required_samples]
+                cycled_paths: list[TopicPath] = []
+                for _cycle in range(multiplier):
+                    for _path_idx, tp in enumerate(topic_paths):
+                        if len(cycled_paths) >= required_samples:
+                            break
+                        # Use global index for uniqueness: handles both cycling and graph duplicates
+                        global_idx = len(cycled_paths)
+                        unique_id = f"{tp.topic_id}_idx_{global_idx}"
+                        cycled_paths.append(TopicPath(path=tp.path, topic_id=unique_id))
+                    if len(cycled_paths) >= required_samples:
+                        break
+                topic_paths = cycled_paths
+                logger.info(
+                    "Topics cycled: %d original paths × %d cycles = %d total (trimmed to %d)",
+                    total_paths,
+                    multiplier,
+                    total_paths * multiplier,
+                    len(topic_paths),
+                )
             elif required_samples < total_paths:
                 # Sample subset (percentage case or explicit count < total)
                 # Bandit: not a security function
                 topic_paths = random.sample(topic_paths, required_samples)  # nosec
-            # else: required_samples == total_paths - use all paths as-is
+                # Assign unique IDs to handle graphs with duplicate node IDs
+                topic_paths = [
+                    TopicPath(path=tp.path, topic_id=f"{tp.topic_id}_idx_{idx}")
+                    for idx, tp in enumerate(topic_paths)
+                ]
+            else:
+                # required_samples == total_paths - use all paths but with unique IDs
+                # Graphs can have duplicate topic_ids (multiple paths to same node)
+                topic_paths = [
+                    TopicPath(path=tp.path, topic_id=f"{tp.topic_id}_idx_{idx}")
+                    for idx, tp in enumerate(topic_paths)
+                ]
+            logger.info("Topic paths after preparation: %d paths", len(topic_paths))
         return topic_paths, num_steps
+    def _prepare_unique_topics(
+        self,
+        total_samples: int,
+        topic_model: "TopicModel",
+    ) -> tuple[list[Topic], int]:
+        """Prepare unique topics and calculate cycles needed for generation.
+        This method supports the new cycle-based generation model where we iterate
+        over unique topics (by UUID) multiple times (cycles) to generate the
+        requested number of samples.
+        Args:
+            total_samples: Total number of samples to generate.
+            topic_model: The topic model (Tree or Graph) to extract topics from.
+        Returns:
+            Tuple of (unique_topics, cycles_needed):
+            - unique_topics: List of Topic namedtuples with (uuid, topic)
+            - cycles_needed: Number of times to iterate through all topics
+        """
+        unique_topics = topic_model.get_unique_topics()
+        unique_count = len(unique_topics)
+        if unique_count == 0:
+            raise DataSetGeneratorError(
+                "Topic model has no unique topics. Ensure the topic model was built successfully."
+            )
+        # Calculate cycles needed to cover the requested samples
+        # e.g., 5000 samples from 1875 topics = ceil(5000/1875) = 3 cycles
+        cycles_needed = math.ceil(total_samples / unique_count)
+        # Calculate how many samples we'll generate in the final (partial) cycle
+        final_cycle_size = total_samples - (cycles_needed - 1) * unique_count
+        logger.info(
+            "Topic preparation: unique_topics=%d, requested_samples=%d, cycles_needed=%d, "
+            "final_cycle_size=%d",
+            unique_count,
+            total_samples,
+            cycles_needed,
+            final_cycle_size,
+        )
+        return unique_topics, cycles_needed
     def _generate_batch_prompts(
         self,
         batch_size: int,
@@ -1050,9 +1186,11 @@ class DataSetGenerator:
         return "malformed_responses"
     def summarize_failures(self) -> dict:
-        """Generate a summary of all failures."""
+        """Generate a summary of all failures including those flushed to checkpoint."""
+        # Include both in-memory and flushed failures for accurate total
+        total_failures = self._flushed_failures_count + len(self.failed_samples)
         summary = {
-            "total_failures": len(self.failed_samples),
+            "total_failures": total_failures,
             "failure_types": {k: len(v) for k, v in self.failure_analysis.items()},
             "failure_examples": {},
         }
@@ -1155,22 +1293,59 @@ class DataSetGenerator:
         include_sys_msg = sys_msg if sys_msg is not None else self.config.sys_msg
-        topic_paths, num_steps = self._prepare_topic_paths(num_steps, batch_size, topic_model)
+        # Calculate total samples requested
         total_samples = num_steps * batch_size
         data_creation_prompt = self._get_cot_prompt_template()
-        final_result: HFDataset | dict | None = None
-        async for event in self._run_generation_loop_async(
-            num_steps=num_steps,
-            batch_size=batch_size,
-            total_samples=total_samples,
-            topic_paths=topic_paths or [],
-            data_creation_prompt=data_creation_prompt,
-            num_example_demonstrations=num_example_demonstrations,
-            include_sys_msg=include_sys_msg,
+        # Ensure checkpoint_interval is at least as large as concurrency/batch_size
+        # so checkpoints align with batch boundaries
+        if (
+            self.config.checkpoint_interval is not None
+            and self.config.checkpoint_interval < batch_size
         ):
-            final_result = event
+            logger.warning(
+                "checkpoint_interval (%d) is less than concurrency/batch_size (%d), "
+                "adjusting to %d to align with batch boundaries",
+                self.config.checkpoint_interval,
+                batch_size,
+                batch_size,
+            )
+            self.config.checkpoint_interval = batch_size
+        final_result: HFDataset | dict | None = None
+        # Use cycle-based generation when a topic model is provided
+        if topic_model is not None:
+            unique_topics, cycles_needed = self._prepare_unique_topics(total_samples, topic_model)
+            # batch_size becomes concurrency in the new model
+            concurrency = batch_size
+            async for event in self._run_cycle_based_generation_async(
+                unique_topics=unique_topics,
+                cycles_needed=cycles_needed,
+                total_samples=total_samples,
+                concurrency=concurrency,
+                data_creation_prompt=data_creation_prompt,
+                num_example_demonstrations=num_example_demonstrations,
+                include_sys_msg=include_sys_msg,
+                topic_model=topic_model,
+            ):
+                final_result = event
+        else:
+            # Fall back to step-based generation when no topic model
+            topic_paths, num_steps = self._prepare_topic_paths(num_steps, batch_size, topic_model)
+            async for event in self._run_generation_loop_async(
+                num_steps=num_steps,
+                batch_size=batch_size,
+                total_samples=total_samples,
+                topic_paths=topic_paths or [],
+                data_creation_prompt=data_creation_prompt,
+                num_example_demonstrations=num_example_demonstrations,
+                include_sys_msg=include_sys_msg,
+            ):
+                final_result = event
         if isinstance(final_result, HFDataset):
             trace(
@@ -1211,29 +1386,67 @@ class DataSetGenerator:
         include_sys_msg = sys_msg if sys_msg is not None else self.config.sys_msg
-        topic_paths, num_steps = self._prepare_topic_paths(num_steps, batch_size, topic_model)
+        # Calculate total samples requested
         total_samples = num_steps * batch_size
         data_creation_prompt = self._get_cot_prompt_template()
+        # Ensure checkpoint_interval is at least as large as concurrency/batch_size
+        # so checkpoints align with batch boundaries
+        if (
+            self.config.checkpoint_interval is not None
+            and self.config.checkpoint_interval < batch_size
+        ):
+            logger.warning(
+                "checkpoint_interval (%d) is less than concurrency/batch_size (%d), "
+                "adjusting to %d to align with batch boundaries",
+                self.config.checkpoint_interval,
+                batch_size,
+                batch_size,
+            )
+            self.config.checkpoint_interval = batch_size
         root_topic_prompt = None
         topic_model_type = None
         if topic_model is not None:
             root_topic_prompt = getattr(topic_model, "topic_prompt", None)
             topic_model_type = type(topic_model).__name__.lower()
-        async for event in self._run_generation_loop_async(
-            num_steps=num_steps,
-            batch_size=batch_size,
-            total_samples=total_samples,
-            topic_paths=topic_paths or [],
-            data_creation_prompt=data_creation_prompt,
-            num_example_demonstrations=num_example_demonstrations,
-            include_sys_msg=include_sys_msg,
-            root_topic_prompt=root_topic_prompt,
-            topic_model_type=topic_model_type,
-        ):
-            yield event
+        # Use cycle-based generation when a topic model is provided
+        if topic_model is not None:
+            unique_topics, cycles_needed = self._prepare_unique_topics(total_samples, topic_model)
+            # batch_size becomes concurrency in the new model
+            concurrency = batch_size
+            async for event in self._run_cycle_based_generation_async(
+                unique_topics=unique_topics,
+                cycles_needed=cycles_needed,
+                total_samples=total_samples,
+                concurrency=concurrency,
+                data_creation_prompt=data_creation_prompt,
+                num_example_demonstrations=num_example_demonstrations,
+                include_sys_msg=include_sys_msg,
+                root_topic_prompt=root_topic_prompt,
+                topic_model_type=topic_model_type,
+                topic_model=topic_model,
+            ):
+                yield event
+        else:
+            # Fall back to step-based generation when no topic model
+            topic_paths, num_steps = self._prepare_topic_paths(num_steps, batch_size, topic_model)
+            async for event in self._run_generation_loop_async(
+                num_steps=num_steps,
+                batch_size=batch_size,
+                total_samples=total_samples,
+                topic_paths=topic_paths or [],
+                data_creation_prompt=data_creation_prompt,
+                num_example_demonstrations=num_example_demonstrations,
+                include_sys_msg=include_sys_msg,
+                root_topic_prompt=root_topic_prompt,
+                topic_model_type=topic_model_type,
+            ):
+                yield event
     async def _run_generation_loop_async(  # noqa: PLR0912, PLR0915
         self,
@@ -1248,6 +1461,17 @@ class DataSetGenerator:
         topic_model_type: str | None = None,
     ) -> AsyncGenerator[dict | HFDataset, None]:
         """Run the main generation loop yielding progress events."""
+        # Verify topic paths cover all expected samples (only when a topic model is used)
+        expected_prompts = num_steps * batch_size
+        actual_paths = len(topic_paths) if topic_paths else 0
+        if topic_paths and actual_paths < expected_prompts:
+            logger.warning(
+                "Topic paths (%d) < expected samples (%d). Steps beyond path %d will produce 0 samples.",
+                actual_paths,
+                expected_prompts,
+                actual_paths // batch_size,
+            )
         # Initialize checkpoint paths if checkpointing is enabled
         if self.config.checkpoint_interval is not None:
             self._initialize_checkpoint_paths()
@@ -1257,6 +1481,7 @@ class DataSetGenerator:
         samples_in_current_batch: list[dict] = []
         failures_in_current_batch: list[dict] = []
         topic_paths_in_current_batch: list[TopicPath | None] = []
+        topics_exhausted_count = 0  # Samples not generated due to topic exhaustion
         try:
             yield {
@@ -1267,11 +1492,12 @@ class DataSetGenerator:
                 "total_samples": total_samples,
                 "root_topic_prompt": root_topic_prompt,
                 "topic_model_type": topic_model_type,
-                "resumed_from_checkpoint": len(self._processed_ids) > 0,
-                "previously_processed": len(self._processed_ids),
+                "resumed_from_checkpoint": len(self._completed) > 0,
+                "previously_processed": len(self._completed),
                 "resumed_samples": self._flushed_samples_count,
                 "resumed_failures": self._flushed_failures_count,
                 "checkpoint_enabled": self.config.checkpoint_interval is not None,
+                "checkpoint_interval": self.config.checkpoint_interval,
             }
             for step in range(num_steps):
@@ -1290,12 +1516,36 @@ class DataSetGenerator:
                     num_example_demonstrations,
                 )
-                # Filter out already-processed topics when resuming
-                if self._processed_ids:
+                # Handle topic exhaustion - when we've run out of topics
+                if not prompts and topic_paths:
+                    # Topics exhausted - remaining steps will produce nothing
+                    exhausted_in_step = batch_size  # All samples in this batch had no topics
+                    topics_exhausted_count += exhausted_in_step
+                    logger.warning(
+                        "Step %d: Topics exhausted at index %d (only %d topics available for %d expected)",
+                        step + 1,
+                        start_idx,
+                        len(topic_paths),
+                        total_samples,
+                    )
+                    yield {
+                        "event": "step_complete",
+                        "step": step + 1,
+                        "samples_generated": 0,
+                        "success": True,
+                        "failed_in_step": 0,
+                        "failure_reasons": [],
+                        "topics_exhausted": exhausted_in_step,
+                    }
+                    continue
+                # Filter out already-completed topics when resuming
+                if self._completed:
                     filtered_prompts = []
                     filtered_topic_paths: list[TopicPath | None] = []
                     for prompt, tp in zip(prompts, used_topic_paths, strict=False):
-                        if not self._is_topic_processed(tp):
+                        # Check if this topic_id has been completed in any cycle
+                        if tp is None or not self._is_uuid_completed_any_cycle(tp.topic_id):
                             filtered_prompts.append(prompt)
                             filtered_topic_paths.append(tp)
@@ -1335,10 +1585,14 @@ class DataSetGenerator:
                     self.config.checkpoint_interval is not None
                     and samples_since_checkpoint >= self.config.checkpoint_interval
                 ):
+                    # Convert topic_paths to (uuid, cycle) tuples (cycle=0 for step-based)
+                    completed_items = [
+                        (tp.topic_id, 0) for tp in topic_paths_in_current_batch if tp is not None
+                    ]
                     self._save_checkpoint(
                         samples_in_current_batch,
                         failures_in_current_batch,
-                        topic_paths_in_current_batch,
+                        completed_items,
                     )
                     samples_in_current_batch = []
                     failures_in_current_batch = []
@@ -1360,11 +1614,11 @@ class DataSetGenerator:
                         }
                         return  # Exit generator cleanly
-                failed_in_batch = len(self.failed_samples) - failed_before
+                # Use new_failures captured BEFORE checkpoint clear, not recalculated after
+                failed_in_batch = len(new_failures)
                 failure_reasons: list[str] = []
-                if failed_in_batch > 0 and self.failed_samples:
-                    recent_failures = self.failed_samples[-failed_in_batch:]
-                    for f in recent_failures[:3]:
+                if failed_in_batch > 0 and new_failures:
+                    for f in new_failures[:3]:
                         if isinstance(f, dict):
                             failure_reasons.append(f.get("error", str(f)))
                         else:
@@ -1390,10 +1644,13 @@ class DataSetGenerator:
             if self.config.checkpoint_interval is not None and (
                 samples_in_current_batch or failures_in_current_batch
             ):
+                completed_items = [
+                    (tp.topic_id, 0) for tp in topic_paths_in_current_batch if tp is not None
+                ]
                 self._save_checkpoint(
                     samples_in_current_batch,
                     failures_in_current_batch,
-                    topic_paths_in_current_batch,
+                    completed_items,
                 )
                 yield {
                     "event": "checkpoint_saved",
@@ -1403,13 +1660,30 @@ class DataSetGenerator:
                 }
             # Calculate total counts including flushed data
-            total_samples = self._flushed_samples_count + len(self._samples)
-            total_failures = self._flushed_failures_count + len(self.failed_samples)
+            actual_samples = self._flushed_samples_count + len(self._samples)
+            actual_failures = self._flushed_failures_count + len(self.failed_samples)
+            total_accounted = actual_samples + actual_failures + topics_exhausted_count
+            true_unaccounted = total_samples - total_accounted
+            # Log accounting summary for debugging
+            logger.info(
+                "Generation complete: expected=%d, generated=%d, failed=%d, topics_exhausted=%d, "
+                "accounted=%d, unaccounted=%d",
+                total_samples,  # This is the parameter passed in (expected count)
+                actual_samples,
+                actual_failures,
+                topics_exhausted_count,
+                total_accounted,
+                true_unaccounted,
+            )
             yield {
                 "event": "generation_complete",
-                "total_samples": total_samples,
-                "failed_samples": total_failures,
+                "total_samples": actual_samples,
+                "failed_samples": actual_failures,
+                "expected_samples": total_samples,
+                "topics_exhausted": topics_exhausted_count,
+                "unaccounted": true_unaccounted,
             }
         except KeyboardInterrupt:
@@ -1417,10 +1691,13 @@ class DataSetGenerator:
             if self.config.checkpoint_interval is not None and (
                 samples_in_current_batch or failures_in_current_batch
             ):
+                completed_items = [
+                    (tp.topic_id, 0) for tp in topic_paths_in_current_batch if tp is not None
+                ]
                 self._save_checkpoint(
                     samples_in_current_batch,
                     failures_in_current_batch,
-                    topic_paths_in_current_batch,
+                    completed_items,
                 )
             yield {
                 "event": "generation_interrupted",
@@ -1434,10 +1711,327 @@ class DataSetGenerator:
             if self.config.checkpoint_interval is not None and (
                 samples_in_current_batch or failures_in_current_batch
             ):
+                completed_items = [
+                    (tp.topic_id, 0) for tp in topic_paths_in_current_batch if tp is not None
+                ]
                 self._save_checkpoint(
                     samples_in_current_batch,
                     failures_in_current_batch,
-                    topic_paths_in_current_batch,
+                    completed_items,
+                )
+            yield {"event": "generation_error", "error": str(e)}
+            self.print_failure_summary()
+            self._save_samples_to_file(ERROR_DATASET_FILENAME)
+            raise DataSetGeneratorError("failed") from e
+        # Build final dataset: if samples were flushed to disk, load them from checkpoint
+        if self._flushed_samples_count > 0:
+            all_samples = self._load_all_samples_from_checkpoint()
+            yield HFDataset.from_list(all_samples) if all_samples else HFDataset.from_list([])
+        else:
+            yield (HFDataset.from_list(self._samples) if self._samples else HFDataset.from_list([]))
+    async def _run_cycle_based_generation_async(  # noqa: PLR0912, PLR0915
+        self,
+        unique_topics: list[Topic],
+        cycles_needed: int,
+        total_samples: int,
+        concurrency: int,
+        data_creation_prompt: str,
+        num_example_demonstrations: int,
+        include_sys_msg: bool,
+        root_topic_prompt: str | None = None,
+        topic_model_type: str | None = None,
+        topic_model: "TopicModel | None" = None,
+    ) -> AsyncGenerator[dict | HFDataset, None]:
+        """Run cycle-based generation loop yielding progress events.
+        This is the new generation model that iterates over unique topics (by UUID)
+        for multiple cycles, rather than the old step-based batching approach.
+        Args:
+            unique_topics: List of Topic namedtuples with (uuid, topic).
+            cycles_needed: Number of cycles to iterate through topics.
+            total_samples: Total number of samples to generate.
+            concurrency: Maximum parallel LLM calls (semaphore limit).
+            data_creation_prompt: The prompt template for data creation.
+            num_example_demonstrations: Number of example demonstrations to include.
+            include_sys_msg: Whether to include system message in output.
+            root_topic_prompt: Original topic prompt for display.
+            topic_model_type: Type of topic model (tree, graph) for display.
+            topic_model: Topic model for path lookup (recovers full path context).
+        Yields:
+            Progress event dicts and final HFDataset.
+        """
+        unique_topic_count = len(unique_topics)
+        final_cycle_size = total_samples - (cycles_needed - 1) * unique_topic_count
+        # Initialize checkpoint paths if checkpointing is enabled
+        if self.config.checkpoint_interval is not None:
+            self._initialize_checkpoint_paths()
+        # Track samples for checkpointing
+        samples_since_checkpoint = 0
+        pending_samples: list[dict] = []
+        pending_failures: list[dict] = []
+        pending_completed: list[tuple[str, int]] = []
+        samples_generated = 0
+        # Create semaphore for concurrency control
+        semaphore = asyncio.Semaphore(concurrency)
+        try:
+            yield {
+                "event": "generation_start",
+                "model_name": self.model_name,
+                "unique_topics": unique_topic_count,
+                "cycles_needed": cycles_needed,
+                "final_cycle_size": final_cycle_size,
+                "concurrency": concurrency,
+                "total_samples": total_samples,
+                "root_topic_prompt": root_topic_prompt,
+                "topic_model_type": topic_model_type,
+                "resumed_from_checkpoint": len(self._completed) > 0,
+                "previously_completed": len(self._completed),
+                "resumed_samples": self._flushed_samples_count,
+                "resumed_failures": self._flushed_failures_count,
+                "checkpoint_enabled": self.config.checkpoint_interval is not None,
+                "checkpoint_interval": self.config.checkpoint_interval,
+            }
+            for cycle in range(cycles_needed):
+                # Determine how many topics to process in this cycle
+                if cycle == cycles_needed - 1:
+                    # Final cycle may be partial
+                    topics_in_cycle = min(final_cycle_size, unique_topic_count)
+                else:
+                    topics_in_cycle = unique_topic_count
+                yield {
+                    "event": "cycle_start",
+                    "cycle": cycle + 1,
+                    "total_cycles": cycles_needed,
+                    "topics_in_cycle": topics_in_cycle,
+                }
+                # Collect topics to process in this cycle
+                topics_to_process: list[Topic] = []
+                for topic_idx, topic in enumerate(unique_topics):
+                    if cycle == cycles_needed - 1 and topic_idx >= final_cycle_size:
+                        break  # Partial final cycle - stop early
+                    if not self._is_completed(topic.uuid, cycle):
+                        topics_to_process.append(topic)
+                if not topics_to_process:
+                    # All topics in this cycle already completed (resume scenario)
+                    yield {
+                        "event": "cycle_complete",
+                        "cycle": cycle + 1,
+                        "samples_in_cycle": 0,
+                        "skipped": topics_in_cycle,
+                    }
+                    continue
+                # Process topics with concurrency control
+                cycle_samples = 0
+                cycle_failures = 0
+                async def process_topic(
+                    topic: Topic, cycle_num: int, sample_idx: int
+                ) -> tuple[tuple[str, int], bool, int]:
+                    """Process a single topic with semaphore-controlled concurrency.
+                    Note: Samples/failures are NOT extracted here because concurrent
+                    tasks share self._samples. Extracting per-task would cause
+                    duplicates since each task's slice overlaps with others.
+                    Instead, samples are captured in bulk after asyncio.gather.
+                    Returns:
+                        Tuple of (completed_item, success, count)
+                    """
+                    async with semaphore:
+                        # Recover full path context (root -> ... -> leaf) for prompt quality
+                        full_path = topic_model.get_path_by_id(topic.uuid) if topic_model else None
+                        subtopics = full_path if full_path else [topic.topic]
+                        # Build prompt for this topic
+                        sample_prompt = self.build_prompt(
+                            data_creation_prompt=data_creation_prompt,
+                            num_example_demonstrations=num_example_demonstrations,
+                            subtopics_list=subtopics,
+                        )
+                        # Use existing batch processing for a single sample
+                        topic_path = TopicPath(path=subtopics, topic_id=topic.uuid)
+                        success, count = await self._process_batch_with_retries_async(
+                            prompts=[sample_prompt],
+                            include_sys_msg=include_sys_msg,
+                            start_sample_idx=sample_idx,
+                            topic_paths_for_batch=[topic_path],
+                        )
+                        completed_item = (topic.uuid, cycle_num)
+                        return completed_item, success, count
+                # Process topics in batches for checkpoint saving
+                for batch_start in range(0, len(topics_to_process), concurrency):
+                    batch_end = min(batch_start + concurrency, len(topics_to_process))
+                    batch_topics = topics_to_process[batch_start:batch_end]
+                    # Snapshot list lengths before tasks start so we can
+                    # capture new items in one safe slice after all complete.
+                    samples_before_gather = len(self._samples)
+                    failures_before_gather = len(self.failed_samples)
+                    # Create concurrent tasks
+                    # Pass current samples_generated as starting index for each task
+                    tasks = [
+                        asyncio.create_task(process_topic(topic, cycle, samples_generated + i))
+                        for i, topic in enumerate(batch_topics)
+                    ]
+                    # Process results as each task finishes (not waiting for
+                    # the whole batch) so the progress bar advances per-sample.
+                    batch_samples = 0
+                    batch_failures = 0
+                    for future in asyncio.as_completed(tasks):
+                        completed_item, success, count = await future
+                        pending_completed.append(completed_item)
+                        if success:
+                            cycle_samples += count
+                            samples_generated += count
+                            batch_samples += count
+                        else:
+                            cycle_failures += 1
+                            batch_failures += 1
+                        samples_since_checkpoint += 1
+                        # Emit per-sample progress so progress bars advance immediately
+                        yield {
+                            "event": "batch_complete",
+                            "samples_generated": count if success else 0,
+                            "samples_failed": 1 if not success else 0,
+                        }
+                    # After all tasks complete, capture samples/failures in
+                    # one safe slice (no concurrent tasks are running now).
+                    batch_new_samples = list(self._samples[samples_before_gather:])
+                    batch_new_failures = list(self.failed_samples[failures_before_gather:])
+                    # Tag each failure with the cycle number so retry logic can
+                    # target only the specific (uuid, cycle) that failed
+                    for failure in batch_new_failures:
+                        failure["cycle"] = cycle
+                    pending_samples.extend(batch_new_samples)
+                    pending_failures.extend(batch_new_failures)
+                    # Save checkpoint if we've reached the interval
+                    if (
+                        self.config.checkpoint_interval is not None
+                        and samples_since_checkpoint >= self.config.checkpoint_interval
+                    ):
+                        self._save_checkpoint(
+                            pending_samples,
+                            pending_failures,
+                            pending_completed,
+                        )
+                        pending_samples = []
+                        pending_failures = []
+                        pending_completed = []
+                        samples_since_checkpoint = 0
+                        yield {
+                            "event": "checkpoint_saved",
+                            "total_samples": self._flushed_samples_count,
+                            "total_failures": self._flushed_failures_count,
+                        }
+                        # Check for graceful stop request
+                        if self.stop_requested:
+                            yield {
+                                "event": "generation_stopped",
+                                "message": "Stopped at checkpoint as requested",
+                                "total_samples": self._flushed_samples_count,
+                                "total_failures": self._flushed_failures_count,
+                            }
+                            return
+                yield {
+                    "event": "cycle_complete",
+                    "cycle": cycle + 1,
+                    "samples_in_cycle": cycle_samples,
+                    "failures_in_cycle": cycle_failures,
+                }
+            # Save final checkpoint with any remaining samples
+            if self.config.checkpoint_interval is not None and (
+                pending_samples or pending_failures
+            ):
+                self._save_checkpoint(
+                    pending_samples,
+                    pending_failures,
+                    pending_completed,
+                )
+                yield {
+                    "event": "checkpoint_saved",
+                    "total_samples": self._flushed_samples_count,
+                    "total_failures": self._flushed_failures_count,
+                    "final": True,
+                }
+            # Calculate total counts including flushed data
+            actual_samples = self._flushed_samples_count + len(self._samples)
+            actual_failures = self._flushed_failures_count + len(self.failed_samples)
+            unaccounted = total_samples - actual_samples - actual_failures
+            logger.info(
+                "Generation complete: expected=%d, generated=%d, failed=%d, "
+                "accounted=%d, unaccounted=%d",
+                total_samples,
+                actual_samples,
+                actual_failures,
+                actual_samples + actual_failures,
+                unaccounted,
+            )
+            yield {
+                "event": "generation_complete",
+                "total_samples": actual_samples,
+                "failed_samples": actual_failures,
+                "expected_samples": total_samples,
+                "unaccounted": unaccounted,
+                "cycles_completed": cycles_needed,
+                "unique_topics": unique_topic_count,
+            }
+        except KeyboardInterrupt:
+            # Save checkpoint on interrupt
+            if self.config.checkpoint_interval is not None and (
+                pending_samples or pending_failures
+            ):
+                self._save_checkpoint(
+                    pending_samples,
+                    pending_failures,
+                    pending_completed,
+                )
+            yield {
+                "event": "generation_interrupted",
+                "message": "Generation interrupted by user.",
+            }
+            self.print_failure_summary()
+            self._save_samples_to_file(INTERRUPTED_DATASET_FILENAME)
+        except Exception as e:  # noqa: BLE001
+            # Save checkpoint on error
+            if self.config.checkpoint_interval is not None and (
+                pending_samples or pending_failures
+            ):
+                self._save_checkpoint(
+                    pending_samples,
+                    pending_failures,
+                    pending_completed,
                 )
             yield {"event": "generation_error", "error": str(e)}
             self.print_failure_summary()
@@ -1465,6 +2059,17 @@ class DataSetGenerator:
                     prompts, include_sys_msg, start_sample_idx, topic_paths_for_batch
                 )
+                # Verify all prompts are accounted for (no silent drops)
+                accounted = len(samples) + len(failed_responses)
+                if accounted != len(prompts):
+                    logger.warning(
+                        "Sample accounting mismatch: %d prompts, %d samples, %d failures (missing %d)",
+                        len(prompts),
+                        len(samples),
+                        len(failed_responses),
+                        len(prompts) - accounted,
+                    )
                 # Update failed samples
                 self.failed_samples.extend(failed_responses)

DeepFabric 4.10.1__py3-none-any.whl → 4.11.0__py3-none-any.whl

DeepFabric 4.10.1py3-none-any.whl → 4.11.0py3-none-any.whl