recursive-cleaner 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,628 @@
1
+ """Main DataCleaner class - the core pipeline."""
2
+
3
+ import json
4
+ import os
5
+ import time
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from typing import Callable, Literal
9
+
10
+ from tenacity import retry, stop_after_attempt, wait_exponential
11
+
12
+ from .context import build_context
13
+ from .errors import OutputValidationError, ParseError
14
+ from .metrics import QualityMetrics, compare_quality, load_structured_data, measure_quality
15
+ from .parsers import chunk_file
16
+ from .prompt import build_prompt
17
+ from .response import parse_response
18
+ from .schema import format_schema_for_prompt, infer_schema
19
+ from .types import LLMBackend
20
+ from .validation import check_code_safety, extract_sample_data, split_holdout, validate_function
21
+
22
+ STATE_VERSION = "0.5.0"
23
+
24
+
25
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
26
+ def call_llm(backend: LLMBackend, prompt: str) -> str:
27
+ """Call LLM with retry logic."""
28
+ return backend.generate(prompt)
29
+
30
+
31
+ class DataCleaner:
32
+ """
33
+ LLM-powered incremental data cleaning pipeline.
34
+
35
+ Processes data in chunks, identifies issues, generates Python
36
+ cleaning functions one at a time, maintaining awareness of
37
+ existing solutions through docstring feedback.
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ llm_backend: LLMBackend,
43
+ file_path: str,
44
+ chunk_size: int = 50,
45
+ instructions: str = "",
46
+ max_iterations: int = 5,
47
+ context_budget: int = 8000,
48
+ on_progress: Callable[[dict], None] | None = None,
49
+ validate_runtime: bool = True,
50
+ schema_sample_size: int = 10,
51
+ state_file: str | None = None,
52
+ mode: Literal["auto", "structured", "text"] = "auto",
53
+ chunk_overlap: int = 200,
54
+ holdout_ratio: float = 0.2,
55
+ track_metrics: bool = False,
56
+ sampling_strategy: Literal["sequential", "random", "stratified"] = "sequential",
57
+ stratify_field: str | None = None,
58
+ optimize: bool = False,
59
+ optimize_threshold: int = 10,
60
+ early_termination: bool = False,
61
+ saturation_check_interval: int = 20,
62
+ report_path: str | None = "cleaning_report.md",
63
+ dry_run: bool = False,
64
+ ):
65
+ self.backend = llm_backend
66
+ self.file_path = file_path
67
+ self.chunk_size = chunk_size
68
+ self.instructions = instructions
69
+ self.max_iterations = max_iterations
70
+ self.context_budget = context_budget
71
+ self.on_progress = on_progress
72
+ self.validate_runtime = validate_runtime
73
+ self.schema_sample_size = schema_sample_size
74
+ self.state_file = state_file
75
+ self.mode = mode
76
+ self.chunk_overlap = chunk_overlap
77
+ self.holdout_ratio = holdout_ratio
78
+ self.track_metrics = track_metrics
79
+ self.sampling_strategy = sampling_strategy
80
+ self.stratify_field = stratify_field
81
+ self.optimize = optimize
82
+ self.optimize_threshold = optimize_threshold
83
+ self.early_termination = early_termination
84
+ self.saturation_check_interval = saturation_check_interval
85
+ self.report_path = report_path
86
+ self.dry_run = dry_run
87
+ self.functions: list[dict] = [] # List of {name, docstring, code}
88
+ # Track recent function generation for saturation check
89
+ self._recent_new_function_count = 0
90
+ self._last_check_function_count = 0
91
+ self._total_chunks: int = 0 # Set during run()
92
+ self._schema_str: str = "" # Formatted schema for prompts
93
+ self._last_completed_chunk: int = -1 # -1 means no chunks completed yet
94
+ self._effective_mode: Literal["structured", "text"] = "structured" # Resolved at run()
95
+ # Quality metrics (populated when track_metrics=True)
96
+ self.metrics_before: QualityMetrics | None = None
97
+ self.metrics_after: QualityMetrics | None = None
98
+ # Latency tracking for LLM calls
99
+ self._latency_stats: dict = {
100
+ "call_count": 0,
101
+ "total_ms": 0.0,
102
+ "min_ms": float("inf"),
103
+ "max_ms": 0.0,
104
+ }
105
+
106
+ def _emit(self, event_type: str, chunk_index: int = 0, **kwargs) -> None:
107
+ """Emit a progress event to the callback, if set."""
108
+ if self.on_progress is None:
109
+ return
110
+ event = {
111
+ "type": event_type,
112
+ "chunk_index": chunk_index,
113
+ "total_chunks": self._total_chunks,
114
+ **kwargs,
115
+ }
116
+ try:
117
+ self.on_progress(event)
118
+ except Exception as e:
119
+ print(f" Warning: callback error: {e}")
120
+
121
+ def _call_llm_timed(self, prompt: str, chunk_index: int = 0) -> str:
122
+ """Call LLM with timing and emit latency event."""
123
+ start = time.perf_counter()
124
+ response = call_llm(self.backend, prompt)
125
+ elapsed_ms = (time.perf_counter() - start) * 1000
126
+
127
+ # Update stats
128
+ self._latency_stats["call_count"] += 1
129
+ self._latency_stats["total_ms"] += elapsed_ms
130
+ self._latency_stats["min_ms"] = min(self._latency_stats["min_ms"], elapsed_ms)
131
+ self._latency_stats["max_ms"] = max(self._latency_stats["max_ms"], elapsed_ms)
132
+
133
+ # Emit event
134
+ self._emit("llm_call", chunk_index=chunk_index, latency_ms=round(elapsed_ms, 2))
135
+
136
+ return response
137
+
138
+ def _get_latency_summary(self) -> dict:
139
+ """Get summary of latency stats with avg calculation."""
140
+ stats = self._latency_stats.copy()
141
+ if stats["call_count"] > 0:
142
+ stats["avg_ms"] = round(stats["total_ms"] / stats["call_count"], 2)
143
+ stats["min_ms"] = round(stats["min_ms"], 2)
144
+ stats["max_ms"] = round(stats["max_ms"], 2)
145
+ stats["total_ms"] = round(stats["total_ms"], 2)
146
+ else:
147
+ stats["avg_ms"] = 0.0
148
+ stats["min_ms"] = 0.0
149
+ return stats
150
+
151
+ def _optimize_functions(self) -> None:
152
+ """
153
+ Run two-pass optimization on generated functions.
154
+
155
+ 1. Group functions by salience (IDF)
156
+ 2. Consolidate each group with agency
157
+ 3. Replace self.functions with optimized result
158
+ """
159
+ from .optimizer import consolidate_with_agency, group_by_salience
160
+
161
+ self._emit(
162
+ "optimize_start",
163
+ function_count=len(self.functions),
164
+ )
165
+
166
+ # Group by IDF
167
+ groups = group_by_salience(self.functions)
168
+
169
+ optimized = []
170
+ for group_name, group_funcs in groups.items():
171
+ self._emit(
172
+ "optimize_group",
173
+ group=group_name,
174
+ count=len(group_funcs),
175
+ )
176
+
177
+ # Consolidate with agency
178
+ consolidated = consolidate_with_agency(group_funcs, self.backend)
179
+ optimized.extend(consolidated)
180
+
181
+ self._emit(
182
+ "optimize_complete",
183
+ original=len(self.functions),
184
+ final=len(optimized),
185
+ )
186
+
187
+ self.functions = optimized
188
+
189
+ def _check_saturation(self, chunks_processed: int) -> bool:
190
+ """
191
+ Ask LLM if pattern discovery has saturated.
192
+
193
+ Returns True if should stop early, False to continue.
194
+ """
195
+ from .prompt import SATURATION_CHECK_TEMPLATE
196
+ from .response import parse_saturation_response
197
+
198
+ # Build function summaries (name + first line of docstring)
199
+ summaries = []
200
+ for f in self.functions:
201
+ first_line = f["docstring"].split("\n")[0] if f["docstring"] else ""
202
+ summaries.append(f"- {f['name']}: {first_line}")
203
+
204
+ prompt = SATURATION_CHECK_TEMPLATE.format(
205
+ count=len(self.functions),
206
+ function_summaries="\n".join(summaries) or "(none)",
207
+ total_chunks=chunks_processed,
208
+ recent_window=self.saturation_check_interval,
209
+ recent_new_functions=self._recent_new_function_count,
210
+ )
211
+
212
+ try:
213
+ response = self._call_llm_timed(prompt, chunk_index=chunks_processed - 1)
214
+ assessment = parse_saturation_response(response)
215
+ except Exception as e:
216
+ print(f" Warning: saturation check failed: {e}")
217
+ return False # Continue on error
218
+
219
+ self._emit(
220
+ "saturation_check",
221
+ chunk_index=chunks_processed - 1,
222
+ saturated=assessment.saturated,
223
+ confidence=assessment.confidence,
224
+ recommendation=assessment.recommendation,
225
+ )
226
+
227
+ # Reset counter for next interval
228
+ self._recent_new_function_count = 0
229
+
230
+ # Only stop if saturated with high or medium confidence
231
+ return assessment.saturated and assessment.confidence != "low"
232
+
233
+ def _save_state(self) -> None:
234
+ """Save current state to JSON file with atomic write."""
235
+ if self.state_file is None:
236
+ return
237
+ state = {
238
+ "version": STATE_VERSION,
239
+ "file_path": self.file_path,
240
+ "instructions": self.instructions,
241
+ "chunk_size": self.chunk_size,
242
+ "last_completed_chunk": self._last_completed_chunk,
243
+ "total_chunks": self._total_chunks,
244
+ "functions": self.functions,
245
+ "timestamp": datetime.now(timezone.utc).isoformat(),
246
+ "optimize": self.optimize,
247
+ "optimize_threshold": self.optimize_threshold,
248
+ "early_termination": self.early_termination,
249
+ "saturation_check_interval": self.saturation_check_interval,
250
+ }
251
+ tmp_path = self.state_file + ".tmp"
252
+ with open(tmp_path, "w") as f:
253
+ json.dump(state, f, indent=2)
254
+ os.rename(tmp_path, self.state_file)
255
+
256
+ def _load_state(self) -> bool:
257
+ """Load state from JSON file if it exists. Returns True if loaded."""
258
+ if self.state_file is None or not os.path.exists(self.state_file):
259
+ return False
260
+ try:
261
+ with open(self.state_file) as f:
262
+ state = json.load(f)
263
+ except json.JSONDecodeError as e:
264
+ raise ValueError(f"Invalid state file JSON: {e}")
265
+ # Validate file_path matches
266
+ if state.get("file_path") != self.file_path:
267
+ raise ValueError(
268
+ f"State file_path mismatch: state has '{state.get('file_path')}', "
269
+ f"but current file_path is '{self.file_path}'"
270
+ )
271
+ # Load state
272
+ self.functions = state.get("functions", [])
273
+ self._last_completed_chunk = state.get("last_completed_chunk", -1)
274
+ self._total_chunks = state.get("total_chunks", 0)
275
+ print(f"Resumed from state: {self._last_completed_chunk + 1}/{self._total_chunks} chunks completed")
276
+ return True
277
+
278
+ @classmethod
279
+ def resume(cls, state_file: str, llm_backend: LLMBackend) -> "DataCleaner":
280
+ """
281
+ Resume processing from a saved state file.
282
+
283
+ Args:
284
+ state_file: Path to state JSON file
285
+ llm_backend: LLM backend to use (not saved in state)
286
+
287
+ Returns:
288
+ DataCleaner instance ready to continue processing
289
+
290
+ Raises:
291
+ FileNotFoundError: If state file doesn't exist
292
+ ValueError: If state file is invalid
293
+ """
294
+ if not os.path.exists(state_file):
295
+ raise FileNotFoundError(f"State file not found: {state_file}")
296
+ try:
297
+ with open(state_file) as f:
298
+ state = json.load(f)
299
+ except json.JSONDecodeError as e:
300
+ raise ValueError(f"Invalid state file JSON: {e}")
301
+ # Create instance with saved parameters
302
+ instance = cls(
303
+ llm_backend=llm_backend,
304
+ file_path=state["file_path"],
305
+ chunk_size=state.get("chunk_size", 50),
306
+ instructions=state.get("instructions", ""),
307
+ state_file=state_file,
308
+ optimize=state.get("optimize", False),
309
+ optimize_threshold=state.get("optimize_threshold", 10),
310
+ early_termination=state.get("early_termination", False),
311
+ saturation_check_interval=state.get("saturation_check_interval", 20),
312
+ )
313
+ # Restore state
314
+ instance.functions = state.get("functions", [])
315
+ instance._last_completed_chunk = state.get("last_completed_chunk", -1)
316
+ instance._total_chunks = state.get("total_chunks", 0)
317
+ return instance
318
+
319
+ def _detect_mode(self) -> Literal["structured", "text"]:
320
+ """Detect mode from file extension."""
321
+ suffix = Path(self.file_path).suffix.lower()
322
+ structured_extensions = {".jsonl", ".csv", ".json"}
323
+ if suffix in structured_extensions:
324
+ return "structured"
325
+ return "text"
326
+
327
+ def run(self) -> None:
328
+ """Run the cleaning pipeline."""
329
+ # Resolve effective mode
330
+ if self.mode == "auto":
331
+ self._effective_mode = self._detect_mode()
332
+ else:
333
+ self._effective_mode = self.mode
334
+
335
+ chunks = chunk_file(
336
+ self.file_path,
337
+ self.chunk_size,
338
+ mode=self._effective_mode,
339
+ chunk_overlap=self.chunk_overlap,
340
+ sampling_strategy=self.sampling_strategy,
341
+ stratify_field=self.stratify_field,
342
+ )
343
+
344
+ if not chunks:
345
+ print("No data to process.")
346
+ return
347
+
348
+ # Try to load existing state
349
+ resumed = self._load_state()
350
+
351
+ # Infer schema only for structured mode
352
+ if self._effective_mode == "structured":
353
+ schema = infer_schema(self.file_path, self.schema_sample_size)
354
+ self._schema_str = format_schema_for_prompt(schema)
355
+ # Measure initial quality metrics if tracking enabled
356
+ if self.track_metrics:
357
+ data = load_structured_data(self.file_path)
358
+ self.metrics_before = measure_quality(data)
359
+ else:
360
+ self._schema_str = "" # No schema for text mode
361
+
362
+ self._total_chunks = len(chunks)
363
+
364
+ for i, chunk in enumerate(chunks):
365
+ # Skip already completed chunks
366
+ if i <= self._last_completed_chunk:
367
+ if resumed:
368
+ print(f"Skipping chunk {i + 1}/{len(chunks)} (already completed)")
369
+ continue
370
+ print(f"Processing chunk {i + 1}/{len(chunks)}...")
371
+ self._process_chunk(chunk, i)
372
+ # Mark chunk as completed and save state
373
+ self._last_completed_chunk = i
374
+ self._save_state()
375
+
376
+ # Check for early termination (saturation detection)
377
+ if (
378
+ self.early_termination
379
+ and i > 0
380
+ and (i + 1) % self.saturation_check_interval == 0
381
+ ):
382
+ if self._check_saturation(i + 1):
383
+ self._emit("early_termination", chunk_index=i)
384
+ print(f"Early termination: pattern discovery saturated at chunk {i + 1}")
385
+ break
386
+
387
+ # Skip optimization and output in dry_run mode
388
+ if self.dry_run:
389
+ self._emit(
390
+ "dry_run_complete",
391
+ chunk_index=self._total_chunks - 1,
392
+ latency_stats=self._get_latency_summary(),
393
+ )
394
+ print("Dry run complete. No functions generated or saved.")
395
+ return
396
+
397
+ # Two-pass optimization (if enabled and enough functions)
398
+ if self.optimize and len(self.functions) >= self.optimize_threshold:
399
+ self._optimize_functions()
400
+
401
+ self._write_output()
402
+ self._write_report()
403
+ self._emit(
404
+ "complete",
405
+ chunk_index=self._total_chunks - 1,
406
+ latency_stats=self._get_latency_summary(),
407
+ )
408
+ print(f"Done! Generated {len(self.functions)} functions.")
409
+
410
+ def _process_chunk(self, chunk: str, chunk_idx: int) -> None:
411
+ """Process a single chunk, iterating until clean or max iterations."""
412
+ self._emit("chunk_start", chunk_index=chunk_idx)
413
+ error_feedback = ""
414
+
415
+ # Dry run mode: just detect issues, don't generate functions
416
+ if self.dry_run:
417
+ self._process_chunk_dry_run(chunk, chunk_idx)
418
+ return
419
+
420
+ # Split chunk for holdout validation if enabled
421
+ use_holdout = self.validate_runtime and self.holdout_ratio > 0
422
+ if use_holdout:
423
+ gen_chunk, holdout_chunk = split_holdout(
424
+ chunk, self.holdout_ratio, mode=self._effective_mode
425
+ )
426
+ else:
427
+ gen_chunk, holdout_chunk = chunk, ""
428
+
429
+ for iteration in range(self.max_iterations):
430
+ self._emit("iteration", chunk_index=chunk_idx, iteration=iteration)
431
+ context = build_context(self.functions, self.context_budget)
432
+ prompt = build_prompt(
433
+ self.instructions,
434
+ context,
435
+ gen_chunk,
436
+ self._schema_str,
437
+ mode=self._effective_mode,
438
+ )
439
+
440
+ if error_feedback:
441
+ prompt += f"\n\nYour previous response had an error: {error_feedback}\nPlease fix and try again."
442
+
443
+ try:
444
+ response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
445
+ result = parse_response(response)
446
+ error_feedback = "" # Clear on success
447
+ except ParseError as e:
448
+ error_feedback = str(e)
449
+ continue
450
+
451
+ if result["status"] == "clean":
452
+ self._emit("chunk_done", chunk_index=chunk_idx)
453
+ return
454
+
455
+ if result["code"]:
456
+ # Safety check: reject dangerous patterns before execution
457
+ safe, safety_error = check_code_safety(result["code"])
458
+ if not safe:
459
+ error_feedback = f"Code safety check failed: {safety_error}. Data cleaning functions should not access filesystem, network, or use eval/exec."
460
+ self._emit(
461
+ "safety_failed",
462
+ chunk_index=chunk_idx,
463
+ function_name=result["name"],
464
+ error=safety_error,
465
+ )
466
+ print(f" Safety check failed: {safety_error}")
467
+ continue
468
+
469
+ # Runtime validation if enabled
470
+ if self.validate_runtime:
471
+ # Use holdout data if available, else sample from generation chunk
472
+ if use_holdout and holdout_chunk:
473
+ sample_data = extract_sample_data(
474
+ holdout_chunk, mode=self._effective_mode
475
+ )
476
+ else:
477
+ sample_data = extract_sample_data(
478
+ gen_chunk, mode=self._effective_mode
479
+ )
480
+ valid, error_msg = validate_function(
481
+ result["code"],
482
+ sample_data,
483
+ result["name"],
484
+ mode=self._effective_mode,
485
+ )
486
+ if not valid:
487
+ error_feedback = f"Runtime validation failed: {error_msg}"
488
+ self._emit(
489
+ "validation_failed",
490
+ chunk_index=chunk_idx,
491
+ function_name=result["name"],
492
+ error=error_msg,
493
+ )
494
+ print(f" Validation failed: {error_msg}")
495
+ continue
496
+
497
+ self.functions.append({
498
+ "name": result["name"],
499
+ "docstring": result["docstring"],
500
+ "code": result["code"],
501
+ })
502
+ # Track for saturation check
503
+ self._recent_new_function_count += 1
504
+ self._emit(
505
+ "function_generated",
506
+ chunk_index=chunk_idx,
507
+ function_name=result["name"],
508
+ )
509
+ print(f" Generated: {result['name']}")
510
+ else:
511
+ # LLM said needs_more_work but didn't provide code
512
+ print(f" Warning: iteration {iteration + 1} produced no function")
513
+
514
+ print(f" Warning: chunk {chunk_idx} hit max iterations ({self.max_iterations})")
515
+ self._emit("chunk_done", chunk_index=chunk_idx)
516
+
517
+ def _process_chunk_dry_run(self, chunk: str, chunk_idx: int) -> None:
518
+ """Process chunk in dry run mode - detect issues only."""
519
+ context = build_context(self.functions, self.context_budget)
520
+ prompt = build_prompt(
521
+ self.instructions,
522
+ context,
523
+ chunk,
524
+ self._schema_str,
525
+ mode=self._effective_mode,
526
+ )
527
+
528
+ try:
529
+ response = self._call_llm_timed(prompt, chunk_index=chunk_idx)
530
+ result = parse_response(response)
531
+ except ParseError as e:
532
+ print(f" Warning: parse error in dry run: {e}")
533
+ self._emit("chunk_done", chunk_index=chunk_idx)
534
+ return
535
+
536
+ # Extract issues from result
537
+ issues = result.get("issues", [])
538
+ self._emit(
539
+ "issues_detected",
540
+ chunk_index=chunk_idx,
541
+ issues=issues,
542
+ )
543
+
544
+ if issues:
545
+ unsolved = [i for i in issues if not i.get("solved", False)]
546
+ print(f" Found {len(issues)} issues ({len(unsolved)} unsolved)")
547
+ else:
548
+ print(" No issues detected")
549
+
550
+ self._emit("chunk_done", chunk_index=chunk_idx)
551
+
552
+ def _write_output(self) -> None:
553
+ """Write generated functions to cleaning_functions.py."""
554
+ from .output import write_cleaning_file
555
+
556
+ try:
557
+ write_cleaning_file(self.functions)
558
+ except OutputValidationError as e:
559
+ print(f" Error: {e}")
560
+ print(" Attempting to write valid functions only...")
561
+ # Try writing functions one by one, skipping invalid ones
562
+ valid_functions = []
563
+ for f in self.functions:
564
+ try:
565
+ import ast
566
+ ast.parse(f["code"])
567
+ valid_functions.append(f)
568
+ except SyntaxError:
569
+ print(f" Skipping invalid function: {f['name']}")
570
+ if valid_functions:
571
+ write_cleaning_file(valid_functions)
572
+ else:
573
+ print(" No valid functions to write.")
574
+
575
+ def _write_report(self) -> None:
576
+ """Write cleaning report if report_path is set."""
577
+ if self.report_path is None:
578
+ return
579
+
580
+ from .report import write_report
581
+
582
+ # Prepare quality metrics if available
583
+ quality_before = None
584
+ quality_after = None
585
+ if self.metrics_before:
586
+ quality_before = {
587
+ "null_count": self.metrics_before.null_count,
588
+ "empty_string_count": self.metrics_before.empty_string_count,
589
+ }
590
+ if self.metrics_after:
591
+ quality_after = {
592
+ "null_count": self.metrics_after.null_count,
593
+ "empty_string_count": self.metrics_after.empty_string_count,
594
+ }
595
+
596
+ write_report(
597
+ report_path=self.report_path,
598
+ file_path=self.file_path,
599
+ total_chunks=self._total_chunks,
600
+ functions=self.functions,
601
+ latency_stats=self._get_latency_summary(),
602
+ quality_before=quality_before,
603
+ quality_after=quality_after,
604
+ )
605
+
606
+ def get_improvement_report(self) -> dict | None:
607
+ """
608
+ Get a comparison report of before/after quality metrics.
609
+
610
+ Returns:
611
+ Dictionary with improvement statistics, or None if metrics
612
+ weren't tracked or after metrics aren't available yet.
613
+ """
614
+ if self.metrics_before is None:
615
+ return None
616
+ if self.metrics_after is None:
617
+ # Return partial report with just before metrics
618
+ return {
619
+ "status": "incomplete",
620
+ "metrics_before": {
621
+ "null_count": self.metrics_before.null_count,
622
+ "empty_string_count": self.metrics_before.empty_string_count,
623
+ "unique_values": self.metrics_before.unique_values,
624
+ "total_records": self.metrics_before.total_records,
625
+ },
626
+ "metrics_after": None,
627
+ }
628
+ return compare_quality(self.metrics_before, self.metrics_after)
@@ -0,0 +1,27 @@
1
+ """Context management for docstring registry."""
2
+
3
+
4
+ def build_context(functions: list[dict], max_chars: int = 8000) -> str:
5
+ """
6
+ Build context string from generated functions for LLM prompt.
7
+
8
+ Uses FIFO eviction - keeps most recent functions that fit within budget.
9
+
10
+ Args:
11
+ functions: List of dicts with 'name' and 'docstring' keys
12
+ max_chars: Maximum character budget for context
13
+
14
+ Returns:
15
+ Formatted string of function docstrings, or placeholder if empty
16
+ """
17
+ if not functions:
18
+ return "(No functions generated yet)"
19
+
20
+ ctx = ""
21
+ for f in reversed(functions):
22
+ entry = f"## {f['name']}\n{f['docstring']}\n\n"
23
+ if len(ctx) + len(entry) > max_chars:
24
+ break
25
+ ctx = entry + ctx
26
+
27
+ return ctx if ctx else "(No functions generated yet)"