mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

@@ -0,0 +1,1003 @@
1
+ """
2
+ Multi-hop Context Completion Agent
3
+
4
+ This agent iteratively verifies chunk completeness and retrieves additional context
5
+ when needed, using a breadth-first approach with depth limits.
6
+
7
+ Architecture:
8
+ - Max Breadth: 5 search strings per verification (fewer is better)
9
+ - Max Depth: 10 iterations
10
+ - Chunks per Search: Top 2 most relevant chunks
11
+
12
+ This module also includes simple retrieval functions for single-query use cases.
13
+ """
14
+
15
+ import json
16
+ import re
17
+ import torch
18
+ import faiss
19
+ import numpy as np
20
+ from typing import List, Dict, Tuple, Optional, Union
21
+ from pathlib import Path
22
+ from mirage.core.llm import call_vlm_interweaved, setup_logging, batch_call_vlm_interweaved
23
+ from mirage.embeddings.models import NomicVLEmbed as NomicEmbedder
24
+ from mirage.embeddings.rerankers_multimodal import MonoVLMReranker, VLMReranker, TextEmbeddingReranker
25
+ from mirage.core.prompts import PROMPTS_CHUNK
26
+
27
+ # ============================================================================
28
+ # CONFIGURATION
29
+ # ============================================================================
30
+
31
+ # Multi-hop context completion parameters
32
+ # No artificial limits - run until context is COMPLETE
33
+ MAX_DEPTH = 20 # Effectively unlimited iterative searches
34
+ MAX_BREADTH = 20 # Effectively unlimited search strings per verification
35
+ CHUNKS_PER_SEARCH = 2 # Number of chunks to retrieve per search string
36
+ # Chunk addition mode: "EXPLANATORY" (only direct answers) or "RELATED" (includes related chunks)
37
+ CHUNK_ADDITION_MODE = "RELATED" # Default: include both EXPLANATORY and RELATED chunks
38
+
39
+ # Simple retrieval parameters
40
+ RETRIEVAL_METHOD = "top_k" # Options: "top_k" or "top_p"
41
+ RETRIEVAL_K = 20 # Number of chunks for top-k retrieval (increased for better recall)
42
+ RETRIEVAL_P = 0.9 # Cumulative probability threshold for top-p retrieval
43
+ RERANK_TOP_K = 10 # Number of chunks to rerank (increased proportionally)
44
+ CONTEXT_SIZE = 2 # Number of chunks to use as final context
45
+
46
+ # Paths (configured via main.py or config.yaml)
47
+ EMBEDDINGS_DIR = "output/results/embeddings"
48
+ CHUNKS_FILE = "output/results/chunks.json"
49
+ IMAGE_BASE_DIR = "output/results/markdown"
50
+
51
+ # ============================================================================
52
+ # UTILITY FUNCTIONS
53
+ # ============================================================================
54
+
55
+ def extract_image_paths(artifact_text, file_name=None):
56
+ """Extract all image paths from artifact field
57
+
58
+ Args:
59
+ artifact_text: The artifact string (e.g., "![Image](path1) ![Image](path2)")
60
+ file_name: The source document name (used for directory structure)
61
+
62
+ Returns:
63
+ List of absolute image paths (empty list if no artifacts found)
64
+ Note: Returns paths even if files don't exist (for multimodal detection)
65
+ """
66
+ if artifact_text == "None" or not artifact_text:
67
+ return []
68
+
69
+ # Find all image references: ![Image](path) or ![alt](path)
70
+ matches = re.findall(r'!\[[^\]]*\]\(([^)]+)\)', artifact_text)
71
+
72
+ image_paths = []
73
+ for rel_path in matches:
74
+ # If file_name is provided, assume images are in a subdirectory named after the file
75
+ if file_name and file_name != 'unknown':
76
+ # Structure: IMAGE_BASE_DIR / file_name / rel_path
77
+ abs_path = f"{IMAGE_BASE_DIR}/{file_name}/{rel_path}"
78
+ else:
79
+ # Fallback to direct concatenation (legacy behavior)
80
+ abs_path = f"{IMAGE_BASE_DIR}/{rel_path}"
81
+
82
+ image_paths.append(abs_path)
83
+
84
+ return image_paths
85
+
86
+
87
+ def extract_image_path(artifact_text, file_name=None):
88
+ """Extract first image path from artifact field (for backward compatibility)
89
+
90
+ Args:
91
+ artifact_text: The artifact string (e.g., "![Image](path)")
92
+ file_name: The source document name (used for directory structure)
93
+
94
+ Returns:
95
+ Absolute path to first image, or None if no artifact found
96
+ Note: Returns path even if file doesn't exist (for multimodal detection)
97
+ """
98
+ paths = extract_image_paths(artifact_text, file_name)
99
+ return paths[0] if paths else None
100
+
101
+
102
+ # ============================================================================
103
+ # SIMPLE RETRIEVAL FUNCTIONS
104
+ # ============================================================================
105
+
106
+ def retrieve_and_rerank(query: str, model_name: str = None,
107
+ retrieval_method: str = "top_k", retrieval_k: int = 10, retrieval_p: float = 0.9,
108
+ rerank_top_k: int = 5, context_size: int = 2):
109
+ """Retrieve top chunks and rerank them (simple single-query retrieval)
110
+
111
+ Args:
112
+ query: Query string
113
+ model_name: Embedding model name
114
+ retrieval_method: "top_k" or "top_p"
115
+ retrieval_k: Number of chunks for top-k retrieval
116
+ retrieval_p: Cumulative probability threshold for top-p retrieval
117
+ rerank_top_k: Number of chunks to rerank
118
+ context_size: Number of chunks to use as final context
119
+
120
+ Returns:
121
+ List of tuples: (orig_idx, relevance, chunk_dict)
122
+ """
123
+
124
+ # Use cached embeddings and index if available (much faster)
125
+ import sys
126
+ this_module = sys.modules[__name__]
127
+
128
+ # Get model name from main's config if not specified
129
+ if model_name is None:
130
+ try:
131
+ import main
132
+ model_name = getattr(main, 'EMBEDDING_MODEL', 'bge_m3')
133
+ except:
134
+ model_name = 'bge_m3' # Default fallback
135
+
136
+ # Check if caching is enabled and cached embeddings/index are available
137
+ cache_enabled = False
138
+ try:
139
+ import main
140
+ cache_enabled = hasattr(main, 'CACHE_EMBEDDINGS') and main.CACHE_EMBEDDINGS
141
+ except:
142
+ pass
143
+
144
+ # Check for cached embeddings and index in memory (only if caching is enabled)
145
+ if (cache_enabled and
146
+ hasattr(this_module, '_cached_chunk_index') and
147
+ this_module._cached_chunk_index is not None and
148
+ hasattr(this_module, '_cached_chunk_ids') and
149
+ this_module._cached_chunk_ids is not None):
150
+
151
+ # Use cached in-memory index and embeddings
152
+ index = this_module._cached_chunk_index
153
+ chunk_ids = this_module._cached_chunk_ids
154
+
155
+ # Load chunks (still need to load for content)
156
+ print(f"Loading chunks from {CHUNKS_FILE}...")
157
+ with open(CHUNKS_FILE, 'r') as f:
158
+ chunks = json.load(f)
159
+ print(f"Loaded {len(chunks)} chunks")
160
+ print(f"✅ Using cached embeddings index in memory (fast retrieval)")
161
+
162
+ # Use cached embedder
163
+ if hasattr(this_module, '_cached_embedder') and this_module._cached_embedder is not None:
164
+ embedder = this_module._cached_embedder
165
+ else:
166
+ print(f"Loading {model_name} embedder...")
167
+ embedder = NomicEmbedder()
168
+
169
+ # Embed query only (chunk embeddings already cached)
170
+ # Use GPU lock if available for thread-safe access
171
+ # Use convert_to_numpy=True to avoid device mismatch when model is on CPU
172
+ gpu_lock = getattr(this_module, '_gpu_lock', None)
173
+ if gpu_lock:
174
+ with gpu_lock:
175
+ query_embedding = embedder.encode(query, convert_to_numpy=True)
176
+ else:
177
+ query_embedding = embedder.encode(query, convert_to_numpy=True)
178
+ if isinstance(query_embedding, torch.Tensor):
179
+ query_array = query_embedding.cpu().float().numpy()
180
+ else:
181
+ query_array = np.array(query_embedding, dtype=np.float32)
182
+ if query_array.ndim == 1:
183
+ query_array = query_array.reshape(1, -1)
184
+ faiss.normalize_L2(query_array)
185
+ else:
186
+ # Fallback to disk-based loading
187
+ print(f"Loading chunks from {CHUNKS_FILE}...")
188
+ with open(CHUNKS_FILE, 'r') as f:
189
+ chunks = json.load(f)
190
+ print(f"Loaded {len(chunks)} chunks")
191
+
192
+ # Load FAISS index from disk
193
+ index_path = f"{EMBEDDINGS_DIR}/{model_name}_index.faiss"
194
+ print(f"Loading FAISS index from {index_path}...")
195
+ index = faiss.read_index(index_path)
196
+
197
+ # Load metadata
198
+ metadata_path = f"{EMBEDDINGS_DIR}/{model_name}_metadata.json"
199
+ with open(metadata_path, 'r') as f:
200
+ metadata = json.load(f)
201
+ chunk_ids = metadata['chunk_ids']
202
+
203
+ # Use cached embedder if available, otherwise load new one
204
+ if hasattr(this_module, '_cached_embedder') and this_module._cached_embedder is not None:
205
+ embedder = this_module._cached_embedder
206
+ else:
207
+ print(f"Loading {model_name} embedder...")
208
+ embedder = NomicEmbedder()
209
+
210
+ # Embed query with GPU lock for thread-safe access
211
+ # Use convert_to_numpy=True to avoid device mismatch when model is on CPU
212
+ print(f"Embedding query...")
213
+ gpu_lock = getattr(this_module, '_gpu_lock', None)
214
+ if gpu_lock:
215
+ with gpu_lock:
216
+ query_embedding = embedder.encode(query, convert_to_numpy=True)
217
+ else:
218
+ query_embedding = embedder.encode(query, convert_to_numpy=True)
219
+ if isinstance(query_embedding, torch.Tensor):
220
+ query_array = query_embedding.cpu().float().numpy()
221
+ else:
222
+ query_array = np.array(query_embedding, dtype=np.float32)
223
+ if query_array.ndim == 1:
224
+ query_array = query_array.reshape(1, -1)
225
+ faiss.normalize_L2(query_array)
226
+
227
+ # Retrieve based on method
228
+ if retrieval_method == "top_k":
229
+ print(f"\nRetrieving top-{retrieval_k} chunks...")
230
+ scores, indices = index.search(query_array, retrieval_k)
231
+ num_retrieve = retrieval_k
232
+ elif retrieval_method == "top_p":
233
+ # Retrieve more candidates initially for top-p filtering
234
+ max_candidates = min(100, len(chunk_ids))
235
+ scores, indices = index.search(query_array, max_candidates)
236
+
237
+ # Apply top-p filtering
238
+ scores_flat = scores[0]
239
+ # Convert cosine similarity to probability distribution
240
+ probs = np.exp(scores_flat) / np.sum(np.exp(scores_flat))
241
+ cumsum = np.cumsum(probs)
242
+ cutoff_idx = np.searchsorted(cumsum, retrieval_p) + 1
243
+
244
+ scores = scores[:, :cutoff_idx]
245
+ indices = indices[:, :cutoff_idx]
246
+ num_retrieve = cutoff_idx
247
+ print(f"\nRetrieving top-p (p={retrieval_p}) chunks: {num_retrieve} chunks selected...")
248
+ else:
249
+ raise ValueError(f"Unknown retrieval method: {retrieval_method}")
250
+
251
+ # Get retrieved chunks
252
+ retrieved_chunks = []
253
+ for idx, score in zip(indices[0], scores[0]):
254
+ chunk_id = chunk_ids[idx]
255
+ chunk = chunks[int(chunk_id) - 1] # chunk_id is 1-indexed
256
+
257
+ # Extract file_name (source document) if available
258
+ file_name = chunk.get('file_name', 'unknown')
259
+
260
+ # Extract all image paths from artifact field (chunks.json contains artifact field)
261
+ artifact_text = chunk.get('artifact', 'None')
262
+ artifact_paths = extract_image_paths(artifact_text, file_name) # List of image paths
263
+ image_path = artifact_paths[0] if artifact_paths else None # First image for backward compatibility
264
+
265
+ retrieved_chunks.append({
266
+ 'text': chunk['content'],
267
+ 'artifact': artifact_paths, # List of image paths (zero, one, or more)
268
+ 'image_path': image_path, # First image path for backward compatibility
269
+ 'chunk_id': chunk_id,
270
+ 'file_name': file_name, # Track source document
271
+ 'score': float(score)
272
+ })
273
+
274
+ print(f"\nTop {num_retrieve} Retrieved Chunks (before reranking):")
275
+ for i, chunk in enumerate(retrieved_chunks, 1):
276
+ file_name = chunk.get('file_name', 'unknown')
277
+ print(f"{i}. Document: {file_name}, Chunk ID: {chunk['chunk_id']}, Score: {chunk['score']:.4f}")
278
+ print(f" Text: {chunk['text'][:150]}...")
279
+ print(f" Has image: {chunk['image_path'] is not None}")
280
+
281
+ # Rerank using MonoVLM (can swap with VLMReranker or TextEmbeddingReranker)
282
+ print(f"\n{'='*60}")
283
+ print(f"Reranking with MonoVLM to get top {rerank_top_k}...")
284
+ print(f"{'='*60}")
285
+
286
+ # Use cached reranker if available
287
+ this_module = sys.modules[__name__]
288
+ if hasattr(this_module, '_cached_reranker') and this_module._cached_reranker is not None:
289
+ reranker = this_module._cached_reranker
290
+ print(f"✅ Using cached MonoVLM reranker")
291
+ else:
292
+ reranker = MonoVLMReranker()
293
+
294
+ # Use GPU lock if available for thread-safe access
295
+ gpu_lock = getattr(this_module, '_gpu_lock', None)
296
+ if gpu_lock:
297
+ with gpu_lock:
298
+ reranked_results = reranker.rerank(query, retrieved_chunks, top_k=rerank_top_k)
299
+ else:
300
+ reranked_results = reranker.rerank(query, retrieved_chunks, top_k=rerank_top_k)
301
+
302
+ print(f"\nTop {rerank_top_k} Reranked Chunks:")
303
+ for i, (orig_idx, relevance, chunk) in enumerate(reranked_results, 1):
304
+ file_name = chunk.get('file_name', 'unknown')
305
+ print(f"{i}. Document: {file_name}, Chunk ID: {chunk['chunk_id']}, Relevance: {relevance:.4f}")
306
+ print(f" Text: {chunk['text'][:150]}...")
307
+ print(f" Has image: {chunk['image_path'] is not None}")
308
+
309
+ # Get final context
310
+ final_context = reranked_results[:context_size]
311
+
312
+ return final_context
313
+
314
+ # ============================================================================
315
+ # MULTI-HOP CONTEXT COMPLETION FUNCTIONS
316
+ # ============================================================================
317
+
318
+ def parse_verification_response(response: str) -> Tuple[str, Optional[List[str]], str]:
319
+ """Parse the LLM verification response
320
+
321
+ Returns:
322
+ (status, search_strings, explanation)
323
+ status: "COMPLETE" or "INCOMPLETE"
324
+ search_strings: List of search strings if incomplete, None if complete
325
+ explanation: Explanation text
326
+ """
327
+ # Extract status
328
+ status_match = re.search(r'Status:\s*(COMPLETE|INCOMPLETE)', response, re.IGNORECASE)
329
+ if not status_match:
330
+ raise ValueError(f"Could not parse status from response: {response}")
331
+
332
+ status = status_match.group(1).upper()
333
+
334
+ # Extract explanation first (needed for fallback)
335
+ explanation_match = re.search(r'Explanation:\s*(.+?)(?:\n\n|$)', response, re.DOTALL)
336
+ explanation = explanation_match.group(1).strip() if explanation_match else ""
337
+
338
+ # Extract search strings (if incomplete)
339
+ search_strings = None
340
+ if status == "INCOMPLETE":
341
+ query_match = re.search(r'Query:\s*([^,]+?)(?:,\s*Explanation:|$)', response, re.DOTALL)
342
+ if query_match:
343
+ query_text = query_match.group(1).strip()
344
+ # Split by pipe character
345
+ search_strings = [s.strip() for s in query_text.split('|') if s.strip()]
346
+
347
+ # Fallback: generate search query from explanation if Query was missing
348
+ if not search_strings and explanation:
349
+ # Extract Figure/Table/Annex references from explanation
350
+ refs = re.findall(r'(Figure|Table|Annex|Formula)\s*[A-Z]?\.?\d+(?:\.\d+)?', explanation, re.IGNORECASE)
351
+ if refs:
352
+ # Create search queries from references
353
+ search_strings = [ref for ref in refs[:5]] # Limit to 5
354
+ print(f" ⚠️ Fallback: Generated search strings from explanation: {search_strings}")
355
+
356
+ return status, search_strings, explanation
357
+
358
+
359
+ def verify_chunk_completeness(chunks: List[Dict], expert_persona: str,
360
+ domain: str) -> Tuple[str, Optional[List[str]], str]:
361
+ """Verify if a set of chunks provide complete context using VLM (interweaved)
362
+
363
+ Args:
364
+ chunks: List of chunks to verify
365
+ expert_persona: Expert role for domain-specific evaluation
366
+ domain: Domain context for evaluation
367
+
368
+ Returns:
369
+ (status, search_strings, explanation)
370
+ """
371
+ prompt_template = PROMPTS_CHUNK["completion_verification"]
372
+ prompt = prompt_template.format(expert_persona=expert_persona, domain=domain)
373
+ prompt += "\n\nAnalyze the following chunks to determine if the context is complete:"
374
+
375
+ response = call_vlm_interweaved(prompt, chunks)
376
+ return parse_verification_response(response)
377
+
378
+
379
+ def parse_addition_verification_response(response: str) -> Tuple[str, str]:
380
+ """Parse the chunk addition verification response
381
+
382
+ Returns:
383
+ (status, explanation)
384
+ status: "EXPLANATORY", "RELATED", or "UNRELATED"
385
+ explanation: Explanation text
386
+ """
387
+ # Extract status - try new format first, then fall back to old format for compatibility
388
+ status_match = re.search(r'Status:\s*(EXPLANATORY|RELATED|UNRELATED)', response, re.IGNORECASE)
389
+ if not status_match:
390
+ # Fallback: check for old HELPFUL/NOT_HELPFUL format
391
+ old_status_match = re.search(r'Status:\s*(HELPFUL|NOT_HELPFUL)', response, re.IGNORECASE)
392
+ if old_status_match:
393
+ old_status = old_status_match.group(1).upper()
394
+ # Map old format to new format
395
+ status = "EXPLANATORY" if old_status == "HELPFUL" else "UNRELATED"
396
+ else:
397
+ # Default to UNRELATED if parsing fails
398
+ return "UNRELATED", f"Could not parse response: {response[:200]}"
399
+ else:
400
+ status = status_match.group(1).upper()
401
+
402
+ # Extract explanation
403
+ explanation_match = re.search(r'Explanation:\s*(.+?)(?:\n\n|$)', response, re.DOTALL)
404
+ explanation = explanation_match.group(1).strip() if explanation_match else ""
405
+
406
+ return status, explanation
407
+
408
+
409
+ def verify_chunk_addition(original_chunks: List[Dict], search_query: str,
410
+ candidate_chunk: Dict, expert_persona: str,
411
+ domain: str) -> Tuple[str, str]:
412
+ """Verify if a candidate chunk should be added to the context for QA generation
413
+
414
+ Args:
415
+ original_chunks: List of current chunks (the original context being built)
416
+ search_query: The search query that was used to find the candidate
417
+ candidate_chunk: The retrieved chunk being considered for addition
418
+ expert_persona: Expert role for domain-specific evaluation
419
+ domain: Domain context for evaluation
420
+
421
+ Returns:
422
+ (status, explanation)
423
+ status: "EXPLANATORY", "RELATED", or "UNRELATED"
424
+ explanation: Classification reasoning
425
+ """
426
+ prompt_template = PROMPTS_CHUNK["chunk_addition_verification"]
427
+ prompt = prompt_template.format(expert_persona=expert_persona, domain=domain)
428
+
429
+ # Build the original chunk content for the prompt
430
+ original_content = "\n\n---\n\n".join([c.get('content', '') for c in original_chunks])
431
+ candidate_content = candidate_chunk.get('content', '')
432
+
433
+ # Format the verification request
434
+ verification_request = f"""
435
+ ORIGINAL CHUNK:
436
+ {original_content}
437
+
438
+ SEARCH QUERY: {search_query}
439
+
440
+ CANDIDATE CHUNK:
441
+ {candidate_content}
442
+ """
443
+
444
+ # Prepare chunks for VLM call (include images if present)
445
+ chunks_for_vlm = []
446
+
447
+ # Add original chunks with their images
448
+ for chunk in original_chunks:
449
+ chunks_for_vlm.append({
450
+ 'content': f"[ORIGINAL CONTEXT]\n{chunk.get('content', '')}",
451
+ 'image_path': chunk.get('image_path')
452
+ })
453
+
454
+ # Add candidate chunk with its image
455
+ chunks_for_vlm.append({
456
+ 'content': f"[CANDIDATE CHUNK for query: {search_query}]\n{candidate_content}",
457
+ 'image_path': candidate_chunk.get('image_path')
458
+ })
459
+
460
+ full_prompt = prompt + "\n\n" + verification_request
461
+
462
+ response = call_vlm_interweaved(full_prompt, chunks_for_vlm)
463
+ return parse_addition_verification_response(response)
464
+
465
+
466
+ def batch_verify_chunk_additions(original_chunks: List[Dict],
467
+ candidates: List[Tuple[str, Dict]],
468
+ expert_persona: str,
469
+ domain: str) -> List[Tuple[str, str]]:
470
+ """Batch verify multiple candidate chunks using concurrent API calls
471
+
472
+ Args:
473
+ original_chunks: List of current chunks (the original context being built)
474
+ candidates: List of (search_query, candidate_chunk) tuples
475
+ expert_persona: Expert role for domain-specific evaluation
476
+ domain: Domain context for evaluation
477
+
478
+ Returns:
479
+ List of (status, explanation) tuples in same order as candidates
480
+ """
481
+ if not candidates:
482
+ return []
483
+
484
+ prompt_template = PROMPTS_CHUNK["chunk_addition_verification"]
485
+ prompt_base = prompt_template.format(expert_persona=expert_persona, domain=domain)
486
+ original_content = "\n\n---\n\n".join([c.get('content', '') for c in original_chunks])
487
+
488
+ # Prepare batch requests
489
+ requests = []
490
+ for search_query, candidate_chunk in candidates:
491
+ candidate_content = candidate_chunk.get('content', '')
492
+
493
+ verification_request = f"""
494
+ ORIGINAL CHUNK:
495
+ {original_content}
496
+
497
+ SEARCH QUERY: {search_query}
498
+
499
+ CANDIDATE CHUNK:
500
+ {candidate_content}
501
+ """
502
+
503
+ # Prepare chunks for VLM call
504
+ chunks_for_vlm = []
505
+ for chunk in original_chunks:
506
+ chunks_for_vlm.append({
507
+ 'content': f"[ORIGINAL CONTEXT]\n{chunk.get('content', '')}",
508
+ 'image_path': chunk.get('image_path')
509
+ })
510
+ chunks_for_vlm.append({
511
+ 'content': f"[CANDIDATE CHUNK for query: {search_query}]\n{candidate_content}",
512
+ 'image_path': candidate_chunk.get('image_path')
513
+ })
514
+
515
+ full_prompt = prompt_base + "\n\n" + verification_request
516
+ requests.append((full_prompt, chunks_for_vlm))
517
+
518
+ # Execute batch call
519
+ print(f" ⚡ Batch verifying {len(requests)} candidates...")
520
+ responses = batch_call_vlm_interweaved(requests, show_progress=False)
521
+
522
+ # Parse all responses
523
+ results = []
524
+ for response in responses:
525
+ if response and not response.startswith("ERROR:"):
526
+ results.append(parse_addition_verification_response(response))
527
+ else:
528
+ results.append(("UNRELATED", f"Error: {response}"))
529
+
530
+ return results
531
+
532
+
533
+ def retrieve_chunks_for_query(query: str, top_k: int = 2) -> List[Dict]:
534
+ """Retrieve top-k chunks for a search query
535
+
536
+ Returns:
537
+ List of chunks with 'text', 'chunk_id', 'image_path', 'score'
538
+ """
539
+ # Use the existing retrieve_and_rerank function
540
+ results = retrieve_and_rerank(
541
+ query=query,
542
+ # model_name uses config default (EMBEDDING_MODEL)
543
+ retrieval_method="top_k",
544
+ retrieval_k=10, # Retrieve more, then rerank
545
+ rerank_top_k=5,
546
+ context_size=top_k
547
+ )
548
+
549
+ # Convert to our format
550
+ # Note: chunks from retrieve_and_rerank already have artifact (list) and image_path set
551
+ chunks = []
552
+ for orig_idx, relevance, chunk in results:
553
+ chunks.append({
554
+ 'content': chunk['text'],
555
+ 'chunk_id': chunk['chunk_id'],
556
+ 'file_name': chunk.get('file_name', 'unknown'),
557
+ 'artifact': chunk.get('artifact', []), # List of image paths
558
+ 'image_path': chunk.get('image_path'), # First image path for backward compatibility
559
+ 'score': relevance
560
+ })
561
+
562
+ return chunks
563
+
564
+
565
+ def build_complete_context(
566
+ initial_chunk: Union[str, Dict],
567
+ max_depth: int = MAX_DEPTH,
568
+ max_breadth: int = MAX_BREADTH,
569
+ chunks_per_search: int = CHUNKS_PER_SEARCH,
570
+ expert_persona: str = None,
571
+ domain: str = None,
572
+ log_details: bool = True,
573
+ chunk_addition_mode: str = CHUNK_ADDITION_MODE
574
+ ) -> Dict:
575
+ """Build complete context through iterative verification and retrieval
576
+
577
+ Args:
578
+ initial_chunk: The starting chunk (string or dict)
579
+ max_depth: Maximum iterations
580
+ max_breadth: Maximum search strings per verification
581
+ chunks_per_search: Number of chunks to retrieve per search string
582
+ expert_persona: Expert role for domain-specific evaluation
583
+ domain: Domain context for evaluation
584
+ log_details: Whether to include detailed iteration logs in output
585
+ chunk_addition_mode: "EXPLANATORY" (only direct answers) or "RELATED" (both)
586
+
587
+ Returns:
588
+ Dict with:
589
+ - 'status': 'COMPLETE' or 'INCOMPLETE'
590
+ - 'context': Combined context text
591
+ - 'chunks': List of chunk dicts used
592
+ - 'depth': Final depth reached
593
+ - 'chunks_added': List of chunk IDs added
594
+ - 'search_history': List of search strings used
595
+ - 'iteration_logs': Detailed logs per iteration (if log_details=True)
596
+ """
597
+ print("="*80)
598
+ print("MULTI-HOP CONTEXT COMPLETION")
599
+ print(f"Chunk Addition Mode: {chunk_addition_mode}")
600
+ print("="*80)
601
+
602
+ # Determine which statuses to accept based on mode
603
+ if chunk_addition_mode.upper() == "EXPLANATORY":
604
+ accepted_statuses = ("EXPLANATORY",)
605
+ else: # Default to RELATED (accept both)
606
+ accepted_statuses = ("EXPLANATORY", "RELATED")
607
+
608
+ # Initialize
609
+ if isinstance(initial_chunk, str):
610
+ # Fallback if string passed
611
+ current_chunks = [{
612
+ 'content': initial_chunk,
613
+ 'artifact': [], # Empty list - no images
614
+ 'image_path': None,
615
+ 'chunk_id': 'initial',
616
+ 'file_name': 'unknown'
617
+ }]
618
+ else:
619
+ # Expect dict with content and artifact/image_path
620
+ chunk_content = initial_chunk.get('content', '')
621
+ file_name = initial_chunk.get('file_name', 'unknown')
622
+
623
+ # Extract artifact as list of image paths
624
+ artifact_text = initial_chunk.get('artifact', 'None')
625
+ artifact_paths = extract_image_paths(artifact_text, file_name) # List of image paths
626
+
627
+ # For backward compatibility, set image_path to first image
628
+ image_path = artifact_paths[0] if artifact_paths else initial_chunk.get('image_path')
629
+
630
+ current_chunks = [{
631
+ 'content': chunk_content,
632
+ 'artifact': artifact_paths, # List of image paths (zero, one, or more)
633
+ 'image_path': image_path, # First image path for backward compatibility
634
+ 'chunk_id': initial_chunk.get('chunk_id', 'initial'),
635
+ 'file_name': file_name # Track source document
636
+ }]
637
+
638
+ depth = 0
639
+ chunks_added = [] # List of dicts: {'file_name': str, 'chunk_id': str}
640
+ search_history = []
641
+ max_breadth_used = 0 # Track maximum search strings used in any iteration
642
+ iteration_logs = [] # Detailed logs for each iteration
643
+
644
+ # Helper to get combined text for return value
645
+ def get_combined_text(chunks):
646
+ return "\n\n".join([c['content'] for c in chunks])
647
+
648
+ # Helper to check if chunk already added (by file_name + chunk_id)
649
+ def is_chunk_added(file_name, chunk_id):
650
+ return any(c.get('file_name') == file_name and c.get('chunk_id') == chunk_id
651
+ for c in chunks_added)
652
+
653
+ # Helper to add chunk to tracking
654
+ def add_chunk_to_tracking(file_name, chunk_id):
655
+ chunks_added.append({'file_name': file_name, 'chunk_id': chunk_id})
656
+
657
+ # Track initial chunk if it has file_name
658
+ initial_file_name = None
659
+ if isinstance(initial_chunk, dict) and 'file_name' in initial_chunk:
660
+ initial_file_name = initial_chunk['file_name']
661
+ elif isinstance(initial_chunk, dict) and 'chunk_id' in initial_chunk:
662
+ # Try to get file_name from the chunk if available
663
+ initial_file_name = initial_chunk.get('file_name', 'unknown')
664
+
665
+ if initial_file_name:
666
+ initial_chunk_id = current_chunks[0].get('chunk_id', 'initial')
667
+ add_chunk_to_tracking(initial_file_name, initial_chunk_id)
668
+
669
+ print(f"\n📄 Initial chunk length: {len(current_chunks[0]['content'])} chars")
670
+
671
+ while depth < max_depth:
672
+ depth += 1
673
+ print(f"\n{'='*80}")
674
+ print(f"DEPTH {depth}/{max_depth}")
675
+ print(f"{'='*80}")
676
+
677
+ # Initialize iteration log
678
+ iter_log = {
679
+ 'depth': depth,
680
+ 'status': None,
681
+ 'explanation': None,
682
+ 'search_strings': None,
683
+ 'retrieved_chunks': [],
684
+ 'chunks_added_this_iteration': []
685
+ }
686
+
687
+ # Verify completeness
688
+ print(f"\n🔍 Verifying chunk completeness...")
689
+ status, search_strings, explanation = verify_chunk_completeness(
690
+ current_chunks, expert_persona=expert_persona, domain=domain
691
+ )
692
+
693
+ print(f"Status: {status}")
694
+ print(f"Explanation: {explanation}")
695
+
696
+ # Log verification result
697
+ iter_log['status'] = status
698
+ iter_log['explanation'] = explanation
699
+ iter_log['search_strings'] = search_strings
700
+
701
+ if status == "COMPLETE":
702
+ hop_count = len(chunks_added) - 1 # Each hop adds one link
703
+ print(f"\n✅ Context is COMPLETE at depth {depth}, hop_count={hop_count}")
704
+ iteration_logs.append(iter_log)
705
+ return {
706
+ 'status': 'COMPLETE',
707
+ 'context': get_combined_text(current_chunks),
708
+ 'chunks': current_chunks,
709
+ 'depth': depth,
710
+ 'hop_count': hop_count,
711
+ 'max_breadth_used': max_breadth_used,
712
+ 'chunks_added': chunks_added,
713
+ 'search_history': search_history,
714
+ 'termination_reason': 'Context verified as complete',
715
+ 'iteration_logs': iteration_logs if log_details else None
716
+ }
717
+
718
+ # Context is incomplete
719
+ if not search_strings:
720
+ print(f"\n⚠️ No search strings generated despite INCOMPLETE status. Stopping.")
721
+ hop_count = len(chunks_added) - 1
722
+ iteration_logs.append(iter_log)
723
+ return {
724
+ 'status': 'INCOMPLETE_NO_SEARCH_STRINGS',
725
+ 'context': get_combined_text(current_chunks),
726
+ 'chunks': current_chunks,
727
+ 'depth': depth,
728
+ 'hop_count': hop_count,
729
+ 'max_breadth_used': max_breadth_used,
730
+ 'chunks_added': chunks_added,
731
+ 'search_history': search_history,
732
+ 'termination_reason': 'LLM marked chunk as INCOMPLETE but failed to generate search strings',
733
+ 'iteration_logs': iteration_logs if log_details else None
734
+ }
735
+
736
+ # Limit breadth and track max used
737
+ search_strings = search_strings[:max_breadth]
738
+ max_breadth_used = max(max_breadth_used, len(search_strings))
739
+ print(f"\n🔎 Generated {len(search_strings)} search strings:")
740
+ for i, s in enumerate(search_strings, 1):
741
+ print(f" {i}. {s}")
742
+
743
+ # Retrieve and verify chunks for each search string - BATCH PROCESSING
744
+ print(f"\n📥 Retrieving top {chunks_per_search} chunks per search string...")
745
+ new_chunks = []
746
+ any_relevant_found = False # Track if ANY chunk was verified as EXPLANATORY or RELATED
747
+
748
+ # Phase 1: Collect all candidates to verify
749
+ candidates_to_verify = [] # List of (search_string, chunk) tuples
750
+ candidate_info = [] # For logging
751
+
752
+ for i, search_string in enumerate(search_strings, 1):
753
+ print(f"\n Search {i}/{len(search_strings)}: {search_string}")
754
+ search_history.append(search_string)
755
+
756
+ try:
757
+ retrieved = retrieve_chunks_for_query(search_string, top_k=chunks_per_search)
758
+
759
+ for j, chunk in enumerate(retrieved, 1):
760
+ chunk_id = chunk['chunk_id']
761
+ file_name = chunk.get('file_name', 'unknown')
762
+
763
+ # Check if chunk already added (by file_name + chunk_id combination)
764
+ initial_chunk_id = current_chunks[0].get('chunk_id', 'initial')
765
+ initial_file_name = current_chunks[0].get('file_name', 'unknown')
766
+
767
+ if (not is_chunk_added(file_name, chunk_id) and
768
+ not (file_name == initial_file_name and chunk_id == initial_chunk_id)):
769
+ candidates_to_verify.append((search_string, chunk))
770
+ candidate_info.append((file_name, chunk_id, chunk['score']))
771
+ print(f" 📋 Queued chunk {file_name}:{chunk_id} (score: {chunk['score']:.4f}) for verification")
772
+ else:
773
+ print(f" ⊘ Chunk {file_name}:{chunk_id} already added")
774
+
775
+ except Exception as e:
776
+ print(f" ⚠️ Error retrieving chunks: {e}")
777
+
778
+ # Phase 2: Batch verify all candidates
779
+ if candidates_to_verify:
780
+ if len(candidates_to_verify) > 1:
781
+ # Use batch verification for multiple candidates
782
+ verification_results = batch_verify_chunk_additions(
783
+ current_chunks, candidates_to_verify,
784
+ expert_persona=expert_persona, domain=domain
785
+ )
786
+ else:
787
+ # Single candidate - use regular verification
788
+ search_string, chunk = candidates_to_verify[0]
789
+ status, explanation = verify_chunk_addition(
790
+ current_chunks, search_string, chunk,
791
+ expert_persona=expert_persona, domain=domain
792
+ )
793
+ verification_results = [(status, explanation)]
794
+
795
+ # Process results and log
796
+ for (search_string, chunk), (status, explanation), (file_name, chunk_id, score) in zip(
797
+ candidates_to_verify, verification_results, candidate_info
798
+ ):
799
+ # Log retrieved chunk info
800
+ chunk_log = {
801
+ 'search_query': search_string,
802
+ 'chunk_id': chunk_id,
803
+ 'file_name': file_name,
804
+ 'score': float(score),
805
+ 'verdict': status,
806
+ 'reason': explanation[:200] if explanation else ''
807
+ }
808
+ iter_log['retrieved_chunks'].append(chunk_log)
809
+
810
+ if status in accepted_statuses:
811
+ add_chunk_to_tracking(file_name, chunk_id)
812
+ # Add classification tag to chunk for context building
813
+ chunk['classification'] = status
814
+ new_chunks.append(chunk)
815
+ any_relevant_found = True
816
+ iter_log['chunks_added_this_iteration'].append({
817
+ 'file_name': file_name,
818
+ 'chunk_id': chunk_id,
819
+ 'classification': status
820
+ })
821
+ emoji = "🎯" if status == "EXPLANATORY" else "🔗"
822
+ print(f" {emoji} {status}: {file_name}:{chunk_id} - {explanation[:100]}...")
823
+ elif status == "RELATED" and chunk_addition_mode.upper() == "EXPLANATORY":
824
+ # Log RELATED chunks that were skipped due to mode
825
+ print(f" ⏭️ RELATED (skipped - mode={chunk_addition_mode}): {file_name}:{chunk_id} - {explanation[:100]}...")
826
+ else:
827
+ print(f" ❌ UNRELATED: {file_name}:{chunk_id} - {explanation[:100]}...")
828
+
829
+ # Save iteration log
830
+ iteration_logs.append(iter_log)
831
+
832
+ # Add verified relevant chunks (EXPLANATORY or RELATED) to context
833
+ if new_chunks:
834
+ explanatory_count = sum(1 for c in new_chunks if c.get('classification') == 'EXPLANATORY')
835
+ related_count = sum(1 for c in new_chunks if c.get('classification') == 'RELATED')
836
+ print(f"\n📝 Adding {len(new_chunks)} verified chunks to context ({explanatory_count} EXPLANATORY, {related_count} RELATED)")
837
+ current_chunks.extend(new_chunks)
838
+ else:
839
+ # No relevant chunks found across ALL search strings
840
+ if not any_relevant_found:
841
+ print(f"\n🛑 No relevant chunks found across all {len(search_strings)} search strings.")
842
+ print(f" The corpus likely does not contain the information needed to complete this context.")
843
+ print(f" Stopping multi-hop context building for this chunk.")
844
+ hop_count = len(chunks_added) - 1
845
+ return {
846
+ 'status': 'INCOMPLETE_NO_RELEVANT_CHUNKS',
847
+ 'context': get_combined_text(current_chunks),
848
+ 'chunks': current_chunks,
849
+ 'depth': depth,
850
+ 'hop_count': hop_count,
851
+ 'max_breadth_used': max_breadth_used,
852
+ 'chunks_added': chunks_added,
853
+ 'search_history': search_history,
854
+ 'termination_reason': 'No retrieved chunks were classified as EXPLANATORY or RELATED',
855
+ 'iteration_logs': iteration_logs if log_details else None
856
+ }
857
+ else:
858
+ print(f"\n⚠️ No new chunks to add (all duplicates). Stopping.")
859
+ hop_count = len(chunks_added) - 1
860
+ return {
861
+ 'status': 'INCOMPLETE_ALL_DUPLICATES',
862
+ 'context': get_combined_text(current_chunks),
863
+ 'chunks': current_chunks,
864
+ 'depth': depth,
865
+ 'hop_count': hop_count,
866
+ 'max_breadth_used': max_breadth_used,
867
+ 'chunks_added': chunks_added,
868
+ 'search_history': search_history,
869
+ 'termination_reason': 'All retrieved chunks were duplicates of already-added chunks',
870
+ 'iteration_logs': iteration_logs if log_details else None
871
+ }
872
+
873
+ # Max depth reached (only if loop completes without early termination)
874
+ hop_count = len(chunks_added) - 1
875
+ print(f"\n⚠️ Max depth {max_depth} reached. Context may still be INCOMPLETE. hop_count={hop_count}")
876
+
877
+ return {
878
+ 'status': 'INCOMPLETE_MAX_DEPTH',
879
+ 'context': get_combined_text(current_chunks),
880
+ 'chunks': current_chunks,
881
+ 'depth': depth,
882
+ 'hop_count': hop_count,
883
+ 'max_breadth_used': max_breadth_used,
884
+ 'chunks_added': chunks_added,
885
+ 'search_history': search_history,
886
+ 'termination_reason': f'Maximum depth of {max_depth} iterations reached',
887
+ 'iteration_logs': iteration_logs if log_details else None
888
+ }
889
+
890
+
891
+ # ============================================================================
892
+ # MAIN
893
+ # ============================================================================
894
+
895
+ if __name__ == "__main__":
896
+ import sys
897
+
898
+ # Setup logging
899
+ setup_logging()
900
+
901
+ # Check command line argument for mode
902
+ mode = sys.argv[1] if len(sys.argv) > 1 else "multihop"
903
+
904
+ if mode == "simple":
905
+ # Test simple retrieval
906
+ print("="*80)
907
+ print("TESTING SIMPLE RETRIEVAL")
908
+ print("="*80)
909
+
910
+ query = "How to minimize the normalized losses of a motor in Y connection?"
911
+
912
+ print(f"\nQUERY: {query}")
913
+ print("="*80)
914
+ print(f"\nHyperparameters:")
915
+ print(f" Retrieval method: {RETRIEVAL_METHOD}")
916
+ if RETRIEVAL_METHOD == "top_k":
917
+ print(f" Retrieval k: {RETRIEVAL_K}")
918
+ else:
919
+ print(f" Retrieval p: {RETRIEVAL_P}")
920
+ print(f" Rerank top-k: {RERANK_TOP_K}")
921
+ print(f" Context size: {CONTEXT_SIZE}")
922
+ print("="*80)
923
+
924
+ # Retrieve and rerank
925
+ context = retrieve_and_rerank(
926
+ query,
927
+ # model_name uses config default (EMBEDDING_MODEL)
928
+ retrieval_method=RETRIEVAL_METHOD,
929
+ retrieval_k=RETRIEVAL_K,
930
+ retrieval_p=RETRIEVAL_P,
931
+ rerank_top_k=RERANK_TOP_K,
932
+ context_size=CONTEXT_SIZE
933
+ )
934
+
935
+ # Print final context
936
+ print("\n" + "="*80)
937
+ print(f"FINAL CONTEXT (Top {CONTEXT_SIZE})")
938
+ print("="*80)
939
+
940
+ for i, (orig_idx, relevance, chunk) in enumerate(context, 1):
941
+ print(f"\n--- Context Chunk {i} ---")
942
+ print(f"Chunk ID: {chunk['chunk_id']}")
943
+ print(f"Relevance Score: {relevance:.4f}")
944
+ print(f"Has Image: {chunk['image_path'] is not None}")
945
+ if chunk['image_path']:
946
+ print(f"Image Path: {chunk['image_path']}")
947
+ print(f"\nContent:\n{chunk['text']}")
948
+ print("-" * 80)
949
+
950
+ else:
951
+ # Test multi-hop context completion
952
+ print("="*80)
953
+ print("TESTING MULTI-HOP CONTEXT COMPLETION")
954
+ print("="*80)
955
+
956
+ # Load first 10 chunks from JSON file
957
+ print(f"\nLoading first 10 chunks from {CHUNKS_FILE}...")
958
+ with open(CHUNKS_FILE, 'r') as f:
959
+ all_chunks = json.load(f)
960
+
961
+ test_chunks = all_chunks
962
+ print(f"Loaded {len(test_chunks)} chunks for testing")
963
+
964
+ # Test with each chunk
965
+ for i, chunk in enumerate(test_chunks):
966
+ print(f"\n{'='*80}")
967
+ print(f"TESTING CHUNK {i+1}/{len(test_chunks)}")
968
+ print(f"Chunk ID: {chunk['chunk_id']}, Type: {chunk['chunk_type']}")
969
+ print(f"{'='*80}")
970
+
971
+ # Extract image path from artifact if it's a standalone image
972
+ image_path = None
973
+ file_name = chunk.get('file_name', 'unknown')
974
+ if chunk.get('artifact') != "None" and chunk.get('artifact'):
975
+ image_path = extract_image_path(chunk['artifact'], file_name)
976
+
977
+ test_chunk_dict = {
978
+ 'content': chunk['content'],
979
+ 'chunk_id': chunk['chunk_id'],
980
+ 'image_path': image_path,
981
+ 'artifact': chunk.get('artifact'),
982
+ 'file_name': file_name
983
+ }
984
+
985
+ result = build_complete_context(
986
+ initial_chunk=test_chunk_dict,
987
+ max_depth=3,
988
+ max_breadth=3,
989
+ chunks_per_search=2
990
+ )
991
+
992
+ print(f"\n{'-'*80}")
993
+ print(f"RESULT FOR CHUNK {chunk['chunk_id']}")
994
+ print(f"{'-'*80}")
995
+ print(f"Status: {result['status']}")
996
+ print(f"Depth reached: {result['depth']}")
997
+ print(f"Chunks added: {len(result['chunks_added'])}")
998
+ print(f"Chunk IDs: {result['chunks_added']}")
999
+ print(f"Search history: {len(result['search_history'])} searches")
1000
+ print(f"Final context length: {len(result['context'])} chars")
1001
+ print(f"\nFirst 300 chars of final context:")
1002
+ print(result['context'][:300])
1003
+ print("...")