mirage-benchmark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mirage-benchmark might be problematic. Click here for more details.
- mirage/__init__.py +83 -0
- mirage/cli.py +150 -0
- mirage/core/__init__.py +52 -0
- mirage/core/config.py +248 -0
- mirage/core/llm.py +1745 -0
- mirage/core/prompts.py +884 -0
- mirage/embeddings/__init__.py +31 -0
- mirage/embeddings/models.py +512 -0
- mirage/embeddings/rerankers_multimodal.py +766 -0
- mirage/embeddings/rerankers_text.py +149 -0
- mirage/evaluation/__init__.py +26 -0
- mirage/evaluation/metrics.py +2223 -0
- mirage/evaluation/metrics_optimized.py +2172 -0
- mirage/pipeline/__init__.py +45 -0
- mirage/pipeline/chunker.py +545 -0
- mirage/pipeline/context.py +1003 -0
- mirage/pipeline/deduplication.py +491 -0
- mirage/pipeline/domain.py +514 -0
- mirage/pipeline/pdf_processor.py +598 -0
- mirage/pipeline/qa_generator.py +798 -0
- mirage/utils/__init__.py +31 -0
- mirage/utils/ablation.py +360 -0
- mirage/utils/preflight.py +663 -0
- mirage/utils/stats.py +626 -0
- mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
- mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
- mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
- mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
- mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
- mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline module for MiRAGE - Document processing, QA generation, and deduplication.
|
|
3
|
+
|
|
4
|
+
Imports are lazy to avoid loading optional dependencies at import time.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Mapping of attribute names to (module_name, attr_name)
|
|
8
|
+
_LAZY_IMPORTS = {
|
|
9
|
+
# PDF Processing (requires docling, matplotlib - optional)
|
|
10
|
+
"process_pdf_to_markdown": ("pdf_processor", "process_pdf_to_markdown"),
|
|
11
|
+
"process_directory": ("pdf_processor", "process_directory"),
|
|
12
|
+
# Chunking
|
|
13
|
+
"chunk_markdown_to_semantic": ("chunker", "chunk_markdown_to_semantic"),
|
|
14
|
+
"process_markdown_file": ("chunker", "process_markdown_file"),
|
|
15
|
+
# Context
|
|
16
|
+
"build_complete_context": ("context", "build_complete_context"),
|
|
17
|
+
"retrieve_similar_chunks": ("context", "retrieve_similar_chunks"),
|
|
18
|
+
"ContextBuilder": ("context", "ContextBuilder"),
|
|
19
|
+
# QA Generation
|
|
20
|
+
"generate_qa_for_chunk": ("qa_generator", "generate_qa_for_chunk"),
|
|
21
|
+
"verify_qa_pair": ("qa_generator", "verify_qa_pair"),
|
|
22
|
+
"select_best_qa_pairs": ("qa_generator", "select_best_qa_pairs"),
|
|
23
|
+
# Domain
|
|
24
|
+
"fetch_domain_and_role": ("domain", "fetch_domain_and_role"),
|
|
25
|
+
"load_domain_expert_from_env": ("domain", "load_domain_expert_from_env"),
|
|
26
|
+
"save_domain_expert_to_env": ("domain", "save_domain_expert_to_env"),
|
|
27
|
+
"DomainExtractor": ("domain", "DomainExtractor"),
|
|
28
|
+
# Deduplication
|
|
29
|
+
"deduplicate_qa_pairs": ("deduplication", "deduplicate_qa_pairs"),
|
|
30
|
+
"cluster_questions": ("deduplication", "cluster_questions"),
|
|
31
|
+
"merge_similar_qa": ("deduplication", "merge_similar_qa"),
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def __getattr__(name):
|
|
36
|
+
"""Lazy import to avoid loading optional dependencies at import time."""
|
|
37
|
+
if name in _LAZY_IMPORTS:
|
|
38
|
+
module_name, attr_name = _LAZY_IMPORTS[name]
|
|
39
|
+
import importlib
|
|
40
|
+
module = importlib.import_module(f"mirage.pipeline.{module_name}")
|
|
41
|
+
return getattr(module, attr_name)
|
|
42
|
+
raise AttributeError(f"module 'mirage.pipeline' has no attribute '{name}'")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
__all__ = list(_LAZY_IMPORTS.keys())
|
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simple Semantic Chunking System
|
|
3
|
+
Uses a single comprehensive prompt to chunk markdown documents semantically.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Dict, Tuple, Optional
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
|
|
12
|
+
from mirage.core import llm as call_llm
|
|
13
|
+
from mirage.core.llm import call_vlm_with_multiple_images, setup_logging, call_llm_simple
|
|
14
|
+
from mirage.core.prompts import PROMPTS_CHUNK
|
|
15
|
+
from tqdm import tqdm
|
|
16
|
+
|
|
17
|
+
# ============================================================================
|
|
18
|
+
# CONFIGURATION
|
|
19
|
+
# ============================================================================
|
|
20
|
+
|
|
21
|
+
# API Configuration (from call_llm.py)
|
|
22
|
+
INPUT_FILE = "output/results/markdown/document/document_ref.md"
|
|
23
|
+
INPUT_DIR = None # Set to a directory path to process all .md files in it
|
|
24
|
+
OUTPUT_DIR = "output/results/chunks"
|
|
25
|
+
LLM_MODEL_NAME = "gemini-2.0-flash"
|
|
26
|
+
|
|
27
|
+
# Windowing parameters (chars, not tokens)
|
|
28
|
+
WINDOW_SIZE = 20000 # 5000 tokens
|
|
29
|
+
OVERLAP_SIZE = 2000 # 500 tokens
|
|
30
|
+
|
|
31
|
+
# Parallel processing
|
|
32
|
+
NUM_FILE_WORKERS = 4 # Number of files to process in parallel
|
|
33
|
+
|
|
34
|
+
def parse_chunks_from_response(response: str) -> List[Dict]:
|
|
35
|
+
"""Parse structured chunks from LLM response
|
|
36
|
+
|
|
37
|
+
Expected format per prompt:
|
|
38
|
+
<chunk_id>VALUE<|#|><chunk_type>VALUE<|#|><content>VALUE<|#|><artifact>VALUE<|#|><status>VALUE<|#|><chunk_end>
|
|
39
|
+
"""
|
|
40
|
+
chunks = []
|
|
41
|
+
|
|
42
|
+
# Split by <chunk_end> marker
|
|
43
|
+
chunk_blocks = response.split('<chunk_end>')
|
|
44
|
+
|
|
45
|
+
for block in chunk_blocks:
|
|
46
|
+
block = block.strip()
|
|
47
|
+
if not block:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
# Parse fields separated by <|#|>
|
|
51
|
+
# Format: <field_name>VALUE<|#|>
|
|
52
|
+
parts = block.split('<|#|>')
|
|
53
|
+
|
|
54
|
+
if len(parts) >= 5:
|
|
55
|
+
# Extract field values by removing the field name prefix
|
|
56
|
+
# parts[0] = "<chunk_id>VALUE"
|
|
57
|
+
# parts[1] = "<chunk_type>VALUE"
|
|
58
|
+
# parts[2] = "<content>VALUE" (may contain newlines)
|
|
59
|
+
# parts[3] = "<artifact>VALUE"
|
|
60
|
+
# parts[4] = "<status>VALUE"
|
|
61
|
+
|
|
62
|
+
chunk_id = re.sub(r'^<chunk_id>', '', parts[0]).strip()
|
|
63
|
+
chunk_type = re.sub(r'^<chunk_type>', '', parts[1]).strip()
|
|
64
|
+
content = re.sub(r'^<content>', '', parts[2]).strip()
|
|
65
|
+
artifact = re.sub(r'^<artifact>', '', parts[3]).strip()
|
|
66
|
+
status = re.sub(r'^<status>', '', parts[4]).strip()
|
|
67
|
+
|
|
68
|
+
chunks.append({
|
|
69
|
+
'chunk_id': chunk_id,
|
|
70
|
+
'chunk_type': chunk_type,
|
|
71
|
+
'content': content,
|
|
72
|
+
'artifact': artifact,
|
|
73
|
+
'status': status
|
|
74
|
+
})
|
|
75
|
+
else:
|
|
76
|
+
logging.warning(f"Skipping malformed chunk block with {len(parts)} parts (expected 5+). Block preview: {block[:200]}")
|
|
77
|
+
print(f"ā ļø Skipping malformed chunk block with {len(parts)} parts")
|
|
78
|
+
|
|
79
|
+
return chunks
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def find_overlap(incomplete_content: str, new_window: str, max_search: int = None) -> int:
|
|
83
|
+
"""Find where incomplete content overlaps with new window
|
|
84
|
+
Returns the position in new_window where unique content starts
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
incomplete_content: The content from the incomplete chunk (LLM-parsed markdown)
|
|
88
|
+
new_window: The raw markdown text from the new window
|
|
89
|
+
max_search: Maximum search range (defaults to OVERLAP_SIZE * 2 to account for potential formatting differences)
|
|
90
|
+
"""
|
|
91
|
+
if max_search is None:
|
|
92
|
+
max_search = OVERLAP_SIZE * 2 # Search up to 2x overlap size to account for formatting differences
|
|
93
|
+
|
|
94
|
+
# Try to find overlap by checking last N chars of incomplete content
|
|
95
|
+
# against beginning of new window (where overlap should be)
|
|
96
|
+
search_range = min(max_search, len(new_window))
|
|
97
|
+
incomplete_len = len(incomplete_content)
|
|
98
|
+
|
|
99
|
+
# Try multiple snippet lengths, starting from larger to smaller
|
|
100
|
+
# This helps find the best match even if there's slight formatting difference
|
|
101
|
+
for length in range(min(max_search, incomplete_len), 50, -50):
|
|
102
|
+
# Get last N characters from incomplete content
|
|
103
|
+
search_snippet = incomplete_content[-length:].strip()
|
|
104
|
+
|
|
105
|
+
if not search_snippet:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Search in the first part of new window (where overlap should be)
|
|
109
|
+
search_text = new_window[:search_range]
|
|
110
|
+
|
|
111
|
+
# Try exact match first
|
|
112
|
+
if search_snippet in search_text:
|
|
113
|
+
overlap_pos = search_text.find(search_snippet)
|
|
114
|
+
# Return position after the overlap
|
|
115
|
+
return overlap_pos + len(search_snippet)
|
|
116
|
+
|
|
117
|
+
# Try without leading/trailing whitespace differences
|
|
118
|
+
search_snippet_normalized = ' '.join(search_snippet.split())
|
|
119
|
+
search_text_normalized = ' '.join(search_text[:min(len(search_snippet_normalized) * 2, len(search_text))].split())
|
|
120
|
+
|
|
121
|
+
if search_snippet_normalized in search_text_normalized:
|
|
122
|
+
# Find approximate position in original text
|
|
123
|
+
# Use a shorter snippet to find the position
|
|
124
|
+
short_snippet = search_snippet[-min(200, len(search_snippet)):]
|
|
125
|
+
if short_snippet in search_text:
|
|
126
|
+
overlap_pos = search_text.find(short_snippet)
|
|
127
|
+
return overlap_pos + len(short_snippet)
|
|
128
|
+
|
|
129
|
+
# No overlap found, return 0 (start from beginning)
|
|
130
|
+
return 0
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def chunk_with_windows(markdown_text: str) -> Tuple[List[Dict], Dict[int, Dict[str, str]]]:
|
|
134
|
+
"""Process markdown in windows with smart handling of incomplete chunks
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
tuple: (list of chunks, dict of window queries and responses)
|
|
138
|
+
"""
|
|
139
|
+
print(f"š Document size: {len(markdown_text):,} characters")
|
|
140
|
+
print(f"š§ Window: {WINDOW_SIZE:,} chars, Overlap: {OVERLAP_SIZE:,} chars")
|
|
141
|
+
|
|
142
|
+
all_chunks = []
|
|
143
|
+
position = 0
|
|
144
|
+
window_num = 0
|
|
145
|
+
incomplete_chunk = None # Carry over incomplete chunks
|
|
146
|
+
|
|
147
|
+
# Store queries and responses for debugging
|
|
148
|
+
queries_responses = {}
|
|
149
|
+
|
|
150
|
+
while position < len(markdown_text):
|
|
151
|
+
window_num += 1
|
|
152
|
+
|
|
153
|
+
# Calculate window boundaries with overlap
|
|
154
|
+
window_end = min(position + WINDOW_SIZE, len(markdown_text))
|
|
155
|
+
window_text = markdown_text[position:window_end]
|
|
156
|
+
|
|
157
|
+
# If we have an incomplete chunk from previous window, merge it
|
|
158
|
+
if incomplete_chunk:
|
|
159
|
+
print(f"\nš Merging incomplete chunk from previous window...")
|
|
160
|
+
print(f" Incomplete chunk content length: {len(incomplete_chunk['content']):,} chars")
|
|
161
|
+
print(f" New window text length: {len(window_text):,} chars")
|
|
162
|
+
|
|
163
|
+
# Find overlap between incomplete chunk and current window
|
|
164
|
+
# The new window should start with OVERLAP_SIZE chars from previous window
|
|
165
|
+
overlap_end = find_overlap(incomplete_chunk['content'], window_text)
|
|
166
|
+
|
|
167
|
+
if overlap_end > 0:
|
|
168
|
+
print(f" ā
Found overlap at position {overlap_end} (expected around 0-{OVERLAP_SIZE*2})")
|
|
169
|
+
|
|
170
|
+
# Debug: Show what's being merged
|
|
171
|
+
overlap_text = window_text[:overlap_end]
|
|
172
|
+
continuation = window_text[overlap_end:]
|
|
173
|
+
print(f" Overlap text (will be skipped): ...{overlap_text[-50:]}...")
|
|
174
|
+
print(f" Continuation text (will be appended): {continuation[:50]}...")
|
|
175
|
+
|
|
176
|
+
# Remove overlapping portion from window
|
|
177
|
+
# This handles duplicates by:
|
|
178
|
+
# 1. Keeping the incomplete chunk's content (which includes the overlapping portion)
|
|
179
|
+
# 2. Appending only the unique continuation from new window (starting after overlap_end)
|
|
180
|
+
window_text = incomplete_chunk['content'] + window_text[overlap_end:]
|
|
181
|
+
print(f" Merged text length: {len(window_text):,} chars (incomplete: {len(incomplete_chunk['content'])}, continuation: {len(continuation)})")
|
|
182
|
+
else:
|
|
183
|
+
print(f" ā ļø No overlap found (searched first {OVERLAP_SIZE*2} chars)")
|
|
184
|
+
print(f" Debug: Last 100 chars of incomplete: ...{incomplete_chunk['content'][-100:]}")
|
|
185
|
+
print(f" Debug: First 100 chars of new window: {window_text[:100]}")
|
|
186
|
+
# No overlap, just prepend
|
|
187
|
+
window_text = incomplete_chunk['content'] + "\n\n" + window_text
|
|
188
|
+
|
|
189
|
+
incomplete_chunk = None # Reset
|
|
190
|
+
|
|
191
|
+
print(f"\nš Processing window {window_num} (pos {position:,} - {window_end:,})")
|
|
192
|
+
|
|
193
|
+
# Call LLM with the semantic chunking prompt
|
|
194
|
+
try:
|
|
195
|
+
full_prompt = f"{PROMPTS_CHUNK['semantic_chunking']}\n\nMarkdown QUERY to chunk:\n\n{window_text}"
|
|
196
|
+
|
|
197
|
+
response = call_llm_simple(full_prompt)
|
|
198
|
+
|
|
199
|
+
# Store query (just the text to chunk) and response for debugging
|
|
200
|
+
queries_responses[window_num] = {
|
|
201
|
+
'query': window_text,
|
|
202
|
+
'response': response
|
|
203
|
+
}
|
|
204
|
+
logging.info(f"Window {window_num}: Query {len(window_text)} chars, Response {len(response)} chars")
|
|
205
|
+
print(f"š Stored query ({len(window_text)} chars) and response ({len(response)} chars)")
|
|
206
|
+
|
|
207
|
+
# Check for empty response
|
|
208
|
+
if not response or not response.strip():
|
|
209
|
+
logging.warning(f"Empty response from LLM for window {window_num}")
|
|
210
|
+
print(f"ā ļø Empty response from LLM for window {window_num}, skipping...")
|
|
211
|
+
incomplete_chunk = None
|
|
212
|
+
# Move to next window
|
|
213
|
+
if window_end >= len(markdown_text):
|
|
214
|
+
break
|
|
215
|
+
position = window_end - OVERLAP_SIZE
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
# Parse chunks from response
|
|
219
|
+
window_chunks = parse_chunks_from_response(response)
|
|
220
|
+
print(f"ā
Parsed {len(window_chunks)} chunks from window {window_num}")
|
|
221
|
+
|
|
222
|
+
# Print character and word count for each chunk
|
|
223
|
+
for idx, chunk in enumerate(window_chunks, 1):
|
|
224
|
+
content = chunk.get('content', '')
|
|
225
|
+
char_count = len(content)
|
|
226
|
+
word_count = len(content.split())
|
|
227
|
+
print(f" Chunk {idx}: {char_count:,} chars, {word_count:,} words")
|
|
228
|
+
|
|
229
|
+
# Check if last chunk is incomplete
|
|
230
|
+
if window_chunks and window_chunks[-1]['status'].upper() == 'INCOMPLETE':
|
|
231
|
+
incomplete_chunk = window_chunks[-1]
|
|
232
|
+
window_chunks = window_chunks[:-1] # Don't add incomplete chunk yet
|
|
233
|
+
print(f" ā ļø Last chunk marked INCOMPLETE, will merge with next window")
|
|
234
|
+
|
|
235
|
+
all_chunks.extend(window_chunks)
|
|
236
|
+
|
|
237
|
+
except Exception as e:
|
|
238
|
+
print(f"ā Error processing window {window_num}: {e}")
|
|
239
|
+
incomplete_chunk = None # Reset on error
|
|
240
|
+
|
|
241
|
+
# Move to next window with overlap
|
|
242
|
+
if window_end >= len(markdown_text):
|
|
243
|
+
# End of document - add incomplete chunk if any
|
|
244
|
+
if incomplete_chunk:
|
|
245
|
+
print(f" š Adding final incomplete chunk as-is")
|
|
246
|
+
all_chunks.append(incomplete_chunk)
|
|
247
|
+
break
|
|
248
|
+
|
|
249
|
+
position = window_end - OVERLAP_SIZE
|
|
250
|
+
|
|
251
|
+
print(f"\nā
Total chunks from all windows: {len(all_chunks)}")
|
|
252
|
+
return all_chunks, queries_responses
|
|
253
|
+
|
|
254
|
+
def renumber_chunks(chunks: List[Dict], file_name: str) -> List[Dict]:
|
|
255
|
+
"""Renumber chunks with continuous numbering and add file name"""
|
|
256
|
+
for i, chunk in enumerate(chunks, 1):
|
|
257
|
+
# Reconstruct dict to ensure order: file_name, chunk_id, ...
|
|
258
|
+
original = chunk.copy()
|
|
259
|
+
chunk.clear()
|
|
260
|
+
chunk['file_name'] = file_name
|
|
261
|
+
chunk['chunk_id'] = str(i)
|
|
262
|
+
chunk.update({k: v for k, v in original.items() if k != 'chunk_id'})
|
|
263
|
+
|
|
264
|
+
print(f"š¢ Renumbered {len(chunks)} chunks and added file name")
|
|
265
|
+
return chunks
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def export_to_json(chunks: List[Dict], output_path: Path):
|
|
269
|
+
"""Export chunks to JSON file"""
|
|
270
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
271
|
+
|
|
272
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
273
|
+
json.dump(chunks, f, indent=2, ensure_ascii=False)
|
|
274
|
+
|
|
275
|
+
print(f"š¾ Saved {len(chunks)} chunks to {output_path}")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def print_summary(chunks: List[Dict]):
|
|
279
|
+
"""Print summary statistics"""
|
|
280
|
+
type_counts = {}
|
|
281
|
+
status_counts = {}
|
|
282
|
+
|
|
283
|
+
for chunk in chunks:
|
|
284
|
+
chunk_type = chunk.get('chunk_type', 'unknown')
|
|
285
|
+
status = chunk.get('status', 'unknown')
|
|
286
|
+
|
|
287
|
+
type_counts[chunk_type] = type_counts.get(chunk_type, 0) + 1
|
|
288
|
+
status_counts[status] = status_counts.get(status, 0) + 1
|
|
289
|
+
|
|
290
|
+
print("\n" + "="*60)
|
|
291
|
+
print("š CHUNKING SUMMARY")
|
|
292
|
+
print("="*60)
|
|
293
|
+
print(f"Total chunks: {len(chunks)}")
|
|
294
|
+
print(f"\nBy type:")
|
|
295
|
+
for ctype, count in sorted(type_counts.items()):
|
|
296
|
+
print(f" ⢠{ctype}: {count}")
|
|
297
|
+
print(f"\nBy status:")
|
|
298
|
+
for status, count in sorted(status_counts.items()):
|
|
299
|
+
print(f" ⢠{status}: {count}")
|
|
300
|
+
|
|
301
|
+
# Calculate and print average word count
|
|
302
|
+
total_words = 0
|
|
303
|
+
for chunk in chunks:
|
|
304
|
+
content = chunk.get('content', '')
|
|
305
|
+
total_words += len(content.split())
|
|
306
|
+
avg_words = total_words / len(chunks) if chunks else 0
|
|
307
|
+
print(f"\nAverage word count per chunk: {avg_words:.1f}")
|
|
308
|
+
print("="*60)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# ============================================================================
|
|
312
|
+
# SINGLE FILE PROCESSING
|
|
313
|
+
# ============================================================================
|
|
314
|
+
|
|
315
|
+
def process_single_file(input_path: Path, output_dir: Path) -> Dict:
|
|
316
|
+
"""Process a single markdown file and return results.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
input_path: Path to markdown file
|
|
320
|
+
output_dir: Directory for output files
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Dict with 'success', 'file', 'chunks_count', 'error' keys
|
|
324
|
+
"""
|
|
325
|
+
result = {
|
|
326
|
+
'success': False,
|
|
327
|
+
'file': str(input_path),
|
|
328
|
+
'chunks_count': 0,
|
|
329
|
+
'error': None
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
if not input_path.exists():
|
|
334
|
+
result['error'] = f"File not found: {input_path}"
|
|
335
|
+
return result
|
|
336
|
+
|
|
337
|
+
print(f"\nš Processing: {input_path.name}")
|
|
338
|
+
markdown_text = input_path.read_text(encoding='utf-8')
|
|
339
|
+
|
|
340
|
+
# Chunk with windows
|
|
341
|
+
chunks, queries_responses = chunk_with_windows(markdown_text)
|
|
342
|
+
|
|
343
|
+
# Renumber continuously and add file name
|
|
344
|
+
chunks = renumber_chunks(chunks, input_path.stem)
|
|
345
|
+
|
|
346
|
+
# Create output directory
|
|
347
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
348
|
+
|
|
349
|
+
# Export chunks to JSON
|
|
350
|
+
output_path = output_dir / f"{input_path.stem}_chunks.json"
|
|
351
|
+
export_to_json(chunks, output_path)
|
|
352
|
+
|
|
353
|
+
# Export queries and responses for debugging
|
|
354
|
+
queries_responses_path = output_dir / f"{input_path.stem}_queries_responses.json"
|
|
355
|
+
with open(queries_responses_path, 'w', encoding='utf-8') as f:
|
|
356
|
+
json.dump(queries_responses, f, indent=2, ensure_ascii=False)
|
|
357
|
+
|
|
358
|
+
result['success'] = True
|
|
359
|
+
result['chunks_count'] = len(chunks)
|
|
360
|
+
print(f"ā
{input_path.name}: {len(chunks)} chunks")
|
|
361
|
+
|
|
362
|
+
except Exception as e:
|
|
363
|
+
result['error'] = str(e)
|
|
364
|
+
print(f"ā {input_path.name}: Error - {e}")
|
|
365
|
+
|
|
366
|
+
return result
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def process_files_parallel(input_files: List[Path], output_dir: Path,
|
|
370
|
+
max_workers: int = NUM_FILE_WORKERS) -> List[Dict]:
|
|
371
|
+
"""Process multiple markdown files in parallel.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
input_files: List of markdown file paths
|
|
375
|
+
output_dir: Base directory for output (subdirs created per file)
|
|
376
|
+
max_workers: Number of parallel workers
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
List of result dicts from process_single_file
|
|
380
|
+
"""
|
|
381
|
+
if not input_files:
|
|
382
|
+
print("ā No files to process")
|
|
383
|
+
return []
|
|
384
|
+
|
|
385
|
+
print(f"\nš Processing {len(input_files)} files with {max_workers} parallel workers")
|
|
386
|
+
print("="*60)
|
|
387
|
+
|
|
388
|
+
results = []
|
|
389
|
+
|
|
390
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
391
|
+
# Submit all tasks
|
|
392
|
+
futures = {}
|
|
393
|
+
for input_path in input_files:
|
|
394
|
+
# Create per-file output directory
|
|
395
|
+
file_output_dir = output_dir / input_path.stem
|
|
396
|
+
future = executor.submit(process_single_file, input_path, file_output_dir)
|
|
397
|
+
futures[future] = input_path
|
|
398
|
+
|
|
399
|
+
# Collect results with progress bar
|
|
400
|
+
for future in tqdm(as_completed(futures), total=len(futures),
|
|
401
|
+
desc="Chunking files"):
|
|
402
|
+
try:
|
|
403
|
+
result = future.result()
|
|
404
|
+
results.append(result)
|
|
405
|
+
except Exception as e:
|
|
406
|
+
input_path = futures[future]
|
|
407
|
+
results.append({
|
|
408
|
+
'success': False,
|
|
409
|
+
'file': str(input_path),
|
|
410
|
+
'chunks_count': 0,
|
|
411
|
+
'error': str(e)
|
|
412
|
+
})
|
|
413
|
+
|
|
414
|
+
# Print summary
|
|
415
|
+
print("\n" + "="*60)
|
|
416
|
+
print("š PARALLEL CHUNKING SUMMARY")
|
|
417
|
+
print("="*60)
|
|
418
|
+
successful = [r for r in results if r['success']]
|
|
419
|
+
failed = [r for r in results if not r['success']]
|
|
420
|
+
total_chunks = sum(r['chunks_count'] for r in successful)
|
|
421
|
+
|
|
422
|
+
print(f"Files processed: {len(results)}")
|
|
423
|
+
print(f"Successful: {len(successful)}")
|
|
424
|
+
print(f"Failed: {len(failed)}")
|
|
425
|
+
print(f"Total chunks generated: {total_chunks}")
|
|
426
|
+
|
|
427
|
+
if failed:
|
|
428
|
+
print("\nā Failed files:")
|
|
429
|
+
for r in failed:
|
|
430
|
+
print(f" ⢠{Path(r['file']).name}: {r['error']}")
|
|
431
|
+
|
|
432
|
+
print("="*60)
|
|
433
|
+
|
|
434
|
+
return results
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def get_markdown_files(input_path: str) -> List[Path]:
|
|
438
|
+
"""Get list of markdown files from path (file or directory).
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
input_path: Path to file or directory
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
List of Path objects for markdown files
|
|
445
|
+
"""
|
|
446
|
+
path = Path(input_path)
|
|
447
|
+
|
|
448
|
+
if path.is_file():
|
|
449
|
+
return [path] if path.suffix.lower() == '.md' else []
|
|
450
|
+
elif path.is_dir():
|
|
451
|
+
# Find all .md files recursively
|
|
452
|
+
return list(path.glob("**/*.md"))
|
|
453
|
+
else:
|
|
454
|
+
return []
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# ============================================================================
|
|
458
|
+
# MAIN
|
|
459
|
+
# ============================================================================
|
|
460
|
+
|
|
461
|
+
def main(input_path: Optional[str] = None, output_dir: Optional[str] = None,
|
|
462
|
+
parallel: bool = True, max_workers: int = NUM_FILE_WORKERS):
|
|
463
|
+
"""Main execution - supports single file or parallel multi-file processing.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
input_path: Path to file or directory (uses INPUT_FILE/INPUT_DIR if None)
|
|
467
|
+
output_dir: Output directory (uses OUTPUT_DIR if None)
|
|
468
|
+
parallel: Whether to use parallel processing for multiple files
|
|
469
|
+
max_workers: Number of parallel workers
|
|
470
|
+
"""
|
|
471
|
+
# Setup logging
|
|
472
|
+
setup_logging()
|
|
473
|
+
|
|
474
|
+
print("š Starting Simple Semantic Chunking")
|
|
475
|
+
print(f"š¤ Using model: {LLM_MODEL_NAME}")
|
|
476
|
+
|
|
477
|
+
# Determine input path
|
|
478
|
+
if input_path is None:
|
|
479
|
+
input_path = INPUT_DIR if INPUT_DIR else INPUT_FILE
|
|
480
|
+
|
|
481
|
+
# Determine output directory
|
|
482
|
+
if output_dir is None:
|
|
483
|
+
output_dir = OUTPUT_DIR
|
|
484
|
+
output_path = Path(output_dir)
|
|
485
|
+
|
|
486
|
+
# Get list of markdown files
|
|
487
|
+
input_files = get_markdown_files(input_path)
|
|
488
|
+
|
|
489
|
+
if not input_files:
|
|
490
|
+
print(f"ā No markdown files found at: {input_path}")
|
|
491
|
+
return
|
|
492
|
+
|
|
493
|
+
print(f"š Found {len(input_files)} markdown file(s)")
|
|
494
|
+
|
|
495
|
+
# Process files
|
|
496
|
+
if len(input_files) == 1:
|
|
497
|
+
# Single file - process directly
|
|
498
|
+
result = process_single_file(input_files[0], output_path)
|
|
499
|
+
if result['success']:
|
|
500
|
+
print_summary_from_file(output_path / f"{input_files[0].stem}_chunks.json")
|
|
501
|
+
elif parallel:
|
|
502
|
+
# Multiple files - process in parallel
|
|
503
|
+
results = process_files_parallel(input_files, output_path, max_workers)
|
|
504
|
+
else:
|
|
505
|
+
# Multiple files - process sequentially
|
|
506
|
+
print(f"\nš Processing {len(input_files)} files sequentially...")
|
|
507
|
+
for input_file in tqdm(input_files, desc="Chunking files"):
|
|
508
|
+
file_output_dir = output_path / input_file.stem
|
|
509
|
+
process_single_file(input_file, file_output_dir)
|
|
510
|
+
|
|
511
|
+
print("\nā
Processing complete!")
|
|
512
|
+
print(f"š Output directory: {output_path}")
|
|
513
|
+
print(f" ⢠Log file: {call_llm.LOG_FILE if hasattr(call_llm, 'LOG_FILE') else 'N/A'}")
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def print_summary_from_file(chunks_file: Path):
|
|
517
|
+
"""Print summary from saved chunks file."""
|
|
518
|
+
if chunks_file.exists():
|
|
519
|
+
with open(chunks_file, 'r', encoding='utf-8') as f:
|
|
520
|
+
chunks = json.load(f)
|
|
521
|
+
print_summary(chunks)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
if __name__ == "__main__":
|
|
525
|
+
import argparse
|
|
526
|
+
|
|
527
|
+
parser = argparse.ArgumentParser(description="Semantic chunking for markdown files")
|
|
528
|
+
parser.add_argument("--input", "-i", type=str, default=None,
|
|
529
|
+
help="Input file or directory path")
|
|
530
|
+
parser.add_argument("--output", "-o", type=str, default=None,
|
|
531
|
+
help="Output directory")
|
|
532
|
+
parser.add_argument("--workers", "-w", type=int, default=NUM_FILE_WORKERS,
|
|
533
|
+
help=f"Number of parallel workers (default: {NUM_FILE_WORKERS})")
|
|
534
|
+
parser.add_argument("--sequential", "-s", action="store_true",
|
|
535
|
+
help="Process files sequentially instead of in parallel")
|
|
536
|
+
|
|
537
|
+
args = parser.parse_args()
|
|
538
|
+
|
|
539
|
+
main(
|
|
540
|
+
input_path=args.input,
|
|
541
|
+
output_dir=args.output,
|
|
542
|
+
parallel=not args.sequential,
|
|
543
|
+
max_workers=args.workers
|
|
544
|
+
)
|
|
545
|
+
|