mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

@@ -0,0 +1,45 @@
1
+ """
2
+ Pipeline module for MiRAGE - Document processing, QA generation, and deduplication.
3
+
4
+ Imports are lazy to avoid loading optional dependencies at import time.
5
+ """
6
+
7
+ # Mapping of attribute names to (module_name, attr_name)
8
+ _LAZY_IMPORTS = {
9
+ # PDF Processing (requires docling, matplotlib - optional)
10
+ "process_pdf_to_markdown": ("pdf_processor", "process_pdf_to_markdown"),
11
+ "process_directory": ("pdf_processor", "process_directory"),
12
+ # Chunking
13
+ "chunk_markdown_to_semantic": ("chunker", "chunk_markdown_to_semantic"),
14
+ "process_markdown_file": ("chunker", "process_markdown_file"),
15
+ # Context
16
+ "build_complete_context": ("context", "build_complete_context"),
17
+ "retrieve_similar_chunks": ("context", "retrieve_similar_chunks"),
18
+ "ContextBuilder": ("context", "ContextBuilder"),
19
+ # QA Generation
20
+ "generate_qa_for_chunk": ("qa_generator", "generate_qa_for_chunk"),
21
+ "verify_qa_pair": ("qa_generator", "verify_qa_pair"),
22
+ "select_best_qa_pairs": ("qa_generator", "select_best_qa_pairs"),
23
+ # Domain
24
+ "fetch_domain_and_role": ("domain", "fetch_domain_and_role"),
25
+ "load_domain_expert_from_env": ("domain", "load_domain_expert_from_env"),
26
+ "save_domain_expert_to_env": ("domain", "save_domain_expert_to_env"),
27
+ "DomainExtractor": ("domain", "DomainExtractor"),
28
+ # Deduplication
29
+ "deduplicate_qa_pairs": ("deduplication", "deduplicate_qa_pairs"),
30
+ "cluster_questions": ("deduplication", "cluster_questions"),
31
+ "merge_similar_qa": ("deduplication", "merge_similar_qa"),
32
+ }
33
+
34
+
35
+ def __getattr__(name):
36
+ """Lazy import to avoid loading optional dependencies at import time."""
37
+ if name in _LAZY_IMPORTS:
38
+ module_name, attr_name = _LAZY_IMPORTS[name]
39
+ import importlib
40
+ module = importlib.import_module(f"mirage.pipeline.{module_name}")
41
+ return getattr(module, attr_name)
42
+ raise AttributeError(f"module 'mirage.pipeline' has no attribute '{name}'")
43
+
44
+
45
+ __all__ = list(_LAZY_IMPORTS.keys())
@@ -0,0 +1,545 @@
1
+ """
2
+ Simple Semantic Chunking System
3
+ Uses a single comprehensive prompt to chunk markdown documents semantically.
4
+ """
5
+
6
+ import json
7
+ import re
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import List, Dict, Tuple, Optional
11
+ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
12
+ from mirage.core import llm as call_llm
13
+ from mirage.core.llm import call_vlm_with_multiple_images, setup_logging, call_llm_simple
14
+ from mirage.core.prompts import PROMPTS_CHUNK
15
+ from tqdm import tqdm
16
+
17
+ # ============================================================================
18
+ # CONFIGURATION
19
+ # ============================================================================
20
+
21
+ # API Configuration (from call_llm.py)
22
+ INPUT_FILE = "output/results/markdown/document/document_ref.md"
23
+ INPUT_DIR = None # Set to a directory path to process all .md files in it
24
+ OUTPUT_DIR = "output/results/chunks"
25
+ LLM_MODEL_NAME = "gemini-2.0-flash"
26
+
27
+ # Windowing parameters (chars, not tokens)
28
+ WINDOW_SIZE = 20000 # 5000 tokens
29
+ OVERLAP_SIZE = 2000 # 500 tokens
30
+
31
+ # Parallel processing
32
+ NUM_FILE_WORKERS = 4 # Number of files to process in parallel
33
+
34
+ def parse_chunks_from_response(response: str) -> List[Dict]:
35
+ """Parse structured chunks from LLM response
36
+
37
+ Expected format per prompt:
38
+ <chunk_id>VALUE<|#|><chunk_type>VALUE<|#|><content>VALUE<|#|><artifact>VALUE<|#|><status>VALUE<|#|><chunk_end>
39
+ """
40
+ chunks = []
41
+
42
+ # Split by <chunk_end> marker
43
+ chunk_blocks = response.split('<chunk_end>')
44
+
45
+ for block in chunk_blocks:
46
+ block = block.strip()
47
+ if not block:
48
+ continue
49
+
50
+ # Parse fields separated by <|#|>
51
+ # Format: <field_name>VALUE<|#|>
52
+ parts = block.split('<|#|>')
53
+
54
+ if len(parts) >= 5:
55
+ # Extract field values by removing the field name prefix
56
+ # parts[0] = "<chunk_id>VALUE"
57
+ # parts[1] = "<chunk_type>VALUE"
58
+ # parts[2] = "<content>VALUE" (may contain newlines)
59
+ # parts[3] = "<artifact>VALUE"
60
+ # parts[4] = "<status>VALUE"
61
+
62
+ chunk_id = re.sub(r'^<chunk_id>', '', parts[0]).strip()
63
+ chunk_type = re.sub(r'^<chunk_type>', '', parts[1]).strip()
64
+ content = re.sub(r'^<content>', '', parts[2]).strip()
65
+ artifact = re.sub(r'^<artifact>', '', parts[3]).strip()
66
+ status = re.sub(r'^<status>', '', parts[4]).strip()
67
+
68
+ chunks.append({
69
+ 'chunk_id': chunk_id,
70
+ 'chunk_type': chunk_type,
71
+ 'content': content,
72
+ 'artifact': artifact,
73
+ 'status': status
74
+ })
75
+ else:
76
+ logging.warning(f"Skipping malformed chunk block with {len(parts)} parts (expected 5+). Block preview: {block[:200]}")
77
+ print(f"āš ļø Skipping malformed chunk block with {len(parts)} parts")
78
+
79
+ return chunks
80
+
81
+
82
+ def find_overlap(incomplete_content: str, new_window: str, max_search: int = None) -> int:
83
+ """Find where incomplete content overlaps with new window
84
+ Returns the position in new_window where unique content starts
85
+
86
+ Args:
87
+ incomplete_content: The content from the incomplete chunk (LLM-parsed markdown)
88
+ new_window: The raw markdown text from the new window
89
+ max_search: Maximum search range (defaults to OVERLAP_SIZE * 2 to account for potential formatting differences)
90
+ """
91
+ if max_search is None:
92
+ max_search = OVERLAP_SIZE * 2 # Search up to 2x overlap size to account for formatting differences
93
+
94
+ # Try to find overlap by checking last N chars of incomplete content
95
+ # against beginning of new window (where overlap should be)
96
+ search_range = min(max_search, len(new_window))
97
+ incomplete_len = len(incomplete_content)
98
+
99
+ # Try multiple snippet lengths, starting from larger to smaller
100
+ # This helps find the best match even if there's slight formatting difference
101
+ for length in range(min(max_search, incomplete_len), 50, -50):
102
+ # Get last N characters from incomplete content
103
+ search_snippet = incomplete_content[-length:].strip()
104
+
105
+ if not search_snippet:
106
+ continue
107
+
108
+ # Search in the first part of new window (where overlap should be)
109
+ search_text = new_window[:search_range]
110
+
111
+ # Try exact match first
112
+ if search_snippet in search_text:
113
+ overlap_pos = search_text.find(search_snippet)
114
+ # Return position after the overlap
115
+ return overlap_pos + len(search_snippet)
116
+
117
+ # Try without leading/trailing whitespace differences
118
+ search_snippet_normalized = ' '.join(search_snippet.split())
119
+ search_text_normalized = ' '.join(search_text[:min(len(search_snippet_normalized) * 2, len(search_text))].split())
120
+
121
+ if search_snippet_normalized in search_text_normalized:
122
+ # Find approximate position in original text
123
+ # Use a shorter snippet to find the position
124
+ short_snippet = search_snippet[-min(200, len(search_snippet)):]
125
+ if short_snippet in search_text:
126
+ overlap_pos = search_text.find(short_snippet)
127
+ return overlap_pos + len(short_snippet)
128
+
129
+ # No overlap found, return 0 (start from beginning)
130
+ return 0
131
+
132
+
133
+ def chunk_with_windows(markdown_text: str) -> Tuple[List[Dict], Dict[int, Dict[str, str]]]:
134
+ """Process markdown in windows with smart handling of incomplete chunks
135
+
136
+ Returns:
137
+ tuple: (list of chunks, dict of window queries and responses)
138
+ """
139
+ print(f"šŸ“„ Document size: {len(markdown_text):,} characters")
140
+ print(f"šŸ”§ Window: {WINDOW_SIZE:,} chars, Overlap: {OVERLAP_SIZE:,} chars")
141
+
142
+ all_chunks = []
143
+ position = 0
144
+ window_num = 0
145
+ incomplete_chunk = None # Carry over incomplete chunks
146
+
147
+ # Store queries and responses for debugging
148
+ queries_responses = {}
149
+
150
+ while position < len(markdown_text):
151
+ window_num += 1
152
+
153
+ # Calculate window boundaries with overlap
154
+ window_end = min(position + WINDOW_SIZE, len(markdown_text))
155
+ window_text = markdown_text[position:window_end]
156
+
157
+ # If we have an incomplete chunk from previous window, merge it
158
+ if incomplete_chunk:
159
+ print(f"\nšŸ”— Merging incomplete chunk from previous window...")
160
+ print(f" Incomplete chunk content length: {len(incomplete_chunk['content']):,} chars")
161
+ print(f" New window text length: {len(window_text):,} chars")
162
+
163
+ # Find overlap between incomplete chunk and current window
164
+ # The new window should start with OVERLAP_SIZE chars from previous window
165
+ overlap_end = find_overlap(incomplete_chunk['content'], window_text)
166
+
167
+ if overlap_end > 0:
168
+ print(f" āœ… Found overlap at position {overlap_end} (expected around 0-{OVERLAP_SIZE*2})")
169
+
170
+ # Debug: Show what's being merged
171
+ overlap_text = window_text[:overlap_end]
172
+ continuation = window_text[overlap_end:]
173
+ print(f" Overlap text (will be skipped): ...{overlap_text[-50:]}...")
174
+ print(f" Continuation text (will be appended): {continuation[:50]}...")
175
+
176
+ # Remove overlapping portion from window
177
+ # This handles duplicates by:
178
+ # 1. Keeping the incomplete chunk's content (which includes the overlapping portion)
179
+ # 2. Appending only the unique continuation from new window (starting after overlap_end)
180
+ window_text = incomplete_chunk['content'] + window_text[overlap_end:]
181
+ print(f" Merged text length: {len(window_text):,} chars (incomplete: {len(incomplete_chunk['content'])}, continuation: {len(continuation)})")
182
+ else:
183
+ print(f" āš ļø No overlap found (searched first {OVERLAP_SIZE*2} chars)")
184
+ print(f" Debug: Last 100 chars of incomplete: ...{incomplete_chunk['content'][-100:]}")
185
+ print(f" Debug: First 100 chars of new window: {window_text[:100]}")
186
+ # No overlap, just prepend
187
+ window_text = incomplete_chunk['content'] + "\n\n" + window_text
188
+
189
+ incomplete_chunk = None # Reset
190
+
191
+ print(f"\nšŸ”„ Processing window {window_num} (pos {position:,} - {window_end:,})")
192
+
193
+ # Call LLM with the semantic chunking prompt
194
+ try:
195
+ full_prompt = f"{PROMPTS_CHUNK['semantic_chunking']}\n\nMarkdown QUERY to chunk:\n\n{window_text}"
196
+
197
+ response = call_llm_simple(full_prompt)
198
+
199
+ # Store query (just the text to chunk) and response for debugging
200
+ queries_responses[window_num] = {
201
+ 'query': window_text,
202
+ 'response': response
203
+ }
204
+ logging.info(f"Window {window_num}: Query {len(window_text)} chars, Response {len(response)} chars")
205
+ print(f"šŸ“ Stored query ({len(window_text)} chars) and response ({len(response)} chars)")
206
+
207
+ # Check for empty response
208
+ if not response or not response.strip():
209
+ logging.warning(f"Empty response from LLM for window {window_num}")
210
+ print(f"āš ļø Empty response from LLM for window {window_num}, skipping...")
211
+ incomplete_chunk = None
212
+ # Move to next window
213
+ if window_end >= len(markdown_text):
214
+ break
215
+ position = window_end - OVERLAP_SIZE
216
+ continue
217
+
218
+ # Parse chunks from response
219
+ window_chunks = parse_chunks_from_response(response)
220
+ print(f"āœ… Parsed {len(window_chunks)} chunks from window {window_num}")
221
+
222
+ # Print character and word count for each chunk
223
+ for idx, chunk in enumerate(window_chunks, 1):
224
+ content = chunk.get('content', '')
225
+ char_count = len(content)
226
+ word_count = len(content.split())
227
+ print(f" Chunk {idx}: {char_count:,} chars, {word_count:,} words")
228
+
229
+ # Check if last chunk is incomplete
230
+ if window_chunks and window_chunks[-1]['status'].upper() == 'INCOMPLETE':
231
+ incomplete_chunk = window_chunks[-1]
232
+ window_chunks = window_chunks[:-1] # Don't add incomplete chunk yet
233
+ print(f" āš ļø Last chunk marked INCOMPLETE, will merge with next window")
234
+
235
+ all_chunks.extend(window_chunks)
236
+
237
+ except Exception as e:
238
+ print(f"āŒ Error processing window {window_num}: {e}")
239
+ incomplete_chunk = None # Reset on error
240
+
241
+ # Move to next window with overlap
242
+ if window_end >= len(markdown_text):
243
+ # End of document - add incomplete chunk if any
244
+ if incomplete_chunk:
245
+ print(f" šŸ“ Adding final incomplete chunk as-is")
246
+ all_chunks.append(incomplete_chunk)
247
+ break
248
+
249
+ position = window_end - OVERLAP_SIZE
250
+
251
+ print(f"\nāœ… Total chunks from all windows: {len(all_chunks)}")
252
+ return all_chunks, queries_responses
253
+
254
+ def renumber_chunks(chunks: List[Dict], file_name: str) -> List[Dict]:
255
+ """Renumber chunks with continuous numbering and add file name"""
256
+ for i, chunk in enumerate(chunks, 1):
257
+ # Reconstruct dict to ensure order: file_name, chunk_id, ...
258
+ original = chunk.copy()
259
+ chunk.clear()
260
+ chunk['file_name'] = file_name
261
+ chunk['chunk_id'] = str(i)
262
+ chunk.update({k: v for k, v in original.items() if k != 'chunk_id'})
263
+
264
+ print(f"šŸ”¢ Renumbered {len(chunks)} chunks and added file name")
265
+ return chunks
266
+
267
+
268
+ def export_to_json(chunks: List[Dict], output_path: Path):
269
+ """Export chunks to JSON file"""
270
+ output_path.parent.mkdir(parents=True, exist_ok=True)
271
+
272
+ with open(output_path, 'w', encoding='utf-8') as f:
273
+ json.dump(chunks, f, indent=2, ensure_ascii=False)
274
+
275
+ print(f"šŸ’¾ Saved {len(chunks)} chunks to {output_path}")
276
+
277
+
278
+ def print_summary(chunks: List[Dict]):
279
+ """Print summary statistics"""
280
+ type_counts = {}
281
+ status_counts = {}
282
+
283
+ for chunk in chunks:
284
+ chunk_type = chunk.get('chunk_type', 'unknown')
285
+ status = chunk.get('status', 'unknown')
286
+
287
+ type_counts[chunk_type] = type_counts.get(chunk_type, 0) + 1
288
+ status_counts[status] = status_counts.get(status, 0) + 1
289
+
290
+ print("\n" + "="*60)
291
+ print("šŸ“Š CHUNKING SUMMARY")
292
+ print("="*60)
293
+ print(f"Total chunks: {len(chunks)}")
294
+ print(f"\nBy type:")
295
+ for ctype, count in sorted(type_counts.items()):
296
+ print(f" • {ctype}: {count}")
297
+ print(f"\nBy status:")
298
+ for status, count in sorted(status_counts.items()):
299
+ print(f" • {status}: {count}")
300
+
301
+ # Calculate and print average word count
302
+ total_words = 0
303
+ for chunk in chunks:
304
+ content = chunk.get('content', '')
305
+ total_words += len(content.split())
306
+ avg_words = total_words / len(chunks) if chunks else 0
307
+ print(f"\nAverage word count per chunk: {avg_words:.1f}")
308
+ print("="*60)
309
+
310
+
311
+ # ============================================================================
312
+ # SINGLE FILE PROCESSING
313
+ # ============================================================================
314
+
315
+ def process_single_file(input_path: Path, output_dir: Path) -> Dict:
316
+ """Process a single markdown file and return results.
317
+
318
+ Args:
319
+ input_path: Path to markdown file
320
+ output_dir: Directory for output files
321
+
322
+ Returns:
323
+ Dict with 'success', 'file', 'chunks_count', 'error' keys
324
+ """
325
+ result = {
326
+ 'success': False,
327
+ 'file': str(input_path),
328
+ 'chunks_count': 0,
329
+ 'error': None
330
+ }
331
+
332
+ try:
333
+ if not input_path.exists():
334
+ result['error'] = f"File not found: {input_path}"
335
+ return result
336
+
337
+ print(f"\nšŸ“– Processing: {input_path.name}")
338
+ markdown_text = input_path.read_text(encoding='utf-8')
339
+
340
+ # Chunk with windows
341
+ chunks, queries_responses = chunk_with_windows(markdown_text)
342
+
343
+ # Renumber continuously and add file name
344
+ chunks = renumber_chunks(chunks, input_path.stem)
345
+
346
+ # Create output directory
347
+ output_dir.mkdir(parents=True, exist_ok=True)
348
+
349
+ # Export chunks to JSON
350
+ output_path = output_dir / f"{input_path.stem}_chunks.json"
351
+ export_to_json(chunks, output_path)
352
+
353
+ # Export queries and responses for debugging
354
+ queries_responses_path = output_dir / f"{input_path.stem}_queries_responses.json"
355
+ with open(queries_responses_path, 'w', encoding='utf-8') as f:
356
+ json.dump(queries_responses, f, indent=2, ensure_ascii=False)
357
+
358
+ result['success'] = True
359
+ result['chunks_count'] = len(chunks)
360
+ print(f"āœ… {input_path.name}: {len(chunks)} chunks")
361
+
362
+ except Exception as e:
363
+ result['error'] = str(e)
364
+ print(f"āŒ {input_path.name}: Error - {e}")
365
+
366
+ return result
367
+
368
+
369
+ def process_files_parallel(input_files: List[Path], output_dir: Path,
370
+ max_workers: int = NUM_FILE_WORKERS) -> List[Dict]:
371
+ """Process multiple markdown files in parallel.
372
+
373
+ Args:
374
+ input_files: List of markdown file paths
375
+ output_dir: Base directory for output (subdirs created per file)
376
+ max_workers: Number of parallel workers
377
+
378
+ Returns:
379
+ List of result dicts from process_single_file
380
+ """
381
+ if not input_files:
382
+ print("āŒ No files to process")
383
+ return []
384
+
385
+ print(f"\nšŸš€ Processing {len(input_files)} files with {max_workers} parallel workers")
386
+ print("="*60)
387
+
388
+ results = []
389
+
390
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
391
+ # Submit all tasks
392
+ futures = {}
393
+ for input_path in input_files:
394
+ # Create per-file output directory
395
+ file_output_dir = output_dir / input_path.stem
396
+ future = executor.submit(process_single_file, input_path, file_output_dir)
397
+ futures[future] = input_path
398
+
399
+ # Collect results with progress bar
400
+ for future in tqdm(as_completed(futures), total=len(futures),
401
+ desc="Chunking files"):
402
+ try:
403
+ result = future.result()
404
+ results.append(result)
405
+ except Exception as e:
406
+ input_path = futures[future]
407
+ results.append({
408
+ 'success': False,
409
+ 'file': str(input_path),
410
+ 'chunks_count': 0,
411
+ 'error': str(e)
412
+ })
413
+
414
+ # Print summary
415
+ print("\n" + "="*60)
416
+ print("šŸ“Š PARALLEL CHUNKING SUMMARY")
417
+ print("="*60)
418
+ successful = [r for r in results if r['success']]
419
+ failed = [r for r in results if not r['success']]
420
+ total_chunks = sum(r['chunks_count'] for r in successful)
421
+
422
+ print(f"Files processed: {len(results)}")
423
+ print(f"Successful: {len(successful)}")
424
+ print(f"Failed: {len(failed)}")
425
+ print(f"Total chunks generated: {total_chunks}")
426
+
427
+ if failed:
428
+ print("\nāŒ Failed files:")
429
+ for r in failed:
430
+ print(f" • {Path(r['file']).name}: {r['error']}")
431
+
432
+ print("="*60)
433
+
434
+ return results
435
+
436
+
437
+ def get_markdown_files(input_path: str) -> List[Path]:
438
+ """Get list of markdown files from path (file or directory).
439
+
440
+ Args:
441
+ input_path: Path to file or directory
442
+
443
+ Returns:
444
+ List of Path objects for markdown files
445
+ """
446
+ path = Path(input_path)
447
+
448
+ if path.is_file():
449
+ return [path] if path.suffix.lower() == '.md' else []
450
+ elif path.is_dir():
451
+ # Find all .md files recursively
452
+ return list(path.glob("**/*.md"))
453
+ else:
454
+ return []
455
+
456
+
457
+ # ============================================================================
458
+ # MAIN
459
+ # ============================================================================
460
+
461
+ def main(input_path: Optional[str] = None, output_dir: Optional[str] = None,
462
+ parallel: bool = True, max_workers: int = NUM_FILE_WORKERS):
463
+ """Main execution - supports single file or parallel multi-file processing.
464
+
465
+ Args:
466
+ input_path: Path to file or directory (uses INPUT_FILE/INPUT_DIR if None)
467
+ output_dir: Output directory (uses OUTPUT_DIR if None)
468
+ parallel: Whether to use parallel processing for multiple files
469
+ max_workers: Number of parallel workers
470
+ """
471
+ # Setup logging
472
+ setup_logging()
473
+
474
+ print("šŸš€ Starting Simple Semantic Chunking")
475
+ print(f"šŸ¤– Using model: {LLM_MODEL_NAME}")
476
+
477
+ # Determine input path
478
+ if input_path is None:
479
+ input_path = INPUT_DIR if INPUT_DIR else INPUT_FILE
480
+
481
+ # Determine output directory
482
+ if output_dir is None:
483
+ output_dir = OUTPUT_DIR
484
+ output_path = Path(output_dir)
485
+
486
+ # Get list of markdown files
487
+ input_files = get_markdown_files(input_path)
488
+
489
+ if not input_files:
490
+ print(f"āŒ No markdown files found at: {input_path}")
491
+ return
492
+
493
+ print(f"šŸ“‚ Found {len(input_files)} markdown file(s)")
494
+
495
+ # Process files
496
+ if len(input_files) == 1:
497
+ # Single file - process directly
498
+ result = process_single_file(input_files[0], output_path)
499
+ if result['success']:
500
+ print_summary_from_file(output_path / f"{input_files[0].stem}_chunks.json")
501
+ elif parallel:
502
+ # Multiple files - process in parallel
503
+ results = process_files_parallel(input_files, output_path, max_workers)
504
+ else:
505
+ # Multiple files - process sequentially
506
+ print(f"\nšŸ”„ Processing {len(input_files)} files sequentially...")
507
+ for input_file in tqdm(input_files, desc="Chunking files"):
508
+ file_output_dir = output_path / input_file.stem
509
+ process_single_file(input_file, file_output_dir)
510
+
511
+ print("\nāœ… Processing complete!")
512
+ print(f"šŸ“ Output directory: {output_path}")
513
+ print(f" • Log file: {call_llm.LOG_FILE if hasattr(call_llm, 'LOG_FILE') else 'N/A'}")
514
+
515
+
516
+ def print_summary_from_file(chunks_file: Path):
517
+ """Print summary from saved chunks file."""
518
+ if chunks_file.exists():
519
+ with open(chunks_file, 'r', encoding='utf-8') as f:
520
+ chunks = json.load(f)
521
+ print_summary(chunks)
522
+
523
+
524
+ if __name__ == "__main__":
525
+ import argparse
526
+
527
+ parser = argparse.ArgumentParser(description="Semantic chunking for markdown files")
528
+ parser.add_argument("--input", "-i", type=str, default=None,
529
+ help="Input file or directory path")
530
+ parser.add_argument("--output", "-o", type=str, default=None,
531
+ help="Output directory")
532
+ parser.add_argument("--workers", "-w", type=int, default=NUM_FILE_WORKERS,
533
+ help=f"Number of parallel workers (default: {NUM_FILE_WORKERS})")
534
+ parser.add_argument("--sequential", "-s", action="store_true",
535
+ help="Process files sequentially instead of in parallel")
536
+
537
+ args = parser.parse_args()
538
+
539
+ main(
540
+ input_path=args.input,
541
+ output_dir=args.output,
542
+ parallel=not args.sequential,
543
+ max_workers=args.workers
544
+ )
545
+