rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,622 @@
1
+ """
2
+ Markdown Loader for processing Markdown (.md) files.
3
+
4
+ This loader provides intelligent markdown processing with:
5
+ - Header-based section splitting
6
+ - Code block preservation
7
+ - Metadata extraction (frontmatter)
8
+ - Configurable chunking strategies
9
+ - Support for common markdown extensions
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import re
16
+ import time
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional, Union
19
+
20
+ from rakam_systems_core.ai_utils import logging
21
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
22
+ from rakam_systems_vectorstore.components.chunker import AdvancedChunker
23
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class MdLoader(Loader):
29
+ """
30
+ Markdown loader for processing .md files.
31
+
32
+ This loader provides markdown processing with support for:
33
+ - Header-based section splitting (preserves document structure)
34
+ - Code block preservation (keeps code blocks intact)
35
+ - YAML frontmatter extraction
36
+ - Advanced text chunking
37
+ - Configurable processing options
38
+
39
+ The extracted content is chunked and returned as text or Node objects.
40
+ """
41
+
42
+ # Default configuration
43
+ DEFAULT_EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
44
+ DEFAULT_CHUNK_SIZE = 2048
45
+ DEFAULT_CHUNK_OVERLAP = 128
46
+
47
+ # Regex patterns
48
+ FRONTMATTER_PATTERN = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL)
49
+ HEADER_PATTERN = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
50
+ CODE_BLOCK_PATTERN = re.compile(r'```[\s\S]*?```', re.MULTILINE)
51
+
52
+ def __init__(
53
+ self,
54
+ name: str = "md_loader",
55
+ config: Optional[Dict[str, Any]] = None
56
+ ):
57
+ """
58
+ Initialize Markdown loader.
59
+
60
+ Args:
61
+ name: Component name
62
+ config: Optional configuration with keys:
63
+ - embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
64
+ - chunk_size: Maximum tokens per chunk (default: 2048)
65
+ - chunk_overlap: Overlap between chunks in tokens (default: 128)
66
+ - min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
67
+ - tokenizer: Tokenizer for chunking (default: "character")
68
+ - split_by_headers: Whether to split by headers (default: True)
69
+ - preserve_code_blocks: Whether to keep code blocks intact (default: True)
70
+ - extract_frontmatter: Whether to extract YAML frontmatter (default: True)
71
+ - include_frontmatter_in_chunks: Whether to include frontmatter in chunks (default: False)
72
+ - encoding: File encoding (default: "utf-8")
73
+ """
74
+ super().__init__(name=name, config=config)
75
+
76
+ # Extract configuration
77
+ config = config or {}
78
+ self._encoding = config.get('encoding', 'utf-8')
79
+ self._split_by_headers = config.get('split_by_headers', True)
80
+ self._preserve_code_blocks = config.get('preserve_code_blocks', True)
81
+ self._extract_frontmatter = config.get('extract_frontmatter', True)
82
+ self._include_frontmatter_in_chunks = config.get(
83
+ 'include_frontmatter_in_chunks', False)
84
+
85
+ # Chunking configuration
86
+ self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
87
+ self._chunk_overlap = config.get(
88
+ 'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
89
+ self._min_sentences_per_chunk = config.get(
90
+ 'min_sentences_per_chunk', 1)
91
+ self._tokenizer = config.get('tokenizer', 'character')
92
+
93
+ # Initialize advanced chunker
94
+ embed_model_id = config.get(
95
+ 'embed_model_id', self.DEFAULT_EMBED_MODEL_ID)
96
+ self._chunker = AdvancedChunker(
97
+ embed_model_id=embed_model_id,
98
+ strategy="default"
99
+ )
100
+
101
+ # Store last extraction info
102
+ self._last_frontmatter = None
103
+ self._last_headers = []
104
+
105
+ logger.info(
106
+ f"Initialized MdLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}")
107
+
108
+ def run(self, source: str) -> List[str]:
109
+ """
110
+ Execute the primary operation for the component.
111
+
112
+ This method satisfies the BaseComponent abstract method requirement
113
+ and delegates to load_as_chunks.
114
+
115
+ Args:
116
+ source: Path to Markdown file
117
+
118
+ Returns:
119
+ List of text chunks extracted from the Markdown file
120
+ """
121
+ return self.load_as_chunks(source)
122
+
123
+ def load_as_text(
124
+ self,
125
+ source: Union[str, Path],
126
+ ) -> str:
127
+ """
128
+ Load Markdown file and return as a single text string.
129
+
130
+ This method extracts all text from the Markdown file and returns it
131
+ as a single string without chunking.
132
+
133
+ Args:
134
+ source: Path to Markdown file
135
+
136
+ Returns:
137
+ Full text content of the Markdown file as a single string
138
+
139
+ Raises:
140
+ FileNotFoundError: If source file doesn't exist
141
+ ValueError: If source is not a Markdown file
142
+ Exception: If file processing fails
143
+ """
144
+ # Convert Path to string
145
+ if isinstance(source, Path):
146
+ source = str(source)
147
+
148
+ # Validate file exists
149
+ if not os.path.isfile(source):
150
+ raise FileNotFoundError(f"File not found: {source}")
151
+
152
+ # Validate file is a Markdown file
153
+ if not self._is_md_file(source):
154
+ raise ValueError(f"File is not a Markdown file: {source}")
155
+
156
+ logger.info(f"Loading Markdown as text: {source}")
157
+ start_time = time.time()
158
+
159
+ try:
160
+ # Read file content
161
+ with open(source, 'r', encoding=self._encoding) as f:
162
+ content = f.read()
163
+
164
+ # Extract and optionally remove frontmatter
165
+ content, frontmatter = self._process_frontmatter(content)
166
+ self._last_frontmatter = frontmatter
167
+
168
+ # Extract headers for metadata
169
+ self._last_headers = self._extract_headers(content)
170
+
171
+ elapsed = time.time() - start_time
172
+ logger.info(
173
+ f"Markdown loaded as text in {elapsed:.2f}s: {len(content)} characters")
174
+
175
+ return content
176
+
177
+ except Exception as e:
178
+ logger.error(f"Error loading Markdown as text {source}: {e}")
179
+ raise
180
+
181
+ def load_as_chunks(
182
+ self,
183
+ source: Union[str, Path],
184
+ ) -> List[str]:
185
+ """
186
+ Load Markdown file and return as a list of text chunks.
187
+
188
+ This method extracts text from the Markdown file, processes it with
189
+ the configured chunker, and returns a list of text chunks.
190
+
191
+ Args:
192
+ source: Path to Markdown file
193
+
194
+ Returns:
195
+ List of text chunks extracted from the Markdown file
196
+
197
+ Raises:
198
+ FileNotFoundError: If source file doesn't exist
199
+ ValueError: If source is not a Markdown file
200
+ Exception: If file processing fails
201
+ """
202
+ # Convert Path to string
203
+ if isinstance(source, Path):
204
+ source = str(source)
205
+
206
+ # Validate file exists
207
+ if not os.path.isfile(source):
208
+ raise FileNotFoundError(f"File not found: {source}")
209
+
210
+ # Validate file is a Markdown file
211
+ if not self._is_md_file(source):
212
+ raise ValueError(f"File is not a Markdown file: {source}")
213
+
214
+ logger.info(f"Loading Markdown file: {source}")
215
+ start_time = time.time()
216
+
217
+ try:
218
+ # Read file content
219
+ with open(source, 'r', encoding=self._encoding) as f:
220
+ content = f.read()
221
+
222
+ # Extract and optionally remove frontmatter
223
+ content, frontmatter = self._process_frontmatter(content)
224
+ self._last_frontmatter = frontmatter
225
+
226
+ # Extract headers for metadata
227
+ self._last_headers = self._extract_headers(content)
228
+
229
+ # Chunk the content
230
+ if self._split_by_headers:
231
+ text_chunks = self._chunk_by_headers(content)
232
+ else:
233
+ text_chunks = self._chunk_text(content)
234
+
235
+ # Optionally prepend frontmatter to first chunk
236
+ if self._include_frontmatter_in_chunks and frontmatter and text_chunks:
237
+ frontmatter_text = self._frontmatter_to_text(frontmatter)
238
+ text_chunks[0] = frontmatter_text + "\n\n" + text_chunks[0]
239
+
240
+ elapsed = time.time() - start_time
241
+ logger.info(
242
+ f"Markdown processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
243
+
244
+ return text_chunks
245
+
246
+ except Exception as e:
247
+ logger.error(f"Error processing Markdown {source}: {e}")
248
+ raise
249
+
250
+ def load_as_nodes(
251
+ self,
252
+ source: Union[str, Path],
253
+ source_id: Optional[str] = None,
254
+ custom_metadata: Optional[Dict[str, Any]] = None
255
+ ) -> List[Node]:
256
+ """
257
+ Load Markdown file and return as Node objects with metadata.
258
+
259
+ Args:
260
+ source: Path to Markdown file
261
+ source_id: Optional source identifier (defaults to file path)
262
+ custom_metadata: Optional custom metadata to attach to nodes
263
+
264
+ Returns:
265
+ List of Node objects with text chunks and metadata
266
+ """
267
+ # Convert Path to string
268
+ if isinstance(source, Path):
269
+ source = str(source)
270
+
271
+ # Load text chunks
272
+ chunks = self.load_as_chunks(source)
273
+
274
+ # Determine source ID
275
+ if source_id is None:
276
+ source_id = source
277
+
278
+ # Build custom metadata with frontmatter if available
279
+ node_custom_metadata = custom_metadata.copy() if custom_metadata else {}
280
+ if self._last_frontmatter:
281
+ node_custom_metadata['frontmatter'] = self._last_frontmatter
282
+ if self._last_headers:
283
+ node_custom_metadata['headers'] = self._last_headers
284
+
285
+ # Create nodes with metadata
286
+ nodes = []
287
+ for idx, chunk in enumerate(chunks):
288
+ metadata = NodeMetadata(
289
+ source_file_uuid=source_id,
290
+ position=idx,
291
+ custom=node_custom_metadata
292
+ )
293
+ node = Node(content=chunk, metadata=metadata)
294
+ nodes.append(node)
295
+
296
+ logger.info(f"Created {len(nodes)} nodes from Markdown: {source}")
297
+ return nodes
298
+
299
+ def load_as_vsfile(
300
+ self,
301
+ file_path: Union[str, Path],
302
+ custom_metadata: Optional[Dict[str, Any]] = None
303
+ ) -> VSFile:
304
+ """
305
+ Load Markdown file and return as VSFile object.
306
+
307
+ Args:
308
+ file_path: Path to Markdown file
309
+ custom_metadata: Optional custom metadata
310
+
311
+ Returns:
312
+ VSFile object with nodes
313
+
314
+ Raises:
315
+ FileNotFoundError: If file doesn't exist
316
+ ValueError: If file is not a Markdown file
317
+ """
318
+ if isinstance(file_path, Path):
319
+ file_path = str(file_path)
320
+
321
+ if not os.path.isfile(file_path):
322
+ raise FileNotFoundError(f"File not found: {file_path}")
323
+
324
+ if not self._is_md_file(file_path):
325
+ raise ValueError(f"File is not a Markdown file: {file_path}")
326
+
327
+ # Create VSFile
328
+ vsfile = VSFile(file_path)
329
+
330
+ # Load and create nodes
331
+ nodes = self.load_as_nodes(
332
+ file_path, str(vsfile.uuid), custom_metadata)
333
+ vsfile.nodes = nodes
334
+ vsfile.processed = True
335
+
336
+ logger.info(
337
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
338
+ return vsfile
339
+
340
+ def get_frontmatter(self) -> Optional[Dict[str, Any]]:
341
+ """
342
+ Get the frontmatter from the last processed file.
343
+
344
+ Returns:
345
+ Dictionary of frontmatter key-value pairs, or None if no frontmatter
346
+ """
347
+ return self._last_frontmatter
348
+
349
+ def get_headers(self) -> List[Dict[str, Any]]:
350
+ """
351
+ Get the headers from the last processed file.
352
+
353
+ Returns:
354
+ List of header dictionaries with 'level' and 'text' keys
355
+ """
356
+ return self._last_headers
357
+
358
+ def _is_md_file(self, file_path: str) -> bool:
359
+ """
360
+ Check if file is a Markdown file based on extension.
361
+
362
+ Args:
363
+ file_path: Path to file
364
+
365
+ Returns:
366
+ True if file is a Markdown file, False otherwise
367
+ """
368
+ path = Path(file_path)
369
+ return path.suffix.lower() in ['.md', '.markdown', '.mdown', '.mkd', '.mkdn']
370
+
371
+ def _process_frontmatter(self, content: str) -> tuple[str, Optional[Dict[str, Any]]]:
372
+ """
373
+ Extract and optionally remove YAML frontmatter from content.
374
+
375
+ Args:
376
+ content: Raw markdown content
377
+
378
+ Returns:
379
+ Tuple of (content without frontmatter, frontmatter dict or None)
380
+ """
381
+ if not self._extract_frontmatter:
382
+ return content, None
383
+
384
+ match = self.FRONTMATTER_PATTERN.match(content)
385
+ if not match:
386
+ return content, None
387
+
388
+ try:
389
+ import yaml
390
+ frontmatter_text = match.group(1)
391
+ frontmatter = yaml.safe_load(frontmatter_text)
392
+
393
+ # Remove frontmatter from content
394
+ content_without_frontmatter = content[match.end():]
395
+
396
+ logger.debug(
397
+ f"Extracted frontmatter with {len(frontmatter) if frontmatter else 0} keys")
398
+ return content_without_frontmatter, frontmatter
399
+
400
+ except ImportError:
401
+ logger.warning(
402
+ "PyYAML not installed. Frontmatter extraction disabled.")
403
+ return content, None
404
+ except Exception as e:
405
+ logger.warning(f"Failed to parse frontmatter: {e}")
406
+ return content, None
407
+
408
+ def _frontmatter_to_text(self, frontmatter: Dict[str, Any]) -> str:
409
+ """
410
+ Convert frontmatter dictionary to readable text.
411
+
412
+ Args:
413
+ frontmatter: Frontmatter dictionary
414
+
415
+ Returns:
416
+ Human-readable text representation
417
+ """
418
+ if not frontmatter:
419
+ return ""
420
+
421
+ lines = ["--- Document Metadata ---"]
422
+ for key, value in frontmatter.items():
423
+ if isinstance(value, list):
424
+ value = ", ".join(str(v) for v in value)
425
+ lines.append(f"{key}: {value}")
426
+ lines.append("---")
427
+
428
+ return "\n".join(lines)
429
+
430
+ def _extract_headers(self, content: str) -> List[Dict[str, Any]]:
431
+ """
432
+ Extract all headers from markdown content.
433
+
434
+ Args:
435
+ content: Markdown content
436
+
437
+ Returns:
438
+ List of header dictionaries with 'level' and 'text' keys
439
+ """
440
+ headers = []
441
+ for match in self.HEADER_PATTERN.finditer(content):
442
+ level = len(match.group(1))
443
+ text = match.group(2).strip()
444
+ headers.append({
445
+ 'level': level,
446
+ 'text': text
447
+ })
448
+
449
+ return headers
450
+
451
+ def _chunk_by_headers(self, content: str) -> List[str]:
452
+ """
453
+ Split content by headers while preserving code blocks.
454
+
455
+ Args:
456
+ content: Markdown content
457
+
458
+ Returns:
459
+ List of text chunks split by headers
460
+ """
461
+ if not content or not content.strip():
462
+ return []
463
+
464
+ # Preserve code blocks by replacing them with placeholders
465
+ code_blocks = []
466
+ if self._preserve_code_blocks:
467
+ def replace_code_block(match):
468
+ code_blocks.append(match.group(0))
469
+ return f"__CODE_BLOCK_{len(code_blocks) - 1}__"
470
+
471
+ content = self.CODE_BLOCK_PATTERN.sub(replace_code_block, content)
472
+
473
+ # Split by headers
474
+ chunks = []
475
+ current_chunk = []
476
+ current_header = None
477
+
478
+ for line in content.split('\n'):
479
+ header_match = self.HEADER_PATTERN.match(line)
480
+
481
+ if header_match:
482
+ # Save previous chunk if it has content
483
+ if current_chunk:
484
+ chunk_text = '\n'.join(current_chunk).strip()
485
+ if chunk_text:
486
+ chunks.append(chunk_text)
487
+
488
+ # Start new chunk with this header
489
+ current_chunk = [line]
490
+ current_header = header_match.group(2)
491
+ else:
492
+ current_chunk.append(line)
493
+
494
+ # Add final chunk
495
+ if current_chunk:
496
+ chunk_text = '\n'.join(current_chunk).strip()
497
+ if chunk_text:
498
+ chunks.append(chunk_text)
499
+
500
+ # Restore code blocks
501
+ if self._preserve_code_blocks and code_blocks:
502
+ restored_chunks = []
503
+ for chunk in chunks:
504
+ for i, code_block in enumerate(code_blocks):
505
+ chunk = chunk.replace(f"__CODE_BLOCK_{i}__", code_block)
506
+ restored_chunks.append(chunk)
507
+ chunks = restored_chunks
508
+
509
+ # If chunks are too large, further split them
510
+ final_chunks = []
511
+ for chunk in chunks:
512
+ if len(chunk) > self._chunk_size * 4: # Rough character estimate
513
+ sub_chunks = self._chunk_text(chunk)
514
+ final_chunks.extend(sub_chunks)
515
+ else:
516
+ final_chunks.append(chunk)
517
+
518
+ # If no chunks created, use standard chunking
519
+ if not final_chunks:
520
+ return self._chunk_text(content)
521
+
522
+ return final_chunks
523
+
524
+ def _chunk_text(self, text: str) -> List[str]:
525
+ """
526
+ Chunk text using AdvancedChunker's chunk_text method.
527
+
528
+ Args:
529
+ text: Full text to chunk
530
+
531
+ Returns:
532
+ List of text chunks
533
+ """
534
+ if not text or not text.strip():
535
+ return []
536
+
537
+ try:
538
+ # Use AdvancedChunker's chunk_text method for plain text
539
+ chunk_dicts = self._chunker.chunk_text(
540
+ text=text,
541
+ chunk_size=self._chunk_size,
542
+ chunk_overlap=self._chunk_overlap,
543
+ min_sentences_per_chunk=self._min_sentences_per_chunk,
544
+ tokenizer=self._tokenizer
545
+ )
546
+
547
+ # Extract just the text from the chunk dictionaries
548
+ text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
549
+
550
+ logger.debug(f"Chunked text into {len(text_chunks)} chunks")
551
+ return text_chunks
552
+
553
+ except Exception as e:
554
+ logger.warning(f"Failed to chunk text with AdvancedChunker: {e}")
555
+ # Fall back to returning the whole text as a single chunk
556
+ logger.info("Falling back to single chunk")
557
+ return [text]
558
+
559
+
560
+ def create_md_loader(
561
+ chunk_size: int = 2048,
562
+ chunk_overlap: int = 128,
563
+ min_sentences_per_chunk: int = 1,
564
+ tokenizer: str = "character",
565
+ embed_model_id: str = "sentence-transformers/all-MiniLM-L6-v2",
566
+ split_by_headers: bool = True,
567
+ preserve_code_blocks: bool = True,
568
+ extract_frontmatter: bool = True,
569
+ include_frontmatter_in_chunks: bool = False,
570
+ encoding: str = "utf-8"
571
+ ) -> MdLoader:
572
+ """
573
+ Factory function to create a Markdown loader.
574
+
575
+ Args:
576
+ chunk_size: Maximum tokens per chunk (default: 2048)
577
+ chunk_overlap: Overlap between chunks in tokens (default: 128)
578
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
579
+ tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
580
+ embed_model_id: HuggingFace model ID for tokenization (default: "sentence-transformers/all-MiniLM-L6-v2")
581
+ split_by_headers: Whether to split content by headers (default: True)
582
+ preserve_code_blocks: Whether to keep code blocks intact (default: True)
583
+ extract_frontmatter: Whether to extract YAML frontmatter (default: True)
584
+ include_frontmatter_in_chunks: Whether to include frontmatter in chunks (default: False)
585
+ encoding: File encoding (default: "utf-8")
586
+
587
+ Returns:
588
+ Configured Markdown loader
589
+
590
+ Example:
591
+ >>> loader = create_md_loader(chunk_size=1024, chunk_overlap=64)
592
+ >>> chunks = loader.run("docs/README.md")
593
+ >>> print(f"Extracted {len(chunks)} chunks")
594
+
595
+ >>> # Create loader without header splitting
596
+ >>> loader = create_md_loader(split_by_headers=False)
597
+ >>> chunks = loader.run("docs/README.md")
598
+
599
+ >>> # Access frontmatter after loading
600
+ >>> loader = create_md_loader()
601
+ >>> chunks = loader.run("docs/article.md")
602
+ >>> frontmatter = loader.get_frontmatter()
603
+ >>> if frontmatter:
604
+ ... print(f"Title: {frontmatter.get('title')}")
605
+ """
606
+ config = {
607
+ 'chunk_size': chunk_size,
608
+ 'chunk_overlap': chunk_overlap,
609
+ 'min_sentences_per_chunk': min_sentences_per_chunk,
610
+ 'tokenizer': tokenizer,
611
+ 'embed_model_id': embed_model_id,
612
+ 'split_by_headers': split_by_headers,
613
+ 'preserve_code_blocks': preserve_code_blocks,
614
+ 'extract_frontmatter': extract_frontmatter,
615
+ 'include_frontmatter_in_chunks': include_frontmatter_in_chunks,
616
+ 'encoding': encoding
617
+ }
618
+
619
+ return MdLoader(config=config)
620
+
621
+
622
+ __all__ = ["MdLoader", "create_md_loader"]