rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,699 @@
1
+ """
2
+ Code Loader for processing source code files.
3
+
4
+ This loader handles various programming language files and provides:
5
+ - Language detection based on file extension
6
+ - Syntax-aware chunking that preserves code structure
7
+ - Support for multiple languages (Python, JavaScript, TypeScript, Java, C/C++, Go, Rust, etc.)
8
+ - Comment and docstring extraction
9
+ - Function/class boundary detection for smarter chunking
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ import re
16
+ import time
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional, Union
19
+
20
+ from rakam_systems_core.ai_utils import logging
21
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
22
+ from rakam_systems_vectorstore.components.chunker import TextChunker
23
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class CodeLoader(Loader):
29
+ """
30
+ Code loader for processing source code files.
31
+
32
+ This loader provides code file processing with support for:
33
+ - Multiple programming languages
34
+ - Language detection based on file extension
35
+ - Syntax-aware chunking that preserves code structure
36
+ - Comment and docstring extraction
37
+
38
+ The extracted content is chunked and returned as text or Node objects.
39
+ """
40
+
41
+ # Default configuration
42
+ DEFAULT_CHUNK_SIZE = 2000
43
+ DEFAULT_CHUNK_OVERLAP = 200
44
+ DEFAULT_MIN_SENTENCES_PER_CHUNK = 3
45
+ DEFAULT_TOKENIZER = "character"
46
+
47
+ # Language detection by file extension
48
+ EXTENSION_TO_LANGUAGE = {
49
+ # Python
50
+ '.py': 'python',
51
+ '.pyw': 'python',
52
+ '.pyi': 'python',
53
+
54
+ # JavaScript/TypeScript
55
+ '.js': 'javascript',
56
+ '.jsx': 'javascript',
57
+ '.ts': 'typescript',
58
+ '.tsx': 'typescript',
59
+ '.mjs': 'javascript',
60
+ '.cjs': 'javascript',
61
+
62
+ # Java/Kotlin
63
+ '.java': 'java',
64
+ '.kt': 'kotlin',
65
+ '.kts': 'kotlin',
66
+
67
+ # C/C++
68
+ '.c': 'c',
69
+ '.h': 'c',
70
+ '.cpp': 'cpp',
71
+ '.cc': 'cpp',
72
+ '.cxx': 'cpp',
73
+ '.hpp': 'cpp',
74
+ '.hxx': 'cpp',
75
+
76
+ # C#
77
+ '.cs': 'csharp',
78
+
79
+ # Go
80
+ '.go': 'go',
81
+
82
+ # Rust
83
+ '.rs': 'rust',
84
+
85
+ # Ruby
86
+ '.rb': 'ruby',
87
+ '.rake': 'ruby',
88
+
89
+ # PHP
90
+ '.php': 'php',
91
+
92
+ # Swift
93
+ '.swift': 'swift',
94
+
95
+ # Scala
96
+ '.scala': 'scala',
97
+
98
+ # Shell
99
+ '.sh': 'shell',
100
+ '.bash': 'shell',
101
+ '.zsh': 'shell',
102
+
103
+ # SQL
104
+ '.sql': 'sql',
105
+
106
+ # R
107
+ '.r': 'r',
108
+ '.R': 'r',
109
+
110
+ # Lua
111
+ '.lua': 'lua',
112
+
113
+ # Perl
114
+ '.pl': 'perl',
115
+ '.pm': 'perl',
116
+
117
+ # Haskell
118
+ '.hs': 'haskell',
119
+
120
+ # Elixir/Erlang
121
+ '.ex': 'elixir',
122
+ '.exs': 'elixir',
123
+ '.erl': 'erlang',
124
+
125
+ # Dart
126
+ '.dart': 'dart',
127
+
128
+ # YAML
129
+ '.yaml': 'yaml',
130
+ '.yml': 'yaml',
131
+
132
+ # TOML
133
+ '.toml': 'toml',
134
+
135
+ # Config files
136
+ '.json': 'json',
137
+ '.xml': 'xml',
138
+ '.ini': 'ini',
139
+ '.cfg': 'ini',
140
+ '.conf': 'ini',
141
+ }
142
+
143
+ # Supported code file extensions
144
+ SUPPORTED_EXTENSIONS = set(EXTENSION_TO_LANGUAGE.keys())
145
+
146
+ def __init__(
147
+ self,
148
+ name: str = "code_loader",
149
+ config: Optional[Dict[str, Any]] = None
150
+ ):
151
+ """
152
+ Initialize Code loader.
153
+
154
+ Args:
155
+ name: Component name
156
+ config: Optional configuration with keys:
157
+ - chunk_size: Maximum tokens per chunk (default: 2000)
158
+ - chunk_overlap: Overlap between chunks in tokens (default: 200)
159
+ - min_sentences_per_chunk: Minimum sentences per chunk (default: 3)
160
+ - tokenizer: Tokenizer for chunking (default: "character")
161
+ - preserve_structure: Whether to preserve code structure in chunks (default: True)
162
+ - include_comments: Whether to include comments in output (default: True)
163
+ - encoding: File encoding (default: "utf-8")
164
+ """
165
+ super().__init__(name=name, config=config)
166
+
167
+ # Extract configuration
168
+ config = config or {}
169
+ self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
170
+ self._chunk_overlap = config.get(
171
+ 'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
172
+ self._min_sentences_per_chunk = config.get(
173
+ 'min_sentences_per_chunk', self.DEFAULT_MIN_SENTENCES_PER_CHUNK)
174
+ self._tokenizer = config.get('tokenizer', self.DEFAULT_TOKENIZER)
175
+ self._preserve_structure = config.get('preserve_structure', True)
176
+ self._include_comments = config.get('include_comments', True)
177
+ self._encoding = config.get('encoding', 'utf-8')
178
+
179
+ # Initialize text chunker
180
+ self._chunker = TextChunker(
181
+ chunk_size=self._chunk_size,
182
+ chunk_overlap=self._chunk_overlap,
183
+ min_sentences_per_chunk=self._min_sentences_per_chunk,
184
+ tokenizer=self._tokenizer
185
+ )
186
+
187
+ logger.info(
188
+ f"Initialized CodeLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}")
189
+
190
+ def run(self, source: str) -> List[str]:
191
+ """
192
+ Execute the primary operation for the component.
193
+
194
+ This method satisfies the BaseComponent abstract method requirement
195
+ and delegates to load_as_chunks.
196
+
197
+ Args:
198
+ source: Path to code file
199
+
200
+ Returns:
201
+ List of text chunks extracted from the code file
202
+ """
203
+ return self.load_as_chunks(source)
204
+
205
+ def load_as_text(
206
+ self,
207
+ source: Union[str, Path],
208
+ ) -> str:
209
+ """
210
+ Load code file and return as a single text string.
211
+
212
+ This method extracts all text from the code file and returns it as a single
213
+ string without chunking. Useful when you need the full code content.
214
+
215
+ Args:
216
+ source: Path to code file
217
+
218
+ Returns:
219
+ Full text content of the code file as a single string
220
+
221
+ Raises:
222
+ FileNotFoundError: If source file doesn't exist
223
+ ValueError: If source is not a supported code file
224
+ Exception: If code processing fails
225
+ """
226
+ # Convert Path to string
227
+ if isinstance(source, Path):
228
+ source = str(source)
229
+
230
+ # Validate file exists
231
+ if not os.path.isfile(source):
232
+ raise FileNotFoundError(f"File not found: {source}")
233
+
234
+ # Validate file is a code file
235
+ if not self._is_code_file(source):
236
+ raise ValueError(
237
+ f"File is not a supported code file: {source}. Extension: {Path(source).suffix}")
238
+
239
+ logger.info(f"Loading code file as text: {source}")
240
+ start_time = time.time()
241
+
242
+ try:
243
+ # Read file content
244
+ with open(source, 'r', encoding=self._encoding, errors='replace') as f:
245
+ content = f.read()
246
+
247
+ elapsed = time.time() - start_time
248
+ logger.info(
249
+ f"Code file loaded as text in {elapsed:.2f}s: {len(content)} characters")
250
+
251
+ return content
252
+
253
+ except Exception as e:
254
+ logger.error(f"Error loading code file as text {source}: {e}")
255
+ raise
256
+
257
+ def load_as_chunks(
258
+ self,
259
+ source: Union[str, Path],
260
+ ) -> List[str]:
261
+ """
262
+ Load code file and return as a list of text chunks.
263
+
264
+ This method extracts text from the code file, processes it with structure-aware
265
+ chunking, and returns a list of text chunks.
266
+
267
+ Args:
268
+ source: Path to code file
269
+
270
+ Returns:
271
+ List of text chunks extracted from the code file
272
+
273
+ Raises:
274
+ FileNotFoundError: If source file doesn't exist
275
+ ValueError: If source is not a supported code file
276
+ Exception: If code processing fails
277
+ """
278
+ # Convert Path to string
279
+ if isinstance(source, Path):
280
+ source = str(source)
281
+
282
+ # Validate file exists
283
+ if not os.path.isfile(source):
284
+ raise FileNotFoundError(f"File not found: {source}")
285
+
286
+ # Validate file is a code file
287
+ if not self._is_code_file(source):
288
+ raise ValueError(
289
+ f"File is not a supported code file: {source}. Extension: {Path(source).suffix}")
290
+
291
+ logger.info(f"Loading code file: {source}")
292
+ start_time = time.time()
293
+
294
+ try:
295
+ # Read file content
296
+ with open(source, 'r', encoding=self._encoding, errors='replace') as f:
297
+ content = f.read()
298
+
299
+ # Detect language
300
+ language = self._detect_language(source)
301
+
302
+ # Process code with structure-aware chunking
303
+ if self._preserve_structure:
304
+ text_chunks = self._chunk_code_with_structure(
305
+ content, language)
306
+ else:
307
+ text_chunks = self._chunk_text(content, language)
308
+
309
+ elapsed = time.time() - start_time
310
+ logger.info(
311
+ f"Code file processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
312
+
313
+ return text_chunks
314
+
315
+ except Exception as e:
316
+ logger.error(f"Error processing code file {source}: {e}")
317
+ raise
318
+
319
+ def load_as_nodes(
320
+ self,
321
+ source: Union[str, Path],
322
+ source_id: Optional[str] = None,
323
+ custom_metadata: Optional[Dict[str, Any]] = None
324
+ ) -> List[Node]:
325
+ """
326
+ Load code file and return as Node objects with metadata.
327
+
328
+ Args:
329
+ source: Path to code file
330
+ source_id: Optional source identifier (defaults to file path)
331
+ custom_metadata: Optional custom metadata to attach to nodes
332
+
333
+ Returns:
334
+ List of Node objects with text chunks and metadata
335
+ """
336
+ # Convert Path to string
337
+ if isinstance(source, Path):
338
+ source = str(source)
339
+
340
+ # Load text chunks
341
+ chunks = self.load_as_chunks(source)
342
+
343
+ # Determine source ID
344
+ if source_id is None:
345
+ source_id = source
346
+
347
+ # Detect language for metadata
348
+ language = self._detect_language(source)
349
+
350
+ # Create nodes with metadata
351
+ nodes = []
352
+ for idx, chunk in enumerate(chunks):
353
+ # Build custom metadata with language info
354
+ node_custom = custom_metadata.copy() if custom_metadata else {}
355
+ node_custom['language'] = language
356
+ node_custom['file_extension'] = Path(source).suffix
357
+
358
+ metadata = NodeMetadata(
359
+ source_file_uuid=source_id,
360
+ position=idx,
361
+ custom=node_custom
362
+ )
363
+ node = Node(content=chunk, metadata=metadata)
364
+ nodes.append(node)
365
+
366
+ logger.info(f"Created {len(nodes)} nodes from code file: {source}")
367
+ return nodes
368
+
369
+ def load_as_vsfile(
370
+ self,
371
+ file_path: Union[str, Path],
372
+ custom_metadata: Optional[Dict[str, Any]] = None
373
+ ) -> VSFile:
374
+ """
375
+ Load code file and return as VSFile object.
376
+
377
+ Args:
378
+ file_path: Path to code file
379
+ custom_metadata: Optional custom metadata
380
+
381
+ Returns:
382
+ VSFile object with nodes
383
+
384
+ Raises:
385
+ FileNotFoundError: If file doesn't exist
386
+ ValueError: If file is not a supported code file
387
+ """
388
+ if isinstance(file_path, Path):
389
+ file_path = str(file_path)
390
+
391
+ if not os.path.isfile(file_path):
392
+ raise FileNotFoundError(f"File not found: {file_path}")
393
+
394
+ if not self._is_code_file(file_path):
395
+ raise ValueError(f"File is not a supported code file: {file_path}")
396
+
397
+ # Create VSFile
398
+ vsfile = VSFile(file_path)
399
+
400
+ # Load and create nodes
401
+ nodes = self.load_as_nodes(
402
+ file_path, str(vsfile.uuid), custom_metadata)
403
+ vsfile.nodes = nodes
404
+ vsfile.processed = True
405
+
406
+ logger.info(
407
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
408
+ return vsfile
409
+
410
+ def _is_code_file(self, file_path: str) -> bool:
411
+ """
412
+ Check if file is a supported code file based on extension.
413
+
414
+ Args:
415
+ file_path: Path to file
416
+
417
+ Returns:
418
+ True if file is a supported code file, False otherwise
419
+ """
420
+ path = Path(file_path)
421
+ return path.suffix.lower() in self.SUPPORTED_EXTENSIONS
422
+
423
+ def _detect_language(self, file_path: str) -> str:
424
+ """
425
+ Detect programming language based on file extension.
426
+
427
+ Args:
428
+ file_path: Path to code file
429
+
430
+ Returns:
431
+ Language name string
432
+ """
433
+ path = Path(file_path)
434
+ suffix = path.suffix.lower()
435
+ return self.EXTENSION_TO_LANGUAGE.get(suffix, 'unknown')
436
+
437
+ def _chunk_code_with_structure(self, content: str, language: str) -> List[str]:
438
+ """
439
+ Chunk code while preserving structural boundaries.
440
+
441
+ This method attempts to split code at logical boundaries like
442
+ function definitions, class definitions, etc.
443
+
444
+ Args:
445
+ content: Code content
446
+ language: Programming language
447
+
448
+ Returns:
449
+ List of text chunks
450
+ """
451
+ if not content or not content.strip():
452
+ return []
453
+
454
+ # Split by structural elements based on language
455
+ blocks = self._split_by_structure(content, language)
456
+
457
+ # Chunk each block, combining small ones
458
+ chunks = []
459
+ current_chunk = []
460
+ current_size = 0
461
+
462
+ for block in blocks:
463
+ block_size = len(block)
464
+
465
+ # If block is too large, chunk it separately
466
+ if block_size > self._chunk_size:
467
+ # Save current accumulated chunk
468
+ if current_chunk:
469
+ chunks.append('\n\n'.join(current_chunk))
470
+ current_chunk = []
471
+ current_size = 0
472
+
473
+ # Chunk the large block
474
+ sub_chunks = self._chunk_text(block, language)
475
+ chunks.extend(sub_chunks)
476
+
477
+ # If adding this block would exceed limit, save current and start new
478
+ elif current_size + block_size > self._chunk_size:
479
+ if current_chunk:
480
+ chunks.append('\n\n'.join(current_chunk))
481
+ current_chunk = [block]
482
+ current_size = block_size
483
+
484
+ # Add to current chunk
485
+ else:
486
+ current_chunk.append(block)
487
+ current_size += block_size
488
+
489
+ # Don't forget the last chunk
490
+ if current_chunk:
491
+ chunks.append('\n\n'.join(current_chunk))
492
+
493
+ return chunks if chunks else [content]
494
+
495
+ def _split_by_structure(self, content: str, language: str) -> List[str]:
496
+ """
497
+ Split code by structural elements (functions, classes, etc).
498
+
499
+ Args:
500
+ content: Code content
501
+ language: Programming language
502
+
503
+ Returns:
504
+ List of code blocks
505
+ """
506
+ # Language-specific patterns for structural elements
507
+ patterns = self._get_structure_patterns(language)
508
+
509
+ if not patterns:
510
+ # Fall back to line-based splitting
511
+ return self._split_by_blank_lines(content)
512
+
513
+ # Find all structural boundaries
514
+ blocks = []
515
+ lines = content.split('\n')
516
+ current_block = []
517
+
518
+ for line in lines:
519
+ # Check if this line starts a new structural element
520
+ is_boundary = any(re.match(pattern, line) for pattern in patterns)
521
+
522
+ if is_boundary and current_block:
523
+ # Save current block and start new one
524
+ blocks.append('\n'.join(current_block))
525
+ current_block = [line]
526
+ else:
527
+ current_block.append(line)
528
+
529
+ # Add final block
530
+ if current_block:
531
+ blocks.append('\n'.join(current_block))
532
+
533
+ return blocks
534
+
535
+ def _get_structure_patterns(self, language: str) -> List[str]:
536
+ """
537
+ Get regex patterns for structural elements in a language.
538
+
539
+ Args:
540
+ language: Programming language
541
+
542
+ Returns:
543
+ List of regex patterns
544
+ """
545
+ patterns = {
546
+ 'python': [
547
+ r'^class\s+\w+', # class definition
548
+ r'^def\s+\w+', # function definition
549
+ r'^async\s+def\s+\w+', # async function
550
+ # decorator (start of decorated block)
551
+ r'^@\w+',
552
+ ],
553
+ 'javascript': [
554
+ r'^(export\s+)?(async\s+)?function\s+\w+', # function
555
+ r'^(export\s+)?class\s+\w+', # class
556
+ # arrow function
557
+ r'^(export\s+)?(const|let|var)\s+\w+\s*=\s*(async\s+)?\(?\w*\)?\s*=>',
558
+ ],
559
+ 'typescript': [
560
+ r'^(export\s+)?(async\s+)?function\s+\w+',
561
+ r'^(export\s+)?class\s+\w+',
562
+ r'^(export\s+)?(const|let|var)\s+\w+\s*=\s*(async\s+)?\(?\w*\)?\s*=>',
563
+ r'^(export\s+)?interface\s+\w+',
564
+ r'^(export\s+)?type\s+\w+',
565
+ ],
566
+ 'java': [
567
+ r'^(public|private|protected)?\s*(static\s+)?class\s+\w+',
568
+ r'^(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(',
569
+ r'^(public|private|protected)?\s*interface\s+\w+',
570
+ ],
571
+ 'go': [
572
+ r'^func\s+(\(\w+\s+\*?\w+\)\s+)?\w+', # function or method
573
+ r'^type\s+\w+\s+(struct|interface)', # type definition
574
+ ],
575
+ 'rust': [
576
+ r'^(pub\s+)?fn\s+\w+', # function
577
+ r'^(pub\s+)?struct\s+\w+', # struct
578
+ r'^(pub\s+)?enum\s+\w+', # enum
579
+ r'^(pub\s+)?trait\s+\w+', # trait
580
+ r'^impl\s+', # impl block
581
+ ],
582
+ 'cpp': [
583
+ r'^class\s+\w+',
584
+ r'^(virtual\s+)?(static\s+)?\w+\s+\w+\s*\(',
585
+ r'^namespace\s+\w+',
586
+ ],
587
+ 'c': [
588
+ r'^\w+\s+\w+\s*\(', # function definition
589
+ r'^struct\s+\w+',
590
+ r'^typedef\s+',
591
+ ],
592
+ 'ruby': [
593
+ r'^class\s+\w+',
594
+ r'^module\s+\w+',
595
+ r'^def\s+\w+',
596
+ ],
597
+ 'php': [
598
+ r'^(public|private|protected)?\s*(static\s+)?function\s+\w+',
599
+ r'^class\s+\w+',
600
+ r'^interface\s+\w+',
601
+ r'^trait\s+\w+',
602
+ ],
603
+ }
604
+
605
+ return patterns.get(language, [])
606
+
607
+ def _split_by_blank_lines(self, content: str) -> List[str]:
608
+ """
609
+ Split content by blank lines as fallback.
610
+
611
+ Args:
612
+ content: Code content
613
+
614
+ Returns:
615
+ List of code blocks
616
+ """
617
+ # Split by one or more blank lines
618
+ blocks = re.split(r'\n\s*\n', content)
619
+ return [block.strip() for block in blocks if block.strip()]
620
+
621
+ def _chunk_text(self, text: str, language: str) -> List[str]:
622
+ """
623
+ Chunk text using TextChunker.
624
+
625
+ Args:
626
+ text: Full text to chunk
627
+ language: Programming language for context
628
+
629
+ Returns:
630
+ List of text chunks
631
+ """
632
+ if not text or not text.strip():
633
+ return []
634
+
635
+ try:
636
+ # Use TextChunker's chunk_text method
637
+ chunk_dicts = self._chunker.chunk_text(
638
+ text, context=f"code_{language}")
639
+
640
+ # Extract just the text from the chunk dictionaries
641
+ text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
642
+
643
+ logger.debug(f"Chunked code text into {len(text_chunks)} chunks")
644
+ return text_chunks
645
+
646
+ except Exception as e:
647
+ logger.warning(f"Failed to chunk text with TextChunker: {e}")
648
+ # Fall back to returning the whole text as a single chunk
649
+ logger.info("Falling back to single chunk")
650
+ return [text]
651
+
652
+
653
+ def create_code_loader(
654
+ chunk_size: int = 2000,
655
+ chunk_overlap: int = 200,
656
+ min_sentences_per_chunk: int = 3,
657
+ tokenizer: str = "character",
658
+ preserve_structure: bool = True,
659
+ include_comments: bool = True,
660
+ encoding: str = 'utf-8'
661
+ ) -> CodeLoader:
662
+ """
663
+ Factory function to create a code loader.
664
+
665
+ Args:
666
+ chunk_size: Maximum tokens per chunk (default: 2000)
667
+ chunk_overlap: Overlap between chunks in tokens (default: 200)
668
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 3)
669
+ tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
670
+ preserve_structure: Whether to preserve code structure in chunks (default: True)
671
+ include_comments: Whether to include comments in output (default: True)
672
+ encoding: File encoding (default: "utf-8")
673
+
674
+ Returns:
675
+ Configured code loader
676
+
677
+ Example:
678
+ >>> loader = create_code_loader(chunk_size=1024, chunk_overlap=64)
679
+ >>> chunks = loader.run("src/main.py")
680
+ >>> print(f"Extracted {len(chunks)} chunks")
681
+
682
+ >>> # Create loader without structure preservation
683
+ >>> loader = create_code_loader(preserve_structure=False)
684
+ >>> chunks = loader.run("src/utils.js")
685
+ """
686
+ config = {
687
+ 'chunk_size': chunk_size,
688
+ 'chunk_overlap': chunk_overlap,
689
+ 'min_sentences_per_chunk': min_sentences_per_chunk,
690
+ 'tokenizer': tokenizer,
691
+ 'preserve_structure': preserve_structure,
692
+ 'include_comments': include_comments,
693
+ 'encoding': encoding
694
+ }
695
+
696
+ return CodeLoader(config=config)
697
+
698
+
699
+ __all__ = ["CodeLoader", "create_code_loader"]