rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rakam_systems_vectorstore/MANIFEST.in +26 -0
- rakam_systems_vectorstore/README.md +1071 -0
- rakam_systems_vectorstore/__init__.py +93 -0
- rakam_systems_vectorstore/components/__init__.py +0 -0
- rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
- rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
- rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
- rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
- rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
- rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
- rakam_systems_vectorstore/components/loader/__init__.py +31 -0
- rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
- rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
- rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
- rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
- rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
- rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
- rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
- rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
- rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
- rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
- rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
- rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
- rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
- rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
- rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
- rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
- rakam_systems_vectorstore/config.py +266 -0
- rakam_systems_vectorstore/core.py +8 -0
- rakam_systems_vectorstore/pyproject.toml +113 -0
- rakam_systems_vectorstore/server/README.md +290 -0
- rakam_systems_vectorstore/server/__init__.py +20 -0
- rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
- rakam_systems_vectorstore/setup.py +103 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
- rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Code Loader for processing source code files.
|
|
3
|
+
|
|
4
|
+
This loader handles various programming language files and provides:
|
|
5
|
+
- Language detection based on file extension
|
|
6
|
+
- Syntax-aware chunking that preserves code structure
|
|
7
|
+
- Support for multiple languages (Python, JavaScript, TypeScript, Java, C/C++, Go, Rust, etc.)
|
|
8
|
+
- Comment and docstring extraction
|
|
9
|
+
- Function/class boundary detection for smarter chunking
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import time
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Dict, List, Optional, Union
|
|
19
|
+
|
|
20
|
+
from rakam_systems_core.ai_utils import logging
|
|
21
|
+
from rakam_systems_core.ai_core.interfaces.loader import Loader
|
|
22
|
+
from rakam_systems_vectorstore.components.chunker import TextChunker
|
|
23
|
+
from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CodeLoader(Loader):
|
|
29
|
+
"""
|
|
30
|
+
Code loader for processing source code files.
|
|
31
|
+
|
|
32
|
+
This loader provides code file processing with support for:
|
|
33
|
+
- Multiple programming languages
|
|
34
|
+
- Language detection based on file extension
|
|
35
|
+
- Syntax-aware chunking that preserves code structure
|
|
36
|
+
- Comment and docstring extraction
|
|
37
|
+
|
|
38
|
+
The extracted content is chunked and returned as text or Node objects.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# Default configuration
|
|
42
|
+
DEFAULT_CHUNK_SIZE = 2000
|
|
43
|
+
DEFAULT_CHUNK_OVERLAP = 200
|
|
44
|
+
DEFAULT_MIN_SENTENCES_PER_CHUNK = 3
|
|
45
|
+
DEFAULT_TOKENIZER = "character"
|
|
46
|
+
|
|
47
|
+
# Language detection by file extension
|
|
48
|
+
EXTENSION_TO_LANGUAGE = {
|
|
49
|
+
# Python
|
|
50
|
+
'.py': 'python',
|
|
51
|
+
'.pyw': 'python',
|
|
52
|
+
'.pyi': 'python',
|
|
53
|
+
|
|
54
|
+
# JavaScript/TypeScript
|
|
55
|
+
'.js': 'javascript',
|
|
56
|
+
'.jsx': 'javascript',
|
|
57
|
+
'.ts': 'typescript',
|
|
58
|
+
'.tsx': 'typescript',
|
|
59
|
+
'.mjs': 'javascript',
|
|
60
|
+
'.cjs': 'javascript',
|
|
61
|
+
|
|
62
|
+
# Java/Kotlin
|
|
63
|
+
'.java': 'java',
|
|
64
|
+
'.kt': 'kotlin',
|
|
65
|
+
'.kts': 'kotlin',
|
|
66
|
+
|
|
67
|
+
# C/C++
|
|
68
|
+
'.c': 'c',
|
|
69
|
+
'.h': 'c',
|
|
70
|
+
'.cpp': 'cpp',
|
|
71
|
+
'.cc': 'cpp',
|
|
72
|
+
'.cxx': 'cpp',
|
|
73
|
+
'.hpp': 'cpp',
|
|
74
|
+
'.hxx': 'cpp',
|
|
75
|
+
|
|
76
|
+
# C#
|
|
77
|
+
'.cs': 'csharp',
|
|
78
|
+
|
|
79
|
+
# Go
|
|
80
|
+
'.go': 'go',
|
|
81
|
+
|
|
82
|
+
# Rust
|
|
83
|
+
'.rs': 'rust',
|
|
84
|
+
|
|
85
|
+
# Ruby
|
|
86
|
+
'.rb': 'ruby',
|
|
87
|
+
'.rake': 'ruby',
|
|
88
|
+
|
|
89
|
+
# PHP
|
|
90
|
+
'.php': 'php',
|
|
91
|
+
|
|
92
|
+
# Swift
|
|
93
|
+
'.swift': 'swift',
|
|
94
|
+
|
|
95
|
+
# Scala
|
|
96
|
+
'.scala': 'scala',
|
|
97
|
+
|
|
98
|
+
# Shell
|
|
99
|
+
'.sh': 'shell',
|
|
100
|
+
'.bash': 'shell',
|
|
101
|
+
'.zsh': 'shell',
|
|
102
|
+
|
|
103
|
+
# SQL
|
|
104
|
+
'.sql': 'sql',
|
|
105
|
+
|
|
106
|
+
# R
|
|
107
|
+
'.r': 'r',
|
|
108
|
+
'.R': 'r',
|
|
109
|
+
|
|
110
|
+
# Lua
|
|
111
|
+
'.lua': 'lua',
|
|
112
|
+
|
|
113
|
+
# Perl
|
|
114
|
+
'.pl': 'perl',
|
|
115
|
+
'.pm': 'perl',
|
|
116
|
+
|
|
117
|
+
# Haskell
|
|
118
|
+
'.hs': 'haskell',
|
|
119
|
+
|
|
120
|
+
# Elixir/Erlang
|
|
121
|
+
'.ex': 'elixir',
|
|
122
|
+
'.exs': 'elixir',
|
|
123
|
+
'.erl': 'erlang',
|
|
124
|
+
|
|
125
|
+
# Dart
|
|
126
|
+
'.dart': 'dart',
|
|
127
|
+
|
|
128
|
+
# YAML
|
|
129
|
+
'.yaml': 'yaml',
|
|
130
|
+
'.yml': 'yaml',
|
|
131
|
+
|
|
132
|
+
# TOML
|
|
133
|
+
'.toml': 'toml',
|
|
134
|
+
|
|
135
|
+
# Config files
|
|
136
|
+
'.json': 'json',
|
|
137
|
+
'.xml': 'xml',
|
|
138
|
+
'.ini': 'ini',
|
|
139
|
+
'.cfg': 'ini',
|
|
140
|
+
'.conf': 'ini',
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Supported code file extensions
|
|
144
|
+
SUPPORTED_EXTENSIONS = set(EXTENSION_TO_LANGUAGE.keys())
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
name: str = "code_loader",
|
|
149
|
+
config: Optional[Dict[str, Any]] = None
|
|
150
|
+
):
|
|
151
|
+
"""
|
|
152
|
+
Initialize Code loader.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
name: Component name
|
|
156
|
+
config: Optional configuration with keys:
|
|
157
|
+
- chunk_size: Maximum tokens per chunk (default: 2000)
|
|
158
|
+
- chunk_overlap: Overlap between chunks in tokens (default: 200)
|
|
159
|
+
- min_sentences_per_chunk: Minimum sentences per chunk (default: 3)
|
|
160
|
+
- tokenizer: Tokenizer for chunking (default: "character")
|
|
161
|
+
- preserve_structure: Whether to preserve code structure in chunks (default: True)
|
|
162
|
+
- include_comments: Whether to include comments in output (default: True)
|
|
163
|
+
- encoding: File encoding (default: "utf-8")
|
|
164
|
+
"""
|
|
165
|
+
super().__init__(name=name, config=config)
|
|
166
|
+
|
|
167
|
+
# Extract configuration
|
|
168
|
+
config = config or {}
|
|
169
|
+
self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
|
|
170
|
+
self._chunk_overlap = config.get(
|
|
171
|
+
'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
|
|
172
|
+
self._min_sentences_per_chunk = config.get(
|
|
173
|
+
'min_sentences_per_chunk', self.DEFAULT_MIN_SENTENCES_PER_CHUNK)
|
|
174
|
+
self._tokenizer = config.get('tokenizer', self.DEFAULT_TOKENIZER)
|
|
175
|
+
self._preserve_structure = config.get('preserve_structure', True)
|
|
176
|
+
self._include_comments = config.get('include_comments', True)
|
|
177
|
+
self._encoding = config.get('encoding', 'utf-8')
|
|
178
|
+
|
|
179
|
+
# Initialize text chunker
|
|
180
|
+
self._chunker = TextChunker(
|
|
181
|
+
chunk_size=self._chunk_size,
|
|
182
|
+
chunk_overlap=self._chunk_overlap,
|
|
183
|
+
min_sentences_per_chunk=self._min_sentences_per_chunk,
|
|
184
|
+
tokenizer=self._tokenizer
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
logger.info(
|
|
188
|
+
f"Initialized CodeLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}")
|
|
189
|
+
|
|
190
|
+
def run(self, source: str) -> List[str]:
|
|
191
|
+
"""
|
|
192
|
+
Execute the primary operation for the component.
|
|
193
|
+
|
|
194
|
+
This method satisfies the BaseComponent abstract method requirement
|
|
195
|
+
and delegates to load_as_chunks.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
source: Path to code file
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
List of text chunks extracted from the code file
|
|
202
|
+
"""
|
|
203
|
+
return self.load_as_chunks(source)
|
|
204
|
+
|
|
205
|
+
def load_as_text(
|
|
206
|
+
self,
|
|
207
|
+
source: Union[str, Path],
|
|
208
|
+
) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Load code file and return as a single text string.
|
|
211
|
+
|
|
212
|
+
This method extracts all text from the code file and returns it as a single
|
|
213
|
+
string without chunking. Useful when you need the full code content.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
source: Path to code file
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Full text content of the code file as a single string
|
|
220
|
+
|
|
221
|
+
Raises:
|
|
222
|
+
FileNotFoundError: If source file doesn't exist
|
|
223
|
+
ValueError: If source is not a supported code file
|
|
224
|
+
Exception: If code processing fails
|
|
225
|
+
"""
|
|
226
|
+
# Convert Path to string
|
|
227
|
+
if isinstance(source, Path):
|
|
228
|
+
source = str(source)
|
|
229
|
+
|
|
230
|
+
# Validate file exists
|
|
231
|
+
if not os.path.isfile(source):
|
|
232
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
233
|
+
|
|
234
|
+
# Validate file is a code file
|
|
235
|
+
if not self._is_code_file(source):
|
|
236
|
+
raise ValueError(
|
|
237
|
+
f"File is not a supported code file: {source}. Extension: {Path(source).suffix}")
|
|
238
|
+
|
|
239
|
+
logger.info(f"Loading code file as text: {source}")
|
|
240
|
+
start_time = time.time()
|
|
241
|
+
|
|
242
|
+
try:
|
|
243
|
+
# Read file content
|
|
244
|
+
with open(source, 'r', encoding=self._encoding, errors='replace') as f:
|
|
245
|
+
content = f.read()
|
|
246
|
+
|
|
247
|
+
elapsed = time.time() - start_time
|
|
248
|
+
logger.info(
|
|
249
|
+
f"Code file loaded as text in {elapsed:.2f}s: {len(content)} characters")
|
|
250
|
+
|
|
251
|
+
return content
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.error(f"Error loading code file as text {source}: {e}")
|
|
255
|
+
raise
|
|
256
|
+
|
|
257
|
+
def load_as_chunks(
|
|
258
|
+
self,
|
|
259
|
+
source: Union[str, Path],
|
|
260
|
+
) -> List[str]:
|
|
261
|
+
"""
|
|
262
|
+
Load code file and return as a list of text chunks.
|
|
263
|
+
|
|
264
|
+
This method extracts text from the code file, processes it with structure-aware
|
|
265
|
+
chunking, and returns a list of text chunks.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
source: Path to code file
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
List of text chunks extracted from the code file
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
FileNotFoundError: If source file doesn't exist
|
|
275
|
+
ValueError: If source is not a supported code file
|
|
276
|
+
Exception: If code processing fails
|
|
277
|
+
"""
|
|
278
|
+
# Convert Path to string
|
|
279
|
+
if isinstance(source, Path):
|
|
280
|
+
source = str(source)
|
|
281
|
+
|
|
282
|
+
# Validate file exists
|
|
283
|
+
if not os.path.isfile(source):
|
|
284
|
+
raise FileNotFoundError(f"File not found: {source}")
|
|
285
|
+
|
|
286
|
+
# Validate file is a code file
|
|
287
|
+
if not self._is_code_file(source):
|
|
288
|
+
raise ValueError(
|
|
289
|
+
f"File is not a supported code file: {source}. Extension: {Path(source).suffix}")
|
|
290
|
+
|
|
291
|
+
logger.info(f"Loading code file: {source}")
|
|
292
|
+
start_time = time.time()
|
|
293
|
+
|
|
294
|
+
try:
|
|
295
|
+
# Read file content
|
|
296
|
+
with open(source, 'r', encoding=self._encoding, errors='replace') as f:
|
|
297
|
+
content = f.read()
|
|
298
|
+
|
|
299
|
+
# Detect language
|
|
300
|
+
language = self._detect_language(source)
|
|
301
|
+
|
|
302
|
+
# Process code with structure-aware chunking
|
|
303
|
+
if self._preserve_structure:
|
|
304
|
+
text_chunks = self._chunk_code_with_structure(
|
|
305
|
+
content, language)
|
|
306
|
+
else:
|
|
307
|
+
text_chunks = self._chunk_text(content, language)
|
|
308
|
+
|
|
309
|
+
elapsed = time.time() - start_time
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Code file processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
|
|
312
|
+
|
|
313
|
+
return text_chunks
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.error(f"Error processing code file {source}: {e}")
|
|
317
|
+
raise
|
|
318
|
+
|
|
319
|
+
def load_as_nodes(
|
|
320
|
+
self,
|
|
321
|
+
source: Union[str, Path],
|
|
322
|
+
source_id: Optional[str] = None,
|
|
323
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
324
|
+
) -> List[Node]:
|
|
325
|
+
"""
|
|
326
|
+
Load code file and return as Node objects with metadata.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
source: Path to code file
|
|
330
|
+
source_id: Optional source identifier (defaults to file path)
|
|
331
|
+
custom_metadata: Optional custom metadata to attach to nodes
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
List of Node objects with text chunks and metadata
|
|
335
|
+
"""
|
|
336
|
+
# Convert Path to string
|
|
337
|
+
if isinstance(source, Path):
|
|
338
|
+
source = str(source)
|
|
339
|
+
|
|
340
|
+
# Load text chunks
|
|
341
|
+
chunks = self.load_as_chunks(source)
|
|
342
|
+
|
|
343
|
+
# Determine source ID
|
|
344
|
+
if source_id is None:
|
|
345
|
+
source_id = source
|
|
346
|
+
|
|
347
|
+
# Detect language for metadata
|
|
348
|
+
language = self._detect_language(source)
|
|
349
|
+
|
|
350
|
+
# Create nodes with metadata
|
|
351
|
+
nodes = []
|
|
352
|
+
for idx, chunk in enumerate(chunks):
|
|
353
|
+
# Build custom metadata with language info
|
|
354
|
+
node_custom = custom_metadata.copy() if custom_metadata else {}
|
|
355
|
+
node_custom['language'] = language
|
|
356
|
+
node_custom['file_extension'] = Path(source).suffix
|
|
357
|
+
|
|
358
|
+
metadata = NodeMetadata(
|
|
359
|
+
source_file_uuid=source_id,
|
|
360
|
+
position=idx,
|
|
361
|
+
custom=node_custom
|
|
362
|
+
)
|
|
363
|
+
node = Node(content=chunk, metadata=metadata)
|
|
364
|
+
nodes.append(node)
|
|
365
|
+
|
|
366
|
+
logger.info(f"Created {len(nodes)} nodes from code file: {source}")
|
|
367
|
+
return nodes
|
|
368
|
+
|
|
369
|
+
def load_as_vsfile(
|
|
370
|
+
self,
|
|
371
|
+
file_path: Union[str, Path],
|
|
372
|
+
custom_metadata: Optional[Dict[str, Any]] = None
|
|
373
|
+
) -> VSFile:
|
|
374
|
+
"""
|
|
375
|
+
Load code file and return as VSFile object.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
file_path: Path to code file
|
|
379
|
+
custom_metadata: Optional custom metadata
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
VSFile object with nodes
|
|
383
|
+
|
|
384
|
+
Raises:
|
|
385
|
+
FileNotFoundError: If file doesn't exist
|
|
386
|
+
ValueError: If file is not a supported code file
|
|
387
|
+
"""
|
|
388
|
+
if isinstance(file_path, Path):
|
|
389
|
+
file_path = str(file_path)
|
|
390
|
+
|
|
391
|
+
if not os.path.isfile(file_path):
|
|
392
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
393
|
+
|
|
394
|
+
if not self._is_code_file(file_path):
|
|
395
|
+
raise ValueError(f"File is not a supported code file: {file_path}")
|
|
396
|
+
|
|
397
|
+
# Create VSFile
|
|
398
|
+
vsfile = VSFile(file_path)
|
|
399
|
+
|
|
400
|
+
# Load and create nodes
|
|
401
|
+
nodes = self.load_as_nodes(
|
|
402
|
+
file_path, str(vsfile.uuid), custom_metadata)
|
|
403
|
+
vsfile.nodes = nodes
|
|
404
|
+
vsfile.processed = True
|
|
405
|
+
|
|
406
|
+
logger.info(
|
|
407
|
+
f"Created VSFile with {len(nodes)} nodes from: {file_path}")
|
|
408
|
+
return vsfile
|
|
409
|
+
|
|
410
|
+
def _is_code_file(self, file_path: str) -> bool:
|
|
411
|
+
"""
|
|
412
|
+
Check if file is a supported code file based on extension.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
file_path: Path to file
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
True if file is a supported code file, False otherwise
|
|
419
|
+
"""
|
|
420
|
+
path = Path(file_path)
|
|
421
|
+
return path.suffix.lower() in self.SUPPORTED_EXTENSIONS
|
|
422
|
+
|
|
423
|
+
def _detect_language(self, file_path: str) -> str:
|
|
424
|
+
"""
|
|
425
|
+
Detect programming language based on file extension.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
file_path: Path to code file
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Language name string
|
|
432
|
+
"""
|
|
433
|
+
path = Path(file_path)
|
|
434
|
+
suffix = path.suffix.lower()
|
|
435
|
+
return self.EXTENSION_TO_LANGUAGE.get(suffix, 'unknown')
|
|
436
|
+
|
|
437
|
+
def _chunk_code_with_structure(self, content: str, language: str) -> List[str]:
|
|
438
|
+
"""
|
|
439
|
+
Chunk code while preserving structural boundaries.
|
|
440
|
+
|
|
441
|
+
This method attempts to split code at logical boundaries like
|
|
442
|
+
function definitions, class definitions, etc.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
content: Code content
|
|
446
|
+
language: Programming language
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
List of text chunks
|
|
450
|
+
"""
|
|
451
|
+
if not content or not content.strip():
|
|
452
|
+
return []
|
|
453
|
+
|
|
454
|
+
# Split by structural elements based on language
|
|
455
|
+
blocks = self._split_by_structure(content, language)
|
|
456
|
+
|
|
457
|
+
# Chunk each block, combining small ones
|
|
458
|
+
chunks = []
|
|
459
|
+
current_chunk = []
|
|
460
|
+
current_size = 0
|
|
461
|
+
|
|
462
|
+
for block in blocks:
|
|
463
|
+
block_size = len(block)
|
|
464
|
+
|
|
465
|
+
# If block is too large, chunk it separately
|
|
466
|
+
if block_size > self._chunk_size:
|
|
467
|
+
# Save current accumulated chunk
|
|
468
|
+
if current_chunk:
|
|
469
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
470
|
+
current_chunk = []
|
|
471
|
+
current_size = 0
|
|
472
|
+
|
|
473
|
+
# Chunk the large block
|
|
474
|
+
sub_chunks = self._chunk_text(block, language)
|
|
475
|
+
chunks.extend(sub_chunks)
|
|
476
|
+
|
|
477
|
+
# If adding this block would exceed limit, save current and start new
|
|
478
|
+
elif current_size + block_size > self._chunk_size:
|
|
479
|
+
if current_chunk:
|
|
480
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
481
|
+
current_chunk = [block]
|
|
482
|
+
current_size = block_size
|
|
483
|
+
|
|
484
|
+
# Add to current chunk
|
|
485
|
+
else:
|
|
486
|
+
current_chunk.append(block)
|
|
487
|
+
current_size += block_size
|
|
488
|
+
|
|
489
|
+
# Don't forget the last chunk
|
|
490
|
+
if current_chunk:
|
|
491
|
+
chunks.append('\n\n'.join(current_chunk))
|
|
492
|
+
|
|
493
|
+
return chunks if chunks else [content]
|
|
494
|
+
|
|
495
|
+
def _split_by_structure(self, content: str, language: str) -> List[str]:
|
|
496
|
+
"""
|
|
497
|
+
Split code by structural elements (functions, classes, etc).
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
content: Code content
|
|
501
|
+
language: Programming language
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
List of code blocks
|
|
505
|
+
"""
|
|
506
|
+
# Language-specific patterns for structural elements
|
|
507
|
+
patterns = self._get_structure_patterns(language)
|
|
508
|
+
|
|
509
|
+
if not patterns:
|
|
510
|
+
# Fall back to line-based splitting
|
|
511
|
+
return self._split_by_blank_lines(content)
|
|
512
|
+
|
|
513
|
+
# Find all structural boundaries
|
|
514
|
+
blocks = []
|
|
515
|
+
lines = content.split('\n')
|
|
516
|
+
current_block = []
|
|
517
|
+
|
|
518
|
+
for line in lines:
|
|
519
|
+
# Check if this line starts a new structural element
|
|
520
|
+
is_boundary = any(re.match(pattern, line) for pattern in patterns)
|
|
521
|
+
|
|
522
|
+
if is_boundary and current_block:
|
|
523
|
+
# Save current block and start new one
|
|
524
|
+
blocks.append('\n'.join(current_block))
|
|
525
|
+
current_block = [line]
|
|
526
|
+
else:
|
|
527
|
+
current_block.append(line)
|
|
528
|
+
|
|
529
|
+
# Add final block
|
|
530
|
+
if current_block:
|
|
531
|
+
blocks.append('\n'.join(current_block))
|
|
532
|
+
|
|
533
|
+
return blocks
|
|
534
|
+
|
|
535
|
+
def _get_structure_patterns(self, language: str) -> List[str]:
|
|
536
|
+
"""
|
|
537
|
+
Get regex patterns for structural elements in a language.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
language: Programming language
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
List of regex patterns
|
|
544
|
+
"""
|
|
545
|
+
patterns = {
|
|
546
|
+
'python': [
|
|
547
|
+
r'^class\s+\w+', # class definition
|
|
548
|
+
r'^def\s+\w+', # function definition
|
|
549
|
+
r'^async\s+def\s+\w+', # async function
|
|
550
|
+
# decorator (start of decorated block)
|
|
551
|
+
r'^@\w+',
|
|
552
|
+
],
|
|
553
|
+
'javascript': [
|
|
554
|
+
r'^(export\s+)?(async\s+)?function\s+\w+', # function
|
|
555
|
+
r'^(export\s+)?class\s+\w+', # class
|
|
556
|
+
# arrow function
|
|
557
|
+
r'^(export\s+)?(const|let|var)\s+\w+\s*=\s*(async\s+)?\(?\w*\)?\s*=>',
|
|
558
|
+
],
|
|
559
|
+
'typescript': [
|
|
560
|
+
r'^(export\s+)?(async\s+)?function\s+\w+',
|
|
561
|
+
r'^(export\s+)?class\s+\w+',
|
|
562
|
+
r'^(export\s+)?(const|let|var)\s+\w+\s*=\s*(async\s+)?\(?\w*\)?\s*=>',
|
|
563
|
+
r'^(export\s+)?interface\s+\w+',
|
|
564
|
+
r'^(export\s+)?type\s+\w+',
|
|
565
|
+
],
|
|
566
|
+
'java': [
|
|
567
|
+
r'^(public|private|protected)?\s*(static\s+)?class\s+\w+',
|
|
568
|
+
r'^(public|private|protected)?\s*(static\s+)?\w+\s+\w+\s*\(',
|
|
569
|
+
r'^(public|private|protected)?\s*interface\s+\w+',
|
|
570
|
+
],
|
|
571
|
+
'go': [
|
|
572
|
+
r'^func\s+(\(\w+\s+\*?\w+\)\s+)?\w+', # function or method
|
|
573
|
+
r'^type\s+\w+\s+(struct|interface)', # type definition
|
|
574
|
+
],
|
|
575
|
+
'rust': [
|
|
576
|
+
r'^(pub\s+)?fn\s+\w+', # function
|
|
577
|
+
r'^(pub\s+)?struct\s+\w+', # struct
|
|
578
|
+
r'^(pub\s+)?enum\s+\w+', # enum
|
|
579
|
+
r'^(pub\s+)?trait\s+\w+', # trait
|
|
580
|
+
r'^impl\s+', # impl block
|
|
581
|
+
],
|
|
582
|
+
'cpp': [
|
|
583
|
+
r'^class\s+\w+',
|
|
584
|
+
r'^(virtual\s+)?(static\s+)?\w+\s+\w+\s*\(',
|
|
585
|
+
r'^namespace\s+\w+',
|
|
586
|
+
],
|
|
587
|
+
'c': [
|
|
588
|
+
r'^\w+\s+\w+\s*\(', # function definition
|
|
589
|
+
r'^struct\s+\w+',
|
|
590
|
+
r'^typedef\s+',
|
|
591
|
+
],
|
|
592
|
+
'ruby': [
|
|
593
|
+
r'^class\s+\w+',
|
|
594
|
+
r'^module\s+\w+',
|
|
595
|
+
r'^def\s+\w+',
|
|
596
|
+
],
|
|
597
|
+
'php': [
|
|
598
|
+
r'^(public|private|protected)?\s*(static\s+)?function\s+\w+',
|
|
599
|
+
r'^class\s+\w+',
|
|
600
|
+
r'^interface\s+\w+',
|
|
601
|
+
r'^trait\s+\w+',
|
|
602
|
+
],
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
return patterns.get(language, [])
|
|
606
|
+
|
|
607
|
+
def _split_by_blank_lines(self, content: str) -> List[str]:
|
|
608
|
+
"""
|
|
609
|
+
Split content by blank lines as fallback.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
content: Code content
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
List of code blocks
|
|
616
|
+
"""
|
|
617
|
+
# Split by one or more blank lines
|
|
618
|
+
blocks = re.split(r'\n\s*\n', content)
|
|
619
|
+
return [block.strip() for block in blocks if block.strip()]
|
|
620
|
+
|
|
621
|
+
def _chunk_text(self, text: str, language: str) -> List[str]:
|
|
622
|
+
"""
|
|
623
|
+
Chunk text using TextChunker.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
text: Full text to chunk
|
|
627
|
+
language: Programming language for context
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
List of text chunks
|
|
631
|
+
"""
|
|
632
|
+
if not text or not text.strip():
|
|
633
|
+
return []
|
|
634
|
+
|
|
635
|
+
try:
|
|
636
|
+
# Use TextChunker's chunk_text method
|
|
637
|
+
chunk_dicts = self._chunker.chunk_text(
|
|
638
|
+
text, context=f"code_{language}")
|
|
639
|
+
|
|
640
|
+
# Extract just the text from the chunk dictionaries
|
|
641
|
+
text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
|
|
642
|
+
|
|
643
|
+
logger.debug(f"Chunked code text into {len(text_chunks)} chunks")
|
|
644
|
+
return text_chunks
|
|
645
|
+
|
|
646
|
+
except Exception as e:
|
|
647
|
+
logger.warning(f"Failed to chunk text with TextChunker: {e}")
|
|
648
|
+
# Fall back to returning the whole text as a single chunk
|
|
649
|
+
logger.info("Falling back to single chunk")
|
|
650
|
+
return [text]
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def create_code_loader(
|
|
654
|
+
chunk_size: int = 2000,
|
|
655
|
+
chunk_overlap: int = 200,
|
|
656
|
+
min_sentences_per_chunk: int = 3,
|
|
657
|
+
tokenizer: str = "character",
|
|
658
|
+
preserve_structure: bool = True,
|
|
659
|
+
include_comments: bool = True,
|
|
660
|
+
encoding: str = 'utf-8'
|
|
661
|
+
) -> CodeLoader:
|
|
662
|
+
"""
|
|
663
|
+
Factory function to create a code loader.
|
|
664
|
+
|
|
665
|
+
Args:
|
|
666
|
+
chunk_size: Maximum tokens per chunk (default: 2000)
|
|
667
|
+
chunk_overlap: Overlap between chunks in tokens (default: 200)
|
|
668
|
+
min_sentences_per_chunk: Minimum sentences per chunk (default: 3)
|
|
669
|
+
tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
|
|
670
|
+
preserve_structure: Whether to preserve code structure in chunks (default: True)
|
|
671
|
+
include_comments: Whether to include comments in output (default: True)
|
|
672
|
+
encoding: File encoding (default: "utf-8")
|
|
673
|
+
|
|
674
|
+
Returns:
|
|
675
|
+
Configured code loader
|
|
676
|
+
|
|
677
|
+
Example:
|
|
678
|
+
>>> loader = create_code_loader(chunk_size=1024, chunk_overlap=64)
|
|
679
|
+
>>> chunks = loader.run("src/main.py")
|
|
680
|
+
>>> print(f"Extracted {len(chunks)} chunks")
|
|
681
|
+
|
|
682
|
+
>>> # Create loader without structure preservation
|
|
683
|
+
>>> loader = create_code_loader(preserve_structure=False)
|
|
684
|
+
>>> chunks = loader.run("src/utils.js")
|
|
685
|
+
"""
|
|
686
|
+
config = {
|
|
687
|
+
'chunk_size': chunk_size,
|
|
688
|
+
'chunk_overlap': chunk_overlap,
|
|
689
|
+
'min_sentences_per_chunk': min_sentences_per_chunk,
|
|
690
|
+
'tokenizer': tokenizer,
|
|
691
|
+
'preserve_structure': preserve_structure,
|
|
692
|
+
'include_comments': include_comments,
|
|
693
|
+
'encoding': encoding
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
return CodeLoader(config=config)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
__all__ = ["CodeLoader", "create_code_loader"]
|