agent-brain-rag 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,506 @@
1
+ """Document loading from various file formats using LlamaIndex."""
2
+
3
+ import logging
4
+ import re
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ from llama_index.core import Document, SimpleDirectoryReader
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ @dataclass
15
+ class LoadedDocument:
16
+ """Represents a loaded document with metadata."""
17
+
18
+ text: str
19
+ source: str
20
+ file_name: str
21
+ file_path: str
22
+ file_size: int
23
+ metadata: dict[str, Any] = field(default_factory=dict)
24
+
25
+
26
+ class LanguageDetector:
27
+ """
28
+ Utility for detecting programming languages from file paths and content.
29
+
30
+ Supports the 10 languages with tree-sitter parsers:
31
+ - Python, TypeScript, JavaScript, Kotlin, C, C++, Java, Go, Rust, Swift
32
+ """
33
+
34
+ # Language detection by file extension
35
+ EXTENSION_TO_LANGUAGE = {
36
+ # Python
37
+ ".py": "python",
38
+ ".pyw": "python",
39
+ ".pyi": "python",
40
+ # TypeScript/JavaScript
41
+ ".ts": "typescript",
42
+ ".tsx": "typescript",
43
+ ".js": "javascript",
44
+ ".jsx": "javascript",
45
+ ".mjs": "javascript",
46
+ ".cjs": "javascript",
47
+ # Kotlin
48
+ ".kt": "kotlin",
49
+ ".kts": "kotlin",
50
+ # C/C++
51
+ ".c": "c",
52
+ ".h": "c",
53
+ ".cpp": "cpp",
54
+ ".cc": "cpp",
55
+ ".cxx": "cpp",
56
+ ".hpp": "cpp",
57
+ ".hxx": "cpp",
58
+ # Java
59
+ ".java": "java",
60
+ # Go
61
+ ".go": "go",
62
+ # Rust
63
+ ".rs": "rust",
64
+ # Swift
65
+ ".swift": "swift",
66
+ # C#
67
+ ".cs": "csharp",
68
+ ".csx": "csharp",
69
+ }
70
+
71
+ # Language detection by content patterns (fallback)
72
+ CONTENT_PATTERNS = {
73
+ "python": [
74
+ re.compile(r"^\s*import\s+\w+", re.MULTILINE),
75
+ re.compile(r"^\s*from\s+\w+\s+import", re.MULTILINE),
76
+ re.compile(r"^\s*def\s+\w+\s*\(", re.MULTILINE),
77
+ re.compile(r"^\s*class\s+\w+", re.MULTILINE),
78
+ ],
79
+ "javascript": [
80
+ re.compile(r"^\s*(const|let|var)\s+\w+\s*=", re.MULTILINE),
81
+ re.compile(r"^\s*function\s+\w+\s*\(", re.MULTILINE),
82
+ re.compile(r"^\s*=>\s*\{", re.MULTILINE), # Arrow functions
83
+ ],
84
+ "typescript": [
85
+ re.compile(r"^\s*interface\s+\w+", re.MULTILINE),
86
+ re.compile(r"^\s*type\s+\w+\s*=", re.MULTILINE),
87
+ re.compile(r":\s*(string|number|boolean|any)", re.MULTILINE),
88
+ ],
89
+ "java": [
90
+ re.compile(r"^\s*public\s+class\s+\w+", re.MULTILINE),
91
+ re.compile(r"^\s*package\s+\w+", re.MULTILINE),
92
+ re.compile(r"^\s*import\s+java\.", re.MULTILINE),
93
+ ],
94
+ "kotlin": [
95
+ re.compile(r"^\s*fun\s+\w+\s*\(", re.MULTILINE),
96
+ re.compile(r"^\s*class\s+\w+", re.MULTILINE),
97
+ re.compile(r":\s*(String|Int|Boolean)", re.MULTILINE),
98
+ ],
99
+ "cpp": [
100
+ re.compile(r"^\s*#include\s*<", re.MULTILINE),
101
+ re.compile(r"^\s*using\s+namespace", re.MULTILINE),
102
+ re.compile(r"^\s*std::", re.MULTILINE),
103
+ ],
104
+ "c": [
105
+ re.compile(r"^\s*#include\s*<", re.MULTILINE),
106
+ re.compile(r"^\s*int\s+main\s*\(", re.MULTILINE),
107
+ re.compile(r"^\s*printf\s*\(", re.MULTILINE),
108
+ ],
109
+ "go": [
110
+ re.compile(r"^\s*package\s+\w+", re.MULTILINE),
111
+ re.compile(r"^\s*import\s*\(", re.MULTILINE),
112
+ re.compile(r"^\s*func\s+\w+\s*\(", re.MULTILINE),
113
+ ],
114
+ "rust": [
115
+ re.compile(r"^\s*fn\s+\w+\s*\(", re.MULTILINE),
116
+ re.compile(r"^\s*use\s+\w+::", re.MULTILINE),
117
+ re.compile(r"^\s*let\s+(mut\s+)?\w+", re.MULTILINE),
118
+ ],
119
+ "swift": [
120
+ re.compile(r"^\s*import\s+Foundation", re.MULTILINE),
121
+ re.compile(r"^\s*func\s+\w+\s*\(", re.MULTILINE),
122
+ re.compile(r"^\s*class\s+\w+\s*:", re.MULTILINE),
123
+ ],
124
+ "csharp": [
125
+ re.compile(r"^\s*using\s+System", re.MULTILINE),
126
+ re.compile(r"^\s*namespace\s+\w+", re.MULTILINE),
127
+ re.compile(r"\{\s*get\s*;\s*(set\s*;)?\s*\}", re.MULTILINE),
128
+ re.compile(r"\[[\w]+(\(.*\))?\]", re.MULTILINE),
129
+ re.compile(
130
+ r"^\s*public\s+(class|interface|struct|record|enum)\s+\w+",
131
+ re.MULTILINE,
132
+ ),
133
+ ],
134
+ }
135
+
136
+ @classmethod
137
+ def detect_from_path(cls, file_path: str) -> Optional[str]:
138
+ """
139
+ Detect language from file path/extension.
140
+
141
+ Args:
142
+ file_path: Path to the file.
143
+
144
+ Returns:
145
+ Language name or None if not detected.
146
+ """
147
+ path = Path(file_path)
148
+ extension = path.suffix.lower()
149
+
150
+ return cls.EXTENSION_TO_LANGUAGE.get(extension)
151
+
152
+ @classmethod
153
+ def detect_from_content(
154
+ cls, content: str, top_n: int = 3
155
+ ) -> list[tuple[str, float]]:
156
+ """
157
+ Detect language from file content using pattern matching.
158
+
159
+ Args:
160
+ content: File content to analyze.
161
+ top_n: Number of top matches to return.
162
+
163
+ Returns:
164
+ List of (language, confidence) tuples, sorted by confidence.
165
+ """
166
+ scores: dict[str, float] = {}
167
+
168
+ for language, patterns in cls.CONTENT_PATTERNS.items():
169
+ total_score = 0.0
170
+ pattern_count = len(patterns)
171
+
172
+ for pattern in patterns:
173
+ matches = len(pattern.findall(content))
174
+ if matches > 0:
175
+ # Score based on number of matches, normalized by pattern count
176
+ total_score += min(matches / 10.0, 1.0) # Cap at 1.0 per pattern
177
+
178
+ if total_score > 0:
179
+ scores[language] = total_score / pattern_count
180
+
181
+ # Sort by score descending
182
+ sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
183
+ return sorted_scores[:top_n]
184
+
185
+ @classmethod
186
+ def detect_language(
187
+ cls, file_path: str, content: Optional[str] = None
188
+ ) -> Optional[str]:
189
+ """
190
+ Detect programming language using both path and content analysis.
191
+
192
+ Args:
193
+ file_path: Path to the file.
194
+ content: Optional file content for fallback detection.
195
+
196
+ Returns:
197
+ Detected language name or None.
198
+ """
199
+ # First try extension-based detection (fast and reliable)
200
+ language = cls.detect_from_path(file_path)
201
+ if language:
202
+ return language
203
+
204
+ # Fallback to content analysis if content is provided
205
+ if content:
206
+ content_matches = cls.detect_from_content(content, top_n=1)
207
+ if (
208
+ content_matches and content_matches[0][1] > 0.1
209
+ ): # Minimum confidence threshold
210
+ return content_matches[0][0]
211
+
212
+ return None
213
+
214
+ @classmethod
215
+ def is_supported_language(cls, language: str) -> bool:
216
+ """
217
+ Check if a language is supported by our tree-sitter parsers.
218
+
219
+ Args:
220
+ language: Language name to check.
221
+
222
+ Returns:
223
+ True if supported, False otherwise.
224
+ """
225
+ return language in cls.CONTENT_PATTERNS
226
+
227
+ @classmethod
228
+ def get_supported_languages(cls) -> list[str]:
229
+ """Get list of all supported programming languages."""
230
+ return list(cls.CONTENT_PATTERNS.keys())
231
+
232
+
233
+ class DocumentLoader:
234
+ """
235
+ Loads documents and code files from a folder supporting multiple file formats.
236
+
237
+ Supported document formats: .txt, .md, .pdf, .docx, .html, .rst
238
+ Supported code formats: .py, .ts, .tsx, .js, .jsx, .kt, .c, .cpp,
239
+ .java, .go, .rs, .swift
240
+ """
241
+
242
+ # Document formats
243
+ DOCUMENT_EXTENSIONS: set[str] = {".txt", ".md", ".pdf", ".docx", ".html", ".rst"}
244
+
245
+ # Code formats (supported by tree-sitter)
246
+ CODE_EXTENSIONS: set[str] = {
247
+ ".py",
248
+ ".pyw",
249
+ ".pyi", # Python
250
+ ".ts",
251
+ ".tsx", # TypeScript
252
+ ".js",
253
+ ".jsx",
254
+ ".mjs",
255
+ ".cjs", # JavaScript
256
+ ".kt",
257
+ ".kts", # Kotlin
258
+ ".c",
259
+ ".h", # C
260
+ ".cpp",
261
+ ".cc",
262
+ ".cxx",
263
+ ".hpp",
264
+ ".hxx", # C++
265
+ ".java", # Java
266
+ ".go", # Go
267
+ ".rs", # Rust
268
+ ".swift", # Swift
269
+ ".cs",
270
+ ".csx", # C#
271
+ }
272
+
273
+ SUPPORTED_EXTENSIONS: set[str] = DOCUMENT_EXTENSIONS | CODE_EXTENSIONS
274
+
275
+ def __init__(
276
+ self,
277
+ supported_extensions: Optional[set[str]] = None,
278
+ ):
279
+ """
280
+ Initialize the document loader.
281
+
282
+ Args:
283
+ supported_extensions: Set of file extensions to load.
284
+ Defaults to SUPPORTED_EXTENSIONS.
285
+ """
286
+ self.extensions = supported_extensions or self.SUPPORTED_EXTENSIONS
287
+
288
+ async def load_from_folder(
289
+ self,
290
+ folder_path: str,
291
+ recursive: bool = True,
292
+ ) -> list[LoadedDocument]:
293
+ """
294
+ Load all supported documents from a folder.
295
+
296
+ Args:
297
+ folder_path: Path to the folder containing documents.
298
+ recursive: Whether to scan subdirectories recursively.
299
+
300
+ Returns:
301
+ List of LoadedDocument objects.
302
+
303
+ Raises:
304
+ ValueError: If the folder path is invalid.
305
+ FileNotFoundError: If the folder doesn't exist.
306
+ """
307
+ path = Path(folder_path)
308
+
309
+ if not path.exists():
310
+ raise FileNotFoundError(f"Folder not found: {folder_path}")
311
+
312
+ if not path.is_dir():
313
+ raise ValueError(f"Path is not a directory: {folder_path}")
314
+
315
+ logger.info(f"Loading documents from: {folder_path} (recursive={recursive})")
316
+
317
+ # Use LlamaIndex's SimpleDirectoryReader
318
+ try:
319
+ reader = SimpleDirectoryReader(
320
+ input_dir=str(path),
321
+ recursive=recursive,
322
+ required_exts=list(self.extensions),
323
+ filename_as_id=True,
324
+ )
325
+ llama_documents: list[Document] = reader.load_data()
326
+ except Exception as e:
327
+ logger.error(f"Failed to load documents: {e}")
328
+ raise
329
+
330
+ # Convert to our LoadedDocument format
331
+ loaded_docs: list[LoadedDocument] = []
332
+
333
+ for doc in llama_documents:
334
+ file_path = doc.metadata.get("file_path", "")
335
+ file_name = doc.metadata.get(
336
+ "file_name", Path(file_path).name if file_path else "unknown"
337
+ )
338
+
339
+ # Get file size
340
+ try:
341
+ file_size = Path(file_path).stat().st_size if file_path else 0
342
+ except OSError:
343
+ file_size = 0
344
+
345
+ # Detect language for code files
346
+ language = None
347
+ source_type = "doc" # Default to document
348
+ if file_path:
349
+ path_ext = Path(file_path).suffix.lower()
350
+ if path_ext in self.CODE_EXTENSIONS:
351
+ source_type = "code"
352
+ language = LanguageDetector.detect_language(file_path, doc.text)
353
+
354
+ loaded_doc = LoadedDocument(
355
+ text=doc.text,
356
+ source=file_path,
357
+ file_name=file_name,
358
+ file_path=file_path,
359
+ file_size=file_size,
360
+ metadata={
361
+ **doc.metadata,
362
+ "doc_id": doc.doc_id,
363
+ "source_type": source_type,
364
+ "language": language,
365
+ },
366
+ )
367
+ loaded_docs.append(loaded_doc)
368
+
369
+ logger.info(f"Loaded {len(loaded_docs)} documents from {folder_path}")
370
+ return loaded_docs
371
+
372
+ async def load_single_file(self, file_path: str) -> LoadedDocument:
373
+ """
374
+ Load a single document file.
375
+
376
+ Args:
377
+ file_path: Path to the file.
378
+
379
+ Returns:
380
+ LoadedDocument object.
381
+
382
+ Raises:
383
+ ValueError: If the file type is not supported.
384
+ FileNotFoundError: If the file doesn't exist.
385
+ """
386
+ path = Path(file_path)
387
+
388
+ if not path.exists():
389
+ raise FileNotFoundError(f"File not found: {file_path}")
390
+
391
+ if path.suffix.lower() not in self.extensions:
392
+ raise ValueError(
393
+ f"Unsupported file type: {path.suffix}. "
394
+ f"Supported: {', '.join(self.extensions)}"
395
+ )
396
+
397
+ reader = SimpleDirectoryReader(
398
+ input_files=[str(path)],
399
+ filename_as_id=True,
400
+ )
401
+ docs = reader.load_data()
402
+
403
+ if not docs:
404
+ raise ValueError(f"No content loaded from file: {file_path}")
405
+
406
+ doc = docs[0]
407
+
408
+ # Detect language for code files
409
+ language = None
410
+ source_type = "doc" # Default to document
411
+ path_ext = path.suffix.lower()
412
+ if path_ext in self.CODE_EXTENSIONS:
413
+ source_type = "code"
414
+ language = LanguageDetector.detect_language(str(path), doc.text)
415
+
416
+ return LoadedDocument(
417
+ text=doc.text,
418
+ source=file_path,
419
+ file_name=path.name,
420
+ file_path=str(path),
421
+ file_size=path.stat().st_size,
422
+ metadata={
423
+ **doc.metadata,
424
+ "doc_id": doc.doc_id,
425
+ "source_type": source_type,
426
+ "language": language,
427
+ },
428
+ )
429
+
430
+ async def load_files(
431
+ self,
432
+ folder_path: str,
433
+ recursive: bool = True,
434
+ include_code: bool = False,
435
+ ) -> list[LoadedDocument]:
436
+ """
437
+ Load documents and optionally code files from a folder.
438
+
439
+ Args:
440
+ folder_path: Path to the folder containing files to load.
441
+ recursive: Whether to scan subdirectories recursively.
442
+ include_code: Whether to include source code files alongside documents.
443
+
444
+ Returns:
445
+ List of LoadedDocument objects with proper metadata.
446
+
447
+ Raises:
448
+ ValueError: If folder path is invalid.
449
+ FileNotFoundError: If folder doesn't exist.
450
+ """
451
+ # Configure extensions based on include_code flag
452
+ if include_code:
453
+ # Use all supported extensions (docs + code)
454
+ effective_extensions = self.SUPPORTED_EXTENSIONS
455
+ else:
456
+ # Use only document extensions
457
+ effective_extensions = self.DOCUMENT_EXTENSIONS
458
+
459
+ # Create a temporary loader with the effective extensions
460
+ temp_loader = DocumentLoader(supported_extensions=effective_extensions)
461
+
462
+ # Load files using the configured extensions
463
+ loaded_docs = await temp_loader.load_from_folder(folder_path, recursive)
464
+
465
+ # Ensure all documents have proper source_type metadata
466
+ for doc in loaded_docs:
467
+ if not doc.metadata.get("source_type"):
468
+ path_ext = Path(doc.source).suffix.lower()
469
+ if path_ext in self.CODE_EXTENSIONS:
470
+ doc.metadata["source_type"] = "code"
471
+ # Detect language for code files
472
+ language = LanguageDetector.detect_language(doc.source, doc.text)
473
+ if language:
474
+ doc.metadata["language"] = language
475
+ else:
476
+ doc.metadata["source_type"] = "doc"
477
+ doc.metadata["language"] = "markdown" # Default for documents
478
+
479
+ return loaded_docs
480
+
481
+ def get_supported_files(
482
+ self,
483
+ folder_path: str,
484
+ recursive: bool = True,
485
+ ) -> list[Path]:
486
+ """
487
+ Get list of supported files in a folder without loading them.
488
+
489
+ Args:
490
+ folder_path: Path to the folder.
491
+ recursive: Whether to scan subdirectories.
492
+
493
+ Returns:
494
+ List of Path objects for supported files.
495
+ """
496
+ path = Path(folder_path)
497
+
498
+ if not path.exists() or not path.is_dir():
499
+ return []
500
+
501
+ if recursive:
502
+ files = list(path.rglob("*"))
503
+ else:
504
+ files = list(path.glob("*"))
505
+
506
+ return [f for f in files if f.is_file() and f.suffix.lower() in self.extensions]