agent-brain-rag 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_brain_rag-1.1.0.dist-info/METADATA +202 -0
- agent_brain_rag-1.1.0.dist-info/RECORD +31 -0
- agent_brain_rag-1.1.0.dist-info/WHEEL +4 -0
- agent_brain_rag-1.1.0.dist-info/entry_points.txt +3 -0
- doc_serve_server/__init__.py +3 -0
- doc_serve_server/api/__init__.py +5 -0
- doc_serve_server/api/main.py +332 -0
- doc_serve_server/api/routers/__init__.py +11 -0
- doc_serve_server/api/routers/health.py +100 -0
- doc_serve_server/api/routers/index.py +208 -0
- doc_serve_server/api/routers/query.py +96 -0
- doc_serve_server/config/__init__.py +5 -0
- doc_serve_server/config/settings.py +92 -0
- doc_serve_server/indexing/__init__.py +19 -0
- doc_serve_server/indexing/bm25_index.py +166 -0
- doc_serve_server/indexing/chunking.py +831 -0
- doc_serve_server/indexing/document_loader.py +506 -0
- doc_serve_server/indexing/embedding.py +274 -0
- doc_serve_server/locking.py +133 -0
- doc_serve_server/models/__init__.py +18 -0
- doc_serve_server/models/health.py +126 -0
- doc_serve_server/models/index.py +157 -0
- doc_serve_server/models/query.py +191 -0
- doc_serve_server/project_root.py +85 -0
- doc_serve_server/runtime.py +112 -0
- doc_serve_server/services/__init__.py +11 -0
- doc_serve_server/services/indexing_service.py +476 -0
- doc_serve_server/services/query_service.py +414 -0
- doc_serve_server/storage/__init__.py +5 -0
- doc_serve_server/storage/vector_store.py +320 -0
- doc_serve_server/storage_paths.py +72 -0
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
"""Document loading from various file formats using LlamaIndex."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
from llama_index.core import Document, SimpleDirectoryReader
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class LoadedDocument:
|
|
16
|
+
"""Represents a loaded document with metadata."""
|
|
17
|
+
|
|
18
|
+
text: str
|
|
19
|
+
source: str
|
|
20
|
+
file_name: str
|
|
21
|
+
file_path: str
|
|
22
|
+
file_size: int
|
|
23
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LanguageDetector:
|
|
27
|
+
"""
|
|
28
|
+
Utility for detecting programming languages from file paths and content.
|
|
29
|
+
|
|
30
|
+
Supports the 10 languages with tree-sitter parsers:
|
|
31
|
+
- Python, TypeScript, JavaScript, Kotlin, C, C++, Java, Go, Rust, Swift
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# Language detection by file extension
|
|
35
|
+
EXTENSION_TO_LANGUAGE = {
|
|
36
|
+
# Python
|
|
37
|
+
".py": "python",
|
|
38
|
+
".pyw": "python",
|
|
39
|
+
".pyi": "python",
|
|
40
|
+
# TypeScript/JavaScript
|
|
41
|
+
".ts": "typescript",
|
|
42
|
+
".tsx": "typescript",
|
|
43
|
+
".js": "javascript",
|
|
44
|
+
".jsx": "javascript",
|
|
45
|
+
".mjs": "javascript",
|
|
46
|
+
".cjs": "javascript",
|
|
47
|
+
# Kotlin
|
|
48
|
+
".kt": "kotlin",
|
|
49
|
+
".kts": "kotlin",
|
|
50
|
+
# C/C++
|
|
51
|
+
".c": "c",
|
|
52
|
+
".h": "c",
|
|
53
|
+
".cpp": "cpp",
|
|
54
|
+
".cc": "cpp",
|
|
55
|
+
".cxx": "cpp",
|
|
56
|
+
".hpp": "cpp",
|
|
57
|
+
".hxx": "cpp",
|
|
58
|
+
# Java
|
|
59
|
+
".java": "java",
|
|
60
|
+
# Go
|
|
61
|
+
".go": "go",
|
|
62
|
+
# Rust
|
|
63
|
+
".rs": "rust",
|
|
64
|
+
# Swift
|
|
65
|
+
".swift": "swift",
|
|
66
|
+
# C#
|
|
67
|
+
".cs": "csharp",
|
|
68
|
+
".csx": "csharp",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Language detection by content patterns (fallback)
|
|
72
|
+
CONTENT_PATTERNS = {
|
|
73
|
+
"python": [
|
|
74
|
+
re.compile(r"^\s*import\s+\w+", re.MULTILINE),
|
|
75
|
+
re.compile(r"^\s*from\s+\w+\s+import", re.MULTILINE),
|
|
76
|
+
re.compile(r"^\s*def\s+\w+\s*\(", re.MULTILINE),
|
|
77
|
+
re.compile(r"^\s*class\s+\w+", re.MULTILINE),
|
|
78
|
+
],
|
|
79
|
+
"javascript": [
|
|
80
|
+
re.compile(r"^\s*(const|let|var)\s+\w+\s*=", re.MULTILINE),
|
|
81
|
+
re.compile(r"^\s*function\s+\w+\s*\(", re.MULTILINE),
|
|
82
|
+
re.compile(r"^\s*=>\s*\{", re.MULTILINE), # Arrow functions
|
|
83
|
+
],
|
|
84
|
+
"typescript": [
|
|
85
|
+
re.compile(r"^\s*interface\s+\w+", re.MULTILINE),
|
|
86
|
+
re.compile(r"^\s*type\s+\w+\s*=", re.MULTILINE),
|
|
87
|
+
re.compile(r":\s*(string|number|boolean|any)", re.MULTILINE),
|
|
88
|
+
],
|
|
89
|
+
"java": [
|
|
90
|
+
re.compile(r"^\s*public\s+class\s+\w+", re.MULTILINE),
|
|
91
|
+
re.compile(r"^\s*package\s+\w+", re.MULTILINE),
|
|
92
|
+
re.compile(r"^\s*import\s+java\.", re.MULTILINE),
|
|
93
|
+
],
|
|
94
|
+
"kotlin": [
|
|
95
|
+
re.compile(r"^\s*fun\s+\w+\s*\(", re.MULTILINE),
|
|
96
|
+
re.compile(r"^\s*class\s+\w+", re.MULTILINE),
|
|
97
|
+
re.compile(r":\s*(String|Int|Boolean)", re.MULTILINE),
|
|
98
|
+
],
|
|
99
|
+
"cpp": [
|
|
100
|
+
re.compile(r"^\s*#include\s*<", re.MULTILINE),
|
|
101
|
+
re.compile(r"^\s*using\s+namespace", re.MULTILINE),
|
|
102
|
+
re.compile(r"^\s*std::", re.MULTILINE),
|
|
103
|
+
],
|
|
104
|
+
"c": [
|
|
105
|
+
re.compile(r"^\s*#include\s*<", re.MULTILINE),
|
|
106
|
+
re.compile(r"^\s*int\s+main\s*\(", re.MULTILINE),
|
|
107
|
+
re.compile(r"^\s*printf\s*\(", re.MULTILINE),
|
|
108
|
+
],
|
|
109
|
+
"go": [
|
|
110
|
+
re.compile(r"^\s*package\s+\w+", re.MULTILINE),
|
|
111
|
+
re.compile(r"^\s*import\s*\(", re.MULTILINE),
|
|
112
|
+
re.compile(r"^\s*func\s+\w+\s*\(", re.MULTILINE),
|
|
113
|
+
],
|
|
114
|
+
"rust": [
|
|
115
|
+
re.compile(r"^\s*fn\s+\w+\s*\(", re.MULTILINE),
|
|
116
|
+
re.compile(r"^\s*use\s+\w+::", re.MULTILINE),
|
|
117
|
+
re.compile(r"^\s*let\s+(mut\s+)?\w+", re.MULTILINE),
|
|
118
|
+
],
|
|
119
|
+
"swift": [
|
|
120
|
+
re.compile(r"^\s*import\s+Foundation", re.MULTILINE),
|
|
121
|
+
re.compile(r"^\s*func\s+\w+\s*\(", re.MULTILINE),
|
|
122
|
+
re.compile(r"^\s*class\s+\w+\s*:", re.MULTILINE),
|
|
123
|
+
],
|
|
124
|
+
"csharp": [
|
|
125
|
+
re.compile(r"^\s*using\s+System", re.MULTILINE),
|
|
126
|
+
re.compile(r"^\s*namespace\s+\w+", re.MULTILINE),
|
|
127
|
+
re.compile(r"\{\s*get\s*;\s*(set\s*;)?\s*\}", re.MULTILINE),
|
|
128
|
+
re.compile(r"\[[\w]+(\(.*\))?\]", re.MULTILINE),
|
|
129
|
+
re.compile(
|
|
130
|
+
r"^\s*public\s+(class|interface|struct|record|enum)\s+\w+",
|
|
131
|
+
re.MULTILINE,
|
|
132
|
+
),
|
|
133
|
+
],
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def detect_from_path(cls, file_path: str) -> Optional[str]:
|
|
138
|
+
"""
|
|
139
|
+
Detect language from file path/extension.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
file_path: Path to the file.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Language name or None if not detected.
|
|
146
|
+
"""
|
|
147
|
+
path = Path(file_path)
|
|
148
|
+
extension = path.suffix.lower()
|
|
149
|
+
|
|
150
|
+
return cls.EXTENSION_TO_LANGUAGE.get(extension)
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def detect_from_content(
|
|
154
|
+
cls, content: str, top_n: int = 3
|
|
155
|
+
) -> list[tuple[str, float]]:
|
|
156
|
+
"""
|
|
157
|
+
Detect language from file content using pattern matching.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
content: File content to analyze.
|
|
161
|
+
top_n: Number of top matches to return.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of (language, confidence) tuples, sorted by confidence.
|
|
165
|
+
"""
|
|
166
|
+
scores: dict[str, float] = {}
|
|
167
|
+
|
|
168
|
+
for language, patterns in cls.CONTENT_PATTERNS.items():
|
|
169
|
+
total_score = 0.0
|
|
170
|
+
pattern_count = len(patterns)
|
|
171
|
+
|
|
172
|
+
for pattern in patterns:
|
|
173
|
+
matches = len(pattern.findall(content))
|
|
174
|
+
if matches > 0:
|
|
175
|
+
# Score based on number of matches, normalized by pattern count
|
|
176
|
+
total_score += min(matches / 10.0, 1.0) # Cap at 1.0 per pattern
|
|
177
|
+
|
|
178
|
+
if total_score > 0:
|
|
179
|
+
scores[language] = total_score / pattern_count
|
|
180
|
+
|
|
181
|
+
# Sort by score descending
|
|
182
|
+
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
183
|
+
return sorted_scores[:top_n]
|
|
184
|
+
|
|
185
|
+
@classmethod
|
|
186
|
+
def detect_language(
|
|
187
|
+
cls, file_path: str, content: Optional[str] = None
|
|
188
|
+
) -> Optional[str]:
|
|
189
|
+
"""
|
|
190
|
+
Detect programming language using both path and content analysis.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
file_path: Path to the file.
|
|
194
|
+
content: Optional file content for fallback detection.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Detected language name or None.
|
|
198
|
+
"""
|
|
199
|
+
# First try extension-based detection (fast and reliable)
|
|
200
|
+
language = cls.detect_from_path(file_path)
|
|
201
|
+
if language:
|
|
202
|
+
return language
|
|
203
|
+
|
|
204
|
+
# Fallback to content analysis if content is provided
|
|
205
|
+
if content:
|
|
206
|
+
content_matches = cls.detect_from_content(content, top_n=1)
|
|
207
|
+
if (
|
|
208
|
+
content_matches and content_matches[0][1] > 0.1
|
|
209
|
+
): # Minimum confidence threshold
|
|
210
|
+
return content_matches[0][0]
|
|
211
|
+
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def is_supported_language(cls, language: str) -> bool:
|
|
216
|
+
"""
|
|
217
|
+
Check if a language is supported by our tree-sitter parsers.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
language: Language name to check.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
True if supported, False otherwise.
|
|
224
|
+
"""
|
|
225
|
+
return language in cls.CONTENT_PATTERNS
|
|
226
|
+
|
|
227
|
+
@classmethod
|
|
228
|
+
def get_supported_languages(cls) -> list[str]:
|
|
229
|
+
"""Get list of all supported programming languages."""
|
|
230
|
+
return list(cls.CONTENT_PATTERNS.keys())
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class DocumentLoader:
|
|
234
|
+
"""
|
|
235
|
+
Loads documents and code files from a folder supporting multiple file formats.
|
|
236
|
+
|
|
237
|
+
Supported document formats: .txt, .md, .pdf, .docx, .html, .rst
|
|
238
|
+
Supported code formats: .py, .ts, .tsx, .js, .jsx, .kt, .c, .cpp,
|
|
239
|
+
.java, .go, .rs, .swift
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
# Document formats
|
|
243
|
+
DOCUMENT_EXTENSIONS: set[str] = {".txt", ".md", ".pdf", ".docx", ".html", ".rst"}
|
|
244
|
+
|
|
245
|
+
# Code formats (supported by tree-sitter)
|
|
246
|
+
CODE_EXTENSIONS: set[str] = {
|
|
247
|
+
".py",
|
|
248
|
+
".pyw",
|
|
249
|
+
".pyi", # Python
|
|
250
|
+
".ts",
|
|
251
|
+
".tsx", # TypeScript
|
|
252
|
+
".js",
|
|
253
|
+
".jsx",
|
|
254
|
+
".mjs",
|
|
255
|
+
".cjs", # JavaScript
|
|
256
|
+
".kt",
|
|
257
|
+
".kts", # Kotlin
|
|
258
|
+
".c",
|
|
259
|
+
".h", # C
|
|
260
|
+
".cpp",
|
|
261
|
+
".cc",
|
|
262
|
+
".cxx",
|
|
263
|
+
".hpp",
|
|
264
|
+
".hxx", # C++
|
|
265
|
+
".java", # Java
|
|
266
|
+
".go", # Go
|
|
267
|
+
".rs", # Rust
|
|
268
|
+
".swift", # Swift
|
|
269
|
+
".cs",
|
|
270
|
+
".csx", # C#
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
SUPPORTED_EXTENSIONS: set[str] = DOCUMENT_EXTENSIONS | CODE_EXTENSIONS
|
|
274
|
+
|
|
275
|
+
def __init__(
|
|
276
|
+
self,
|
|
277
|
+
supported_extensions: Optional[set[str]] = None,
|
|
278
|
+
):
|
|
279
|
+
"""
|
|
280
|
+
Initialize the document loader.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
supported_extensions: Set of file extensions to load.
|
|
284
|
+
Defaults to SUPPORTED_EXTENSIONS.
|
|
285
|
+
"""
|
|
286
|
+
self.extensions = supported_extensions or self.SUPPORTED_EXTENSIONS
|
|
287
|
+
|
|
288
|
+
async def load_from_folder(
|
|
289
|
+
self,
|
|
290
|
+
folder_path: str,
|
|
291
|
+
recursive: bool = True,
|
|
292
|
+
) -> list[LoadedDocument]:
|
|
293
|
+
"""
|
|
294
|
+
Load all supported documents from a folder.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
folder_path: Path to the folder containing documents.
|
|
298
|
+
recursive: Whether to scan subdirectories recursively.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
List of LoadedDocument objects.
|
|
302
|
+
|
|
303
|
+
Raises:
|
|
304
|
+
ValueError: If the folder path is invalid.
|
|
305
|
+
FileNotFoundError: If the folder doesn't exist.
|
|
306
|
+
"""
|
|
307
|
+
path = Path(folder_path)
|
|
308
|
+
|
|
309
|
+
if not path.exists():
|
|
310
|
+
raise FileNotFoundError(f"Folder not found: {folder_path}")
|
|
311
|
+
|
|
312
|
+
if not path.is_dir():
|
|
313
|
+
raise ValueError(f"Path is not a directory: {folder_path}")
|
|
314
|
+
|
|
315
|
+
logger.info(f"Loading documents from: {folder_path} (recursive={recursive})")
|
|
316
|
+
|
|
317
|
+
# Use LlamaIndex's SimpleDirectoryReader
|
|
318
|
+
try:
|
|
319
|
+
reader = SimpleDirectoryReader(
|
|
320
|
+
input_dir=str(path),
|
|
321
|
+
recursive=recursive,
|
|
322
|
+
required_exts=list(self.extensions),
|
|
323
|
+
filename_as_id=True,
|
|
324
|
+
)
|
|
325
|
+
llama_documents: list[Document] = reader.load_data()
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logger.error(f"Failed to load documents: {e}")
|
|
328
|
+
raise
|
|
329
|
+
|
|
330
|
+
# Convert to our LoadedDocument format
|
|
331
|
+
loaded_docs: list[LoadedDocument] = []
|
|
332
|
+
|
|
333
|
+
for doc in llama_documents:
|
|
334
|
+
file_path = doc.metadata.get("file_path", "")
|
|
335
|
+
file_name = doc.metadata.get(
|
|
336
|
+
"file_name", Path(file_path).name if file_path else "unknown"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Get file size
|
|
340
|
+
try:
|
|
341
|
+
file_size = Path(file_path).stat().st_size if file_path else 0
|
|
342
|
+
except OSError:
|
|
343
|
+
file_size = 0
|
|
344
|
+
|
|
345
|
+
# Detect language for code files
|
|
346
|
+
language = None
|
|
347
|
+
source_type = "doc" # Default to document
|
|
348
|
+
if file_path:
|
|
349
|
+
path_ext = Path(file_path).suffix.lower()
|
|
350
|
+
if path_ext in self.CODE_EXTENSIONS:
|
|
351
|
+
source_type = "code"
|
|
352
|
+
language = LanguageDetector.detect_language(file_path, doc.text)
|
|
353
|
+
|
|
354
|
+
loaded_doc = LoadedDocument(
|
|
355
|
+
text=doc.text,
|
|
356
|
+
source=file_path,
|
|
357
|
+
file_name=file_name,
|
|
358
|
+
file_path=file_path,
|
|
359
|
+
file_size=file_size,
|
|
360
|
+
metadata={
|
|
361
|
+
**doc.metadata,
|
|
362
|
+
"doc_id": doc.doc_id,
|
|
363
|
+
"source_type": source_type,
|
|
364
|
+
"language": language,
|
|
365
|
+
},
|
|
366
|
+
)
|
|
367
|
+
loaded_docs.append(loaded_doc)
|
|
368
|
+
|
|
369
|
+
logger.info(f"Loaded {len(loaded_docs)} documents from {folder_path}")
|
|
370
|
+
return loaded_docs
|
|
371
|
+
|
|
372
|
+
async def load_single_file(self, file_path: str) -> LoadedDocument:
|
|
373
|
+
"""
|
|
374
|
+
Load a single document file.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
file_path: Path to the file.
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
LoadedDocument object.
|
|
381
|
+
|
|
382
|
+
Raises:
|
|
383
|
+
ValueError: If the file type is not supported.
|
|
384
|
+
FileNotFoundError: If the file doesn't exist.
|
|
385
|
+
"""
|
|
386
|
+
path = Path(file_path)
|
|
387
|
+
|
|
388
|
+
if not path.exists():
|
|
389
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
390
|
+
|
|
391
|
+
if path.suffix.lower() not in self.extensions:
|
|
392
|
+
raise ValueError(
|
|
393
|
+
f"Unsupported file type: {path.suffix}. "
|
|
394
|
+
f"Supported: {', '.join(self.extensions)}"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
reader = SimpleDirectoryReader(
|
|
398
|
+
input_files=[str(path)],
|
|
399
|
+
filename_as_id=True,
|
|
400
|
+
)
|
|
401
|
+
docs = reader.load_data()
|
|
402
|
+
|
|
403
|
+
if not docs:
|
|
404
|
+
raise ValueError(f"No content loaded from file: {file_path}")
|
|
405
|
+
|
|
406
|
+
doc = docs[0]
|
|
407
|
+
|
|
408
|
+
# Detect language for code files
|
|
409
|
+
language = None
|
|
410
|
+
source_type = "doc" # Default to document
|
|
411
|
+
path_ext = path.suffix.lower()
|
|
412
|
+
if path_ext in self.CODE_EXTENSIONS:
|
|
413
|
+
source_type = "code"
|
|
414
|
+
language = LanguageDetector.detect_language(str(path), doc.text)
|
|
415
|
+
|
|
416
|
+
return LoadedDocument(
|
|
417
|
+
text=doc.text,
|
|
418
|
+
source=file_path,
|
|
419
|
+
file_name=path.name,
|
|
420
|
+
file_path=str(path),
|
|
421
|
+
file_size=path.stat().st_size,
|
|
422
|
+
metadata={
|
|
423
|
+
**doc.metadata,
|
|
424
|
+
"doc_id": doc.doc_id,
|
|
425
|
+
"source_type": source_type,
|
|
426
|
+
"language": language,
|
|
427
|
+
},
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
async def load_files(
|
|
431
|
+
self,
|
|
432
|
+
folder_path: str,
|
|
433
|
+
recursive: bool = True,
|
|
434
|
+
include_code: bool = False,
|
|
435
|
+
) -> list[LoadedDocument]:
|
|
436
|
+
"""
|
|
437
|
+
Load documents and optionally code files from a folder.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
folder_path: Path to the folder containing files to load.
|
|
441
|
+
recursive: Whether to scan subdirectories recursively.
|
|
442
|
+
include_code: Whether to include source code files alongside documents.
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
List of LoadedDocument objects with proper metadata.
|
|
446
|
+
|
|
447
|
+
Raises:
|
|
448
|
+
ValueError: If folder path is invalid.
|
|
449
|
+
FileNotFoundError: If folder doesn't exist.
|
|
450
|
+
"""
|
|
451
|
+
# Configure extensions based on include_code flag
|
|
452
|
+
if include_code:
|
|
453
|
+
# Use all supported extensions (docs + code)
|
|
454
|
+
effective_extensions = self.SUPPORTED_EXTENSIONS
|
|
455
|
+
else:
|
|
456
|
+
# Use only document extensions
|
|
457
|
+
effective_extensions = self.DOCUMENT_EXTENSIONS
|
|
458
|
+
|
|
459
|
+
# Create a temporary loader with the effective extensions
|
|
460
|
+
temp_loader = DocumentLoader(supported_extensions=effective_extensions)
|
|
461
|
+
|
|
462
|
+
# Load files using the configured extensions
|
|
463
|
+
loaded_docs = await temp_loader.load_from_folder(folder_path, recursive)
|
|
464
|
+
|
|
465
|
+
# Ensure all documents have proper source_type metadata
|
|
466
|
+
for doc in loaded_docs:
|
|
467
|
+
if not doc.metadata.get("source_type"):
|
|
468
|
+
path_ext = Path(doc.source).suffix.lower()
|
|
469
|
+
if path_ext in self.CODE_EXTENSIONS:
|
|
470
|
+
doc.metadata["source_type"] = "code"
|
|
471
|
+
# Detect language for code files
|
|
472
|
+
language = LanguageDetector.detect_language(doc.source, doc.text)
|
|
473
|
+
if language:
|
|
474
|
+
doc.metadata["language"] = language
|
|
475
|
+
else:
|
|
476
|
+
doc.metadata["source_type"] = "doc"
|
|
477
|
+
doc.metadata["language"] = "markdown" # Default for documents
|
|
478
|
+
|
|
479
|
+
return loaded_docs
|
|
480
|
+
|
|
481
|
+
def get_supported_files(
|
|
482
|
+
self,
|
|
483
|
+
folder_path: str,
|
|
484
|
+
recursive: bool = True,
|
|
485
|
+
) -> list[Path]:
|
|
486
|
+
"""
|
|
487
|
+
Get list of supported files in a folder without loading them.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
folder_path: Path to the folder.
|
|
491
|
+
recursive: Whether to scan subdirectories.
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
List of Path objects for supported files.
|
|
495
|
+
"""
|
|
496
|
+
path = Path(folder_path)
|
|
497
|
+
|
|
498
|
+
if not path.exists() or not path.is_dir():
|
|
499
|
+
return []
|
|
500
|
+
|
|
501
|
+
if recursive:
|
|
502
|
+
files = list(path.rglob("*"))
|
|
503
|
+
else:
|
|
504
|
+
files = list(path.glob("*"))
|
|
505
|
+
|
|
506
|
+
return [f for f in files if f.is_file() and f.suffix.lower() in self.extensions]
|