code-graph-rag 0.0.79__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cgr/__init__.py +14 -0
- code_graph_rag-0.0.79.dist-info/METADATA +948 -0
- code_graph_rag-0.0.79.dist-info/RECORD +34 -0
- code_graph_rag-0.0.79.dist-info/WHEEL +5 -0
- code_graph_rag-0.0.79.dist-info/entry_points.txt +3 -0
- code_graph_rag-0.0.79.dist-info/licenses/LICENSE +21 -0
- code_graph_rag-0.0.79.dist-info/top_level.txt +3 -0
- codebase_rag/__init__.py +0 -0
- codebase_rag/cli.py +469 -0
- codebase_rag/cli_help.py +93 -0
- codebase_rag/config.py +370 -0
- codebase_rag/constants.py +2816 -0
- codebase_rag/cypher_queries.py +128 -0
- codebase_rag/decorators.py +160 -0
- codebase_rag/embedder.py +48 -0
- codebase_rag/exceptions.py +59 -0
- codebase_rag/graph_loader.py +154 -0
- codebase_rag/graph_updater.py +470 -0
- codebase_rag/language_spec.py +427 -0
- codebase_rag/logs.py +624 -0
- codebase_rag/main.py +1074 -0
- codebase_rag/models.py +94 -0
- codebase_rag/parser_loader.py +292 -0
- codebase_rag/prompts.py +273 -0
- codebase_rag/readme_sections.py +248 -0
- codebase_rag/schema_builder.py +41 -0
- codebase_rag/schemas.py +88 -0
- codebase_rag/tool_errors.py +71 -0
- codebase_rag/types_defs.py +558 -0
- codebase_rag/unixcoder.py +278 -0
- codebase_rag/vector_store.py +80 -0
- codec/__init__.py +0 -0
- codec/schema_pb2.py +61 -0
- codec/schema_pb2.pyi +293 -0
codebase_rag/models.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, NamedTuple
|
|
5
|
+
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
|
|
8
|
+
from .constants import SupportedLanguage
|
|
9
|
+
from .types_defs import MCPHandlerType, MCPInputSchema, PropertyValue
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from tree_sitter import Node
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SessionState:
|
|
17
|
+
confirm_edits: bool = True
|
|
18
|
+
log_file: Path | None = None
|
|
19
|
+
cancelled: bool = False
|
|
20
|
+
|
|
21
|
+
def reset_cancelled(self) -> None:
|
|
22
|
+
self.cancelled = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _default_console() -> Console:
|
|
26
|
+
return Console(width=None, force_terminal=True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class AppContext:
|
|
31
|
+
session: SessionState = field(default_factory=SessionState)
|
|
32
|
+
console: Console = field(default_factory=_default_console)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class GraphNode:
|
|
37
|
+
node_id: int
|
|
38
|
+
labels: list[str]
|
|
39
|
+
properties: dict[str, PropertyValue]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class GraphRelationship:
|
|
44
|
+
from_id: int
|
|
45
|
+
to_id: int
|
|
46
|
+
type: str
|
|
47
|
+
properties: dict[str, PropertyValue]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FQNSpec(NamedTuple):
|
|
51
|
+
scope_node_types: frozenset[str]
|
|
52
|
+
function_node_types: frozenset[str]
|
|
53
|
+
get_name: Callable[["Node"], str | None]
|
|
54
|
+
file_to_module_parts: Callable[[Path, Path], list[str]]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True)
|
|
58
|
+
class LanguageSpec:
|
|
59
|
+
language: SupportedLanguage | str
|
|
60
|
+
file_extensions: tuple[str, ...]
|
|
61
|
+
function_node_types: tuple[str, ...]
|
|
62
|
+
class_node_types: tuple[str, ...]
|
|
63
|
+
module_node_types: tuple[str, ...]
|
|
64
|
+
call_node_types: tuple[str, ...] = ()
|
|
65
|
+
import_node_types: tuple[str, ...] = ()
|
|
66
|
+
import_from_node_types: tuple[str, ...] = ()
|
|
67
|
+
name_field: str = "name"
|
|
68
|
+
body_field: str = "body"
|
|
69
|
+
package_indicators: tuple[str, ...] = ()
|
|
70
|
+
function_query: str | None = None
|
|
71
|
+
class_query: str | None = None
|
|
72
|
+
call_query: str | None = None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class Dependency:
|
|
77
|
+
name: str
|
|
78
|
+
spec: str
|
|
79
|
+
properties: dict[str, str] = field(default_factory=dict)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class MethodModifiersAndAnnotations:
|
|
84
|
+
modifiers: list[str] = field(default_factory=list)
|
|
85
|
+
annotations: list[str] = field(default_factory=list)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class ToolMetadata:
|
|
90
|
+
name: str
|
|
91
|
+
description: str
|
|
92
|
+
input_schema: MCPInputSchema
|
|
93
|
+
handler: MCPHandlerType
|
|
94
|
+
returns_json: bool
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
from tree_sitter import Language, Parser, Query
|
|
9
|
+
|
|
10
|
+
from . import constants as cs
|
|
11
|
+
from . import exceptions as ex
|
|
12
|
+
from . import logs as ls
|
|
13
|
+
from .language_spec import LANGUAGE_SPECS, LanguageSpec
|
|
14
|
+
from .types_defs import LanguageImport, LanguageLoader, LanguageQueries
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader:
|
|
18
|
+
submodule_path = Path(cs.GRAMMARS_DIR) / f"{cs.TREE_SITTER_PREFIX}{lang_name}"
|
|
19
|
+
python_bindings_path = (
|
|
20
|
+
submodule_path / cs.BINDINGS_DIR / cs.SupportedLanguage.PYTHON
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if not python_bindings_path.exists():
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
python_bindings_str = str(python_bindings_path)
|
|
27
|
+
try:
|
|
28
|
+
if python_bindings_str not in sys.path:
|
|
29
|
+
sys.path.insert(0, python_bindings_str)
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
module_name = f"{cs.TREE_SITTER_MODULE_PREFIX}{lang_name.replace('-', '_')}"
|
|
33
|
+
|
|
34
|
+
setup_py_path = submodule_path / cs.SETUP_PY
|
|
35
|
+
if setup_py_path.exists():
|
|
36
|
+
logger.debug(ls.BUILDING_BINDINGS.format(lang=lang_name))
|
|
37
|
+
result = subprocess.run(
|
|
38
|
+
[sys.executable, cs.SETUP_PY, cs.BUILD_EXT_CMD, cs.INPLACE_FLAG],
|
|
39
|
+
check=False,
|
|
40
|
+
cwd=str(submodule_path),
|
|
41
|
+
capture_output=True,
|
|
42
|
+
text=True,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if result.returncode != 0:
|
|
46
|
+
logger.debug(
|
|
47
|
+
ls.BUILD_FAILED.format(
|
|
48
|
+
lang=lang_name, stdout=result.stdout, stderr=result.stderr
|
|
49
|
+
)
|
|
50
|
+
)
|
|
51
|
+
return None
|
|
52
|
+
logger.debug(ls.BUILD_SUCCESS.format(lang=lang_name))
|
|
53
|
+
|
|
54
|
+
logger.debug(ls.IMPORTING_MODULE.format(module=module_name))
|
|
55
|
+
module = importlib.import_module(module_name)
|
|
56
|
+
|
|
57
|
+
language_attrs: list[str] = [
|
|
58
|
+
cs.QUERY_LANGUAGE,
|
|
59
|
+
f"{cs.LANG_ATTR_PREFIX}{lang_name}",
|
|
60
|
+
f"{cs.LANG_ATTR_PREFIX}{lang_name.replace('-', '_')}",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
for attr_name in language_attrs:
|
|
64
|
+
if hasattr(module, attr_name):
|
|
65
|
+
logger.debug(
|
|
66
|
+
ls.LOADED_FROM_SUBMODULE.format(lang=lang_name, attr=attr_name)
|
|
67
|
+
)
|
|
68
|
+
loader: LanguageLoader = getattr(module, attr_name)
|
|
69
|
+
return loader
|
|
70
|
+
|
|
71
|
+
logger.debug(
|
|
72
|
+
ls.NO_LANG_ATTR.format(module=module_name, available=dir(module))
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
finally:
|
|
76
|
+
if python_bindings_str in sys.path:
|
|
77
|
+
sys.path.remove(python_bindings_str)
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.debug(ls.SUBMODULE_LOAD_FAILED.format(lang=lang_name, error=e))
|
|
81
|
+
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _try_import_language(
|
|
86
|
+
module_path: str, attr_name: str, lang_name: cs.SupportedLanguage
|
|
87
|
+
) -> LanguageLoader:
|
|
88
|
+
try:
|
|
89
|
+
module = importlib.import_module(module_path)
|
|
90
|
+
loader: LanguageLoader = getattr(module, attr_name)
|
|
91
|
+
return loader
|
|
92
|
+
except ImportError:
|
|
93
|
+
return _try_load_from_submodule(lang_name)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _import_language_loaders() -> dict[cs.SupportedLanguage, LanguageLoader]:
|
|
97
|
+
language_imports: list[LanguageImport] = [
|
|
98
|
+
LanguageImport(
|
|
99
|
+
cs.SupportedLanguage.PYTHON,
|
|
100
|
+
cs.TreeSitterModule.PYTHON,
|
|
101
|
+
cs.QUERY_LANGUAGE,
|
|
102
|
+
cs.SupportedLanguage.PYTHON,
|
|
103
|
+
),
|
|
104
|
+
LanguageImport(
|
|
105
|
+
cs.SupportedLanguage.JS,
|
|
106
|
+
cs.TreeSitterModule.JS,
|
|
107
|
+
cs.QUERY_LANGUAGE,
|
|
108
|
+
cs.SupportedLanguage.JS,
|
|
109
|
+
),
|
|
110
|
+
LanguageImport(
|
|
111
|
+
cs.SupportedLanguage.TS,
|
|
112
|
+
cs.TreeSitterModule.TS,
|
|
113
|
+
cs.LANG_ATTR_TYPESCRIPT,
|
|
114
|
+
cs.SupportedLanguage.TS,
|
|
115
|
+
),
|
|
116
|
+
LanguageImport(
|
|
117
|
+
cs.SupportedLanguage.RUST,
|
|
118
|
+
cs.TreeSitterModule.RUST,
|
|
119
|
+
cs.QUERY_LANGUAGE,
|
|
120
|
+
cs.SupportedLanguage.RUST,
|
|
121
|
+
),
|
|
122
|
+
LanguageImport(
|
|
123
|
+
cs.SupportedLanguage.GO,
|
|
124
|
+
cs.TreeSitterModule.GO,
|
|
125
|
+
cs.QUERY_LANGUAGE,
|
|
126
|
+
cs.SupportedLanguage.GO,
|
|
127
|
+
),
|
|
128
|
+
LanguageImport(
|
|
129
|
+
cs.SupportedLanguage.SCALA,
|
|
130
|
+
cs.TreeSitterModule.SCALA,
|
|
131
|
+
cs.QUERY_LANGUAGE,
|
|
132
|
+
cs.SupportedLanguage.SCALA,
|
|
133
|
+
),
|
|
134
|
+
LanguageImport(
|
|
135
|
+
cs.SupportedLanguage.JAVA,
|
|
136
|
+
cs.TreeSitterModule.JAVA,
|
|
137
|
+
cs.QUERY_LANGUAGE,
|
|
138
|
+
cs.SupportedLanguage.JAVA,
|
|
139
|
+
),
|
|
140
|
+
LanguageImport(
|
|
141
|
+
cs.SupportedLanguage.CPP,
|
|
142
|
+
cs.TreeSitterModule.CPP,
|
|
143
|
+
cs.QUERY_LANGUAGE,
|
|
144
|
+
cs.SupportedLanguage.CPP,
|
|
145
|
+
),
|
|
146
|
+
LanguageImport(
|
|
147
|
+
cs.SupportedLanguage.LUA,
|
|
148
|
+
cs.TreeSitterModule.LUA,
|
|
149
|
+
cs.QUERY_LANGUAGE,
|
|
150
|
+
cs.SupportedLanguage.LUA,
|
|
151
|
+
),
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
loaders: dict[cs.SupportedLanguage, LanguageLoader] = {
|
|
155
|
+
lang_import.lang_key: _try_import_language(
|
|
156
|
+
lang_import.module_path,
|
|
157
|
+
lang_import.attr_name,
|
|
158
|
+
lang_import.submodule_name,
|
|
159
|
+
)
|
|
160
|
+
for lang_import in language_imports
|
|
161
|
+
}
|
|
162
|
+
for lang_key in LANGUAGE_SPECS:
|
|
163
|
+
lang_name = cs.SupportedLanguage(lang_key)
|
|
164
|
+
if lang_name not in loaders or loaders[lang_name] is None:
|
|
165
|
+
loaders[lang_name] = _try_load_from_submodule(lang_name)
|
|
166
|
+
|
|
167
|
+
return loaders
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
_language_loaders = _import_language_loaders()
|
|
171
|
+
|
|
172
|
+
LANGUAGE_LIBRARIES: dict[cs.SupportedLanguage, LanguageLoader] = _language_loaders
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _build_query_pattern(node_types: tuple[str, ...], capture_name: str) -> str:
|
|
176
|
+
return " ".join([f"({node_type}) @{capture_name}" for node_type in node_types])
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _get_locals_pattern(lang_name: cs.SupportedLanguage) -> str | None:
|
|
180
|
+
match lang_name:
|
|
181
|
+
case cs.SupportedLanguage.JS:
|
|
182
|
+
return cs.JS_LOCALS_PATTERN
|
|
183
|
+
case cs.SupportedLanguage.TS:
|
|
184
|
+
return cs.TS_LOCALS_PATTERN
|
|
185
|
+
case _:
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _build_combined_import_pattern(lang_config: LanguageSpec) -> str:
|
|
190
|
+
import_patterns = _build_query_pattern(
|
|
191
|
+
lang_config.import_node_types, cs.CAPTURE_IMPORT
|
|
192
|
+
)
|
|
193
|
+
import_from_patterns = _build_query_pattern(
|
|
194
|
+
lang_config.import_from_node_types, cs.CAPTURE_IMPORT_FROM
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
all_patterns: list[str] = []
|
|
198
|
+
if import_patterns.strip():
|
|
199
|
+
all_patterns.append(import_patterns)
|
|
200
|
+
if import_from_patterns.strip() and import_from_patterns != import_patterns:
|
|
201
|
+
all_patterns.append(import_from_patterns)
|
|
202
|
+
return " ".join(all_patterns)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _create_optional_query(language: Language, pattern: str | None) -> Query | None:
|
|
206
|
+
return Query(language, pattern) if pattern else None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _create_locals_query(
|
|
210
|
+
language: Language, lang_name: cs.SupportedLanguage
|
|
211
|
+
) -> Query | None:
|
|
212
|
+
locals_pattern = _get_locals_pattern(lang_name)
|
|
213
|
+
if not locals_pattern:
|
|
214
|
+
return None
|
|
215
|
+
try:
|
|
216
|
+
return Query(language, locals_pattern)
|
|
217
|
+
except Exception as e:
|
|
218
|
+
logger.debug(ls.LOCALS_QUERY_FAILED.format(lang=lang_name, error=e))
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _create_language_queries(
|
|
223
|
+
language: Language,
|
|
224
|
+
parser: Parser,
|
|
225
|
+
lang_config: LanguageSpec,
|
|
226
|
+
lang_name: cs.SupportedLanguage,
|
|
227
|
+
) -> LanguageQueries:
|
|
228
|
+
function_patterns = lang_config.function_query or _build_query_pattern(
|
|
229
|
+
lang_config.function_node_types, cs.CAPTURE_FUNCTION
|
|
230
|
+
)
|
|
231
|
+
class_patterns = lang_config.class_query or _build_query_pattern(
|
|
232
|
+
lang_config.class_node_types, cs.CAPTURE_CLASS
|
|
233
|
+
)
|
|
234
|
+
call_patterns = lang_config.call_query or _build_query_pattern(
|
|
235
|
+
lang_config.call_node_types, cs.CAPTURE_CALL
|
|
236
|
+
)
|
|
237
|
+
combined_import_patterns = _build_combined_import_pattern(lang_config)
|
|
238
|
+
|
|
239
|
+
return LanguageQueries(
|
|
240
|
+
functions=_create_optional_query(language, function_patterns),
|
|
241
|
+
classes=_create_optional_query(language, class_patterns),
|
|
242
|
+
calls=_create_optional_query(language, call_patterns),
|
|
243
|
+
imports=_create_optional_query(language, combined_import_patterns),
|
|
244
|
+
locals=_create_locals_query(language, lang_name),
|
|
245
|
+
config=lang_config,
|
|
246
|
+
language=language,
|
|
247
|
+
parser=parser,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _process_language(
|
|
252
|
+
lang_name: cs.SupportedLanguage,
|
|
253
|
+
lang_config: LanguageSpec,
|
|
254
|
+
parsers: dict[cs.SupportedLanguage, Parser],
|
|
255
|
+
queries: dict[cs.SupportedLanguage, LanguageQueries],
|
|
256
|
+
) -> bool:
|
|
257
|
+
lang_lib = LANGUAGE_LIBRARIES.get(lang_name)
|
|
258
|
+
if not lang_lib:
|
|
259
|
+
logger.debug(ls.LIB_NOT_AVAILABLE.format(lang=lang_name))
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
language = Language(lang_lib())
|
|
264
|
+
parser = Parser(language)
|
|
265
|
+
parsers[lang_name] = parser
|
|
266
|
+
queries[lang_name] = _create_language_queries(
|
|
267
|
+
language, parser, lang_config, lang_name
|
|
268
|
+
)
|
|
269
|
+
logger.success(ls.GRAMMAR_LOADED.format(lang=lang_name))
|
|
270
|
+
return True
|
|
271
|
+
except Exception as e:
|
|
272
|
+
logger.warning(ls.GRAMMAR_LOAD_FAILED.format(lang=lang_name, error=e))
|
|
273
|
+
return False
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def load_parsers() -> tuple[
|
|
277
|
+
dict[cs.SupportedLanguage, Parser], dict[cs.SupportedLanguage, LanguageQueries]
|
|
278
|
+
]:
|
|
279
|
+
parsers: dict[cs.SupportedLanguage, Parser] = {}
|
|
280
|
+
queries: dict[cs.SupportedLanguage, LanguageQueries] = {}
|
|
281
|
+
available_languages: list[cs.SupportedLanguage] = []
|
|
282
|
+
|
|
283
|
+
for lang_key, lang_config in deepcopy(LANGUAGE_SPECS).items():
|
|
284
|
+
lang_name = cs.SupportedLanguage(lang_key)
|
|
285
|
+
if _process_language(lang_name, lang_config, parsers, queries):
|
|
286
|
+
available_languages.append(lang_name)
|
|
287
|
+
|
|
288
|
+
if not available_languages:
|
|
289
|
+
raise RuntimeError(ex.NO_LANGUAGES)
|
|
290
|
+
|
|
291
|
+
logger.info(ls.INITIALIZED_PARSERS.format(languages=", ".join(available_languages)))
|
|
292
|
+
return parsers, queries
|
codebase_rag/prompts.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
from .cypher_queries import (
|
|
4
|
+
CYPHER_EXAMPLE_CLASS_METHODS,
|
|
5
|
+
CYPHER_EXAMPLE_CONTENT_BY_PATH,
|
|
6
|
+
CYPHER_EXAMPLE_DECORATED_FUNCTIONS,
|
|
7
|
+
CYPHER_EXAMPLE_FILES_IN_FOLDER,
|
|
8
|
+
CYPHER_EXAMPLE_FIND_FILE,
|
|
9
|
+
CYPHER_EXAMPLE_KEYWORD_SEARCH,
|
|
10
|
+
CYPHER_EXAMPLE_LIMIT_ONE,
|
|
11
|
+
CYPHER_EXAMPLE_PYTHON_FILES,
|
|
12
|
+
CYPHER_EXAMPLE_README,
|
|
13
|
+
CYPHER_EXAMPLE_TASKS,
|
|
14
|
+
)
|
|
15
|
+
from .schema_builder import GRAPH_SCHEMA_DEFINITION
|
|
16
|
+
from .types_defs import ToolNames
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from pydantic_ai import Tool
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_tool_names(tools: list["Tool"]) -> ToolNames:
|
|
23
|
+
tool_map = {t.name: t.name for t in tools}
|
|
24
|
+
return ToolNames(
|
|
25
|
+
query_graph=tool_map.get(
|
|
26
|
+
"query_codebase_knowledge_graph", "query_codebase_knowledge_graph"
|
|
27
|
+
),
|
|
28
|
+
read_file=tool_map.get("read_file_content", "read_file_content"),
|
|
29
|
+
analyze_document=tool_map.get("analyze_document", "analyze_document"),
|
|
30
|
+
semantic_search=tool_map.get("semantic_code_search", "semantic_code_search"),
|
|
31
|
+
create_file=tool_map.get("create_new_file", "create_new_file"),
|
|
32
|
+
edit_file=tool_map.get("replace_code_surgically", "replace_code_surgically"),
|
|
33
|
+
shell_command=tool_map.get("execute_shell_command", "execute_shell_command"),
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
CYPHER_QUERY_RULES = """**2. Critical Cypher Query Rules**
|
|
38
|
+
|
|
39
|
+
- **ALWAYS Return Specific Properties with Aliases**: Do NOT return whole nodes (e.g., `RETURN n`). You MUST return specific properties with clear aliases (e.g., `RETURN n.name AS name`).
|
|
40
|
+
- **Use `STARTS WITH` for Paths**: When matching paths, always use `STARTS WITH` for robustness (e.g., `WHERE n.path STARTS WITH 'workflows/src'`). Do not use `=`.
|
|
41
|
+
- **Use `ENDS WITH` for qualified_name**: The `qualified_name` property contains full paths like `'Project.folder.subfolder.ClassName'`. When users mention a class, function, or method by its short name (e.g., "VatManager"), use `ENDS WITH` to match: `WHERE c.qualified_name ENDS WITH '.VatManager'`. Do NOT use `{name: 'VatManager'}` equality matching.
|
|
42
|
+
- **Use `toLower()` for Searches**: For case-insensitive searching on string properties, use `toLower()`.
|
|
43
|
+
- **Querying Lists**: To check if a list property (like `decorators`) contains an item, use the `ANY` or `IN` clause (e.g., `WHERE 'flow' IN n.decorators`)."""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def build_graph_schema_and_rules() -> str:
|
|
47
|
+
return f"""You are an expert AI assistant for analyzing codebases using a **hybrid retrieval system**: a **Memgraph knowledge graph** for structural queries and a **semantic code search engine** for intent-based discovery.
|
|
48
|
+
|
|
49
|
+
**1. Graph Schema Definition**
|
|
50
|
+
The database contains information about a codebase, structured with the following nodes and relationships.
|
|
51
|
+
|
|
52
|
+
{GRAPH_SCHEMA_DEFINITION}
|
|
53
|
+
|
|
54
|
+
{CYPHER_QUERY_RULES}
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
GRAPH_SCHEMA_AND_RULES = build_graph_schema_and_rules()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str:
|
|
62
|
+
t = extract_tool_names(tools)
|
|
63
|
+
return f"""You are an expert AI assistant for analyzing codebases. Your answers are based **EXCLUSIVELY** on information retrieved using your tools.
|
|
64
|
+
|
|
65
|
+
**CRITICAL RULES:**
|
|
66
|
+
1. **TOOL-ONLY ANSWERS**: You must ONLY use information from the tools provided. Do not use external knowledge.
|
|
67
|
+
2. **NATURAL LANGUAGE QUERIES**: When using the `{t.query_graph}` tool, ALWAYS use natural language questions. NEVER write Cypher queries directly - the tool will translate your natural language into the appropriate database query.
|
|
68
|
+
3. **HONESTY**: If a tool fails or returns no results, you MUST state that clearly and report any error messages. Do not invent answers.
|
|
69
|
+
4. **CHOOSE THE RIGHT TOOL FOR THE FILE TYPE**:
|
|
70
|
+
- For source code files (.py, .ts, etc.), use `{t.read_file}`.
|
|
71
|
+
- For documents like PDFs, use the `{t.analyze_document}` tool. This is more effective than trying to read them as plain text.
|
|
72
|
+
|
|
73
|
+
**Your General Approach:**
|
|
74
|
+
1. **Analyze Documents**: If the user asks a question about a document (like a PDF), you **MUST** use the `{t.analyze_document}` tool. Provide both the `file_path` and the user's `question` to the tool.
|
|
75
|
+
2. **Deep Dive into Code**: When you identify a relevant component (e.g., a folder), you must go beyond documentation.
|
|
76
|
+
a. First, check if documentation files like `README.md` exist and read them for context. For configuration, look for files appropriate to the language (e.g., `pyproject.toml` for Python, `package.json` for Node.js).
|
|
77
|
+
b. **Then, you MUST dive into the source code.** Explore the `src` directory (or equivalent). Identify and read key files (e.g., `main.py`, `index.ts`, `app.ts`) to understand the implementation details, logic, and functionality.
|
|
78
|
+
c. Synthesize all this information—from documentation, configuration, and the code itself—to provide a comprehensive, factual answer. Do not just describe the files; explain what the code *does*.
|
|
79
|
+
d. Only ask for clarification if, after a thorough investigation, the user's intent is still unclear.
|
|
80
|
+
3. **Choose the Right Search Strategy - SEMANTIC FIRST for Intent**:
|
|
81
|
+
a. **WHEN TO USE SEMANTIC SEARCH FIRST**: Always start with `{t.semantic_search}` for ANY of these patterns:
|
|
82
|
+
- "main entry point", "startup", "initialization", "bootstrap", "launcher"
|
|
83
|
+
- "error handling", "validation", "authentication"
|
|
84
|
+
- "where is X done", "how does Y work", "find Z logic"
|
|
85
|
+
- Any question about PURPOSE, INTENT, or FUNCTIONALITY
|
|
86
|
+
|
|
87
|
+
**Entry Point Recognition Patterns**:
|
|
88
|
+
- Python: `if __name__ == "__main__"`, `main()` function, CLI scripts, `app.run()`
|
|
89
|
+
- JavaScript/TypeScript: `index.js`, `main.ts`, `app.js`, `server.js`, package.json scripts
|
|
90
|
+
- Java: `public static void main`, `@SpringBootApplication`
|
|
91
|
+
- C/C++: `int main()`, `WinMain`
|
|
92
|
+
- Web: `index.html`, routing configurations, startup middleware
|
|
93
|
+
|
|
94
|
+
b. **WHEN TO USE GRAPH DIRECTLY**: Only use `{t.query_graph}` directly for pure structural queries:
|
|
95
|
+
- "What does function X call?" (when you already know X's name)
|
|
96
|
+
- "List methods of User class" (when you know the exact class name)
|
|
97
|
+
- "Show files in folder Y" (when you know the exact folder path)
|
|
98
|
+
|
|
99
|
+
c. **HYBRID APPROACH (RECOMMENDED)**: For most queries, use this sequence:
|
|
100
|
+
1. Use `{t.semantic_search}` to find relevant code elements by intent/meaning
|
|
101
|
+
2. Then use `{t.query_graph}` to explore structural relationships
|
|
102
|
+
3. **CRITICAL**: Always read the actual files using `{t.read_file}` to examine source code
|
|
103
|
+
4. For entry points specifically: Look for `if __name__ == "__main__"`, `main()` functions, or CLI entry points
|
|
104
|
+
|
|
105
|
+
d. **Tool Chaining Example**: For "main entry point and what it calls":
|
|
106
|
+
1. `{t.semantic_search}` for focused terms like "main entry startup" (not overly broad)
|
|
107
|
+
2. `{t.query_graph}` to find specific function relationships
|
|
108
|
+
3. `{t.read_file}` for main.py with targeted sections (use offset/limit for large files)
|
|
109
|
+
4. Look for the true application entry point (main function, __main__ block, CLI commands)
|
|
110
|
+
5. If you find CLI frameworks (typer, click, argparse), read relevant command sections only
|
|
111
|
+
6. Summarize execution flow concisely rather than showing all details
|
|
112
|
+
4. **Plan Before Writing or Modifying**:
|
|
113
|
+
a. Before using `{t.create_file}`, `{t.edit_file}`, or modifying files, you MUST explore the codebase to find the correct location and file structure.
|
|
114
|
+
b. For shell commands: If `{t.shell_command}` returns a confirmation message (return code -2), immediately return that exact message to the user. When they respond "yes", call the tool again with `user_confirmed=True`.
|
|
115
|
+
5. **Execute Shell Commands**: The `{t.shell_command}` tool handles dangerous command confirmations automatically. If it returns a confirmation prompt, pass it directly to the user.
|
|
116
|
+
6. **Complete the Investigation Cycle**: For entry point queries, you MUST:
|
|
117
|
+
a. Find candidate functions via semantic search
|
|
118
|
+
b. Explore their relationships via graph queries
|
|
119
|
+
c. **AUTOMATICALLY read main.py** (or main entry file) - NEVER ask the user for permission
|
|
120
|
+
d. Look for the ACTUAL startup code: `if __name__ == "__main__"`, CLI commands, `main()` functions
|
|
121
|
+
e. If CLI framework detected (typer, click, argparse), examine command functions
|
|
122
|
+
f. Distinguish between helper functions and the real application entry point
|
|
123
|
+
g. Show the complete execution flow from the true entry point through initialization
|
|
124
|
+
7. **Token Management**: Be efficient with context usage:
|
|
125
|
+
a. For semantic search, use focused queries (not overly broad terms)
|
|
126
|
+
b. For file reading, read specific sections when possible using offset/limit
|
|
127
|
+
c. Summarize large results rather than including full content
|
|
128
|
+
d. Prioritize most relevant findings over comprehensive coverage
|
|
129
|
+
8. **Synthesize Answer**: Analyze and explain the retrieved content. Cite your sources (file paths or qualified names). Report any errors gracefully.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
CYPHER_SYSTEM_PROMPT = f"""
|
|
134
|
+
You are an expert translator that converts natural language questions about code structure into precise Neo4j Cypher queries.
|
|
135
|
+
|
|
136
|
+
{GRAPH_SCHEMA_AND_RULES}
|
|
137
|
+
|
|
138
|
+
**3. Query Optimization Rules**
|
|
139
|
+
|
|
140
|
+
- **LIMIT Results**: ALWAYS add `LIMIT 50` to queries that list items. This prevents overwhelming responses.
|
|
141
|
+
- **Aggregation Queries**: When asked "how many", "count", or "total", return ONLY the count, not all items:
|
|
142
|
+
- CORRECT: `MATCH (c:Class) RETURN count(c) AS total`
|
|
143
|
+
- WRONG: `MATCH (c:Class) RETURN c.name, c.path, count(c) AS total` (returns all items!)
|
|
144
|
+
- **List vs Count**: If asked to "list" or "show", return items with LIMIT. If asked to "count" or "how many", return only the count.
|
|
145
|
+
|
|
146
|
+
**4. Query Patterns & Examples**
|
|
147
|
+
When listing items, return the `name`, `path`, and `qualified_name` with a LIMIT.
|
|
148
|
+
|
|
149
|
+
**Pattern: Counting Items**
|
|
150
|
+
cypher// "How many classes are there?" or "Count all functions"
|
|
151
|
+
MATCH (c:Class) RETURN count(c) AS total
|
|
152
|
+
|
|
153
|
+
**Pattern: Finding Decorated Functions/Methods (e.g., Workflows, Tasks)**
|
|
154
|
+
cypher// "Find all prefect flows" or "what are the workflows?" or "show me the tasks"
|
|
155
|
+
// Use the 'IN' operator to check the 'decorators' list property.
|
|
156
|
+
{CYPHER_EXAMPLE_DECORATED_FUNCTIONS}
|
|
157
|
+
|
|
158
|
+
**Pattern: Finding Content by Path (Robustly)**
|
|
159
|
+
cypher// "what is in the 'workflows/src' directory?" or "list files in workflows"
|
|
160
|
+
// Use `STARTS WITH` for path matching.
|
|
161
|
+
{CYPHER_EXAMPLE_CONTENT_BY_PATH}
|
|
162
|
+
|
|
163
|
+
**Pattern: Keyword & Concept Search (Fallback for general terms)**
|
|
164
|
+
cypher// "find things related to 'database'"
|
|
165
|
+
{CYPHER_EXAMPLE_KEYWORD_SEARCH}
|
|
166
|
+
|
|
167
|
+
**Pattern: Finding a Specific File**
|
|
168
|
+
cypher// "Find the main README.md"
|
|
169
|
+
{CYPHER_EXAMPLE_FIND_FILE}
|
|
170
|
+
|
|
171
|
+
**Pattern: Finding Methods of a Class by Short Name**
|
|
172
|
+
cypher// "What methods does UserService have?" or "Show me methods in UserService" or "List UserService methods"
|
|
173
|
+
// Use `ENDS WITH` to match the class by short name since qualified_name contains full path.
|
|
174
|
+
{CYPHER_EXAMPLE_CLASS_METHODS}
|
|
175
|
+
|
|
176
|
+
**4. Output Format**
|
|
177
|
+
Provide only the Cypher query.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
# (H) Stricter prompt for less capable open-source/local models (e.g., Ollama)
|
|
181
|
+
LOCAL_CYPHER_SYSTEM_PROMPT = f"""
|
|
182
|
+
You are a Neo4j Cypher query generator. You ONLY respond with a valid Cypher query. Do not add explanations or markdown.
|
|
183
|
+
|
|
184
|
+
{GRAPH_SCHEMA_AND_RULES}
|
|
185
|
+
|
|
186
|
+
**CRITICAL RULES FOR QUERY GENERATION:**
|
|
187
|
+
1. **NO `UNION`**: Never use the `UNION` clause. Generate a single, simple `MATCH` query.
|
|
188
|
+
2. **BIND and ALIAS**: You must bind every node you use to a variable (e.g., `MATCH (f:File)`). You must use that variable to access properties and alias every returned property (e.g., `RETURN f.path AS path`).
|
|
189
|
+
3. **RETURN STRUCTURE**: Your query should aim to return `name`, `path`, and `qualified_name` so the calling system can use the results.
|
|
190
|
+
- For `File` nodes, return `f.path AS path`.
|
|
191
|
+
- For code nodes (`Class`, `Function`, etc.), return `n.qualified_name AS qualified_name`.
|
|
192
|
+
4. **KEEP IT SIMPLE**: Do not try to be clever. A simple query that returns a few relevant nodes is better than a complex one that fails.
|
|
193
|
+
5. **CLAUSE ORDER**: You MUST follow the standard Cypher clause order: `MATCH`, `WHERE`, `RETURN`, `LIMIT`.
|
|
194
|
+
6. **ALWAYS ADD LIMIT**: For queries that list items, ALWAYS add `LIMIT 50` to prevent overwhelming responses.
|
|
195
|
+
7. **AGGREGATION QUERIES**: When asked "how many" or "count", return ONLY the count:
|
|
196
|
+
- CORRECT: `MATCH (c:Class) RETURN count(c) AS total`
|
|
197
|
+
- WRONG: `MATCH (c:Class) RETURN c.name, count(c) AS total` (returns all items!)
|
|
198
|
+
|
|
199
|
+
**Examples:**
|
|
200
|
+
|
|
201
|
+
* **Natural Language:** "How many classes are there?"
|
|
202
|
+
* **Cypher Query:**
|
|
203
|
+
```cypher
|
|
204
|
+
MATCH (c:Class) RETURN count(c) AS total
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
* **Natural Language:** "Find the main README file"
|
|
208
|
+
* **Cypher Query:**
|
|
209
|
+
```cypher
|
|
210
|
+
{CYPHER_EXAMPLE_README}
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
* **Natural Language:** "Find all python files"
|
|
214
|
+
* **Cypher Query (Note the '.' in extension):**
|
|
215
|
+
```cypher
|
|
216
|
+
{CYPHER_EXAMPLE_PYTHON_FILES}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
* **Natural Language:** "show me the tasks"
|
|
220
|
+
* **Cypher Query:**
|
|
221
|
+
```cypher
|
|
222
|
+
{CYPHER_EXAMPLE_TASKS}
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
* **Natural Language:** "list files in the services folder"
|
|
226
|
+
* **Cypher Query:**
|
|
227
|
+
```cypher
|
|
228
|
+
{CYPHER_EXAMPLE_FILES_IN_FOLDER}
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
* **Natural Language:** "Find just one file to test"
|
|
232
|
+
* **Cypher Query:**
|
|
233
|
+
```cypher
|
|
234
|
+
{CYPHER_EXAMPLE_LIMIT_ONE}
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
* **Natural Language:** "What methods does UserService have?" or "Show me methods in UserService" or "List UserService methods"
|
|
238
|
+
* **Cypher Query (Use ENDS WITH to match class by short name):**
|
|
239
|
+
```cypher
|
|
240
|
+
{CYPHER_EXAMPLE_CLASS_METHODS}
|
|
241
|
+
```
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
OPTIMIZATION_PROMPT = """
|
|
245
|
+
I want you to analyze my {language} codebase and propose specific optimizations based on best practices.
|
|
246
|
+
|
|
247
|
+
Please:
|
|
248
|
+
1. Use your code retrieval and graph querying tools to understand the codebase structure
|
|
249
|
+
2. Read relevant source files to identify optimization opportunities
|
|
250
|
+
3. Reference established patterns and best practices for {language}
|
|
251
|
+
4. Propose specific, actionable optimizations with file references
|
|
252
|
+
5. IMPORTANT: Do not make any changes yet - just propose them and wait for approval
|
|
253
|
+
6. After approval, use your file editing tools to implement the changes
|
|
254
|
+
|
|
255
|
+
Start by analyzing the codebase structure and identifying the main areas that could benefit from optimization.
|
|
256
|
+
Remember: Propose changes first, wait for my approval, then implement.
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
OPTIMIZATION_PROMPT_WITH_REFERENCE = """
|
|
260
|
+
I want you to analyze my {language} codebase and propose specific optimizations based on best practices.
|
|
261
|
+
|
|
262
|
+
Please:
|
|
263
|
+
1. Use your code retrieval and graph querying tools to understand the codebase structure
|
|
264
|
+
2. Read relevant source files to identify optimization opportunities
|
|
265
|
+
3. Use the analyze_document tool to reference best practices from {reference_document}
|
|
266
|
+
4. Reference established patterns and best practices for {language}
|
|
267
|
+
5. Propose specific, actionable optimizations with file references
|
|
268
|
+
6. IMPORTANT: Do not make any changes yet - just propose them and wait for approval
|
|
269
|
+
7. After approval, use your file editing tools to implement the changes
|
|
270
|
+
|
|
271
|
+
Start by analyzing the codebase structure and identifying the main areas that could benefit from optimization.
|
|
272
|
+
Remember: Propose changes first, wait for my approval, then implement.
|
|
273
|
+
"""
|