code-graph-rag 0.0.79__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codebase_rag/models.py ADDED
@@ -0,0 +1,94 @@
1
+ from collections.abc import Callable
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, NamedTuple
5
+
6
+ from rich.console import Console
7
+
8
+ from .constants import SupportedLanguage
9
+ from .types_defs import MCPHandlerType, MCPInputSchema, PropertyValue
10
+
11
+ if TYPE_CHECKING:
12
+ from tree_sitter import Node
13
+
14
+
15
+ @dataclass
16
+ class SessionState:
17
+ confirm_edits: bool = True
18
+ log_file: Path | None = None
19
+ cancelled: bool = False
20
+
21
+ def reset_cancelled(self) -> None:
22
+ self.cancelled = False
23
+
24
+
25
+ def _default_console() -> Console:
26
+ return Console(width=None, force_terminal=True)
27
+
28
+
29
+ @dataclass
30
+ class AppContext:
31
+ session: SessionState = field(default_factory=SessionState)
32
+ console: Console = field(default_factory=_default_console)
33
+
34
+
35
+ @dataclass
36
+ class GraphNode:
37
+ node_id: int
38
+ labels: list[str]
39
+ properties: dict[str, PropertyValue]
40
+
41
+
42
+ @dataclass
43
+ class GraphRelationship:
44
+ from_id: int
45
+ to_id: int
46
+ type: str
47
+ properties: dict[str, PropertyValue]
48
+
49
+
50
+ class FQNSpec(NamedTuple):
51
+ scope_node_types: frozenset[str]
52
+ function_node_types: frozenset[str]
53
+ get_name: Callable[["Node"], str | None]
54
+ file_to_module_parts: Callable[[Path, Path], list[str]]
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class LanguageSpec:
59
+ language: SupportedLanguage | str
60
+ file_extensions: tuple[str, ...]
61
+ function_node_types: tuple[str, ...]
62
+ class_node_types: tuple[str, ...]
63
+ module_node_types: tuple[str, ...]
64
+ call_node_types: tuple[str, ...] = ()
65
+ import_node_types: tuple[str, ...] = ()
66
+ import_from_node_types: tuple[str, ...] = ()
67
+ name_field: str = "name"
68
+ body_field: str = "body"
69
+ package_indicators: tuple[str, ...] = ()
70
+ function_query: str | None = None
71
+ class_query: str | None = None
72
+ call_query: str | None = None
73
+
74
+
75
+ @dataclass
76
+ class Dependency:
77
+ name: str
78
+ spec: str
79
+ properties: dict[str, str] = field(default_factory=dict)
80
+
81
+
82
+ @dataclass
83
+ class MethodModifiersAndAnnotations:
84
+ modifiers: list[str] = field(default_factory=list)
85
+ annotations: list[str] = field(default_factory=list)
86
+
87
+
88
+ @dataclass
89
+ class ToolMetadata:
90
+ name: str
91
+ description: str
92
+ input_schema: MCPInputSchema
93
+ handler: MCPHandlerType
94
+ returns_json: bool
@@ -0,0 +1,292 @@
1
+ import importlib
2
+ import subprocess
3
+ import sys
4
+ from copy import deepcopy
5
+ from pathlib import Path
6
+
7
+ from loguru import logger
8
+ from tree_sitter import Language, Parser, Query
9
+
10
+ from . import constants as cs
11
+ from . import exceptions as ex
12
+ from . import logs as ls
13
+ from .language_spec import LANGUAGE_SPECS, LanguageSpec
14
+ from .types_defs import LanguageImport, LanguageLoader, LanguageQueries
15
+
16
+
17
+ def _try_load_from_submodule(lang_name: cs.SupportedLanguage) -> LanguageLoader:
18
+ submodule_path = Path(cs.GRAMMARS_DIR) / f"{cs.TREE_SITTER_PREFIX}{lang_name}"
19
+ python_bindings_path = (
20
+ submodule_path / cs.BINDINGS_DIR / cs.SupportedLanguage.PYTHON
21
+ )
22
+
23
+ if not python_bindings_path.exists():
24
+ return None
25
+
26
+ python_bindings_str = str(python_bindings_path)
27
+ try:
28
+ if python_bindings_str not in sys.path:
29
+ sys.path.insert(0, python_bindings_str)
30
+
31
+ try:
32
+ module_name = f"{cs.TREE_SITTER_MODULE_PREFIX}{lang_name.replace('-', '_')}"
33
+
34
+ setup_py_path = submodule_path / cs.SETUP_PY
35
+ if setup_py_path.exists():
36
+ logger.debug(ls.BUILDING_BINDINGS.format(lang=lang_name))
37
+ result = subprocess.run(
38
+ [sys.executable, cs.SETUP_PY, cs.BUILD_EXT_CMD, cs.INPLACE_FLAG],
39
+ check=False,
40
+ cwd=str(submodule_path),
41
+ capture_output=True,
42
+ text=True,
43
+ )
44
+
45
+ if result.returncode != 0:
46
+ logger.debug(
47
+ ls.BUILD_FAILED.format(
48
+ lang=lang_name, stdout=result.stdout, stderr=result.stderr
49
+ )
50
+ )
51
+ return None
52
+ logger.debug(ls.BUILD_SUCCESS.format(lang=lang_name))
53
+
54
+ logger.debug(ls.IMPORTING_MODULE.format(module=module_name))
55
+ module = importlib.import_module(module_name)
56
+
57
+ language_attrs: list[str] = [
58
+ cs.QUERY_LANGUAGE,
59
+ f"{cs.LANG_ATTR_PREFIX}{lang_name}",
60
+ f"{cs.LANG_ATTR_PREFIX}{lang_name.replace('-', '_')}",
61
+ ]
62
+
63
+ for attr_name in language_attrs:
64
+ if hasattr(module, attr_name):
65
+ logger.debug(
66
+ ls.LOADED_FROM_SUBMODULE.format(lang=lang_name, attr=attr_name)
67
+ )
68
+ loader: LanguageLoader = getattr(module, attr_name)
69
+ return loader
70
+
71
+ logger.debug(
72
+ ls.NO_LANG_ATTR.format(module=module_name, available=dir(module))
73
+ )
74
+
75
+ finally:
76
+ if python_bindings_str in sys.path:
77
+ sys.path.remove(python_bindings_str)
78
+
79
+ except Exception as e:
80
+ logger.debug(ls.SUBMODULE_LOAD_FAILED.format(lang=lang_name, error=e))
81
+
82
+ return None
83
+
84
+
85
+ def _try_import_language(
86
+ module_path: str, attr_name: str, lang_name: cs.SupportedLanguage
87
+ ) -> LanguageLoader:
88
+ try:
89
+ module = importlib.import_module(module_path)
90
+ loader: LanguageLoader = getattr(module, attr_name)
91
+ return loader
92
+ except ImportError:
93
+ return _try_load_from_submodule(lang_name)
94
+
95
+
96
+ def _import_language_loaders() -> dict[cs.SupportedLanguage, LanguageLoader]:
97
+ language_imports: list[LanguageImport] = [
98
+ LanguageImport(
99
+ cs.SupportedLanguage.PYTHON,
100
+ cs.TreeSitterModule.PYTHON,
101
+ cs.QUERY_LANGUAGE,
102
+ cs.SupportedLanguage.PYTHON,
103
+ ),
104
+ LanguageImport(
105
+ cs.SupportedLanguage.JS,
106
+ cs.TreeSitterModule.JS,
107
+ cs.QUERY_LANGUAGE,
108
+ cs.SupportedLanguage.JS,
109
+ ),
110
+ LanguageImport(
111
+ cs.SupportedLanguage.TS,
112
+ cs.TreeSitterModule.TS,
113
+ cs.LANG_ATTR_TYPESCRIPT,
114
+ cs.SupportedLanguage.TS,
115
+ ),
116
+ LanguageImport(
117
+ cs.SupportedLanguage.RUST,
118
+ cs.TreeSitterModule.RUST,
119
+ cs.QUERY_LANGUAGE,
120
+ cs.SupportedLanguage.RUST,
121
+ ),
122
+ LanguageImport(
123
+ cs.SupportedLanguage.GO,
124
+ cs.TreeSitterModule.GO,
125
+ cs.QUERY_LANGUAGE,
126
+ cs.SupportedLanguage.GO,
127
+ ),
128
+ LanguageImport(
129
+ cs.SupportedLanguage.SCALA,
130
+ cs.TreeSitterModule.SCALA,
131
+ cs.QUERY_LANGUAGE,
132
+ cs.SupportedLanguage.SCALA,
133
+ ),
134
+ LanguageImport(
135
+ cs.SupportedLanguage.JAVA,
136
+ cs.TreeSitterModule.JAVA,
137
+ cs.QUERY_LANGUAGE,
138
+ cs.SupportedLanguage.JAVA,
139
+ ),
140
+ LanguageImport(
141
+ cs.SupportedLanguage.CPP,
142
+ cs.TreeSitterModule.CPP,
143
+ cs.QUERY_LANGUAGE,
144
+ cs.SupportedLanguage.CPP,
145
+ ),
146
+ LanguageImport(
147
+ cs.SupportedLanguage.LUA,
148
+ cs.TreeSitterModule.LUA,
149
+ cs.QUERY_LANGUAGE,
150
+ cs.SupportedLanguage.LUA,
151
+ ),
152
+ ]
153
+
154
+ loaders: dict[cs.SupportedLanguage, LanguageLoader] = {
155
+ lang_import.lang_key: _try_import_language(
156
+ lang_import.module_path,
157
+ lang_import.attr_name,
158
+ lang_import.submodule_name,
159
+ )
160
+ for lang_import in language_imports
161
+ }
162
+ for lang_key in LANGUAGE_SPECS:
163
+ lang_name = cs.SupportedLanguage(lang_key)
164
+ if lang_name not in loaders or loaders[lang_name] is None:
165
+ loaders[lang_name] = _try_load_from_submodule(lang_name)
166
+
167
+ return loaders
168
+
169
+
170
+ _language_loaders = _import_language_loaders()
171
+
172
+ LANGUAGE_LIBRARIES: dict[cs.SupportedLanguage, LanguageLoader] = _language_loaders
173
+
174
+
175
+ def _build_query_pattern(node_types: tuple[str, ...], capture_name: str) -> str:
176
+ return " ".join([f"({node_type}) @{capture_name}" for node_type in node_types])
177
+
178
+
179
+ def _get_locals_pattern(lang_name: cs.SupportedLanguage) -> str | None:
180
+ match lang_name:
181
+ case cs.SupportedLanguage.JS:
182
+ return cs.JS_LOCALS_PATTERN
183
+ case cs.SupportedLanguage.TS:
184
+ return cs.TS_LOCALS_PATTERN
185
+ case _:
186
+ return None
187
+
188
+
189
+ def _build_combined_import_pattern(lang_config: LanguageSpec) -> str:
190
+ import_patterns = _build_query_pattern(
191
+ lang_config.import_node_types, cs.CAPTURE_IMPORT
192
+ )
193
+ import_from_patterns = _build_query_pattern(
194
+ lang_config.import_from_node_types, cs.CAPTURE_IMPORT_FROM
195
+ )
196
+
197
+ all_patterns: list[str] = []
198
+ if import_patterns.strip():
199
+ all_patterns.append(import_patterns)
200
+ if import_from_patterns.strip() and import_from_patterns != import_patterns:
201
+ all_patterns.append(import_from_patterns)
202
+ return " ".join(all_patterns)
203
+
204
+
205
+ def _create_optional_query(language: Language, pattern: str | None) -> Query | None:
206
+ return Query(language, pattern) if pattern else None
207
+
208
+
209
+ def _create_locals_query(
210
+ language: Language, lang_name: cs.SupportedLanguage
211
+ ) -> Query | None:
212
+ locals_pattern = _get_locals_pattern(lang_name)
213
+ if not locals_pattern:
214
+ return None
215
+ try:
216
+ return Query(language, locals_pattern)
217
+ except Exception as e:
218
+ logger.debug(ls.LOCALS_QUERY_FAILED.format(lang=lang_name, error=e))
219
+ return None
220
+
221
+
222
+ def _create_language_queries(
223
+ language: Language,
224
+ parser: Parser,
225
+ lang_config: LanguageSpec,
226
+ lang_name: cs.SupportedLanguage,
227
+ ) -> LanguageQueries:
228
+ function_patterns = lang_config.function_query or _build_query_pattern(
229
+ lang_config.function_node_types, cs.CAPTURE_FUNCTION
230
+ )
231
+ class_patterns = lang_config.class_query or _build_query_pattern(
232
+ lang_config.class_node_types, cs.CAPTURE_CLASS
233
+ )
234
+ call_patterns = lang_config.call_query or _build_query_pattern(
235
+ lang_config.call_node_types, cs.CAPTURE_CALL
236
+ )
237
+ combined_import_patterns = _build_combined_import_pattern(lang_config)
238
+
239
+ return LanguageQueries(
240
+ functions=_create_optional_query(language, function_patterns),
241
+ classes=_create_optional_query(language, class_patterns),
242
+ calls=_create_optional_query(language, call_patterns),
243
+ imports=_create_optional_query(language, combined_import_patterns),
244
+ locals=_create_locals_query(language, lang_name),
245
+ config=lang_config,
246
+ language=language,
247
+ parser=parser,
248
+ )
249
+
250
+
251
+ def _process_language(
252
+ lang_name: cs.SupportedLanguage,
253
+ lang_config: LanguageSpec,
254
+ parsers: dict[cs.SupportedLanguage, Parser],
255
+ queries: dict[cs.SupportedLanguage, LanguageQueries],
256
+ ) -> bool:
257
+ lang_lib = LANGUAGE_LIBRARIES.get(lang_name)
258
+ if not lang_lib:
259
+ logger.debug(ls.LIB_NOT_AVAILABLE.format(lang=lang_name))
260
+ return False
261
+
262
+ try:
263
+ language = Language(lang_lib())
264
+ parser = Parser(language)
265
+ parsers[lang_name] = parser
266
+ queries[lang_name] = _create_language_queries(
267
+ language, parser, lang_config, lang_name
268
+ )
269
+ logger.success(ls.GRAMMAR_LOADED.format(lang=lang_name))
270
+ return True
271
+ except Exception as e:
272
+ logger.warning(ls.GRAMMAR_LOAD_FAILED.format(lang=lang_name, error=e))
273
+ return False
274
+
275
+
276
+ def load_parsers() -> tuple[
277
+ dict[cs.SupportedLanguage, Parser], dict[cs.SupportedLanguage, LanguageQueries]
278
+ ]:
279
+ parsers: dict[cs.SupportedLanguage, Parser] = {}
280
+ queries: dict[cs.SupportedLanguage, LanguageQueries] = {}
281
+ available_languages: list[cs.SupportedLanguage] = []
282
+
283
+ for lang_key, lang_config in deepcopy(LANGUAGE_SPECS).items():
284
+ lang_name = cs.SupportedLanguage(lang_key)
285
+ if _process_language(lang_name, lang_config, parsers, queries):
286
+ available_languages.append(lang_name)
287
+
288
+ if not available_languages:
289
+ raise RuntimeError(ex.NO_LANGUAGES)
290
+
291
+ logger.info(ls.INITIALIZED_PARSERS.format(languages=", ".join(available_languages)))
292
+ return parsers, queries
@@ -0,0 +1,273 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from .cypher_queries import (
4
+ CYPHER_EXAMPLE_CLASS_METHODS,
5
+ CYPHER_EXAMPLE_CONTENT_BY_PATH,
6
+ CYPHER_EXAMPLE_DECORATED_FUNCTIONS,
7
+ CYPHER_EXAMPLE_FILES_IN_FOLDER,
8
+ CYPHER_EXAMPLE_FIND_FILE,
9
+ CYPHER_EXAMPLE_KEYWORD_SEARCH,
10
+ CYPHER_EXAMPLE_LIMIT_ONE,
11
+ CYPHER_EXAMPLE_PYTHON_FILES,
12
+ CYPHER_EXAMPLE_README,
13
+ CYPHER_EXAMPLE_TASKS,
14
+ )
15
+ from .schema_builder import GRAPH_SCHEMA_DEFINITION
16
+ from .types_defs import ToolNames
17
+
18
+ if TYPE_CHECKING:
19
+ from pydantic_ai import Tool
20
+
21
+
22
+ def extract_tool_names(tools: list["Tool"]) -> ToolNames:
23
+ tool_map = {t.name: t.name for t in tools}
24
+ return ToolNames(
25
+ query_graph=tool_map.get(
26
+ "query_codebase_knowledge_graph", "query_codebase_knowledge_graph"
27
+ ),
28
+ read_file=tool_map.get("read_file_content", "read_file_content"),
29
+ analyze_document=tool_map.get("analyze_document", "analyze_document"),
30
+ semantic_search=tool_map.get("semantic_code_search", "semantic_code_search"),
31
+ create_file=tool_map.get("create_new_file", "create_new_file"),
32
+ edit_file=tool_map.get("replace_code_surgically", "replace_code_surgically"),
33
+ shell_command=tool_map.get("execute_shell_command", "execute_shell_command"),
34
+ )
35
+
36
+
37
+ CYPHER_QUERY_RULES = """**2. Critical Cypher Query Rules**
38
+
39
+ - **ALWAYS Return Specific Properties with Aliases**: Do NOT return whole nodes (e.g., `RETURN n`). You MUST return specific properties with clear aliases (e.g., `RETURN n.name AS name`).
40
+ - **Use `STARTS WITH` for Paths**: When matching paths, always use `STARTS WITH` for robustness (e.g., `WHERE n.path STARTS WITH 'workflows/src'`). Do not use `=`.
41
+ - **Use `ENDS WITH` for qualified_name**: The `qualified_name` property contains full paths like `'Project.folder.subfolder.ClassName'`. When users mention a class, function, or method by its short name (e.g., "VatManager"), use `ENDS WITH` to match: `WHERE c.qualified_name ENDS WITH '.VatManager'`. Do NOT use `{name: 'VatManager'}` equality matching.
42
+ - **Use `toLower()` for Searches**: For case-insensitive searching on string properties, use `toLower()`.
43
+ - **Querying Lists**: To check if a list property (like `decorators`) contains an item, use the `ANY` or `IN` clause (e.g., `WHERE 'flow' IN n.decorators`)."""
44
+
45
+
46
+ def build_graph_schema_and_rules() -> str:
47
+ return f"""You are an expert AI assistant for analyzing codebases using a **hybrid retrieval system**: a **Memgraph knowledge graph** for structural queries and a **semantic code search engine** for intent-based discovery.
48
+
49
+ **1. Graph Schema Definition**
50
+ The database contains information about a codebase, structured with the following nodes and relationships.
51
+
52
+ {GRAPH_SCHEMA_DEFINITION}
53
+
54
+ {CYPHER_QUERY_RULES}
55
+ """
56
+
57
+
58
+ GRAPH_SCHEMA_AND_RULES = build_graph_schema_and_rules()
59
+
60
+
61
+ def build_rag_orchestrator_prompt(tools: list["Tool"]) -> str:
62
+ t = extract_tool_names(tools)
63
+ return f"""You are an expert AI assistant for analyzing codebases. Your answers are based **EXCLUSIVELY** on information retrieved using your tools.
64
+
65
+ **CRITICAL RULES:**
66
+ 1. **TOOL-ONLY ANSWERS**: You must ONLY use information from the tools provided. Do not use external knowledge.
67
+ 2. **NATURAL LANGUAGE QUERIES**: When using the `{t.query_graph}` tool, ALWAYS use natural language questions. NEVER write Cypher queries directly - the tool will translate your natural language into the appropriate database query.
68
+ 3. **HONESTY**: If a tool fails or returns no results, you MUST state that clearly and report any error messages. Do not invent answers.
69
+ 4. **CHOOSE THE RIGHT TOOL FOR THE FILE TYPE**:
70
+ - For source code files (.py, .ts, etc.), use `{t.read_file}`.
71
+ - For documents like PDFs, use the `{t.analyze_document}` tool. This is more effective than trying to read them as plain text.
72
+
73
+ **Your General Approach:**
74
+ 1. **Analyze Documents**: If the user asks a question about a document (like a PDF), you **MUST** use the `{t.analyze_document}` tool. Provide both the `file_path` and the user's `question` to the tool.
75
+ 2. **Deep Dive into Code**: When you identify a relevant component (e.g., a folder), you must go beyond documentation.
76
+ a. First, check if documentation files like `README.md` exist and read them for context. For configuration, look for files appropriate to the language (e.g., `pyproject.toml` for Python, `package.json` for Node.js).
77
+ b. **Then, you MUST dive into the source code.** Explore the `src` directory (or equivalent). Identify and read key files (e.g., `main.py`, `index.ts`, `app.ts`) to understand the implementation details, logic, and functionality.
78
+ c. Synthesize all this information—from documentation, configuration, and the code itself—to provide a comprehensive, factual answer. Do not just describe the files; explain what the code *does*.
79
+ d. Only ask for clarification if, after a thorough investigation, the user's intent is still unclear.
80
+ 3. **Choose the Right Search Strategy - SEMANTIC FIRST for Intent**:
81
+ a. **WHEN TO USE SEMANTIC SEARCH FIRST**: Always start with `{t.semantic_search}` for ANY of these patterns:
82
+ - "main entry point", "startup", "initialization", "bootstrap", "launcher"
83
+ - "error handling", "validation", "authentication"
84
+ - "where is X done", "how does Y work", "find Z logic"
85
+ - Any question about PURPOSE, INTENT, or FUNCTIONALITY
86
+
87
+ **Entry Point Recognition Patterns**:
88
+ - Python: `if __name__ == "__main__"`, `main()` function, CLI scripts, `app.run()`
89
+ - JavaScript/TypeScript: `index.js`, `main.ts`, `app.js`, `server.js`, package.json scripts
90
+ - Java: `public static void main`, `@SpringBootApplication`
91
+ - C/C++: `int main()`, `WinMain`
92
+ - Web: `index.html`, routing configurations, startup middleware
93
+
94
+ b. **WHEN TO USE GRAPH DIRECTLY**: Only use `{t.query_graph}` directly for pure structural queries:
95
+ - "What does function X call?" (when you already know X's name)
96
+ - "List methods of User class" (when you know the exact class name)
97
+ - "Show files in folder Y" (when you know the exact folder path)
98
+
99
+ c. **HYBRID APPROACH (RECOMMENDED)**: For most queries, use this sequence:
100
+ 1. Use `{t.semantic_search}` to find relevant code elements by intent/meaning
101
+ 2. Then use `{t.query_graph}` to explore structural relationships
102
+ 3. **CRITICAL**: Always read the actual files using `{t.read_file}` to examine source code
103
+ 4. For entry points specifically: Look for `if __name__ == "__main__"`, `main()` functions, or CLI entry points
104
+
105
+ d. **Tool Chaining Example**: For "main entry point and what it calls":
106
+ 1. `{t.semantic_search}` for focused terms like "main entry startup" (not overly broad)
107
+ 2. `{t.query_graph}` to find specific function relationships
108
+ 3. `{t.read_file}` for main.py with targeted sections (use offset/limit for large files)
109
+ 4. Look for the true application entry point (main function, __main__ block, CLI commands)
110
+ 5. If you find CLI frameworks (typer, click, argparse), read relevant command sections only
111
+ 6. Summarize execution flow concisely rather than showing all details
112
+ 4. **Plan Before Writing or Modifying**:
113
+ a. Before using `{t.create_file}`, `{t.edit_file}`, or modifying files, you MUST explore the codebase to find the correct location and file structure.
114
+ b. For shell commands: If `{t.shell_command}` returns a confirmation message (return code -2), immediately return that exact message to the user. When they respond "yes", call the tool again with `user_confirmed=True`.
115
+ 5. **Execute Shell Commands**: The `{t.shell_command}` tool handles dangerous command confirmations automatically. If it returns a confirmation prompt, pass it directly to the user.
116
+ 6. **Complete the Investigation Cycle**: For entry point queries, you MUST:
117
+ a. Find candidate functions via semantic search
118
+ b. Explore their relationships via graph queries
119
+ c. **AUTOMATICALLY read main.py** (or main entry file) - NEVER ask the user for permission
120
+ d. Look for the ACTUAL startup code: `if __name__ == "__main__"`, CLI commands, `main()` functions
121
+ e. If CLI framework detected (typer, click, argparse), examine command functions
122
+ f. Distinguish between helper functions and the real application entry point
123
+ g. Show the complete execution flow from the true entry point through initialization
124
+ 7. **Token Management**: Be efficient with context usage:
125
+ a. For semantic search, use focused queries (not overly broad terms)
126
+ b. For file reading, read specific sections when possible using offset/limit
127
+ c. Summarize large results rather than including full content
128
+ d. Prioritize most relevant findings over comprehensive coverage
129
+ 8. **Synthesize Answer**: Analyze and explain the retrieved content. Cite your sources (file paths or qualified names). Report any errors gracefully.
130
+ """
131
+
132
+
133
+ CYPHER_SYSTEM_PROMPT = f"""
134
+ You are an expert translator that converts natural language questions about code structure into precise Neo4j Cypher queries.
135
+
136
+ {GRAPH_SCHEMA_AND_RULES}
137
+
138
+ **3. Query Optimization Rules**
139
+
140
+ - **LIMIT Results**: ALWAYS add `LIMIT 50` to queries that list items. This prevents overwhelming responses.
141
+ - **Aggregation Queries**: When asked "how many", "count", or "total", return ONLY the count, not all items:
142
+ - CORRECT: `MATCH (c:Class) RETURN count(c) AS total`
143
+ - WRONG: `MATCH (c:Class) RETURN c.name, c.path, count(c) AS total` (returns all items!)
144
+ - **List vs Count**: If asked to "list" or "show", return items with LIMIT. If asked to "count" or "how many", return only the count.
145
+
146
+ **4. Query Patterns & Examples**
147
+ When listing items, return the `name`, `path`, and `qualified_name` with a LIMIT.
148
+
149
+ **Pattern: Counting Items**
150
+ cypher// "How many classes are there?" or "Count all functions"
151
+ MATCH (c:Class) RETURN count(c) AS total
152
+
153
+ **Pattern: Finding Decorated Functions/Methods (e.g., Workflows, Tasks)**
154
+ cypher// "Find all prefect flows" or "what are the workflows?" or "show me the tasks"
155
+ // Use the 'IN' operator to check the 'decorators' list property.
156
+ {CYPHER_EXAMPLE_DECORATED_FUNCTIONS}
157
+
158
+ **Pattern: Finding Content by Path (Robustly)**
159
+ cypher// "what is in the 'workflows/src' directory?" or "list files in workflows"
160
+ // Use `STARTS WITH` for path matching.
161
+ {CYPHER_EXAMPLE_CONTENT_BY_PATH}
162
+
163
+ **Pattern: Keyword & Concept Search (Fallback for general terms)**
164
+ cypher// "find things related to 'database'"
165
+ {CYPHER_EXAMPLE_KEYWORD_SEARCH}
166
+
167
+ **Pattern: Finding a Specific File**
168
+ cypher// "Find the main README.md"
169
+ {CYPHER_EXAMPLE_FIND_FILE}
170
+
171
+ **Pattern: Finding Methods of a Class by Short Name**
172
+ cypher// "What methods does UserService have?" or "Show me methods in UserService" or "List UserService methods"
173
+ // Use `ENDS WITH` to match the class by short name since qualified_name contains full path.
174
+ {CYPHER_EXAMPLE_CLASS_METHODS}
175
+
176
+ **4. Output Format**
177
+ Provide only the Cypher query.
178
+ """
179
+
180
+ # (H) Stricter prompt for less capable open-source/local models (e.g., Ollama)
181
+ LOCAL_CYPHER_SYSTEM_PROMPT = f"""
182
+ You are a Neo4j Cypher query generator. You ONLY respond with a valid Cypher query. Do not add explanations or markdown.
183
+
184
+ {GRAPH_SCHEMA_AND_RULES}
185
+
186
+ **CRITICAL RULES FOR QUERY GENERATION:**
187
+ 1. **NO `UNION`**: Never use the `UNION` clause. Generate a single, simple `MATCH` query.
188
+ 2. **BIND and ALIAS**: You must bind every node you use to a variable (e.g., `MATCH (f:File)`). You must use that variable to access properties and alias every returned property (e.g., `RETURN f.path AS path`).
189
+ 3. **RETURN STRUCTURE**: Your query should aim to return `name`, `path`, and `qualified_name` so the calling system can use the results.
190
+ - For `File` nodes, return `f.path AS path`.
191
+ - For code nodes (`Class`, `Function`, etc.), return `n.qualified_name AS qualified_name`.
192
+ 4. **KEEP IT SIMPLE**: Do not try to be clever. A simple query that returns a few relevant nodes is better than a complex one that fails.
193
+ 5. **CLAUSE ORDER**: You MUST follow the standard Cypher clause order: `MATCH`, `WHERE`, `RETURN`, `LIMIT`.
194
+ 6. **ALWAYS ADD LIMIT**: For queries that list items, ALWAYS add `LIMIT 50` to prevent overwhelming responses.
195
+ 7. **AGGREGATION QUERIES**: When asked "how many" or "count", return ONLY the count:
196
+ - CORRECT: `MATCH (c:Class) RETURN count(c) AS total`
197
+ - WRONG: `MATCH (c:Class) RETURN c.name, count(c) AS total` (returns all items!)
198
+
199
+ **Examples:**
200
+
201
+ * **Natural Language:** "How many classes are there?"
202
+ * **Cypher Query:**
203
+ ```cypher
204
+ MATCH (c:Class) RETURN count(c) AS total
205
+ ```
206
+
207
+ * **Natural Language:** "Find the main README file"
208
+ * **Cypher Query:**
209
+ ```cypher
210
+ {CYPHER_EXAMPLE_README}
211
+ ```
212
+
213
+ * **Natural Language:** "Find all python files"
214
+ * **Cypher Query (Note the '.' in extension):**
215
+ ```cypher
216
+ {CYPHER_EXAMPLE_PYTHON_FILES}
217
+ ```
218
+
219
+ * **Natural Language:** "show me the tasks"
220
+ * **Cypher Query:**
221
+ ```cypher
222
+ {CYPHER_EXAMPLE_TASKS}
223
+ ```
224
+
225
+ * **Natural Language:** "list files in the services folder"
226
+ * **Cypher Query:**
227
+ ```cypher
228
+ {CYPHER_EXAMPLE_FILES_IN_FOLDER}
229
+ ```
230
+
231
+ * **Natural Language:** "Find just one file to test"
232
+ * **Cypher Query:**
233
+ ```cypher
234
+ {CYPHER_EXAMPLE_LIMIT_ONE}
235
+ ```
236
+
237
+ * **Natural Language:** "What methods does UserService have?" or "Show me methods in UserService" or "List UserService methods"
238
+ * **Cypher Query (Use ENDS WITH to match class by short name):**
239
+ ```cypher
240
+ {CYPHER_EXAMPLE_CLASS_METHODS}
241
+ ```
242
+ """
243
+
244
+ OPTIMIZATION_PROMPT = """
245
+ I want you to analyze my {language} codebase and propose specific optimizations based on best practices.
246
+
247
+ Please:
248
+ 1. Use your code retrieval and graph querying tools to understand the codebase structure
249
+ 2. Read relevant source files to identify optimization opportunities
250
+ 3. Reference established patterns and best practices for {language}
251
+ 4. Propose specific, actionable optimizations with file references
252
+ 5. IMPORTANT: Do not make any changes yet - just propose them and wait for approval
253
+ 6. After approval, use your file editing tools to implement the changes
254
+
255
+ Start by analyzing the codebase structure and identifying the main areas that could benefit from optimization.
256
+ Remember: Propose changes first, wait for my approval, then implement.
257
+ """
258
+
259
+ OPTIMIZATION_PROMPT_WITH_REFERENCE = """
260
+ I want you to analyze my {language} codebase and propose specific optimizations based on best practices.
261
+
262
+ Please:
263
+ 1. Use your code retrieval and graph querying tools to understand the codebase structure
264
+ 2. Read relevant source files to identify optimization opportunities
265
+ 3. Use the analyze_document tool to reference best practices from {reference_document}
266
+ 4. Reference established patterns and best practices for {language}
267
+ 5. Propose specific, actionable optimizations with file references
268
+ 6. IMPORTANT: Do not make any changes yet - just propose them and wait for approval
269
+ 7. After approval, use your file editing tools to implement the changes
270
+
271
+ Start by analyzing the codebase structure and identifying the main areas that could benefit from optimization.
272
+ Remember: Propose changes first, wait for my approval, then implement.
273
+ """