shotgun-sh 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of shotgun-sh might be problematic. Click here for more details.
- shotgun/__init__.py +5 -0
- shotgun/agents/__init__.py +1 -0
- shotgun/agents/agent_manager.py +651 -0
- shotgun/agents/common.py +549 -0
- shotgun/agents/config/__init__.py +13 -0
- shotgun/agents/config/constants.py +17 -0
- shotgun/agents/config/manager.py +294 -0
- shotgun/agents/config/models.py +185 -0
- shotgun/agents/config/provider.py +206 -0
- shotgun/agents/conversation_history.py +106 -0
- shotgun/agents/conversation_manager.py +105 -0
- shotgun/agents/export.py +96 -0
- shotgun/agents/history/__init__.py +5 -0
- shotgun/agents/history/compaction.py +85 -0
- shotgun/agents/history/constants.py +19 -0
- shotgun/agents/history/context_extraction.py +108 -0
- shotgun/agents/history/history_building.py +104 -0
- shotgun/agents/history/history_processors.py +426 -0
- shotgun/agents/history/message_utils.py +84 -0
- shotgun/agents/history/token_counting.py +429 -0
- shotgun/agents/history/token_estimation.py +138 -0
- shotgun/agents/messages.py +35 -0
- shotgun/agents/models.py +275 -0
- shotgun/agents/plan.py +98 -0
- shotgun/agents/research.py +108 -0
- shotgun/agents/specify.py +98 -0
- shotgun/agents/tasks.py +96 -0
- shotgun/agents/tools/__init__.py +34 -0
- shotgun/agents/tools/codebase/__init__.py +28 -0
- shotgun/agents/tools/codebase/codebase_shell.py +256 -0
- shotgun/agents/tools/codebase/directory_lister.py +141 -0
- shotgun/agents/tools/codebase/file_read.py +144 -0
- shotgun/agents/tools/codebase/models.py +252 -0
- shotgun/agents/tools/codebase/query_graph.py +67 -0
- shotgun/agents/tools/codebase/retrieve_code.py +81 -0
- shotgun/agents/tools/file_management.py +218 -0
- shotgun/agents/tools/user_interaction.py +37 -0
- shotgun/agents/tools/web_search/__init__.py +60 -0
- shotgun/agents/tools/web_search/anthropic.py +144 -0
- shotgun/agents/tools/web_search/gemini.py +85 -0
- shotgun/agents/tools/web_search/openai.py +98 -0
- shotgun/agents/tools/web_search/utils.py +20 -0
- shotgun/build_constants.py +20 -0
- shotgun/cli/__init__.py +1 -0
- shotgun/cli/codebase/__init__.py +5 -0
- shotgun/cli/codebase/commands.py +202 -0
- shotgun/cli/codebase/models.py +21 -0
- shotgun/cli/config.py +275 -0
- shotgun/cli/export.py +81 -0
- shotgun/cli/models.py +10 -0
- shotgun/cli/plan.py +73 -0
- shotgun/cli/research.py +85 -0
- shotgun/cli/specify.py +69 -0
- shotgun/cli/tasks.py +78 -0
- shotgun/cli/update.py +152 -0
- shotgun/cli/utils.py +25 -0
- shotgun/codebase/__init__.py +12 -0
- shotgun/codebase/core/__init__.py +46 -0
- shotgun/codebase/core/change_detector.py +358 -0
- shotgun/codebase/core/code_retrieval.py +243 -0
- shotgun/codebase/core/ingestor.py +1497 -0
- shotgun/codebase/core/language_config.py +297 -0
- shotgun/codebase/core/manager.py +1662 -0
- shotgun/codebase/core/nl_query.py +331 -0
- shotgun/codebase/core/parser_loader.py +128 -0
- shotgun/codebase/models.py +111 -0
- shotgun/codebase/service.py +206 -0
- shotgun/logging_config.py +227 -0
- shotgun/main.py +167 -0
- shotgun/posthog_telemetry.py +158 -0
- shotgun/prompts/__init__.py +5 -0
- shotgun/prompts/agents/__init__.py +1 -0
- shotgun/prompts/agents/export.j2 +350 -0
- shotgun/prompts/agents/partials/codebase_understanding.j2 +87 -0
- shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +37 -0
- shotgun/prompts/agents/partials/content_formatting.j2 +65 -0
- shotgun/prompts/agents/partials/interactive_mode.j2 +26 -0
- shotgun/prompts/agents/plan.j2 +144 -0
- shotgun/prompts/agents/research.j2 +69 -0
- shotgun/prompts/agents/specify.j2 +51 -0
- shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +19 -0
- shotgun/prompts/agents/state/system_state.j2 +31 -0
- shotgun/prompts/agents/tasks.j2 +143 -0
- shotgun/prompts/codebase/__init__.py +1 -0
- shotgun/prompts/codebase/cypher_query_patterns.j2 +223 -0
- shotgun/prompts/codebase/cypher_system.j2 +28 -0
- shotgun/prompts/codebase/enhanced_query_context.j2 +10 -0
- shotgun/prompts/codebase/partials/cypher_rules.j2 +24 -0
- shotgun/prompts/codebase/partials/graph_schema.j2 +30 -0
- shotgun/prompts/codebase/partials/temporal_context.j2 +21 -0
- shotgun/prompts/history/__init__.py +1 -0
- shotgun/prompts/history/incremental_summarization.j2 +53 -0
- shotgun/prompts/history/summarization.j2 +46 -0
- shotgun/prompts/loader.py +140 -0
- shotgun/py.typed +0 -0
- shotgun/sdk/__init__.py +13 -0
- shotgun/sdk/codebase.py +219 -0
- shotgun/sdk/exceptions.py +17 -0
- shotgun/sdk/models.py +189 -0
- shotgun/sdk/services.py +23 -0
- shotgun/sentry_telemetry.py +87 -0
- shotgun/telemetry.py +93 -0
- shotgun/tui/__init__.py +0 -0
- shotgun/tui/app.py +116 -0
- shotgun/tui/commands/__init__.py +76 -0
- shotgun/tui/components/prompt_input.py +69 -0
- shotgun/tui/components/spinner.py +86 -0
- shotgun/tui/components/splash.py +25 -0
- shotgun/tui/components/vertical_tail.py +13 -0
- shotgun/tui/screens/chat.py +782 -0
- shotgun/tui/screens/chat.tcss +43 -0
- shotgun/tui/screens/chat_screen/__init__.py +0 -0
- shotgun/tui/screens/chat_screen/command_providers.py +219 -0
- shotgun/tui/screens/chat_screen/hint_message.py +40 -0
- shotgun/tui/screens/chat_screen/history.py +221 -0
- shotgun/tui/screens/directory_setup.py +113 -0
- shotgun/tui/screens/provider_config.py +221 -0
- shotgun/tui/screens/splash.py +31 -0
- shotgun/tui/styles.tcss +10 -0
- shotgun/tui/utils/__init__.py +5 -0
- shotgun/tui/utils/mode_progress.py +257 -0
- shotgun/utils/__init__.py +5 -0
- shotgun/utils/env_utils.py +35 -0
- shotgun/utils/file_system_utils.py +36 -0
- shotgun/utils/update_checker.py +375 -0
- shotgun_sh-0.1.0.dist-info/METADATA +466 -0
- shotgun_sh-0.1.0.dist-info/RECORD +130 -0
- shotgun_sh-0.1.0.dist-info/WHEEL +4 -0
- shotgun_sh-0.1.0.dist-info/entry_points.txt +2 -0
- shotgun_sh-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1497 @@
|
|
|
1
|
+
"""Kuzu graph ingestor for building code knowledge graphs."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import kuzu
|
|
12
|
+
from tree_sitter import Node, Parser, QueryCursor
|
|
13
|
+
|
|
14
|
+
from shotgun.codebase.core.language_config import LANGUAGE_CONFIGS, get_language_config
|
|
15
|
+
from shotgun.codebase.core.parser_loader import load_parsers
|
|
16
|
+
from shotgun.logging_config import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Default ignore patterns
|
|
22
|
+
IGNORE_PATTERNS = {
|
|
23
|
+
".git",
|
|
24
|
+
"venv",
|
|
25
|
+
".venv",
|
|
26
|
+
"__pycache__",
|
|
27
|
+
"node_modules",
|
|
28
|
+
"build",
|
|
29
|
+
"dist",
|
|
30
|
+
".eggs",
|
|
31
|
+
".pytest_cache",
|
|
32
|
+
".mypy_cache",
|
|
33
|
+
".ruff_cache",
|
|
34
|
+
".claude",
|
|
35
|
+
".idea",
|
|
36
|
+
".vscode",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Ingestor:
|
|
41
|
+
"""Handles all communication and ingestion with the Kuzu database."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, connection: kuzu.Connection):
|
|
44
|
+
self.conn = connection
|
|
45
|
+
self.node_buffer: list[tuple[str, dict[str, Any]]] = []
|
|
46
|
+
self.relationship_buffer: list[
|
|
47
|
+
tuple[str, str, Any, str, str, str, Any, dict[str, Any] | None]
|
|
48
|
+
] = []
|
|
49
|
+
self.batch_size = 1000
|
|
50
|
+
|
|
51
|
+
def create_schema(self) -> None:
|
|
52
|
+
"""Create the graph schema in Kuzu."""
|
|
53
|
+
logger.info("Creating Kuzu schema...")
|
|
54
|
+
|
|
55
|
+
# Node tables
|
|
56
|
+
node_schemas = [
|
|
57
|
+
"CREATE NODE TABLE Project(name STRING PRIMARY KEY, repo_path STRING, graph_id STRING, created_at INT64, updated_at INT64, schema_version STRING, build_options STRING, node_count INT64, relationship_count INT64, stats_updated_at INT64, status STRING, current_operation_id STRING, last_operation STRING, indexed_from_cwds STRING)",
|
|
58
|
+
"CREATE NODE TABLE Package(qualified_name STRING PRIMARY KEY, name STRING, path STRING)",
|
|
59
|
+
"CREATE NODE TABLE Folder(path STRING PRIMARY KEY, name STRING)",
|
|
60
|
+
"CREATE NODE TABLE File(path STRING PRIMARY KEY, name STRING, extension STRING)",
|
|
61
|
+
"CREATE NODE TABLE Module(qualified_name STRING PRIMARY KEY, name STRING, path STRING, created_at INT64, updated_at INT64)",
|
|
62
|
+
"CREATE NODE TABLE Class(qualified_name STRING PRIMARY KEY, name STRING, decorators STRING[], line_start INT64, line_end INT64, created_at INT64, updated_at INT64, docstring STRING)",
|
|
63
|
+
"CREATE NODE TABLE Function(qualified_name STRING PRIMARY KEY, name STRING, decorators STRING[], line_start INT64, line_end INT64, created_at INT64, updated_at INT64, docstring STRING)",
|
|
64
|
+
"CREATE NODE TABLE Method(qualified_name STRING PRIMARY KEY, name STRING, decorators STRING[], line_start INT64, line_end INT64, created_at INT64, updated_at INT64, docstring STRING)",
|
|
65
|
+
"CREATE NODE TABLE ExternalPackage(name STRING PRIMARY KEY, version_spec STRING)",
|
|
66
|
+
"CREATE NODE TABLE FileMetadata(filepath STRING PRIMARY KEY, mtime INT64, hash STRING, last_updated INT64)",
|
|
67
|
+
"CREATE NODE TABLE DeletionLog(id STRING PRIMARY KEY, entity_type STRING, entity_qualified_name STRING, deleted_from_file STRING, deleted_at INT64, deletion_reason STRING)",
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
# Relationship tables - need separate tables for each source/target combination
|
|
71
|
+
rel_schemas = [
|
|
72
|
+
# CONTAINS_PACKAGE relationships
|
|
73
|
+
"CREATE REL TABLE CONTAINS_PACKAGE(FROM Project TO Package)",
|
|
74
|
+
"CREATE REL TABLE CONTAINS_PACKAGE_PKG(FROM Package TO Package)",
|
|
75
|
+
"CREATE REL TABLE CONTAINS_PACKAGE_FOLDER(FROM Folder TO Package)",
|
|
76
|
+
# CONTAINS_FOLDER relationships
|
|
77
|
+
"CREATE REL TABLE CONTAINS_FOLDER(FROM Project TO Folder)",
|
|
78
|
+
"CREATE REL TABLE CONTAINS_FOLDER_PKG(FROM Package TO Folder)",
|
|
79
|
+
"CREATE REL TABLE CONTAINS_FOLDER_FOLDER(FROM Folder TO Folder)",
|
|
80
|
+
# CONTAINS_FILE relationships
|
|
81
|
+
"CREATE REL TABLE CONTAINS_FILE(FROM Project TO File)",
|
|
82
|
+
"CREATE REL TABLE CONTAINS_FILE_PKG(FROM Package TO File)",
|
|
83
|
+
"CREATE REL TABLE CONTAINS_FILE_FOLDER(FROM Folder TO File)",
|
|
84
|
+
# CONTAINS_MODULE relationships
|
|
85
|
+
"CREATE REL TABLE CONTAINS_MODULE(FROM Project TO Module)",
|
|
86
|
+
"CREATE REL TABLE CONTAINS_MODULE_PKG(FROM Package TO Module)",
|
|
87
|
+
"CREATE REL TABLE CONTAINS_MODULE_FOLDER(FROM Folder TO Module)",
|
|
88
|
+
# Other relationships
|
|
89
|
+
"CREATE REL TABLE DEFINES(FROM Module TO Class)",
|
|
90
|
+
"CREATE REL TABLE DEFINES_FUNC(FROM Module TO Function)",
|
|
91
|
+
"CREATE REL TABLE DEFINES_METHOD(FROM Class TO Method)",
|
|
92
|
+
"CREATE REL TABLE INHERITS(FROM Class TO Class)",
|
|
93
|
+
"CREATE REL TABLE OVERRIDES(FROM Method TO Method)",
|
|
94
|
+
"CREATE REL TABLE DEPENDS_ON_EXTERNAL(FROM Project TO ExternalPackage)",
|
|
95
|
+
# CALLS relationships (all combinations)
|
|
96
|
+
"CREATE REL TABLE CALLS(FROM Function TO Function)",
|
|
97
|
+
"CREATE REL TABLE CALLS_FM(FROM Function TO Method)",
|
|
98
|
+
"CREATE REL TABLE CALLS_MF(FROM Method TO Function)",
|
|
99
|
+
"CREATE REL TABLE CALLS_MM(FROM Method TO Method)",
|
|
100
|
+
# IMPORTS
|
|
101
|
+
"CREATE REL TABLE IMPORTS(FROM Module TO Module)",
|
|
102
|
+
# TRACKS relationships (FileMetadata to nodes)
|
|
103
|
+
"CREATE REL TABLE TRACKS_Module(FROM FileMetadata TO Module)",
|
|
104
|
+
"CREATE REL TABLE TRACKS_Class(FROM FileMetadata TO Class)",
|
|
105
|
+
"CREATE REL TABLE TRACKS_Function(FROM FileMetadata TO Function)",
|
|
106
|
+
"CREATE REL TABLE TRACKS_Method(FROM FileMetadata TO Method)",
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# Create all schemas
|
|
110
|
+
for schema in node_schemas + rel_schemas:
|
|
111
|
+
try:
|
|
112
|
+
self.conn.execute(schema)
|
|
113
|
+
logger.debug(f"Created: {schema.split('(')[0]}")
|
|
114
|
+
except Exception as e:
|
|
115
|
+
if "already exists" not in str(e):
|
|
116
|
+
logger.error(f"Failed to create schema: {schema}, error: {e}")
|
|
117
|
+
|
|
118
|
+
logger.info("Schema creation complete.")
|
|
119
|
+
|
|
120
|
+
def ensure_node_batch(self, label: str, properties: dict[str, Any]) -> None:
|
|
121
|
+
"""Add a node to the buffer for batch insertion."""
|
|
122
|
+
# Check for duplicates based on primary key
|
|
123
|
+
primary_key = self._get_primary_key(label, properties)
|
|
124
|
+
if primary_key and self._is_duplicate_node(label, primary_key):
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
self.node_buffer.append((label, properties))
|
|
128
|
+
|
|
129
|
+
if len(self.node_buffer) >= self.batch_size:
|
|
130
|
+
self.flush_nodes()
|
|
131
|
+
|
|
132
|
+
def _get_primary_key(self, label: str, properties: dict[str, Any]) -> str | None:
|
|
133
|
+
"""Get the primary key value for a node."""
|
|
134
|
+
primary_key_field = self._get_primary_key_field(label)
|
|
135
|
+
return properties.get(primary_key_field) if primary_key_field else None
|
|
136
|
+
|
|
137
|
+
def _get_primary_key_field(self, label: str) -> str | None:
|
|
138
|
+
"""Get the primary key field name for a node type."""
|
|
139
|
+
if label == "Project":
|
|
140
|
+
return "name"
|
|
141
|
+
elif label in ["Package", "Module", "Class", "Function", "Method"]:
|
|
142
|
+
return "qualified_name"
|
|
143
|
+
elif label in ["Folder", "File"]:
|
|
144
|
+
return "path"
|
|
145
|
+
elif label == "FileMetadata":
|
|
146
|
+
return "filepath"
|
|
147
|
+
elif label == "ExternalPackage":
|
|
148
|
+
return "name"
|
|
149
|
+
elif label == "DeletionLog":
|
|
150
|
+
return "id"
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
def _is_duplicate_node(self, label: str, primary_key: str) -> bool:
|
|
154
|
+
"""Check if a node with the given primary key already exists in the buffer."""
|
|
155
|
+
for buffered_label, buffered_props in self.node_buffer:
|
|
156
|
+
if buffered_label == label:
|
|
157
|
+
buffered_key = self._get_primary_key(buffered_label, buffered_props)
|
|
158
|
+
if buffered_key == primary_key:
|
|
159
|
+
return True
|
|
160
|
+
return False
|
|
161
|
+
|
|
162
|
+
def flush_nodes(self) -> None:
|
|
163
|
+
"""Flush pending node insertions to the database."""
|
|
164
|
+
if not self.node_buffer:
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
# Group nodes by label
|
|
168
|
+
nodes_by_label: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
169
|
+
for label, properties in self.node_buffer:
|
|
170
|
+
nodes_by_label[label].append(properties)
|
|
171
|
+
|
|
172
|
+
# Insert each group
|
|
173
|
+
for label, nodes in nodes_by_label.items():
|
|
174
|
+
try:
|
|
175
|
+
# Build CREATE query for batch insertion
|
|
176
|
+
for node_props in nodes:
|
|
177
|
+
# Create individual nodes
|
|
178
|
+
prop_names = list(node_props.keys())
|
|
179
|
+
prop_values = [node_props[k] for k in prop_names]
|
|
180
|
+
|
|
181
|
+
# Build query - use MERGE to handle duplicates
|
|
182
|
+
primary_key_field = self._get_primary_key_field(label)
|
|
183
|
+
if primary_key_field and primary_key_field in node_props:
|
|
184
|
+
# Use MERGE for nodes with primary keys
|
|
185
|
+
merge_props = f"{primary_key_field}: ${primary_key_field}"
|
|
186
|
+
set_props = ", ".join(
|
|
187
|
+
[
|
|
188
|
+
f"n.{k} = ${k}"
|
|
189
|
+
for k in prop_names
|
|
190
|
+
if k != primary_key_field
|
|
191
|
+
]
|
|
192
|
+
)
|
|
193
|
+
query = f"MERGE (n:{label} {{{merge_props}}}) SET {set_props}"
|
|
194
|
+
else:
|
|
195
|
+
# Use CREATE for nodes without primary keys
|
|
196
|
+
props_str = ", ".join([f"{k}: ${k}" for k in prop_names])
|
|
197
|
+
query = f"CREATE (n:{label} {{{props_str}}})"
|
|
198
|
+
|
|
199
|
+
# Execute with parameters
|
|
200
|
+
params = dict(zip(prop_names, prop_values, strict=False))
|
|
201
|
+
self.conn.execute(query, params)
|
|
202
|
+
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.error(f"Failed to insert {label} nodes: {e}")
|
|
205
|
+
|
|
206
|
+
# Log node counts by type
|
|
207
|
+
node_type_counts: dict[str, int] = {}
|
|
208
|
+
for label, _ in self.node_buffer:
|
|
209
|
+
node_type_counts[label] = node_type_counts.get(label, 0) + 1
|
|
210
|
+
|
|
211
|
+
logger.info(f"Flushed {len(self.node_buffer)} nodes:")
|
|
212
|
+
for label, count in sorted(node_type_counts.items()):
|
|
213
|
+
logger.info(f" {label}: {count}")
|
|
214
|
+
|
|
215
|
+
self.node_buffer.clear()
|
|
216
|
+
|
|
217
|
+
def ensure_relationship_batch(
|
|
218
|
+
self,
|
|
219
|
+
from_label: str,
|
|
220
|
+
from_key: str,
|
|
221
|
+
from_value: Any,
|
|
222
|
+
rel_type: str,
|
|
223
|
+
to_label: str,
|
|
224
|
+
to_key: str,
|
|
225
|
+
to_value: Any,
|
|
226
|
+
properties: dict[str, Any] | None = None,
|
|
227
|
+
) -> None:
|
|
228
|
+
"""Add a relationship to the buffer for batch insertion."""
|
|
229
|
+
self.relationship_buffer.append(
|
|
230
|
+
(
|
|
231
|
+
from_label,
|
|
232
|
+
from_key,
|
|
233
|
+
from_value,
|
|
234
|
+
rel_type,
|
|
235
|
+
to_label,
|
|
236
|
+
to_key,
|
|
237
|
+
to_value,
|
|
238
|
+
properties,
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Don't auto-flush relationships - wait for explicit flush_all() to ensure nodes exist first
|
|
243
|
+
|
|
244
|
+
def flush_relationships(self) -> None:
|
|
245
|
+
"""Flush pending relationship insertions to the database."""
|
|
246
|
+
if not self.relationship_buffer:
|
|
247
|
+
return
|
|
248
|
+
|
|
249
|
+
# Group relationships by type
|
|
250
|
+
rels_by_type: dict[
|
|
251
|
+
str, list[tuple[str, str, Any, str, str, str, Any, dict[str, Any] | None]]
|
|
252
|
+
] = defaultdict(list)
|
|
253
|
+
|
|
254
|
+
for rel_data in self.relationship_buffer:
|
|
255
|
+
(
|
|
256
|
+
from_label,
|
|
257
|
+
from_key,
|
|
258
|
+
from_value,
|
|
259
|
+
rel_type,
|
|
260
|
+
to_label,
|
|
261
|
+
to_key,
|
|
262
|
+
to_value,
|
|
263
|
+
properties,
|
|
264
|
+
) = rel_data
|
|
265
|
+
|
|
266
|
+
# Determine actual table name
|
|
267
|
+
table_name = self._get_relationship_table_name(
|
|
268
|
+
rel_type, from_label, to_label
|
|
269
|
+
)
|
|
270
|
+
if table_name:
|
|
271
|
+
rels_by_type[table_name].append(rel_data)
|
|
272
|
+
|
|
273
|
+
# Insert each group
|
|
274
|
+
relationship_counts = {}
|
|
275
|
+
for table_name, relationships in rels_by_type.items():
|
|
276
|
+
success_count = 0
|
|
277
|
+
try:
|
|
278
|
+
for rel_data in relationships:
|
|
279
|
+
(
|
|
280
|
+
from_label,
|
|
281
|
+
from_key,
|
|
282
|
+
from_value,
|
|
283
|
+
_,
|
|
284
|
+
to_label,
|
|
285
|
+
to_key,
|
|
286
|
+
to_value,
|
|
287
|
+
properties,
|
|
288
|
+
) = rel_data
|
|
289
|
+
|
|
290
|
+
# Build MATCH and MERGE query (use MERGE to avoid duplicate relationships)
|
|
291
|
+
query = f"""
|
|
292
|
+
MATCH (a:{from_label} {{{from_key}: $from_val}}),
|
|
293
|
+
(b:{to_label} {{{to_key}: $to_val}})
|
|
294
|
+
MERGE (a)-[:{table_name}]->(b)
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
params = {"from_val": from_value, "to_val": to_value}
|
|
298
|
+
try:
|
|
299
|
+
self.conn.execute(query, params)
|
|
300
|
+
success_count += 1
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.error(
|
|
303
|
+
f"Failed to create single relationship {table_name}: {from_label}({from_value}) -> {to_label}({to_value})"
|
|
304
|
+
)
|
|
305
|
+
logger.error(f"Error: {e}")
|
|
306
|
+
raise
|
|
307
|
+
|
|
308
|
+
relationship_counts[table_name] = success_count
|
|
309
|
+
if table_name == "DEFINES_METHOD":
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Successfully created {success_count} DEFINES_METHOD relationships"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
except Exception as e:
|
|
315
|
+
logger.error(f"Failed to insert {table_name} relationships: {e}")
|
|
316
|
+
logger.error(
|
|
317
|
+
f"Failed on relationship #{success_count + 1} of {len(relationships)}"
|
|
318
|
+
)
|
|
319
|
+
logger.error(f"Query was: {query}")
|
|
320
|
+
logger.error(f"Params were: {params}")
|
|
321
|
+
# Don't swallow the exception - let it propagate
|
|
322
|
+
raise
|
|
323
|
+
|
|
324
|
+
# Log summary of flushed relationships
|
|
325
|
+
logger.info(
|
|
326
|
+
f"Flushed {len(self.relationship_buffer)} relationships: {relationship_counts}"
|
|
327
|
+
)
|
|
328
|
+
self.relationship_buffer.clear()
|
|
329
|
+
|
|
330
|
+
def _get_relationship_table_name(
|
|
331
|
+
self, rel_type: str, from_label: str, to_label: str
|
|
332
|
+
) -> str | None:
|
|
333
|
+
"""Determine the actual relationship table name based on source and target."""
|
|
334
|
+
# Mapping of relationship types and from_labels to table names
|
|
335
|
+
table_mapping = {
|
|
336
|
+
"CONTAINS_PACKAGE": {
|
|
337
|
+
"Project": "CONTAINS_PACKAGE",
|
|
338
|
+
"Package": "CONTAINS_PACKAGE_PKG",
|
|
339
|
+
"Folder": "CONTAINS_PACKAGE_FOLDER",
|
|
340
|
+
},
|
|
341
|
+
"CONTAINS_FOLDER": {
|
|
342
|
+
"Project": "CONTAINS_FOLDER",
|
|
343
|
+
"Package": "CONTAINS_FOLDER_PKG",
|
|
344
|
+
"Folder": "CONTAINS_FOLDER_FOLDER",
|
|
345
|
+
},
|
|
346
|
+
"CONTAINS_FILE": {
|
|
347
|
+
"Project": "CONTAINS_FILE",
|
|
348
|
+
"Package": "CONTAINS_FILE_PKG",
|
|
349
|
+
"Folder": "CONTAINS_FILE_FOLDER",
|
|
350
|
+
},
|
|
351
|
+
"CONTAINS_MODULE": {
|
|
352
|
+
"Project": "CONTAINS_MODULE",
|
|
353
|
+
"Package": "CONTAINS_MODULE_PKG",
|
|
354
|
+
"Folder": "CONTAINS_MODULE_FOLDER",
|
|
355
|
+
},
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
if rel_type in table_mapping:
|
|
359
|
+
return table_mapping[rel_type].get(from_label)
|
|
360
|
+
elif rel_type == "DEFINES":
|
|
361
|
+
if to_label == "Function":
|
|
362
|
+
return "DEFINES_FUNC"
|
|
363
|
+
else:
|
|
364
|
+
return "DEFINES"
|
|
365
|
+
elif rel_type == "CALLS":
|
|
366
|
+
if from_label == "Function" and to_label == "Function":
|
|
367
|
+
return "CALLS"
|
|
368
|
+
elif from_label == "Function" and to_label == "Method":
|
|
369
|
+
return "CALLS_FM"
|
|
370
|
+
elif from_label == "Method" and to_label == "Function":
|
|
371
|
+
return "CALLS_MF"
|
|
372
|
+
elif from_label == "Method" and to_label == "Method":
|
|
373
|
+
return "CALLS_MM"
|
|
374
|
+
elif rel_type.startswith("TRACKS_"):
|
|
375
|
+
# TRACKS relationships already have the correct table name
|
|
376
|
+
return rel_type
|
|
377
|
+
else:
|
|
378
|
+
# Default to the relationship type
|
|
379
|
+
return rel_type
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
def flush_all(self) -> None:
|
|
383
|
+
"""Flush all pending operations."""
|
|
384
|
+
logger.info(
|
|
385
|
+
f"Starting flush_all: {len(self.node_buffer)} nodes, {len(self.relationship_buffer)} relationships buffered"
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# IMPORTANT: Flush nodes first to ensure they exist before creating relationships
|
|
389
|
+
self.flush_nodes()
|
|
390
|
+
|
|
391
|
+
# Now flush relationships - all nodes should exist
|
|
392
|
+
self.flush_relationships()
|
|
393
|
+
|
|
394
|
+
logger.info("flush_all completed successfully")
|
|
395
|
+
|
|
396
|
+
def ensure_file_metadata(
|
|
397
|
+
self, filepath: str, mtime: int, hash_value: str, last_updated: int
|
|
398
|
+
) -> None:
|
|
399
|
+
"""Create or update FileMetadata node."""
|
|
400
|
+
self.ensure_node_batch(
|
|
401
|
+
"FileMetadata",
|
|
402
|
+
{
|
|
403
|
+
"filepath": filepath,
|
|
404
|
+
"mtime": mtime,
|
|
405
|
+
"hash": hash_value,
|
|
406
|
+
"last_updated": last_updated,
|
|
407
|
+
},
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
def log_deletion(
|
|
411
|
+
self,
|
|
412
|
+
entity_type: str,
|
|
413
|
+
entity_qn: str,
|
|
414
|
+
filepath: str,
|
|
415
|
+
reason: str = "file_modified",
|
|
416
|
+
) -> None:
|
|
417
|
+
"""Log a deletion to the DeletionLog table."""
|
|
418
|
+
deletion_id = str(uuid.uuid4())
|
|
419
|
+
current_time = int(time.time())
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
self.conn.execute(
|
|
423
|
+
"""
|
|
424
|
+
CREATE (d:DeletionLog {
|
|
425
|
+
id: $id,
|
|
426
|
+
entity_type: $type,
|
|
427
|
+
entity_qualified_name: $qn,
|
|
428
|
+
deleted_from_file: $file,
|
|
429
|
+
deleted_at: $time,
|
|
430
|
+
deletion_reason: $reason
|
|
431
|
+
})
|
|
432
|
+
""",
|
|
433
|
+
{
|
|
434
|
+
"id": deletion_id,
|
|
435
|
+
"type": entity_type,
|
|
436
|
+
"qn": entity_qn,
|
|
437
|
+
"file": filepath,
|
|
438
|
+
"time": current_time,
|
|
439
|
+
"reason": reason,
|
|
440
|
+
},
|
|
441
|
+
)
|
|
442
|
+
except Exception as e:
|
|
443
|
+
logger.error(f"Failed to log deletion of {entity_qn}: {e}")
|
|
444
|
+
|
|
445
|
+
def ensure_tracks_relationship(
|
|
446
|
+
self, filepath: str, node_type: str, node_qn: str
|
|
447
|
+
) -> None:
|
|
448
|
+
"""Create TRACKS relationship between FileMetadata and a node."""
|
|
449
|
+
rel_type = f"TRACKS_{node_type}"
|
|
450
|
+
self.ensure_relationship_batch(
|
|
451
|
+
"FileMetadata",
|
|
452
|
+
"filepath",
|
|
453
|
+
filepath,
|
|
454
|
+
rel_type,
|
|
455
|
+
node_type,
|
|
456
|
+
"qualified_name",
|
|
457
|
+
node_qn,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
def delete_file_nodes(self, filepath: str) -> dict[str, int]:
|
|
461
|
+
"""Delete all nodes tracked by a FileMetadata.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
filepath: Relative file path
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
Statistics of deleted entities
|
|
468
|
+
"""
|
|
469
|
+
stats = {"modules": 0, "classes": 0, "functions": 0, "methods": 0}
|
|
470
|
+
|
|
471
|
+
# Delete each type of node tracked by this file
|
|
472
|
+
for node_type, rel_type, stat_key in [
|
|
473
|
+
("Module", "TRACKS_Module", "modules"),
|
|
474
|
+
("Class", "TRACKS_Class", "classes"),
|
|
475
|
+
("Function", "TRACKS_Function", "functions"),
|
|
476
|
+
("Method", "TRACKS_Method", "methods"),
|
|
477
|
+
]:
|
|
478
|
+
try:
|
|
479
|
+
# First get the nodes to delete (for logging)
|
|
480
|
+
result = self.conn.execute(
|
|
481
|
+
f"""
|
|
482
|
+
MATCH (f:FileMetadata {{filepath: $path}})-[:{rel_type}]->(n:{node_type})
|
|
483
|
+
RETURN n.qualified_name
|
|
484
|
+
""",
|
|
485
|
+
{"path": filepath},
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
nodes_to_delete = []
|
|
489
|
+
if hasattr(result, "has_next") and not isinstance(result, list):
|
|
490
|
+
while result.has_next():
|
|
491
|
+
row = result.get_next()
|
|
492
|
+
if isinstance(row, list | tuple) and len(row) > 0:
|
|
493
|
+
nodes_to_delete.append(row[0])
|
|
494
|
+
|
|
495
|
+
# Log deletions
|
|
496
|
+
for node_qn in nodes_to_delete:
|
|
497
|
+
self.log_deletion(node_type, node_qn, filepath, "file_modified")
|
|
498
|
+
|
|
499
|
+
# Delete the nodes and their relationships
|
|
500
|
+
self.conn.execute(
|
|
501
|
+
f"""
|
|
502
|
+
MATCH (f:FileMetadata {{filepath: $path}})-[:{rel_type}]->(n:{node_type})
|
|
503
|
+
DETACH DELETE n
|
|
504
|
+
""",
|
|
505
|
+
{"path": filepath},
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
stats[stat_key] = len(nodes_to_delete)
|
|
509
|
+
|
|
510
|
+
except Exception as e:
|
|
511
|
+
logger.error(f"Failed to delete {node_type} nodes for {filepath}: {e}")
|
|
512
|
+
|
|
513
|
+
# Delete the FileMetadata node itself
|
|
514
|
+
try:
|
|
515
|
+
self.conn.execute(
|
|
516
|
+
"""
|
|
517
|
+
MATCH (f:FileMetadata {filepath: $path})
|
|
518
|
+
DETACH DELETE f
|
|
519
|
+
""",
|
|
520
|
+
{"path": filepath},
|
|
521
|
+
)
|
|
522
|
+
except Exception as e:
|
|
523
|
+
logger.error(f"Failed to delete FileMetadata for {filepath}: {e}")
|
|
524
|
+
|
|
525
|
+
return stats
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
class SimpleGraphBuilder:
|
|
529
|
+
"""Simplified version of GraphUpdater for building the code graph."""
|
|
530
|
+
|
|
531
|
+
def __init__(
|
|
532
|
+
self,
|
|
533
|
+
ingestor: Ingestor,
|
|
534
|
+
repo_path: Path,
|
|
535
|
+
parsers: dict[str, Parser],
|
|
536
|
+
queries: dict[str, Any],
|
|
537
|
+
exclude_patterns: list[str] | None = None,
|
|
538
|
+
):
|
|
539
|
+
self.ingestor = ingestor
|
|
540
|
+
self.repo_path = repo_path
|
|
541
|
+
self.parsers = parsers
|
|
542
|
+
self.queries = queries
|
|
543
|
+
self.project_name = repo_path.name
|
|
544
|
+
self.ignore_dirs = IGNORE_PATTERNS
|
|
545
|
+
if exclude_patterns:
|
|
546
|
+
self.ignore_dirs = self.ignore_dirs.union(set(exclude_patterns))
|
|
547
|
+
|
|
548
|
+
# Caches
|
|
549
|
+
self.structural_elements: dict[Path, str | None] = {}
|
|
550
|
+
self.ast_cache: dict[Path, tuple[Node, str]] = {}
|
|
551
|
+
self.function_registry: dict[str, str] = {} # qualified_name -> type
|
|
552
|
+
self.simple_name_lookup: dict[str, set[str]] = defaultdict(set)
|
|
553
|
+
self.class_inheritance: dict[str, list[str]] = {} # class_qn -> [parent_qns]
|
|
554
|
+
|
|
555
|
+
def run(self) -> None:
|
|
556
|
+
"""Run the three-pass graph building process."""
|
|
557
|
+
logger.info(f"Building graph for project: {self.project_name}")
|
|
558
|
+
|
|
559
|
+
# Pass 1: Structure
|
|
560
|
+
logger.info("Pass 1: Identifying packages and folders...")
|
|
561
|
+
self._identify_structure()
|
|
562
|
+
|
|
563
|
+
# Pass 2: Definitions
|
|
564
|
+
logger.info("Pass 2: Processing files and extracting definitions...")
|
|
565
|
+
self._process_files()
|
|
566
|
+
|
|
567
|
+
# Pass 3: Relationships
|
|
568
|
+
logger.info("Pass 3: Processing relationships (calls, imports)...")
|
|
569
|
+
self._process_relationships()
|
|
570
|
+
|
|
571
|
+
# Flush all pending operations
|
|
572
|
+
logger.info("Flushing all data to database...")
|
|
573
|
+
self.ingestor.flush_all()
|
|
574
|
+
logger.info("Graph building complete!")
|
|
575
|
+
|
|
576
|
+
def _identify_structure(self) -> None:
|
|
577
|
+
"""First pass: Walk directory to find packages and folders."""
|
|
578
|
+
for root_str, dirs, _ in os.walk(self.repo_path, topdown=True):
|
|
579
|
+
dirs[:] = [d for d in dirs if d not in self.ignore_dirs]
|
|
580
|
+
root = Path(root_str)
|
|
581
|
+
relative_root = root.relative_to(self.repo_path)
|
|
582
|
+
|
|
583
|
+
# Skip root directory
|
|
584
|
+
if root == self.repo_path:
|
|
585
|
+
continue
|
|
586
|
+
|
|
587
|
+
parent_rel_path = relative_root.parent
|
|
588
|
+
parent_container_qn = self.structural_elements.get(parent_rel_path)
|
|
589
|
+
|
|
590
|
+
# Check if this is a package
|
|
591
|
+
is_package = False
|
|
592
|
+
package_indicators = set()
|
|
593
|
+
|
|
594
|
+
# Collect package indicators from all languages
|
|
595
|
+
for lang_name, lang_config in LANGUAGE_CONFIGS.items():
|
|
596
|
+
if lang_name in self.queries:
|
|
597
|
+
package_indicators.update(lang_config.package_indicators)
|
|
598
|
+
|
|
599
|
+
# Check for package indicators
|
|
600
|
+
for indicator in package_indicators:
|
|
601
|
+
if (root / indicator).exists():
|
|
602
|
+
is_package = True
|
|
603
|
+
break
|
|
604
|
+
|
|
605
|
+
if is_package:
|
|
606
|
+
# Create package
|
|
607
|
+
package_qn = ".".join([self.project_name] + list(relative_root.parts))
|
|
608
|
+
self.ingestor.ensure_node_batch(
|
|
609
|
+
"Package",
|
|
610
|
+
{
|
|
611
|
+
"qualified_name": package_qn,
|
|
612
|
+
"name": relative_root.name,
|
|
613
|
+
"path": str(relative_root).replace(os.sep, "/"),
|
|
614
|
+
},
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
# Create containment relationship
|
|
618
|
+
if parent_container_qn:
|
|
619
|
+
# Parent is a package
|
|
620
|
+
self.ingestor.ensure_relationship_batch(
|
|
621
|
+
"Package",
|
|
622
|
+
"qualified_name",
|
|
623
|
+
parent_container_qn,
|
|
624
|
+
"CONTAINS_PACKAGE",
|
|
625
|
+
"Package",
|
|
626
|
+
"qualified_name",
|
|
627
|
+
package_qn,
|
|
628
|
+
)
|
|
629
|
+
else:
|
|
630
|
+
# Parent is project root
|
|
631
|
+
self.ingestor.ensure_relationship_batch(
|
|
632
|
+
"Project",
|
|
633
|
+
"name",
|
|
634
|
+
self.project_name,
|
|
635
|
+
"CONTAINS_PACKAGE",
|
|
636
|
+
"Package",
|
|
637
|
+
"qualified_name",
|
|
638
|
+
package_qn,
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
self.structural_elements[relative_root] = package_qn
|
|
642
|
+
else:
|
|
643
|
+
# Create folder
|
|
644
|
+
self.ingestor.ensure_node_batch(
|
|
645
|
+
"Folder",
|
|
646
|
+
{
|
|
647
|
+
"path": str(relative_root).replace(os.sep, "/"),
|
|
648
|
+
"name": relative_root.name,
|
|
649
|
+
},
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
# Create containment relationship
|
|
653
|
+
if parent_container_qn:
|
|
654
|
+
# Parent is a package
|
|
655
|
+
self.ingestor.ensure_relationship_batch(
|
|
656
|
+
"Package",
|
|
657
|
+
"qualified_name",
|
|
658
|
+
parent_container_qn,
|
|
659
|
+
"CONTAINS_FOLDER",
|
|
660
|
+
"Folder",
|
|
661
|
+
"path",
|
|
662
|
+
str(relative_root).replace(os.sep, "/"),
|
|
663
|
+
)
|
|
664
|
+
elif parent_rel_path == Path("."):
|
|
665
|
+
# Parent is project root
|
|
666
|
+
self.ingestor.ensure_relationship_batch(
|
|
667
|
+
"Project",
|
|
668
|
+
"name",
|
|
669
|
+
self.project_name,
|
|
670
|
+
"CONTAINS_FOLDER",
|
|
671
|
+
"Folder",
|
|
672
|
+
"path",
|
|
673
|
+
str(relative_root).replace(os.sep, "/"),
|
|
674
|
+
)
|
|
675
|
+
else:
|
|
676
|
+
# Parent is another folder
|
|
677
|
+
self.ingestor.ensure_relationship_batch(
|
|
678
|
+
"Folder",
|
|
679
|
+
"path",
|
|
680
|
+
str(parent_rel_path).replace(os.sep, "/"),
|
|
681
|
+
"CONTAINS_FOLDER",
|
|
682
|
+
"Folder",
|
|
683
|
+
"path",
|
|
684
|
+
str(relative_root).replace(os.sep, "/"),
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
self.structural_elements[relative_root] = None
|
|
688
|
+
|
|
689
|
+
def _process_files(self) -> None:
|
|
690
|
+
"""Second pass: Process files and extract definitions."""
|
|
691
|
+
file_count = 0
|
|
692
|
+
for root_str, _, files in os.walk(self.repo_path):
|
|
693
|
+
root = Path(root_str)
|
|
694
|
+
|
|
695
|
+
# Skip ignored directories
|
|
696
|
+
if any(part in self.ignore_dirs for part in root.parts):
|
|
697
|
+
continue
|
|
698
|
+
|
|
699
|
+
for filename in files:
|
|
700
|
+
filepath = root / filename
|
|
701
|
+
|
|
702
|
+
# Check if this is a supported file
|
|
703
|
+
ext = filepath.suffix
|
|
704
|
+
lang_config = get_language_config(ext)
|
|
705
|
+
|
|
706
|
+
if lang_config and lang_config.name in self.parsers:
|
|
707
|
+
self._process_single_file(filepath, lang_config.name)
|
|
708
|
+
file_count += 1
|
|
709
|
+
|
|
710
|
+
if file_count % 100 == 0:
|
|
711
|
+
logger.info(f" Processed {file_count} files...")
|
|
712
|
+
|
|
713
|
+
logger.info(f" Total files processed: {file_count}")
|
|
714
|
+
|
|
715
|
+
def _process_single_file(self, filepath: Path, language: str) -> None:
|
|
716
|
+
"""Process a single file."""
|
|
717
|
+
relative_path = filepath.relative_to(self.repo_path)
|
|
718
|
+
relative_path_str = str(relative_path).replace(os.sep, "/")
|
|
719
|
+
|
|
720
|
+
# Create File node
|
|
721
|
+
self.ingestor.ensure_node_batch(
|
|
722
|
+
"File",
|
|
723
|
+
{
|
|
724
|
+
"path": relative_path_str,
|
|
725
|
+
"name": filepath.name,
|
|
726
|
+
"extension": filepath.suffix,
|
|
727
|
+
},
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Create containment relationship
|
|
731
|
+
parent_rel_path = relative_path.parent
|
|
732
|
+
if parent_rel_path == Path("."):
|
|
733
|
+
# File in project root
|
|
734
|
+
self.ingestor.ensure_relationship_batch(
|
|
735
|
+
"Project",
|
|
736
|
+
"name",
|
|
737
|
+
self.project_name,
|
|
738
|
+
"CONTAINS_FILE",
|
|
739
|
+
"File",
|
|
740
|
+
"path",
|
|
741
|
+
relative_path_str,
|
|
742
|
+
)
|
|
743
|
+
else:
|
|
744
|
+
self.ingestor.ensure_relationship_batch(
|
|
745
|
+
"Folder",
|
|
746
|
+
"path",
|
|
747
|
+
str(parent_rel_path).replace(os.sep, "/"),
|
|
748
|
+
"CONTAINS_FILE",
|
|
749
|
+
"File",
|
|
750
|
+
"path",
|
|
751
|
+
relative_path_str,
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Parse file
|
|
755
|
+
try:
|
|
756
|
+
with open(filepath, "rb") as f:
|
|
757
|
+
content = f.read()
|
|
758
|
+
|
|
759
|
+
parser = self.parsers[language]
|
|
760
|
+
tree = parser.parse(content)
|
|
761
|
+
root_node = tree.root_node
|
|
762
|
+
|
|
763
|
+
# Cache AST for later
|
|
764
|
+
self.ast_cache[filepath] = (root_node, language)
|
|
765
|
+
|
|
766
|
+
# Create module
|
|
767
|
+
if filepath.name == "__init__.py":
|
|
768
|
+
module_qn = ".".join(
|
|
769
|
+
[self.project_name] + list(relative_path.parent.parts)
|
|
770
|
+
)
|
|
771
|
+
else:
|
|
772
|
+
module_qn = ".".join(
|
|
773
|
+
[self.project_name] + list(relative_path.with_suffix("").parts)
|
|
774
|
+
)
|
|
775
|
+
|
|
776
|
+
current_time = int(time.time())
|
|
777
|
+
self.ingestor.ensure_node_batch(
|
|
778
|
+
"Module",
|
|
779
|
+
{
|
|
780
|
+
"qualified_name": module_qn,
|
|
781
|
+
"name": filepath.stem,
|
|
782
|
+
"path": relative_path_str,
|
|
783
|
+
"created_at": current_time,
|
|
784
|
+
"updated_at": current_time,
|
|
785
|
+
},
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
# Create module containment
|
|
789
|
+
parent_container = self.structural_elements.get(parent_rel_path)
|
|
790
|
+
if parent_container:
|
|
791
|
+
# Parent is a package
|
|
792
|
+
self.ingestor.ensure_relationship_batch(
|
|
793
|
+
"Package",
|
|
794
|
+
"qualified_name",
|
|
795
|
+
parent_container,
|
|
796
|
+
"CONTAINS_MODULE",
|
|
797
|
+
"Module",
|
|
798
|
+
"qualified_name",
|
|
799
|
+
module_qn,
|
|
800
|
+
)
|
|
801
|
+
elif parent_rel_path == Path("."):
|
|
802
|
+
# Parent is project root
|
|
803
|
+
self.ingestor.ensure_relationship_batch(
|
|
804
|
+
"Project",
|
|
805
|
+
"name",
|
|
806
|
+
self.project_name,
|
|
807
|
+
"CONTAINS_MODULE",
|
|
808
|
+
"Module",
|
|
809
|
+
"qualified_name",
|
|
810
|
+
module_qn,
|
|
811
|
+
)
|
|
812
|
+
else:
|
|
813
|
+
# Parent is a folder
|
|
814
|
+
self.ingestor.ensure_relationship_batch(
|
|
815
|
+
"Folder",
|
|
816
|
+
"path",
|
|
817
|
+
str(parent_rel_path).replace(os.sep, "/"),
|
|
818
|
+
"CONTAINS_MODULE",
|
|
819
|
+
"Module",
|
|
820
|
+
"qualified_name",
|
|
821
|
+
module_qn,
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
# Create file metadata
|
|
825
|
+
mtime = int(filepath.stat().st_mtime)
|
|
826
|
+
hash_value = hashlib.sha256(content).hexdigest()
|
|
827
|
+
self.ingestor.ensure_file_metadata(
|
|
828
|
+
relative_path_str, mtime, hash_value, current_time
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
# Track module
|
|
832
|
+
self.ingestor.ensure_tracks_relationship(
|
|
833
|
+
relative_path_str, "Module", module_qn
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# Extract definitions
|
|
837
|
+
self._extract_definitions(filepath, root_node, module_qn, language)
|
|
838
|
+
|
|
839
|
+
except Exception as e:
|
|
840
|
+
logger.error(f"Failed to process {filepath}: {e}")
|
|
841
|
+
|
|
842
|
+
def _extract_definitions(
|
|
843
|
+
self, filepath: Path, root_node: Node, module_qn: str, language: str
|
|
844
|
+
) -> None:
|
|
845
|
+
"""Extract function and class definitions from AST."""
|
|
846
|
+
lang_queries = self.queries.get(language, {})
|
|
847
|
+
relative_path_str = str(filepath.relative_to(self.repo_path)).replace(
|
|
848
|
+
os.sep, "/"
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
# Extract classes
|
|
852
|
+
if "class_query" in lang_queries:
|
|
853
|
+
cursor = QueryCursor(lang_queries["class_query"])
|
|
854
|
+
for match in cursor.matches(root_node):
|
|
855
|
+
class_node = None
|
|
856
|
+
class_name = None
|
|
857
|
+
|
|
858
|
+
captures = match[1] # Get captures dictionary from tuple
|
|
859
|
+
for capture_name, nodes in captures.items():
|
|
860
|
+
for node in nodes:
|
|
861
|
+
if capture_name in ["class", "interface", "type_alias"]:
|
|
862
|
+
class_node = node
|
|
863
|
+
elif capture_name == "class_name" and node.text:
|
|
864
|
+
class_name = node.text.decode("utf-8")
|
|
865
|
+
|
|
866
|
+
if class_node and class_name:
|
|
867
|
+
class_qn = f"{module_qn}.{class_name}"
|
|
868
|
+
|
|
869
|
+
# Extract decorators
|
|
870
|
+
decorators = self._extract_decorators(class_node, language)
|
|
871
|
+
|
|
872
|
+
# Extract docstring
|
|
873
|
+
docstring = self._extract_docstring(class_node, language)
|
|
874
|
+
|
|
875
|
+
current_time = int(time.time())
|
|
876
|
+
self.ingestor.ensure_node_batch(
|
|
877
|
+
"Class",
|
|
878
|
+
{
|
|
879
|
+
"qualified_name": class_qn,
|
|
880
|
+
"name": class_name,
|
|
881
|
+
"decorators": decorators,
|
|
882
|
+
"line_start": class_node.start_point.row + 1,
|
|
883
|
+
"line_end": class_node.end_point.row + 1,
|
|
884
|
+
"created_at": current_time,
|
|
885
|
+
"updated_at": current_time,
|
|
886
|
+
"docstring": docstring,
|
|
887
|
+
},
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
# Create DEFINES relationship
|
|
891
|
+
logger.debug(
|
|
892
|
+
f"Creating DEFINES relationship: Module({module_qn}) -> Class({class_qn})"
|
|
893
|
+
)
|
|
894
|
+
self.ingestor.ensure_relationship_batch(
|
|
895
|
+
"Module",
|
|
896
|
+
"qualified_name",
|
|
897
|
+
module_qn,
|
|
898
|
+
"DEFINES",
|
|
899
|
+
"Class",
|
|
900
|
+
"qualified_name",
|
|
901
|
+
class_qn,
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
# Track class
|
|
905
|
+
self.ingestor.ensure_tracks_relationship(
|
|
906
|
+
relative_path_str, "Class", class_qn
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
# Register for lookup
|
|
910
|
+
self.function_registry[class_qn] = "Class"
|
|
911
|
+
self.simple_name_lookup[class_name].add(class_qn)
|
|
912
|
+
|
|
913
|
+
# Extract inheritance
|
|
914
|
+
parent_names = self._extract_inheritance(class_node, language)
|
|
915
|
+
if parent_names:
|
|
916
|
+
self.class_inheritance[class_qn] = parent_names
|
|
917
|
+
|
|
918
|
+
# Extract functions
|
|
919
|
+
if "function_query" in lang_queries:
|
|
920
|
+
cursor = QueryCursor(lang_queries["function_query"])
|
|
921
|
+
matches = list(cursor.matches(root_node))
|
|
922
|
+
logger.debug(f"Found {len(matches)} function matches in {filepath}")
|
|
923
|
+
for match in matches:
|
|
924
|
+
func_node = None
|
|
925
|
+
func_name = None
|
|
926
|
+
|
|
927
|
+
captures = match[1] # Get captures dictionary from tuple
|
|
928
|
+
for capture_name, nodes in captures.items():
|
|
929
|
+
for node in nodes:
|
|
930
|
+
if capture_name == "function":
|
|
931
|
+
func_node = node
|
|
932
|
+
elif capture_name == "function_name" and node.text:
|
|
933
|
+
func_name = node.text.decode("utf-8")
|
|
934
|
+
|
|
935
|
+
if func_node and func_name:
|
|
936
|
+
# Log what we found
|
|
937
|
+
logger.debug(
|
|
938
|
+
f"Found function: {func_name} at line {func_node.start_point.row + 1}"
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
# Check if this is a method inside a class
|
|
942
|
+
parent_class = self._find_parent_class(func_node, module_qn)
|
|
943
|
+
|
|
944
|
+
if parent_class:
|
|
945
|
+
# This is a method
|
|
946
|
+
method_qn = f"{parent_class}.{func_name}"
|
|
947
|
+
decorators = self._extract_decorators(func_node, language)
|
|
948
|
+
|
|
949
|
+
# Extract docstring
|
|
950
|
+
docstring = self._extract_docstring(func_node, language)
|
|
951
|
+
|
|
952
|
+
current_time = int(time.time())
|
|
953
|
+
self.ingestor.ensure_node_batch(
|
|
954
|
+
"Method",
|
|
955
|
+
{
|
|
956
|
+
"qualified_name": method_qn,
|
|
957
|
+
"name": func_name,
|
|
958
|
+
"decorators": decorators,
|
|
959
|
+
"line_start": func_node.start_point.row + 1,
|
|
960
|
+
"line_end": func_node.end_point.row + 1,
|
|
961
|
+
"created_at": current_time,
|
|
962
|
+
"updated_at": current_time,
|
|
963
|
+
"docstring": docstring,
|
|
964
|
+
},
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# Create DEFINES_METHOD relationship
|
|
968
|
+
self.ingestor.ensure_relationship_batch(
|
|
969
|
+
"Class",
|
|
970
|
+
"qualified_name",
|
|
971
|
+
parent_class,
|
|
972
|
+
"DEFINES_METHOD",
|
|
973
|
+
"Method",
|
|
974
|
+
"qualified_name",
|
|
975
|
+
method_qn,
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
# Track method
|
|
979
|
+
self.ingestor.ensure_tracks_relationship(
|
|
980
|
+
relative_path_str, "Method", method_qn
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
# Register for lookup
|
|
984
|
+
self.function_registry[method_qn] = "Method"
|
|
985
|
+
self.simple_name_lookup[func_name].add(method_qn)
|
|
986
|
+
else:
|
|
987
|
+
# This is a standalone function
|
|
988
|
+
func_qn = f"{module_qn}.{func_name}"
|
|
989
|
+
decorators = self._extract_decorators(func_node, language)
|
|
990
|
+
|
|
991
|
+
# Extract docstring
|
|
992
|
+
docstring = self._extract_docstring(func_node, language)
|
|
993
|
+
|
|
994
|
+
current_time = int(time.time())
|
|
995
|
+
self.ingestor.ensure_node_batch(
|
|
996
|
+
"Function",
|
|
997
|
+
{
|
|
998
|
+
"qualified_name": func_qn,
|
|
999
|
+
"name": func_name,
|
|
1000
|
+
"decorators": decorators,
|
|
1001
|
+
"line_start": func_node.start_point.row + 1,
|
|
1002
|
+
"line_end": func_node.end_point.row + 1,
|
|
1003
|
+
"created_at": current_time,
|
|
1004
|
+
"updated_at": current_time,
|
|
1005
|
+
"docstring": docstring,
|
|
1006
|
+
},
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
# Create DEFINES relationship
|
|
1010
|
+
self.ingestor.ensure_relationship_batch(
|
|
1011
|
+
"Module",
|
|
1012
|
+
"qualified_name",
|
|
1013
|
+
module_qn,
|
|
1014
|
+
"DEFINES_FUNC",
|
|
1015
|
+
"Function",
|
|
1016
|
+
"qualified_name",
|
|
1017
|
+
func_qn,
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
# Track function
|
|
1021
|
+
self.ingestor.ensure_tracks_relationship(
|
|
1022
|
+
relative_path_str, "Function", func_qn
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
# Register for lookup
|
|
1026
|
+
self.function_registry[func_qn] = "Function"
|
|
1027
|
+
self.simple_name_lookup[func_name].add(func_qn)
|
|
1028
|
+
|
|
1029
|
+
def _extract_decorators(self, node: Node, language: str) -> list[str]:
|
|
1030
|
+
"""Extract decorators from a function/class node."""
|
|
1031
|
+
decorators = []
|
|
1032
|
+
|
|
1033
|
+
if language == "python":
|
|
1034
|
+
# Look for decorator nodes
|
|
1035
|
+
for child in node.children:
|
|
1036
|
+
if child.type == "decorator":
|
|
1037
|
+
# Extract decorator name
|
|
1038
|
+
for grandchild in child.children:
|
|
1039
|
+
if grandchild.type == "identifier" and grandchild.text:
|
|
1040
|
+
decorators.append(grandchild.text.decode("utf-8"))
|
|
1041
|
+
break
|
|
1042
|
+
elif grandchild.type == "attribute":
|
|
1043
|
+
# Handle @module.decorator
|
|
1044
|
+
attr_node = grandchild.child_by_field_name("attribute")
|
|
1045
|
+
if attr_node and attr_node.text:
|
|
1046
|
+
decorators.append(attr_node.text.decode("utf-8"))
|
|
1047
|
+
break
|
|
1048
|
+
|
|
1049
|
+
return decorators
|
|
1050
|
+
|
|
1051
|
+
def _extract_docstring(self, node: Node, language: str) -> str | None:
|
|
1052
|
+
"""Extract docstring from function/class node."""
|
|
1053
|
+
if language == "python":
|
|
1054
|
+
# Get the body node
|
|
1055
|
+
body_node = node.child_by_field_name("body")
|
|
1056
|
+
if not body_node or not body_node.children:
|
|
1057
|
+
return None
|
|
1058
|
+
|
|
1059
|
+
# Check if first statement is a string (docstring)
|
|
1060
|
+
first_statement = body_node.children[0]
|
|
1061
|
+
if first_statement.type == "expression_statement":
|
|
1062
|
+
# Check if it contains a string
|
|
1063
|
+
for child in first_statement.children:
|
|
1064
|
+
if child.type == "string" and child.text:
|
|
1065
|
+
# Extract and clean the docstring
|
|
1066
|
+
docstring = child.text.decode("utf-8")
|
|
1067
|
+
# Remove quotes (handle various quote styles)
|
|
1068
|
+
docstring = docstring.strip()
|
|
1069
|
+
if (
|
|
1070
|
+
docstring.startswith('"""')
|
|
1071
|
+
and docstring.endswith('"""')
|
|
1072
|
+
or docstring.startswith("'''")
|
|
1073
|
+
and docstring.endswith("'''")
|
|
1074
|
+
):
|
|
1075
|
+
docstring = docstring[3:-3]
|
|
1076
|
+
elif (
|
|
1077
|
+
docstring.startswith('"')
|
|
1078
|
+
and docstring.endswith('"')
|
|
1079
|
+
or docstring.startswith("'")
|
|
1080
|
+
and docstring.endswith("'")
|
|
1081
|
+
):
|
|
1082
|
+
docstring = docstring[1:-1]
|
|
1083
|
+
return docstring.strip()
|
|
1084
|
+
# Add support for other languages later
|
|
1085
|
+
return None
|
|
1086
|
+
|
|
1087
|
+
def _extract_inheritance(self, class_node: Node, language: str) -> list[str]:
|
|
1088
|
+
"""Extract parent class names from class definition."""
|
|
1089
|
+
parent_names = []
|
|
1090
|
+
|
|
1091
|
+
if language == "python":
|
|
1092
|
+
# Look for argument_list in class definition
|
|
1093
|
+
for child in class_node.children:
|
|
1094
|
+
if child.type == "argument_list":
|
|
1095
|
+
# Each argument is a parent class
|
|
1096
|
+
for arg in child.children:
|
|
1097
|
+
if arg.type == "identifier" and arg.text:
|
|
1098
|
+
parent_names.append(arg.text.decode("utf-8"))
|
|
1099
|
+
elif arg.type == "attribute":
|
|
1100
|
+
# Handle module.Class inheritance
|
|
1101
|
+
full_name_parts: list[str] = []
|
|
1102
|
+
self._extract_full_name(arg, full_name_parts)
|
|
1103
|
+
if full_name_parts:
|
|
1104
|
+
parent_names.append(".".join(full_name_parts))
|
|
1105
|
+
|
|
1106
|
+
return parent_names
|
|
1107
|
+
|
|
1108
|
+
def _extract_full_name(self, node: Node, parts: list[str]) -> None:
|
|
1109
|
+
"""Recursively extract full qualified name from attribute access."""
|
|
1110
|
+
if node.type == "identifier" and node.text:
|
|
1111
|
+
parts.insert(0, node.text.decode("utf-8"))
|
|
1112
|
+
elif node.type == "attribute":
|
|
1113
|
+
# Get attribute name
|
|
1114
|
+
attr_node = node.child_by_field_name("attribute")
|
|
1115
|
+
if attr_node and attr_node.text:
|
|
1116
|
+
parts.insert(0, attr_node.text.decode("utf-8"))
|
|
1117
|
+
|
|
1118
|
+
# Get object name
|
|
1119
|
+
obj_node = node.child_by_field_name("object")
|
|
1120
|
+
if obj_node:
|
|
1121
|
+
self._extract_full_name(obj_node, parts)
|
|
1122
|
+
|
|
1123
|
+
def _find_parent_class(self, func_node: Node, module_qn: str) -> str | None:
|
|
1124
|
+
"""Find the parent class of a function node."""
|
|
1125
|
+
# Walk up the tree to find containing class
|
|
1126
|
+
current = func_node.parent
|
|
1127
|
+
|
|
1128
|
+
while current:
|
|
1129
|
+
if current.type in ["class_definition", "class_declaration"]:
|
|
1130
|
+
# Extract class name
|
|
1131
|
+
for child in current.children:
|
|
1132
|
+
if child.type == "identifier" and child.text:
|
|
1133
|
+
class_name = child.text.decode("utf-8")
|
|
1134
|
+
return f"{module_qn}.{class_name}"
|
|
1135
|
+
|
|
1136
|
+
current = current.parent
|
|
1137
|
+
|
|
1138
|
+
return None
|
|
1139
|
+
|
|
1140
|
+
def _process_relationships(self) -> None:
|
|
1141
|
+
"""Third pass: Process function calls and imports."""
|
|
1142
|
+
# Process inheritance relationships first
|
|
1143
|
+
self._process_inheritance()
|
|
1144
|
+
|
|
1145
|
+
# Then process function calls
|
|
1146
|
+
logger.info(f"Processing function calls for {len(self.ast_cache)} files...")
|
|
1147
|
+
logger.info(f"Function registry has {len(self.function_registry)} entries")
|
|
1148
|
+
logger.info(
|
|
1149
|
+
f"Simple name lookup has {len(self.simple_name_lookup)} unique names"
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# Log some examples from simple_name_lookup
|
|
1153
|
+
if self.simple_name_lookup:
|
|
1154
|
+
example_names = list(self.simple_name_lookup.keys())[:5]
|
|
1155
|
+
for name in example_names:
|
|
1156
|
+
logger.debug(
|
|
1157
|
+
f" Example: '{name}' -> {list(self.simple_name_lookup[name])[:3]}"
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
for filepath, (root_node, language) in self.ast_cache.items():
|
|
1161
|
+
self._process_calls(filepath, root_node, language)
|
|
1162
|
+
# NOTE: Add import processing. wtf does this mean?
|
|
1163
|
+
|
|
1164
|
+
def _process_inheritance(self) -> None:
|
|
1165
|
+
"""Process inheritance relationships between classes."""
|
|
1166
|
+
logger.info("Processing inheritance relationships...")
|
|
1167
|
+
|
|
1168
|
+
for child_qn, parent_qns in self.class_inheritance.items():
|
|
1169
|
+
for parent_qn in parent_qns:
|
|
1170
|
+
# Check if parent exists in our registry
|
|
1171
|
+
if parent_qn in self.function_registry:
|
|
1172
|
+
# Create INHERITS relationship
|
|
1173
|
+
self.ingestor.ensure_relationship_batch(
|
|
1174
|
+
"Class",
|
|
1175
|
+
"qualified_name",
|
|
1176
|
+
child_qn,
|
|
1177
|
+
"INHERITS",
|
|
1178
|
+
"Class",
|
|
1179
|
+
"qualified_name",
|
|
1180
|
+
parent_qn,
|
|
1181
|
+
)
|
|
1182
|
+
logger.debug(
|
|
1183
|
+
f" Created inheritance: {child_qn} INHERITS {parent_qn}"
|
|
1184
|
+
)
|
|
1185
|
+
else:
|
|
1186
|
+
# Try to find parent by simple name lookup
|
|
1187
|
+
parent_simple_name = parent_qn.split(".")[-1]
|
|
1188
|
+
possible_parents = self.simple_name_lookup.get(
|
|
1189
|
+
parent_simple_name, set()
|
|
1190
|
+
)
|
|
1191
|
+
|
|
1192
|
+
# If we find exactly one match, use it
|
|
1193
|
+
if len(possible_parents) == 1:
|
|
1194
|
+
actual_parent_qn = list(possible_parents)[0]
|
|
1195
|
+
self.ingestor.ensure_relationship_batch(
|
|
1196
|
+
"Class",
|
|
1197
|
+
"qualified_name",
|
|
1198
|
+
child_qn,
|
|
1199
|
+
"INHERITS",
|
|
1200
|
+
"Class",
|
|
1201
|
+
"qualified_name",
|
|
1202
|
+
actual_parent_qn,
|
|
1203
|
+
)
|
|
1204
|
+
logger.debug(
|
|
1205
|
+
f" Created inheritance: {child_qn} INHERITS {actual_parent_qn}"
|
|
1206
|
+
)
|
|
1207
|
+
else:
|
|
1208
|
+
logger.debug(
|
|
1209
|
+
f" Could not resolve parent class: {parent_qn} for {child_qn}"
|
|
1210
|
+
)
|
|
1211
|
+
|
|
1212
|
+
def _process_calls(self, filepath: Path, root_node: Node, language: str) -> None:
|
|
1213
|
+
"""Process function calls in a file."""
|
|
1214
|
+
lang_queries = self.queries.get(language, {})
|
|
1215
|
+
|
|
1216
|
+
if "call_query" not in lang_queries:
|
|
1217
|
+
return
|
|
1218
|
+
|
|
1219
|
+
# Get the module qualified name
|
|
1220
|
+
relative_path = filepath.relative_to(self.repo_path)
|
|
1221
|
+
if filepath.name == "__init__.py":
|
|
1222
|
+
module_qn = ".".join([self.project_name] + list(relative_path.parent.parts))
|
|
1223
|
+
else:
|
|
1224
|
+
module_qn = ".".join(
|
|
1225
|
+
[self.project_name] + list(relative_path.with_suffix("").parts)
|
|
1226
|
+
)
|
|
1227
|
+
|
|
1228
|
+
# Find all call expressions
|
|
1229
|
+
cursor = QueryCursor(lang_queries["call_query"])
|
|
1230
|
+
matches = list(cursor.matches(root_node))
|
|
1231
|
+
logger.debug(f"Found {len(matches)} call matches in {filepath}")
|
|
1232
|
+
for match in matches:
|
|
1233
|
+
call_node = None
|
|
1234
|
+
|
|
1235
|
+
captures = match[1] # Get captures dictionary from tuple
|
|
1236
|
+
for capture_name, nodes in captures.items():
|
|
1237
|
+
for node in nodes:
|
|
1238
|
+
if capture_name == "call":
|
|
1239
|
+
call_node = node
|
|
1240
|
+
break
|
|
1241
|
+
|
|
1242
|
+
if call_node:
|
|
1243
|
+
self._process_single_call(call_node, module_qn, language)
|
|
1244
|
+
|
|
1245
|
+
def _process_single_call(
|
|
1246
|
+
self, call_node: Node, module_qn: str, language: str
|
|
1247
|
+
) -> None:
|
|
1248
|
+
"""Process a single function call with smart resolution."""
|
|
1249
|
+
# Extract called function name and context (simplified)
|
|
1250
|
+
callee_name = None
|
|
1251
|
+
object_name = None # For method calls like obj.method()
|
|
1252
|
+
|
|
1253
|
+
if language in ["python", "javascript", "typescript"]:
|
|
1254
|
+
# Look for function/method name
|
|
1255
|
+
for child in call_node.children:
|
|
1256
|
+
if child.type == "identifier" and child.text:
|
|
1257
|
+
callee_name = child.text.decode("utf-8")
|
|
1258
|
+
break
|
|
1259
|
+
elif child.type == "attribute":
|
|
1260
|
+
# Handle method calls like obj.method()
|
|
1261
|
+
obj_node = child.child_by_field_name("object")
|
|
1262
|
+
attr_node = child.child_by_field_name("attribute")
|
|
1263
|
+
if obj_node and obj_node.text:
|
|
1264
|
+
object_name = obj_node.text.decode("utf-8")
|
|
1265
|
+
if attr_node and attr_node.text:
|
|
1266
|
+
callee_name = attr_node.text.decode("utf-8")
|
|
1267
|
+
break
|
|
1268
|
+
|
|
1269
|
+
if not callee_name:
|
|
1270
|
+
logger.debug(
|
|
1271
|
+
f" Could not extract callee name from call at line {call_node.start_point[0]}"
|
|
1272
|
+
)
|
|
1273
|
+
return
|
|
1274
|
+
|
|
1275
|
+
logger.debug(f" Processing call to {callee_name} (object: {object_name})")
|
|
1276
|
+
|
|
1277
|
+
# Find caller function
|
|
1278
|
+
caller_qn = self._find_containing_function(call_node, module_qn)
|
|
1279
|
+
if not caller_qn:
|
|
1280
|
+
logger.debug(
|
|
1281
|
+
f" Could not find containing function for call at line {call_node.start_point[0]}"
|
|
1282
|
+
)
|
|
1283
|
+
return
|
|
1284
|
+
|
|
1285
|
+
# Get all possible callees
|
|
1286
|
+
possible_callees = self.simple_name_lookup.get(callee_name, set())
|
|
1287
|
+
if not possible_callees:
|
|
1288
|
+
logger.debug(f" No functions found with name: {callee_name}")
|
|
1289
|
+
return
|
|
1290
|
+
|
|
1291
|
+
logger.debug(
|
|
1292
|
+
f" Found {len(possible_callees)} possible callees for {callee_name}"
|
|
1293
|
+
)
|
|
1294
|
+
|
|
1295
|
+
# Calculate confidence scores for each possible callee
|
|
1296
|
+
scored_callees = []
|
|
1297
|
+
for possible_qn in possible_callees:
|
|
1298
|
+
score = self._calculate_callee_confidence(
|
|
1299
|
+
caller_qn, possible_qn, module_qn, object_name
|
|
1300
|
+
)
|
|
1301
|
+
scored_callees.append((possible_qn, score))
|
|
1302
|
+
|
|
1303
|
+
# Sort by confidence score (highest first)
|
|
1304
|
+
scored_callees.sort(key=lambda x: x[1], reverse=True)
|
|
1305
|
+
|
|
1306
|
+
# Use the highest confidence match
|
|
1307
|
+
callee_qn, confidence = scored_callees[0]
|
|
1308
|
+
|
|
1309
|
+
# Create CALLS relationship with metadata
|
|
1310
|
+
caller_type = self.function_registry.get(caller_qn)
|
|
1311
|
+
callee_type = self.function_registry.get(callee_qn)
|
|
1312
|
+
|
|
1313
|
+
if caller_type and callee_type:
|
|
1314
|
+
# Create the primary CALLS relationship
|
|
1315
|
+
self.ingestor.ensure_relationship_batch(
|
|
1316
|
+
caller_type,
|
|
1317
|
+
"qualified_name",
|
|
1318
|
+
caller_qn,
|
|
1319
|
+
"CALLS",
|
|
1320
|
+
callee_type,
|
|
1321
|
+
"qualified_name",
|
|
1322
|
+
callee_qn,
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
# Log with confidence information
|
|
1326
|
+
alternatives = len(scored_callees) - 1
|
|
1327
|
+
logger.info(
|
|
1328
|
+
f" Created CALLS relationship: {caller_qn} -> {callee_qn} (confidence: {confidence:.2f}, alternatives: {alternatives})"
|
|
1329
|
+
)
|
|
1330
|
+
|
|
1331
|
+
# If multiple alternatives exist with similar confidence, log them
|
|
1332
|
+
if alternatives > 0 and confidence < 1.0:
|
|
1333
|
+
similar_alternatives = [
|
|
1334
|
+
qn for qn, score in scored_callees[1:4] if score >= confidence * 0.8
|
|
1335
|
+
] # Top 3 alternatives # Within 80% of best score
|
|
1336
|
+
if similar_alternatives:
|
|
1337
|
+
logger.debug(
|
|
1338
|
+
f" Alternative matches: {', '.join(similar_alternatives)}"
|
|
1339
|
+
)
|
|
1340
|
+
else:
|
|
1341
|
+
logger.warning(
|
|
1342
|
+
f" Failed to create CALLS relationship - caller_type: {caller_type}, callee_type: {callee_type}"
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
def _calculate_callee_confidence(
|
|
1346
|
+
self, caller_qn: str, callee_qn: str, module_qn: str, object_name: str | None
|
|
1347
|
+
) -> float:
|
|
1348
|
+
"""Calculate confidence score for a potential callee match.
|
|
1349
|
+
|
|
1350
|
+
Args:
|
|
1351
|
+
caller_qn: Qualified name of the calling function
|
|
1352
|
+
callee_qn: Qualified name of the potential callee
|
|
1353
|
+
module_qn: Qualified name of the current module
|
|
1354
|
+
object_name: Object name for method calls (e.g., 'obj' in obj.method())
|
|
1355
|
+
|
|
1356
|
+
Returns:
|
|
1357
|
+
Confidence score between 0.0 and 1.0
|
|
1358
|
+
"""
|
|
1359
|
+
score = 0.0
|
|
1360
|
+
|
|
1361
|
+
# 1. Module locality - functions in the same module are most likely
|
|
1362
|
+
if callee_qn.startswith(module_qn + "."):
|
|
1363
|
+
score += 0.5
|
|
1364
|
+
|
|
1365
|
+
# Even higher if in the same class
|
|
1366
|
+
caller_parts = caller_qn.split(".")
|
|
1367
|
+
callee_parts = callee_qn.split(".")
|
|
1368
|
+
if len(caller_parts) >= 3 and len(callee_parts) >= 3:
|
|
1369
|
+
if caller_parts[:-1] == callee_parts[:-1]: # Same class
|
|
1370
|
+
score += 0.2
|
|
1371
|
+
|
|
1372
|
+
# 2. Package locality - functions in the same package hierarchy
|
|
1373
|
+
elif "." in module_qn:
|
|
1374
|
+
package = module_qn.rsplit(".", 1)[0]
|
|
1375
|
+
if callee_qn.startswith(package + "."):
|
|
1376
|
+
score += 0.3
|
|
1377
|
+
|
|
1378
|
+
# 3. Object/class match for method calls
|
|
1379
|
+
if object_name:
|
|
1380
|
+
# Check if callee is a method of a class matching the object name
|
|
1381
|
+
callee_parts = callee_qn.split(".")
|
|
1382
|
+
if len(callee_parts) >= 2:
|
|
1383
|
+
# Simple heuristic: check if class name matches object name
|
|
1384
|
+
# (In reality, we'd need type inference for accuracy)
|
|
1385
|
+
class_name = callee_parts[-2]
|
|
1386
|
+
if class_name.lower() == object_name.lower():
|
|
1387
|
+
score += 0.3
|
|
1388
|
+
elif object_name == "self" and callee_qn.startswith(
|
|
1389
|
+
caller_qn.rsplit(".", 1)[0]
|
|
1390
|
+
):
|
|
1391
|
+
# 'self' refers to the same class
|
|
1392
|
+
score += 0.4
|
|
1393
|
+
|
|
1394
|
+
# 4. Import presence check (simplified - would need import tracking)
|
|
1395
|
+
# For now, we'll give a small boost to standard library functions
|
|
1396
|
+
if callee_qn.startswith(("builtins.", "typing.", "collections.")):
|
|
1397
|
+
score += 0.1
|
|
1398
|
+
|
|
1399
|
+
# 5. Name similarity for disambiguation
|
|
1400
|
+
# If function names are unique enough, boost confidence
|
|
1401
|
+
possible_count = len(
|
|
1402
|
+
self.simple_name_lookup.get(callee_qn.split(".")[-1], set())
|
|
1403
|
+
)
|
|
1404
|
+
if possible_count == 1:
|
|
1405
|
+
score += 0.2
|
|
1406
|
+
elif possible_count <= 3:
|
|
1407
|
+
score += 0.1
|
|
1408
|
+
|
|
1409
|
+
# Normalize to [0, 1]
|
|
1410
|
+
return min(score, 1.0)
|
|
1411
|
+
|
|
1412
|
+
def _find_containing_function(self, node: Node, module_qn: str) -> str | None:
|
|
1413
|
+
"""Find the containing function/method of a node."""
|
|
1414
|
+
current = node.parent
|
|
1415
|
+
|
|
1416
|
+
while current:
|
|
1417
|
+
if current.type in [
|
|
1418
|
+
"function_definition",
|
|
1419
|
+
"method_definition",
|
|
1420
|
+
"arrow_function",
|
|
1421
|
+
]:
|
|
1422
|
+
# Extract function name
|
|
1423
|
+
for child in current.children:
|
|
1424
|
+
if child.type == "identifier" and child.text:
|
|
1425
|
+
func_name = child.text.decode("utf-8")
|
|
1426
|
+
|
|
1427
|
+
# Check if this is inside a class
|
|
1428
|
+
parent_class = self._find_parent_class(current, module_qn)
|
|
1429
|
+
if parent_class:
|
|
1430
|
+
return f"{parent_class}.{func_name}"
|
|
1431
|
+
else:
|
|
1432
|
+
return f"{module_qn}.{func_name}"
|
|
1433
|
+
|
|
1434
|
+
current = current.parent
|
|
1435
|
+
|
|
1436
|
+
return None
|
|
1437
|
+
|
|
1438
|
+
|
|
1439
|
+
class CodebaseIngestor:
|
|
1440
|
+
"""Main ingestor class for building code knowledge graphs."""
|
|
1441
|
+
|
|
1442
|
+
def __init__(
|
|
1443
|
+
self,
|
|
1444
|
+
db_path: str,
|
|
1445
|
+
project_name: str | None = None,
|
|
1446
|
+
exclude_patterns: list[str] | None = None,
|
|
1447
|
+
):
|
|
1448
|
+
"""Initialize the ingestor.
|
|
1449
|
+
|
|
1450
|
+
Args:
|
|
1451
|
+
db_path: Path to Kuzu database
|
|
1452
|
+
project_name: Optional project name
|
|
1453
|
+
exclude_patterns: Patterns to exclude from processing
|
|
1454
|
+
"""
|
|
1455
|
+
self.db_path = Path(db_path)
|
|
1456
|
+
self.project_name = project_name
|
|
1457
|
+
self.exclude_patterns = exclude_patterns or []
|
|
1458
|
+
|
|
1459
|
+
def build_graph_from_directory(self, repo_path: str) -> None:
|
|
1460
|
+
"""Build a code knowledge graph from a directory.
|
|
1461
|
+
|
|
1462
|
+
Args:
|
|
1463
|
+
repo_path: Path to repository directory
|
|
1464
|
+
"""
|
|
1465
|
+
repo_path_obj = Path(repo_path)
|
|
1466
|
+
|
|
1467
|
+
# Use directory name as project name if not specified
|
|
1468
|
+
if not self.project_name:
|
|
1469
|
+
self.project_name = repo_path_obj.name
|
|
1470
|
+
|
|
1471
|
+
try:
|
|
1472
|
+
# Create database
|
|
1473
|
+
logger.info(f"Creating Kuzu database at: {self.db_path}")
|
|
1474
|
+
db = kuzu.Database(str(self.db_path))
|
|
1475
|
+
conn = kuzu.Connection(db)
|
|
1476
|
+
|
|
1477
|
+
# Initialize ingestor
|
|
1478
|
+
ingestor = Ingestor(conn)
|
|
1479
|
+
ingestor.create_schema()
|
|
1480
|
+
|
|
1481
|
+
# Load parsers
|
|
1482
|
+
logger.info("Loading language parsers...")
|
|
1483
|
+
parsers, queries = load_parsers()
|
|
1484
|
+
|
|
1485
|
+
# Build graph
|
|
1486
|
+
builder = SimpleGraphBuilder(
|
|
1487
|
+
ingestor, repo_path_obj, parsers, queries, self.exclude_patterns
|
|
1488
|
+
)
|
|
1489
|
+
if self.project_name:
|
|
1490
|
+
builder.project_name = self.project_name
|
|
1491
|
+
builder.run()
|
|
1492
|
+
|
|
1493
|
+
logger.info(f"Graph successfully created at: {self.db_path}")
|
|
1494
|
+
|
|
1495
|
+
except Exception as e:
|
|
1496
|
+
logger.error(f"Failed to build graph: {e}")
|
|
1497
|
+
raise
|