shotgun-sh 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of shotgun-sh might be problematic. Click here for more details.

Files changed (130) hide show
  1. shotgun/__init__.py +5 -0
  2. shotgun/agents/__init__.py +1 -0
  3. shotgun/agents/agent_manager.py +651 -0
  4. shotgun/agents/common.py +549 -0
  5. shotgun/agents/config/__init__.py +13 -0
  6. shotgun/agents/config/constants.py +17 -0
  7. shotgun/agents/config/manager.py +294 -0
  8. shotgun/agents/config/models.py +185 -0
  9. shotgun/agents/config/provider.py +206 -0
  10. shotgun/agents/conversation_history.py +106 -0
  11. shotgun/agents/conversation_manager.py +105 -0
  12. shotgun/agents/export.py +96 -0
  13. shotgun/agents/history/__init__.py +5 -0
  14. shotgun/agents/history/compaction.py +85 -0
  15. shotgun/agents/history/constants.py +19 -0
  16. shotgun/agents/history/context_extraction.py +108 -0
  17. shotgun/agents/history/history_building.py +104 -0
  18. shotgun/agents/history/history_processors.py +426 -0
  19. shotgun/agents/history/message_utils.py +84 -0
  20. shotgun/agents/history/token_counting.py +429 -0
  21. shotgun/agents/history/token_estimation.py +138 -0
  22. shotgun/agents/messages.py +35 -0
  23. shotgun/agents/models.py +275 -0
  24. shotgun/agents/plan.py +98 -0
  25. shotgun/agents/research.py +108 -0
  26. shotgun/agents/specify.py +98 -0
  27. shotgun/agents/tasks.py +96 -0
  28. shotgun/agents/tools/__init__.py +34 -0
  29. shotgun/agents/tools/codebase/__init__.py +28 -0
  30. shotgun/agents/tools/codebase/codebase_shell.py +256 -0
  31. shotgun/agents/tools/codebase/directory_lister.py +141 -0
  32. shotgun/agents/tools/codebase/file_read.py +144 -0
  33. shotgun/agents/tools/codebase/models.py +252 -0
  34. shotgun/agents/tools/codebase/query_graph.py +67 -0
  35. shotgun/agents/tools/codebase/retrieve_code.py +81 -0
  36. shotgun/agents/tools/file_management.py +218 -0
  37. shotgun/agents/tools/user_interaction.py +37 -0
  38. shotgun/agents/tools/web_search/__init__.py +60 -0
  39. shotgun/agents/tools/web_search/anthropic.py +144 -0
  40. shotgun/agents/tools/web_search/gemini.py +85 -0
  41. shotgun/agents/tools/web_search/openai.py +98 -0
  42. shotgun/agents/tools/web_search/utils.py +20 -0
  43. shotgun/build_constants.py +20 -0
  44. shotgun/cli/__init__.py +1 -0
  45. shotgun/cli/codebase/__init__.py +5 -0
  46. shotgun/cli/codebase/commands.py +202 -0
  47. shotgun/cli/codebase/models.py +21 -0
  48. shotgun/cli/config.py +275 -0
  49. shotgun/cli/export.py +81 -0
  50. shotgun/cli/models.py +10 -0
  51. shotgun/cli/plan.py +73 -0
  52. shotgun/cli/research.py +85 -0
  53. shotgun/cli/specify.py +69 -0
  54. shotgun/cli/tasks.py +78 -0
  55. shotgun/cli/update.py +152 -0
  56. shotgun/cli/utils.py +25 -0
  57. shotgun/codebase/__init__.py +12 -0
  58. shotgun/codebase/core/__init__.py +46 -0
  59. shotgun/codebase/core/change_detector.py +358 -0
  60. shotgun/codebase/core/code_retrieval.py +243 -0
  61. shotgun/codebase/core/ingestor.py +1497 -0
  62. shotgun/codebase/core/language_config.py +297 -0
  63. shotgun/codebase/core/manager.py +1662 -0
  64. shotgun/codebase/core/nl_query.py +331 -0
  65. shotgun/codebase/core/parser_loader.py +128 -0
  66. shotgun/codebase/models.py +111 -0
  67. shotgun/codebase/service.py +206 -0
  68. shotgun/logging_config.py +227 -0
  69. shotgun/main.py +167 -0
  70. shotgun/posthog_telemetry.py +158 -0
  71. shotgun/prompts/__init__.py +5 -0
  72. shotgun/prompts/agents/__init__.py +1 -0
  73. shotgun/prompts/agents/export.j2 +350 -0
  74. shotgun/prompts/agents/partials/codebase_understanding.j2 +87 -0
  75. shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +37 -0
  76. shotgun/prompts/agents/partials/content_formatting.j2 +65 -0
  77. shotgun/prompts/agents/partials/interactive_mode.j2 +26 -0
  78. shotgun/prompts/agents/plan.j2 +144 -0
  79. shotgun/prompts/agents/research.j2 +69 -0
  80. shotgun/prompts/agents/specify.j2 +51 -0
  81. shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +19 -0
  82. shotgun/prompts/agents/state/system_state.j2 +31 -0
  83. shotgun/prompts/agents/tasks.j2 +143 -0
  84. shotgun/prompts/codebase/__init__.py +1 -0
  85. shotgun/prompts/codebase/cypher_query_patterns.j2 +223 -0
  86. shotgun/prompts/codebase/cypher_system.j2 +28 -0
  87. shotgun/prompts/codebase/enhanced_query_context.j2 +10 -0
  88. shotgun/prompts/codebase/partials/cypher_rules.j2 +24 -0
  89. shotgun/prompts/codebase/partials/graph_schema.j2 +30 -0
  90. shotgun/prompts/codebase/partials/temporal_context.j2 +21 -0
  91. shotgun/prompts/history/__init__.py +1 -0
  92. shotgun/prompts/history/incremental_summarization.j2 +53 -0
  93. shotgun/prompts/history/summarization.j2 +46 -0
  94. shotgun/prompts/loader.py +140 -0
  95. shotgun/py.typed +0 -0
  96. shotgun/sdk/__init__.py +13 -0
  97. shotgun/sdk/codebase.py +219 -0
  98. shotgun/sdk/exceptions.py +17 -0
  99. shotgun/sdk/models.py +189 -0
  100. shotgun/sdk/services.py +23 -0
  101. shotgun/sentry_telemetry.py +87 -0
  102. shotgun/telemetry.py +93 -0
  103. shotgun/tui/__init__.py +0 -0
  104. shotgun/tui/app.py +116 -0
  105. shotgun/tui/commands/__init__.py +76 -0
  106. shotgun/tui/components/prompt_input.py +69 -0
  107. shotgun/tui/components/spinner.py +86 -0
  108. shotgun/tui/components/splash.py +25 -0
  109. shotgun/tui/components/vertical_tail.py +13 -0
  110. shotgun/tui/screens/chat.py +782 -0
  111. shotgun/tui/screens/chat.tcss +43 -0
  112. shotgun/tui/screens/chat_screen/__init__.py +0 -0
  113. shotgun/tui/screens/chat_screen/command_providers.py +219 -0
  114. shotgun/tui/screens/chat_screen/hint_message.py +40 -0
  115. shotgun/tui/screens/chat_screen/history.py +221 -0
  116. shotgun/tui/screens/directory_setup.py +113 -0
  117. shotgun/tui/screens/provider_config.py +221 -0
  118. shotgun/tui/screens/splash.py +31 -0
  119. shotgun/tui/styles.tcss +10 -0
  120. shotgun/tui/utils/__init__.py +5 -0
  121. shotgun/tui/utils/mode_progress.py +257 -0
  122. shotgun/utils/__init__.py +5 -0
  123. shotgun/utils/env_utils.py +35 -0
  124. shotgun/utils/file_system_utils.py +36 -0
  125. shotgun/utils/update_checker.py +375 -0
  126. shotgun_sh-0.1.0.dist-info/METADATA +466 -0
  127. shotgun_sh-0.1.0.dist-info/RECORD +130 -0
  128. shotgun_sh-0.1.0.dist-info/WHEEL +4 -0
  129. shotgun_sh-0.1.0.dist-info/entry_points.txt +2 -0
  130. shotgun_sh-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1497 @@
1
+ """Kuzu graph ingestor for building code knowledge graphs."""
2
+
3
+ import hashlib
4
+ import os
5
+ import time
6
+ import uuid
7
+ from collections import defaultdict
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import kuzu
12
+ from tree_sitter import Node, Parser, QueryCursor
13
+
14
+ from shotgun.codebase.core.language_config import LANGUAGE_CONFIGS, get_language_config
15
+ from shotgun.codebase.core.parser_loader import load_parsers
16
+ from shotgun.logging_config import get_logger
17
+
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ # Default ignore patterns
22
+ IGNORE_PATTERNS = {
23
+ ".git",
24
+ "venv",
25
+ ".venv",
26
+ "__pycache__",
27
+ "node_modules",
28
+ "build",
29
+ "dist",
30
+ ".eggs",
31
+ ".pytest_cache",
32
+ ".mypy_cache",
33
+ ".ruff_cache",
34
+ ".claude",
35
+ ".idea",
36
+ ".vscode",
37
+ }
38
+
39
+
40
+ class Ingestor:
41
+ """Handles all communication and ingestion with the Kuzu database."""
42
+
43
+ def __init__(self, connection: kuzu.Connection):
44
+ self.conn = connection
45
+ self.node_buffer: list[tuple[str, dict[str, Any]]] = []
46
+ self.relationship_buffer: list[
47
+ tuple[str, str, Any, str, str, str, Any, dict[str, Any] | None]
48
+ ] = []
49
+ self.batch_size = 1000
50
+
51
+ def create_schema(self) -> None:
52
+ """Create the graph schema in Kuzu."""
53
+ logger.info("Creating Kuzu schema...")
54
+
55
+ # Node tables
56
+ node_schemas = [
57
+ "CREATE NODE TABLE Project(name STRING PRIMARY KEY, repo_path STRING, graph_id STRING, created_at INT64, updated_at INT64, schema_version STRING, build_options STRING, node_count INT64, relationship_count INT64, stats_updated_at INT64, status STRING, current_operation_id STRING, last_operation STRING, indexed_from_cwds STRING)",
58
+ "CREATE NODE TABLE Package(qualified_name STRING PRIMARY KEY, name STRING, path STRING)",
59
+ "CREATE NODE TABLE Folder(path STRING PRIMARY KEY, name STRING)",
60
+ "CREATE NODE TABLE File(path STRING PRIMARY KEY, name STRING, extension STRING)",
61
+ "CREATE NODE TABLE Module(qualified_name STRING PRIMARY KEY, name STRING, path STRING, created_at INT64, updated_at INT64)",
62
+ "CREATE NODE TABLE Class(qualified_name STRING PRIMARY KEY, name STRING, decorators STRING[], line_start INT64, line_end INT64, created_at INT64, updated_at INT64, docstring STRING)",
63
+ "CREATE NODE TABLE Function(qualified_name STRING PRIMARY KEY, name STRING, decorators STRING[], line_start INT64, line_end INT64, created_at INT64, updated_at INT64, docstring STRING)",
64
+ "CREATE NODE TABLE Method(qualified_name STRING PRIMARY KEY, name STRING, decorators STRING[], line_start INT64, line_end INT64, created_at INT64, updated_at INT64, docstring STRING)",
65
+ "CREATE NODE TABLE ExternalPackage(name STRING PRIMARY KEY, version_spec STRING)",
66
+ "CREATE NODE TABLE FileMetadata(filepath STRING PRIMARY KEY, mtime INT64, hash STRING, last_updated INT64)",
67
+ "CREATE NODE TABLE DeletionLog(id STRING PRIMARY KEY, entity_type STRING, entity_qualified_name STRING, deleted_from_file STRING, deleted_at INT64, deletion_reason STRING)",
68
+ ]
69
+
70
+ # Relationship tables - need separate tables for each source/target combination
71
+ rel_schemas = [
72
+ # CONTAINS_PACKAGE relationships
73
+ "CREATE REL TABLE CONTAINS_PACKAGE(FROM Project TO Package)",
74
+ "CREATE REL TABLE CONTAINS_PACKAGE_PKG(FROM Package TO Package)",
75
+ "CREATE REL TABLE CONTAINS_PACKAGE_FOLDER(FROM Folder TO Package)",
76
+ # CONTAINS_FOLDER relationships
77
+ "CREATE REL TABLE CONTAINS_FOLDER(FROM Project TO Folder)",
78
+ "CREATE REL TABLE CONTAINS_FOLDER_PKG(FROM Package TO Folder)",
79
+ "CREATE REL TABLE CONTAINS_FOLDER_FOLDER(FROM Folder TO Folder)",
80
+ # CONTAINS_FILE relationships
81
+ "CREATE REL TABLE CONTAINS_FILE(FROM Project TO File)",
82
+ "CREATE REL TABLE CONTAINS_FILE_PKG(FROM Package TO File)",
83
+ "CREATE REL TABLE CONTAINS_FILE_FOLDER(FROM Folder TO File)",
84
+ # CONTAINS_MODULE relationships
85
+ "CREATE REL TABLE CONTAINS_MODULE(FROM Project TO Module)",
86
+ "CREATE REL TABLE CONTAINS_MODULE_PKG(FROM Package TO Module)",
87
+ "CREATE REL TABLE CONTAINS_MODULE_FOLDER(FROM Folder TO Module)",
88
+ # Other relationships
89
+ "CREATE REL TABLE DEFINES(FROM Module TO Class)",
90
+ "CREATE REL TABLE DEFINES_FUNC(FROM Module TO Function)",
91
+ "CREATE REL TABLE DEFINES_METHOD(FROM Class TO Method)",
92
+ "CREATE REL TABLE INHERITS(FROM Class TO Class)",
93
+ "CREATE REL TABLE OVERRIDES(FROM Method TO Method)",
94
+ "CREATE REL TABLE DEPENDS_ON_EXTERNAL(FROM Project TO ExternalPackage)",
95
+ # CALLS relationships (all combinations)
96
+ "CREATE REL TABLE CALLS(FROM Function TO Function)",
97
+ "CREATE REL TABLE CALLS_FM(FROM Function TO Method)",
98
+ "CREATE REL TABLE CALLS_MF(FROM Method TO Function)",
99
+ "CREATE REL TABLE CALLS_MM(FROM Method TO Method)",
100
+ # IMPORTS
101
+ "CREATE REL TABLE IMPORTS(FROM Module TO Module)",
102
+ # TRACKS relationships (FileMetadata to nodes)
103
+ "CREATE REL TABLE TRACKS_Module(FROM FileMetadata TO Module)",
104
+ "CREATE REL TABLE TRACKS_Class(FROM FileMetadata TO Class)",
105
+ "CREATE REL TABLE TRACKS_Function(FROM FileMetadata TO Function)",
106
+ "CREATE REL TABLE TRACKS_Method(FROM FileMetadata TO Method)",
107
+ ]
108
+
109
+ # Create all schemas
110
+ for schema in node_schemas + rel_schemas:
111
+ try:
112
+ self.conn.execute(schema)
113
+ logger.debug(f"Created: {schema.split('(')[0]}")
114
+ except Exception as e:
115
+ if "already exists" not in str(e):
116
+ logger.error(f"Failed to create schema: {schema}, error: {e}")
117
+
118
+ logger.info("Schema creation complete.")
119
+
120
+ def ensure_node_batch(self, label: str, properties: dict[str, Any]) -> None:
121
+ """Add a node to the buffer for batch insertion."""
122
+ # Check for duplicates based on primary key
123
+ primary_key = self._get_primary_key(label, properties)
124
+ if primary_key and self._is_duplicate_node(label, primary_key):
125
+ return
126
+
127
+ self.node_buffer.append((label, properties))
128
+
129
+ if len(self.node_buffer) >= self.batch_size:
130
+ self.flush_nodes()
131
+
132
+ def _get_primary_key(self, label: str, properties: dict[str, Any]) -> str | None:
133
+ """Get the primary key value for a node."""
134
+ primary_key_field = self._get_primary_key_field(label)
135
+ return properties.get(primary_key_field) if primary_key_field else None
136
+
137
+ def _get_primary_key_field(self, label: str) -> str | None:
138
+ """Get the primary key field name for a node type."""
139
+ if label == "Project":
140
+ return "name"
141
+ elif label in ["Package", "Module", "Class", "Function", "Method"]:
142
+ return "qualified_name"
143
+ elif label in ["Folder", "File"]:
144
+ return "path"
145
+ elif label == "FileMetadata":
146
+ return "filepath"
147
+ elif label == "ExternalPackage":
148
+ return "name"
149
+ elif label == "DeletionLog":
150
+ return "id"
151
+ return None
152
+
153
+ def _is_duplicate_node(self, label: str, primary_key: str) -> bool:
154
+ """Check if a node with the given primary key already exists in the buffer."""
155
+ for buffered_label, buffered_props in self.node_buffer:
156
+ if buffered_label == label:
157
+ buffered_key = self._get_primary_key(buffered_label, buffered_props)
158
+ if buffered_key == primary_key:
159
+ return True
160
+ return False
161
+
162
+ def flush_nodes(self) -> None:
163
+ """Flush pending node insertions to the database."""
164
+ if not self.node_buffer:
165
+ return
166
+
167
+ # Group nodes by label
168
+ nodes_by_label: dict[str, list[dict[str, Any]]] = defaultdict(list)
169
+ for label, properties in self.node_buffer:
170
+ nodes_by_label[label].append(properties)
171
+
172
+ # Insert each group
173
+ for label, nodes in nodes_by_label.items():
174
+ try:
175
+ # Build CREATE query for batch insertion
176
+ for node_props in nodes:
177
+ # Create individual nodes
178
+ prop_names = list(node_props.keys())
179
+ prop_values = [node_props[k] for k in prop_names]
180
+
181
+ # Build query - use MERGE to handle duplicates
182
+ primary_key_field = self._get_primary_key_field(label)
183
+ if primary_key_field and primary_key_field in node_props:
184
+ # Use MERGE for nodes with primary keys
185
+ merge_props = f"{primary_key_field}: ${primary_key_field}"
186
+ set_props = ", ".join(
187
+ [
188
+ f"n.{k} = ${k}"
189
+ for k in prop_names
190
+ if k != primary_key_field
191
+ ]
192
+ )
193
+ query = f"MERGE (n:{label} {{{merge_props}}}) SET {set_props}"
194
+ else:
195
+ # Use CREATE for nodes without primary keys
196
+ props_str = ", ".join([f"{k}: ${k}" for k in prop_names])
197
+ query = f"CREATE (n:{label} {{{props_str}}})"
198
+
199
+ # Execute with parameters
200
+ params = dict(zip(prop_names, prop_values, strict=False))
201
+ self.conn.execute(query, params)
202
+
203
+ except Exception as e:
204
+ logger.error(f"Failed to insert {label} nodes: {e}")
205
+
206
+ # Log node counts by type
207
+ node_type_counts: dict[str, int] = {}
208
+ for label, _ in self.node_buffer:
209
+ node_type_counts[label] = node_type_counts.get(label, 0) + 1
210
+
211
+ logger.info(f"Flushed {len(self.node_buffer)} nodes:")
212
+ for label, count in sorted(node_type_counts.items()):
213
+ logger.info(f" {label}: {count}")
214
+
215
+ self.node_buffer.clear()
216
+
217
+ def ensure_relationship_batch(
218
+ self,
219
+ from_label: str,
220
+ from_key: str,
221
+ from_value: Any,
222
+ rel_type: str,
223
+ to_label: str,
224
+ to_key: str,
225
+ to_value: Any,
226
+ properties: dict[str, Any] | None = None,
227
+ ) -> None:
228
+ """Add a relationship to the buffer for batch insertion."""
229
+ self.relationship_buffer.append(
230
+ (
231
+ from_label,
232
+ from_key,
233
+ from_value,
234
+ rel_type,
235
+ to_label,
236
+ to_key,
237
+ to_value,
238
+ properties,
239
+ )
240
+ )
241
+
242
+ # Don't auto-flush relationships - wait for explicit flush_all() to ensure nodes exist first
243
+
244
+ def flush_relationships(self) -> None:
245
+ """Flush pending relationship insertions to the database."""
246
+ if not self.relationship_buffer:
247
+ return
248
+
249
+ # Group relationships by type
250
+ rels_by_type: dict[
251
+ str, list[tuple[str, str, Any, str, str, str, Any, dict[str, Any] | None]]
252
+ ] = defaultdict(list)
253
+
254
+ for rel_data in self.relationship_buffer:
255
+ (
256
+ from_label,
257
+ from_key,
258
+ from_value,
259
+ rel_type,
260
+ to_label,
261
+ to_key,
262
+ to_value,
263
+ properties,
264
+ ) = rel_data
265
+
266
+ # Determine actual table name
267
+ table_name = self._get_relationship_table_name(
268
+ rel_type, from_label, to_label
269
+ )
270
+ if table_name:
271
+ rels_by_type[table_name].append(rel_data)
272
+
273
+ # Insert each group
274
+ relationship_counts = {}
275
+ for table_name, relationships in rels_by_type.items():
276
+ success_count = 0
277
+ try:
278
+ for rel_data in relationships:
279
+ (
280
+ from_label,
281
+ from_key,
282
+ from_value,
283
+ _,
284
+ to_label,
285
+ to_key,
286
+ to_value,
287
+ properties,
288
+ ) = rel_data
289
+
290
+ # Build MATCH and MERGE query (use MERGE to avoid duplicate relationships)
291
+ query = f"""
292
+ MATCH (a:{from_label} {{{from_key}: $from_val}}),
293
+ (b:{to_label} {{{to_key}: $to_val}})
294
+ MERGE (a)-[:{table_name}]->(b)
295
+ """
296
+
297
+ params = {"from_val": from_value, "to_val": to_value}
298
+ try:
299
+ self.conn.execute(query, params)
300
+ success_count += 1
301
+ except Exception as e:
302
+ logger.error(
303
+ f"Failed to create single relationship {table_name}: {from_label}({from_value}) -> {to_label}({to_value})"
304
+ )
305
+ logger.error(f"Error: {e}")
306
+ raise
307
+
308
+ relationship_counts[table_name] = success_count
309
+ if table_name == "DEFINES_METHOD":
310
+ logger.info(
311
+ f"Successfully created {success_count} DEFINES_METHOD relationships"
312
+ )
313
+
314
+ except Exception as e:
315
+ logger.error(f"Failed to insert {table_name} relationships: {e}")
316
+ logger.error(
317
+ f"Failed on relationship #{success_count + 1} of {len(relationships)}"
318
+ )
319
+ logger.error(f"Query was: {query}")
320
+ logger.error(f"Params were: {params}")
321
+ # Don't swallow the exception - let it propagate
322
+ raise
323
+
324
+ # Log summary of flushed relationships
325
+ logger.info(
326
+ f"Flushed {len(self.relationship_buffer)} relationships: {relationship_counts}"
327
+ )
328
+ self.relationship_buffer.clear()
329
+
330
+ def _get_relationship_table_name(
331
+ self, rel_type: str, from_label: str, to_label: str
332
+ ) -> str | None:
333
+ """Determine the actual relationship table name based on source and target."""
334
+ # Mapping of relationship types and from_labels to table names
335
+ table_mapping = {
336
+ "CONTAINS_PACKAGE": {
337
+ "Project": "CONTAINS_PACKAGE",
338
+ "Package": "CONTAINS_PACKAGE_PKG",
339
+ "Folder": "CONTAINS_PACKAGE_FOLDER",
340
+ },
341
+ "CONTAINS_FOLDER": {
342
+ "Project": "CONTAINS_FOLDER",
343
+ "Package": "CONTAINS_FOLDER_PKG",
344
+ "Folder": "CONTAINS_FOLDER_FOLDER",
345
+ },
346
+ "CONTAINS_FILE": {
347
+ "Project": "CONTAINS_FILE",
348
+ "Package": "CONTAINS_FILE_PKG",
349
+ "Folder": "CONTAINS_FILE_FOLDER",
350
+ },
351
+ "CONTAINS_MODULE": {
352
+ "Project": "CONTAINS_MODULE",
353
+ "Package": "CONTAINS_MODULE_PKG",
354
+ "Folder": "CONTAINS_MODULE_FOLDER",
355
+ },
356
+ }
357
+
358
+ if rel_type in table_mapping:
359
+ return table_mapping[rel_type].get(from_label)
360
+ elif rel_type == "DEFINES":
361
+ if to_label == "Function":
362
+ return "DEFINES_FUNC"
363
+ else:
364
+ return "DEFINES"
365
+ elif rel_type == "CALLS":
366
+ if from_label == "Function" and to_label == "Function":
367
+ return "CALLS"
368
+ elif from_label == "Function" and to_label == "Method":
369
+ return "CALLS_FM"
370
+ elif from_label == "Method" and to_label == "Function":
371
+ return "CALLS_MF"
372
+ elif from_label == "Method" and to_label == "Method":
373
+ return "CALLS_MM"
374
+ elif rel_type.startswith("TRACKS_"):
375
+ # TRACKS relationships already have the correct table name
376
+ return rel_type
377
+ else:
378
+ # Default to the relationship type
379
+ return rel_type
380
+ return None
381
+
382
+ def flush_all(self) -> None:
383
+ """Flush all pending operations."""
384
+ logger.info(
385
+ f"Starting flush_all: {len(self.node_buffer)} nodes, {len(self.relationship_buffer)} relationships buffered"
386
+ )
387
+
388
+ # IMPORTANT: Flush nodes first to ensure they exist before creating relationships
389
+ self.flush_nodes()
390
+
391
+ # Now flush relationships - all nodes should exist
392
+ self.flush_relationships()
393
+
394
+ logger.info("flush_all completed successfully")
395
+
396
+ def ensure_file_metadata(
397
+ self, filepath: str, mtime: int, hash_value: str, last_updated: int
398
+ ) -> None:
399
+ """Create or update FileMetadata node."""
400
+ self.ensure_node_batch(
401
+ "FileMetadata",
402
+ {
403
+ "filepath": filepath,
404
+ "mtime": mtime,
405
+ "hash": hash_value,
406
+ "last_updated": last_updated,
407
+ },
408
+ )
409
+
410
+ def log_deletion(
411
+ self,
412
+ entity_type: str,
413
+ entity_qn: str,
414
+ filepath: str,
415
+ reason: str = "file_modified",
416
+ ) -> None:
417
+ """Log a deletion to the DeletionLog table."""
418
+ deletion_id = str(uuid.uuid4())
419
+ current_time = int(time.time())
420
+
421
+ try:
422
+ self.conn.execute(
423
+ """
424
+ CREATE (d:DeletionLog {
425
+ id: $id,
426
+ entity_type: $type,
427
+ entity_qualified_name: $qn,
428
+ deleted_from_file: $file,
429
+ deleted_at: $time,
430
+ deletion_reason: $reason
431
+ })
432
+ """,
433
+ {
434
+ "id": deletion_id,
435
+ "type": entity_type,
436
+ "qn": entity_qn,
437
+ "file": filepath,
438
+ "time": current_time,
439
+ "reason": reason,
440
+ },
441
+ )
442
+ except Exception as e:
443
+ logger.error(f"Failed to log deletion of {entity_qn}: {e}")
444
+
445
+ def ensure_tracks_relationship(
446
+ self, filepath: str, node_type: str, node_qn: str
447
+ ) -> None:
448
+ """Create TRACKS relationship between FileMetadata and a node."""
449
+ rel_type = f"TRACKS_{node_type}"
450
+ self.ensure_relationship_batch(
451
+ "FileMetadata",
452
+ "filepath",
453
+ filepath,
454
+ rel_type,
455
+ node_type,
456
+ "qualified_name",
457
+ node_qn,
458
+ )
459
+
460
+ def delete_file_nodes(self, filepath: str) -> dict[str, int]:
461
+ """Delete all nodes tracked by a FileMetadata.
462
+
463
+ Args:
464
+ filepath: Relative file path
465
+
466
+ Returns:
467
+ Statistics of deleted entities
468
+ """
469
+ stats = {"modules": 0, "classes": 0, "functions": 0, "methods": 0}
470
+
471
+ # Delete each type of node tracked by this file
472
+ for node_type, rel_type, stat_key in [
473
+ ("Module", "TRACKS_Module", "modules"),
474
+ ("Class", "TRACKS_Class", "classes"),
475
+ ("Function", "TRACKS_Function", "functions"),
476
+ ("Method", "TRACKS_Method", "methods"),
477
+ ]:
478
+ try:
479
+ # First get the nodes to delete (for logging)
480
+ result = self.conn.execute(
481
+ f"""
482
+ MATCH (f:FileMetadata {{filepath: $path}})-[:{rel_type}]->(n:{node_type})
483
+ RETURN n.qualified_name
484
+ """,
485
+ {"path": filepath},
486
+ )
487
+
488
+ nodes_to_delete = []
489
+ if hasattr(result, "has_next") and not isinstance(result, list):
490
+ while result.has_next():
491
+ row = result.get_next()
492
+ if isinstance(row, list | tuple) and len(row) > 0:
493
+ nodes_to_delete.append(row[0])
494
+
495
+ # Log deletions
496
+ for node_qn in nodes_to_delete:
497
+ self.log_deletion(node_type, node_qn, filepath, "file_modified")
498
+
499
+ # Delete the nodes and their relationships
500
+ self.conn.execute(
501
+ f"""
502
+ MATCH (f:FileMetadata {{filepath: $path}})-[:{rel_type}]->(n:{node_type})
503
+ DETACH DELETE n
504
+ """,
505
+ {"path": filepath},
506
+ )
507
+
508
+ stats[stat_key] = len(nodes_to_delete)
509
+
510
+ except Exception as e:
511
+ logger.error(f"Failed to delete {node_type} nodes for {filepath}: {e}")
512
+
513
+ # Delete the FileMetadata node itself
514
+ try:
515
+ self.conn.execute(
516
+ """
517
+ MATCH (f:FileMetadata {filepath: $path})
518
+ DETACH DELETE f
519
+ """,
520
+ {"path": filepath},
521
+ )
522
+ except Exception as e:
523
+ logger.error(f"Failed to delete FileMetadata for {filepath}: {e}")
524
+
525
+ return stats
526
+
527
+
528
+ class SimpleGraphBuilder:
529
+ """Simplified version of GraphUpdater for building the code graph."""
530
+
531
+ def __init__(
532
+ self,
533
+ ingestor: Ingestor,
534
+ repo_path: Path,
535
+ parsers: dict[str, Parser],
536
+ queries: dict[str, Any],
537
+ exclude_patterns: list[str] | None = None,
538
+ ):
539
+ self.ingestor = ingestor
540
+ self.repo_path = repo_path
541
+ self.parsers = parsers
542
+ self.queries = queries
543
+ self.project_name = repo_path.name
544
+ self.ignore_dirs = IGNORE_PATTERNS
545
+ if exclude_patterns:
546
+ self.ignore_dirs = self.ignore_dirs.union(set(exclude_patterns))
547
+
548
+ # Caches
549
+ self.structural_elements: dict[Path, str | None] = {}
550
+ self.ast_cache: dict[Path, tuple[Node, str]] = {}
551
+ self.function_registry: dict[str, str] = {} # qualified_name -> type
552
+ self.simple_name_lookup: dict[str, set[str]] = defaultdict(set)
553
+ self.class_inheritance: dict[str, list[str]] = {} # class_qn -> [parent_qns]
554
+
555
+ def run(self) -> None:
556
+ """Run the three-pass graph building process."""
557
+ logger.info(f"Building graph for project: {self.project_name}")
558
+
559
+ # Pass 1: Structure
560
+ logger.info("Pass 1: Identifying packages and folders...")
561
+ self._identify_structure()
562
+
563
+ # Pass 2: Definitions
564
+ logger.info("Pass 2: Processing files and extracting definitions...")
565
+ self._process_files()
566
+
567
+ # Pass 3: Relationships
568
+ logger.info("Pass 3: Processing relationships (calls, imports)...")
569
+ self._process_relationships()
570
+
571
+ # Flush all pending operations
572
+ logger.info("Flushing all data to database...")
573
+ self.ingestor.flush_all()
574
+ logger.info("Graph building complete!")
575
+
576
+ def _identify_structure(self) -> None:
577
+ """First pass: Walk directory to find packages and folders."""
578
+ for root_str, dirs, _ in os.walk(self.repo_path, topdown=True):
579
+ dirs[:] = [d for d in dirs if d not in self.ignore_dirs]
580
+ root = Path(root_str)
581
+ relative_root = root.relative_to(self.repo_path)
582
+
583
+ # Skip root directory
584
+ if root == self.repo_path:
585
+ continue
586
+
587
+ parent_rel_path = relative_root.parent
588
+ parent_container_qn = self.structural_elements.get(parent_rel_path)
589
+
590
+ # Check if this is a package
591
+ is_package = False
592
+ package_indicators = set()
593
+
594
+ # Collect package indicators from all languages
595
+ for lang_name, lang_config in LANGUAGE_CONFIGS.items():
596
+ if lang_name in self.queries:
597
+ package_indicators.update(lang_config.package_indicators)
598
+
599
+ # Check for package indicators
600
+ for indicator in package_indicators:
601
+ if (root / indicator).exists():
602
+ is_package = True
603
+ break
604
+
605
+ if is_package:
606
+ # Create package
607
+ package_qn = ".".join([self.project_name] + list(relative_root.parts))
608
+ self.ingestor.ensure_node_batch(
609
+ "Package",
610
+ {
611
+ "qualified_name": package_qn,
612
+ "name": relative_root.name,
613
+ "path": str(relative_root).replace(os.sep, "/"),
614
+ },
615
+ )
616
+
617
+ # Create containment relationship
618
+ if parent_container_qn:
619
+ # Parent is a package
620
+ self.ingestor.ensure_relationship_batch(
621
+ "Package",
622
+ "qualified_name",
623
+ parent_container_qn,
624
+ "CONTAINS_PACKAGE",
625
+ "Package",
626
+ "qualified_name",
627
+ package_qn,
628
+ )
629
+ else:
630
+ # Parent is project root
631
+ self.ingestor.ensure_relationship_batch(
632
+ "Project",
633
+ "name",
634
+ self.project_name,
635
+ "CONTAINS_PACKAGE",
636
+ "Package",
637
+ "qualified_name",
638
+ package_qn,
639
+ )
640
+
641
+ self.structural_elements[relative_root] = package_qn
642
+ else:
643
+ # Create folder
644
+ self.ingestor.ensure_node_batch(
645
+ "Folder",
646
+ {
647
+ "path": str(relative_root).replace(os.sep, "/"),
648
+ "name": relative_root.name,
649
+ },
650
+ )
651
+
652
+ # Create containment relationship
653
+ if parent_container_qn:
654
+ # Parent is a package
655
+ self.ingestor.ensure_relationship_batch(
656
+ "Package",
657
+ "qualified_name",
658
+ parent_container_qn,
659
+ "CONTAINS_FOLDER",
660
+ "Folder",
661
+ "path",
662
+ str(relative_root).replace(os.sep, "/"),
663
+ )
664
+ elif parent_rel_path == Path("."):
665
+ # Parent is project root
666
+ self.ingestor.ensure_relationship_batch(
667
+ "Project",
668
+ "name",
669
+ self.project_name,
670
+ "CONTAINS_FOLDER",
671
+ "Folder",
672
+ "path",
673
+ str(relative_root).replace(os.sep, "/"),
674
+ )
675
+ else:
676
+ # Parent is another folder
677
+ self.ingestor.ensure_relationship_batch(
678
+ "Folder",
679
+ "path",
680
+ str(parent_rel_path).replace(os.sep, "/"),
681
+ "CONTAINS_FOLDER",
682
+ "Folder",
683
+ "path",
684
+ str(relative_root).replace(os.sep, "/"),
685
+ )
686
+
687
+ self.structural_elements[relative_root] = None
688
+
689
+ def _process_files(self) -> None:
690
+ """Second pass: Process files and extract definitions."""
691
+ file_count = 0
692
+ for root_str, _, files in os.walk(self.repo_path):
693
+ root = Path(root_str)
694
+
695
+ # Skip ignored directories
696
+ if any(part in self.ignore_dirs for part in root.parts):
697
+ continue
698
+
699
+ for filename in files:
700
+ filepath = root / filename
701
+
702
+ # Check if this is a supported file
703
+ ext = filepath.suffix
704
+ lang_config = get_language_config(ext)
705
+
706
+ if lang_config and lang_config.name in self.parsers:
707
+ self._process_single_file(filepath, lang_config.name)
708
+ file_count += 1
709
+
710
+ if file_count % 100 == 0:
711
+ logger.info(f" Processed {file_count} files...")
712
+
713
+ logger.info(f" Total files processed: {file_count}")
714
+
715
+ def _process_single_file(self, filepath: Path, language: str) -> None:
716
+ """Process a single file."""
717
+ relative_path = filepath.relative_to(self.repo_path)
718
+ relative_path_str = str(relative_path).replace(os.sep, "/")
719
+
720
+ # Create File node
721
+ self.ingestor.ensure_node_batch(
722
+ "File",
723
+ {
724
+ "path": relative_path_str,
725
+ "name": filepath.name,
726
+ "extension": filepath.suffix,
727
+ },
728
+ )
729
+
730
+ # Create containment relationship
731
+ parent_rel_path = relative_path.parent
732
+ if parent_rel_path == Path("."):
733
+ # File in project root
734
+ self.ingestor.ensure_relationship_batch(
735
+ "Project",
736
+ "name",
737
+ self.project_name,
738
+ "CONTAINS_FILE",
739
+ "File",
740
+ "path",
741
+ relative_path_str,
742
+ )
743
+ else:
744
+ self.ingestor.ensure_relationship_batch(
745
+ "Folder",
746
+ "path",
747
+ str(parent_rel_path).replace(os.sep, "/"),
748
+ "CONTAINS_FILE",
749
+ "File",
750
+ "path",
751
+ relative_path_str,
752
+ )
753
+
754
+ # Parse file
755
+ try:
756
+ with open(filepath, "rb") as f:
757
+ content = f.read()
758
+
759
+ parser = self.parsers[language]
760
+ tree = parser.parse(content)
761
+ root_node = tree.root_node
762
+
763
+ # Cache AST for later
764
+ self.ast_cache[filepath] = (root_node, language)
765
+
766
+ # Create module
767
+ if filepath.name == "__init__.py":
768
+ module_qn = ".".join(
769
+ [self.project_name] + list(relative_path.parent.parts)
770
+ )
771
+ else:
772
+ module_qn = ".".join(
773
+ [self.project_name] + list(relative_path.with_suffix("").parts)
774
+ )
775
+
776
+ current_time = int(time.time())
777
+ self.ingestor.ensure_node_batch(
778
+ "Module",
779
+ {
780
+ "qualified_name": module_qn,
781
+ "name": filepath.stem,
782
+ "path": relative_path_str,
783
+ "created_at": current_time,
784
+ "updated_at": current_time,
785
+ },
786
+ )
787
+
788
+ # Create module containment
789
+ parent_container = self.structural_elements.get(parent_rel_path)
790
+ if parent_container:
791
+ # Parent is a package
792
+ self.ingestor.ensure_relationship_batch(
793
+ "Package",
794
+ "qualified_name",
795
+ parent_container,
796
+ "CONTAINS_MODULE",
797
+ "Module",
798
+ "qualified_name",
799
+ module_qn,
800
+ )
801
+ elif parent_rel_path == Path("."):
802
+ # Parent is project root
803
+ self.ingestor.ensure_relationship_batch(
804
+ "Project",
805
+ "name",
806
+ self.project_name,
807
+ "CONTAINS_MODULE",
808
+ "Module",
809
+ "qualified_name",
810
+ module_qn,
811
+ )
812
+ else:
813
+ # Parent is a folder
814
+ self.ingestor.ensure_relationship_batch(
815
+ "Folder",
816
+ "path",
817
+ str(parent_rel_path).replace(os.sep, "/"),
818
+ "CONTAINS_MODULE",
819
+ "Module",
820
+ "qualified_name",
821
+ module_qn,
822
+ )
823
+
824
+ # Create file metadata
825
+ mtime = int(filepath.stat().st_mtime)
826
+ hash_value = hashlib.sha256(content).hexdigest()
827
+ self.ingestor.ensure_file_metadata(
828
+ relative_path_str, mtime, hash_value, current_time
829
+ )
830
+
831
+ # Track module
832
+ self.ingestor.ensure_tracks_relationship(
833
+ relative_path_str, "Module", module_qn
834
+ )
835
+
836
+ # Extract definitions
837
+ self._extract_definitions(filepath, root_node, module_qn, language)
838
+
839
+ except Exception as e:
840
+ logger.error(f"Failed to process {filepath}: {e}")
841
+
842
+ def _extract_definitions(
843
+ self, filepath: Path, root_node: Node, module_qn: str, language: str
844
+ ) -> None:
845
+ """Extract function and class definitions from AST."""
846
+ lang_queries = self.queries.get(language, {})
847
+ relative_path_str = str(filepath.relative_to(self.repo_path)).replace(
848
+ os.sep, "/"
849
+ )
850
+
851
+ # Extract classes
852
+ if "class_query" in lang_queries:
853
+ cursor = QueryCursor(lang_queries["class_query"])
854
+ for match in cursor.matches(root_node):
855
+ class_node = None
856
+ class_name = None
857
+
858
+ captures = match[1] # Get captures dictionary from tuple
859
+ for capture_name, nodes in captures.items():
860
+ for node in nodes:
861
+ if capture_name in ["class", "interface", "type_alias"]:
862
+ class_node = node
863
+ elif capture_name == "class_name" and node.text:
864
+ class_name = node.text.decode("utf-8")
865
+
866
+ if class_node and class_name:
867
+ class_qn = f"{module_qn}.{class_name}"
868
+
869
+ # Extract decorators
870
+ decorators = self._extract_decorators(class_node, language)
871
+
872
+ # Extract docstring
873
+ docstring = self._extract_docstring(class_node, language)
874
+
875
+ current_time = int(time.time())
876
+ self.ingestor.ensure_node_batch(
877
+ "Class",
878
+ {
879
+ "qualified_name": class_qn,
880
+ "name": class_name,
881
+ "decorators": decorators,
882
+ "line_start": class_node.start_point.row + 1,
883
+ "line_end": class_node.end_point.row + 1,
884
+ "created_at": current_time,
885
+ "updated_at": current_time,
886
+ "docstring": docstring,
887
+ },
888
+ )
889
+
890
+ # Create DEFINES relationship
891
+ logger.debug(
892
+ f"Creating DEFINES relationship: Module({module_qn}) -> Class({class_qn})"
893
+ )
894
+ self.ingestor.ensure_relationship_batch(
895
+ "Module",
896
+ "qualified_name",
897
+ module_qn,
898
+ "DEFINES",
899
+ "Class",
900
+ "qualified_name",
901
+ class_qn,
902
+ )
903
+
904
+ # Track class
905
+ self.ingestor.ensure_tracks_relationship(
906
+ relative_path_str, "Class", class_qn
907
+ )
908
+
909
+ # Register for lookup
910
+ self.function_registry[class_qn] = "Class"
911
+ self.simple_name_lookup[class_name].add(class_qn)
912
+
913
+ # Extract inheritance
914
+ parent_names = self._extract_inheritance(class_node, language)
915
+ if parent_names:
916
+ self.class_inheritance[class_qn] = parent_names
917
+
918
+ # Extract functions
919
+ if "function_query" in lang_queries:
920
+ cursor = QueryCursor(lang_queries["function_query"])
921
+ matches = list(cursor.matches(root_node))
922
+ logger.debug(f"Found {len(matches)} function matches in {filepath}")
923
+ for match in matches:
924
+ func_node = None
925
+ func_name = None
926
+
927
+ captures = match[1] # Get captures dictionary from tuple
928
+ for capture_name, nodes in captures.items():
929
+ for node in nodes:
930
+ if capture_name == "function":
931
+ func_node = node
932
+ elif capture_name == "function_name" and node.text:
933
+ func_name = node.text.decode("utf-8")
934
+
935
+ if func_node and func_name:
936
+ # Log what we found
937
+ logger.debug(
938
+ f"Found function: {func_name} at line {func_node.start_point.row + 1}"
939
+ )
940
+
941
+ # Check if this is a method inside a class
942
+ parent_class = self._find_parent_class(func_node, module_qn)
943
+
944
+ if parent_class:
945
+ # This is a method
946
+ method_qn = f"{parent_class}.{func_name}"
947
+ decorators = self._extract_decorators(func_node, language)
948
+
949
+ # Extract docstring
950
+ docstring = self._extract_docstring(func_node, language)
951
+
952
+ current_time = int(time.time())
953
+ self.ingestor.ensure_node_batch(
954
+ "Method",
955
+ {
956
+ "qualified_name": method_qn,
957
+ "name": func_name,
958
+ "decorators": decorators,
959
+ "line_start": func_node.start_point.row + 1,
960
+ "line_end": func_node.end_point.row + 1,
961
+ "created_at": current_time,
962
+ "updated_at": current_time,
963
+ "docstring": docstring,
964
+ },
965
+ )
966
+
967
+ # Create DEFINES_METHOD relationship
968
+ self.ingestor.ensure_relationship_batch(
969
+ "Class",
970
+ "qualified_name",
971
+ parent_class,
972
+ "DEFINES_METHOD",
973
+ "Method",
974
+ "qualified_name",
975
+ method_qn,
976
+ )
977
+
978
+ # Track method
979
+ self.ingestor.ensure_tracks_relationship(
980
+ relative_path_str, "Method", method_qn
981
+ )
982
+
983
+ # Register for lookup
984
+ self.function_registry[method_qn] = "Method"
985
+ self.simple_name_lookup[func_name].add(method_qn)
986
+ else:
987
+ # This is a standalone function
988
+ func_qn = f"{module_qn}.{func_name}"
989
+ decorators = self._extract_decorators(func_node, language)
990
+
991
+ # Extract docstring
992
+ docstring = self._extract_docstring(func_node, language)
993
+
994
+ current_time = int(time.time())
995
+ self.ingestor.ensure_node_batch(
996
+ "Function",
997
+ {
998
+ "qualified_name": func_qn,
999
+ "name": func_name,
1000
+ "decorators": decorators,
1001
+ "line_start": func_node.start_point.row + 1,
1002
+ "line_end": func_node.end_point.row + 1,
1003
+ "created_at": current_time,
1004
+ "updated_at": current_time,
1005
+ "docstring": docstring,
1006
+ },
1007
+ )
1008
+
1009
+ # Create DEFINES relationship
1010
+ self.ingestor.ensure_relationship_batch(
1011
+ "Module",
1012
+ "qualified_name",
1013
+ module_qn,
1014
+ "DEFINES_FUNC",
1015
+ "Function",
1016
+ "qualified_name",
1017
+ func_qn,
1018
+ )
1019
+
1020
+ # Track function
1021
+ self.ingestor.ensure_tracks_relationship(
1022
+ relative_path_str, "Function", func_qn
1023
+ )
1024
+
1025
+ # Register for lookup
1026
+ self.function_registry[func_qn] = "Function"
1027
+ self.simple_name_lookup[func_name].add(func_qn)
1028
+
1029
+ def _extract_decorators(self, node: Node, language: str) -> list[str]:
1030
+ """Extract decorators from a function/class node."""
1031
+ decorators = []
1032
+
1033
+ if language == "python":
1034
+ # Look for decorator nodes
1035
+ for child in node.children:
1036
+ if child.type == "decorator":
1037
+ # Extract decorator name
1038
+ for grandchild in child.children:
1039
+ if grandchild.type == "identifier" and grandchild.text:
1040
+ decorators.append(grandchild.text.decode("utf-8"))
1041
+ break
1042
+ elif grandchild.type == "attribute":
1043
+ # Handle @module.decorator
1044
+ attr_node = grandchild.child_by_field_name("attribute")
1045
+ if attr_node and attr_node.text:
1046
+ decorators.append(attr_node.text.decode("utf-8"))
1047
+ break
1048
+
1049
+ return decorators
1050
+
1051
+ def _extract_docstring(self, node: Node, language: str) -> str | None:
1052
+ """Extract docstring from function/class node."""
1053
+ if language == "python":
1054
+ # Get the body node
1055
+ body_node = node.child_by_field_name("body")
1056
+ if not body_node or not body_node.children:
1057
+ return None
1058
+
1059
+ # Check if first statement is a string (docstring)
1060
+ first_statement = body_node.children[0]
1061
+ if first_statement.type == "expression_statement":
1062
+ # Check if it contains a string
1063
+ for child in first_statement.children:
1064
+ if child.type == "string" and child.text:
1065
+ # Extract and clean the docstring
1066
+ docstring = child.text.decode("utf-8")
1067
+ # Remove quotes (handle various quote styles)
1068
+ docstring = docstring.strip()
1069
+ if (
1070
+ docstring.startswith('"""')
1071
+ and docstring.endswith('"""')
1072
+ or docstring.startswith("'''")
1073
+ and docstring.endswith("'''")
1074
+ ):
1075
+ docstring = docstring[3:-3]
1076
+ elif (
1077
+ docstring.startswith('"')
1078
+ and docstring.endswith('"')
1079
+ or docstring.startswith("'")
1080
+ and docstring.endswith("'")
1081
+ ):
1082
+ docstring = docstring[1:-1]
1083
+ return docstring.strip()
1084
+ # Add support for other languages later
1085
+ return None
1086
+
1087
+ def _extract_inheritance(self, class_node: Node, language: str) -> list[str]:
1088
+ """Extract parent class names from class definition."""
1089
+ parent_names = []
1090
+
1091
+ if language == "python":
1092
+ # Look for argument_list in class definition
1093
+ for child in class_node.children:
1094
+ if child.type == "argument_list":
1095
+ # Each argument is a parent class
1096
+ for arg in child.children:
1097
+ if arg.type == "identifier" and arg.text:
1098
+ parent_names.append(arg.text.decode("utf-8"))
1099
+ elif arg.type == "attribute":
1100
+ # Handle module.Class inheritance
1101
+ full_name_parts: list[str] = []
1102
+ self._extract_full_name(arg, full_name_parts)
1103
+ if full_name_parts:
1104
+ parent_names.append(".".join(full_name_parts))
1105
+
1106
+ return parent_names
1107
+
1108
+ def _extract_full_name(self, node: Node, parts: list[str]) -> None:
1109
+ """Recursively extract full qualified name from attribute access."""
1110
+ if node.type == "identifier" and node.text:
1111
+ parts.insert(0, node.text.decode("utf-8"))
1112
+ elif node.type == "attribute":
1113
+ # Get attribute name
1114
+ attr_node = node.child_by_field_name("attribute")
1115
+ if attr_node and attr_node.text:
1116
+ parts.insert(0, attr_node.text.decode("utf-8"))
1117
+
1118
+ # Get object name
1119
+ obj_node = node.child_by_field_name("object")
1120
+ if obj_node:
1121
+ self._extract_full_name(obj_node, parts)
1122
+
1123
+ def _find_parent_class(self, func_node: Node, module_qn: str) -> str | None:
1124
+ """Find the parent class of a function node."""
1125
+ # Walk up the tree to find containing class
1126
+ current = func_node.parent
1127
+
1128
+ while current:
1129
+ if current.type in ["class_definition", "class_declaration"]:
1130
+ # Extract class name
1131
+ for child in current.children:
1132
+ if child.type == "identifier" and child.text:
1133
+ class_name = child.text.decode("utf-8")
1134
+ return f"{module_qn}.{class_name}"
1135
+
1136
+ current = current.parent
1137
+
1138
+ return None
1139
+
1140
+ def _process_relationships(self) -> None:
1141
+ """Third pass: Process function calls and imports."""
1142
+ # Process inheritance relationships first
1143
+ self._process_inheritance()
1144
+
1145
+ # Then process function calls
1146
+ logger.info(f"Processing function calls for {len(self.ast_cache)} files...")
1147
+ logger.info(f"Function registry has {len(self.function_registry)} entries")
1148
+ logger.info(
1149
+ f"Simple name lookup has {len(self.simple_name_lookup)} unique names"
1150
+ )
1151
+
1152
+ # Log some examples from simple_name_lookup
1153
+ if self.simple_name_lookup:
1154
+ example_names = list(self.simple_name_lookup.keys())[:5]
1155
+ for name in example_names:
1156
+ logger.debug(
1157
+ f" Example: '{name}' -> {list(self.simple_name_lookup[name])[:3]}"
1158
+ )
1159
+
1160
+ for filepath, (root_node, language) in self.ast_cache.items():
1161
+ self._process_calls(filepath, root_node, language)
1162
+ # NOTE: Add import processing. wtf does this mean?
1163
+
1164
+ def _process_inheritance(self) -> None:
1165
+ """Process inheritance relationships between classes."""
1166
+ logger.info("Processing inheritance relationships...")
1167
+
1168
+ for child_qn, parent_qns in self.class_inheritance.items():
1169
+ for parent_qn in parent_qns:
1170
+ # Check if parent exists in our registry
1171
+ if parent_qn in self.function_registry:
1172
+ # Create INHERITS relationship
1173
+ self.ingestor.ensure_relationship_batch(
1174
+ "Class",
1175
+ "qualified_name",
1176
+ child_qn,
1177
+ "INHERITS",
1178
+ "Class",
1179
+ "qualified_name",
1180
+ parent_qn,
1181
+ )
1182
+ logger.debug(
1183
+ f" Created inheritance: {child_qn} INHERITS {parent_qn}"
1184
+ )
1185
+ else:
1186
+ # Try to find parent by simple name lookup
1187
+ parent_simple_name = parent_qn.split(".")[-1]
1188
+ possible_parents = self.simple_name_lookup.get(
1189
+ parent_simple_name, set()
1190
+ )
1191
+
1192
+ # If we find exactly one match, use it
1193
+ if len(possible_parents) == 1:
1194
+ actual_parent_qn = list(possible_parents)[0]
1195
+ self.ingestor.ensure_relationship_batch(
1196
+ "Class",
1197
+ "qualified_name",
1198
+ child_qn,
1199
+ "INHERITS",
1200
+ "Class",
1201
+ "qualified_name",
1202
+ actual_parent_qn,
1203
+ )
1204
+ logger.debug(
1205
+ f" Created inheritance: {child_qn} INHERITS {actual_parent_qn}"
1206
+ )
1207
+ else:
1208
+ logger.debug(
1209
+ f" Could not resolve parent class: {parent_qn} for {child_qn}"
1210
+ )
1211
+
1212
+ def _process_calls(self, filepath: Path, root_node: Node, language: str) -> None:
1213
+ """Process function calls in a file."""
1214
+ lang_queries = self.queries.get(language, {})
1215
+
1216
+ if "call_query" not in lang_queries:
1217
+ return
1218
+
1219
+ # Get the module qualified name
1220
+ relative_path = filepath.relative_to(self.repo_path)
1221
+ if filepath.name == "__init__.py":
1222
+ module_qn = ".".join([self.project_name] + list(relative_path.parent.parts))
1223
+ else:
1224
+ module_qn = ".".join(
1225
+ [self.project_name] + list(relative_path.with_suffix("").parts)
1226
+ )
1227
+
1228
+ # Find all call expressions
1229
+ cursor = QueryCursor(lang_queries["call_query"])
1230
+ matches = list(cursor.matches(root_node))
1231
+ logger.debug(f"Found {len(matches)} call matches in {filepath}")
1232
+ for match in matches:
1233
+ call_node = None
1234
+
1235
+ captures = match[1] # Get captures dictionary from tuple
1236
+ for capture_name, nodes in captures.items():
1237
+ for node in nodes:
1238
+ if capture_name == "call":
1239
+ call_node = node
1240
+ break
1241
+
1242
+ if call_node:
1243
+ self._process_single_call(call_node, module_qn, language)
1244
+
1245
+ def _process_single_call(
1246
+ self, call_node: Node, module_qn: str, language: str
1247
+ ) -> None:
1248
+ """Process a single function call with smart resolution."""
1249
+ # Extract called function name and context (simplified)
1250
+ callee_name = None
1251
+ object_name = None # For method calls like obj.method()
1252
+
1253
+ if language in ["python", "javascript", "typescript"]:
1254
+ # Look for function/method name
1255
+ for child in call_node.children:
1256
+ if child.type == "identifier" and child.text:
1257
+ callee_name = child.text.decode("utf-8")
1258
+ break
1259
+ elif child.type == "attribute":
1260
+ # Handle method calls like obj.method()
1261
+ obj_node = child.child_by_field_name("object")
1262
+ attr_node = child.child_by_field_name("attribute")
1263
+ if obj_node and obj_node.text:
1264
+ object_name = obj_node.text.decode("utf-8")
1265
+ if attr_node and attr_node.text:
1266
+ callee_name = attr_node.text.decode("utf-8")
1267
+ break
1268
+
1269
+ if not callee_name:
1270
+ logger.debug(
1271
+ f" Could not extract callee name from call at line {call_node.start_point[0]}"
1272
+ )
1273
+ return
1274
+
1275
+ logger.debug(f" Processing call to {callee_name} (object: {object_name})")
1276
+
1277
+ # Find caller function
1278
+ caller_qn = self._find_containing_function(call_node, module_qn)
1279
+ if not caller_qn:
1280
+ logger.debug(
1281
+ f" Could not find containing function for call at line {call_node.start_point[0]}"
1282
+ )
1283
+ return
1284
+
1285
+ # Get all possible callees
1286
+ possible_callees = self.simple_name_lookup.get(callee_name, set())
1287
+ if not possible_callees:
1288
+ logger.debug(f" No functions found with name: {callee_name}")
1289
+ return
1290
+
1291
+ logger.debug(
1292
+ f" Found {len(possible_callees)} possible callees for {callee_name}"
1293
+ )
1294
+
1295
+ # Calculate confidence scores for each possible callee
1296
+ scored_callees = []
1297
+ for possible_qn in possible_callees:
1298
+ score = self._calculate_callee_confidence(
1299
+ caller_qn, possible_qn, module_qn, object_name
1300
+ )
1301
+ scored_callees.append((possible_qn, score))
1302
+
1303
+ # Sort by confidence score (highest first)
1304
+ scored_callees.sort(key=lambda x: x[1], reverse=True)
1305
+
1306
+ # Use the highest confidence match
1307
+ callee_qn, confidence = scored_callees[0]
1308
+
1309
+ # Create CALLS relationship with metadata
1310
+ caller_type = self.function_registry.get(caller_qn)
1311
+ callee_type = self.function_registry.get(callee_qn)
1312
+
1313
+ if caller_type and callee_type:
1314
+ # Create the primary CALLS relationship
1315
+ self.ingestor.ensure_relationship_batch(
1316
+ caller_type,
1317
+ "qualified_name",
1318
+ caller_qn,
1319
+ "CALLS",
1320
+ callee_type,
1321
+ "qualified_name",
1322
+ callee_qn,
1323
+ )
1324
+
1325
+ # Log with confidence information
1326
+ alternatives = len(scored_callees) - 1
1327
+ logger.info(
1328
+ f" Created CALLS relationship: {caller_qn} -> {callee_qn} (confidence: {confidence:.2f}, alternatives: {alternatives})"
1329
+ )
1330
+
1331
+ # If multiple alternatives exist with similar confidence, log them
1332
+ if alternatives > 0 and confidence < 1.0:
1333
+ similar_alternatives = [
1334
+ qn for qn, score in scored_callees[1:4] if score >= confidence * 0.8
1335
+ ] # Top 3 alternatives # Within 80% of best score
1336
+ if similar_alternatives:
1337
+ logger.debug(
1338
+ f" Alternative matches: {', '.join(similar_alternatives)}"
1339
+ )
1340
+ else:
1341
+ logger.warning(
1342
+ f" Failed to create CALLS relationship - caller_type: {caller_type}, callee_type: {callee_type}"
1343
+ )
1344
+
1345
+ def _calculate_callee_confidence(
1346
+ self, caller_qn: str, callee_qn: str, module_qn: str, object_name: str | None
1347
+ ) -> float:
1348
+ """Calculate confidence score for a potential callee match.
1349
+
1350
+ Args:
1351
+ caller_qn: Qualified name of the calling function
1352
+ callee_qn: Qualified name of the potential callee
1353
+ module_qn: Qualified name of the current module
1354
+ object_name: Object name for method calls (e.g., 'obj' in obj.method())
1355
+
1356
+ Returns:
1357
+ Confidence score between 0.0 and 1.0
1358
+ """
1359
+ score = 0.0
1360
+
1361
+ # 1. Module locality - functions in the same module are most likely
1362
+ if callee_qn.startswith(module_qn + "."):
1363
+ score += 0.5
1364
+
1365
+ # Even higher if in the same class
1366
+ caller_parts = caller_qn.split(".")
1367
+ callee_parts = callee_qn.split(".")
1368
+ if len(caller_parts) >= 3 and len(callee_parts) >= 3:
1369
+ if caller_parts[:-1] == callee_parts[:-1]: # Same class
1370
+ score += 0.2
1371
+
1372
+ # 2. Package locality - functions in the same package hierarchy
1373
+ elif "." in module_qn:
1374
+ package = module_qn.rsplit(".", 1)[0]
1375
+ if callee_qn.startswith(package + "."):
1376
+ score += 0.3
1377
+
1378
+ # 3. Object/class match for method calls
1379
+ if object_name:
1380
+ # Check if callee is a method of a class matching the object name
1381
+ callee_parts = callee_qn.split(".")
1382
+ if len(callee_parts) >= 2:
1383
+ # Simple heuristic: check if class name matches object name
1384
+ # (In reality, we'd need type inference for accuracy)
1385
+ class_name = callee_parts[-2]
1386
+ if class_name.lower() == object_name.lower():
1387
+ score += 0.3
1388
+ elif object_name == "self" and callee_qn.startswith(
1389
+ caller_qn.rsplit(".", 1)[0]
1390
+ ):
1391
+ # 'self' refers to the same class
1392
+ score += 0.4
1393
+
1394
+ # 4. Import presence check (simplified - would need import tracking)
1395
+ # For now, we'll give a small boost to standard library functions
1396
+ if callee_qn.startswith(("builtins.", "typing.", "collections.")):
1397
+ score += 0.1
1398
+
1399
+ # 5. Name similarity for disambiguation
1400
+ # If function names are unique enough, boost confidence
1401
+ possible_count = len(
1402
+ self.simple_name_lookup.get(callee_qn.split(".")[-1], set())
1403
+ )
1404
+ if possible_count == 1:
1405
+ score += 0.2
1406
+ elif possible_count <= 3:
1407
+ score += 0.1
1408
+
1409
+ # Normalize to [0, 1]
1410
+ return min(score, 1.0)
1411
+
1412
+ def _find_containing_function(self, node: Node, module_qn: str) -> str | None:
1413
+ """Find the containing function/method of a node."""
1414
+ current = node.parent
1415
+
1416
+ while current:
1417
+ if current.type in [
1418
+ "function_definition",
1419
+ "method_definition",
1420
+ "arrow_function",
1421
+ ]:
1422
+ # Extract function name
1423
+ for child in current.children:
1424
+ if child.type == "identifier" and child.text:
1425
+ func_name = child.text.decode("utf-8")
1426
+
1427
+ # Check if this is inside a class
1428
+ parent_class = self._find_parent_class(current, module_qn)
1429
+ if parent_class:
1430
+ return f"{parent_class}.{func_name}"
1431
+ else:
1432
+ return f"{module_qn}.{func_name}"
1433
+
1434
+ current = current.parent
1435
+
1436
+ return None
1437
+
1438
+
1439
+ class CodebaseIngestor:
1440
+ """Main ingestor class for building code knowledge graphs."""
1441
+
1442
+ def __init__(
1443
+ self,
1444
+ db_path: str,
1445
+ project_name: str | None = None,
1446
+ exclude_patterns: list[str] | None = None,
1447
+ ):
1448
+ """Initialize the ingestor.
1449
+
1450
+ Args:
1451
+ db_path: Path to Kuzu database
1452
+ project_name: Optional project name
1453
+ exclude_patterns: Patterns to exclude from processing
1454
+ """
1455
+ self.db_path = Path(db_path)
1456
+ self.project_name = project_name
1457
+ self.exclude_patterns = exclude_patterns or []
1458
+
1459
+ def build_graph_from_directory(self, repo_path: str) -> None:
1460
+ """Build a code knowledge graph from a directory.
1461
+
1462
+ Args:
1463
+ repo_path: Path to repository directory
1464
+ """
1465
+ repo_path_obj = Path(repo_path)
1466
+
1467
+ # Use directory name as project name if not specified
1468
+ if not self.project_name:
1469
+ self.project_name = repo_path_obj.name
1470
+
1471
+ try:
1472
+ # Create database
1473
+ logger.info(f"Creating Kuzu database at: {self.db_path}")
1474
+ db = kuzu.Database(str(self.db_path))
1475
+ conn = kuzu.Connection(db)
1476
+
1477
+ # Initialize ingestor
1478
+ ingestor = Ingestor(conn)
1479
+ ingestor.create_schema()
1480
+
1481
+ # Load parsers
1482
+ logger.info("Loading language parsers...")
1483
+ parsers, queries = load_parsers()
1484
+
1485
+ # Build graph
1486
+ builder = SimpleGraphBuilder(
1487
+ ingestor, repo_path_obj, parsers, queries, self.exclude_patterns
1488
+ )
1489
+ if self.project_name:
1490
+ builder.project_name = self.project_name
1491
+ builder.run()
1492
+
1493
+ logger.info(f"Graph successfully created at: {self.db_path}")
1494
+
1495
+ except Exception as e:
1496
+ logger.error(f"Failed to build graph: {e}")
1497
+ raise