shotgun-sh 0.2.29.dev2__py3-none-any.whl → 0.6.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of shotgun-sh might be problematic. Click here for more details.
- shotgun/agents/agent_manager.py +497 -30
- shotgun/agents/cancellation.py +103 -0
- shotgun/agents/common.py +90 -77
- shotgun/agents/config/README.md +0 -1
- shotgun/agents/config/manager.py +52 -8
- shotgun/agents/config/models.py +48 -45
- shotgun/agents/config/provider.py +44 -29
- shotgun/agents/conversation/history/file_content_deduplication.py +66 -43
- shotgun/agents/conversation/history/token_counting/base.py +51 -9
- shotgun/agents/export.py +12 -13
- shotgun/agents/file_read.py +176 -0
- shotgun/agents/messages.py +15 -3
- shotgun/agents/models.py +90 -2
- shotgun/agents/plan.py +12 -13
- shotgun/agents/research.py +13 -10
- shotgun/agents/router/__init__.py +47 -0
- shotgun/agents/router/models.py +384 -0
- shotgun/agents/router/router.py +185 -0
- shotgun/agents/router/tools/__init__.py +18 -0
- shotgun/agents/router/tools/delegation_tools.py +557 -0
- shotgun/agents/router/tools/plan_tools.py +403 -0
- shotgun/agents/runner.py +17 -2
- shotgun/agents/specify.py +12 -13
- shotgun/agents/tasks.py +12 -13
- shotgun/agents/tools/__init__.py +8 -0
- shotgun/agents/tools/codebase/directory_lister.py +27 -39
- shotgun/agents/tools/codebase/file_read.py +26 -35
- shotgun/agents/tools/codebase/query_graph.py +9 -0
- shotgun/agents/tools/codebase/retrieve_code.py +9 -0
- shotgun/agents/tools/file_management.py +81 -3
- shotgun/agents/tools/file_read_tools/__init__.py +7 -0
- shotgun/agents/tools/file_read_tools/multimodal_file_read.py +167 -0
- shotgun/agents/tools/markdown_tools/__init__.py +62 -0
- shotgun/agents/tools/markdown_tools/insert_section.py +148 -0
- shotgun/agents/tools/markdown_tools/models.py +86 -0
- shotgun/agents/tools/markdown_tools/remove_section.py +114 -0
- shotgun/agents/tools/markdown_tools/replace_section.py +119 -0
- shotgun/agents/tools/markdown_tools/utils.py +453 -0
- shotgun/agents/tools/registry.py +41 -0
- shotgun/agents/tools/web_search/__init__.py +1 -2
- shotgun/agents/tools/web_search/gemini.py +1 -3
- shotgun/agents/tools/web_search/openai.py +42 -23
- shotgun/attachments/__init__.py +41 -0
- shotgun/attachments/errors.py +60 -0
- shotgun/attachments/models.py +107 -0
- shotgun/attachments/parser.py +257 -0
- shotgun/attachments/processor.py +193 -0
- shotgun/cli/clear.py +2 -2
- shotgun/cli/codebase/commands.py +181 -65
- shotgun/cli/compact.py +2 -2
- shotgun/cli/context.py +2 -2
- shotgun/cli/run.py +90 -0
- shotgun/cli/spec/backup.py +2 -1
- shotgun/cli/spec/commands.py +2 -0
- shotgun/cli/spec/models.py +18 -0
- shotgun/cli/spec/pull_service.py +122 -68
- shotgun/codebase/__init__.py +2 -0
- shotgun/codebase/benchmarks/__init__.py +35 -0
- shotgun/codebase/benchmarks/benchmark_runner.py +309 -0
- shotgun/codebase/benchmarks/exporters.py +119 -0
- shotgun/codebase/benchmarks/formatters/__init__.py +49 -0
- shotgun/codebase/benchmarks/formatters/base.py +34 -0
- shotgun/codebase/benchmarks/formatters/json_formatter.py +106 -0
- shotgun/codebase/benchmarks/formatters/markdown.py +136 -0
- shotgun/codebase/benchmarks/models.py +129 -0
- shotgun/codebase/core/__init__.py +4 -0
- shotgun/codebase/core/call_resolution.py +91 -0
- shotgun/codebase/core/change_detector.py +11 -6
- shotgun/codebase/core/errors.py +159 -0
- shotgun/codebase/core/extractors/__init__.py +23 -0
- shotgun/codebase/core/extractors/base.py +138 -0
- shotgun/codebase/core/extractors/factory.py +63 -0
- shotgun/codebase/core/extractors/go/__init__.py +7 -0
- shotgun/codebase/core/extractors/go/extractor.py +122 -0
- shotgun/codebase/core/extractors/javascript/__init__.py +7 -0
- shotgun/codebase/core/extractors/javascript/extractor.py +132 -0
- shotgun/codebase/core/extractors/protocol.py +109 -0
- shotgun/codebase/core/extractors/python/__init__.py +7 -0
- shotgun/codebase/core/extractors/python/extractor.py +141 -0
- shotgun/codebase/core/extractors/rust/__init__.py +7 -0
- shotgun/codebase/core/extractors/rust/extractor.py +139 -0
- shotgun/codebase/core/extractors/types.py +15 -0
- shotgun/codebase/core/extractors/typescript/__init__.py +7 -0
- shotgun/codebase/core/extractors/typescript/extractor.py +92 -0
- shotgun/codebase/core/gitignore.py +252 -0
- shotgun/codebase/core/ingestor.py +644 -354
- shotgun/codebase/core/kuzu_compat.py +119 -0
- shotgun/codebase/core/language_config.py +239 -0
- shotgun/codebase/core/manager.py +256 -46
- shotgun/codebase/core/metrics_collector.py +310 -0
- shotgun/codebase/core/metrics_types.py +347 -0
- shotgun/codebase/core/parallel_executor.py +424 -0
- shotgun/codebase/core/work_distributor.py +254 -0
- shotgun/codebase/core/worker.py +768 -0
- shotgun/codebase/indexing_state.py +86 -0
- shotgun/codebase/models.py +94 -0
- shotgun/codebase/service.py +13 -0
- shotgun/exceptions.py +1 -1
- shotgun/main.py +2 -10
- shotgun/prompts/agents/export.j2 +2 -0
- shotgun/prompts/agents/file_read.j2 +48 -0
- shotgun/prompts/agents/partials/common_agent_system_prompt.j2 +20 -28
- shotgun/prompts/agents/partials/content_formatting.j2 +12 -33
- shotgun/prompts/agents/partials/interactive_mode.j2 +9 -32
- shotgun/prompts/agents/partials/router_delegation_mode.j2 +35 -0
- shotgun/prompts/agents/plan.j2 +43 -1
- shotgun/prompts/agents/research.j2 +75 -20
- shotgun/prompts/agents/router.j2 +713 -0
- shotgun/prompts/agents/specify.j2 +94 -4
- shotgun/prompts/agents/state/codebase/codebase_graphs_available.j2 +14 -1
- shotgun/prompts/agents/state/system_state.j2 +24 -15
- shotgun/prompts/agents/tasks.j2 +77 -23
- shotgun/settings.py +44 -0
- shotgun/shotgun_web/shared_specs/upload_pipeline.py +38 -0
- shotgun/tui/app.py +90 -23
- shotgun/tui/commands/__init__.py +9 -1
- shotgun/tui/components/attachment_bar.py +87 -0
- shotgun/tui/components/mode_indicator.py +120 -25
- shotgun/tui/components/prompt_input.py +23 -28
- shotgun/tui/components/status_bar.py +5 -4
- shotgun/tui/dependencies.py +58 -8
- shotgun/tui/protocols.py +37 -0
- shotgun/tui/screens/chat/chat.tcss +24 -1
- shotgun/tui/screens/chat/chat_screen.py +1374 -211
- shotgun/tui/screens/chat/codebase_index_prompt_screen.py +8 -4
- shotgun/tui/screens/chat_screen/attachment_hint.py +40 -0
- shotgun/tui/screens/chat_screen/command_providers.py +0 -97
- shotgun/tui/screens/chat_screen/history/agent_response.py +7 -3
- shotgun/tui/screens/chat_screen/history/chat_history.py +49 -6
- shotgun/tui/screens/chat_screen/history/formatters.py +75 -15
- shotgun/tui/screens/chat_screen/history/partial_response.py +11 -1
- shotgun/tui/screens/chat_screen/history/user_question.py +25 -3
- shotgun/tui/screens/chat_screen/messages.py +219 -0
- shotgun/tui/screens/database_locked_dialog.py +219 -0
- shotgun/tui/screens/database_timeout_dialog.py +158 -0
- shotgun/tui/screens/kuzu_error_dialog.py +135 -0
- shotgun/tui/screens/model_picker.py +14 -9
- shotgun/tui/screens/models.py +11 -0
- shotgun/tui/screens/shotgun_auth.py +50 -0
- shotgun/tui/screens/spec_pull.py +2 -0
- shotgun/tui/state/processing_state.py +19 -0
- shotgun/tui/utils/mode_progress.py +20 -86
- shotgun/tui/widgets/__init__.py +2 -1
- shotgun/tui/widgets/approval_widget.py +152 -0
- shotgun/tui/widgets/cascade_confirmation_widget.py +203 -0
- shotgun/tui/widgets/plan_panel.py +129 -0
- shotgun/tui/widgets/step_checkpoint_widget.py +180 -0
- shotgun/tui/widgets/widget_coordinator.py +18 -0
- shotgun/utils/file_system_utils.py +4 -1
- {shotgun_sh-0.2.29.dev2.dist-info → shotgun_sh-0.6.1.dev1.dist-info}/METADATA +88 -34
- shotgun_sh-0.6.1.dev1.dist-info/RECORD +292 -0
- shotgun/cli/export.py +0 -81
- shotgun/cli/plan.py +0 -73
- shotgun/cli/research.py +0 -93
- shotgun/cli/specify.py +0 -70
- shotgun/cli/tasks.py +0 -78
- shotgun/tui/screens/onboarding.py +0 -580
- shotgun_sh-0.2.29.dev2.dist-info/RECORD +0 -229
- {shotgun_sh-0.2.29.dev2.dist-info → shotgun_sh-0.6.1.dev1.dist-info}/WHEEL +0 -0
- {shotgun_sh-0.2.29.dev2.dist-info → shotgun_sh-0.6.1.dev1.dist-info}/entry_points.txt +0 -0
- {shotgun_sh-0.2.29.dev2.dist-info → shotgun_sh-0.6.1.dev1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,80 +1,73 @@
|
|
|
1
1
|
"""Kuzu graph ingestor for building code knowledge graphs."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import asyncio
|
|
4
6
|
import hashlib
|
|
7
|
+
import multiprocessing
|
|
5
8
|
import os
|
|
6
9
|
import time
|
|
7
10
|
import uuid
|
|
8
11
|
from collections import defaultdict
|
|
9
|
-
from collections.abc import Callable
|
|
12
|
+
from collections.abc import Callable, Generator
|
|
13
|
+
from contextlib import contextmanager
|
|
10
14
|
from pathlib import Path
|
|
11
|
-
from typing import Any
|
|
15
|
+
from typing import TYPE_CHECKING, Any
|
|
12
16
|
|
|
13
17
|
import aiofiles
|
|
14
|
-
import kuzu
|
|
15
18
|
from tree_sitter import Node, Parser, QueryCursor
|
|
16
19
|
|
|
17
|
-
from shotgun.codebase.core.
|
|
20
|
+
from shotgun.codebase.core.call_resolution import calculate_callee_confidence
|
|
21
|
+
from shotgun.codebase.core.gitignore import GitignoreManager
|
|
22
|
+
from shotgun.codebase.core.kuzu_compat import get_kuzu
|
|
23
|
+
from shotgun.codebase.core.metrics_collector import MetricsCollector
|
|
24
|
+
from shotgun.codebase.core.metrics_types import (
|
|
25
|
+
FileInfo,
|
|
26
|
+
IndexingPhase,
|
|
27
|
+
ParallelExecutionResult,
|
|
28
|
+
)
|
|
29
|
+
from shotgun.codebase.core.parallel_executor import ParallelExecutor
|
|
30
|
+
from shotgun.codebase.core.work_distributor import WorkDistributor, get_worker_count
|
|
31
|
+
from shotgun.codebase.models import (
|
|
32
|
+
IgnoreReason,
|
|
33
|
+
IndexingStats,
|
|
34
|
+
IndexProgress,
|
|
35
|
+
NodeLabel,
|
|
36
|
+
ProgressPhase,
|
|
37
|
+
RelationshipType,
|
|
38
|
+
)
|
|
39
|
+
from shotgun.posthog_telemetry import track_event
|
|
40
|
+
from shotgun.settings import settings
|
|
41
|
+
|
|
42
|
+
if TYPE_CHECKING:
|
|
43
|
+
import real_ladybug as kuzu
|
|
44
|
+
|
|
45
|
+
from shotgun.codebase.core.language_config import (
|
|
46
|
+
LANGUAGE_CONFIGS,
|
|
47
|
+
get_all_ignore_directories,
|
|
48
|
+
get_language_config,
|
|
49
|
+
is_path_ignored,
|
|
50
|
+
should_ignore_directory,
|
|
51
|
+
)
|
|
18
52
|
from shotgun.codebase.core.parser_loader import load_parsers
|
|
19
53
|
from shotgun.logging_config import get_logger
|
|
20
54
|
|
|
21
55
|
logger = get_logger(__name__)
|
|
22
56
|
|
|
57
|
+
# For backwards compatibility, expose IGNORE_PATTERNS from this module
|
|
58
|
+
IGNORE_PATTERNS = get_all_ignore_directories()
|
|
23
59
|
|
|
24
|
-
#
|
|
25
|
-
|
|
26
|
-
"
|
|
27
|
-
"
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
|
|
35
|
-
".idea",
|
|
36
|
-
".vscode",
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
# Well-known build output directories to skip when determining source files
|
|
40
|
-
BUILD_ARTIFACT_DIRECTORIES = {
|
|
41
|
-
"node_modules",
|
|
42
|
-
".next",
|
|
43
|
-
".nuxt",
|
|
44
|
-
".vite",
|
|
45
|
-
".yarn",
|
|
46
|
-
".svelte-kit",
|
|
47
|
-
".output",
|
|
48
|
-
".turbo",
|
|
49
|
-
".parcel-cache",
|
|
50
|
-
".vercel",
|
|
51
|
-
".serverless",
|
|
52
|
-
"build",
|
|
53
|
-
"dist",
|
|
54
|
-
"out",
|
|
55
|
-
"tmp",
|
|
56
|
-
"coverage",
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
# Default ignore patterns combines base directories and build artifacts
|
|
60
|
-
IGNORE_PATTERNS = BASE_IGNORE_DIRECTORIES | BUILD_ARTIFACT_DIRECTORIES
|
|
61
|
-
|
|
62
|
-
# Directory prefixes that should always be ignored
|
|
63
|
-
IGNORED_DIRECTORY_PREFIXES = (".",)
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def should_ignore_directory(name: str, ignore_patterns: set[str] | None = None) -> bool:
|
|
67
|
-
"""Return True if the directory name should be ignored."""
|
|
68
|
-
patterns = IGNORE_PATTERNS if ignore_patterns is None else ignore_patterns
|
|
69
|
-
if name in patterns:
|
|
70
|
-
return True
|
|
71
|
-
return name.startswith(IGNORED_DIRECTORY_PREFIXES)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def is_path_ignored(path: Path, ignore_patterns: set[str] | None = None) -> bool:
|
|
75
|
-
"""Return True if any part of the path should be ignored."""
|
|
76
|
-
patterns = IGNORE_PATTERNS if ignore_patterns is None else ignore_patterns
|
|
77
|
-
return any(should_ignore_directory(part, patterns) for part in path.parts)
|
|
60
|
+
# Explicit re-exports for type checkers
|
|
61
|
+
__all__ = [
|
|
62
|
+
"IGNORE_PATTERNS",
|
|
63
|
+
"LANGUAGE_CONFIGS",
|
|
64
|
+
"Ingestor",
|
|
65
|
+
"SimpleGraphBuilder",
|
|
66
|
+
"get_all_ignore_directories",
|
|
67
|
+
"get_language_config",
|
|
68
|
+
"is_path_ignored",
|
|
69
|
+
"should_ignore_directory",
|
|
70
|
+
]
|
|
78
71
|
|
|
79
72
|
|
|
80
73
|
class Ingestor:
|
|
@@ -87,6 +80,8 @@ class Ingestor:
|
|
|
87
80
|
tuple[str, str, Any, str, str, str, Any, dict[str, Any] | None]
|
|
88
81
|
] = []
|
|
89
82
|
self.batch_size = 1000
|
|
83
|
+
# Track seen primary keys to avoid O(n²) duplicate checking
|
|
84
|
+
self._seen_node_keys: set[tuple[str, str]] = set()
|
|
90
85
|
|
|
91
86
|
def create_schema(self) -> None:
|
|
92
87
|
"""Create the graph schema in Kuzu."""
|
|
@@ -159,10 +154,13 @@ class Ingestor:
|
|
|
159
154
|
|
|
160
155
|
def ensure_node_batch(self, label: str, properties: dict[str, Any]) -> None:
|
|
161
156
|
"""Add a node to the buffer for batch insertion."""
|
|
162
|
-
# Check for duplicates based on primary key
|
|
157
|
+
# Check for duplicates based on primary key using O(1) set lookup
|
|
163
158
|
primary_key = self._get_primary_key(label, properties)
|
|
164
|
-
if primary_key
|
|
165
|
-
|
|
159
|
+
if primary_key:
|
|
160
|
+
key = (label, primary_key)
|
|
161
|
+
if key in self._seen_node_keys:
|
|
162
|
+
return
|
|
163
|
+
self._seen_node_keys.add(key)
|
|
166
164
|
|
|
167
165
|
self.node_buffer.append((label, properties))
|
|
168
166
|
|
|
@@ -176,29 +174,26 @@ class Ingestor:
|
|
|
176
174
|
|
|
177
175
|
def _get_primary_key_field(self, label: str) -> str | None:
|
|
178
176
|
"""Get the primary key field name for a node type."""
|
|
179
|
-
if label ==
|
|
177
|
+
if label == NodeLabel.PROJECT:
|
|
180
178
|
return "name"
|
|
181
|
-
elif label in [
|
|
179
|
+
elif label in [
|
|
180
|
+
NodeLabel.PACKAGE,
|
|
181
|
+
NodeLabel.MODULE,
|
|
182
|
+
NodeLabel.CLASS,
|
|
183
|
+
NodeLabel.FUNCTION,
|
|
184
|
+
NodeLabel.METHOD,
|
|
185
|
+
]:
|
|
182
186
|
return "qualified_name"
|
|
183
|
-
elif label in [
|
|
187
|
+
elif label in [NodeLabel.FOLDER, NodeLabel.FILE]:
|
|
184
188
|
return "path"
|
|
185
|
-
elif label ==
|
|
189
|
+
elif label == NodeLabel.FILE_METADATA:
|
|
186
190
|
return "filepath"
|
|
187
|
-
elif label ==
|
|
191
|
+
elif label == NodeLabel.EXTERNAL_PACKAGE:
|
|
188
192
|
return "name"
|
|
189
|
-
elif label ==
|
|
193
|
+
elif label == NodeLabel.DELETION_LOG:
|
|
190
194
|
return "id"
|
|
191
195
|
return None
|
|
192
196
|
|
|
193
|
-
def _is_duplicate_node(self, label: str, primary_key: str) -> bool:
|
|
194
|
-
"""Check if a node with the given primary key already exists in the buffer."""
|
|
195
|
-
for buffered_label, buffered_props in self.node_buffer:
|
|
196
|
-
if buffered_label == label:
|
|
197
|
-
buffered_key = self._get_primary_key(buffered_label, buffered_props)
|
|
198
|
-
if buffered_key == primary_key:
|
|
199
|
-
return True
|
|
200
|
-
return False
|
|
201
|
-
|
|
202
197
|
def flush_nodes(
|
|
203
198
|
self,
|
|
204
199
|
progress_callback: Callable[[int, int], None] | None = None,
|
|
@@ -272,6 +267,7 @@ class Ingestor:
|
|
|
272
267
|
logger.info(f" {label}: {count}")
|
|
273
268
|
|
|
274
269
|
self.node_buffer.clear()
|
|
270
|
+
self._seen_node_keys.clear()
|
|
275
271
|
|
|
276
272
|
def ensure_relationship_batch(
|
|
277
273
|
self,
|
|
@@ -410,45 +406,46 @@ class Ingestor:
|
|
|
410
406
|
) -> str | None:
|
|
411
407
|
"""Determine the actual relationship table name based on source and target."""
|
|
412
408
|
# Mapping of relationship types and from_labels to table names
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
409
|
+
# Keys use enum values for type-safe comparisons
|
|
410
|
+
table_mapping: dict[str, dict[str, str]] = {
|
|
411
|
+
RelationshipType.CONTAINS_PACKAGE: {
|
|
412
|
+
NodeLabel.PROJECT: "CONTAINS_PACKAGE",
|
|
413
|
+
NodeLabel.PACKAGE: "CONTAINS_PACKAGE_PKG",
|
|
414
|
+
NodeLabel.FOLDER: "CONTAINS_PACKAGE_FOLDER",
|
|
418
415
|
},
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
416
|
+
RelationshipType.CONTAINS_FOLDER: {
|
|
417
|
+
NodeLabel.PROJECT: "CONTAINS_FOLDER",
|
|
418
|
+
NodeLabel.PACKAGE: "CONTAINS_FOLDER_PKG",
|
|
419
|
+
NodeLabel.FOLDER: "CONTAINS_FOLDER_FOLDER",
|
|
423
420
|
},
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
421
|
+
RelationshipType.CONTAINS_FILE: {
|
|
422
|
+
NodeLabel.PROJECT: "CONTAINS_FILE",
|
|
423
|
+
NodeLabel.PACKAGE: "CONTAINS_FILE_PKG",
|
|
424
|
+
NodeLabel.FOLDER: "CONTAINS_FILE_FOLDER",
|
|
428
425
|
},
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
426
|
+
RelationshipType.CONTAINS_MODULE: {
|
|
427
|
+
NodeLabel.PROJECT: "CONTAINS_MODULE",
|
|
428
|
+
NodeLabel.PACKAGE: "CONTAINS_MODULE_PKG",
|
|
429
|
+
NodeLabel.FOLDER: "CONTAINS_MODULE_FOLDER",
|
|
433
430
|
},
|
|
434
431
|
}
|
|
435
432
|
|
|
436
433
|
if rel_type in table_mapping:
|
|
437
434
|
return table_mapping[rel_type].get(from_label)
|
|
438
|
-
elif rel_type ==
|
|
439
|
-
if to_label ==
|
|
440
|
-
return
|
|
435
|
+
elif rel_type == RelationshipType.DEFINES:
|
|
436
|
+
if to_label == NodeLabel.FUNCTION:
|
|
437
|
+
return RelationshipType.DEFINES_FUNC
|
|
441
438
|
else:
|
|
442
|
-
return
|
|
443
|
-
elif rel_type ==
|
|
444
|
-
if from_label ==
|
|
445
|
-
return
|
|
446
|
-
elif from_label ==
|
|
447
|
-
return
|
|
448
|
-
elif from_label ==
|
|
449
|
-
return
|
|
450
|
-
elif from_label ==
|
|
451
|
-
return
|
|
439
|
+
return RelationshipType.DEFINES
|
|
440
|
+
elif rel_type == RelationshipType.CALLS:
|
|
441
|
+
if from_label == NodeLabel.FUNCTION and to_label == NodeLabel.FUNCTION:
|
|
442
|
+
return RelationshipType.CALLS
|
|
443
|
+
elif from_label == NodeLabel.FUNCTION and to_label == NodeLabel.METHOD:
|
|
444
|
+
return RelationshipType.CALLS_FM
|
|
445
|
+
elif from_label == NodeLabel.METHOD and to_label == NodeLabel.FUNCTION:
|
|
446
|
+
return RelationshipType.CALLS_MF
|
|
447
|
+
elif from_label == NodeLabel.METHOD and to_label == NodeLabel.METHOD:
|
|
448
|
+
return RelationshipType.CALLS_MM
|
|
452
449
|
elif rel_type.startswith("TRACKS_"):
|
|
453
450
|
# TRACKS relationships already have the correct table name
|
|
454
451
|
return rel_type
|
|
@@ -548,10 +545,10 @@ class Ingestor:
|
|
|
548
545
|
|
|
549
546
|
# Delete each type of node tracked by this file
|
|
550
547
|
for node_type, rel_type, stat_key in [
|
|
551
|
-
(
|
|
552
|
-
(
|
|
553
|
-
(
|
|
554
|
-
(
|
|
548
|
+
(NodeLabel.MODULE, RelationshipType.TRACKS_MODULE, "modules"),
|
|
549
|
+
(NodeLabel.CLASS, RelationshipType.TRACKS_CLASS, "classes"),
|
|
550
|
+
(NodeLabel.FUNCTION, RelationshipType.TRACKS_FUNCTION, "functions"),
|
|
551
|
+
(NodeLabel.METHOD, RelationshipType.TRACKS_METHOD, "methods"),
|
|
555
552
|
]:
|
|
556
553
|
try:
|
|
557
554
|
# First get the nodes to delete (for logging)
|
|
@@ -614,6 +611,9 @@ class SimpleGraphBuilder:
|
|
|
614
611
|
queries: dict[str, Any],
|
|
615
612
|
exclude_patterns: list[str] | None = None,
|
|
616
613
|
progress_callback: Any | None = None,
|
|
614
|
+
respect_gitignore: bool = True,
|
|
615
|
+
metrics_collector: MetricsCollector | None = None,
|
|
616
|
+
enable_parallel: bool = True,
|
|
617
617
|
):
|
|
618
618
|
self.ingestor = ingestor
|
|
619
619
|
self.repo_path = repo_path
|
|
@@ -624,10 +624,26 @@ class SimpleGraphBuilder:
|
|
|
624
624
|
if exclude_patterns:
|
|
625
625
|
self.ignore_dirs = self.ignore_dirs.union(set(exclude_patterns))
|
|
626
626
|
self.progress_callback = progress_callback
|
|
627
|
+
self.metrics_collector = metrics_collector
|
|
628
|
+
|
|
629
|
+
# Initialize gitignore support
|
|
630
|
+
self.respect_gitignore = respect_gitignore
|
|
631
|
+
self.gitignore_manager: GitignoreManager | None = None
|
|
632
|
+
if respect_gitignore:
|
|
633
|
+
self.gitignore_manager = GitignoreManager(repo_path)
|
|
634
|
+
if self.gitignore_manager.stats.patterns_loaded > 0:
|
|
635
|
+
logger.info(
|
|
636
|
+
f"Loaded gitignore patterns - "
|
|
637
|
+
f"files: {self.gitignore_manager.stats.gitignore_files_loaded}, "
|
|
638
|
+
f"patterns: {self.gitignore_manager.stats.patterns_loaded}"
|
|
639
|
+
)
|
|
627
640
|
|
|
628
641
|
# Generate unique session ID for correlating timing events in PostHog
|
|
629
642
|
self._index_session_id = str(uuid.uuid4())[:8]
|
|
630
643
|
|
|
644
|
+
# Statistics for tracking what was indexed vs skipped
|
|
645
|
+
self._index_stats = IndexingStats()
|
|
646
|
+
|
|
631
647
|
# Caches
|
|
632
648
|
self.structural_elements: dict[Path, str | None] = {}
|
|
633
649
|
self.ast_cache: dict[Path, tuple[Node, str]] = {}
|
|
@@ -635,6 +651,153 @@ class SimpleGraphBuilder:
|
|
|
635
651
|
self.simple_name_lookup: dict[str, set[str]] = defaultdict(set)
|
|
636
652
|
self.class_inheritance: dict[str, list[str]] = {} # class_qn -> [parent_qns]
|
|
637
653
|
|
|
654
|
+
# Parallel execution support
|
|
655
|
+
self.enable_parallel = enable_parallel
|
|
656
|
+
self.parallel_executor: ParallelExecutor | None = None
|
|
657
|
+
self._parallel_mode_active = False # Track if parallel was used for this run
|
|
658
|
+
self._worker_count = 0
|
|
659
|
+
self._init_parallel_executor()
|
|
660
|
+
|
|
661
|
+
def _init_parallel_executor(self) -> None:
|
|
662
|
+
"""Initialize parallel executor if conditions are met.
|
|
663
|
+
|
|
664
|
+
Conditions for parallel execution:
|
|
665
|
+
1. enable_parallel=True (constructor parameter)
|
|
666
|
+
2. settings.indexing.index_parallel is True
|
|
667
|
+
3. CPU count >= 4
|
|
668
|
+
"""
|
|
669
|
+
# Check settings override
|
|
670
|
+
if not settings.indexing.index_parallel:
|
|
671
|
+
logger.info("Parallel indexing disabled via SHOTGUN_INDEX_PARALLEL=false")
|
|
672
|
+
return
|
|
673
|
+
|
|
674
|
+
if not self.enable_parallel:
|
|
675
|
+
logger.debug("Parallel indexing disabled via enable_parallel=False")
|
|
676
|
+
return
|
|
677
|
+
|
|
678
|
+
cpu_count = multiprocessing.cpu_count()
|
|
679
|
+
if cpu_count < 4:
|
|
680
|
+
logger.info(f"Parallel indexing disabled: CPU count ({cpu_count}) < 4")
|
|
681
|
+
return
|
|
682
|
+
|
|
683
|
+
worker_count = get_worker_count()
|
|
684
|
+
self.parallel_executor = ParallelExecutor(
|
|
685
|
+
worker_count=worker_count,
|
|
686
|
+
metrics_collector=self.metrics_collector,
|
|
687
|
+
)
|
|
688
|
+
self._worker_count = worker_count
|
|
689
|
+
logger.info(f"Parallel indexing enabled with {worker_count} workers")
|
|
690
|
+
|
|
691
|
+
def _build_file_infos(
|
|
692
|
+
self, files_to_process: list[tuple[Path, str]]
|
|
693
|
+
) -> list[FileInfo]:
|
|
694
|
+
"""Convert files_to_process list to FileInfo objects for parallel execution.
|
|
695
|
+
|
|
696
|
+
Args:
|
|
697
|
+
files_to_process: List of (filepath, language) tuples
|
|
698
|
+
|
|
699
|
+
Returns:
|
|
700
|
+
List of FileInfo objects ready for WorkDistributor
|
|
701
|
+
"""
|
|
702
|
+
file_infos: list[FileInfo] = []
|
|
703
|
+
|
|
704
|
+
for filepath, language in files_to_process:
|
|
705
|
+
relative_path = filepath.relative_to(self.repo_path)
|
|
706
|
+
|
|
707
|
+
# Compute module_qn (same logic as _process_single_file)
|
|
708
|
+
if filepath.name == "__init__.py":
|
|
709
|
+
module_qn = ".".join(
|
|
710
|
+
[self.project_name] + list(relative_path.parent.parts)
|
|
711
|
+
)
|
|
712
|
+
else:
|
|
713
|
+
module_qn = ".".join(
|
|
714
|
+
[self.project_name] + list(relative_path.with_suffix("").parts)
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
# Get container qualified name from structural elements
|
|
718
|
+
parent_rel_path = relative_path.parent
|
|
719
|
+
container_qn = self.structural_elements.get(parent_rel_path)
|
|
720
|
+
|
|
721
|
+
try:
|
|
722
|
+
file_size = filepath.stat().st_size
|
|
723
|
+
except OSError:
|
|
724
|
+
file_size = 0
|
|
725
|
+
|
|
726
|
+
file_infos.append(
|
|
727
|
+
FileInfo(
|
|
728
|
+
file_path=filepath,
|
|
729
|
+
relative_path=relative_path,
|
|
730
|
+
language=language,
|
|
731
|
+
module_qn=module_qn,
|
|
732
|
+
container_qn=container_qn,
|
|
733
|
+
file_size_bytes=file_size,
|
|
734
|
+
)
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
return file_infos
|
|
738
|
+
|
|
739
|
+
def _merge_parallel_results(self, result: ParallelExecutionResult) -> None:
|
|
740
|
+
"""Merge parallel execution results into Ingestor buffers and local caches.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
result: ParallelExecutionResult containing all parsed file data
|
|
744
|
+
"""
|
|
745
|
+
# Merge nodes and direct relationships from each file
|
|
746
|
+
for file_result in result.results:
|
|
747
|
+
if not file_result.success:
|
|
748
|
+
logger.warning(
|
|
749
|
+
f"File {file_result.task.file_path} failed: {file_result.error}"
|
|
750
|
+
)
|
|
751
|
+
continue
|
|
752
|
+
|
|
753
|
+
# Add nodes to buffer
|
|
754
|
+
for node in file_result.nodes:
|
|
755
|
+
self.ingestor.ensure_node_batch(node.label, node.properties)
|
|
756
|
+
|
|
757
|
+
# Add direct relationships to buffer
|
|
758
|
+
for rel in file_result.relationships:
|
|
759
|
+
self.ingestor.ensure_relationship_batch(
|
|
760
|
+
rel.from_label,
|
|
761
|
+
rel.from_key,
|
|
762
|
+
rel.from_value,
|
|
763
|
+
rel.rel_type,
|
|
764
|
+
rel.to_label,
|
|
765
|
+
rel.to_key,
|
|
766
|
+
rel.to_value,
|
|
767
|
+
rel.properties,
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
# Add resolved relationships (calls, inheritance) from aggregation
|
|
771
|
+
for rel in result.resolved_relationships:
|
|
772
|
+
self.ingestor.ensure_relationship_batch(
|
|
773
|
+
rel.from_label,
|
|
774
|
+
rel.from_key,
|
|
775
|
+
rel.from_value,
|
|
776
|
+
rel.rel_type,
|
|
777
|
+
rel.to_label,
|
|
778
|
+
rel.to_key,
|
|
779
|
+
rel.to_value,
|
|
780
|
+
rel.properties,
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
# Merge registries into local caches
|
|
784
|
+
self.function_registry.update(result.function_registry)
|
|
785
|
+
for name, qns in result.simple_name_lookup.items():
|
|
786
|
+
for qn in qns:
|
|
787
|
+
self.simple_name_lookup[name].add(qn)
|
|
788
|
+
|
|
789
|
+
# Merge inheritance data for potential future use
|
|
790
|
+
for file_result in result.results:
|
|
791
|
+
if file_result.success:
|
|
792
|
+
for inh in file_result.inheritance_data:
|
|
793
|
+
self.class_inheritance[inh.child_class_qn] = inh.parent_simple_names
|
|
794
|
+
|
|
795
|
+
logger.info(
|
|
796
|
+
f"Merged parallel results: {result.successful_files} files, "
|
|
797
|
+
f"{len(result.function_registry)} registry entries, "
|
|
798
|
+
f"{len(result.resolved_relationships)} resolved relationships"
|
|
799
|
+
)
|
|
800
|
+
|
|
638
801
|
def _report_progress(
|
|
639
802
|
self,
|
|
640
803
|
phase: str,
|
|
@@ -648,9 +811,6 @@ class SimpleGraphBuilder:
|
|
|
648
811
|
return
|
|
649
812
|
|
|
650
813
|
try:
|
|
651
|
-
# Import here to avoid circular dependency
|
|
652
|
-
from shotgun.codebase.models import IndexProgress, ProgressPhase
|
|
653
|
-
|
|
654
814
|
progress = IndexProgress(
|
|
655
815
|
phase=ProgressPhase(phase),
|
|
656
816
|
phase_name=phase_name,
|
|
@@ -671,8 +831,6 @@ class SimpleGraphBuilder:
|
|
|
671
831
|
extra_props: dict[str, Any] | None = None,
|
|
672
832
|
) -> None:
|
|
673
833
|
"""Log timing data to PostHog for analysis."""
|
|
674
|
-
from shotgun.posthog_telemetry import track_event
|
|
675
|
-
|
|
676
834
|
properties: dict[str, Any] = {
|
|
677
835
|
"session_id": self._index_session_id,
|
|
678
836
|
"phase": phase,
|
|
@@ -692,8 +850,6 @@ class SimpleGraphBuilder:
|
|
|
692
850
|
total_relationships: int,
|
|
693
851
|
) -> None:
|
|
694
852
|
"""Log indexing summary event to PostHog."""
|
|
695
|
-
from shotgun.posthog_telemetry import track_event
|
|
696
|
-
|
|
697
853
|
track_event(
|
|
698
854
|
"codebase_index_completed",
|
|
699
855
|
{
|
|
@@ -705,6 +861,24 @@ class SimpleGraphBuilder:
|
|
|
705
861
|
},
|
|
706
862
|
)
|
|
707
863
|
|
|
864
|
+
@contextmanager
|
|
865
|
+
def _track_phase(
|
|
866
|
+
self, phase: IndexingPhase, get_items_count: Callable[[], int]
|
|
867
|
+
) -> Generator[None, None, None]:
|
|
868
|
+
"""Context manager for tracking phase metrics.
|
|
869
|
+
|
|
870
|
+
Args:
|
|
871
|
+
phase: The indexing phase to track
|
|
872
|
+
get_items_count: Callable that returns the items processed count
|
|
873
|
+
"""
|
|
874
|
+
if self.metrics_collector:
|
|
875
|
+
self.metrics_collector.start_phase(phase)
|
|
876
|
+
try:
|
|
877
|
+
yield
|
|
878
|
+
finally:
|
|
879
|
+
if self.metrics_collector:
|
|
880
|
+
self.metrics_collector.end_phase(phase, get_items_count())
|
|
881
|
+
|
|
708
882
|
async def run(self) -> None:
|
|
709
883
|
"""Run the three-pass graph building process."""
|
|
710
884
|
logger.info(f"Building graph for project: {self.project_name}")
|
|
@@ -712,17 +886,25 @@ class SimpleGraphBuilder:
|
|
|
712
886
|
# Pass 1: Structure
|
|
713
887
|
logger.info("Pass 1: Identifying packages and folders...")
|
|
714
888
|
t0 = time.time()
|
|
715
|
-
self.
|
|
889
|
+
with self._track_phase(
|
|
890
|
+
IndexingPhase.STRUCTURE, lambda: self._index_stats.dirs_scanned
|
|
891
|
+
):
|
|
892
|
+
self._identify_structure()
|
|
716
893
|
t1 = time.time()
|
|
717
|
-
self._log_timing(
|
|
894
|
+
self._log_timing(
|
|
895
|
+
IndexingPhase.STRUCTURE, t1 - t0, len(self.structural_elements)
|
|
896
|
+
)
|
|
718
897
|
|
|
719
898
|
# Pass 2: Definitions
|
|
720
899
|
logger.info("Pass 2: Processing files and extracting definitions...")
|
|
721
900
|
t2 = time.time()
|
|
722
|
-
|
|
901
|
+
with self._track_phase(
|
|
902
|
+
IndexingPhase.DEFINITIONS, lambda: self._index_stats.files_processed
|
|
903
|
+
):
|
|
904
|
+
await self._process_files()
|
|
723
905
|
t3 = time.time()
|
|
724
906
|
self._log_timing(
|
|
725
|
-
|
|
907
|
+
IndexingPhase.DEFINITIONS,
|
|
726
908
|
t3 - t2,
|
|
727
909
|
len(self.ast_cache),
|
|
728
910
|
{"file_count": len(self.ast_cache)},
|
|
@@ -731,52 +913,71 @@ class SimpleGraphBuilder:
|
|
|
731
913
|
# Pass 3: Relationships
|
|
732
914
|
logger.info("Pass 3: Processing relationships (calls, imports)...")
|
|
733
915
|
t4 = time.time()
|
|
734
|
-
self.
|
|
916
|
+
with self._track_phase(
|
|
917
|
+
IndexingPhase.RELATIONSHIPS, lambda: self._index_stats.files_processed
|
|
918
|
+
):
|
|
919
|
+
self._process_relationships()
|
|
735
920
|
t5 = time.time()
|
|
736
|
-
self._log_timing(
|
|
921
|
+
self._log_timing(IndexingPhase.RELATIONSHIPS, t5 - t4, len(self.ast_cache))
|
|
737
922
|
|
|
738
923
|
# Flush all pending operations
|
|
739
924
|
logger.info("Flushing all data to database...")
|
|
740
|
-
t6 = time.time()
|
|
741
925
|
node_count = len(self.ingestor.node_buffer)
|
|
742
926
|
|
|
743
927
|
# Create progress callback for flush_nodes
|
|
744
928
|
def node_progress(current: int, total: int) -> None:
|
|
745
929
|
self._report_progress(
|
|
746
|
-
|
|
930
|
+
IndexingPhase.FLUSH_NODES, "Flushing nodes to database", current, total
|
|
747
931
|
)
|
|
748
932
|
|
|
749
|
-
|
|
933
|
+
t6 = time.time()
|
|
934
|
+
with self._track_phase(IndexingPhase.FLUSH_NODES, lambda: node_count):
|
|
935
|
+
self.ingestor.flush_nodes(progress_callback=node_progress)
|
|
936
|
+
t7 = time.time()
|
|
750
937
|
self._report_progress(
|
|
751
|
-
|
|
938
|
+
IndexingPhase.FLUSH_NODES,
|
|
939
|
+
"Flushing nodes to database",
|
|
940
|
+
node_count,
|
|
941
|
+
node_count,
|
|
942
|
+
True,
|
|
943
|
+
)
|
|
944
|
+
self._log_timing(
|
|
945
|
+
IndexingPhase.FLUSH_NODES, t7 - t6, node_count, {"node_count": node_count}
|
|
752
946
|
)
|
|
753
|
-
t7 = time.time()
|
|
754
|
-
self._log_timing("flush_nodes", t7 - t6, node_count, {"node_count": node_count})
|
|
755
947
|
|
|
756
948
|
rel_count = len(self.ingestor.relationship_buffer)
|
|
757
949
|
|
|
758
950
|
# Create progress callback for flush_relationships
|
|
759
951
|
def rel_progress(current: int, total: int) -> None:
|
|
760
952
|
self._report_progress(
|
|
761
|
-
|
|
953
|
+
IndexingPhase.FLUSH_RELATIONSHIPS,
|
|
762
954
|
"Flushing relationships to database",
|
|
763
955
|
current,
|
|
764
956
|
total,
|
|
765
957
|
)
|
|
766
958
|
|
|
767
|
-
|
|
959
|
+
t8_start = time.time()
|
|
960
|
+
with self._track_phase(IndexingPhase.FLUSH_RELATIONSHIPS, lambda: rel_count):
|
|
961
|
+
self.ingestor.flush_relationships(progress_callback=rel_progress)
|
|
962
|
+
t8 = time.time()
|
|
768
963
|
self._report_progress(
|
|
769
|
-
|
|
964
|
+
IndexingPhase.FLUSH_RELATIONSHIPS,
|
|
770
965
|
"Flushing relationships to database",
|
|
771
966
|
rel_count,
|
|
772
967
|
rel_count,
|
|
773
968
|
True,
|
|
774
969
|
)
|
|
775
|
-
t8 = time.time()
|
|
776
970
|
self._log_timing(
|
|
777
|
-
|
|
971
|
+
IndexingPhase.FLUSH_RELATIONSHIPS,
|
|
972
|
+
t8 - t8_start,
|
|
973
|
+
rel_count,
|
|
974
|
+
{"relationship_count": rel_count},
|
|
778
975
|
)
|
|
779
976
|
|
|
977
|
+
# Update metrics collector with totals
|
|
978
|
+
if self.metrics_collector:
|
|
979
|
+
self.metrics_collector.set_totals(node_count, rel_count)
|
|
980
|
+
|
|
780
981
|
# Track summary event with totals (no PII - only numeric metadata)
|
|
781
982
|
total_duration = t8 - t0
|
|
782
983
|
self._log_summary(
|
|
@@ -786,15 +987,79 @@ class SimpleGraphBuilder:
|
|
|
786
987
|
total_relationships=rel_count,
|
|
787
988
|
)
|
|
788
989
|
|
|
990
|
+
# Log final indexing statistics
|
|
991
|
+
logger.info("=== Indexing Statistics ===")
|
|
992
|
+
logger.info(f" Directories scanned: {self._index_stats.dirs_scanned}")
|
|
993
|
+
logger.info(
|
|
994
|
+
f" Directories ignored (hardcoded patterns): {self._index_stats.dirs_ignored_hardcoded}"
|
|
995
|
+
)
|
|
996
|
+
logger.info(
|
|
997
|
+
f" Directories ignored (gitignore): {self._index_stats.dirs_ignored_gitignore}"
|
|
998
|
+
)
|
|
999
|
+
logger.info(f" Files scanned: {self._index_stats.files_scanned}")
|
|
1000
|
+
logger.info(
|
|
1001
|
+
f" Files ignored (hardcoded patterns): {self._index_stats.files_ignored_hardcoded}"
|
|
1002
|
+
)
|
|
1003
|
+
logger.info(
|
|
1004
|
+
f" Files ignored (gitignore): {self._index_stats.files_ignored_gitignore}"
|
|
1005
|
+
)
|
|
1006
|
+
logger.info(
|
|
1007
|
+
f" Files ignored (no parser): {self._index_stats.files_ignored_no_parser}"
|
|
1008
|
+
)
|
|
1009
|
+
logger.info(f" Files processed: {self._index_stats.files_processed}")
|
|
1010
|
+
|
|
1011
|
+
# Log gitignore manager stats if available
|
|
1012
|
+
if self.gitignore_manager:
|
|
1013
|
+
logger.info(f" {self.gitignore_manager.get_stats_summary()}")
|
|
1014
|
+
|
|
789
1015
|
logger.info("Graph building complete!")
|
|
790
1016
|
|
|
1017
|
+
def _should_ignore_directory(
|
|
1018
|
+
self, dir_path: Path, dir_name: str
|
|
1019
|
+
) -> tuple[bool, IgnoreReason | None]:
|
|
1020
|
+
"""Check if a directory should be ignored.
|
|
1021
|
+
|
|
1022
|
+
Args:
|
|
1023
|
+
dir_path: Full path to the directory
|
|
1024
|
+
dir_name: Name of the directory
|
|
1025
|
+
|
|
1026
|
+
Returns:
|
|
1027
|
+
Tuple of (should_ignore, reason)
|
|
1028
|
+
"""
|
|
1029
|
+
# Check hardcoded patterns first (fastest)
|
|
1030
|
+
if should_ignore_directory(dir_name, self.ignore_dirs):
|
|
1031
|
+
return True, IgnoreReason.HARDCODED
|
|
1032
|
+
|
|
1033
|
+
# Check gitignore patterns
|
|
1034
|
+
if self.gitignore_manager:
|
|
1035
|
+
try:
|
|
1036
|
+
relative_path = dir_path.relative_to(self.repo_path)
|
|
1037
|
+
if self.gitignore_manager.is_directory_ignored(relative_path):
|
|
1038
|
+
return True, IgnoreReason.GITIGNORE
|
|
1039
|
+
except ValueError:
|
|
1040
|
+
pass
|
|
1041
|
+
|
|
1042
|
+
return False, None
|
|
1043
|
+
|
|
791
1044
|
def _identify_structure(self) -> None:
|
|
792
1045
|
"""First pass: Walk directory to find packages and folders."""
|
|
793
1046
|
dir_count = 0
|
|
794
1047
|
for root_str, dirs, _ in os.walk(self.repo_path, topdown=True):
|
|
795
|
-
dirs
|
|
796
|
-
|
|
797
|
-
|
|
1048
|
+
# Filter directories - modifying dirs in-place affects os.walk traversal
|
|
1049
|
+
filtered_dirs = []
|
|
1050
|
+
for d in dirs:
|
|
1051
|
+
dir_path = Path(root_str) / d
|
|
1052
|
+
should_ignore, reason = self._should_ignore_directory(dir_path, d)
|
|
1053
|
+
if should_ignore:
|
|
1054
|
+
if reason == IgnoreReason.HARDCODED:
|
|
1055
|
+
self._index_stats.dirs_ignored_hardcoded += 1
|
|
1056
|
+
elif reason == IgnoreReason.GITIGNORE:
|
|
1057
|
+
self._index_stats.dirs_ignored_gitignore += 1
|
|
1058
|
+
else:
|
|
1059
|
+
filtered_dirs.append(d)
|
|
1060
|
+
self._index_stats.dirs_scanned += 1
|
|
1061
|
+
|
|
1062
|
+
dirs[:] = filtered_dirs
|
|
798
1063
|
root = Path(root_str)
|
|
799
1064
|
relative_root = root.relative_to(self.repo_path)
|
|
800
1065
|
|
|
@@ -831,7 +1096,7 @@ class SimpleGraphBuilder:
|
|
|
831
1096
|
# Create package
|
|
832
1097
|
package_qn = ".".join([self.project_name] + list(relative_root.parts))
|
|
833
1098
|
self.ingestor.ensure_node_batch(
|
|
834
|
-
|
|
1099
|
+
NodeLabel.PACKAGE,
|
|
835
1100
|
{
|
|
836
1101
|
"qualified_name": package_qn,
|
|
837
1102
|
"name": relative_root.name,
|
|
@@ -843,22 +1108,22 @@ class SimpleGraphBuilder:
|
|
|
843
1108
|
if parent_container_qn:
|
|
844
1109
|
# Parent is a package
|
|
845
1110
|
self.ingestor.ensure_relationship_batch(
|
|
846
|
-
|
|
1111
|
+
NodeLabel.PACKAGE,
|
|
847
1112
|
"qualified_name",
|
|
848
1113
|
parent_container_qn,
|
|
849
|
-
|
|
850
|
-
|
|
1114
|
+
RelationshipType.CONTAINS_PACKAGE,
|
|
1115
|
+
NodeLabel.PACKAGE,
|
|
851
1116
|
"qualified_name",
|
|
852
1117
|
package_qn,
|
|
853
1118
|
)
|
|
854
1119
|
else:
|
|
855
1120
|
# Parent is project root
|
|
856
1121
|
self.ingestor.ensure_relationship_batch(
|
|
857
|
-
|
|
1122
|
+
NodeLabel.PROJECT,
|
|
858
1123
|
"name",
|
|
859
1124
|
self.project_name,
|
|
860
|
-
|
|
861
|
-
|
|
1125
|
+
RelationshipType.CONTAINS_PACKAGE,
|
|
1126
|
+
NodeLabel.PACKAGE,
|
|
862
1127
|
"qualified_name",
|
|
863
1128
|
package_qn,
|
|
864
1129
|
)
|
|
@@ -867,7 +1132,7 @@ class SimpleGraphBuilder:
|
|
|
867
1132
|
else:
|
|
868
1133
|
# Create folder
|
|
869
1134
|
self.ingestor.ensure_node_batch(
|
|
870
|
-
|
|
1135
|
+
NodeLabel.FOLDER,
|
|
871
1136
|
{
|
|
872
1137
|
"path": str(relative_root).replace(os.sep, "/"),
|
|
873
1138
|
"name": relative_root.name,
|
|
@@ -878,33 +1143,33 @@ class SimpleGraphBuilder:
|
|
|
878
1143
|
if parent_container_qn:
|
|
879
1144
|
# Parent is a package
|
|
880
1145
|
self.ingestor.ensure_relationship_batch(
|
|
881
|
-
|
|
1146
|
+
NodeLabel.PACKAGE,
|
|
882
1147
|
"qualified_name",
|
|
883
1148
|
parent_container_qn,
|
|
884
|
-
|
|
885
|
-
|
|
1149
|
+
RelationshipType.CONTAINS_FOLDER,
|
|
1150
|
+
NodeLabel.FOLDER,
|
|
886
1151
|
"path",
|
|
887
1152
|
str(relative_root).replace(os.sep, "/"),
|
|
888
1153
|
)
|
|
889
1154
|
elif parent_rel_path == Path("."):
|
|
890
1155
|
# Parent is project root
|
|
891
1156
|
self.ingestor.ensure_relationship_batch(
|
|
892
|
-
|
|
1157
|
+
NodeLabel.PROJECT,
|
|
893
1158
|
"name",
|
|
894
1159
|
self.project_name,
|
|
895
|
-
|
|
896
|
-
|
|
1160
|
+
RelationshipType.CONTAINS_FOLDER,
|
|
1161
|
+
NodeLabel.FOLDER,
|
|
897
1162
|
"path",
|
|
898
1163
|
str(relative_root).replace(os.sep, "/"),
|
|
899
1164
|
)
|
|
900
1165
|
else:
|
|
901
1166
|
# Parent is another folder
|
|
902
1167
|
self.ingestor.ensure_relationship_batch(
|
|
903
|
-
|
|
1168
|
+
NodeLabel.FOLDER,
|
|
904
1169
|
"path",
|
|
905
1170
|
str(parent_rel_path).replace(os.sep, "/"),
|
|
906
|
-
|
|
907
|
-
|
|
1171
|
+
RelationshipType.CONTAINS_FOLDER,
|
|
1172
|
+
NodeLabel.FOLDER,
|
|
908
1173
|
"path",
|
|
909
1174
|
str(relative_root).replace(os.sep, "/"),
|
|
910
1175
|
)
|
|
@@ -919,62 +1184,195 @@ class SimpleGraphBuilder:
|
|
|
919
1184
|
phase_complete=True,
|
|
920
1185
|
)
|
|
921
1186
|
|
|
1187
|
+
def _should_ignore_file(self, filepath: Path) -> tuple[bool, IgnoreReason | None]:
|
|
1188
|
+
"""Check if a file should be ignored.
|
|
1189
|
+
|
|
1190
|
+
Args:
|
|
1191
|
+
filepath: Full path to the file
|
|
1192
|
+
|
|
1193
|
+
Returns:
|
|
1194
|
+
Tuple of (should_ignore, reason)
|
|
1195
|
+
"""
|
|
1196
|
+
# Check hardcoded directory patterns in path
|
|
1197
|
+
if is_path_ignored(filepath, self.ignore_dirs):
|
|
1198
|
+
return True, IgnoreReason.HARDCODED
|
|
1199
|
+
|
|
1200
|
+
# Check gitignore patterns
|
|
1201
|
+
if self.gitignore_manager:
|
|
1202
|
+
try:
|
|
1203
|
+
relative_path = filepath.relative_to(self.repo_path)
|
|
1204
|
+
if self.gitignore_manager.is_ignored(relative_path):
|
|
1205
|
+
return True, IgnoreReason.GITIGNORE
|
|
1206
|
+
except ValueError:
|
|
1207
|
+
pass
|
|
1208
|
+
|
|
1209
|
+
return False, None
|
|
1210
|
+
|
|
922
1211
|
async def _process_files(self) -> None:
|
|
923
1212
|
"""Second pass: Process files and extract definitions."""
|
|
924
|
-
# First pass: Count total files
|
|
1213
|
+
# First pass: Count total files (respecting all ignore patterns)
|
|
925
1214
|
total_files = 0
|
|
926
|
-
|
|
1215
|
+
files_to_process: list[tuple[Path, str]] = []
|
|
1216
|
+
|
|
1217
|
+
for root_str, dirs, files in os.walk(self.repo_path, topdown=True):
|
|
927
1218
|
root = Path(root_str)
|
|
928
1219
|
|
|
929
|
-
#
|
|
930
|
-
|
|
931
|
-
|
|
1220
|
+
# Filter directories in-place to prevent os.walk from descending
|
|
1221
|
+
filtered_dirs = []
|
|
1222
|
+
for d in dirs:
|
|
1223
|
+
dir_path = root / d
|
|
1224
|
+
should_ignore, _ = self._should_ignore_directory(dir_path, d)
|
|
1225
|
+
if not should_ignore:
|
|
1226
|
+
filtered_dirs.append(d)
|
|
1227
|
+
dirs[:] = filtered_dirs
|
|
932
1228
|
|
|
933
1229
|
for filename in files:
|
|
934
1230
|
filepath = root / filename
|
|
1231
|
+
self._index_stats.files_scanned += 1
|
|
1232
|
+
|
|
1233
|
+
# Check if file should be ignored
|
|
1234
|
+
should_ignore, reason = self._should_ignore_file(filepath)
|
|
1235
|
+
if should_ignore:
|
|
1236
|
+
if reason == IgnoreReason.HARDCODED:
|
|
1237
|
+
self._index_stats.files_ignored_hardcoded += 1
|
|
1238
|
+
elif reason == IgnoreReason.GITIGNORE:
|
|
1239
|
+
self._index_stats.files_ignored_gitignore += 1
|
|
1240
|
+
continue
|
|
1241
|
+
|
|
1242
|
+
# Check if this is a supported file
|
|
935
1243
|
ext = filepath.suffix
|
|
936
1244
|
lang_config = get_language_config(ext)
|
|
937
1245
|
|
|
938
1246
|
if lang_config and lang_config.name in self.parsers:
|
|
1247
|
+
files_to_process.append((filepath, lang_config.name))
|
|
939
1248
|
total_files += 1
|
|
1249
|
+
else:
|
|
1250
|
+
self._index_stats.files_ignored_no_parser += 1
|
|
940
1251
|
|
|
941
|
-
#
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1252
|
+
# Log what we're about to process
|
|
1253
|
+
logger.info(
|
|
1254
|
+
f"Index statistics: "
|
|
1255
|
+
f"scanned {self._index_stats.files_scanned} files, "
|
|
1256
|
+
f"processing {total_files}, "
|
|
1257
|
+
f"skipped {self._index_stats.files_ignored_hardcoded} (hardcoded), "
|
|
1258
|
+
f"{self._index_stats.files_ignored_gitignore} (gitignore), "
|
|
1259
|
+
f"{self._index_stats.files_ignored_no_parser} (no parser)"
|
|
1260
|
+
)
|
|
945
1261
|
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
1262
|
+
# Decide on parallel vs sequential execution
|
|
1263
|
+
# Use parallel if executor available and enough files to benefit
|
|
1264
|
+
if self.parallel_executor and total_files >= 10:
|
|
1265
|
+
await self._process_files_parallel(files_to_process, total_files)
|
|
1266
|
+
else:
|
|
1267
|
+
await self._process_files_sequential(files_to_process, total_files)
|
|
949
1268
|
|
|
950
|
-
|
|
951
|
-
|
|
1269
|
+
async def _process_files_parallel(
|
|
1270
|
+
self, files_to_process: list[tuple[Path, str]], total_files: int
|
|
1271
|
+
) -> None:
|
|
1272
|
+
"""Process files using parallel execution.
|
|
952
1273
|
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
1274
|
+
Args:
|
|
1275
|
+
files_to_process: List of (filepath, language) tuples
|
|
1276
|
+
total_files: Total number of files to process
|
|
1277
|
+
"""
|
|
1278
|
+
logger.info(f"Using parallel execution with {self._worker_count} workers")
|
|
1279
|
+
self._parallel_mode_active = True
|
|
956
1280
|
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
1281
|
+
try:
|
|
1282
|
+
# Build FileInfo objects for WorkDistributor
|
|
1283
|
+
file_infos = self._build_file_infos(files_to_process)
|
|
1284
|
+
|
|
1285
|
+
# Create work batches using WorkDistributor
|
|
1286
|
+
distributor = WorkDistributor(worker_count=self._worker_count)
|
|
1287
|
+
batches = distributor.create_batches(file_infos)
|
|
1288
|
+
|
|
1289
|
+
logger.info(f"Created {len(batches)} batches for {len(file_infos)} files")
|
|
960
1290
|
|
|
961
|
-
|
|
1291
|
+
# Track progress for UI
|
|
1292
|
+
files_completed = 0
|
|
1293
|
+
|
|
1294
|
+
def parallel_progress(completed_batches: int, total_batches: int) -> None:
|
|
1295
|
+
nonlocal files_completed
|
|
1296
|
+
# Estimate files completed based on batch progress
|
|
1297
|
+
if total_batches > 0:
|
|
1298
|
+
estimated = int((completed_batches / total_batches) * total_files)
|
|
1299
|
+
files_completed = estimated
|
|
962
1300
|
self._report_progress(
|
|
963
1301
|
"definitions",
|
|
964
|
-
"Processing files
|
|
965
|
-
|
|
1302
|
+
f"Processing files (Parallel, {self._worker_count} workers)",
|
|
1303
|
+
files_completed,
|
|
966
1304
|
total_files,
|
|
967
1305
|
)
|
|
968
1306
|
|
|
969
|
-
|
|
970
|
-
|
|
1307
|
+
# Execute in parallel - run blocking executor in thread pool
|
|
1308
|
+
loop = asyncio.get_event_loop()
|
|
1309
|
+
result = await loop.run_in_executor(
|
|
1310
|
+
None,
|
|
1311
|
+
lambda: self.parallel_executor.execute(batches, parallel_progress), # type: ignore[union-attr]
|
|
1312
|
+
)
|
|
1313
|
+
|
|
1314
|
+
# Merge results into Ingestor buffers and local caches
|
|
1315
|
+
self._merge_parallel_results(result)
|
|
1316
|
+
|
|
1317
|
+
# Update stats
|
|
1318
|
+
self._index_stats.files_processed = result.successful_files
|
|
1319
|
+
|
|
1320
|
+
# Report phase completion
|
|
1321
|
+
self._report_progress(
|
|
1322
|
+
"definitions",
|
|
1323
|
+
f"Processing files (Parallel, {self._worker_count} workers)",
|
|
1324
|
+
result.successful_files,
|
|
1325
|
+
total_files,
|
|
1326
|
+
phase_complete=True,
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
logger.info(
|
|
1330
|
+
f"Parallel processing complete: {result.successful_files}/{total_files} "
|
|
1331
|
+
f"files, {result.failed_files} failures"
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
except Exception as e:
|
|
1335
|
+
logger.warning(
|
|
1336
|
+
f"Parallel execution failed: {e}. Falling back to sequential."
|
|
1337
|
+
)
|
|
1338
|
+
self._parallel_mode_active = False
|
|
1339
|
+
await self._process_files_sequential(files_to_process, total_files)
|
|
1340
|
+
|
|
1341
|
+
async def _process_files_sequential(
|
|
1342
|
+
self, files_to_process: list[tuple[Path, str]], total_files: int
|
|
1343
|
+
) -> None:
|
|
1344
|
+
"""Process files using sequential execution (original behavior).
|
|
1345
|
+
|
|
1346
|
+
Args:
|
|
1347
|
+
files_to_process: List of (filepath, language) tuples
|
|
1348
|
+
total_files: Total number of files to process
|
|
1349
|
+
"""
|
|
1350
|
+
logger.info("Using sequential execution")
|
|
1351
|
+
self._parallel_mode_active = False
|
|
1352
|
+
|
|
1353
|
+
file_count = 0
|
|
1354
|
+
for filepath, language in files_to_process:
|
|
1355
|
+
await self._process_single_file(filepath, language)
|
|
1356
|
+
file_count += 1
|
|
1357
|
+
self._index_stats.files_processed += 1
|
|
1358
|
+
|
|
1359
|
+
# Report progress after each file
|
|
1360
|
+
self._report_progress(
|
|
1361
|
+
"definitions",
|
|
1362
|
+
"Processing files (Sequential)",
|
|
1363
|
+
file_count,
|
|
1364
|
+
total_files,
|
|
1365
|
+
)
|
|
1366
|
+
|
|
1367
|
+
if file_count % 100 == 0:
|
|
1368
|
+
logger.info(f" Processed {file_count}/{total_files} files...")
|
|
971
1369
|
|
|
972
1370
|
logger.info(f" Total files processed: {file_count}/{total_files}")
|
|
973
1371
|
|
|
974
1372
|
# Report phase completion
|
|
975
1373
|
self._report_progress(
|
|
976
1374
|
"definitions",
|
|
977
|
-
"Processing files
|
|
1375
|
+
"Processing files (Sequential)",
|
|
978
1376
|
file_count,
|
|
979
1377
|
total_files,
|
|
980
1378
|
phase_complete=True,
|
|
@@ -987,7 +1385,7 @@ class SimpleGraphBuilder:
|
|
|
987
1385
|
|
|
988
1386
|
# Create File node
|
|
989
1387
|
self.ingestor.ensure_node_batch(
|
|
990
|
-
|
|
1388
|
+
NodeLabel.FILE,
|
|
991
1389
|
{
|
|
992
1390
|
"path": relative_path_str,
|
|
993
1391
|
"name": filepath.name,
|
|
@@ -1000,21 +1398,21 @@ class SimpleGraphBuilder:
|
|
|
1000
1398
|
if parent_rel_path == Path("."):
|
|
1001
1399
|
# File in project root
|
|
1002
1400
|
self.ingestor.ensure_relationship_batch(
|
|
1003
|
-
|
|
1401
|
+
NodeLabel.PROJECT,
|
|
1004
1402
|
"name",
|
|
1005
1403
|
self.project_name,
|
|
1006
|
-
|
|
1007
|
-
|
|
1404
|
+
RelationshipType.CONTAINS_FILE,
|
|
1405
|
+
NodeLabel.FILE,
|
|
1008
1406
|
"path",
|
|
1009
1407
|
relative_path_str,
|
|
1010
1408
|
)
|
|
1011
1409
|
else:
|
|
1012
1410
|
self.ingestor.ensure_relationship_batch(
|
|
1013
|
-
|
|
1411
|
+
NodeLabel.FOLDER,
|
|
1014
1412
|
"path",
|
|
1015
1413
|
str(parent_rel_path).replace(os.sep, "/"),
|
|
1016
|
-
|
|
1017
|
-
|
|
1414
|
+
RelationshipType.CONTAINS_FILE,
|
|
1415
|
+
NodeLabel.FILE,
|
|
1018
1416
|
"path",
|
|
1019
1417
|
relative_path_str,
|
|
1020
1418
|
)
|
|
@@ -1043,7 +1441,7 @@ class SimpleGraphBuilder:
|
|
|
1043
1441
|
|
|
1044
1442
|
current_time = int(time.time())
|
|
1045
1443
|
self.ingestor.ensure_node_batch(
|
|
1046
|
-
|
|
1444
|
+
NodeLabel.MODULE,
|
|
1047
1445
|
{
|
|
1048
1446
|
"qualified_name": module_qn,
|
|
1049
1447
|
"name": filepath.stem,
|
|
@@ -1058,33 +1456,33 @@ class SimpleGraphBuilder:
|
|
|
1058
1456
|
if parent_container:
|
|
1059
1457
|
# Parent is a package
|
|
1060
1458
|
self.ingestor.ensure_relationship_batch(
|
|
1061
|
-
|
|
1459
|
+
NodeLabel.PACKAGE,
|
|
1062
1460
|
"qualified_name",
|
|
1063
1461
|
parent_container,
|
|
1064
|
-
|
|
1065
|
-
|
|
1462
|
+
RelationshipType.CONTAINS_MODULE,
|
|
1463
|
+
NodeLabel.MODULE,
|
|
1066
1464
|
"qualified_name",
|
|
1067
1465
|
module_qn,
|
|
1068
1466
|
)
|
|
1069
1467
|
elif parent_rel_path == Path("."):
|
|
1070
1468
|
# Parent is project root
|
|
1071
1469
|
self.ingestor.ensure_relationship_batch(
|
|
1072
|
-
|
|
1470
|
+
NodeLabel.PROJECT,
|
|
1073
1471
|
"name",
|
|
1074
1472
|
self.project_name,
|
|
1075
|
-
|
|
1076
|
-
|
|
1473
|
+
RelationshipType.CONTAINS_MODULE,
|
|
1474
|
+
NodeLabel.MODULE,
|
|
1077
1475
|
"qualified_name",
|
|
1078
1476
|
module_qn,
|
|
1079
1477
|
)
|
|
1080
1478
|
else:
|
|
1081
1479
|
# Parent is a folder
|
|
1082
1480
|
self.ingestor.ensure_relationship_batch(
|
|
1083
|
-
|
|
1481
|
+
NodeLabel.FOLDER,
|
|
1084
1482
|
"path",
|
|
1085
1483
|
str(parent_rel_path).replace(os.sep, "/"),
|
|
1086
|
-
|
|
1087
|
-
|
|
1484
|
+
RelationshipType.CONTAINS_MODULE,
|
|
1485
|
+
NodeLabel.MODULE,
|
|
1088
1486
|
"qualified_name",
|
|
1089
1487
|
module_qn,
|
|
1090
1488
|
)
|
|
@@ -1098,7 +1496,7 @@ class SimpleGraphBuilder:
|
|
|
1098
1496
|
|
|
1099
1497
|
# Track module
|
|
1100
1498
|
self.ingestor.ensure_tracks_relationship(
|
|
1101
|
-
relative_path_str,
|
|
1499
|
+
relative_path_str, NodeLabel.MODULE, module_qn
|
|
1102
1500
|
)
|
|
1103
1501
|
|
|
1104
1502
|
# Extract definitions
|
|
@@ -1142,7 +1540,7 @@ class SimpleGraphBuilder:
|
|
|
1142
1540
|
|
|
1143
1541
|
current_time = int(time.time())
|
|
1144
1542
|
self.ingestor.ensure_node_batch(
|
|
1145
|
-
|
|
1543
|
+
NodeLabel.CLASS,
|
|
1146
1544
|
{
|
|
1147
1545
|
"qualified_name": class_qn,
|
|
1148
1546
|
"name": class_name,
|
|
@@ -1156,26 +1554,23 @@ class SimpleGraphBuilder:
|
|
|
1156
1554
|
)
|
|
1157
1555
|
|
|
1158
1556
|
# Create DEFINES relationship
|
|
1159
|
-
logger.debug(
|
|
1160
|
-
f"Creating DEFINES relationship: Module({module_qn}) -> Class({class_qn})"
|
|
1161
|
-
)
|
|
1162
1557
|
self.ingestor.ensure_relationship_batch(
|
|
1163
|
-
|
|
1558
|
+
NodeLabel.MODULE,
|
|
1164
1559
|
"qualified_name",
|
|
1165
1560
|
module_qn,
|
|
1166
|
-
|
|
1167
|
-
|
|
1561
|
+
RelationshipType.DEFINES,
|
|
1562
|
+
NodeLabel.CLASS,
|
|
1168
1563
|
"qualified_name",
|
|
1169
1564
|
class_qn,
|
|
1170
1565
|
)
|
|
1171
1566
|
|
|
1172
1567
|
# Track class
|
|
1173
1568
|
self.ingestor.ensure_tracks_relationship(
|
|
1174
|
-
relative_path_str,
|
|
1569
|
+
relative_path_str, NodeLabel.CLASS, class_qn
|
|
1175
1570
|
)
|
|
1176
1571
|
|
|
1177
1572
|
# Register for lookup
|
|
1178
|
-
self.function_registry[class_qn] =
|
|
1573
|
+
self.function_registry[class_qn] = NodeLabel.CLASS
|
|
1179
1574
|
self.simple_name_lookup[class_name].add(class_qn)
|
|
1180
1575
|
|
|
1181
1576
|
# Extract inheritance
|
|
@@ -1187,7 +1582,6 @@ class SimpleGraphBuilder:
|
|
|
1187
1582
|
if "function_query" in lang_queries:
|
|
1188
1583
|
cursor = QueryCursor(lang_queries["function_query"])
|
|
1189
1584
|
matches = list(cursor.matches(root_node))
|
|
1190
|
-
logger.debug(f"Found {len(matches)} function matches in {filepath}")
|
|
1191
1585
|
for match in matches:
|
|
1192
1586
|
func_node = None
|
|
1193
1587
|
func_name = None
|
|
@@ -1201,11 +1595,6 @@ class SimpleGraphBuilder:
|
|
|
1201
1595
|
func_name = node.text.decode("utf-8")
|
|
1202
1596
|
|
|
1203
1597
|
if func_node and func_name:
|
|
1204
|
-
# Log what we found
|
|
1205
|
-
logger.debug(
|
|
1206
|
-
f"Found function: {func_name} at line {func_node.start_point.row + 1}"
|
|
1207
|
-
)
|
|
1208
|
-
|
|
1209
1598
|
# Check if this is a method inside a class
|
|
1210
1599
|
parent_class = self._find_parent_class(func_node, module_qn)
|
|
1211
1600
|
|
|
@@ -1219,7 +1608,7 @@ class SimpleGraphBuilder:
|
|
|
1219
1608
|
|
|
1220
1609
|
current_time = int(time.time())
|
|
1221
1610
|
self.ingestor.ensure_node_batch(
|
|
1222
|
-
|
|
1611
|
+
NodeLabel.METHOD,
|
|
1223
1612
|
{
|
|
1224
1613
|
"qualified_name": method_qn,
|
|
1225
1614
|
"name": func_name,
|
|
@@ -1234,22 +1623,22 @@ class SimpleGraphBuilder:
|
|
|
1234
1623
|
|
|
1235
1624
|
# Create DEFINES_METHOD relationship
|
|
1236
1625
|
self.ingestor.ensure_relationship_batch(
|
|
1237
|
-
|
|
1626
|
+
NodeLabel.CLASS,
|
|
1238
1627
|
"qualified_name",
|
|
1239
1628
|
parent_class,
|
|
1240
|
-
|
|
1241
|
-
|
|
1629
|
+
RelationshipType.DEFINES_METHOD,
|
|
1630
|
+
NodeLabel.METHOD,
|
|
1242
1631
|
"qualified_name",
|
|
1243
1632
|
method_qn,
|
|
1244
1633
|
)
|
|
1245
1634
|
|
|
1246
1635
|
# Track method
|
|
1247
1636
|
self.ingestor.ensure_tracks_relationship(
|
|
1248
|
-
relative_path_str,
|
|
1637
|
+
relative_path_str, NodeLabel.METHOD, method_qn
|
|
1249
1638
|
)
|
|
1250
1639
|
|
|
1251
1640
|
# Register for lookup
|
|
1252
|
-
self.function_registry[method_qn] =
|
|
1641
|
+
self.function_registry[method_qn] = NodeLabel.METHOD
|
|
1253
1642
|
self.simple_name_lookup[func_name].add(method_qn)
|
|
1254
1643
|
else:
|
|
1255
1644
|
# This is a standalone function
|
|
@@ -1261,7 +1650,7 @@ class SimpleGraphBuilder:
|
|
|
1261
1650
|
|
|
1262
1651
|
current_time = int(time.time())
|
|
1263
1652
|
self.ingestor.ensure_node_batch(
|
|
1264
|
-
|
|
1653
|
+
NodeLabel.FUNCTION,
|
|
1265
1654
|
{
|
|
1266
1655
|
"qualified_name": func_qn,
|
|
1267
1656
|
"name": func_name,
|
|
@@ -1276,22 +1665,22 @@ class SimpleGraphBuilder:
|
|
|
1276
1665
|
|
|
1277
1666
|
# Create DEFINES relationship
|
|
1278
1667
|
self.ingestor.ensure_relationship_batch(
|
|
1279
|
-
|
|
1668
|
+
NodeLabel.MODULE,
|
|
1280
1669
|
"qualified_name",
|
|
1281
1670
|
module_qn,
|
|
1282
|
-
|
|
1283
|
-
|
|
1671
|
+
RelationshipType.DEFINES_FUNC,
|
|
1672
|
+
NodeLabel.FUNCTION,
|
|
1284
1673
|
"qualified_name",
|
|
1285
1674
|
func_qn,
|
|
1286
1675
|
)
|
|
1287
1676
|
|
|
1288
1677
|
# Track function
|
|
1289
1678
|
self.ingestor.ensure_tracks_relationship(
|
|
1290
|
-
relative_path_str,
|
|
1679
|
+
relative_path_str, NodeLabel.FUNCTION, func_qn
|
|
1291
1680
|
)
|
|
1292
1681
|
|
|
1293
1682
|
# Register for lookup
|
|
1294
|
-
self.function_registry[func_qn] =
|
|
1683
|
+
self.function_registry[func_qn] = NodeLabel.FUNCTION
|
|
1295
1684
|
self.simple_name_lookup[func_name].add(func_qn)
|
|
1296
1685
|
|
|
1297
1686
|
def _extract_decorators(self, node: Node, language: str) -> list[str]:
|
|
@@ -1407,6 +1796,25 @@ class SimpleGraphBuilder:
|
|
|
1407
1796
|
|
|
1408
1797
|
def _process_relationships(self) -> None:
|
|
1409
1798
|
"""Third pass: Process function calls and imports."""
|
|
1799
|
+
# If parallel mode was used, relationships are already resolved
|
|
1800
|
+
# by ParallelExecutor during the definitions phase
|
|
1801
|
+
if self._parallel_mode_active:
|
|
1802
|
+
logger.info(
|
|
1803
|
+
"Skipping relationship processing "
|
|
1804
|
+
"(already resolved during parallel execution)"
|
|
1805
|
+
)
|
|
1806
|
+
# Report progress as complete for UI consistency
|
|
1807
|
+
total = len(self.function_registry)
|
|
1808
|
+
self._report_progress(
|
|
1809
|
+
"relationships",
|
|
1810
|
+
"Relationships resolved during parallel execution",
|
|
1811
|
+
total,
|
|
1812
|
+
total,
|
|
1813
|
+
phase_complete=True,
|
|
1814
|
+
)
|
|
1815
|
+
return
|
|
1816
|
+
|
|
1817
|
+
# Sequential mode - process relationships normally
|
|
1410
1818
|
# Process inheritance relationships first
|
|
1411
1819
|
self._process_inheritance()
|
|
1412
1820
|
|
|
@@ -1418,18 +1826,10 @@ class SimpleGraphBuilder:
|
|
|
1418
1826
|
f"Simple name lookup has {len(self.simple_name_lookup)} unique names"
|
|
1419
1827
|
)
|
|
1420
1828
|
|
|
1421
|
-
# Log some examples from simple_name_lookup
|
|
1422
|
-
if self.simple_name_lookup:
|
|
1423
|
-
example_names = list(self.simple_name_lookup.keys())[:5]
|
|
1424
|
-
for name in example_names:
|
|
1425
|
-
logger.debug(
|
|
1426
|
-
f" Example: '{name}' -> {list(self.simple_name_lookup[name])[:3]}"
|
|
1427
|
-
)
|
|
1428
|
-
|
|
1429
1829
|
file_count = 0
|
|
1430
1830
|
for filepath, (root_node, language) in self.ast_cache.items():
|
|
1431
1831
|
self._process_calls(filepath, root_node, language)
|
|
1432
|
-
#
|
|
1832
|
+
# TODO(future): Add import statement processing for IMPORTS relationships
|
|
1433
1833
|
|
|
1434
1834
|
file_count += 1
|
|
1435
1835
|
# Report progress after each file
|
|
@@ -1459,17 +1859,14 @@ class SimpleGraphBuilder:
|
|
|
1459
1859
|
if parent_qn in self.function_registry:
|
|
1460
1860
|
# Create INHERITS relationship
|
|
1461
1861
|
self.ingestor.ensure_relationship_batch(
|
|
1462
|
-
|
|
1862
|
+
NodeLabel.CLASS,
|
|
1463
1863
|
"qualified_name",
|
|
1464
1864
|
child_qn,
|
|
1465
|
-
|
|
1466
|
-
|
|
1865
|
+
RelationshipType.INHERITS,
|
|
1866
|
+
NodeLabel.CLASS,
|
|
1467
1867
|
"qualified_name",
|
|
1468
1868
|
parent_qn,
|
|
1469
1869
|
)
|
|
1470
|
-
logger.debug(
|
|
1471
|
-
f" Created inheritance: {child_qn} INHERITS {parent_qn}"
|
|
1472
|
-
)
|
|
1473
1870
|
else:
|
|
1474
1871
|
# Try to find parent by simple name lookup
|
|
1475
1872
|
parent_simple_name = parent_qn.split(".")[-1]
|
|
@@ -1481,21 +1878,14 @@ class SimpleGraphBuilder:
|
|
|
1481
1878
|
if len(possible_parents) == 1:
|
|
1482
1879
|
actual_parent_qn = list(possible_parents)[0]
|
|
1483
1880
|
self.ingestor.ensure_relationship_batch(
|
|
1484
|
-
|
|
1881
|
+
NodeLabel.CLASS,
|
|
1485
1882
|
"qualified_name",
|
|
1486
1883
|
child_qn,
|
|
1487
|
-
|
|
1488
|
-
|
|
1884
|
+
RelationshipType.INHERITS,
|
|
1885
|
+
NodeLabel.CLASS,
|
|
1489
1886
|
"qualified_name",
|
|
1490
1887
|
actual_parent_qn,
|
|
1491
1888
|
)
|
|
1492
|
-
logger.debug(
|
|
1493
|
-
f" Created inheritance: {child_qn} INHERITS {actual_parent_qn}"
|
|
1494
|
-
)
|
|
1495
|
-
else:
|
|
1496
|
-
logger.debug(
|
|
1497
|
-
f" Could not resolve parent class: {parent_qn} for {child_qn}"
|
|
1498
|
-
)
|
|
1499
1889
|
|
|
1500
1890
|
def _process_calls(self, filepath: Path, root_node: Node, language: str) -> None:
|
|
1501
1891
|
"""Process function calls in a file."""
|
|
@@ -1516,7 +1906,6 @@ class SimpleGraphBuilder:
|
|
|
1516
1906
|
# Find all call expressions
|
|
1517
1907
|
cursor = QueryCursor(lang_queries["call_query"])
|
|
1518
1908
|
matches = list(cursor.matches(root_node))
|
|
1519
|
-
logger.debug(f"Found {len(matches)} call matches in {filepath}")
|
|
1520
1909
|
for match in matches:
|
|
1521
1910
|
call_node = None
|
|
1522
1911
|
|
|
@@ -1555,36 +1944,23 @@ class SimpleGraphBuilder:
|
|
|
1555
1944
|
break
|
|
1556
1945
|
|
|
1557
1946
|
if not callee_name:
|
|
1558
|
-
logger.debug(
|
|
1559
|
-
f" Could not extract callee name from call at line {call_node.start_point[0]}"
|
|
1560
|
-
)
|
|
1561
1947
|
return
|
|
1562
1948
|
|
|
1563
|
-
logger.debug(f" Processing call to {callee_name} (object: {object_name})")
|
|
1564
|
-
|
|
1565
1949
|
# Find caller function
|
|
1566
1950
|
caller_qn = self._find_containing_function(call_node, module_qn)
|
|
1567
1951
|
if not caller_qn:
|
|
1568
|
-
logger.debug(
|
|
1569
|
-
f" Could not find containing function for call at line {call_node.start_point[0]}"
|
|
1570
|
-
)
|
|
1571
1952
|
return
|
|
1572
1953
|
|
|
1573
1954
|
# Get all possible callees
|
|
1574
1955
|
possible_callees = self.simple_name_lookup.get(callee_name, set())
|
|
1575
1956
|
if not possible_callees:
|
|
1576
|
-
logger.debug(f" No functions found with name: {callee_name}")
|
|
1577
1957
|
return
|
|
1578
1958
|
|
|
1579
|
-
logger.debug(
|
|
1580
|
-
f" Found {len(possible_callees)} possible callees for {callee_name}"
|
|
1581
|
-
)
|
|
1582
|
-
|
|
1583
1959
|
# Calculate confidence scores for each possible callee
|
|
1584
1960
|
scored_callees = []
|
|
1585
1961
|
for possible_qn in possible_callees:
|
|
1586
|
-
score =
|
|
1587
|
-
caller_qn, possible_qn, module_qn, object_name
|
|
1962
|
+
score = calculate_callee_confidence(
|
|
1963
|
+
caller_qn, possible_qn, module_qn, object_name, self.simple_name_lookup
|
|
1588
1964
|
)
|
|
1589
1965
|
scored_callees.append((possible_qn, score))
|
|
1590
1966
|
|
|
@@ -1610,93 +1986,6 @@ class SimpleGraphBuilder:
|
|
|
1610
1986
|
callee_qn,
|
|
1611
1987
|
)
|
|
1612
1988
|
|
|
1613
|
-
# Log with confidence information
|
|
1614
|
-
alternatives = len(scored_callees) - 1
|
|
1615
|
-
logger.info(
|
|
1616
|
-
f" Created CALLS relationship: {caller_qn} -> {callee_qn} (confidence: {confidence:.2f}, alternatives: {alternatives})"
|
|
1617
|
-
)
|
|
1618
|
-
|
|
1619
|
-
# If multiple alternatives exist with similar confidence, log them
|
|
1620
|
-
if alternatives > 0 and confidence < 1.0:
|
|
1621
|
-
similar_alternatives = [
|
|
1622
|
-
qn for qn, score in scored_callees[1:4] if score >= confidence * 0.8
|
|
1623
|
-
] # Top 3 alternatives # Within 80% of best score
|
|
1624
|
-
if similar_alternatives:
|
|
1625
|
-
logger.debug(
|
|
1626
|
-
f" Alternative matches: {', '.join(similar_alternatives)}"
|
|
1627
|
-
)
|
|
1628
|
-
else:
|
|
1629
|
-
logger.warning(
|
|
1630
|
-
f" Failed to create CALLS relationship - caller_type: {caller_type}, callee_type: {callee_type}"
|
|
1631
|
-
)
|
|
1632
|
-
|
|
1633
|
-
def _calculate_callee_confidence(
|
|
1634
|
-
self, caller_qn: str, callee_qn: str, module_qn: str, object_name: str | None
|
|
1635
|
-
) -> float:
|
|
1636
|
-
"""Calculate confidence score for a potential callee match.
|
|
1637
|
-
|
|
1638
|
-
Args:
|
|
1639
|
-
caller_qn: Qualified name of the calling function
|
|
1640
|
-
callee_qn: Qualified name of the potential callee
|
|
1641
|
-
module_qn: Qualified name of the current module
|
|
1642
|
-
object_name: Object name for method calls (e.g., 'obj' in obj.method())
|
|
1643
|
-
|
|
1644
|
-
Returns:
|
|
1645
|
-
Confidence score between 0.0 and 1.0
|
|
1646
|
-
"""
|
|
1647
|
-
score = 0.0
|
|
1648
|
-
|
|
1649
|
-
# 1. Module locality - functions in the same module are most likely
|
|
1650
|
-
if callee_qn.startswith(module_qn + "."):
|
|
1651
|
-
score += 0.5
|
|
1652
|
-
|
|
1653
|
-
# Even higher if in the same class
|
|
1654
|
-
caller_parts = caller_qn.split(".")
|
|
1655
|
-
callee_parts = callee_qn.split(".")
|
|
1656
|
-
if len(caller_parts) >= 3 and len(callee_parts) >= 3:
|
|
1657
|
-
if caller_parts[:-1] == callee_parts[:-1]: # Same class
|
|
1658
|
-
score += 0.2
|
|
1659
|
-
|
|
1660
|
-
# 2. Package locality - functions in the same package hierarchy
|
|
1661
|
-
elif "." in module_qn:
|
|
1662
|
-
package = module_qn.rsplit(".", 1)[0]
|
|
1663
|
-
if callee_qn.startswith(package + "."):
|
|
1664
|
-
score += 0.3
|
|
1665
|
-
|
|
1666
|
-
# 3. Object/class match for method calls
|
|
1667
|
-
if object_name:
|
|
1668
|
-
# Check if callee is a method of a class matching the object name
|
|
1669
|
-
callee_parts = callee_qn.split(".")
|
|
1670
|
-
if len(callee_parts) >= 2:
|
|
1671
|
-
# Simple heuristic: check if class name matches object name
|
|
1672
|
-
# (In reality, we'd need type inference for accuracy)
|
|
1673
|
-
class_name = callee_parts[-2]
|
|
1674
|
-
if class_name.lower() == object_name.lower():
|
|
1675
|
-
score += 0.3
|
|
1676
|
-
elif object_name == "self" and callee_qn.startswith(
|
|
1677
|
-
caller_qn.rsplit(".", 1)[0]
|
|
1678
|
-
):
|
|
1679
|
-
# 'self' refers to the same class
|
|
1680
|
-
score += 0.4
|
|
1681
|
-
|
|
1682
|
-
# 4. Import presence check (simplified - would need import tracking)
|
|
1683
|
-
# For now, we'll give a small boost to standard library functions
|
|
1684
|
-
if callee_qn.startswith(("builtins.", "typing.", "collections.")):
|
|
1685
|
-
score += 0.1
|
|
1686
|
-
|
|
1687
|
-
# 5. Name similarity for disambiguation
|
|
1688
|
-
# If function names are unique enough, boost confidence
|
|
1689
|
-
possible_count = len(
|
|
1690
|
-
self.simple_name_lookup.get(callee_qn.split(".")[-1], set())
|
|
1691
|
-
)
|
|
1692
|
-
if possible_count == 1:
|
|
1693
|
-
score += 0.2
|
|
1694
|
-
elif possible_count <= 3:
|
|
1695
|
-
score += 0.1
|
|
1696
|
-
|
|
1697
|
-
# Normalize to [0, 1]
|
|
1698
|
-
return min(score, 1.0)
|
|
1699
|
-
|
|
1700
1989
|
def _find_containing_function(self, node: Node, module_qn: str) -> str | None:
|
|
1701
1990
|
"""Find the containing function/method of a node."""
|
|
1702
1991
|
current = node.parent
|
|
@@ -1760,7 +2049,8 @@ class CodebaseIngestor:
|
|
|
1760
2049
|
self.project_name = repo_path_obj.name
|
|
1761
2050
|
|
|
1762
2051
|
try:
|
|
1763
|
-
# Create database
|
|
2052
|
+
# Create database (lazy import kuzu for Windows compatibility)
|
|
2053
|
+
kuzu = get_kuzu()
|
|
1764
2054
|
logger.info(f"Creating Kuzu database at: {self.db_path}")
|
|
1765
2055
|
db = kuzu.Database(str(self.db_path))
|
|
1766
2056
|
conn = kuzu.Connection(db)
|