emdash-core 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emdash_core/__init__.py +3 -0
- emdash_core/agent/__init__.py +37 -0
- emdash_core/agent/agents.py +225 -0
- emdash_core/agent/code_reviewer.py +476 -0
- emdash_core/agent/compaction.py +143 -0
- emdash_core/agent/context_manager.py +140 -0
- emdash_core/agent/events.py +338 -0
- emdash_core/agent/handlers.py +224 -0
- emdash_core/agent/inprocess_subagent.py +377 -0
- emdash_core/agent/mcp/__init__.py +50 -0
- emdash_core/agent/mcp/client.py +346 -0
- emdash_core/agent/mcp/config.py +302 -0
- emdash_core/agent/mcp/manager.py +496 -0
- emdash_core/agent/mcp/tool_factory.py +213 -0
- emdash_core/agent/prompts/__init__.py +38 -0
- emdash_core/agent/prompts/main_agent.py +104 -0
- emdash_core/agent/prompts/subagents.py +131 -0
- emdash_core/agent/prompts/workflow.py +136 -0
- emdash_core/agent/providers/__init__.py +34 -0
- emdash_core/agent/providers/base.py +143 -0
- emdash_core/agent/providers/factory.py +80 -0
- emdash_core/agent/providers/models.py +220 -0
- emdash_core/agent/providers/openai_provider.py +463 -0
- emdash_core/agent/providers/transformers_provider.py +217 -0
- emdash_core/agent/research/__init__.py +81 -0
- emdash_core/agent/research/agent.py +143 -0
- emdash_core/agent/research/controller.py +254 -0
- emdash_core/agent/research/critic.py +428 -0
- emdash_core/agent/research/macros.py +469 -0
- emdash_core/agent/research/planner.py +449 -0
- emdash_core/agent/research/researcher.py +436 -0
- emdash_core/agent/research/state.py +523 -0
- emdash_core/agent/research/synthesizer.py +594 -0
- emdash_core/agent/reviewer_profile.py +475 -0
- emdash_core/agent/rules.py +123 -0
- emdash_core/agent/runner.py +601 -0
- emdash_core/agent/session.py +262 -0
- emdash_core/agent/spec_schema.py +66 -0
- emdash_core/agent/specification.py +479 -0
- emdash_core/agent/subagent.py +397 -0
- emdash_core/agent/subagent_prompts.py +13 -0
- emdash_core/agent/toolkit.py +482 -0
- emdash_core/agent/toolkits/__init__.py +64 -0
- emdash_core/agent/toolkits/base.py +96 -0
- emdash_core/agent/toolkits/explore.py +47 -0
- emdash_core/agent/toolkits/plan.py +55 -0
- emdash_core/agent/tools/__init__.py +141 -0
- emdash_core/agent/tools/analytics.py +436 -0
- emdash_core/agent/tools/base.py +131 -0
- emdash_core/agent/tools/coding.py +484 -0
- emdash_core/agent/tools/github_mcp.py +592 -0
- emdash_core/agent/tools/history.py +13 -0
- emdash_core/agent/tools/modes.py +153 -0
- emdash_core/agent/tools/plan.py +206 -0
- emdash_core/agent/tools/plan_write.py +135 -0
- emdash_core/agent/tools/search.py +412 -0
- emdash_core/agent/tools/spec.py +341 -0
- emdash_core/agent/tools/task.py +262 -0
- emdash_core/agent/tools/task_output.py +204 -0
- emdash_core/agent/tools/tasks.py +454 -0
- emdash_core/agent/tools/traversal.py +588 -0
- emdash_core/agent/tools/web.py +179 -0
- emdash_core/analytics/__init__.py +5 -0
- emdash_core/analytics/engine.py +1286 -0
- emdash_core/api/__init__.py +5 -0
- emdash_core/api/agent.py +308 -0
- emdash_core/api/agents.py +154 -0
- emdash_core/api/analyze.py +264 -0
- emdash_core/api/auth.py +173 -0
- emdash_core/api/context.py +77 -0
- emdash_core/api/db.py +121 -0
- emdash_core/api/embed.py +131 -0
- emdash_core/api/feature.py +143 -0
- emdash_core/api/health.py +93 -0
- emdash_core/api/index.py +162 -0
- emdash_core/api/plan.py +110 -0
- emdash_core/api/projectmd.py +210 -0
- emdash_core/api/query.py +320 -0
- emdash_core/api/research.py +122 -0
- emdash_core/api/review.py +161 -0
- emdash_core/api/router.py +76 -0
- emdash_core/api/rules.py +116 -0
- emdash_core/api/search.py +119 -0
- emdash_core/api/spec.py +99 -0
- emdash_core/api/swarm.py +223 -0
- emdash_core/api/tasks.py +109 -0
- emdash_core/api/team.py +120 -0
- emdash_core/auth/__init__.py +17 -0
- emdash_core/auth/github.py +389 -0
- emdash_core/config.py +74 -0
- emdash_core/context/__init__.py +52 -0
- emdash_core/context/models.py +50 -0
- emdash_core/context/providers/__init__.py +11 -0
- emdash_core/context/providers/base.py +74 -0
- emdash_core/context/providers/explored_areas.py +183 -0
- emdash_core/context/providers/touched_areas.py +360 -0
- emdash_core/context/registry.py +73 -0
- emdash_core/context/reranker.py +199 -0
- emdash_core/context/service.py +260 -0
- emdash_core/context/session.py +352 -0
- emdash_core/core/__init__.py +104 -0
- emdash_core/core/config.py +454 -0
- emdash_core/core/exceptions.py +55 -0
- emdash_core/core/models.py +265 -0
- emdash_core/core/review_config.py +57 -0
- emdash_core/db/__init__.py +67 -0
- emdash_core/db/auth.py +134 -0
- emdash_core/db/models.py +91 -0
- emdash_core/db/provider.py +222 -0
- emdash_core/db/providers/__init__.py +5 -0
- emdash_core/db/providers/supabase.py +452 -0
- emdash_core/embeddings/__init__.py +24 -0
- emdash_core/embeddings/indexer.py +534 -0
- emdash_core/embeddings/models.py +192 -0
- emdash_core/embeddings/providers/__init__.py +7 -0
- emdash_core/embeddings/providers/base.py +112 -0
- emdash_core/embeddings/providers/fireworks.py +141 -0
- emdash_core/embeddings/providers/openai.py +104 -0
- emdash_core/embeddings/registry.py +146 -0
- emdash_core/embeddings/service.py +215 -0
- emdash_core/graph/__init__.py +26 -0
- emdash_core/graph/builder.py +134 -0
- emdash_core/graph/connection.py +692 -0
- emdash_core/graph/schema.py +416 -0
- emdash_core/graph/writer.py +667 -0
- emdash_core/ingestion/__init__.py +7 -0
- emdash_core/ingestion/change_detector.py +150 -0
- emdash_core/ingestion/git/__init__.py +5 -0
- emdash_core/ingestion/git/commit_analyzer.py +196 -0
- emdash_core/ingestion/github/__init__.py +6 -0
- emdash_core/ingestion/github/pr_fetcher.py +296 -0
- emdash_core/ingestion/github/task_extractor.py +100 -0
- emdash_core/ingestion/orchestrator.py +540 -0
- emdash_core/ingestion/parsers/__init__.py +10 -0
- emdash_core/ingestion/parsers/base_parser.py +66 -0
- emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
- emdash_core/ingestion/parsers/class_extractor.py +154 -0
- emdash_core/ingestion/parsers/function_extractor.py +202 -0
- emdash_core/ingestion/parsers/import_analyzer.py +119 -0
- emdash_core/ingestion/parsers/python_parser.py +123 -0
- emdash_core/ingestion/parsers/registry.py +72 -0
- emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
- emdash_core/ingestion/parsers/typescript_parser.py +278 -0
- emdash_core/ingestion/repository.py +346 -0
- emdash_core/models/__init__.py +38 -0
- emdash_core/models/agent.py +68 -0
- emdash_core/models/index.py +77 -0
- emdash_core/models/query.py +113 -0
- emdash_core/planning/__init__.py +7 -0
- emdash_core/planning/agent_api.py +413 -0
- emdash_core/planning/context_builder.py +265 -0
- emdash_core/planning/feature_context.py +232 -0
- emdash_core/planning/feature_expander.py +646 -0
- emdash_core/planning/llm_explainer.py +198 -0
- emdash_core/planning/similarity.py +509 -0
- emdash_core/planning/team_focus.py +821 -0
- emdash_core/server.py +153 -0
- emdash_core/sse/__init__.py +5 -0
- emdash_core/sse/stream.py +196 -0
- emdash_core/swarm/__init__.py +17 -0
- emdash_core/swarm/merge_agent.py +383 -0
- emdash_core/swarm/session_manager.py +274 -0
- emdash_core/swarm/swarm_runner.py +226 -0
- emdash_core/swarm/task_definition.py +137 -0
- emdash_core/swarm/worker_spawner.py +319 -0
- emdash_core/swarm/worktree_manager.py +278 -0
- emdash_core/templates/__init__.py +10 -0
- emdash_core/templates/defaults/agent-builder.md.template +82 -0
- emdash_core/templates/defaults/focus.md.template +115 -0
- emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
- emdash_core/templates/defaults/pr-review.md.template +80 -0
- emdash_core/templates/defaults/project.md.template +85 -0
- emdash_core/templates/defaults/research_critic.md.template +112 -0
- emdash_core/templates/defaults/research_planner.md.template +85 -0
- emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
- emdash_core/templates/defaults/reviewer.md.template +81 -0
- emdash_core/templates/defaults/spec.md.template +41 -0
- emdash_core/templates/defaults/tasks.md.template +78 -0
- emdash_core/templates/loader.py +296 -0
- emdash_core/utils/__init__.py +45 -0
- emdash_core/utils/git.py +84 -0
- emdash_core/utils/image.py +502 -0
- emdash_core/utils/logger.py +51 -0
- emdash_core-0.1.7.dist-info/METADATA +35 -0
- emdash_core-0.1.7.dist-info/RECORD +187 -0
- emdash_core-0.1.7.dist-info/WHEEL +4 -0
- emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Extract subtasks from PR descriptions."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from ...core.models import PullRequestEntity, TaskEntity
|
|
7
|
+
from ...utils.logger import log
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TaskExtractor:
|
|
11
|
+
"""Extracts subtasks from PR descriptions (markdown checkboxes)."""
|
|
12
|
+
|
|
13
|
+
# Pattern to match markdown checkboxes: - [ ] or - [x] or * [ ] etc.
|
|
14
|
+
CHECKBOX_PATTERN = re.compile(
|
|
15
|
+
r"^[\s]*[-*]\s*\[([ xX])\]\s*(.+?)$",
|
|
16
|
+
re.MULTILINE
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def extract_tasks(self, pr: PullRequestEntity) -> list[TaskEntity]:
|
|
20
|
+
"""Extract tasks from a PR description.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
pr: PullRequestEntity with description
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
List of TaskEntity objects extracted from checkboxes
|
|
27
|
+
"""
|
|
28
|
+
if not pr.description:
|
|
29
|
+
return []
|
|
30
|
+
|
|
31
|
+
tasks = []
|
|
32
|
+
matches = self.CHECKBOX_PATTERN.findall(pr.description)
|
|
33
|
+
|
|
34
|
+
for index, (checkbox, description) in enumerate(matches):
|
|
35
|
+
is_completed = checkbox.lower() == "x"
|
|
36
|
+
description = description.strip()
|
|
37
|
+
|
|
38
|
+
# Skip empty descriptions
|
|
39
|
+
if not description:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
task = TaskEntity(
|
|
43
|
+
id=f"pr_{pr.number}_task_{index}",
|
|
44
|
+
pr_number=pr.number,
|
|
45
|
+
description=description,
|
|
46
|
+
is_completed=is_completed,
|
|
47
|
+
order=index,
|
|
48
|
+
)
|
|
49
|
+
tasks.append(task)
|
|
50
|
+
|
|
51
|
+
if tasks:
|
|
52
|
+
log.debug(f"Extracted {len(tasks)} tasks from PR #{pr.number}")
|
|
53
|
+
|
|
54
|
+
return tasks
|
|
55
|
+
|
|
56
|
+
def extract_tasks_from_prs(
|
|
57
|
+
self,
|
|
58
|
+
prs: list[PullRequestEntity],
|
|
59
|
+
) -> list[TaskEntity]:
|
|
60
|
+
"""Extract tasks from multiple PRs.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
prs: List of PullRequestEntity objects
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of all TaskEntity objects from all PRs
|
|
67
|
+
"""
|
|
68
|
+
all_tasks = []
|
|
69
|
+
|
|
70
|
+
for pr in prs:
|
|
71
|
+
tasks = self.extract_tasks(pr)
|
|
72
|
+
all_tasks.extend(tasks)
|
|
73
|
+
|
|
74
|
+
log.info(f"Extracted {len(all_tasks)} tasks from {len(prs)} PRs")
|
|
75
|
+
return all_tasks
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def extract_task_summary(description: str) -> Optional[str]:
|
|
79
|
+
"""Extract a summary from a task description.
|
|
80
|
+
|
|
81
|
+
Useful for getting a short version of long task descriptions.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
description: Full task description
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Shortened summary (first sentence or first N chars)
|
|
88
|
+
"""
|
|
89
|
+
if not description:
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
# Get first sentence
|
|
93
|
+
first_sentence = description.split(".")[0].strip()
|
|
94
|
+
|
|
95
|
+
# If still too long, truncate
|
|
96
|
+
max_length = 100
|
|
97
|
+
if len(first_sentence) > max_length:
|
|
98
|
+
return first_sentence[:max_length - 3] + "..."
|
|
99
|
+
|
|
100
|
+
return first_sentence
|
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
"""Orchestrates the complete ingestion pipeline."""
|
|
2
|
+
|
|
3
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable, Optional
|
|
6
|
+
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from ..core.config import get_config
|
|
10
|
+
from ..core.models import CodebaseEntities, FileEntities, GitData
|
|
11
|
+
from ..graph.connection import get_connection, write_lock_context
|
|
12
|
+
from ..graph.builder import GraphBuilder
|
|
13
|
+
from ..graph.schema import SchemaManager
|
|
14
|
+
from .repository import RepositoryManager
|
|
15
|
+
from .parsers.registry import ParserRegistry
|
|
16
|
+
from .parsers.base_parser import BaseLanguageParser
|
|
17
|
+
from .git.commit_analyzer import CommitAnalyzer
|
|
18
|
+
from .github.pr_fetcher import PRFetcher
|
|
19
|
+
from .github.task_extractor import TaskExtractor
|
|
20
|
+
from .change_detector import ChangeDetector, ChangedFiles
|
|
21
|
+
from ..graph.writer import GraphWriter
|
|
22
|
+
from ..utils.logger import log
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse_file_worker(file_path: Path, repo_root: Path) -> FileEntities:
|
|
26
|
+
"""Worker function for parallel file parsing.
|
|
27
|
+
|
|
28
|
+
This is a module-level function so it can be pickled for ProcessPoolExecutor.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
file_path: Path to source file
|
|
32
|
+
repo_root: Repository root path
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
FileEntities
|
|
36
|
+
"""
|
|
37
|
+
# Get parser from registry
|
|
38
|
+
parser_class = ParserRegistry.get_parser(file_path)
|
|
39
|
+
|
|
40
|
+
if parser_class is None:
|
|
41
|
+
log.warning(f"No parser available for {file_path.suffix}")
|
|
42
|
+
return FileEntities()
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
parser = parser_class(file_path, repo_root)
|
|
46
|
+
return parser.parse()
|
|
47
|
+
except Exception as e:
|
|
48
|
+
log.warning(f"Failed to parse {file_path}: {e}")
|
|
49
|
+
return FileEntities()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class IngestionOrchestrator:
|
|
53
|
+
"""Coordinates the full ingestion pipeline."""
|
|
54
|
+
|
|
55
|
+
def __init__(self):
|
|
56
|
+
"""Initialize orchestrator."""
|
|
57
|
+
self.config = get_config()
|
|
58
|
+
self.repo_manager = RepositoryManager()
|
|
59
|
+
self.connection = get_connection()
|
|
60
|
+
self.graph_builder = GraphBuilder(
|
|
61
|
+
self.connection,
|
|
62
|
+
batch_size=self.config.ingestion.batch_size
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def index(
|
|
66
|
+
self,
|
|
67
|
+
repo_path: str,
|
|
68
|
+
incremental: bool = False,
|
|
69
|
+
changed_only: bool = False,
|
|
70
|
+
skip_git: Optional[bool] = None,
|
|
71
|
+
pr_limit: int = 100,
|
|
72
|
+
progress_callback: Optional[Callable[[str, float], None]] = None,
|
|
73
|
+
):
|
|
74
|
+
"""Execute the full indexing pipeline.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
repo_path: URL or local path to repository
|
|
78
|
+
incremental: Whether to perform incremental update (legacy)
|
|
79
|
+
changed_only: Only index files changed since last index (uses git diff)
|
|
80
|
+
skip_git: Whether to skip git history analysis (and GitHub analysis).
|
|
81
|
+
Defaults to config.ingestion.ast_only (AST_ONLY env var).
|
|
82
|
+
pr_limit: Maximum number of PRs to fetch per state (open/merged)
|
|
83
|
+
progress_callback: Optional callback(step: str, percent: float) for progress
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
Various exceptions if indexing fails
|
|
87
|
+
"""
|
|
88
|
+
# Use config ast_only as default if skip_git not explicitly set
|
|
89
|
+
if skip_git is None:
|
|
90
|
+
skip_git = self.config.ingestion.ast_only
|
|
91
|
+
|
|
92
|
+
# Store callback for use in parsing
|
|
93
|
+
self._progress_callback = progress_callback
|
|
94
|
+
|
|
95
|
+
# Acquire write lock for the entire indexing operation
|
|
96
|
+
with write_lock_context("indexing", timeout=120):
|
|
97
|
+
self._run_index(repo_path, incremental, changed_only, skip_git, pr_limit)
|
|
98
|
+
|
|
99
|
+
def _run_index(
|
|
100
|
+
self,
|
|
101
|
+
repo_path: str,
|
|
102
|
+
incremental: bool,
|
|
103
|
+
changed_only: bool,
|
|
104
|
+
skip_git: bool,
|
|
105
|
+
pr_limit: int,
|
|
106
|
+
):
|
|
107
|
+
"""Internal indexing implementation (called with write lock held)."""
|
|
108
|
+
log.info(f"Starting indexing: {repo_path}")
|
|
109
|
+
if changed_only:
|
|
110
|
+
log.info("Incremental mode: only indexing changed files")
|
|
111
|
+
if skip_git:
|
|
112
|
+
log.info("AST_ONLY mode: skipping Layer B (git) and Layer C (analytics)")
|
|
113
|
+
|
|
114
|
+
# Ensure Kuzu connection and schema
|
|
115
|
+
self._ensure_database_ready()
|
|
116
|
+
|
|
117
|
+
# Step 1: Get or clone repository
|
|
118
|
+
log.info("Step 1: Fetching repository...")
|
|
119
|
+
repo, repo_entity = self.repo_manager.get_or_clone(
|
|
120
|
+
repo_path,
|
|
121
|
+
skip_commit_count=skip_git
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# For incremental indexing, detect changes
|
|
125
|
+
changed_files: Optional[ChangedFiles] = None
|
|
126
|
+
current_commit = None
|
|
127
|
+
if changed_only:
|
|
128
|
+
last_indexed_commit = self._get_last_indexed_commit(repo_entity.url)
|
|
129
|
+
current_commit = repo.head.commit.hexsha
|
|
130
|
+
|
|
131
|
+
if not last_indexed_commit:
|
|
132
|
+
log.info("No previous index found - performing full index")
|
|
133
|
+
changed_only = False
|
|
134
|
+
elif last_indexed_commit == current_commit:
|
|
135
|
+
log.info("No changes since last index - nothing to do")
|
|
136
|
+
return
|
|
137
|
+
else:
|
|
138
|
+
extensions = ParserRegistry.get_all_extensions()
|
|
139
|
+
detector = ChangeDetector(repo, last_indexed_commit)
|
|
140
|
+
changed_files = detector.get_changed_files(extensions)
|
|
141
|
+
|
|
142
|
+
if not changed_files:
|
|
143
|
+
log.info("No relevant file changes detected - nothing to do")
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
log.info(f"Changes detected: {changed_files.total_changes} files")
|
|
147
|
+
|
|
148
|
+
# Step 2: Parse codebase (Layer A)
|
|
149
|
+
if changed_only and changed_files:
|
|
150
|
+
log.info("Step 2: Parsing changed files (Layer A)...")
|
|
151
|
+
|
|
152
|
+
# Delete removed files from graph first
|
|
153
|
+
if changed_files.deleted:
|
|
154
|
+
deleted_paths = [str(p) for p in changed_files.deleted]
|
|
155
|
+
self.graph_builder.delete_files(deleted_paths)
|
|
156
|
+
|
|
157
|
+
# Parse only added and modified files
|
|
158
|
+
entities = self._parse_codebase(repo, file_filter=changed_files.all_to_index)
|
|
159
|
+
else:
|
|
160
|
+
log.info("Step 2: Parsing codebase (Layer A)...")
|
|
161
|
+
entities = self._parse_codebase(repo)
|
|
162
|
+
|
|
163
|
+
# Step 3: Extract git history (Layer B)
|
|
164
|
+
if skip_git:
|
|
165
|
+
log.info("Step 3: Skipping git history (code-only mode)")
|
|
166
|
+
git_data = GitData(repository=repo_entity)
|
|
167
|
+
else:
|
|
168
|
+
log.info("Step 3: Analyzing git history (Layer B)...")
|
|
169
|
+
git_data = self._analyze_git_history(repo, repo_entity)
|
|
170
|
+
|
|
171
|
+
# Step 4: Build graph
|
|
172
|
+
log.info("Step 4: Building graph...")
|
|
173
|
+
self.graph_builder.build_code_graph(entities)
|
|
174
|
+
if not skip_git:
|
|
175
|
+
self.graph_builder.build_git_graph(git_data)
|
|
176
|
+
|
|
177
|
+
# Step 5: Fetch PRs from GitHub (if token available and not skipped)
|
|
178
|
+
prs = []
|
|
179
|
+
tasks = []
|
|
180
|
+
if not skip_git and self.config.github.is_available and repo_entity.owner and pr_limit > 0:
|
|
181
|
+
log.info(f"Step 5: Fetching pull requests from GitHub (limit: {pr_limit} per state)...")
|
|
182
|
+
prs, tasks = self._fetch_pull_requests(repo_entity, pr_limit=pr_limit)
|
|
183
|
+
else:
|
|
184
|
+
if skip_git:
|
|
185
|
+
log.info("Step 5: Skipping GitHub layer (code-only mode)")
|
|
186
|
+
elif not self.config.github.is_available:
|
|
187
|
+
log.info("Step 5: Skipping PR fetch (no GitHub token)")
|
|
188
|
+
elif pr_limit == 0:
|
|
189
|
+
log.info("Step 5: Skipping PR fetch (pr_limit=0)")
|
|
190
|
+
else:
|
|
191
|
+
log.info("Step 5: Skipping PR fetch (not a GitHub repo)")
|
|
192
|
+
|
|
193
|
+
# Step 6: Update repository metadata
|
|
194
|
+
log.info("Step 6: Updating repository metadata...")
|
|
195
|
+
self._update_repository_metadata(
|
|
196
|
+
repo_entity,
|
|
197
|
+
len(entities.files),
|
|
198
|
+
last_indexed_commit=current_commit or repo.head.commit.hexsha
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
log.info("✓ Indexing complete!")
|
|
202
|
+
self._print_summary(entities, git_data, prs)
|
|
203
|
+
|
|
204
|
+
def _ensure_database_ready(self):
|
|
205
|
+
"""Ensure database connection and schema are ready."""
|
|
206
|
+
log.info("Connecting to Kuzu and ensuring schema...")
|
|
207
|
+
|
|
208
|
+
self.connection.connect()
|
|
209
|
+
|
|
210
|
+
# Initialize schema if needed
|
|
211
|
+
schema_manager = SchemaManager(self.connection)
|
|
212
|
+
|
|
213
|
+
# Check if schema exists (check for tables)
|
|
214
|
+
try:
|
|
215
|
+
result = self.connection.execute("CALL show_tables()")
|
|
216
|
+
if not result:
|
|
217
|
+
log.info("Schema not found, initializing...")
|
|
218
|
+
schema_manager.initialize_schema()
|
|
219
|
+
except Exception:
|
|
220
|
+
# If we can't check tables, try to initialize anyway
|
|
221
|
+
schema_manager.initialize_schema()
|
|
222
|
+
|
|
223
|
+
def _get_last_indexed_commit(self, repo_url: str) -> Optional[str]:
|
|
224
|
+
"""Get the commit SHA from the last successful index.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
repo_url: Repository URL
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Commit SHA or None if not found
|
|
231
|
+
"""
|
|
232
|
+
try:
|
|
233
|
+
result = self.connection.execute(
|
|
234
|
+
"MATCH (r:Repository {url: $url}) RETURN r.last_indexed_commit",
|
|
235
|
+
{"url": repo_url}
|
|
236
|
+
)
|
|
237
|
+
if result and result.has_next():
|
|
238
|
+
row = result.get_next()
|
|
239
|
+
return row[0] if row[0] else None
|
|
240
|
+
return None
|
|
241
|
+
except Exception as e:
|
|
242
|
+
log.debug(f"Could not get last indexed commit: {e}")
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
def _parse_codebase(
|
|
246
|
+
self,
|
|
247
|
+
repo,
|
|
248
|
+
file_filter: Optional[list[Path]] = None
|
|
249
|
+
) -> CodebaseEntities:
|
|
250
|
+
"""Parse source files in the repository.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
repo: Git repository
|
|
254
|
+
file_filter: Optional list of specific files to parse (for incremental mode)
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
CodebaseEntities
|
|
258
|
+
"""
|
|
259
|
+
repo_root = Path(repo.working_dir)
|
|
260
|
+
|
|
261
|
+
# Use file filter if provided (incremental mode)
|
|
262
|
+
if file_filter is not None:
|
|
263
|
+
source_files = [f for f in file_filter if f.exists()]
|
|
264
|
+
if not source_files:
|
|
265
|
+
log.info("No files to parse (all filtered files may have been deleted)")
|
|
266
|
+
return CodebaseEntities()
|
|
267
|
+
else:
|
|
268
|
+
# Get all supported extensions from registry
|
|
269
|
+
extensions = ParserRegistry.get_all_extensions()
|
|
270
|
+
|
|
271
|
+
if not extensions:
|
|
272
|
+
log.warning("No parsers registered")
|
|
273
|
+
return CodebaseEntities()
|
|
274
|
+
|
|
275
|
+
# Get source files (multi-language)
|
|
276
|
+
source_files = self.repo_manager.get_source_files(
|
|
277
|
+
repo,
|
|
278
|
+
extensions,
|
|
279
|
+
self.config.ingestion.ignore_patterns
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if not source_files:
|
|
283
|
+
log.warning(f"No source files found (supported: {', '.join(extensions)})")
|
|
284
|
+
return CodebaseEntities()
|
|
285
|
+
|
|
286
|
+
log.info(f"Parsing {len(source_files)} source files...")
|
|
287
|
+
|
|
288
|
+
# Parse files (with optional parallelization)
|
|
289
|
+
if self.config.ingestion.max_workers > 1:
|
|
290
|
+
entities = self._parse_files_parallel(source_files, repo_root)
|
|
291
|
+
else:
|
|
292
|
+
entities = self._parse_files_sequential(source_files, repo_root)
|
|
293
|
+
|
|
294
|
+
log.info(f"Extracted: {len(entities.files)} files, {len(entities.classes)} classes, "
|
|
295
|
+
f"{len(entities.functions)} functions")
|
|
296
|
+
|
|
297
|
+
return entities
|
|
298
|
+
|
|
299
|
+
def _parse_single_file(self, file_path: Path, repo_root: Path) -> FileEntities:
|
|
300
|
+
"""Parse a single file using appropriate parser.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
file_path: Path to source file
|
|
304
|
+
repo_root: Repository root path
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
FileEntities
|
|
308
|
+
"""
|
|
309
|
+
# Get parser from registry
|
|
310
|
+
parser_class = ParserRegistry.get_parser(file_path)
|
|
311
|
+
|
|
312
|
+
if parser_class is None:
|
|
313
|
+
log.warning(f"No parser available for {file_path.suffix}")
|
|
314
|
+
return FileEntities()
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
parser = parser_class(file_path, repo_root)
|
|
318
|
+
return parser.parse()
|
|
319
|
+
except Exception as e:
|
|
320
|
+
log.warning(f"Failed to parse {file_path}: {e}")
|
|
321
|
+
return FileEntities()
|
|
322
|
+
|
|
323
|
+
def _parse_files_sequential(
|
|
324
|
+
self,
|
|
325
|
+
source_files: list[Path],
|
|
326
|
+
repo_root: Path
|
|
327
|
+
) -> CodebaseEntities:
|
|
328
|
+
"""Parse files sequentially with progress bar.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
source_files: List of source files to parse
|
|
332
|
+
repo_root: Repository root path
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
CodebaseEntities
|
|
336
|
+
"""
|
|
337
|
+
results = []
|
|
338
|
+
total = len(source_files)
|
|
339
|
+
last_reported_percent = 0
|
|
340
|
+
|
|
341
|
+
for i, file_path in enumerate(tqdm(source_files, desc="Parsing", unit="file")):
|
|
342
|
+
file_entities = self._parse_single_file(file_path, repo_root)
|
|
343
|
+
results.append(file_entities)
|
|
344
|
+
|
|
345
|
+
# Report progress every 10%
|
|
346
|
+
if total > 0 and hasattr(self, '_progress_callback') and self._progress_callback:
|
|
347
|
+
current_percent = int((i + 1) / total * 100)
|
|
348
|
+
# Report at 10% intervals (10, 20, 30, etc.)
|
|
349
|
+
if current_percent >= last_reported_percent + 10:
|
|
350
|
+
last_reported_percent = (current_percent // 10) * 10
|
|
351
|
+
# Map parsing progress (0-100) to overall progress (10-70)
|
|
352
|
+
overall_percent = 10 + (last_reported_percent * 0.6)
|
|
353
|
+
self._progress_callback(f"Parsing files ({last_reported_percent}%)", overall_percent)
|
|
354
|
+
|
|
355
|
+
return CodebaseEntities.merge(results)
|
|
356
|
+
|
|
357
|
+
def _parse_files_parallel(
|
|
358
|
+
self,
|
|
359
|
+
source_files: list[Path],
|
|
360
|
+
repo_root: Path
|
|
361
|
+
) -> CodebaseEntities:
|
|
362
|
+
"""Parse files in parallel with progress bar.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
source_files: List of source files to parse
|
|
366
|
+
repo_root: Repository root path
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
CodebaseEntities
|
|
370
|
+
"""
|
|
371
|
+
results = []
|
|
372
|
+
total = len(source_files)
|
|
373
|
+
last_reported_percent = 0
|
|
374
|
+
completed = 0
|
|
375
|
+
|
|
376
|
+
with ProcessPoolExecutor(max_workers=self.config.ingestion.max_workers) as executor:
|
|
377
|
+
# Submit all tasks
|
|
378
|
+
futures = {
|
|
379
|
+
executor.submit(_parse_file_worker, file_path, repo_root): file_path
|
|
380
|
+
for file_path in source_files
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
# Process results with progress bar
|
|
384
|
+
for future in tqdm(
|
|
385
|
+
as_completed(futures),
|
|
386
|
+
total=len(futures),
|
|
387
|
+
desc="Parsing",
|
|
388
|
+
unit="file"
|
|
389
|
+
):
|
|
390
|
+
try:
|
|
391
|
+
file_entities = future.result()
|
|
392
|
+
results.append(file_entities)
|
|
393
|
+
except Exception as e:
|
|
394
|
+
file_path = futures[future]
|
|
395
|
+
log.warning(f"Failed to parse {file_path}: {e}")
|
|
396
|
+
|
|
397
|
+
# Report progress every 10%
|
|
398
|
+
completed += 1
|
|
399
|
+
if total > 0 and hasattr(self, '_progress_callback') and self._progress_callback:
|
|
400
|
+
current_percent = int(completed / total * 100)
|
|
401
|
+
if current_percent >= last_reported_percent + 10:
|
|
402
|
+
last_reported_percent = (current_percent // 10) * 10
|
|
403
|
+
overall_percent = 10 + (last_reported_percent * 0.6)
|
|
404
|
+
self._progress_callback(f"Parsing files ({last_reported_percent}%)", overall_percent)
|
|
405
|
+
|
|
406
|
+
return CodebaseEntities.merge(results)
|
|
407
|
+
|
|
408
|
+
def _analyze_git_history(self, repo, repo_entity):
|
|
409
|
+
"""Analyze Git commit history.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
repo: Git repository
|
|
413
|
+
repo_entity: Repository entity
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
GitData
|
|
417
|
+
"""
|
|
418
|
+
analyzer = CommitAnalyzer(repo, max_commits=self.config.ingestion.git_depth)
|
|
419
|
+
return analyzer.analyze(repo_entity)
|
|
420
|
+
|
|
421
|
+
def _update_repository_metadata(
|
|
422
|
+
self,
|
|
423
|
+
repo_entity,
|
|
424
|
+
file_count: int,
|
|
425
|
+
last_indexed_commit: Optional[str] = None
|
|
426
|
+
):
|
|
427
|
+
"""Update repository metadata in the graph.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
repo_entity: Repository entity
|
|
431
|
+
file_count: Number of files processed
|
|
432
|
+
last_indexed_commit: Commit SHA at time of indexing
|
|
433
|
+
"""
|
|
434
|
+
from datetime import datetime
|
|
435
|
+
|
|
436
|
+
query = """
|
|
437
|
+
MERGE (r:Repository {url: $url})
|
|
438
|
+
SET r.name = $name,
|
|
439
|
+
r.owner = $owner,
|
|
440
|
+
r.default_branch = $default_branch,
|
|
441
|
+
r.last_ingested = timestamp($last_ingested),
|
|
442
|
+
r.last_indexed_commit = $last_indexed_commit,
|
|
443
|
+
r.ingestion_status = 'completed',
|
|
444
|
+
r.commit_count = $commit_count,
|
|
445
|
+
r.file_count = $file_count,
|
|
446
|
+
r.primary_language = 'Python'
|
|
447
|
+
"""
|
|
448
|
+
|
|
449
|
+
self.connection.execute_write(
|
|
450
|
+
query,
|
|
451
|
+
{
|
|
452
|
+
"url": repo_entity.url,
|
|
453
|
+
"name": repo_entity.name,
|
|
454
|
+
"owner": repo_entity.owner,
|
|
455
|
+
"default_branch": repo_entity.default_branch,
|
|
456
|
+
"last_ingested": datetime.now().isoformat(),
|
|
457
|
+
"last_indexed_commit": last_indexed_commit,
|
|
458
|
+
"commit_count": repo_entity.commit_count,
|
|
459
|
+
"file_count": file_count,
|
|
460
|
+
}
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
def _fetch_pull_requests(self, repo_entity, pr_limit: int = 100):
|
|
464
|
+
"""Fetch pull requests from GitHub.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
repo_entity: Repository entity with owner and name
|
|
468
|
+
pr_limit: Maximum PRs to fetch per state (open/merged)
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
Tuple of (list[PullRequestEntity], list[TaskEntity])
|
|
472
|
+
"""
|
|
473
|
+
try:
|
|
474
|
+
# Initialize fetcher and get PRs
|
|
475
|
+
fetcher = PRFetcher(
|
|
476
|
+
owner=repo_entity.owner,
|
|
477
|
+
repo=repo_entity.name,
|
|
478
|
+
token=self.config.github.token
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Fetch open and merged PRs separately with the limit
|
|
482
|
+
open_prs = fetcher.fetch_prs(state="open", limit=pr_limit)
|
|
483
|
+
merged_prs = fetcher.fetch_prs(state="merged", limit=pr_limit)
|
|
484
|
+
prs = open_prs + merged_prs
|
|
485
|
+
log.info(f"Fetched {len(open_prs)} open + {len(merged_prs)} merged PRs")
|
|
486
|
+
|
|
487
|
+
if not prs:
|
|
488
|
+
log.info("No pull requests found")
|
|
489
|
+
return [], []
|
|
490
|
+
|
|
491
|
+
# Extract tasks from PR descriptions
|
|
492
|
+
extractor = TaskExtractor()
|
|
493
|
+
all_tasks = []
|
|
494
|
+
for pr in prs:
|
|
495
|
+
tasks = extractor.extract_tasks(pr)
|
|
496
|
+
all_tasks.extend(tasks)
|
|
497
|
+
|
|
498
|
+
if all_tasks:
|
|
499
|
+
log.info(f"Extracted {len(all_tasks)} tasks from PR descriptions")
|
|
500
|
+
|
|
501
|
+
# Write to graph using connection directly
|
|
502
|
+
writer = GraphWriter(self.connection)
|
|
503
|
+
writer.write_pull_requests(prs)
|
|
504
|
+
writer.write_tasks(all_tasks)
|
|
505
|
+
|
|
506
|
+
# Link PRs to commits and files
|
|
507
|
+
for pr in prs:
|
|
508
|
+
if pr.commit_shas:
|
|
509
|
+
writer.write_pr_commit_links(pr.number, pr.commit_shas)
|
|
510
|
+
if pr.files_changed:
|
|
511
|
+
writer.write_pr_file_links(pr.number, pr.files_changed)
|
|
512
|
+
|
|
513
|
+
return prs, all_tasks
|
|
514
|
+
|
|
515
|
+
except Exception as e:
|
|
516
|
+
log.warning(f"Failed to fetch pull requests: {e}")
|
|
517
|
+
return [], []
|
|
518
|
+
|
|
519
|
+
def _print_summary(self, entities: CodebaseEntities, git_data, prs=None):
|
|
520
|
+
"""Print indexing summary.
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
entities: Codebase entities
|
|
524
|
+
git_data: Git data
|
|
525
|
+
prs: Optional list of pull requests
|
|
526
|
+
"""
|
|
527
|
+
log.info("\n" + "=" * 60)
|
|
528
|
+
log.info("INDEXING SUMMARY")
|
|
529
|
+
log.info("=" * 60)
|
|
530
|
+
log.info(f"Files: {len(entities.files)}")
|
|
531
|
+
log.info(f"Classes: {len(entities.classes)}")
|
|
532
|
+
log.info(f"Functions: {len(entities.functions)}")
|
|
533
|
+
log.info(f"Modules: {len(entities.modules)}")
|
|
534
|
+
log.info(f"Commits: {len(git_data.commits)}")
|
|
535
|
+
log.info(f"Authors: {len(git_data.authors)}")
|
|
536
|
+
if prs:
|
|
537
|
+
log.info(f"PRs: {len(prs)}")
|
|
538
|
+
else:
|
|
539
|
+
log.info("PRs: 0 (GitHub layer skipped)")
|
|
540
|
+
log.info("=" * 60)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Language parsers for code extraction."""
|
|
2
|
+
|
|
3
|
+
from .base_parser import BaseLanguageParser
|
|
4
|
+
from .registry import ParserRegistry
|
|
5
|
+
from .python_parser import PythonParser
|
|
6
|
+
from .typescript_parser import TypeScriptParser
|
|
7
|
+
|
|
8
|
+
# Parsers auto-register when imported above
|
|
9
|
+
|
|
10
|
+
__all__ = ['BaseLanguageParser', 'ParserRegistry', 'PythonParser', 'TypeScriptParser']
|