emdash-core 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. emdash_core/__init__.py +3 -0
  2. emdash_core/agent/__init__.py +37 -0
  3. emdash_core/agent/agents.py +225 -0
  4. emdash_core/agent/code_reviewer.py +476 -0
  5. emdash_core/agent/compaction.py +143 -0
  6. emdash_core/agent/context_manager.py +140 -0
  7. emdash_core/agent/events.py +338 -0
  8. emdash_core/agent/handlers.py +224 -0
  9. emdash_core/agent/inprocess_subagent.py +377 -0
  10. emdash_core/agent/mcp/__init__.py +50 -0
  11. emdash_core/agent/mcp/client.py +346 -0
  12. emdash_core/agent/mcp/config.py +302 -0
  13. emdash_core/agent/mcp/manager.py +496 -0
  14. emdash_core/agent/mcp/tool_factory.py +213 -0
  15. emdash_core/agent/prompts/__init__.py +38 -0
  16. emdash_core/agent/prompts/main_agent.py +104 -0
  17. emdash_core/agent/prompts/subagents.py +131 -0
  18. emdash_core/agent/prompts/workflow.py +136 -0
  19. emdash_core/agent/providers/__init__.py +34 -0
  20. emdash_core/agent/providers/base.py +143 -0
  21. emdash_core/agent/providers/factory.py +80 -0
  22. emdash_core/agent/providers/models.py +220 -0
  23. emdash_core/agent/providers/openai_provider.py +463 -0
  24. emdash_core/agent/providers/transformers_provider.py +217 -0
  25. emdash_core/agent/research/__init__.py +81 -0
  26. emdash_core/agent/research/agent.py +143 -0
  27. emdash_core/agent/research/controller.py +254 -0
  28. emdash_core/agent/research/critic.py +428 -0
  29. emdash_core/agent/research/macros.py +469 -0
  30. emdash_core/agent/research/planner.py +449 -0
  31. emdash_core/agent/research/researcher.py +436 -0
  32. emdash_core/agent/research/state.py +523 -0
  33. emdash_core/agent/research/synthesizer.py +594 -0
  34. emdash_core/agent/reviewer_profile.py +475 -0
  35. emdash_core/agent/rules.py +123 -0
  36. emdash_core/agent/runner.py +601 -0
  37. emdash_core/agent/session.py +262 -0
  38. emdash_core/agent/spec_schema.py +66 -0
  39. emdash_core/agent/specification.py +479 -0
  40. emdash_core/agent/subagent.py +397 -0
  41. emdash_core/agent/subagent_prompts.py +13 -0
  42. emdash_core/agent/toolkit.py +482 -0
  43. emdash_core/agent/toolkits/__init__.py +64 -0
  44. emdash_core/agent/toolkits/base.py +96 -0
  45. emdash_core/agent/toolkits/explore.py +47 -0
  46. emdash_core/agent/toolkits/plan.py +55 -0
  47. emdash_core/agent/tools/__init__.py +141 -0
  48. emdash_core/agent/tools/analytics.py +436 -0
  49. emdash_core/agent/tools/base.py +131 -0
  50. emdash_core/agent/tools/coding.py +484 -0
  51. emdash_core/agent/tools/github_mcp.py +592 -0
  52. emdash_core/agent/tools/history.py +13 -0
  53. emdash_core/agent/tools/modes.py +153 -0
  54. emdash_core/agent/tools/plan.py +206 -0
  55. emdash_core/agent/tools/plan_write.py +135 -0
  56. emdash_core/agent/tools/search.py +412 -0
  57. emdash_core/agent/tools/spec.py +341 -0
  58. emdash_core/agent/tools/task.py +262 -0
  59. emdash_core/agent/tools/task_output.py +204 -0
  60. emdash_core/agent/tools/tasks.py +454 -0
  61. emdash_core/agent/tools/traversal.py +588 -0
  62. emdash_core/agent/tools/web.py +179 -0
  63. emdash_core/analytics/__init__.py +5 -0
  64. emdash_core/analytics/engine.py +1286 -0
  65. emdash_core/api/__init__.py +5 -0
  66. emdash_core/api/agent.py +308 -0
  67. emdash_core/api/agents.py +154 -0
  68. emdash_core/api/analyze.py +264 -0
  69. emdash_core/api/auth.py +173 -0
  70. emdash_core/api/context.py +77 -0
  71. emdash_core/api/db.py +121 -0
  72. emdash_core/api/embed.py +131 -0
  73. emdash_core/api/feature.py +143 -0
  74. emdash_core/api/health.py +93 -0
  75. emdash_core/api/index.py +162 -0
  76. emdash_core/api/plan.py +110 -0
  77. emdash_core/api/projectmd.py +210 -0
  78. emdash_core/api/query.py +320 -0
  79. emdash_core/api/research.py +122 -0
  80. emdash_core/api/review.py +161 -0
  81. emdash_core/api/router.py +76 -0
  82. emdash_core/api/rules.py +116 -0
  83. emdash_core/api/search.py +119 -0
  84. emdash_core/api/spec.py +99 -0
  85. emdash_core/api/swarm.py +223 -0
  86. emdash_core/api/tasks.py +109 -0
  87. emdash_core/api/team.py +120 -0
  88. emdash_core/auth/__init__.py +17 -0
  89. emdash_core/auth/github.py +389 -0
  90. emdash_core/config.py +74 -0
  91. emdash_core/context/__init__.py +52 -0
  92. emdash_core/context/models.py +50 -0
  93. emdash_core/context/providers/__init__.py +11 -0
  94. emdash_core/context/providers/base.py +74 -0
  95. emdash_core/context/providers/explored_areas.py +183 -0
  96. emdash_core/context/providers/touched_areas.py +360 -0
  97. emdash_core/context/registry.py +73 -0
  98. emdash_core/context/reranker.py +199 -0
  99. emdash_core/context/service.py +260 -0
  100. emdash_core/context/session.py +352 -0
  101. emdash_core/core/__init__.py +104 -0
  102. emdash_core/core/config.py +454 -0
  103. emdash_core/core/exceptions.py +55 -0
  104. emdash_core/core/models.py +265 -0
  105. emdash_core/core/review_config.py +57 -0
  106. emdash_core/db/__init__.py +67 -0
  107. emdash_core/db/auth.py +134 -0
  108. emdash_core/db/models.py +91 -0
  109. emdash_core/db/provider.py +222 -0
  110. emdash_core/db/providers/__init__.py +5 -0
  111. emdash_core/db/providers/supabase.py +452 -0
  112. emdash_core/embeddings/__init__.py +24 -0
  113. emdash_core/embeddings/indexer.py +534 -0
  114. emdash_core/embeddings/models.py +192 -0
  115. emdash_core/embeddings/providers/__init__.py +7 -0
  116. emdash_core/embeddings/providers/base.py +112 -0
  117. emdash_core/embeddings/providers/fireworks.py +141 -0
  118. emdash_core/embeddings/providers/openai.py +104 -0
  119. emdash_core/embeddings/registry.py +146 -0
  120. emdash_core/embeddings/service.py +215 -0
  121. emdash_core/graph/__init__.py +26 -0
  122. emdash_core/graph/builder.py +134 -0
  123. emdash_core/graph/connection.py +692 -0
  124. emdash_core/graph/schema.py +416 -0
  125. emdash_core/graph/writer.py +667 -0
  126. emdash_core/ingestion/__init__.py +7 -0
  127. emdash_core/ingestion/change_detector.py +150 -0
  128. emdash_core/ingestion/git/__init__.py +5 -0
  129. emdash_core/ingestion/git/commit_analyzer.py +196 -0
  130. emdash_core/ingestion/github/__init__.py +6 -0
  131. emdash_core/ingestion/github/pr_fetcher.py +296 -0
  132. emdash_core/ingestion/github/task_extractor.py +100 -0
  133. emdash_core/ingestion/orchestrator.py +540 -0
  134. emdash_core/ingestion/parsers/__init__.py +10 -0
  135. emdash_core/ingestion/parsers/base_parser.py +66 -0
  136. emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
  137. emdash_core/ingestion/parsers/class_extractor.py +154 -0
  138. emdash_core/ingestion/parsers/function_extractor.py +202 -0
  139. emdash_core/ingestion/parsers/import_analyzer.py +119 -0
  140. emdash_core/ingestion/parsers/python_parser.py +123 -0
  141. emdash_core/ingestion/parsers/registry.py +72 -0
  142. emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
  143. emdash_core/ingestion/parsers/typescript_parser.py +278 -0
  144. emdash_core/ingestion/repository.py +346 -0
  145. emdash_core/models/__init__.py +38 -0
  146. emdash_core/models/agent.py +68 -0
  147. emdash_core/models/index.py +77 -0
  148. emdash_core/models/query.py +113 -0
  149. emdash_core/planning/__init__.py +7 -0
  150. emdash_core/planning/agent_api.py +413 -0
  151. emdash_core/planning/context_builder.py +265 -0
  152. emdash_core/planning/feature_context.py +232 -0
  153. emdash_core/planning/feature_expander.py +646 -0
  154. emdash_core/planning/llm_explainer.py +198 -0
  155. emdash_core/planning/similarity.py +509 -0
  156. emdash_core/planning/team_focus.py +821 -0
  157. emdash_core/server.py +153 -0
  158. emdash_core/sse/__init__.py +5 -0
  159. emdash_core/sse/stream.py +196 -0
  160. emdash_core/swarm/__init__.py +17 -0
  161. emdash_core/swarm/merge_agent.py +383 -0
  162. emdash_core/swarm/session_manager.py +274 -0
  163. emdash_core/swarm/swarm_runner.py +226 -0
  164. emdash_core/swarm/task_definition.py +137 -0
  165. emdash_core/swarm/worker_spawner.py +319 -0
  166. emdash_core/swarm/worktree_manager.py +278 -0
  167. emdash_core/templates/__init__.py +10 -0
  168. emdash_core/templates/defaults/agent-builder.md.template +82 -0
  169. emdash_core/templates/defaults/focus.md.template +115 -0
  170. emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
  171. emdash_core/templates/defaults/pr-review.md.template +80 -0
  172. emdash_core/templates/defaults/project.md.template +85 -0
  173. emdash_core/templates/defaults/research_critic.md.template +112 -0
  174. emdash_core/templates/defaults/research_planner.md.template +85 -0
  175. emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
  176. emdash_core/templates/defaults/reviewer.md.template +81 -0
  177. emdash_core/templates/defaults/spec.md.template +41 -0
  178. emdash_core/templates/defaults/tasks.md.template +78 -0
  179. emdash_core/templates/loader.py +296 -0
  180. emdash_core/utils/__init__.py +45 -0
  181. emdash_core/utils/git.py +84 -0
  182. emdash_core/utils/image.py +502 -0
  183. emdash_core/utils/logger.py +51 -0
  184. emdash_core-0.1.7.dist-info/METADATA +35 -0
  185. emdash_core-0.1.7.dist-info/RECORD +187 -0
  186. emdash_core-0.1.7.dist-info/WHEEL +4 -0
  187. emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,100 @@
1
+ """Extract subtasks from PR descriptions."""
2
+
3
+ import re
4
+ from typing import Optional
5
+
6
+ from ...core.models import PullRequestEntity, TaskEntity
7
+ from ...utils.logger import log
8
+
9
+
10
+ class TaskExtractor:
11
+ """Extracts subtasks from PR descriptions (markdown checkboxes)."""
12
+
13
+ # Pattern to match markdown checkboxes: - [ ] or - [x] or * [ ] etc.
14
+ CHECKBOX_PATTERN = re.compile(
15
+ r"^[\s]*[-*]\s*\[([ xX])\]\s*(.+?)$",
16
+ re.MULTILINE
17
+ )
18
+
19
+ def extract_tasks(self, pr: PullRequestEntity) -> list[TaskEntity]:
20
+ """Extract tasks from a PR description.
21
+
22
+ Args:
23
+ pr: PullRequestEntity with description
24
+
25
+ Returns:
26
+ List of TaskEntity objects extracted from checkboxes
27
+ """
28
+ if not pr.description:
29
+ return []
30
+
31
+ tasks = []
32
+ matches = self.CHECKBOX_PATTERN.findall(pr.description)
33
+
34
+ for index, (checkbox, description) in enumerate(matches):
35
+ is_completed = checkbox.lower() == "x"
36
+ description = description.strip()
37
+
38
+ # Skip empty descriptions
39
+ if not description:
40
+ continue
41
+
42
+ task = TaskEntity(
43
+ id=f"pr_{pr.number}_task_{index}",
44
+ pr_number=pr.number,
45
+ description=description,
46
+ is_completed=is_completed,
47
+ order=index,
48
+ )
49
+ tasks.append(task)
50
+
51
+ if tasks:
52
+ log.debug(f"Extracted {len(tasks)} tasks from PR #{pr.number}")
53
+
54
+ return tasks
55
+
56
+ def extract_tasks_from_prs(
57
+ self,
58
+ prs: list[PullRequestEntity],
59
+ ) -> list[TaskEntity]:
60
+ """Extract tasks from multiple PRs.
61
+
62
+ Args:
63
+ prs: List of PullRequestEntity objects
64
+
65
+ Returns:
66
+ List of all TaskEntity objects from all PRs
67
+ """
68
+ all_tasks = []
69
+
70
+ for pr in prs:
71
+ tasks = self.extract_tasks(pr)
72
+ all_tasks.extend(tasks)
73
+
74
+ log.info(f"Extracted {len(all_tasks)} tasks from {len(prs)} PRs")
75
+ return all_tasks
76
+
77
+
78
+ def extract_task_summary(description: str) -> Optional[str]:
79
+ """Extract a summary from a task description.
80
+
81
+ Useful for getting a short version of long task descriptions.
82
+
83
+ Args:
84
+ description: Full task description
85
+
86
+ Returns:
87
+ Shortened summary (first sentence or first N chars)
88
+ """
89
+ if not description:
90
+ return None
91
+
92
+ # Get first sentence
93
+ first_sentence = description.split(".")[0].strip()
94
+
95
+ # If still too long, truncate
96
+ max_length = 100
97
+ if len(first_sentence) > max_length:
98
+ return first_sentence[:max_length - 3] + "..."
99
+
100
+ return first_sentence
@@ -0,0 +1,540 @@
1
+ """Orchestrates the complete ingestion pipeline."""
2
+
3
+ from concurrent.futures import ProcessPoolExecutor, as_completed
4
+ from pathlib import Path
5
+ from typing import Callable, Optional
6
+
7
+ from tqdm import tqdm
8
+
9
+ from ..core.config import get_config
10
+ from ..core.models import CodebaseEntities, FileEntities, GitData
11
+ from ..graph.connection import get_connection, write_lock_context
12
+ from ..graph.builder import GraphBuilder
13
+ from ..graph.schema import SchemaManager
14
+ from .repository import RepositoryManager
15
+ from .parsers.registry import ParserRegistry
16
+ from .parsers.base_parser import BaseLanguageParser
17
+ from .git.commit_analyzer import CommitAnalyzer
18
+ from .github.pr_fetcher import PRFetcher
19
+ from .github.task_extractor import TaskExtractor
20
+ from .change_detector import ChangeDetector, ChangedFiles
21
+ from ..graph.writer import GraphWriter
22
+ from ..utils.logger import log
23
+
24
+
25
+ def _parse_file_worker(file_path: Path, repo_root: Path) -> FileEntities:
26
+ """Worker function for parallel file parsing.
27
+
28
+ This is a module-level function so it can be pickled for ProcessPoolExecutor.
29
+
30
+ Args:
31
+ file_path: Path to source file
32
+ repo_root: Repository root path
33
+
34
+ Returns:
35
+ FileEntities
36
+ """
37
+ # Get parser from registry
38
+ parser_class = ParserRegistry.get_parser(file_path)
39
+
40
+ if parser_class is None:
41
+ log.warning(f"No parser available for {file_path.suffix}")
42
+ return FileEntities()
43
+
44
+ try:
45
+ parser = parser_class(file_path, repo_root)
46
+ return parser.parse()
47
+ except Exception as e:
48
+ log.warning(f"Failed to parse {file_path}: {e}")
49
+ return FileEntities()
50
+
51
+
52
+ class IngestionOrchestrator:
53
+ """Coordinates the full ingestion pipeline."""
54
+
55
+ def __init__(self):
56
+ """Initialize orchestrator."""
57
+ self.config = get_config()
58
+ self.repo_manager = RepositoryManager()
59
+ self.connection = get_connection()
60
+ self.graph_builder = GraphBuilder(
61
+ self.connection,
62
+ batch_size=self.config.ingestion.batch_size
63
+ )
64
+
65
+ def index(
66
+ self,
67
+ repo_path: str,
68
+ incremental: bool = False,
69
+ changed_only: bool = False,
70
+ skip_git: Optional[bool] = None,
71
+ pr_limit: int = 100,
72
+ progress_callback: Optional[Callable[[str, float], None]] = None,
73
+ ):
74
+ """Execute the full indexing pipeline.
75
+
76
+ Args:
77
+ repo_path: URL or local path to repository
78
+ incremental: Whether to perform incremental update (legacy)
79
+ changed_only: Only index files changed since last index (uses git diff)
80
+ skip_git: Whether to skip git history analysis (and GitHub analysis).
81
+ Defaults to config.ingestion.ast_only (AST_ONLY env var).
82
+ pr_limit: Maximum number of PRs to fetch per state (open/merged)
83
+ progress_callback: Optional callback(step: str, percent: float) for progress
84
+
85
+ Raises:
86
+ Various exceptions if indexing fails
87
+ """
88
+ # Use config ast_only as default if skip_git not explicitly set
89
+ if skip_git is None:
90
+ skip_git = self.config.ingestion.ast_only
91
+
92
+ # Store callback for use in parsing
93
+ self._progress_callback = progress_callback
94
+
95
+ # Acquire write lock for the entire indexing operation
96
+ with write_lock_context("indexing", timeout=120):
97
+ self._run_index(repo_path, incremental, changed_only, skip_git, pr_limit)
98
+
99
+ def _run_index(
100
+ self,
101
+ repo_path: str,
102
+ incremental: bool,
103
+ changed_only: bool,
104
+ skip_git: bool,
105
+ pr_limit: int,
106
+ ):
107
+ """Internal indexing implementation (called with write lock held)."""
108
+ log.info(f"Starting indexing: {repo_path}")
109
+ if changed_only:
110
+ log.info("Incremental mode: only indexing changed files")
111
+ if skip_git:
112
+ log.info("AST_ONLY mode: skipping Layer B (git) and Layer C (analytics)")
113
+
114
+ # Ensure Kuzu connection and schema
115
+ self._ensure_database_ready()
116
+
117
+ # Step 1: Get or clone repository
118
+ log.info("Step 1: Fetching repository...")
119
+ repo, repo_entity = self.repo_manager.get_or_clone(
120
+ repo_path,
121
+ skip_commit_count=skip_git
122
+ )
123
+
124
+ # For incremental indexing, detect changes
125
+ changed_files: Optional[ChangedFiles] = None
126
+ current_commit = None
127
+ if changed_only:
128
+ last_indexed_commit = self._get_last_indexed_commit(repo_entity.url)
129
+ current_commit = repo.head.commit.hexsha
130
+
131
+ if not last_indexed_commit:
132
+ log.info("No previous index found - performing full index")
133
+ changed_only = False
134
+ elif last_indexed_commit == current_commit:
135
+ log.info("No changes since last index - nothing to do")
136
+ return
137
+ else:
138
+ extensions = ParserRegistry.get_all_extensions()
139
+ detector = ChangeDetector(repo, last_indexed_commit)
140
+ changed_files = detector.get_changed_files(extensions)
141
+
142
+ if not changed_files:
143
+ log.info("No relevant file changes detected - nothing to do")
144
+ return
145
+
146
+ log.info(f"Changes detected: {changed_files.total_changes} files")
147
+
148
+ # Step 2: Parse codebase (Layer A)
149
+ if changed_only and changed_files:
150
+ log.info("Step 2: Parsing changed files (Layer A)...")
151
+
152
+ # Delete removed files from graph first
153
+ if changed_files.deleted:
154
+ deleted_paths = [str(p) for p in changed_files.deleted]
155
+ self.graph_builder.delete_files(deleted_paths)
156
+
157
+ # Parse only added and modified files
158
+ entities = self._parse_codebase(repo, file_filter=changed_files.all_to_index)
159
+ else:
160
+ log.info("Step 2: Parsing codebase (Layer A)...")
161
+ entities = self._parse_codebase(repo)
162
+
163
+ # Step 3: Extract git history (Layer B)
164
+ if skip_git:
165
+ log.info("Step 3: Skipping git history (code-only mode)")
166
+ git_data = GitData(repository=repo_entity)
167
+ else:
168
+ log.info("Step 3: Analyzing git history (Layer B)...")
169
+ git_data = self._analyze_git_history(repo, repo_entity)
170
+
171
+ # Step 4: Build graph
172
+ log.info("Step 4: Building graph...")
173
+ self.graph_builder.build_code_graph(entities)
174
+ if not skip_git:
175
+ self.graph_builder.build_git_graph(git_data)
176
+
177
+ # Step 5: Fetch PRs from GitHub (if token available and not skipped)
178
+ prs = []
179
+ tasks = []
180
+ if not skip_git and self.config.github.is_available and repo_entity.owner and pr_limit > 0:
181
+ log.info(f"Step 5: Fetching pull requests from GitHub (limit: {pr_limit} per state)...")
182
+ prs, tasks = self._fetch_pull_requests(repo_entity, pr_limit=pr_limit)
183
+ else:
184
+ if skip_git:
185
+ log.info("Step 5: Skipping GitHub layer (code-only mode)")
186
+ elif not self.config.github.is_available:
187
+ log.info("Step 5: Skipping PR fetch (no GitHub token)")
188
+ elif pr_limit == 0:
189
+ log.info("Step 5: Skipping PR fetch (pr_limit=0)")
190
+ else:
191
+ log.info("Step 5: Skipping PR fetch (not a GitHub repo)")
192
+
193
+ # Step 6: Update repository metadata
194
+ log.info("Step 6: Updating repository metadata...")
195
+ self._update_repository_metadata(
196
+ repo_entity,
197
+ len(entities.files),
198
+ last_indexed_commit=current_commit or repo.head.commit.hexsha
199
+ )
200
+
201
+ log.info("✓ Indexing complete!")
202
+ self._print_summary(entities, git_data, prs)
203
+
204
+ def _ensure_database_ready(self):
205
+ """Ensure database connection and schema are ready."""
206
+ log.info("Connecting to Kuzu and ensuring schema...")
207
+
208
+ self.connection.connect()
209
+
210
+ # Initialize schema if needed
211
+ schema_manager = SchemaManager(self.connection)
212
+
213
+ # Check if schema exists (check for tables)
214
+ try:
215
+ result = self.connection.execute("CALL show_tables()")
216
+ if not result:
217
+ log.info("Schema not found, initializing...")
218
+ schema_manager.initialize_schema()
219
+ except Exception:
220
+ # If we can't check tables, try to initialize anyway
221
+ schema_manager.initialize_schema()
222
+
223
+ def _get_last_indexed_commit(self, repo_url: str) -> Optional[str]:
224
+ """Get the commit SHA from the last successful index.
225
+
226
+ Args:
227
+ repo_url: Repository URL
228
+
229
+ Returns:
230
+ Commit SHA or None if not found
231
+ """
232
+ try:
233
+ result = self.connection.execute(
234
+ "MATCH (r:Repository {url: $url}) RETURN r.last_indexed_commit",
235
+ {"url": repo_url}
236
+ )
237
+ if result and result.has_next():
238
+ row = result.get_next()
239
+ return row[0] if row[0] else None
240
+ return None
241
+ except Exception as e:
242
+ log.debug(f"Could not get last indexed commit: {e}")
243
+ return None
244
+
245
+ def _parse_codebase(
246
+ self,
247
+ repo,
248
+ file_filter: Optional[list[Path]] = None
249
+ ) -> CodebaseEntities:
250
+ """Parse source files in the repository.
251
+
252
+ Args:
253
+ repo: Git repository
254
+ file_filter: Optional list of specific files to parse (for incremental mode)
255
+
256
+ Returns:
257
+ CodebaseEntities
258
+ """
259
+ repo_root = Path(repo.working_dir)
260
+
261
+ # Use file filter if provided (incremental mode)
262
+ if file_filter is not None:
263
+ source_files = [f for f in file_filter if f.exists()]
264
+ if not source_files:
265
+ log.info("No files to parse (all filtered files may have been deleted)")
266
+ return CodebaseEntities()
267
+ else:
268
+ # Get all supported extensions from registry
269
+ extensions = ParserRegistry.get_all_extensions()
270
+
271
+ if not extensions:
272
+ log.warning("No parsers registered")
273
+ return CodebaseEntities()
274
+
275
+ # Get source files (multi-language)
276
+ source_files = self.repo_manager.get_source_files(
277
+ repo,
278
+ extensions,
279
+ self.config.ingestion.ignore_patterns
280
+ )
281
+
282
+ if not source_files:
283
+ log.warning(f"No source files found (supported: {', '.join(extensions)})")
284
+ return CodebaseEntities()
285
+
286
+ log.info(f"Parsing {len(source_files)} source files...")
287
+
288
+ # Parse files (with optional parallelization)
289
+ if self.config.ingestion.max_workers > 1:
290
+ entities = self._parse_files_parallel(source_files, repo_root)
291
+ else:
292
+ entities = self._parse_files_sequential(source_files, repo_root)
293
+
294
+ log.info(f"Extracted: {len(entities.files)} files, {len(entities.classes)} classes, "
295
+ f"{len(entities.functions)} functions")
296
+
297
+ return entities
298
+
299
+ def _parse_single_file(self, file_path: Path, repo_root: Path) -> FileEntities:
300
+ """Parse a single file using appropriate parser.
301
+
302
+ Args:
303
+ file_path: Path to source file
304
+ repo_root: Repository root path
305
+
306
+ Returns:
307
+ FileEntities
308
+ """
309
+ # Get parser from registry
310
+ parser_class = ParserRegistry.get_parser(file_path)
311
+
312
+ if parser_class is None:
313
+ log.warning(f"No parser available for {file_path.suffix}")
314
+ return FileEntities()
315
+
316
+ try:
317
+ parser = parser_class(file_path, repo_root)
318
+ return parser.parse()
319
+ except Exception as e:
320
+ log.warning(f"Failed to parse {file_path}: {e}")
321
+ return FileEntities()
322
+
323
+ def _parse_files_sequential(
324
+ self,
325
+ source_files: list[Path],
326
+ repo_root: Path
327
+ ) -> CodebaseEntities:
328
+ """Parse files sequentially with progress bar.
329
+
330
+ Args:
331
+ source_files: List of source files to parse
332
+ repo_root: Repository root path
333
+
334
+ Returns:
335
+ CodebaseEntities
336
+ """
337
+ results = []
338
+ total = len(source_files)
339
+ last_reported_percent = 0
340
+
341
+ for i, file_path in enumerate(tqdm(source_files, desc="Parsing", unit="file")):
342
+ file_entities = self._parse_single_file(file_path, repo_root)
343
+ results.append(file_entities)
344
+
345
+ # Report progress every 10%
346
+ if total > 0 and hasattr(self, '_progress_callback') and self._progress_callback:
347
+ current_percent = int((i + 1) / total * 100)
348
+ # Report at 10% intervals (10, 20, 30, etc.)
349
+ if current_percent >= last_reported_percent + 10:
350
+ last_reported_percent = (current_percent // 10) * 10
351
+ # Map parsing progress (0-100) to overall progress (10-70)
352
+ overall_percent = 10 + (last_reported_percent * 0.6)
353
+ self._progress_callback(f"Parsing files ({last_reported_percent}%)", overall_percent)
354
+
355
+ return CodebaseEntities.merge(results)
356
+
357
+ def _parse_files_parallel(
358
+ self,
359
+ source_files: list[Path],
360
+ repo_root: Path
361
+ ) -> CodebaseEntities:
362
+ """Parse files in parallel with progress bar.
363
+
364
+ Args:
365
+ source_files: List of source files to parse
366
+ repo_root: Repository root path
367
+
368
+ Returns:
369
+ CodebaseEntities
370
+ """
371
+ results = []
372
+ total = len(source_files)
373
+ last_reported_percent = 0
374
+ completed = 0
375
+
376
+ with ProcessPoolExecutor(max_workers=self.config.ingestion.max_workers) as executor:
377
+ # Submit all tasks
378
+ futures = {
379
+ executor.submit(_parse_file_worker, file_path, repo_root): file_path
380
+ for file_path in source_files
381
+ }
382
+
383
+ # Process results with progress bar
384
+ for future in tqdm(
385
+ as_completed(futures),
386
+ total=len(futures),
387
+ desc="Parsing",
388
+ unit="file"
389
+ ):
390
+ try:
391
+ file_entities = future.result()
392
+ results.append(file_entities)
393
+ except Exception as e:
394
+ file_path = futures[future]
395
+ log.warning(f"Failed to parse {file_path}: {e}")
396
+
397
+ # Report progress every 10%
398
+ completed += 1
399
+ if total > 0 and hasattr(self, '_progress_callback') and self._progress_callback:
400
+ current_percent = int(completed / total * 100)
401
+ if current_percent >= last_reported_percent + 10:
402
+ last_reported_percent = (current_percent // 10) * 10
403
+ overall_percent = 10 + (last_reported_percent * 0.6)
404
+ self._progress_callback(f"Parsing files ({last_reported_percent}%)", overall_percent)
405
+
406
+ return CodebaseEntities.merge(results)
407
+
408
+ def _analyze_git_history(self, repo, repo_entity):
409
+ """Analyze Git commit history.
410
+
411
+ Args:
412
+ repo: Git repository
413
+ repo_entity: Repository entity
414
+
415
+ Returns:
416
+ GitData
417
+ """
418
+ analyzer = CommitAnalyzer(repo, max_commits=self.config.ingestion.git_depth)
419
+ return analyzer.analyze(repo_entity)
420
+
421
+ def _update_repository_metadata(
422
+ self,
423
+ repo_entity,
424
+ file_count: int,
425
+ last_indexed_commit: Optional[str] = None
426
+ ):
427
+ """Update repository metadata in the graph.
428
+
429
+ Args:
430
+ repo_entity: Repository entity
431
+ file_count: Number of files processed
432
+ last_indexed_commit: Commit SHA at time of indexing
433
+ """
434
+ from datetime import datetime
435
+
436
+ query = """
437
+ MERGE (r:Repository {url: $url})
438
+ SET r.name = $name,
439
+ r.owner = $owner,
440
+ r.default_branch = $default_branch,
441
+ r.last_ingested = timestamp($last_ingested),
442
+ r.last_indexed_commit = $last_indexed_commit,
443
+ r.ingestion_status = 'completed',
444
+ r.commit_count = $commit_count,
445
+ r.file_count = $file_count,
446
+ r.primary_language = 'Python'
447
+ """
448
+
449
+ self.connection.execute_write(
450
+ query,
451
+ {
452
+ "url": repo_entity.url,
453
+ "name": repo_entity.name,
454
+ "owner": repo_entity.owner,
455
+ "default_branch": repo_entity.default_branch,
456
+ "last_ingested": datetime.now().isoformat(),
457
+ "last_indexed_commit": last_indexed_commit,
458
+ "commit_count": repo_entity.commit_count,
459
+ "file_count": file_count,
460
+ }
461
+ )
462
+
463
+ def _fetch_pull_requests(self, repo_entity, pr_limit: int = 100):
464
+ """Fetch pull requests from GitHub.
465
+
466
+ Args:
467
+ repo_entity: Repository entity with owner and name
468
+ pr_limit: Maximum PRs to fetch per state (open/merged)
469
+
470
+ Returns:
471
+ Tuple of (list[PullRequestEntity], list[TaskEntity])
472
+ """
473
+ try:
474
+ # Initialize fetcher and get PRs
475
+ fetcher = PRFetcher(
476
+ owner=repo_entity.owner,
477
+ repo=repo_entity.name,
478
+ token=self.config.github.token
479
+ )
480
+
481
+ # Fetch open and merged PRs separately with the limit
482
+ open_prs = fetcher.fetch_prs(state="open", limit=pr_limit)
483
+ merged_prs = fetcher.fetch_prs(state="merged", limit=pr_limit)
484
+ prs = open_prs + merged_prs
485
+ log.info(f"Fetched {len(open_prs)} open + {len(merged_prs)} merged PRs")
486
+
487
+ if not prs:
488
+ log.info("No pull requests found")
489
+ return [], []
490
+
491
+ # Extract tasks from PR descriptions
492
+ extractor = TaskExtractor()
493
+ all_tasks = []
494
+ for pr in prs:
495
+ tasks = extractor.extract_tasks(pr)
496
+ all_tasks.extend(tasks)
497
+
498
+ if all_tasks:
499
+ log.info(f"Extracted {len(all_tasks)} tasks from PR descriptions")
500
+
501
+ # Write to graph using connection directly
502
+ writer = GraphWriter(self.connection)
503
+ writer.write_pull_requests(prs)
504
+ writer.write_tasks(all_tasks)
505
+
506
+ # Link PRs to commits and files
507
+ for pr in prs:
508
+ if pr.commit_shas:
509
+ writer.write_pr_commit_links(pr.number, pr.commit_shas)
510
+ if pr.files_changed:
511
+ writer.write_pr_file_links(pr.number, pr.files_changed)
512
+
513
+ return prs, all_tasks
514
+
515
+ except Exception as e:
516
+ log.warning(f"Failed to fetch pull requests: {e}")
517
+ return [], []
518
+
519
+ def _print_summary(self, entities: CodebaseEntities, git_data, prs=None):
520
+ """Print indexing summary.
521
+
522
+ Args:
523
+ entities: Codebase entities
524
+ git_data: Git data
525
+ prs: Optional list of pull requests
526
+ """
527
+ log.info("\n" + "=" * 60)
528
+ log.info("INDEXING SUMMARY")
529
+ log.info("=" * 60)
530
+ log.info(f"Files: {len(entities.files)}")
531
+ log.info(f"Classes: {len(entities.classes)}")
532
+ log.info(f"Functions: {len(entities.functions)}")
533
+ log.info(f"Modules: {len(entities.modules)}")
534
+ log.info(f"Commits: {len(git_data.commits)}")
535
+ log.info(f"Authors: {len(git_data.authors)}")
536
+ if prs:
537
+ log.info(f"PRs: {len(prs)}")
538
+ else:
539
+ log.info("PRs: 0 (GitHub layer skipped)")
540
+ log.info("=" * 60)
@@ -0,0 +1,10 @@
1
+ """Language parsers for code extraction."""
2
+
3
+ from .base_parser import BaseLanguageParser
4
+ from .registry import ParserRegistry
5
+ from .python_parser import PythonParser
6
+ from .typescript_parser import TypeScriptParser
7
+
8
+ # Parsers auto-register when imported above
9
+
10
+ __all__ = ['BaseLanguageParser', 'ParserRegistry', 'PythonParser', 'TypeScriptParser']