PyPI - codexa - Versions diffs - 0.4.0__py3-none-any.whl - Mend

codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (189) hide show

codexa-0.4.0.dist-info/METADATA +650 -0
codexa-0.4.0.dist-info/RECORD +189 -0
codexa-0.4.0.dist-info/WHEEL +5 -0
codexa-0.4.0.dist-info/entry_points.txt +2 -0
codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
codexa-0.4.0.dist-info/top_level.txt +1 -0
semantic_code_intelligence/__init__.py +5 -0
semantic_code_intelligence/analysis/__init__.py +21 -0
semantic_code_intelligence/analysis/ai_features.py +351 -0
semantic_code_intelligence/bridge/__init__.py +28 -0
semantic_code_intelligence/bridge/context_provider.py +245 -0
semantic_code_intelligence/bridge/protocol.py +167 -0
semantic_code_intelligence/bridge/server.py +348 -0
semantic_code_intelligence/bridge/vscode.py +271 -0
semantic_code_intelligence/ci/__init__.py +13 -0
semantic_code_intelligence/ci/hooks.py +98 -0
semantic_code_intelligence/ci/hotspots.py +272 -0
semantic_code_intelligence/ci/impact.py +246 -0
semantic_code_intelligence/ci/metrics.py +591 -0
semantic_code_intelligence/ci/pr.py +412 -0
semantic_code_intelligence/ci/quality.py +557 -0
semantic_code_intelligence/ci/templates.py +164 -0
semantic_code_intelligence/ci/trace.py +224 -0
semantic_code_intelligence/cli/__init__.py +0 -0
semantic_code_intelligence/cli/commands/__init__.py +0 -0
semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
semantic_code_intelligence/cli/main.py +65 -0
semantic_code_intelligence/cli/router.py +92 -0
semantic_code_intelligence/config/__init__.py +0 -0
semantic_code_intelligence/config/settings.py +260 -0
semantic_code_intelligence/context/__init__.py +19 -0
semantic_code_intelligence/context/engine.py +429 -0
semantic_code_intelligence/context/memory.py +253 -0
semantic_code_intelligence/daemon/__init__.py +1 -0
semantic_code_intelligence/daemon/watcher.py +515 -0
semantic_code_intelligence/docs/__init__.py +1080 -0
semantic_code_intelligence/embeddings/__init__.py +0 -0
semantic_code_intelligence/embeddings/enhanced.py +131 -0
semantic_code_intelligence/embeddings/generator.py +149 -0
semantic_code_intelligence/embeddings/model_registry.py +100 -0
semantic_code_intelligence/evolution/__init__.py +1 -0
semantic_code_intelligence/evolution/budget_guard.py +111 -0
semantic_code_intelligence/evolution/commit_manager.py +88 -0
semantic_code_intelligence/evolution/context_builder.py +131 -0
semantic_code_intelligence/evolution/engine.py +249 -0
semantic_code_intelligence/evolution/patch_generator.py +229 -0
semantic_code_intelligence/evolution/task_selector.py +214 -0
semantic_code_intelligence/evolution/test_runner.py +111 -0
semantic_code_intelligence/indexing/__init__.py +0 -0
semantic_code_intelligence/indexing/chunker.py +174 -0
semantic_code_intelligence/indexing/parallel.py +86 -0
semantic_code_intelligence/indexing/scanner.py +146 -0
semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
semantic_code_intelligence/llm/__init__.py +62 -0
semantic_code_intelligence/llm/cache.py +219 -0
semantic_code_intelligence/llm/cached_provider.py +145 -0
semantic_code_intelligence/llm/conversation.py +190 -0
semantic_code_intelligence/llm/cross_refactor.py +272 -0
semantic_code_intelligence/llm/investigation.py +274 -0
semantic_code_intelligence/llm/mock_provider.py +77 -0
semantic_code_intelligence/llm/ollama_provider.py +122 -0
semantic_code_intelligence/llm/openai_provider.py +100 -0
semantic_code_intelligence/llm/provider.py +92 -0
semantic_code_intelligence/llm/rate_limiter.py +164 -0
semantic_code_intelligence/llm/reasoning.py +438 -0
semantic_code_intelligence/llm/safety.py +110 -0
semantic_code_intelligence/llm/streaming.py +251 -0
semantic_code_intelligence/lsp/__init__.py +609 -0
semantic_code_intelligence/mcp/__init__.py +393 -0
semantic_code_intelligence/parsing/__init__.py +19 -0
semantic_code_intelligence/parsing/parser.py +375 -0
semantic_code_intelligence/plugins/__init__.py +255 -0
semantic_code_intelligence/plugins/examples/__init__.py +1 -0
semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
semantic_code_intelligence/scalability/__init__.py +205 -0
semantic_code_intelligence/search/__init__.py +0 -0
semantic_code_intelligence/search/formatter.py +123 -0
semantic_code_intelligence/search/grep.py +361 -0
semantic_code_intelligence/search/hybrid_search.py +170 -0
semantic_code_intelligence/search/keyword_search.py +311 -0
semantic_code_intelligence/search/section_expander.py +103 -0
semantic_code_intelligence/services/__init__.py +0 -0
semantic_code_intelligence/services/indexing_service.py +630 -0
semantic_code_intelligence/services/search_service.py +269 -0
semantic_code_intelligence/storage/__init__.py +0 -0
semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
semantic_code_intelligence/storage/hash_store.py +66 -0
semantic_code_intelligence/storage/index_manifest.py +85 -0
semantic_code_intelligence/storage/index_stats.py +138 -0
semantic_code_intelligence/storage/query_history.py +160 -0
semantic_code_intelligence/storage/symbol_registry.py +209 -0
semantic_code_intelligence/storage/vector_store.py +297 -0
semantic_code_intelligence/tests/__init__.py +0 -0
semantic_code_intelligence/tests/test_ai_features.py +351 -0
semantic_code_intelligence/tests/test_chunker.py +119 -0
semantic_code_intelligence/tests/test_cli.py +188 -0
semantic_code_intelligence/tests/test_config.py +154 -0
semantic_code_intelligence/tests/test_context.py +381 -0
semantic_code_intelligence/tests/test_embeddings.py +73 -0
semantic_code_intelligence/tests/test_endtoend.py +1142 -0
semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
semantic_code_intelligence/tests/test_hash_store.py +79 -0
semantic_code_intelligence/tests/test_logging.py +55 -0
semantic_code_intelligence/tests/test_new_cli.py +138 -0
semantic_code_intelligence/tests/test_parser.py +495 -0
semantic_code_intelligence/tests/test_phase10.py +355 -0
semantic_code_intelligence/tests/test_phase11.py +593 -0
semantic_code_intelligence/tests/test_phase12.py +375 -0
semantic_code_intelligence/tests/test_phase13.py +663 -0
semantic_code_intelligence/tests/test_phase14.py +568 -0
semantic_code_intelligence/tests/test_phase15.py +814 -0
semantic_code_intelligence/tests/test_phase16.py +792 -0
semantic_code_intelligence/tests/test_phase17.py +815 -0
semantic_code_intelligence/tests/test_phase18.py +934 -0
semantic_code_intelligence/tests/test_phase19.py +986 -0
semantic_code_intelligence/tests/test_phase20.py +2753 -0
semantic_code_intelligence/tests/test_phase20b.py +2058 -0
semantic_code_intelligence/tests/test_phase20c.py +962 -0
semantic_code_intelligence/tests/test_phase21.py +428 -0
semantic_code_intelligence/tests/test_phase22.py +799 -0
semantic_code_intelligence/tests/test_phase23.py +783 -0
semantic_code_intelligence/tests/test_phase24.py +715 -0
semantic_code_intelligence/tests/test_phase25.py +496 -0
semantic_code_intelligence/tests/test_phase26.py +251 -0
semantic_code_intelligence/tests/test_phase27.py +531 -0
semantic_code_intelligence/tests/test_phase8.py +592 -0
semantic_code_intelligence/tests/test_phase9.py +643 -0
semantic_code_intelligence/tests/test_plugins.py +293 -0
semantic_code_intelligence/tests/test_priority_features.py +727 -0
semantic_code_intelligence/tests/test_router.py +41 -0
semantic_code_intelligence/tests/test_scalability.py +138 -0
semantic_code_intelligence/tests/test_scanner.py +125 -0
semantic_code_intelligence/tests/test_search.py +160 -0
semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
semantic_code_intelligence/tests/test_tools.py +182 -0
semantic_code_intelligence/tests/test_vector_store.py +151 -0
semantic_code_intelligence/tests/test_watcher.py +211 -0
semantic_code_intelligence/tools/__init__.py +442 -0
semantic_code_intelligence/tools/executor.py +232 -0
semantic_code_intelligence/tools/protocol.py +200 -0
semantic_code_intelligence/tui/__init__.py +454 -0
semantic_code_intelligence/utils/__init__.py +0 -0
semantic_code_intelligence/utils/logging.py +112 -0
semantic_code_intelligence/version.py +3 -0
semantic_code_intelligence/web/__init__.py +11 -0
semantic_code_intelligence/web/api.py +289 -0
semantic_code_intelligence/web/server.py +397 -0
semantic_code_intelligence/web/ui.py +659 -0
semantic_code_intelligence/web/visualize.py +226 -0
semantic_code_intelligence/workspace/__init__.py +427 -0

semantic_code_intelligence/evolution/task_selector.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""Task selector — chooses the next small improvement task.
+Analyses the current repository state (git diff, failing tests, code
+quality signals) and picks a single, well-scoped task for the LLM to
+implement.  Every task targets **≤3 files** and **≤200 lines changed**.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from semantic_code_intelligence.evolution.commit_manager import CommitManager
+from semantic_code_intelligence.evolution.test_runner import TestResult, TestRunner
+from semantic_code_intelligence.utils.logging import get_logger
+logger = get_logger("evolution.task_selector")
+# Priority-ordered task categories
+TASK_FIX_TESTS = "fix_failing_tests"
+TASK_TYPE_HINTS = "add_missing_type_hints"
+TASK_ERROR_HANDLING = "improve_error_handling"
+TASK_REDUCE_DUPLICATION = "reduce_duplication"
+TASK_SMALL_OPTIMISATION = "small_performance_optimisation"
+TASK_PRIORITIES: list[str] = [
+    TASK_FIX_TESTS,
+    TASK_TYPE_HINTS,
+    TASK_ERROR_HANDLING,
+    TASK_REDUCE_DUPLICATION,
+    TASK_SMALL_OPTIMISATION,
+]
+@dataclass
+class EvolutionTask:
+    """A single, well-scoped improvement task."""
+    category: str
+    description: str
+    target_files: list[str] = field(default_factory=list)
+    context_hint: str = ""
+    def to_dict(self) -> dict[str, object]:
+        """Serialise the task to a plain dictionary."""
+        return {
+            "category": self.category,
+            "description": self.description,
+            "target_files": self.target_files,
+            "context_hint": self.context_hint,
+        }
+class TaskSelector:
+    """Selects the next evolution task based on repo state."""
+    def __init__(
+        self,
+        project_root: Path,
+        test_runner: TestRunner,
+        commit_manager: CommitManager,
+    ) -> None:
+        self._root = project_root.resolve()
+        self._runner = test_runner
+        self._git = commit_manager
+    def select(self, last_test_result: TestResult | None = None) -> EvolutionTask:
+        """Choose the highest-priority actionable task.
+        1. If tests are failing → fix them
+        2. Else scan for missing type hints
+        3. Else scan for bare excepts / weak error handling
+        4. Else look for obvious duplication
+        5. Fallback: small quality improvement
+        """
+        # Priority 1: fix failing tests
+        if last_test_result and not last_test_result.passed:
+            return self._task_from_failures(last_test_result)
+        # Priority 2–5: static analysis of source files
+        src_dir = self._root / "semantic_code_intelligence"
+        py_files = self._collect_py_files(src_dir)
+        task = self._find_type_hint_task(py_files)
+        if task:
+            return task
+        task = self._find_error_handling_task(py_files)
+        if task:
+            return task
+        task = self._find_duplication_task(py_files)
+        if task:
+            return task
+        # Fallback
+        return EvolutionTask(
+            category=TASK_SMALL_OPTIMISATION,
+            description="Look for a small quality or performance improvement in the codebase.",
+            target_files=[],
+            context_hint="Focus on hot-path functions or frequently used utilities.",
+        )
+    # ------------------------------------------------------------------ #
+    # Task builders
+    # ------------------------------------------------------------------ #
+    def _task_from_failures(self, result: TestResult) -> EvolutionTask:
+        """Extract a fix-tests task from failing test output."""
+        # Pull failing file hints from the output (pytest --tb=line gives file:line)
+        failing_files: list[str] = []
+        for line in result.output.splitlines():
+            stripped = line.strip()
+            if stripped.startswith("FAILED ") or "::" in stripped:
+                parts = stripped.split("::")
+                if parts:
+                    fpath = parts[0].replace("FAILED ", "").strip()
+                    if fpath.endswith(".py") and fpath not in failing_files:
+                        failing_files.append(fpath)
+        return EvolutionTask(
+            category=TASK_FIX_TESTS,
+            description=f"Fix {result.failures} failing test(s).",
+            target_files=failing_files[:3],
+            context_hint=_last_n_lines(result.output, 40),
+        )
+    def _find_type_hint_task(self, files: list[Path]) -> EvolutionTask | None:
+        """Find a source file with functions lacking return type annotations."""
+        import re
+        pattern = re.compile(r"^\s*def\s+\w+\([^)]*\)\s*:", re.MULTILINE)
+        typed = re.compile(r"^\s*def\s+\w+\([^)]*\)\s*->\s*", re.MULTILINE)
+        for fpath in files:
+            try:
+                text = fpath.read_text(encoding="utf-8", errors="replace")
+            except OSError:
+                continue
+            all_defs = pattern.findall(text)
+            typed_defs = typed.findall(text)
+            missing = len(all_defs) - len(typed_defs)
+            if missing >= 2:
+                rel = str(fpath.relative_to(self._root))
+                return EvolutionTask(
+                    category=TASK_TYPE_HINTS,
+                    description=f"Add return type hints to {missing} function(s) in {rel}.",
+                    target_files=[rel],
+                    context_hint=f"File has {len(all_defs)} defs, {len(typed_defs)} typed.",
+                )
+        return None
+    def _find_error_handling_task(self, files: list[Path]) -> EvolutionTask | None:
+        """Find a file with bare ``except:`` or ``except Exception:`` blocks."""
+        for fpath in files:
+            try:
+                text = fpath.read_text(encoding="utf-8", errors="replace")
+            except OSError:
+                continue
+            if "\nexcept:" in text or "\nexcept Exception:" in text:
+                rel = str(fpath.relative_to(self._root))
+                return EvolutionTask(
+                    category=TASK_ERROR_HANDLING,
+                    description=f"Replace bare/broad except blocks with specific exceptions in {rel}.",
+                    target_files=[rel],
+                    context_hint="Catch only the exceptions that can actually occur.",
+                )
+        return None
+    def _find_duplication_task(self, files: list[Path]) -> EvolutionTask | None:
+        """Very lightweight duplication detector — looks for repeated blocks."""
+        # Simplified: look for files > 300 lines with repeated 5-line blocks
+        for fpath in files:
+            try:
+                lines = fpath.read_text(encoding="utf-8", errors="replace").splitlines()
+            except OSError:
+                continue
+            if len(lines) < 300:
+                continue
+            blocks: dict[str, int] = {}
+            for i in range(len(lines) - 4):
+                block = "\n".join(lines[i : i + 5]).strip()
+                if len(block) > 60:
+                    blocks[block] = blocks.get(block, 0) + 1
+            dups = sum(1 for v in blocks.values() if v >= 2)
+            if dups >= 2:
+                rel = str(fpath.relative_to(self._root))
+                return EvolutionTask(
+                    category=TASK_REDUCE_DUPLICATION,
+                    description=f"Extract duplicated logic into helper functions in {rel}.",
+                    target_files=[rel],
+                    context_hint=f"Found {dups} repeated 5-line blocks.",
+                )
+        return None
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+    def _collect_py_files(self, src_dir: Path) -> list[Path]:
+        """Collect .py source files, excluding tests and __pycache__."""
+        results: list[Path] = []
+        if not src_dir.exists():
+            return results
+        for fpath in sorted(src_dir.rglob("*.py")):
+            rel = str(fpath.relative_to(self._root))
+            if "tests" in rel or "__pycache__" in rel:
+                continue
+            results.append(fpath)
+        return results
+def _last_n_lines(text: str, n: int) -> str:
+    """Return the last *n* non-empty lines of *text*."""
+    lines = [l for l in text.splitlines() if l.strip()]
+    return "\n".join(lines[-n:])

semantic_code_intelligence/evolution/test_runner.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Test runner — executes pytest and returns structured results.
+Runs ``pytest`` as a subprocess to avoid polluting the current process
+with imported test modules.  Returns a structured ``TestResult`` that
+the engine can use to decide whether to commit or revert.
+"""
+from __future__ import annotations
+import subprocess
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from semantic_code_intelligence.utils.logging import get_logger
+logger = get_logger("evolution.test_runner")
+@dataclass
+class TestResult:
+    """Structured test-run result."""
+    __test__ = False  # prevent pytest collection
+    passed: bool = False
+    total: int = 0
+    failures: int = 0
+    errors: int = 0
+    output: str = ""
+    return_code: int = -1
+    def summary_line(self) -> str:
+        status = "PASS" if self.passed else "FAIL"
+        return f"[{status}] {self.total} tests, {self.failures} failures, {self.errors} errors"
+class TestRunner:
+    """Runs the project test suite via ``pytest``."""
+    __test__ = False  # prevent pytest collection
+    def __init__(self, project_root: Path, timeout: int = 120) -> None:
+        self._root = project_root.resolve()
+        self._timeout = timeout
+    def run(self, extra_args: list[str] | None = None) -> TestResult:
+        """Run pytest and return a :class:`TestResult`.
+        Parameters
+        ----------
+        extra_args
+            Additional pytest CLI arguments (e.g. ``["-x", "--tb=short"]``).
+        """
+        cmd = [
+            sys.executable, "-m", "pytest",
+            str(self._root / "semantic_code_intelligence" / "tests"),
+            "-q", "--tb=line", "--no-header",
+        ]
+        if extra_args:
+            cmd.extend(extra_args)
+        logger.info("Running: %s", " ".join(cmd))
+        try:
+            proc = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=self._timeout,
+                cwd=str(self._root),
+            )
+        except subprocess.TimeoutExpired:
+            return TestResult(
+                passed=False,
+                output=f"pytest timed out after {self._timeout}s",
+                return_code=-1,
+            )
+        result = TestResult(
+            passed=proc.returncode == 0,
+            output=proc.stdout + proc.stderr,
+            return_code=proc.returncode,
+        )
+        # Parse summary line like "2258 passed, 3 warnings in 20.05s"
+        result.total, result.failures, result.errors = _parse_summary(result.output)
+        return result
+def _parse_summary(output: str) -> tuple[int, int, int]:
+    """Extract passed/failed/error counts from pytest output."""
+    total = 0
+    failures = 0
+    errors = 0
+    for line in reversed(output.splitlines()):
+        line_lower = line.strip().lower()
+        if "passed" in line_lower or "failed" in line_lower or "error" in line_lower:
+            import re
+            m_passed = re.search(r"(\d+)\s+passed", line_lower)
+            m_failed = re.search(r"(\d+)\s+failed", line_lower)
+            m_error = re.search(r"(\d+)\s+error", line_lower)
+            if m_passed:
+                total += int(m_passed.group(1))
+            if m_failed:
+                failures = int(m_failed.group(1))
+                total += failures
+            if m_error:
+                errors = int(m_error.group(1))
+                total += errors
+            break
+    return total, failures, errors

semantic_code_intelligence/indexing/__init__.py ADDED Viewed

File without changes

semantic_code_intelligence/indexing/chunker.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""Code chunker — splits source files into meaningful chunks for embedding."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class CodeChunk:
+    """A chunk of code extracted from a source file."""
+    file_path: str
+    content: str
+    start_line: int
+    end_line: int
+    chunk_index: int
+    language: str
+# Map file extensions to language names
+EXTENSION_TO_LANGUAGE: dict[str, str] = {
+    ".py": "python",
+    ".js": "javascript",
+    ".ts": "typescript",
+    ".jsx": "javascript",
+    ".tsx": "typescript",
+    ".java": "java",
+    ".go": "go",
+    ".rs": "rust",
+    ".c": "c",
+    ".cpp": "cpp",
+    ".h": "c",
+    ".hpp": "cpp",
+    ".rb": "ruby",
+    ".php": "php",
+    ".cs": "csharp",
+    ".swift": "swift",
+    ".kt": "kotlin",
+    ".scala": "scala",
+    ".sh": "shell",
+    ".bash": "shell",
+    ".sql": "sql",
+    ".r": "r",
+    ".lua": "lua",
+    ".dart": "dart",
+    ".ex": "elixir",
+    ".exs": "elixir",
+}
+def detect_language(file_path: str) -> str:
+    """Detect the programming language from a file extension.
+    Args:
+        file_path: Path to the source file.
+    Returns:
+        Language name string, or 'unknown' if unrecognized.
+    """
+    ext = Path(file_path).suffix.lower()
+    return EXTENSION_TO_LANGUAGE.get(ext, "unknown")
+def chunk_code(
+    content: str,
+    file_path: str,
+    chunk_size: int = 512,
+    chunk_overlap: int = 64,
+) -> list[CodeChunk]:
+    """Split source code into overlapping chunks by line boundaries.
+    Chunks are split at line boundaries to preserve code structure.
+    Each chunk is at most chunk_size characters, with chunk_overlap
+    characters of overlap with the previous chunk.
+    Args:
+        content: The full source code string.
+        file_path: Path to the source file (for metadata).
+        chunk_size: Maximum characters per chunk.
+        chunk_overlap: Characters of overlap between consecutive chunks.
+    Returns:
+        List of CodeChunk objects.
+    """
+    if not content.strip():
+        return []
+    language = detect_language(file_path)
+    lines = content.splitlines(keepends=True)
+    chunks: list[CodeChunk] = []
+    current_chars = 0
+    chunk_start_line = 0
+    chunk_lines: list[str] = []
+    chunk_index = 0
+    for i, line in enumerate(lines):
+        chunk_lines.append(line)
+        current_chars += len(line)
+        if current_chars >= chunk_size:
+            chunk_text = "".join(chunk_lines)
+            chunks.append(
+                CodeChunk(
+                    file_path=file_path,
+                    content=chunk_text,
+                    start_line=chunk_start_line + 1,  # 1-indexed
+                    end_line=i + 1,
+                    chunk_index=chunk_index,
+                    language=language,
+                )
+            )
+            chunk_index += 1
+            # Calculate overlap: walk backwards until we have enough overlap chars
+            overlap_chars = 0
+            overlap_start = len(chunk_lines)
+            for j in range(len(chunk_lines) - 1, -1, -1):
+                overlap_chars += len(chunk_lines[j])
+                if overlap_chars >= chunk_overlap:
+                    overlap_start = j
+                    break
+            chunk_lines = chunk_lines[overlap_start:]
+            chunk_start_line = i + 1 - len(chunk_lines) + 1
+            # But we need to preserve 0-indexed line tracking
+            chunk_start_line = (i + 1) - len(chunk_lines)
+            current_chars = sum(len(l) for l in chunk_lines)
+    # Emit the last chunk if there's remaining content
+    if chunk_lines:
+        chunk_text = "".join(chunk_lines)
+        if chunk_text.strip():
+            chunks.append(
+                CodeChunk(
+                    file_path=file_path,
+                    content=chunk_text,
+                    start_line=chunk_start_line + 1,
+                    end_line=len(lines),
+                    chunk_index=chunk_index,
+                    language=language,
+                )
+            )
+    return chunks
+def chunk_file(
+    file_path: Path,
+    chunk_size: int = 512,
+    chunk_overlap: int = 64,
+) -> list[CodeChunk]:
+    """Read a file and split it into code chunks.
+    Args:
+        file_path: Path to the source file.
+        chunk_size: Maximum characters per chunk.
+        chunk_overlap: Characters of overlap.
+    Returns:
+        List of CodeChunk objects.
+    """
+    try:
+        content = file_path.read_text(encoding="utf-8", errors="replace")
+    except (OSError, PermissionError):
+        return []
+    return chunk_code(
+        content=content,
+        file_path=str(file_path),
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )

semantic_code_intelligence/indexing/parallel.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""Parallel indexing utilities — concurrent file I/O and chunking.
+Speeds up the scanning and chunking phases by processing files in
+parallel using a thread pool, while embedding generation is batched
+through the model (which already uses efficient GPU/CPU batching).
+"""
+from __future__ import annotations
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Any
+from semantic_code_intelligence.indexing.chunker import CodeChunk, chunk_file
+from semantic_code_intelligence.indexing.scanner import ScannedFile, compute_file_hash
+from semantic_code_intelligence.utils.logging import get_logger
+logger = get_logger("indexing.parallel")
+# Sensible default: don't overwhelm disk or CPU
+DEFAULT_WORKERS = 4
+def parallel_chunk_files(
+    files: list[ScannedFile],
+    chunk_size: int = 512,
+    chunk_overlap: int = 64,
+    max_workers: int = DEFAULT_WORKERS,
+) -> list[tuple[ScannedFile, list[CodeChunk]]]:
+    """Chunk multiple files in parallel using a thread pool.
+    Args:
+        files: List of scanned files to chunk.
+        chunk_size: Max characters per chunk.
+        chunk_overlap: Overlap between consecutive chunks.
+        max_workers: Number of threads.
+    Returns:
+        List of (ScannedFile, chunks) tuples in original order.
+    """
+    if not files:
+        return []
+    results: dict[int, tuple[ScannedFile, list[CodeChunk]]] = {}
+    def _chunk_one(idx: int, sf: ScannedFile) -> tuple[int, ScannedFile, list[CodeChunk]]:
+        chunks = chunk_file(sf.path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        return idx, sf, chunks
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(_chunk_one, i, sf): i
+            for i, sf in enumerate(files)
+        }
+        for future in as_completed(futures):
+            idx, sf, chunks = future.result()
+            results[idx] = (sf, chunks)
+    return [results[i] for i in range(len(files))]
+def parallel_scan_hashes(
+    file_paths: list[Path],
+    max_workers: int = DEFAULT_WORKERS,
+) -> dict[Path, str]:
+    """Compute file hashes in parallel.
+    Args:
+        file_paths: Files to hash.
+        max_workers: Number of threads.
+    Returns:
+        Mapping of path → SHA-256 hex digest.
+    """
+    result: dict[Path, str] = {}
+    def _hash_one(p: Path) -> tuple[Path, str]:
+        return p, compute_file_hash(p)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(_hash_one, p) for p in file_paths]
+        for future in as_completed(futures):
+            p, h = future.result()
+            result[p] = h
+    return result