npm - claude-code-workflow - Versions diffs - 6.2.7 → 6.3.0 - Mend

claude-code-workflow 6.2.7 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (208) hide show

package/.claude/CLAUDE.md +16 -1
package/.claude/workflows/cli-templates/protocols/analysis-protocol.md +11 -4
package/.claude/workflows/cli-templates/protocols/write-protocol.md +10 -75
package/.claude/workflows/cli-tools-usage.md +14 -24
package/.codex/AGENTS.md +51 -1
package/.codex/prompts/compact.md +378 -0
package/.gemini/GEMINI.md +57 -20
package/ccw/dist/cli.d.ts.map +1 -1
package/ccw/dist/cli.js +21 -8
package/ccw/dist/cli.js.map +1 -1
package/ccw/dist/commands/cli.d.ts +2 -0
package/ccw/dist/commands/cli.d.ts.map +1 -1
package/ccw/dist/commands/cli.js +129 -8
package/ccw/dist/commands/cli.js.map +1 -1
package/ccw/dist/commands/hook.d.ts.map +1 -1
package/ccw/dist/commands/hook.js +3 -2
package/ccw/dist/commands/hook.js.map +1 -1
package/ccw/dist/config/litellm-api-config-manager.d.ts +180 -0
package/ccw/dist/config/litellm-api-config-manager.d.ts.map +1 -0
package/ccw/dist/config/litellm-api-config-manager.js +770 -0
package/ccw/dist/config/litellm-api-config-manager.js.map +1 -0
package/ccw/dist/config/provider-models.d.ts +73 -0
package/ccw/dist/config/provider-models.d.ts.map +1 -0
package/ccw/dist/config/provider-models.js +172 -0
package/ccw/dist/config/provider-models.js.map +1 -0
package/ccw/dist/core/cache-manager.d.ts.map +1 -1
package/ccw/dist/core/cache-manager.js +3 -5
package/ccw/dist/core/cache-manager.js.map +1 -1
package/ccw/dist/core/dashboard-generator.d.ts.map +1 -1
package/ccw/dist/core/dashboard-generator.js +3 -1
package/ccw/dist/core/dashboard-generator.js.map +1 -1
package/ccw/dist/core/routes/cli-routes.d.ts.map +1 -1
package/ccw/dist/core/routes/cli-routes.js +169 -0
package/ccw/dist/core/routes/cli-routes.js.map +1 -1
package/ccw/dist/core/routes/codexlens-routes.d.ts.map +1 -1
package/ccw/dist/core/routes/codexlens-routes.js +234 -18
package/ccw/dist/core/routes/codexlens-routes.js.map +1 -1
package/ccw/dist/core/routes/hooks-routes.d.ts.map +1 -1
package/ccw/dist/core/routes/hooks-routes.js +30 -32
package/ccw/dist/core/routes/hooks-routes.js.map +1 -1
package/ccw/dist/core/routes/litellm-api-routes.d.ts +21 -0
package/ccw/dist/core/routes/litellm-api-routes.d.ts.map +1 -0
package/ccw/dist/core/routes/litellm-api-routes.js +780 -0
package/ccw/dist/core/routes/litellm-api-routes.js.map +1 -0
package/ccw/dist/core/routes/litellm-routes.d.ts +20 -0
package/ccw/dist/core/routes/litellm-routes.d.ts.map +1 -0
package/ccw/dist/core/routes/litellm-routes.js +85 -0
package/ccw/dist/core/routes/litellm-routes.js.map +1 -0
package/ccw/dist/core/routes/mcp-routes.js +2 -2
package/ccw/dist/core/routes/mcp-routes.js.map +1 -1
package/ccw/dist/core/routes/status-routes.d.ts.map +1 -1
package/ccw/dist/core/routes/status-routes.js +39 -0
package/ccw/dist/core/routes/status-routes.js.map +1 -1
package/ccw/dist/core/routes/system-routes.js +1 -1
package/ccw/dist/core/routes/system-routes.js.map +1 -1
package/ccw/dist/core/server.d.ts.map +1 -1
package/ccw/dist/core/server.js +15 -1
package/ccw/dist/core/server.js.map +1 -1
package/ccw/dist/mcp-server/index.js +1 -1
package/ccw/dist/mcp-server/index.js.map +1 -1
package/ccw/dist/tools/claude-cli-tools.d.ts +82 -0
package/ccw/dist/tools/claude-cli-tools.d.ts.map +1 -0
package/ccw/dist/tools/claude-cli-tools.js +216 -0
package/ccw/dist/tools/claude-cli-tools.js.map +1 -0
package/ccw/dist/tools/cli-executor.d.ts.map +1 -1
package/ccw/dist/tools/cli-executor.js +76 -14
package/ccw/dist/tools/cli-executor.js.map +1 -1
package/ccw/dist/tools/codex-lens.d.ts +9 -2
package/ccw/dist/tools/codex-lens.d.ts.map +1 -1
package/ccw/dist/tools/codex-lens.js +114 -9
package/ccw/dist/tools/codex-lens.js.map +1 -1
package/ccw/dist/tools/context-cache-store.d.ts +136 -0
package/ccw/dist/tools/context-cache-store.d.ts.map +1 -0
package/ccw/dist/tools/context-cache-store.js +256 -0
package/ccw/dist/tools/context-cache-store.js.map +1 -0
package/ccw/dist/tools/context-cache.d.ts +56 -0
package/ccw/dist/tools/context-cache.d.ts.map +1 -0
package/ccw/dist/tools/context-cache.js +294 -0
package/ccw/dist/tools/context-cache.js.map +1 -0
package/ccw/dist/tools/core-memory.d.ts.map +1 -1
package/ccw/dist/tools/core-memory.js +33 -19
package/ccw/dist/tools/core-memory.js.map +1 -1
package/ccw/dist/tools/index.d.ts.map +1 -1
package/ccw/dist/tools/index.js +2 -0
package/ccw/dist/tools/index.js.map +1 -1
package/ccw/dist/tools/litellm-client.d.ts +85 -0
package/ccw/dist/tools/litellm-client.d.ts.map +1 -0
package/ccw/dist/tools/litellm-client.js +188 -0
package/ccw/dist/tools/litellm-client.js.map +1 -0
package/ccw/dist/tools/litellm-executor.d.ts +34 -0
package/ccw/dist/tools/litellm-executor.d.ts.map +1 -0
package/ccw/dist/tools/litellm-executor.js +192 -0
package/ccw/dist/tools/litellm-executor.js.map +1 -0
package/ccw/dist/tools/pattern-parser.d.ts +55 -0
package/ccw/dist/tools/pattern-parser.d.ts.map +1 -0
package/ccw/dist/tools/pattern-parser.js +237 -0
package/ccw/dist/tools/pattern-parser.js.map +1 -0
package/ccw/dist/tools/smart-search.d.ts +1 -0
package/ccw/dist/tools/smart-search.d.ts.map +1 -1
package/ccw/dist/tools/smart-search.js +117 -41
package/ccw/dist/tools/smart-search.js.map +1 -1
package/ccw/dist/types/litellm-api-config.d.ts +294 -0
package/ccw/dist/types/litellm-api-config.d.ts.map +1 -0
package/ccw/dist/types/litellm-api-config.js +8 -0
package/ccw/dist/types/litellm-api-config.js.map +1 -0
package/ccw/src/cli.ts +258 -244
package/ccw/src/commands/cli.ts +153 -9
package/ccw/src/commands/hook.ts +3 -2
package/ccw/src/config/.litellm-api-config-manager.ts.2025-12-23T11-57-43-727Z.bak +441 -0
package/ccw/src/config/litellm-api-config-manager.ts +1012 -0
package/ccw/src/config/provider-models.ts +222 -0
package/ccw/src/core/cache-manager.ts +292 -294
package/ccw/src/core/dashboard-generator.ts +3 -1
package/ccw/src/core/routes/cli-routes.ts +192 -0
package/ccw/src/core/routes/codexlens-routes.ts +241 -19
package/ccw/src/core/routes/hooks-routes.ts +399 -405
package/ccw/src/core/routes/litellm-api-routes.ts +930 -0
package/ccw/src/core/routes/litellm-routes.ts +107 -0
package/ccw/src/core/routes/mcp-routes.ts +1271 -1271
package/ccw/src/core/routes/status-routes.ts +51 -0
package/ccw/src/core/routes/system-routes.ts +1 -1
package/ccw/src/core/server.ts +15 -1
package/ccw/src/mcp-server/index.ts +1 -1
package/ccw/src/templates/dashboard-css/12-cli-legacy.css +44 -0
package/ccw/src/templates/dashboard-css/31-api-settings.css +2265 -0
package/ccw/src/templates/dashboard-js/components/cli-history.js +15 -8
package/ccw/src/templates/dashboard-js/components/cli-status.js +323 -9
package/ccw/src/templates/dashboard-js/components/navigation.js +329 -313
package/ccw/src/templates/dashboard-js/i18n.js +583 -1
package/ccw/src/templates/dashboard-js/views/api-settings.js +3362 -0
package/ccw/src/templates/dashboard-js/views/cli-manager.js +199 -24
package/ccw/src/templates/dashboard-js/views/codexlens-manager.js +1265 -27
package/ccw/src/templates/dashboard.html +840 -831
package/ccw/src/tools/claude-cli-tools.ts +300 -0
package/ccw/src/tools/cli-executor.ts +83 -14
package/ccw/src/tools/codex-lens.ts +146 -9
package/ccw/src/tools/context-cache-store.ts +368 -0
package/ccw/src/tools/context-cache.ts +393 -0
package/ccw/src/tools/core-memory.ts +33 -19
package/ccw/src/tools/index.ts +2 -0
package/ccw/src/tools/litellm-client.ts +246 -0
package/ccw/src/tools/litellm-executor.ts +241 -0
package/ccw/src/tools/pattern-parser.ts +329 -0
package/ccw/src/tools/smart-search.ts +142 -41
package/ccw/src/types/litellm-api-config.ts +402 -0
package/ccw-litellm/README.md +180 -0
package/ccw-litellm/pyproject.toml +35 -0
package/ccw-litellm/src/ccw_litellm/__init__.py +47 -0
package/ccw-litellm/src/ccw_litellm/__pycache__/__init__.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/__pycache__/cli.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/cli.py +108 -0
package/ccw-litellm/src/ccw_litellm/clients/__init__.py +12 -0
package/ccw-litellm/src/ccw_litellm/clients/__pycache__/__init__.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/clients/__pycache__/litellm_embedder.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/clients/__pycache__/litellm_llm.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/clients/litellm_embedder.py +251 -0
package/ccw-litellm/src/ccw_litellm/clients/litellm_llm.py +165 -0
package/ccw-litellm/src/ccw_litellm/config/__init__.py +22 -0
package/ccw-litellm/src/ccw_litellm/config/__pycache__/__init__.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/config/__pycache__/loader.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/config/__pycache__/models.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/config/loader.py +316 -0
package/ccw-litellm/src/ccw_litellm/config/models.py +130 -0
package/ccw-litellm/src/ccw_litellm/interfaces/__init__.py +14 -0
package/ccw-litellm/src/ccw_litellm/interfaces/__pycache__/__init__.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/interfaces/__pycache__/embedder.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/interfaces/__pycache__/llm.cpython-313.pyc +0 -0
package/ccw-litellm/src/ccw_litellm/interfaces/embedder.py +52 -0
package/ccw-litellm/src/ccw_litellm/interfaces/llm.py +45 -0
package/codex-lens/src/codexlens/__pycache__/config.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/cli/__pycache__/commands.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/cli/__pycache__/embedding_manager.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/cli/__pycache__/model_manager.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/cli/__pycache__/output.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/cli/commands.py +378 -23
package/codex-lens/src/codexlens/cli/embedding_manager.py +660 -56
package/codex-lens/src/codexlens/cli/model_manager.py +31 -18
package/codex-lens/src/codexlens/cli/output.py +12 -1
package/codex-lens/src/codexlens/config.py +93 -0
package/codex-lens/src/codexlens/search/__pycache__/chain_search.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/search/__pycache__/hybrid_search.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/search/__pycache__/ranking.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/search/chain_search.py +6 -2
package/codex-lens/src/codexlens/search/hybrid_search.py +44 -21
package/codex-lens/src/codexlens/search/ranking.py +1 -1
package/codex-lens/src/codexlens/semantic/__init__.py +42 -0
package/codex-lens/src/codexlens/semantic/__pycache__/__init__.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/__pycache__/base.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/__pycache__/chunker.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/__pycache__/embedder.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/__pycache__/factory.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/__pycache__/gpu_support.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/__pycache__/litellm_embedder.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/__pycache__/vector_store.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/semantic/base.py +61 -0
package/codex-lens/src/codexlens/semantic/chunker.py +43 -20
package/codex-lens/src/codexlens/semantic/embedder.py +60 -13
package/codex-lens/src/codexlens/semantic/factory.py +98 -0
package/codex-lens/src/codexlens/semantic/gpu_support.py +225 -3
package/codex-lens/src/codexlens/semantic/litellm_embedder.py +144 -0
package/codex-lens/src/codexlens/semantic/rotational_embedder.py +434 -0
package/codex-lens/src/codexlens/semantic/vector_store.py +33 -8
package/codex-lens/src/codexlens/storage/__pycache__/path_mapper.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/storage/migrations/__pycache__/migration_004_dual_fts.cpython-313.pyc +0 -0
package/codex-lens/src/codexlens/storage/path_mapper.py +27 -1
package/package.json +15 -5
package/.codex/prompts.zip +0 -0
package/ccw/package.json +0 -65

package/codex-lens/src/codexlens/cli/embedding_manager.py CHANGED Viewed

@@ -1,27 +1,36 @@
 """Embedding Manager - Manage semantic embeddings for code indexes."""
 import gc
+import json
 import logging
 import sqlite3
 import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from itertools import islice
 from pathlib import Path
-from typing import Dict, Generator, List, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple
 try:
-    from codexlens.semantic import SEMANTIC_AVAILABLE
-    if SEMANTIC_AVAILABLE:
-        from codexlens.semantic.embedder import Embedder, get_embedder, clear_embedder_cache
-        from codexlens.semantic.vector_store import VectorStore
-        from codexlens.semantic.chunker import Chunker, ChunkConfig
+    from codexlens.semantic import SEMANTIC_AVAILABLE, is_embedding_backend_available
 except ImportError:
     SEMANTIC_AVAILABLE = False
+    def is_embedding_backend_available(_backend: str):  # type: ignore[no-redef]
+        return False, "codexlens.semantic not available"
 logger = logging.getLogger(__name__)
 # Embedding batch size - larger values improve throughput on modern hardware
 # Benchmark: 256 gives ~2.35x speedup over 64 with DirectML GPU acceleration
-EMBEDDING_BATCH_SIZE = 256  # Optimized from 64 based on batch size benchmarks
+EMBEDDING_BATCH_SIZE = 256
+def _cleanup_fastembed_resources() -> None:
+    """Best-effort cleanup for fastembed/ONNX resources (no-op for other backends)."""
+    try:
+        from codexlens.semantic.embedder import clear_embedder_cache
+        clear_embedder_cache()
+    except Exception:
+        pass
 def _generate_chunks_from_cursor(
@@ -79,6 +88,44 @@ def _generate_chunks_from_cursor(
                 failed_files.append((file_path, str(e)))
+def _create_token_aware_batches(
+    chunk_generator: Generator,
+    max_tokens_per_batch: int = 8000,
+) -> Generator[List[Tuple], None, None]:
+    """Group chunks by total token count instead of fixed count.
+    Uses fast token estimation (len(content) // 4) for efficiency.
+    Yields batches when approaching the token limit.
+    Args:
+        chunk_generator: Generator yielding (chunk, file_path) tuples
+        max_tokens_per_batch: Maximum tokens per batch (default: 8000)
+    Yields:
+        List of (chunk, file_path) tuples representing a batch
+    """
+    current_batch = []
+    current_tokens = 0
+    for chunk, file_path in chunk_generator:
+        # Fast token estimation: len(content) // 4
+        chunk_tokens = len(chunk.content) // 4
+        # If adding this chunk would exceed limit and we have items, yield current batch
+        if current_tokens + chunk_tokens > max_tokens_per_batch and current_batch:
+            yield current_batch
+            current_batch = []
+            current_tokens = 0
+        # Add chunk to current batch
+        current_batch.append((chunk, file_path))
+        current_tokens += chunk_tokens
+    # Yield final batch if not empty
+    if current_batch:
+        yield current_batch
 def _get_path_column(conn: sqlite3.Connection) -> str:
     """Detect whether files table uses 'path' or 'full_path' column.
@@ -189,33 +236,110 @@ def check_index_embeddings(index_path: Path) -> Dict[str, any]:
         }
+def _get_embedding_defaults() -> tuple[str, str, bool, List, str, float]:
+    """Get default embedding settings from config.
+    Returns:
+        Tuple of (backend, model, use_gpu, endpoints, strategy, cooldown)
+    """
+    try:
+        from codexlens.config import Config
+        config = Config.load()
+        return (
+            config.embedding_backend,
+            config.embedding_model,
+            config.embedding_use_gpu,
+            config.embedding_endpoints,
+            config.embedding_strategy,
+            config.embedding_cooldown,
+        )
+    except Exception:
+        return "fastembed", "code", True, [], "latency_aware", 60.0
 def generate_embeddings(
     index_path: Path,
-    model_profile: str = "code",
+    embedding_backend: Optional[str] = None,
+    model_profile: Optional[str] = None,
     force: bool = False,
     chunk_size: int = 2000,
+    overlap: int = 200,
     progress_callback: Optional[callable] = None,
+    use_gpu: Optional[bool] = None,
+    max_tokens_per_batch: Optional[int] = None,
+    max_workers: Optional[int] = None,
+    endpoints: Optional[List] = None,
+    strategy: Optional[str] = None,
+    cooldown: Optional[float] = None,
 ) -> Dict[str, any]:
     """Generate embeddings for an index using memory-efficient batch processing.
     This function processes files in small batches to keep memory usage under 2GB,
-    regardless of the total project size.
+    regardless of the total project size. Supports concurrent API calls for
+    LiteLLM backend to improve throughput.
     Args:
         index_path: Path to _index.db file
-        model_profile: Model profile (fast, code, multilingual, balanced)
+        embedding_backend: Embedding backend to use (fastembed or litellm).
+                          Defaults to config setting.
+        model_profile: Model profile for fastembed (fast, code, multilingual, balanced)
+                      or model name for litellm (e.g., qwen3-embedding).
+                      Defaults to config setting.
         force: If True, regenerate even if embeddings exist
         chunk_size: Maximum chunk size in characters
+        overlap: Overlap size in characters for sliding window chunking (default: 200)
         progress_callback: Optional callback for progress updates
+        use_gpu: Whether to use GPU acceleration (fastembed only).
+                Defaults to config setting.
+        max_tokens_per_batch: Maximum tokens per batch for token-aware batching.
+                             If None, attempts to get from embedder.max_tokens,
+                             then falls back to 8000. If set, overrides automatic detection.
+        max_workers: Maximum number of concurrent API calls.
+                    If None, uses dynamic defaults based on backend and endpoint count.
+        endpoints: Optional list of endpoint configurations for multi-API load balancing.
+                  Each dict has keys: model, api_key, api_base, weight.
+        strategy: Selection strategy for multi-endpoint mode (round_robin, latency_aware).
+        cooldown: Default cooldown seconds for rate-limited endpoints.
     Returns:
         Result dictionary with generation statistics
     """
-    if not SEMANTIC_AVAILABLE:
-        return {
-            "success": False,
-            "error": "Semantic search not available. Install with: pip install codexlens[semantic]",
-        }
+    # Get defaults from config if not specified
+    (default_backend, default_model, default_gpu,
+     default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults()
+    if embedding_backend is None:
+        embedding_backend = default_backend
+    if model_profile is None:
+        model_profile = default_model
+    if use_gpu is None:
+        use_gpu = default_gpu
+    if endpoints is None:
+        endpoints = default_endpoints
+    if strategy is None:
+        strategy = default_strategy
+    if cooldown is None:
+        cooldown = default_cooldown
+    # Calculate endpoint count for worker scaling
+    endpoint_count = len(endpoints) if endpoints else 1
+    # Set dynamic max_workers default based on backend type and endpoint count
+    # - FastEmbed: CPU-bound, sequential is optimal (1 worker)
+    # - LiteLLM single endpoint: 4 workers default
+    # - LiteLLM multi-endpoint: workers = endpoint_count * 2 (to saturate all APIs)
+    if max_workers is None:
+        if embedding_backend == "litellm":
+            if endpoint_count > 1:
+                max_workers = endpoint_count * 2  # No cap, scale with endpoints
+            else:
+                max_workers = 4
+        else:
+            max_workers = 1
+    backend_available, backend_error = is_embedding_backend_available(embedding_backend)
+    if not backend_available:
+        return {"success": False, "error": backend_error or "Embedding backend not available"}
     if not index_path.exists():
         return {
@@ -253,13 +377,43 @@ def generate_embeddings(
     # Initialize components
     try:
-        # Initialize embedder (singleton, reused throughout the function)
-        embedder = get_embedder(profile=model_profile)
+        # Import factory function to support both backends
+        from codexlens.semantic.factory import get_embedder as get_embedder_factory
+        from codexlens.semantic.vector_store import VectorStore
+        from codexlens.semantic.chunker import Chunker, ChunkConfig
+        # Initialize embedder using factory (supports fastembed, litellm, and rotational)
+        # For fastembed: model_profile is a profile name (fast/code/multilingual/balanced)
+        # For litellm: model_profile is a model name (e.g., qwen3-embedding)
+        # For multi-endpoint: endpoints list enables load balancing
+        if embedding_backend == "fastembed":
+            embedder = get_embedder_factory(backend="fastembed", profile=model_profile, use_gpu=use_gpu)
+        elif embedding_backend == "litellm":
+            embedder = get_embedder_factory(
+                backend="litellm",
+                model=model_profile,
+                endpoints=endpoints if endpoints else None,
+                strategy=strategy,
+                cooldown=cooldown,
+            )
+        else:
+            return {
+                "success": False,
+                "error": f"Invalid embedding backend: {embedding_backend}. Must be 'fastembed' or 'litellm'.",
+            }
         # skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
         # This significantly reduces CPU usage with minimal impact on metadata accuracy
-        chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size, skip_token_count=True))
+        chunker = Chunker(config=ChunkConfig(
+            max_chunk_size=chunk_size,
+            overlap=overlap,
+            skip_token_count=True
+        ))
+        # Log embedder info with endpoint count for multi-endpoint mode
         if progress_callback:
+            if endpoint_count > 1:
+                progress_callback(f"Using {endpoint_count} API endpoints with {strategy} strategy")
             progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
     except Exception as e:
@@ -292,7 +446,7 @@ def generate_embeddings(
             # Set/update model configuration for this index
             vector_store.set_model_config(
-                model_profile, embedder.model_name, embedder.embedding_dim
+                model_profile, embedder.model_name, embedder.embedding_dim, backend=embedding_backend
             )
             # Use bulk insert mode for efficient batch ANN index building
             # This defers ANN updates until end_bulk_insert() is called
@@ -319,42 +473,203 @@ def generate_embeddings(
                         cursor, chunker, path_column, FILE_BATCH_SIZE, failed_files
                     )
+                    # Determine max tokens per batch
+                    # Priority: explicit parameter > embedder.max_tokens > default 8000
+                    if max_tokens_per_batch is None:
+                        max_tokens_per_batch = getattr(embedder, 'max_tokens', 8000)
+                    # Create token-aware batches or fall back to fixed-size batching
+                    if max_tokens_per_batch:
+                        batch_generator = _create_token_aware_batches(
+                            chunk_generator, max_tokens_per_batch
+                        )
+                    else:
+                        # Fallback to fixed-size batching for backward compatibility
+                        def fixed_size_batches():
+                            while True:
+                                batch = list(islice(chunk_generator, EMBEDDING_BATCH_SIZE))
+                                if not batch:
+                                    break
+                                yield batch
+                        batch_generator = fixed_size_batches()
                     batch_number = 0
                     files_seen = set()
-                    while True:
-                        # Get a small batch of chunks from the generator (EMBEDDING_BATCH_SIZE at a time)
-                        chunk_batch = list(islice(chunk_generator, EMBEDDING_BATCH_SIZE))
-                        if not chunk_batch:
-                            break
+                    def compute_embeddings_only(batch_data: Tuple[int, List[Tuple]]):
+                        """Compute embeddings for a batch (no DB write) with retry logic.
-                        batch_number += 1
-                        # Track unique files for progress
-                        for _, file_path in chunk_batch:
-                            files_seen.add(file_path)
+                        Args:
+                            batch_data: Tuple of (batch_number, chunk_batch)
-                        # Generate embeddings directly to numpy (no tolist() conversion)
-                        try:
-                            batch_contents = [chunk.content for chunk, _ in chunk_batch]
-                            embeddings_numpy = embedder.embed_to_numpy(batch_contents)
+                        Returns:
+                            Tuple of (batch_num, chunk_batch, embeddings_numpy, batch_files, error)
+                        """
+                        import random
-                            # Use add_chunks_batch_numpy to avoid numpy->list->numpy roundtrip
-                            vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
-                            total_chunks_created += len(chunk_batch)
-                            total_files_processed = len(files_seen)
-                            if progress_callback and batch_number % 10 == 0:
+                        batch_num, chunk_batch = batch_data
+                        batch_files = set()
+                        for _, file_path in chunk_batch:
+                            batch_files.add(file_path)
+                        max_retries = 5
+                        base_delay = 2.0
+                        for attempt in range(max_retries + 1):
+                            try:
+                                batch_contents = [chunk.content for chunk, _ in chunk_batch]
+                                embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
+                                return batch_num, chunk_batch, embeddings_numpy, batch_files, None
+                            except Exception as e:
+                                error_str = str(e).lower()
+                                # Check for retryable errors (rate limit, connection, backend issues)
+                                # Note: Some backends (e.g., ModelScope) return 400 with nested 500 errors
+                                is_retryable = any(x in error_str for x in [
+                                    "429", "rate limit", "connection", "timeout",
+                                    "502", "503", "504", "service unavailable",
+                                    "500", "400", "badrequesterror", "internal server error",
+                                    "11434"  # Ollama port - indicates backend routing issue
+                                ])
+                                if attempt < max_retries and is_retryable:
+                                    sleep_time = base_delay * (2 ** attempt) + random.uniform(0, 0.5)
+                                    logger.warning(f"Batch {batch_num} failed (attempt {attempt+1}/{max_retries+1}). "
+                                                   f"Retrying in {sleep_time:.1f}s. Error: {e}")
+                                    time.sleep(sleep_time)
+                                    continue
+                                error_msg = f"Batch {batch_num}: {str(e)}"
+                                logger.error(f"Failed to compute embeddings for batch {batch_num}: {str(e)}")
+                                return batch_num, chunk_batch, None, batch_files, error_msg
+                        # Should not reach here, but just in case
+                        return batch_num, chunk_batch, None, batch_files, f"Batch {batch_num}: Max retries exceeded"
+                    # Process batches based on max_workers setting
+                    if max_workers <= 1:
+                        # Sequential processing - stream directly from generator (no pre-materialization)
+                        for chunk_batch in batch_generator:
+                            batch_number += 1
+                            # Track files in this batch
+                            batch_files = set()
+                            for _, file_path in chunk_batch:
+                                batch_files.add(file_path)
+                            # Retry logic for transient backend errors
+                            max_retries = 5
+                            base_delay = 2.0
+                            success = False
+                            for attempt in range(max_retries + 1):
+                                try:
+                                    # Generate embeddings
+                                    batch_contents = [chunk.content for chunk, _ in chunk_batch]
+                                    embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
+                                    # Store embeddings
+                                    vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+                                    files_seen.update(batch_files)
+                                    total_chunks_created += len(chunk_batch)
+                                    total_files_processed = len(files_seen)
+                                    success = True
+                                    break
+                                except Exception as e:
+                                    error_str = str(e).lower()
+                                    # Check for retryable errors (rate limit, connection, backend issues)
+                                    is_retryable = any(x in error_str for x in [
+                                        "429", "rate limit", "connection", "timeout",
+                                        "502", "503", "504", "service unavailable",
+                                        "500", "400", "badrequesterror", "internal server error",
+                                        "11434"  # Ollama port - indicates backend routing issue
+                                    ])
+                                    if attempt < max_retries and is_retryable:
+                                        import random
+                                        sleep_time = base_delay * (2 ** attempt) + random.uniform(0, 0.5)
+                                        logger.warning(f"Batch {batch_number} failed (attempt {attempt+1}/{max_retries+1}). "
+                                                       f"Retrying in {sleep_time:.1f}s. Error: {e}")
+                                        time.sleep(sleep_time)
+                                        continue
+                                    logger.error(f"Failed to process batch {batch_number}: {str(e)}")
+                                    files_seen.update(batch_files)
+                                    break
+                            if success and progress_callback and batch_number % 10 == 0:
                                 progress_callback(f"  Batch {batch_number}: {total_chunks_created} chunks, {total_files_processed} files")
-                            # Cleanup intermediate data
-                            del batch_contents, embeddings_numpy, chunk_batch
-                        except Exception as e:
-                            logger.error(f"Failed to process embedding batch {batch_number}: {str(e)}")
-                            # Continue to next batch instead of failing entirely
-                            continue
+                    else:
+                        # Concurrent processing - main thread iterates batches (SQLite safe),
+                        # workers compute embeddings (parallel), main thread writes to DB (serial)
+                        if progress_callback:
+                            progress_callback(f"Processing with {max_workers} concurrent embedding workers...")
+                        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                            pending_futures = {}  # future -> (batch_num, chunk_batch)
+                            completed_batches = 0
+                            last_reported_batch = 0
+                            def process_completed_futures():
+                                """Process any completed futures and write to DB."""
+                                nonlocal total_chunks_created, total_files_processed, completed_batches, last_reported_batch
+                                done_futures = [f for f in pending_futures if f.done()]
+                                for f in done_futures:
+                                    try:
+                                        batch_num, chunk_batch, embeddings_numpy, batch_files, error = f.result()
+                                        if embeddings_numpy is not None and error is None:
+                                            # Write to DB in main thread (no contention)
+                                            vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+                                            total_chunks_created += len(chunk_batch)
+                                        files_seen.update(batch_files)
+                                        total_files_processed = len(files_seen)
+                                        completed_batches += 1
+                                    except Exception as e:
+                                        logger.error(f"Future raised exception: {e}")
+                                        completed_batches += 1
+                                    del pending_futures[f]
+                                # Report progress based on completed batches (every 5 batches)
+                                if progress_callback and completed_batches >= last_reported_batch + 5:
+                                    progress_callback(f"  Batch {completed_batches}: {total_chunks_created} chunks, {total_files_processed} files")
+                                    last_reported_batch = completed_batches
+                            # Iterate batches in main thread (SQLite cursor is main-thread bound)
+                            for chunk_batch in batch_generator:
+                                batch_number += 1
+                                # Submit compute task to worker pool
+                                future = executor.submit(compute_embeddings_only, (batch_number, chunk_batch))
+                                pending_futures[future] = batch_number
+                                # Process any completed futures to free memory and write to DB
+                                process_completed_futures()
+                                # Backpressure: wait if too many pending
+                                while len(pending_futures) >= max_workers * 2:
+                                    process_completed_futures()
+                                    if len(pending_futures) >= max_workers * 2:
+                                        time.sleep(0.1)  # time is imported at module level
+                            # Wait for remaining futures
+                            for future in as_completed(list(pending_futures.keys())):
+                                try:
+                                    batch_num, chunk_batch, embeddings_numpy, batch_files, error = future.result()
+                                    if embeddings_numpy is not None and error is None:
+                                        vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+                                        total_chunks_created += len(chunk_batch)
+                                    files_seen.update(batch_files)
+                                    total_files_processed = len(files_seen)
+                                    completed_batches += 1
+                                    # Report progress for remaining batches
+                                    if progress_callback and completed_batches >= last_reported_batch + 5:
+                                        progress_callback(f"  Batch {completed_batches}: {total_chunks_created} chunks, {total_files_processed} files")
+                                        last_reported_batch = completed_batches
+                                except Exception as e:
+                                    logger.error(f"Future raised exception: {e}")
                 # Notify before ANN index finalization (happens when bulk_insert context exits)
                 if progress_callback:
@@ -363,7 +678,7 @@ def generate_embeddings(
     except Exception as e:
         # Cleanup on error to prevent process hanging
         try:
-            clear_embedder_cache()
+            _cleanup_fastembed_resources()
             gc.collect()
         except Exception:
             pass
@@ -374,7 +689,7 @@ def generate_embeddings(
     # Final cleanup: release ONNX resources to allow process exit
     # This is critical - without it, ONNX Runtime threads prevent Python from exiting
     try:
-        clear_embedder_cache()
+        _cleanup_fastembed_resources()
         gc.collect()
     except Exception:
         pass
@@ -427,23 +742,76 @@ def find_all_indexes(scan_dir: Path) -> List[Path]:
 def generate_embeddings_recursive(
     index_root: Path,
-    model_profile: str = "code",
+    embedding_backend: Optional[str] = None,
+    model_profile: Optional[str] = None,
     force: bool = False,
     chunk_size: int = 2000,
+    overlap: int = 200,
     progress_callback: Optional[callable] = None,
+    use_gpu: Optional[bool] = None,
+    max_tokens_per_batch: Optional[int] = None,
+    max_workers: Optional[int] = None,
+    endpoints: Optional[List] = None,
+    strategy: Optional[str] = None,
+    cooldown: Optional[float] = None,
 ) -> Dict[str, any]:
     """Generate embeddings for all index databases in a project recursively.
     Args:
         index_root: Root index directory containing _index.db files
-        model_profile: Model profile (fast, code, multilingual, balanced)
+        embedding_backend: Embedding backend to use (fastembed or litellm).
+                          Defaults to config setting.
+        model_profile: Model profile for fastembed (fast, code, multilingual, balanced)
+                      or model name for litellm (e.g., qwen3-embedding).
+                      Defaults to config setting.
         force: If True, regenerate even if embeddings exist
         chunk_size: Maximum chunk size in characters
+        overlap: Overlap size in characters for sliding window chunking (default: 200)
         progress_callback: Optional callback for progress updates
+        use_gpu: Whether to use GPU acceleration (fastembed only).
+                Defaults to config setting.
+        max_tokens_per_batch: Maximum tokens per batch for token-aware batching.
+                             If None, attempts to get from embedder.max_tokens,
+                             then falls back to 8000. If set, overrides automatic detection.
+        max_workers: Maximum number of concurrent API calls.
+                    If None, uses dynamic defaults based on backend and endpoint count.
+        endpoints: Optional list of endpoint configurations for multi-API load balancing.
+        strategy: Selection strategy for multi-endpoint mode.
+        cooldown: Default cooldown seconds for rate-limited endpoints.
     Returns:
         Aggregated result dictionary with generation statistics
     """
+    # Get defaults from config if not specified
+    (default_backend, default_model, default_gpu,
+     default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults()
+    if embedding_backend is None:
+        embedding_backend = default_backend
+    if model_profile is None:
+        model_profile = default_model
+    if use_gpu is None:
+        use_gpu = default_gpu
+    if endpoints is None:
+        endpoints = default_endpoints
+    if strategy is None:
+        strategy = default_strategy
+    if cooldown is None:
+        cooldown = default_cooldown
+    # Calculate endpoint count for worker scaling
+    endpoint_count = len(endpoints) if endpoints else 1
+    # Set dynamic max_workers default based on backend type and endpoint count
+    if max_workers is None:
+        if embedding_backend == "litellm":
+            if endpoint_count > 1:
+                max_workers = endpoint_count * 2  # No cap, scale with endpoints
+            else:
+                max_workers = 4
+        else:
+            max_workers = 1
     # Discover all _index.db files
     index_files = discover_all_index_dbs(index_root)
@@ -473,10 +841,18 @@ def generate_embeddings_recursive(
         result = generate_embeddings(
             index_path,
+            embedding_backend=embedding_backend,
             model_profile=model_profile,
             force=force,
             chunk_size=chunk_size,
+            overlap=overlap,
             progress_callback=None,  # Don't cascade callbacks
+            use_gpu=use_gpu,
+            max_tokens_per_batch=max_tokens_per_batch,
+            max_workers=max_workers,
+            endpoints=endpoints,
+            strategy=strategy,
+            cooldown=cooldown,
         )
         all_results.append({
@@ -497,9 +873,8 @@ def generate_embeddings_recursive(
     # Final cleanup after processing all indexes
     # Each generate_embeddings() call does its own cleanup, but do a final one to be safe
     try:
-        if SEMANTIC_AVAILABLE:
-            clear_embedder_cache()
-            gc.collect()
+        _cleanup_fastembed_resources()
+        gc.collect()
     except Exception:
         pass
@@ -525,7 +900,7 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
         index_root: Root index directory
     Returns:
-        Aggregated status with coverage statistics
+        Aggregated status with coverage statistics, model info, and timestamps
     """
     index_files = discover_all_index_dbs(index_root)
@@ -541,6 +916,7 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
                 "coverage_percent": 0.0,
                 "indexes_with_embeddings": 0,
                 "indexes_without_embeddings": 0,
+                "model_info": None,
             },
         }
@@ -548,6 +924,8 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
     files_with_embeddings = 0
     total_chunks = 0
     indexes_with_embeddings = 0
+    model_info = None
+    latest_updated_at = None
     for index_path in index_files:
         status = check_index_embeddings(index_path)
@@ -559,6 +937,40 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
             if result["has_embeddings"]:
                 indexes_with_embeddings += 1
+                # Get model config from first index with embeddings (they should all match)
+                if model_info is None:
+                    try:
+                        from codexlens.semantic.vector_store import VectorStore
+                        with VectorStore(index_path) as vs:
+                            config = vs.get_model_config()
+                            if config:
+                                model_info = {
+                                    "model_profile": config.get("model_profile"),
+                                    "model_name": config.get("model_name"),
+                                    "embedding_dim": config.get("embedding_dim"),
+                                    "backend": config.get("backend"),
+                                    "created_at": config.get("created_at"),
+                                    "updated_at": config.get("updated_at"),
+                                }
+                                latest_updated_at = config.get("updated_at")
+                    except Exception:
+                        pass
+                else:
+                    # Track the latest updated_at across all indexes
+                    try:
+                        from codexlens.semantic.vector_store import VectorStore
+                        with VectorStore(index_path) as vs:
+                            config = vs.get_model_config()
+                            if config and config.get("updated_at"):
+                                if latest_updated_at is None or config["updated_at"] > latest_updated_at:
+                                    latest_updated_at = config["updated_at"]
+                    except Exception:
+                        pass
+    # Update model_info with latest timestamp
+    if model_info and latest_updated_at:
+        model_info["updated_at"] = latest_updated_at
     return {
         "success": True,
         "result": {
@@ -570,6 +982,7 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
             "coverage_percent": round((files_with_embeddings / total_files * 100) if total_files > 0 else 0, 1),
             "indexes_with_embeddings": indexes_with_embeddings,
             "indexes_without_embeddings": len(index_files) - indexes_with_embeddings,
+            "model_info": model_info,
         },
     }
@@ -633,3 +1046,194 @@ def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
             "indexes": index_stats,
         },
     }
+def scan_for_model_conflicts(
+    index_root: Path,
+    target_backend: str,
+    target_model: str,
+) -> Dict[str, any]:
+    """Scan for model conflicts across all indexes in a directory.
+    Checks if any existing embeddings were generated with a different
+    backend or model than the target configuration.
+    Args:
+        index_root: Root index directory to scan
+        target_backend: Target embedding backend (fastembed or litellm)
+        target_model: Target model profile/name
+    Returns:
+        Dictionary with:
+        - has_conflict: True if any index has different model config
+        - existing_config: Config from first index with embeddings (if any)
+        - target_config: The requested configuration
+        - conflicts: List of conflicting index paths with their configs
+        - indexes_with_embeddings: Count of indexes that have embeddings
+    """
+    index_files = discover_all_index_dbs(index_root)
+    if not index_files:
+        return {
+            "has_conflict": False,
+            "existing_config": None,
+            "target_config": {"backend": target_backend, "model": target_model},
+            "conflicts": [],
+            "indexes_with_embeddings": 0,
+        }
+    conflicts = []
+    existing_config = None
+    indexes_with_embeddings = 0
+    for index_path in index_files:
+        try:
+            from codexlens.semantic.vector_store import VectorStore
+            with VectorStore(index_path) as vs:
+                config = vs.get_model_config()
+                if config and config.get("model_profile"):
+                    indexes_with_embeddings += 1
+                    # Store first existing config as reference
+                    if existing_config is None:
+                        existing_config = {
+                            "backend": config.get("backend"),
+                            "model": config.get("model_profile"),
+                            "model_name": config.get("model_name"),
+                            "embedding_dim": config.get("embedding_dim"),
+                        }
+                    # Check for conflict: different backend OR different model
+                    existing_backend = config.get("backend", "")
+                    existing_model = config.get("model_profile", "")
+                    if existing_backend != target_backend or existing_model != target_model:
+                        conflicts.append({
+                            "path": str(index_path),
+                            "existing": {
+                                "backend": existing_backend,
+                                "model": existing_model,
+                                "model_name": config.get("model_name"),
+                            },
+                        })
+        except Exception as e:
+            logger.debug(f"Failed to check model config for {index_path}: {e}")
+            continue
+    return {
+        "has_conflict": len(conflicts) > 0,
+        "existing_config": existing_config,
+        "target_config": {"backend": target_backend, "model": target_model},
+        "conflicts": conflicts,
+        "indexes_with_embeddings": indexes_with_embeddings,
+    }
+def _get_global_settings_path() -> Path:
+    """Get the path to global embedding settings file."""
+    return Path.home() / ".codexlens" / "embedding_lock.json"
+def get_locked_model_config() -> Optional[Dict[str, Any]]:
+    """Get the globally locked embedding model configuration.
+    Returns:
+        Dictionary with backend and model if locked, None otherwise.
+    """
+    settings_path = _get_global_settings_path()
+    if not settings_path.exists():
+        return None
+    try:
+        with open(settings_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            if data.get("locked"):
+                return {
+                    "backend": data.get("backend"),
+                    "model": data.get("model"),
+                    "locked_at": data.get("locked_at"),
+                }
+    except (json.JSONDecodeError, OSError):
+        pass
+    return None
+def set_locked_model_config(backend: str, model: str) -> None:
+    """Set the globally locked embedding model configuration.
+    This is called after the first successful embedding generation
+    to lock the model for all future operations.
+    Args:
+        backend: Embedding backend (fastembed or litellm)
+        model: Model profile/name
+    """
+    import datetime
+    settings_path = _get_global_settings_path()
+    settings_path.parent.mkdir(parents=True, exist_ok=True)
+    data = {
+        "locked": True,
+        "backend": backend,
+        "model": model,
+        "locked_at": datetime.datetime.now().isoformat(),
+    }
+    with open(settings_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2)
+def clear_locked_model_config() -> bool:
+    """Clear the globally locked embedding model configuration.
+    Returns:
+        True if lock was cleared, False if no lock existed.
+    """
+    settings_path = _get_global_settings_path()
+    if settings_path.exists():
+        settings_path.unlink()
+        return True
+    return False
+def check_global_model_lock(
+    target_backend: str,
+    target_model: str,
+) -> Dict[str, Any]:
+    """Check if the target model conflicts with the global lock.
+    Args:
+        target_backend: Requested embedding backend
+        target_model: Requested model profile/name
+    Returns:
+        Dictionary with:
+        - is_locked: True if a global lock exists
+        - has_conflict: True if target differs from locked config
+        - locked_config: The locked configuration (if any)
+        - target_config: The requested configuration
+    """
+    locked_config = get_locked_model_config()
+    if locked_config is None:
+        return {
+            "is_locked": False,
+            "has_conflict": False,
+            "locked_config": None,
+            "target_config": {"backend": target_backend, "model": target_model},
+        }
+    has_conflict = (
+        locked_config["backend"] != target_backend or
+        locked_config["model"] != target_model
+    )
+    return {
+        "is_locked": True,
+        "has_conflict": has_conflict,
+        "locked_config": locked_config,
+        "target_config": {"backend": target_backend, "model": target_model},
+    }