claude-code-workflow 6.3.13 → 6.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/agents/issue-plan-agent.md +57 -103
- package/.claude/agents/issue-queue-agent.md +69 -120
- package/.claude/commands/issue/new.md +217 -473
- package/.claude/commands/issue/plan.md +76 -154
- package/.claude/commands/issue/queue.md +208 -259
- package/.claude/skills/issue-manage/SKILL.md +63 -22
- package/.claude/workflows/cli-templates/schemas/discovery-finding-schema.json +3 -3
- package/.claude/workflows/cli-templates/schemas/issues-jsonl-schema.json +3 -3
- package/.claude/workflows/cli-templates/schemas/queue-schema.json +0 -5
- package/.codex/prompts/issue-plan.md +16 -19
- package/.codex/prompts/issue-queue.md +0 -1
- package/README.md +1 -0
- package/ccw/dist/cli.d.ts.map +1 -1
- package/ccw/dist/cli.js +3 -1
- package/ccw/dist/cli.js.map +1 -1
- package/ccw/dist/commands/cli.d.ts.map +1 -1
- package/ccw/dist/commands/cli.js +45 -3
- package/ccw/dist/commands/cli.js.map +1 -1
- package/ccw/dist/commands/issue.d.ts +3 -1
- package/ccw/dist/commands/issue.d.ts.map +1 -1
- package/ccw/dist/commands/issue.js +383 -30
- package/ccw/dist/commands/issue.js.map +1 -1
- package/ccw/dist/core/routes/issue-routes.d.ts.map +1 -1
- package/ccw/dist/core/routes/issue-routes.js +77 -16
- package/ccw/dist/core/routes/issue-routes.js.map +1 -1
- package/ccw/dist/tools/cli-executor.d.ts.map +1 -1
- package/ccw/dist/tools/cli-executor.js +117 -4
- package/ccw/dist/tools/cli-executor.js.map +1 -1
- package/ccw/dist/tools/litellm-executor.d.ts +4 -0
- package/ccw/dist/tools/litellm-executor.d.ts.map +1 -1
- package/ccw/dist/tools/litellm-executor.js +54 -1
- package/ccw/dist/tools/litellm-executor.js.map +1 -1
- package/ccw/dist/tools/ui-generate-preview.d.ts +18 -0
- package/ccw/dist/tools/ui-generate-preview.d.ts.map +1 -1
- package/ccw/dist/tools/ui-generate-preview.js +26 -10
- package/ccw/dist/tools/ui-generate-preview.js.map +1 -1
- package/ccw/src/cli.ts +3 -1
- package/ccw/src/commands/cli.ts +47 -3
- package/ccw/src/commands/issue.ts +442 -34
- package/ccw/src/core/routes/issue-routes.ts +82 -16
- package/ccw/src/tools/cli-executor.ts +125 -4
- package/ccw/src/tools/litellm-executor.ts +107 -24
- package/ccw/src/tools/ui-generate-preview.js +60 -37
- package/codex-lens/src/codexlens/__pycache__/config.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/__pycache__/entities.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/config.py +25 -2
- package/codex-lens/src/codexlens/entities.py +5 -1
- package/codex-lens/src/codexlens/indexing/__pycache__/symbol_extractor.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/indexing/symbol_extractor.py +243 -243
- package/codex-lens/src/codexlens/parsers/__pycache__/factory.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/parsers/__pycache__/treesitter_parser.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/parsers/factory.py +256 -256
- package/codex-lens/src/codexlens/parsers/treesitter_parser.py +335 -335
- package/codex-lens/src/codexlens/search/__pycache__/chain_search.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/search/__pycache__/hybrid_search.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/search/__pycache__/ranking.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/search/chain_search.py +30 -1
- package/codex-lens/src/codexlens/semantic/__pycache__/__init__.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/__pycache__/embedder.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/__pycache__/reranker.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/__pycache__/vector_store.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/embedder.py +6 -9
- package/codex-lens/src/codexlens/semantic/vector_store.py +271 -200
- package/codex-lens/src/codexlens/storage/__pycache__/dir_index.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/storage/__pycache__/index_tree.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/storage/__pycache__/sqlite_store.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/storage/sqlite_store.py +184 -108
- package/package.json +6 -1
- package/.claude/commands/issue/manage.md +0 -113
|
@@ -9,12 +9,13 @@ Optimized for high-performance similarity search using:
|
|
|
9
9
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
|
-
import json
|
|
13
|
-
import logging
|
|
14
|
-
import
|
|
15
|
-
import
|
|
16
|
-
|
|
17
|
-
from
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import sys
|
|
15
|
+
import sqlite3
|
|
16
|
+
import threading
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
18
19
|
|
|
19
20
|
from codexlens.entities import SearchResult, SemanticChunk
|
|
20
21
|
from codexlens.errors import StorageError
|
|
@@ -39,6 +40,34 @@ logger = logging.getLogger(__name__)
|
|
|
39
40
|
# Epsilon used to guard against floating point precision edge cases (e.g., near-zero norms).
|
|
40
41
|
EPSILON = 1e-10
|
|
41
42
|
|
|
43
|
+
# SQLite INTEGER PRIMARY KEY uses signed 64-bit rowids.
|
|
44
|
+
SQLITE_INTEGER_MAX = (1 << 63) - 1
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _validate_chunk_id_range(start_id: int, count: int) -> None:
|
|
48
|
+
"""Validate that a batch insert can safely generate sequential chunk IDs."""
|
|
49
|
+
if count <= 0:
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
last_id = start_id + count - 1
|
|
53
|
+
if last_id > sys.maxsize or last_id > SQLITE_INTEGER_MAX:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
"Chunk ID range overflow: "
|
|
56
|
+
f"start_id={start_id}, count={count} would allocate up to {last_id}, "
|
|
57
|
+
f"exceeding limits (sys.maxsize={sys.maxsize}, sqlite_max={SQLITE_INTEGER_MAX}). "
|
|
58
|
+
"Consider cleaning up the index database or creating a new index database."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _validate_sql_placeholders(placeholders: str, expected_count: int) -> None:
|
|
63
|
+
"""Validate the placeholder string used for a parameterized SQL IN clause."""
|
|
64
|
+
expected = ",".join("?" * expected_count)
|
|
65
|
+
if placeholders != expected:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"Invalid SQL placeholders for IN clause. "
|
|
68
|
+
f"Expected {expected_count} '?' placeholders."
|
|
69
|
+
)
|
|
70
|
+
|
|
42
71
|
|
|
43
72
|
def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
44
73
|
"""Compute cosine similarity between two vectors."""
|
|
@@ -443,11 +472,11 @@ class VectorStore:
|
|
|
443
472
|
self._invalidate_cache()
|
|
444
473
|
return ids
|
|
445
474
|
|
|
446
|
-
def add_chunks_batch(
|
|
447
|
-
self,
|
|
448
|
-
chunks_with_paths: List[Tuple[SemanticChunk, str]],
|
|
449
|
-
update_ann: bool = True,
|
|
450
|
-
auto_save_ann: bool = True,
|
|
475
|
+
def add_chunks_batch(
|
|
476
|
+
self,
|
|
477
|
+
chunks_with_paths: List[Tuple[SemanticChunk, str]],
|
|
478
|
+
update_ann: bool = True,
|
|
479
|
+
auto_save_ann: bool = True,
|
|
451
480
|
) -> List[int]:
|
|
452
481
|
"""Batch insert chunks from multiple files in a single transaction.
|
|
453
482
|
|
|
@@ -459,16 +488,18 @@ class VectorStore:
|
|
|
459
488
|
auto_save_ann: If True, save ANN index after update (default: True).
|
|
460
489
|
Set to False for bulk inserts to reduce I/O overhead.
|
|
461
490
|
|
|
462
|
-
Returns:
|
|
463
|
-
List of inserted chunk IDs
|
|
464
|
-
"""
|
|
465
|
-
if not chunks_with_paths:
|
|
466
|
-
return []
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
491
|
+
Returns:
|
|
492
|
+
List of inserted chunk IDs
|
|
493
|
+
"""
|
|
494
|
+
if not chunks_with_paths:
|
|
495
|
+
return []
|
|
496
|
+
|
|
497
|
+
batch_size = len(chunks_with_paths)
|
|
498
|
+
|
|
499
|
+
# Prepare batch data
|
|
500
|
+
batch_data = []
|
|
501
|
+
embeddings_list = []
|
|
502
|
+
for chunk, file_path in chunks_with_paths:
|
|
472
503
|
if chunk.embedding is None:
|
|
473
504
|
raise ValueError("All chunks must have embeddings")
|
|
474
505
|
# Optimize: avoid repeated np.array() if already numpy
|
|
@@ -481,49 +512,51 @@ class VectorStore:
|
|
|
481
512
|
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
|
482
513
|
embeddings_list.append(embedding_arr)
|
|
483
514
|
|
|
484
|
-
# Batch insert to SQLite in single transaction
|
|
485
|
-
with sqlite3.connect(self.db_path) as conn:
|
|
486
|
-
# Get starting ID before insert
|
|
487
|
-
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
|
488
|
-
start_id = (row[0] or 0) + 1
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
515
|
+
# Batch insert to SQLite in single transaction
|
|
516
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
517
|
+
# Get starting ID before insert
|
|
518
|
+
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
|
519
|
+
start_id = (row[0] or 0) + 1
|
|
520
|
+
|
|
521
|
+
_validate_chunk_id_range(start_id, batch_size)
|
|
522
|
+
|
|
523
|
+
conn.executemany(
|
|
524
|
+
"""
|
|
525
|
+
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
|
526
|
+
VALUES (?, ?, ?, ?)
|
|
494
527
|
""",
|
|
495
528
|
batch_data
|
|
496
|
-
)
|
|
497
|
-
conn.commit()
|
|
498
|
-
# Calculate inserted IDs based on starting ID
|
|
499
|
-
ids = list(range(start_id, start_id +
|
|
500
|
-
|
|
501
|
-
# Handle ANN index updates
|
|
502
|
-
if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
self.
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
try:
|
|
511
|
-
embeddings_matrix = np.vstack(embeddings_list)
|
|
512
|
-
self._ann_index.add_vectors(ids, embeddings_matrix)
|
|
513
|
-
if auto_save_ann:
|
|
514
|
-
self._ann_index.save()
|
|
515
|
-
except Exception as e:
|
|
516
|
-
logger.warning("Failed to add batch to ANN index: %s", e)
|
|
529
|
+
)
|
|
530
|
+
conn.commit()
|
|
531
|
+
# Calculate inserted IDs based on starting ID
|
|
532
|
+
ids = list(range(start_id, start_id + batch_size))
|
|
533
|
+
|
|
534
|
+
# Handle ANN index updates
|
|
535
|
+
if embeddings_list and update_ann and self._ensure_ann_index(len(embeddings_list[0])):
|
|
536
|
+
with self._ann_write_lock:
|
|
537
|
+
# In bulk insert mode, accumulate for later batch update
|
|
538
|
+
if self._bulk_insert_mode:
|
|
539
|
+
self._bulk_insert_ids.extend(ids)
|
|
540
|
+
self._bulk_insert_embeddings.extend(embeddings_list)
|
|
541
|
+
else:
|
|
542
|
+
# Normal mode: update immediately
|
|
543
|
+
try:
|
|
544
|
+
embeddings_matrix = np.vstack(embeddings_list)
|
|
545
|
+
self._ann_index.add_vectors(ids, embeddings_matrix)
|
|
546
|
+
if auto_save_ann:
|
|
547
|
+
self._ann_index.save()
|
|
548
|
+
except Exception as e:
|
|
549
|
+
logger.warning("Failed to add batch to ANN index: %s", e)
|
|
517
550
|
|
|
518
551
|
# Invalidate cache after modification
|
|
519
552
|
self._invalidate_cache()
|
|
520
553
|
return ids
|
|
521
554
|
|
|
522
|
-
def add_chunks_batch_numpy(
|
|
523
|
-
self,
|
|
524
|
-
chunks_with_paths: List[Tuple[SemanticChunk, str]],
|
|
525
|
-
embeddings_matrix: np.ndarray,
|
|
526
|
-
update_ann: bool = True,
|
|
555
|
+
def add_chunks_batch_numpy(
|
|
556
|
+
self,
|
|
557
|
+
chunks_with_paths: List[Tuple[SemanticChunk, str]],
|
|
558
|
+
embeddings_matrix: np.ndarray,
|
|
559
|
+
update_ann: bool = True,
|
|
527
560
|
auto_save_ann: bool = True,
|
|
528
561
|
) -> List[int]:
|
|
529
562
|
"""Batch insert chunks with pre-computed numpy embeddings matrix.
|
|
@@ -537,16 +570,18 @@ class VectorStore:
|
|
|
537
570
|
update_ann: If True, update ANN index with new vectors (default: True)
|
|
538
571
|
auto_save_ann: If True, save ANN index after update (default: True)
|
|
539
572
|
|
|
540
|
-
Returns:
|
|
541
|
-
List of inserted chunk IDs
|
|
542
|
-
"""
|
|
543
|
-
if not chunks_with_paths:
|
|
544
|
-
return []
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
573
|
+
Returns:
|
|
574
|
+
List of inserted chunk IDs
|
|
575
|
+
"""
|
|
576
|
+
if not chunks_with_paths:
|
|
577
|
+
return []
|
|
578
|
+
|
|
579
|
+
batch_size = len(chunks_with_paths)
|
|
580
|
+
|
|
581
|
+
if len(chunks_with_paths) != embeddings_matrix.shape[0]:
|
|
582
|
+
raise ValueError(
|
|
583
|
+
f"Mismatch: {len(chunks_with_paths)} chunks but "
|
|
584
|
+
f"{embeddings_matrix.shape[0]} embeddings"
|
|
550
585
|
)
|
|
551
586
|
|
|
552
587
|
# Ensure float32 format
|
|
@@ -560,45 +595,47 @@ class VectorStore:
|
|
|
560
595
|
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
|
561
596
|
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
|
562
597
|
|
|
563
|
-
# Batch insert to SQLite in single transaction
|
|
564
|
-
with sqlite3.connect(self.db_path) as conn:
|
|
565
|
-
# Get starting ID before insert
|
|
566
|
-
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
|
567
|
-
start_id = (row[0] or 0) + 1
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
598
|
+
# Batch insert to SQLite in single transaction
|
|
599
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
600
|
+
# Get starting ID before insert
|
|
601
|
+
row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
|
|
602
|
+
start_id = (row[0] or 0) + 1
|
|
603
|
+
|
|
604
|
+
_validate_chunk_id_range(start_id, batch_size)
|
|
605
|
+
|
|
606
|
+
conn.executemany(
|
|
607
|
+
"""
|
|
608
|
+
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
|
609
|
+
VALUES (?, ?, ?, ?)
|
|
573
610
|
""",
|
|
574
611
|
batch_data
|
|
575
|
-
)
|
|
576
|
-
conn.commit()
|
|
577
|
-
# Calculate inserted IDs based on starting ID
|
|
578
|
-
ids = list(range(start_id, start_id +
|
|
579
|
-
|
|
580
|
-
# Handle ANN index updates
|
|
581
|
-
if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
self.
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
try:
|
|
591
|
-
self._ann_index.add_vectors(ids, embeddings_matrix)
|
|
592
|
-
if auto_save_ann:
|
|
593
|
-
self._ann_index.save()
|
|
594
|
-
except Exception as e:
|
|
595
|
-
logger.warning("Failed to add batch to ANN index: %s", e)
|
|
612
|
+
)
|
|
613
|
+
conn.commit()
|
|
614
|
+
# Calculate inserted IDs based on starting ID
|
|
615
|
+
ids = list(range(start_id, start_id + batch_size))
|
|
616
|
+
|
|
617
|
+
# Handle ANN index updates
|
|
618
|
+
if update_ann and self._ensure_ann_index(embeddings_matrix.shape[1]):
|
|
619
|
+
with self._ann_write_lock:
|
|
620
|
+
# In bulk insert mode, accumulate for later batch update
|
|
621
|
+
if self._bulk_insert_mode:
|
|
622
|
+
self._bulk_insert_ids.extend(ids)
|
|
623
|
+
# Split matrix into individual arrays for accumulation
|
|
624
|
+
self._bulk_insert_embeddings.extend([embeddings_matrix[i] for i in range(len(ids))])
|
|
625
|
+
else:
|
|
626
|
+
# Normal mode: update immediately
|
|
627
|
+
try:
|
|
628
|
+
self._ann_index.add_vectors(ids, embeddings_matrix)
|
|
629
|
+
if auto_save_ann:
|
|
630
|
+
self._ann_index.save()
|
|
631
|
+
except Exception as e:
|
|
632
|
+
logger.warning("Failed to add batch to ANN index: %s", e)
|
|
596
633
|
|
|
597
634
|
# Invalidate cache after modification
|
|
598
635
|
self._invalidate_cache()
|
|
599
636
|
return ids
|
|
600
637
|
|
|
601
|
-
def begin_bulk_insert(self) -> None:
|
|
638
|
+
def begin_bulk_insert(self) -> None:
|
|
602
639
|
"""Begin bulk insert mode - disable ANN auto-update for better performance.
|
|
603
640
|
|
|
604
641
|
Usage:
|
|
@@ -614,42 +651,45 @@ class VectorStore:
|
|
|
614
651
|
for batch in batches:
|
|
615
652
|
store.add_chunks_batch(batch)
|
|
616
653
|
"""
|
|
617
|
-
self.
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
654
|
+
with self._ann_write_lock:
|
|
655
|
+
self._bulk_insert_mode = True
|
|
656
|
+
self._bulk_insert_ids.clear()
|
|
657
|
+
self._bulk_insert_embeddings.clear()
|
|
658
|
+
logger.debug("Entered bulk insert mode")
|
|
659
|
+
|
|
660
|
+
def end_bulk_insert(self) -> None:
|
|
623
661
|
"""End bulk insert mode and rebuild ANN index from accumulated data.
|
|
624
662
|
|
|
625
663
|
This method should be called after all bulk inserts are complete to
|
|
626
664
|
update the ANN index in a single batch operation.
|
|
627
665
|
"""
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
)
|
|
646
|
-
|
|
647
|
-
logger.
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
666
|
+
with self._ann_write_lock:
|
|
667
|
+
if not self._bulk_insert_mode:
|
|
668
|
+
logger.warning("end_bulk_insert called but not in bulk insert mode")
|
|
669
|
+
return
|
|
670
|
+
|
|
671
|
+
self._bulk_insert_mode = False
|
|
672
|
+
bulk_ids = list(self._bulk_insert_ids)
|
|
673
|
+
bulk_embeddings = list(self._bulk_insert_embeddings)
|
|
674
|
+
self._bulk_insert_ids.clear()
|
|
675
|
+
self._bulk_insert_embeddings.clear()
|
|
676
|
+
|
|
677
|
+
# Update ANN index with accumulated data.
|
|
678
|
+
if bulk_ids and bulk_embeddings:
|
|
679
|
+
if self._ensure_ann_index(len(bulk_embeddings[0])):
|
|
680
|
+
with self._ann_write_lock:
|
|
681
|
+
try:
|
|
682
|
+
embeddings_matrix = np.vstack(bulk_embeddings)
|
|
683
|
+
self._ann_index.add_vectors(bulk_ids, embeddings_matrix)
|
|
684
|
+
self._ann_index.save()
|
|
685
|
+
logger.info(
|
|
686
|
+
"Bulk insert complete: added %d vectors to ANN index",
|
|
687
|
+
len(bulk_ids),
|
|
688
|
+
)
|
|
689
|
+
except Exception as e:
|
|
690
|
+
logger.error("Failed to update ANN index after bulk insert: %s", e)
|
|
691
|
+
|
|
692
|
+
logger.debug("Exited bulk insert mode")
|
|
653
693
|
|
|
654
694
|
class BulkInsertContext:
|
|
655
695
|
"""Context manager for bulk insert operations."""
|
|
@@ -712,34 +752,39 @@ class VectorStore:
|
|
|
712
752
|
self._invalidate_cache()
|
|
713
753
|
return deleted
|
|
714
754
|
|
|
715
|
-
def search_similar(
|
|
716
|
-
self,
|
|
717
|
-
query_embedding: List[float],
|
|
718
|
-
top_k: int = 10,
|
|
719
|
-
min_score: float = 0.0,
|
|
720
|
-
return_full_content: bool = True,
|
|
721
|
-
) -> List[SearchResult]:
|
|
722
|
-
"""Find chunks most similar to query embedding.
|
|
755
|
+
def search_similar(
|
|
756
|
+
self,
|
|
757
|
+
query_embedding: List[float],
|
|
758
|
+
top_k: int = 10,
|
|
759
|
+
min_score: float = 0.0,
|
|
760
|
+
return_full_content: bool = True,
|
|
761
|
+
) -> List[SearchResult]:
|
|
762
|
+
"""Find chunks most similar to query embedding.
|
|
723
763
|
|
|
724
764
|
Uses HNSW index for O(log N) search when available, falls back to
|
|
725
765
|
brute-force NumPy search otherwise.
|
|
726
766
|
|
|
727
|
-
Args:
|
|
728
|
-
query_embedding: Query vector.
|
|
729
|
-
top_k: Maximum results to return.
|
|
730
|
-
min_score: Minimum similarity score
|
|
731
|
-
return_full_content: If True, return full code block content.
|
|
732
|
-
|
|
733
|
-
Returns:
|
|
734
|
-
List of SearchResult ordered by similarity (highest first).
|
|
735
|
-
"""
|
|
736
|
-
query_vec = np.array(query_embedding, dtype=np.float32)
|
|
767
|
+
Args:
|
|
768
|
+
query_embedding: Query vector.
|
|
769
|
+
top_k: Maximum results to return.
|
|
770
|
+
min_score: Minimum cosine similarity score in [0.0, 1.0].
|
|
771
|
+
return_full_content: If True, return full code block content.
|
|
737
772
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
773
|
+
Returns:
|
|
774
|
+
List of SearchResult ordered by similarity (highest first).
|
|
775
|
+
"""
|
|
776
|
+
query_vec = np.array(query_embedding, dtype=np.float32)
|
|
777
|
+
|
|
778
|
+
if not 0.0 <= min_score <= 1.0:
|
|
779
|
+
raise ValueError(
|
|
780
|
+
f"Invalid min_score: {min_score}. Must be within [0.0, 1.0] for cosine similarity."
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
# Try HNSW search first (O(log N))
|
|
784
|
+
if (
|
|
785
|
+
HNSWLIB_AVAILABLE
|
|
786
|
+
and self._ann_index is not None
|
|
787
|
+
and self._ann_index.is_loaded
|
|
743
788
|
and self._ann_index.count() > 0
|
|
744
789
|
):
|
|
745
790
|
try:
|
|
@@ -754,20 +799,20 @@ class VectorStore:
|
|
|
754
799
|
query_vec, top_k, min_score, return_full_content
|
|
755
800
|
)
|
|
756
801
|
|
|
757
|
-
def _search_with_ann(
|
|
758
|
-
self,
|
|
759
|
-
query_vec: np.ndarray,
|
|
760
|
-
top_k: int,
|
|
761
|
-
min_score: float,
|
|
762
|
-
return_full_content: bool,
|
|
763
|
-
) -> List[SearchResult]:
|
|
764
|
-
"""Search using HNSW index (O(log N)).
|
|
765
|
-
|
|
766
|
-
Args:
|
|
767
|
-
query_vec: Query vector as numpy array
|
|
768
|
-
top_k: Maximum results to return
|
|
769
|
-
min_score: Minimum similarity score
|
|
770
|
-
return_full_content: If True, return full code block content
|
|
802
|
+
def _search_with_ann(
|
|
803
|
+
self,
|
|
804
|
+
query_vec: np.ndarray,
|
|
805
|
+
top_k: int,
|
|
806
|
+
min_score: float,
|
|
807
|
+
return_full_content: bool,
|
|
808
|
+
) -> List[SearchResult]:
|
|
809
|
+
"""Search using HNSW index (O(log N)).
|
|
810
|
+
|
|
811
|
+
Args:
|
|
812
|
+
query_vec: Query vector as numpy array
|
|
813
|
+
top_k: Maximum results to return
|
|
814
|
+
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
|
815
|
+
return_full_content: If True, return full code block content
|
|
771
816
|
|
|
772
817
|
Returns:
|
|
773
818
|
List of SearchResult ordered by similarity (highest first)
|
|
@@ -779,15 +824,36 @@ class VectorStore:
|
|
|
779
824
|
if effective_top_k == 0:
|
|
780
825
|
return []
|
|
781
826
|
|
|
782
|
-
# HNSW search returns (ids, distances)
|
|
783
|
-
# For cosine space: distance = 1 - similarity
|
|
784
|
-
ids, distances = self._ann_index.search(query_vec, effective_top_k)
|
|
785
|
-
|
|
786
|
-
if
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
827
|
+
# HNSW search returns (ids, distances)
|
|
828
|
+
# For cosine space: distance = 1 - similarity
|
|
829
|
+
ids, distances = self._ann_index.search(query_vec, effective_top_k)
|
|
830
|
+
|
|
831
|
+
if ids is None or distances is None:
|
|
832
|
+
logger.debug(
|
|
833
|
+
"ANN search returned null results (ids=%s, distances=%s)",
|
|
834
|
+
ids,
|
|
835
|
+
distances,
|
|
836
|
+
)
|
|
837
|
+
return []
|
|
838
|
+
|
|
839
|
+
if len(ids) == 0 or len(distances) == 0:
|
|
840
|
+
logger.debug(
|
|
841
|
+
"ANN search returned empty results (ids=%s, distances=%s)",
|
|
842
|
+
ids,
|
|
843
|
+
distances,
|
|
844
|
+
)
|
|
845
|
+
return []
|
|
846
|
+
|
|
847
|
+
if len(ids) != len(distances):
|
|
848
|
+
logger.warning(
|
|
849
|
+
"ANN search returned mismatched result lengths (%d ids, %d distances)",
|
|
850
|
+
len(ids),
|
|
851
|
+
len(distances),
|
|
852
|
+
)
|
|
853
|
+
return []
|
|
854
|
+
|
|
855
|
+
# Convert distances to similarity scores
|
|
856
|
+
scores = [1.0 - d for d in distances]
|
|
791
857
|
|
|
792
858
|
# Filter by min_score
|
|
793
859
|
filtered = [
|
|
@@ -805,20 +871,20 @@ class VectorStore:
|
|
|
805
871
|
# Fetch content from SQLite
|
|
806
872
|
return self._fetch_results_by_ids(top_ids, top_scores, return_full_content)
|
|
807
873
|
|
|
808
|
-
def _search_brute_force(
|
|
809
|
-
self,
|
|
810
|
-
query_vec: np.ndarray,
|
|
811
|
-
top_k: int,
|
|
812
|
-
min_score: float,
|
|
813
|
-
return_full_content: bool,
|
|
814
|
-
) -> List[SearchResult]:
|
|
815
|
-
"""Brute-force search using NumPy (O(N) fallback).
|
|
816
|
-
|
|
817
|
-
Args:
|
|
818
|
-
query_vec: Query vector as numpy array
|
|
819
|
-
top_k: Maximum results to return
|
|
820
|
-
min_score: Minimum similarity score
|
|
821
|
-
return_full_content: If True, return full code block content
|
|
874
|
+
def _search_brute_force(
|
|
875
|
+
self,
|
|
876
|
+
query_vec: np.ndarray,
|
|
877
|
+
top_k: int,
|
|
878
|
+
min_score: float,
|
|
879
|
+
return_full_content: bool,
|
|
880
|
+
) -> List[SearchResult]:
|
|
881
|
+
"""Brute-force search using NumPy (O(N) fallback).
|
|
882
|
+
|
|
883
|
+
Args:
|
|
884
|
+
query_vec: Query vector as numpy array
|
|
885
|
+
top_k: Maximum results to return
|
|
886
|
+
min_score: Minimum cosine similarity score in [0.0, 1.0]
|
|
887
|
+
return_full_content: If True, return full code block content
|
|
822
888
|
|
|
823
889
|
Returns:
|
|
824
890
|
List of SearchResult ordered by similarity (highest first)
|
|
@@ -885,16 +951,21 @@ class VectorStore:
|
|
|
885
951
|
Returns:
|
|
886
952
|
List of SearchResult objects.
|
|
887
953
|
"""
|
|
888
|
-
if not chunk_ids:
|
|
889
|
-
return []
|
|
890
|
-
|
|
891
|
-
# Build parameterized query for IN clause
|
|
892
|
-
placeholders = ",".join("?" * len(chunk_ids))
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
954
|
+
if not chunk_ids:
|
|
955
|
+
return []
|
|
956
|
+
|
|
957
|
+
# Build parameterized query for IN clause
|
|
958
|
+
placeholders = ",".join("?" * len(chunk_ids))
|
|
959
|
+
_validate_sql_placeholders(placeholders, len(chunk_ids))
|
|
960
|
+
|
|
961
|
+
# SQL injection prevention:
|
|
962
|
+
# - Only a validated placeholders string (commas + '?') is interpolated into the query.
|
|
963
|
+
# - User-provided values are passed separately via sqlite3 parameters.
|
|
964
|
+
query = """
|
|
965
|
+
SELECT id, file_path, content, metadata
|
|
966
|
+
FROM semantic_chunks
|
|
967
|
+
WHERE id IN ({placeholders})
|
|
968
|
+
""".format(placeholders=placeholders)
|
|
898
969
|
|
|
899
970
|
with sqlite3.connect(self.db_path) as conn:
|
|
900
971
|
conn.execute("PRAGMA mmap_size = 30000000000")
|
|
Binary file
|
|
Binary file
|