atomicmemory 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atomicmemory/__init__.py +166 -0
- atomicmemory/_version.py +3 -0
- atomicmemory/client/__init__.py +22 -0
- atomicmemory/client/async_memory_client.py +202 -0
- atomicmemory/client/atomic_memory_client.py +181 -0
- atomicmemory/client/memory_client.py +292 -0
- atomicmemory/core/__init__.py +34 -0
- atomicmemory/core/errors.py +122 -0
- atomicmemory/core/events.py +65 -0
- atomicmemory/core/logging.py +37 -0
- atomicmemory/core/retry.py +124 -0
- atomicmemory/core/validation.py +22 -0
- atomicmemory/embeddings/__init__.py +16 -0
- atomicmemory/embeddings/base.py +39 -0
- atomicmemory/embeddings/sentence_transformers.py +104 -0
- atomicmemory/kv_cache/__init__.py +17 -0
- atomicmemory/kv_cache/adapter.py +50 -0
- atomicmemory/kv_cache/memory_storage.py +98 -0
- atomicmemory/kv_cache/sqlite_storage.py +122 -0
- atomicmemory/memory/__init__.py +82 -0
- atomicmemory/memory/filters.py +68 -0
- atomicmemory/memory/pipeline.py +42 -0
- atomicmemory/memory/provider.py +397 -0
- atomicmemory/memory/registry.py +95 -0
- atomicmemory/memory/service.py +199 -0
- atomicmemory/memory/types.py +398 -0
- atomicmemory/providers/__init__.py +5 -0
- atomicmemory/providers/atomicmemory/__init__.py +43 -0
- atomicmemory/providers/atomicmemory/agents.py +156 -0
- atomicmemory/providers/atomicmemory/async_handle_impl.py +198 -0
- atomicmemory/providers/atomicmemory/async_provider.py +245 -0
- atomicmemory/providers/atomicmemory/audit.py +74 -0
- atomicmemory/providers/atomicmemory/config.py +38 -0
- atomicmemory/providers/atomicmemory/config_handle.py +123 -0
- atomicmemory/providers/atomicmemory/handle.py +513 -0
- atomicmemory/providers/atomicmemory/handle_impl.py +325 -0
- atomicmemory/providers/atomicmemory/http.py +255 -0
- atomicmemory/providers/atomicmemory/lessons.py +133 -0
- atomicmemory/providers/atomicmemory/lifecycle.py +202 -0
- atomicmemory/providers/atomicmemory/mappers.py +125 -0
- atomicmemory/providers/atomicmemory/path.py +20 -0
- atomicmemory/providers/atomicmemory/provider.py +300 -0
- atomicmemory/providers/atomicmemory/scope_mapper.py +98 -0
- atomicmemory/providers/mem0/__init__.py +41 -0
- atomicmemory/providers/mem0/async_provider.py +191 -0
- atomicmemory/providers/mem0/config.py +51 -0
- atomicmemory/providers/mem0/http.py +195 -0
- atomicmemory/providers/mem0/mappers.py +145 -0
- atomicmemory/providers/mem0/provider.py +202 -0
- atomicmemory/py.typed +0 -0
- atomicmemory/search/__init__.py +47 -0
- atomicmemory/search/chunking.py +161 -0
- atomicmemory/search/ranking.py +94 -0
- atomicmemory/search/semantic_search.py +130 -0
- atomicmemory/search/similarity.py +110 -0
- atomicmemory/storage/__init__.py +63 -0
- atomicmemory/storage/_mapping.py +305 -0
- atomicmemory/storage/async_client.py +208 -0
- atomicmemory/storage/client.py +339 -0
- atomicmemory/storage/errors.py +115 -0
- atomicmemory/storage/types.py +305 -0
- atomicmemory/utils/__init__.py +5 -0
- atomicmemory/utils/environment.py +23 -0
- atomicmemory-1.0.0.dist-info/METADATA +146 -0
- atomicmemory-1.0.0.dist-info/RECORD +67 -0
- atomicmemory-1.0.0.dist-info/WHEEL +4 -0
- atomicmemory-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Text chunking helpers.
|
|
2
|
+
|
|
3
|
+
Port of `atomicmemory-sdk/src/utils/chunking.ts`. Three strategies:
|
|
4
|
+
|
|
5
|
+
- :func:`chunk_text_with_metadata` — character-window with optional
|
|
6
|
+
word-boundary preservation and overlap.
|
|
7
|
+
- :func:`chunk_by_sentences` — split on ``.!?`` then re-group with
|
|
8
|
+
overlap.
|
|
9
|
+
- :func:`chunk_by_paragraphs` — split on blank lines then re-group with
|
|
10
|
+
overlap.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class ChunkOptions:
|
|
21
|
+
"""Configuration for :func:`chunk_text_with_metadata`."""
|
|
22
|
+
|
|
23
|
+
chunk_size: int
|
|
24
|
+
chunk_overlap: int
|
|
25
|
+
preserve_words: bool = True
|
|
26
|
+
separator: str = " "
|
|
27
|
+
min_chunk_size: int | None = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class ChunkMetadata:
|
|
32
|
+
word_count: int
|
|
33
|
+
char_count: int
|
|
34
|
+
has_overlap: bool
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class ChunkResult:
|
|
39
|
+
text: str
|
|
40
|
+
index: int
|
|
41
|
+
start_offset: int
|
|
42
|
+
end_offset: int
|
|
43
|
+
metadata: ChunkMetadata
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _word_count(text: str) -> int:
|
|
47
|
+
return len([w for w in re.split(r"\s+", text) if w])
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _validate(options: ChunkOptions) -> int:
|
|
51
|
+
if options.chunk_size <= 0:
|
|
52
|
+
raise ValueError("chunk_size must be positive")
|
|
53
|
+
if options.chunk_overlap < 0:
|
|
54
|
+
raise ValueError("chunk_overlap must be >= 0")
|
|
55
|
+
if options.chunk_overlap >= options.chunk_size:
|
|
56
|
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
|
57
|
+
raw_min = options.min_chunk_size if options.min_chunk_size is not None else max(1, options.chunk_size // 10)
|
|
58
|
+
return max(1, min(raw_min, options.chunk_size))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def chunk_text(text: str, options: ChunkOptions) -> list[str]:
|
|
62
|
+
"""Return chunks as plain strings; thin wrapper around metadata variant."""
|
|
63
|
+
return [c.text for c in chunk_text_with_metadata(text, options)]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def chunk_text_with_metadata(text: str, options: ChunkOptions) -> list[ChunkResult]:
|
|
67
|
+
"""Sliding-window chunker with optional word-boundary preservation."""
|
|
68
|
+
if not isinstance(text, str) or text == "":
|
|
69
|
+
return []
|
|
70
|
+
min_chunk_size = _validate(options)
|
|
71
|
+
if len(text) <= options.chunk_size:
|
|
72
|
+
trimmed = text.strip()
|
|
73
|
+
return [
|
|
74
|
+
ChunkResult(
|
|
75
|
+
text=trimmed,
|
|
76
|
+
index=0,
|
|
77
|
+
start_offset=0,
|
|
78
|
+
end_offset=len(text),
|
|
79
|
+
metadata=ChunkMetadata(
|
|
80
|
+
word_count=_word_count(trimmed),
|
|
81
|
+
char_count=len(text),
|
|
82
|
+
has_overlap=False,
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
]
|
|
86
|
+
return _chunk_window(text, options, min_chunk_size)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _chunk_window(text: str, options: ChunkOptions, min_chunk_size: int) -> list[ChunkResult]:
|
|
90
|
+
chunks: list[ChunkResult] = []
|
|
91
|
+
current_offset = 0
|
|
92
|
+
chunk_index = 0
|
|
93
|
+
while current_offset < len(text):
|
|
94
|
+
end_offset = min(current_offset + options.chunk_size, len(text))
|
|
95
|
+
chunk_text_slice = text[current_offset:end_offset]
|
|
96
|
+
if options.preserve_words and end_offset < len(text):
|
|
97
|
+
last_sep = chunk_text_slice.rfind(options.separator)
|
|
98
|
+
if last_sep > min_chunk_size:
|
|
99
|
+
end_offset = current_offset + last_sep
|
|
100
|
+
chunk_text_slice = text[current_offset:end_offset]
|
|
101
|
+
trimmed = chunk_text_slice.strip()
|
|
102
|
+
if len(trimmed) >= min_chunk_size:
|
|
103
|
+
chunks.append(
|
|
104
|
+
ChunkResult(
|
|
105
|
+
text=trimmed,
|
|
106
|
+
index=chunk_index,
|
|
107
|
+
start_offset=current_offset,
|
|
108
|
+
end_offset=end_offset,
|
|
109
|
+
metadata=ChunkMetadata(
|
|
110
|
+
word_count=_word_count(trimmed),
|
|
111
|
+
char_count=len(trimmed),
|
|
112
|
+
has_overlap=chunk_index > 0,
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
chunk_index += 1
|
|
117
|
+
next_offset = end_offset - options.chunk_overlap
|
|
118
|
+
current_offset = end_offset if next_offset <= current_offset else next_offset
|
|
119
|
+
if end_offset >= len(text):
|
|
120
|
+
break
|
|
121
|
+
return chunks
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def chunk_by_sentences(text: str, max_sentences: int, overlap_sentences: int = 1) -> list[str]:
|
|
125
|
+
"""Group sentences with overlap; sentence boundary = ``[.!?]+``."""
|
|
126
|
+
if not isinstance(text, str) or text == "":
|
|
127
|
+
return []
|
|
128
|
+
sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
|
|
129
|
+
if len(sentences) <= max_sentences:
|
|
130
|
+
return [text]
|
|
131
|
+
chunks: list[str] = []
|
|
132
|
+
current = 0
|
|
133
|
+
while current < len(sentences):
|
|
134
|
+
end = min(current + max_sentences, len(sentences))
|
|
135
|
+
slice_ = sentences[current:end]
|
|
136
|
+
if slice_:
|
|
137
|
+
chunks.append(". ".join(slice_) + ".")
|
|
138
|
+
current = end - overlap_sentences
|
|
139
|
+
if current <= 0 or end >= len(sentences):
|
|
140
|
+
break
|
|
141
|
+
return chunks
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def chunk_by_paragraphs(text: str, max_paragraphs: int, overlap_paragraphs: int = 0) -> list[str]:
|
|
145
|
+
"""Group paragraphs with overlap; paragraph boundary = blank lines."""
|
|
146
|
+
if not isinstance(text, str) or text == "":
|
|
147
|
+
return []
|
|
148
|
+
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
|
|
149
|
+
if len(paragraphs) <= max_paragraphs:
|
|
150
|
+
return [text]
|
|
151
|
+
chunks: list[str] = []
|
|
152
|
+
current = 0
|
|
153
|
+
while current < len(paragraphs):
|
|
154
|
+
end = min(current + max_paragraphs, len(paragraphs))
|
|
155
|
+
slice_ = paragraphs[current:end]
|
|
156
|
+
if slice_:
|
|
157
|
+
chunks.append("\n\n".join(slice_))
|
|
158
|
+
current = end - overlap_paragraphs
|
|
159
|
+
if current <= 0 or end >= len(paragraphs):
|
|
160
|
+
break
|
|
161
|
+
return chunks
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Heuristic reranking for local semantic search results.
|
|
2
|
+
|
|
3
|
+
Port of the rerank heuristics in
|
|
4
|
+
`atomicmemory-sdk/src/search/semantic-search.ts:rerankResults`.
|
|
5
|
+
Three signals: short-content boost, long-content penalty, recency
|
|
6
|
+
boost. Pure functions; no I/O.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class RankingConfig:
|
|
17
|
+
"""Tunable reranking knobs."""
|
|
18
|
+
|
|
19
|
+
short_threshold: int = 500
|
|
20
|
+
short_boost: float = 1.1
|
|
21
|
+
long_threshold: int = 2000
|
|
22
|
+
long_penalty: float = 0.9
|
|
23
|
+
recency_window_days: int = 7
|
|
24
|
+
recency_boost: float = 1.05
|
|
25
|
+
score_cap: float = 1.0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class RankableHit:
|
|
30
|
+
"""Minimal shape rerank operates on; field names match the TS port.
|
|
31
|
+
|
|
32
|
+
``original_index`` lets callers map the reranked output back to their
|
|
33
|
+
own list without relying on content/score equality (rerank mutates
|
|
34
|
+
score, and content can collide between hits).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
content: str
|
|
38
|
+
score: float
|
|
39
|
+
timestamp_seconds: float | None = None
|
|
40
|
+
original_index: int | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _length_factor(content: str, config: RankingConfig) -> float:
|
|
44
|
+
length = len(content)
|
|
45
|
+
if length < config.short_threshold:
|
|
46
|
+
return config.short_boost
|
|
47
|
+
if length > config.long_threshold:
|
|
48
|
+
return config.long_penalty
|
|
49
|
+
return 1.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _recency_factor(timestamp_seconds: float | None, *, now: datetime, config: RankingConfig) -> float:
|
|
53
|
+
if timestamp_seconds is None:
|
|
54
|
+
return 1.0
|
|
55
|
+
delta = now.timestamp() - timestamp_seconds
|
|
56
|
+
window_seconds = config.recency_window_days * 24 * 60 * 60
|
|
57
|
+
if delta < window_seconds:
|
|
58
|
+
return config.recency_boost
|
|
59
|
+
return 1.0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def rerank(
|
|
63
|
+
hits: list[RankableHit],
|
|
64
|
+
*,
|
|
65
|
+
config: RankingConfig | None = None,
|
|
66
|
+
now: datetime | None = None,
|
|
67
|
+
) -> list[RankableHit]:
|
|
68
|
+
"""Apply length + recency heuristics; sort by adjusted score descending.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
hits: Hits to rerank. Mutated in place is **not** allowed — a new
|
|
72
|
+
list is returned.
|
|
73
|
+
config: Tunable thresholds; defaults to :class:`RankingConfig`.
|
|
74
|
+
now: Reference time for recency. Defaults to UTC now.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
New list of hits with adjusted ``score`` values, sorted descending.
|
|
78
|
+
"""
|
|
79
|
+
cfg = config or RankingConfig()
|
|
80
|
+
reference = now or datetime.now(tz=timezone.utc)
|
|
81
|
+
adjusted: list[RankableHit] = []
|
|
82
|
+
for hit in hits:
|
|
83
|
+
factor = _length_factor(hit.content, cfg) * _recency_factor(hit.timestamp_seconds, now=reference, config=cfg)
|
|
84
|
+
new_score = min(hit.score * factor, cfg.score_cap)
|
|
85
|
+
adjusted.append(
|
|
86
|
+
RankableHit(
|
|
87
|
+
content=hit.content,
|
|
88
|
+
score=new_score,
|
|
89
|
+
timestamp_seconds=hit.timestamp_seconds,
|
|
90
|
+
original_index=hit.original_index,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
adjusted.sort(key=lambda h: h.score, reverse=True)
|
|
94
|
+
return adjusted
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Local SemanticSearch orchestrator — embed → score → rank.
|
|
2
|
+
|
|
3
|
+
Port of `atomicmemory-sdk/src/search/semantic-search.ts`. Brute-force
|
|
4
|
+
linear scan against an in-memory list of :class:`StoredContext`. No
|
|
5
|
+
HNSW / IVF — designed for small-to-medium local stores (<10k items)
|
|
6
|
+
where simplicity beats indexing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Callable, Sequence
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from atomicmemory.search.ranking import RankableHit, RankingConfig, rerank
|
|
17
|
+
from atomicmemory.search.similarity import batch_cosine_similarity, rank_by_similarity
|
|
18
|
+
|
|
19
|
+
EmbedFn = Callable[[str], list[float]]
|
|
20
|
+
"""Callable that turns a query string into an embedding vector."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class StoredContext:
|
|
25
|
+
"""A single context record indexed for local search."""
|
|
26
|
+
|
|
27
|
+
id: str
|
|
28
|
+
content: str
|
|
29
|
+
embedding: list[float]
|
|
30
|
+
metadata: dict[str, Any] | None = None
|
|
31
|
+
timestamp: float | None = None
|
|
32
|
+
user_id: str | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class SemanticSearchResult:
|
|
37
|
+
context: StoredContext
|
|
38
|
+
score: float
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class SemanticSearchConfig:
|
|
43
|
+
"""Search-time knobs."""
|
|
44
|
+
|
|
45
|
+
default_top_k: int = 10
|
|
46
|
+
default_threshold: float = 0.1
|
|
47
|
+
max_results: int = 100
|
|
48
|
+
reranking_enabled: bool = True
|
|
49
|
+
ranking: RankingConfig = field(default_factory=RankingConfig)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SemanticSearch:
|
|
53
|
+
"""Brute-force semantic search over an in-memory context list."""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
embed_fn: EmbedFn,
|
|
58
|
+
*,
|
|
59
|
+
config: SemanticSearchConfig | None = None,
|
|
60
|
+
) -> None:
|
|
61
|
+
self._embed = embed_fn
|
|
62
|
+
self._config = config or SemanticSearchConfig()
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def config(self) -> SemanticSearchConfig:
|
|
66
|
+
return self._config
|
|
67
|
+
|
|
68
|
+
def search(
|
|
69
|
+
self,
|
|
70
|
+
query: str,
|
|
71
|
+
contexts: Sequence[StoredContext],
|
|
72
|
+
*,
|
|
73
|
+
top_k: int | None = None,
|
|
74
|
+
threshold: float | None = None,
|
|
75
|
+
filter_fn: Callable[[StoredContext], bool] | None = None,
|
|
76
|
+
rerank_results: bool | None = None,
|
|
77
|
+
) -> list[SemanticSearchResult]:
|
|
78
|
+
"""Return up to ``top_k`` matches, optionally reranked.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
query: Free-text query.
|
|
82
|
+
contexts: Candidate pool to score.
|
|
83
|
+
top_k: Cap on returned results. Defaults to
|
|
84
|
+
``config.default_top_k``; clamped to ``config.max_results``.
|
|
85
|
+
threshold: Minimum cosine similarity. Defaults to
|
|
86
|
+
``config.default_threshold``.
|
|
87
|
+
filter_fn: Optional predicate run before scoring (cheap pre-
|
|
88
|
+
filter, e.g. by user_id).
|
|
89
|
+
rerank_results: Override config's ``reranking_enabled``.
|
|
90
|
+
"""
|
|
91
|
+
if not contexts:
|
|
92
|
+
return []
|
|
93
|
+
candidates = [ctx for ctx in contexts if filter_fn(ctx)] if filter_fn is not None else list(contexts)
|
|
94
|
+
if not candidates:
|
|
95
|
+
return []
|
|
96
|
+
query_embedding = self._embed(query)
|
|
97
|
+
similarities = batch_cosine_similarity(query_embedding, [ctx.embedding for ctx in candidates])
|
|
98
|
+
effective_threshold = threshold if threshold is not None else self._config.default_threshold
|
|
99
|
+
ranked_indices = rank_by_similarity(similarities, threshold=effective_threshold)
|
|
100
|
+
effective_top_k = min(
|
|
101
|
+
top_k if top_k is not None else self._config.default_top_k,
|
|
102
|
+
self._config.max_results,
|
|
103
|
+
)
|
|
104
|
+
ranked_indices = ranked_indices[:effective_top_k]
|
|
105
|
+
primary = [SemanticSearchResult(context=candidates[i], score=similarities[i]) for i in ranked_indices]
|
|
106
|
+
do_rerank = rerank_results if rerank_results is not None else self._config.reranking_enabled
|
|
107
|
+
if not do_rerank or not primary:
|
|
108
|
+
return primary
|
|
109
|
+
return self._apply_rerank(primary)
|
|
110
|
+
|
|
111
|
+
def _apply_rerank(self, results: list[SemanticSearchResult]) -> list[SemanticSearchResult]:
|
|
112
|
+
# Tag each rankable with its original-index so the reranked
|
|
113
|
+
# output maps back unambiguously even when scores or content
|
|
114
|
+
# collide.
|
|
115
|
+
rankable = [
|
|
116
|
+
RankableHit(
|
|
117
|
+
content=r.context.content,
|
|
118
|
+
score=r.score,
|
|
119
|
+
timestamp_seconds=r.context.timestamp,
|
|
120
|
+
original_index=i,
|
|
121
|
+
)
|
|
122
|
+
for i, r in enumerate(results)
|
|
123
|
+
]
|
|
124
|
+
adjusted = rerank(rankable, config=self._config.ranking, now=datetime.now(tz=timezone.utc))
|
|
125
|
+
reordered: list[SemanticSearchResult] = []
|
|
126
|
+
for hit in adjusted:
|
|
127
|
+
if hit.original_index is None:
|
|
128
|
+
continue
|
|
129
|
+
reordered.append(SemanticSearchResult(context=results[hit.original_index].context, score=hit.score))
|
|
130
|
+
return reordered
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Cosine similarity + top-k helpers.
|
|
2
|
+
|
|
3
|
+
Port of `atomicmemory-sdk/src/search/similarity-calculator.ts`. Uses
|
|
4
|
+
numpy for the dot/norm math; correctly handles zero-vectors and length
|
|
5
|
+
mismatches.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def cosine_similarity(a: Sequence[float], b: Sequence[float]) -> float:
|
|
17
|
+
"""Cosine similarity between two equal-length vectors.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
a: First embedding vector.
|
|
21
|
+
b: Second embedding vector.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
The cosine similarity in ``[-1, 1]``. Returns ``0.0`` when either
|
|
25
|
+
vector has zero L2 norm (avoids NaN).
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
ValueError: When ``a`` and ``b`` have different lengths.
|
|
29
|
+
"""
|
|
30
|
+
if len(a) != len(b):
|
|
31
|
+
raise ValueError(f"Vector length mismatch: {len(a)} != {len(b)}")
|
|
32
|
+
arr_a = np.asarray(a, dtype=np.float64)
|
|
33
|
+
arr_b = np.asarray(b, dtype=np.float64)
|
|
34
|
+
norm_a = float(np.linalg.norm(arr_a))
|
|
35
|
+
norm_b = float(np.linalg.norm(arr_b))
|
|
36
|
+
if norm_a == 0.0 or norm_b == 0.0:
|
|
37
|
+
return 0.0
|
|
38
|
+
return float(np.dot(arr_a, arr_b) / (norm_a * norm_b))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def batch_cosine_similarity(
|
|
42
|
+
query: Sequence[float],
|
|
43
|
+
candidates: Sequence[Sequence[float]],
|
|
44
|
+
) -> list[float]:
|
|
45
|
+
"""Compute cosine similarity between ``query`` and every candidate.
|
|
46
|
+
|
|
47
|
+
Returns a list of similarities in candidate order; empty when
|
|
48
|
+
``candidates`` is empty.
|
|
49
|
+
"""
|
|
50
|
+
if not candidates:
|
|
51
|
+
return []
|
|
52
|
+
q = np.asarray(query, dtype=np.float64)
|
|
53
|
+
if q.size == 0:
|
|
54
|
+
return [0.0] * len(candidates)
|
|
55
|
+
matrix = np.asarray(candidates, dtype=np.float64)
|
|
56
|
+
if matrix.shape[1] != q.shape[0]:
|
|
57
|
+
raise ValueError(f"Query length {q.shape[0]} does not match candidate width {matrix.shape[1]}")
|
|
58
|
+
q_norm = float(np.linalg.norm(q))
|
|
59
|
+
if q_norm == 0.0:
|
|
60
|
+
return [0.0] * len(candidates)
|
|
61
|
+
candidate_norms = np.linalg.norm(matrix, axis=1)
|
|
62
|
+
dots = matrix @ q
|
|
63
|
+
out: list[float] = []
|
|
64
|
+
for value, norm in zip(dots, candidate_norms, strict=True):
|
|
65
|
+
if norm == 0.0:
|
|
66
|
+
out.append(0.0)
|
|
67
|
+
else:
|
|
68
|
+
out.append(float(value / (norm * q_norm)))
|
|
69
|
+
return out
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def rank_by_similarity(
|
|
73
|
+
similarities: Sequence[float],
|
|
74
|
+
*,
|
|
75
|
+
threshold: float | None = None,
|
|
76
|
+
) -> list[int]:
|
|
77
|
+
"""Return candidate indices sorted by similarity descending.
|
|
78
|
+
|
|
79
|
+
Optional ``threshold`` filters out indices whose similarity is
|
|
80
|
+
strictly less than the threshold.
|
|
81
|
+
"""
|
|
82
|
+
indices = list(range(len(similarities)))
|
|
83
|
+
indices.sort(key=lambda i: similarities[i], reverse=True)
|
|
84
|
+
if threshold is None:
|
|
85
|
+
return indices
|
|
86
|
+
return [i for i in indices if similarities[i] >= threshold]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def find_top_k(
|
|
90
|
+
query_embedding: Sequence[float],
|
|
91
|
+
candidates: Sequence[Sequence[float]],
|
|
92
|
+
k: int,
|
|
93
|
+
*,
|
|
94
|
+
metadata: Sequence[Any] | None = None,
|
|
95
|
+
threshold: float | None = None,
|
|
96
|
+
) -> list[tuple[int, float, Any]]:
|
|
97
|
+
"""Return up to ``k`` candidates as ``(index, similarity, metadata?)`` tuples.
|
|
98
|
+
|
|
99
|
+
Sorted by similarity descending. ``metadata`` (when supplied) is
|
|
100
|
+
aligned by index with ``candidates``.
|
|
101
|
+
"""
|
|
102
|
+
if k <= 0:
|
|
103
|
+
return []
|
|
104
|
+
similarities = batch_cosine_similarity(query_embedding, candidates)
|
|
105
|
+
ranked = rank_by_similarity(similarities, threshold=threshold)[:k]
|
|
106
|
+
if metadata is None:
|
|
107
|
+
return [(i, similarities[i], None) for i in ranked]
|
|
108
|
+
if len(metadata) != len(candidates):
|
|
109
|
+
raise ValueError(f"metadata length {len(metadata)} does not match candidates length {len(candidates)}")
|
|
110
|
+
return [(i, similarities[i], metadata[i]) for i in ranked]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Backend artifact-storage API.
|
|
2
|
+
|
|
3
|
+
This package is the Python peer of `atomicmemory-sdk/src/storage`. It
|
|
4
|
+
contains the direct artifact-storage clients and types for
|
|
5
|
+
``/v1/storage/artifacts/*``. Local key/value cache adapters live under
|
|
6
|
+
``atomicmemory.kv_cache``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from atomicmemory.storage.async_client import AsyncStorageClient
|
|
10
|
+
from atomicmemory.storage.client import StorageClient
|
|
11
|
+
from atomicmemory.storage.errors import (
|
|
12
|
+
ArtifactInUseError,
|
|
13
|
+
ArtifactNotFoundError,
|
|
14
|
+
FilecoinDirectStorageNotSupportedError,
|
|
15
|
+
PointerContentNotManagedError,
|
|
16
|
+
StorageClientError,
|
|
17
|
+
UnsupportedCapabilityError,
|
|
18
|
+
)
|
|
19
|
+
from atomicmemory.storage.types import (
|
|
20
|
+
ArtifactHead,
|
|
21
|
+
ArtifactMetadata,
|
|
22
|
+
ArtifactRange,
|
|
23
|
+
ArtifactRef,
|
|
24
|
+
DeleteArtifactOptions,
|
|
25
|
+
DeleteArtifactPolicy,
|
|
26
|
+
DeleteArtifactResult,
|
|
27
|
+
PutArtifactInput,
|
|
28
|
+
PutManagedInput,
|
|
29
|
+
PutPointerInput,
|
|
30
|
+
StorageArtifactStatus,
|
|
31
|
+
StorageCapabilities,
|
|
32
|
+
StorageClientConfig,
|
|
33
|
+
StoredArtifact,
|
|
34
|
+
VerificationResult,
|
|
35
|
+
VerifyArtifactOptions,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"ArtifactHead",
|
|
40
|
+
"ArtifactInUseError",
|
|
41
|
+
"ArtifactMetadata",
|
|
42
|
+
"ArtifactNotFoundError",
|
|
43
|
+
"ArtifactRange",
|
|
44
|
+
"ArtifactRef",
|
|
45
|
+
"AsyncStorageClient",
|
|
46
|
+
"DeleteArtifactOptions",
|
|
47
|
+
"DeleteArtifactPolicy",
|
|
48
|
+
"DeleteArtifactResult",
|
|
49
|
+
"FilecoinDirectStorageNotSupportedError",
|
|
50
|
+
"PointerContentNotManagedError",
|
|
51
|
+
"PutArtifactInput",
|
|
52
|
+
"PutManagedInput",
|
|
53
|
+
"PutPointerInput",
|
|
54
|
+
"StorageArtifactStatus",
|
|
55
|
+
"StorageCapabilities",
|
|
56
|
+
"StorageClient",
|
|
57
|
+
"StorageClientConfig",
|
|
58
|
+
"StorageClientError",
|
|
59
|
+
"StoredArtifact",
|
|
60
|
+
"UnsupportedCapabilityError",
|
|
61
|
+
"VerificationResult",
|
|
62
|
+
"VerifyArtifactOptions",
|
|
63
|
+
]
|