spatial-memory-mcp 1.0.3__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spatial-memory-mcp might be problematic. Click here for more details.
- spatial_memory/__init__.py +97 -97
- spatial_memory/__main__.py +241 -2
- spatial_memory/adapters/lancedb_repository.py +74 -5
- spatial_memory/config.py +115 -2
- spatial_memory/core/__init__.py +35 -0
- spatial_memory/core/cache.py +317 -0
- spatial_memory/core/circuit_breaker.py +297 -0
- spatial_memory/core/connection_pool.py +41 -3
- spatial_memory/core/consolidation_strategies.py +402 -0
- spatial_memory/core/database.py +791 -769
- spatial_memory/core/db_idempotency.py +242 -0
- spatial_memory/core/db_indexes.py +575 -0
- spatial_memory/core/db_migrations.py +584 -0
- spatial_memory/core/db_search.py +509 -0
- spatial_memory/core/db_versioning.py +177 -0
- spatial_memory/core/embeddings.py +156 -19
- spatial_memory/core/errors.py +75 -3
- spatial_memory/core/filesystem.py +178 -0
- spatial_memory/core/logging.py +194 -103
- spatial_memory/core/models.py +4 -0
- spatial_memory/core/rate_limiter.py +326 -105
- spatial_memory/core/response_types.py +497 -0
- spatial_memory/core/tracing.py +300 -0
- spatial_memory/core/validation.py +403 -319
- spatial_memory/factory.py +407 -0
- spatial_memory/migrations/__init__.py +40 -0
- spatial_memory/ports/repositories.py +52 -2
- spatial_memory/server.py +329 -188
- spatial_memory/services/export_import.py +61 -43
- spatial_memory/services/lifecycle.py +397 -122
- spatial_memory/services/memory.py +81 -4
- spatial_memory/services/spatial.py +129 -46
- spatial_memory/tools/definitions.py +695 -671
- {spatial_memory_mcp-1.0.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/METADATA +83 -3
- spatial_memory_mcp-1.6.0.dist-info/RECORD +54 -0
- spatial_memory_mcp-1.0.3.dist-info/RECORD +0 -41
- {spatial_memory_mcp-1.0.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/WHEEL +0 -0
- {spatial_memory_mcp-1.0.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/entry_points.txt +0 -0
- {spatial_memory_mcp-1.0.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -13,7 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
import logging
|
|
15
15
|
from dataclasses import dataclass, field
|
|
16
|
-
from typing import TYPE_CHECKING, Any
|
|
16
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
17
17
|
|
|
18
18
|
from spatial_memory.core.errors import MemoryNotFoundError, ValidationError
|
|
19
19
|
from spatial_memory.core.models import Memory, MemorySource
|
|
@@ -22,6 +22,7 @@ from spatial_memory.core.validation import validate_content, validate_importance
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
if TYPE_CHECKING:
|
|
25
|
+
from spatial_memory.core.database import IdempotencyRecord
|
|
25
26
|
from spatial_memory.core.models import MemoryResult
|
|
26
27
|
from spatial_memory.ports.repositories import (
|
|
27
28
|
EmbeddingServiceProtocol,
|
|
@@ -29,6 +30,39 @@ if TYPE_CHECKING:
|
|
|
29
30
|
)
|
|
30
31
|
|
|
31
32
|
|
|
33
|
+
class IdempotencyProviderProtocol(Protocol):
|
|
34
|
+
"""Protocol for idempotency key storage and lookup.
|
|
35
|
+
|
|
36
|
+
Implementations should handle key-to-memory-id mappings with TTL support.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def get_by_idempotency_key(self, key: str) -> IdempotencyRecord | None:
|
|
40
|
+
"""Look up an idempotency record by key.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
key: The idempotency key to look up.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
IdempotencyRecord if found and not expired, None otherwise.
|
|
47
|
+
"""
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
def store_idempotency_key(
|
|
51
|
+
self,
|
|
52
|
+
key: str,
|
|
53
|
+
memory_id: str,
|
|
54
|
+
ttl_hours: float = 24.0,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Store an idempotency key mapping.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
key: The idempotency key.
|
|
60
|
+
memory_id: The memory ID that was created.
|
|
61
|
+
ttl_hours: Time-to-live in hours (default: 24 hours).
|
|
62
|
+
"""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
|
|
32
66
|
@dataclass
|
|
33
67
|
class RememberResult:
|
|
34
68
|
"""Result of storing a memory."""
|
|
@@ -36,6 +70,7 @@ class RememberResult:
|
|
|
36
70
|
id: str
|
|
37
71
|
content: str
|
|
38
72
|
namespace: str
|
|
73
|
+
deduplicated: bool = False
|
|
39
74
|
|
|
40
75
|
|
|
41
76
|
@dataclass
|
|
@@ -80,15 +115,18 @@ class MemoryService:
|
|
|
80
115
|
self,
|
|
81
116
|
repository: MemoryRepositoryProtocol,
|
|
82
117
|
embeddings: EmbeddingServiceProtocol,
|
|
118
|
+
idempotency_provider: IdempotencyProviderProtocol | None = None,
|
|
83
119
|
) -> None:
|
|
84
120
|
"""Initialize the memory service.
|
|
85
121
|
|
|
86
122
|
Args:
|
|
87
123
|
repository: Repository for memory storage.
|
|
88
124
|
embeddings: Service for generating embeddings.
|
|
125
|
+
idempotency_provider: Optional provider for idempotency key support.
|
|
89
126
|
"""
|
|
90
127
|
self._repo = repository
|
|
91
128
|
self._embeddings = embeddings
|
|
129
|
+
self._idempotency = idempotency_provider
|
|
92
130
|
|
|
93
131
|
# Use centralized validation functions
|
|
94
132
|
_validate_content = staticmethod(validate_content)
|
|
@@ -101,6 +139,7 @@ class MemoryService:
|
|
|
101
139
|
tags: list[str] | None = None,
|
|
102
140
|
importance: float = 0.5,
|
|
103
141
|
metadata: dict[str, Any] | None = None,
|
|
142
|
+
idempotency_key: str | None = None,
|
|
104
143
|
) -> RememberResult:
|
|
105
144
|
"""Store a new memory.
|
|
106
145
|
|
|
@@ -110,13 +149,40 @@ class MemoryService:
|
|
|
110
149
|
tags: Optional list of tags.
|
|
111
150
|
importance: Importance score (0-1).
|
|
112
151
|
metadata: Optional metadata dict.
|
|
152
|
+
idempotency_key: Optional key for idempotent requests. If provided
|
|
153
|
+
and a memory was already created with this key, returns the
|
|
154
|
+
existing memory ID with deduplicated=True.
|
|
113
155
|
|
|
114
156
|
Returns:
|
|
115
|
-
RememberResult with the new memory's ID.
|
|
157
|
+
RememberResult with the new memory's ID. If idempotency_key was
|
|
158
|
+
provided and matched an existing request, deduplicated=True.
|
|
116
159
|
|
|
117
160
|
Raises:
|
|
118
161
|
ValidationError: If input validation fails.
|
|
119
162
|
"""
|
|
163
|
+
# Check idempotency key first (before any expensive operations)
|
|
164
|
+
if idempotency_key and self._idempotency:
|
|
165
|
+
existing = self._idempotency.get_by_idempotency_key(idempotency_key)
|
|
166
|
+
if existing:
|
|
167
|
+
logger.debug(
|
|
168
|
+
f"Idempotency key '{idempotency_key}' matched existing "
|
|
169
|
+
f"memory '{existing.memory_id}'"
|
|
170
|
+
)
|
|
171
|
+
# Return cached result - fetch the memory to get content
|
|
172
|
+
cached_memory = self._repo.get(existing.memory_id)
|
|
173
|
+
if cached_memory:
|
|
174
|
+
return RememberResult(
|
|
175
|
+
id=existing.memory_id,
|
|
176
|
+
content=cached_memory.content,
|
|
177
|
+
namespace=cached_memory.namespace,
|
|
178
|
+
deduplicated=True,
|
|
179
|
+
)
|
|
180
|
+
# Memory was deleted but key exists - proceed with new insert
|
|
181
|
+
logger.warning(
|
|
182
|
+
f"Idempotency key '{idempotency_key}' references deleted "
|
|
183
|
+
f"memory '{existing.memory_id}', creating new memory"
|
|
184
|
+
)
|
|
185
|
+
|
|
120
186
|
# Validate inputs
|
|
121
187
|
self._validate_content(content)
|
|
122
188
|
self._validate_importance(importance)
|
|
@@ -138,10 +204,21 @@ class MemoryService:
|
|
|
138
204
|
# Store in repository
|
|
139
205
|
memory_id = self._repo.add(memory, vector)
|
|
140
206
|
|
|
207
|
+
# Store idempotency key mapping if provided
|
|
208
|
+
if idempotency_key and self._idempotency:
|
|
209
|
+
try:
|
|
210
|
+
self._idempotency.store_idempotency_key(idempotency_key, memory_id)
|
|
211
|
+
except Exception as e:
|
|
212
|
+
# Log but don't fail the memory creation
|
|
213
|
+
logger.warning(
|
|
214
|
+
f"Failed to store idempotency key '{idempotency_key}': {e}"
|
|
215
|
+
)
|
|
216
|
+
|
|
141
217
|
return RememberResult(
|
|
142
218
|
id=memory_id,
|
|
143
219
|
content=content,
|
|
144
220
|
namespace=namespace,
|
|
221
|
+
deduplicated=False,
|
|
145
222
|
)
|
|
146
223
|
|
|
147
224
|
def remember_batch(
|
|
@@ -327,9 +404,9 @@ class MemoryService:
|
|
|
327
404
|
if not memory_ids:
|
|
328
405
|
raise ValidationError("Memory ID list cannot be empty")
|
|
329
406
|
|
|
330
|
-
deleted_count = self._repo.delete_batch(memory_ids)
|
|
407
|
+
deleted_count, deleted_ids = self._repo.delete_batch(memory_ids)
|
|
331
408
|
|
|
332
409
|
return ForgetResult(
|
|
333
410
|
deleted=deleted_count,
|
|
334
|
-
ids=
|
|
411
|
+
ids=deleted_ids,
|
|
335
412
|
)
|
|
@@ -63,6 +63,28 @@ except ImportError:
|
|
|
63
63
|
UMAP_AVAILABLE = False
|
|
64
64
|
logger.debug("UMAP not available - visualize operation will be disabled")
|
|
65
65
|
|
|
66
|
+
try:
|
|
67
|
+
from scipy.spatial.distance import cdist
|
|
68
|
+
|
|
69
|
+
SCIPY_AVAILABLE = True
|
|
70
|
+
except ImportError:
|
|
71
|
+
SCIPY_AVAILABLE = False
|
|
72
|
+
logger.debug("scipy not available - using fallback for similarity calculations")
|
|
73
|
+
|
|
74
|
+
# Common stop words for keyword extraction (module-level to avoid recreation)
|
|
75
|
+
_STOP_WORDS: frozenset[str] = frozenset({
|
|
76
|
+
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
77
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
78
|
+
"should", "may", "might", "must", "can", "to", "of", "in", "for",
|
|
79
|
+
"on", "with", "at", "by", "from", "as", "into", "through", "during",
|
|
80
|
+
"before", "after", "above", "below", "between", "under", "again",
|
|
81
|
+
"further", "then", "once", "here", "there", "when", "where", "why",
|
|
82
|
+
"how", "all", "each", "few", "more", "most", "other", "some", "such",
|
|
83
|
+
"no", "nor", "not", "only", "own", "same", "so", "than", "too",
|
|
84
|
+
"very", "just", "also", "now", "and", "but", "or", "if", "it", "its",
|
|
85
|
+
"this", "that", "these", "those", "i", "you", "he", "she", "we", "they",
|
|
86
|
+
})
|
|
87
|
+
|
|
66
88
|
if TYPE_CHECKING:
|
|
67
89
|
from spatial_memory.ports.repositories import (
|
|
68
90
|
EmbeddingServiceProtocol,
|
|
@@ -212,11 +234,12 @@ class SpatialService:
|
|
|
212
234
|
)
|
|
213
235
|
|
|
214
236
|
# Find nearest memories for each interpolation point
|
|
215
|
-
# Use batch search for efficiency
|
|
237
|
+
# Use batch search for efficiency, include vectors to avoid N+1 queries
|
|
216
238
|
search_results = self._batch_vector_search(
|
|
217
239
|
interpolated_vectors,
|
|
218
240
|
limit_per_query=self._config.journey_neighbors_per_step,
|
|
219
241
|
namespace=namespace,
|
|
242
|
+
include_vector=True, # Include vectors to avoid follow-up queries
|
|
220
243
|
)
|
|
221
244
|
|
|
222
245
|
# Build journey steps
|
|
@@ -230,9 +253,15 @@ class SpatialService:
|
|
|
230
253
|
distance_to_path = float("inf")
|
|
231
254
|
if neighbors:
|
|
232
255
|
for neighbor in neighbors:
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
256
|
+
# Use vector from search result (included via include_vector=True)
|
|
257
|
+
if neighbor.vector is not None:
|
|
258
|
+
neighbor_vec = np.array(neighbor.vector, dtype=np.float32)
|
|
259
|
+
dist = self._cosine_distance(interp_vec, neighbor_vec)
|
|
260
|
+
else:
|
|
261
|
+
# Fallback if vector not included (shouldn't happen)
|
|
262
|
+
dist = self._cosine_distance(
|
|
263
|
+
interp_vec, self._get_vector_for_memory(neighbor.id)
|
|
264
|
+
)
|
|
236
265
|
if dist < distance_to_path:
|
|
237
266
|
distance_to_path = dist
|
|
238
267
|
steps_with_memories += 1
|
|
@@ -330,10 +359,12 @@ class SpatialService:
|
|
|
330
359
|
|
|
331
360
|
for step_num in range(actual_steps):
|
|
332
361
|
# Find candidates from current position
|
|
362
|
+
# Include vectors to avoid follow-up get_with_vector queries
|
|
333
363
|
neighbors = self._repo.search(
|
|
334
364
|
current_vector,
|
|
335
365
|
limit=self._config.wander_candidates_per_step + len(visited_ids),
|
|
336
366
|
namespace=namespace,
|
|
367
|
+
include_vector=True,
|
|
337
368
|
)
|
|
338
369
|
|
|
339
370
|
# Filter out recently visited
|
|
@@ -358,12 +389,16 @@ class SpatialService:
|
|
|
358
389
|
candidates, actual_temp
|
|
359
390
|
)
|
|
360
391
|
|
|
361
|
-
#
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
392
|
+
# Get vector from search result (included via include_vector=True)
|
|
393
|
+
if next_memory.vector is not None:
|
|
394
|
+
next_vector = np.array(next_memory.vector, dtype=np.float32)
|
|
395
|
+
else:
|
|
396
|
+
# Fallback if vector not included (shouldn't happen)
|
|
397
|
+
next_result = self._repo.get_with_vector(next_memory.id)
|
|
398
|
+
if next_result is None:
|
|
399
|
+
logger.warning(f"Memory {next_memory.id} disappeared during wander")
|
|
400
|
+
break
|
|
401
|
+
_, next_vector = next_result
|
|
367
402
|
|
|
368
403
|
step_distance = self._cosine_distance(prev_vector, next_vector)
|
|
369
404
|
total_distance += step_distance
|
|
@@ -665,18 +700,26 @@ class SpatialService:
|
|
|
665
700
|
# Build edges if requested
|
|
666
701
|
edges: list[VisualizationEdge] = []
|
|
667
702
|
if include_edges:
|
|
668
|
-
# Calculate pairwise similarities
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
703
|
+
# Calculate pairwise similarities using vectorized operations
|
|
704
|
+
similarity_matrix = self._compute_pairwise_similarities(vectors)
|
|
705
|
+
threshold = self._config.visualize_similarity_threshold
|
|
706
|
+
|
|
707
|
+
# Extract upper triangle indices where similarity >= threshold
|
|
708
|
+
# (upper triangle avoids duplicate edges)
|
|
709
|
+
upper_tri_indices = np.triu_indices(len(vectors), k=1)
|
|
710
|
+
similarities = similarity_matrix[upper_tri_indices]
|
|
711
|
+
|
|
712
|
+
# Filter by threshold and create edges
|
|
713
|
+
mask = similarities >= threshold
|
|
714
|
+
for idx in np.where(mask)[0]:
|
|
715
|
+
i, j = upper_tri_indices[0][idx], upper_tri_indices[1][idx]
|
|
716
|
+
edges.append(
|
|
717
|
+
VisualizationEdge(
|
|
718
|
+
from_id=nodes[i].id,
|
|
719
|
+
to_id=nodes[j].id,
|
|
720
|
+
weight=float(similarities[idx]),
|
|
721
|
+
)
|
|
722
|
+
)
|
|
680
723
|
|
|
681
724
|
# Calculate bounds
|
|
682
725
|
x_coords = [n.x for n in nodes]
|
|
@@ -783,25 +826,50 @@ class SpatialService:
|
|
|
783
826
|
vectors: list[np.ndarray],
|
|
784
827
|
limit_per_query: int,
|
|
785
828
|
namespace: str | None,
|
|
829
|
+
include_vector: bool = False,
|
|
786
830
|
) -> list[list[MemoryResult]]:
|
|
787
|
-
"""Perform batch vector search.
|
|
831
|
+
"""Perform batch vector search using repository's native batch capability.
|
|
788
832
|
|
|
789
|
-
|
|
790
|
-
|
|
833
|
+
Uses the repository's batch_vector_search for efficient multi-query
|
|
834
|
+
searches in a single database operation.
|
|
791
835
|
|
|
792
836
|
Args:
|
|
793
837
|
vectors: List of query vectors.
|
|
794
838
|
limit_per_query: Results per query.
|
|
795
839
|
namespace: Optional namespace filter.
|
|
840
|
+
include_vector: Whether to include embedding vectors in results.
|
|
841
|
+
Defaults to False to reduce response size.
|
|
796
842
|
|
|
797
843
|
Returns:
|
|
798
|
-
List of result lists.
|
|
844
|
+
List of result lists. If include_vector=True, each MemoryResult
|
|
845
|
+
includes its embedding vector.
|
|
799
846
|
"""
|
|
800
|
-
#
|
|
847
|
+
# Use native batch search for efficiency
|
|
848
|
+
raw_results = self._repo.batch_vector_search(
|
|
849
|
+
query_vectors=vectors,
|
|
850
|
+
limit_per_query=limit_per_query,
|
|
851
|
+
namespace=namespace,
|
|
852
|
+
include_vector=include_vector,
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
# Convert raw dict results to MemoryResult objects
|
|
801
856
|
results: list[list[MemoryResult]] = []
|
|
802
|
-
for
|
|
803
|
-
|
|
804
|
-
|
|
857
|
+
for query_results in raw_results:
|
|
858
|
+
memory_results: list[MemoryResult] = []
|
|
859
|
+
for record in query_results:
|
|
860
|
+
memory_result = MemoryResult(
|
|
861
|
+
id=record["id"],
|
|
862
|
+
content=record["content"],
|
|
863
|
+
similarity=record.get("similarity", 0.0),
|
|
864
|
+
namespace=record.get("namespace", "default"),
|
|
865
|
+
tags=record.get("tags", []),
|
|
866
|
+
importance=record.get("importance", 0.5),
|
|
867
|
+
created_at=record.get("created_at"),
|
|
868
|
+
metadata=record.get("metadata", {}),
|
|
869
|
+
vector=record.get("vector") if include_vector else None,
|
|
870
|
+
)
|
|
871
|
+
memory_results.append(memory_result)
|
|
872
|
+
results.append(memory_results)
|
|
805
873
|
return results
|
|
806
874
|
|
|
807
875
|
def _get_vector_for_memory(self, memory_id: str) -> np.ndarray:
|
|
@@ -838,6 +906,35 @@ class SpatialService:
|
|
|
838
906
|
similarity = np.dot(vec1, vec2) / (norm1 * norm2)
|
|
839
907
|
return float(1.0 - similarity)
|
|
840
908
|
|
|
909
|
+
def _compute_pairwise_similarities(self, vectors: np.ndarray) -> np.ndarray:
|
|
910
|
+
"""Compute pairwise cosine similarities using vectorized operations.
|
|
911
|
+
|
|
912
|
+
Uses scipy.cdist if available for optimal performance, otherwise
|
|
913
|
+
falls back to numpy matrix operations.
|
|
914
|
+
|
|
915
|
+
Args:
|
|
916
|
+
vectors: 2D array of shape (n_vectors, embedding_dim).
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
Symmetric similarity matrix of shape (n_vectors, n_vectors).
|
|
920
|
+
Values range from -1 (opposite) to 1 (identical).
|
|
921
|
+
"""
|
|
922
|
+
# Normalize vectors to unit length
|
|
923
|
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
924
|
+
# Avoid division by zero for zero vectors
|
|
925
|
+
norms = np.where(norms < 1e-10, 1.0, norms)
|
|
926
|
+
normalized = vectors / norms
|
|
927
|
+
|
|
928
|
+
if SCIPY_AVAILABLE:
|
|
929
|
+
# scipy.cdist with cosine metric returns distances (1 - similarity)
|
|
930
|
+
distances = cdist(normalized, normalized, metric="cosine")
|
|
931
|
+
similarities = 1.0 - distances
|
|
932
|
+
else:
|
|
933
|
+
# Fallback: use numpy dot product (A @ A.T for normalized vectors)
|
|
934
|
+
similarities = normalized @ normalized.T
|
|
935
|
+
|
|
936
|
+
return similarities
|
|
937
|
+
|
|
841
938
|
def _temperature_select(
|
|
842
939
|
self,
|
|
843
940
|
candidates: list[MemoryResult],
|
|
@@ -888,23 +985,9 @@ class SpatialService:
|
|
|
888
985
|
List of top keywords.
|
|
889
986
|
"""
|
|
890
987
|
# Simple keyword extraction using word frequency
|
|
891
|
-
#
|
|
892
|
-
stop_words = {
|
|
893
|
-
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
894
|
-
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
895
|
-
"should", "may", "might", "must", "can", "to", "of", "in", "for",
|
|
896
|
-
"on", "with", "at", "by", "from", "as", "into", "through", "during",
|
|
897
|
-
"before", "after", "above", "below", "between", "under", "again",
|
|
898
|
-
"further", "then", "once", "here", "there", "when", "where", "why",
|
|
899
|
-
"how", "all", "each", "few", "more", "most", "other", "some", "such",
|
|
900
|
-
"no", "nor", "not", "only", "own", "same", "so", "than", "too",
|
|
901
|
-
"very", "just", "also", "now", "and", "but", "or", "if", "it", "its",
|
|
902
|
-
"this", "that", "these", "those", "i", "you", "he", "she", "we", "they",
|
|
903
|
-
}
|
|
904
|
-
|
|
905
|
-
# Tokenize and filter
|
|
988
|
+
# Tokenize and filter using module-level stop words
|
|
906
989
|
words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
|
|
907
|
-
filtered = [w for w in words if w not in
|
|
990
|
+
filtered = [w for w in words if w not in _STOP_WORDS and len(w) > 2]
|
|
908
991
|
|
|
909
992
|
# Count frequencies
|
|
910
993
|
counter = Counter(filtered)
|