solohq-memory 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. solohq_memory-0.1.0/.gitignore +13 -0
  2. solohq_memory-0.1.0/LICENSE +21 -0
  3. solohq_memory-0.1.0/PKG-INFO +108 -0
  4. solohq_memory-0.1.0/README.md +71 -0
  5. solohq_memory-0.1.0/pyproject.toml +41 -0
  6. solohq_memory-0.1.0/src/solohq_memory/__init__.py +90 -0
  7. solohq_memory-0.1.0/src/solohq_memory/indexing/__init__.py +12 -0
  8. solohq_memory-0.1.0/src/solohq_memory/indexing/file_indexer.py +246 -0
  9. solohq_memory-0.1.0/src/solohq_memory/indexing/processors.py +105 -0
  10. solohq_memory-0.1.0/src/solohq_memory/indexing/registry.py +28 -0
  11. solohq_memory-0.1.0/src/solohq_memory/interfaces/__init__.py +11 -0
  12. solohq_memory-0.1.0/src/solohq_memory/interfaces/block_storage.py +15 -0
  13. solohq_memory-0.1.0/src/solohq_memory/interfaces/embedding.py +10 -0
  14. solohq_memory-0.1.0/src/solohq_memory/interfaces/llm.py +21 -0
  15. solohq_memory-0.1.0/src/solohq_memory/interfaces/storage.py +59 -0
  16. solohq_memory-0.1.0/src/solohq_memory/memory/__init__.py +33 -0
  17. solohq_memory-0.1.0/src/solohq_memory/memory/artifact_manager.py +372 -0
  18. solohq_memory-0.1.0/src/solohq_memory/memory/block_contradiction.py +68 -0
  19. solohq_memory-0.1.0/src/solohq_memory/memory/block_summarizer.py +41 -0
  20. solohq_memory-0.1.0/src/solohq_memory/memory/boundary_detector.py +149 -0
  21. solohq_memory-0.1.0/src/solohq_memory/memory/classifier.py +247 -0
  22. solohq_memory-0.1.0/src/solohq_memory/memory/context_graph.py +290 -0
  23. solohq_memory-0.1.0/src/solohq_memory/memory/contradiction.py +143 -0
  24. solohq_memory-0.1.0/src/solohq_memory/memory/episode_recorder.py +46 -0
  25. solohq_memory-0.1.0/src/solohq_memory/memory/manager.py +182 -0
  26. solohq_memory-0.1.0/src/solohq_memory/memory/staleness.py +151 -0
  27. solohq_memory-0.1.0/src/solohq_memory/models/__init__.py +52 -0
  28. solohq_memory-0.1.0/src/solohq_memory/models/artifact.py +49 -0
  29. solohq_memory-0.1.0/src/solohq_memory/models/common.py +42 -0
  30. solohq_memory-0.1.0/src/solohq_memory/models/context.py +31 -0
  31. solohq_memory-0.1.0/src/solohq_memory/models/episode.py +18 -0
  32. solohq_memory-0.1.0/src/solohq_memory/models/message.py +21 -0
  33. solohq_memory-0.1.0/src/solohq_memory/models/relationship.py +29 -0
  34. solohq_memory-0.1.0/src/solohq_memory/models/session_block.py +23 -0
  35. solohq_memory-0.1.0/src/solohq_memory/prompts/__init__.py +27 -0
  36. solohq_memory-0.1.0/src/solohq_memory/prompts/block_boundary.py +30 -0
  37. solohq_memory-0.1.0/src/solohq_memory/prompts/block_contradiction.py +34 -0
  38. solohq_memory-0.1.0/src/solohq_memory/prompts/block_summarizer.py +28 -0
  39. solohq_memory-0.1.0/src/solohq_memory/prompts/classifier.py +46 -0
  40. solohq_memory-0.1.0/src/solohq_memory/prompts/contradiction.py +50 -0
  41. solohq_memory-0.1.0/src/solohq_memory/prompts/relationship.py +33 -0
  42. solohq_memory-0.1.0/src/solohq_memory/prompts/summarizer.py +28 -0
  43. solohq_memory-0.1.0/src/solohq_memory/prompts/updater.py +43 -0
  44. solohq_memory-0.1.0/src/solohq_memory/providers/__init__.py +0 -0
  45. solohq_memory-0.1.0/src/solohq_memory/providers/anthropic.py +77 -0
  46. solohq_memory-0.1.0/src/solohq_memory/providers/google.py +125 -0
  47. solohq_memory-0.1.0/src/solohq_memory/providers/openai.py +89 -0
  48. solohq_memory-0.1.0/src/solohq_memory/py.typed +0 -0
  49. solohq_memory-0.1.0/src/solohq_memory/storage/__init__.py +8 -0
  50. solohq_memory-0.1.0/src/solohq_memory/storage/schema.py +139 -0
  51. solohq_memory-0.1.0/src/solohq_memory/storage/serialization.py +12 -0
  52. solohq_memory-0.1.0/src/solohq_memory/storage/sqlite.py +690 -0
  53. solohq_memory-0.1.0/src/solohq_memory/utils/__init__.py +7 -0
  54. solohq_memory-0.1.0/src/solohq_memory/utils/cache.py +45 -0
  55. solohq_memory-0.1.0/src/solohq_memory/utils/similarity.py +14 -0
  56. solohq_memory-0.1.0/src/solohq_memory/utils/tokens.py +3 -0
  57. solohq_memory-0.1.0/tests/__init__.py +0 -0
  58. solohq_memory-0.1.0/tests/e2e/__init__.py +0 -0
  59. solohq_memory-0.1.0/tests/e2e/conftest.py +149 -0
  60. solohq_memory-0.1.0/tests/e2e/test_e2e_artifact_manager.py +516 -0
  61. solohq_memory-0.1.0/tests/e2e/test_e2e_context_graph.py +648 -0
  62. solohq_memory-0.1.0/tests/e2e/test_e2e_contradiction.py +603 -0
  63. solohq_memory-0.1.0/tests/e2e/test_e2e_edge_cases.py +916 -0
  64. solohq_memory-0.1.0/tests/e2e/test_e2e_messages.py +155 -0
  65. solohq_memory-0.1.0/tests/e2e/test_e2e_phase1.py +844 -0
  66. solohq_memory-0.1.0/tests/e2e/test_e2e_phase2_integration.py +435 -0
  67. solohq_memory-0.1.0/tests/e2e/test_e2e_phase3_integration.py +588 -0
  68. solohq_memory-0.1.0/tests/e2e/test_e2e_staleness.py +632 -0
  69. solohq_memory-0.1.0/tests/unit/__init__.py +0 -0
  70. solohq_memory-0.1.0/tests/unit/test_anthropic_provider.py +177 -0
  71. solohq_memory-0.1.0/tests/unit/test_artifact_manager.py +539 -0
  72. solohq_memory-0.1.0/tests/unit/test_artifact_versioning.py +302 -0
  73. solohq_memory-0.1.0/tests/unit/test_block_contradiction.py +133 -0
  74. solohq_memory-0.1.0/tests/unit/test_block_summarizer.py +132 -0
  75. solohq_memory-0.1.0/tests/unit/test_boundary_detector.py +247 -0
  76. solohq_memory-0.1.0/tests/unit/test_classifier.py +444 -0
  77. solohq_memory-0.1.0/tests/unit/test_context_graph.py +809 -0
  78. solohq_memory-0.1.0/tests/unit/test_context_updater.py +456 -0
  79. solohq_memory-0.1.0/tests/unit/test_contradiction.py +390 -0
  80. solohq_memory-0.1.0/tests/unit/test_file_indexer.py +349 -0
  81. solohq_memory-0.1.0/tests/unit/test_google_provider.py +240 -0
  82. solohq_memory-0.1.0/tests/unit/test_packaging.py +117 -0
  83. solohq_memory-0.1.0/tests/unit/test_performance.py +188 -0
  84. solohq_memory-0.1.0/tests/unit/test_processors.py +209 -0
  85. solohq_memory-0.1.0/tests/unit/test_staleness.py +413 -0
  86. solohq_memory-0.1.0/uv.lock +1513 -0
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ *.swp
8
+ *.swo
9
+ *~
10
+ .env
11
+ *.db
12
+ .pytest_cache/
13
+ .venv/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 SoloHQ
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: solohq-memory
3
+ Version: 0.1.0
4
+ Summary: Context memory system for AI agents — persistent context graph with episodes, artifacts, relationships, and staleness tracking
5
+ Project-URL: Repository, https://github.com/whaleventure13/solohq-agent
6
+ Author: SoloHQ
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Typing :: Typed
18
+ Requires-Python: >=3.11
19
+ Requires-Dist: apsw>=3.46
20
+ Requires-Dist: pydantic>=2.0
21
+ Requires-Dist: python-ulid>=3.0
22
+ Requires-Dist: sqlite-vec>=0.1.6
23
+ Provides-Extra: all
24
+ Requires-Dist: anthropic>=0.40; extra == 'all'
25
+ Requires-Dist: google-genai>=1.0; extra == 'all'
26
+ Requires-Dist: openai>=1.0; extra == 'all'
27
+ Provides-Extra: anthropic
28
+ Requires-Dist: anthropic>=0.40; extra == 'anthropic'
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
31
+ Requires-Dist: pytest>=8.0; extra == 'dev'
32
+ Provides-Extra: google
33
+ Requires-Dist: google-genai>=1.0; extra == 'google'
34
+ Provides-Extra: openai
35
+ Requires-Dist: openai>=1.0; extra == 'openai'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # solohq-memory
39
+
40
+ Core context memory library for AI agents. Persistent context graph with episodes, artifacts, relationships, and staleness tracking.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install solohq-memory
46
+
47
+ # With LLM provider extras
48
+ pip install solohq-memory[anthropic]
49
+ pip install solohq-memory[openai]
50
+ pip install solohq-memory[google]
51
+ pip install solohq-memory[all]
52
+ ```
53
+
54
+ ## Quick Start
55
+
56
+ ```python
57
+ from solohq_memory import (
58
+ ContextMemoryManager,
59
+ SqliteVecStorage,
60
+ )
61
+
62
+ # You need to provide LLM and embedding implementations
63
+ # that satisfy LLMProvider and EmbeddingProvider protocols
64
+
65
+ storage = SqliteVecStorage("memory.db", dimensions=1536)
66
+ manager = ContextMemoryManager(
67
+ storage=storage,
68
+ llm=your_llm_provider,
69
+ embedder=your_embedding_provider,
70
+ )
71
+
72
+ # Classify a message to find or create a context
73
+ result = await manager.classify("Tell me about Python decorators")
74
+ context = result.context
75
+
76
+ # Record an episode
77
+ await manager.record_episode(
78
+ context_id=context.id,
79
+ user_message="Tell me about Python decorators",
80
+ assistant_message="Decorators are a way to modify functions...",
81
+ )
82
+
83
+ # Search across all contexts
84
+ results = await manager.search("decorators", limit=5)
85
+ ```
86
+
87
+ ## Key Components
88
+
89
+ - **ContextMemoryManager** — main orchestrator for context operations
90
+ - **SqliteVecStorage** — SQLite + sqlite-vec storage with vector search
91
+ - **LLMClassifier** — 3-tier context routing: stable shortcut, embedding similarity, LLM fallback
92
+ - **ArtifactManager** — versioned content objects with rollback support
93
+ - **EpisodeRecorder** — conversation recording
94
+ - **BoundaryDetector** — topic shift detection (heuristics + LLM)
95
+ - **FileIndexer** — file indexing (on-disk references) and upload (full content)
96
+
97
+ ## Interfaces
98
+
99
+ All providers are protocol-based and pluggable:
100
+
101
+ - `StorageProvider` — CRUD + vector search
102
+ - `LLMProvider` — `complete()` and `complete_json()`
103
+ - `EmbeddingProvider` — `embed()`, `embed_batch()`, `dimensions`
104
+ - `BlockStorageProvider` — session block persistence
105
+
106
+ ## License
107
+
108
+ MIT
@@ -0,0 +1,71 @@
1
+ # solohq-memory
2
+
3
+ Core context memory library for AI agents. Persistent context graph with episodes, artifacts, relationships, and staleness tracking.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install solohq-memory
9
+
10
+ # With LLM provider extras
11
+ pip install solohq-memory[anthropic]
12
+ pip install solohq-memory[openai]
13
+ pip install solohq-memory[google]
14
+ pip install solohq-memory[all]
15
+ ```
16
+
17
+ ## Quick Start
18
+
19
+ ```python
20
+ from solohq_memory import (
21
+ ContextMemoryManager,
22
+ SqliteVecStorage,
23
+ )
24
+
25
+ # You need to provide LLM and embedding implementations
26
+ # that satisfy LLMProvider and EmbeddingProvider protocols
27
+
28
+ storage = SqliteVecStorage("memory.db", dimensions=1536)
29
+ manager = ContextMemoryManager(
30
+ storage=storage,
31
+ llm=your_llm_provider,
32
+ embedder=your_embedding_provider,
33
+ )
34
+
35
+ # Classify a message to find or create a context
36
+ result = await manager.classify("Tell me about Python decorators")
37
+ context = result.context
38
+
39
+ # Record an episode
40
+ await manager.record_episode(
41
+ context_id=context.id,
42
+ user_message="Tell me about Python decorators",
43
+ assistant_message="Decorators are a way to modify functions...",
44
+ )
45
+
46
+ # Search across all contexts
47
+ results = await manager.search("decorators", limit=5)
48
+ ```
49
+
50
+ ## Key Components
51
+
52
+ - **ContextMemoryManager** — main orchestrator for context operations
53
+ - **SqliteVecStorage** — SQLite + sqlite-vec storage with vector search
54
+ - **LLMClassifier** — 3-tier context routing: stable shortcut, embedding similarity, LLM fallback
55
+ - **ArtifactManager** — versioned content objects with rollback support
56
+ - **EpisodeRecorder** — conversation recording
57
+ - **BoundaryDetector** — topic shift detection (heuristics + LLM)
58
+ - **FileIndexer** — file indexing (on-disk references) and upload (full content)
59
+
60
+ ## Interfaces
61
+
62
+ All providers are protocol-based and pluggable:
63
+
64
+ - `StorageProvider` — CRUD + vector search
65
+ - `LLMProvider` — `complete()` and `complete_json()`
66
+ - `EmbeddingProvider` — `embed()`, `embed_batch()`, `dimensions`
67
+ - `BlockStorageProvider` — session block persistence
68
+
69
+ ## License
70
+
71
+ MIT
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "solohq-memory"
7
+ version = "0.1.0"
8
+ description = "Context memory system for AI agents — persistent context graph with episodes, artifacts, relationships, and staleness tracking"
9
+ requires-python = ">=3.11"
10
+ license = "MIT"
11
+ authors = [
12
+ {name = "SoloHQ"},
13
+ ]
14
+ readme = "README.md"
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ "Typing :: Typed",
25
+ ]
26
+ dependencies = [
27
+ "pydantic>=2.0",
28
+ "sqlite-vec>=0.1.6",
29
+ "apsw>=3.46",
30
+ "python-ulid>=3.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ openai = ["openai>=1.0"]
35
+ anthropic = ["anthropic>=0.40"]
36
+ google = ["google-genai>=1.0"]
37
+ all = ["solohq-memory[openai,anthropic,google]"]
38
+ dev = ["pytest>=8.0", "pytest-asyncio>=0.24"]
39
+
40
+ [project.urls]
41
+ Repository = "https://github.com/whaleventure13/solohq-agent"
@@ -0,0 +1,90 @@
1
+ from .indexing import FileIndexer
2
+ from .interfaces import BlockStorageProvider, EmbeddingProvider, LLMProvider, StorageProvider
3
+ from .memory.artifact_manager import ArtifactManager, CascadeResult, VersionDiff
4
+ from .memory.block_contradiction import BlockConflict, BlockConflictResult, BlockContradictionDetector
5
+ from .memory.block_summarizer import BlockSummarizer
6
+ from .memory.boundary_detector import BoundaryDetector
7
+ from .memory.classifier import ClassificationResult, Confidence, LLMClassifier
8
+ from .memory.context_graph import ContextGraphManager
9
+ from .memory.contradiction import (
10
+ Contradiction,
11
+ ContradictionDetector,
12
+ ContradictionResult,
13
+ Severity,
14
+ )
15
+ from .memory.episode_recorder import EpisodeRecorder
16
+ from .memory.manager import ContextLoad, ContextMemoryManager, SearchResult
17
+ from .memory.staleness import StalenessTracker, StalenessType
18
+ from .models import (
19
+ Artifact,
20
+ ArtifactContext,
21
+ ArtifactFreshness,
22
+ ArtifactVersion,
23
+ SourceMeta,
24
+ Context,
25
+ ContextRelationship,
26
+ ContextState,
27
+ ContextStatus,
28
+ Episode,
29
+ Message,
30
+ RelationType,
31
+ Role,
32
+ SessionBlock,
33
+ SignalType,
34
+ )
35
+ from .storage.sqlite import SqliteVecStorage
36
+ from .utils.cache import EmbeddingCache
37
+
38
+ __all__ = [
39
+ # Core manager
40
+ "ContextMemoryManager",
41
+ "ContextLoad",
42
+ "SearchResult",
43
+ # Memory components
44
+ "ArtifactManager",
45
+ "BlockConflict",
46
+ "BlockConflictResult",
47
+ "BlockContradictionDetector",
48
+ "BlockSummarizer",
49
+ "BoundaryDetector",
50
+ "CascadeResult",
51
+ "ClassificationResult",
52
+ "Confidence",
53
+ "ContextGraphManager",
54
+ "ContradictionDetector",
55
+ "Contradiction",
56
+ "ContradictionResult",
57
+ "EpisodeRecorder",
58
+ "LLMClassifier",
59
+ "Severity",
60
+ "StalenessTracker",
61
+ "StalenessType",
62
+ "VersionDiff",
63
+ # Models
64
+ "Artifact",
65
+ "ArtifactContext",
66
+ "ArtifactFreshness",
67
+ "ArtifactVersion",
68
+ "SourceMeta",
69
+ "Context",
70
+ "ContextRelationship",
71
+ "ContextState",
72
+ "ContextStatus",
73
+ "Episode",
74
+ "Message",
75
+ "RelationType",
76
+ "Role",
77
+ "SessionBlock",
78
+ "SignalType",
79
+ # Interfaces
80
+ "BlockStorageProvider",
81
+ "EmbeddingProvider",
82
+ "LLMProvider",
83
+ "StorageProvider",
84
+ # Indexing
85
+ "FileIndexer",
86
+ # Storage
87
+ "SqliteVecStorage",
88
+ # Utils
89
+ "EmbeddingCache",
90
+ ]
@@ -0,0 +1,12 @@
1
+ from .file_indexer import FileIndexer
2
+ from .processors import BinaryProcessor, CodeProcessor, FileProcessor, TextProcessor
3
+ from .registry import get_processor
4
+
5
+ __all__ = [
6
+ "FileIndexer",
7
+ "FileProcessor",
8
+ "CodeProcessor",
9
+ "TextProcessor",
10
+ "BinaryProcessor",
11
+ "get_processor",
12
+ ]
@@ -0,0 +1,246 @@
1
+ from __future__ import annotations
2
+
3
+ import mimetypes
4
+ import os
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+
8
+ from solohq_memory.interfaces import EmbeddingProvider, LLMProvider, StorageProvider
9
+ from solohq_memory.models import Artifact, ArtifactContext, ArtifactVersion
10
+ from solohq_memory.models.artifact import SourceMeta
11
+
12
+ from .registry import get_processor
13
+
14
+ DEFAULT_IGNORE = {".git", "node_modules", "__pycache__", ".venv", ".env", ".tox", ".mypy_cache"}
15
+
16
+
17
+ class FileIndexer:
18
+ """Core engine for indexing and uploading files as artifacts."""
19
+
20
+ def __init__(
21
+ self,
22
+ storage: StorageProvider,
23
+ embedder: EmbeddingProvider,
24
+ llm: LLMProvider,
25
+ ) -> None:
26
+ self._storage = storage
27
+ self._embedder = embedder
28
+ self._llm = llm
29
+
30
+ async def index_file(
31
+ self, path: str, context_id: str | None = None,
32
+ ) -> Artifact:
33
+ """Index a single file: summary + embedding + save.
34
+
35
+ Content stays on disk, read on-demand.
36
+ """
37
+ abs_path = os.path.abspath(os.path.expanduser(path))
38
+ if not os.path.isfile(abs_path):
39
+ raise FileNotFoundError(f"File not found: {abs_path}")
40
+
41
+ # Check if already indexed and unchanged
42
+ existing = await self._storage.find_by_source_path(abs_path)
43
+ if existing and existing.source_meta:
44
+ current_mtime = os.path.getmtime(abs_path)
45
+ if existing.source_meta.mtime == current_mtime:
46
+ return existing
47
+
48
+ # Determine processor
49
+ mime_type = mimetypes.guess_type(abs_path)[0] or "application/octet-stream"
50
+ processor = get_processor(mime_type, abs_path)
51
+
52
+ # Extract text for embedding
53
+ text = processor.extract_text(abs_path)
54
+ embedding = await self._embedder.embed(text) if text else []
55
+
56
+ # LLM summary for artifact content
57
+ summary = await processor.extract_summary(abs_path, self._llm)
58
+
59
+ # Build source meta
60
+ stat = os.stat(abs_path)
61
+ source_meta = SourceMeta(
62
+ source_path=abs_path,
63
+ mime_type=mime_type,
64
+ size_bytes=stat.st_size,
65
+ mtime=stat.st_mtime,
66
+ processor=type(processor).__name__,
67
+ )
68
+
69
+ filename = os.path.basename(abs_path)
70
+
71
+ contexts = []
72
+ if context_id:
73
+ contexts.append(ArtifactContext(context_id=context_id, role="primary", relevance=1.0))
74
+
75
+ if existing:
76
+ # Update existing artifact
77
+ existing.content = summary
78
+ existing.embedding = embedding
79
+ existing.source_meta = source_meta
80
+ existing.updated_at = datetime.now(timezone.utc)
81
+ await self._storage.save_artifact(existing)
82
+ return existing
83
+
84
+ artifact = Artifact(
85
+ type="indexed_file",
86
+ title=filename,
87
+ content=summary,
88
+ embedding=embedding,
89
+ source_path=abs_path,
90
+ source_meta=source_meta,
91
+ contexts=contexts,
92
+ current_version=1,
93
+ versions=[ArtifactVersion(version=1, content=summary, change_reason="indexed")],
94
+ )
95
+ await self._storage.save_artifact(artifact)
96
+ return artifact
97
+
98
+ async def index_directory(
99
+ self,
100
+ path: str,
101
+ context_id: str | None = None,
102
+ patterns: list[str] | None = None,
103
+ ignore_patterns: list[str] | None = None,
104
+ max_files: int = 100,
105
+ ) -> list[Artifact]:
106
+ """Recursively index files in a directory."""
107
+ abs_path = os.path.abspath(os.path.expanduser(path))
108
+ if not os.path.isdir(abs_path):
109
+ raise NotADirectoryError(f"Directory not found: {abs_path}")
110
+
111
+ ignore = DEFAULT_IGNORE | set(ignore_patterns or [])
112
+ root = Path(abs_path)
113
+
114
+ # Collect files matching patterns
115
+ files: list[Path] = []
116
+ if patterns:
117
+ for pattern in patterns:
118
+ for match in root.rglob(pattern):
119
+ if match.is_file() and not _should_ignore(match, root, ignore):
120
+ files.append(match)
121
+ else:
122
+ for match in root.rglob("*"):
123
+ if match.is_file() and not _should_ignore(match, root, ignore):
124
+ files.append(match)
125
+
126
+ # Deduplicate and limit
127
+ seen: set[str] = set()
128
+ unique_files: list[Path] = []
129
+ for f in files:
130
+ resolved = str(f.resolve())
131
+ if resolved not in seen:
132
+ seen.add(resolved)
133
+ unique_files.append(f)
134
+
135
+ unique_files = unique_files[:max_files]
136
+
137
+ results = []
138
+ for file_path in unique_files:
139
+ artifact = await self.index_file(str(file_path), context_id)
140
+ results.append(artifact)
141
+
142
+ return results
143
+
144
+ async def refresh_file(self, artifact: Artifact) -> Artifact | None:
145
+ """Re-index if the file has changed (mtime check).
146
+
147
+ Returns updated artifact if changed, None if unchanged.
148
+ """
149
+ if not artifact.source_meta or not artifact.source_path:
150
+ return None
151
+
152
+ if not os.path.isfile(artifact.source_path):
153
+ return None
154
+
155
+ current_mtime = os.path.getmtime(artifact.source_path)
156
+ if artifact.source_meta.mtime == current_mtime:
157
+ return None
158
+
159
+ # File changed — re-index
160
+ mime_type = artifact.source_meta.mime_type
161
+ processor = get_processor(mime_type, artifact.source_path)
162
+
163
+ text = processor.extract_text(artifact.source_path)
164
+ embedding = await self._embedder.embed(text) if text else []
165
+ summary = await processor.extract_summary(artifact.source_path, self._llm)
166
+
167
+ stat = os.stat(artifact.source_path)
168
+ artifact.source_meta.mtime = stat.st_mtime
169
+ artifact.source_meta.size_bytes = stat.st_size
170
+ artifact.source_meta.indexed_at = datetime.now(timezone.utc)
171
+ artifact.content = summary
172
+ artifact.embedding = embedding
173
+ artifact.updated_at = datetime.now(timezone.utc)
174
+ await self._storage.save_artifact(artifact)
175
+ return artifact
176
+
177
+ async def read_file_content(self, artifact: Artifact, max_chars: int = 50000) -> str:
178
+ """Read current file content from disk."""
179
+ if artifact.type == "uploaded_file":
180
+ return artifact.content
181
+
182
+ if not artifact.source_path:
183
+ return "Error: No source path for this artifact."
184
+
185
+ if not os.path.isfile(artifact.source_path):
186
+ return f"Error: File not found: {artifact.source_path}"
187
+
188
+ mime_type = "application/octet-stream"
189
+ if artifact.source_meta:
190
+ mime_type = artifact.source_meta.mime_type
191
+
192
+ processor = get_processor(mime_type, artifact.source_path)
193
+ return processor.extract_text(artifact.source_path, max_chars=max_chars)
194
+
195
+ async def upload_file(
196
+ self, path: str, context_id: str | None = None,
197
+ ) -> Artifact:
198
+ """Upload a file — full content saved in the artifact."""
199
+ abs_path = os.path.abspath(os.path.expanduser(path))
200
+ if not os.path.isfile(abs_path):
201
+ raise FileNotFoundError(f"File not found: {abs_path}")
202
+
203
+ mime_type = mimetypes.guess_type(abs_path)[0] or "application/octet-stream"
204
+ processor = get_processor(mime_type, abs_path)
205
+
206
+ # Full text content stored in artifact
207
+ full_text = processor.extract_text(abs_path, max_chars=100000)
208
+ embedding = await self._embedder.embed(full_text) if full_text else []
209
+
210
+ stat = os.stat(abs_path)
211
+ source_meta = SourceMeta(
212
+ source_path=abs_path,
213
+ mime_type=mime_type,
214
+ size_bytes=stat.st_size,
215
+ mtime=stat.st_mtime,
216
+ processor=type(processor).__name__,
217
+ )
218
+
219
+ filename = os.path.basename(abs_path)
220
+
221
+ contexts = []
222
+ if context_id:
223
+ contexts.append(ArtifactContext(context_id=context_id, role="primary", relevance=1.0))
224
+
225
+ artifact = Artifact(
226
+ type="uploaded_file",
227
+ title=filename,
228
+ content=full_text,
229
+ embedding=embedding,
230
+ source_path=abs_path,
231
+ source_meta=source_meta,
232
+ contexts=contexts,
233
+ current_version=1,
234
+ versions=[ArtifactVersion(version=1, content=full_text, change_reason="uploaded")],
235
+ )
236
+ await self._storage.save_artifact(artifact)
237
+ return artifact
238
+
239
+
240
+ def _should_ignore(path: Path, root: Path, ignore: set[str]) -> bool:
241
+ """Check if a file should be ignored based on directory names."""
242
+ rel = path.relative_to(root)
243
+ for part in rel.parts:
244
+ if part in ignore:
245
+ return True
246
+ return False