memory-access 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memory_access/__init__.py +1 -0
- memory_access/cli.py +131 -0
- memory_access/crawl.py +84 -0
- memory_access/embeddings.py +96 -0
- memory_access/ingest.py +228 -0
- memory_access/models.py +86 -0
- memory_access/normalizer.py +181 -0
- memory_access/server.py +404 -0
- memory_access/storage.py +899 -0
- memory_access-0.1.3.dist-info/METADATA +352 -0
- memory_access-0.1.3.dist-info/RECORD +14 -0
- memory_access-0.1.3.dist-info/WHEEL +4 -0
- memory_access-0.1.3.dist-info/entry_points.txt +2 -0
- memory_access-0.1.3.dist-info/licenses/LICENSE +54 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Semantic Memory MCP Server — intent-based memory storage and retrieval for AI agents."""
|
memory_access/cli.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
"""Entry point for memory-access CLI."""
|
|
10
|
+
if len(sys.argv) > 1 and sys.argv[1] == "kb":
|
|
11
|
+
return _run_kb_cli()
|
|
12
|
+
|
|
13
|
+
# Default: run MCP server
|
|
14
|
+
from .server import main as server_main
|
|
15
|
+
server_main()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _run_kb_cli():
|
|
19
|
+
parser = argparse.ArgumentParser(prog="memory-access kb")
|
|
20
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
21
|
+
|
|
22
|
+
# new
|
|
23
|
+
new_p = sub.add_parser("new", help="Create a new knowledge base")
|
|
24
|
+
new_p.add_argument("name", help="Knowledge base name (slug)")
|
|
25
|
+
new_p.add_argument("--crawl", help="URL to crawl")
|
|
26
|
+
new_p.add_argument("--scrape", help="Single URL to scrape")
|
|
27
|
+
new_p.add_argument("--from-dir", dest="from_dir", help="Directory of Firecrawl JSON files")
|
|
28
|
+
new_p.add_argument("--limit", type=int, default=1000, help="Max pages to crawl")
|
|
29
|
+
new_p.add_argument("--description", default="", help="KB description")
|
|
30
|
+
|
|
31
|
+
# list
|
|
32
|
+
sub.add_parser("list", help="List knowledge bases")
|
|
33
|
+
|
|
34
|
+
# delete
|
|
35
|
+
del_p = sub.add_parser("delete", help="Delete a knowledge base")
|
|
36
|
+
del_p.add_argument("name", help="Knowledge base name")
|
|
37
|
+
|
|
38
|
+
# refresh
|
|
39
|
+
ref_p = sub.add_parser("refresh", help="Re-crawl and refresh a knowledge base")
|
|
40
|
+
ref_p.add_argument("name", help="Knowledge base name")
|
|
41
|
+
ref_p.add_argument("--limit", type=int, default=1000, help="Max pages to crawl")
|
|
42
|
+
|
|
43
|
+
args = parser.parse_args(sys.argv[2:])
|
|
44
|
+
asyncio.run(_dispatch(args))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def _dispatch(args):
|
|
48
|
+
from .server import create_app
|
|
49
|
+
from .ingest import Ingestor
|
|
50
|
+
|
|
51
|
+
app = await create_app()
|
|
52
|
+
|
|
53
|
+
# Only create crawl service when needed (requires firecrawl dependency + API key)
|
|
54
|
+
crawl_service = None
|
|
55
|
+
needs_crawl = args.command == "new" and (getattr(args, "crawl", None) or getattr(args, "scrape", None))
|
|
56
|
+
needs_crawl = needs_crawl or args.command == "refresh"
|
|
57
|
+
if needs_crawl:
|
|
58
|
+
from .crawl import create_crawl_service
|
|
59
|
+
crawl_service = create_crawl_service()
|
|
60
|
+
|
|
61
|
+
ingestor = Ingestor(
|
|
62
|
+
store=app.store,
|
|
63
|
+
normalizer=app.normalizer,
|
|
64
|
+
embeddings=app.embeddings,
|
|
65
|
+
crawl_service=crawl_service,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if args.command == "new":
|
|
69
|
+
await _cmd_new(app, ingestor, args)
|
|
70
|
+
elif args.command == "list":
|
|
71
|
+
await _cmd_list(app)
|
|
72
|
+
elif args.command == "delete":
|
|
73
|
+
await _cmd_delete(app, args)
|
|
74
|
+
elif args.command == "refresh":
|
|
75
|
+
await _cmd_refresh(app, ingestor, args)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def _cmd_new(app, ingestor, args):
|
|
79
|
+
source_type = "crawl" if args.crawl else "scrape" if args.scrape else "file" if args.from_dir else "text"
|
|
80
|
+
kb_id = await app.store.create_kb(args.name, description=args.description, source_type=source_type)
|
|
81
|
+
print(f"Created knowledge base '{args.name}' ({kb_id})", file=sys.stderr)
|
|
82
|
+
|
|
83
|
+
def on_progress(current, total, url):
|
|
84
|
+
print(f" [{current}/{total}] {url}", file=sys.stderr)
|
|
85
|
+
|
|
86
|
+
if args.crawl:
|
|
87
|
+
count = await ingestor.ingest_crawl(kb_id, args.crawl, limit=args.limit, on_progress=on_progress)
|
|
88
|
+
print(f"Ingested {count} chunks from {args.crawl}", file=sys.stderr)
|
|
89
|
+
elif args.scrape:
|
|
90
|
+
count = await ingestor.ingest_scrape(kb_id, args.scrape)
|
|
91
|
+
print(f"Ingested {count} chunks from {args.scrape}", file=sys.stderr)
|
|
92
|
+
elif args.from_dir:
|
|
93
|
+
count = await ingestor.ingest_from_directory(kb_id, args.from_dir, on_progress=on_progress)
|
|
94
|
+
print(f"Ingested {count} chunks from {args.from_dir}", file=sys.stderr)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def _cmd_list(app):
|
|
98
|
+
kbs = await app.store.list_kbs()
|
|
99
|
+
if not kbs:
|
|
100
|
+
print("No knowledge bases found.", file=sys.stderr)
|
|
101
|
+
return
|
|
102
|
+
for kb in kbs:
|
|
103
|
+
desc = f" - {kb.description}" if kb.description else ""
|
|
104
|
+
print(f" {kb.name}{desc} [{kb.source_type or 'unknown'}]", file=sys.stderr)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def _cmd_delete(app, args):
|
|
108
|
+
kb = await app.store.get_kb_by_name(args.name)
|
|
109
|
+
if kb is None:
|
|
110
|
+
print(f"Knowledge base '{args.name}' not found.", file=sys.stderr)
|
|
111
|
+
return
|
|
112
|
+
await app.store.delete_kb(kb.id)
|
|
113
|
+
print(f"Deleted knowledge base '{args.name}'.", file=sys.stderr)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
async def _cmd_refresh(app, ingestor, args):
|
|
117
|
+
kb = await app.store.get_kb_by_name(args.name)
|
|
118
|
+
if kb is None:
|
|
119
|
+
print(f"Knowledge base '{args.name}' not found.", file=sys.stderr)
|
|
120
|
+
return
|
|
121
|
+
if not kb.source_type or kb.source_type == "text":
|
|
122
|
+
print(f"Cannot refresh KB '{args.name}': no crawl/scrape source.", file=sys.stderr)
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
deleted = await app.store.delete_kb_chunks(kb.id)
|
|
126
|
+
print(f"Deleted {deleted} existing chunks.", file=sys.stderr)
|
|
127
|
+
|
|
128
|
+
# Re-ingest — we need the original URL, but it's not stored on the KB model.
|
|
129
|
+
# For now, require --crawl or --scrape on refresh too, or look for source_url from old chunks.
|
|
130
|
+
# Since we don't store the source URL on the KB, print a message.
|
|
131
|
+
print(f"Chunks cleared. Use 'memory-access kb new' with --crawl/--scrape to re-ingest.", file=sys.stderr)
|
memory_access/crawl.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
|
|
6
|
+
from .models import CrawledPage
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CrawlService(ABC):
|
|
10
|
+
"""Abstract crawl service. Implement for each crawl provider."""
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
async def crawl(self, url: str, limit: int = 1000) -> list[CrawledPage]:
|
|
14
|
+
"""Crawl a URL and return pages as markdown."""
|
|
15
|
+
...
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
async def scrape(self, url: str) -> CrawledPage:
|
|
19
|
+
"""Scrape a single URL and return as markdown."""
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FirecrawlService(CrawlService):
|
|
24
|
+
"""Crawl service using Firecrawl API."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, api_key: str | None = None):
|
|
27
|
+
from firecrawl import FirecrawlApp
|
|
28
|
+
|
|
29
|
+
self.app = FirecrawlApp(api_key=api_key or os.environ.get("FIRECRAWL_API_KEY"))
|
|
30
|
+
|
|
31
|
+
async def crawl(self, url: str, limit: int = 1000) -> list[CrawledPage]:
|
|
32
|
+
"""Crawl a URL using Firecrawl. Returns markdown pages."""
|
|
33
|
+
result = self.app.crawl(
|
|
34
|
+
url,
|
|
35
|
+
limit=limit,
|
|
36
|
+
scrape_options={"formats": ["markdown"], "only_main_content": True},
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
pages = []
|
|
40
|
+
for doc in result.data:
|
|
41
|
+
# Extract URL from metadata, fallback to base URL
|
|
42
|
+
page_url = url
|
|
43
|
+
if doc.metadata and doc.metadata.url:
|
|
44
|
+
page_url = doc.metadata.url
|
|
45
|
+
elif doc.metadata and doc.metadata.source_url:
|
|
46
|
+
page_url = doc.metadata.source_url
|
|
47
|
+
|
|
48
|
+
pages.append(
|
|
49
|
+
CrawledPage(
|
|
50
|
+
url=page_url,
|
|
51
|
+
markdown=doc.markdown or "",
|
|
52
|
+
metadata=doc.metadata.model_dump(exclude_none=True) if doc.metadata else {},
|
|
53
|
+
)
|
|
54
|
+
)
|
|
55
|
+
return pages
|
|
56
|
+
|
|
57
|
+
async def scrape(self, url: str) -> CrawledPage:
|
|
58
|
+
"""Scrape a single URL using Firecrawl."""
|
|
59
|
+
result = self.app.scrape(
|
|
60
|
+
url,
|
|
61
|
+
formats=["markdown"],
|
|
62
|
+
only_main_content=True,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Extract URL from metadata, fallback to input URL
|
|
66
|
+
page_url = url
|
|
67
|
+
if result.metadata and result.metadata.url:
|
|
68
|
+
page_url = result.metadata.url
|
|
69
|
+
elif result.metadata and result.metadata.source_url:
|
|
70
|
+
page_url = result.metadata.source_url
|
|
71
|
+
|
|
72
|
+
return CrawledPage(
|
|
73
|
+
url=page_url,
|
|
74
|
+
markdown=result.markdown or "",
|
|
75
|
+
metadata=result.metadata.model_dump(exclude_none=True) if result.metadata else {},
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def create_crawl_service(provider: str | None = None, **kwargs) -> CrawlService:
|
|
80
|
+
"""Factory to create the appropriate crawl service."""
|
|
81
|
+
provider = provider or os.environ.get("CRAWL_SERVICE", "firecrawl")
|
|
82
|
+
if provider == "firecrawl":
|
|
83
|
+
return FirecrawlService(**kwargs)
|
|
84
|
+
raise ValueError(f"Unknown crawl service: {provider}")
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import openai
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EmbeddingEngine:
|
|
9
|
+
"""Generates normalized embeddings using OpenAI's text-embedding-3-small model."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, model: str = "text-embedding-3-small"):
|
|
12
|
+
self._client: openai.OpenAI | None = None
|
|
13
|
+
self._model = model
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def client(self) -> openai.OpenAI:
|
|
17
|
+
if self._client is None:
|
|
18
|
+
self._client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
|
19
|
+
return self._client
|
|
20
|
+
|
|
21
|
+
def embed(self, text: str) -> np.ndarray:
|
|
22
|
+
response = self.client.embeddings.create(input=[text], model=self._model)
|
|
23
|
+
vec = np.array(response.data[0].embedding, dtype=np.float32)
|
|
24
|
+
norm = np.linalg.norm(vec)
|
|
25
|
+
return vec / norm if norm > 0 else vec
|
|
26
|
+
|
|
27
|
+
def embed_batch(self, texts: list[str]) -> np.ndarray:
|
|
28
|
+
response = self.client.embeddings.create(input=texts, model=self._model)
|
|
29
|
+
vecs = np.array([d.embedding for d in response.data], dtype=np.float32)
|
|
30
|
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
|
31
|
+
norms[norms == 0] = 1
|
|
32
|
+
return vecs / norms
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BedrockEmbeddingEngine:
|
|
36
|
+
"""Generates normalized embeddings using Amazon Titan via AWS Bedrock."""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model: str | None = None,
|
|
41
|
+
aws_region: str | None = None,
|
|
42
|
+
aws_profile: str | None = None,
|
|
43
|
+
):
|
|
44
|
+
self._client = None
|
|
45
|
+
self._model = model or os.environ.get(
|
|
46
|
+
"BEDROCK_EMBEDDING_MODEL", "amazon.titan-embed-text-v2:0"
|
|
47
|
+
)
|
|
48
|
+
self._aws_region = aws_region or os.environ.get("AWS_REGION", "us-east-1")
|
|
49
|
+
self._aws_profile = aws_profile or os.environ.get("AWS_PROFILE")
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def client(self):
|
|
53
|
+
if self._client is None:
|
|
54
|
+
import boto3
|
|
55
|
+
|
|
56
|
+
session_kwargs = {"region_name": self._aws_region}
|
|
57
|
+
if self._aws_profile:
|
|
58
|
+
session_kwargs["profile_name"] = self._aws_profile
|
|
59
|
+
session = boto3.Session(**session_kwargs)
|
|
60
|
+
self._client = session.client("bedrock-runtime")
|
|
61
|
+
return self._client
|
|
62
|
+
|
|
63
|
+
def _invoke(self, text: str) -> list[float]:
|
|
64
|
+
body = json.dumps({"inputText": text})
|
|
65
|
+
response = self.client.invoke_model(
|
|
66
|
+
modelId=self._model,
|
|
67
|
+
contentType="application/json",
|
|
68
|
+
accept="application/json",
|
|
69
|
+
body=body,
|
|
70
|
+
)
|
|
71
|
+
result = json.loads(response["body"].read())
|
|
72
|
+
return result["embedding"]
|
|
73
|
+
|
|
74
|
+
def embed(self, text: str) -> np.ndarray:
|
|
75
|
+
vec = np.array(self._invoke(text), dtype=np.float32)
|
|
76
|
+
norm = np.linalg.norm(vec)
|
|
77
|
+
return vec / norm if norm > 0 else vec
|
|
78
|
+
|
|
79
|
+
def embed_batch(self, texts: list[str]) -> np.ndarray:
|
|
80
|
+
import concurrent.futures
|
|
81
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
|
82
|
+
results = list(executor.map(self._invoke, texts))
|
|
83
|
+
vecs = np.array(results, dtype=np.float32)
|
|
84
|
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True)
|
|
85
|
+
norms[norms == 0] = 1
|
|
86
|
+
return vecs / norms
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def create_embedding_engine(
|
|
90
|
+
provider: str | None = None, **kwargs
|
|
91
|
+
) -> EmbeddingEngine | BedrockEmbeddingEngine:
|
|
92
|
+
"""Factory to create the appropriate embedding engine based on provider."""
|
|
93
|
+
provider = provider or os.environ.get("EMBEDDING_PROVIDER", "openai")
|
|
94
|
+
if provider == "bedrock":
|
|
95
|
+
return BedrockEmbeddingEngine(**kwargs)
|
|
96
|
+
return EmbeddingEngine(**kwargs)
|
memory_access/ingest.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .crawl import CrawlService
|
|
9
|
+
from .embeddings import EmbeddingEngine, BedrockEmbeddingEngine
|
|
10
|
+
from .models import CrawledPage, KbChunk
|
|
11
|
+
from .normalizer import Normalizer
|
|
12
|
+
from .storage import InsightStore
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def clean_markdown(text: str) -> str:
|
|
18
|
+
"""Strip common boilerplate from crawled markdown.
|
|
19
|
+
|
|
20
|
+
Removes navigation headers (before first # heading) and
|
|
21
|
+
feedback footers ("Did you find this page useful?" etc).
|
|
22
|
+
"""
|
|
23
|
+
lines = text.split("\n")
|
|
24
|
+
|
|
25
|
+
# Find first H1 heading — content starts there
|
|
26
|
+
start = 0
|
|
27
|
+
for i, line in enumerate(lines):
|
|
28
|
+
if line.startswith("# "):
|
|
29
|
+
start = i
|
|
30
|
+
break
|
|
31
|
+
|
|
32
|
+
# Find footer markers — content ends before them
|
|
33
|
+
end = len(lines)
|
|
34
|
+
footer_markers = [
|
|
35
|
+
"Did you find this page useful",
|
|
36
|
+
"Thanks for rating this page",
|
|
37
|
+
"Report a problem on this page",
|
|
38
|
+
]
|
|
39
|
+
for i, line in enumerate(lines[start:], start):
|
|
40
|
+
if any(marker in line for marker in footer_markers):
|
|
41
|
+
end = i
|
|
42
|
+
break
|
|
43
|
+
|
|
44
|
+
return "\n".join(lines[start:end]).strip()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def split_markdown(text: str, max_chars: int = 4000) -> list[str]:
|
|
48
|
+
"""Split markdown into chunks by ## headings, with max_chars fallback.
|
|
49
|
+
|
|
50
|
+
Strategy:
|
|
51
|
+
1. Split on ## headings — each section becomes a chunk
|
|
52
|
+
2. If a section exceeds max_chars, split on paragraphs (double newline)
|
|
53
|
+
3. If a paragraph still exceeds max_chars, split at max_chars boundary
|
|
54
|
+
"""
|
|
55
|
+
if not text.strip():
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
# Split on ## headings, preserving the heading with its content
|
|
59
|
+
sections = []
|
|
60
|
+
current = []
|
|
61
|
+
for line in text.split("\n"):
|
|
62
|
+
if line.startswith("## ") and current:
|
|
63
|
+
sections.append("\n".join(current))
|
|
64
|
+
current = [line]
|
|
65
|
+
else:
|
|
66
|
+
current.append(line)
|
|
67
|
+
if current:
|
|
68
|
+
sections.append("\n".join(current))
|
|
69
|
+
|
|
70
|
+
# Sub-split oversized sections
|
|
71
|
+
chunks = []
|
|
72
|
+
for section in sections:
|
|
73
|
+
if len(section) <= max_chars:
|
|
74
|
+
chunks.append(section)
|
|
75
|
+
else:
|
|
76
|
+
# Split on paragraphs
|
|
77
|
+
paragraphs = section.split("\n\n")
|
|
78
|
+
current_chunk = ""
|
|
79
|
+
for para in paragraphs:
|
|
80
|
+
if len(current_chunk) + len(para) + 2 > max_chars:
|
|
81
|
+
if current_chunk:
|
|
82
|
+
chunks.append(current_chunk)
|
|
83
|
+
# Handle single paragraphs exceeding max_chars
|
|
84
|
+
if len(para) > max_chars:
|
|
85
|
+
for i in range(0, len(para), max_chars):
|
|
86
|
+
chunks.append(para[i:i + max_chars])
|
|
87
|
+
current_chunk = ""
|
|
88
|
+
else:
|
|
89
|
+
current_chunk = para
|
|
90
|
+
else:
|
|
91
|
+
current_chunk = current_chunk + "\n\n" + para if current_chunk else para
|
|
92
|
+
if current_chunk:
|
|
93
|
+
chunks.append(current_chunk)
|
|
94
|
+
|
|
95
|
+
return [c.strip() for c in chunks if c.strip()]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Ingestor:
|
|
99
|
+
"""Orchestrates: crawl -> split -> normalize -> embed -> store."""
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
store: InsightStore,
|
|
104
|
+
normalizer: Normalizer,
|
|
105
|
+
embeddings: EmbeddingEngine | BedrockEmbeddingEngine,
|
|
106
|
+
crawl_service: CrawlService | None = None,
|
|
107
|
+
):
|
|
108
|
+
self.store = store
|
|
109
|
+
self.normalizer = normalizer
|
|
110
|
+
self.embeddings = embeddings
|
|
111
|
+
self.crawl_service = crawl_service
|
|
112
|
+
|
|
113
|
+
async def ingest_crawl(
|
|
114
|
+
self,
|
|
115
|
+
kb_id: str,
|
|
116
|
+
url: str,
|
|
117
|
+
limit: int = 1000,
|
|
118
|
+
on_progress: callable | None = None,
|
|
119
|
+
) -> int:
|
|
120
|
+
"""Crawl a URL and ingest all pages into a knowledge base.
|
|
121
|
+
|
|
122
|
+
Returns the total number of chunks stored.
|
|
123
|
+
"""
|
|
124
|
+
pages = await self.crawl_service.crawl(url, limit=limit)
|
|
125
|
+
total_chunks = 0
|
|
126
|
+
|
|
127
|
+
for i, page in enumerate(pages):
|
|
128
|
+
if on_progress:
|
|
129
|
+
on_progress(i + 1, len(pages), page.url)
|
|
130
|
+
|
|
131
|
+
chunks_stored = await self.ingest_page(kb_id, page)
|
|
132
|
+
total_chunks += chunks_stored
|
|
133
|
+
|
|
134
|
+
return total_chunks
|
|
135
|
+
|
|
136
|
+
async def ingest_page(self, kb_id: str, page: CrawledPage) -> int:
|
|
137
|
+
"""Ingest a single crawled page into a knowledge base.
|
|
138
|
+
|
|
139
|
+
Returns the number of chunks stored.
|
|
140
|
+
"""
|
|
141
|
+
cleaned = clean_markdown(page.markdown)
|
|
142
|
+
text_chunks = split_markdown(cleaned)
|
|
143
|
+
|
|
144
|
+
# Collect all insights from all chunks
|
|
145
|
+
all_insights = []
|
|
146
|
+
for chunk_text in text_chunks:
|
|
147
|
+
try:
|
|
148
|
+
insights = await self.normalizer.normalize(chunk_text)
|
|
149
|
+
all_insights.extend(insights)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.warning("Failed to normalize chunk from %s: %s", page.url, e)
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
if not all_insights:
|
|
155
|
+
return 0
|
|
156
|
+
|
|
157
|
+
# Filter low-confidence insights
|
|
158
|
+
min_threshold = float(os.environ.get("MIN_CONFIDENCE_THRESHOLD", "0.5"))
|
|
159
|
+
filtered = [i for i in all_insights if i.confidence >= min_threshold]
|
|
160
|
+
if len(all_insights) != len(filtered):
|
|
161
|
+
logger.info(
|
|
162
|
+
"Filtered %d/%d insights below confidence threshold %.2f",
|
|
163
|
+
len(all_insights) - len(filtered), len(all_insights), min_threshold,
|
|
164
|
+
)
|
|
165
|
+
all_insights = filtered
|
|
166
|
+
|
|
167
|
+
if not all_insights:
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
# Batch embed all normalized texts in single API call
|
|
171
|
+
texts_to_embed = [i.normalized_text for i in all_insights]
|
|
172
|
+
embeddings = self.embeddings.embed_batch(texts_to_embed)
|
|
173
|
+
|
|
174
|
+
# Store with corresponding embeddings
|
|
175
|
+
stored = 0
|
|
176
|
+
for insight, emb in zip(all_insights, embeddings):
|
|
177
|
+
kb_chunk = KbChunk(
|
|
178
|
+
kb_id=kb_id,
|
|
179
|
+
text=insight.text,
|
|
180
|
+
normalized_text=insight.normalized_text,
|
|
181
|
+
frame=insight.frame,
|
|
182
|
+
domains=insight.domains,
|
|
183
|
+
entities=insight.entities,
|
|
184
|
+
problems=insight.problems,
|
|
185
|
+
resolutions=insight.resolutions,
|
|
186
|
+
contexts=insight.contexts,
|
|
187
|
+
confidence=insight.confidence,
|
|
188
|
+
source_url=page.url,
|
|
189
|
+
)
|
|
190
|
+
await self.store.insert_kb_chunk(kb_chunk, emb)
|
|
191
|
+
stored += 1
|
|
192
|
+
|
|
193
|
+
return stored
|
|
194
|
+
|
|
195
|
+
async def ingest_scrape(self, kb_id: str, url: str) -> int:
|
|
196
|
+
"""Scrape a single URL and ingest into a knowledge base."""
|
|
197
|
+
page = await self.crawl_service.scrape(url)
|
|
198
|
+
return await self.ingest_page(kb_id, page)
|
|
199
|
+
|
|
200
|
+
async def ingest_from_directory(
|
|
201
|
+
self,
|
|
202
|
+
kb_id: str,
|
|
203
|
+
dir_path: str,
|
|
204
|
+
on_progress: callable | None = None,
|
|
205
|
+
) -> int:
|
|
206
|
+
"""Load Firecrawl JSON files from a directory and ingest into a KB.
|
|
207
|
+
|
|
208
|
+
Each JSON file should have {"markdown": "...", "metadata": {"sourceURL": "..."}}.
|
|
209
|
+
Returns total chunks stored.
|
|
210
|
+
"""
|
|
211
|
+
path = Path(dir_path)
|
|
212
|
+
files = sorted(path.glob("*.json"))
|
|
213
|
+
total_chunks = 0
|
|
214
|
+
|
|
215
|
+
for i, f in enumerate(files):
|
|
216
|
+
data = json.loads(f.read_text())
|
|
217
|
+
markdown = data.get("markdown", "")
|
|
218
|
+
metadata = data.get("metadata", {})
|
|
219
|
+
url = metadata.get("sourceURL") or metadata.get("url", f.stem)
|
|
220
|
+
|
|
221
|
+
if on_progress:
|
|
222
|
+
on_progress(i + 1, len(files), url)
|
|
223
|
+
|
|
224
|
+
page = CrawledPage(url=url, markdown=markdown, metadata=metadata)
|
|
225
|
+
chunks_stored = await self.ingest_page(kb_id, page)
|
|
226
|
+
total_chunks += chunks_stored
|
|
227
|
+
|
|
228
|
+
return total_chunks
|
memory_access/models.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Frame(str, Enum):
|
|
9
|
+
"""Canonical semantic frames for normalizing insights."""
|
|
10
|
+
|
|
11
|
+
CAUSAL = "causal"
|
|
12
|
+
CONSTRAINT = "constraint"
|
|
13
|
+
PATTERN = "pattern"
|
|
14
|
+
EQUIVALENCE = "equivalence"
|
|
15
|
+
TAXONOMY = "taxonomy"
|
|
16
|
+
PROCEDURE = "procedure"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Insight(BaseModel):
|
|
20
|
+
"""A single atomic insight stored in semantic memory."""
|
|
21
|
+
|
|
22
|
+
id: Optional[str] = None
|
|
23
|
+
text: str
|
|
24
|
+
normalized_text: str = ""
|
|
25
|
+
frame: Frame = Frame.CAUSAL
|
|
26
|
+
domains: list[str] = Field(default_factory=list)
|
|
27
|
+
entities: list[str] = Field(default_factory=list)
|
|
28
|
+
problems: list[str] = Field(default_factory=list)
|
|
29
|
+
resolutions: list[str] = Field(default_factory=list)
|
|
30
|
+
contexts: list[str] = Field(default_factory=list)
|
|
31
|
+
confidence: float = 1.0
|
|
32
|
+
source: str = ""
|
|
33
|
+
created_at: Optional[datetime] = None
|
|
34
|
+
updated_at: Optional[datetime] = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class GitContext(BaseModel):
|
|
38
|
+
"""Optional git metadata to associate with insights."""
|
|
39
|
+
|
|
40
|
+
repo: str = ""
|
|
41
|
+
pr: str = ""
|
|
42
|
+
author: str = ""
|
|
43
|
+
project: str = ""
|
|
44
|
+
task: str = ""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SearchResult(BaseModel):
|
|
48
|
+
"""An insight with its similarity score from a search query."""
|
|
49
|
+
|
|
50
|
+
insight: Insight
|
|
51
|
+
score: float
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class KnowledgeBase(BaseModel):
|
|
55
|
+
"""A collection of document chunks from an external source."""
|
|
56
|
+
id: Optional[str] = None
|
|
57
|
+
name: str
|
|
58
|
+
description: str = ""
|
|
59
|
+
source_type: str = "" # 'crawl', 'scrape', 'file', 'text'
|
|
60
|
+
created_at: Optional[datetime] = None
|
|
61
|
+
updated_at: Optional[datetime] = None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class KbChunk(BaseModel):
|
|
65
|
+
"""A normalized chunk from a knowledge base document."""
|
|
66
|
+
id: Optional[str] = None
|
|
67
|
+
kb_id: str
|
|
68
|
+
text: str
|
|
69
|
+
normalized_text: str = ""
|
|
70
|
+
frame: Frame = Frame.CAUSAL
|
|
71
|
+
domains: list[str] = Field(default_factory=list)
|
|
72
|
+
entities: list[str] = Field(default_factory=list)
|
|
73
|
+
problems: list[str] = Field(default_factory=list)
|
|
74
|
+
resolutions: list[str] = Field(default_factory=list)
|
|
75
|
+
contexts: list[str] = Field(default_factory=list)
|
|
76
|
+
confidence: float = 1.0
|
|
77
|
+
source_url: str = ""
|
|
78
|
+
created_at: Optional[datetime] = None
|
|
79
|
+
updated_at: Optional[datetime] = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class CrawledPage(BaseModel):
|
|
83
|
+
"""A single page returned by a crawl service."""
|
|
84
|
+
url: str
|
|
85
|
+
markdown: str
|
|
86
|
+
metadata: dict = Field(default_factory=dict)
|