business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import math
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from sqlalchemy import text
|
|
11
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
12
|
+
|
|
13
|
+
from app.config import Settings
|
|
14
|
+
from app.services.embeddings.build_inputs import load_blocks_span
|
|
15
|
+
from app.services.embeddings.gemini_api import batch_embed_contents
|
|
16
|
+
from app.services.embeddings.types import TextPart
|
|
17
|
+
from app.services.integrations_remote import resolve_gemini_api_key
|
|
18
|
+
from app.vectorstore import SqliteVecStore, VectorSearchFilters, VectorSearchResult
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _parse_source_weights(raw: str) -> dict[str, float]:
|
|
24
|
+
try:
|
|
25
|
+
data = json.loads(raw)
|
|
26
|
+
except json.JSONDecodeError:
|
|
27
|
+
logger.warning("invalid RETRIEVAL_SOURCE_WEIGHTS_JSON; using default")
|
|
28
|
+
return {"default": 1.0}
|
|
29
|
+
if not isinstance(data, dict):
|
|
30
|
+
return {"default": 1.0}
|
|
31
|
+
out: dict[str, float] = {}
|
|
32
|
+
for k, v in data.items():
|
|
33
|
+
try:
|
|
34
|
+
out[str(k)] = float(v)
|
|
35
|
+
except (TypeError, ValueError):
|
|
36
|
+
continue
|
|
37
|
+
if "default" not in out:
|
|
38
|
+
out["default"] = 1.0
|
|
39
|
+
return out
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _parse_iso_utc(s: str) -> datetime | None:
|
|
43
|
+
try:
|
|
44
|
+
norm = s.replace("Z", "+00:00")
|
|
45
|
+
dt = datetime.fromisoformat(norm)
|
|
46
|
+
if dt.tzinfo is None:
|
|
47
|
+
dt = dt.replace(tzinfo=UTC)
|
|
48
|
+
return dt.astimezone(UTC)
|
|
49
|
+
except (TypeError, ValueError):
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def semantic_score_from_distance(distance: float) -> float:
|
|
54
|
+
"""Map sqlite-vec distance to [0, 1]; lower distance → higher score."""
|
|
55
|
+
if distance < 0 or math.isnan(distance):
|
|
56
|
+
return 0.0
|
|
57
|
+
return 1.0 / (1.0 + float(distance))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def recency_score_from_ingested_at(
|
|
61
|
+
ingested_at: str,
|
|
62
|
+
*,
|
|
63
|
+
now: datetime,
|
|
64
|
+
half_life_days: float,
|
|
65
|
+
) -> float:
|
|
66
|
+
"""
|
|
67
|
+
Exponential half-life decay in [0, 1]: score = 0.5 ** (age_days / half_life).
|
|
68
|
+
"""
|
|
69
|
+
if half_life_days <= 0:
|
|
70
|
+
return 1.0
|
|
71
|
+
parsed = _parse_iso_utc(ingested_at)
|
|
72
|
+
if parsed is None:
|
|
73
|
+
return 0.5
|
|
74
|
+
age = now - parsed
|
|
75
|
+
age_days = max(0.0, age.total_seconds() / 86_400.0)
|
|
76
|
+
return float(0.5 ** (age_days / half_life_days))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def source_weight_for_connector(
|
|
80
|
+
connector_type: str,
|
|
81
|
+
weights: dict[str, float],
|
|
82
|
+
) -> float:
|
|
83
|
+
w = weights.get(connector_type)
|
|
84
|
+
if w is None:
|
|
85
|
+
w = weights["default"]
|
|
86
|
+
return max(0.0, min(1.0, float(w)))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def blended_retrieval_score(
|
|
90
|
+
*,
|
|
91
|
+
semantic: float,
|
|
92
|
+
recency: float,
|
|
93
|
+
source_w: float,
|
|
94
|
+
settings: Settings,
|
|
95
|
+
) -> float:
|
|
96
|
+
return (
|
|
97
|
+
settings.retrieval_score_semantic_weight * semantic
|
|
98
|
+
+ settings.retrieval_score_recency_weight * recency
|
|
99
|
+
+ settings.retrieval_score_source_weight * source_w
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass(frozen=True, slots=True)
|
|
104
|
+
class _HydratedChunk:
|
|
105
|
+
chunk_pk: int
|
|
106
|
+
document_id: str
|
|
107
|
+
ordinal: int
|
|
108
|
+
chunk_text: str
|
|
109
|
+
start_block_ordinal: int
|
|
110
|
+
end_block_ordinal: int
|
|
111
|
+
chunk_meta: str | None
|
|
112
|
+
doc_summary: str | None
|
|
113
|
+
doc_content_type: str
|
|
114
|
+
doc_timestamp: str
|
|
115
|
+
doc_status: str
|
|
116
|
+
source_id: int
|
|
117
|
+
source_name: str
|
|
118
|
+
connector_type: str
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
async def _hydrate_chunks(
|
|
122
|
+
session: AsyncSession,
|
|
123
|
+
chunk_pks: list[int],
|
|
124
|
+
) -> dict[int, _HydratedChunk]:
|
|
125
|
+
if not chunk_pks:
|
|
126
|
+
return {}
|
|
127
|
+
placeholders = ", ".join(f":c{i}" for i in range(len(chunk_pks)))
|
|
128
|
+
params = {f"c{i}": pk for i, pk in enumerate(chunk_pks)}
|
|
129
|
+
r = await session.execute(
|
|
130
|
+
text(
|
|
131
|
+
f"""
|
|
132
|
+
SELECT
|
|
133
|
+
dc.id,
|
|
134
|
+
dc.document_id,
|
|
135
|
+
dc.ordinal,
|
|
136
|
+
dc.text,
|
|
137
|
+
dc.start_block_ordinal,
|
|
138
|
+
dc.end_block_ordinal,
|
|
139
|
+
dc.meta,
|
|
140
|
+
d.summary,
|
|
141
|
+
d.content_type,
|
|
142
|
+
d.timestamp,
|
|
143
|
+
d.status,
|
|
144
|
+
s.id,
|
|
145
|
+
s.name,
|
|
146
|
+
s.connector_type
|
|
147
|
+
FROM document_chunks dc
|
|
148
|
+
JOIN documents d ON d.id = dc.document_id
|
|
149
|
+
JOIN sources s ON s.id = d.source_id
|
|
150
|
+
WHERE dc.id IN ({placeholders})
|
|
151
|
+
""",
|
|
152
|
+
),
|
|
153
|
+
params,
|
|
154
|
+
)
|
|
155
|
+
out: dict[int, _HydratedChunk] = {}
|
|
156
|
+
for row in r.fetchall():
|
|
157
|
+
pk = int(row[0])
|
|
158
|
+
out[pk] = _HydratedChunk(
|
|
159
|
+
chunk_pk=pk,
|
|
160
|
+
document_id=str(row[1]),
|
|
161
|
+
ordinal=int(row[2]),
|
|
162
|
+
chunk_text=str(row[3]),
|
|
163
|
+
start_block_ordinal=int(row[4]),
|
|
164
|
+
end_block_ordinal=int(row[5]),
|
|
165
|
+
chunk_meta=row[6] if row[6] else None,
|
|
166
|
+
doc_summary=row[7] if row[7] else None,
|
|
167
|
+
doc_content_type=str(row[8]),
|
|
168
|
+
doc_timestamp=str(row[9]),
|
|
169
|
+
doc_status=str(row[10]),
|
|
170
|
+
source_id=int(row[11]),
|
|
171
|
+
source_name=str(row[12]),
|
|
172
|
+
connector_type=str(row[13]),
|
|
173
|
+
)
|
|
174
|
+
return out
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
async def run_retrieval(
|
|
178
|
+
*,
|
|
179
|
+
session: AsyncSession,
|
|
180
|
+
store: SqliteVecStore,
|
|
181
|
+
settings: Settings,
|
|
182
|
+
query: str,
|
|
183
|
+
k: int,
|
|
184
|
+
filters: VectorSearchFilters | None,
|
|
185
|
+
now: datetime | None = None,
|
|
186
|
+
) -> dict[str, Any]:
|
|
187
|
+
"""
|
|
188
|
+
Embed query (Gemini), vector search with filters, score, hydrate SQLite rows,
|
|
189
|
+
return ranked candidates + aggregated multimodal context for a generator.
|
|
190
|
+
"""
|
|
191
|
+
q = query.strip()
|
|
192
|
+
if not q:
|
|
193
|
+
msg = "query must be non-empty"
|
|
194
|
+
raise ValueError(msg)
|
|
195
|
+
api_key = await resolve_gemini_api_key(settings)
|
|
196
|
+
if not api_key:
|
|
197
|
+
msg = "Gemini API key not configured (GEMINI_API_KEY or gateway store)"
|
|
198
|
+
raise RuntimeError(msg)
|
|
199
|
+
|
|
200
|
+
now_utc = (now or datetime.now(UTC)).astimezone(UTC)
|
|
201
|
+
source_weights = _parse_source_weights(settings.retrieval_source_weights_json)
|
|
202
|
+
|
|
203
|
+
vecs = await batch_embed_contents(
|
|
204
|
+
api_key=api_key,
|
|
205
|
+
model=settings.gemini_embedding_model,
|
|
206
|
+
contents=[[TextPart(q)]],
|
|
207
|
+
settings=settings,
|
|
208
|
+
task_type=settings.gemini_query_task_type,
|
|
209
|
+
)
|
|
210
|
+
if not vecs:
|
|
211
|
+
msg = "query embedding failed"
|
|
212
|
+
raise RuntimeError(msg)
|
|
213
|
+
query_vector = vecs[0]
|
|
214
|
+
|
|
215
|
+
mult = max(1, settings.retrieval_vec_candidate_multiplier)
|
|
216
|
+
vec_k = min(500, max(k, k * mult))
|
|
217
|
+
raw_hits = await store.search(query_vector, vec_k, filters)
|
|
218
|
+
|
|
219
|
+
chunk_ids = [h.chunk_id for h in raw_hits]
|
|
220
|
+
hydrated = await _hydrate_chunks(session, chunk_ids)
|
|
221
|
+
|
|
222
|
+
rescored: list[
|
|
223
|
+
tuple[float, VectorSearchResult, float, float, float, _HydratedChunk | None]
|
|
224
|
+
] = []
|
|
225
|
+
for h in raw_hits:
|
|
226
|
+
sem = semantic_score_from_distance(h.distance)
|
|
227
|
+
rec = recency_score_from_ingested_at(
|
|
228
|
+
h.ingested_at,
|
|
229
|
+
now=now_utc,
|
|
230
|
+
half_life_days=settings.retrieval_recency_half_life_days,
|
|
231
|
+
)
|
|
232
|
+
row = hydrated.get(h.chunk_id)
|
|
233
|
+
src_w = (
|
|
234
|
+
source_weight_for_connector(row.connector_type, source_weights)
|
|
235
|
+
if row
|
|
236
|
+
else source_weights["default"]
|
|
237
|
+
)
|
|
238
|
+
total = blended_retrieval_score(
|
|
239
|
+
semantic=sem,
|
|
240
|
+
recency=rec,
|
|
241
|
+
source_w=src_w,
|
|
242
|
+
settings=settings,
|
|
243
|
+
)
|
|
244
|
+
rescored.append((total, h, sem, rec, src_w, row))
|
|
245
|
+
|
|
246
|
+
rescored.sort(key=lambda x: x[0], reverse=True)
|
|
247
|
+
top = rescored[:k]
|
|
248
|
+
|
|
249
|
+
candidates: list[dict[str, Any]] = []
|
|
250
|
+
context_sections: list[dict[str, Any]] = []
|
|
251
|
+
context_media: list[dict[str, Any]] = []
|
|
252
|
+
|
|
253
|
+
for rank, (score, hit, sem, rec, src_w, row) in enumerate(top, start=1):
|
|
254
|
+
att: dict[str, Any] = {
|
|
255
|
+
"vector_rowid": hit.rowid,
|
|
256
|
+
"distance": hit.distance,
|
|
257
|
+
"document_id": hit.document_id,
|
|
258
|
+
"chunk_id": hit.chunk_id,
|
|
259
|
+
"source_id": hit.source_id,
|
|
260
|
+
"modality": hit.modality,
|
|
261
|
+
"ingested_at": hit.ingested_at,
|
|
262
|
+
}
|
|
263
|
+
chunk_payload: dict[str, Any] | None = None
|
|
264
|
+
doc_payload: dict[str, Any] | None = None
|
|
265
|
+
source_payload: dict[str, Any] | None = None
|
|
266
|
+
|
|
267
|
+
if row:
|
|
268
|
+
att.update(
|
|
269
|
+
{
|
|
270
|
+
"source_name": row.source_name,
|
|
271
|
+
"connector_type": row.connector_type,
|
|
272
|
+
"document_timestamp": row.doc_timestamp,
|
|
273
|
+
},
|
|
274
|
+
)
|
|
275
|
+
chunk_payload = {
|
|
276
|
+
"ordinal": row.ordinal,
|
|
277
|
+
"text": row.chunk_text,
|
|
278
|
+
"start_block_ordinal": row.start_block_ordinal,
|
|
279
|
+
"end_block_ordinal": row.end_block_ordinal,
|
|
280
|
+
"meta": row.chunk_meta,
|
|
281
|
+
}
|
|
282
|
+
doc_payload = {
|
|
283
|
+
"summary": row.doc_summary,
|
|
284
|
+
"content_type": row.doc_content_type,
|
|
285
|
+
"timestamp": row.doc_timestamp,
|
|
286
|
+
"status": row.doc_status,
|
|
287
|
+
}
|
|
288
|
+
source_payload = {
|
|
289
|
+
"id": row.source_id,
|
|
290
|
+
"name": row.source_name,
|
|
291
|
+
"connector_type": row.connector_type,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
candidates.append(
|
|
295
|
+
{
|
|
296
|
+
"rank": rank,
|
|
297
|
+
"score": score,
|
|
298
|
+
"semantic_score": sem,
|
|
299
|
+
"recency_score": rec,
|
|
300
|
+
"source_weight": src_w,
|
|
301
|
+
"attribution": att,
|
|
302
|
+
"document": doc_payload,
|
|
303
|
+
"chunk": chunk_payload,
|
|
304
|
+
"source": source_payload,
|
|
305
|
+
},
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if row:
|
|
309
|
+
context_sections.append(
|
|
310
|
+
{
|
|
311
|
+
"document_id": row.document_id,
|
|
312
|
+
"chunk_id": row.chunk_pk,
|
|
313
|
+
"chunk_ordinal": row.ordinal,
|
|
314
|
+
"text": row.chunk_text,
|
|
315
|
+
},
|
|
316
|
+
)
|
|
317
|
+
blocks = await load_blocks_span(
|
|
318
|
+
session,
|
|
319
|
+
row.document_id,
|
|
320
|
+
row.start_block_ordinal,
|
|
321
|
+
row.end_block_ordinal,
|
|
322
|
+
)
|
|
323
|
+
for b in blocks:
|
|
324
|
+
if b.type in ("image", "audio", "video", "document"):
|
|
325
|
+
context_media.append(
|
|
326
|
+
{
|
|
327
|
+
"document_id": row.document_id,
|
|
328
|
+
"chunk_ordinal": row.ordinal,
|
|
329
|
+
"block_ordinal": b.ordinal,
|
|
330
|
+
"type": b.type,
|
|
331
|
+
"mime": b.mime,
|
|
332
|
+
"storage_uri": b.storage_uri,
|
|
333
|
+
},
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
combined = "\n\n".join(
|
|
337
|
+
f"[{s['document_id']}#{s['chunk_ordinal']}] {s['text']}"
|
|
338
|
+
for s in context_sections
|
|
339
|
+
if s.get("text")
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
return {
|
|
343
|
+
"candidates": candidates,
|
|
344
|
+
"context": {
|
|
345
|
+
"combined_text": combined,
|
|
346
|
+
"sections": context_sections,
|
|
347
|
+
"media": context_media,
|
|
348
|
+
},
|
|
349
|
+
"embedding_model": settings.gemini_embedding_model,
|
|
350
|
+
"vector_candidates_considered": len(raw_hits),
|
|
351
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Helpers to reach the real sqlite3 connection behind SQLAlchemy + aiosqlite."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sqlite3
|
|
6
|
+
|
|
7
|
+
import aiosqlite
|
|
8
|
+
import sqlite_vec
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def unwrap_sqlite3_connection(dbapi_connection: object) -> sqlite3.Connection:
|
|
12
|
+
"""Return the underlying sqlite3.Connection from driver wrappers."""
|
|
13
|
+
if isinstance(dbapi_connection, aiosqlite.Connection):
|
|
14
|
+
return dbapi_connection._conn
|
|
15
|
+
|
|
16
|
+
inner = getattr(dbapi_connection, "_connection", None)
|
|
17
|
+
if inner is not None:
|
|
18
|
+
if isinstance(inner, aiosqlite.Connection):
|
|
19
|
+
return inner._conn
|
|
20
|
+
real = getattr(inner, "_conn", None)
|
|
21
|
+
if isinstance(real, sqlite3.Connection):
|
|
22
|
+
return real
|
|
23
|
+
driver = getattr(dbapi_connection, "driver_connection", None)
|
|
24
|
+
if isinstance(driver, sqlite3.Connection):
|
|
25
|
+
return driver
|
|
26
|
+
if isinstance(dbapi_connection, sqlite3.Connection):
|
|
27
|
+
return dbapi_connection
|
|
28
|
+
msg = f"Cannot resolve sqlite3.Connection from {type(dbapi_connection)!r}"
|
|
29
|
+
raise TypeError(msg)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def load_sqlite_vec_extension(dbapi_connection: object) -> None:
|
|
33
|
+
raw = unwrap_sqlite3_connection(dbapi_connection)
|
|
34
|
+
raw.enable_load_extension(True)
|
|
35
|
+
sqlite_vec.load(raw)
|
|
36
|
+
raw.enable_load_extension(False)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BlobStore:
|
|
8
|
+
"""Content-addressed files under root (e.g. data/blobs/ab/<sha256>)."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, root: Path) -> None:
|
|
11
|
+
self.root = root.resolve()
|
|
12
|
+
|
|
13
|
+
def _path_for(self, sha256_hex: str) -> Path:
|
|
14
|
+
if len(sha256_hex) != 64:
|
|
15
|
+
msg = "sha256_hex must be 64 hex characters"
|
|
16
|
+
raise ValueError(msg)
|
|
17
|
+
prefix = sha256_hex[:2]
|
|
18
|
+
return self.root / prefix / sha256_hex
|
|
19
|
+
|
|
20
|
+
def write(self, data: bytes) -> tuple[str, str]:
|
|
21
|
+
"""Write bytes if missing; return (sha256_hex, storage_uri blob://...)."""
|
|
22
|
+
digest = hashlib.sha256(data).hexdigest()
|
|
23
|
+
path = self._path_for(digest)
|
|
24
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
if not path.exists():
|
|
26
|
+
path.write_bytes(data)
|
|
27
|
+
return digest, f"blob://{digest}"
|
|
28
|
+
|
|
29
|
+
def read_bytes(self, sha256_hex: str) -> bytes:
|
|
30
|
+
return self._path_for(sha256_hex).read_bytes()
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
import sqlite_vec
|
|
7
|
+
from sqlalchemy import text
|
|
8
|
+
from sqlalchemy.engine import Connection
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from sqlalchemy.ext.asyncio import AsyncEngine
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True, slots=True)
|
|
16
|
+
class VectorMeta:
|
|
17
|
+
"""Row metadata for sqlite-vec (non-null fields at insert time)."""
|
|
18
|
+
|
|
19
|
+
document_id: str
|
|
20
|
+
chunk_id: int = 0
|
|
21
|
+
source_id: int = -1
|
|
22
|
+
modality: str = "text"
|
|
23
|
+
ingested_at: str = ""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True, slots=True)
|
|
27
|
+
class VectorSearchFilters:
|
|
28
|
+
document_id: str | None = None
|
|
29
|
+
source_id: int | None = None
|
|
30
|
+
modality: str | None = None
|
|
31
|
+
timestamp_min: str | None = None
|
|
32
|
+
timestamp_max: str | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True, slots=True)
|
|
36
|
+
class VectorSearchResult:
|
|
37
|
+
rowid: int
|
|
38
|
+
distance: float
|
|
39
|
+
document_id: str
|
|
40
|
+
chunk_id: int
|
|
41
|
+
source_id: int
|
|
42
|
+
modality: str
|
|
43
|
+
ingested_at: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SqliteVecStore:
|
|
47
|
+
"""sqlite-vec backed vector storage in the app SQLite file."""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
engine: AsyncEngine,
|
|
52
|
+
*,
|
|
53
|
+
dimension: int,
|
|
54
|
+
table_name: str = "kb_vec_embeddings",
|
|
55
|
+
) -> None:
|
|
56
|
+
self._engine = engine
|
|
57
|
+
self._dimension = dimension
|
|
58
|
+
self._table = table_name
|
|
59
|
+
|
|
60
|
+
def _validate_vector(self, v: list[float], *, ctx: str) -> None:
|
|
61
|
+
if len(v) != self._dimension:
|
|
62
|
+
msg = f"{ctx}: expected dimension {self._dimension}, got {len(v)}"
|
|
63
|
+
raise ValueError(msg)
|
|
64
|
+
|
|
65
|
+
async def upsert(
|
|
66
|
+
self,
|
|
67
|
+
embeddings: list[list[float]],
|
|
68
|
+
metas: list[VectorMeta],
|
|
69
|
+
) -> None:
|
|
70
|
+
if len(embeddings) != len(metas):
|
|
71
|
+
msg = "embeddings and metas must have the same length"
|
|
72
|
+
raise ValueError(msg)
|
|
73
|
+
for i, (emb, meta) in enumerate(zip(embeddings, metas, strict=True)):
|
|
74
|
+
self._validate_vector(emb, ctx=f"embeddings[{i}]")
|
|
75
|
+
if not meta.document_id:
|
|
76
|
+
raise ValueError("VectorMeta.document_id is required")
|
|
77
|
+
if not meta.ingested_at:
|
|
78
|
+
raise ValueError("VectorMeta.ingested_at is required (ISO-8601 text)")
|
|
79
|
+
|
|
80
|
+
keys: set[tuple[str, int]] = set()
|
|
81
|
+
for m in metas:
|
|
82
|
+
keys.add((m.document_id, m.chunk_id))
|
|
83
|
+
|
|
84
|
+
async with self._engine.begin() as conn:
|
|
85
|
+
await conn.run_sync(
|
|
86
|
+
self._sync_upsert_keys,
|
|
87
|
+
keys,
|
|
88
|
+
embeddings,
|
|
89
|
+
metas,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
def _sync_upsert_keys(
|
|
93
|
+
self,
|
|
94
|
+
c: Connection,
|
|
95
|
+
keys: set[tuple[str, int]],
|
|
96
|
+
embeddings: list[list[float]],
|
|
97
|
+
metas: list[VectorMeta],
|
|
98
|
+
) -> None:
|
|
99
|
+
for doc_id, chunk_id in keys:
|
|
100
|
+
c.execute(
|
|
101
|
+
text(
|
|
102
|
+
f"DELETE FROM {self._table} "
|
|
103
|
+
"WHERE document_id = :doc AND chunk_id = :chunk",
|
|
104
|
+
),
|
|
105
|
+
{"doc": doc_id, "chunk": chunk_id},
|
|
106
|
+
)
|
|
107
|
+
cols = "embedding, document_id, chunk_id, source_id, modality, ingested_at"
|
|
108
|
+
vals = ":emb, :document_id, :chunk_id, :source_id, :modality, :ingested_at"
|
|
109
|
+
sql = text(f"INSERT INTO {self._table} ({cols}) VALUES ({vals})")
|
|
110
|
+
for emb, meta in zip(embeddings, metas, strict=True):
|
|
111
|
+
blob = sqlite_vec.serialize_float32(emb)
|
|
112
|
+
c.execute(
|
|
113
|
+
sql,
|
|
114
|
+
{
|
|
115
|
+
"emb": blob,
|
|
116
|
+
"document_id": meta.document_id,
|
|
117
|
+
"chunk_id": meta.chunk_id,
|
|
118
|
+
"source_id": meta.source_id,
|
|
119
|
+
"modality": meta.modality,
|
|
120
|
+
"ingested_at": meta.ingested_at,
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
async def upsert_for_session(
|
|
125
|
+
self,
|
|
126
|
+
session: AsyncSession,
|
|
127
|
+
embeddings: list[list[float]],
|
|
128
|
+
metas: list[VectorMeta],
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Same as ``upsert`` but uses the session's connection (avoids SQLite lock)."""
|
|
131
|
+
if len(embeddings) != len(metas):
|
|
132
|
+
msg = "embeddings and metas must have the same length"
|
|
133
|
+
raise ValueError(msg)
|
|
134
|
+
for i, (emb, meta) in enumerate(zip(embeddings, metas, strict=True)):
|
|
135
|
+
self._validate_vector(emb, ctx=f"embeddings[{i}]")
|
|
136
|
+
if not meta.document_id:
|
|
137
|
+
raise ValueError("VectorMeta.document_id is required")
|
|
138
|
+
if not meta.ingested_at:
|
|
139
|
+
raise ValueError("VectorMeta.ingested_at is required (ISO-8601 text)")
|
|
140
|
+
|
|
141
|
+
keys: set[tuple[str, int]] = set()
|
|
142
|
+
for m in metas:
|
|
143
|
+
keys.add((m.document_id, m.chunk_id))
|
|
144
|
+
|
|
145
|
+
async_conn = await session.connection()
|
|
146
|
+
await async_conn.run_sync(
|
|
147
|
+
self._sync_upsert_keys,
|
|
148
|
+
keys,
|
|
149
|
+
embeddings,
|
|
150
|
+
metas,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
async def delete_document(self, document_id: str) -> int:
|
|
154
|
+
async with self._engine.begin() as conn:
|
|
155
|
+
return await conn.run_sync(self._sync_delete_document, document_id)
|
|
156
|
+
|
|
157
|
+
def _sync_delete_document(self, c: Connection, document_id: str) -> int:
|
|
158
|
+
r = c.execute(
|
|
159
|
+
text(f"DELETE FROM {self._table} WHERE document_id = :doc"),
|
|
160
|
+
{"doc": document_id},
|
|
161
|
+
)
|
|
162
|
+
return r.rowcount or 0
|
|
163
|
+
|
|
164
|
+
async def delete_document_for_session(
|
|
165
|
+
self,
|
|
166
|
+
session: AsyncSession,
|
|
167
|
+
document_id: str,
|
|
168
|
+
) -> int:
|
|
169
|
+
"""Same as ``delete_document`` but uses the session's SQLite connection."""
|
|
170
|
+
async_conn = await session.connection()
|
|
171
|
+
return await async_conn.run_sync(self._sync_delete_document, document_id)
|
|
172
|
+
|
|
173
|
+
async def search(
|
|
174
|
+
self,
|
|
175
|
+
query_vector: list[float],
|
|
176
|
+
k: int,
|
|
177
|
+
filters: VectorSearchFilters | None = None,
|
|
178
|
+
) -> list[VectorSearchResult]:
|
|
179
|
+
if k < 1:
|
|
180
|
+
raise ValueError("k must be >= 1")
|
|
181
|
+
self._validate_vector(query_vector, ctx="query_vector")
|
|
182
|
+
filters = filters or VectorSearchFilters()
|
|
183
|
+
|
|
184
|
+
qblob = sqlite_vec.serialize_float32(query_vector)
|
|
185
|
+
where_parts = [
|
|
186
|
+
"embedding MATCH :qvec",
|
|
187
|
+
"k = :k",
|
|
188
|
+
]
|
|
189
|
+
params: dict[str, Any] = {"qvec": qblob, "k": k}
|
|
190
|
+
|
|
191
|
+
if filters.document_id is not None:
|
|
192
|
+
where_parts.append("document_id = :document_id")
|
|
193
|
+
params["document_id"] = filters.document_id
|
|
194
|
+
if filters.source_id is not None:
|
|
195
|
+
where_parts.append("source_id = :source_id")
|
|
196
|
+
params["source_id"] = filters.source_id
|
|
197
|
+
if filters.modality is not None:
|
|
198
|
+
where_parts.append("modality = :modality")
|
|
199
|
+
params["modality"] = filters.modality
|
|
200
|
+
if filters.timestamp_min is not None:
|
|
201
|
+
where_parts.append("ingested_at >= :ts_min")
|
|
202
|
+
params["ts_min"] = filters.timestamp_min
|
|
203
|
+
if filters.timestamp_max is not None:
|
|
204
|
+
where_parts.append("ingested_at <= :ts_max")
|
|
205
|
+
params["ts_max"] = filters.timestamp_max
|
|
206
|
+
|
|
207
|
+
where_sql = " AND ".join(where_parts)
|
|
208
|
+
stmt = text(
|
|
209
|
+
f"""
|
|
210
|
+
SELECT
|
|
211
|
+
rowid,
|
|
212
|
+
document_id,
|
|
213
|
+
chunk_id,
|
|
214
|
+
source_id,
|
|
215
|
+
modality,
|
|
216
|
+
ingested_at,
|
|
217
|
+
distance
|
|
218
|
+
FROM {self._table}
|
|
219
|
+
WHERE {where_sql}
|
|
220
|
+
"""
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
async with self._engine.connect() as conn:
|
|
224
|
+
|
|
225
|
+
def _search(c: Connection) -> list[VectorSearchResult]:
|
|
226
|
+
rows = c.execute(stmt, params).mappings().all()
|
|
227
|
+
out: list[VectorSearchResult] = []
|
|
228
|
+
for row in rows:
|
|
229
|
+
out.append(
|
|
230
|
+
VectorSearchResult(
|
|
231
|
+
rowid=int(row["rowid"]),
|
|
232
|
+
distance=float(row["distance"]),
|
|
233
|
+
document_id=str(row["document_id"]),
|
|
234
|
+
chunk_id=int(row["chunk_id"]),
|
|
235
|
+
source_id=int(row["source_id"]),
|
|
236
|
+
modality=str(row["modality"]),
|
|
237
|
+
ingested_at=str(row["ingested_at"]),
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
return out
|
|
241
|
+
|
|
242
|
+
return await conn.run_sync(_search)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: backend
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: FastAPI service (internal API behind Hono gateway)
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: fastapi[standard]==0.113.0
|
|
7
|
+
Requires-Dist: pydantic==2.8.0
|
|
8
|
+
Requires-Dist: pydantic-settings>=2.4.0
|
|
9
|
+
Requires-Dist: sqlalchemy[asyncio]>=2.0.36
|
|
10
|
+
Requires-Dist: aiosqlite>=0.20.0
|
|
11
|
+
Requires-Dist: alembic>=1.14.0
|
|
12
|
+
Requires-Dist: python-json-logger>=2.0.7
|
|
13
|
+
Requires-Dist: sqlite-vec>=0.1.9
|
|
14
|
+
Requires-Dist: httpx>=0.27.0
|
|
15
|
+
Provides-Extra: entities
|
|
16
|
+
Requires-Dist: spacy>=3.7.0; extra == "entities"
|
|
17
|
+
Provides-Extra: playwright
|
|
18
|
+
Requires-Dist: playwright>=1.49.0; extra == "playwright"
|