business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,351 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import math
6
+ from dataclasses import dataclass
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+
10
+ from sqlalchemy import text
11
+ from sqlalchemy.ext.asyncio import AsyncSession
12
+
13
+ from app.config import Settings
14
+ from app.services.embeddings.build_inputs import load_blocks_span
15
+ from app.services.embeddings.gemini_api import batch_embed_contents
16
+ from app.services.embeddings.types import TextPart
17
+ from app.services.integrations_remote import resolve_gemini_api_key
18
+ from app.vectorstore import SqliteVecStore, VectorSearchFilters, VectorSearchResult
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def _parse_source_weights(raw: str) -> dict[str, float]:
24
+ try:
25
+ data = json.loads(raw)
26
+ except json.JSONDecodeError:
27
+ logger.warning("invalid RETRIEVAL_SOURCE_WEIGHTS_JSON; using default")
28
+ return {"default": 1.0}
29
+ if not isinstance(data, dict):
30
+ return {"default": 1.0}
31
+ out: dict[str, float] = {}
32
+ for k, v in data.items():
33
+ try:
34
+ out[str(k)] = float(v)
35
+ except (TypeError, ValueError):
36
+ continue
37
+ if "default" not in out:
38
+ out["default"] = 1.0
39
+ return out
40
+
41
+
42
+ def _parse_iso_utc(s: str) -> datetime | None:
43
+ try:
44
+ norm = s.replace("Z", "+00:00")
45
+ dt = datetime.fromisoformat(norm)
46
+ if dt.tzinfo is None:
47
+ dt = dt.replace(tzinfo=UTC)
48
+ return dt.astimezone(UTC)
49
+ except (TypeError, ValueError):
50
+ return None
51
+
52
+
53
+ def semantic_score_from_distance(distance: float) -> float:
54
+ """Map sqlite-vec distance to [0, 1]; lower distance → higher score."""
55
+ if distance < 0 or math.isnan(distance):
56
+ return 0.0
57
+ return 1.0 / (1.0 + float(distance))
58
+
59
+
60
+ def recency_score_from_ingested_at(
61
+ ingested_at: str,
62
+ *,
63
+ now: datetime,
64
+ half_life_days: float,
65
+ ) -> float:
66
+ """
67
+ Exponential half-life decay in [0, 1]: score = 0.5 ** (age_days / half_life).
68
+ """
69
+ if half_life_days <= 0:
70
+ return 1.0
71
+ parsed = _parse_iso_utc(ingested_at)
72
+ if parsed is None:
73
+ return 0.5
74
+ age = now - parsed
75
+ age_days = max(0.0, age.total_seconds() / 86_400.0)
76
+ return float(0.5 ** (age_days / half_life_days))
77
+
78
+
79
+ def source_weight_for_connector(
80
+ connector_type: str,
81
+ weights: dict[str, float],
82
+ ) -> float:
83
+ w = weights.get(connector_type)
84
+ if w is None:
85
+ w = weights["default"]
86
+ return max(0.0, min(1.0, float(w)))
87
+
88
+
89
+ def blended_retrieval_score(
90
+ *,
91
+ semantic: float,
92
+ recency: float,
93
+ source_w: float,
94
+ settings: Settings,
95
+ ) -> float:
96
+ return (
97
+ settings.retrieval_score_semantic_weight * semantic
98
+ + settings.retrieval_score_recency_weight * recency
99
+ + settings.retrieval_score_source_weight * source_w
100
+ )
101
+
102
+
103
+ @dataclass(frozen=True, slots=True)
104
+ class _HydratedChunk:
105
+ chunk_pk: int
106
+ document_id: str
107
+ ordinal: int
108
+ chunk_text: str
109
+ start_block_ordinal: int
110
+ end_block_ordinal: int
111
+ chunk_meta: str | None
112
+ doc_summary: str | None
113
+ doc_content_type: str
114
+ doc_timestamp: str
115
+ doc_status: str
116
+ source_id: int
117
+ source_name: str
118
+ connector_type: str
119
+
120
+
121
+ async def _hydrate_chunks(
122
+ session: AsyncSession,
123
+ chunk_pks: list[int],
124
+ ) -> dict[int, _HydratedChunk]:
125
+ if not chunk_pks:
126
+ return {}
127
+ placeholders = ", ".join(f":c{i}" for i in range(len(chunk_pks)))
128
+ params = {f"c{i}": pk for i, pk in enumerate(chunk_pks)}
129
+ r = await session.execute(
130
+ text(
131
+ f"""
132
+ SELECT
133
+ dc.id,
134
+ dc.document_id,
135
+ dc.ordinal,
136
+ dc.text,
137
+ dc.start_block_ordinal,
138
+ dc.end_block_ordinal,
139
+ dc.meta,
140
+ d.summary,
141
+ d.content_type,
142
+ d.timestamp,
143
+ d.status,
144
+ s.id,
145
+ s.name,
146
+ s.connector_type
147
+ FROM document_chunks dc
148
+ JOIN documents d ON d.id = dc.document_id
149
+ JOIN sources s ON s.id = d.source_id
150
+ WHERE dc.id IN ({placeholders})
151
+ """,
152
+ ),
153
+ params,
154
+ )
155
+ out: dict[int, _HydratedChunk] = {}
156
+ for row in r.fetchall():
157
+ pk = int(row[0])
158
+ out[pk] = _HydratedChunk(
159
+ chunk_pk=pk,
160
+ document_id=str(row[1]),
161
+ ordinal=int(row[2]),
162
+ chunk_text=str(row[3]),
163
+ start_block_ordinal=int(row[4]),
164
+ end_block_ordinal=int(row[5]),
165
+ chunk_meta=row[6] if row[6] else None,
166
+ doc_summary=row[7] if row[7] else None,
167
+ doc_content_type=str(row[8]),
168
+ doc_timestamp=str(row[9]),
169
+ doc_status=str(row[10]),
170
+ source_id=int(row[11]),
171
+ source_name=str(row[12]),
172
+ connector_type=str(row[13]),
173
+ )
174
+ return out
175
+
176
+
177
+ async def run_retrieval(
178
+ *,
179
+ session: AsyncSession,
180
+ store: SqliteVecStore,
181
+ settings: Settings,
182
+ query: str,
183
+ k: int,
184
+ filters: VectorSearchFilters | None,
185
+ now: datetime | None = None,
186
+ ) -> dict[str, Any]:
187
+ """
188
+ Embed query (Gemini), vector search with filters, score, hydrate SQLite rows,
189
+ return ranked candidates + aggregated multimodal context for a generator.
190
+ """
191
+ q = query.strip()
192
+ if not q:
193
+ msg = "query must be non-empty"
194
+ raise ValueError(msg)
195
+ api_key = await resolve_gemini_api_key(settings)
196
+ if not api_key:
197
+ msg = "Gemini API key not configured (GEMINI_API_KEY or gateway store)"
198
+ raise RuntimeError(msg)
199
+
200
+ now_utc = (now or datetime.now(UTC)).astimezone(UTC)
201
+ source_weights = _parse_source_weights(settings.retrieval_source_weights_json)
202
+
203
+ vecs = await batch_embed_contents(
204
+ api_key=api_key,
205
+ model=settings.gemini_embedding_model,
206
+ contents=[[TextPart(q)]],
207
+ settings=settings,
208
+ task_type=settings.gemini_query_task_type,
209
+ )
210
+ if not vecs:
211
+ msg = "query embedding failed"
212
+ raise RuntimeError(msg)
213
+ query_vector = vecs[0]
214
+
215
+ mult = max(1, settings.retrieval_vec_candidate_multiplier)
216
+ vec_k = min(500, max(k, k * mult))
217
+ raw_hits = await store.search(query_vector, vec_k, filters)
218
+
219
+ chunk_ids = [h.chunk_id for h in raw_hits]
220
+ hydrated = await _hydrate_chunks(session, chunk_ids)
221
+
222
+ rescored: list[
223
+ tuple[float, VectorSearchResult, float, float, float, _HydratedChunk | None]
224
+ ] = []
225
+ for h in raw_hits:
226
+ sem = semantic_score_from_distance(h.distance)
227
+ rec = recency_score_from_ingested_at(
228
+ h.ingested_at,
229
+ now=now_utc,
230
+ half_life_days=settings.retrieval_recency_half_life_days,
231
+ )
232
+ row = hydrated.get(h.chunk_id)
233
+ src_w = (
234
+ source_weight_for_connector(row.connector_type, source_weights)
235
+ if row
236
+ else source_weights["default"]
237
+ )
238
+ total = blended_retrieval_score(
239
+ semantic=sem,
240
+ recency=rec,
241
+ source_w=src_w,
242
+ settings=settings,
243
+ )
244
+ rescored.append((total, h, sem, rec, src_w, row))
245
+
246
+ rescored.sort(key=lambda x: x[0], reverse=True)
247
+ top = rescored[:k]
248
+
249
+ candidates: list[dict[str, Any]] = []
250
+ context_sections: list[dict[str, Any]] = []
251
+ context_media: list[dict[str, Any]] = []
252
+
253
+ for rank, (score, hit, sem, rec, src_w, row) in enumerate(top, start=1):
254
+ att: dict[str, Any] = {
255
+ "vector_rowid": hit.rowid,
256
+ "distance": hit.distance,
257
+ "document_id": hit.document_id,
258
+ "chunk_id": hit.chunk_id,
259
+ "source_id": hit.source_id,
260
+ "modality": hit.modality,
261
+ "ingested_at": hit.ingested_at,
262
+ }
263
+ chunk_payload: dict[str, Any] | None = None
264
+ doc_payload: dict[str, Any] | None = None
265
+ source_payload: dict[str, Any] | None = None
266
+
267
+ if row:
268
+ att.update(
269
+ {
270
+ "source_name": row.source_name,
271
+ "connector_type": row.connector_type,
272
+ "document_timestamp": row.doc_timestamp,
273
+ },
274
+ )
275
+ chunk_payload = {
276
+ "ordinal": row.ordinal,
277
+ "text": row.chunk_text,
278
+ "start_block_ordinal": row.start_block_ordinal,
279
+ "end_block_ordinal": row.end_block_ordinal,
280
+ "meta": row.chunk_meta,
281
+ }
282
+ doc_payload = {
283
+ "summary": row.doc_summary,
284
+ "content_type": row.doc_content_type,
285
+ "timestamp": row.doc_timestamp,
286
+ "status": row.doc_status,
287
+ }
288
+ source_payload = {
289
+ "id": row.source_id,
290
+ "name": row.source_name,
291
+ "connector_type": row.connector_type,
292
+ }
293
+
294
+ candidates.append(
295
+ {
296
+ "rank": rank,
297
+ "score": score,
298
+ "semantic_score": sem,
299
+ "recency_score": rec,
300
+ "source_weight": src_w,
301
+ "attribution": att,
302
+ "document": doc_payload,
303
+ "chunk": chunk_payload,
304
+ "source": source_payload,
305
+ },
306
+ )
307
+
308
+ if row:
309
+ context_sections.append(
310
+ {
311
+ "document_id": row.document_id,
312
+ "chunk_id": row.chunk_pk,
313
+ "chunk_ordinal": row.ordinal,
314
+ "text": row.chunk_text,
315
+ },
316
+ )
317
+ blocks = await load_blocks_span(
318
+ session,
319
+ row.document_id,
320
+ row.start_block_ordinal,
321
+ row.end_block_ordinal,
322
+ )
323
+ for b in blocks:
324
+ if b.type in ("image", "audio", "video", "document"):
325
+ context_media.append(
326
+ {
327
+ "document_id": row.document_id,
328
+ "chunk_ordinal": row.ordinal,
329
+ "block_ordinal": b.ordinal,
330
+ "type": b.type,
331
+ "mime": b.mime,
332
+ "storage_uri": b.storage_uri,
333
+ },
334
+ )
335
+
336
+ combined = "\n\n".join(
337
+ f"[{s['document_id']}#{s['chunk_ordinal']}] {s['text']}"
338
+ for s in context_sections
339
+ if s.get("text")
340
+ )
341
+
342
+ return {
343
+ "candidates": candidates,
344
+ "context": {
345
+ "combined_text": combined,
346
+ "sections": context_sections,
347
+ "media": context_media,
348
+ },
349
+ "embedding_model": settings.gemini_embedding_model,
350
+ "vector_candidates_considered": len(raw_hits),
351
+ }
@@ -0,0 +1,36 @@
1
+ """Helpers to reach the real sqlite3 connection behind SQLAlchemy + aiosqlite."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sqlite3
6
+
7
+ import aiosqlite
8
+ import sqlite_vec
9
+
10
+
11
+ def unwrap_sqlite3_connection(dbapi_connection: object) -> sqlite3.Connection:
12
+ """Return the underlying sqlite3.Connection from driver wrappers."""
13
+ if isinstance(dbapi_connection, aiosqlite.Connection):
14
+ return dbapi_connection._conn
15
+
16
+ inner = getattr(dbapi_connection, "_connection", None)
17
+ if inner is not None:
18
+ if isinstance(inner, aiosqlite.Connection):
19
+ return inner._conn
20
+ real = getattr(inner, "_conn", None)
21
+ if isinstance(real, sqlite3.Connection):
22
+ return real
23
+ driver = getattr(dbapi_connection, "driver_connection", None)
24
+ if isinstance(driver, sqlite3.Connection):
25
+ return driver
26
+ if isinstance(dbapi_connection, sqlite3.Connection):
27
+ return dbapi_connection
28
+ msg = f"Cannot resolve sqlite3.Connection from {type(dbapi_connection)!r}"
29
+ raise TypeError(msg)
30
+
31
+
32
+ def load_sqlite_vec_extension(dbapi_connection: object) -> None:
33
+ raw = unwrap_sqlite3_connection(dbapi_connection)
34
+ raw.enable_load_extension(True)
35
+ sqlite_vec.load(raw)
36
+ raw.enable_load_extension(False)
@@ -0,0 +1,3 @@
1
+ from app.storage.blobs import BlobStore
2
+
3
+ __all__ = ["BlobStore"]
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ from pathlib import Path
5
+
6
+
7
+ class BlobStore:
8
+ """Content-addressed files under root (e.g. data/blobs/ab/<sha256>)."""
9
+
10
+ def __init__(self, root: Path) -> None:
11
+ self.root = root.resolve()
12
+
13
+ def _path_for(self, sha256_hex: str) -> Path:
14
+ if len(sha256_hex) != 64:
15
+ msg = "sha256_hex must be 64 hex characters"
16
+ raise ValueError(msg)
17
+ prefix = sha256_hex[:2]
18
+ return self.root / prefix / sha256_hex
19
+
20
+ def write(self, data: bytes) -> tuple[str, str]:
21
+ """Write bytes if missing; return (sha256_hex, storage_uri blob://...)."""
22
+ digest = hashlib.sha256(data).hexdigest()
23
+ path = self._path_for(digest)
24
+ path.parent.mkdir(parents=True, exist_ok=True)
25
+ if not path.exists():
26
+ path.write_bytes(data)
27
+ return digest, f"blob://{digest}"
28
+
29
+ def read_bytes(self, sha256_hex: str) -> bytes:
30
+ return self._path_for(sha256_hex).read_bytes()
@@ -0,0 +1,13 @@
1
+ from app.vectorstore.sqlite_vec_store import (
2
+ SqliteVecStore,
3
+ VectorMeta,
4
+ VectorSearchFilters,
5
+ VectorSearchResult,
6
+ )
7
+
8
+ __all__ = [
9
+ "SqliteVecStore",
10
+ "VectorMeta",
11
+ "VectorSearchFilters",
12
+ "VectorSearchResult",
13
+ ]
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ import sqlite_vec
7
+ from sqlalchemy import text
8
+ from sqlalchemy.engine import Connection
9
+ from sqlalchemy.ext.asyncio import AsyncSession
10
+
11
+ if TYPE_CHECKING:
12
+ from sqlalchemy.ext.asyncio import AsyncEngine
13
+
14
+
15
+ @dataclass(frozen=True, slots=True)
16
+ class VectorMeta:
17
+ """Row metadata for sqlite-vec (non-null fields at insert time)."""
18
+
19
+ document_id: str
20
+ chunk_id: int = 0
21
+ source_id: int = -1
22
+ modality: str = "text"
23
+ ingested_at: str = ""
24
+
25
+
26
+ @dataclass(frozen=True, slots=True)
27
+ class VectorSearchFilters:
28
+ document_id: str | None = None
29
+ source_id: int | None = None
30
+ modality: str | None = None
31
+ timestamp_min: str | None = None
32
+ timestamp_max: str | None = None
33
+
34
+
35
+ @dataclass(frozen=True, slots=True)
36
+ class VectorSearchResult:
37
+ rowid: int
38
+ distance: float
39
+ document_id: str
40
+ chunk_id: int
41
+ source_id: int
42
+ modality: str
43
+ ingested_at: str
44
+
45
+
46
+ class SqliteVecStore:
47
+ """sqlite-vec backed vector storage in the app SQLite file."""
48
+
49
+ def __init__(
50
+ self,
51
+ engine: AsyncEngine,
52
+ *,
53
+ dimension: int,
54
+ table_name: str = "kb_vec_embeddings",
55
+ ) -> None:
56
+ self._engine = engine
57
+ self._dimension = dimension
58
+ self._table = table_name
59
+
60
+ def _validate_vector(self, v: list[float], *, ctx: str) -> None:
61
+ if len(v) != self._dimension:
62
+ msg = f"{ctx}: expected dimension {self._dimension}, got {len(v)}"
63
+ raise ValueError(msg)
64
+
65
+ async def upsert(
66
+ self,
67
+ embeddings: list[list[float]],
68
+ metas: list[VectorMeta],
69
+ ) -> None:
70
+ if len(embeddings) != len(metas):
71
+ msg = "embeddings and metas must have the same length"
72
+ raise ValueError(msg)
73
+ for i, (emb, meta) in enumerate(zip(embeddings, metas, strict=True)):
74
+ self._validate_vector(emb, ctx=f"embeddings[{i}]")
75
+ if not meta.document_id:
76
+ raise ValueError("VectorMeta.document_id is required")
77
+ if not meta.ingested_at:
78
+ raise ValueError("VectorMeta.ingested_at is required (ISO-8601 text)")
79
+
80
+ keys: set[tuple[str, int]] = set()
81
+ for m in metas:
82
+ keys.add((m.document_id, m.chunk_id))
83
+
84
+ async with self._engine.begin() as conn:
85
+ await conn.run_sync(
86
+ self._sync_upsert_keys,
87
+ keys,
88
+ embeddings,
89
+ metas,
90
+ )
91
+
92
+ def _sync_upsert_keys(
93
+ self,
94
+ c: Connection,
95
+ keys: set[tuple[str, int]],
96
+ embeddings: list[list[float]],
97
+ metas: list[VectorMeta],
98
+ ) -> None:
99
+ for doc_id, chunk_id in keys:
100
+ c.execute(
101
+ text(
102
+ f"DELETE FROM {self._table} "
103
+ "WHERE document_id = :doc AND chunk_id = :chunk",
104
+ ),
105
+ {"doc": doc_id, "chunk": chunk_id},
106
+ )
107
+ cols = "embedding, document_id, chunk_id, source_id, modality, ingested_at"
108
+ vals = ":emb, :document_id, :chunk_id, :source_id, :modality, :ingested_at"
109
+ sql = text(f"INSERT INTO {self._table} ({cols}) VALUES ({vals})")
110
+ for emb, meta in zip(embeddings, metas, strict=True):
111
+ blob = sqlite_vec.serialize_float32(emb)
112
+ c.execute(
113
+ sql,
114
+ {
115
+ "emb": blob,
116
+ "document_id": meta.document_id,
117
+ "chunk_id": meta.chunk_id,
118
+ "source_id": meta.source_id,
119
+ "modality": meta.modality,
120
+ "ingested_at": meta.ingested_at,
121
+ },
122
+ )
123
+
124
+ async def upsert_for_session(
125
+ self,
126
+ session: AsyncSession,
127
+ embeddings: list[list[float]],
128
+ metas: list[VectorMeta],
129
+ ) -> None:
130
+ """Same as ``upsert`` but uses the session's connection (avoids SQLite lock)."""
131
+ if len(embeddings) != len(metas):
132
+ msg = "embeddings and metas must have the same length"
133
+ raise ValueError(msg)
134
+ for i, (emb, meta) in enumerate(zip(embeddings, metas, strict=True)):
135
+ self._validate_vector(emb, ctx=f"embeddings[{i}]")
136
+ if not meta.document_id:
137
+ raise ValueError("VectorMeta.document_id is required")
138
+ if not meta.ingested_at:
139
+ raise ValueError("VectorMeta.ingested_at is required (ISO-8601 text)")
140
+
141
+ keys: set[tuple[str, int]] = set()
142
+ for m in metas:
143
+ keys.add((m.document_id, m.chunk_id))
144
+
145
+ async_conn = await session.connection()
146
+ await async_conn.run_sync(
147
+ self._sync_upsert_keys,
148
+ keys,
149
+ embeddings,
150
+ metas,
151
+ )
152
+
153
+ async def delete_document(self, document_id: str) -> int:
154
+ async with self._engine.begin() as conn:
155
+ return await conn.run_sync(self._sync_delete_document, document_id)
156
+
157
+ def _sync_delete_document(self, c: Connection, document_id: str) -> int:
158
+ r = c.execute(
159
+ text(f"DELETE FROM {self._table} WHERE document_id = :doc"),
160
+ {"doc": document_id},
161
+ )
162
+ return r.rowcount or 0
163
+
164
+ async def delete_document_for_session(
165
+ self,
166
+ session: AsyncSession,
167
+ document_id: str,
168
+ ) -> int:
169
+ """Same as ``delete_document`` but uses the session's SQLite connection."""
170
+ async_conn = await session.connection()
171
+ return await async_conn.run_sync(self._sync_delete_document, document_id)
172
+
173
+ async def search(
174
+ self,
175
+ query_vector: list[float],
176
+ k: int,
177
+ filters: VectorSearchFilters | None = None,
178
+ ) -> list[VectorSearchResult]:
179
+ if k < 1:
180
+ raise ValueError("k must be >= 1")
181
+ self._validate_vector(query_vector, ctx="query_vector")
182
+ filters = filters or VectorSearchFilters()
183
+
184
+ qblob = sqlite_vec.serialize_float32(query_vector)
185
+ where_parts = [
186
+ "embedding MATCH :qvec",
187
+ "k = :k",
188
+ ]
189
+ params: dict[str, Any] = {"qvec": qblob, "k": k}
190
+
191
+ if filters.document_id is not None:
192
+ where_parts.append("document_id = :document_id")
193
+ params["document_id"] = filters.document_id
194
+ if filters.source_id is not None:
195
+ where_parts.append("source_id = :source_id")
196
+ params["source_id"] = filters.source_id
197
+ if filters.modality is not None:
198
+ where_parts.append("modality = :modality")
199
+ params["modality"] = filters.modality
200
+ if filters.timestamp_min is not None:
201
+ where_parts.append("ingested_at >= :ts_min")
202
+ params["ts_min"] = filters.timestamp_min
203
+ if filters.timestamp_max is not None:
204
+ where_parts.append("ingested_at <= :ts_max")
205
+ params["ts_max"] = filters.timestamp_max
206
+
207
+ where_sql = " AND ".join(where_parts)
208
+ stmt = text(
209
+ f"""
210
+ SELECT
211
+ rowid,
212
+ document_id,
213
+ chunk_id,
214
+ source_id,
215
+ modality,
216
+ ingested_at,
217
+ distance
218
+ FROM {self._table}
219
+ WHERE {where_sql}
220
+ """
221
+ )
222
+
223
+ async with self._engine.connect() as conn:
224
+
225
+ def _search(c: Connection) -> list[VectorSearchResult]:
226
+ rows = c.execute(stmt, params).mappings().all()
227
+ out: list[VectorSearchResult] = []
228
+ for row in rows:
229
+ out.append(
230
+ VectorSearchResult(
231
+ rowid=int(row["rowid"]),
232
+ distance=float(row["distance"]),
233
+ document_id=str(row["document_id"]),
234
+ chunk_id=int(row["chunk_id"]),
235
+ source_id=int(row["source_id"]),
236
+ modality=str(row["modality"]),
237
+ ingested_at=str(row["ingested_at"]),
238
+ )
239
+ )
240
+ return out
241
+
242
+ return await conn.run_sync(_search)
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: backend
3
+ Version: 0.1.0
4
+ Summary: FastAPI service (internal API behind Hono gateway)
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: fastapi[standard]==0.113.0
7
+ Requires-Dist: pydantic==2.8.0
8
+ Requires-Dist: pydantic-settings>=2.4.0
9
+ Requires-Dist: sqlalchemy[asyncio]>=2.0.36
10
+ Requires-Dist: aiosqlite>=0.20.0
11
+ Requires-Dist: alembic>=1.14.0
12
+ Requires-Dist: python-json-logger>=2.0.7
13
+ Requires-Dist: sqlite-vec>=0.1.9
14
+ Requires-Dist: httpx>=0.27.0
15
+ Provides-Extra: entities
16
+ Requires-Dist: spacy>=3.7.0; extra == "entities"
17
+ Provides-Extra: playwright
18
+ Requires-Dist: playwright>=1.49.0; extra == "playwright"