business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from fastapi import APIRouter, HTTPException, Query
|
|
7
|
+
|
|
8
|
+
from app.dependencies import DbSession, GatewayUserDep, SettingsDep, VectorStoreDep
|
|
9
|
+
from app.schemas.chat import (
|
|
10
|
+
ChatCompleteRequest,
|
|
11
|
+
ChatCompleteResponse,
|
|
12
|
+
ChatMessageOut,
|
|
13
|
+
ChatSessionCreate,
|
|
14
|
+
ChatSessionOut,
|
|
15
|
+
)
|
|
16
|
+
from app.services.chat_store import (
|
|
17
|
+
append_chat_message,
|
|
18
|
+
create_chat_session,
|
|
19
|
+
delete_chat_session,
|
|
20
|
+
get_session_owned,
|
|
21
|
+
list_chat_messages,
|
|
22
|
+
list_chat_sessions,
|
|
23
|
+
maybe_set_title_from_first_message,
|
|
24
|
+
touch_session_updated,
|
|
25
|
+
)
|
|
26
|
+
from app.services.gemini_chat import gemini_chat_reply
|
|
27
|
+
from app.services.retrieval_service import run_retrieval
|
|
28
|
+
|
|
29
|
+
router = APIRouter(prefix="/chat", tags=["chat"])
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@router.post("/sessions", response_model=ChatSessionOut)
|
|
33
|
+
async def create_session(
|
|
34
|
+
body: ChatSessionCreate,
|
|
35
|
+
db: DbSession,
|
|
36
|
+
user_id: GatewayUserDep,
|
|
37
|
+
) -> ChatSessionOut:
|
|
38
|
+
sid = await create_chat_session(db, user_id=user_id, title=body.title)
|
|
39
|
+
await db.commit()
|
|
40
|
+
row = await get_session_owned(db, session_id=sid, user_id=user_id)
|
|
41
|
+
if not row:
|
|
42
|
+
raise HTTPException(status_code=500, detail="session create failed") from None
|
|
43
|
+
return ChatSessionOut(
|
|
44
|
+
id=str(row["id"]),
|
|
45
|
+
title=row.get("title"),
|
|
46
|
+
created_at=str(row["created_at"]),
|
|
47
|
+
updated_at=str(row["updated_at"]),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@router.get("/sessions", response_model=list[ChatSessionOut])
|
|
52
|
+
async def sessions_list(
|
|
53
|
+
db: DbSession,
|
|
54
|
+
user_id: GatewayUserDep,
|
|
55
|
+
limit: int = Query(default=100, ge=1, le=200),
|
|
56
|
+
) -> list[ChatSessionOut]:
|
|
57
|
+
rows = await list_chat_sessions(db, user_id=user_id, limit=limit)
|
|
58
|
+
return [
|
|
59
|
+
ChatSessionOut(
|
|
60
|
+
id=str(r["id"]),
|
|
61
|
+
title=r.get("title"),
|
|
62
|
+
created_at=str(r["created_at"]),
|
|
63
|
+
updated_at=str(r["updated_at"]),
|
|
64
|
+
)
|
|
65
|
+
for r in rows
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@router.delete("/sessions/{session_id}")
|
|
70
|
+
async def sessions_delete(
|
|
71
|
+
session_id: str,
|
|
72
|
+
db: DbSession,
|
|
73
|
+
user_id: GatewayUserDep,
|
|
74
|
+
) -> dict[str, bool]:
|
|
75
|
+
ok = await delete_chat_session(db, session_id=session_id, user_id=user_id)
|
|
76
|
+
await db.commit()
|
|
77
|
+
if not ok:
|
|
78
|
+
raise HTTPException(status_code=404, detail="Session not found")
|
|
79
|
+
return {"deleted": True}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@router.get(
|
|
83
|
+
"/sessions/{session_id}/messages",
|
|
84
|
+
response_model=list[ChatMessageOut],
|
|
85
|
+
)
|
|
86
|
+
async def messages_list(
|
|
87
|
+
session_id: str,
|
|
88
|
+
db: DbSession,
|
|
89
|
+
user_id: GatewayUserDep,
|
|
90
|
+
limit: int = Query(default=200, ge=1, le=500),
|
|
91
|
+
) -> list[ChatMessageOut]:
|
|
92
|
+
row = await get_session_owned(db, session_id=session_id, user_id=user_id)
|
|
93
|
+
if not row:
|
|
94
|
+
raise HTTPException(status_code=404, detail="Session not found")
|
|
95
|
+
rows = await list_chat_messages(db, session_id=session_id, limit=limit)
|
|
96
|
+
return [
|
|
97
|
+
ChatMessageOut(
|
|
98
|
+
id=int(r["id"]),
|
|
99
|
+
role=str(r["role"]),
|
|
100
|
+
content=str(r["content"]),
|
|
101
|
+
meta_json=r.get("meta_json"),
|
|
102
|
+
created_at=str(r["created_at"]),
|
|
103
|
+
)
|
|
104
|
+
for r in rows
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _sources_meta(candidates: list[dict[str, Any]], *, max_n: int = 12) -> list[dict[str, Any]]:
|
|
109
|
+
out: list[dict[str, Any]] = []
|
|
110
|
+
for c in candidates[:max_n]:
|
|
111
|
+
att = c.get("attribution") or {}
|
|
112
|
+
if not isinstance(att, dict):
|
|
113
|
+
continue
|
|
114
|
+
out.append(
|
|
115
|
+
{
|
|
116
|
+
"rank": c.get("rank"),
|
|
117
|
+
"score": c.get("score"),
|
|
118
|
+
"document_id": att.get("document_id"),
|
|
119
|
+
"chunk_id": att.get("chunk_id"),
|
|
120
|
+
"source_name": att.get("source_name"),
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
return out
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@router.post(
|
|
127
|
+
"/sessions/{session_id}/complete",
|
|
128
|
+
response_model=ChatCompleteResponse,
|
|
129
|
+
)
|
|
130
|
+
async def complete_turn(
|
|
131
|
+
session_id: str,
|
|
132
|
+
body: ChatCompleteRequest,
|
|
133
|
+
db: DbSession,
|
|
134
|
+
user_id: GatewayUserDep,
|
|
135
|
+
settings: SettingsDep,
|
|
136
|
+
store: VectorStoreDep,
|
|
137
|
+
) -> ChatCompleteResponse:
|
|
138
|
+
row = await get_session_owned(db, session_id=session_id, user_id=user_id)
|
|
139
|
+
if not row:
|
|
140
|
+
raise HTTPException(status_code=404, detail="Session not found")
|
|
141
|
+
|
|
142
|
+
filters = (
|
|
143
|
+
body.filters.to_vector_filters(now=datetime.now(UTC)) if body.filters else None
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
await append_chat_message(
|
|
147
|
+
db,
|
|
148
|
+
session_id=session_id,
|
|
149
|
+
role="user",
|
|
150
|
+
content=body.message.strip(),
|
|
151
|
+
meta=None,
|
|
152
|
+
)
|
|
153
|
+
await maybe_set_title_from_first_message(
|
|
154
|
+
db,
|
|
155
|
+
session_id=session_id,
|
|
156
|
+
snippet=body.message.strip(),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
raw = await run_retrieval(
|
|
161
|
+
session=db,
|
|
162
|
+
store=store,
|
|
163
|
+
settings=settings,
|
|
164
|
+
query=body.message,
|
|
165
|
+
k=body.k,
|
|
166
|
+
filters=filters,
|
|
167
|
+
)
|
|
168
|
+
except ValueError as e:
|
|
169
|
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
|
170
|
+
except RuntimeError as e:
|
|
171
|
+
msg = str(e)
|
|
172
|
+
if "Gemini API key" in msg or "GEMINI_API_KEY" in msg:
|
|
173
|
+
raise HTTPException(status_code=503, detail=msg) from e
|
|
174
|
+
raise HTTPException(status_code=502, detail=msg) from e
|
|
175
|
+
|
|
176
|
+
ctx = raw["context"]
|
|
177
|
+
combined = str(ctx.get("combined_text") or "")
|
|
178
|
+
|
|
179
|
+
reply, reply_source = await gemini_chat_reply(
|
|
180
|
+
settings,
|
|
181
|
+
user_message=body.message.strip(),
|
|
182
|
+
context_combined=combined,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
meta = {
|
|
186
|
+
"reply_source": reply_source,
|
|
187
|
+
"sources": _sources_meta(raw["candidates"]),
|
|
188
|
+
"vector_candidates_considered": raw["vector_candidates_considered"],
|
|
189
|
+
}
|
|
190
|
+
mid = await append_chat_message(
|
|
191
|
+
db,
|
|
192
|
+
session_id=session_id,
|
|
193
|
+
role="assistant",
|
|
194
|
+
content=reply,
|
|
195
|
+
meta=meta,
|
|
196
|
+
)
|
|
197
|
+
await touch_session_updated(db, session_id=session_id)
|
|
198
|
+
await db.commit()
|
|
199
|
+
|
|
200
|
+
return ChatCompleteResponse(
|
|
201
|
+
assistant_message_id=mid,
|
|
202
|
+
reply=reply,
|
|
203
|
+
reply_source=reply_source,
|
|
204
|
+
embedding_model=raw["embedding_model"],
|
|
205
|
+
vector_candidates_considered=int(raw["vector_candidates_considered"]),
|
|
206
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from fastapi import APIRouter, HTTPException
|
|
2
|
+
from sqlalchemy import text
|
|
3
|
+
|
|
4
|
+
from app.config import get_settings
|
|
5
|
+
from app.dependencies import DbSession
|
|
6
|
+
from app.services.chunking.schemas import RebuildChunksRequest, RebuildChunksResponse
|
|
7
|
+
from app.services.chunking.semantic_chunk import rebuild_semantic_chunks
|
|
8
|
+
|
|
9
|
+
router = APIRouter(prefix="/ingest", tags=["chunks"])
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@router.post(
|
|
13
|
+
"/documents/{document_id}/chunks",
|
|
14
|
+
response_model=RebuildChunksResponse,
|
|
15
|
+
)
|
|
16
|
+
async def rebuild_document_chunks(
|
|
17
|
+
document_id: str,
|
|
18
|
+
body: RebuildChunksRequest,
|
|
19
|
+
session: DbSession,
|
|
20
|
+
) -> RebuildChunksResponse:
|
|
21
|
+
"""Rebuild semantic chunks (structure-first; optional LLM for weak structure)."""
|
|
22
|
+
row = await session.execute(
|
|
23
|
+
text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
|
|
24
|
+
{"id": document_id},
|
|
25
|
+
)
|
|
26
|
+
if row.first() is None:
|
|
27
|
+
raise HTTPException(status_code=404, detail="Document not found")
|
|
28
|
+
settings = get_settings()
|
|
29
|
+
n = await rebuild_semantic_chunks(
|
|
30
|
+
session,
|
|
31
|
+
document_id=document_id,
|
|
32
|
+
use_llm_weak_structure=body.use_llm_weak_structure,
|
|
33
|
+
settings=settings,
|
|
34
|
+
)
|
|
35
|
+
await session.commit()
|
|
36
|
+
return RebuildChunksResponse(document_id=document_id, chunks_written=n)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from fastapi import APIRouter, BackgroundTasks, HTTPException, Request
|
|
2
|
+
from sqlalchemy import text
|
|
3
|
+
|
|
4
|
+
from app.config import get_settings
|
|
5
|
+
from app.services.entities.pipeline import run_entity_extraction_job
|
|
6
|
+
|
|
7
|
+
router = APIRouter(prefix="/ingest", tags=["entities"])
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@router.post("/documents/{document_id}/entities")
|
|
11
|
+
async def queue_entity_extraction(
|
|
12
|
+
document_id: str,
|
|
13
|
+
background_tasks: BackgroundTasks,
|
|
14
|
+
request: Request,
|
|
15
|
+
) -> dict[str, str | bool]:
|
|
16
|
+
"""Re-run entity extraction for a document (uses current ``document_chunks``)."""
|
|
17
|
+
settings = get_settings()
|
|
18
|
+
async with request.app.state.session_factory() as session:
|
|
19
|
+
row = await session.execute(
|
|
20
|
+
text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
|
|
21
|
+
{"id": document_id},
|
|
22
|
+
)
|
|
23
|
+
if row.first() is None:
|
|
24
|
+
raise HTTPException(status_code=404, detail="Document not found")
|
|
25
|
+
background_tasks.add_task(
|
|
26
|
+
run_entity_extraction_job,
|
|
27
|
+
document_id=document_id,
|
|
28
|
+
session_factory=request.app.state.session_factory,
|
|
29
|
+
settings=settings,
|
|
30
|
+
)
|
|
31
|
+
return {"accepted": True, "document_id": document_id}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from fastapi import APIRouter, BackgroundTasks, HTTPException, Request
|
|
2
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
3
|
+
from sqlalchemy import text
|
|
4
|
+
|
|
5
|
+
from app.config import get_settings
|
|
6
|
+
from app.dependencies import VectorStoreDep
|
|
7
|
+
from app.services.embeddings.worker import run_embed_document_job
|
|
8
|
+
from app.services.integrations_remote import resolve_gemini_api_key
|
|
9
|
+
|
|
10
|
+
router = APIRouter(prefix="/ingest", tags=["embeddings"])
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EmbedDocumentRequest(BaseModel):
|
|
14
|
+
model_config = ConfigDict(extra="forbid")
|
|
15
|
+
|
|
16
|
+
multimodal: bool = Field(
|
|
17
|
+
default=False,
|
|
18
|
+
description=(
|
|
19
|
+
"If true, build interleaved parts from content_blocks in each "
|
|
20
|
+
"chunk's block span (text + images from blob storage)"
|
|
21
|
+
),
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@router.post("/documents/{document_id}/embed")
|
|
26
|
+
async def queue_document_embed(
|
|
27
|
+
document_id: str,
|
|
28
|
+
body: EmbedDocumentRequest,
|
|
29
|
+
background_tasks: BackgroundTasks,
|
|
30
|
+
request: Request,
|
|
31
|
+
store: VectorStoreDep,
|
|
32
|
+
) -> dict[str, str | bool]:
|
|
33
|
+
"""Schedule Gemini embedding for all ``document_chunks`` (background task)."""
|
|
34
|
+
settings = get_settings()
|
|
35
|
+
if not await resolve_gemini_api_key(settings):
|
|
36
|
+
raise HTTPException(
|
|
37
|
+
status_code=503,
|
|
38
|
+
detail=(
|
|
39
|
+
"Gemini API key is not configured "
|
|
40
|
+
"(GEMINI_API_KEY or gateway integration store)"
|
|
41
|
+
),
|
|
42
|
+
)
|
|
43
|
+
async with request.app.state.session_factory() as session:
|
|
44
|
+
row = await session.execute(
|
|
45
|
+
text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
|
|
46
|
+
{"id": document_id},
|
|
47
|
+
)
|
|
48
|
+
if row.first() is None:
|
|
49
|
+
raise HTTPException(status_code=404, detail="Document not found")
|
|
50
|
+
background_tasks.add_task(
|
|
51
|
+
run_embed_document_job,
|
|
52
|
+
document_id=document_id,
|
|
53
|
+
multimodal=body.multimodal,
|
|
54
|
+
session_factory=request.app.state.session_factory,
|
|
55
|
+
settings=settings,
|
|
56
|
+
store=store,
|
|
57
|
+
)
|
|
58
|
+
return {"accepted": True, "document_id": document_id}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from fastapi import APIRouter, Request
|
|
4
|
+
from fastapi.responses import JSONResponse
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
|
|
7
|
+
router = APIRouter(tags=["health"])
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@router.get("/healthz")
|
|
12
|
+
async def healthz() -> dict[str, str]:
|
|
13
|
+
return {"status": "ok"}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@router.get("/readyz")
|
|
17
|
+
async def readyz(request: Request):
|
|
18
|
+
engine = request.app.state.engine
|
|
19
|
+
try:
|
|
20
|
+
async with engine.connect() as conn:
|
|
21
|
+
await conn.execute(text("SELECT 1"))
|
|
22
|
+
except Exception:
|
|
23
|
+
logger.exception("readiness check failed")
|
|
24
|
+
return JSONResponse(
|
|
25
|
+
status_code=503,
|
|
26
|
+
content={"status": "not_ready"},
|
|
27
|
+
)
|
|
28
|
+
return {"status": "ready"}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import traceback
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from fastapi import APIRouter, HTTPException, Query
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
from sqlalchemy import text
|
|
9
|
+
|
|
10
|
+
from app.config import get_settings
|
|
11
|
+
from app.connectors.registry import get_connector_class, init_connectors
|
|
12
|
+
from app.dependencies import DbSession
|
|
13
|
+
from app.schemas.canonical import CanonicalDocument
|
|
14
|
+
from app.schemas.ingest import RawIngestEnvelope
|
|
15
|
+
from app.services.chunking.semantic_chunk import rebuild_semantic_chunks
|
|
16
|
+
from app.services.ingestion.persist import (
|
|
17
|
+
clear_normalization_error,
|
|
18
|
+
load_raw_envelope_for_document,
|
|
19
|
+
persist_raw_envelope,
|
|
20
|
+
)
|
|
21
|
+
from app.services.normalization import (
|
|
22
|
+
normalize_envelope_to_canonical,
|
|
23
|
+
persist_normalized_document,
|
|
24
|
+
)
|
|
25
|
+
from app.storage.blobs import BlobStore
|
|
26
|
+
|
|
27
|
+
router = APIRouter(prefix="/ingest", tags=["ingestion"])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RawIngestResponse(BaseModel):
|
|
31
|
+
document_id: str
|
|
32
|
+
status: str
|
|
33
|
+
connector: str
|
|
34
|
+
normalized: dict[str, Any]
|
|
35
|
+
canonical: dict[str, Any]
|
|
36
|
+
deduplicated: bool = Field(
|
|
37
|
+
default=False,
|
|
38
|
+
description="True when ingest matched an existing doc (external_id or hash)",
|
|
39
|
+
)
|
|
40
|
+
normalization_failed: bool = Field(
|
|
41
|
+
default=False,
|
|
42
|
+
description="True when raw was stored but canonical blocks could not be built",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _canonical_fallback(
|
|
47
|
+
*,
|
|
48
|
+
document_id: str,
|
|
49
|
+
envelope: RawIngestEnvelope,
|
|
50
|
+
) -> CanonicalDocument:
|
|
51
|
+
"""Minimal canonical shape when normalization fails (raw still persisted)."""
|
|
52
|
+
return CanonicalDocument(
|
|
53
|
+
id=document_id,
|
|
54
|
+
source=envelope.source,
|
|
55
|
+
timestamp=envelope.timestamp,
|
|
56
|
+
content_blocks=[],
|
|
57
|
+
raw_content=envelope.payload,
|
|
58
|
+
entities=[],
|
|
59
|
+
links=[],
|
|
60
|
+
tags=[],
|
|
61
|
+
summary="",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@router.post("/raw", response_model=RawIngestResponse)
|
|
66
|
+
async def ingest_raw(
|
|
67
|
+
body: RawIngestEnvelope,
|
|
68
|
+
session: DbSession,
|
|
69
|
+
connector: str = Query(
|
|
70
|
+
default="generic",
|
|
71
|
+
description="Registry key for normalize_raw / validate_config",
|
|
72
|
+
),
|
|
73
|
+
) -> RawIngestResponse:
|
|
74
|
+
init_connectors()
|
|
75
|
+
try:
|
|
76
|
+
connector_cls = get_connector_class(connector)
|
|
77
|
+
except KeyError:
|
|
78
|
+
raise HTTPException(
|
|
79
|
+
status_code=400,
|
|
80
|
+
detail=f"Unknown connector: {connector}",
|
|
81
|
+
) from None
|
|
82
|
+
|
|
83
|
+
instance = connector_cls()
|
|
84
|
+
cfg = body.metadata.get("connector_config")
|
|
85
|
+
if cfg is not None and not isinstance(cfg, dict):
|
|
86
|
+
raise HTTPException(
|
|
87
|
+
status_code=400,
|
|
88
|
+
detail="metadata.connector_config must be an object when set",
|
|
89
|
+
)
|
|
90
|
+
config = cfg if isinstance(cfg, dict) else {}
|
|
91
|
+
try:
|
|
92
|
+
instance.validate_config(config)
|
|
93
|
+
except ValueError as e:
|
|
94
|
+
raise HTTPException(status_code=400, detail=str(e)) from e
|
|
95
|
+
|
|
96
|
+
if connector == "playwright_session":
|
|
97
|
+
body = await asyncio.to_thread(instance.prepare_envelope, body, config)
|
|
98
|
+
else:
|
|
99
|
+
body = instance.prepare_envelope(body, config)
|
|
100
|
+
|
|
101
|
+
normalized = instance.normalize_raw(body)
|
|
102
|
+
|
|
103
|
+
settings = get_settings()
|
|
104
|
+
blob_store = BlobStore(settings.data_dir / "blobs")
|
|
105
|
+
|
|
106
|
+
norm_failed = False
|
|
107
|
+
async with session.begin():
|
|
108
|
+
doc_id, deduped = await persist_raw_envelope(session, body)
|
|
109
|
+
stored = await load_raw_envelope_for_document(session, document_id=doc_id)
|
|
110
|
+
try:
|
|
111
|
+
canonical = normalize_envelope_to_canonical(
|
|
112
|
+
document_id=doc_id,
|
|
113
|
+
envelope=stored,
|
|
114
|
+
blob_store=blob_store,
|
|
115
|
+
)
|
|
116
|
+
if not deduped:
|
|
117
|
+
await persist_normalized_document(session, canonical=canonical)
|
|
118
|
+
await clear_normalization_error(session, document_id=doc_id)
|
|
119
|
+
await rebuild_semantic_chunks(
|
|
120
|
+
session,
|
|
121
|
+
document_id=doc_id,
|
|
122
|
+
use_llm_weak_structure=False,
|
|
123
|
+
settings=settings,
|
|
124
|
+
)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
norm_failed = True
|
|
127
|
+
err_obj = {
|
|
128
|
+
"type": type(e).__name__,
|
|
129
|
+
"message": str(e),
|
|
130
|
+
"traceback": traceback.format_exc()[-8000:],
|
|
131
|
+
}
|
|
132
|
+
await session.execute(
|
|
133
|
+
text("UPDATE documents SET normalization_error = :err WHERE id = :id"),
|
|
134
|
+
{"err": json.dumps(err_obj)[:16000], "id": doc_id},
|
|
135
|
+
)
|
|
136
|
+
canonical = _canonical_fallback(document_id=doc_id, envelope=stored)
|
|
137
|
+
|
|
138
|
+
return RawIngestResponse(
|
|
139
|
+
document_id=doc_id,
|
|
140
|
+
status="partial",
|
|
141
|
+
connector=connector,
|
|
142
|
+
normalized=normalized,
|
|
143
|
+
canonical=canonical.model_dump(mode="json"),
|
|
144
|
+
deduplicated=deduped,
|
|
145
|
+
normalization_failed=norm_failed,
|
|
146
|
+
)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from fastapi import APIRouter, BackgroundTasks, HTTPException, Request
|
|
2
|
+
from sqlalchemy import text
|
|
3
|
+
|
|
4
|
+
from app.config import get_settings
|
|
5
|
+
from app.services.link_expansion.schemas import ExpandLinksRequest
|
|
6
|
+
from app.services.link_expansion.worker import run_expand_links_job
|
|
7
|
+
|
|
8
|
+
router = APIRouter(prefix="/ingest", tags=["link-expansion"])
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@router.post("/expand-links")
|
|
12
|
+
async def queue_expand_links(
|
|
13
|
+
body: ExpandLinksRequest,
|
|
14
|
+
background_tasks: BackgroundTasks,
|
|
15
|
+
request: Request,
|
|
16
|
+
) -> dict[str, str | bool]:
|
|
17
|
+
"""Schedule recursive link expansion for a document's outbound links."""
|
|
18
|
+
settings = get_settings()
|
|
19
|
+
async with request.app.state.session_factory() as session:
|
|
20
|
+
row = await session.execute(
|
|
21
|
+
text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
|
|
22
|
+
{"id": body.document_id},
|
|
23
|
+
)
|
|
24
|
+
if row.first() is None:
|
|
25
|
+
raise HTTPException(status_code=404, detail="Document not found")
|
|
26
|
+
options = body.resolve(settings)
|
|
27
|
+
background_tasks.add_task(
|
|
28
|
+
run_expand_links_job,
|
|
29
|
+
root_document_id=body.document_id,
|
|
30
|
+
options=options,
|
|
31
|
+
session_factory=request.app.state.session_factory,
|
|
32
|
+
settings=settings,
|
|
33
|
+
)
|
|
34
|
+
return {"accepted": True, "document_id": body.document_id}
|