business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,49 @@
1
+ """
2
+ Twitter / X thread expansion — **official APIs only**.
3
+
4
+ Do not use scraping or undocumented HTML/GraphQL access; those may violate the
5
+ platform Terms of Service. Production fetchers must call documented X API
6
+ endpoints under your developer product and agreement.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from abc import ABC, abstractmethod
12
+ from typing import Any
13
+
14
+ from app.connectors.thread_expansion.models import ThreadExpansionResult
15
+
16
+
17
+ class TwitterThreadExpansionFetcher(ABC):
18
+ """
19
+ Fetch a full conversation thread from X/Twitter using **official APIs**.
20
+
21
+ Compliance & credentials (operational checklist — not legal advice):
22
+
23
+ - **API access**: Requires developer account credentials (e.g. bearer token /
24
+ OAuth 2.0 user context, depending on endpoint). Keys must be kept server-side
25
+ and rotated per your security policy.
26
+ - **Policies**: Usage is subject to the X Developer Agreement, API rules, and
27
+ rate limits for your access tier. Automated collection must stay within
28
+ documented allowances.
29
+ - **Privacy**: Respect user privacy settings and data-retention obligations
30
+ applicable to your jurisdiction and use case.
31
+
32
+ Implementations MUST NOT rely on scraping the public web UI.
33
+ """
34
+
35
+ @abstractmethod
36
+ async def fetch_full_thread(
37
+ self,
38
+ conversation_id: str,
39
+ *,
40
+ config: dict[str, Any],
41
+ ) -> ThreadExpansionResult:
42
+ """
43
+ Return every accessible post in the thread identified by ``conversation_id``
44
+ (typically the root tweet id as returned by the API).
45
+
46
+ ``config`` holds implementation-specific settings (e.g. timeout, optional
47
+ endpoint flags). **Do not** log secrets from ``config``.
48
+ """
49
+ raise NotImplementedError
@@ -0,0 +1,5 @@
1
+ from sqlalchemy.orm import DeclarativeBase
2
+
3
+
4
+ class Base(DeclarativeBase):
5
+ pass
@@ -0,0 +1,34 @@
1
+ from collections.abc import AsyncGenerator
2
+ from typing import Annotated
3
+
4
+ from fastapi import Depends, Header, HTTPException, Request
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+
7
+ from app.config import Settings, get_settings
8
+ from app.vectorstore import SqliteVecStore
9
+
10
+
11
+ async def get_db(request: Request) -> AsyncGenerator[AsyncSession, None]:
12
+ factory = request.app.state.session_factory
13
+ async with factory() as session:
14
+ yield session
15
+
16
+
17
+ def get_vector_store(request: Request) -> SqliteVecStore:
18
+ return request.app.state.vector_store
19
+
20
+
21
+ SettingsDep = Annotated[Settings, Depends(get_settings)]
22
+ DbSession = Annotated[AsyncSession, Depends(get_db)]
23
+ VectorStoreDep = Annotated[SqliteVecStore, Depends(get_vector_store)]
24
+
25
+
26
+ async def get_gateway_user_id(
27
+ x_user_id: Annotated[str | None, Header(alias="x-user-id")] = None,
28
+ ) -> str:
29
+ if not x_user_id or not str(x_user_id).strip():
30
+ raise HTTPException(status_code=401, detail="Missing x-user-id")
31
+ return str(x_user_id).strip()
32
+
33
+
34
+ GatewayUserDep = Annotated[str, Depends(get_gateway_user_id)]
@@ -0,0 +1,35 @@
1
+ import logging
2
+ import sys
3
+
4
+ from pythonjsonlogger.json import JsonFormatter
5
+
6
+ from app.middleware.request_id import request_id_ctx
7
+
8
+
9
+ class RequestIdFilter(logging.Filter):
10
+ def filter(self, record: logging.LogRecord) -> bool:
11
+ if not hasattr(record, "request_id"):
12
+ rid = request_id_ctx.get()
13
+ record.request_id = rid if rid is not None else "-"
14
+ return True
15
+
16
+
17
+ def configure_logging(*, log_level: str, log_json: bool) -> None:
18
+ root = logging.getLogger()
19
+ root.handlers.clear()
20
+ root.setLevel(log_level.upper())
21
+
22
+ handler = logging.StreamHandler(sys.stdout)
23
+ handler.addFilter(RequestIdFilter())
24
+
25
+ if log_json:
26
+ formatter = JsonFormatter(
27
+ "%(asctime)s %(levelname)s %(name)s %(message)s %(request_id)s",
28
+ )
29
+ else:
30
+ fmt = (
31
+ "%(asctime)s %(levelname)s %(name)s %(message)s [request_id=%(request_id)s]"
32
+ )
33
+ formatter = logging.Formatter(fmt)
34
+ handler.setFormatter(formatter)
35
+ root.addHandler(handler)
@@ -0,0 +1,97 @@
1
+ from contextlib import asynccontextmanager
2
+
3
+ from fastapi import FastAPI
4
+ from sqlalchemy import event
5
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
6
+
7
+ from app.config import get_settings
8
+ from app.connectors.registry import init_connectors
9
+ from app.logging_config import configure_logging
10
+ from app.middleware.gateway_identity import GatewayIdentityMiddleware
11
+ from app.middleware.openapi_gateway import OpenApiGatewayMiddleware
12
+ from app.middleware.request_id import RequestIdMiddleware
13
+ from app.openapi_config import APP_DESCRIPTION, OPENAPI_TAGS, attach_custom_openapi
14
+ from app.routers import (
15
+ admin_pipeline,
16
+ chat,
17
+ chunks,
18
+ entity_extract,
19
+ example,
20
+ gemini_embed,
21
+ health,
22
+ ingestion,
23
+ link_expansion,
24
+ pipeline_status,
25
+ query,
26
+ vectors,
27
+ )
28
+ from app.sqlite_ext import load_sqlite_vec_extension, unwrap_sqlite3_connection
29
+ from app.vectorstore import SqliteVecStore
30
+
31
+
32
+ @asynccontextmanager
33
+ async def lifespan(app: FastAPI):
34
+ settings = get_settings()
35
+ configure_logging(log_level=settings.log_level, log_json=settings.log_json)
36
+ settings.data_dir.mkdir(parents=True, exist_ok=True)
37
+ (settings.data_dir / "blobs").mkdir(parents=True, exist_ok=True)
38
+ init_connectors()
39
+ engine = create_async_engine(
40
+ settings.sqlalchemy_database_url,
41
+ pool_pre_ping=True,
42
+ )
43
+
44
+ @event.listens_for(engine.sync_engine, "connect")
45
+ def _sqlite_on_connect(dbapi_connection, _connection_record) -> None:
46
+ raw = unwrap_sqlite3_connection(dbapi_connection)
47
+ cursor = raw.cursor()
48
+ cursor.execute("PRAGMA foreign_keys=ON")
49
+ cursor.execute("PRAGMA busy_timeout=10000")
50
+ cursor.close()
51
+ load_sqlite_vec_extension(dbapi_connection)
52
+
53
+ app.state.engine = engine
54
+ app.state.vector_store = SqliteVecStore(
55
+ engine,
56
+ dimension=settings.vector_embedding_dim,
57
+ )
58
+ app.state.session_factory = async_sessionmaker(
59
+ engine,
60
+ class_=AsyncSession,
61
+ expire_on_commit=False,
62
+ )
63
+ yield
64
+ await engine.dispose()
65
+
66
+
67
+ def create_app() -> FastAPI:
68
+ app = FastAPI(
69
+ title="Business Stack API",
70
+ description=APP_DESCRIPTION,
71
+ version="0.1.0",
72
+ openapi_tags=OPENAPI_TAGS,
73
+ docs_url="/docs",
74
+ redoc_url="/redoc",
75
+ openapi_url="/openapi.json",
76
+ lifespan=lifespan,
77
+ )
78
+ attach_custom_openapi(app)
79
+ app.add_middleware(GatewayIdentityMiddleware)
80
+ app.add_middleware(RequestIdMiddleware)
81
+ app.add_middleware(OpenApiGatewayMiddleware)
82
+ app.include_router(health.router)
83
+ app.include_router(admin_pipeline.router)
84
+ app.include_router(ingestion.router)
85
+ app.include_router(pipeline_status.router)
86
+ app.include_router(link_expansion.router)
87
+ app.include_router(chunks.router)
88
+ app.include_router(gemini_embed.router)
89
+ app.include_router(entity_extract.router)
90
+ app.include_router(vectors.router)
91
+ app.include_router(query.router)
92
+ app.include_router(chat.router)
93
+ app.include_router(example.router)
94
+ return app
95
+
96
+
97
+ app = create_app()
File without changes
@@ -0,0 +1,17 @@
1
+ from starlette.middleware.base import BaseHTTPMiddleware
2
+ from starlette.requests import Request
3
+ from starlette.responses import JSONResponse, Response
4
+
5
+ _UNAUTH_PATHS = frozenset({"/healthz", "/readyz"})
6
+
7
+
8
+ class GatewayIdentityMiddleware(BaseHTTPMiddleware):
9
+ async def dispatch(self, request: Request, call_next) -> Response:
10
+ if request.url.path in _UNAUTH_PATHS:
11
+ return await call_next(request)
12
+ if not request.headers.get("x-user-id"):
13
+ return JSONResponse(
14
+ status_code=401,
15
+ content={"detail": "Unauthorized"},
16
+ )
17
+ return await call_next(request)
@@ -0,0 +1,71 @@
1
+ """ASGI middleware: OpenAPI UIs only via trusted gateway + correct URL prefix for Swagger/ReDoc."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ from starlette.types import ASGIApp, Receive, Scope, Send
8
+
9
+ from app.config import get_settings
10
+
11
+
12
+ def _is_openapi_public_path(path: str) -> bool:
13
+ if path == "/openapi.json" or path == "/redoc" or path == "/docs":
14
+ return True
15
+ return path.startswith("/docs/")
16
+
17
+
18
+ def _decode_headers(scope: dict) -> dict[str, str]:
19
+ raw = scope.get("headers") or []
20
+ out: dict[str, str] = {}
21
+ for k, v in raw:
22
+ key = k.decode("latin-1").lower()
23
+ out[key] = v.decode("latin-1")
24
+ return out
25
+
26
+
27
+ class OpenApiGatewayMiddleware:
28
+ """Runs outermost: sets ASGI root_path from X-Forwarded-Prefix; gates OpenAPI routes on X-Gateway-Secret."""
29
+
30
+ def __init__(self, app: ASGIApp):
31
+ self.app = app
32
+
33
+ async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
34
+ if scope["type"] != "http":
35
+ await self.app(scope, receive, send)
36
+ return
37
+
38
+ settings = get_settings()
39
+ path = scope["path"]
40
+ headers = _decode_headers(scope)
41
+
42
+ secret = (settings.backend_gateway_secret or "").strip()
43
+ if secret and _is_openapi_public_path(path):
44
+ if headers.get("x-gateway-secret") != secret:
45
+ body = json.dumps(
46
+ {
47
+ "detail": (
48
+ "OpenAPI documentation is only available through the API gateway "
49
+ "with a valid X-Gateway-Secret."
50
+ ),
51
+ },
52
+ ).encode("utf-8")
53
+ await send(
54
+ {
55
+ "type": "http.response.start",
56
+ "status": 403,
57
+ "headers": [
58
+ (b"content-type", b"application/json"),
59
+ (b"content-length", str(len(body)).encode("ascii")),
60
+ ],
61
+ },
62
+ )
63
+ await send({"type": "http.response.body", "body": body})
64
+ return
65
+
66
+ prefix = (headers.get("x-forwarded-prefix") or "").strip().rstrip("/")
67
+ new_scope = scope
68
+ if prefix:
69
+ new_scope = {**scope, "root_path": prefix}
70
+
71
+ await self.app(new_scope, receive, send)
@@ -0,0 +1,23 @@
1
+ from contextvars import ContextVar
2
+ from uuid import uuid4
3
+
4
+ from starlette.middleware.base import BaseHTTPMiddleware
5
+ from starlette.requests import Request
6
+ from starlette.responses import Response
7
+
8
+ REQUEST_ID_HEADER = "X-Request-ID"
9
+
10
+ request_id_ctx: ContextVar[str | None] = ContextVar("request_id", default=None)
11
+
12
+
13
+ class RequestIdMiddleware(BaseHTTPMiddleware):
14
+ async def dispatch(self, request: Request, call_next) -> Response:
15
+ incoming = request.headers.get("x-request-id")
16
+ rid = incoming or str(uuid4())
17
+ token = request_id_ctx.set(rid)
18
+ try:
19
+ response = await call_next(request)
20
+ response.headers[REQUEST_ID_HEADER] = rid
21
+ return response
22
+ finally:
23
+ request_id_ctx.reset(token)
@@ -0,0 +1,126 @@
1
+ """OpenAPI (Swagger) metadata and schema customization for the FastAPI app."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from fastapi import FastAPI
9
+
10
+ from fastapi.openapi.utils import get_openapi
11
+
12
+ # Must stay in sync with GatewayIdentityMiddleware._UNAUTH_PATHS
13
+ _UNAUTHENTICATED_PATHS = frozenset({"/healthz", "/readyz"})
14
+
15
+ OPENAPI_TAGS: list[dict[str, str]] = [
16
+ {
17
+ "name": "health",
18
+ "description": "Liveness (`/healthz`) and readiness (`/readyz`) probes. No gateway identity header required.",
19
+ },
20
+ {
21
+ "name": "ingestion",
22
+ "description": "Ingest raw connector payloads, normalize to canonical documents, and persist blobs.",
23
+ },
24
+ {
25
+ "name": "admin",
26
+ "description": "Operational endpoints: retry embedding, re-run normalization, clear DLQ state.",
27
+ },
28
+ {
29
+ "name": "query",
30
+ "description": "RAG retrieval: embed the question (Gemini), search sqlite-vec, return ranked context.",
31
+ },
32
+ {
33
+ "name": "chat",
34
+ "description": (
35
+ "Saved chat sessions (SQLite): list/create sessions, load messages, "
36
+ "complete a turn (RAG + Gemini answer)."
37
+ ),
38
+ },
39
+ {
40
+ "name": "embeddings",
41
+ "description": "Trigger or inspect Gemini embedding jobs for ingested documents.",
42
+ },
43
+ {
44
+ "name": "chunks",
45
+ "description": "Chunk listing and related ingest-time chunk operations.",
46
+ },
47
+ {
48
+ "name": "entities",
49
+ "description": "Entity extraction over document chunks.",
50
+ },
51
+ {
52
+ "name": "link-expansion",
53
+ "description": "Configurable URL fetching and link expansion during ingest.",
54
+ },
55
+ {
56
+ "name": "vectors",
57
+ "description": "Direct vector store utilities (debugging and advanced use).",
58
+ },
59
+ {
60
+ "name": "example",
61
+ "description": "Sample routes for development.",
62
+ },
63
+ ]
64
+
65
+ APP_DESCRIPTION = """Internal HTTP API for the Business Stack knowledge base (RAG).
66
+
67
+ **How to open these docs:** use the Hono gateway URL, e.g. `{gateway}{prefix}/docs` (same prefix as `API_GATEWAY_PREFIX`, default `/api/backend`). The gateway requires a signed-in Better Auth session and injects `X-User-Id` plus `X-Forwarded-Prefix` so Swagger loads `/openapi.json` correctly.
68
+
69
+ **Direct backend access:** if `BACKEND_GATEWAY_SECRET` is set on the backend and gateway, `/docs`, `/redoc`, and `/openapi.json` return **403** unless the gateway adds matching `X-Gateway-Secret` (clients cannot spoof docs by calling the backend with only `x-user-id`). Leave the secret unset in local dev if you want OpenAPI on `http://127.0.0.1:8000/docs` without the gateway.
70
+
71
+ **Gateway identity:** every route except `/healthz` and `/readyz` requires `x-user-id`. The gateway adds it after Better Auth session validation, so **Try it out** works with your login cookies and you do not need **Authorize** unless you call the raw backend URL.
72
+
73
+ **External services:** embeddings and query use **Gemini**. The key may be `GEMINI_API_KEY` or stored in gateway integration settings; the backend uses `INTEGRATIONS_GATEWAY_URL` and `INTEGRATIONS_INTERNAL_SECRET` when env is unset.
74
+ Optional OpenAI is only for LLM-assisted chunking, not `/query`.
75
+ """
76
+
77
+
78
+ def attach_custom_openapi(app: FastAPI) -> None:
79
+ """Register a custom OpenAPI generator with gateway security and tag descriptions."""
80
+
81
+ def custom_openapi() -> dict:
82
+ if app.openapi_schema:
83
+ return app.openapi_schema
84
+ openapi_schema = get_openapi(
85
+ title=app.title,
86
+ version=app.version,
87
+ description=app.description,
88
+ routes=app.routes,
89
+ tags=OPENAPI_TAGS,
90
+ )
91
+ components = openapi_schema.setdefault("components", {})
92
+ security_schemes = components.setdefault("securitySchemes", {})
93
+ security_schemes["GatewayIdentity"] = {
94
+ "type": "apiKey",
95
+ "in": "header",
96
+ "name": "x-user-id",
97
+ "description": (
98
+ "End-user id propagated by the API gateway. Required on all routes except "
99
+ "`GET /healthz` and `GET /readyz`."
100
+ ),
101
+ }
102
+ paths = openapi_schema.get("paths", {})
103
+ for path, path_item in paths.items():
104
+ if not isinstance(path_item, dict):
105
+ continue
106
+ for method, operation in path_item.items():
107
+ if method not in (
108
+ "get",
109
+ "post",
110
+ "put",
111
+ "patch",
112
+ "delete",
113
+ "head",
114
+ "options",
115
+ ):
116
+ continue
117
+ if not isinstance(operation, dict):
118
+ continue
119
+ if path in _UNAUTHENTICATED_PATHS:
120
+ operation["security"] = []
121
+ else:
122
+ operation["security"] = [{"GatewayIdentity": []}]
123
+ app.openapi_schema = openapi_schema
124
+ return openapi_schema
125
+
126
+ app.openapi = custom_openapi # type: ignore[method-assign]
File without changes
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import UTC, datetime
5
+
6
+ from fastapi import APIRouter, BackgroundTasks, HTTPException, Request
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+ from sqlalchemy import text
9
+
10
+ from app.config import get_settings
11
+ from app.dependencies import DbSession, VectorStoreDep
12
+ from app.services.embeddings.dlq import (
13
+ cleaned_ingest_meta_dict,
14
+ clear_embedding_dlq,
15
+ merge_ingest_meta,
16
+ )
17
+ from app.services.embeddings.worker import run_embed_document_job
18
+ from app.services.ingestion.persist import (
19
+ clear_normalization_error,
20
+ load_raw_envelope_for_document,
21
+ )
22
+ from app.services.integrations_remote import resolve_gemini_api_key
23
+ from app.services.normalization import (
24
+ normalize_envelope_to_canonical,
25
+ persist_normalized_document,
26
+ )
27
+ from app.storage.blobs import BlobStore
28
+
29
+ router = APIRouter(prefix="/admin", tags=["admin"])
30
+
31
+
32
+ class RetryEmbedBody(BaseModel):
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ multimodal: bool = Field(
36
+ default=False,
37
+ description="Same flag as POST /ingest/documents/{id}/embed",
38
+ )
39
+
40
+
41
+ @router.post("/documents/{document_id}/retry-embedding")
42
+ async def admin_retry_embedding(
43
+ document_id: str,
44
+ body: RetryEmbedBody,
45
+ session: DbSession,
46
+ request: Request,
47
+ store: VectorStoreDep,
48
+ background_tasks: BackgroundTasks,
49
+ ) -> dict[str, str | bool]:
50
+ """
51
+ Clear DLQ state, reset ingest meta embedding fields, set status to ``partial``,
52
+ and queue the same background embed job as the public embed endpoint.
53
+ """
54
+ settings = get_settings()
55
+ if not await resolve_gemini_api_key(settings):
56
+ raise HTTPException(
57
+ status_code=503,
58
+ detail=(
59
+ "Gemini API key is not configured "
60
+ "(GEMINI_API_KEY or gateway integration store)"
61
+ ),
62
+ )
63
+ async with session.begin():
64
+ row = await session.execute(
65
+ text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
66
+ {"id": document_id},
67
+ )
68
+ if row.first() is None:
69
+ raise HTTPException(status_code=404, detail="Document not found")
70
+ await clear_embedding_dlq(session, document_id=document_id)
71
+ r2 = await session.execute(
72
+ text("SELECT ingest_meta FROM documents WHERE id = :id LIMIT 1"),
73
+ {"id": document_id},
74
+ )
75
+ im = r2.scalar_one_or_none()
76
+ clean = cleaned_ingest_meta_dict(im)
77
+ new_meta = merge_ingest_meta(
78
+ json.dumps(clean, default=str),
79
+ {"embedding_admin_retry_at": datetime.now(UTC).isoformat()},
80
+ )
81
+ await session.execute(
82
+ text(
83
+ "UPDATE documents SET status = 'partial', ingest_meta = :im "
84
+ "WHERE id = :id",
85
+ ),
86
+ {"im": new_meta, "id": document_id},
87
+ )
88
+
89
+ background_tasks.add_task(
90
+ run_embed_document_job,
91
+ document_id=document_id,
92
+ multimodal=body.multimodal,
93
+ session_factory=request.app.state.session_factory,
94
+ settings=settings,
95
+ store=store,
96
+ )
97
+ return {"accepted": True, "document_id": document_id}
98
+
99
+
100
+ @router.post("/documents/{document_id}/reprocess-normalization")
101
+ async def admin_reprocess_normalization(
102
+ document_id: str,
103
+ session: DbSession,
104
+ ) -> dict[str, str | bool]:
105
+ """Rebuild ``content_blocks`` / ``document_links`` from stored ``raw_content``."""
106
+ settings = get_settings()
107
+ blob_store = BlobStore(settings.data_dir / "blobs")
108
+ async with session.begin():
109
+ row = await session.execute(
110
+ text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
111
+ {"id": document_id},
112
+ )
113
+ if row.first() is None:
114
+ raise HTTPException(status_code=404, detail="Document not found")
115
+ env = await load_raw_envelope_for_document(session, document_id=document_id)
116
+ canonical = normalize_envelope_to_canonical(
117
+ document_id=document_id,
118
+ envelope=env,
119
+ blob_store=blob_store,
120
+ )
121
+ await persist_normalized_document(session, canonical=canonical)
122
+ await clear_normalization_error(session, document_id=document_id)
123
+ return {"ok": True, "document_id": document_id}