business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Gemini generateContent for chat replies over retrieved context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
from app.config import Settings
|
|
11
|
+
from app.services.integrations_remote import resolve_gemini_api_key
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
|
|
16
|
+
_MAX_CONTEXT_CHARS = 100_000
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _fallback_from_context(context_text: str) -> str:
|
|
20
|
+
t = context_text.strip()
|
|
21
|
+
if not t:
|
|
22
|
+
return (
|
|
23
|
+
"No matching passages were found in the knowledge base for this question."
|
|
24
|
+
)
|
|
25
|
+
head = t[:8000]
|
|
26
|
+
suffix = "\n\n…(truncated)" if len(t) > 8000 else ""
|
|
27
|
+
return (
|
|
28
|
+
"Here are the most relevant passages from your knowledge base "
|
|
29
|
+
"(chat model unavailable or failed):\n\n"
|
|
30
|
+
f"{head}{suffix}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _extract_text(data: dict[str, Any]) -> str | None:
|
|
35
|
+
cands = data.get("candidates")
|
|
36
|
+
if not isinstance(cands, list) or not cands:
|
|
37
|
+
return None
|
|
38
|
+
first = cands[0]
|
|
39
|
+
if not isinstance(first, dict):
|
|
40
|
+
return None
|
|
41
|
+
content = first.get("content")
|
|
42
|
+
if not isinstance(content, dict):
|
|
43
|
+
return None
|
|
44
|
+
parts = content.get("parts")
|
|
45
|
+
if not isinstance(parts, list):
|
|
46
|
+
return None
|
|
47
|
+
texts: list[str] = []
|
|
48
|
+
for p in parts:
|
|
49
|
+
if isinstance(p, dict) and isinstance(p.get("text"), str):
|
|
50
|
+
texts.append(p["text"])
|
|
51
|
+
out = "\n".join(texts).strip()
|
|
52
|
+
return out or None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def gemini_chat_reply(
|
|
56
|
+
settings: Settings,
|
|
57
|
+
*,
|
|
58
|
+
user_message: str,
|
|
59
|
+
context_combined: str,
|
|
60
|
+
) -> tuple[str, str]:
|
|
61
|
+
"""
|
|
62
|
+
Returns (reply, source) where source is ``model`` or ``retrieval_fallback``.
|
|
63
|
+
"""
|
|
64
|
+
ctx = context_combined[:_MAX_CONTEXT_CHARS]
|
|
65
|
+
api_key = await resolve_gemini_api_key(settings)
|
|
66
|
+
if not api_key:
|
|
67
|
+
return _fallback_from_context(ctx), "retrieval_fallback"
|
|
68
|
+
|
|
69
|
+
model = (settings.gemini_chat_model or "gemini-3-flash-preview").strip()
|
|
70
|
+
url = f"{_GEMINI_BASE}/models/{model}:generateContent"
|
|
71
|
+
system_text = (
|
|
72
|
+
"You are a helpful assistant. Answer using ONLY the context below. "
|
|
73
|
+
"If the context does not contain the answer, say so clearly and suggest "
|
|
74
|
+
"what might be missing. Be concise.\n\n"
|
|
75
|
+
f"--- Context ---\n{ctx}\n--- End context ---"
|
|
76
|
+
)
|
|
77
|
+
body: dict[str, Any] = {
|
|
78
|
+
"systemInstruction": {"parts": [{"text": system_text}]},
|
|
79
|
+
"contents": [
|
|
80
|
+
{
|
|
81
|
+
"role": "user",
|
|
82
|
+
"parts": [{"text": user_message}],
|
|
83
|
+
},
|
|
84
|
+
],
|
|
85
|
+
}
|
|
86
|
+
try:
|
|
87
|
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
88
|
+
r = await client.post(
|
|
89
|
+
url,
|
|
90
|
+
params={"key": api_key},
|
|
91
|
+
json=body,
|
|
92
|
+
)
|
|
93
|
+
except httpx.HTTPError:
|
|
94
|
+
logger.exception("gemini chat HTTP error")
|
|
95
|
+
return _fallback_from_context(ctx), "retrieval_fallback"
|
|
96
|
+
|
|
97
|
+
if r.status_code != 200:
|
|
98
|
+
logger.warning(
|
|
99
|
+
"gemini chat HTTP %s: %s",
|
|
100
|
+
r.status_code,
|
|
101
|
+
(r.text or "")[:300],
|
|
102
|
+
)
|
|
103
|
+
return _fallback_from_context(ctx), "retrieval_fallback"
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
data = r.json()
|
|
107
|
+
except ValueError:
|
|
108
|
+
return _fallback_from_context(ctx), "retrieval_fallback"
|
|
109
|
+
|
|
110
|
+
text = _extract_text(data) if isinstance(data, dict) else None
|
|
111
|
+
if not text:
|
|
112
|
+
return _fallback_from_context(ctx), "retrieval_fallback"
|
|
113
|
+
return text, "model"
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Post-ingest notifications when a document reaches ``status = 'ok'`` (after embed).
|
|
3
|
+
|
|
4
|
+
Templates are Python ``str.format`` strings. Available placeholders:
|
|
5
|
+
|
|
6
|
+
- ``document_id`` — KB document id
|
|
7
|
+
- ``summary`` — full stored summary (may be long)
|
|
8
|
+
- ``summary_short`` — summary truncated to ``post_ingest_summary_max_chars``
|
|
9
|
+
- ``source_link`` — ``canonical_url``, else first link row, else ``(none)``
|
|
10
|
+
- ``canonical_url`` — raw column or empty string
|
|
11
|
+
- ``source_name`` — ``sources.name``
|
|
12
|
+
|
|
13
|
+
Use ``{{`` and ``}}`` for literal braces in templates.
|
|
14
|
+
|
|
15
|
+
**Logging**: webhook URLs and payloads are never written to logs. Failures log only
|
|
16
|
+
the destination label (``slack`` / ``discord``) and HTTP status or exception type.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import httpx
|
|
25
|
+
from sqlalchemy import text
|
|
26
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
27
|
+
|
|
28
|
+
from app.config import Settings
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class _FormatMapping(dict[str, Any]):
|
|
34
|
+
"""Missing keys render as ``{key}`` instead of raising ``KeyError``."""
|
|
35
|
+
|
|
36
|
+
def __missing__(self, key: str) -> str:
|
|
37
|
+
return "{" + key + "}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _apply_template(template: str, ctx: dict[str, Any]) -> str:
|
|
41
|
+
try:
|
|
42
|
+
return template.format_map(_FormatMapping(**ctx))
|
|
43
|
+
except (ValueError, TypeError) as e:
|
|
44
|
+
logger.warning(
|
|
45
|
+
"post-ingest template format failed: %s",
|
|
46
|
+
type(e).__name__,
|
|
47
|
+
)
|
|
48
|
+
return template
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _truncate(s: str | None, max_chars: int) -> str:
|
|
52
|
+
if not s:
|
|
53
|
+
return ""
|
|
54
|
+
t = s.strip()
|
|
55
|
+
if len(t) <= max_chars:
|
|
56
|
+
return t
|
|
57
|
+
return t[: max(0, max_chars - 1)] + "…"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
async def _load_hook_context(
|
|
61
|
+
session: AsyncSession,
|
|
62
|
+
*,
|
|
63
|
+
document_id: str,
|
|
64
|
+
settings: Settings,
|
|
65
|
+
) -> dict[str, Any] | None:
|
|
66
|
+
r = await session.execute(
|
|
67
|
+
text(
|
|
68
|
+
"SELECT d.status, d.summary, d.canonical_url, s.name "
|
|
69
|
+
"FROM documents d "
|
|
70
|
+
"JOIN sources s ON s.id = d.source_id "
|
|
71
|
+
"WHERE d.id = :id LIMIT 1",
|
|
72
|
+
),
|
|
73
|
+
{"id": document_id},
|
|
74
|
+
)
|
|
75
|
+
row = r.first()
|
|
76
|
+
if row is None:
|
|
77
|
+
return None
|
|
78
|
+
status, summary, canonical_url, source_name = row[0], row[1], row[2], row[3]
|
|
79
|
+
if str(status) != "ok":
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
link_row = await session.execute(
|
|
83
|
+
text(
|
|
84
|
+
"SELECT url FROM document_links WHERE document_id = :d "
|
|
85
|
+
"ORDER BY ordinal LIMIT 1",
|
|
86
|
+
),
|
|
87
|
+
{"d": document_id},
|
|
88
|
+
)
|
|
89
|
+
first_link = link_row.scalar_one_or_none()
|
|
90
|
+
cu = (canonical_url or "").strip() if canonical_url else ""
|
|
91
|
+
fl = (first_link or "").strip() if first_link else ""
|
|
92
|
+
source_link = cu or fl or "(none)"
|
|
93
|
+
|
|
94
|
+
sm = summary or ""
|
|
95
|
+
max_c = settings.post_ingest_summary_max_chars
|
|
96
|
+
return {
|
|
97
|
+
"document_id": document_id,
|
|
98
|
+
"summary": sm,
|
|
99
|
+
"summary_short": _truncate(sm, max_c),
|
|
100
|
+
"source_link": source_link,
|
|
101
|
+
"canonical_url": cu,
|
|
102
|
+
"source_name": str(source_name or ""),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def _post_slack(*, webhook_url: str, text_body: str) -> None:
|
|
107
|
+
payload = {"text": text_body[:15000]}
|
|
108
|
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
|
109
|
+
r = await client.post(webhook_url, json=payload)
|
|
110
|
+
r.raise_for_status()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def _post_discord(*, webhook_url: str, content: str) -> None:
|
|
114
|
+
payload = {"content": content[:2000]}
|
|
115
|
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
|
116
|
+
r = await client.post(webhook_url, json=payload)
|
|
117
|
+
r.raise_for_status()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
async def dispatch_post_ingest_hooks(
|
|
121
|
+
session_factory: async_sessionmaker[AsyncSession],
|
|
122
|
+
*,
|
|
123
|
+
document_id: str,
|
|
124
|
+
settings: Settings,
|
|
125
|
+
) -> None:
|
|
126
|
+
"""
|
|
127
|
+
If the document is ``ok``, render templates and POST to configured webhooks.
|
|
128
|
+
|
|
129
|
+
Safe to call when no webhooks are configured (no-op). Never logs secrets.
|
|
130
|
+
"""
|
|
131
|
+
slack_u = (settings.post_ingest_slack_webhook_url or "").strip()
|
|
132
|
+
discord_u = (settings.post_ingest_discord_webhook_url or "").strip()
|
|
133
|
+
if not slack_u and not discord_u:
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
async with session_factory() as session:
|
|
137
|
+
ctx = await _load_hook_context(
|
|
138
|
+
session,
|
|
139
|
+
document_id=document_id,
|
|
140
|
+
settings=settings,
|
|
141
|
+
)
|
|
142
|
+
if ctx is None:
|
|
143
|
+
logger.debug("post-ingest skipped (missing doc or not ok): %s", document_id)
|
|
144
|
+
return
|
|
145
|
+
|
|
146
|
+
if slack_u:
|
|
147
|
+
body = _apply_template(settings.post_ingest_slack_template, ctx)
|
|
148
|
+
try:
|
|
149
|
+
await _post_slack(webhook_url=slack_u, text_body=body)
|
|
150
|
+
logger.info(
|
|
151
|
+
"post-ingest slack dispatch ok for document_id=%s",
|
|
152
|
+
document_id,
|
|
153
|
+
)
|
|
154
|
+
except httpx.HTTPStatusError as e:
|
|
155
|
+
logger.warning(
|
|
156
|
+
"post-ingest slack HTTP %s for document_id=%s",
|
|
157
|
+
e.response.status_code,
|
|
158
|
+
document_id,
|
|
159
|
+
)
|
|
160
|
+
except httpx.HTTPError as e:
|
|
161
|
+
logger.warning(
|
|
162
|
+
"post-ingest slack transport %s for document_id=%s",
|
|
163
|
+
type(e).__name__,
|
|
164
|
+
document_id,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if discord_u:
|
|
168
|
+
body = _apply_template(settings.post_ingest_discord_template, ctx)
|
|
169
|
+
try:
|
|
170
|
+
await _post_discord(webhook_url=discord_u, content=body)
|
|
171
|
+
logger.info(
|
|
172
|
+
"post-ingest discord dispatch ok for document_id=%s",
|
|
173
|
+
document_id,
|
|
174
|
+
)
|
|
175
|
+
except httpx.HTTPStatusError as e:
|
|
176
|
+
logger.warning(
|
|
177
|
+
"post-ingest discord HTTP %s for document_id=%s",
|
|
178
|
+
e.response.status_code,
|
|
179
|
+
document_id,
|
|
180
|
+
)
|
|
181
|
+
except httpx.HTTPError as e:
|
|
182
|
+
logger.warning(
|
|
183
|
+
"post-ingest discord transport %s for document_id=%s",
|
|
184
|
+
type(e).__name__,
|
|
185
|
+
document_id,
|
|
186
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import text
|
|
8
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
+
|
|
10
|
+
from app.schemas.ingest import RawIngestEnvelope
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def compute_envelope_dedupe_hash(envelope: RawIngestEnvelope) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Content-stable SHA-256 for dedupe when ``external_id`` is absent.
|
|
16
|
+
|
|
17
|
+
Excludes ``timestamp`` so replays with a new event time still dedupe.
|
|
18
|
+
If ``metadata.dedupe_content_hash`` is set (hex string), that value wins.
|
|
19
|
+
"""
|
|
20
|
+
raw = envelope.metadata.get("dedupe_content_hash")
|
|
21
|
+
if isinstance(raw, str):
|
|
22
|
+
s = raw.strip().lower()
|
|
23
|
+
if len(s) >= 16:
|
|
24
|
+
return s[:128]
|
|
25
|
+
stable = {
|
|
26
|
+
"source": envelope.source,
|
|
27
|
+
"content_type": envelope.content_type,
|
|
28
|
+
"payload": envelope.payload,
|
|
29
|
+
"metadata": envelope.metadata,
|
|
30
|
+
}
|
|
31
|
+
payload = json.dumps(stable, sort_keys=True, default=str, separators=(",", ":"))
|
|
32
|
+
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def envelope_external_id(envelope: RawIngestEnvelope) -> str | None:
|
|
36
|
+
raw = envelope.metadata.get("external_id")
|
|
37
|
+
if raw is None:
|
|
38
|
+
return None
|
|
39
|
+
s = str(raw).strip()
|
|
40
|
+
return s or None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def get_or_create_source_id(
|
|
44
|
+
session: AsyncSession,
|
|
45
|
+
*,
|
|
46
|
+
name: str,
|
|
47
|
+
connector_type: str,
|
|
48
|
+
) -> int:
|
|
49
|
+
res = await session.execute(
|
|
50
|
+
text(
|
|
51
|
+
"SELECT id FROM sources WHERE name = :name AND connector_type = :ct "
|
|
52
|
+
"LIMIT 1",
|
|
53
|
+
),
|
|
54
|
+
{"name": name, "ct": connector_type},
|
|
55
|
+
)
|
|
56
|
+
row = res.first()
|
|
57
|
+
if row is not None:
|
|
58
|
+
return int(row[0])
|
|
59
|
+
ins = await session.execute(
|
|
60
|
+
text(
|
|
61
|
+
"INSERT INTO sources (name, connector_type) "
|
|
62
|
+
"VALUES (:name, :ct) RETURNING id",
|
|
63
|
+
),
|
|
64
|
+
{"name": name, "ct": connector_type},
|
|
65
|
+
)
|
|
66
|
+
return int(ins.scalar_one())
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
async def find_document_by_dedupe(
|
|
70
|
+
session: AsyncSession,
|
|
71
|
+
*,
|
|
72
|
+
source_id: int,
|
|
73
|
+
external_id: str | None,
|
|
74
|
+
dedupe_hash: str,
|
|
75
|
+
) -> str | None:
|
|
76
|
+
if external_id:
|
|
77
|
+
r = await session.execute(
|
|
78
|
+
text(
|
|
79
|
+
"SELECT id FROM documents WHERE source_id = :sid "
|
|
80
|
+
"AND external_id = :eid LIMIT 1",
|
|
81
|
+
),
|
|
82
|
+
{"sid": source_id, "eid": external_id},
|
|
83
|
+
)
|
|
84
|
+
row = r.first()
|
|
85
|
+
if row is not None:
|
|
86
|
+
return str(row[0])
|
|
87
|
+
r2 = await session.execute(
|
|
88
|
+
text(
|
|
89
|
+
"SELECT id FROM documents WHERE source_id = :sid "
|
|
90
|
+
"AND dedupe_content_hash = :h "
|
|
91
|
+
"AND (external_id IS NULL OR trim(external_id) = '') "
|
|
92
|
+
"LIMIT 1",
|
|
93
|
+
),
|
|
94
|
+
{"sid": source_id, "h": dedupe_hash},
|
|
95
|
+
)
|
|
96
|
+
row2 = r2.first()
|
|
97
|
+
return str(row2[0]) if row2 is not None else None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
async def load_raw_envelope_for_document(
|
|
101
|
+
session: AsyncSession,
|
|
102
|
+
*,
|
|
103
|
+
document_id: str,
|
|
104
|
+
) -> RawIngestEnvelope:
|
|
105
|
+
r = await session.execute(
|
|
106
|
+
text("SELECT raw_content FROM documents WHERE id = :id LIMIT 1"),
|
|
107
|
+
{"id": document_id},
|
|
108
|
+
)
|
|
109
|
+
row = r.first()
|
|
110
|
+
if row is None:
|
|
111
|
+
msg = "document not found"
|
|
112
|
+
raise ValueError(msg)
|
|
113
|
+
data = json.loads(str(row[0]))
|
|
114
|
+
return RawIngestEnvelope.model_validate(data)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
async def insert_document_partial(
|
|
118
|
+
session: AsyncSession,
|
|
119
|
+
*,
|
|
120
|
+
envelope: RawIngestEnvelope,
|
|
121
|
+
source_id: int,
|
|
122
|
+
external_id: str | None,
|
|
123
|
+
dedupe_content_hash: str,
|
|
124
|
+
) -> str:
|
|
125
|
+
doc_id = str(uuid4())
|
|
126
|
+
stored = json.dumps(envelope.model_dump(mode="json"), default=str)
|
|
127
|
+
ts = envelope.timestamp.isoformat()
|
|
128
|
+
await session.execute(
|
|
129
|
+
text(
|
|
130
|
+
"INSERT INTO documents "
|
|
131
|
+
"(id, source_id, timestamp, content_type, raw_content, summary, status, "
|
|
132
|
+
"external_id, dedupe_content_hash, normalization_error) "
|
|
133
|
+
"VALUES (:id, :sid, :ts, :ctype, :raw, NULL, 'partial', "
|
|
134
|
+
":eid, :dhash, NULL)",
|
|
135
|
+
),
|
|
136
|
+
{
|
|
137
|
+
"id": doc_id,
|
|
138
|
+
"sid": source_id,
|
|
139
|
+
"ts": ts,
|
|
140
|
+
"ctype": envelope.content_type,
|
|
141
|
+
"raw": stored,
|
|
142
|
+
"eid": external_id,
|
|
143
|
+
"dhash": dedupe_content_hash,
|
|
144
|
+
},
|
|
145
|
+
)
|
|
146
|
+
return doc_id
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
async def persist_raw_envelope(
|
|
150
|
+
session: AsyncSession,
|
|
151
|
+
envelope: RawIngestEnvelope,
|
|
152
|
+
) -> tuple[str, bool]:
|
|
153
|
+
"""
|
|
154
|
+
Insert raw document or return an existing id when ``source_id`` +
|
|
155
|
+
``external_id`` or ``source_id`` + content hash matches.
|
|
156
|
+
|
|
157
|
+
Returns ``(document_id, deduplicated)``.
|
|
158
|
+
"""
|
|
159
|
+
source_id = await get_or_create_source_id(
|
|
160
|
+
session,
|
|
161
|
+
name=envelope.source,
|
|
162
|
+
connector_type=envelope.source,
|
|
163
|
+
)
|
|
164
|
+
ext = envelope_external_id(envelope)
|
|
165
|
+
dedupe_hash = compute_envelope_dedupe_hash(envelope)
|
|
166
|
+
existing = await find_document_by_dedupe(
|
|
167
|
+
session,
|
|
168
|
+
source_id=source_id,
|
|
169
|
+
external_id=ext,
|
|
170
|
+
dedupe_hash=dedupe_hash,
|
|
171
|
+
)
|
|
172
|
+
if existing is not None:
|
|
173
|
+
return existing, True
|
|
174
|
+
doc_id = await insert_document_partial(
|
|
175
|
+
session,
|
|
176
|
+
envelope=envelope,
|
|
177
|
+
source_id=source_id,
|
|
178
|
+
external_id=ext,
|
|
179
|
+
dedupe_content_hash=dedupe_hash,
|
|
180
|
+
)
|
|
181
|
+
return doc_id, False
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
async def clear_normalization_error(session: AsyncSession, *, document_id: str) -> None:
|
|
185
|
+
await session.execute(
|
|
186
|
+
text("UPDATE documents SET normalization_error = NULL WHERE id = :id"),
|
|
187
|
+
{"id": document_id},
|
|
188
|
+
)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Fetch integration secrets from the Hono gateway (encrypted SQLite)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from app.config import Settings
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_TTL_S = 60.0
|
|
17
|
+
_lock = asyncio.Lock()
|
|
18
|
+
_cache_payload: dict[str, Any] | None = None
|
|
19
|
+
_cache_at: float = 0.0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def clear_remote_integrations_cache() -> None:
|
|
23
|
+
"""Test hook: drop cached GET /internal/integrations response."""
|
|
24
|
+
global _cache_payload, _cache_at
|
|
25
|
+
_cache_payload = None
|
|
26
|
+
_cache_at = 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def _fetch_integrations_json(settings: Settings) -> dict[str, Any] | None:
|
|
30
|
+
base = (settings.integrations_gateway_url or "").strip().rstrip("/")
|
|
31
|
+
secret = (settings.integrations_internal_secret or "").strip()
|
|
32
|
+
if not base or not secret:
|
|
33
|
+
return None
|
|
34
|
+
url = f"{base}/internal/integrations"
|
|
35
|
+
try:
|
|
36
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
37
|
+
r = await client.get(
|
|
38
|
+
url,
|
|
39
|
+
headers={"Authorization": f"Bearer {secret}"},
|
|
40
|
+
)
|
|
41
|
+
except httpx.HTTPError:
|
|
42
|
+
logger.warning("integrations gateway request failed: %s", url)
|
|
43
|
+
return None
|
|
44
|
+
if r.status_code != 200:
|
|
45
|
+
logger.warning(
|
|
46
|
+
"integrations gateway HTTP %s for %s",
|
|
47
|
+
r.status_code,
|
|
48
|
+
url,
|
|
49
|
+
)
|
|
50
|
+
return None
|
|
51
|
+
try:
|
|
52
|
+
data = r.json()
|
|
53
|
+
except ValueError:
|
|
54
|
+
return None
|
|
55
|
+
if not isinstance(data, dict):
|
|
56
|
+
return None
|
|
57
|
+
return data
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
async def get_remote_integrations_payload(
|
|
61
|
+
settings: Settings,
|
|
62
|
+
) -> dict[str, Any] | None:
|
|
63
|
+
"""Cached plaintext integration dict from the gateway (or None)."""
|
|
64
|
+
global _cache_payload, _cache_at
|
|
65
|
+
async with _lock:
|
|
66
|
+
now = time.monotonic()
|
|
67
|
+
if _cache_payload is not None and now - _cache_at < _TTL_S:
|
|
68
|
+
return _cache_payload
|
|
69
|
+
payload = await _fetch_integrations_json(settings)
|
|
70
|
+
_cache_payload = payload
|
|
71
|
+
_cache_at = now
|
|
72
|
+
return payload
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def resolve_gemini_api_key(settings: Settings) -> str | None:
|
|
76
|
+
"""
|
|
77
|
+
Env ``GEMINI_API_KEY`` wins. Otherwise use ``geminiApiKey`` from the gateway
|
|
78
|
+
integration store when ``INTEGRATIONS_GATEWAY_URL`` and
|
|
79
|
+
``INTEGRATIONS_INTERNAL_SECRET`` are set.
|
|
80
|
+
"""
|
|
81
|
+
env_key = (settings.gemini_api_key or "").strip()
|
|
82
|
+
if env_key:
|
|
83
|
+
return env_key
|
|
84
|
+
data = await get_remote_integrations_payload(settings)
|
|
85
|
+
if not data:
|
|
86
|
+
return None
|
|
87
|
+
raw = data.get("geminiApiKey")
|
|
88
|
+
if not isinstance(raw, str):
|
|
89
|
+
return None
|
|
90
|
+
key = raw.strip()
|
|
91
|
+
return key or None
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def host_from_url(url: str) -> str | None:
|
|
7
|
+
try:
|
|
8
|
+
p = urlparse(url.strip())
|
|
9
|
+
except ValueError:
|
|
10
|
+
return None
|
|
11
|
+
if not p.netloc:
|
|
12
|
+
return None
|
|
13
|
+
return p.netloc.split("@")[-1].split(":")[0].lower()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def canonicalize_url(url: str) -> str:
|
|
17
|
+
"""Normalize URL for dedupe (scheme/host lower, sorted query, no fragment)."""
|
|
18
|
+
raw = url.strip()
|
|
19
|
+
try:
|
|
20
|
+
p = urlparse(raw)
|
|
21
|
+
except ValueError:
|
|
22
|
+
return raw
|
|
23
|
+
if not p.scheme or not p.netloc:
|
|
24
|
+
return raw
|
|
25
|
+
scheme = p.scheme.lower()
|
|
26
|
+
netloc = p.netloc.lower()
|
|
27
|
+
if scheme == "http" and netloc.endswith(":80"):
|
|
28
|
+
netloc = netloc[:-3]
|
|
29
|
+
elif scheme == "https" and netloc.endswith(":443"):
|
|
30
|
+
netloc = netloc[:-4]
|
|
31
|
+
path = p.path or "/"
|
|
32
|
+
if len(path) > 1 and path.endswith("/"):
|
|
33
|
+
path = path.rstrip("/")
|
|
34
|
+
q = parse_qsl(p.query, keep_blank_values=True)
|
|
35
|
+
q.sort()
|
|
36
|
+
query = urlencode(q)
|
|
37
|
+
return urlunparse((scheme, netloc, path, "", query, ""))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def is_http_url(url: str) -> bool:
|
|
41
|
+
try:
|
|
42
|
+
p = urlparse(url.strip())
|
|
43
|
+
except ValueError:
|
|
44
|
+
return False
|
|
45
|
+
return p.scheme in ("http", "https") and bool(p.netloc)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def parse_host_csv(csv: str) -> frozenset[str]:
|
|
5
|
+
return frozenset(
|
|
6
|
+
h.strip().lower().split(":")[0] for h in csv.split(",") if h.strip()
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def host_allowed(
|
|
11
|
+
host: str,
|
|
12
|
+
*,
|
|
13
|
+
allowlist: frozenset[str],
|
|
14
|
+
denylist: frozenset[str],
|
|
15
|
+
) -> bool:
|
|
16
|
+
h = host.lower().split(":")[0]
|
|
17
|
+
if h in denylist:
|
|
18
|
+
return False
|
|
19
|
+
if not allowlist:
|
|
20
|
+
return True
|
|
21
|
+
if h in allowlist:
|
|
22
|
+
return True
|
|
23
|
+
for allowed in allowlist:
|
|
24
|
+
if h == allowed or h.endswith(f".{allowed}"):
|
|
25
|
+
return True
|
|
26
|
+
return False
|