business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import hashlib
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from app.schemas.ingest import RawIngestEnvelope
|
|
11
|
+
from app.services.normalization.normalizer import (
|
|
12
|
+
extract_urls_from_text,
|
|
13
|
+
normalize_envelope_to_canonical,
|
|
14
|
+
parse_blob_sha256,
|
|
15
|
+
)
|
|
16
|
+
from app.storage.blobs import BlobStore
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_text_only_string_payload(tmp_path: Path) -> None:
|
|
20
|
+
store = BlobStore(tmp_path / "blobs")
|
|
21
|
+
ts = datetime(2026, 4, 5, 12, 0, tzinfo=UTC)
|
|
22
|
+
env = RawIngestEnvelope(
|
|
23
|
+
source="slack",
|
|
24
|
+
timestamp=ts,
|
|
25
|
+
content_type="text",
|
|
26
|
+
payload="Hello from the connector.",
|
|
27
|
+
metadata={},
|
|
28
|
+
)
|
|
29
|
+
doc = normalize_envelope_to_canonical(
|
|
30
|
+
document_id="doc-1",
|
|
31
|
+
envelope=env,
|
|
32
|
+
blob_store=store,
|
|
33
|
+
)
|
|
34
|
+
assert doc.id == "doc-1"
|
|
35
|
+
assert doc.source == "slack"
|
|
36
|
+
assert doc.timestamp == ts
|
|
37
|
+
assert len(doc.content_blocks) == 1
|
|
38
|
+
assert doc.content_blocks[0].type == "text"
|
|
39
|
+
assert doc.content_blocks[0].data == "Hello from the connector."
|
|
40
|
+
assert doc.content_blocks[0].raw_input is None
|
|
41
|
+
assert doc.content_blocks[0].mime == "text/plain"
|
|
42
|
+
assert doc.links == []
|
|
43
|
+
assert doc.summary == "Hello from the connector."
|
|
44
|
+
assert doc.raw_content == "Hello from the connector."
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_text_and_image_multimodal_preserves_blocks_and_blob(tmp_path: Path) -> None:
|
|
48
|
+
store = BlobStore(tmp_path / "blobs")
|
|
49
|
+
image_bytes = b"\x89PNG\r\n\x1a\nfake-png-bytes-for-test"
|
|
50
|
+
b64 = base64.b64encode(image_bytes).decode("ascii")
|
|
51
|
+
expected_sha = hashlib.sha256(image_bytes).hexdigest()
|
|
52
|
+
ts = datetime(2026, 4, 5, 12, 0, tzinfo=UTC)
|
|
53
|
+
env = RawIngestEnvelope(
|
|
54
|
+
source="twitter",
|
|
55
|
+
timestamp=ts,
|
|
56
|
+
content_type="multimodal",
|
|
57
|
+
payload={
|
|
58
|
+
"tweet": "Hot take + chart",
|
|
59
|
+
"image_base64": b64,
|
|
60
|
+
},
|
|
61
|
+
metadata={},
|
|
62
|
+
)
|
|
63
|
+
doc = normalize_envelope_to_canonical(
|
|
64
|
+
document_id="doc-2",
|
|
65
|
+
envelope=env,
|
|
66
|
+
blob_store=store,
|
|
67
|
+
)
|
|
68
|
+
assert len(doc.content_blocks) == 2
|
|
69
|
+
assert doc.content_blocks[0].type == "text"
|
|
70
|
+
assert doc.content_blocks[0].data == "Hot take + chart"
|
|
71
|
+
assert doc.content_blocks[1].type == "image"
|
|
72
|
+
assert doc.content_blocks[1].raw_input == f"blob://{expected_sha}"
|
|
73
|
+
sha = parse_blob_sha256(doc.content_blocks[1].raw_input)
|
|
74
|
+
assert sha == expected_sha
|
|
75
|
+
blob_path = store._path_for(expected_sha)
|
|
76
|
+
assert blob_path.is_file()
|
|
77
|
+
assert blob_path.read_bytes() == image_bytes
|
|
78
|
+
assert doc.summary.startswith("Hot take")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_url_metadata_and_inline_urls_deduped(tmp_path: Path) -> None:
|
|
82
|
+
store = BlobStore(tmp_path / "blobs")
|
|
83
|
+
ts = datetime(2026, 4, 5, 12, 0, tzinfo=UTC)
|
|
84
|
+
env = RawIngestEnvelope(
|
|
85
|
+
source="rss",
|
|
86
|
+
timestamp=ts,
|
|
87
|
+
content_type="text",
|
|
88
|
+
payload="Read https://example.com/a and also https://example.com/b",
|
|
89
|
+
metadata={
|
|
90
|
+
"url": "https://example.com/a",
|
|
91
|
+
"urls": ["https://example.com/c"],
|
|
92
|
+
},
|
|
93
|
+
)
|
|
94
|
+
doc = normalize_envelope_to_canonical(
|
|
95
|
+
document_id="doc-3",
|
|
96
|
+
envelope=env,
|
|
97
|
+
blob_store=store,
|
|
98
|
+
)
|
|
99
|
+
assert doc.links == [
|
|
100
|
+
"https://example.com/a",
|
|
101
|
+
"https://example.com/b",
|
|
102
|
+
"https://example.com/c",
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@pytest.mark.parametrize(
|
|
107
|
+
"text,expected",
|
|
108
|
+
[
|
|
109
|
+
("no urls here", []),
|
|
110
|
+
("see https://x.com/foo?q=1", ["https://x.com/foo?q=1"]),
|
|
111
|
+
],
|
|
112
|
+
)
|
|
113
|
+
def test_extract_urls_from_text(text: str, expected: list[str]) -> None:
|
|
114
|
+
assert extract_urls_from_text(text) == expected
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from httpx import ASGITransport, AsyncClient
|
|
3
|
+
|
|
4
|
+
from app.config import get_settings
|
|
5
|
+
from app.main import create_app
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def anyio_backend():
|
|
10
|
+
return "asyncio"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.mark.anyio
|
|
14
|
+
async def test_openapi_paths_require_gateway_secret_when_configured(monkeypatch):
|
|
15
|
+
monkeypatch.setenv("BACKEND_GATEWAY_SECRET", "test-gateway-secret")
|
|
16
|
+
get_settings.cache_clear()
|
|
17
|
+
app = create_app()
|
|
18
|
+
transport = ASGITransport(app=app)
|
|
19
|
+
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
|
20
|
+
r = await client.get("/docs", headers={"x-user-id": "u1"})
|
|
21
|
+
assert r.status_code == 403
|
|
22
|
+
r2 = await client.get(
|
|
23
|
+
"/docs",
|
|
24
|
+
headers={
|
|
25
|
+
"x-user-id": "u1",
|
|
26
|
+
"x-gateway-secret": "test-gateway-secret",
|
|
27
|
+
},
|
|
28
|
+
)
|
|
29
|
+
assert r2.status_code == 200
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.anyio
|
|
33
|
+
async def test_openapi_paths_allow_without_secret(monkeypatch):
|
|
34
|
+
monkeypatch.delenv("BACKEND_GATEWAY_SECRET", raising=False)
|
|
35
|
+
get_settings.cache_clear()
|
|
36
|
+
app = create_app()
|
|
37
|
+
transport = ASGITransport(app=app)
|
|
38
|
+
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
|
39
|
+
r = await client.get("/docs", headers={"x-user-id": "u1"})
|
|
40
|
+
assert r.status_code == 200
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sqlite3
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
10
|
+
|
|
11
|
+
from alembic.config import Config
|
|
12
|
+
from fastapi.testclient import TestClient
|
|
13
|
+
from sqlalchemy import text
|
|
14
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
|
15
|
+
|
|
16
|
+
from alembic import command
|
|
17
|
+
from app.config import Settings, clear_settings_cache, get_settings
|
|
18
|
+
from app.services.embeddings.worker import run_embed_document_job
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_ingest_dedupe_external_id(migrated_client: TestClient) -> None:
|
|
22
|
+
h = {"x-user-id": "u1"}
|
|
23
|
+
ts1 = datetime(2026, 4, 1, 10, 0, 0, tzinfo=UTC)
|
|
24
|
+
ts2 = datetime(2026, 4, 2, 11, 0, 0, tzinfo=UTC)
|
|
25
|
+
base = {
|
|
26
|
+
"source": "dedupe-src",
|
|
27
|
+
"content_type": "text",
|
|
28
|
+
"payload": "same body",
|
|
29
|
+
"metadata": {"external_id": "ext-42"},
|
|
30
|
+
}
|
|
31
|
+
r1 = migrated_client.post(
|
|
32
|
+
"/ingest/raw?connector=echo",
|
|
33
|
+
headers=h,
|
|
34
|
+
json={**base, "timestamp": ts1.isoformat()},
|
|
35
|
+
)
|
|
36
|
+
r2 = migrated_client.post(
|
|
37
|
+
"/ingest/raw?connector=echo",
|
|
38
|
+
headers=h,
|
|
39
|
+
json={**base, "timestamp": ts2.isoformat()},
|
|
40
|
+
)
|
|
41
|
+
assert r1.status_code == 200
|
|
42
|
+
assert r2.status_code == 200
|
|
43
|
+
assert r1.json()["document_id"] == r2.json()["document_id"]
|
|
44
|
+
assert r2.json()["deduplicated"] is True
|
|
45
|
+
assert r1.json()["deduplicated"] is False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_ingest_dedupe_content_hash_no_external_id(
|
|
49
|
+
migrated_client: TestClient,
|
|
50
|
+
) -> None:
|
|
51
|
+
h = {"x-user-id": "u1"}
|
|
52
|
+
ts1 = datetime(2026, 4, 1, 10, 0, 0, tzinfo=UTC)
|
|
53
|
+
ts2 = datetime(2026, 4, 2, 11, 0, 0, tzinfo=UTC)
|
|
54
|
+
body = {
|
|
55
|
+
"source": "hash-src",
|
|
56
|
+
"content_type": "text",
|
|
57
|
+
"payload": "identical",
|
|
58
|
+
"metadata": {},
|
|
59
|
+
}
|
|
60
|
+
r1 = migrated_client.post(
|
|
61
|
+
"/ingest/raw?connector=echo",
|
|
62
|
+
headers=h,
|
|
63
|
+
json={**body, "timestamp": ts1.isoformat()},
|
|
64
|
+
)
|
|
65
|
+
r2 = migrated_client.post(
|
|
66
|
+
"/ingest/raw?connector=echo",
|
|
67
|
+
headers=h,
|
|
68
|
+
json={**body, "timestamp": ts2.isoformat()},
|
|
69
|
+
)
|
|
70
|
+
assert r1.json()["document_id"] == r2.json()["document_id"]
|
|
71
|
+
assert r2.json()["deduplicated"] is True
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_ingest_normalization_failure_stores_error_and_partial(
|
|
75
|
+
migrated_client: TestClient,
|
|
76
|
+
monkeypatch,
|
|
77
|
+
) -> None:
|
|
78
|
+
h = {"x-user-id": "u1"}
|
|
79
|
+
|
|
80
|
+
def boom(*args, **kwargs):
|
|
81
|
+
raise RuntimeError("normalize boom")
|
|
82
|
+
|
|
83
|
+
monkeypatch.setattr(
|
|
84
|
+
"app.routers.ingestion.normalize_envelope_to_canonical",
|
|
85
|
+
boom,
|
|
86
|
+
)
|
|
87
|
+
r = migrated_client.post(
|
|
88
|
+
"/ingest/raw?connector=echo",
|
|
89
|
+
headers=h,
|
|
90
|
+
json={
|
|
91
|
+
"source": "norm-fail",
|
|
92
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
93
|
+
"content_type": "text",
|
|
94
|
+
"payload": "x",
|
|
95
|
+
"metadata": {},
|
|
96
|
+
},
|
|
97
|
+
)
|
|
98
|
+
assert r.status_code == 200
|
|
99
|
+
data = r.json()
|
|
100
|
+
assert data["normalization_failed"] is True
|
|
101
|
+
assert data["canonical"]["content_blocks"] == []
|
|
102
|
+
doc_id = data["document_id"]
|
|
103
|
+
settings = get_settings()
|
|
104
|
+
conn = sqlite3.connect(str(settings.data_dir / settings.sqlite_filename))
|
|
105
|
+
row = conn.execute(
|
|
106
|
+
"SELECT normalization_error FROM documents WHERE id = ?",
|
|
107
|
+
(doc_id,),
|
|
108
|
+
).fetchone()
|
|
109
|
+
conn.close()
|
|
110
|
+
assert row is not None
|
|
111
|
+
assert row[0] is not None
|
|
112
|
+
err = json.loads(row[0])
|
|
113
|
+
assert err["type"] == "RuntimeError"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_admin_reprocess_normalization(migrated_client: TestClient) -> None:
|
|
117
|
+
h = {"x-user-id": "u1"}
|
|
118
|
+
|
|
119
|
+
def boom(*args, **kwargs):
|
|
120
|
+
raise RuntimeError("temporary")
|
|
121
|
+
|
|
122
|
+
with patch(
|
|
123
|
+
"app.routers.ingestion.normalize_envelope_to_canonical",
|
|
124
|
+
side_effect=boom,
|
|
125
|
+
):
|
|
126
|
+
r = migrated_client.post(
|
|
127
|
+
"/ingest/raw?connector=echo",
|
|
128
|
+
headers=h,
|
|
129
|
+
json={
|
|
130
|
+
"source": "reproc",
|
|
131
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
132
|
+
"content_type": "text",
|
|
133
|
+
"payload": "hello reproc",
|
|
134
|
+
"metadata": {},
|
|
135
|
+
},
|
|
136
|
+
)
|
|
137
|
+
assert r.status_code == 200
|
|
138
|
+
doc_id = r.json()["document_id"]
|
|
139
|
+
|
|
140
|
+
rr = migrated_client.post(
|
|
141
|
+
f"/admin/documents/{doc_id}/reprocess-normalization",
|
|
142
|
+
headers=h,
|
|
143
|
+
)
|
|
144
|
+
assert rr.status_code == 200
|
|
145
|
+
settings = get_settings()
|
|
146
|
+
conn = sqlite3.connect(str(settings.data_dir / settings.sqlite_filename))
|
|
147
|
+
ne = conn.execute(
|
|
148
|
+
"SELECT normalization_error FROM documents WHERE id = ?",
|
|
149
|
+
(doc_id,),
|
|
150
|
+
).fetchone()[0]
|
|
151
|
+
blocks = conn.execute(
|
|
152
|
+
"SELECT COUNT(*) FROM content_blocks WHERE document_id = ?",
|
|
153
|
+
(doc_id,),
|
|
154
|
+
).fetchone()[0]
|
|
155
|
+
conn.close()
|
|
156
|
+
assert ne is None
|
|
157
|
+
assert blocks >= 1
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_embed_failure_writes_dlq(tmp_path, monkeypatch) -> None:
|
|
161
|
+
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
162
|
+
monkeypatch.setenv("GEMINI_API_KEY", "k")
|
|
163
|
+
monkeypatch.setenv("EMBEDDING_DLQ_MAX_ATTEMPTS", "3")
|
|
164
|
+
clear_settings_cache()
|
|
165
|
+
|
|
166
|
+
backend = Path(__file__).resolve().parents[1]
|
|
167
|
+
prev = os.getcwd()
|
|
168
|
+
os.chdir(str(backend))
|
|
169
|
+
try:
|
|
170
|
+
command.upgrade(Config(str(backend / "alembic.ini")), "head")
|
|
171
|
+
finally:
|
|
172
|
+
os.chdir(prev)
|
|
173
|
+
clear_settings_cache()
|
|
174
|
+
|
|
175
|
+
settings = Settings()
|
|
176
|
+
engine = create_async_engine(settings.sqlalchemy_database_url)
|
|
177
|
+
factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
|
178
|
+
doc_id = "dlq-doc-1"
|
|
179
|
+
|
|
180
|
+
async def _seed() -> None:
|
|
181
|
+
async with factory() as session:
|
|
182
|
+
await session.execute(
|
|
183
|
+
text("INSERT INTO sources (name, connector_type) VALUES ('s','s')"),
|
|
184
|
+
)
|
|
185
|
+
sid_row = await session.execute(text("SELECT id FROM sources LIMIT 1"))
|
|
186
|
+
sid = int(sid_row.scalar_one())
|
|
187
|
+
await session.execute(
|
|
188
|
+
text(
|
|
189
|
+
"INSERT INTO documents "
|
|
190
|
+
"(id, source_id, timestamp, content_type, raw_content, "
|
|
191
|
+
"summary, status) VALUES "
|
|
192
|
+
"(:id, :sid, :ts, 'text', '{}', 'x', 'partial')",
|
|
193
|
+
),
|
|
194
|
+
{"id": doc_id, "sid": sid, "ts": datetime.now(UTC).isoformat()},
|
|
195
|
+
)
|
|
196
|
+
await session.execute(
|
|
197
|
+
text(
|
|
198
|
+
"INSERT INTO document_chunks "
|
|
199
|
+
"(document_id, ordinal, text, start_block_ordinal, "
|
|
200
|
+
"end_block_ordinal, meta) VALUES (:d, 0, 't', 0, 0, NULL)",
|
|
201
|
+
),
|
|
202
|
+
{"d": doc_id},
|
|
203
|
+
)
|
|
204
|
+
await session.commit()
|
|
205
|
+
|
|
206
|
+
asyncio.run(_seed())
|
|
207
|
+
|
|
208
|
+
async def _fail() -> None:
|
|
209
|
+
with patch(
|
|
210
|
+
"app.services.embeddings.worker.batch_embed_contents",
|
|
211
|
+
new=AsyncMock(side_effect=RuntimeError("api down")),
|
|
212
|
+
):
|
|
213
|
+
await run_embed_document_job(
|
|
214
|
+
document_id=doc_id,
|
|
215
|
+
multimodal=False,
|
|
216
|
+
session_factory=factory,
|
|
217
|
+
settings=get_settings(),
|
|
218
|
+
store=MagicMock(),
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
asyncio.run(_fail())
|
|
222
|
+
|
|
223
|
+
db_path = (get_settings().data_dir / get_settings().sqlite_filename).resolve()
|
|
224
|
+
conn = sqlite3.connect(str(db_path))
|
|
225
|
+
row = conn.execute(
|
|
226
|
+
"SELECT attempt_count, state FROM embedding_dlq WHERE document_id = ?",
|
|
227
|
+
(doc_id,),
|
|
228
|
+
).fetchone()
|
|
229
|
+
conn.close()
|
|
230
|
+
assert row is not None
|
|
231
|
+
assert int(row[0]) == 1
|
|
232
|
+
assert row[1] == "pending_retry"
|
|
233
|
+
|
|
234
|
+
asyncio.run(engine.dispose())
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_admin_retry_embedding_clears_dlq(
|
|
238
|
+
migrated_client: TestClient,
|
|
239
|
+
monkeypatch,
|
|
240
|
+
) -> None:
|
|
241
|
+
monkeypatch.setenv("GEMINI_API_KEY", "k")
|
|
242
|
+
clear_settings_cache()
|
|
243
|
+
h = {"x-user-id": "u1"}
|
|
244
|
+
ing = migrated_client.post(
|
|
245
|
+
"/ingest/raw?connector=echo",
|
|
246
|
+
headers=h,
|
|
247
|
+
json={
|
|
248
|
+
"source": "adm",
|
|
249
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
250
|
+
"content_type": "text",
|
|
251
|
+
"payload": "p",
|
|
252
|
+
"metadata": {},
|
|
253
|
+
},
|
|
254
|
+
)
|
|
255
|
+
doc_id = ing.json()["document_id"]
|
|
256
|
+
settings = get_settings()
|
|
257
|
+
db_path = str(settings.data_dir / settings.sqlite_filename)
|
|
258
|
+
conn = sqlite3.connect(db_path)
|
|
259
|
+
conn.execute(
|
|
260
|
+
"INSERT INTO embedding_dlq (document_id, last_error, attempt_count, "
|
|
261
|
+
"next_retry_at, state, multimodal) VALUES (?, ?, 2, NULL, 'dead', 0)",
|
|
262
|
+
(doc_id, "old err"),
|
|
263
|
+
)
|
|
264
|
+
conn.commit()
|
|
265
|
+
conn.close()
|
|
266
|
+
|
|
267
|
+
mock_job = MagicMock()
|
|
268
|
+
with patch(
|
|
269
|
+
"app.routers.admin_pipeline.run_embed_document_job",
|
|
270
|
+
mock_job,
|
|
271
|
+
):
|
|
272
|
+
r = migrated_client.post(
|
|
273
|
+
f"/admin/documents/{doc_id}/retry-embedding",
|
|
274
|
+
headers=h,
|
|
275
|
+
json={"multimodal": False},
|
|
276
|
+
)
|
|
277
|
+
assert r.status_code == 200
|
|
278
|
+
conn = sqlite3.connect(db_path)
|
|
279
|
+
n = conn.execute(
|
|
280
|
+
"SELECT COUNT(*) FROM embedding_dlq WHERE document_id = ?",
|
|
281
|
+
(doc_id,),
|
|
282
|
+
).fetchone()[0]
|
|
283
|
+
conn.close()
|
|
284
|
+
assert n == 0
|
|
285
|
+
mock_job.assert_called_once()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from fastapi.testclient import TestClient
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_pipeline_status_404(migrated_client: TestClient) -> None:
|
|
5
|
+
h = {"x-user-id": "u-pipe"}
|
|
6
|
+
r = migrated_client.get(
|
|
7
|
+
"/ingest/documents/missing-id/pipeline",
|
|
8
|
+
headers=h,
|
|
9
|
+
)
|
|
10
|
+
assert r.status_code == 404
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_pipeline_status_after_ingest(migrated_client: TestClient) -> None:
|
|
14
|
+
h = {"x-user-id": "u-pipe"}
|
|
15
|
+
env = {
|
|
16
|
+
"source": "test",
|
|
17
|
+
"timestamp": "2026-04-05T00:00:00Z",
|
|
18
|
+
"content_type": "text",
|
|
19
|
+
"payload": "hello pipeline",
|
|
20
|
+
"metadata": {},
|
|
21
|
+
}
|
|
22
|
+
ing = migrated_client.post(
|
|
23
|
+
"/ingest/raw?connector=generic",
|
|
24
|
+
headers=h,
|
|
25
|
+
json=env,
|
|
26
|
+
)
|
|
27
|
+
assert ing.status_code == 200
|
|
28
|
+
doc_id = ing.json()["document_id"]
|
|
29
|
+
|
|
30
|
+
r = migrated_client.get(
|
|
31
|
+
f"/ingest/documents/{doc_id}/pipeline",
|
|
32
|
+
headers=h,
|
|
33
|
+
)
|
|
34
|
+
assert r.status_code == 200
|
|
35
|
+
body = r.json()
|
|
36
|
+
assert body["document_id"] == doc_id
|
|
37
|
+
assert "steps" in body
|
|
38
|
+
assert isinstance(body["steps"], list)
|
|
39
|
+
assert body["chunk_count"] >= 0
|
|
40
|
+
assert body["content_block_count"] >= 0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_pipeline_response_shape(migrated_client: TestClient) -> None:
|
|
44
|
+
h = {"x-user-id": "u-pipe"}
|
|
45
|
+
env = {
|
|
46
|
+
"source": "test",
|
|
47
|
+
"timestamp": "2026-04-05T00:00:01Z",
|
|
48
|
+
"content_type": "text",
|
|
49
|
+
"payload": "shape",
|
|
50
|
+
"metadata": {},
|
|
51
|
+
}
|
|
52
|
+
ing = migrated_client.post(
|
|
53
|
+
"/ingest/raw?connector=generic",
|
|
54
|
+
headers=h,
|
|
55
|
+
json=env,
|
|
56
|
+
)
|
|
57
|
+
doc_id = ing.json()["document_id"]
|
|
58
|
+
r = migrated_client.get(
|
|
59
|
+
f"/ingest/documents/{doc_id}/pipeline",
|
|
60
|
+
headers=h,
|
|
61
|
+
)
|
|
62
|
+
assert r.status_code == 200
|
|
63
|
+
j = r.json()
|
|
64
|
+
for key in (
|
|
65
|
+
"normalization_error",
|
|
66
|
+
"ingest_meta",
|
|
67
|
+
"vector_row_count",
|
|
68
|
+
"gemini_embedding_row_count",
|
|
69
|
+
"checked_at",
|
|
70
|
+
):
|
|
71
|
+
assert key in j
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from fastapi.testclient import TestClient
|
|
7
|
+
|
|
8
|
+
from app.services.playwright_extract.extract import (
|
|
9
|
+
assert_url_allowed,
|
|
10
|
+
host_matches_allowlist,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_host_allowlist_exact() -> None:
|
|
15
|
+
assert host_matches_allowlist("example.com", ["example.com"])
|
|
16
|
+
assert not host_matches_allowlist("evil.com", ["example.com"])
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_host_allowlist_suffix() -> None:
|
|
20
|
+
assert host_matches_allowlist("sub.example.com", ["*.example.com"])
|
|
21
|
+
assert host_matches_allowlist("example.com", ["*.example.com"])
|
|
22
|
+
assert not host_matches_allowlist("notexample.com", ["*.example.com"])
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_assert_url_allowed_rejects_non_http() -> None:
|
|
26
|
+
with pytest.raises(ValueError, match="scheme"):
|
|
27
|
+
assert_url_allowed("file:///etc/passwd", ["x"])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_assert_url_allowed_empty_hosts() -> None:
|
|
31
|
+
with pytest.raises(ValueError, match="non-empty"):
|
|
32
|
+
assert_url_allowed("https://a.com", [])
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_playwright_ingest_enriches_envelope(
|
|
36
|
+
migrated_client: TestClient,
|
|
37
|
+
monkeypatch,
|
|
38
|
+
) -> None:
|
|
39
|
+
from app.services.playwright_extract.extract import PlaywrightExtractResult
|
|
40
|
+
|
|
41
|
+
def fake_extract(*args, **kwargs):
|
|
42
|
+
assert kwargs["allowlisted_hosts"] == ["example.com"]
|
|
43
|
+
return PlaywrightExtractResult(
|
|
44
|
+
visible_text="hello page",
|
|
45
|
+
title="Hi",
|
|
46
|
+
final_url="https://example.com/after",
|
|
47
|
+
truncated=False,
|
|
48
|
+
meta={},
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
monkeypatch.setattr(
|
|
52
|
+
"app.connectors.playwright_session.extract_visible_text_sync",
|
|
53
|
+
fake_extract,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
r = migrated_client.post(
|
|
57
|
+
"/ingest/raw?connector=playwright_session",
|
|
58
|
+
headers={"x-user-id": "u1"},
|
|
59
|
+
json={
|
|
60
|
+
"source": "pw-test",
|
|
61
|
+
"timestamp": datetime(2026, 3, 1, tzinfo=UTC).isoformat(),
|
|
62
|
+
"content_type": "text",
|
|
63
|
+
"payload": "",
|
|
64
|
+
"metadata": {
|
|
65
|
+
"url": "https://example.com/start",
|
|
66
|
+
"playwright_user_data_dir": r"C:\fake\pw-profile",
|
|
67
|
+
"connector_config": {
|
|
68
|
+
"allowlisted_hosts": ["example.com"],
|
|
69
|
+
},
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
assert r.status_code == 200, r.text
|
|
74
|
+
data = r.json()
|
|
75
|
+
assert data["normalized"]["extracted_text_chars"] == len("hello page")
|
|
76
|
+
canonical = data["canonical"]
|
|
77
|
+
assert any(
|
|
78
|
+
b.get("type") == "text" and "hello page" in (b.get("data") or "")
|
|
79
|
+
for b in canonical.get("content_blocks", [])
|
|
80
|
+
)
|