business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,114 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import hashlib
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+
10
+ from app.schemas.ingest import RawIngestEnvelope
11
+ from app.services.normalization.normalizer import (
12
+ extract_urls_from_text,
13
+ normalize_envelope_to_canonical,
14
+ parse_blob_sha256,
15
+ )
16
+ from app.storage.blobs import BlobStore
17
+
18
+
19
+ def test_text_only_string_payload(tmp_path: Path) -> None:
20
+ store = BlobStore(tmp_path / "blobs")
21
+ ts = datetime(2026, 4, 5, 12, 0, tzinfo=UTC)
22
+ env = RawIngestEnvelope(
23
+ source="slack",
24
+ timestamp=ts,
25
+ content_type="text",
26
+ payload="Hello from the connector.",
27
+ metadata={},
28
+ )
29
+ doc = normalize_envelope_to_canonical(
30
+ document_id="doc-1",
31
+ envelope=env,
32
+ blob_store=store,
33
+ )
34
+ assert doc.id == "doc-1"
35
+ assert doc.source == "slack"
36
+ assert doc.timestamp == ts
37
+ assert len(doc.content_blocks) == 1
38
+ assert doc.content_blocks[0].type == "text"
39
+ assert doc.content_blocks[0].data == "Hello from the connector."
40
+ assert doc.content_blocks[0].raw_input is None
41
+ assert doc.content_blocks[0].mime == "text/plain"
42
+ assert doc.links == []
43
+ assert doc.summary == "Hello from the connector."
44
+ assert doc.raw_content == "Hello from the connector."
45
+
46
+
47
+ def test_text_and_image_multimodal_preserves_blocks_and_blob(tmp_path: Path) -> None:
48
+ store = BlobStore(tmp_path / "blobs")
49
+ image_bytes = b"\x89PNG\r\n\x1a\nfake-png-bytes-for-test"
50
+ b64 = base64.b64encode(image_bytes).decode("ascii")
51
+ expected_sha = hashlib.sha256(image_bytes).hexdigest()
52
+ ts = datetime(2026, 4, 5, 12, 0, tzinfo=UTC)
53
+ env = RawIngestEnvelope(
54
+ source="twitter",
55
+ timestamp=ts,
56
+ content_type="multimodal",
57
+ payload={
58
+ "tweet": "Hot take + chart",
59
+ "image_base64": b64,
60
+ },
61
+ metadata={},
62
+ )
63
+ doc = normalize_envelope_to_canonical(
64
+ document_id="doc-2",
65
+ envelope=env,
66
+ blob_store=store,
67
+ )
68
+ assert len(doc.content_blocks) == 2
69
+ assert doc.content_blocks[0].type == "text"
70
+ assert doc.content_blocks[0].data == "Hot take + chart"
71
+ assert doc.content_blocks[1].type == "image"
72
+ assert doc.content_blocks[1].raw_input == f"blob://{expected_sha}"
73
+ sha = parse_blob_sha256(doc.content_blocks[1].raw_input)
74
+ assert sha == expected_sha
75
+ blob_path = store._path_for(expected_sha)
76
+ assert blob_path.is_file()
77
+ assert blob_path.read_bytes() == image_bytes
78
+ assert doc.summary.startswith("Hot take")
79
+
80
+
81
+ def test_url_metadata_and_inline_urls_deduped(tmp_path: Path) -> None:
82
+ store = BlobStore(tmp_path / "blobs")
83
+ ts = datetime(2026, 4, 5, 12, 0, tzinfo=UTC)
84
+ env = RawIngestEnvelope(
85
+ source="rss",
86
+ timestamp=ts,
87
+ content_type="text",
88
+ payload="Read https://example.com/a and also https://example.com/b",
89
+ metadata={
90
+ "url": "https://example.com/a",
91
+ "urls": ["https://example.com/c"],
92
+ },
93
+ )
94
+ doc = normalize_envelope_to_canonical(
95
+ document_id="doc-3",
96
+ envelope=env,
97
+ blob_store=store,
98
+ )
99
+ assert doc.links == [
100
+ "https://example.com/a",
101
+ "https://example.com/b",
102
+ "https://example.com/c",
103
+ ]
104
+
105
+
106
+ @pytest.mark.parametrize(
107
+ "text,expected",
108
+ [
109
+ ("no urls here", []),
110
+ ("see https://x.com/foo?q=1", ["https://x.com/foo?q=1"]),
111
+ ],
112
+ )
113
+ def test_extract_urls_from_text(text: str, expected: list[str]) -> None:
114
+ assert extract_urls_from_text(text) == expected
@@ -0,0 +1,40 @@
1
+ import pytest
2
+ from httpx import ASGITransport, AsyncClient
3
+
4
+ from app.config import get_settings
5
+ from app.main import create_app
6
+
7
+
8
+ @pytest.fixture
9
+ def anyio_backend():
10
+ return "asyncio"
11
+
12
+
13
+ @pytest.mark.anyio
14
+ async def test_openapi_paths_require_gateway_secret_when_configured(monkeypatch):
15
+ monkeypatch.setenv("BACKEND_GATEWAY_SECRET", "test-gateway-secret")
16
+ get_settings.cache_clear()
17
+ app = create_app()
18
+ transport = ASGITransport(app=app)
19
+ async with AsyncClient(transport=transport, base_url="http://test") as client:
20
+ r = await client.get("/docs", headers={"x-user-id": "u1"})
21
+ assert r.status_code == 403
22
+ r2 = await client.get(
23
+ "/docs",
24
+ headers={
25
+ "x-user-id": "u1",
26
+ "x-gateway-secret": "test-gateway-secret",
27
+ },
28
+ )
29
+ assert r2.status_code == 200
30
+
31
+
32
+ @pytest.mark.anyio
33
+ async def test_openapi_paths_allow_without_secret(monkeypatch):
34
+ monkeypatch.delenv("BACKEND_GATEWAY_SECRET", raising=False)
35
+ get_settings.cache_clear()
36
+ app = create_app()
37
+ transport = ASGITransport(app=app)
38
+ async with AsyncClient(transport=transport, base_url="http://test") as client:
39
+ r = await client.get("/docs", headers={"x-user-id": "u1"})
40
+ assert r.status_code == 200
@@ -0,0 +1,285 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import sqlite3
7
+ from datetime import UTC, datetime
8
+ from pathlib import Path
9
+ from unittest.mock import AsyncMock, MagicMock, patch
10
+
11
+ from alembic.config import Config
12
+ from fastapi.testclient import TestClient
13
+ from sqlalchemy import text
14
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
15
+
16
+ from alembic import command
17
+ from app.config import Settings, clear_settings_cache, get_settings
18
+ from app.services.embeddings.worker import run_embed_document_job
19
+
20
+
21
+ def test_ingest_dedupe_external_id(migrated_client: TestClient) -> None:
22
+ h = {"x-user-id": "u1"}
23
+ ts1 = datetime(2026, 4, 1, 10, 0, 0, tzinfo=UTC)
24
+ ts2 = datetime(2026, 4, 2, 11, 0, 0, tzinfo=UTC)
25
+ base = {
26
+ "source": "dedupe-src",
27
+ "content_type": "text",
28
+ "payload": "same body",
29
+ "metadata": {"external_id": "ext-42"},
30
+ }
31
+ r1 = migrated_client.post(
32
+ "/ingest/raw?connector=echo",
33
+ headers=h,
34
+ json={**base, "timestamp": ts1.isoformat()},
35
+ )
36
+ r2 = migrated_client.post(
37
+ "/ingest/raw?connector=echo",
38
+ headers=h,
39
+ json={**base, "timestamp": ts2.isoformat()},
40
+ )
41
+ assert r1.status_code == 200
42
+ assert r2.status_code == 200
43
+ assert r1.json()["document_id"] == r2.json()["document_id"]
44
+ assert r2.json()["deduplicated"] is True
45
+ assert r1.json()["deduplicated"] is False
46
+
47
+
48
+ def test_ingest_dedupe_content_hash_no_external_id(
49
+ migrated_client: TestClient,
50
+ ) -> None:
51
+ h = {"x-user-id": "u1"}
52
+ ts1 = datetime(2026, 4, 1, 10, 0, 0, tzinfo=UTC)
53
+ ts2 = datetime(2026, 4, 2, 11, 0, 0, tzinfo=UTC)
54
+ body = {
55
+ "source": "hash-src",
56
+ "content_type": "text",
57
+ "payload": "identical",
58
+ "metadata": {},
59
+ }
60
+ r1 = migrated_client.post(
61
+ "/ingest/raw?connector=echo",
62
+ headers=h,
63
+ json={**body, "timestamp": ts1.isoformat()},
64
+ )
65
+ r2 = migrated_client.post(
66
+ "/ingest/raw?connector=echo",
67
+ headers=h,
68
+ json={**body, "timestamp": ts2.isoformat()},
69
+ )
70
+ assert r1.json()["document_id"] == r2.json()["document_id"]
71
+ assert r2.json()["deduplicated"] is True
72
+
73
+
74
+ def test_ingest_normalization_failure_stores_error_and_partial(
75
+ migrated_client: TestClient,
76
+ monkeypatch,
77
+ ) -> None:
78
+ h = {"x-user-id": "u1"}
79
+
80
+ def boom(*args, **kwargs):
81
+ raise RuntimeError("normalize boom")
82
+
83
+ monkeypatch.setattr(
84
+ "app.routers.ingestion.normalize_envelope_to_canonical",
85
+ boom,
86
+ )
87
+ r = migrated_client.post(
88
+ "/ingest/raw?connector=echo",
89
+ headers=h,
90
+ json={
91
+ "source": "norm-fail",
92
+ "timestamp": datetime.now(UTC).isoformat(),
93
+ "content_type": "text",
94
+ "payload": "x",
95
+ "metadata": {},
96
+ },
97
+ )
98
+ assert r.status_code == 200
99
+ data = r.json()
100
+ assert data["normalization_failed"] is True
101
+ assert data["canonical"]["content_blocks"] == []
102
+ doc_id = data["document_id"]
103
+ settings = get_settings()
104
+ conn = sqlite3.connect(str(settings.data_dir / settings.sqlite_filename))
105
+ row = conn.execute(
106
+ "SELECT normalization_error FROM documents WHERE id = ?",
107
+ (doc_id,),
108
+ ).fetchone()
109
+ conn.close()
110
+ assert row is not None
111
+ assert row[0] is not None
112
+ err = json.loads(row[0])
113
+ assert err["type"] == "RuntimeError"
114
+
115
+
116
+ def test_admin_reprocess_normalization(migrated_client: TestClient) -> None:
117
+ h = {"x-user-id": "u1"}
118
+
119
+ def boom(*args, **kwargs):
120
+ raise RuntimeError("temporary")
121
+
122
+ with patch(
123
+ "app.routers.ingestion.normalize_envelope_to_canonical",
124
+ side_effect=boom,
125
+ ):
126
+ r = migrated_client.post(
127
+ "/ingest/raw?connector=echo",
128
+ headers=h,
129
+ json={
130
+ "source": "reproc",
131
+ "timestamp": datetime.now(UTC).isoformat(),
132
+ "content_type": "text",
133
+ "payload": "hello reproc",
134
+ "metadata": {},
135
+ },
136
+ )
137
+ assert r.status_code == 200
138
+ doc_id = r.json()["document_id"]
139
+
140
+ rr = migrated_client.post(
141
+ f"/admin/documents/{doc_id}/reprocess-normalization",
142
+ headers=h,
143
+ )
144
+ assert rr.status_code == 200
145
+ settings = get_settings()
146
+ conn = sqlite3.connect(str(settings.data_dir / settings.sqlite_filename))
147
+ ne = conn.execute(
148
+ "SELECT normalization_error FROM documents WHERE id = ?",
149
+ (doc_id,),
150
+ ).fetchone()[0]
151
+ blocks = conn.execute(
152
+ "SELECT COUNT(*) FROM content_blocks WHERE document_id = ?",
153
+ (doc_id,),
154
+ ).fetchone()[0]
155
+ conn.close()
156
+ assert ne is None
157
+ assert blocks >= 1
158
+
159
+
160
+ def test_embed_failure_writes_dlq(tmp_path, monkeypatch) -> None:
161
+ monkeypatch.setenv("DATA_DIR", str(tmp_path))
162
+ monkeypatch.setenv("GEMINI_API_KEY", "k")
163
+ monkeypatch.setenv("EMBEDDING_DLQ_MAX_ATTEMPTS", "3")
164
+ clear_settings_cache()
165
+
166
+ backend = Path(__file__).resolve().parents[1]
167
+ prev = os.getcwd()
168
+ os.chdir(str(backend))
169
+ try:
170
+ command.upgrade(Config(str(backend / "alembic.ini")), "head")
171
+ finally:
172
+ os.chdir(prev)
173
+ clear_settings_cache()
174
+
175
+ settings = Settings()
176
+ engine = create_async_engine(settings.sqlalchemy_database_url)
177
+ factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
178
+ doc_id = "dlq-doc-1"
179
+
180
+ async def _seed() -> None:
181
+ async with factory() as session:
182
+ await session.execute(
183
+ text("INSERT INTO sources (name, connector_type) VALUES ('s','s')"),
184
+ )
185
+ sid_row = await session.execute(text("SELECT id FROM sources LIMIT 1"))
186
+ sid = int(sid_row.scalar_one())
187
+ await session.execute(
188
+ text(
189
+ "INSERT INTO documents "
190
+ "(id, source_id, timestamp, content_type, raw_content, "
191
+ "summary, status) VALUES "
192
+ "(:id, :sid, :ts, 'text', '{}', 'x', 'partial')",
193
+ ),
194
+ {"id": doc_id, "sid": sid, "ts": datetime.now(UTC).isoformat()},
195
+ )
196
+ await session.execute(
197
+ text(
198
+ "INSERT INTO document_chunks "
199
+ "(document_id, ordinal, text, start_block_ordinal, "
200
+ "end_block_ordinal, meta) VALUES (:d, 0, 't', 0, 0, NULL)",
201
+ ),
202
+ {"d": doc_id},
203
+ )
204
+ await session.commit()
205
+
206
+ asyncio.run(_seed())
207
+
208
+ async def _fail() -> None:
209
+ with patch(
210
+ "app.services.embeddings.worker.batch_embed_contents",
211
+ new=AsyncMock(side_effect=RuntimeError("api down")),
212
+ ):
213
+ await run_embed_document_job(
214
+ document_id=doc_id,
215
+ multimodal=False,
216
+ session_factory=factory,
217
+ settings=get_settings(),
218
+ store=MagicMock(),
219
+ )
220
+
221
+ asyncio.run(_fail())
222
+
223
+ db_path = (get_settings().data_dir / get_settings().sqlite_filename).resolve()
224
+ conn = sqlite3.connect(str(db_path))
225
+ row = conn.execute(
226
+ "SELECT attempt_count, state FROM embedding_dlq WHERE document_id = ?",
227
+ (doc_id,),
228
+ ).fetchone()
229
+ conn.close()
230
+ assert row is not None
231
+ assert int(row[0]) == 1
232
+ assert row[1] == "pending_retry"
233
+
234
+ asyncio.run(engine.dispose())
235
+
236
+
237
+ def test_admin_retry_embedding_clears_dlq(
238
+ migrated_client: TestClient,
239
+ monkeypatch,
240
+ ) -> None:
241
+ monkeypatch.setenv("GEMINI_API_KEY", "k")
242
+ clear_settings_cache()
243
+ h = {"x-user-id": "u1"}
244
+ ing = migrated_client.post(
245
+ "/ingest/raw?connector=echo",
246
+ headers=h,
247
+ json={
248
+ "source": "adm",
249
+ "timestamp": datetime.now(UTC).isoformat(),
250
+ "content_type": "text",
251
+ "payload": "p",
252
+ "metadata": {},
253
+ },
254
+ )
255
+ doc_id = ing.json()["document_id"]
256
+ settings = get_settings()
257
+ db_path = str(settings.data_dir / settings.sqlite_filename)
258
+ conn = sqlite3.connect(db_path)
259
+ conn.execute(
260
+ "INSERT INTO embedding_dlq (document_id, last_error, attempt_count, "
261
+ "next_retry_at, state, multimodal) VALUES (?, ?, 2, NULL, 'dead', 0)",
262
+ (doc_id, "old err"),
263
+ )
264
+ conn.commit()
265
+ conn.close()
266
+
267
+ mock_job = MagicMock()
268
+ with patch(
269
+ "app.routers.admin_pipeline.run_embed_document_job",
270
+ mock_job,
271
+ ):
272
+ r = migrated_client.post(
273
+ f"/admin/documents/{doc_id}/retry-embedding",
274
+ headers=h,
275
+ json={"multimodal": False},
276
+ )
277
+ assert r.status_code == 200
278
+ conn = sqlite3.connect(db_path)
279
+ n = conn.execute(
280
+ "SELECT COUNT(*) FROM embedding_dlq WHERE document_id = ?",
281
+ (doc_id,),
282
+ ).fetchone()[0]
283
+ conn.close()
284
+ assert n == 0
285
+ mock_job.assert_called_once()
@@ -0,0 +1,71 @@
1
+ from fastapi.testclient import TestClient
2
+
3
+
4
+ def test_pipeline_status_404(migrated_client: TestClient) -> None:
5
+ h = {"x-user-id": "u-pipe"}
6
+ r = migrated_client.get(
7
+ "/ingest/documents/missing-id/pipeline",
8
+ headers=h,
9
+ )
10
+ assert r.status_code == 404
11
+
12
+
13
+ def test_pipeline_status_after_ingest(migrated_client: TestClient) -> None:
14
+ h = {"x-user-id": "u-pipe"}
15
+ env = {
16
+ "source": "test",
17
+ "timestamp": "2026-04-05T00:00:00Z",
18
+ "content_type": "text",
19
+ "payload": "hello pipeline",
20
+ "metadata": {},
21
+ }
22
+ ing = migrated_client.post(
23
+ "/ingest/raw?connector=generic",
24
+ headers=h,
25
+ json=env,
26
+ )
27
+ assert ing.status_code == 200
28
+ doc_id = ing.json()["document_id"]
29
+
30
+ r = migrated_client.get(
31
+ f"/ingest/documents/{doc_id}/pipeline",
32
+ headers=h,
33
+ )
34
+ assert r.status_code == 200
35
+ body = r.json()
36
+ assert body["document_id"] == doc_id
37
+ assert "steps" in body
38
+ assert isinstance(body["steps"], list)
39
+ assert body["chunk_count"] >= 0
40
+ assert body["content_block_count"] >= 0
41
+
42
+
43
+ def test_pipeline_response_shape(migrated_client: TestClient) -> None:
44
+ h = {"x-user-id": "u-pipe"}
45
+ env = {
46
+ "source": "test",
47
+ "timestamp": "2026-04-05T00:00:01Z",
48
+ "content_type": "text",
49
+ "payload": "shape",
50
+ "metadata": {},
51
+ }
52
+ ing = migrated_client.post(
53
+ "/ingest/raw?connector=generic",
54
+ headers=h,
55
+ json=env,
56
+ )
57
+ doc_id = ing.json()["document_id"]
58
+ r = migrated_client.get(
59
+ f"/ingest/documents/{doc_id}/pipeline",
60
+ headers=h,
61
+ )
62
+ assert r.status_code == 200
63
+ j = r.json()
64
+ for key in (
65
+ "normalization_error",
66
+ "ingest_meta",
67
+ "vector_row_count",
68
+ "gemini_embedding_row_count",
69
+ "checked_at",
70
+ ):
71
+ assert key in j
@@ -0,0 +1,80 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import UTC, datetime
4
+
5
+ import pytest
6
+ from fastapi.testclient import TestClient
7
+
8
+ from app.services.playwright_extract.extract import (
9
+ assert_url_allowed,
10
+ host_matches_allowlist,
11
+ )
12
+
13
+
14
+ def test_host_allowlist_exact() -> None:
15
+ assert host_matches_allowlist("example.com", ["example.com"])
16
+ assert not host_matches_allowlist("evil.com", ["example.com"])
17
+
18
+
19
+ def test_host_allowlist_suffix() -> None:
20
+ assert host_matches_allowlist("sub.example.com", ["*.example.com"])
21
+ assert host_matches_allowlist("example.com", ["*.example.com"])
22
+ assert not host_matches_allowlist("notexample.com", ["*.example.com"])
23
+
24
+
25
+ def test_assert_url_allowed_rejects_non_http() -> None:
26
+ with pytest.raises(ValueError, match="scheme"):
27
+ assert_url_allowed("file:///etc/passwd", ["x"])
28
+
29
+
30
+ def test_assert_url_allowed_empty_hosts() -> None:
31
+ with pytest.raises(ValueError, match="non-empty"):
32
+ assert_url_allowed("https://a.com", [])
33
+
34
+
35
+ def test_playwright_ingest_enriches_envelope(
36
+ migrated_client: TestClient,
37
+ monkeypatch,
38
+ ) -> None:
39
+ from app.services.playwright_extract.extract import PlaywrightExtractResult
40
+
41
+ def fake_extract(*args, **kwargs):
42
+ assert kwargs["allowlisted_hosts"] == ["example.com"]
43
+ return PlaywrightExtractResult(
44
+ visible_text="hello page",
45
+ title="Hi",
46
+ final_url="https://example.com/after",
47
+ truncated=False,
48
+ meta={},
49
+ )
50
+
51
+ monkeypatch.setattr(
52
+ "app.connectors.playwright_session.extract_visible_text_sync",
53
+ fake_extract,
54
+ )
55
+
56
+ r = migrated_client.post(
57
+ "/ingest/raw?connector=playwright_session",
58
+ headers={"x-user-id": "u1"},
59
+ json={
60
+ "source": "pw-test",
61
+ "timestamp": datetime(2026, 3, 1, tzinfo=UTC).isoformat(),
62
+ "content_type": "text",
63
+ "payload": "",
64
+ "metadata": {
65
+ "url": "https://example.com/start",
66
+ "playwright_user_data_dir": r"C:\fake\pw-profile",
67
+ "connector_config": {
68
+ "allowlisted_hosts": ["example.com"],
69
+ },
70
+ },
71
+ },
72
+ )
73
+ assert r.status_code == 200, r.text
74
+ data = r.json()
75
+ assert data["normalized"]["extracted_text_chars"] == len("hello page")
76
+ canonical = data["canonical"]
77
+ assert any(
78
+ b.get("type") == "text" and "hello page" in (b.get("data") or "")
79
+ for b in canonical.get("content_blocks", [])
80
+ )