business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,224 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import sqlite3
5
+ import uuid
6
+ from datetime import UTC, datetime
7
+ from unittest.mock import AsyncMock, MagicMock, patch
8
+
9
+ import httpx
10
+ from fastapi.testclient import TestClient
11
+
12
+ from app.config import clear_settings_cache, get_settings
13
+ from app.services.embeddings.gemini_api import batch_embed_contents
14
+ from app.services.embeddings.types import InlineDataPart, TextPart
15
+
16
+
17
+ def _vec1536() -> list[float]:
18
+ v = [0.0] * 1536
19
+ v[0] = 0.25
20
+ return v
21
+
22
+
23
+ def test_batch_embed_contents_parses_response(monkeypatch) -> None:
24
+ monkeypatch.setenv("VECTOR_EMBEDDING_DIM", "1536")
25
+ clear_settings_cache()
26
+ settings = get_settings()
27
+
28
+ async def handler(request: httpx.Request) -> httpx.Response:
29
+ body = {
30
+ "embeddings": [
31
+ {"values": _vec1536()},
32
+ {"values": _vec1536()},
33
+ ],
34
+ }
35
+ return httpx.Response(200, json=body)
36
+
37
+ transport = httpx.MockTransport(handler)
38
+
39
+ async def _run() -> None:
40
+ async with httpx.AsyncClient(transport=transport) as client:
41
+ out = await batch_embed_contents(
42
+ api_key="k",
43
+ model="gemini-embedding-001",
44
+ contents=[[TextPart("a")], [TextPart("b"), TextPart("c")]],
45
+ settings=settings,
46
+ client=client,
47
+ )
48
+ assert len(out) == 2
49
+ assert len(out[0]) == 1536
50
+
51
+ asyncio.run(_run())
52
+
53
+
54
+ def test_embed_endpoint_503_without_key(migrated_client: TestClient) -> None:
55
+ clear_settings_cache()
56
+ h = {"x-user-id": "u1"}
57
+ r = migrated_client.post(
58
+ "/ingest/documents/x/embed",
59
+ headers=h,
60
+ json={"multimodal": False},
61
+ )
62
+ assert r.status_code == 503
63
+
64
+
65
+ def test_embed_endpoint_accepts_with_key(
66
+ migrated_client: TestClient,
67
+ monkeypatch,
68
+ ) -> None:
69
+ monkeypatch.setenv("GEMINI_API_KEY", "test-key")
70
+ clear_settings_cache()
71
+
72
+ h = {"x-user-id": "u1"}
73
+ ing = migrated_client.post(
74
+ "/ingest/raw?connector=echo",
75
+ headers=h,
76
+ json={
77
+ "source": "s",
78
+ "timestamp": datetime.now(UTC).isoformat(),
79
+ "content_type": "text",
80
+ "payload": "hello chunk",
81
+ "metadata": {},
82
+ },
83
+ )
84
+ doc_id = ing.json()["document_id"]
85
+ ch = migrated_client.post(
86
+ f"/ingest/documents/{doc_id}/chunks",
87
+ headers=h,
88
+ json={"use_llm_weak_structure": False},
89
+ )
90
+ assert ch.status_code == 200
91
+
92
+ mock_job = AsyncMock()
93
+ with patch(
94
+ "app.routers.gemini_embed.run_embed_document_job",
95
+ mock_job,
96
+ ):
97
+ r = migrated_client.post(
98
+ f"/ingest/documents/{doc_id}/embed",
99
+ headers=h,
100
+ json={"multimodal": False},
101
+ )
102
+ assert r.status_code == 200
103
+ assert r.json()["accepted"] is True
104
+ mock_job.assert_called_once()
105
+
106
+
107
+ def test_embed_document_persists_vectors_and_rows(
108
+ tmp_path,
109
+ monkeypatch,
110
+ ) -> None:
111
+ db_file = tmp_path / "rag.sqlite"
112
+ monkeypatch.setenv(
113
+ "DATABASE_URL",
114
+ f"sqlite+aiosqlite:///{db_file.as_posix()}",
115
+ )
116
+ monkeypatch.setenv("GEMINI_API_KEY", "k")
117
+ monkeypatch.setenv("VECTOR_EMBEDDING_DIM", "1536")
118
+ clear_settings_cache()
119
+
120
+ import os
121
+ from pathlib import Path
122
+
123
+ from alembic.config import Config
124
+
125
+ from alembic import command
126
+
127
+ backend = Path(__file__).resolve().parents[1]
128
+ prev = os.getcwd()
129
+ os.chdir(str(backend))
130
+ try:
131
+ command.upgrade(Config(str(backend / "alembic.ini")), "head")
132
+ finally:
133
+ os.chdir(prev)
134
+ clear_settings_cache()
135
+
136
+ from sqlalchemy import text
137
+ from sqlalchemy.ext.asyncio import (
138
+ AsyncSession,
139
+ async_sessionmaker,
140
+ create_async_engine,
141
+ )
142
+
143
+ from app.config import Settings
144
+ from app.services.embeddings.worker import embed_document_gemini
145
+
146
+ settings = Settings()
147
+ engine = create_async_engine(settings.sqlalchemy_database_url)
148
+ factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
149
+ store = MagicMock()
150
+ store.delete_document_for_session = AsyncMock(return_value=0)
151
+ store.upsert_for_session = AsyncMock()
152
+ doc_id = f"d-embed-{uuid.uuid4().hex[:12]}"
153
+
154
+ async def _seed() -> None:
155
+ async with factory() as session:
156
+ await session.execute(
157
+ text("INSERT INTO sources (name, connector_type) VALUES ('s','s')"),
158
+ )
159
+ sid = int(
160
+ (
161
+ await session.execute(text("SELECT id FROM sources LIMIT 1"))
162
+ ).scalar_one(),
163
+ )
164
+ await session.execute(
165
+ text(
166
+ "INSERT INTO documents "
167
+ "(id, source_id, timestamp, content_type, raw_content, "
168
+ "summary, status) VALUES "
169
+ "(:id, :sid, :ts, 'text', '{}', 'x', 'partial')",
170
+ ),
171
+ {"id": doc_id, "sid": sid, "ts": datetime.now(UTC).isoformat()},
172
+ )
173
+ await session.execute(
174
+ text(
175
+ "INSERT INTO document_chunks "
176
+ "(document_id, ordinal, text, start_block_ordinal, "
177
+ "end_block_ordinal, meta) "
178
+ "VALUES (:d, 0, 'alpha beta', 0, 0, NULL)",
179
+ ),
180
+ {"d": doc_id},
181
+ )
182
+ await session.commit()
183
+
184
+ asyncio.run(_seed())
185
+
186
+ async def _run_embed() -> None:
187
+ with patch(
188
+ "app.services.embeddings.worker.batch_embed_contents",
189
+ new=AsyncMock(return_value=[_vec1536()]),
190
+ ):
191
+ async with factory() as session:
192
+ n = await embed_document_gemini(
193
+ session,
194
+ document_id=doc_id,
195
+ settings=settings,
196
+ store=store,
197
+ multimodal=False,
198
+ )
199
+ await session.commit()
200
+ assert n == 1
201
+
202
+ asyncio.run(_run_embed())
203
+
204
+ db_path = db_file.resolve()
205
+ conn = sqlite3.connect(str(db_path))
206
+ erows = conn.execute(
207
+ "SELECT document_id, model, dim FROM embeddings WHERE document_id = ?",
208
+ (doc_id,),
209
+ ).fetchall()
210
+ conn.close()
211
+ assert len(erows) == 1
212
+ assert erows[0][1] == "gemini-embedding-001"
213
+ assert erows[0][2] == 1536
214
+
215
+ asyncio.run(engine.dispose())
216
+
217
+
218
+ def test_inline_data_part_roundtrip_in_api_dict() -> None:
219
+ p = InlineDataPart(mime_type="image/png", data=b"\x89PNG\r\n")
220
+ from app.services.embeddings.gemini_api import _part_to_api_dict
221
+
222
+ d = _part_to_api_dict(p)
223
+ assert d["inlineData"]["mimeType"] == "image/png"
224
+ assert "data" in d["inlineData"]
@@ -0,0 +1,24 @@
1
+ from fastapi.testclient import TestClient
2
+
3
+
4
+ def test_healthz_no_gateway_header(client: TestClient) -> None:
5
+ response = client.get("/healthz")
6
+ assert response.status_code == 200
7
+ assert response.json() == {"status": "ok"}
8
+ assert response.headers.get("x-request-id")
9
+
10
+
11
+ def test_readyz_no_gateway_header(client: TestClient) -> None:
12
+ response = client.get("/readyz")
13
+ assert response.status_code == 200
14
+ assert response.json() == {"status": "ready"}
15
+ assert response.headers.get("x-request-id")
16
+
17
+
18
+ def test_request_id_echo(client: TestClient) -> None:
19
+ response = client.get(
20
+ "/healthz",
21
+ headers={"X-Request-ID": "probe-abc"},
22
+ )
23
+ assert response.status_code == 200
24
+ assert response.headers.get("x-request-id") == "probe-abc"
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sqlite3
5
+ from datetime import UTC, datetime
6
+
7
+ from fastapi.testclient import TestClient
8
+
9
+ from app.config import get_settings
10
+
11
+
12
+ def test_ingest_raw_requires_auth(migrated_client: TestClient) -> None:
13
+ r = migrated_client.post(
14
+ "/ingest/raw",
15
+ json={
16
+ "source": "test",
17
+ "timestamp": datetime.now(UTC).isoformat(),
18
+ "content_type": "text",
19
+ "payload": {"x": 1},
20
+ "metadata": {},
21
+ },
22
+ )
23
+ assert r.status_code == 401
24
+
25
+
26
+ def test_ingest_raw_unknown_connector(migrated_client: TestClient) -> None:
27
+ h = {"x-user-id": "u1"}
28
+ r = migrated_client.post(
29
+ "/ingest/raw?connector=does-not-exist",
30
+ headers=h,
31
+ json={
32
+ "source": "test",
33
+ "timestamp": datetime.now(UTC).isoformat(),
34
+ "content_type": "text",
35
+ "payload": {},
36
+ "metadata": {},
37
+ },
38
+ )
39
+ assert r.status_code == 400
40
+
41
+
42
+ def test_ingest_raw_strict_connector_validation(migrated_client: TestClient) -> None:
43
+ h = {"x-user-id": "u1"}
44
+ base = {
45
+ "source": "src-a",
46
+ "timestamp": datetime.now(UTC).isoformat(),
47
+ "content_type": "text",
48
+ "payload": "hello",
49
+ "metadata": {"connector_config": {"ok": False}},
50
+ }
51
+ bad = migrated_client.post("/ingest/raw?connector=strict", headers=h, json=base)
52
+ assert bad.status_code == 400
53
+
54
+ base["metadata"] = {"connector_config": {"ok": True}}
55
+ ok = migrated_client.post("/ingest/raw?connector=strict", headers=h, json=base)
56
+ assert ok.status_code == 200
57
+ body = ok.json()
58
+ assert body["status"] == "partial"
59
+ assert body["normalized"]["strict"] is True
60
+
61
+
62
+ def test_ingest_raw_persists_envelope(migrated_client: TestClient) -> None:
63
+ h = {"x-user-id": "u1"}
64
+ ts = datetime(2024, 3, 15, 10, 0, 0, tzinfo=UTC)
65
+ payload = {"body": "hello"}
66
+ r = migrated_client.post(
67
+ "/ingest/raw?connector=echo",
68
+ headers=h,
69
+ json={
70
+ "source": "filesystem",
71
+ "timestamp": ts.isoformat(),
72
+ "content_type": "multimodal",
73
+ "payload": payload,
74
+ "metadata": {"path": "/a/b"},
75
+ },
76
+ )
77
+ assert r.status_code == 200
78
+ data = r.json()
79
+ doc_id = data["document_id"]
80
+ assert data["status"] == "partial"
81
+ assert data["connector"] == "echo"
82
+ assert data["normalized"]["metadata_keys"] == ["path"]
83
+ assert data["canonical"]["id"] == doc_id
84
+ assert len(data["canonical"]["content_blocks"]) == 1
85
+ assert data["canonical"]["content_blocks"][0]["type"] == "text"
86
+ assert data["canonical"]["content_blocks"][0]["data"] == "hello"
87
+
88
+ settings = get_settings()
89
+ db_path = (settings.data_dir / settings.sqlite_filename).resolve()
90
+ conn = sqlite3.connect(str(db_path))
91
+ cur = conn.execute(
92
+ "SELECT id, status, content_type, raw_content, summary "
93
+ "FROM documents WHERE id = ?",
94
+ (doc_id,),
95
+ )
96
+ row = cur.fetchone()
97
+ assert row is not None
98
+ assert row[1] == "partial"
99
+ assert row[2] == "multimodal"
100
+ stored = json.loads(row[3])
101
+ assert stored["source"] == "filesystem"
102
+ assert stored["payload"] == payload
103
+ assert row[4] == "hello"
104
+
105
+ blocks = conn.execute(
106
+ "SELECT ordinal, type, meta FROM content_blocks "
107
+ "WHERE document_id = ? ORDER BY ordinal",
108
+ (doc_id,),
109
+ ).fetchall()
110
+ conn.close()
111
+ assert len(blocks) == 1
112
+ assert blocks[0][1] == "text"
113
+ assert json.loads(blocks[0][2])["text"] == "hello"
114
+
115
+
116
+ def test_registry_includes_entrypoint_connector() -> None:
117
+ from app.connectors.registry import init_connectors, list_connector_keys
118
+
119
+ init_connectors()
120
+ keys = list_connector_keys()
121
+ assert "generic" in keys
122
+ assert "strict" in keys
123
+ assert "echo" in keys
@@ -0,0 +1,241 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ import sqlite3
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from unittest.mock import AsyncMock, patch
9
+
10
+ from alembic.config import Config
11
+ from fastapi.testclient import TestClient
12
+ from sqlalchemy import text
13
+ from sqlalchemy.ext.asyncio import (
14
+ AsyncSession,
15
+ async_sessionmaker,
16
+ create_async_engine,
17
+ )
18
+
19
+ from alembic import command
20
+ from app.config import Settings, clear_settings_cache, get_settings
21
+ from app.services.link_expansion.canonical_url import canonicalize_url, host_from_url
22
+ from app.services.link_expansion.domain_policy import host_allowed, parse_host_csv
23
+ from app.services.link_expansion.schemas import ExpandLinksJobResult, ExpandLinksOptions
24
+ from app.services.link_expansion.worker import expand_links_from_document
25
+
26
+
27
+ def test_canonicalize_url_strips_fragment_sorts_query() -> None:
28
+ u = "HTTPS://Example.COM/path?b=2&a=1#frag"
29
+ assert canonicalize_url(u) == "https://example.com/path?a=1&b=2"
30
+
31
+
32
+ def test_host_allowed_allowlist_denylist() -> None:
33
+ allow = frozenset({"good.example"})
34
+ deny = frozenset({"bad.example"})
35
+ assert host_allowed("good.example", allowlist=allow, denylist=deny) is True
36
+ assert host_allowed("sub.good.example", allowlist=allow, denylist=deny) is True
37
+ assert host_allowed("bad.example", allowlist=allow, denylist=deny) is False
38
+ assert host_allowed("other.com", allowlist=allow, denylist=deny) is False
39
+ assert host_allowed("open.com", allowlist=frozenset(), denylist=deny) is True
40
+
41
+
42
+ def test_parse_host_csv() -> None:
43
+ assert "a.com" in parse_host_csv(" A.COM , b.com ")
44
+
45
+
46
+ class _MockResp:
47
+ def __init__(
48
+ self,
49
+ *,
50
+ content: bytes = b"",
51
+ status: int = 200,
52
+ headers: dict[str, str] | None = None,
53
+ url: str = "",
54
+ ) -> None:
55
+ self.content = content
56
+ self.status_code = status
57
+ self.headers = headers or {}
58
+ self.url = url
59
+
60
+ @property
61
+ def is_success(self) -> bool:
62
+ return 200 <= self.status_code < 300
63
+
64
+ @property
65
+ def text(self) -> str:
66
+ return self.content.decode("utf-8", errors="replace")
67
+
68
+
69
+ class _MockAsyncClient:
70
+ def __init__(self, *args: object, **kwargs: object) -> None:
71
+ pass
72
+
73
+ async def __aenter__(self) -> _MockAsyncClient:
74
+ return self
75
+
76
+ async def __aexit__(self, *args: object) -> None:
77
+ pass
78
+
79
+ async def head(self, url: str, timeout: object = None) -> _MockResp:
80
+ return _MockResp(status=200, headers={}, url=url)
81
+
82
+ async def get(self, url: str, timeout: object = None) -> _MockResp:
83
+ if "robots.txt" in url:
84
+ body = b"User-agent: *\nDisallow:\n"
85
+ return _MockResp(content=body, url=url)
86
+ html = (
87
+ b"<html><body>Hello "
88
+ b'<a href="https://child.example/child">x</a></body></html>'
89
+ )
90
+ return _MockResp(
91
+ content=html,
92
+ headers={"content-type": "text/html"},
93
+ url="https://example.com/page",
94
+ )
95
+
96
+
97
+ def test_expand_links_creates_child_and_relationship(tmp_path, monkeypatch) -> None:
98
+ monkeypatch.setenv("DATA_DIR", str(tmp_path))
99
+ monkeypatch.setenv("LINK_EXPAND_POLITENESS_DELAY_MS", "0")
100
+ monkeypatch.setenv("LINK_EXPAND_PER_DOMAIN_INTERVAL_MS", "0")
101
+ clear_settings_cache()
102
+
103
+ backend = Path(__file__).resolve().parents[1]
104
+ prev = os.getcwd()
105
+ os.chdir(str(backend))
106
+ try:
107
+ cfg = Config(str(backend / "alembic.ini"))
108
+ command.upgrade(cfg, "head")
109
+ finally:
110
+ os.chdir(prev)
111
+ clear_settings_cache()
112
+
113
+ settings = Settings()
114
+ engine = create_async_engine(settings.sqlalchemy_database_url)
115
+ factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
116
+
117
+ root_id = "root-doc-1"
118
+
119
+ async def _seed() -> None:
120
+ async with factory() as session:
121
+ await session.execute(
122
+ text("INSERT INTO sources (name, connector_type) VALUES ('t', 't')"),
123
+ )
124
+ res = await session.execute(text("SELECT id FROM sources LIMIT 1"))
125
+ sid = int(res.scalar_one())
126
+ await session.execute(
127
+ text(
128
+ "INSERT INTO documents "
129
+ "(id, source_id, timestamp, content_type, raw_content, "
130
+ "summary, status) "
131
+ "VALUES (:id, :sid, :ts, 'text', '{}', 's', 'partial')",
132
+ ),
133
+ {
134
+ "id": root_id,
135
+ "sid": sid,
136
+ "ts": datetime.now(UTC).isoformat(),
137
+ },
138
+ )
139
+ await session.execute(
140
+ text(
141
+ "INSERT INTO document_links (document_id, url, ordinal) "
142
+ "VALUES (:id, :u, 0)",
143
+ ),
144
+ {"id": root_id, "u": "https://example.com/page"},
145
+ )
146
+ await session.commit()
147
+
148
+ asyncio.run(_seed())
149
+
150
+ opts = ExpandLinksOptions(
151
+ max_depth=2,
152
+ allowlist=frozenset(),
153
+ denylist=frozenset(),
154
+ respect_robots=True,
155
+ )
156
+
157
+ async def _expand() -> ExpandLinksJobResult:
158
+ with patch(
159
+ "app.services.link_expansion.worker.httpx.AsyncClient",
160
+ _MockAsyncClient,
161
+ ):
162
+ async with factory() as session:
163
+ out = await expand_links_from_document(
164
+ session,
165
+ root_document_id=root_id,
166
+ options=opts,
167
+ settings=get_settings(),
168
+ )
169
+ await session.commit()
170
+ return out
171
+
172
+ result = asyncio.run(_expand())
173
+ assert result.relationships_created >= 1
174
+ assert result.documents_created >= 1
175
+
176
+ db_path = (get_settings().data_dir / get_settings().sqlite_filename).resolve()
177
+ conn = sqlite3.connect(str(db_path))
178
+ rels = conn.execute(
179
+ "SELECT parent_document_id, child_document_id, relation_type "
180
+ "FROM relationships",
181
+ ).fetchall()
182
+ conn.close()
183
+ assert any(r[0] == root_id and r[2] == "link" for r in rels)
184
+
185
+ async def _dispose() -> None:
186
+ await engine.dispose()
187
+
188
+ asyncio.run(_dispose())
189
+
190
+
191
+ def test_expand_links_endpoint_404(migrated_client: TestClient) -> None:
192
+ h = {"x-user-id": "u1"}
193
+ r = migrated_client.post(
194
+ "/ingest/expand-links",
195
+ headers=h,
196
+ json={"document_id": "does-not-exist"},
197
+ )
198
+ assert r.status_code == 404
199
+
200
+
201
+ def test_expand_links_endpoint_accepts(
202
+ migrated_client: TestClient,
203
+ monkeypatch,
204
+ ) -> None:
205
+ monkeypatch.setenv("LINK_EXPAND_POLITENESS_DELAY_MS", "0")
206
+ monkeypatch.setenv("LINK_EXPAND_PER_DOMAIN_INTERVAL_MS", "0")
207
+ clear_settings_cache()
208
+
209
+ h = {"x-user-id": "u1"}
210
+ ing = migrated_client.post(
211
+ "/ingest/raw?connector=echo",
212
+ headers=h,
213
+ json={
214
+ "source": "t",
215
+ "timestamp": datetime.now(UTC).isoformat(),
216
+ "content_type": "text",
217
+ "payload": "see https://example.com/x",
218
+ "metadata": {},
219
+ },
220
+ )
221
+ assert ing.status_code == 200
222
+ doc_id = ing.json()["document_id"]
223
+
224
+ mock_job = AsyncMock()
225
+
226
+ with patch(
227
+ "app.routers.link_expansion.run_expand_links_job",
228
+ mock_job,
229
+ ):
230
+ r = migrated_client.post(
231
+ "/ingest/expand-links",
232
+ headers=h,
233
+ json={"document_id": doc_id},
234
+ )
235
+ assert r.status_code == 200
236
+ assert r.json() == {"accepted": True, "document_id": doc_id}
237
+ mock_job.assert_called_once()
238
+
239
+
240
+ def test_host_from_url() -> None:
241
+ assert host_from_url("https://a.B.com/p") == "a.b.com"
@@ -0,0 +1,12 @@
1
+ from fastapi.testclient import TestClient
2
+
3
+
4
+ def test_hello_world_requires_gateway_identity(client: TestClient) -> None:
5
+ response = client.get("/hello-world")
6
+ assert response.status_code == 401
7
+
8
+
9
+ def test_hello_world_with_user_header(client: TestClient) -> None:
10
+ response = client.get("/hello-world", headers={"x-user-id": "user-1"})
11
+ assert response.status_code == 200
12
+ assert response.json() == {"message": "Hello, World"}