business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import sqlite3
|
|
5
|
+
import uuid
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
from fastapi.testclient import TestClient
|
|
11
|
+
|
|
12
|
+
from app.config import clear_settings_cache, get_settings
|
|
13
|
+
from app.services.embeddings.gemini_api import batch_embed_contents
|
|
14
|
+
from app.services.embeddings.types import InlineDataPart, TextPart
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _vec1536() -> list[float]:
|
|
18
|
+
v = [0.0] * 1536
|
|
19
|
+
v[0] = 0.25
|
|
20
|
+
return v
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_batch_embed_contents_parses_response(monkeypatch) -> None:
|
|
24
|
+
monkeypatch.setenv("VECTOR_EMBEDDING_DIM", "1536")
|
|
25
|
+
clear_settings_cache()
|
|
26
|
+
settings = get_settings()
|
|
27
|
+
|
|
28
|
+
async def handler(request: httpx.Request) -> httpx.Response:
|
|
29
|
+
body = {
|
|
30
|
+
"embeddings": [
|
|
31
|
+
{"values": _vec1536()},
|
|
32
|
+
{"values": _vec1536()},
|
|
33
|
+
],
|
|
34
|
+
}
|
|
35
|
+
return httpx.Response(200, json=body)
|
|
36
|
+
|
|
37
|
+
transport = httpx.MockTransport(handler)
|
|
38
|
+
|
|
39
|
+
async def _run() -> None:
|
|
40
|
+
async with httpx.AsyncClient(transport=transport) as client:
|
|
41
|
+
out = await batch_embed_contents(
|
|
42
|
+
api_key="k",
|
|
43
|
+
model="gemini-embedding-001",
|
|
44
|
+
contents=[[TextPart("a")], [TextPart("b"), TextPart("c")]],
|
|
45
|
+
settings=settings,
|
|
46
|
+
client=client,
|
|
47
|
+
)
|
|
48
|
+
assert len(out) == 2
|
|
49
|
+
assert len(out[0]) == 1536
|
|
50
|
+
|
|
51
|
+
asyncio.run(_run())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_embed_endpoint_503_without_key(migrated_client: TestClient) -> None:
|
|
55
|
+
clear_settings_cache()
|
|
56
|
+
h = {"x-user-id": "u1"}
|
|
57
|
+
r = migrated_client.post(
|
|
58
|
+
"/ingest/documents/x/embed",
|
|
59
|
+
headers=h,
|
|
60
|
+
json={"multimodal": False},
|
|
61
|
+
)
|
|
62
|
+
assert r.status_code == 503
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_embed_endpoint_accepts_with_key(
|
|
66
|
+
migrated_client: TestClient,
|
|
67
|
+
monkeypatch,
|
|
68
|
+
) -> None:
|
|
69
|
+
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
|
70
|
+
clear_settings_cache()
|
|
71
|
+
|
|
72
|
+
h = {"x-user-id": "u1"}
|
|
73
|
+
ing = migrated_client.post(
|
|
74
|
+
"/ingest/raw?connector=echo",
|
|
75
|
+
headers=h,
|
|
76
|
+
json={
|
|
77
|
+
"source": "s",
|
|
78
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
79
|
+
"content_type": "text",
|
|
80
|
+
"payload": "hello chunk",
|
|
81
|
+
"metadata": {},
|
|
82
|
+
},
|
|
83
|
+
)
|
|
84
|
+
doc_id = ing.json()["document_id"]
|
|
85
|
+
ch = migrated_client.post(
|
|
86
|
+
f"/ingest/documents/{doc_id}/chunks",
|
|
87
|
+
headers=h,
|
|
88
|
+
json={"use_llm_weak_structure": False},
|
|
89
|
+
)
|
|
90
|
+
assert ch.status_code == 200
|
|
91
|
+
|
|
92
|
+
mock_job = AsyncMock()
|
|
93
|
+
with patch(
|
|
94
|
+
"app.routers.gemini_embed.run_embed_document_job",
|
|
95
|
+
mock_job,
|
|
96
|
+
):
|
|
97
|
+
r = migrated_client.post(
|
|
98
|
+
f"/ingest/documents/{doc_id}/embed",
|
|
99
|
+
headers=h,
|
|
100
|
+
json={"multimodal": False},
|
|
101
|
+
)
|
|
102
|
+
assert r.status_code == 200
|
|
103
|
+
assert r.json()["accepted"] is True
|
|
104
|
+
mock_job.assert_called_once()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_embed_document_persists_vectors_and_rows(
|
|
108
|
+
tmp_path,
|
|
109
|
+
monkeypatch,
|
|
110
|
+
) -> None:
|
|
111
|
+
db_file = tmp_path / "rag.sqlite"
|
|
112
|
+
monkeypatch.setenv(
|
|
113
|
+
"DATABASE_URL",
|
|
114
|
+
f"sqlite+aiosqlite:///{db_file.as_posix()}",
|
|
115
|
+
)
|
|
116
|
+
monkeypatch.setenv("GEMINI_API_KEY", "k")
|
|
117
|
+
monkeypatch.setenv("VECTOR_EMBEDDING_DIM", "1536")
|
|
118
|
+
clear_settings_cache()
|
|
119
|
+
|
|
120
|
+
import os
|
|
121
|
+
from pathlib import Path
|
|
122
|
+
|
|
123
|
+
from alembic.config import Config
|
|
124
|
+
|
|
125
|
+
from alembic import command
|
|
126
|
+
|
|
127
|
+
backend = Path(__file__).resolve().parents[1]
|
|
128
|
+
prev = os.getcwd()
|
|
129
|
+
os.chdir(str(backend))
|
|
130
|
+
try:
|
|
131
|
+
command.upgrade(Config(str(backend / "alembic.ini")), "head")
|
|
132
|
+
finally:
|
|
133
|
+
os.chdir(prev)
|
|
134
|
+
clear_settings_cache()
|
|
135
|
+
|
|
136
|
+
from sqlalchemy import text
|
|
137
|
+
from sqlalchemy.ext.asyncio import (
|
|
138
|
+
AsyncSession,
|
|
139
|
+
async_sessionmaker,
|
|
140
|
+
create_async_engine,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
from app.config import Settings
|
|
144
|
+
from app.services.embeddings.worker import embed_document_gemini
|
|
145
|
+
|
|
146
|
+
settings = Settings()
|
|
147
|
+
engine = create_async_engine(settings.sqlalchemy_database_url)
|
|
148
|
+
factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
|
149
|
+
store = MagicMock()
|
|
150
|
+
store.delete_document_for_session = AsyncMock(return_value=0)
|
|
151
|
+
store.upsert_for_session = AsyncMock()
|
|
152
|
+
doc_id = f"d-embed-{uuid.uuid4().hex[:12]}"
|
|
153
|
+
|
|
154
|
+
async def _seed() -> None:
|
|
155
|
+
async with factory() as session:
|
|
156
|
+
await session.execute(
|
|
157
|
+
text("INSERT INTO sources (name, connector_type) VALUES ('s','s')"),
|
|
158
|
+
)
|
|
159
|
+
sid = int(
|
|
160
|
+
(
|
|
161
|
+
await session.execute(text("SELECT id FROM sources LIMIT 1"))
|
|
162
|
+
).scalar_one(),
|
|
163
|
+
)
|
|
164
|
+
await session.execute(
|
|
165
|
+
text(
|
|
166
|
+
"INSERT INTO documents "
|
|
167
|
+
"(id, source_id, timestamp, content_type, raw_content, "
|
|
168
|
+
"summary, status) VALUES "
|
|
169
|
+
"(:id, :sid, :ts, 'text', '{}', 'x', 'partial')",
|
|
170
|
+
),
|
|
171
|
+
{"id": doc_id, "sid": sid, "ts": datetime.now(UTC).isoformat()},
|
|
172
|
+
)
|
|
173
|
+
await session.execute(
|
|
174
|
+
text(
|
|
175
|
+
"INSERT INTO document_chunks "
|
|
176
|
+
"(document_id, ordinal, text, start_block_ordinal, "
|
|
177
|
+
"end_block_ordinal, meta) "
|
|
178
|
+
"VALUES (:d, 0, 'alpha beta', 0, 0, NULL)",
|
|
179
|
+
),
|
|
180
|
+
{"d": doc_id},
|
|
181
|
+
)
|
|
182
|
+
await session.commit()
|
|
183
|
+
|
|
184
|
+
asyncio.run(_seed())
|
|
185
|
+
|
|
186
|
+
async def _run_embed() -> None:
|
|
187
|
+
with patch(
|
|
188
|
+
"app.services.embeddings.worker.batch_embed_contents",
|
|
189
|
+
new=AsyncMock(return_value=[_vec1536()]),
|
|
190
|
+
):
|
|
191
|
+
async with factory() as session:
|
|
192
|
+
n = await embed_document_gemini(
|
|
193
|
+
session,
|
|
194
|
+
document_id=doc_id,
|
|
195
|
+
settings=settings,
|
|
196
|
+
store=store,
|
|
197
|
+
multimodal=False,
|
|
198
|
+
)
|
|
199
|
+
await session.commit()
|
|
200
|
+
assert n == 1
|
|
201
|
+
|
|
202
|
+
asyncio.run(_run_embed())
|
|
203
|
+
|
|
204
|
+
db_path = db_file.resolve()
|
|
205
|
+
conn = sqlite3.connect(str(db_path))
|
|
206
|
+
erows = conn.execute(
|
|
207
|
+
"SELECT document_id, model, dim FROM embeddings WHERE document_id = ?",
|
|
208
|
+
(doc_id,),
|
|
209
|
+
).fetchall()
|
|
210
|
+
conn.close()
|
|
211
|
+
assert len(erows) == 1
|
|
212
|
+
assert erows[0][1] == "gemini-embedding-001"
|
|
213
|
+
assert erows[0][2] == 1536
|
|
214
|
+
|
|
215
|
+
asyncio.run(engine.dispose())
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def test_inline_data_part_roundtrip_in_api_dict() -> None:
|
|
219
|
+
p = InlineDataPart(mime_type="image/png", data=b"\x89PNG\r\n")
|
|
220
|
+
from app.services.embeddings.gemini_api import _part_to_api_dict
|
|
221
|
+
|
|
222
|
+
d = _part_to_api_dict(p)
|
|
223
|
+
assert d["inlineData"]["mimeType"] == "image/png"
|
|
224
|
+
assert "data" in d["inlineData"]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from fastapi.testclient import TestClient
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_healthz_no_gateway_header(client: TestClient) -> None:
|
|
5
|
+
response = client.get("/healthz")
|
|
6
|
+
assert response.status_code == 200
|
|
7
|
+
assert response.json() == {"status": "ok"}
|
|
8
|
+
assert response.headers.get("x-request-id")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_readyz_no_gateway_header(client: TestClient) -> None:
|
|
12
|
+
response = client.get("/readyz")
|
|
13
|
+
assert response.status_code == 200
|
|
14
|
+
assert response.json() == {"status": "ready"}
|
|
15
|
+
assert response.headers.get("x-request-id")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_request_id_echo(client: TestClient) -> None:
|
|
19
|
+
response = client.get(
|
|
20
|
+
"/healthz",
|
|
21
|
+
headers={"X-Request-ID": "probe-abc"},
|
|
22
|
+
)
|
|
23
|
+
assert response.status_code == 200
|
|
24
|
+
assert response.headers.get("x-request-id") == "probe-abc"
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sqlite3
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
|
|
7
|
+
from fastapi.testclient import TestClient
|
|
8
|
+
|
|
9
|
+
from app.config import get_settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_ingest_raw_requires_auth(migrated_client: TestClient) -> None:
|
|
13
|
+
r = migrated_client.post(
|
|
14
|
+
"/ingest/raw",
|
|
15
|
+
json={
|
|
16
|
+
"source": "test",
|
|
17
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
18
|
+
"content_type": "text",
|
|
19
|
+
"payload": {"x": 1},
|
|
20
|
+
"metadata": {},
|
|
21
|
+
},
|
|
22
|
+
)
|
|
23
|
+
assert r.status_code == 401
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_ingest_raw_unknown_connector(migrated_client: TestClient) -> None:
|
|
27
|
+
h = {"x-user-id": "u1"}
|
|
28
|
+
r = migrated_client.post(
|
|
29
|
+
"/ingest/raw?connector=does-not-exist",
|
|
30
|
+
headers=h,
|
|
31
|
+
json={
|
|
32
|
+
"source": "test",
|
|
33
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
34
|
+
"content_type": "text",
|
|
35
|
+
"payload": {},
|
|
36
|
+
"metadata": {},
|
|
37
|
+
},
|
|
38
|
+
)
|
|
39
|
+
assert r.status_code == 400
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_ingest_raw_strict_connector_validation(migrated_client: TestClient) -> None:
|
|
43
|
+
h = {"x-user-id": "u1"}
|
|
44
|
+
base = {
|
|
45
|
+
"source": "src-a",
|
|
46
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
47
|
+
"content_type": "text",
|
|
48
|
+
"payload": "hello",
|
|
49
|
+
"metadata": {"connector_config": {"ok": False}},
|
|
50
|
+
}
|
|
51
|
+
bad = migrated_client.post("/ingest/raw?connector=strict", headers=h, json=base)
|
|
52
|
+
assert bad.status_code == 400
|
|
53
|
+
|
|
54
|
+
base["metadata"] = {"connector_config": {"ok": True}}
|
|
55
|
+
ok = migrated_client.post("/ingest/raw?connector=strict", headers=h, json=base)
|
|
56
|
+
assert ok.status_code == 200
|
|
57
|
+
body = ok.json()
|
|
58
|
+
assert body["status"] == "partial"
|
|
59
|
+
assert body["normalized"]["strict"] is True
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_ingest_raw_persists_envelope(migrated_client: TestClient) -> None:
|
|
63
|
+
h = {"x-user-id": "u1"}
|
|
64
|
+
ts = datetime(2024, 3, 15, 10, 0, 0, tzinfo=UTC)
|
|
65
|
+
payload = {"body": "hello"}
|
|
66
|
+
r = migrated_client.post(
|
|
67
|
+
"/ingest/raw?connector=echo",
|
|
68
|
+
headers=h,
|
|
69
|
+
json={
|
|
70
|
+
"source": "filesystem",
|
|
71
|
+
"timestamp": ts.isoformat(),
|
|
72
|
+
"content_type": "multimodal",
|
|
73
|
+
"payload": payload,
|
|
74
|
+
"metadata": {"path": "/a/b"},
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
assert r.status_code == 200
|
|
78
|
+
data = r.json()
|
|
79
|
+
doc_id = data["document_id"]
|
|
80
|
+
assert data["status"] == "partial"
|
|
81
|
+
assert data["connector"] == "echo"
|
|
82
|
+
assert data["normalized"]["metadata_keys"] == ["path"]
|
|
83
|
+
assert data["canonical"]["id"] == doc_id
|
|
84
|
+
assert len(data["canonical"]["content_blocks"]) == 1
|
|
85
|
+
assert data["canonical"]["content_blocks"][0]["type"] == "text"
|
|
86
|
+
assert data["canonical"]["content_blocks"][0]["data"] == "hello"
|
|
87
|
+
|
|
88
|
+
settings = get_settings()
|
|
89
|
+
db_path = (settings.data_dir / settings.sqlite_filename).resolve()
|
|
90
|
+
conn = sqlite3.connect(str(db_path))
|
|
91
|
+
cur = conn.execute(
|
|
92
|
+
"SELECT id, status, content_type, raw_content, summary "
|
|
93
|
+
"FROM documents WHERE id = ?",
|
|
94
|
+
(doc_id,),
|
|
95
|
+
)
|
|
96
|
+
row = cur.fetchone()
|
|
97
|
+
assert row is not None
|
|
98
|
+
assert row[1] == "partial"
|
|
99
|
+
assert row[2] == "multimodal"
|
|
100
|
+
stored = json.loads(row[3])
|
|
101
|
+
assert stored["source"] == "filesystem"
|
|
102
|
+
assert stored["payload"] == payload
|
|
103
|
+
assert row[4] == "hello"
|
|
104
|
+
|
|
105
|
+
blocks = conn.execute(
|
|
106
|
+
"SELECT ordinal, type, meta FROM content_blocks "
|
|
107
|
+
"WHERE document_id = ? ORDER BY ordinal",
|
|
108
|
+
(doc_id,),
|
|
109
|
+
).fetchall()
|
|
110
|
+
conn.close()
|
|
111
|
+
assert len(blocks) == 1
|
|
112
|
+
assert blocks[0][1] == "text"
|
|
113
|
+
assert json.loads(blocks[0][2])["text"] == "hello"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_registry_includes_entrypoint_connector() -> None:
|
|
117
|
+
from app.connectors.registry import init_connectors, list_connector_keys
|
|
118
|
+
|
|
119
|
+
init_connectors()
|
|
120
|
+
keys = list_connector_keys()
|
|
121
|
+
assert "generic" in keys
|
|
122
|
+
assert "strict" in keys
|
|
123
|
+
assert "echo" in keys
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import sqlite3
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from unittest.mock import AsyncMock, patch
|
|
9
|
+
|
|
10
|
+
from alembic.config import Config
|
|
11
|
+
from fastapi.testclient import TestClient
|
|
12
|
+
from sqlalchemy import text
|
|
13
|
+
from sqlalchemy.ext.asyncio import (
|
|
14
|
+
AsyncSession,
|
|
15
|
+
async_sessionmaker,
|
|
16
|
+
create_async_engine,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from alembic import command
|
|
20
|
+
from app.config import Settings, clear_settings_cache, get_settings
|
|
21
|
+
from app.services.link_expansion.canonical_url import canonicalize_url, host_from_url
|
|
22
|
+
from app.services.link_expansion.domain_policy import host_allowed, parse_host_csv
|
|
23
|
+
from app.services.link_expansion.schemas import ExpandLinksJobResult, ExpandLinksOptions
|
|
24
|
+
from app.services.link_expansion.worker import expand_links_from_document
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_canonicalize_url_strips_fragment_sorts_query() -> None:
|
|
28
|
+
u = "HTTPS://Example.COM/path?b=2&a=1#frag"
|
|
29
|
+
assert canonicalize_url(u) == "https://example.com/path?a=1&b=2"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_host_allowed_allowlist_denylist() -> None:
|
|
33
|
+
allow = frozenset({"good.example"})
|
|
34
|
+
deny = frozenset({"bad.example"})
|
|
35
|
+
assert host_allowed("good.example", allowlist=allow, denylist=deny) is True
|
|
36
|
+
assert host_allowed("sub.good.example", allowlist=allow, denylist=deny) is True
|
|
37
|
+
assert host_allowed("bad.example", allowlist=allow, denylist=deny) is False
|
|
38
|
+
assert host_allowed("other.com", allowlist=allow, denylist=deny) is False
|
|
39
|
+
assert host_allowed("open.com", allowlist=frozenset(), denylist=deny) is True
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_parse_host_csv() -> None:
|
|
43
|
+
assert "a.com" in parse_host_csv(" A.COM , b.com ")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class _MockResp:
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
*,
|
|
50
|
+
content: bytes = b"",
|
|
51
|
+
status: int = 200,
|
|
52
|
+
headers: dict[str, str] | None = None,
|
|
53
|
+
url: str = "",
|
|
54
|
+
) -> None:
|
|
55
|
+
self.content = content
|
|
56
|
+
self.status_code = status
|
|
57
|
+
self.headers = headers or {}
|
|
58
|
+
self.url = url
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def is_success(self) -> bool:
|
|
62
|
+
return 200 <= self.status_code < 300
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def text(self) -> str:
|
|
66
|
+
return self.content.decode("utf-8", errors="replace")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class _MockAsyncClient:
|
|
70
|
+
def __init__(self, *args: object, **kwargs: object) -> None:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
async def __aenter__(self) -> _MockAsyncClient:
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
async def __aexit__(self, *args: object) -> None:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
async def head(self, url: str, timeout: object = None) -> _MockResp:
|
|
80
|
+
return _MockResp(status=200, headers={}, url=url)
|
|
81
|
+
|
|
82
|
+
async def get(self, url: str, timeout: object = None) -> _MockResp:
|
|
83
|
+
if "robots.txt" in url:
|
|
84
|
+
body = b"User-agent: *\nDisallow:\n"
|
|
85
|
+
return _MockResp(content=body, url=url)
|
|
86
|
+
html = (
|
|
87
|
+
b"<html><body>Hello "
|
|
88
|
+
b'<a href="https://child.example/child">x</a></body></html>'
|
|
89
|
+
)
|
|
90
|
+
return _MockResp(
|
|
91
|
+
content=html,
|
|
92
|
+
headers={"content-type": "text/html"},
|
|
93
|
+
url="https://example.com/page",
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_expand_links_creates_child_and_relationship(tmp_path, monkeypatch) -> None:
|
|
98
|
+
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
99
|
+
monkeypatch.setenv("LINK_EXPAND_POLITENESS_DELAY_MS", "0")
|
|
100
|
+
monkeypatch.setenv("LINK_EXPAND_PER_DOMAIN_INTERVAL_MS", "0")
|
|
101
|
+
clear_settings_cache()
|
|
102
|
+
|
|
103
|
+
backend = Path(__file__).resolve().parents[1]
|
|
104
|
+
prev = os.getcwd()
|
|
105
|
+
os.chdir(str(backend))
|
|
106
|
+
try:
|
|
107
|
+
cfg = Config(str(backend / "alembic.ini"))
|
|
108
|
+
command.upgrade(cfg, "head")
|
|
109
|
+
finally:
|
|
110
|
+
os.chdir(prev)
|
|
111
|
+
clear_settings_cache()
|
|
112
|
+
|
|
113
|
+
settings = Settings()
|
|
114
|
+
engine = create_async_engine(settings.sqlalchemy_database_url)
|
|
115
|
+
factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
|
116
|
+
|
|
117
|
+
root_id = "root-doc-1"
|
|
118
|
+
|
|
119
|
+
async def _seed() -> None:
|
|
120
|
+
async with factory() as session:
|
|
121
|
+
await session.execute(
|
|
122
|
+
text("INSERT INTO sources (name, connector_type) VALUES ('t', 't')"),
|
|
123
|
+
)
|
|
124
|
+
res = await session.execute(text("SELECT id FROM sources LIMIT 1"))
|
|
125
|
+
sid = int(res.scalar_one())
|
|
126
|
+
await session.execute(
|
|
127
|
+
text(
|
|
128
|
+
"INSERT INTO documents "
|
|
129
|
+
"(id, source_id, timestamp, content_type, raw_content, "
|
|
130
|
+
"summary, status) "
|
|
131
|
+
"VALUES (:id, :sid, :ts, 'text', '{}', 's', 'partial')",
|
|
132
|
+
),
|
|
133
|
+
{
|
|
134
|
+
"id": root_id,
|
|
135
|
+
"sid": sid,
|
|
136
|
+
"ts": datetime.now(UTC).isoformat(),
|
|
137
|
+
},
|
|
138
|
+
)
|
|
139
|
+
await session.execute(
|
|
140
|
+
text(
|
|
141
|
+
"INSERT INTO document_links (document_id, url, ordinal) "
|
|
142
|
+
"VALUES (:id, :u, 0)",
|
|
143
|
+
),
|
|
144
|
+
{"id": root_id, "u": "https://example.com/page"},
|
|
145
|
+
)
|
|
146
|
+
await session.commit()
|
|
147
|
+
|
|
148
|
+
asyncio.run(_seed())
|
|
149
|
+
|
|
150
|
+
opts = ExpandLinksOptions(
|
|
151
|
+
max_depth=2,
|
|
152
|
+
allowlist=frozenset(),
|
|
153
|
+
denylist=frozenset(),
|
|
154
|
+
respect_robots=True,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
async def _expand() -> ExpandLinksJobResult:
|
|
158
|
+
with patch(
|
|
159
|
+
"app.services.link_expansion.worker.httpx.AsyncClient",
|
|
160
|
+
_MockAsyncClient,
|
|
161
|
+
):
|
|
162
|
+
async with factory() as session:
|
|
163
|
+
out = await expand_links_from_document(
|
|
164
|
+
session,
|
|
165
|
+
root_document_id=root_id,
|
|
166
|
+
options=opts,
|
|
167
|
+
settings=get_settings(),
|
|
168
|
+
)
|
|
169
|
+
await session.commit()
|
|
170
|
+
return out
|
|
171
|
+
|
|
172
|
+
result = asyncio.run(_expand())
|
|
173
|
+
assert result.relationships_created >= 1
|
|
174
|
+
assert result.documents_created >= 1
|
|
175
|
+
|
|
176
|
+
db_path = (get_settings().data_dir / get_settings().sqlite_filename).resolve()
|
|
177
|
+
conn = sqlite3.connect(str(db_path))
|
|
178
|
+
rels = conn.execute(
|
|
179
|
+
"SELECT parent_document_id, child_document_id, relation_type "
|
|
180
|
+
"FROM relationships",
|
|
181
|
+
).fetchall()
|
|
182
|
+
conn.close()
|
|
183
|
+
assert any(r[0] == root_id and r[2] == "link" for r in rels)
|
|
184
|
+
|
|
185
|
+
async def _dispose() -> None:
|
|
186
|
+
await engine.dispose()
|
|
187
|
+
|
|
188
|
+
asyncio.run(_dispose())
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def test_expand_links_endpoint_404(migrated_client: TestClient) -> None:
|
|
192
|
+
h = {"x-user-id": "u1"}
|
|
193
|
+
r = migrated_client.post(
|
|
194
|
+
"/ingest/expand-links",
|
|
195
|
+
headers=h,
|
|
196
|
+
json={"document_id": "does-not-exist"},
|
|
197
|
+
)
|
|
198
|
+
assert r.status_code == 404
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def test_expand_links_endpoint_accepts(
|
|
202
|
+
migrated_client: TestClient,
|
|
203
|
+
monkeypatch,
|
|
204
|
+
) -> None:
|
|
205
|
+
monkeypatch.setenv("LINK_EXPAND_POLITENESS_DELAY_MS", "0")
|
|
206
|
+
monkeypatch.setenv("LINK_EXPAND_PER_DOMAIN_INTERVAL_MS", "0")
|
|
207
|
+
clear_settings_cache()
|
|
208
|
+
|
|
209
|
+
h = {"x-user-id": "u1"}
|
|
210
|
+
ing = migrated_client.post(
|
|
211
|
+
"/ingest/raw?connector=echo",
|
|
212
|
+
headers=h,
|
|
213
|
+
json={
|
|
214
|
+
"source": "t",
|
|
215
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
216
|
+
"content_type": "text",
|
|
217
|
+
"payload": "see https://example.com/x",
|
|
218
|
+
"metadata": {},
|
|
219
|
+
},
|
|
220
|
+
)
|
|
221
|
+
assert ing.status_code == 200
|
|
222
|
+
doc_id = ing.json()["document_id"]
|
|
223
|
+
|
|
224
|
+
mock_job = AsyncMock()
|
|
225
|
+
|
|
226
|
+
with patch(
|
|
227
|
+
"app.routers.link_expansion.run_expand_links_job",
|
|
228
|
+
mock_job,
|
|
229
|
+
):
|
|
230
|
+
r = migrated_client.post(
|
|
231
|
+
"/ingest/expand-links",
|
|
232
|
+
headers=h,
|
|
233
|
+
json={"document_id": doc_id},
|
|
234
|
+
)
|
|
235
|
+
assert r.status_code == 200
|
|
236
|
+
assert r.json() == {"accepted": True, "document_id": doc_id}
|
|
237
|
+
mock_job.assert_called_once()
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def test_host_from_url() -> None:
|
|
241
|
+
assert host_from_url("https://a.B.com/p") == "a.b.com"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from fastapi.testclient import TestClient
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_hello_world_requires_gateway_identity(client: TestClient) -> None:
|
|
5
|
+
response = client.get("/hello-world")
|
|
6
|
+
assert response.status_code == 401
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_hello_world_with_user_header(client: TestClient) -> None:
|
|
10
|
+
response = client.get("/hello-world", headers={"x-user-id": "user-1"})
|
|
11
|
+
assert response.status_code == 200
|
|
12
|
+
assert response.json() == {"message": "Hello, World"}
|