business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
9
|
+
|
|
10
|
+
from alembic.config import Config
|
|
11
|
+
from sqlalchemy import text
|
|
12
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
|
13
|
+
|
|
14
|
+
from alembic import command
|
|
15
|
+
from app.config import Settings, clear_settings_cache, get_settings
|
|
16
|
+
from app.services.hooks.post_ingest import (
|
|
17
|
+
_apply_template,
|
|
18
|
+
_truncate,
|
|
19
|
+
dispatch_post_ingest_hooks,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_apply_template_missing_key_preserved() -> None:
|
|
24
|
+
s = _apply_template("Hi {name} {missing}", {"name": "x"})
|
|
25
|
+
assert "x" in s
|
|
26
|
+
assert "{missing}" in s
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_truncate() -> None:
|
|
30
|
+
assert _truncate("abc", 2) == "a…"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def _seed_ok_doc(session: AsyncSession) -> str:
|
|
34
|
+
await session.execute(
|
|
35
|
+
text("INSERT INTO sources (name, connector_type) VALUES ('src1', 't')"),
|
|
36
|
+
)
|
|
37
|
+
sid = int(
|
|
38
|
+
(await session.execute(text("SELECT id FROM sources LIMIT 1"))).scalar_one(),
|
|
39
|
+
)
|
|
40
|
+
doc_id = "hook-doc-1"
|
|
41
|
+
await session.execute(
|
|
42
|
+
text(
|
|
43
|
+
"INSERT INTO documents "
|
|
44
|
+
"(id, source_id, timestamp, content_type, raw_content, summary, status, "
|
|
45
|
+
"canonical_url, external_id, dedupe_content_hash, normalization_error) "
|
|
46
|
+
"VALUES (:id, :sid, :ts, 'text', '{}', :sum, 'ok', :cu, NULL, NULL, NULL)",
|
|
47
|
+
),
|
|
48
|
+
{
|
|
49
|
+
"id": doc_id,
|
|
50
|
+
"sid": sid,
|
|
51
|
+
"ts": datetime.now(UTC).isoformat(),
|
|
52
|
+
"sum": "hello " * 50,
|
|
53
|
+
"cu": "https://example.com/page",
|
|
54
|
+
},
|
|
55
|
+
)
|
|
56
|
+
await session.execute(
|
|
57
|
+
text(
|
|
58
|
+
"INSERT INTO document_links (document_id, url, ordinal) "
|
|
59
|
+
"VALUES (:d, 'https://fallback.test/', 0)",
|
|
60
|
+
),
|
|
61
|
+
{"d": doc_id},
|
|
62
|
+
)
|
|
63
|
+
await session.commit()
|
|
64
|
+
return doc_id
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_dispatch_slack_and_discord_no_secret_logs(
|
|
68
|
+
tmp_path,
|
|
69
|
+
monkeypatch,
|
|
70
|
+
caplog,
|
|
71
|
+
) -> None:
|
|
72
|
+
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
73
|
+
monkeypatch.setenv(
|
|
74
|
+
"POST_INGEST_SLACK_WEBHOOK_URL",
|
|
75
|
+
"https://hooks.slack.com/services/SECRET/SHOULD/NOTLOG",
|
|
76
|
+
)
|
|
77
|
+
monkeypatch.setenv(
|
|
78
|
+
"POST_INGEST_DISCORD_WEBHOOK_URL",
|
|
79
|
+
"https://discord.com/api/webhooks/SECRET/NOTLOG",
|
|
80
|
+
)
|
|
81
|
+
clear_settings_cache()
|
|
82
|
+
|
|
83
|
+
backend = Path(__file__).resolve().parents[1]
|
|
84
|
+
prev = os.getcwd()
|
|
85
|
+
os.chdir(str(backend))
|
|
86
|
+
try:
|
|
87
|
+
command.upgrade(Config(str(backend / "alembic.ini")), "head")
|
|
88
|
+
finally:
|
|
89
|
+
os.chdir(prev)
|
|
90
|
+
clear_settings_cache()
|
|
91
|
+
|
|
92
|
+
settings = Settings()
|
|
93
|
+
engine = create_async_engine(settings.sqlalchemy_database_url)
|
|
94
|
+
factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
|
95
|
+
|
|
96
|
+
async def _setup() -> str:
|
|
97
|
+
async with factory() as session:
|
|
98
|
+
return await _seed_ok_doc(session)
|
|
99
|
+
|
|
100
|
+
doc_id = asyncio.run(_setup())
|
|
101
|
+
|
|
102
|
+
caplog.set_level(logging.INFO, logger="app.services.hooks.post_ingest")
|
|
103
|
+
|
|
104
|
+
mock_resp = MagicMock()
|
|
105
|
+
mock_resp.raise_for_status = MagicMock()
|
|
106
|
+
client_inst = MagicMock()
|
|
107
|
+
client_inst.post = AsyncMock(return_value=mock_resp)
|
|
108
|
+
client_inst.__aenter__ = AsyncMock(return_value=client_inst)
|
|
109
|
+
client_inst.__aexit__ = AsyncMock(return_value=False)
|
|
110
|
+
|
|
111
|
+
with patch(
|
|
112
|
+
"app.services.hooks.post_ingest.httpx.AsyncClient",
|
|
113
|
+
return_value=client_inst,
|
|
114
|
+
):
|
|
115
|
+
asyncio.run(
|
|
116
|
+
dispatch_post_ingest_hooks(
|
|
117
|
+
factory,
|
|
118
|
+
document_id=doc_id,
|
|
119
|
+
settings=get_settings(),
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
assert client_inst.post.await_count == 2
|
|
124
|
+
|
|
125
|
+
def _call_url(c: object) -> str:
|
|
126
|
+
args, kwargs = c.args, c.kwargs
|
|
127
|
+
if args:
|
|
128
|
+
return str(args[0])
|
|
129
|
+
return str(kwargs.get("url", ""))
|
|
130
|
+
|
|
131
|
+
calls = list(client_inst.post.await_args_list)
|
|
132
|
+
urls = [_call_url(c) for c in calls]
|
|
133
|
+
assert any("hooks.slack.com" in u for u in urls)
|
|
134
|
+
assert any("discord.com" in u for u in urls)
|
|
135
|
+
slack_call = next(c for c in calls if "slack" in _call_url(c))
|
|
136
|
+
slack_json = slack_call.kwargs["json"]
|
|
137
|
+
assert "hello" in slack_json["text"]
|
|
138
|
+
assert "example.com/page" in slack_json["text"]
|
|
139
|
+
|
|
140
|
+
log_text = caplog.text
|
|
141
|
+
assert "SECRET" not in log_text
|
|
142
|
+
assert "NOTLOG" not in log_text
|
|
143
|
+
assert "hooks.slack.com" not in log_text.lower()
|
|
144
|
+
|
|
145
|
+
asyncio.run(engine.dispose())
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_dispatch_no_webhooks_is_noop(tmp_path, monkeypatch) -> None:
|
|
149
|
+
monkeypatch.setenv("DATA_DIR", str(tmp_path))
|
|
150
|
+
monkeypatch.delenv("POST_INGEST_SLACK_WEBHOOK_URL", raising=False)
|
|
151
|
+
monkeypatch.delenv("POST_INGEST_DISCORD_WEBHOOK_URL", raising=False)
|
|
152
|
+
clear_settings_cache()
|
|
153
|
+
|
|
154
|
+
with patch("app.services.hooks.post_ingest.httpx.AsyncClient") as ac:
|
|
155
|
+
asyncio.run(
|
|
156
|
+
dispatch_post_ingest_hooks(
|
|
157
|
+
MagicMock(),
|
|
158
|
+
document_id="x",
|
|
159
|
+
settings=get_settings(),
|
|
160
|
+
),
|
|
161
|
+
)
|
|
162
|
+
ac.assert_not_called()
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sqlite3
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from unittest.mock import AsyncMock, patch
|
|
6
|
+
|
|
7
|
+
from fastapi.testclient import TestClient
|
|
8
|
+
|
|
9
|
+
from app.config import clear_settings_cache, get_settings
|
|
10
|
+
from app.schemas.query import QueryFiltersPayload
|
|
11
|
+
from app.services.retrieval_service import (
|
|
12
|
+
blended_retrieval_score,
|
|
13
|
+
recency_score_from_ingested_at,
|
|
14
|
+
semantic_score_from_distance,
|
|
15
|
+
source_weight_for_connector,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
_DIM = 1536
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _vec(first: float = 1.0) -> list[float]:
|
|
22
|
+
v = [0.0] * _DIM
|
|
23
|
+
v[0] = first
|
|
24
|
+
return v
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_query_requires_gateway(migrated_client: TestClient) -> None:
|
|
28
|
+
r = migrated_client.post("/query", json={"query": "hi", "k": 5})
|
|
29
|
+
assert r.status_code == 401
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_semantic_score_decreases_with_distance() -> None:
|
|
33
|
+
assert semantic_score_from_distance(0.0) > semantic_score_from_distance(2.0)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_recency_newer_higher() -> None:
|
|
37
|
+
now = datetime(2026, 6, 15, tzinfo=UTC)
|
|
38
|
+
old = recency_score_from_ingested_at(
|
|
39
|
+
"2020-01-01T00:00:00Z",
|
|
40
|
+
now=now,
|
|
41
|
+
half_life_days=30.0,
|
|
42
|
+
)
|
|
43
|
+
new = recency_score_from_ingested_at(
|
|
44
|
+
"2026-06-10T00:00:00Z",
|
|
45
|
+
now=now,
|
|
46
|
+
half_life_days=30.0,
|
|
47
|
+
)
|
|
48
|
+
assert new > old
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_source_weight_default() -> None:
|
|
52
|
+
w = {"default": 0.5, "web": 0.9}
|
|
53
|
+
assert source_weight_for_connector("missing", w) == 0.5
|
|
54
|
+
assert source_weight_for_connector("web", w) == 0.9
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_blended_score() -> None:
|
|
58
|
+
from app.config import Settings
|
|
59
|
+
|
|
60
|
+
s = Settings()
|
|
61
|
+
score = blended_retrieval_score(
|
|
62
|
+
semantic=1.0,
|
|
63
|
+
recency=1.0,
|
|
64
|
+
source_w=1.0,
|
|
65
|
+
settings=s,
|
|
66
|
+
)
|
|
67
|
+
assert abs(score - 1.0) < 1e-6
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_filters_newer_than_days_sets_timestamp_min() -> None:
|
|
71
|
+
now = datetime(2026, 1, 20, 12, 0, 0, tzinfo=UTC)
|
|
72
|
+
f = QueryFiltersPayload(newer_than_days=10.0)
|
|
73
|
+
vf = f.to_vector_filters(now=now)
|
|
74
|
+
assert vf is not None
|
|
75
|
+
assert vf.timestamp_min is not None
|
|
76
|
+
assert vf.timestamp_max is None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_vector_filters_merge_timestamp_min() -> None:
|
|
80
|
+
now = datetime(2026, 1, 20, tzinfo=UTC)
|
|
81
|
+
f = QueryFiltersPayload(
|
|
82
|
+
timestamp_min="2026-01-01T00:00:00Z",
|
|
83
|
+
newer_than_days=30.0,
|
|
84
|
+
)
|
|
85
|
+
vf = f.to_vector_filters(now=now)
|
|
86
|
+
assert vf is not None
|
|
87
|
+
# Later (stricter) bound should win
|
|
88
|
+
assert vf.timestamp_min == "2026-01-01T00:00:00Z"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def test_query_retrieves_chunk(migrated_client: TestClient, monkeypatch) -> None:
|
|
92
|
+
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
|
93
|
+
clear_settings_cache()
|
|
94
|
+
|
|
95
|
+
h = {"x-user-id": "u1"}
|
|
96
|
+
settings = get_settings()
|
|
97
|
+
db_path = settings.data_dir / settings.sqlite_filename
|
|
98
|
+
conn = sqlite3.connect(str(db_path))
|
|
99
|
+
cur = conn.cursor()
|
|
100
|
+
cur.execute(
|
|
101
|
+
"INSERT INTO sources (name, connector_type) VALUES ('n', 'web')",
|
|
102
|
+
)
|
|
103
|
+
sid = int(cur.lastrowid)
|
|
104
|
+
cur.execute(
|
|
105
|
+
"INSERT INTO documents (id, source_id, timestamp, content_type, raw_content, "
|
|
106
|
+
"summary, status) VALUES ('d1', ?, '2024-01-01T00:00:00Z', 'text', '{}', "
|
|
107
|
+
"'summary one', 'ok')",
|
|
108
|
+
(sid,),
|
|
109
|
+
)
|
|
110
|
+
cur.execute(
|
|
111
|
+
"INSERT INTO document_chunks (document_id, ordinal, text, start_block_ordinal, "
|
|
112
|
+
"end_block_ordinal, meta) VALUES ('d1', 0, 'alpha beta gamma', 0, 0, NULL)",
|
|
113
|
+
)
|
|
114
|
+
cid = int(cur.lastrowid)
|
|
115
|
+
conn.commit()
|
|
116
|
+
conn.close()
|
|
117
|
+
|
|
118
|
+
ts = "2026-01-10T12:00:00Z"
|
|
119
|
+
up = migrated_client.post(
|
|
120
|
+
"/vectors/upsert",
|
|
121
|
+
headers=h,
|
|
122
|
+
json={
|
|
123
|
+
"embeddings": [_vec(1.0)],
|
|
124
|
+
"metas": [
|
|
125
|
+
{
|
|
126
|
+
"document_id": "d1",
|
|
127
|
+
"chunk_id": cid,
|
|
128
|
+
"source_id": sid,
|
|
129
|
+
"modality": "text",
|
|
130
|
+
"ingested_at": ts,
|
|
131
|
+
},
|
|
132
|
+
],
|
|
133
|
+
},
|
|
134
|
+
)
|
|
135
|
+
assert up.status_code == 200
|
|
136
|
+
|
|
137
|
+
async def _fake_batch(*args, **kwargs):
|
|
138
|
+
return [_vec(1.0)]
|
|
139
|
+
|
|
140
|
+
with patch(
|
|
141
|
+
"app.services.retrieval_service.batch_embed_contents",
|
|
142
|
+
new=AsyncMock(side_effect=_fake_batch),
|
|
143
|
+
):
|
|
144
|
+
q = migrated_client.post(
|
|
145
|
+
"/query",
|
|
146
|
+
headers=h,
|
|
147
|
+
json={"query": "alpha beta", "k": 5},
|
|
148
|
+
)
|
|
149
|
+
assert q.status_code == 200
|
|
150
|
+
data = q.json()
|
|
151
|
+
assert len(data["candidates"]) == 1
|
|
152
|
+
c0 = data["candidates"][0]
|
|
153
|
+
assert c0["chunk"]["text"] == "alpha beta gamma"
|
|
154
|
+
assert c0["document"]["summary"] == "summary one"
|
|
155
|
+
assert c0["attribution"]["document_id"] == "d1"
|
|
156
|
+
assert "combined_text" in data["context"]
|
|
157
|
+
assert "d1" in data["context"]["combined_text"]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_query_503_without_gemini_key(migrated_client: TestClient, monkeypatch) -> None:
|
|
161
|
+
monkeypatch.delenv("GEMINI_API_KEY", raising=False)
|
|
162
|
+
clear_settings_cache()
|
|
163
|
+
h = {"x-user-id": "u1"}
|
|
164
|
+
r = migrated_client.post("/query", headers=h, json={"query": "x", "k": 1})
|
|
165
|
+
assert r.status_code == 503
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
from app.connectors.thread_expansion import (
|
|
9
|
+
FakeRedditThreadExpansionFetcher,
|
|
10
|
+
FakeTwitterThreadExpansionFetcher,
|
|
11
|
+
ThreadExpansionResult,
|
|
12
|
+
ThreadRelationshipEdge,
|
|
13
|
+
)
|
|
14
|
+
from app.connectors.thread_expansion.models import ThreadDocumentNode
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_thread_edge_rejects_self_loop() -> None:
|
|
18
|
+
with pytest.raises(ValueError, match="must differ"):
|
|
19
|
+
ThreadRelationshipEdge(
|
|
20
|
+
parent_document_id="a",
|
|
21
|
+
child_document_id="a",
|
|
22
|
+
relation_type="reply",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_expansion_result_requires_root_in_nodes() -> None:
|
|
27
|
+
with pytest.raises(ValueError, match="root_document_id"):
|
|
28
|
+
ThreadExpansionResult(
|
|
29
|
+
platform="twitter",
|
|
30
|
+
thread_key="k",
|
|
31
|
+
root_document_id="missing",
|
|
32
|
+
nodes=[
|
|
33
|
+
ThreadDocumentNode(
|
|
34
|
+
document_id="x",
|
|
35
|
+
external_id="1",
|
|
36
|
+
text="t",
|
|
37
|
+
created_at=datetime.now(UTC),
|
|
38
|
+
),
|
|
39
|
+
],
|
|
40
|
+
edges=[],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_fake_twitter_linear_reply_chain() -> None:
|
|
45
|
+
f = FakeTwitterThreadExpansionFetcher()
|
|
46
|
+
r = asyncio.run(f.fetch_full_thread("1999", config={}))
|
|
47
|
+
assert r.platform == "twitter"
|
|
48
|
+
assert r.root_document_id == "twitter:tw_root"
|
|
49
|
+
reply_edges = [e for e in r.edges if e.relation_type == "reply"]
|
|
50
|
+
assert len(reply_edges) == 2
|
|
51
|
+
parents = {e.child_document_id: e.parent_document_id for e in reply_edges}
|
|
52
|
+
assert parents["twitter:tw_r1"] == "twitter:tw_root"
|
|
53
|
+
assert parents["twitter:tw_r2"] == "twitter:tw_r1"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_fake_reddit_post_and_nested_comment() -> None:
|
|
57
|
+
f = FakeRedditThreadExpansionFetcher()
|
|
58
|
+
r = asyncio.run(f.fetch_post_and_comments("t3_abc123", config={}))
|
|
59
|
+
assert r.platform == "reddit"
|
|
60
|
+
assert len(r.nodes) == 3
|
|
61
|
+
assert any(
|
|
62
|
+
e.parent_document_id == "reddit:t3_abc123"
|
|
63
|
+
and e.child_document_id == "reddit:t1_cmnt1"
|
|
64
|
+
for e in r.edges
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_fakes_run_sync_helper() -> None:
|
|
69
|
+
"""Ensure asyncio.run works for quick local scripts."""
|
|
70
|
+
f = FakeTwitterThreadExpansionFetcher()
|
|
71
|
+
r = asyncio.run(f.fetch_full_thread("1", config={}))
|
|
72
|
+
assert isinstance(r, ThreadExpansionResult)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from fastapi.testclient import TestClient
|
|
4
|
+
|
|
5
|
+
_DIM = 1536
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _vec(first: float = 1.0) -> list[float]:
|
|
9
|
+
v = [0.0] * _DIM
|
|
10
|
+
v[0] = first
|
|
11
|
+
return v
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def test_vectors_require_gateway(migrated_client: TestClient) -> None:
|
|
15
|
+
r = migrated_client.post(
|
|
16
|
+
"/vectors/search",
|
|
17
|
+
json={"query_vector": _vec(), "k": 1},
|
|
18
|
+
)
|
|
19
|
+
assert r.status_code == 401
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_vectors_upsert_search_delete(migrated_client: TestClient) -> None:
|
|
23
|
+
h = {"x-user-id": "u1"}
|
|
24
|
+
ts = "2024-06-01T12:00:00Z"
|
|
25
|
+
|
|
26
|
+
r = migrated_client.post(
|
|
27
|
+
"/vectors/upsert",
|
|
28
|
+
json={
|
|
29
|
+
"embeddings": [_vec(1.0), _vec(0.2)],
|
|
30
|
+
"metas": [
|
|
31
|
+
{
|
|
32
|
+
"document_id": "doc-a",
|
|
33
|
+
"chunk_id": 0,
|
|
34
|
+
"source_id": 10,
|
|
35
|
+
"modality": "text",
|
|
36
|
+
"ingested_at": ts,
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"document_id": "doc-b",
|
|
40
|
+
"chunk_id": 1,
|
|
41
|
+
"source_id": 10,
|
|
42
|
+
"modality": "text",
|
|
43
|
+
"ingested_at": ts,
|
|
44
|
+
},
|
|
45
|
+
],
|
|
46
|
+
},
|
|
47
|
+
headers=h,
|
|
48
|
+
)
|
|
49
|
+
assert r.status_code == 200
|
|
50
|
+
|
|
51
|
+
q = migrated_client.post(
|
|
52
|
+
"/vectors/search",
|
|
53
|
+
json={"query_vector": _vec(1.0), "k": 2},
|
|
54
|
+
headers=h,
|
|
55
|
+
)
|
|
56
|
+
assert q.status_code == 200
|
|
57
|
+
data = q.json()["results"]
|
|
58
|
+
assert len(data) == 2
|
|
59
|
+
assert data[0]["document_id"] == "doc-a"
|
|
60
|
+
assert data[0]["distance"] < data[1]["distance"]
|
|
61
|
+
|
|
62
|
+
f = migrated_client.post(
|
|
63
|
+
"/vectors/search",
|
|
64
|
+
json={
|
|
65
|
+
"query_vector": _vec(1.0),
|
|
66
|
+
"k": 5,
|
|
67
|
+
"filters": {"source_id": 10, "modality": "text"},
|
|
68
|
+
},
|
|
69
|
+
headers=h,
|
|
70
|
+
)
|
|
71
|
+
assert f.status_code == 200
|
|
72
|
+
assert len(f.json()["results"]) == 2
|
|
73
|
+
|
|
74
|
+
d = migrated_client.delete("/vectors/documents/doc-a", headers=h)
|
|
75
|
+
assert d.status_code == 200
|
|
76
|
+
assert d.json()["deleted"] >= 1
|
|
77
|
+
|
|
78
|
+
q2 = migrated_client.post(
|
|
79
|
+
"/vectors/search",
|
|
80
|
+
json={"query_vector": _vec(1.0), "k": 5},
|
|
81
|
+
headers=h,
|
|
82
|
+
)
|
|
83
|
+
ids = {row["document_id"] for row in q2.json()["results"]}
|
|
84
|
+
assert "doc-a" not in ids
|
|
85
|
+
assert "doc-b" in ids
|