business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,93 @@
1
+ pyproject.toml
2
+ app/__init__.py
3
+ app/config.py
4
+ app/db.py
5
+ app/dependencies.py
6
+ app/logging_config.py
7
+ app/main.py
8
+ app/sqlite_ext.py
9
+ app/connectors/__init__.py
10
+ app/connectors/base.py
11
+ app/connectors/builtins.py
12
+ app/connectors/playwright_session.py
13
+ app/connectors/registry.py
14
+ app/connectors/thread_expansion/__init__.py
15
+ app/connectors/thread_expansion/fakes.py
16
+ app/connectors/thread_expansion/models.py
17
+ app/connectors/thread_expansion/reddit.py
18
+ app/connectors/thread_expansion/twitter.py
19
+ app/middleware/__init__.py
20
+ app/middleware/gateway_identity.py
21
+ app/middleware/request_id.py
22
+ app/routers/__init__.py
23
+ app/routers/chunks.py
24
+ app/routers/entity_extract.py
25
+ app/routers/example.py
26
+ app/routers/gemini_embed.py
27
+ app/routers/health.py
28
+ app/routers/ingestion.py
29
+ app/routers/link_expansion.py
30
+ app/routers/query.py
31
+ app/routers/vectors.py
32
+ app/schemas/__init__.py
33
+ app/schemas/canonical.py
34
+ app/schemas/ingest.py
35
+ app/schemas/query.py
36
+ app/schemas/vectors.py
37
+ app/services/__init__.py
38
+ app/services/retrieval_service.py
39
+ app/services/chunking/__init__.py
40
+ app/services/chunking/llm_boundaries.py
41
+ app/services/chunking/schemas.py
42
+ app/services/chunking/semantic_chunk.py
43
+ app/services/chunking/splitters.py
44
+ app/services/embeddings/__init__.py
45
+ app/services/embeddings/build_inputs.py
46
+ app/services/embeddings/gemini_api.py
47
+ app/services/embeddings/persist.py
48
+ app/services/embeddings/types.py
49
+ app/services/embeddings/worker.py
50
+ app/services/entities/__init__.py
51
+ app/services/entities/gliner_extract.py
52
+ app/services/entities/llm_extract.py
53
+ app/services/entities/pipeline.py
54
+ app/services/entities/spacy_extract.py
55
+ app/services/entities/types.py
56
+ app/services/ingestion/__init__.py
57
+ app/services/ingestion/persist.py
58
+ app/services/link_expansion/__init__.py
59
+ app/services/link_expansion/canonical_url.py
60
+ app/services/link_expansion/domain_policy.py
61
+ app/services/link_expansion/html_extract.py
62
+ app/services/link_expansion/rate_limit.py
63
+ app/services/link_expansion/robots.py
64
+ app/services/link_expansion/schemas.py
65
+ app/services/link_expansion/worker.py
66
+ app/services/normalization/__init__.py
67
+ app/services/normalization/normalizer.py
68
+ app/services/normalization/persist_normalized.py
69
+ app/services/playwright_extract/__init__.py
70
+ app/services/playwright_extract/__main__.py
71
+ app/services/playwright_extract/extract.py
72
+ app/storage/__init__.py
73
+ app/storage/blobs.py
74
+ app/vectorstore/__init__.py
75
+ app/vectorstore/sqlite_vec_store.py
76
+ backend.egg-info/PKG-INFO
77
+ backend.egg-info/SOURCES.txt
78
+ backend.egg-info/dependency_links.txt
79
+ backend.egg-info/entry_points.txt
80
+ backend.egg-info/requires.txt
81
+ backend.egg-info/top_level.txt
82
+ tests/test_chunking.py
83
+ tests/test_entities.py
84
+ tests/test_gemini_embed.py
85
+ tests/test_health.py
86
+ tests/test_ingest_raw.py
87
+ tests/test_link_expansion.py
88
+ tests/test_main.py
89
+ tests/test_normalizer.py
90
+ tests/test_playwright_extract.py
91
+ tests/test_query.py
92
+ tests/test_thread_expansion.py
93
+ tests/test_vectors.py
@@ -0,0 +1,2 @@
1
+ [kb.connectors]
2
+ echo = app.connectors.builtins:EchoConnector
@@ -0,0 +1,15 @@
1
+ fastapi[standard]==0.113.0
2
+ pydantic==2.8.0
3
+ pydantic-settings>=2.4.0
4
+ sqlalchemy[asyncio]>=2.0.36
5
+ aiosqlite>=0.20.0
6
+ alembic>=1.14.0
7
+ python-json-logger>=2.0.7
8
+ sqlite-vec>=0.1.9
9
+ httpx>=0.27.0
10
+
11
+ [entities]
12
+ spacy>=3.7.0
13
+
14
+ [playwright]
15
+ playwright>=1.49.0
@@ -0,0 +1,4 @@
1
+ app
2
+ data
3
+ dist
4
+ tmp_migrate_kb
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "@business-stack/backend",
3
+ "private": true,
4
+ "scripts": {
5
+ "dev": "uv run uvicorn app.main:app --reload --host 127.0.0.1 --port 8000",
6
+ "start": "uv run uvicorn app.main:app --host 127.0.0.1 --port 8000",
7
+ "build": "bun -e \"require('fs').mkdirSync('dist',{recursive:true}); require('fs').writeFileSync('dist/.buildstamp','')\"",
8
+ "lint": "uv run ruff check .",
9
+ "lint:fix": "uv run ruff check --fix . && uv run ruff format .",
10
+ "typecheck": "bun -e \"process.exit(0)\"",
11
+ "test": "uv run pytest",
12
+ "db:migrate": "uv run alembic upgrade head",
13
+ "clean": "bun x rimraf@6 .pytest_cache"
14
+ }
15
+ }
@@ -0,0 +1,52 @@
1
+ [project]
2
+ name = "backend"
3
+ version = "0.1.0"
4
+ description = "FastAPI service (internal API behind Hono gateway)"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "fastapi[standard]==0.113.0",
8
+ "pydantic==2.8.0",
9
+ "pydantic-settings>=2.4.0",
10
+ "sqlalchemy[asyncio]>=2.0.36",
11
+ "aiosqlite>=0.20.0",
12
+ "alembic>=1.14.0",
13
+ "python-json-logger>=2.0.7",
14
+ "sqlite-vec>=0.1.9",
15
+ "httpx>=0.27.0",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ entities = [
20
+ "spacy>=3.7.0",
21
+ ]
22
+ playwright = [
23
+ "playwright>=1.49.0",
24
+ ]
25
+
26
+ [dependency-groups]
27
+ dev = [
28
+ "httpx>=0.27.0",
29
+ "pytest>=8.3.0",
30
+ "ruff>=0.8.0",
31
+ ]
32
+
33
+ [project.entry-points."kb.connectors"]
34
+ echo = "app.connectors.builtins:EchoConnector"
35
+
36
+ [build-system]
37
+ requires = ["setuptools>=61"]
38
+ build-backend = "setuptools.build_meta"
39
+
40
+ [tool.setuptools.packages.find]
41
+ where = ["."]
42
+ exclude = ["tests*", "alembic*"]
43
+
44
+ [tool.ruff]
45
+ target-version = "py312"
46
+ line-length = 88
47
+
48
+ [tool.ruff.lint]
49
+ select = ["E", "F", "I", "UP"]
50
+
51
+ [tool.pytest.ini_options]
52
+ testpaths = ["tests"]
@@ -0,0 +1,40 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+ from alembic.config import Config
6
+ from fastapi.testclient import TestClient
7
+
8
+ from alembic import command
9
+ from app.config import clear_settings_cache
10
+ from app.main import create_app
11
+
12
+ BACKEND_ROOT = Path(__file__).resolve().parents[1]
13
+
14
+
15
+ def _alembic_upgrade_head() -> None:
16
+ prev = os.getcwd()
17
+ os.chdir(BACKEND_ROOT)
18
+ try:
19
+ cfg = Config(str(BACKEND_ROOT / "alembic.ini"))
20
+ command.upgrade(cfg, "head")
21
+ finally:
22
+ os.chdir(prev)
23
+
24
+
25
+ @pytest.fixture
26
+ def client(tmp_path, monkeypatch) -> TestClient:
27
+ monkeypatch.setenv("DATA_DIR", str(tmp_path))
28
+ clear_settings_cache()
29
+ with TestClient(create_app()) as test_client:
30
+ yield test_client
31
+
32
+
33
+ @pytest.fixture
34
+ def migrated_client(tmp_path, monkeypatch) -> TestClient:
35
+ monkeypatch.setenv("DATA_DIR", str(tmp_path))
36
+ clear_settings_cache()
37
+ _alembic_upgrade_head()
38
+ clear_settings_cache()
39
+ with TestClient(create_app()) as test_client:
40
+ yield test_client
@@ -0,0 +1,92 @@
1
+ from unittest.mock import patch
2
+
3
+ from fastapi.testclient import TestClient
4
+
5
+ from app.config import clear_settings_cache
6
+
7
+
8
+ def test_chat_requires_gateway_user(migrated_client: TestClient) -> None:
9
+ response = migrated_client.get("/chat/sessions")
10
+ assert response.status_code == 401
11
+
12
+
13
+ def test_chat_session_crud(migrated_client: TestClient) -> None:
14
+ headers = {"x-user-id": "user-chat-crud"}
15
+ create = migrated_client.post("/chat/sessions", json={}, headers=headers)
16
+ assert create.status_code == 200
17
+ session_id = create.json()["id"]
18
+
19
+ listed = migrated_client.get("/chat/sessions", headers=headers)
20
+ assert listed.status_code == 200
21
+ ids = {row["id"] for row in listed.json()}
22
+ assert session_id in ids
23
+
24
+ msgs = migrated_client.get(
25
+ f"/chat/sessions/{session_id}/messages",
26
+ headers=headers,
27
+ )
28
+ assert msgs.status_code == 200
29
+ assert msgs.json() == []
30
+
31
+ deleted = migrated_client.delete(
32
+ f"/chat/sessions/{session_id}",
33
+ headers=headers,
34
+ )
35
+ assert deleted.status_code == 200
36
+
37
+ gone = migrated_client.get(
38
+ f"/chat/sessions/{session_id}/messages",
39
+ headers=headers,
40
+ )
41
+ assert gone.status_code == 404
42
+
43
+
44
+ def test_chat_complete_persists_messages(
45
+ migrated_client: TestClient,
46
+ monkeypatch,
47
+ ) -> None:
48
+ monkeypatch.setenv("GEMINI_API_KEY", "test-key-for-chat-test")
49
+ clear_settings_cache()
50
+ headers = {"x-user-id": "user-chat-complete"}
51
+ sid = migrated_client.post("/chat/sessions", json={}, headers=headers).json()[
52
+ "id"
53
+ ]
54
+
55
+ async def fake_retrieval(*_args, **_kwargs):
56
+ return {
57
+ "candidates": [],
58
+ "context": {
59
+ "combined_text": "stub context",
60
+ "sections": [],
61
+ "media": [],
62
+ },
63
+ "embedding_model": "stub-model",
64
+ "vector_candidates_considered": 0,
65
+ }
66
+
67
+ async def fake_gemini(*_args, **_kwargs):
68
+ return ("stub assistant reply", "model")
69
+
70
+ with (
71
+ patch("app.routers.chat.run_retrieval", side_effect=fake_retrieval),
72
+ patch("app.routers.chat.gemini_chat_reply", side_effect=fake_gemini),
73
+ ):
74
+ complete = migrated_client.post(
75
+ f"/chat/sessions/{sid}/complete",
76
+ json={"message": "hello from test"},
77
+ headers=headers,
78
+ )
79
+ assert complete.status_code == 200
80
+ body = complete.json()
81
+ assert body["reply"] == "stub assistant reply"
82
+ assert body["reply_source"] == "model"
83
+
84
+ rows = migrated_client.get(
85
+ f"/chat/sessions/{sid}/messages",
86
+ headers=headers,
87
+ ).json()
88
+ assert len(rows) == 2
89
+ assert rows[0]["role"] == "user"
90
+ assert rows[0]["content"] == "hello from test"
91
+ assert rows[1]["role"] == "assistant"
92
+ assert rows[1]["content"] == "stub assistant reply"
@@ -0,0 +1,132 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sqlite3
5
+ from datetime import UTC, datetime
6
+
7
+ from fastapi.testclient import TestClient
8
+
9
+ from app.config import get_settings
10
+ from app.services.chunking.splitters import (
11
+ extract_thread_message_items,
12
+ structure_chunks_for_text_block,
13
+ thread_chunks_from_envelope,
14
+ )
15
+
16
+
17
+ def test_markdown_headings_produce_multiple_chunks() -> None:
18
+ text = "# Intro\n\nFirst para.\n\n## Details\n\nSecond para."
19
+ chunks = structure_chunks_for_text_block(
20
+ text,
21
+ 2,
22
+ use_llm=False,
23
+ llm_sections=None,
24
+ )
25
+ assert len(chunks) >= 2
26
+ assert all(c.start_block_ordinal == 2 for c in chunks)
27
+ assert chunks[0].meta.get("strategy") == "markdown_section"
28
+
29
+
30
+ def test_paragraph_split_default() -> None:
31
+ text = "One block.\n\nTwo block.\n\nThree."
32
+ chunks = structure_chunks_for_text_block(
33
+ text,
34
+ 0,
35
+ use_llm=False,
36
+ llm_sections=None,
37
+ )
38
+ assert len(chunks) == 3
39
+ assert chunks[1].meta.get("strategy") == "paragraph"
40
+
41
+
42
+ def test_weak_structure_single_chunk_without_llm() -> None:
43
+ text = "x" * 500
44
+ chunks = structure_chunks_for_text_block(
45
+ text,
46
+ 1,
47
+ use_llm=False,
48
+ llm_sections=None,
49
+ )
50
+ assert len(chunks) == 1
51
+ assert chunks[0].meta.get("strategy") == "fallback_single"
52
+
53
+
54
+ def test_llm_sections_used_when_provided() -> None:
55
+ chunks = structure_chunks_for_text_block(
56
+ "x" * 900,
57
+ 0,
58
+ use_llm=True,
59
+ llm_sections=["part a", "part b"],
60
+ )
61
+ assert len(chunks) == 2
62
+ assert chunks[0].meta.get("strategy") == "llm_section"
63
+
64
+
65
+ def test_thread_envelope_metadata() -> None:
66
+ env = {
67
+ "payload": [
68
+ {"text": "hello"},
69
+ {"text": "world"},
70
+ ],
71
+ "metadata": {"thread_id": "thr-9"},
72
+ }
73
+ drafts = thread_chunks_from_envelope(env)
74
+ assert len(drafts) == 2
75
+ assert drafts[0].meta.get("thread_id") == "thr-9"
76
+ assert drafts[1].meta.get("message_index") == 1
77
+
78
+
79
+ def test_messages_key_in_payload_dict() -> None:
80
+ env = {
81
+ "payload": {
82
+ "messages": [{"body": "a"}, {"body": "b"}],
83
+ "thread_id": "abc",
84
+ },
85
+ "metadata": {},
86
+ }
87
+ items = extract_thread_message_items(env)
88
+ assert items is not None
89
+ assert len(items) == 2
90
+
91
+
92
+ def test_rebuild_chunks_endpoint(migrated_client: TestClient) -> None:
93
+ h = {"x-user-id": "u1"}
94
+ r = migrated_client.post(
95
+ "/ingest/raw?connector=echo",
96
+ headers=h,
97
+ json={
98
+ "source": "s",
99
+ "timestamp": datetime.now(UTC).isoformat(),
100
+ "content_type": "text",
101
+ "payload": "# Title\n\nAlpha.\n\nBeta.",
102
+ "metadata": {},
103
+ },
104
+ )
105
+ assert r.status_code == 200
106
+ doc_id = r.json()["document_id"]
107
+
108
+ ch = migrated_client.post(
109
+ f"/ingest/documents/{doc_id}/chunks",
110
+ headers=h,
111
+ json={"use_llm_weak_structure": False},
112
+ )
113
+ assert ch.status_code == 200
114
+ data = ch.json()
115
+ assert data["chunks_written"] >= 1
116
+
117
+ settings = get_settings()
118
+ db_path = (settings.data_dir / settings.sqlite_filename).resolve()
119
+ conn = sqlite3.connect(str(db_path))
120
+ rows = conn.execute(
121
+ "SELECT ordinal, start_block_ordinal, end_block_ordinal, meta "
122
+ "FROM document_chunks WHERE document_id = ? ORDER BY ordinal",
123
+ (doc_id,),
124
+ ).fetchall()
125
+ conn.close()
126
+ assert len(rows) >= 1
127
+ meta0 = json.loads(rows[0][3]) if rows[0][3] else {}
128
+ assert meta0.get("strategy") in (
129
+ "markdown_section",
130
+ "paragraph",
131
+ "fallback_single",
132
+ )
@@ -0,0 +1,170 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ import sqlite3
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from unittest.mock import AsyncMock, patch
9
+
10
+ from alembic.config import Config
11
+ from sqlalchemy import text
12
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
13
+
14
+ from alembic import command
15
+ from app.config import Settings, clear_settings_cache, get_settings
16
+ from app.services.entities.pipeline import extract_and_store_entities_for_document
17
+ from app.services.entities.types import ExtractedMention
18
+
19
+
20
+ def test_merge_mentions_keeps_highest_confidence() -> None:
21
+ from app.services.entities.pipeline import _merge_mentions
22
+
23
+ m = _merge_mentions(
24
+ [
25
+ ExtractedMention("Acme", "company", 0.5),
26
+ ExtractedMention("acme", "company", 0.9),
27
+ ExtractedMention("Acme", "person", 0.8),
28
+ ],
29
+ )
30
+ by_type = {(x.name.lower(), x.type): x for x in m}
31
+ assert by_type[("acme", "company")].confidence == 0.9
32
+ assert ("acme", "person") in by_type
33
+
34
+
35
+ def test_entity_extract_endpoint_accepts(migrated_client, monkeypatch) -> None:
36
+ monkeypatch.setenv("GEMINI_API_KEY", "x")
37
+ clear_settings_cache()
38
+ h = {"x-user-id": "u1"}
39
+ ing = migrated_client.post(
40
+ "/ingest/raw?connector=echo",
41
+ headers=h,
42
+ json={
43
+ "source": "s",
44
+ "timestamp": datetime.now(UTC).isoformat(),
45
+ "content_type": "text",
46
+ "payload": "Alice at Acme in Boston",
47
+ "metadata": {},
48
+ },
49
+ )
50
+ doc_id = ing.json()["document_id"]
51
+ migrated_client.post(
52
+ f"/ingest/documents/{doc_id}/chunks",
53
+ headers=h,
54
+ json={},
55
+ )
56
+ mock_job = AsyncMock()
57
+ with patch(
58
+ "app.routers.entity_extract.run_entity_extraction_job",
59
+ mock_job,
60
+ ):
61
+ r = migrated_client.post(
62
+ f"/ingest/documents/{doc_id}/entities",
63
+ headers=h,
64
+ )
65
+ assert r.status_code == 200
66
+ mock_job.assert_called_once()
67
+
68
+
69
+ def test_extract_persists_mentions_and_cooccurrence(tmp_path, monkeypatch) -> None:
70
+ monkeypatch.setenv("DATA_DIR", str(tmp_path))
71
+ clear_settings_cache()
72
+
73
+ backend = Path(__file__).resolve().parents[1]
74
+ prev = os.getcwd()
75
+ os.chdir(str(backend))
76
+ try:
77
+ command.upgrade(Config(str(backend / "alembic.ini")), "head")
78
+ finally:
79
+ os.chdir(prev)
80
+ clear_settings_cache()
81
+
82
+ settings = Settings()
83
+ engine = create_async_engine(settings.sqlalchemy_database_url)
84
+ factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
85
+ doc_id = "doc-ent-1"
86
+
87
+ async def _seed() -> None:
88
+ async with factory() as session:
89
+ await session.execute(
90
+ text("INSERT INTO sources (name, connector_type) VALUES ('s','s')"),
91
+ )
92
+ sid = int(
93
+ (
94
+ await session.execute(text("SELECT id FROM sources LIMIT 1"))
95
+ ).scalar_one(),
96
+ )
97
+ await session.execute(
98
+ text(
99
+ "INSERT INTO documents "
100
+ "(id, source_id, timestamp, content_type, raw_content, "
101
+ "summary, status) VALUES "
102
+ "(:id, :sid, :ts, 'text', '{}', 'x', 'ok')",
103
+ ),
104
+ {"id": doc_id, "sid": sid, "ts": datetime.now(UTC).isoformat()},
105
+ )
106
+ await session.execute(
107
+ text(
108
+ "INSERT INTO document_chunks "
109
+ "(document_id, ordinal, text, start_block_ordinal, "
110
+ "end_block_ordinal, meta) VALUES "
111
+ "(:d, 0, 'one', 0, 0, NULL)",
112
+ ),
113
+ {"d": doc_id},
114
+ )
115
+ await session.execute(
116
+ text(
117
+ "INSERT INTO document_chunks "
118
+ "(document_id, ordinal, text, start_block_ordinal, "
119
+ "end_block_ordinal, meta) VALUES "
120
+ "(:d, 1, 'two', 0, 0, NULL)",
121
+ ),
122
+ {"d": doc_id},
123
+ )
124
+ await session.commit()
125
+
126
+ asyncio.run(_seed())
127
+
128
+ async def _fake_extract(text: str, _settings: Settings):
129
+ if "one" in text:
130
+ return [
131
+ ExtractedMention("Person A", "person", 0.9),
132
+ ExtractedMention("Acme Corp", "company", 0.85),
133
+ ]
134
+ return [
135
+ ExtractedMention("Person A", "person", 0.88),
136
+ ExtractedMention("Boston", "location", 0.8),
137
+ ]
138
+
139
+ async def _run() -> None:
140
+ with patch(
141
+ "app.services.entities.pipeline.extract_mentions_for_chunk_text",
142
+ new=_fake_extract,
143
+ ):
144
+ async with factory() as session:
145
+ nm, nc = await extract_and_store_entities_for_document(
146
+ session,
147
+ document_id=doc_id,
148
+ settings=get_settings(),
149
+ )
150
+ await session.commit()
151
+ assert nm >= 4
152
+ assert nc == 3
153
+
154
+ asyncio.run(_run())
155
+
156
+ db_path = (get_settings().data_dir / get_settings().sqlite_filename).resolve()
157
+ conn = sqlite3.connect(str(db_path))
158
+ mentions = conn.execute(
159
+ "SELECT COUNT(*) FROM entity_mentions WHERE document_id = ?",
160
+ (doc_id,),
161
+ ).fetchone()[0]
162
+ co = conn.execute(
163
+ "SELECT COUNT(*) FROM entity_cooccurrence WHERE document_id = ?",
164
+ (doc_id,),
165
+ ).fetchone()[0]
166
+ conn.close()
167
+ assert mentions == 4
168
+ assert co == 3
169
+
170
+ asyncio.run(engine.dispose())