business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
|
|
3
|
+
from app.vectorstore import VectorSearchFilters
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class VectorMetaPayload(BaseModel):
|
|
7
|
+
document_id: str
|
|
8
|
+
chunk_id: int = 0
|
|
9
|
+
source_id: int = -1
|
|
10
|
+
modality: str = "text"
|
|
11
|
+
ingested_at: str = Field(
|
|
12
|
+
...,
|
|
13
|
+
description="ISO-8601 timestamp for filterable range queries on ingested_at",
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class VectorUpsertRequest(BaseModel):
|
|
18
|
+
embeddings: list[list[float]]
|
|
19
|
+
metas: list[VectorMetaPayload]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VectorSearchFiltersPayload(BaseModel):
|
|
23
|
+
document_id: str | None = None
|
|
24
|
+
source_id: int | None = None
|
|
25
|
+
modality: str | None = None
|
|
26
|
+
timestamp_min: str | None = None
|
|
27
|
+
timestamp_max: str | None = None
|
|
28
|
+
|
|
29
|
+
def to_filters(self) -> VectorSearchFilters:
|
|
30
|
+
return VectorSearchFilters(
|
|
31
|
+
document_id=self.document_id,
|
|
32
|
+
source_id=self.source_id,
|
|
33
|
+
modality=self.modality,
|
|
34
|
+
timestamp_min=self.timestamp_min,
|
|
35
|
+
timestamp_max=self.timestamp_max,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class VectorSearchRequest(BaseModel):
|
|
40
|
+
query_vector: list[float]
|
|
41
|
+
k: int = Field(default=10, ge=1, le=500)
|
|
42
|
+
filters: VectorSearchFiltersPayload | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class VectorSearchHit(BaseModel):
|
|
46
|
+
rowid: int
|
|
47
|
+
distance: float
|
|
48
|
+
document_id: str
|
|
49
|
+
chunk_id: int
|
|
50
|
+
source_id: int
|
|
51
|
+
modality: str
|
|
52
|
+
ingested_at: str
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class VectorSearchResponse(BaseModel):
|
|
56
|
+
results: list[VectorSearchHit]
|
|
File without changes
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import text
|
|
8
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _utc_now_iso() -> str:
|
|
12
|
+
return datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def create_chat_session(
|
|
16
|
+
session: AsyncSession,
|
|
17
|
+
*,
|
|
18
|
+
user_id: str,
|
|
19
|
+
title: str | None,
|
|
20
|
+
) -> str:
|
|
21
|
+
sid = str(uuid.uuid4())
|
|
22
|
+
now = _utc_now_iso()
|
|
23
|
+
await session.execute(
|
|
24
|
+
text(
|
|
25
|
+
"INSERT INTO chat_sessions (id, user_id, title, created_at, updated_at) "
|
|
26
|
+
"VALUES (:id, :uid, :title, :ca, :ua)",
|
|
27
|
+
),
|
|
28
|
+
{"id": sid, "uid": user_id, "title": title, "ca": now, "ua": now},
|
|
29
|
+
)
|
|
30
|
+
return sid
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
async def list_chat_sessions(
|
|
34
|
+
session: AsyncSession,
|
|
35
|
+
*,
|
|
36
|
+
user_id: str,
|
|
37
|
+
limit: int = 100,
|
|
38
|
+
) -> list[dict[str, str | None]]:
|
|
39
|
+
r = await session.execute(
|
|
40
|
+
text(
|
|
41
|
+
"SELECT id, title, created_at, updated_at FROM chat_sessions "
|
|
42
|
+
"WHERE user_id = :u ORDER BY updated_at DESC LIMIT :lim",
|
|
43
|
+
),
|
|
44
|
+
{"u": user_id, "lim": limit},
|
|
45
|
+
)
|
|
46
|
+
rows = r.mappings().all()
|
|
47
|
+
return [dict(row) for row in rows]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def get_session_owned(
|
|
51
|
+
session: AsyncSession,
|
|
52
|
+
*,
|
|
53
|
+
session_id: str,
|
|
54
|
+
user_id: str,
|
|
55
|
+
) -> dict[str, str | None] | None:
|
|
56
|
+
r = await session.execute(
|
|
57
|
+
text(
|
|
58
|
+
"SELECT id, user_id, title, created_at, updated_at FROM chat_sessions "
|
|
59
|
+
"WHERE id = :id AND user_id = :u LIMIT 1",
|
|
60
|
+
),
|
|
61
|
+
{"id": session_id, "u": user_id},
|
|
62
|
+
)
|
|
63
|
+
row = r.mappings().first()
|
|
64
|
+
return dict(row) if row else None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def touch_session_updated(session: AsyncSession, *, session_id: str) -> None:
|
|
68
|
+
now = _utc_now_iso()
|
|
69
|
+
await session.execute(
|
|
70
|
+
text("UPDATE chat_sessions SET updated_at = :ua WHERE id = :id"),
|
|
71
|
+
{"ua": now, "id": session_id},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def maybe_set_title_from_first_message(
|
|
76
|
+
session: AsyncSession,
|
|
77
|
+
*,
|
|
78
|
+
session_id: str,
|
|
79
|
+
snippet: str,
|
|
80
|
+
) -> None:
|
|
81
|
+
r = await session.execute(
|
|
82
|
+
text("SELECT title FROM chat_sessions WHERE id = :id LIMIT 1"),
|
|
83
|
+
{"id": session_id},
|
|
84
|
+
)
|
|
85
|
+
row = r.first()
|
|
86
|
+
if not row or row[0]:
|
|
87
|
+
return
|
|
88
|
+
t = snippet.strip().replace("\n", " ")[:80]
|
|
89
|
+
if not t:
|
|
90
|
+
return
|
|
91
|
+
await session.execute(
|
|
92
|
+
text("UPDATE chat_sessions SET title = :t WHERE id = :id"),
|
|
93
|
+
{"t": t, "id": session_id},
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def append_chat_message(
|
|
98
|
+
session: AsyncSession,
|
|
99
|
+
*,
|
|
100
|
+
session_id: str,
|
|
101
|
+
role: str,
|
|
102
|
+
content: str,
|
|
103
|
+
meta: dict | None = None,
|
|
104
|
+
) -> int:
|
|
105
|
+
now = _utc_now_iso()
|
|
106
|
+
meta_s = json.dumps(meta, default=str) if meta else None
|
|
107
|
+
await session.execute(
|
|
108
|
+
text(
|
|
109
|
+
"INSERT INTO chat_messages (session_id, role, content, meta_json, created_at) "
|
|
110
|
+
"VALUES (:sid, :role, :content, :meta, :ca)",
|
|
111
|
+
),
|
|
112
|
+
{
|
|
113
|
+
"sid": session_id,
|
|
114
|
+
"role": role,
|
|
115
|
+
"content": content,
|
|
116
|
+
"meta": meta_s,
|
|
117
|
+
"ca": now,
|
|
118
|
+
},
|
|
119
|
+
)
|
|
120
|
+
r = await session.execute(text("SELECT last_insert_rowid()"))
|
|
121
|
+
rid = r.scalar_one()
|
|
122
|
+
return int(rid)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def list_chat_messages(
|
|
126
|
+
session: AsyncSession,
|
|
127
|
+
*,
|
|
128
|
+
session_id: str,
|
|
129
|
+
limit: int = 200,
|
|
130
|
+
) -> list[dict[str, str | int | None]]:
|
|
131
|
+
r = await session.execute(
|
|
132
|
+
text(
|
|
133
|
+
"SELECT id, role, content, meta_json, created_at FROM chat_messages "
|
|
134
|
+
"WHERE session_id = :sid ORDER BY id ASC LIMIT :lim",
|
|
135
|
+
),
|
|
136
|
+
{"sid": session_id, "lim": limit},
|
|
137
|
+
)
|
|
138
|
+
rows = r.mappings().all()
|
|
139
|
+
return [dict(row) for row in rows]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
async def delete_chat_session(
|
|
143
|
+
session: AsyncSession,
|
|
144
|
+
*,
|
|
145
|
+
session_id: str,
|
|
146
|
+
user_id: str,
|
|
147
|
+
) -> bool:
|
|
148
|
+
r = await session.execute(
|
|
149
|
+
text("DELETE FROM chat_sessions WHERE id = :id AND user_id = :u"),
|
|
150
|
+
{"id": session_id, "u": user_id},
|
|
151
|
+
)
|
|
152
|
+
return (r.rowcount or 0) > 0
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def suggest_sections_via_llm(
|
|
13
|
+
text: str,
|
|
14
|
+
*,
|
|
15
|
+
api_key: str,
|
|
16
|
+
model: str,
|
|
17
|
+
timeout_s: float = 60.0,
|
|
18
|
+
) -> list[str] | None:
|
|
19
|
+
"""Ask the model for semantic section splits; returns section strings or None."""
|
|
20
|
+
if not text.strip():
|
|
21
|
+
return []
|
|
22
|
+
prompt = (
|
|
23
|
+
"Split the following document into semantic sections for retrieval. "
|
|
24
|
+
"Prefer coherent topical units. Do not use fixed token sizes. "
|
|
25
|
+
'Return JSON only: {"sections": ["...", "..."]}\n\n---\n' + text[:48_000]
|
|
26
|
+
)
|
|
27
|
+
url = "https://api.openai.com/v1/chat/completions"
|
|
28
|
+
body: dict[str, Any] = {
|
|
29
|
+
"model": model,
|
|
30
|
+
"temperature": 0.2,
|
|
31
|
+
"response_format": {"type": "json_object"},
|
|
32
|
+
"messages": [
|
|
33
|
+
{
|
|
34
|
+
"role": "system",
|
|
35
|
+
"content": (
|
|
36
|
+
"You output only valid JSON with a sections array of strings."
|
|
37
|
+
),
|
|
38
|
+
},
|
|
39
|
+
{"role": "user", "content": prompt},
|
|
40
|
+
],
|
|
41
|
+
}
|
|
42
|
+
try:
|
|
43
|
+
async with httpx.AsyncClient(timeout=timeout_s) as client:
|
|
44
|
+
r = await client.post(
|
|
45
|
+
url,
|
|
46
|
+
headers={
|
|
47
|
+
"Authorization": f"Bearer {api_key}",
|
|
48
|
+
"Content-Type": "application/json",
|
|
49
|
+
},
|
|
50
|
+
json=body,
|
|
51
|
+
)
|
|
52
|
+
r.raise_for_status()
|
|
53
|
+
data = r.json()
|
|
54
|
+
content = data["choices"][0]["message"]["content"]
|
|
55
|
+
parsed = json.loads(content)
|
|
56
|
+
sections = parsed.get("sections")
|
|
57
|
+
if not isinstance(sections, list):
|
|
58
|
+
return None
|
|
59
|
+
out = [str(s).strip() for s in sections if str(s).strip()]
|
|
60
|
+
return out or None
|
|
61
|
+
except Exception:
|
|
62
|
+
logger.exception("LLM chunk boundary request failed")
|
|
63
|
+
return None
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SemanticChunkDraft(BaseModel):
|
|
9
|
+
model_config = ConfigDict(extra="forbid")
|
|
10
|
+
|
|
11
|
+
text: str
|
|
12
|
+
start_block_ordinal: int = Field(ge=0)
|
|
13
|
+
end_block_ordinal: int = Field(ge=0)
|
|
14
|
+
meta: dict[str, Any] = Field(default_factory=dict)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RebuildChunksRequest(BaseModel):
|
|
18
|
+
model_config = ConfigDict(extra="forbid")
|
|
19
|
+
|
|
20
|
+
use_llm_weak_structure: bool = Field(
|
|
21
|
+
default=False,
|
|
22
|
+
description="If true and structure is weak, call LLM for section boundaries",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RebuildChunksResponse(BaseModel):
|
|
27
|
+
model_config = ConfigDict(extra="forbid")
|
|
28
|
+
|
|
29
|
+
document_id: str
|
|
30
|
+
chunks_written: int
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from sqlalchemy import text
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
+
|
|
11
|
+
from app.config import Settings
|
|
12
|
+
from app.services.chunking.llm_boundaries import suggest_sections_via_llm
|
|
13
|
+
from app.services.chunking.schemas import SemanticChunkDraft
|
|
14
|
+
from app.services.chunking.splitters import (
|
|
15
|
+
split_paragraphs,
|
|
16
|
+
structure_chunks_for_text_block,
|
|
17
|
+
thread_chunks_from_envelope,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class _BlockRow:
|
|
25
|
+
ordinal: int
|
|
26
|
+
type: str
|
|
27
|
+
meta: dict[str, Any]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def _load_blocks(session: AsyncSession, document_id: str) -> list[_BlockRow]:
|
|
31
|
+
r = await session.execute(
|
|
32
|
+
text(
|
|
33
|
+
"SELECT ordinal, type, meta FROM content_blocks "
|
|
34
|
+
"WHERE document_id = :d ORDER BY ordinal",
|
|
35
|
+
),
|
|
36
|
+
{"d": document_id},
|
|
37
|
+
)
|
|
38
|
+
out: list[_BlockRow] = []
|
|
39
|
+
for row in r.fetchall():
|
|
40
|
+
meta: dict[str, Any] = {}
|
|
41
|
+
if row[2]:
|
|
42
|
+
try:
|
|
43
|
+
parsed = json.loads(row[2])
|
|
44
|
+
if isinstance(parsed, dict):
|
|
45
|
+
meta = parsed
|
|
46
|
+
except json.JSONDecodeError:
|
|
47
|
+
pass
|
|
48
|
+
out.append(_BlockRow(ordinal=int(row[0]), type=str(row[1]), meta=meta))
|
|
49
|
+
return out
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def _load_envelope(session: AsyncSession, document_id: str) -> dict[str, Any]:
|
|
53
|
+
r = await session.execute(
|
|
54
|
+
text("SELECT raw_content FROM documents WHERE id = :id LIMIT 1"),
|
|
55
|
+
{"id": document_id},
|
|
56
|
+
)
|
|
57
|
+
row = r.first()
|
|
58
|
+
if row is None or row[0] is None:
|
|
59
|
+
return {}
|
|
60
|
+
try:
|
|
61
|
+
data = json.loads(row[0])
|
|
62
|
+
return data if isinstance(data, dict) else {}
|
|
63
|
+
except json.JSONDecodeError:
|
|
64
|
+
return {}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _align_thread_chunks_to_blocks(
|
|
68
|
+
drafts: list[SemanticChunkDraft],
|
|
69
|
+
text_blocks: list[_BlockRow],
|
|
70
|
+
) -> list[SemanticChunkDraft]:
|
|
71
|
+
if not drafts:
|
|
72
|
+
return []
|
|
73
|
+
if not text_blocks:
|
|
74
|
+
return [
|
|
75
|
+
d.model_copy(
|
|
76
|
+
update={"start_block_ordinal": 0, "end_block_ordinal": 0},
|
|
77
|
+
)
|
|
78
|
+
for d in drafts
|
|
79
|
+
]
|
|
80
|
+
if len(drafts) == len(text_blocks):
|
|
81
|
+
return [
|
|
82
|
+
d.model_copy(
|
|
83
|
+
update={
|
|
84
|
+
"start_block_ordinal": b.ordinal,
|
|
85
|
+
"end_block_ordinal": b.ordinal,
|
|
86
|
+
},
|
|
87
|
+
)
|
|
88
|
+
for d, b in zip(drafts, text_blocks, strict=True)
|
|
89
|
+
]
|
|
90
|
+
max_ord = max(b.ordinal for b in text_blocks) if text_blocks else 0
|
|
91
|
+
return [
|
|
92
|
+
d.model_copy(
|
|
93
|
+
update={
|
|
94
|
+
"start_block_ordinal": 0,
|
|
95
|
+
"end_block_ordinal": max_ord,
|
|
96
|
+
},
|
|
97
|
+
)
|
|
98
|
+
for d in drafts
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
async def rebuild_semantic_chunks(
|
|
103
|
+
session: AsyncSession,
|
|
104
|
+
*,
|
|
105
|
+
document_id: str,
|
|
106
|
+
use_llm_weak_structure: bool,
|
|
107
|
+
settings: Settings,
|
|
108
|
+
) -> int:
|
|
109
|
+
"""Replace document_chunks for this document using structure-aware splitting."""
|
|
110
|
+
envelope = await _load_envelope(session, document_id)
|
|
111
|
+
blocks = await _load_blocks(session, document_id)
|
|
112
|
+
text_blocks = [b for b in blocks if b.type == "text" and _block_text_from_row(b)]
|
|
113
|
+
|
|
114
|
+
thread_drafts = thread_chunks_from_envelope(envelope)
|
|
115
|
+
drafts: list[SemanticChunkDraft] = []
|
|
116
|
+
|
|
117
|
+
if thread_drafts:
|
|
118
|
+
drafts = _align_thread_chunks_to_blocks(thread_drafts, text_blocks)
|
|
119
|
+
logger.debug(
|
|
120
|
+
"document %s: %s thread chunks",
|
|
121
|
+
document_id,
|
|
122
|
+
len(drafts),
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
for b in blocks:
|
|
126
|
+
if b.type != "text":
|
|
127
|
+
continue
|
|
128
|
+
t = _block_text_from_row(b)
|
|
129
|
+
if not t:
|
|
130
|
+
continue
|
|
131
|
+
llm_sections: list[str] | None = None
|
|
132
|
+
if (
|
|
133
|
+
use_llm_weak_structure
|
|
134
|
+
and settings.openai_api_key
|
|
135
|
+
and len(split_paragraphs(t)) <= 1
|
|
136
|
+
and len(t) > 800
|
|
137
|
+
):
|
|
138
|
+
llm_sections = await suggest_sections_via_llm(
|
|
139
|
+
t,
|
|
140
|
+
api_key=settings.openai_api_key,
|
|
141
|
+
model=settings.chunk_llm_model,
|
|
142
|
+
)
|
|
143
|
+
drafts.extend(
|
|
144
|
+
structure_chunks_for_text_block(
|
|
145
|
+
t,
|
|
146
|
+
b.ordinal,
|
|
147
|
+
use_llm=use_llm_weak_structure,
|
|
148
|
+
llm_sections=llm_sections,
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
await session.execute(
|
|
153
|
+
text("DELETE FROM document_chunks WHERE document_id = :d"),
|
|
154
|
+
{"d": document_id},
|
|
155
|
+
)
|
|
156
|
+
for ordinal, d in enumerate(drafts):
|
|
157
|
+
await session.execute(
|
|
158
|
+
text(
|
|
159
|
+
"INSERT INTO document_chunks "
|
|
160
|
+
"(document_id, ordinal, text, start_block_ordinal, "
|
|
161
|
+
"end_block_ordinal, meta) "
|
|
162
|
+
"VALUES (:did, :ord, :txt, :sb, :eb, :meta)",
|
|
163
|
+
),
|
|
164
|
+
{
|
|
165
|
+
"did": document_id,
|
|
166
|
+
"ord": ordinal,
|
|
167
|
+
"txt": d.text,
|
|
168
|
+
"sb": d.start_block_ordinal,
|
|
169
|
+
"eb": d.end_block_ordinal,
|
|
170
|
+
"meta": json.dumps(d.meta) if d.meta else None,
|
|
171
|
+
},
|
|
172
|
+
)
|
|
173
|
+
return len(drafts)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _block_text_from_row(b: _BlockRow) -> str:
|
|
177
|
+
t = b.meta.get("text")
|
|
178
|
+
return t.strip() if isinstance(t, str) else ""
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from app.services.chunking.schemas import SemanticChunkDraft
|
|
8
|
+
|
|
9
|
+
_HEADING_LINE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _message_body(item: dict[str, Any]) -> str:
|
|
13
|
+
for k in ("text", "body", "content", "message", "markdown", "tweet"):
|
|
14
|
+
v = item.get(k)
|
|
15
|
+
if isinstance(v, str) and v.strip():
|
|
16
|
+
return v.strip()
|
|
17
|
+
return json.dumps(item, ensure_ascii=False)[:16_000]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def extract_thread_message_items(
|
|
21
|
+
envelope: dict[str, Any],
|
|
22
|
+
) -> list[dict[str, Any]] | None:
|
|
23
|
+
"""Detect chat/thread payloads; return list of {thread_id, item, index}."""
|
|
24
|
+
p = envelope.get("payload")
|
|
25
|
+
md = envelope.get("metadata") if isinstance(envelope.get("metadata"), dict) else {}
|
|
26
|
+
tid = md.get("thread_id") or md.get("thread_ts") or md.get("conversation_id")
|
|
27
|
+
|
|
28
|
+
if isinstance(p, list) and p and all(isinstance(x, dict) for x in p):
|
|
29
|
+
return [
|
|
30
|
+
{"thread_id": tid, "item": x, "index": i, "role": x.get("role")}
|
|
31
|
+
for i, x in enumerate(p)
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
if isinstance(p, dict):
|
|
35
|
+
inner_tid = p.get("thread_id") or tid
|
|
36
|
+
for key in ("messages", "thread", "replies", "posts"):
|
|
37
|
+
msgs = p.get(key)
|
|
38
|
+
if (
|
|
39
|
+
isinstance(msgs, list)
|
|
40
|
+
and msgs
|
|
41
|
+
and all(isinstance(x, dict) for x in msgs)
|
|
42
|
+
):
|
|
43
|
+
return [
|
|
44
|
+
{
|
|
45
|
+
"thread_id": inner_tid or p.get("thread_id"),
|
|
46
|
+
"item": x,
|
|
47
|
+
"index": i,
|
|
48
|
+
"role": x.get("role"),
|
|
49
|
+
}
|
|
50
|
+
for i, x in enumerate(msgs)
|
|
51
|
+
]
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def thread_chunks_from_envelope(
|
|
56
|
+
envelope: dict[str, Any],
|
|
57
|
+
*,
|
|
58
|
+
default_block_ordinal: int = 0,
|
|
59
|
+
) -> list[SemanticChunkDraft]:
|
|
60
|
+
items = extract_thread_message_items(envelope)
|
|
61
|
+
if not items:
|
|
62
|
+
return []
|
|
63
|
+
out: list[SemanticChunkDraft] = []
|
|
64
|
+
for row in items:
|
|
65
|
+
item = row["item"]
|
|
66
|
+
if not isinstance(item, dict):
|
|
67
|
+
continue
|
|
68
|
+
body = _message_body(item)
|
|
69
|
+
if not body:
|
|
70
|
+
continue
|
|
71
|
+
tid = row.get("thread_id")
|
|
72
|
+
meta = {
|
|
73
|
+
"strategy": "thread_message",
|
|
74
|
+
"message_index": row["index"],
|
|
75
|
+
"role": row.get("role"),
|
|
76
|
+
}
|
|
77
|
+
if tid is not None:
|
|
78
|
+
meta["thread_id"] = str(tid)
|
|
79
|
+
if item.get("id") is not None:
|
|
80
|
+
meta["message_id"] = str(item["id"])
|
|
81
|
+
out.append(
|
|
82
|
+
SemanticChunkDraft(
|
|
83
|
+
text=body,
|
|
84
|
+
start_block_ordinal=default_block_ordinal,
|
|
85
|
+
end_block_ordinal=default_block_ordinal,
|
|
86
|
+
meta=meta,
|
|
87
|
+
),
|
|
88
|
+
)
|
|
89
|
+
return out
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def split_markdown_sections(text: str) -> list[tuple[str, dict[str, Any]]]:
|
|
93
|
+
"""Split on ATX headings; each chunk includes its heading line."""
|
|
94
|
+
if not text.strip():
|
|
95
|
+
return []
|
|
96
|
+
if not _HEADING_LINE.search(text):
|
|
97
|
+
return []
|
|
98
|
+
|
|
99
|
+
parts: list[tuple[str, dict[str, Any]]] = []
|
|
100
|
+
matches = list(_HEADING_LINE.finditer(text))
|
|
101
|
+
if not matches:
|
|
102
|
+
return [(text.strip(), {"strategy": "markdown_section", "section_index": 0})]
|
|
103
|
+
|
|
104
|
+
preamble = text[: matches[0].start()].strip()
|
|
105
|
+
if preamble:
|
|
106
|
+
parts.append(
|
|
107
|
+
(
|
|
108
|
+
preamble,
|
|
109
|
+
{
|
|
110
|
+
"strategy": "markdown_section",
|
|
111
|
+
"section_index": 0,
|
|
112
|
+
"heading": None,
|
|
113
|
+
},
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
for i, m in enumerate(matches):
|
|
118
|
+
start = m.start()
|
|
119
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
120
|
+
chunk = text[start:end].strip()
|
|
121
|
+
if not chunk:
|
|
122
|
+
continue
|
|
123
|
+
level = len(m.group(1))
|
|
124
|
+
title = m.group(2).strip()
|
|
125
|
+
parts.append(
|
|
126
|
+
(
|
|
127
|
+
chunk,
|
|
128
|
+
{
|
|
129
|
+
"strategy": "markdown_section",
|
|
130
|
+
"section_index": len(parts),
|
|
131
|
+
"heading_level": level,
|
|
132
|
+
"heading": title,
|
|
133
|
+
},
|
|
134
|
+
),
|
|
135
|
+
)
|
|
136
|
+
return parts
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def split_paragraphs(text: str) -> list[str]:
|
|
140
|
+
raw = text.strip()
|
|
141
|
+
if not raw:
|
|
142
|
+
return []
|
|
143
|
+
blocks = re.split(r"\n\s*\n+", raw)
|
|
144
|
+
return [b.strip() for b in blocks if b.strip()]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def structure_chunks_for_text_block(
|
|
148
|
+
text: str,
|
|
149
|
+
block_ordinal: int,
|
|
150
|
+
*,
|
|
151
|
+
use_llm: bool,
|
|
152
|
+
llm_sections: list[str] | None,
|
|
153
|
+
) -> list[SemanticChunkDraft]:
|
|
154
|
+
"""Prefer markdown sections, then paragraphs; optional LLM replaces weak splits."""
|
|
155
|
+
text = text.strip()
|
|
156
|
+
if not text:
|
|
157
|
+
return []
|
|
158
|
+
|
|
159
|
+
md_parts = split_markdown_sections(text)
|
|
160
|
+
if len(md_parts) > 1 or (
|
|
161
|
+
len(md_parts) == 1 and md_parts[0][1].get("heading") is not None
|
|
162
|
+
):
|
|
163
|
+
return [
|
|
164
|
+
SemanticChunkDraft(
|
|
165
|
+
text=seg,
|
|
166
|
+
start_block_ordinal=block_ordinal,
|
|
167
|
+
end_block_ordinal=block_ordinal,
|
|
168
|
+
meta=meta,
|
|
169
|
+
)
|
|
170
|
+
for seg, meta in md_parts
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
paras = split_paragraphs(text)
|
|
174
|
+
weak = len(paras) <= 1 and len(text) > 400
|
|
175
|
+
|
|
176
|
+
if weak and use_llm and llm_sections:
|
|
177
|
+
return [
|
|
178
|
+
SemanticChunkDraft(
|
|
179
|
+
text=s.strip(),
|
|
180
|
+
start_block_ordinal=block_ordinal,
|
|
181
|
+
end_block_ordinal=block_ordinal,
|
|
182
|
+
meta={
|
|
183
|
+
"strategy": "llm_section",
|
|
184
|
+
"section_index": i,
|
|
185
|
+
},
|
|
186
|
+
)
|
|
187
|
+
for i, s in enumerate(llm_sections)
|
|
188
|
+
if s.strip()
|
|
189
|
+
]
|
|
190
|
+
|
|
191
|
+
if len(paras) <= 1:
|
|
192
|
+
return [
|
|
193
|
+
SemanticChunkDraft(
|
|
194
|
+
text=text,
|
|
195
|
+
start_block_ordinal=block_ordinal,
|
|
196
|
+
end_block_ordinal=block_ordinal,
|
|
197
|
+
meta={
|
|
198
|
+
"strategy": "fallback_single",
|
|
199
|
+
"reason": "weak_structure_no_llm"
|
|
200
|
+
if weak and not use_llm
|
|
201
|
+
else "short_text",
|
|
202
|
+
},
|
|
203
|
+
),
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
return [
|
|
207
|
+
SemanticChunkDraft(
|
|
208
|
+
text=p,
|
|
209
|
+
start_block_ordinal=block_ordinal,
|
|
210
|
+
end_block_ordinal=block_ordinal,
|
|
211
|
+
meta={"strategy": "paragraph", "paragraph_index": i},
|
|
212
|
+
)
|
|
213
|
+
for i, p in enumerate(paras)
|
|
214
|
+
]
|