business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,56 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+ from app.vectorstore import VectorSearchFilters
4
+
5
+
6
+ class VectorMetaPayload(BaseModel):
7
+ document_id: str
8
+ chunk_id: int = 0
9
+ source_id: int = -1
10
+ modality: str = "text"
11
+ ingested_at: str = Field(
12
+ ...,
13
+ description="ISO-8601 timestamp for filterable range queries on ingested_at",
14
+ )
15
+
16
+
17
+ class VectorUpsertRequest(BaseModel):
18
+ embeddings: list[list[float]]
19
+ metas: list[VectorMetaPayload]
20
+
21
+
22
+ class VectorSearchFiltersPayload(BaseModel):
23
+ document_id: str | None = None
24
+ source_id: int | None = None
25
+ modality: str | None = None
26
+ timestamp_min: str | None = None
27
+ timestamp_max: str | None = None
28
+
29
+ def to_filters(self) -> VectorSearchFilters:
30
+ return VectorSearchFilters(
31
+ document_id=self.document_id,
32
+ source_id=self.source_id,
33
+ modality=self.modality,
34
+ timestamp_min=self.timestamp_min,
35
+ timestamp_max=self.timestamp_max,
36
+ )
37
+
38
+
39
+ class VectorSearchRequest(BaseModel):
40
+ query_vector: list[float]
41
+ k: int = Field(default=10, ge=1, le=500)
42
+ filters: VectorSearchFiltersPayload | None = None
43
+
44
+
45
+ class VectorSearchHit(BaseModel):
46
+ rowid: int
47
+ distance: float
48
+ document_id: str
49
+ chunk_id: int
50
+ source_id: int
51
+ modality: str
52
+ ingested_at: str
53
+
54
+
55
+ class VectorSearchResponse(BaseModel):
56
+ results: list[VectorSearchHit]
File without changes
@@ -0,0 +1,152 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import uuid
5
+ from datetime import UTC, datetime
6
+
7
+ from sqlalchemy import text
8
+ from sqlalchemy.ext.asyncio import AsyncSession
9
+
10
+
11
+ def _utc_now_iso() -> str:
12
+ return datetime.now(UTC).isoformat().replace("+00:00", "Z")
13
+
14
+
15
+ async def create_chat_session(
16
+ session: AsyncSession,
17
+ *,
18
+ user_id: str,
19
+ title: str | None,
20
+ ) -> str:
21
+ sid = str(uuid.uuid4())
22
+ now = _utc_now_iso()
23
+ await session.execute(
24
+ text(
25
+ "INSERT INTO chat_sessions (id, user_id, title, created_at, updated_at) "
26
+ "VALUES (:id, :uid, :title, :ca, :ua)",
27
+ ),
28
+ {"id": sid, "uid": user_id, "title": title, "ca": now, "ua": now},
29
+ )
30
+ return sid
31
+
32
+
33
+ async def list_chat_sessions(
34
+ session: AsyncSession,
35
+ *,
36
+ user_id: str,
37
+ limit: int = 100,
38
+ ) -> list[dict[str, str | None]]:
39
+ r = await session.execute(
40
+ text(
41
+ "SELECT id, title, created_at, updated_at FROM chat_sessions "
42
+ "WHERE user_id = :u ORDER BY updated_at DESC LIMIT :lim",
43
+ ),
44
+ {"u": user_id, "lim": limit},
45
+ )
46
+ rows = r.mappings().all()
47
+ return [dict(row) for row in rows]
48
+
49
+
50
+ async def get_session_owned(
51
+ session: AsyncSession,
52
+ *,
53
+ session_id: str,
54
+ user_id: str,
55
+ ) -> dict[str, str | None] | None:
56
+ r = await session.execute(
57
+ text(
58
+ "SELECT id, user_id, title, created_at, updated_at FROM chat_sessions "
59
+ "WHERE id = :id AND user_id = :u LIMIT 1",
60
+ ),
61
+ {"id": session_id, "u": user_id},
62
+ )
63
+ row = r.mappings().first()
64
+ return dict(row) if row else None
65
+
66
+
67
+ async def touch_session_updated(session: AsyncSession, *, session_id: str) -> None:
68
+ now = _utc_now_iso()
69
+ await session.execute(
70
+ text("UPDATE chat_sessions SET updated_at = :ua WHERE id = :id"),
71
+ {"ua": now, "id": session_id},
72
+ )
73
+
74
+
75
+ async def maybe_set_title_from_first_message(
76
+ session: AsyncSession,
77
+ *,
78
+ session_id: str,
79
+ snippet: str,
80
+ ) -> None:
81
+ r = await session.execute(
82
+ text("SELECT title FROM chat_sessions WHERE id = :id LIMIT 1"),
83
+ {"id": session_id},
84
+ )
85
+ row = r.first()
86
+ if not row or row[0]:
87
+ return
88
+ t = snippet.strip().replace("\n", " ")[:80]
89
+ if not t:
90
+ return
91
+ await session.execute(
92
+ text("UPDATE chat_sessions SET title = :t WHERE id = :id"),
93
+ {"t": t, "id": session_id},
94
+ )
95
+
96
+
97
+ async def append_chat_message(
98
+ session: AsyncSession,
99
+ *,
100
+ session_id: str,
101
+ role: str,
102
+ content: str,
103
+ meta: dict | None = None,
104
+ ) -> int:
105
+ now = _utc_now_iso()
106
+ meta_s = json.dumps(meta, default=str) if meta else None
107
+ await session.execute(
108
+ text(
109
+ "INSERT INTO chat_messages (session_id, role, content, meta_json, created_at) "
110
+ "VALUES (:sid, :role, :content, :meta, :ca)",
111
+ ),
112
+ {
113
+ "sid": session_id,
114
+ "role": role,
115
+ "content": content,
116
+ "meta": meta_s,
117
+ "ca": now,
118
+ },
119
+ )
120
+ r = await session.execute(text("SELECT last_insert_rowid()"))
121
+ rid = r.scalar_one()
122
+ return int(rid)
123
+
124
+
125
+ async def list_chat_messages(
126
+ session: AsyncSession,
127
+ *,
128
+ session_id: str,
129
+ limit: int = 200,
130
+ ) -> list[dict[str, str | int | None]]:
131
+ r = await session.execute(
132
+ text(
133
+ "SELECT id, role, content, meta_json, created_at FROM chat_messages "
134
+ "WHERE session_id = :sid ORDER BY id ASC LIMIT :lim",
135
+ ),
136
+ {"sid": session_id, "lim": limit},
137
+ )
138
+ rows = r.mappings().all()
139
+ return [dict(row) for row in rows]
140
+
141
+
142
+ async def delete_chat_session(
143
+ session: AsyncSession,
144
+ *,
145
+ session_id: str,
146
+ user_id: str,
147
+ ) -> bool:
148
+ r = await session.execute(
149
+ text("DELETE FROM chat_sessions WHERE id = :id AND user_id = :u"),
150
+ {"id": session_id, "u": user_id},
151
+ )
152
+ return (r.rowcount or 0) > 0
@@ -0,0 +1,3 @@
1
+ from app.services.chunking.semantic_chunk import rebuild_semantic_chunks
2
+
3
+ __all__ = ["rebuild_semantic_chunks"]
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from typing import Any
6
+
7
+ import httpx
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ async def suggest_sections_via_llm(
13
+ text: str,
14
+ *,
15
+ api_key: str,
16
+ model: str,
17
+ timeout_s: float = 60.0,
18
+ ) -> list[str] | None:
19
+ """Ask the model for semantic section splits; returns section strings or None."""
20
+ if not text.strip():
21
+ return []
22
+ prompt = (
23
+ "Split the following document into semantic sections for retrieval. "
24
+ "Prefer coherent topical units. Do not use fixed token sizes. "
25
+ 'Return JSON only: {"sections": ["...", "..."]}\n\n---\n' + text[:48_000]
26
+ )
27
+ url = "https://api.openai.com/v1/chat/completions"
28
+ body: dict[str, Any] = {
29
+ "model": model,
30
+ "temperature": 0.2,
31
+ "response_format": {"type": "json_object"},
32
+ "messages": [
33
+ {
34
+ "role": "system",
35
+ "content": (
36
+ "You output only valid JSON with a sections array of strings."
37
+ ),
38
+ },
39
+ {"role": "user", "content": prompt},
40
+ ],
41
+ }
42
+ try:
43
+ async with httpx.AsyncClient(timeout=timeout_s) as client:
44
+ r = await client.post(
45
+ url,
46
+ headers={
47
+ "Authorization": f"Bearer {api_key}",
48
+ "Content-Type": "application/json",
49
+ },
50
+ json=body,
51
+ )
52
+ r.raise_for_status()
53
+ data = r.json()
54
+ content = data["choices"][0]["message"]["content"]
55
+ parsed = json.loads(content)
56
+ sections = parsed.get("sections")
57
+ if not isinstance(sections, list):
58
+ return None
59
+ out = [str(s).strip() for s in sections if str(s).strip()]
60
+ return out or None
61
+ except Exception:
62
+ logger.exception("LLM chunk boundary request failed")
63
+ return None
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+
7
+
8
+ class SemanticChunkDraft(BaseModel):
9
+ model_config = ConfigDict(extra="forbid")
10
+
11
+ text: str
12
+ start_block_ordinal: int = Field(ge=0)
13
+ end_block_ordinal: int = Field(ge=0)
14
+ meta: dict[str, Any] = Field(default_factory=dict)
15
+
16
+
17
+ class RebuildChunksRequest(BaseModel):
18
+ model_config = ConfigDict(extra="forbid")
19
+
20
+ use_llm_weak_structure: bool = Field(
21
+ default=False,
22
+ description="If true and structure is weak, call LLM for section boundaries",
23
+ )
24
+
25
+
26
+ class RebuildChunksResponse(BaseModel):
27
+ model_config = ConfigDict(extra="forbid")
28
+
29
+ document_id: str
30
+ chunks_written: int
@@ -0,0 +1,178 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ from sqlalchemy import text
9
+ from sqlalchemy.ext.asyncio import AsyncSession
10
+
11
+ from app.config import Settings
12
+ from app.services.chunking.llm_boundaries import suggest_sections_via_llm
13
+ from app.services.chunking.schemas import SemanticChunkDraft
14
+ from app.services.chunking.splitters import (
15
+ split_paragraphs,
16
+ structure_chunks_for_text_block,
17
+ thread_chunks_from_envelope,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class _BlockRow:
25
+ ordinal: int
26
+ type: str
27
+ meta: dict[str, Any]
28
+
29
+
30
+ async def _load_blocks(session: AsyncSession, document_id: str) -> list[_BlockRow]:
31
+ r = await session.execute(
32
+ text(
33
+ "SELECT ordinal, type, meta FROM content_blocks "
34
+ "WHERE document_id = :d ORDER BY ordinal",
35
+ ),
36
+ {"d": document_id},
37
+ )
38
+ out: list[_BlockRow] = []
39
+ for row in r.fetchall():
40
+ meta: dict[str, Any] = {}
41
+ if row[2]:
42
+ try:
43
+ parsed = json.loads(row[2])
44
+ if isinstance(parsed, dict):
45
+ meta = parsed
46
+ except json.JSONDecodeError:
47
+ pass
48
+ out.append(_BlockRow(ordinal=int(row[0]), type=str(row[1]), meta=meta))
49
+ return out
50
+
51
+
52
+ async def _load_envelope(session: AsyncSession, document_id: str) -> dict[str, Any]:
53
+ r = await session.execute(
54
+ text("SELECT raw_content FROM documents WHERE id = :id LIMIT 1"),
55
+ {"id": document_id},
56
+ )
57
+ row = r.first()
58
+ if row is None or row[0] is None:
59
+ return {}
60
+ try:
61
+ data = json.loads(row[0])
62
+ return data if isinstance(data, dict) else {}
63
+ except json.JSONDecodeError:
64
+ return {}
65
+
66
+
67
+ def _align_thread_chunks_to_blocks(
68
+ drafts: list[SemanticChunkDraft],
69
+ text_blocks: list[_BlockRow],
70
+ ) -> list[SemanticChunkDraft]:
71
+ if not drafts:
72
+ return []
73
+ if not text_blocks:
74
+ return [
75
+ d.model_copy(
76
+ update={"start_block_ordinal": 0, "end_block_ordinal": 0},
77
+ )
78
+ for d in drafts
79
+ ]
80
+ if len(drafts) == len(text_blocks):
81
+ return [
82
+ d.model_copy(
83
+ update={
84
+ "start_block_ordinal": b.ordinal,
85
+ "end_block_ordinal": b.ordinal,
86
+ },
87
+ )
88
+ for d, b in zip(drafts, text_blocks, strict=True)
89
+ ]
90
+ max_ord = max(b.ordinal for b in text_blocks) if text_blocks else 0
91
+ return [
92
+ d.model_copy(
93
+ update={
94
+ "start_block_ordinal": 0,
95
+ "end_block_ordinal": max_ord,
96
+ },
97
+ )
98
+ for d in drafts
99
+ ]
100
+
101
+
102
+ async def rebuild_semantic_chunks(
103
+ session: AsyncSession,
104
+ *,
105
+ document_id: str,
106
+ use_llm_weak_structure: bool,
107
+ settings: Settings,
108
+ ) -> int:
109
+ """Replace document_chunks for this document using structure-aware splitting."""
110
+ envelope = await _load_envelope(session, document_id)
111
+ blocks = await _load_blocks(session, document_id)
112
+ text_blocks = [b for b in blocks if b.type == "text" and _block_text_from_row(b)]
113
+
114
+ thread_drafts = thread_chunks_from_envelope(envelope)
115
+ drafts: list[SemanticChunkDraft] = []
116
+
117
+ if thread_drafts:
118
+ drafts = _align_thread_chunks_to_blocks(thread_drafts, text_blocks)
119
+ logger.debug(
120
+ "document %s: %s thread chunks",
121
+ document_id,
122
+ len(drafts),
123
+ )
124
+ else:
125
+ for b in blocks:
126
+ if b.type != "text":
127
+ continue
128
+ t = _block_text_from_row(b)
129
+ if not t:
130
+ continue
131
+ llm_sections: list[str] | None = None
132
+ if (
133
+ use_llm_weak_structure
134
+ and settings.openai_api_key
135
+ and len(split_paragraphs(t)) <= 1
136
+ and len(t) > 800
137
+ ):
138
+ llm_sections = await suggest_sections_via_llm(
139
+ t,
140
+ api_key=settings.openai_api_key,
141
+ model=settings.chunk_llm_model,
142
+ )
143
+ drafts.extend(
144
+ structure_chunks_for_text_block(
145
+ t,
146
+ b.ordinal,
147
+ use_llm=use_llm_weak_structure,
148
+ llm_sections=llm_sections,
149
+ ),
150
+ )
151
+
152
+ await session.execute(
153
+ text("DELETE FROM document_chunks WHERE document_id = :d"),
154
+ {"d": document_id},
155
+ )
156
+ for ordinal, d in enumerate(drafts):
157
+ await session.execute(
158
+ text(
159
+ "INSERT INTO document_chunks "
160
+ "(document_id, ordinal, text, start_block_ordinal, "
161
+ "end_block_ordinal, meta) "
162
+ "VALUES (:did, :ord, :txt, :sb, :eb, :meta)",
163
+ ),
164
+ {
165
+ "did": document_id,
166
+ "ord": ordinal,
167
+ "txt": d.text,
168
+ "sb": d.start_block_ordinal,
169
+ "eb": d.end_block_ordinal,
170
+ "meta": json.dumps(d.meta) if d.meta else None,
171
+ },
172
+ )
173
+ return len(drafts)
174
+
175
+
176
+ def _block_text_from_row(b: _BlockRow) -> str:
177
+ t = b.meta.get("text")
178
+ return t.strip() if isinstance(t, str) else ""
@@ -0,0 +1,214 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from typing import Any
6
+
7
+ from app.services.chunking.schemas import SemanticChunkDraft
8
+
9
+ _HEADING_LINE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
10
+
11
+
12
+ def _message_body(item: dict[str, Any]) -> str:
13
+ for k in ("text", "body", "content", "message", "markdown", "tweet"):
14
+ v = item.get(k)
15
+ if isinstance(v, str) and v.strip():
16
+ return v.strip()
17
+ return json.dumps(item, ensure_ascii=False)[:16_000]
18
+
19
+
20
+ def extract_thread_message_items(
21
+ envelope: dict[str, Any],
22
+ ) -> list[dict[str, Any]] | None:
23
+ """Detect chat/thread payloads; return list of {thread_id, item, index}."""
24
+ p = envelope.get("payload")
25
+ md = envelope.get("metadata") if isinstance(envelope.get("metadata"), dict) else {}
26
+ tid = md.get("thread_id") or md.get("thread_ts") or md.get("conversation_id")
27
+
28
+ if isinstance(p, list) and p and all(isinstance(x, dict) for x in p):
29
+ return [
30
+ {"thread_id": tid, "item": x, "index": i, "role": x.get("role")}
31
+ for i, x in enumerate(p)
32
+ ]
33
+
34
+ if isinstance(p, dict):
35
+ inner_tid = p.get("thread_id") or tid
36
+ for key in ("messages", "thread", "replies", "posts"):
37
+ msgs = p.get(key)
38
+ if (
39
+ isinstance(msgs, list)
40
+ and msgs
41
+ and all(isinstance(x, dict) for x in msgs)
42
+ ):
43
+ return [
44
+ {
45
+ "thread_id": inner_tid or p.get("thread_id"),
46
+ "item": x,
47
+ "index": i,
48
+ "role": x.get("role"),
49
+ }
50
+ for i, x in enumerate(msgs)
51
+ ]
52
+ return None
53
+
54
+
55
+ def thread_chunks_from_envelope(
56
+ envelope: dict[str, Any],
57
+ *,
58
+ default_block_ordinal: int = 0,
59
+ ) -> list[SemanticChunkDraft]:
60
+ items = extract_thread_message_items(envelope)
61
+ if not items:
62
+ return []
63
+ out: list[SemanticChunkDraft] = []
64
+ for row in items:
65
+ item = row["item"]
66
+ if not isinstance(item, dict):
67
+ continue
68
+ body = _message_body(item)
69
+ if not body:
70
+ continue
71
+ tid = row.get("thread_id")
72
+ meta = {
73
+ "strategy": "thread_message",
74
+ "message_index": row["index"],
75
+ "role": row.get("role"),
76
+ }
77
+ if tid is not None:
78
+ meta["thread_id"] = str(tid)
79
+ if item.get("id") is not None:
80
+ meta["message_id"] = str(item["id"])
81
+ out.append(
82
+ SemanticChunkDraft(
83
+ text=body,
84
+ start_block_ordinal=default_block_ordinal,
85
+ end_block_ordinal=default_block_ordinal,
86
+ meta=meta,
87
+ ),
88
+ )
89
+ return out
90
+
91
+
92
+ def split_markdown_sections(text: str) -> list[tuple[str, dict[str, Any]]]:
93
+ """Split on ATX headings; each chunk includes its heading line."""
94
+ if not text.strip():
95
+ return []
96
+ if not _HEADING_LINE.search(text):
97
+ return []
98
+
99
+ parts: list[tuple[str, dict[str, Any]]] = []
100
+ matches = list(_HEADING_LINE.finditer(text))
101
+ if not matches:
102
+ return [(text.strip(), {"strategy": "markdown_section", "section_index": 0})]
103
+
104
+ preamble = text[: matches[0].start()].strip()
105
+ if preamble:
106
+ parts.append(
107
+ (
108
+ preamble,
109
+ {
110
+ "strategy": "markdown_section",
111
+ "section_index": 0,
112
+ "heading": None,
113
+ },
114
+ ),
115
+ )
116
+
117
+ for i, m in enumerate(matches):
118
+ start = m.start()
119
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
120
+ chunk = text[start:end].strip()
121
+ if not chunk:
122
+ continue
123
+ level = len(m.group(1))
124
+ title = m.group(2).strip()
125
+ parts.append(
126
+ (
127
+ chunk,
128
+ {
129
+ "strategy": "markdown_section",
130
+ "section_index": len(parts),
131
+ "heading_level": level,
132
+ "heading": title,
133
+ },
134
+ ),
135
+ )
136
+ return parts
137
+
138
+
139
+ def split_paragraphs(text: str) -> list[str]:
140
+ raw = text.strip()
141
+ if not raw:
142
+ return []
143
+ blocks = re.split(r"\n\s*\n+", raw)
144
+ return [b.strip() for b in blocks if b.strip()]
145
+
146
+
147
+ def structure_chunks_for_text_block(
148
+ text: str,
149
+ block_ordinal: int,
150
+ *,
151
+ use_llm: bool,
152
+ llm_sections: list[str] | None,
153
+ ) -> list[SemanticChunkDraft]:
154
+ """Prefer markdown sections, then paragraphs; optional LLM replaces weak splits."""
155
+ text = text.strip()
156
+ if not text:
157
+ return []
158
+
159
+ md_parts = split_markdown_sections(text)
160
+ if len(md_parts) > 1 or (
161
+ len(md_parts) == 1 and md_parts[0][1].get("heading") is not None
162
+ ):
163
+ return [
164
+ SemanticChunkDraft(
165
+ text=seg,
166
+ start_block_ordinal=block_ordinal,
167
+ end_block_ordinal=block_ordinal,
168
+ meta=meta,
169
+ )
170
+ for seg, meta in md_parts
171
+ ]
172
+
173
+ paras = split_paragraphs(text)
174
+ weak = len(paras) <= 1 and len(text) > 400
175
+
176
+ if weak and use_llm and llm_sections:
177
+ return [
178
+ SemanticChunkDraft(
179
+ text=s.strip(),
180
+ start_block_ordinal=block_ordinal,
181
+ end_block_ordinal=block_ordinal,
182
+ meta={
183
+ "strategy": "llm_section",
184
+ "section_index": i,
185
+ },
186
+ )
187
+ for i, s in enumerate(llm_sections)
188
+ if s.strip()
189
+ ]
190
+
191
+ if len(paras) <= 1:
192
+ return [
193
+ SemanticChunkDraft(
194
+ text=text,
195
+ start_block_ordinal=block_ordinal,
196
+ end_block_ordinal=block_ordinal,
197
+ meta={
198
+ "strategy": "fallback_single",
199
+ "reason": "weak_structure_no_llm"
200
+ if weak and not use_llm
201
+ else "short_text",
202
+ },
203
+ ),
204
+ ]
205
+
206
+ return [
207
+ SemanticChunkDraft(
208
+ text=p,
209
+ start_block_ordinal=block_ordinal,
210
+ end_block_ordinal=block_ordinal,
211
+ meta={"strategy": "paragraph", "paragraph_index": i},
212
+ )
213
+ for i, p in enumerate(paras)
214
+ ]