business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Gemini embedding generation (single- or multi-modal, batched).
|
|
3
|
+
|
|
4
|
+
Public entrypoints:
|
|
5
|
+
|
|
6
|
+
- ``batch_embed_contents`` — low-level API batch (list of interleaved part lists).
|
|
7
|
+
|
|
8
|
+
Import ``run_embed_document_job`` / ``embed_document_gemini`` from
|
|
9
|
+
``app.services.embeddings.worker`` to avoid circular imports with the entity pipeline.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from app.services.embeddings.gemini_api import batch_embed_contents
|
|
13
|
+
from app.services.embeddings.types import EmbeddingPart, InlineDataPart, TextPart
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"EmbeddingPart",
|
|
17
|
+
"InlineDataPart",
|
|
18
|
+
"TextPart",
|
|
19
|
+
"batch_embed_contents",
|
|
20
|
+
]
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import text
|
|
8
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
+
|
|
10
|
+
from app.services.embeddings.types import EmbeddingPart, InlineDataPart, TextPart
|
|
11
|
+
from app.services.normalization.normalizer import parse_blob_sha256
|
|
12
|
+
from app.storage.blobs import BlobStore
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True, slots=True)
|
|
18
|
+
class ChunkRow:
|
|
19
|
+
id: int
|
|
20
|
+
ordinal: int
|
|
21
|
+
text: str
|
|
22
|
+
start_block_ordinal: int
|
|
23
|
+
end_block_ordinal: int
|
|
24
|
+
meta_raw: str | None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True, slots=True)
|
|
28
|
+
class BlockRow:
|
|
29
|
+
ordinal: int
|
|
30
|
+
type: str
|
|
31
|
+
meta: dict
|
|
32
|
+
storage_uri: str | None
|
|
33
|
+
sha256: str | None
|
|
34
|
+
mime: str | None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def load_document_chunks(
|
|
38
|
+
session: AsyncSession,
|
|
39
|
+
document_id: str,
|
|
40
|
+
) -> list[ChunkRow]:
|
|
41
|
+
r = await session.execute(
|
|
42
|
+
text(
|
|
43
|
+
"SELECT id, ordinal, text, start_block_ordinal, end_block_ordinal, meta "
|
|
44
|
+
"FROM document_chunks WHERE document_id = :d ORDER BY ordinal",
|
|
45
|
+
),
|
|
46
|
+
{"d": document_id},
|
|
47
|
+
)
|
|
48
|
+
rows = r.fetchall()
|
|
49
|
+
return [
|
|
50
|
+
ChunkRow(
|
|
51
|
+
id=int(row[0]),
|
|
52
|
+
ordinal=int(row[1]),
|
|
53
|
+
text=str(row[2]),
|
|
54
|
+
start_block_ordinal=int(row[3]),
|
|
55
|
+
end_block_ordinal=int(row[4]),
|
|
56
|
+
meta_raw=row[5] if row[5] else None,
|
|
57
|
+
)
|
|
58
|
+
for row in rows
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
async def load_blocks_span(
|
|
63
|
+
session: AsyncSession,
|
|
64
|
+
document_id: str,
|
|
65
|
+
start_ord: int,
|
|
66
|
+
end_ord: int,
|
|
67
|
+
) -> list[BlockRow]:
|
|
68
|
+
r = await session.execute(
|
|
69
|
+
text(
|
|
70
|
+
"SELECT ordinal, type, meta, storage_uri, sha256, mime "
|
|
71
|
+
"FROM content_blocks WHERE document_id = :d "
|
|
72
|
+
"AND ordinal >= :s AND ordinal <= :e "
|
|
73
|
+
"ORDER BY ordinal",
|
|
74
|
+
),
|
|
75
|
+
{"d": document_id, "s": start_ord, "e": end_ord},
|
|
76
|
+
)
|
|
77
|
+
out: list[BlockRow] = []
|
|
78
|
+
for row in r.fetchall():
|
|
79
|
+
meta: dict = {}
|
|
80
|
+
if row[2]:
|
|
81
|
+
try:
|
|
82
|
+
p = json.loads(row[2])
|
|
83
|
+
if isinstance(p, dict):
|
|
84
|
+
meta = p
|
|
85
|
+
except json.JSONDecodeError:
|
|
86
|
+
pass
|
|
87
|
+
out.append(
|
|
88
|
+
BlockRow(
|
|
89
|
+
ordinal=int(row[0]),
|
|
90
|
+
type=str(row[1]),
|
|
91
|
+
meta=meta,
|
|
92
|
+
storage_uri=row[3] if row[3] else None,
|
|
93
|
+
sha256=row[4] if row[4] else None,
|
|
94
|
+
mime=row[5] if row[5] else None,
|
|
95
|
+
),
|
|
96
|
+
)
|
|
97
|
+
return out
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def chunk_to_text_parts(chunk: ChunkRow) -> list[EmbeddingPart]:
|
|
101
|
+
"""Single-modality text from stored chunk row."""
|
|
102
|
+
if not chunk.text.strip():
|
|
103
|
+
return [TextPart(" ")]
|
|
104
|
+
return [TextPart(chunk.text)]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def chunk_to_multimodal_parts(
|
|
108
|
+
session: AsyncSession,
|
|
109
|
+
document_id: str,
|
|
110
|
+
chunk: ChunkRow,
|
|
111
|
+
blob_store: BlobStore,
|
|
112
|
+
) -> list[EmbeddingPart]:
|
|
113
|
+
"""Interleaved parts from content_blocks in the chunk's ordinal span."""
|
|
114
|
+
blocks = await load_blocks_span(
|
|
115
|
+
session,
|
|
116
|
+
document_id,
|
|
117
|
+
chunk.start_block_ordinal,
|
|
118
|
+
chunk.end_block_ordinal,
|
|
119
|
+
)
|
|
120
|
+
parts: list[EmbeddingPart] = []
|
|
121
|
+
for b in blocks:
|
|
122
|
+
if b.type == "text":
|
|
123
|
+
t = b.meta.get("text")
|
|
124
|
+
if isinstance(t, str) and t.strip():
|
|
125
|
+
parts.append(TextPart(t.strip()))
|
|
126
|
+
continue
|
|
127
|
+
if b.type == "image" and b.storage_uri:
|
|
128
|
+
sha = b.sha256 or parse_blob_sha256(b.storage_uri)
|
|
129
|
+
if not sha:
|
|
130
|
+
continue
|
|
131
|
+
try:
|
|
132
|
+
data = blob_store.read_bytes(sha)
|
|
133
|
+
except OSError:
|
|
134
|
+
logger.warning("missing blob %s for document %s", sha, document_id)
|
|
135
|
+
continue
|
|
136
|
+
mime = b.mime or "image/png"
|
|
137
|
+
parts.append(InlineDataPart(mime_type=mime, data=data))
|
|
138
|
+
if not parts:
|
|
139
|
+
return chunk_to_text_parts(chunk)
|
|
140
|
+
return parts
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import UTC, datetime, timedelta
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import text
|
|
8
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
+
|
|
10
|
+
from app.config import Settings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def merge_ingest_meta(existing: str | None, patch: dict[str, Any]) -> str:
|
|
14
|
+
base: dict[str, Any] = {}
|
|
15
|
+
if existing:
|
|
16
|
+
try:
|
|
17
|
+
parsed = json.loads(existing)
|
|
18
|
+
if isinstance(parsed, dict):
|
|
19
|
+
base = parsed
|
|
20
|
+
except json.JSONDecodeError:
|
|
21
|
+
base = {"_raw_ingest_meta": existing}
|
|
22
|
+
base.update(patch)
|
|
23
|
+
return json.dumps(base, default=str)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def cleaned_ingest_meta_dict(ingest_meta: str | None) -> dict[str, Any]:
|
|
27
|
+
"""Drop embedding failure / DLQ keys so a successful embed can overwrite status."""
|
|
28
|
+
base: dict[str, Any] = {}
|
|
29
|
+
if ingest_meta:
|
|
30
|
+
try:
|
|
31
|
+
parsed = json.loads(ingest_meta)
|
|
32
|
+
if isinstance(parsed, dict):
|
|
33
|
+
base = parsed
|
|
34
|
+
except json.JSONDecodeError:
|
|
35
|
+
base = {}
|
|
36
|
+
for k in ("embedding_error", "embedding_dlq_attempts", "embedding_dlq_state"):
|
|
37
|
+
base.pop(k, None)
|
|
38
|
+
return base
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
async def clear_embedding_dlq(session: AsyncSession, *, document_id: str) -> None:
|
|
42
|
+
await session.execute(
|
|
43
|
+
text("DELETE FROM embedding_dlq WHERE document_id = :d"),
|
|
44
|
+
{"d": document_id},
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def record_embedding_dlq_failure(
|
|
49
|
+
session: AsyncSession,
|
|
50
|
+
*,
|
|
51
|
+
document_id: str,
|
|
52
|
+
error_message: str,
|
|
53
|
+
multimodal: bool,
|
|
54
|
+
settings: Settings,
|
|
55
|
+
prior_ingest_meta: str | None,
|
|
56
|
+
) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Increment attempts, set exponential ``next_retry_at``, or mark ``dead`` when
|
|
59
|
+
attempts exceed ``embedding_dlq_max_attempts``.
|
|
60
|
+
"""
|
|
61
|
+
row = await session.execute(
|
|
62
|
+
text(
|
|
63
|
+
"SELECT attempt_count FROM embedding_dlq WHERE document_id = :d LIMIT 1",
|
|
64
|
+
),
|
|
65
|
+
{"d": document_id},
|
|
66
|
+
)
|
|
67
|
+
found = row.first()
|
|
68
|
+
attempts = int(found[0]) + 1 if found is not None else 1
|
|
69
|
+
max_a = settings.embedding_dlq_max_attempts
|
|
70
|
+
base_delay = settings.embedding_dlq_base_delay_s
|
|
71
|
+
|
|
72
|
+
if attempts >= max_a:
|
|
73
|
+
state = "dead"
|
|
74
|
+
next_retry_at: str | None = None
|
|
75
|
+
doc_status = "failed"
|
|
76
|
+
else:
|
|
77
|
+
state = "pending_retry"
|
|
78
|
+
delay_s = base_delay * (2 ** (attempts - 1))
|
|
79
|
+
cap = settings.embedding_dlq_max_backoff_s
|
|
80
|
+
delay_s = min(delay_s, cap)
|
|
81
|
+
next_at = datetime.now(UTC) + timedelta(seconds=delay_s)
|
|
82
|
+
next_retry_at = next_at.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
83
|
+
doc_status = "partial"
|
|
84
|
+
|
|
85
|
+
now_iso = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
86
|
+
await session.execute(
|
|
87
|
+
text(
|
|
88
|
+
"""
|
|
89
|
+
INSERT INTO embedding_dlq (
|
|
90
|
+
document_id, last_error, attempt_count, next_retry_at, state,
|
|
91
|
+
multimodal, updated_at
|
|
92
|
+
) VALUES (
|
|
93
|
+
:doc, :err, :ac, :nra, :st, :mm, :up
|
|
94
|
+
)
|
|
95
|
+
ON CONFLICT(document_id) DO UPDATE SET
|
|
96
|
+
last_error = excluded.last_error,
|
|
97
|
+
attempt_count = excluded.attempt_count,
|
|
98
|
+
next_retry_at = excluded.next_retry_at,
|
|
99
|
+
state = excluded.state,
|
|
100
|
+
multimodal = excluded.multimodal,
|
|
101
|
+
updated_at = excluded.updated_at
|
|
102
|
+
""",
|
|
103
|
+
),
|
|
104
|
+
{
|
|
105
|
+
"doc": document_id,
|
|
106
|
+
"err": error_message[:8000],
|
|
107
|
+
"ac": attempts,
|
|
108
|
+
"nra": next_retry_at,
|
|
109
|
+
"st": state,
|
|
110
|
+
"mm": 1 if multimodal else 0,
|
|
111
|
+
"up": now_iso,
|
|
112
|
+
},
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
im = merge_ingest_meta(
|
|
116
|
+
prior_ingest_meta,
|
|
117
|
+
{
|
|
118
|
+
"embedding_error": error_message[:2000],
|
|
119
|
+
"embedding_dlq_attempts": attempts,
|
|
120
|
+
"embedding_dlq_state": state,
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
await session.execute(
|
|
124
|
+
text(
|
|
125
|
+
"UPDATE documents SET status = :st, ingest_meta = :im WHERE id = :id",
|
|
126
|
+
),
|
|
127
|
+
{"st": doc_status, "im": im, "id": document_id},
|
|
128
|
+
)
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import base64
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
from app.config import Settings
|
|
11
|
+
from app.services.embeddings.types import EmbeddingPart, InlineDataPart, TextPart
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _part_to_api_dict(part: EmbeddingPart) -> dict[str, Any]:
|
|
19
|
+
if isinstance(part, TextPart):
|
|
20
|
+
return {"text": part.text}
|
|
21
|
+
if isinstance(part, InlineDataPart):
|
|
22
|
+
b64 = base64.standard_b64encode(part.data).decode("ascii")
|
|
23
|
+
return {
|
|
24
|
+
"inlineData": {
|
|
25
|
+
"mimeType": part.mime_type,
|
|
26
|
+
"data": b64,
|
|
27
|
+
},
|
|
28
|
+
}
|
|
29
|
+
msg = f"unknown part type: {type(part)}"
|
|
30
|
+
raise TypeError(msg)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _build_request_item(
|
|
34
|
+
*,
|
|
35
|
+
model: str,
|
|
36
|
+
parts: list[EmbeddingPart],
|
|
37
|
+
task_type: str,
|
|
38
|
+
output_dimensionality: int | None,
|
|
39
|
+
) -> dict[str, Any]:
|
|
40
|
+
item: dict[str, Any] = {
|
|
41
|
+
"model": f"models/{model}",
|
|
42
|
+
"content": {"parts": [_part_to_api_dict(p) for p in parts]},
|
|
43
|
+
"taskType": task_type,
|
|
44
|
+
}
|
|
45
|
+
if output_dimensionality is not None:
|
|
46
|
+
item["outputDimensionality"] = output_dimensionality
|
|
47
|
+
return item
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _normalize_vector(values: list[float], target_dim: int) -> list[float]:
|
|
51
|
+
if len(values) == target_dim:
|
|
52
|
+
return values
|
|
53
|
+
if len(values) > target_dim:
|
|
54
|
+
return values[:target_dim]
|
|
55
|
+
msg = f"embedding dim {len(values)} < target {target_dim}"
|
|
56
|
+
raise ValueError(msg)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
async def _post_batch_once(
|
|
60
|
+
client: httpx.AsyncClient,
|
|
61
|
+
url: str,
|
|
62
|
+
headers: dict[str, str],
|
|
63
|
+
payload: dict[str, Any],
|
|
64
|
+
target_dim: int,
|
|
65
|
+
) -> tuple[list[list[float]] | None, bool]:
|
|
66
|
+
"""
|
|
67
|
+
Returns (vectors, reject_output_dimensionality).
|
|
68
|
+
Second flag True if caller should retry without outputDimensionality.
|
|
69
|
+
"""
|
|
70
|
+
r = await client.post(url, headers=headers, json=payload)
|
|
71
|
+
if r.status_code == 400 and "outputDimensionality" in r.text:
|
|
72
|
+
return None, True
|
|
73
|
+
if r.status_code in (429, 500, 502, 503):
|
|
74
|
+
msg = f"retryable HTTP {r.status_code}"
|
|
75
|
+
raise RuntimeError(msg)
|
|
76
|
+
r.raise_for_status()
|
|
77
|
+
data = r.json()
|
|
78
|
+
embs = data.get("embeddings")
|
|
79
|
+
if not isinstance(embs, list):
|
|
80
|
+
msg = f"unexpected response: {data!r}"[:500]
|
|
81
|
+
raise RuntimeError(msg)
|
|
82
|
+
out: list[list[float]] = []
|
|
83
|
+
for emb in embs:
|
|
84
|
+
vals = emb.get("values")
|
|
85
|
+
if not isinstance(vals, list):
|
|
86
|
+
msg = f"bad embedding entry: {emb!r}"[:300]
|
|
87
|
+
raise RuntimeError(msg)
|
|
88
|
+
floats = [float(x) for x in vals]
|
|
89
|
+
out.append(_normalize_vector(floats, target_dim))
|
|
90
|
+
return out, False
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
async def _post_batch_with_retry(
|
|
94
|
+
client: httpx.AsyncClient,
|
|
95
|
+
url: str,
|
|
96
|
+
headers: dict[str, str],
|
|
97
|
+
payload: dict[str, Any],
|
|
98
|
+
target_dim: int,
|
|
99
|
+
settings: Settings,
|
|
100
|
+
) -> list[list[float]] | None:
|
|
101
|
+
delay = settings.gemini_embed_base_delay_s
|
|
102
|
+
attempts = max(1, settings.gemini_embed_max_retries)
|
|
103
|
+
for attempt in range(attempts):
|
|
104
|
+
try:
|
|
105
|
+
vecs, reject_od = await _post_batch_once(
|
|
106
|
+
client,
|
|
107
|
+
url,
|
|
108
|
+
headers,
|
|
109
|
+
payload,
|
|
110
|
+
target_dim,
|
|
111
|
+
)
|
|
112
|
+
if reject_od:
|
|
113
|
+
return None
|
|
114
|
+
return vecs
|
|
115
|
+
except (httpx.HTTPError, RuntimeError, ValueError, KeyError, TypeError) as e:
|
|
116
|
+
logger.warning(
|
|
117
|
+
"Gemini embed batch attempt %s/%s: %s",
|
|
118
|
+
attempt + 1,
|
|
119
|
+
attempts,
|
|
120
|
+
e,
|
|
121
|
+
)
|
|
122
|
+
if attempt == attempts - 1:
|
|
123
|
+
logger.exception("Gemini embed batch failed")
|
|
124
|
+
return None
|
|
125
|
+
await asyncio.sleep(delay * (2**attempt))
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
async def batch_embed_contents(
|
|
130
|
+
*,
|
|
131
|
+
api_key: str,
|
|
132
|
+
model: str,
|
|
133
|
+
contents: list[list[EmbeddingPart]],
|
|
134
|
+
settings: Settings,
|
|
135
|
+
task_type: str = "RETRIEVAL_DOCUMENT",
|
|
136
|
+
client: httpx.AsyncClient | None = None,
|
|
137
|
+
) -> list[list[float]]:
|
|
138
|
+
"""
|
|
139
|
+
Batch Gemini ``batchEmbedContents`` calls.
|
|
140
|
+
|
|
141
|
+
Each inner list is one embed request: a single ``TextPart`` or an
|
|
142
|
+
interleaved list of ``TextPart`` / ``InlineDataPart`` (multimodal).
|
|
143
|
+
"""
|
|
144
|
+
if not contents:
|
|
145
|
+
return []
|
|
146
|
+
target_dim = settings.vector_embedding_dim
|
|
147
|
+
url = f"{_GEMINI_BASE}/models/{model}:batchEmbedContents"
|
|
148
|
+
headers = {
|
|
149
|
+
"x-goog-api-key": api_key,
|
|
150
|
+
"Content-Type": "application/json",
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
use_od: int | None = target_dim
|
|
154
|
+
close_client = False
|
|
155
|
+
if client is None:
|
|
156
|
+
client = httpx.AsyncClient(timeout=120.0)
|
|
157
|
+
close_client = True
|
|
158
|
+
try:
|
|
159
|
+
all_out: list[list[float]] = []
|
|
160
|
+
batch_size = max(1, settings.gemini_embed_batch_size)
|
|
161
|
+
for start in range(0, len(contents), batch_size):
|
|
162
|
+
batch = contents[start : start + batch_size]
|
|
163
|
+
requests_body = [
|
|
164
|
+
_build_request_item(
|
|
165
|
+
model=model,
|
|
166
|
+
parts=parts,
|
|
167
|
+
task_type=task_type,
|
|
168
|
+
output_dimensionality=use_od,
|
|
169
|
+
)
|
|
170
|
+
for parts in batch
|
|
171
|
+
]
|
|
172
|
+
payload: dict[str, Any] = {"requests": requests_body}
|
|
173
|
+
vecs = await _post_batch_with_retry(
|
|
174
|
+
client,
|
|
175
|
+
url,
|
|
176
|
+
headers,
|
|
177
|
+
payload,
|
|
178
|
+
target_dim,
|
|
179
|
+
settings,
|
|
180
|
+
)
|
|
181
|
+
if vecs is None and use_od is not None:
|
|
182
|
+
use_od = None
|
|
183
|
+
requests_body = [
|
|
184
|
+
_build_request_item(
|
|
185
|
+
model=model,
|
|
186
|
+
parts=parts,
|
|
187
|
+
task_type=task_type,
|
|
188
|
+
output_dimensionality=None,
|
|
189
|
+
)
|
|
190
|
+
for parts in batch
|
|
191
|
+
]
|
|
192
|
+
vecs = await _post_batch_with_retry(
|
|
193
|
+
client,
|
|
194
|
+
url,
|
|
195
|
+
headers,
|
|
196
|
+
{"requests": requests_body},
|
|
197
|
+
target_dim,
|
|
198
|
+
settings,
|
|
199
|
+
)
|
|
200
|
+
if vecs is None:
|
|
201
|
+
msg = "Gemini batchEmbedContents failed after retries"
|
|
202
|
+
raise RuntimeError(msg)
|
|
203
|
+
all_out.extend(vecs)
|
|
204
|
+
return all_out
|
|
205
|
+
finally:
|
|
206
|
+
if close_client:
|
|
207
|
+
await client.aclose()
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
+
|
|
8
|
+
from app.config import Settings
|
|
9
|
+
from app.services.embeddings.build_inputs import ChunkRow
|
|
10
|
+
from app.vectorstore import SqliteVecStore, VectorMeta
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def vector_store_pointer(document_id: str, chunk_pk: int) -> str:
|
|
14
|
+
return f"sqlite-vec:kb_vec_embeddings:{document_id}:{chunk_pk}"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def delete_gemini_embeddings_for_document(
|
|
18
|
+
session: AsyncSession,
|
|
19
|
+
*,
|
|
20
|
+
document_id: str,
|
|
21
|
+
model: str,
|
|
22
|
+
) -> None:
|
|
23
|
+
await session.execute(
|
|
24
|
+
text("DELETE FROM embeddings WHERE document_id = :d AND model = :m"),
|
|
25
|
+
{"d": document_id, "m": model},
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def persist_chunk_embeddings(
|
|
30
|
+
session: AsyncSession,
|
|
31
|
+
store: SqliteVecStore,
|
|
32
|
+
*,
|
|
33
|
+
document_id: str,
|
|
34
|
+
source_id: int,
|
|
35
|
+
model: str,
|
|
36
|
+
chunks: list[ChunkRow],
|
|
37
|
+
vectors: list[list[float]],
|
|
38
|
+
modalities: list[str],
|
|
39
|
+
settings: Settings,
|
|
40
|
+
) -> None:
|
|
41
|
+
if len(chunks) != len(vectors) or len(chunks) != len(modalities):
|
|
42
|
+
msg = "chunks, vectors, and modalities must align"
|
|
43
|
+
raise ValueError(msg)
|
|
44
|
+
ts = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
45
|
+
dim = settings.vector_embedding_dim
|
|
46
|
+
|
|
47
|
+
metas = [
|
|
48
|
+
VectorMeta(
|
|
49
|
+
document_id=document_id,
|
|
50
|
+
chunk_id=ch.id,
|
|
51
|
+
source_id=source_id,
|
|
52
|
+
modality=mod,
|
|
53
|
+
ingested_at=ts,
|
|
54
|
+
)
|
|
55
|
+
for ch, mod in zip(chunks, modalities, strict=True)
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
await store.upsert_for_session(session, vectors, metas)
|
|
59
|
+
|
|
60
|
+
for ch in chunks:
|
|
61
|
+
ptr = vector_store_pointer(document_id, ch.id)
|
|
62
|
+
await session.execute(
|
|
63
|
+
text(
|
|
64
|
+
"INSERT INTO embeddings "
|
|
65
|
+
"(document_id, chunk_id, model, dim, vector_store_ref) "
|
|
66
|
+
"VALUES (:doc, NULL, :model, :dim, :ref)",
|
|
67
|
+
),
|
|
68
|
+
{
|
|
69
|
+
"doc": document_id,
|
|
70
|
+
"model": model,
|
|
71
|
+
"dim": dim,
|
|
72
|
+
"ref": ptr,
|
|
73
|
+
},
|
|
74
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True, slots=True)
|
|
7
|
+
class TextPart:
|
|
8
|
+
text: str
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True, slots=True)
|
|
12
|
+
class InlineDataPart:
|
|
13
|
+
"""Binary payload as base64 in API (image/video/audio snippet)."""
|
|
14
|
+
|
|
15
|
+
mime_type: str
|
|
16
|
+
data: bytes
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
EmbeddingPart = TextPart | InlineDataPart
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def parts_to_modality(parts: list[EmbeddingPart]) -> str:
|
|
23
|
+
if not parts:
|
|
24
|
+
return "text"
|
|
25
|
+
if len(parts) == 1 and isinstance(parts[0], TextPart):
|
|
26
|
+
return "text"
|
|
27
|
+
kinds = {type(p).__name__ for p in parts}
|
|
28
|
+
if "InlineDataPart" in kinds and "TextPart" in kinds:
|
|
29
|
+
return "multimodal"
|
|
30
|
+
if "InlineDataPart" in kinds:
|
|
31
|
+
return "image"
|
|
32
|
+
return "text"
|