business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
package/.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# Database (defaults: ./data/rag.sqlite)
|
|
3
|
+
# =============================================================================
|
|
4
|
+
DATA_DIR=data
|
|
5
|
+
SQLITE_FILENAME=rag.sqlite
|
|
6
|
+
DATABASE_URL=sqlite+aiosqlite:///./data/rag.sqlite
|
|
7
|
+
|
|
8
|
+
# =============================================================================
|
|
9
|
+
# Gemini — embeddings, /query, /chat/.../complete, POST /ingest/documents/{id}/embed
|
|
10
|
+
# =============================================================================
|
|
11
|
+
# Set GEMINI_API_KEY here OR leave unset and use the gateway integration store
|
|
12
|
+
# (same values as Next: INTEGRATIONS_INTERNAL_SECRET + gateway URL below).
|
|
13
|
+
GEMINI_API_KEY=
|
|
14
|
+
# When GEMINI_API_KEY is empty, fetch geminiApiKey from the Hono gateway:
|
|
15
|
+
# INTEGRATIONS_GATEWAY_URL=http://127.0.0.1:3001
|
|
16
|
+
# INTEGRATIONS_INTERNAL_SECRET=
|
|
17
|
+
# Embeddings: use ListModels entries with embedContent / batchEmbedContents (e.g. gemini-embedding-001).
|
|
18
|
+
GEMINI_EMBEDDING_MODEL=gemini-embedding-001
|
|
19
|
+
GEMINI_EMBED_TASK_TYPE=RETRIEVAL_DOCUMENT
|
|
20
|
+
GEMINI_QUERY_TASK_TYPE=RETRIEVAL_QUERY
|
|
21
|
+
# Chat: use ListModels entries with generateContent (preview ids change over time).
|
|
22
|
+
# You may paste ``models/gemini-3-flash-preview`` — the leading ``models/`` is stripped automatically.
|
|
23
|
+
GEMINI_CHAT_MODEL=gemini-3-flash-preview
|
|
24
|
+
VECTOR_EMBEDDING_DIM=1536
|
|
25
|
+
|
|
26
|
+
# =============================================================================
|
|
27
|
+
# OpenAI — optional LLM-assisted chunking only (semantic_chunk / llm_boundaries)
|
|
28
|
+
# =============================================================================
|
|
29
|
+
# Not used for embeddings or /query. Leave unset if you only use Gemini RAG.
|
|
30
|
+
# OPENAI_API_KEY=
|
|
31
|
+
# CHUNK_LLM_MODEL=gpt-4o-mini
|
|
32
|
+
|
|
33
|
+
# Legacy / unused by current embed path (kept for Settings compatibility):
|
|
34
|
+
# EMBEDDING_API_KEY=
|
|
35
|
+
# EMBEDDING_MODEL=text-embedding-3-small
|
|
36
|
+
|
|
37
|
+
# =============================================================================
|
|
38
|
+
# Embed failures (DLQ) — optional tuning
|
|
39
|
+
# =============================================================================
|
|
40
|
+
# EMBEDDING_DLQ_MAX_ATTEMPTS=5
|
|
41
|
+
# EMBEDDING_DLQ_BASE_DELAY_S=60
|
|
42
|
+
# EMBEDDING_DLQ_MAX_BACKOFF_S=3600
|
|
43
|
+
|
|
44
|
+
# =============================================================================
|
|
45
|
+
# Post-ingest webhooks (after document status = ok) — optional
|
|
46
|
+
# =============================================================================
|
|
47
|
+
# POST_INGEST_SLACK_WEBHOOK_URL=
|
|
48
|
+
# POST_INGEST_DISCORD_WEBHOOK_URL=
|
|
49
|
+
# POST_INGEST_SLACK_TEMPLATE=...
|
|
50
|
+
# POST_INGEST_DISCORD_TEMPLATE=...
|
|
51
|
+
# POST_INGEST_SUMMARY_MAX_CHARS=400
|
|
52
|
+
|
|
53
|
+
# =============================================================================
|
|
54
|
+
# Gateway + OpenAPI (production)
|
|
55
|
+
# =============================================================================
|
|
56
|
+
# Same value on the Hono gateway (BACKEND_GATEWAY_SECRET). When set, /docs, /redoc,
|
|
57
|
+
# and /openapi.json require X-Gateway-Secret — use docs via the gateway only.
|
|
58
|
+
# openssl rand -base64 32
|
|
59
|
+
# BACKEND_GATEWAY_SECRET=
|
|
60
|
+
|
|
61
|
+
# =============================================================================
|
|
62
|
+
# Logging
|
|
63
|
+
# =============================================================================
|
|
64
|
+
# LOG_LEVEL=INFO
|
|
65
|
+
# LOG_JSON=false
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from logging.config import fileConfig
|
|
3
|
+
|
|
4
|
+
from sqlalchemy import pool
|
|
5
|
+
from sqlalchemy.engine import Connection
|
|
6
|
+
from sqlalchemy.ext.asyncio import async_engine_from_config
|
|
7
|
+
|
|
8
|
+
from alembic import context
|
|
9
|
+
from app.config import get_settings
|
|
10
|
+
from app.db import Base
|
|
11
|
+
|
|
12
|
+
config = context.config
|
|
13
|
+
|
|
14
|
+
if config.config_file_name is not None:
|
|
15
|
+
fileConfig(config.config_file_name)
|
|
16
|
+
|
|
17
|
+
target_metadata = Base.metadata
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def run_migrations_offline() -> None:
|
|
21
|
+
url = get_settings().sqlalchemy_database_url
|
|
22
|
+
context.configure(
|
|
23
|
+
url=url,
|
|
24
|
+
target_metadata=target_metadata,
|
|
25
|
+
literal_binds=True,
|
|
26
|
+
dialect_opts={"paramstyle": "named"},
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
with context.begin_transaction():
|
|
30
|
+
context.run_migrations()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def do_run_migrations(connection: Connection) -> None:
|
|
34
|
+
context.configure(connection=connection, target_metadata=target_metadata)
|
|
35
|
+
with context.begin_transaction():
|
|
36
|
+
context.run_migrations()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def run_async_migrations() -> None:
|
|
40
|
+
settings = get_settings()
|
|
41
|
+
settings.data_dir.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
section = config.get_section(config.config_ini_section) or {}
|
|
43
|
+
section["sqlalchemy.url"] = settings.sqlalchemy_database_url
|
|
44
|
+
connectable = async_engine_from_config(
|
|
45
|
+
section,
|
|
46
|
+
prefix="sqlalchemy.",
|
|
47
|
+
poolclass=pool.NullPool,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
async with connectable.connect() as connection:
|
|
51
|
+
await connection.run_sync(do_run_migrations)
|
|
52
|
+
|
|
53
|
+
await connectable.dispose()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def run_migrations_online() -> None:
|
|
57
|
+
asyncio.run(run_async_migrations())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
if context.is_offline_mode():
|
|
61
|
+
run_migrations_offline()
|
|
62
|
+
else:
|
|
63
|
+
run_migrations_online()
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""${message}
|
|
2
|
+
|
|
3
|
+
Revision ID: ${up_revision}
|
|
4
|
+
Revises: ${down_revision | comma,n}
|
|
5
|
+
Create Date: ${create_date}
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from typing import Sequence, Union
|
|
9
|
+
|
|
10
|
+
from alembic import op
|
|
11
|
+
import sqlalchemy as sa
|
|
12
|
+
${imports if imports else ""}
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision: str = ${repr(up_revision)}
|
|
16
|
+
down_revision: Union[str, None] = ${repr(down_revision)}
|
|
17
|
+
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
|
18
|
+
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade() -> None:
|
|
22
|
+
${upgrades if upgrades else "pass"}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def downgrade() -> None:
|
|
26
|
+
${downgrades if downgrades else "pass"}
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""multimodal knowledge base schema
|
|
2
|
+
|
|
3
|
+
Revision ID: 2a9c8f1d0e7b
|
|
4
|
+
Revises: 8f2a1c0d9e3b
|
|
5
|
+
Create Date: 2026-04-05
|
|
6
|
+
|
|
7
|
+
Design notes (SQLite):
|
|
8
|
+
|
|
9
|
+
- chunk_id on embeddings: without a separate chunks table, document-level rows use
|
|
10
|
+
document_id set and chunk_id NULL; block-level rows use chunk_id -> content_blocks.id
|
|
11
|
+
and document_id NULL. A future chunks table can be added via migration if text
|
|
12
|
+
segments need their own rows.
|
|
13
|
+
|
|
14
|
+
- PRAGMA foreign_keys=ON must be enabled per connection (see app.main engine connect
|
|
15
|
+
hook). SQLite does not enforce FKs when this is off.
|
|
16
|
+
|
|
17
|
+
- JSON columns (meta, raw_content when structured) are TEXT; validate in the app.
|
|
18
|
+
Optional: json_valid() CHECKs on newer SQLite if you want DB-level validation.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from collections.abc import Sequence
|
|
23
|
+
|
|
24
|
+
import sqlalchemy as sa
|
|
25
|
+
|
|
26
|
+
from alembic import op
|
|
27
|
+
|
|
28
|
+
revision: str = "2a9c8f1d0e7b"
|
|
29
|
+
down_revision: str | None = "8f2a1c0d9e3b"
|
|
30
|
+
branch_labels: str | Sequence[str] | None = None
|
|
31
|
+
depends_on: str | Sequence[str] | None = None
|
|
32
|
+
|
|
33
|
+
_TS_DEFAULT = sa.text("(strftime('%Y-%m-%dT%H:%M:%fZ','now'))")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def upgrade() -> None:
|
|
37
|
+
op.execute(sa.text("PRAGMA foreign_keys=ON"))
|
|
38
|
+
|
|
39
|
+
op.create_table(
|
|
40
|
+
"sources",
|
|
41
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
42
|
+
sa.Column("name", sa.Text(), nullable=False),
|
|
43
|
+
sa.Column("connector_type", sa.Text(), nullable=False),
|
|
44
|
+
sa.Column("config_ref", sa.Text(), nullable=True),
|
|
45
|
+
sa.Column(
|
|
46
|
+
"created_at",
|
|
47
|
+
sa.Text(),
|
|
48
|
+
server_default=_TS_DEFAULT,
|
|
49
|
+
nullable=False,
|
|
50
|
+
),
|
|
51
|
+
sa.PrimaryKeyConstraint("id", name="pk_sources"),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
op.create_table(
|
|
55
|
+
"documents",
|
|
56
|
+
sa.Column("id", sa.Text(), nullable=False),
|
|
57
|
+
sa.Column("source_id", sa.Integer(), nullable=False),
|
|
58
|
+
sa.Column("timestamp", sa.Text(), nullable=False),
|
|
59
|
+
sa.Column("content_type", sa.Text(), nullable=False),
|
|
60
|
+
sa.Column("raw_content", sa.Text(), nullable=True),
|
|
61
|
+
sa.Column("summary", sa.Text(), nullable=True),
|
|
62
|
+
sa.Column("status", sa.Text(), nullable=False),
|
|
63
|
+
sa.Column(
|
|
64
|
+
"created_at",
|
|
65
|
+
sa.Text(),
|
|
66
|
+
server_default=_TS_DEFAULT,
|
|
67
|
+
nullable=False,
|
|
68
|
+
),
|
|
69
|
+
sa.ForeignKeyConstraint(
|
|
70
|
+
["source_id"],
|
|
71
|
+
["sources.id"],
|
|
72
|
+
name="fk_documents_source_id_sources",
|
|
73
|
+
ondelete="RESTRICT",
|
|
74
|
+
),
|
|
75
|
+
sa.PrimaryKeyConstraint("id", name="pk_documents"),
|
|
76
|
+
sa.CheckConstraint(
|
|
77
|
+
"status IN ('ok','partial','failed')",
|
|
78
|
+
name="ck_documents_status",
|
|
79
|
+
),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
op.create_table(
|
|
83
|
+
"content_blocks",
|
|
84
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
85
|
+
sa.Column("document_id", sa.Text(), nullable=False),
|
|
86
|
+
sa.Column("ordinal", sa.Integer(), nullable=False),
|
|
87
|
+
sa.Column("type", sa.Text(), nullable=False),
|
|
88
|
+
sa.Column("storage_uri", sa.Text(), nullable=True),
|
|
89
|
+
sa.Column("inline_ref", sa.Text(), nullable=True),
|
|
90
|
+
sa.Column("mime", sa.Text(), nullable=True),
|
|
91
|
+
sa.Column("sha256", sa.Text(), nullable=True),
|
|
92
|
+
sa.Column("meta", sa.Text(), nullable=True),
|
|
93
|
+
sa.ForeignKeyConstraint(
|
|
94
|
+
["document_id"],
|
|
95
|
+
["documents.id"],
|
|
96
|
+
name="fk_content_blocks_document_id_documents",
|
|
97
|
+
ondelete="CASCADE",
|
|
98
|
+
),
|
|
99
|
+
sa.PrimaryKeyConstraint("id", name="pk_content_blocks"),
|
|
100
|
+
sa.CheckConstraint(
|
|
101
|
+
"type IN ('text','image','audio','video','document')",
|
|
102
|
+
name="ck_content_blocks_type",
|
|
103
|
+
),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
op.create_table(
|
|
107
|
+
"entities",
|
|
108
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
109
|
+
sa.Column("name", sa.Text(), nullable=False),
|
|
110
|
+
sa.Column("type", sa.Text(), nullable=False),
|
|
111
|
+
sa.Column("meta", sa.Text(), nullable=True),
|
|
112
|
+
sa.PrimaryKeyConstraint("id", name="pk_entities"),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
op.create_table(
|
|
116
|
+
"document_entities",
|
|
117
|
+
sa.Column("document_id", sa.Text(), nullable=False),
|
|
118
|
+
sa.Column("entity_id", sa.Integer(), nullable=False),
|
|
119
|
+
sa.Column("confidence", sa.Float(), nullable=True),
|
|
120
|
+
sa.ForeignKeyConstraint(
|
|
121
|
+
["document_id"],
|
|
122
|
+
["documents.id"],
|
|
123
|
+
name="fk_document_entities_document_id_documents",
|
|
124
|
+
ondelete="CASCADE",
|
|
125
|
+
),
|
|
126
|
+
sa.ForeignKeyConstraint(
|
|
127
|
+
["entity_id"],
|
|
128
|
+
["entities.id"],
|
|
129
|
+
name="fk_document_entities_entity_id_entities",
|
|
130
|
+
ondelete="CASCADE",
|
|
131
|
+
),
|
|
132
|
+
sa.PrimaryKeyConstraint(
|
|
133
|
+
"document_id",
|
|
134
|
+
"entity_id",
|
|
135
|
+
name="pk_document_entities",
|
|
136
|
+
),
|
|
137
|
+
sa.CheckConstraint(
|
|
138
|
+
"confidence IS NULL OR (confidence >= 0 AND confidence <= 1)",
|
|
139
|
+
name="ck_document_entities_confidence",
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
op.create_table(
|
|
144
|
+
"embeddings",
|
|
145
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
146
|
+
sa.Column("document_id", sa.Text(), nullable=True),
|
|
147
|
+
sa.Column("chunk_id", sa.Integer(), nullable=True),
|
|
148
|
+
sa.Column("model", sa.Text(), nullable=False),
|
|
149
|
+
sa.Column("dim", sa.Integer(), nullable=False),
|
|
150
|
+
sa.Column("vector_store_ref", sa.Text(), nullable=False),
|
|
151
|
+
sa.Column(
|
|
152
|
+
"created_at",
|
|
153
|
+
sa.Text(),
|
|
154
|
+
server_default=_TS_DEFAULT,
|
|
155
|
+
nullable=False,
|
|
156
|
+
),
|
|
157
|
+
sa.ForeignKeyConstraint(
|
|
158
|
+
["document_id"],
|
|
159
|
+
["documents.id"],
|
|
160
|
+
name="fk_embeddings_document_id_documents",
|
|
161
|
+
ondelete="CASCADE",
|
|
162
|
+
),
|
|
163
|
+
sa.ForeignKeyConstraint(
|
|
164
|
+
["chunk_id"],
|
|
165
|
+
["content_blocks.id"],
|
|
166
|
+
name="fk_embeddings_chunk_id_content_blocks",
|
|
167
|
+
ondelete="CASCADE",
|
|
168
|
+
),
|
|
169
|
+
sa.PrimaryKeyConstraint("id", name="pk_embeddings"),
|
|
170
|
+
sa.CheckConstraint(
|
|
171
|
+
"(document_id IS NOT NULL AND chunk_id IS NULL) OR "
|
|
172
|
+
"(document_id IS NULL AND chunk_id IS NOT NULL)",
|
|
173
|
+
name="ck_embeddings_document_xor_chunk",
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
op.create_table(
|
|
178
|
+
"relationships",
|
|
179
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
180
|
+
sa.Column("parent_document_id", sa.Text(), nullable=False),
|
|
181
|
+
sa.Column("child_document_id", sa.Text(), nullable=False),
|
|
182
|
+
sa.Column("relation_type", sa.Text(), nullable=False),
|
|
183
|
+
sa.Column("weight", sa.Float(), nullable=True),
|
|
184
|
+
sa.Column("meta", sa.Text(), nullable=True),
|
|
185
|
+
sa.ForeignKeyConstraint(
|
|
186
|
+
["parent_document_id"],
|
|
187
|
+
["documents.id"],
|
|
188
|
+
name="fk_relationships_parent_document_id_documents",
|
|
189
|
+
ondelete="CASCADE",
|
|
190
|
+
),
|
|
191
|
+
sa.ForeignKeyConstraint(
|
|
192
|
+
["child_document_id"],
|
|
193
|
+
["documents.id"],
|
|
194
|
+
name="fk_relationships_child_document_id_documents",
|
|
195
|
+
ondelete="CASCADE",
|
|
196
|
+
),
|
|
197
|
+
sa.PrimaryKeyConstraint("id", name="pk_relationships"),
|
|
198
|
+
sa.CheckConstraint(
|
|
199
|
+
"relation_type IN ('link','thread','reply','entity_cooccur')",
|
|
200
|
+
name="ck_relationships_relation_type",
|
|
201
|
+
),
|
|
202
|
+
sa.CheckConstraint(
|
|
203
|
+
"parent_document_id != child_document_id",
|
|
204
|
+
name="ck_relationships_no_self_loop",
|
|
205
|
+
),
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
op.create_index(
|
|
209
|
+
"ix_documents_source_timestamp",
|
|
210
|
+
"documents",
|
|
211
|
+
["source_id", "timestamp"],
|
|
212
|
+
)
|
|
213
|
+
op.create_index(
|
|
214
|
+
"uq_content_blocks_document_ordinal",
|
|
215
|
+
"content_blocks",
|
|
216
|
+
["document_id", "ordinal"],
|
|
217
|
+
unique=True,
|
|
218
|
+
)
|
|
219
|
+
op.create_index(
|
|
220
|
+
"ix_content_blocks_document_id",
|
|
221
|
+
"content_blocks",
|
|
222
|
+
["document_id"],
|
|
223
|
+
)
|
|
224
|
+
op.create_index(
|
|
225
|
+
"ix_embeddings_document_id",
|
|
226
|
+
"embeddings",
|
|
227
|
+
["document_id"],
|
|
228
|
+
)
|
|
229
|
+
op.create_index(
|
|
230
|
+
"ix_embeddings_chunk_id",
|
|
231
|
+
"embeddings",
|
|
232
|
+
["chunk_id"],
|
|
233
|
+
)
|
|
234
|
+
op.create_index(
|
|
235
|
+
"ix_document_entities_entity",
|
|
236
|
+
"document_entities",
|
|
237
|
+
["entity_id"],
|
|
238
|
+
)
|
|
239
|
+
op.create_index(
|
|
240
|
+
"ix_document_entities_document",
|
|
241
|
+
"document_entities",
|
|
242
|
+
["document_id"],
|
|
243
|
+
)
|
|
244
|
+
op.create_index(
|
|
245
|
+
"ix_relationships_parent",
|
|
246
|
+
"relationships",
|
|
247
|
+
["parent_document_id"],
|
|
248
|
+
)
|
|
249
|
+
op.create_index(
|
|
250
|
+
"ix_relationships_child",
|
|
251
|
+
"relationships",
|
|
252
|
+
["child_document_id"],
|
|
253
|
+
)
|
|
254
|
+
op.create_index(
|
|
255
|
+
"ix_relationships_parent_type",
|
|
256
|
+
"relationships",
|
|
257
|
+
["parent_document_id", "relation_type"],
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def downgrade() -> None:
|
|
262
|
+
op.drop_index("ix_relationships_parent_type", table_name="relationships")
|
|
263
|
+
op.drop_index("ix_relationships_child", table_name="relationships")
|
|
264
|
+
op.drop_index("ix_relationships_parent", table_name="relationships")
|
|
265
|
+
op.drop_index("ix_document_entities_document", table_name="document_entities")
|
|
266
|
+
op.drop_index("ix_document_entities_entity", table_name="document_entities")
|
|
267
|
+
op.drop_index("ix_embeddings_chunk_id", table_name="embeddings")
|
|
268
|
+
op.drop_index("ix_embeddings_document_id", table_name="embeddings")
|
|
269
|
+
op.drop_index("ix_content_blocks_document_id", table_name="content_blocks")
|
|
270
|
+
op.drop_index("uq_content_blocks_document_ordinal", table_name="content_blocks")
|
|
271
|
+
op.drop_index("ix_documents_source_timestamp", table_name="documents")
|
|
272
|
+
|
|
273
|
+
op.drop_table("relationships")
|
|
274
|
+
op.drop_table("embeddings")
|
|
275
|
+
op.drop_table("document_entities")
|
|
276
|
+
op.drop_table("content_blocks")
|
|
277
|
+
op.drop_table("entities")
|
|
278
|
+
op.drop_table("documents")
|
|
279
|
+
op.drop_table("sources")
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""sqlite-vec kb_vec_embeddings virtual table
|
|
2
|
+
|
|
3
|
+
Revision ID: 3c1d2e4f5a6b
|
|
4
|
+
Revises: 2a9c8f1d0e7b
|
|
5
|
+
Create Date: 2026-04-05
|
|
6
|
+
|
|
7
|
+
Vector dimension is fixed at DDL time. Must match Settings.vector_embedding_dim
|
|
8
|
+
(default 1536 for text-embedding-3-small). Changing dim requires a new vec0 table
|
|
9
|
+
and migration.
|
|
10
|
+
|
|
11
|
+
Metadata columns use non-NULL sentinel-friendly values at insert time (chunk_id=0
|
|
12
|
+
means document-level; source_id=-1 means unknown) because sqlite-vec KNN filtering
|
|
13
|
+
does not support NULL metadata yet.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from collections.abc import Sequence
|
|
18
|
+
|
|
19
|
+
import sqlalchemy as sa
|
|
20
|
+
|
|
21
|
+
from alembic import op
|
|
22
|
+
from app.sqlite_ext import load_sqlite_vec_extension
|
|
23
|
+
|
|
24
|
+
revision: str = "3c1d2e4f5a6b"
|
|
25
|
+
down_revision: str | None = "2a9c8f1d0e7b"
|
|
26
|
+
branch_labels: str | Sequence[str] | None = None
|
|
27
|
+
depends_on: str | Sequence[str] | None = None
|
|
28
|
+
|
|
29
|
+
# Keep in sync with app.config.Settings.vector_embedding_dim default
|
|
30
|
+
_VEC_DIM = 1536
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def upgrade() -> None:
|
|
34
|
+
bind = op.get_bind()
|
|
35
|
+
load_sqlite_vec_extension(bind.connection.driver_connection)
|
|
36
|
+
|
|
37
|
+
op.execute(sa.text("PRAGMA foreign_keys=ON"))
|
|
38
|
+
op.execute(
|
|
39
|
+
sa.text(
|
|
40
|
+
f"""
|
|
41
|
+
CREATE VIRTUAL TABLE kb_vec_embeddings USING vec0(
|
|
42
|
+
embedding float[{_VEC_DIM}],
|
|
43
|
+
document_id text,
|
|
44
|
+
chunk_id integer,
|
|
45
|
+
source_id integer,
|
|
46
|
+
modality text,
|
|
47
|
+
ingested_at text
|
|
48
|
+
)
|
|
49
|
+
"""
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def downgrade() -> None:
|
|
55
|
+
bind = op.get_bind()
|
|
56
|
+
load_sqlite_vec_extension(bind.connection.driver_connection)
|
|
57
|
+
|
|
58
|
+
op.execute(sa.text("DROP TABLE IF EXISTS kb_vec_embeddings"))
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""document_links for extracted URLs
|
|
2
|
+
|
|
3
|
+
Revision ID: 4e8b0c2d1a3f
|
|
4
|
+
Revises: 3c1d2e4f5a6b
|
|
5
|
+
Create Date: 2026-04-05
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
|
|
11
|
+
import sqlalchemy as sa
|
|
12
|
+
from alembic import op
|
|
13
|
+
|
|
14
|
+
revision: str = "4e8b0c2d1a3f"
|
|
15
|
+
down_revision: str | None = "3c1d2e4f5a6b"
|
|
16
|
+
branch_labels: str | Sequence[str] | None = None
|
|
17
|
+
depends_on: str | Sequence[str] | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade() -> None:
|
|
21
|
+
op.execute(sa.text("PRAGMA foreign_keys=ON"))
|
|
22
|
+
op.create_table(
|
|
23
|
+
"document_links",
|
|
24
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
25
|
+
sa.Column("document_id", sa.Text(), nullable=False),
|
|
26
|
+
sa.Column("url", sa.Text(), nullable=False),
|
|
27
|
+
sa.Column("ordinal", sa.Integer(), nullable=False),
|
|
28
|
+
sa.ForeignKeyConstraint(
|
|
29
|
+
["document_id"],
|
|
30
|
+
["documents.id"],
|
|
31
|
+
name="fk_document_links_document_id_documents",
|
|
32
|
+
ondelete="CASCADE",
|
|
33
|
+
),
|
|
34
|
+
sa.PrimaryKeyConstraint("id", name="pk_document_links"),
|
|
35
|
+
sa.UniqueConstraint(
|
|
36
|
+
"document_id",
|
|
37
|
+
"ordinal",
|
|
38
|
+
name="uq_document_links_document_ordinal",
|
|
39
|
+
),
|
|
40
|
+
)
|
|
41
|
+
op.create_index(
|
|
42
|
+
"ix_document_links_document",
|
|
43
|
+
"document_links",
|
|
44
|
+
["document_id"],
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def downgrade() -> None:
|
|
49
|
+
op.drop_index("ix_document_links_document", table_name="document_links")
|
|
50
|
+
op.drop_table("document_links")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""documents dedupe columns + unique relationship edges
|
|
2
|
+
|
|
3
|
+
Revision ID: 6a0b1c2d3e4f
|
|
4
|
+
Revises: 4e8b0c2d1a3f
|
|
5
|
+
Create Date: 2026-04-05
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
|
|
11
|
+
import sqlalchemy as sa
|
|
12
|
+
from alembic import op
|
|
13
|
+
|
|
14
|
+
revision: str = "6a0b1c2d3e4f"
|
|
15
|
+
down_revision: str | None = "4e8b0c2d1a3f"
|
|
16
|
+
branch_labels: str | Sequence[str] | None = None
|
|
17
|
+
depends_on: str | Sequence[str] | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade() -> None:
|
|
21
|
+
op.execute(sa.text("PRAGMA foreign_keys=ON"))
|
|
22
|
+
with op.batch_alter_table("documents") as batch:
|
|
23
|
+
batch.add_column(sa.Column("canonical_url", sa.Text(), nullable=True))
|
|
24
|
+
batch.add_column(sa.Column("content_sha256", sa.Text(), nullable=True))
|
|
25
|
+
batch.add_column(sa.Column("ingest_meta", sa.Text(), nullable=True))
|
|
26
|
+
op.create_index("ix_documents_canonical_url", "documents", ["canonical_url"])
|
|
27
|
+
op.execute(
|
|
28
|
+
sa.text(
|
|
29
|
+
"CREATE UNIQUE INDEX uq_documents_canonical_url_content_hash "
|
|
30
|
+
"ON documents (canonical_url, content_sha256) "
|
|
31
|
+
"WHERE canonical_url IS NOT NULL AND content_sha256 IS NOT NULL",
|
|
32
|
+
),
|
|
33
|
+
)
|
|
34
|
+
op.execute(
|
|
35
|
+
sa.text(
|
|
36
|
+
"CREATE UNIQUE INDEX uq_relationships_parent_child_type "
|
|
37
|
+
"ON relationships (parent_document_id, child_document_id, relation_type)",
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def downgrade() -> None:
|
|
43
|
+
op.execute(sa.text("DROP INDEX IF EXISTS uq_relationships_parent_child_type"))
|
|
44
|
+
op.execute(sa.text("DROP INDEX IF EXISTS uq_documents_canonical_url_content_hash"))
|
|
45
|
+
op.drop_index("ix_documents_canonical_url", table_name="documents")
|
|
46
|
+
with op.batch_alter_table("documents") as batch:
|
|
47
|
+
batch.drop_column("ingest_meta")
|
|
48
|
+
batch.drop_column("content_sha256")
|
|
49
|
+
batch.drop_column("canonical_url")
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""semantic document_chunks linked to documents + block ordinals
|
|
2
|
+
|
|
3
|
+
Revision ID: 7d8e9f0a1b2c
|
|
4
|
+
Revises: 6a0b1c2d3e4f
|
|
5
|
+
Create Date: 2026-04-05
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
|
|
11
|
+
import sqlalchemy as sa
|
|
12
|
+
from alembic import op
|
|
13
|
+
|
|
14
|
+
revision: str = "7d8e9f0a1b2c"
|
|
15
|
+
down_revision: str | None = "6a0b1c2d3e4f"
|
|
16
|
+
branch_labels: str | Sequence[str] | None = None
|
|
17
|
+
depends_on: str | Sequence[str] | None = None
|
|
18
|
+
|
|
19
|
+
_TS_DEFAULT = sa.text("(strftime('%Y-%m-%dT%H:%M:%fZ','now'))")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
op.execute(sa.text("PRAGMA foreign_keys=ON"))
|
|
24
|
+
op.create_table(
|
|
25
|
+
"document_chunks",
|
|
26
|
+
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
|
27
|
+
sa.Column("document_id", sa.Text(), nullable=False),
|
|
28
|
+
sa.Column("ordinal", sa.Integer(), nullable=False),
|
|
29
|
+
sa.Column("text", sa.Text(), nullable=False),
|
|
30
|
+
sa.Column("start_block_ordinal", sa.Integer(), nullable=False),
|
|
31
|
+
sa.Column("end_block_ordinal", sa.Integer(), nullable=False),
|
|
32
|
+
sa.Column("meta", sa.Text(), nullable=True),
|
|
33
|
+
sa.Column(
|
|
34
|
+
"created_at",
|
|
35
|
+
sa.Text(),
|
|
36
|
+
server_default=_TS_DEFAULT,
|
|
37
|
+
nullable=False,
|
|
38
|
+
),
|
|
39
|
+
sa.ForeignKeyConstraint(
|
|
40
|
+
["document_id"],
|
|
41
|
+
["documents.id"],
|
|
42
|
+
name="fk_document_chunks_document_id_documents",
|
|
43
|
+
ondelete="CASCADE",
|
|
44
|
+
),
|
|
45
|
+
sa.PrimaryKeyConstraint("id", name="pk_document_chunks"),
|
|
46
|
+
sa.CheckConstraint(
|
|
47
|
+
"end_block_ordinal >= start_block_ordinal",
|
|
48
|
+
name="ck_document_chunks_block_range",
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
op.create_index(
|
|
52
|
+
"uq_document_chunks_document_ordinal",
|
|
53
|
+
"document_chunks",
|
|
54
|
+
["document_id", "ordinal"],
|
|
55
|
+
unique=True,
|
|
56
|
+
)
|
|
57
|
+
op.create_index(
|
|
58
|
+
"ix_document_chunks_document",
|
|
59
|
+
"document_chunks",
|
|
60
|
+
["document_id"],
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def downgrade() -> None:
|
|
65
|
+
op.drop_index("ix_document_chunks_document", table_name="document_chunks")
|
|
66
|
+
op.drop_index(
|
|
67
|
+
"uq_document_chunks_document_ordinal",
|
|
68
|
+
table_name="document_chunks",
|
|
69
|
+
)
|
|
70
|
+
op.drop_table("document_chunks")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""initial empty revision
|
|
2
|
+
|
|
3
|
+
Revision ID: 8f2a1c0d9e3b
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2026-04-05
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
|
|
11
|
+
revision: str = "8f2a1c0d9e3b"
|
|
12
|
+
down_revision: str | None = None
|
|
13
|
+
branch_labels: str | Sequence[str] | None = None
|
|
14
|
+
depends_on: str | Sequence[str] | None = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def upgrade() -> None:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def downgrade() -> None:
|
|
22
|
+
pass
|