business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,65 @@
1
+ # =============================================================================
2
+ # Database (defaults: ./data/rag.sqlite)
3
+ # =============================================================================
4
+ DATA_DIR=data
5
+ SQLITE_FILENAME=rag.sqlite
6
+ DATABASE_URL=sqlite+aiosqlite:///./data/rag.sqlite
7
+
8
+ # =============================================================================
9
+ # Gemini — embeddings, /query, /chat/.../complete, POST /ingest/documents/{id}/embed
10
+ # =============================================================================
11
+ # Set GEMINI_API_KEY here OR leave unset and use the gateway integration store
12
+ # (same values as Next: INTEGRATIONS_INTERNAL_SECRET + gateway URL below).
13
+ GEMINI_API_KEY=
14
+ # When GEMINI_API_KEY is empty, fetch geminiApiKey from the Hono gateway:
15
+ # INTEGRATIONS_GATEWAY_URL=http://127.0.0.1:3001
16
+ # INTEGRATIONS_INTERNAL_SECRET=
17
+ # Embeddings: use ListModels entries with embedContent / batchEmbedContents (e.g. gemini-embedding-001).
18
+ GEMINI_EMBEDDING_MODEL=gemini-embedding-001
19
+ GEMINI_EMBED_TASK_TYPE=RETRIEVAL_DOCUMENT
20
+ GEMINI_QUERY_TASK_TYPE=RETRIEVAL_QUERY
21
+ # Chat: use ListModels entries with generateContent (preview ids change over time).
22
+ # You may paste ``models/gemini-3-flash-preview`` — the leading ``models/`` is stripped automatically.
23
+ GEMINI_CHAT_MODEL=gemini-3-flash-preview
24
+ VECTOR_EMBEDDING_DIM=1536
25
+
26
+ # =============================================================================
27
+ # OpenAI — optional LLM-assisted chunking only (semantic_chunk / llm_boundaries)
28
+ # =============================================================================
29
+ # Not used for embeddings or /query. Leave unset if you only use Gemini RAG.
30
+ # OPENAI_API_KEY=
31
+ # CHUNK_LLM_MODEL=gpt-4o-mini
32
+
33
+ # Legacy / unused by current embed path (kept for Settings compatibility):
34
+ # EMBEDDING_API_KEY=
35
+ # EMBEDDING_MODEL=text-embedding-3-small
36
+
37
+ # =============================================================================
38
+ # Embed failures (DLQ) — optional tuning
39
+ # =============================================================================
40
+ # EMBEDDING_DLQ_MAX_ATTEMPTS=5
41
+ # EMBEDDING_DLQ_BASE_DELAY_S=60
42
+ # EMBEDDING_DLQ_MAX_BACKOFF_S=3600
43
+
44
+ # =============================================================================
45
+ # Post-ingest webhooks (after document status = ok) — optional
46
+ # =============================================================================
47
+ # POST_INGEST_SLACK_WEBHOOK_URL=
48
+ # POST_INGEST_DISCORD_WEBHOOK_URL=
49
+ # POST_INGEST_SLACK_TEMPLATE=...
50
+ # POST_INGEST_DISCORD_TEMPLATE=...
51
+ # POST_INGEST_SUMMARY_MAX_CHARS=400
52
+
53
+ # =============================================================================
54
+ # Gateway + OpenAPI (production)
55
+ # =============================================================================
56
+ # Same value on the Hono gateway (BACKEND_GATEWAY_SECRET). When set, /docs, /redoc,
57
+ # and /openapi.json require X-Gateway-Secret — use docs via the gateway only.
58
+ # openssl rand -base64 32
59
+ # BACKEND_GATEWAY_SECRET=
60
+
61
+ # =============================================================================
62
+ # Logging
63
+ # =============================================================================
64
+ # LOG_LEVEL=INFO
65
+ # LOG_JSON=false
@@ -0,0 +1,63 @@
1
+ import asyncio
2
+ from logging.config import fileConfig
3
+
4
+ from sqlalchemy import pool
5
+ from sqlalchemy.engine import Connection
6
+ from sqlalchemy.ext.asyncio import async_engine_from_config
7
+
8
+ from alembic import context
9
+ from app.config import get_settings
10
+ from app.db import Base
11
+
12
+ config = context.config
13
+
14
+ if config.config_file_name is not None:
15
+ fileConfig(config.config_file_name)
16
+
17
+ target_metadata = Base.metadata
18
+
19
+
20
+ def run_migrations_offline() -> None:
21
+ url = get_settings().sqlalchemy_database_url
22
+ context.configure(
23
+ url=url,
24
+ target_metadata=target_metadata,
25
+ literal_binds=True,
26
+ dialect_opts={"paramstyle": "named"},
27
+ )
28
+
29
+ with context.begin_transaction():
30
+ context.run_migrations()
31
+
32
+
33
+ def do_run_migrations(connection: Connection) -> None:
34
+ context.configure(connection=connection, target_metadata=target_metadata)
35
+ with context.begin_transaction():
36
+ context.run_migrations()
37
+
38
+
39
+ async def run_async_migrations() -> None:
40
+ settings = get_settings()
41
+ settings.data_dir.mkdir(parents=True, exist_ok=True)
42
+ section = config.get_section(config.config_ini_section) or {}
43
+ section["sqlalchemy.url"] = settings.sqlalchemy_database_url
44
+ connectable = async_engine_from_config(
45
+ section,
46
+ prefix="sqlalchemy.",
47
+ poolclass=pool.NullPool,
48
+ )
49
+
50
+ async with connectable.connect() as connection:
51
+ await connection.run_sync(do_run_migrations)
52
+
53
+ await connectable.dispose()
54
+
55
+
56
+ def run_migrations_online() -> None:
57
+ asyncio.run(run_async_migrations())
58
+
59
+
60
+ if context.is_offline_mode():
61
+ run_migrations_offline()
62
+ else:
63
+ run_migrations_online()
@@ -0,0 +1,26 @@
1
+ """${message}
2
+
3
+ Revision ID: ${up_revision}
4
+ Revises: ${down_revision | comma,n}
5
+ Create Date: ${create_date}
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+ ${imports if imports else ""}
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = ${repr(up_revision)}
16
+ down_revision: Union[str, None] = ${repr(down_revision)}
17
+ branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
18
+ depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
19
+
20
+
21
+ def upgrade() -> None:
22
+ ${upgrades if upgrades else "pass"}
23
+
24
+
25
+ def downgrade() -> None:
26
+ ${downgrades if downgrades else "pass"}
@@ -0,0 +1,279 @@
1
+ """multimodal knowledge base schema
2
+
3
+ Revision ID: 2a9c8f1d0e7b
4
+ Revises: 8f2a1c0d9e3b
5
+ Create Date: 2026-04-05
6
+
7
+ Design notes (SQLite):
8
+
9
+ - chunk_id on embeddings: without a separate chunks table, document-level rows use
10
+ document_id set and chunk_id NULL; block-level rows use chunk_id -> content_blocks.id
11
+ and document_id NULL. A future chunks table can be added via migration if text
12
+ segments need their own rows.
13
+
14
+ - PRAGMA foreign_keys=ON must be enabled per connection (see app.main engine connect
15
+ hook). SQLite does not enforce FKs when this is off.
16
+
17
+ - JSON columns (meta, raw_content when structured) are TEXT; validate in the app.
18
+ Optional: json_valid() CHECKs on newer SQLite if you want DB-level validation.
19
+
20
+ """
21
+
22
+ from collections.abc import Sequence
23
+
24
+ import sqlalchemy as sa
25
+
26
+ from alembic import op
27
+
28
+ revision: str = "2a9c8f1d0e7b"
29
+ down_revision: str | None = "8f2a1c0d9e3b"
30
+ branch_labels: str | Sequence[str] | None = None
31
+ depends_on: str | Sequence[str] | None = None
32
+
33
+ _TS_DEFAULT = sa.text("(strftime('%Y-%m-%dT%H:%M:%fZ','now'))")
34
+
35
+
36
+ def upgrade() -> None:
37
+ op.execute(sa.text("PRAGMA foreign_keys=ON"))
38
+
39
+ op.create_table(
40
+ "sources",
41
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
42
+ sa.Column("name", sa.Text(), nullable=False),
43
+ sa.Column("connector_type", sa.Text(), nullable=False),
44
+ sa.Column("config_ref", sa.Text(), nullable=True),
45
+ sa.Column(
46
+ "created_at",
47
+ sa.Text(),
48
+ server_default=_TS_DEFAULT,
49
+ nullable=False,
50
+ ),
51
+ sa.PrimaryKeyConstraint("id", name="pk_sources"),
52
+ )
53
+
54
+ op.create_table(
55
+ "documents",
56
+ sa.Column("id", sa.Text(), nullable=False),
57
+ sa.Column("source_id", sa.Integer(), nullable=False),
58
+ sa.Column("timestamp", sa.Text(), nullable=False),
59
+ sa.Column("content_type", sa.Text(), nullable=False),
60
+ sa.Column("raw_content", sa.Text(), nullable=True),
61
+ sa.Column("summary", sa.Text(), nullable=True),
62
+ sa.Column("status", sa.Text(), nullable=False),
63
+ sa.Column(
64
+ "created_at",
65
+ sa.Text(),
66
+ server_default=_TS_DEFAULT,
67
+ nullable=False,
68
+ ),
69
+ sa.ForeignKeyConstraint(
70
+ ["source_id"],
71
+ ["sources.id"],
72
+ name="fk_documents_source_id_sources",
73
+ ondelete="RESTRICT",
74
+ ),
75
+ sa.PrimaryKeyConstraint("id", name="pk_documents"),
76
+ sa.CheckConstraint(
77
+ "status IN ('ok','partial','failed')",
78
+ name="ck_documents_status",
79
+ ),
80
+ )
81
+
82
+ op.create_table(
83
+ "content_blocks",
84
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
85
+ sa.Column("document_id", sa.Text(), nullable=False),
86
+ sa.Column("ordinal", sa.Integer(), nullable=False),
87
+ sa.Column("type", sa.Text(), nullable=False),
88
+ sa.Column("storage_uri", sa.Text(), nullable=True),
89
+ sa.Column("inline_ref", sa.Text(), nullable=True),
90
+ sa.Column("mime", sa.Text(), nullable=True),
91
+ sa.Column("sha256", sa.Text(), nullable=True),
92
+ sa.Column("meta", sa.Text(), nullable=True),
93
+ sa.ForeignKeyConstraint(
94
+ ["document_id"],
95
+ ["documents.id"],
96
+ name="fk_content_blocks_document_id_documents",
97
+ ondelete="CASCADE",
98
+ ),
99
+ sa.PrimaryKeyConstraint("id", name="pk_content_blocks"),
100
+ sa.CheckConstraint(
101
+ "type IN ('text','image','audio','video','document')",
102
+ name="ck_content_blocks_type",
103
+ ),
104
+ )
105
+
106
+ op.create_table(
107
+ "entities",
108
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
109
+ sa.Column("name", sa.Text(), nullable=False),
110
+ sa.Column("type", sa.Text(), nullable=False),
111
+ sa.Column("meta", sa.Text(), nullable=True),
112
+ sa.PrimaryKeyConstraint("id", name="pk_entities"),
113
+ )
114
+
115
+ op.create_table(
116
+ "document_entities",
117
+ sa.Column("document_id", sa.Text(), nullable=False),
118
+ sa.Column("entity_id", sa.Integer(), nullable=False),
119
+ sa.Column("confidence", sa.Float(), nullable=True),
120
+ sa.ForeignKeyConstraint(
121
+ ["document_id"],
122
+ ["documents.id"],
123
+ name="fk_document_entities_document_id_documents",
124
+ ondelete="CASCADE",
125
+ ),
126
+ sa.ForeignKeyConstraint(
127
+ ["entity_id"],
128
+ ["entities.id"],
129
+ name="fk_document_entities_entity_id_entities",
130
+ ondelete="CASCADE",
131
+ ),
132
+ sa.PrimaryKeyConstraint(
133
+ "document_id",
134
+ "entity_id",
135
+ name="pk_document_entities",
136
+ ),
137
+ sa.CheckConstraint(
138
+ "confidence IS NULL OR (confidence >= 0 AND confidence <= 1)",
139
+ name="ck_document_entities_confidence",
140
+ ),
141
+ )
142
+
143
+ op.create_table(
144
+ "embeddings",
145
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
146
+ sa.Column("document_id", sa.Text(), nullable=True),
147
+ sa.Column("chunk_id", sa.Integer(), nullable=True),
148
+ sa.Column("model", sa.Text(), nullable=False),
149
+ sa.Column("dim", sa.Integer(), nullable=False),
150
+ sa.Column("vector_store_ref", sa.Text(), nullable=False),
151
+ sa.Column(
152
+ "created_at",
153
+ sa.Text(),
154
+ server_default=_TS_DEFAULT,
155
+ nullable=False,
156
+ ),
157
+ sa.ForeignKeyConstraint(
158
+ ["document_id"],
159
+ ["documents.id"],
160
+ name="fk_embeddings_document_id_documents",
161
+ ondelete="CASCADE",
162
+ ),
163
+ sa.ForeignKeyConstraint(
164
+ ["chunk_id"],
165
+ ["content_blocks.id"],
166
+ name="fk_embeddings_chunk_id_content_blocks",
167
+ ondelete="CASCADE",
168
+ ),
169
+ sa.PrimaryKeyConstraint("id", name="pk_embeddings"),
170
+ sa.CheckConstraint(
171
+ "(document_id IS NOT NULL AND chunk_id IS NULL) OR "
172
+ "(document_id IS NULL AND chunk_id IS NOT NULL)",
173
+ name="ck_embeddings_document_xor_chunk",
174
+ ),
175
+ )
176
+
177
+ op.create_table(
178
+ "relationships",
179
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
180
+ sa.Column("parent_document_id", sa.Text(), nullable=False),
181
+ sa.Column("child_document_id", sa.Text(), nullable=False),
182
+ sa.Column("relation_type", sa.Text(), nullable=False),
183
+ sa.Column("weight", sa.Float(), nullable=True),
184
+ sa.Column("meta", sa.Text(), nullable=True),
185
+ sa.ForeignKeyConstraint(
186
+ ["parent_document_id"],
187
+ ["documents.id"],
188
+ name="fk_relationships_parent_document_id_documents",
189
+ ondelete="CASCADE",
190
+ ),
191
+ sa.ForeignKeyConstraint(
192
+ ["child_document_id"],
193
+ ["documents.id"],
194
+ name="fk_relationships_child_document_id_documents",
195
+ ondelete="CASCADE",
196
+ ),
197
+ sa.PrimaryKeyConstraint("id", name="pk_relationships"),
198
+ sa.CheckConstraint(
199
+ "relation_type IN ('link','thread','reply','entity_cooccur')",
200
+ name="ck_relationships_relation_type",
201
+ ),
202
+ sa.CheckConstraint(
203
+ "parent_document_id != child_document_id",
204
+ name="ck_relationships_no_self_loop",
205
+ ),
206
+ )
207
+
208
+ op.create_index(
209
+ "ix_documents_source_timestamp",
210
+ "documents",
211
+ ["source_id", "timestamp"],
212
+ )
213
+ op.create_index(
214
+ "uq_content_blocks_document_ordinal",
215
+ "content_blocks",
216
+ ["document_id", "ordinal"],
217
+ unique=True,
218
+ )
219
+ op.create_index(
220
+ "ix_content_blocks_document_id",
221
+ "content_blocks",
222
+ ["document_id"],
223
+ )
224
+ op.create_index(
225
+ "ix_embeddings_document_id",
226
+ "embeddings",
227
+ ["document_id"],
228
+ )
229
+ op.create_index(
230
+ "ix_embeddings_chunk_id",
231
+ "embeddings",
232
+ ["chunk_id"],
233
+ )
234
+ op.create_index(
235
+ "ix_document_entities_entity",
236
+ "document_entities",
237
+ ["entity_id"],
238
+ )
239
+ op.create_index(
240
+ "ix_document_entities_document",
241
+ "document_entities",
242
+ ["document_id"],
243
+ )
244
+ op.create_index(
245
+ "ix_relationships_parent",
246
+ "relationships",
247
+ ["parent_document_id"],
248
+ )
249
+ op.create_index(
250
+ "ix_relationships_child",
251
+ "relationships",
252
+ ["child_document_id"],
253
+ )
254
+ op.create_index(
255
+ "ix_relationships_parent_type",
256
+ "relationships",
257
+ ["parent_document_id", "relation_type"],
258
+ )
259
+
260
+
261
+ def downgrade() -> None:
262
+ op.drop_index("ix_relationships_parent_type", table_name="relationships")
263
+ op.drop_index("ix_relationships_child", table_name="relationships")
264
+ op.drop_index("ix_relationships_parent", table_name="relationships")
265
+ op.drop_index("ix_document_entities_document", table_name="document_entities")
266
+ op.drop_index("ix_document_entities_entity", table_name="document_entities")
267
+ op.drop_index("ix_embeddings_chunk_id", table_name="embeddings")
268
+ op.drop_index("ix_embeddings_document_id", table_name="embeddings")
269
+ op.drop_index("ix_content_blocks_document_id", table_name="content_blocks")
270
+ op.drop_index("uq_content_blocks_document_ordinal", table_name="content_blocks")
271
+ op.drop_index("ix_documents_source_timestamp", table_name="documents")
272
+
273
+ op.drop_table("relationships")
274
+ op.drop_table("embeddings")
275
+ op.drop_table("document_entities")
276
+ op.drop_table("content_blocks")
277
+ op.drop_table("entities")
278
+ op.drop_table("documents")
279
+ op.drop_table("sources")
@@ -0,0 +1,58 @@
1
+ """sqlite-vec kb_vec_embeddings virtual table
2
+
3
+ Revision ID: 3c1d2e4f5a6b
4
+ Revises: 2a9c8f1d0e7b
5
+ Create Date: 2026-04-05
6
+
7
+ Vector dimension is fixed at DDL time. Must match Settings.vector_embedding_dim
8
+ (default 1536 for text-embedding-3-small). Changing dim requires a new vec0 table
9
+ and migration.
10
+
11
+ Metadata columns use non-NULL sentinel-friendly values at insert time (chunk_id=0
12
+ means document-level; source_id=-1 means unknown) because sqlite-vec KNN filtering
13
+ does not support NULL metadata yet.
14
+
15
+ """
16
+
17
+ from collections.abc import Sequence
18
+
19
+ import sqlalchemy as sa
20
+
21
+ from alembic import op
22
+ from app.sqlite_ext import load_sqlite_vec_extension
23
+
24
+ revision: str = "3c1d2e4f5a6b"
25
+ down_revision: str | None = "2a9c8f1d0e7b"
26
+ branch_labels: str | Sequence[str] | None = None
27
+ depends_on: str | Sequence[str] | None = None
28
+
29
+ # Keep in sync with app.config.Settings.vector_embedding_dim default
30
+ _VEC_DIM = 1536
31
+
32
+
33
+ def upgrade() -> None:
34
+ bind = op.get_bind()
35
+ load_sqlite_vec_extension(bind.connection.driver_connection)
36
+
37
+ op.execute(sa.text("PRAGMA foreign_keys=ON"))
38
+ op.execute(
39
+ sa.text(
40
+ f"""
41
+ CREATE VIRTUAL TABLE kb_vec_embeddings USING vec0(
42
+ embedding float[{_VEC_DIM}],
43
+ document_id text,
44
+ chunk_id integer,
45
+ source_id integer,
46
+ modality text,
47
+ ingested_at text
48
+ )
49
+ """
50
+ )
51
+ )
52
+
53
+
54
+ def downgrade() -> None:
55
+ bind = op.get_bind()
56
+ load_sqlite_vec_extension(bind.connection.driver_connection)
57
+
58
+ op.execute(sa.text("DROP TABLE IF EXISTS kb_vec_embeddings"))
@@ -0,0 +1,50 @@
1
+ """document_links for extracted URLs
2
+
3
+ Revision ID: 4e8b0c2d1a3f
4
+ Revises: 3c1d2e4f5a6b
5
+ Create Date: 2026-04-05
6
+
7
+ """
8
+
9
+ from collections.abc import Sequence
10
+
11
+ import sqlalchemy as sa
12
+ from alembic import op
13
+
14
+ revision: str = "4e8b0c2d1a3f"
15
+ down_revision: str | None = "3c1d2e4f5a6b"
16
+ branch_labels: str | Sequence[str] | None = None
17
+ depends_on: str | Sequence[str] | None = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ op.execute(sa.text("PRAGMA foreign_keys=ON"))
22
+ op.create_table(
23
+ "document_links",
24
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
25
+ sa.Column("document_id", sa.Text(), nullable=False),
26
+ sa.Column("url", sa.Text(), nullable=False),
27
+ sa.Column("ordinal", sa.Integer(), nullable=False),
28
+ sa.ForeignKeyConstraint(
29
+ ["document_id"],
30
+ ["documents.id"],
31
+ name="fk_document_links_document_id_documents",
32
+ ondelete="CASCADE",
33
+ ),
34
+ sa.PrimaryKeyConstraint("id", name="pk_document_links"),
35
+ sa.UniqueConstraint(
36
+ "document_id",
37
+ "ordinal",
38
+ name="uq_document_links_document_ordinal",
39
+ ),
40
+ )
41
+ op.create_index(
42
+ "ix_document_links_document",
43
+ "document_links",
44
+ ["document_id"],
45
+ )
46
+
47
+
48
+ def downgrade() -> None:
49
+ op.drop_index("ix_document_links_document", table_name="document_links")
50
+ op.drop_table("document_links")
@@ -0,0 +1,49 @@
1
+ """documents dedupe columns + unique relationship edges
2
+
3
+ Revision ID: 6a0b1c2d3e4f
4
+ Revises: 4e8b0c2d1a3f
5
+ Create Date: 2026-04-05
6
+
7
+ """
8
+
9
+ from collections.abc import Sequence
10
+
11
+ import sqlalchemy as sa
12
+ from alembic import op
13
+
14
+ revision: str = "6a0b1c2d3e4f"
15
+ down_revision: str | None = "4e8b0c2d1a3f"
16
+ branch_labels: str | Sequence[str] | None = None
17
+ depends_on: str | Sequence[str] | None = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ op.execute(sa.text("PRAGMA foreign_keys=ON"))
22
+ with op.batch_alter_table("documents") as batch:
23
+ batch.add_column(sa.Column("canonical_url", sa.Text(), nullable=True))
24
+ batch.add_column(sa.Column("content_sha256", sa.Text(), nullable=True))
25
+ batch.add_column(sa.Column("ingest_meta", sa.Text(), nullable=True))
26
+ op.create_index("ix_documents_canonical_url", "documents", ["canonical_url"])
27
+ op.execute(
28
+ sa.text(
29
+ "CREATE UNIQUE INDEX uq_documents_canonical_url_content_hash "
30
+ "ON documents (canonical_url, content_sha256) "
31
+ "WHERE canonical_url IS NOT NULL AND content_sha256 IS NOT NULL",
32
+ ),
33
+ )
34
+ op.execute(
35
+ sa.text(
36
+ "CREATE UNIQUE INDEX uq_relationships_parent_child_type "
37
+ "ON relationships (parent_document_id, child_document_id, relation_type)",
38
+ ),
39
+ )
40
+
41
+
42
+ def downgrade() -> None:
43
+ op.execute(sa.text("DROP INDEX IF EXISTS uq_relationships_parent_child_type"))
44
+ op.execute(sa.text("DROP INDEX IF EXISTS uq_documents_canonical_url_content_hash"))
45
+ op.drop_index("ix_documents_canonical_url", table_name="documents")
46
+ with op.batch_alter_table("documents") as batch:
47
+ batch.drop_column("ingest_meta")
48
+ batch.drop_column("content_sha256")
49
+ batch.drop_column("canonical_url")
@@ -0,0 +1,70 @@
1
+ """semantic document_chunks linked to documents + block ordinals
2
+
3
+ Revision ID: 7d8e9f0a1b2c
4
+ Revises: 6a0b1c2d3e4f
5
+ Create Date: 2026-04-05
6
+
7
+ """
8
+
9
+ from collections.abc import Sequence
10
+
11
+ import sqlalchemy as sa
12
+ from alembic import op
13
+
14
+ revision: str = "7d8e9f0a1b2c"
15
+ down_revision: str | None = "6a0b1c2d3e4f"
16
+ branch_labels: str | Sequence[str] | None = None
17
+ depends_on: str | Sequence[str] | None = None
18
+
19
+ _TS_DEFAULT = sa.text("(strftime('%Y-%m-%dT%H:%M:%fZ','now'))")
20
+
21
+
22
+ def upgrade() -> None:
23
+ op.execute(sa.text("PRAGMA foreign_keys=ON"))
24
+ op.create_table(
25
+ "document_chunks",
26
+ sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
27
+ sa.Column("document_id", sa.Text(), nullable=False),
28
+ sa.Column("ordinal", sa.Integer(), nullable=False),
29
+ sa.Column("text", sa.Text(), nullable=False),
30
+ sa.Column("start_block_ordinal", sa.Integer(), nullable=False),
31
+ sa.Column("end_block_ordinal", sa.Integer(), nullable=False),
32
+ sa.Column("meta", sa.Text(), nullable=True),
33
+ sa.Column(
34
+ "created_at",
35
+ sa.Text(),
36
+ server_default=_TS_DEFAULT,
37
+ nullable=False,
38
+ ),
39
+ sa.ForeignKeyConstraint(
40
+ ["document_id"],
41
+ ["documents.id"],
42
+ name="fk_document_chunks_document_id_documents",
43
+ ondelete="CASCADE",
44
+ ),
45
+ sa.PrimaryKeyConstraint("id", name="pk_document_chunks"),
46
+ sa.CheckConstraint(
47
+ "end_block_ordinal >= start_block_ordinal",
48
+ name="ck_document_chunks_block_range",
49
+ ),
50
+ )
51
+ op.create_index(
52
+ "uq_document_chunks_document_ordinal",
53
+ "document_chunks",
54
+ ["document_id", "ordinal"],
55
+ unique=True,
56
+ )
57
+ op.create_index(
58
+ "ix_document_chunks_document",
59
+ "document_chunks",
60
+ ["document_id"],
61
+ )
62
+
63
+
64
+ def downgrade() -> None:
65
+ op.drop_index("ix_document_chunks_document", table_name="document_chunks")
66
+ op.drop_index(
67
+ "uq_document_chunks_document_ordinal",
68
+ table_name="document_chunks",
69
+ )
70
+ op.drop_table("document_chunks")
@@ -0,0 +1,22 @@
1
+ """initial empty revision
2
+
3
+ Revision ID: 8f2a1c0d9e3b
4
+ Revises:
5
+ Create Date: 2026-04-05
6
+
7
+ """
8
+
9
+ from collections.abc import Sequence
10
+
11
+ revision: str = "8f2a1c0d9e3b"
12
+ down_revision: str | None = None
13
+ branch_labels: str | Sequence[str] | None = None
14
+ depends_on: str | Sequence[str] | None = None
15
+
16
+
17
+ def upgrade() -> None:
18
+ pass
19
+
20
+
21
+ def downgrade() -> None:
22
+ pass