business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,224 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+
6
+ from sqlalchemy import text
7
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
8
+
9
+ from app.config import Settings
10
+ from app.services.embeddings.build_inputs import (
11
+ chunk_to_multimodal_parts,
12
+ chunk_to_text_parts,
13
+ load_document_chunks,
14
+ )
15
+ from app.services.embeddings.dlq import (
16
+ cleaned_ingest_meta_dict,
17
+ clear_embedding_dlq,
18
+ merge_ingest_meta,
19
+ record_embedding_dlq_failure,
20
+ )
21
+ from app.services.embeddings.gemini_api import batch_embed_contents
22
+ from app.services.embeddings.persist import (
23
+ delete_gemini_embeddings_for_document,
24
+ persist_chunk_embeddings,
25
+ )
26
+ from app.services.embeddings.types import EmbeddingPart, parts_to_modality
27
+ from app.services.entities.pipeline import extract_and_store_entities_for_document
28
+ from app.services.hooks.post_ingest import dispatch_post_ingest_hooks
29
+ from app.services.integrations_remote import resolve_gemini_api_key
30
+ from app.storage.blobs import BlobStore
31
+ from app.vectorstore import SqliteVecStore
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ async def embed_document_gemini(
37
+ session: AsyncSession,
38
+ *,
39
+ document_id: str,
40
+ settings: Settings,
41
+ store: SqliteVecStore,
42
+ multimodal: bool = False,
43
+ ) -> int:
44
+ """
45
+ Idempotent: embed first, then replace vec rows + Gemini ``embeddings`` rows.
46
+
47
+ Prior embedding rows for this model are removed only after a successful
48
+ ``batch_embed_contents`` call so failures do not leave the document without
49
+ vectors while a retry is pending.
50
+ """
51
+ api_key = await resolve_gemini_api_key(settings)
52
+ if not api_key:
53
+ msg = "Gemini API key not configured (GEMINI_API_KEY or gateway store)"
54
+ raise ValueError(msg)
55
+
56
+ row = await session.execute(
57
+ text("SELECT source_id, ingest_meta FROM documents WHERE id = :id LIMIT 1"),
58
+ {"id": document_id},
59
+ )
60
+ doc_row = row.first()
61
+ if doc_row is None:
62
+ msg = "document not found"
63
+ raise ValueError(msg)
64
+ source_id = int(doc_row[0])
65
+ prior_meta = doc_row[1]
66
+
67
+ chunks = await load_document_chunks(session, document_id)
68
+ if not chunks:
69
+ err = "no document_chunks; run POST .../chunks first"
70
+ raise ValueError(err)
71
+
72
+ blob_store = BlobStore(settings.data_dir / "blobs")
73
+ contents: list[list[EmbeddingPart]] = []
74
+ modalities: list[str] = []
75
+ for ch in chunks:
76
+ if multimodal:
77
+ parts = await chunk_to_multimodal_parts(
78
+ session,
79
+ document_id,
80
+ ch,
81
+ blob_store,
82
+ )
83
+ else:
84
+ parts = chunk_to_text_parts(ch)
85
+ contents.append(parts)
86
+ modalities.append(parts_to_modality(parts))
87
+
88
+ vectors = await batch_embed_contents(
89
+ api_key=api_key,
90
+ model=settings.gemini_embedding_model,
91
+ contents=contents,
92
+ settings=settings,
93
+ task_type=settings.gemini_embed_task_type,
94
+ )
95
+
96
+ await delete_gemini_embeddings_for_document(
97
+ session,
98
+ document_id=document_id,
99
+ model=settings.gemini_embedding_model,
100
+ )
101
+ # Use session connection: a second pool connection + open txn causes SQLite
102
+ # "database is locked" on kb_vec_embeddings (single-writer).
103
+ await store.delete_document_for_session(session, document_id)
104
+
105
+ await persist_chunk_embeddings(
106
+ session,
107
+ store,
108
+ document_id=document_id,
109
+ source_id=source_id,
110
+ model=settings.gemini_embedding_model,
111
+ chunks=chunks,
112
+ vectors=vectors,
113
+ modalities=modalities,
114
+ settings=settings,
115
+ )
116
+
117
+ await clear_embedding_dlq(session, document_id=document_id)
118
+
119
+ clean = cleaned_ingest_meta_dict(prior_meta)
120
+ meta_emb = merge_ingest_meta(
121
+ json.dumps(clean, default=str),
122
+ {
123
+ "embedding_model": settings.gemini_embedding_model,
124
+ "embedding_dim": settings.vector_embedding_dim,
125
+ },
126
+ )
127
+
128
+ await session.execute(
129
+ text(
130
+ "UPDATE documents SET status = 'ok', ingest_meta = :im WHERE id = :id",
131
+ ),
132
+ {"id": document_id, "im": meta_emb},
133
+ )
134
+
135
+ try:
136
+ n_mentions, n_co = await extract_and_store_entities_for_document(
137
+ session,
138
+ document_id=document_id,
139
+ settings=settings,
140
+ )
141
+ await session.execute(
142
+ text("UPDATE documents SET ingest_meta = :im WHERE id = :id"),
143
+ {
144
+ "id": document_id,
145
+ "im": merge_ingest_meta(
146
+ meta_emb,
147
+ {
148
+ "entity_mentions": n_mentions,
149
+ "entity_cooccurrence_pairs": n_co,
150
+ },
151
+ ),
152
+ },
153
+ )
154
+ except Exception as ex:
155
+ logger.exception("Entity extraction failed for %s", document_id)
156
+ err_meta = merge_ingest_meta(
157
+ meta_emb,
158
+ {"entity_extraction_error": str(ex)},
159
+ )
160
+ await session.execute(
161
+ text(
162
+ "UPDATE documents SET ingest_meta = :im WHERE id = :id",
163
+ ),
164
+ {"id": document_id, "im": err_meta},
165
+ )
166
+ if settings.entity_extract_strict:
167
+ await session.execute(
168
+ text("UPDATE documents SET status = 'failed' WHERE id = :id"),
169
+ {"id": document_id},
170
+ )
171
+ raise
172
+ return len(chunks)
173
+
174
+
175
+ async def run_embed_document_job(
176
+ *,
177
+ document_id: str,
178
+ multimodal: bool,
179
+ session_factory: async_sessionmaker[AsyncSession],
180
+ settings: Settings,
181
+ store: SqliteVecStore,
182
+ ) -> None:
183
+ async with session_factory() as session:
184
+ try:
185
+ await embed_document_gemini(
186
+ session,
187
+ document_id=document_id,
188
+ settings=settings,
189
+ store=store,
190
+ multimodal=multimodal,
191
+ )
192
+ await session.commit()
193
+ try:
194
+ await dispatch_post_ingest_hooks(
195
+ session_factory,
196
+ document_id=document_id,
197
+ settings=settings,
198
+ )
199
+ except Exception:
200
+ logger.exception(
201
+ "post-ingest hooks failed (document already ok): %s",
202
+ document_id,
203
+ )
204
+ except Exception as e:
205
+ logger.exception("Gemini embed failed for %s", document_id)
206
+ await session.rollback()
207
+ async with session_factory() as s2:
208
+ async with s2.begin():
209
+ row = await s2.execute(
210
+ text(
211
+ "SELECT ingest_meta FROM documents WHERE id = :id LIMIT 1",
212
+ ),
213
+ {"id": document_id},
214
+ )
215
+ r = row.first()
216
+ im = r[0] if r else None
217
+ await record_embedding_dlq_failure(
218
+ s2,
219
+ document_id=document_id,
220
+ error_message=str(e),
221
+ multimodal=multimodal,
222
+ settings=settings,
223
+ prior_ingest_meta=im,
224
+ )
@@ -0,0 +1,12 @@
1
+ from app.services.entities.pipeline import (
2
+ extract_and_store_entities_for_document,
3
+ run_entity_extraction_job,
4
+ )
5
+ from app.services.entities.types import EntityType, ExtractedMention
6
+
7
+ __all__ = [
8
+ "EntityType",
9
+ "ExtractedMention",
10
+ "extract_and_store_entities_for_document",
11
+ "run_entity_extraction_job",
12
+ ]
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+
6
+ from app.services.entities.types import EntityType, ExtractedMention
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ _GLINER_LABELS = [
11
+ "person",
12
+ "company",
13
+ "organization",
14
+ "product",
15
+ "location",
16
+ "concept",
17
+ ]
18
+
19
+ _LABEL_MAP: dict[str, EntityType] = {
20
+ "person": "person",
21
+ "company": "company",
22
+ "organization": "company",
23
+ "product": "product",
24
+ "location": "location",
25
+ "concept": "concept",
26
+ }
27
+
28
+
29
+ def _normalize_name(s: str) -> str:
30
+ return re.sub(r"\s+", " ", s.strip())
31
+
32
+
33
+ def extract_with_gliner(text: str, *, model_id: str) -> list[ExtractedMention]:
34
+ """Optional GLiNER zero-shot NER (requires ``gliner`` + a checkpoint)."""
35
+ if not text.strip():
36
+ return []
37
+ try:
38
+ from gliner import GLiNER # type: ignore[import-not-found]
39
+ except ImportError:
40
+ logger.debug("gliner not installed; skipping GLiNER entity extraction")
41
+ return []
42
+
43
+ try:
44
+ model = GLiNER.from_pretrained(model_id)
45
+ except Exception:
46
+ logger.warning("GLiNER model %r could not load", model_id, exc_info=True)
47
+ return []
48
+
49
+ ents = model.predict_entities(text[:8000], _GLINER_LABELS, threshold=0.35)
50
+ out: list[ExtractedMention] = []
51
+ for e in ents:
52
+ label = str(e.get("label", "")).lower()
53
+ et = _LABEL_MAP.get(label)
54
+ if et is None:
55
+ continue
56
+ name = _normalize_name(str(e.get("text", "")))
57
+ if len(name) < 2:
58
+ continue
59
+ score = float(e.get("score", 0.6))
60
+ out.append(
61
+ ExtractedMention(name=name, type=et, confidence=min(1.0, max(0.0, score)))
62
+ )
63
+ return out
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+
7
+ import httpx
8
+
9
+ from app.config import Settings
10
+ from app.services.entities.types import ExtractedMention
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ _ALLOWED = frozenset({"person", "company", "product", "concept", "location"})
15
+
16
+ _SYSTEM = """Extract named entities from the user text. Return JSON only:
17
+ {"entities":[{"name":"string","type":"person|company|product|concept|location","confidence":0.0-1.0}]}
18
+ Skip duplicates. Prefer high-precision extractions."""
19
+
20
+
21
+ def _normalize_name(s: str) -> str:
22
+ return re.sub(r"\s+", " ", s.strip())
23
+
24
+
25
+ async def extract_with_ollama(
26
+ text: str,
27
+ *,
28
+ settings: Settings,
29
+ client: httpx.AsyncClient | None = None,
30
+ ) -> list[ExtractedMention]:
31
+ """Optional local LLM (Ollama) with JSON-shaped output for harder cases."""
32
+ if not text.strip() or not settings.entity_llm_enabled:
33
+ return []
34
+ base = str(settings.ollama_base_url).rstrip("/")
35
+ url = f"{base}/api/chat"
36
+ body = {
37
+ "model": settings.ollama_entity_model,
38
+ "stream": False,
39
+ "format": "json",
40
+ "messages": [
41
+ {"role": "system", "content": _SYSTEM},
42
+ {
43
+ "role": "user",
44
+ "content": text[:16_000],
45
+ },
46
+ ],
47
+ }
48
+ close = False
49
+ if client is None:
50
+ client = httpx.AsyncClient(timeout=120.0)
51
+ close = True
52
+ try:
53
+ r = await client.post(url, json=body)
54
+ r.raise_for_status()
55
+ data = r.json()
56
+ raw = data.get("message", {}).get("content", "")
57
+ parsed = json.loads(raw)
58
+ items = parsed.get("entities")
59
+ if not isinstance(items, list):
60
+ return []
61
+ out: list[ExtractedMention] = []
62
+ for it in items:
63
+ if not isinstance(it, dict):
64
+ continue
65
+ name = it.get("name")
66
+ typ = it.get("type")
67
+ conf = it.get("confidence", 0.75)
68
+ if not isinstance(name, str) or not isinstance(typ, str):
69
+ continue
70
+ typ_l = typ.lower().strip()
71
+ if typ_l not in _ALLOWED:
72
+ continue
73
+ try:
74
+ c = float(conf)
75
+ except (TypeError, ValueError):
76
+ c = 0.75
77
+ c = max(0.0, min(1.0, c))
78
+ nn = _normalize_name(name)
79
+ if len(nn) < 2:
80
+ continue
81
+ out.append(
82
+ ExtractedMention(
83
+ name=nn,
84
+ type=typ_l, # type: ignore[arg-type]
85
+ confidence=c,
86
+ ),
87
+ )
88
+ return out
89
+ except Exception:
90
+ logger.exception("Ollama entity extraction failed")
91
+ return []
92
+ finally:
93
+ if close:
94
+ await client.aclose()
@@ -0,0 +1,179 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import Iterable
6
+
7
+ from sqlalchemy import text
8
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
9
+
10
+ from app.config import Settings
11
+ from app.services.embeddings.build_inputs import load_document_chunks
12
+ from app.services.entities.gliner_extract import extract_with_gliner
13
+ from app.services.entities.llm_extract import extract_with_ollama
14
+ from app.services.entities.spacy_extract import extract_with_spacy
15
+ from app.services.entities.types import ExtractedMention
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _merge_mentions(mentions: Iterable[ExtractedMention]) -> list[ExtractedMention]:
21
+ """Dedupe by (normalized name, type); keep max confidence."""
22
+ best: dict[tuple[str, str], ExtractedMention] = {}
23
+ for m in mentions:
24
+ key = (m.name.strip().lower(), m.type)
25
+ prev = best.get(key)
26
+ if prev is None or m.confidence > prev.confidence:
27
+ best[key] = m
28
+ return list(best.values())
29
+
30
+
31
+ def _baseline_sync(text: str, settings: Settings) -> list[ExtractedMention]:
32
+ out: list[ExtractedMention] = []
33
+ if settings.entity_use_spacy:
34
+ out.extend(extract_with_spacy(text, model_name=settings.spacy_model))
35
+ if settings.entity_use_gliner:
36
+ out.extend(
37
+ extract_with_gliner(text, model_id=settings.gliner_model_id),
38
+ )
39
+ return _merge_mentions(out)
40
+
41
+
42
+ async def extract_mentions_for_chunk_text(
43
+ text: str,
44
+ settings: Settings,
45
+ ) -> list[ExtractedMention]:
46
+ baseline = await asyncio.to_thread(_baseline_sync, text, settings)
47
+ merged = baseline
48
+ sparse = len(merged) < settings.entity_llm_min_mentions
49
+ if settings.entity_llm_enabled and (sparse or settings.entity_llm_on_all_chunks):
50
+ llm_ents = await extract_with_ollama(text, settings=settings)
51
+ merged = _merge_mentions([*merged, *llm_ents])
52
+ return merged
53
+
54
+
55
+ async def get_or_create_entity_id(
56
+ session: AsyncSession,
57
+ *,
58
+ name: str,
59
+ entity_type: str,
60
+ ) -> int:
61
+ row = await session.execute(
62
+ text(
63
+ "SELECT id FROM entities WHERE lower(trim(name)) = lower(trim(:n)) "
64
+ "AND type = :t LIMIT 1",
65
+ ),
66
+ {"n": name, "t": entity_type},
67
+ )
68
+ found = row.first()
69
+ if found is not None:
70
+ return int(found[0])
71
+ ins = await session.execute(
72
+ text(
73
+ "INSERT INTO entities (name, type, meta) VALUES (:n, :t, NULL) "
74
+ "RETURNING id",
75
+ ),
76
+ {"n": name.strip(), "t": entity_type},
77
+ )
78
+ return int(ins.scalar_one())
79
+
80
+
81
+ async def extract_and_store_entities_for_document(
82
+ session: AsyncSession,
83
+ *,
84
+ document_id: str,
85
+ settings: Settings,
86
+ ) -> tuple[int, int]:
87
+ """
88
+ Replace prior mentions/co-occurrence for this document, then insert fresh rows.
89
+ Caller should commit with the same session as embedding writes.
90
+ Returns (mention_rows, cooccurrence_rows).
91
+ """
92
+ chunks = await load_document_chunks(session, document_id)
93
+ if not chunks:
94
+ return 0, 0
95
+
96
+ await session.execute(
97
+ text("DELETE FROM entity_mentions WHERE document_id = :d"),
98
+ {"d": document_id},
99
+ )
100
+ await session.execute(
101
+ text("DELETE FROM entity_cooccurrence WHERE document_id = :d"),
102
+ {"d": document_id},
103
+ )
104
+
105
+ doc_entity_ids: set[int] = set()
106
+ mention_count = 0
107
+
108
+ for ch in chunks:
109
+ mentions = await extract_mentions_for_chunk_text(ch.text, settings)
110
+ chunk_entity_ids: set[int] = set()
111
+ for m in mentions:
112
+ eid = await get_or_create_entity_id(
113
+ session,
114
+ name=m.name,
115
+ entity_type=m.type,
116
+ )
117
+ await session.execute(
118
+ text(
119
+ "INSERT INTO entity_mentions "
120
+ "(document_id, entity_id, document_chunk_id, confidence) "
121
+ "VALUES (:doc, :eid, :cid, :conf)",
122
+ ),
123
+ {
124
+ "doc": document_id,
125
+ "eid": eid,
126
+ "cid": ch.id,
127
+ "conf": m.confidence,
128
+ },
129
+ )
130
+ mention_count += 1
131
+ doc_entity_ids.add(eid)
132
+ chunk_entity_ids.add(eid)
133
+
134
+ logger.debug(
135
+ "chunk %s document %s: %s mentions, %s unique entities",
136
+ ch.id,
137
+ document_id,
138
+ len(mentions),
139
+ len(chunk_entity_ids),
140
+ )
141
+
142
+ sorted_ids = sorted(doc_entity_ids)
143
+ co_pairs = 0
144
+ for i in range(len(sorted_ids)):
145
+ for j in range(i + 1, len(sorted_ids)):
146
+ lo, hi = sorted_ids[i], sorted_ids[j]
147
+ if lo > hi:
148
+ lo, hi = hi, lo
149
+ await session.execute(
150
+ text(
151
+ "INSERT INTO entity_cooccurrence "
152
+ "(entity_low_id, entity_high_id, document_id, weight) "
153
+ "VALUES (:lo, :hi, :doc, 1.0)",
154
+ ),
155
+ {"lo": lo, "hi": hi, "doc": document_id},
156
+ )
157
+ co_pairs += 1
158
+
159
+ return mention_count, co_pairs
160
+
161
+
162
+ async def run_entity_extraction_job(
163
+ *,
164
+ document_id: str,
165
+ session_factory: async_sessionmaker[AsyncSession],
166
+ settings: Settings,
167
+ ) -> None:
168
+ """Background entrypoint (same pattern as embedding worker)."""
169
+ async with session_factory() as session:
170
+ try:
171
+ await extract_and_store_entities_for_document(
172
+ session,
173
+ document_id=document_id,
174
+ settings=settings,
175
+ )
176
+ await session.commit()
177
+ except Exception:
178
+ logger.exception("entity extraction failed for %s", document_id)
179
+ await session.rollback()
@@ -0,0 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ from functools import lru_cache
6
+
7
+ from app.services.entities.types import EntityType, ExtractedMention
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ _SPACY_LABEL_TO_TYPE: dict[str, EntityType] = {
12
+ "PERSON": "person",
13
+ "ORG": "company",
14
+ "GPE": "location",
15
+ "LOC": "location",
16
+ "FAC": "location",
17
+ "PRODUCT": "product",
18
+ "EVENT": "concept",
19
+ "WORK_OF_ART": "concept",
20
+ "LAW": "concept",
21
+ "NORP": "concept",
22
+ "LANGUAGE": "concept",
23
+ }
24
+
25
+
26
+ def _normalize_name(s: str) -> str:
27
+ t = re.sub(r"\s+", " ", s.strip())
28
+ return t
29
+
30
+
31
+ @lru_cache(maxsize=1)
32
+ def _load_nlp(model_name: str):
33
+ try:
34
+ import spacy
35
+
36
+ return spacy.load(model_name)
37
+ except Exception:
38
+ logger.warning(
39
+ "spaCy model %r not available; install optional [entities] deps",
40
+ model_name,
41
+ exc_info=True,
42
+ )
43
+ return None
44
+
45
+
46
+ def extract_with_spacy(text: str, *, model_name: str) -> list[ExtractedMention]:
47
+ """Baseline NER: spaCy labels mapped to person/company/product/concept/location."""
48
+ if not text or not text.strip():
49
+ return []
50
+ nlp = _load_nlp(model_name)
51
+ if nlp is None:
52
+ return []
53
+ doc = nlp(text[:1_000_000])
54
+ out: list[ExtractedMention] = []
55
+ for ent in doc.ents:
56
+ et = _SPACY_LABEL_TO_TYPE.get(ent.label_)
57
+ if et is None:
58
+ continue
59
+ name = _normalize_name(ent.text)
60
+ if len(name) < 2:
61
+ continue
62
+ out.append(ExtractedMention(name=name, type=et, confidence=0.82))
63
+ return out
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ EntityType = Literal["person", "company", "product", "concept", "location"]
7
+
8
+
9
+ @dataclass(frozen=True, slots=True)
10
+ class ExtractedMention:
11
+ """One surface-form mention with coarse type and confidence."""
12
+
13
+ name: str
14
+ type: EntityType
15
+ confidence: float