business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import text
|
|
7
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
8
|
+
|
|
9
|
+
from app.config import Settings
|
|
10
|
+
from app.services.embeddings.build_inputs import (
|
|
11
|
+
chunk_to_multimodal_parts,
|
|
12
|
+
chunk_to_text_parts,
|
|
13
|
+
load_document_chunks,
|
|
14
|
+
)
|
|
15
|
+
from app.services.embeddings.dlq import (
|
|
16
|
+
cleaned_ingest_meta_dict,
|
|
17
|
+
clear_embedding_dlq,
|
|
18
|
+
merge_ingest_meta,
|
|
19
|
+
record_embedding_dlq_failure,
|
|
20
|
+
)
|
|
21
|
+
from app.services.embeddings.gemini_api import batch_embed_contents
|
|
22
|
+
from app.services.embeddings.persist import (
|
|
23
|
+
delete_gemini_embeddings_for_document,
|
|
24
|
+
persist_chunk_embeddings,
|
|
25
|
+
)
|
|
26
|
+
from app.services.embeddings.types import EmbeddingPart, parts_to_modality
|
|
27
|
+
from app.services.entities.pipeline import extract_and_store_entities_for_document
|
|
28
|
+
from app.services.hooks.post_ingest import dispatch_post_ingest_hooks
|
|
29
|
+
from app.services.integrations_remote import resolve_gemini_api_key
|
|
30
|
+
from app.storage.blobs import BlobStore
|
|
31
|
+
from app.vectorstore import SqliteVecStore
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def embed_document_gemini(
|
|
37
|
+
session: AsyncSession,
|
|
38
|
+
*,
|
|
39
|
+
document_id: str,
|
|
40
|
+
settings: Settings,
|
|
41
|
+
store: SqliteVecStore,
|
|
42
|
+
multimodal: bool = False,
|
|
43
|
+
) -> int:
|
|
44
|
+
"""
|
|
45
|
+
Idempotent: embed first, then replace vec rows + Gemini ``embeddings`` rows.
|
|
46
|
+
|
|
47
|
+
Prior embedding rows for this model are removed only after a successful
|
|
48
|
+
``batch_embed_contents`` call so failures do not leave the document without
|
|
49
|
+
vectors while a retry is pending.
|
|
50
|
+
"""
|
|
51
|
+
api_key = await resolve_gemini_api_key(settings)
|
|
52
|
+
if not api_key:
|
|
53
|
+
msg = "Gemini API key not configured (GEMINI_API_KEY or gateway store)"
|
|
54
|
+
raise ValueError(msg)
|
|
55
|
+
|
|
56
|
+
row = await session.execute(
|
|
57
|
+
text("SELECT source_id, ingest_meta FROM documents WHERE id = :id LIMIT 1"),
|
|
58
|
+
{"id": document_id},
|
|
59
|
+
)
|
|
60
|
+
doc_row = row.first()
|
|
61
|
+
if doc_row is None:
|
|
62
|
+
msg = "document not found"
|
|
63
|
+
raise ValueError(msg)
|
|
64
|
+
source_id = int(doc_row[0])
|
|
65
|
+
prior_meta = doc_row[1]
|
|
66
|
+
|
|
67
|
+
chunks = await load_document_chunks(session, document_id)
|
|
68
|
+
if not chunks:
|
|
69
|
+
err = "no document_chunks; run POST .../chunks first"
|
|
70
|
+
raise ValueError(err)
|
|
71
|
+
|
|
72
|
+
blob_store = BlobStore(settings.data_dir / "blobs")
|
|
73
|
+
contents: list[list[EmbeddingPart]] = []
|
|
74
|
+
modalities: list[str] = []
|
|
75
|
+
for ch in chunks:
|
|
76
|
+
if multimodal:
|
|
77
|
+
parts = await chunk_to_multimodal_parts(
|
|
78
|
+
session,
|
|
79
|
+
document_id,
|
|
80
|
+
ch,
|
|
81
|
+
blob_store,
|
|
82
|
+
)
|
|
83
|
+
else:
|
|
84
|
+
parts = chunk_to_text_parts(ch)
|
|
85
|
+
contents.append(parts)
|
|
86
|
+
modalities.append(parts_to_modality(parts))
|
|
87
|
+
|
|
88
|
+
vectors = await batch_embed_contents(
|
|
89
|
+
api_key=api_key,
|
|
90
|
+
model=settings.gemini_embedding_model,
|
|
91
|
+
contents=contents,
|
|
92
|
+
settings=settings,
|
|
93
|
+
task_type=settings.gemini_embed_task_type,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
await delete_gemini_embeddings_for_document(
|
|
97
|
+
session,
|
|
98
|
+
document_id=document_id,
|
|
99
|
+
model=settings.gemini_embedding_model,
|
|
100
|
+
)
|
|
101
|
+
# Use session connection: a second pool connection + open txn causes SQLite
|
|
102
|
+
# "database is locked" on kb_vec_embeddings (single-writer).
|
|
103
|
+
await store.delete_document_for_session(session, document_id)
|
|
104
|
+
|
|
105
|
+
await persist_chunk_embeddings(
|
|
106
|
+
session,
|
|
107
|
+
store,
|
|
108
|
+
document_id=document_id,
|
|
109
|
+
source_id=source_id,
|
|
110
|
+
model=settings.gemini_embedding_model,
|
|
111
|
+
chunks=chunks,
|
|
112
|
+
vectors=vectors,
|
|
113
|
+
modalities=modalities,
|
|
114
|
+
settings=settings,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
await clear_embedding_dlq(session, document_id=document_id)
|
|
118
|
+
|
|
119
|
+
clean = cleaned_ingest_meta_dict(prior_meta)
|
|
120
|
+
meta_emb = merge_ingest_meta(
|
|
121
|
+
json.dumps(clean, default=str),
|
|
122
|
+
{
|
|
123
|
+
"embedding_model": settings.gemini_embedding_model,
|
|
124
|
+
"embedding_dim": settings.vector_embedding_dim,
|
|
125
|
+
},
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
await session.execute(
|
|
129
|
+
text(
|
|
130
|
+
"UPDATE documents SET status = 'ok', ingest_meta = :im WHERE id = :id",
|
|
131
|
+
),
|
|
132
|
+
{"id": document_id, "im": meta_emb},
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
n_mentions, n_co = await extract_and_store_entities_for_document(
|
|
137
|
+
session,
|
|
138
|
+
document_id=document_id,
|
|
139
|
+
settings=settings,
|
|
140
|
+
)
|
|
141
|
+
await session.execute(
|
|
142
|
+
text("UPDATE documents SET ingest_meta = :im WHERE id = :id"),
|
|
143
|
+
{
|
|
144
|
+
"id": document_id,
|
|
145
|
+
"im": merge_ingest_meta(
|
|
146
|
+
meta_emb,
|
|
147
|
+
{
|
|
148
|
+
"entity_mentions": n_mentions,
|
|
149
|
+
"entity_cooccurrence_pairs": n_co,
|
|
150
|
+
},
|
|
151
|
+
),
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
except Exception as ex:
|
|
155
|
+
logger.exception("Entity extraction failed for %s", document_id)
|
|
156
|
+
err_meta = merge_ingest_meta(
|
|
157
|
+
meta_emb,
|
|
158
|
+
{"entity_extraction_error": str(ex)},
|
|
159
|
+
)
|
|
160
|
+
await session.execute(
|
|
161
|
+
text(
|
|
162
|
+
"UPDATE documents SET ingest_meta = :im WHERE id = :id",
|
|
163
|
+
),
|
|
164
|
+
{"id": document_id, "im": err_meta},
|
|
165
|
+
)
|
|
166
|
+
if settings.entity_extract_strict:
|
|
167
|
+
await session.execute(
|
|
168
|
+
text("UPDATE documents SET status = 'failed' WHERE id = :id"),
|
|
169
|
+
{"id": document_id},
|
|
170
|
+
)
|
|
171
|
+
raise
|
|
172
|
+
return len(chunks)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
async def run_embed_document_job(
|
|
176
|
+
*,
|
|
177
|
+
document_id: str,
|
|
178
|
+
multimodal: bool,
|
|
179
|
+
session_factory: async_sessionmaker[AsyncSession],
|
|
180
|
+
settings: Settings,
|
|
181
|
+
store: SqliteVecStore,
|
|
182
|
+
) -> None:
|
|
183
|
+
async with session_factory() as session:
|
|
184
|
+
try:
|
|
185
|
+
await embed_document_gemini(
|
|
186
|
+
session,
|
|
187
|
+
document_id=document_id,
|
|
188
|
+
settings=settings,
|
|
189
|
+
store=store,
|
|
190
|
+
multimodal=multimodal,
|
|
191
|
+
)
|
|
192
|
+
await session.commit()
|
|
193
|
+
try:
|
|
194
|
+
await dispatch_post_ingest_hooks(
|
|
195
|
+
session_factory,
|
|
196
|
+
document_id=document_id,
|
|
197
|
+
settings=settings,
|
|
198
|
+
)
|
|
199
|
+
except Exception:
|
|
200
|
+
logger.exception(
|
|
201
|
+
"post-ingest hooks failed (document already ok): %s",
|
|
202
|
+
document_id,
|
|
203
|
+
)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.exception("Gemini embed failed for %s", document_id)
|
|
206
|
+
await session.rollback()
|
|
207
|
+
async with session_factory() as s2:
|
|
208
|
+
async with s2.begin():
|
|
209
|
+
row = await s2.execute(
|
|
210
|
+
text(
|
|
211
|
+
"SELECT ingest_meta FROM documents WHERE id = :id LIMIT 1",
|
|
212
|
+
),
|
|
213
|
+
{"id": document_id},
|
|
214
|
+
)
|
|
215
|
+
r = row.first()
|
|
216
|
+
im = r[0] if r else None
|
|
217
|
+
await record_embedding_dlq_failure(
|
|
218
|
+
s2,
|
|
219
|
+
document_id=document_id,
|
|
220
|
+
error_message=str(e),
|
|
221
|
+
multimodal=multimodal,
|
|
222
|
+
settings=settings,
|
|
223
|
+
prior_ingest_meta=im,
|
|
224
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from app.services.entities.pipeline import (
|
|
2
|
+
extract_and_store_entities_for_document,
|
|
3
|
+
run_entity_extraction_job,
|
|
4
|
+
)
|
|
5
|
+
from app.services.entities.types import EntityType, ExtractedMention
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"EntityType",
|
|
9
|
+
"ExtractedMention",
|
|
10
|
+
"extract_and_store_entities_for_document",
|
|
11
|
+
"run_entity_extraction_job",
|
|
12
|
+
]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from app.services.entities.types import EntityType, ExtractedMention
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
_GLINER_LABELS = [
|
|
11
|
+
"person",
|
|
12
|
+
"company",
|
|
13
|
+
"organization",
|
|
14
|
+
"product",
|
|
15
|
+
"location",
|
|
16
|
+
"concept",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
_LABEL_MAP: dict[str, EntityType] = {
|
|
20
|
+
"person": "person",
|
|
21
|
+
"company": "company",
|
|
22
|
+
"organization": "company",
|
|
23
|
+
"product": "product",
|
|
24
|
+
"location": "location",
|
|
25
|
+
"concept": "concept",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _normalize_name(s: str) -> str:
|
|
30
|
+
return re.sub(r"\s+", " ", s.strip())
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract_with_gliner(text: str, *, model_id: str) -> list[ExtractedMention]:
|
|
34
|
+
"""Optional GLiNER zero-shot NER (requires ``gliner`` + a checkpoint)."""
|
|
35
|
+
if not text.strip():
|
|
36
|
+
return []
|
|
37
|
+
try:
|
|
38
|
+
from gliner import GLiNER # type: ignore[import-not-found]
|
|
39
|
+
except ImportError:
|
|
40
|
+
logger.debug("gliner not installed; skipping GLiNER entity extraction")
|
|
41
|
+
return []
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
model = GLiNER.from_pretrained(model_id)
|
|
45
|
+
except Exception:
|
|
46
|
+
logger.warning("GLiNER model %r could not load", model_id, exc_info=True)
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
ents = model.predict_entities(text[:8000], _GLINER_LABELS, threshold=0.35)
|
|
50
|
+
out: list[ExtractedMention] = []
|
|
51
|
+
for e in ents:
|
|
52
|
+
label = str(e.get("label", "")).lower()
|
|
53
|
+
et = _LABEL_MAP.get(label)
|
|
54
|
+
if et is None:
|
|
55
|
+
continue
|
|
56
|
+
name = _normalize_name(str(e.get("text", "")))
|
|
57
|
+
if len(name) < 2:
|
|
58
|
+
continue
|
|
59
|
+
score = float(e.get("score", 0.6))
|
|
60
|
+
out.append(
|
|
61
|
+
ExtractedMention(name=name, type=et, confidence=min(1.0, max(0.0, score)))
|
|
62
|
+
)
|
|
63
|
+
return out
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from app.config import Settings
|
|
10
|
+
from app.services.entities.types import ExtractedMention
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
_ALLOWED = frozenset({"person", "company", "product", "concept", "location"})
|
|
15
|
+
|
|
16
|
+
_SYSTEM = """Extract named entities from the user text. Return JSON only:
|
|
17
|
+
{"entities":[{"name":"string","type":"person|company|product|concept|location","confidence":0.0-1.0}]}
|
|
18
|
+
Skip duplicates. Prefer high-precision extractions."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _normalize_name(s: str) -> str:
|
|
22
|
+
return re.sub(r"\s+", " ", s.strip())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def extract_with_ollama(
|
|
26
|
+
text: str,
|
|
27
|
+
*,
|
|
28
|
+
settings: Settings,
|
|
29
|
+
client: httpx.AsyncClient | None = None,
|
|
30
|
+
) -> list[ExtractedMention]:
|
|
31
|
+
"""Optional local LLM (Ollama) with JSON-shaped output for harder cases."""
|
|
32
|
+
if not text.strip() or not settings.entity_llm_enabled:
|
|
33
|
+
return []
|
|
34
|
+
base = str(settings.ollama_base_url).rstrip("/")
|
|
35
|
+
url = f"{base}/api/chat"
|
|
36
|
+
body = {
|
|
37
|
+
"model": settings.ollama_entity_model,
|
|
38
|
+
"stream": False,
|
|
39
|
+
"format": "json",
|
|
40
|
+
"messages": [
|
|
41
|
+
{"role": "system", "content": _SYSTEM},
|
|
42
|
+
{
|
|
43
|
+
"role": "user",
|
|
44
|
+
"content": text[:16_000],
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
}
|
|
48
|
+
close = False
|
|
49
|
+
if client is None:
|
|
50
|
+
client = httpx.AsyncClient(timeout=120.0)
|
|
51
|
+
close = True
|
|
52
|
+
try:
|
|
53
|
+
r = await client.post(url, json=body)
|
|
54
|
+
r.raise_for_status()
|
|
55
|
+
data = r.json()
|
|
56
|
+
raw = data.get("message", {}).get("content", "")
|
|
57
|
+
parsed = json.loads(raw)
|
|
58
|
+
items = parsed.get("entities")
|
|
59
|
+
if not isinstance(items, list):
|
|
60
|
+
return []
|
|
61
|
+
out: list[ExtractedMention] = []
|
|
62
|
+
for it in items:
|
|
63
|
+
if not isinstance(it, dict):
|
|
64
|
+
continue
|
|
65
|
+
name = it.get("name")
|
|
66
|
+
typ = it.get("type")
|
|
67
|
+
conf = it.get("confidence", 0.75)
|
|
68
|
+
if not isinstance(name, str) or not isinstance(typ, str):
|
|
69
|
+
continue
|
|
70
|
+
typ_l = typ.lower().strip()
|
|
71
|
+
if typ_l not in _ALLOWED:
|
|
72
|
+
continue
|
|
73
|
+
try:
|
|
74
|
+
c = float(conf)
|
|
75
|
+
except (TypeError, ValueError):
|
|
76
|
+
c = 0.75
|
|
77
|
+
c = max(0.0, min(1.0, c))
|
|
78
|
+
nn = _normalize_name(name)
|
|
79
|
+
if len(nn) < 2:
|
|
80
|
+
continue
|
|
81
|
+
out.append(
|
|
82
|
+
ExtractedMention(
|
|
83
|
+
name=nn,
|
|
84
|
+
type=typ_l, # type: ignore[arg-type]
|
|
85
|
+
confidence=c,
|
|
86
|
+
),
|
|
87
|
+
)
|
|
88
|
+
return out
|
|
89
|
+
except Exception:
|
|
90
|
+
logger.exception("Ollama entity extraction failed")
|
|
91
|
+
return []
|
|
92
|
+
finally:
|
|
93
|
+
if close:
|
|
94
|
+
await client.aclose()
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import text
|
|
8
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
9
|
+
|
|
10
|
+
from app.config import Settings
|
|
11
|
+
from app.services.embeddings.build_inputs import load_document_chunks
|
|
12
|
+
from app.services.entities.gliner_extract import extract_with_gliner
|
|
13
|
+
from app.services.entities.llm_extract import extract_with_ollama
|
|
14
|
+
from app.services.entities.spacy_extract import extract_with_spacy
|
|
15
|
+
from app.services.entities.types import ExtractedMention
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _merge_mentions(mentions: Iterable[ExtractedMention]) -> list[ExtractedMention]:
|
|
21
|
+
"""Dedupe by (normalized name, type); keep max confidence."""
|
|
22
|
+
best: dict[tuple[str, str], ExtractedMention] = {}
|
|
23
|
+
for m in mentions:
|
|
24
|
+
key = (m.name.strip().lower(), m.type)
|
|
25
|
+
prev = best.get(key)
|
|
26
|
+
if prev is None or m.confidence > prev.confidence:
|
|
27
|
+
best[key] = m
|
|
28
|
+
return list(best.values())
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _baseline_sync(text: str, settings: Settings) -> list[ExtractedMention]:
|
|
32
|
+
out: list[ExtractedMention] = []
|
|
33
|
+
if settings.entity_use_spacy:
|
|
34
|
+
out.extend(extract_with_spacy(text, model_name=settings.spacy_model))
|
|
35
|
+
if settings.entity_use_gliner:
|
|
36
|
+
out.extend(
|
|
37
|
+
extract_with_gliner(text, model_id=settings.gliner_model_id),
|
|
38
|
+
)
|
|
39
|
+
return _merge_mentions(out)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def extract_mentions_for_chunk_text(
|
|
43
|
+
text: str,
|
|
44
|
+
settings: Settings,
|
|
45
|
+
) -> list[ExtractedMention]:
|
|
46
|
+
baseline = await asyncio.to_thread(_baseline_sync, text, settings)
|
|
47
|
+
merged = baseline
|
|
48
|
+
sparse = len(merged) < settings.entity_llm_min_mentions
|
|
49
|
+
if settings.entity_llm_enabled and (sparse or settings.entity_llm_on_all_chunks):
|
|
50
|
+
llm_ents = await extract_with_ollama(text, settings=settings)
|
|
51
|
+
merged = _merge_mentions([*merged, *llm_ents])
|
|
52
|
+
return merged
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def get_or_create_entity_id(
|
|
56
|
+
session: AsyncSession,
|
|
57
|
+
*,
|
|
58
|
+
name: str,
|
|
59
|
+
entity_type: str,
|
|
60
|
+
) -> int:
|
|
61
|
+
row = await session.execute(
|
|
62
|
+
text(
|
|
63
|
+
"SELECT id FROM entities WHERE lower(trim(name)) = lower(trim(:n)) "
|
|
64
|
+
"AND type = :t LIMIT 1",
|
|
65
|
+
),
|
|
66
|
+
{"n": name, "t": entity_type},
|
|
67
|
+
)
|
|
68
|
+
found = row.first()
|
|
69
|
+
if found is not None:
|
|
70
|
+
return int(found[0])
|
|
71
|
+
ins = await session.execute(
|
|
72
|
+
text(
|
|
73
|
+
"INSERT INTO entities (name, type, meta) VALUES (:n, :t, NULL) "
|
|
74
|
+
"RETURNING id",
|
|
75
|
+
),
|
|
76
|
+
{"n": name.strip(), "t": entity_type},
|
|
77
|
+
)
|
|
78
|
+
return int(ins.scalar_one())
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def extract_and_store_entities_for_document(
|
|
82
|
+
session: AsyncSession,
|
|
83
|
+
*,
|
|
84
|
+
document_id: str,
|
|
85
|
+
settings: Settings,
|
|
86
|
+
) -> tuple[int, int]:
|
|
87
|
+
"""
|
|
88
|
+
Replace prior mentions/co-occurrence for this document, then insert fresh rows.
|
|
89
|
+
Caller should commit with the same session as embedding writes.
|
|
90
|
+
Returns (mention_rows, cooccurrence_rows).
|
|
91
|
+
"""
|
|
92
|
+
chunks = await load_document_chunks(session, document_id)
|
|
93
|
+
if not chunks:
|
|
94
|
+
return 0, 0
|
|
95
|
+
|
|
96
|
+
await session.execute(
|
|
97
|
+
text("DELETE FROM entity_mentions WHERE document_id = :d"),
|
|
98
|
+
{"d": document_id},
|
|
99
|
+
)
|
|
100
|
+
await session.execute(
|
|
101
|
+
text("DELETE FROM entity_cooccurrence WHERE document_id = :d"),
|
|
102
|
+
{"d": document_id},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
doc_entity_ids: set[int] = set()
|
|
106
|
+
mention_count = 0
|
|
107
|
+
|
|
108
|
+
for ch in chunks:
|
|
109
|
+
mentions = await extract_mentions_for_chunk_text(ch.text, settings)
|
|
110
|
+
chunk_entity_ids: set[int] = set()
|
|
111
|
+
for m in mentions:
|
|
112
|
+
eid = await get_or_create_entity_id(
|
|
113
|
+
session,
|
|
114
|
+
name=m.name,
|
|
115
|
+
entity_type=m.type,
|
|
116
|
+
)
|
|
117
|
+
await session.execute(
|
|
118
|
+
text(
|
|
119
|
+
"INSERT INTO entity_mentions "
|
|
120
|
+
"(document_id, entity_id, document_chunk_id, confidence) "
|
|
121
|
+
"VALUES (:doc, :eid, :cid, :conf)",
|
|
122
|
+
),
|
|
123
|
+
{
|
|
124
|
+
"doc": document_id,
|
|
125
|
+
"eid": eid,
|
|
126
|
+
"cid": ch.id,
|
|
127
|
+
"conf": m.confidence,
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
mention_count += 1
|
|
131
|
+
doc_entity_ids.add(eid)
|
|
132
|
+
chunk_entity_ids.add(eid)
|
|
133
|
+
|
|
134
|
+
logger.debug(
|
|
135
|
+
"chunk %s document %s: %s mentions, %s unique entities",
|
|
136
|
+
ch.id,
|
|
137
|
+
document_id,
|
|
138
|
+
len(mentions),
|
|
139
|
+
len(chunk_entity_ids),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
sorted_ids = sorted(doc_entity_ids)
|
|
143
|
+
co_pairs = 0
|
|
144
|
+
for i in range(len(sorted_ids)):
|
|
145
|
+
for j in range(i + 1, len(sorted_ids)):
|
|
146
|
+
lo, hi = sorted_ids[i], sorted_ids[j]
|
|
147
|
+
if lo > hi:
|
|
148
|
+
lo, hi = hi, lo
|
|
149
|
+
await session.execute(
|
|
150
|
+
text(
|
|
151
|
+
"INSERT INTO entity_cooccurrence "
|
|
152
|
+
"(entity_low_id, entity_high_id, document_id, weight) "
|
|
153
|
+
"VALUES (:lo, :hi, :doc, 1.0)",
|
|
154
|
+
),
|
|
155
|
+
{"lo": lo, "hi": hi, "doc": document_id},
|
|
156
|
+
)
|
|
157
|
+
co_pairs += 1
|
|
158
|
+
|
|
159
|
+
return mention_count, co_pairs
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def run_entity_extraction_job(
|
|
163
|
+
*,
|
|
164
|
+
document_id: str,
|
|
165
|
+
session_factory: async_sessionmaker[AsyncSession],
|
|
166
|
+
settings: Settings,
|
|
167
|
+
) -> None:
|
|
168
|
+
"""Background entrypoint (same pattern as embedding worker)."""
|
|
169
|
+
async with session_factory() as session:
|
|
170
|
+
try:
|
|
171
|
+
await extract_and_store_entities_for_document(
|
|
172
|
+
session,
|
|
173
|
+
document_id=document_id,
|
|
174
|
+
settings=settings,
|
|
175
|
+
)
|
|
176
|
+
await session.commit()
|
|
177
|
+
except Exception:
|
|
178
|
+
logger.exception("entity extraction failed for %s", document_id)
|
|
179
|
+
await session.rollback()
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
|
|
7
|
+
from app.services.entities.types import EntityType, ExtractedMention
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
_SPACY_LABEL_TO_TYPE: dict[str, EntityType] = {
|
|
12
|
+
"PERSON": "person",
|
|
13
|
+
"ORG": "company",
|
|
14
|
+
"GPE": "location",
|
|
15
|
+
"LOC": "location",
|
|
16
|
+
"FAC": "location",
|
|
17
|
+
"PRODUCT": "product",
|
|
18
|
+
"EVENT": "concept",
|
|
19
|
+
"WORK_OF_ART": "concept",
|
|
20
|
+
"LAW": "concept",
|
|
21
|
+
"NORP": "concept",
|
|
22
|
+
"LANGUAGE": "concept",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _normalize_name(s: str) -> str:
|
|
27
|
+
t = re.sub(r"\s+", " ", s.strip())
|
|
28
|
+
return t
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@lru_cache(maxsize=1)
|
|
32
|
+
def _load_nlp(model_name: str):
|
|
33
|
+
try:
|
|
34
|
+
import spacy
|
|
35
|
+
|
|
36
|
+
return spacy.load(model_name)
|
|
37
|
+
except Exception:
|
|
38
|
+
logger.warning(
|
|
39
|
+
"spaCy model %r not available; install optional [entities] deps",
|
|
40
|
+
model_name,
|
|
41
|
+
exc_info=True,
|
|
42
|
+
)
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_with_spacy(text: str, *, model_name: str) -> list[ExtractedMention]:
|
|
47
|
+
"""Baseline NER: spaCy labels mapped to person/company/product/concept/location."""
|
|
48
|
+
if not text or not text.strip():
|
|
49
|
+
return []
|
|
50
|
+
nlp = _load_nlp(model_name)
|
|
51
|
+
if nlp is None:
|
|
52
|
+
return []
|
|
53
|
+
doc = nlp(text[:1_000_000])
|
|
54
|
+
out: list[ExtractedMention] = []
|
|
55
|
+
for ent in doc.ents:
|
|
56
|
+
et = _SPACY_LABEL_TO_TYPE.get(ent.label_)
|
|
57
|
+
if et is None:
|
|
58
|
+
continue
|
|
59
|
+
name = _normalize_name(ent.text)
|
|
60
|
+
if len(name) < 2:
|
|
61
|
+
continue
|
|
62
|
+
out.append(ExtractedMention(name=name, type=et, confidence=0.82))
|
|
63
|
+
return out
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
EntityType = Literal["person", "company", "product", "concept", "location"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True, slots=True)
|
|
10
|
+
class ExtractedMention:
|
|
11
|
+
"""One surface-form mention with coarse type and confidence."""
|
|
12
|
+
|
|
13
|
+
name: str
|
|
14
|
+
type: EntityType
|
|
15
|
+
confidence: float
|