business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any, ClassVar
|
|
6
|
+
|
|
7
|
+
from app.schemas.ingest import RawIngestEnvelope
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Connector(ABC):
|
|
11
|
+
"""Source plugin: validate config, optional fetch_iter, normalize raw envelopes."""
|
|
12
|
+
|
|
13
|
+
name: ClassVar[str]
|
|
14
|
+
|
|
15
|
+
def prepare_envelope(
|
|
16
|
+
self,
|
|
17
|
+
envelope: RawIngestEnvelope,
|
|
18
|
+
config: dict[str, Any],
|
|
19
|
+
) -> RawIngestEnvelope:
|
|
20
|
+
"""
|
|
21
|
+
Mutate or replace the envelope before persistence (default: identity).
|
|
22
|
+
|
|
23
|
+
Used for connectors that fetch remote content (e.g. browser session)
|
|
24
|
+
so ``raw_content`` and canonical normalization see extracted text.
|
|
25
|
+
"""
|
|
26
|
+
return envelope
|
|
27
|
+
|
|
28
|
+
def validate_config(self, config: dict[str, Any]) -> None:
|
|
29
|
+
"""Raise ValueError if *config* is invalid for this connector."""
|
|
30
|
+
|
|
31
|
+
def fetch_iter(self, config: dict[str, Any]) -> Iterator[Any]:
|
|
32
|
+
"""Optional batch source; not used by POST /ingest/raw."""
|
|
33
|
+
raise NotImplementedError(
|
|
34
|
+
f"Connector {self.name!r} does not implement fetch_iter()",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
|
|
39
|
+
"""Produce a normalized dict for downstream pipeline steps."""
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from app.connectors.base import Connector
|
|
6
|
+
from app.connectors.registry import register_connector
|
|
7
|
+
from app.schemas.ingest import RawIngestEnvelope
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GenericConnector(Connector):
|
|
11
|
+
name = "generic"
|
|
12
|
+
|
|
13
|
+
def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
|
|
14
|
+
return {
|
|
15
|
+
"source": envelope.source,
|
|
16
|
+
"content_type": envelope.content_type,
|
|
17
|
+
"payload_type": type(envelope.payload).__name__,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EchoConnector(Connector):
|
|
22
|
+
"""Passes through key envelope fields for tests and debugging."""
|
|
23
|
+
|
|
24
|
+
name = "echo"
|
|
25
|
+
|
|
26
|
+
def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
|
|
27
|
+
return {
|
|
28
|
+
"source": envelope.source,
|
|
29
|
+
"content_type": envelope.content_type,
|
|
30
|
+
"metadata_keys": sorted(envelope.metadata.keys()),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class StrictConnector(Connector):
|
|
35
|
+
"""Validates metadata.connector_config for integration tests."""
|
|
36
|
+
|
|
37
|
+
name = "strict"
|
|
38
|
+
|
|
39
|
+
def validate_config(self, config: dict[str, Any]) -> None:
|
|
40
|
+
if config.get("ok") is not True:
|
|
41
|
+
raise ValueError("connector_config.ok must be true")
|
|
42
|
+
|
|
43
|
+
def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
|
|
44
|
+
return {"strict": True, "source": envelope.source}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def register_builtin_connectors() -> None:
|
|
48
|
+
register_connector(GenericConnector)
|
|
49
|
+
register_connector(StrictConnector)
|
|
50
|
+
# Side effect: @register_connector on PlaywrightSessionConnector
|
|
51
|
+
import app.connectors.playwright_session # noqa: F401
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ingest connector: fetch **visible text** via Playwright + persistent user-data dir.
|
|
3
|
+
|
|
4
|
+
Uses the same ``RawIngestEnvelope`` as other connectors; ``prepare_envelope``
|
|
5
|
+
runs the browser **before** the document is persisted so ``raw_content`` and
|
|
6
|
+
canonical normalization include extracted text.
|
|
7
|
+
|
|
8
|
+
**Envelope shape**
|
|
9
|
+
|
|
10
|
+
- **URL**: ``metadata.url`` *or* string ``payload`` *or* ``payload["url"]``.
|
|
11
|
+
- **Profile directory**: ``metadata.playwright_user_data_dir`` *or*
|
|
12
|
+
``connector_config.user_data_dir`` (config wins if both set — see code).
|
|
13
|
+
|
|
14
|
+
**connector_config** (safety / tuning)
|
|
15
|
+
|
|
16
|
+
- ``allowlisted_hosts`` (required): non-empty list of host strings; supports
|
|
17
|
+
``*.example.com`` suffix patterns (see ``host_matches_allowlist``).
|
|
18
|
+
- ``navigation_timeout_ms`` (optional, default 30000).
|
|
19
|
+
- ``max_response_chars`` (optional, default 500000).
|
|
20
|
+
- ``headless`` (optional, default true).
|
|
21
|
+
- ``browser_channel`` (optional): e.g. ``"chrome"`` to use installed Google Chrome.
|
|
22
|
+
|
|
23
|
+
**Credentials / compliance**
|
|
24
|
+
|
|
25
|
+
- This connector does not ship API keys; authenticated pages rely on **cookies and
|
|
26
|
+
session state** already stored in the persistent profile directory.
|
|
27
|
+
- You are responsible for site terms, login policies, and data handling.
|
|
28
|
+
|
|
29
|
+
**Windows / profile locking** (summary)
|
|
30
|
+
|
|
31
|
+
- Use a **dedicated** profile folder for automation; **close** all browser
|
|
32
|
+
windows using that profile before ingest; avoid concurrent runs on the same
|
|
33
|
+
path. Full notes: ``app.services.playwright_extract.extract`` module docstring.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
from app.connectors.base import Connector
|
|
41
|
+
from app.connectors.registry import register_connector
|
|
42
|
+
from app.schemas.ingest import RawIngestEnvelope
|
|
43
|
+
from app.services.playwright_extract.extract import extract_visible_text_sync
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _resolve_target_url(envelope: RawIngestEnvelope) -> str:
|
|
47
|
+
md = envelope.metadata.get("url")
|
|
48
|
+
if isinstance(md, str) and md.strip():
|
|
49
|
+
return md.strip()
|
|
50
|
+
if isinstance(envelope.payload, str) and envelope.payload.strip():
|
|
51
|
+
return envelope.payload.strip()
|
|
52
|
+
if isinstance(envelope.payload, dict):
|
|
53
|
+
u = envelope.payload.get("url")
|
|
54
|
+
if isinstance(u, str) and u.strip():
|
|
55
|
+
return u.strip()
|
|
56
|
+
msg = "Set metadata.url or payload (string or {url: ...}) for playwright_session"
|
|
57
|
+
raise ValueError(msg)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_user_data_dir(envelope: RawIngestEnvelope, config: dict[str, Any]) -> str:
|
|
61
|
+
cfg_dir = config.get("user_data_dir")
|
|
62
|
+
if isinstance(cfg_dir, str) and cfg_dir.strip():
|
|
63
|
+
return cfg_dir.strip()
|
|
64
|
+
md_dir = envelope.metadata.get("playwright_user_data_dir")
|
|
65
|
+
if isinstance(md_dir, str) and md_dir.strip():
|
|
66
|
+
return md_dir.strip()
|
|
67
|
+
msg = (
|
|
68
|
+
"Set connector_config.user_data_dir or metadata.playwright_user_data_dir "
|
|
69
|
+
"to the persistent Chrome/Chromium user data directory"
|
|
70
|
+
)
|
|
71
|
+
raise ValueError(msg)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@register_connector
|
|
75
|
+
class PlaywrightSessionConnector(Connector):
|
|
76
|
+
name = "playwright_session"
|
|
77
|
+
|
|
78
|
+
def validate_config(self, config: dict[str, Any]) -> None:
|
|
79
|
+
hosts = config.get("allowlisted_hosts")
|
|
80
|
+
if not isinstance(hosts, list) or not hosts:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
"connector_config.allowlisted_hosts must be a non-empty list",
|
|
83
|
+
)
|
|
84
|
+
if not all(isinstance(h, str) and h.strip() for h in hosts):
|
|
85
|
+
raise ValueError("allowlisted_hosts entries must be non-empty strings")
|
|
86
|
+
|
|
87
|
+
def prepare_envelope(
|
|
88
|
+
self,
|
|
89
|
+
envelope: RawIngestEnvelope,
|
|
90
|
+
config: dict[str, Any],
|
|
91
|
+
) -> RawIngestEnvelope:
|
|
92
|
+
url = _resolve_target_url(envelope)
|
|
93
|
+
user_data_dir = _resolve_user_data_dir(envelope, config)
|
|
94
|
+
navigation_timeout_ms = int(config.get("navigation_timeout_ms", 30_000))
|
|
95
|
+
max_response_chars = int(config.get("max_response_chars", 500_000))
|
|
96
|
+
headless = bool(config.get("headless", True))
|
|
97
|
+
ch = config.get("browser_channel")
|
|
98
|
+
browser_channel = str(ch) if isinstance(ch, str) and ch.strip() else None
|
|
99
|
+
hosts = [str(h).strip() for h in config["allowlisted_hosts"] if str(h).strip()]
|
|
100
|
+
|
|
101
|
+
result = extract_visible_text_sync(
|
|
102
|
+
url,
|
|
103
|
+
user_data_dir,
|
|
104
|
+
allowlisted_hosts=hosts,
|
|
105
|
+
navigation_timeout_ms=navigation_timeout_ms,
|
|
106
|
+
max_response_chars=max_response_chars,
|
|
107
|
+
headless=headless,
|
|
108
|
+
browser_channel=browser_channel,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
meta = dict(envelope.metadata)
|
|
112
|
+
meta["playwright_extraction"] = {
|
|
113
|
+
"requested_url": url,
|
|
114
|
+
"final_url": result.final_url,
|
|
115
|
+
"title": result.title,
|
|
116
|
+
"truncated": result.truncated,
|
|
117
|
+
"profile_meta": result.meta,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
payload: dict[str, Any] = {
|
|
121
|
+
"text": result.visible_text,
|
|
122
|
+
"title": result.title,
|
|
123
|
+
"url": result.final_url,
|
|
124
|
+
"source_url": url,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return envelope.model_copy(
|
|
128
|
+
update={
|
|
129
|
+
"content_type": "text",
|
|
130
|
+
"metadata": meta,
|
|
131
|
+
"payload": payload,
|
|
132
|
+
},
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
|
|
136
|
+
raw_playwright = envelope.metadata.get("playwright_extraction")
|
|
137
|
+
text_len = 0
|
|
138
|
+
if isinstance(envelope.payload, dict):
|
|
139
|
+
t = envelope.payload.get("text")
|
|
140
|
+
if isinstance(t, str):
|
|
141
|
+
text_len = len(t)
|
|
142
|
+
return {
|
|
143
|
+
"connector": self.name,
|
|
144
|
+
"playwright": raw_playwright if isinstance(raw_playwright, dict) else None,
|
|
145
|
+
"extracted_text_chars": text_len,
|
|
146
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from importlib.metadata import entry_points
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from app.connectors.base import Connector
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_CONNECTORS: dict[str, type[Connector]] = {}
|
|
13
|
+
_INITIALIZED = False
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def register_connector(cls: type[Connector]) -> type[Connector]:
|
|
17
|
+
key = cls.name
|
|
18
|
+
if key in _CONNECTORS:
|
|
19
|
+
logger.warning("Overwriting connector registration for key %r", key)
|
|
20
|
+
_CONNECTORS[key] = cls
|
|
21
|
+
return cls
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_connector_class(key: str) -> type[Connector]:
|
|
25
|
+
return _CONNECTORS[key]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def list_connector_keys() -> list[str]:
|
|
29
|
+
return sorted(_CONNECTORS.keys())
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _load_entrypoint_connectors() -> None:
|
|
33
|
+
from app.connectors.base import Connector as ConnectorBase
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
eps = entry_points(group="kb.connectors")
|
|
37
|
+
except TypeError:
|
|
38
|
+
eps = entry_points().select(group="kb.connectors")
|
|
39
|
+
|
|
40
|
+
for ep in eps:
|
|
41
|
+
try:
|
|
42
|
+
loaded = ep.load()
|
|
43
|
+
except Exception:
|
|
44
|
+
logger.exception("Failed to load connector entry point %r", ep.name)
|
|
45
|
+
continue
|
|
46
|
+
if isinstance(loaded, type) and issubclass(loaded, ConnectorBase):
|
|
47
|
+
register_connector(loaded)
|
|
48
|
+
else:
|
|
49
|
+
logger.warning(
|
|
50
|
+
"Entry point %r did not resolve to a Connector subclass",
|
|
51
|
+
ep.name,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _load_builtin_connectors() -> None:
|
|
56
|
+
from app.connectors.builtins import register_builtin_connectors
|
|
57
|
+
|
|
58
|
+
register_builtin_connectors()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def init_connectors() -> None:
|
|
62
|
+
"""Idempotent: register built-ins and kb.connectors entry points."""
|
|
63
|
+
global _INITIALIZED
|
|
64
|
+
if _INITIALIZED:
|
|
65
|
+
return
|
|
66
|
+
_load_builtin_connectors()
|
|
67
|
+
_load_entrypoint_connectors()
|
|
68
|
+
_INITIALIZED = True
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Thread expansion: social threads via **official APIs** (when implemented).
|
|
3
|
+
|
|
4
|
+
Models: ``ThreadDocumentNode``, ``ThreadRelationshipEdge``, ``ThreadExpansionResult``
|
|
5
|
+
for a document + ``relationships`` tree (``reply`` / ``thread`` edges).
|
|
6
|
+
|
|
7
|
+
Abstract fetchers: ``TwitterThreadExpansionFetcher``, ``RedditThreadExpansionFetcher``
|
|
8
|
+
(compliance notes in their docstrings). Fakes: deterministic tests / local use.
|
|
9
|
+
|
|
10
|
+
Production: implement with documented X and Reddit APIs; do not use scraping.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from app.connectors.thread_expansion.fakes import (
|
|
14
|
+
FakeRedditThreadExpansionFetcher,
|
|
15
|
+
FakeTwitterThreadExpansionFetcher,
|
|
16
|
+
)
|
|
17
|
+
from app.connectors.thread_expansion.models import (
|
|
18
|
+
ThreadDocumentNode,
|
|
19
|
+
ThreadExpansionResult,
|
|
20
|
+
ThreadRelationshipEdge,
|
|
21
|
+
)
|
|
22
|
+
from app.connectors.thread_expansion.reddit import RedditThreadExpansionFetcher
|
|
23
|
+
from app.connectors.thread_expansion.twitter import TwitterThreadExpansionFetcher
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"FakeRedditThreadExpansionFetcher",
|
|
27
|
+
"FakeTwitterThreadExpansionFetcher",
|
|
28
|
+
"RedditThreadExpansionFetcher",
|
|
29
|
+
"ThreadDocumentNode",
|
|
30
|
+
"ThreadExpansionResult",
|
|
31
|
+
"ThreadRelationshipEdge",
|
|
32
|
+
"TwitterThreadExpansionFetcher",
|
|
33
|
+
]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic fake thread fetchers for unit tests and local development.
|
|
3
|
+
|
|
4
|
+
No network calls; no credentials required.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from app.connectors.thread_expansion.models import (
|
|
13
|
+
ThreadDocumentNode,
|
|
14
|
+
ThreadExpansionResult,
|
|
15
|
+
ThreadRelationshipEdge,
|
|
16
|
+
)
|
|
17
|
+
from app.connectors.thread_expansion.reddit import RedditThreadExpansionFetcher
|
|
18
|
+
from app.connectors.thread_expansion.twitter import TwitterThreadExpansionFetcher
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FakeTwitterThreadExpansionFetcher(TwitterThreadExpansionFetcher):
|
|
22
|
+
"""
|
|
23
|
+
Returns a small linear 3-tweet thread (root → reply → reply).
|
|
24
|
+
|
|
25
|
+
``conversation_id`` is echoed into node ``meta`` only; fetch always returns
|
|
26
|
+
the same structure for testing.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
async def fetch_full_thread(
|
|
30
|
+
self,
|
|
31
|
+
conversation_id: str,
|
|
32
|
+
*,
|
|
33
|
+
config: dict[str, Any],
|
|
34
|
+
) -> ThreadExpansionResult:
|
|
35
|
+
_ = config
|
|
36
|
+
t0 = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
|
|
37
|
+
nodes = [
|
|
38
|
+
ThreadDocumentNode(
|
|
39
|
+
document_id="twitter:tw_root",
|
|
40
|
+
external_id="1999999999999999999",
|
|
41
|
+
text="Root post about the project launch.",
|
|
42
|
+
author_handle="example_user",
|
|
43
|
+
created_at=t0,
|
|
44
|
+
meta={"conversation_id": conversation_id, "kind": "tweet"},
|
|
45
|
+
),
|
|
46
|
+
ThreadDocumentNode(
|
|
47
|
+
document_id="twitter:tw_r1",
|
|
48
|
+
external_id="1999999999999999998",
|
|
49
|
+
text="Reply: congratulations!",
|
|
50
|
+
author_handle="fan_one",
|
|
51
|
+
created_at=t0,
|
|
52
|
+
meta={"kind": "tweet"},
|
|
53
|
+
),
|
|
54
|
+
ThreadDocumentNode(
|
|
55
|
+
document_id="twitter:tw_r2",
|
|
56
|
+
external_id="1999999999999999997",
|
|
57
|
+
text="Reply: when is v2 shipping?",
|
|
58
|
+
author_handle="fan_two",
|
|
59
|
+
created_at=t0,
|
|
60
|
+
meta={"kind": "tweet"},
|
|
61
|
+
),
|
|
62
|
+
]
|
|
63
|
+
edges = [
|
|
64
|
+
ThreadRelationshipEdge(
|
|
65
|
+
parent_document_id="twitter:tw_root",
|
|
66
|
+
child_document_id="twitter:tw_r1",
|
|
67
|
+
relation_type="reply",
|
|
68
|
+
meta={"platform": "twitter"},
|
|
69
|
+
),
|
|
70
|
+
ThreadRelationshipEdge(
|
|
71
|
+
parent_document_id="twitter:tw_r1",
|
|
72
|
+
child_document_id="twitter:tw_r2",
|
|
73
|
+
relation_type="reply",
|
|
74
|
+
meta={"platform": "twitter"},
|
|
75
|
+
),
|
|
76
|
+
ThreadRelationshipEdge(
|
|
77
|
+
parent_document_id="twitter:tw_root",
|
|
78
|
+
child_document_id="twitter:tw_r2",
|
|
79
|
+
relation_type="thread",
|
|
80
|
+
weight=0.3,
|
|
81
|
+
meta={"platform": "twitter", "note": "optional weak thread link"},
|
|
82
|
+
),
|
|
83
|
+
]
|
|
84
|
+
return ThreadExpansionResult(
|
|
85
|
+
platform="twitter",
|
|
86
|
+
thread_key=conversation_id,
|
|
87
|
+
root_document_id="twitter:tw_root",
|
|
88
|
+
nodes=nodes,
|
|
89
|
+
edges=edges,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class FakeRedditThreadExpansionFetcher(RedditThreadExpansionFetcher):
|
|
94
|
+
"""
|
|
95
|
+
Returns a post plus two nested comments (linear reply chain).
|
|
96
|
+
|
|
97
|
+
Ignores ``post_id`` except to set ``thread_key`` for stable tests.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
async def fetch_post_and_comments(
|
|
101
|
+
self,
|
|
102
|
+
post_id: str,
|
|
103
|
+
*,
|
|
104
|
+
config: dict[str, Any],
|
|
105
|
+
) -> ThreadExpansionResult:
|
|
106
|
+
_ = config
|
|
107
|
+
t0 = datetime(2026, 2, 1, 9, 30, 0, tzinfo=UTC)
|
|
108
|
+
nodes = [
|
|
109
|
+
ThreadDocumentNode(
|
|
110
|
+
document_id="reddit:t3_abc123",
|
|
111
|
+
external_id="t3_abc123",
|
|
112
|
+
text="Ask HN: Best patterns for thread-safe queues?",
|
|
113
|
+
author_handle="op_dev",
|
|
114
|
+
created_at=t0,
|
|
115
|
+
meta={"subreddit": "test", "score": 42, "permalink": "/r/test/abc"},
|
|
116
|
+
),
|
|
117
|
+
ThreadDocumentNode(
|
|
118
|
+
document_id="reddit:t1_cmnt1",
|
|
119
|
+
external_id="t1_cmnt1",
|
|
120
|
+
text="We use channels and bounded queues.",
|
|
121
|
+
author_handle="commenter_a",
|
|
122
|
+
created_at=t0,
|
|
123
|
+
meta={"depth": 0},
|
|
124
|
+
),
|
|
125
|
+
ThreadDocumentNode(
|
|
126
|
+
document_id="reddit:t1_cmnt2",
|
|
127
|
+
external_id="t1_cmnt2",
|
|
128
|
+
text="Plus backpressure on the producer side.",
|
|
129
|
+
author_handle="commenter_b",
|
|
130
|
+
created_at=t0,
|
|
131
|
+
meta={"depth": 1},
|
|
132
|
+
),
|
|
133
|
+
]
|
|
134
|
+
edges = [
|
|
135
|
+
ThreadRelationshipEdge(
|
|
136
|
+
parent_document_id="reddit:t3_abc123",
|
|
137
|
+
child_document_id="reddit:t1_cmnt1",
|
|
138
|
+
relation_type="reply",
|
|
139
|
+
meta={"platform": "reddit"},
|
|
140
|
+
),
|
|
141
|
+
ThreadRelationshipEdge(
|
|
142
|
+
parent_document_id="reddit:t1_cmnt1",
|
|
143
|
+
child_document_id="reddit:t1_cmnt2",
|
|
144
|
+
relation_type="reply",
|
|
145
|
+
meta={"platform": "reddit"},
|
|
146
|
+
),
|
|
147
|
+
]
|
|
148
|
+
return ThreadExpansionResult(
|
|
149
|
+
platform="reddit",
|
|
150
|
+
thread_key=post_id or "t3_abc123",
|
|
151
|
+
root_document_id="reddit:t3_abc123",
|
|
152
|
+
nodes=nodes,
|
|
153
|
+
edges=edges,
|
|
154
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Structured thread data for persisting into ``documents`` + ``relationships``.
|
|
3
|
+
|
|
4
|
+
Use ``relation_type`` values that match the DB check constraint:
|
|
5
|
+
``'link'``, ``'thread'``, ``'reply'``, ``'entity_cooccur'``.
|
|
6
|
+
|
|
7
|
+
For social threads, model the tree with:
|
|
8
|
+
|
|
9
|
+
- ``reply`` — direct parent → child reply edge (authoritative tree shape).
|
|
10
|
+
- ``thread`` — optional weak edge (e.g. root → descendant) for “same thread”
|
|
11
|
+
navigation; omit if you only need the reply tree.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from typing import Any, Literal
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
20
|
+
|
|
21
|
+
PlatformLiteral = Literal["twitter", "reddit"]
|
|
22
|
+
ThreadRelationLiteral = Literal["reply", "thread"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ThreadDocumentNode(BaseModel):
|
|
26
|
+
"""
|
|
27
|
+
One logical post (tweet, Reddit post/comment) mapped to a KB ``documents`` row.
|
|
28
|
+
|
|
29
|
+
``document_id`` should be stable across fetches (e.g. ``twitter:{tweet_id}``,
|
|
30
|
+
``reddit:{fullname}``) so upserts and ``relationships`` stay consistent.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
model_config = ConfigDict(extra="forbid")
|
|
34
|
+
|
|
35
|
+
document_id: str = Field(..., min_length=1)
|
|
36
|
+
external_id: str = Field(
|
|
37
|
+
...,
|
|
38
|
+
min_length=1,
|
|
39
|
+
description="Platform-native id (tweet id, Reddit fullname t1_/t3_)",
|
|
40
|
+
)
|
|
41
|
+
text: str = Field(default="", description="Primary text for summary / chunking")
|
|
42
|
+
author_handle: str | None = None
|
|
43
|
+
created_at: datetime
|
|
44
|
+
content_type: Literal["text", "multimodal"] = "text"
|
|
45
|
+
raw_content: dict[str, Any] | None = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="Optional structured payload stored as JSON in ingest paths",
|
|
48
|
+
)
|
|
49
|
+
meta: dict[str, Any] = Field(
|
|
50
|
+
default_factory=dict,
|
|
51
|
+
description="Extra attribution: urls, like counts, etc. (JSON-serializable)",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ThreadRelationshipEdge(BaseModel):
|
|
56
|
+
"""
|
|
57
|
+
Maps to ``relationships`` (parent_document_id, child_document_id, relation_type).
|
|
58
|
+
|
|
59
|
+
Use ``reply`` for the reply tree. Use ``thread`` sparingly for non-reply links
|
|
60
|
+
(e.g. root → leaf) if your UI or graph queries need them.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
model_config = ConfigDict(extra="forbid")
|
|
64
|
+
|
|
65
|
+
parent_document_id: str = Field(..., min_length=1)
|
|
66
|
+
child_document_id: str = Field(..., min_length=1)
|
|
67
|
+
relation_type: ThreadRelationLiteral
|
|
68
|
+
weight: float | None = None
|
|
69
|
+
meta: dict[str, Any] = Field(default_factory=dict)
|
|
70
|
+
|
|
71
|
+
@model_validator(mode="after")
|
|
72
|
+
def _no_self_loop(self) -> ThreadRelationshipEdge:
|
|
73
|
+
if self.parent_document_id == self.child_document_id:
|
|
74
|
+
msg = "parent_document_id and child_document_id must differ"
|
|
75
|
+
raise ValueError(msg)
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ThreadExpansionResult(BaseModel):
|
|
80
|
+
"""
|
|
81
|
+
Full thread fetch: nodes (documents) + edges (relationships preserving tree shape).
|
|
82
|
+
|
|
83
|
+
Persist by inserting/updating one document per node, then inserting edges with
|
|
84
|
+
``INSERT OR IGNORE`` (or equivalent) to respect the unique constraint on
|
|
85
|
+
``(parent_document_id, child_document_id, relation_type)``.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
model_config = ConfigDict(extra="forbid")
|
|
89
|
+
|
|
90
|
+
platform: PlatformLiteral
|
|
91
|
+
thread_key: str = Field(
|
|
92
|
+
...,
|
|
93
|
+
min_length=1,
|
|
94
|
+
description="Stable thread id: X conversation_id, Reddit post fullname, etc.",
|
|
95
|
+
)
|
|
96
|
+
root_document_id: str = Field(..., min_length=1)
|
|
97
|
+
nodes: list[ThreadDocumentNode]
|
|
98
|
+
edges: list[ThreadRelationshipEdge]
|
|
99
|
+
|
|
100
|
+
@model_validator(mode="after")
|
|
101
|
+
def _graph_integrity(self) -> ThreadExpansionResult:
|
|
102
|
+
ids = {n.document_id for n in self.nodes}
|
|
103
|
+
if self.root_document_id not in ids:
|
|
104
|
+
msg = "root_document_id must match a node.document_id"
|
|
105
|
+
raise ValueError(msg)
|
|
106
|
+
for e in self.edges:
|
|
107
|
+
if e.parent_document_id not in ids:
|
|
108
|
+
msg = f"edge parent not in nodes: {e.parent_document_id!r}"
|
|
109
|
+
raise ValueError(msg)
|
|
110
|
+
if e.child_document_id not in ids:
|
|
111
|
+
msg = f"edge child not in nodes: {e.child_document_id!r}"
|
|
112
|
+
raise ValueError(msg)
|
|
113
|
+
return self
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reddit thread expansion — **official Reddit Data API only**.
|
|
3
|
+
|
|
4
|
+
Use reddit.com/dev/api and OAuth-based access. Do not bypass or automate the
|
|
5
|
+
site in ways that conflict with Reddit’s Terms or API rules.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from app.connectors.thread_expansion.models import ThreadExpansionResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RedditThreadExpansionFetcher(ABC):
|
|
17
|
+
"""
|
|
18
|
+
Fetch a post and its comment tree via the **Reddit Data API**.
|
|
19
|
+
|
|
20
|
+
Compliance & credentials (operational checklist — not legal advice):
|
|
21
|
+
|
|
22
|
+
- **API access**: Register an app at Reddit to obtain a client id/secret;
|
|
23
|
+
use OAuth as required for the endpoints you call. Credentials belong in
|
|
24
|
+
server-side configuration only.
|
|
25
|
+
- **User-Agent**: Reddit expects a descriptive ``User-Agent`` string
|
|
26
|
+
(typically including app name and contact). Follow Reddit’s API guidelines.
|
|
27
|
+
- **Policies**: Comply with Reddit’s API Terms, rate limits, and content
|
|
28
|
+
policies. Do not use the API for disallowed bulk or commercial uses unless
|
|
29
|
+
permitted.
|
|
30
|
+
- **Privacy**: Honor moderator and user visibility (removed/deleted content)
|
|
31
|
+
as represented by the API.
|
|
32
|
+
|
|
33
|
+
Implementations MUST NOT scrape HTML threads as a substitute for the API.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
async def fetch_post_and_comments(
|
|
38
|
+
self,
|
|
39
|
+
post_id: str,
|
|
40
|
+
*,
|
|
41
|
+
config: dict[str, Any],
|
|
42
|
+
) -> ThreadExpansionResult:
|
|
43
|
+
"""
|
|
44
|
+
Return the submission and reachable comments as nodes, with ``reply``
|
|
45
|
+
edges mirroring Reddit’s parent/child comment relationships.
|
|
46
|
+
|
|
47
|
+
``post_id`` should be the API-facing id (e.g. base36 id or fullname
|
|
48
|
+
``t3_xxx`` — implementations should normalize consistently).
|
|
49
|
+
|
|
50
|
+
``config`` holds OAuth tokens, user-agent suffix, sort, depth limits, etc.
|
|
51
|
+
**Do not** log access tokens or refresh tokens.
|
|
52
|
+
"""
|
|
53
|
+
raise NotImplementedError
|