business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Iterator
5
+ from typing import Any, ClassVar
6
+
7
+ from app.schemas.ingest import RawIngestEnvelope
8
+
9
+
10
+ class Connector(ABC):
11
+ """Source plugin: validate config, optional fetch_iter, normalize raw envelopes."""
12
+
13
+ name: ClassVar[str]
14
+
15
+ def prepare_envelope(
16
+ self,
17
+ envelope: RawIngestEnvelope,
18
+ config: dict[str, Any],
19
+ ) -> RawIngestEnvelope:
20
+ """
21
+ Mutate or replace the envelope before persistence (default: identity).
22
+
23
+ Used for connectors that fetch remote content (e.g. browser session)
24
+ so ``raw_content`` and canonical normalization see extracted text.
25
+ """
26
+ return envelope
27
+
28
+ def validate_config(self, config: dict[str, Any]) -> None:
29
+ """Raise ValueError if *config* is invalid for this connector."""
30
+
31
+ def fetch_iter(self, config: dict[str, Any]) -> Iterator[Any]:
32
+ """Optional batch source; not used by POST /ingest/raw."""
33
+ raise NotImplementedError(
34
+ f"Connector {self.name!r} does not implement fetch_iter()",
35
+ )
36
+
37
+ @abstractmethod
38
+ def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
39
+ """Produce a normalized dict for downstream pipeline steps."""
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from app.connectors.base import Connector
6
+ from app.connectors.registry import register_connector
7
+ from app.schemas.ingest import RawIngestEnvelope
8
+
9
+
10
+ class GenericConnector(Connector):
11
+ name = "generic"
12
+
13
+ def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
14
+ return {
15
+ "source": envelope.source,
16
+ "content_type": envelope.content_type,
17
+ "payload_type": type(envelope.payload).__name__,
18
+ }
19
+
20
+
21
+ class EchoConnector(Connector):
22
+ """Passes through key envelope fields for tests and debugging."""
23
+
24
+ name = "echo"
25
+
26
+ def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
27
+ return {
28
+ "source": envelope.source,
29
+ "content_type": envelope.content_type,
30
+ "metadata_keys": sorted(envelope.metadata.keys()),
31
+ }
32
+
33
+
34
+ class StrictConnector(Connector):
35
+ """Validates metadata.connector_config for integration tests."""
36
+
37
+ name = "strict"
38
+
39
+ def validate_config(self, config: dict[str, Any]) -> None:
40
+ if config.get("ok") is not True:
41
+ raise ValueError("connector_config.ok must be true")
42
+
43
+ def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
44
+ return {"strict": True, "source": envelope.source}
45
+
46
+
47
+ def register_builtin_connectors() -> None:
48
+ register_connector(GenericConnector)
49
+ register_connector(StrictConnector)
50
+ # Side effect: @register_connector on PlaywrightSessionConnector
51
+ import app.connectors.playwright_session # noqa: F401
@@ -0,0 +1,146 @@
1
+ """
2
+ Ingest connector: fetch **visible text** via Playwright + persistent user-data dir.
3
+
4
+ Uses the same ``RawIngestEnvelope`` as other connectors; ``prepare_envelope``
5
+ runs the browser **before** the document is persisted so ``raw_content`` and
6
+ canonical normalization include extracted text.
7
+
8
+ **Envelope shape**
9
+
10
+ - **URL**: ``metadata.url`` *or* string ``payload`` *or* ``payload["url"]``.
11
+ - **Profile directory**: ``metadata.playwright_user_data_dir`` *or*
12
+ ``connector_config.user_data_dir`` (config wins if both set — see code).
13
+
14
+ **connector_config** (safety / tuning)
15
+
16
+ - ``allowlisted_hosts`` (required): non-empty list of host strings; supports
17
+ ``*.example.com`` suffix patterns (see ``host_matches_allowlist``).
18
+ - ``navigation_timeout_ms`` (optional, default 30000).
19
+ - ``max_response_chars`` (optional, default 500000).
20
+ - ``headless`` (optional, default true).
21
+ - ``browser_channel`` (optional): e.g. ``"chrome"`` to use installed Google Chrome.
22
+
23
+ **Credentials / compliance**
24
+
25
+ - This connector does not ship API keys; authenticated pages rely on **cookies and
26
+ session state** already stored in the persistent profile directory.
27
+ - You are responsible for site terms, login policies, and data handling.
28
+
29
+ **Windows / profile locking** (summary)
30
+
31
+ - Use a **dedicated** profile folder for automation; **close** all browser
32
+ windows using that profile before ingest; avoid concurrent runs on the same
33
+ path. Full notes: ``app.services.playwright_extract.extract`` module docstring.
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ from typing import Any
39
+
40
+ from app.connectors.base import Connector
41
+ from app.connectors.registry import register_connector
42
+ from app.schemas.ingest import RawIngestEnvelope
43
+ from app.services.playwright_extract.extract import extract_visible_text_sync
44
+
45
+
46
+ def _resolve_target_url(envelope: RawIngestEnvelope) -> str:
47
+ md = envelope.metadata.get("url")
48
+ if isinstance(md, str) and md.strip():
49
+ return md.strip()
50
+ if isinstance(envelope.payload, str) and envelope.payload.strip():
51
+ return envelope.payload.strip()
52
+ if isinstance(envelope.payload, dict):
53
+ u = envelope.payload.get("url")
54
+ if isinstance(u, str) and u.strip():
55
+ return u.strip()
56
+ msg = "Set metadata.url or payload (string or {url: ...}) for playwright_session"
57
+ raise ValueError(msg)
58
+
59
+
60
+ def _resolve_user_data_dir(envelope: RawIngestEnvelope, config: dict[str, Any]) -> str:
61
+ cfg_dir = config.get("user_data_dir")
62
+ if isinstance(cfg_dir, str) and cfg_dir.strip():
63
+ return cfg_dir.strip()
64
+ md_dir = envelope.metadata.get("playwright_user_data_dir")
65
+ if isinstance(md_dir, str) and md_dir.strip():
66
+ return md_dir.strip()
67
+ msg = (
68
+ "Set connector_config.user_data_dir or metadata.playwright_user_data_dir "
69
+ "to the persistent Chrome/Chromium user data directory"
70
+ )
71
+ raise ValueError(msg)
72
+
73
+
74
+ @register_connector
75
+ class PlaywrightSessionConnector(Connector):
76
+ name = "playwright_session"
77
+
78
+ def validate_config(self, config: dict[str, Any]) -> None:
79
+ hosts = config.get("allowlisted_hosts")
80
+ if not isinstance(hosts, list) or not hosts:
81
+ raise ValueError(
82
+ "connector_config.allowlisted_hosts must be a non-empty list",
83
+ )
84
+ if not all(isinstance(h, str) and h.strip() for h in hosts):
85
+ raise ValueError("allowlisted_hosts entries must be non-empty strings")
86
+
87
+ def prepare_envelope(
88
+ self,
89
+ envelope: RawIngestEnvelope,
90
+ config: dict[str, Any],
91
+ ) -> RawIngestEnvelope:
92
+ url = _resolve_target_url(envelope)
93
+ user_data_dir = _resolve_user_data_dir(envelope, config)
94
+ navigation_timeout_ms = int(config.get("navigation_timeout_ms", 30_000))
95
+ max_response_chars = int(config.get("max_response_chars", 500_000))
96
+ headless = bool(config.get("headless", True))
97
+ ch = config.get("browser_channel")
98
+ browser_channel = str(ch) if isinstance(ch, str) and ch.strip() else None
99
+ hosts = [str(h).strip() for h in config["allowlisted_hosts"] if str(h).strip()]
100
+
101
+ result = extract_visible_text_sync(
102
+ url,
103
+ user_data_dir,
104
+ allowlisted_hosts=hosts,
105
+ navigation_timeout_ms=navigation_timeout_ms,
106
+ max_response_chars=max_response_chars,
107
+ headless=headless,
108
+ browser_channel=browser_channel,
109
+ )
110
+
111
+ meta = dict(envelope.metadata)
112
+ meta["playwright_extraction"] = {
113
+ "requested_url": url,
114
+ "final_url": result.final_url,
115
+ "title": result.title,
116
+ "truncated": result.truncated,
117
+ "profile_meta": result.meta,
118
+ }
119
+
120
+ payload: dict[str, Any] = {
121
+ "text": result.visible_text,
122
+ "title": result.title,
123
+ "url": result.final_url,
124
+ "source_url": url,
125
+ }
126
+
127
+ return envelope.model_copy(
128
+ update={
129
+ "content_type": "text",
130
+ "metadata": meta,
131
+ "payload": payload,
132
+ },
133
+ )
134
+
135
+ def normalize_raw(self, envelope: RawIngestEnvelope) -> dict[str, Any]:
136
+ raw_playwright = envelope.metadata.get("playwright_extraction")
137
+ text_len = 0
138
+ if isinstance(envelope.payload, dict):
139
+ t = envelope.payload.get("text")
140
+ if isinstance(t, str):
141
+ text_len = len(t)
142
+ return {
143
+ "connector": self.name,
144
+ "playwright": raw_playwright if isinstance(raw_playwright, dict) else None,
145
+ "extracted_text_chars": text_len,
146
+ }
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from importlib.metadata import entry_points
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from app.connectors.base import Connector
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _CONNECTORS: dict[str, type[Connector]] = {}
13
+ _INITIALIZED = False
14
+
15
+
16
+ def register_connector(cls: type[Connector]) -> type[Connector]:
17
+ key = cls.name
18
+ if key in _CONNECTORS:
19
+ logger.warning("Overwriting connector registration for key %r", key)
20
+ _CONNECTORS[key] = cls
21
+ return cls
22
+
23
+
24
+ def get_connector_class(key: str) -> type[Connector]:
25
+ return _CONNECTORS[key]
26
+
27
+
28
+ def list_connector_keys() -> list[str]:
29
+ return sorted(_CONNECTORS.keys())
30
+
31
+
32
+ def _load_entrypoint_connectors() -> None:
33
+ from app.connectors.base import Connector as ConnectorBase
34
+
35
+ try:
36
+ eps = entry_points(group="kb.connectors")
37
+ except TypeError:
38
+ eps = entry_points().select(group="kb.connectors")
39
+
40
+ for ep in eps:
41
+ try:
42
+ loaded = ep.load()
43
+ except Exception:
44
+ logger.exception("Failed to load connector entry point %r", ep.name)
45
+ continue
46
+ if isinstance(loaded, type) and issubclass(loaded, ConnectorBase):
47
+ register_connector(loaded)
48
+ else:
49
+ logger.warning(
50
+ "Entry point %r did not resolve to a Connector subclass",
51
+ ep.name,
52
+ )
53
+
54
+
55
+ def _load_builtin_connectors() -> None:
56
+ from app.connectors.builtins import register_builtin_connectors
57
+
58
+ register_builtin_connectors()
59
+
60
+
61
+ def init_connectors() -> None:
62
+ """Idempotent: register built-ins and kb.connectors entry points."""
63
+ global _INITIALIZED
64
+ if _INITIALIZED:
65
+ return
66
+ _load_builtin_connectors()
67
+ _load_entrypoint_connectors()
68
+ _INITIALIZED = True
@@ -0,0 +1,33 @@
1
+ """
2
+ Thread expansion: social threads via **official APIs** (when implemented).
3
+
4
+ Models: ``ThreadDocumentNode``, ``ThreadRelationshipEdge``, ``ThreadExpansionResult``
5
+ for a document + ``relationships`` tree (``reply`` / ``thread`` edges).
6
+
7
+ Abstract fetchers: ``TwitterThreadExpansionFetcher``, ``RedditThreadExpansionFetcher``
8
+ (compliance notes in their docstrings). Fakes: deterministic tests / local use.
9
+
10
+ Production: implement with documented X and Reddit APIs; do not use scraping.
11
+ """
12
+
13
+ from app.connectors.thread_expansion.fakes import (
14
+ FakeRedditThreadExpansionFetcher,
15
+ FakeTwitterThreadExpansionFetcher,
16
+ )
17
+ from app.connectors.thread_expansion.models import (
18
+ ThreadDocumentNode,
19
+ ThreadExpansionResult,
20
+ ThreadRelationshipEdge,
21
+ )
22
+ from app.connectors.thread_expansion.reddit import RedditThreadExpansionFetcher
23
+ from app.connectors.thread_expansion.twitter import TwitterThreadExpansionFetcher
24
+
25
+ __all__ = [
26
+ "FakeRedditThreadExpansionFetcher",
27
+ "FakeTwitterThreadExpansionFetcher",
28
+ "RedditThreadExpansionFetcher",
29
+ "ThreadDocumentNode",
30
+ "ThreadExpansionResult",
31
+ "ThreadRelationshipEdge",
32
+ "TwitterThreadExpansionFetcher",
33
+ ]
@@ -0,0 +1,154 @@
1
+ """
2
+ Deterministic fake thread fetchers for unit tests and local development.
3
+
4
+ No network calls; no credentials required.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import UTC, datetime
10
+ from typing import Any
11
+
12
+ from app.connectors.thread_expansion.models import (
13
+ ThreadDocumentNode,
14
+ ThreadExpansionResult,
15
+ ThreadRelationshipEdge,
16
+ )
17
+ from app.connectors.thread_expansion.reddit import RedditThreadExpansionFetcher
18
+ from app.connectors.thread_expansion.twitter import TwitterThreadExpansionFetcher
19
+
20
+
21
+ class FakeTwitterThreadExpansionFetcher(TwitterThreadExpansionFetcher):
22
+ """
23
+ Returns a small linear 3-tweet thread (root → reply → reply).
24
+
25
+ ``conversation_id`` is echoed into node ``meta`` only; fetch always returns
26
+ the same structure for testing.
27
+ """
28
+
29
+ async def fetch_full_thread(
30
+ self,
31
+ conversation_id: str,
32
+ *,
33
+ config: dict[str, Any],
34
+ ) -> ThreadExpansionResult:
35
+ _ = config
36
+ t0 = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
37
+ nodes = [
38
+ ThreadDocumentNode(
39
+ document_id="twitter:tw_root",
40
+ external_id="1999999999999999999",
41
+ text="Root post about the project launch.",
42
+ author_handle="example_user",
43
+ created_at=t0,
44
+ meta={"conversation_id": conversation_id, "kind": "tweet"},
45
+ ),
46
+ ThreadDocumentNode(
47
+ document_id="twitter:tw_r1",
48
+ external_id="1999999999999999998",
49
+ text="Reply: congratulations!",
50
+ author_handle="fan_one",
51
+ created_at=t0,
52
+ meta={"kind": "tweet"},
53
+ ),
54
+ ThreadDocumentNode(
55
+ document_id="twitter:tw_r2",
56
+ external_id="1999999999999999997",
57
+ text="Reply: when is v2 shipping?",
58
+ author_handle="fan_two",
59
+ created_at=t0,
60
+ meta={"kind": "tweet"},
61
+ ),
62
+ ]
63
+ edges = [
64
+ ThreadRelationshipEdge(
65
+ parent_document_id="twitter:tw_root",
66
+ child_document_id="twitter:tw_r1",
67
+ relation_type="reply",
68
+ meta={"platform": "twitter"},
69
+ ),
70
+ ThreadRelationshipEdge(
71
+ parent_document_id="twitter:tw_r1",
72
+ child_document_id="twitter:tw_r2",
73
+ relation_type="reply",
74
+ meta={"platform": "twitter"},
75
+ ),
76
+ ThreadRelationshipEdge(
77
+ parent_document_id="twitter:tw_root",
78
+ child_document_id="twitter:tw_r2",
79
+ relation_type="thread",
80
+ weight=0.3,
81
+ meta={"platform": "twitter", "note": "optional weak thread link"},
82
+ ),
83
+ ]
84
+ return ThreadExpansionResult(
85
+ platform="twitter",
86
+ thread_key=conversation_id,
87
+ root_document_id="twitter:tw_root",
88
+ nodes=nodes,
89
+ edges=edges,
90
+ )
91
+
92
+
93
+ class FakeRedditThreadExpansionFetcher(RedditThreadExpansionFetcher):
94
+ """
95
+ Returns a post plus two nested comments (linear reply chain).
96
+
97
+ Ignores ``post_id`` except to set ``thread_key`` for stable tests.
98
+ """
99
+
100
+ async def fetch_post_and_comments(
101
+ self,
102
+ post_id: str,
103
+ *,
104
+ config: dict[str, Any],
105
+ ) -> ThreadExpansionResult:
106
+ _ = config
107
+ t0 = datetime(2026, 2, 1, 9, 30, 0, tzinfo=UTC)
108
+ nodes = [
109
+ ThreadDocumentNode(
110
+ document_id="reddit:t3_abc123",
111
+ external_id="t3_abc123",
112
+ text="Ask HN: Best patterns for thread-safe queues?",
113
+ author_handle="op_dev",
114
+ created_at=t0,
115
+ meta={"subreddit": "test", "score": 42, "permalink": "/r/test/abc"},
116
+ ),
117
+ ThreadDocumentNode(
118
+ document_id="reddit:t1_cmnt1",
119
+ external_id="t1_cmnt1",
120
+ text="We use channels and bounded queues.",
121
+ author_handle="commenter_a",
122
+ created_at=t0,
123
+ meta={"depth": 0},
124
+ ),
125
+ ThreadDocumentNode(
126
+ document_id="reddit:t1_cmnt2",
127
+ external_id="t1_cmnt2",
128
+ text="Plus backpressure on the producer side.",
129
+ author_handle="commenter_b",
130
+ created_at=t0,
131
+ meta={"depth": 1},
132
+ ),
133
+ ]
134
+ edges = [
135
+ ThreadRelationshipEdge(
136
+ parent_document_id="reddit:t3_abc123",
137
+ child_document_id="reddit:t1_cmnt1",
138
+ relation_type="reply",
139
+ meta={"platform": "reddit"},
140
+ ),
141
+ ThreadRelationshipEdge(
142
+ parent_document_id="reddit:t1_cmnt1",
143
+ child_document_id="reddit:t1_cmnt2",
144
+ relation_type="reply",
145
+ meta={"platform": "reddit"},
146
+ ),
147
+ ]
148
+ return ThreadExpansionResult(
149
+ platform="reddit",
150
+ thread_key=post_id or "t3_abc123",
151
+ root_document_id="reddit:t3_abc123",
152
+ nodes=nodes,
153
+ edges=edges,
154
+ )
@@ -0,0 +1,113 @@
1
+ """
2
+ Structured thread data for persisting into ``documents`` + ``relationships``.
3
+
4
+ Use ``relation_type`` values that match the DB check constraint:
5
+ ``'link'``, ``'thread'``, ``'reply'``, ``'entity_cooccur'``.
6
+
7
+ For social threads, model the tree with:
8
+
9
+ - ``reply`` — direct parent → child reply edge (authoritative tree shape).
10
+ - ``thread`` — optional weak edge (e.g. root → descendant) for “same thread”
11
+ navigation; omit if you only need the reply tree.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from datetime import datetime
17
+ from typing import Any, Literal
18
+
19
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
20
+
21
+ PlatformLiteral = Literal["twitter", "reddit"]
22
+ ThreadRelationLiteral = Literal["reply", "thread"]
23
+
24
+
25
+ class ThreadDocumentNode(BaseModel):
26
+ """
27
+ One logical post (tweet, Reddit post/comment) mapped to a KB ``documents`` row.
28
+
29
+ ``document_id`` should be stable across fetches (e.g. ``twitter:{tweet_id}``,
30
+ ``reddit:{fullname}``) so upserts and ``relationships`` stay consistent.
31
+ """
32
+
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ document_id: str = Field(..., min_length=1)
36
+ external_id: str = Field(
37
+ ...,
38
+ min_length=1,
39
+ description="Platform-native id (tweet id, Reddit fullname t1_/t3_)",
40
+ )
41
+ text: str = Field(default="", description="Primary text for summary / chunking")
42
+ author_handle: str | None = None
43
+ created_at: datetime
44
+ content_type: Literal["text", "multimodal"] = "text"
45
+ raw_content: dict[str, Any] | None = Field(
46
+ default=None,
47
+ description="Optional structured payload stored as JSON in ingest paths",
48
+ )
49
+ meta: dict[str, Any] = Field(
50
+ default_factory=dict,
51
+ description="Extra attribution: urls, like counts, etc. (JSON-serializable)",
52
+ )
53
+
54
+
55
+ class ThreadRelationshipEdge(BaseModel):
56
+ """
57
+ Maps to ``relationships`` (parent_document_id, child_document_id, relation_type).
58
+
59
+ Use ``reply`` for the reply tree. Use ``thread`` sparingly for non-reply links
60
+ (e.g. root → leaf) if your UI or graph queries need them.
61
+ """
62
+
63
+ model_config = ConfigDict(extra="forbid")
64
+
65
+ parent_document_id: str = Field(..., min_length=1)
66
+ child_document_id: str = Field(..., min_length=1)
67
+ relation_type: ThreadRelationLiteral
68
+ weight: float | None = None
69
+ meta: dict[str, Any] = Field(default_factory=dict)
70
+
71
+ @model_validator(mode="after")
72
+ def _no_self_loop(self) -> ThreadRelationshipEdge:
73
+ if self.parent_document_id == self.child_document_id:
74
+ msg = "parent_document_id and child_document_id must differ"
75
+ raise ValueError(msg)
76
+ return self
77
+
78
+
79
+ class ThreadExpansionResult(BaseModel):
80
+ """
81
+ Full thread fetch: nodes (documents) + edges (relationships preserving tree shape).
82
+
83
+ Persist by inserting/updating one document per node, then inserting edges with
84
+ ``INSERT OR IGNORE`` (or equivalent) to respect the unique constraint on
85
+ ``(parent_document_id, child_document_id, relation_type)``.
86
+ """
87
+
88
+ model_config = ConfigDict(extra="forbid")
89
+
90
+ platform: PlatformLiteral
91
+ thread_key: str = Field(
92
+ ...,
93
+ min_length=1,
94
+ description="Stable thread id: X conversation_id, Reddit post fullname, etc.",
95
+ )
96
+ root_document_id: str = Field(..., min_length=1)
97
+ nodes: list[ThreadDocumentNode]
98
+ edges: list[ThreadRelationshipEdge]
99
+
100
+ @model_validator(mode="after")
101
+ def _graph_integrity(self) -> ThreadExpansionResult:
102
+ ids = {n.document_id for n in self.nodes}
103
+ if self.root_document_id not in ids:
104
+ msg = "root_document_id must match a node.document_id"
105
+ raise ValueError(msg)
106
+ for e in self.edges:
107
+ if e.parent_document_id not in ids:
108
+ msg = f"edge parent not in nodes: {e.parent_document_id!r}"
109
+ raise ValueError(msg)
110
+ if e.child_document_id not in ids:
111
+ msg = f"edge child not in nodes: {e.child_document_id!r}"
112
+ raise ValueError(msg)
113
+ return self
@@ -0,0 +1,53 @@
1
+ """
2
+ Reddit thread expansion — **official Reddit Data API only**.
3
+
4
+ Use reddit.com/dev/api and OAuth-based access. Do not bypass or automate the
5
+ site in ways that conflict with Reddit’s Terms or API rules.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+ from typing import Any
12
+
13
+ from app.connectors.thread_expansion.models import ThreadExpansionResult
14
+
15
+
16
+ class RedditThreadExpansionFetcher(ABC):
17
+ """
18
+ Fetch a post and its comment tree via the **Reddit Data API**.
19
+
20
+ Compliance & credentials (operational checklist — not legal advice):
21
+
22
+ - **API access**: Register an app at Reddit to obtain a client id/secret;
23
+ use OAuth as required for the endpoints you call. Credentials belong in
24
+ server-side configuration only.
25
+ - **User-Agent**: Reddit expects a descriptive ``User-Agent`` string
26
+ (typically including app name and contact). Follow Reddit’s API guidelines.
27
+ - **Policies**: Comply with Reddit’s API Terms, rate limits, and content
28
+ policies. Do not use the API for disallowed bulk or commercial uses unless
29
+ permitted.
30
+ - **Privacy**: Honor moderator and user visibility (removed/deleted content)
31
+ as represented by the API.
32
+
33
+ Implementations MUST NOT scrape HTML threads as a substitute for the API.
34
+ """
35
+
36
+ @abstractmethod
37
+ async def fetch_post_and_comments(
38
+ self,
39
+ post_id: str,
40
+ *,
41
+ config: dict[str, Any],
42
+ ) -> ThreadExpansionResult:
43
+ """
44
+ Return the submission and reachable comments as nodes, with ``reply``
45
+ edges mirroring Reddit’s parent/child comment relationships.
46
+
47
+ ``post_id`` should be the API-facing id (e.g. base36 id or fullname
48
+ ``t3_xxx`` — implementations should normalize consistently).
49
+
50
+ ``config`` holds OAuth tokens, user-agent suffix, sort, depth limits, etc.
51
+ **Do not** log access tokens or refresh tokens.
52
+ """
53
+ raise NotImplementedError