business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ from html.parser import HTMLParser
4
+ from urllib.parse import urljoin
5
+
6
+
7
+ class _LinkCollector(HTMLParser):
8
+ def __init__(self, base_url: str) -> None:
9
+ super().__init__(convert_charrefs=True)
10
+ self.base_url = base_url
11
+ self.links: list[str] = []
12
+
13
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
14
+ if tag.lower() != "a":
15
+ return
16
+ href = None
17
+ for k, v in attrs:
18
+ if k.lower() == "href" and v:
19
+ href = v
20
+ break
21
+ if not href:
22
+ return
23
+ h = href.strip()
24
+ if h.startswith(("#", "mailto:", "javascript:", "tel:")):
25
+ return
26
+ self.links.append(urljoin(self.base_url, h))
27
+
28
+
29
+ class _TextCollector(HTMLParser):
30
+ def __init__(self) -> None:
31
+ super().__init__(convert_charrefs=True)
32
+ self._skip = 0
33
+ self.parts: list[str] = []
34
+
35
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
36
+ t = tag.lower()
37
+ if t in ("script", "style", "noscript"):
38
+ self._skip += 1
39
+
40
+ def handle_endtag(self, tag: str) -> None:
41
+ t = tag.lower()
42
+ if t in ("script", "style", "noscript") and self._skip > 0:
43
+ self._skip -= 1
44
+
45
+ def handle_data(self, data: str) -> None:
46
+ if self._skip == 0 and data.strip():
47
+ self.parts.append(data)
48
+
49
+
50
+ def extract_links_from_html(html: str, base_url: str) -> list[str]:
51
+ p = _LinkCollector(base_url)
52
+ try:
53
+ p.feed(html)
54
+ p.close()
55
+ except Exception:
56
+ return list(dict.fromkeys(p.links))
57
+ return list(dict.fromkeys(p.links))
58
+
59
+
60
+ def extract_text_from_html(html: str, max_chars: int = 200_000) -> str:
61
+ p = _TextCollector()
62
+ try:
63
+ p.feed(html)
64
+ p.close()
65
+ except Exception:
66
+ text = " ".join(p.parts)
67
+ return text[:max_chars]
68
+ text = " ".join(p.parts)
69
+ text = " ".join(text.split())
70
+ if len(text) > max_chars:
71
+ return text[:max_chars]
72
+ return text
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import time
5
+ from collections import defaultdict
6
+
7
+
8
+ class PolitenessGate:
9
+ """Per-host spacing; each wait is max(politeness floor, domain interval gap)."""
10
+
11
+ def __init__(
12
+ self,
13
+ *,
14
+ politeness_delay_s: float,
15
+ per_domain_interval_s: float,
16
+ ) -> None:
17
+ self._politeness = politeness_delay_s
18
+ self._interval = per_domain_interval_s
19
+ self._locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
20
+ self._last_fetch_mono: dict[str, float] = {}
21
+
22
+ async def before_request(self, host: str) -> None:
23
+ key = host.lower()
24
+ lock = self._locks[key]
25
+ async with lock:
26
+ now = time.monotonic()
27
+ last = self._last_fetch_mono.get(key, 0.0)
28
+ gap = max(0.0, self._interval - (now - last))
29
+ wait = max(self._politeness, gap)
30
+ if wait > 0:
31
+ await asyncio.sleep(wait)
32
+ self._last_fetch_mono[key] = time.monotonic()
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from urllib.parse import urlparse
5
+ from urllib.robotparser import RobotFileParser
6
+
7
+ import httpx
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ async def fetch_robots_parser(
13
+ client: httpx.AsyncClient,
14
+ page_url: str,
15
+ cache: dict[str, RobotFileParser | None],
16
+ *,
17
+ timeout_s: float,
18
+ ) -> RobotFileParser | None:
19
+ parsed = urlparse(page_url)
20
+ if not parsed.scheme or not parsed.netloc:
21
+ return None
22
+ host = parsed.netloc.lower()
23
+ if host in cache:
24
+ return cache[host]
25
+ robots_url = f"{parsed.scheme}://{host}/robots.txt"
26
+ try:
27
+ r = await client.get(robots_url, timeout=timeout_s)
28
+ if r.status_code == 200 and (text := r.text):
29
+ rp = RobotFileParser()
30
+ rp.parse(text.splitlines())
31
+ cache[host] = rp
32
+ return rp
33
+ except Exception:
34
+ logger.debug("robots.txt fetch failed for %s", host, exc_info=True)
35
+ cache[host] = None
36
+ return None
37
+
38
+
39
+ def robots_can_fetch(
40
+ rp: RobotFileParser | None,
41
+ user_agent: str,
42
+ url: str,
43
+ ) -> bool:
44
+ if rp is None:
45
+ return True
46
+ return rp.can_fetch(user_agent, url)
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ from app.config import Settings
9
+ from app.services.link_expansion.domain_policy import parse_host_csv
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class ExpandLinksOptions:
14
+ max_depth: int
15
+ allowlist: frozenset[str]
16
+ denylist: frozenset[str]
17
+ respect_robots: bool
18
+
19
+
20
+ class ExpandLinksRequest(BaseModel):
21
+ model_config = ConfigDict(extra="forbid")
22
+
23
+ document_id: str = Field(..., min_length=1)
24
+ max_depth: int | None = Field(default=None, ge=1, le=32)
25
+ domain_allowlist: list[str] | None = None
26
+ domain_denylist: list[str] | None = None
27
+ respect_robots: bool | None = None
28
+
29
+ def resolve(self, settings: Settings) -> ExpandLinksOptions:
30
+ allow = (
31
+ frozenset(h.strip().lower().split(":")[0] for h in self.domain_allowlist)
32
+ if self.domain_allowlist is not None
33
+ else parse_host_csv(settings.link_expand_domain_allowlist)
34
+ )
35
+ deny = (
36
+ frozenset(h.strip().lower().split(":")[0] for h in self.domain_denylist)
37
+ if self.domain_denylist is not None
38
+ else parse_host_csv(settings.link_expand_domain_denylist)
39
+ )
40
+ depth = (
41
+ self.max_depth
42
+ if self.max_depth is not None
43
+ else settings.link_expand_max_depth
44
+ )
45
+ robots = (
46
+ self.respect_robots
47
+ if self.respect_robots is not None
48
+ else settings.link_expand_respect_robots
49
+ )
50
+ return ExpandLinksOptions(
51
+ max_depth=depth,
52
+ allowlist=allow,
53
+ denylist=deny,
54
+ respect_robots=robots,
55
+ )
56
+
57
+
58
+ class ExpandLinksJobResult(BaseModel):
59
+ model_config = ConfigDict(extra="forbid")
60
+
61
+ root_document_id: str
62
+ documents_created: int = 0
63
+ relationships_created: int = 0
64
+ urls_skipped_policy: int = 0
65
+ urls_skipped_robots: int = 0
66
+ fetch_errors: int = 0
67
+ errors: list[dict[str, Any]] = Field(default_factory=list)
@@ -0,0 +1,458 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import json
5
+ import logging
6
+ from collections import deque
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+ from urllib.robotparser import RobotFileParser
10
+ from uuid import uuid4
11
+
12
+ import httpx
13
+ from sqlalchemy import text
14
+ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
15
+
16
+ from app.config import Settings
17
+ from app.services.ingestion.persist import get_or_create_source_id
18
+ from app.services.link_expansion.canonical_url import (
19
+ canonicalize_url,
20
+ host_from_url,
21
+ is_http_url,
22
+ )
23
+ from app.services.link_expansion.domain_policy import host_allowed
24
+ from app.services.link_expansion.html_extract import (
25
+ extract_links_from_html,
26
+ extract_text_from_html,
27
+ )
28
+ from app.services.link_expansion.rate_limit import PolitenessGate
29
+ from app.services.link_expansion.robots import fetch_robots_parser, robots_can_fetch
30
+ from app.services.link_expansion.schemas import ExpandLinksJobResult, ExpandLinksOptions
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ _MAX_SUMMARY = 500
35
+ _TEXT_BLOCK_CAP = 200_000
36
+
37
+
38
+ async def _document_exists(session: AsyncSession, doc_id: str) -> bool:
39
+ r = await session.execute(
40
+ text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
41
+ {"id": doc_id},
42
+ )
43
+ return r.first() is not None
44
+
45
+
46
+ async def _load_outbound_links(session: AsyncSession, doc_id: str) -> list[str]:
47
+ r = await session.execute(
48
+ text(
49
+ "SELECT url FROM document_links WHERE document_id = :d ORDER BY ordinal",
50
+ ),
51
+ {"d": doc_id},
52
+ )
53
+ return [row[0] for row in r.fetchall()]
54
+
55
+
56
+ async def _find_deduped_document(
57
+ session: AsyncSession,
58
+ *,
59
+ canonical_url: str,
60
+ content_sha256: str,
61
+ ) -> str | None:
62
+ r = await session.execute(
63
+ text(
64
+ "SELECT id FROM documents WHERE canonical_url = :u "
65
+ "AND content_sha256 = :h LIMIT 1",
66
+ ),
67
+ {"u": canonical_url, "h": content_sha256},
68
+ )
69
+ row = r.first()
70
+ return str(row[0]) if row else None
71
+
72
+
73
+ async def _insert_relationship(
74
+ session: AsyncSession,
75
+ *,
76
+ parent_id: str,
77
+ child_id: str,
78
+ meta: dict[str, Any],
79
+ ) -> bool:
80
+ res = await session.execute(
81
+ text(
82
+ "INSERT OR IGNORE INTO relationships "
83
+ "(parent_document_id, child_document_id, relation_type, weight, meta) "
84
+ "VALUES (:p, :c, 'link', NULL, :m)",
85
+ ),
86
+ {"p": parent_id, "c": child_id, "m": json.dumps(meta)},
87
+ )
88
+ return (res.rowcount or 0) > 0
89
+
90
+
91
+ async def _insert_partial_error_document(
92
+ session: AsyncSession,
93
+ *,
94
+ source_id: int,
95
+ canonical_url: str,
96
+ requested_url: str,
97
+ error: str,
98
+ ingest_meta: dict[str, Any],
99
+ ) -> str:
100
+ doc_id = str(uuid4())
101
+ ts = datetime.now(UTC).isoformat()
102
+ raw = json.dumps(
103
+ {
104
+ "kind": "link_fetch_error",
105
+ "requested_url": requested_url,
106
+ "canonical_url": canonical_url,
107
+ "error": error,
108
+ },
109
+ )
110
+ meta = {**ingest_meta, "fetch_failed": True, "error": error}
111
+ summary = f"Fetch failed: {error[:200]}"
112
+ await session.execute(
113
+ text(
114
+ "INSERT INTO documents "
115
+ "(id, source_id, timestamp, content_type, raw_content, summary, "
116
+ "status, canonical_url, content_sha256, ingest_meta) "
117
+ "VALUES (:id, :sid, :ts, 'text', :raw, :sum, 'partial', :cu, NULL, :im)",
118
+ ),
119
+ {
120
+ "id": doc_id,
121
+ "sid": source_id,
122
+ "ts": ts,
123
+ "raw": raw,
124
+ "sum": summary,
125
+ "cu": canonical_url,
126
+ "im": json.dumps(meta),
127
+ },
128
+ )
129
+ err_block = json.dumps({"text": summary, "link_expand_error": True})
130
+ await session.execute(
131
+ text(
132
+ "INSERT INTO content_blocks "
133
+ "(document_id, ordinal, type, storage_uri, inline_ref, mime, sha256, meta) "
134
+ "VALUES (:did, 0, 'text', NULL, 'link_expand:error', "
135
+ "'text/plain', NULL, :meta)",
136
+ ),
137
+ {"did": doc_id, "meta": err_block},
138
+ )
139
+ return doc_id
140
+
141
+
142
+ async def _insert_fetched_document(
143
+ session: AsyncSession,
144
+ *,
145
+ source_id: int,
146
+ canonical_url: str,
147
+ content_sha256: str,
148
+ final_url: str,
149
+ requested_url: str,
150
+ body: bytes,
151
+ content_type_header: str | None,
152
+ outbound_urls: list[str],
153
+ ) -> tuple[str, bool]:
154
+ existing = await _find_deduped_document(
155
+ session,
156
+ canonical_url=canonical_url,
157
+ content_sha256=content_sha256,
158
+ )
159
+ if existing:
160
+ return existing, False
161
+
162
+ doc_id = str(uuid4())
163
+ ts = datetime.now(UTC).isoformat()
164
+ html = body.decode("utf-8", errors="replace")
165
+ plain = extract_text_from_html(html, max_chars=_TEXT_BLOCK_CAP)
166
+ summary = plain[:_MAX_SUMMARY] + ("…" if len(plain) > _MAX_SUMMARY else "")
167
+ raw = json.dumps(
168
+ {
169
+ "kind": "link_fetch",
170
+ "requested_url": requested_url,
171
+ "final_url": final_url,
172
+ "content_type": content_type_header,
173
+ "bytes": len(body),
174
+ },
175
+ )
176
+ await session.execute(
177
+ text(
178
+ "INSERT INTO documents "
179
+ "(id, source_id, timestamp, content_type, raw_content, summary, status, "
180
+ "canonical_url, content_sha256, ingest_meta) "
181
+ "VALUES (:id, :sid, :ts, 'text', :raw, :sum, 'partial', :cu, :h, NULL)",
182
+ ),
183
+ {
184
+ "id": doc_id,
185
+ "sid": source_id,
186
+ "ts": ts,
187
+ "raw": raw,
188
+ "sum": summary,
189
+ "cu": canonical_url,
190
+ "h": content_sha256,
191
+ },
192
+ )
193
+ text_meta = json.dumps({"text": plain[:_TEXT_BLOCK_CAP]})
194
+ await session.execute(
195
+ text(
196
+ "INSERT INTO content_blocks "
197
+ "(document_id, ordinal, type, storage_uri, inline_ref, mime, sha256, meta) "
198
+ "VALUES (:did, 0, 'text', NULL, 'link_expand:html', "
199
+ "'text/plain', NULL, :meta)",
200
+ ),
201
+ {"did": doc_id, "meta": text_meta},
202
+ )
203
+ for ordinal, u in enumerate(outbound_urls):
204
+ await session.execute(
205
+ text(
206
+ "INSERT INTO document_links (document_id, url, ordinal) "
207
+ "VALUES (:d, :u, :o)",
208
+ ),
209
+ {"d": doc_id, "u": u, "o": ordinal},
210
+ )
211
+ return doc_id, True
212
+
213
+
214
+ async def expand_links_from_document(
215
+ session: AsyncSession,
216
+ *,
217
+ root_document_id: str,
218
+ options: ExpandLinksOptions,
219
+ settings: Settings,
220
+ ) -> ExpandLinksJobResult:
221
+ result = ExpandLinksJobResult(root_document_id=root_document_id)
222
+ if not await _document_exists(session, root_document_id):
223
+ result.errors.append(
224
+ {"code": "not_found", "detail": "document id does not exist"},
225
+ )
226
+ return result
227
+
228
+ source_id = await get_or_create_source_id(
229
+ session,
230
+ name="link_expansion",
231
+ connector_type="link_expansion",
232
+ )
233
+
234
+ root_links = await _load_outbound_links(session, root_document_id)
235
+ q: deque[tuple[str, str, int]] = deque()
236
+ for u in root_links:
237
+ if is_http_url(u):
238
+ q.append((root_document_id, u, 1))
239
+
240
+ fetch_cache: dict[str, tuple[str, str, str]] = {}
241
+ expanded_children: set[str] = set()
242
+
243
+ gate = PolitenessGate(
244
+ politeness_delay_s=settings.link_expand_politeness_delay_ms / 1000.0,
245
+ per_domain_interval_s=settings.link_expand_per_domain_interval_ms / 1000.0,
246
+ )
247
+ robots_cache: dict[str, RobotFileParser | None] = {}
248
+ timeout = settings.link_expand_fetch_timeout_s
249
+ max_bytes = settings.link_expand_max_response_bytes
250
+ ua = settings.link_expand_user_agent
251
+
252
+ async with httpx.AsyncClient(
253
+ headers={"User-Agent": ua},
254
+ follow_redirects=True,
255
+ ) as client:
256
+ while q:
257
+ parent_id, url, link_depth = q.popleft()
258
+ if link_depth > options.max_depth:
259
+ continue
260
+ if not is_http_url(url):
261
+ result.urls_skipped_policy += 1
262
+ continue
263
+
264
+ req_canon = canonicalize_url(url)
265
+ host = host_from_url(url)
266
+ if not host:
267
+ result.urls_skipped_policy += 1
268
+ continue
269
+ if not host_allowed(
270
+ host,
271
+ allowlist=options.allowlist,
272
+ denylist=options.denylist,
273
+ ):
274
+ result.urls_skipped_policy += 1
275
+ continue
276
+
277
+ if req_canon in fetch_cache:
278
+ final_c, sha_hex, child_id = fetch_cache[req_canon]
279
+ meta = {
280
+ "requested_url": url,
281
+ "final_url": final_c,
282
+ "depth": link_depth,
283
+ "cache_hit": True,
284
+ }
285
+ if await _insert_relationship(
286
+ session,
287
+ parent_id=parent_id,
288
+ child_id=child_id,
289
+ meta=meta,
290
+ ):
291
+ result.relationships_created += 1
292
+ if link_depth < options.max_depth and child_id not in expanded_children:
293
+ expanded_children.add(child_id)
294
+ for nxt in await _load_outbound_links(session, child_id):
295
+ if is_http_url(nxt):
296
+ q.append((child_id, nxt, link_depth + 1))
297
+ continue
298
+
299
+ await gate.before_request(host)
300
+
301
+ if options.respect_robots:
302
+ rp = await fetch_robots_parser(
303
+ client,
304
+ url,
305
+ robots_cache,
306
+ timeout_s=min(timeout, 15.0),
307
+ )
308
+ if not robots_can_fetch(rp, ua, url):
309
+ result.urls_skipped_robots += 1
310
+ continue
311
+
312
+ try:
313
+ head = None
314
+ try:
315
+ head = await client.head(url, timeout=timeout)
316
+ except httpx.HTTPError:
317
+ pass
318
+ if head is not None and head.is_success:
319
+ cl = head.headers.get("content-length")
320
+ if cl is not None:
321
+ try:
322
+ if int(cl) > max_bytes:
323
+ raise OSError(f"content-length {cl} exceeds cap")
324
+ except ValueError:
325
+ pass
326
+ response = await client.get(url, timeout=timeout)
327
+ except Exception as e:
328
+ logger.info("link fetch failed %s: %s", url, e)
329
+ result.fetch_errors += 1
330
+ result.errors.append({"url": url, "error": str(e)})
331
+ canon_for_err = canonicalize_url(str(url))
332
+ child_id = await _insert_partial_error_document(
333
+ session,
334
+ source_id=source_id,
335
+ canonical_url=canon_for_err,
336
+ requested_url=url,
337
+ error=str(e),
338
+ ingest_meta={"requested_url": url},
339
+ )
340
+ result.documents_created += 1
341
+ fetch_cache[req_canon] = (canon_for_err, "", child_id)
342
+ meta = {
343
+ "requested_url": url,
344
+ "final_url": canon_for_err,
345
+ "depth": link_depth,
346
+ "fetch_error": True,
347
+ }
348
+ if await _insert_relationship(
349
+ session,
350
+ parent_id=parent_id,
351
+ child_id=child_id,
352
+ meta=meta,
353
+ ):
354
+ result.relationships_created += 1
355
+ continue
356
+
357
+ body = response.content
358
+ if len(body) > max_bytes:
359
+ err = f"response body {len(body)} exceeds max_bytes={max_bytes}"
360
+ result.fetch_errors += 1
361
+ result.errors.append({"url": url, "error": err})
362
+ final_u = canonicalize_url(str(response.url))
363
+ child_id = await _insert_partial_error_document(
364
+ session,
365
+ source_id=source_id,
366
+ canonical_url=final_u,
367
+ requested_url=url,
368
+ error=err,
369
+ ingest_meta={"requested_url": url},
370
+ )
371
+ result.documents_created += 1
372
+ fetch_cache[req_canon] = (final_u, "", child_id)
373
+ meta = {
374
+ "requested_url": url,
375
+ "final_url": final_u,
376
+ "depth": link_depth,
377
+ "fetch_error": True,
378
+ }
379
+ if await _insert_relationship(
380
+ session,
381
+ parent_id=parent_id,
382
+ child_id=child_id,
383
+ meta=meta,
384
+ ):
385
+ result.relationships_created += 1
386
+ continue
387
+
388
+ final_canon = canonicalize_url(str(response.url))
389
+ sha_hex = hashlib.sha256(body).hexdigest()
390
+
391
+ ct = (response.headers.get("content-type") or "").split(";")[0].strip()
392
+ outbound: list[str] = []
393
+ if "html" in ct.lower() or body.lstrip()[:1] in (b"<", b"\xef"):
394
+ outbound = extract_links_from_html(
395
+ body.decode("utf-8", errors="replace"),
396
+ str(response.url),
397
+ )
398
+
399
+ child_id, inserted_new = await _insert_fetched_document(
400
+ session,
401
+ source_id=source_id,
402
+ canonical_url=final_canon,
403
+ content_sha256=sha_hex,
404
+ final_url=str(response.url),
405
+ requested_url=url,
406
+ body=body,
407
+ content_type_header=ct or None,
408
+ outbound_urls=outbound,
409
+ )
410
+ if inserted_new:
411
+ result.documents_created += 1
412
+
413
+ fetch_cache[req_canon] = (final_canon, sha_hex, child_id)
414
+
415
+ meta = {
416
+ "requested_url": url,
417
+ "final_url": str(response.url),
418
+ "depth": link_depth,
419
+ "content_sha256": sha_hex,
420
+ }
421
+ if await _insert_relationship(
422
+ session,
423
+ parent_id=parent_id,
424
+ child_id=child_id,
425
+ meta=meta,
426
+ ):
427
+ result.relationships_created += 1
428
+
429
+ if link_depth < options.max_depth and child_id not in expanded_children:
430
+ expanded_children.add(child_id)
431
+ to_follow = await _load_outbound_links(session, child_id)
432
+ for nxt in to_follow:
433
+ if is_http_url(nxt):
434
+ q.append((child_id, nxt, link_depth + 1))
435
+
436
+ return result
437
+
438
+
439
+ async def run_expand_links_job(
440
+ *,
441
+ root_document_id: str,
442
+ options: ExpandLinksOptions,
443
+ session_factory: async_sessionmaker[AsyncSession],
444
+ settings: Settings,
445
+ ) -> None:
446
+ """Background entrypoint: own session + commit."""
447
+ async with session_factory() as session:
448
+ try:
449
+ await expand_links_from_document(
450
+ session,
451
+ root_document_id=root_document_id,
452
+ options=options,
453
+ settings=settings,
454
+ )
455
+ await session.commit()
456
+ except Exception:
457
+ logger.exception("link expansion job failed for %s", root_document_id)
458
+ await session.rollback()
@@ -0,0 +1,7 @@
1
+ from app.services.normalization.normalizer import normalize_envelope_to_canonical
2
+ from app.services.normalization.persist_normalized import persist_normalized_document
3
+
4
+ __all__ = [
5
+ "normalize_envelope_to_canonical",
6
+ "persist_normalized_document",
7
+ ]