business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from html.parser import HTMLParser
|
|
4
|
+
from urllib.parse import urljoin
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _LinkCollector(HTMLParser):
|
|
8
|
+
def __init__(self, base_url: str) -> None:
|
|
9
|
+
super().__init__(convert_charrefs=True)
|
|
10
|
+
self.base_url = base_url
|
|
11
|
+
self.links: list[str] = []
|
|
12
|
+
|
|
13
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
14
|
+
if tag.lower() != "a":
|
|
15
|
+
return
|
|
16
|
+
href = None
|
|
17
|
+
for k, v in attrs:
|
|
18
|
+
if k.lower() == "href" and v:
|
|
19
|
+
href = v
|
|
20
|
+
break
|
|
21
|
+
if not href:
|
|
22
|
+
return
|
|
23
|
+
h = href.strip()
|
|
24
|
+
if h.startswith(("#", "mailto:", "javascript:", "tel:")):
|
|
25
|
+
return
|
|
26
|
+
self.links.append(urljoin(self.base_url, h))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class _TextCollector(HTMLParser):
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
super().__init__(convert_charrefs=True)
|
|
32
|
+
self._skip = 0
|
|
33
|
+
self.parts: list[str] = []
|
|
34
|
+
|
|
35
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
36
|
+
t = tag.lower()
|
|
37
|
+
if t in ("script", "style", "noscript"):
|
|
38
|
+
self._skip += 1
|
|
39
|
+
|
|
40
|
+
def handle_endtag(self, tag: str) -> None:
|
|
41
|
+
t = tag.lower()
|
|
42
|
+
if t in ("script", "style", "noscript") and self._skip > 0:
|
|
43
|
+
self._skip -= 1
|
|
44
|
+
|
|
45
|
+
def handle_data(self, data: str) -> None:
|
|
46
|
+
if self._skip == 0 and data.strip():
|
|
47
|
+
self.parts.append(data)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def extract_links_from_html(html: str, base_url: str) -> list[str]:
|
|
51
|
+
p = _LinkCollector(base_url)
|
|
52
|
+
try:
|
|
53
|
+
p.feed(html)
|
|
54
|
+
p.close()
|
|
55
|
+
except Exception:
|
|
56
|
+
return list(dict.fromkeys(p.links))
|
|
57
|
+
return list(dict.fromkeys(p.links))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def extract_text_from_html(html: str, max_chars: int = 200_000) -> str:
|
|
61
|
+
p = _TextCollector()
|
|
62
|
+
try:
|
|
63
|
+
p.feed(html)
|
|
64
|
+
p.close()
|
|
65
|
+
except Exception:
|
|
66
|
+
text = " ".join(p.parts)
|
|
67
|
+
return text[:max_chars]
|
|
68
|
+
text = " ".join(p.parts)
|
|
69
|
+
text = " ".join(text.split())
|
|
70
|
+
if len(text) > max_chars:
|
|
71
|
+
return text[:max_chars]
|
|
72
|
+
return text
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PolitenessGate:
|
|
9
|
+
"""Per-host spacing; each wait is max(politeness floor, domain interval gap)."""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
*,
|
|
14
|
+
politeness_delay_s: float,
|
|
15
|
+
per_domain_interval_s: float,
|
|
16
|
+
) -> None:
|
|
17
|
+
self._politeness = politeness_delay_s
|
|
18
|
+
self._interval = per_domain_interval_s
|
|
19
|
+
self._locks: dict[str, asyncio.Lock] = defaultdict(asyncio.Lock)
|
|
20
|
+
self._last_fetch_mono: dict[str, float] = {}
|
|
21
|
+
|
|
22
|
+
async def before_request(self, host: str) -> None:
|
|
23
|
+
key = host.lower()
|
|
24
|
+
lock = self._locks[key]
|
|
25
|
+
async with lock:
|
|
26
|
+
now = time.monotonic()
|
|
27
|
+
last = self._last_fetch_mono.get(key, 0.0)
|
|
28
|
+
gap = max(0.0, self._interval - (now - last))
|
|
29
|
+
wait = max(self._politeness, gap)
|
|
30
|
+
if wait > 0:
|
|
31
|
+
await asyncio.sleep(wait)
|
|
32
|
+
self._last_fetch_mono[key] = time.monotonic()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
from urllib.robotparser import RobotFileParser
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def fetch_robots_parser(
|
|
13
|
+
client: httpx.AsyncClient,
|
|
14
|
+
page_url: str,
|
|
15
|
+
cache: dict[str, RobotFileParser | None],
|
|
16
|
+
*,
|
|
17
|
+
timeout_s: float,
|
|
18
|
+
) -> RobotFileParser | None:
|
|
19
|
+
parsed = urlparse(page_url)
|
|
20
|
+
if not parsed.scheme or not parsed.netloc:
|
|
21
|
+
return None
|
|
22
|
+
host = parsed.netloc.lower()
|
|
23
|
+
if host in cache:
|
|
24
|
+
return cache[host]
|
|
25
|
+
robots_url = f"{parsed.scheme}://{host}/robots.txt"
|
|
26
|
+
try:
|
|
27
|
+
r = await client.get(robots_url, timeout=timeout_s)
|
|
28
|
+
if r.status_code == 200 and (text := r.text):
|
|
29
|
+
rp = RobotFileParser()
|
|
30
|
+
rp.parse(text.splitlines())
|
|
31
|
+
cache[host] = rp
|
|
32
|
+
return rp
|
|
33
|
+
except Exception:
|
|
34
|
+
logger.debug("robots.txt fetch failed for %s", host, exc_info=True)
|
|
35
|
+
cache[host] = None
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def robots_can_fetch(
|
|
40
|
+
rp: RobotFileParser | None,
|
|
41
|
+
user_agent: str,
|
|
42
|
+
url: str,
|
|
43
|
+
) -> bool:
|
|
44
|
+
if rp is None:
|
|
45
|
+
return True
|
|
46
|
+
return rp.can_fetch(user_agent, url)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
from app.config import Settings
|
|
9
|
+
from app.services.link_expansion.domain_policy import parse_host_csv
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class ExpandLinksOptions:
|
|
14
|
+
max_depth: int
|
|
15
|
+
allowlist: frozenset[str]
|
|
16
|
+
denylist: frozenset[str]
|
|
17
|
+
respect_robots: bool
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ExpandLinksRequest(BaseModel):
|
|
21
|
+
model_config = ConfigDict(extra="forbid")
|
|
22
|
+
|
|
23
|
+
document_id: str = Field(..., min_length=1)
|
|
24
|
+
max_depth: int | None = Field(default=None, ge=1, le=32)
|
|
25
|
+
domain_allowlist: list[str] | None = None
|
|
26
|
+
domain_denylist: list[str] | None = None
|
|
27
|
+
respect_robots: bool | None = None
|
|
28
|
+
|
|
29
|
+
def resolve(self, settings: Settings) -> ExpandLinksOptions:
|
|
30
|
+
allow = (
|
|
31
|
+
frozenset(h.strip().lower().split(":")[0] for h in self.domain_allowlist)
|
|
32
|
+
if self.domain_allowlist is not None
|
|
33
|
+
else parse_host_csv(settings.link_expand_domain_allowlist)
|
|
34
|
+
)
|
|
35
|
+
deny = (
|
|
36
|
+
frozenset(h.strip().lower().split(":")[0] for h in self.domain_denylist)
|
|
37
|
+
if self.domain_denylist is not None
|
|
38
|
+
else parse_host_csv(settings.link_expand_domain_denylist)
|
|
39
|
+
)
|
|
40
|
+
depth = (
|
|
41
|
+
self.max_depth
|
|
42
|
+
if self.max_depth is not None
|
|
43
|
+
else settings.link_expand_max_depth
|
|
44
|
+
)
|
|
45
|
+
robots = (
|
|
46
|
+
self.respect_robots
|
|
47
|
+
if self.respect_robots is not None
|
|
48
|
+
else settings.link_expand_respect_robots
|
|
49
|
+
)
|
|
50
|
+
return ExpandLinksOptions(
|
|
51
|
+
max_depth=depth,
|
|
52
|
+
allowlist=allow,
|
|
53
|
+
denylist=deny,
|
|
54
|
+
respect_robots=robots,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ExpandLinksJobResult(BaseModel):
|
|
59
|
+
model_config = ConfigDict(extra="forbid")
|
|
60
|
+
|
|
61
|
+
root_document_id: str
|
|
62
|
+
documents_created: int = 0
|
|
63
|
+
relationships_created: int = 0
|
|
64
|
+
urls_skipped_policy: int = 0
|
|
65
|
+
urls_skipped_robots: int = 0
|
|
66
|
+
fetch_errors: int = 0
|
|
67
|
+
errors: list[dict[str, Any]] = Field(default_factory=list)
|
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from collections import deque
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
from urllib.robotparser import RobotFileParser
|
|
10
|
+
from uuid import uuid4
|
|
11
|
+
|
|
12
|
+
import httpx
|
|
13
|
+
from sqlalchemy import text
|
|
14
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
15
|
+
|
|
16
|
+
from app.config import Settings
|
|
17
|
+
from app.services.ingestion.persist import get_or_create_source_id
|
|
18
|
+
from app.services.link_expansion.canonical_url import (
|
|
19
|
+
canonicalize_url,
|
|
20
|
+
host_from_url,
|
|
21
|
+
is_http_url,
|
|
22
|
+
)
|
|
23
|
+
from app.services.link_expansion.domain_policy import host_allowed
|
|
24
|
+
from app.services.link_expansion.html_extract import (
|
|
25
|
+
extract_links_from_html,
|
|
26
|
+
extract_text_from_html,
|
|
27
|
+
)
|
|
28
|
+
from app.services.link_expansion.rate_limit import PolitenessGate
|
|
29
|
+
from app.services.link_expansion.robots import fetch_robots_parser, robots_can_fetch
|
|
30
|
+
from app.services.link_expansion.schemas import ExpandLinksJobResult, ExpandLinksOptions
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
_MAX_SUMMARY = 500
|
|
35
|
+
_TEXT_BLOCK_CAP = 200_000
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def _document_exists(session: AsyncSession, doc_id: str) -> bool:
|
|
39
|
+
r = await session.execute(
|
|
40
|
+
text("SELECT 1 FROM documents WHERE id = :id LIMIT 1"),
|
|
41
|
+
{"id": doc_id},
|
|
42
|
+
)
|
|
43
|
+
return r.first() is not None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def _load_outbound_links(session: AsyncSession, doc_id: str) -> list[str]:
|
|
47
|
+
r = await session.execute(
|
|
48
|
+
text(
|
|
49
|
+
"SELECT url FROM document_links WHERE document_id = :d ORDER BY ordinal",
|
|
50
|
+
),
|
|
51
|
+
{"d": doc_id},
|
|
52
|
+
)
|
|
53
|
+
return [row[0] for row in r.fetchall()]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def _find_deduped_document(
|
|
57
|
+
session: AsyncSession,
|
|
58
|
+
*,
|
|
59
|
+
canonical_url: str,
|
|
60
|
+
content_sha256: str,
|
|
61
|
+
) -> str | None:
|
|
62
|
+
r = await session.execute(
|
|
63
|
+
text(
|
|
64
|
+
"SELECT id FROM documents WHERE canonical_url = :u "
|
|
65
|
+
"AND content_sha256 = :h LIMIT 1",
|
|
66
|
+
),
|
|
67
|
+
{"u": canonical_url, "h": content_sha256},
|
|
68
|
+
)
|
|
69
|
+
row = r.first()
|
|
70
|
+
return str(row[0]) if row else None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def _insert_relationship(
|
|
74
|
+
session: AsyncSession,
|
|
75
|
+
*,
|
|
76
|
+
parent_id: str,
|
|
77
|
+
child_id: str,
|
|
78
|
+
meta: dict[str, Any],
|
|
79
|
+
) -> bool:
|
|
80
|
+
res = await session.execute(
|
|
81
|
+
text(
|
|
82
|
+
"INSERT OR IGNORE INTO relationships "
|
|
83
|
+
"(parent_document_id, child_document_id, relation_type, weight, meta) "
|
|
84
|
+
"VALUES (:p, :c, 'link', NULL, :m)",
|
|
85
|
+
),
|
|
86
|
+
{"p": parent_id, "c": child_id, "m": json.dumps(meta)},
|
|
87
|
+
)
|
|
88
|
+
return (res.rowcount or 0) > 0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def _insert_partial_error_document(
|
|
92
|
+
session: AsyncSession,
|
|
93
|
+
*,
|
|
94
|
+
source_id: int,
|
|
95
|
+
canonical_url: str,
|
|
96
|
+
requested_url: str,
|
|
97
|
+
error: str,
|
|
98
|
+
ingest_meta: dict[str, Any],
|
|
99
|
+
) -> str:
|
|
100
|
+
doc_id = str(uuid4())
|
|
101
|
+
ts = datetime.now(UTC).isoformat()
|
|
102
|
+
raw = json.dumps(
|
|
103
|
+
{
|
|
104
|
+
"kind": "link_fetch_error",
|
|
105
|
+
"requested_url": requested_url,
|
|
106
|
+
"canonical_url": canonical_url,
|
|
107
|
+
"error": error,
|
|
108
|
+
},
|
|
109
|
+
)
|
|
110
|
+
meta = {**ingest_meta, "fetch_failed": True, "error": error}
|
|
111
|
+
summary = f"Fetch failed: {error[:200]}"
|
|
112
|
+
await session.execute(
|
|
113
|
+
text(
|
|
114
|
+
"INSERT INTO documents "
|
|
115
|
+
"(id, source_id, timestamp, content_type, raw_content, summary, "
|
|
116
|
+
"status, canonical_url, content_sha256, ingest_meta) "
|
|
117
|
+
"VALUES (:id, :sid, :ts, 'text', :raw, :sum, 'partial', :cu, NULL, :im)",
|
|
118
|
+
),
|
|
119
|
+
{
|
|
120
|
+
"id": doc_id,
|
|
121
|
+
"sid": source_id,
|
|
122
|
+
"ts": ts,
|
|
123
|
+
"raw": raw,
|
|
124
|
+
"sum": summary,
|
|
125
|
+
"cu": canonical_url,
|
|
126
|
+
"im": json.dumps(meta),
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
err_block = json.dumps({"text": summary, "link_expand_error": True})
|
|
130
|
+
await session.execute(
|
|
131
|
+
text(
|
|
132
|
+
"INSERT INTO content_blocks "
|
|
133
|
+
"(document_id, ordinal, type, storage_uri, inline_ref, mime, sha256, meta) "
|
|
134
|
+
"VALUES (:did, 0, 'text', NULL, 'link_expand:error', "
|
|
135
|
+
"'text/plain', NULL, :meta)",
|
|
136
|
+
),
|
|
137
|
+
{"did": doc_id, "meta": err_block},
|
|
138
|
+
)
|
|
139
|
+
return doc_id
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
async def _insert_fetched_document(
|
|
143
|
+
session: AsyncSession,
|
|
144
|
+
*,
|
|
145
|
+
source_id: int,
|
|
146
|
+
canonical_url: str,
|
|
147
|
+
content_sha256: str,
|
|
148
|
+
final_url: str,
|
|
149
|
+
requested_url: str,
|
|
150
|
+
body: bytes,
|
|
151
|
+
content_type_header: str | None,
|
|
152
|
+
outbound_urls: list[str],
|
|
153
|
+
) -> tuple[str, bool]:
|
|
154
|
+
existing = await _find_deduped_document(
|
|
155
|
+
session,
|
|
156
|
+
canonical_url=canonical_url,
|
|
157
|
+
content_sha256=content_sha256,
|
|
158
|
+
)
|
|
159
|
+
if existing:
|
|
160
|
+
return existing, False
|
|
161
|
+
|
|
162
|
+
doc_id = str(uuid4())
|
|
163
|
+
ts = datetime.now(UTC).isoformat()
|
|
164
|
+
html = body.decode("utf-8", errors="replace")
|
|
165
|
+
plain = extract_text_from_html(html, max_chars=_TEXT_BLOCK_CAP)
|
|
166
|
+
summary = plain[:_MAX_SUMMARY] + ("…" if len(plain) > _MAX_SUMMARY else "")
|
|
167
|
+
raw = json.dumps(
|
|
168
|
+
{
|
|
169
|
+
"kind": "link_fetch",
|
|
170
|
+
"requested_url": requested_url,
|
|
171
|
+
"final_url": final_url,
|
|
172
|
+
"content_type": content_type_header,
|
|
173
|
+
"bytes": len(body),
|
|
174
|
+
},
|
|
175
|
+
)
|
|
176
|
+
await session.execute(
|
|
177
|
+
text(
|
|
178
|
+
"INSERT INTO documents "
|
|
179
|
+
"(id, source_id, timestamp, content_type, raw_content, summary, status, "
|
|
180
|
+
"canonical_url, content_sha256, ingest_meta) "
|
|
181
|
+
"VALUES (:id, :sid, :ts, 'text', :raw, :sum, 'partial', :cu, :h, NULL)",
|
|
182
|
+
),
|
|
183
|
+
{
|
|
184
|
+
"id": doc_id,
|
|
185
|
+
"sid": source_id,
|
|
186
|
+
"ts": ts,
|
|
187
|
+
"raw": raw,
|
|
188
|
+
"sum": summary,
|
|
189
|
+
"cu": canonical_url,
|
|
190
|
+
"h": content_sha256,
|
|
191
|
+
},
|
|
192
|
+
)
|
|
193
|
+
text_meta = json.dumps({"text": plain[:_TEXT_BLOCK_CAP]})
|
|
194
|
+
await session.execute(
|
|
195
|
+
text(
|
|
196
|
+
"INSERT INTO content_blocks "
|
|
197
|
+
"(document_id, ordinal, type, storage_uri, inline_ref, mime, sha256, meta) "
|
|
198
|
+
"VALUES (:did, 0, 'text', NULL, 'link_expand:html', "
|
|
199
|
+
"'text/plain', NULL, :meta)",
|
|
200
|
+
),
|
|
201
|
+
{"did": doc_id, "meta": text_meta},
|
|
202
|
+
)
|
|
203
|
+
for ordinal, u in enumerate(outbound_urls):
|
|
204
|
+
await session.execute(
|
|
205
|
+
text(
|
|
206
|
+
"INSERT INTO document_links (document_id, url, ordinal) "
|
|
207
|
+
"VALUES (:d, :u, :o)",
|
|
208
|
+
),
|
|
209
|
+
{"d": doc_id, "u": u, "o": ordinal},
|
|
210
|
+
)
|
|
211
|
+
return doc_id, True
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
async def expand_links_from_document(
|
|
215
|
+
session: AsyncSession,
|
|
216
|
+
*,
|
|
217
|
+
root_document_id: str,
|
|
218
|
+
options: ExpandLinksOptions,
|
|
219
|
+
settings: Settings,
|
|
220
|
+
) -> ExpandLinksJobResult:
|
|
221
|
+
result = ExpandLinksJobResult(root_document_id=root_document_id)
|
|
222
|
+
if not await _document_exists(session, root_document_id):
|
|
223
|
+
result.errors.append(
|
|
224
|
+
{"code": "not_found", "detail": "document id does not exist"},
|
|
225
|
+
)
|
|
226
|
+
return result
|
|
227
|
+
|
|
228
|
+
source_id = await get_or_create_source_id(
|
|
229
|
+
session,
|
|
230
|
+
name="link_expansion",
|
|
231
|
+
connector_type="link_expansion",
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
root_links = await _load_outbound_links(session, root_document_id)
|
|
235
|
+
q: deque[tuple[str, str, int]] = deque()
|
|
236
|
+
for u in root_links:
|
|
237
|
+
if is_http_url(u):
|
|
238
|
+
q.append((root_document_id, u, 1))
|
|
239
|
+
|
|
240
|
+
fetch_cache: dict[str, tuple[str, str, str]] = {}
|
|
241
|
+
expanded_children: set[str] = set()
|
|
242
|
+
|
|
243
|
+
gate = PolitenessGate(
|
|
244
|
+
politeness_delay_s=settings.link_expand_politeness_delay_ms / 1000.0,
|
|
245
|
+
per_domain_interval_s=settings.link_expand_per_domain_interval_ms / 1000.0,
|
|
246
|
+
)
|
|
247
|
+
robots_cache: dict[str, RobotFileParser | None] = {}
|
|
248
|
+
timeout = settings.link_expand_fetch_timeout_s
|
|
249
|
+
max_bytes = settings.link_expand_max_response_bytes
|
|
250
|
+
ua = settings.link_expand_user_agent
|
|
251
|
+
|
|
252
|
+
async with httpx.AsyncClient(
|
|
253
|
+
headers={"User-Agent": ua},
|
|
254
|
+
follow_redirects=True,
|
|
255
|
+
) as client:
|
|
256
|
+
while q:
|
|
257
|
+
parent_id, url, link_depth = q.popleft()
|
|
258
|
+
if link_depth > options.max_depth:
|
|
259
|
+
continue
|
|
260
|
+
if not is_http_url(url):
|
|
261
|
+
result.urls_skipped_policy += 1
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
req_canon = canonicalize_url(url)
|
|
265
|
+
host = host_from_url(url)
|
|
266
|
+
if not host:
|
|
267
|
+
result.urls_skipped_policy += 1
|
|
268
|
+
continue
|
|
269
|
+
if not host_allowed(
|
|
270
|
+
host,
|
|
271
|
+
allowlist=options.allowlist,
|
|
272
|
+
denylist=options.denylist,
|
|
273
|
+
):
|
|
274
|
+
result.urls_skipped_policy += 1
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
if req_canon in fetch_cache:
|
|
278
|
+
final_c, sha_hex, child_id = fetch_cache[req_canon]
|
|
279
|
+
meta = {
|
|
280
|
+
"requested_url": url,
|
|
281
|
+
"final_url": final_c,
|
|
282
|
+
"depth": link_depth,
|
|
283
|
+
"cache_hit": True,
|
|
284
|
+
}
|
|
285
|
+
if await _insert_relationship(
|
|
286
|
+
session,
|
|
287
|
+
parent_id=parent_id,
|
|
288
|
+
child_id=child_id,
|
|
289
|
+
meta=meta,
|
|
290
|
+
):
|
|
291
|
+
result.relationships_created += 1
|
|
292
|
+
if link_depth < options.max_depth and child_id not in expanded_children:
|
|
293
|
+
expanded_children.add(child_id)
|
|
294
|
+
for nxt in await _load_outbound_links(session, child_id):
|
|
295
|
+
if is_http_url(nxt):
|
|
296
|
+
q.append((child_id, nxt, link_depth + 1))
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
await gate.before_request(host)
|
|
300
|
+
|
|
301
|
+
if options.respect_robots:
|
|
302
|
+
rp = await fetch_robots_parser(
|
|
303
|
+
client,
|
|
304
|
+
url,
|
|
305
|
+
robots_cache,
|
|
306
|
+
timeout_s=min(timeout, 15.0),
|
|
307
|
+
)
|
|
308
|
+
if not robots_can_fetch(rp, ua, url):
|
|
309
|
+
result.urls_skipped_robots += 1
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
head = None
|
|
314
|
+
try:
|
|
315
|
+
head = await client.head(url, timeout=timeout)
|
|
316
|
+
except httpx.HTTPError:
|
|
317
|
+
pass
|
|
318
|
+
if head is not None and head.is_success:
|
|
319
|
+
cl = head.headers.get("content-length")
|
|
320
|
+
if cl is not None:
|
|
321
|
+
try:
|
|
322
|
+
if int(cl) > max_bytes:
|
|
323
|
+
raise OSError(f"content-length {cl} exceeds cap")
|
|
324
|
+
except ValueError:
|
|
325
|
+
pass
|
|
326
|
+
response = await client.get(url, timeout=timeout)
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.info("link fetch failed %s: %s", url, e)
|
|
329
|
+
result.fetch_errors += 1
|
|
330
|
+
result.errors.append({"url": url, "error": str(e)})
|
|
331
|
+
canon_for_err = canonicalize_url(str(url))
|
|
332
|
+
child_id = await _insert_partial_error_document(
|
|
333
|
+
session,
|
|
334
|
+
source_id=source_id,
|
|
335
|
+
canonical_url=canon_for_err,
|
|
336
|
+
requested_url=url,
|
|
337
|
+
error=str(e),
|
|
338
|
+
ingest_meta={"requested_url": url},
|
|
339
|
+
)
|
|
340
|
+
result.documents_created += 1
|
|
341
|
+
fetch_cache[req_canon] = (canon_for_err, "", child_id)
|
|
342
|
+
meta = {
|
|
343
|
+
"requested_url": url,
|
|
344
|
+
"final_url": canon_for_err,
|
|
345
|
+
"depth": link_depth,
|
|
346
|
+
"fetch_error": True,
|
|
347
|
+
}
|
|
348
|
+
if await _insert_relationship(
|
|
349
|
+
session,
|
|
350
|
+
parent_id=parent_id,
|
|
351
|
+
child_id=child_id,
|
|
352
|
+
meta=meta,
|
|
353
|
+
):
|
|
354
|
+
result.relationships_created += 1
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
body = response.content
|
|
358
|
+
if len(body) > max_bytes:
|
|
359
|
+
err = f"response body {len(body)} exceeds max_bytes={max_bytes}"
|
|
360
|
+
result.fetch_errors += 1
|
|
361
|
+
result.errors.append({"url": url, "error": err})
|
|
362
|
+
final_u = canonicalize_url(str(response.url))
|
|
363
|
+
child_id = await _insert_partial_error_document(
|
|
364
|
+
session,
|
|
365
|
+
source_id=source_id,
|
|
366
|
+
canonical_url=final_u,
|
|
367
|
+
requested_url=url,
|
|
368
|
+
error=err,
|
|
369
|
+
ingest_meta={"requested_url": url},
|
|
370
|
+
)
|
|
371
|
+
result.documents_created += 1
|
|
372
|
+
fetch_cache[req_canon] = (final_u, "", child_id)
|
|
373
|
+
meta = {
|
|
374
|
+
"requested_url": url,
|
|
375
|
+
"final_url": final_u,
|
|
376
|
+
"depth": link_depth,
|
|
377
|
+
"fetch_error": True,
|
|
378
|
+
}
|
|
379
|
+
if await _insert_relationship(
|
|
380
|
+
session,
|
|
381
|
+
parent_id=parent_id,
|
|
382
|
+
child_id=child_id,
|
|
383
|
+
meta=meta,
|
|
384
|
+
):
|
|
385
|
+
result.relationships_created += 1
|
|
386
|
+
continue
|
|
387
|
+
|
|
388
|
+
final_canon = canonicalize_url(str(response.url))
|
|
389
|
+
sha_hex = hashlib.sha256(body).hexdigest()
|
|
390
|
+
|
|
391
|
+
ct = (response.headers.get("content-type") or "").split(";")[0].strip()
|
|
392
|
+
outbound: list[str] = []
|
|
393
|
+
if "html" in ct.lower() or body.lstrip()[:1] in (b"<", b"\xef"):
|
|
394
|
+
outbound = extract_links_from_html(
|
|
395
|
+
body.decode("utf-8", errors="replace"),
|
|
396
|
+
str(response.url),
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
child_id, inserted_new = await _insert_fetched_document(
|
|
400
|
+
session,
|
|
401
|
+
source_id=source_id,
|
|
402
|
+
canonical_url=final_canon,
|
|
403
|
+
content_sha256=sha_hex,
|
|
404
|
+
final_url=str(response.url),
|
|
405
|
+
requested_url=url,
|
|
406
|
+
body=body,
|
|
407
|
+
content_type_header=ct or None,
|
|
408
|
+
outbound_urls=outbound,
|
|
409
|
+
)
|
|
410
|
+
if inserted_new:
|
|
411
|
+
result.documents_created += 1
|
|
412
|
+
|
|
413
|
+
fetch_cache[req_canon] = (final_canon, sha_hex, child_id)
|
|
414
|
+
|
|
415
|
+
meta = {
|
|
416
|
+
"requested_url": url,
|
|
417
|
+
"final_url": str(response.url),
|
|
418
|
+
"depth": link_depth,
|
|
419
|
+
"content_sha256": sha_hex,
|
|
420
|
+
}
|
|
421
|
+
if await _insert_relationship(
|
|
422
|
+
session,
|
|
423
|
+
parent_id=parent_id,
|
|
424
|
+
child_id=child_id,
|
|
425
|
+
meta=meta,
|
|
426
|
+
):
|
|
427
|
+
result.relationships_created += 1
|
|
428
|
+
|
|
429
|
+
if link_depth < options.max_depth and child_id not in expanded_children:
|
|
430
|
+
expanded_children.add(child_id)
|
|
431
|
+
to_follow = await _load_outbound_links(session, child_id)
|
|
432
|
+
for nxt in to_follow:
|
|
433
|
+
if is_http_url(nxt):
|
|
434
|
+
q.append((child_id, nxt, link_depth + 1))
|
|
435
|
+
|
|
436
|
+
return result
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
async def run_expand_links_job(
|
|
440
|
+
*,
|
|
441
|
+
root_document_id: str,
|
|
442
|
+
options: ExpandLinksOptions,
|
|
443
|
+
session_factory: async_sessionmaker[AsyncSession],
|
|
444
|
+
settings: Settings,
|
|
445
|
+
) -> None:
|
|
446
|
+
"""Background entrypoint: own session + commit."""
|
|
447
|
+
async with session_factory() as session:
|
|
448
|
+
try:
|
|
449
|
+
await expand_links_from_document(
|
|
450
|
+
session,
|
|
451
|
+
root_document_id=root_document_id,
|
|
452
|
+
options=options,
|
|
453
|
+
settings=settings,
|
|
454
|
+
)
|
|
455
|
+
await session.commit()
|
|
456
|
+
except Exception:
|
|
457
|
+
logger.exception("link expansion job failed for %s", root_document_id)
|
|
458
|
+
await session.rollback()
|