business-stack 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.python-version +1 -0
- package/backend/.env.example +65 -0
- package/backend/alembic/env.py +63 -0
- package/backend/alembic/script.py.mako +26 -0
- package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
- package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
- package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
- package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
- package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
- package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
- package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
- package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
- package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
- package/backend/alembic.ini +42 -0
- package/backend/app/__init__.py +0 -0
- package/backend/app/config.py +337 -0
- package/backend/app/connectors/__init__.py +13 -0
- package/backend/app/connectors/base.py +39 -0
- package/backend/app/connectors/builtins.py +51 -0
- package/backend/app/connectors/playwright_session.py +146 -0
- package/backend/app/connectors/registry.py +68 -0
- package/backend/app/connectors/thread_expansion/__init__.py +33 -0
- package/backend/app/connectors/thread_expansion/fakes.py +154 -0
- package/backend/app/connectors/thread_expansion/models.py +113 -0
- package/backend/app/connectors/thread_expansion/reddit.py +53 -0
- package/backend/app/connectors/thread_expansion/twitter.py +49 -0
- package/backend/app/db.py +5 -0
- package/backend/app/dependencies.py +34 -0
- package/backend/app/logging_config.py +35 -0
- package/backend/app/main.py +97 -0
- package/backend/app/middleware/__init__.py +0 -0
- package/backend/app/middleware/gateway_identity.py +17 -0
- package/backend/app/middleware/openapi_gateway.py +71 -0
- package/backend/app/middleware/request_id.py +23 -0
- package/backend/app/openapi_config.py +126 -0
- package/backend/app/routers/__init__.py +0 -0
- package/backend/app/routers/admin_pipeline.py +123 -0
- package/backend/app/routers/chat.py +206 -0
- package/backend/app/routers/chunks.py +36 -0
- package/backend/app/routers/entity_extract.py +31 -0
- package/backend/app/routers/example.py +8 -0
- package/backend/app/routers/gemini_embed.py +58 -0
- package/backend/app/routers/health.py +28 -0
- package/backend/app/routers/ingestion.py +146 -0
- package/backend/app/routers/link_expansion.py +34 -0
- package/backend/app/routers/pipeline_status.py +304 -0
- package/backend/app/routers/query.py +63 -0
- package/backend/app/routers/vectors.py +63 -0
- package/backend/app/schemas/__init__.py +0 -0
- package/backend/app/schemas/canonical.py +44 -0
- package/backend/app/schemas/chat.py +50 -0
- package/backend/app/schemas/ingest.py +29 -0
- package/backend/app/schemas/query.py +153 -0
- package/backend/app/schemas/vectors.py +56 -0
- package/backend/app/services/__init__.py +0 -0
- package/backend/app/services/chat_store.py +152 -0
- package/backend/app/services/chunking/__init__.py +3 -0
- package/backend/app/services/chunking/llm_boundaries.py +63 -0
- package/backend/app/services/chunking/schemas.py +30 -0
- package/backend/app/services/chunking/semantic_chunk.py +178 -0
- package/backend/app/services/chunking/splitters.py +214 -0
- package/backend/app/services/embeddings/__init__.py +20 -0
- package/backend/app/services/embeddings/build_inputs.py +140 -0
- package/backend/app/services/embeddings/dlq.py +128 -0
- package/backend/app/services/embeddings/gemini_api.py +207 -0
- package/backend/app/services/embeddings/persist.py +74 -0
- package/backend/app/services/embeddings/types.py +32 -0
- package/backend/app/services/embeddings/worker.py +224 -0
- package/backend/app/services/entities/__init__.py +12 -0
- package/backend/app/services/entities/gliner_extract.py +63 -0
- package/backend/app/services/entities/llm_extract.py +94 -0
- package/backend/app/services/entities/pipeline.py +179 -0
- package/backend/app/services/entities/spacy_extract.py +63 -0
- package/backend/app/services/entities/types.py +15 -0
- package/backend/app/services/gemini_chat.py +113 -0
- package/backend/app/services/hooks/__init__.py +3 -0
- package/backend/app/services/hooks/post_ingest.py +186 -0
- package/backend/app/services/ingestion/__init__.py +0 -0
- package/backend/app/services/ingestion/persist.py +188 -0
- package/backend/app/services/integrations_remote.py +91 -0
- package/backend/app/services/link_expansion/__init__.py +3 -0
- package/backend/app/services/link_expansion/canonical_url.py +45 -0
- package/backend/app/services/link_expansion/domain_policy.py +26 -0
- package/backend/app/services/link_expansion/html_extract.py +72 -0
- package/backend/app/services/link_expansion/rate_limit.py +32 -0
- package/backend/app/services/link_expansion/robots.py +46 -0
- package/backend/app/services/link_expansion/schemas.py +67 -0
- package/backend/app/services/link_expansion/worker.py +458 -0
- package/backend/app/services/normalization/__init__.py +7 -0
- package/backend/app/services/normalization/normalizer.py +331 -0
- package/backend/app/services/normalization/persist_normalized.py +67 -0
- package/backend/app/services/playwright_extract/__init__.py +13 -0
- package/backend/app/services/playwright_extract/__main__.py +96 -0
- package/backend/app/services/playwright_extract/extract.py +181 -0
- package/backend/app/services/retrieval_service.py +351 -0
- package/backend/app/sqlite_ext.py +36 -0
- package/backend/app/storage/__init__.py +3 -0
- package/backend/app/storage/blobs.py +30 -0
- package/backend/app/vectorstore/__init__.py +13 -0
- package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
- package/backend/backend.egg-info/PKG-INFO +18 -0
- package/backend/backend.egg-info/SOURCES.txt +93 -0
- package/backend/backend.egg-info/dependency_links.txt +1 -0
- package/backend/backend.egg-info/entry_points.txt +2 -0
- package/backend/backend.egg-info/requires.txt +15 -0
- package/backend/backend.egg-info/top_level.txt +4 -0
- package/backend/package.json +15 -0
- package/backend/pyproject.toml +52 -0
- package/backend/tests/conftest.py +40 -0
- package/backend/tests/test_chat.py +92 -0
- package/backend/tests/test_chunking.py +132 -0
- package/backend/tests/test_entities.py +170 -0
- package/backend/tests/test_gemini_embed.py +224 -0
- package/backend/tests/test_health.py +24 -0
- package/backend/tests/test_ingest_raw.py +123 -0
- package/backend/tests/test_link_expansion.py +241 -0
- package/backend/tests/test_main.py +12 -0
- package/backend/tests/test_normalizer.py +114 -0
- package/backend/tests/test_openapi_gateway.py +40 -0
- package/backend/tests/test_pipeline_hardening.py +285 -0
- package/backend/tests/test_pipeline_status.py +71 -0
- package/backend/tests/test_playwright_extract.py +80 -0
- package/backend/tests/test_post_ingest_hooks.py +162 -0
- package/backend/tests/test_query.py +165 -0
- package/backend/tests/test_thread_expansion.py +72 -0
- package/backend/tests/test_vectors.py +85 -0
- package/backend/uv.lock +1839 -0
- package/bin/business-stack.cjs +412 -0
- package/frontend/web/.env.example +23 -0
- package/frontend/web/AGENTS.md +5 -0
- package/frontend/web/CLAUDE.md +1 -0
- package/frontend/web/README.md +36 -0
- package/frontend/web/components.json +25 -0
- package/frontend/web/next-env.d.ts +6 -0
- package/frontend/web/next.config.ts +30 -0
- package/frontend/web/package.json +65 -0
- package/frontend/web/postcss.config.mjs +7 -0
- package/frontend/web/skills-lock.json +35 -0
- package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
- package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
- package/frontend/web/src/app/chat/page.tsx +725 -0
- package/frontend/web/src/app/favicon.ico +0 -0
- package/frontend/web/src/app/globals.css +563 -0
- package/frontend/web/src/app/layout.tsx +50 -0
- package/frontend/web/src/app/page.tsx +96 -0
- package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
- package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
- package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
- package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
- package/frontend/web/src/components/home-auth-panel.tsx +49 -0
- package/frontend/web/src/components/providers.tsx +50 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
- package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
- package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
- package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
- package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
- package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
- package/frontend/web/src/lib/auth-client.ts +23 -0
- package/frontend/web/src/lib/integrations-config.ts +125 -0
- package/frontend/web/src/lib/ui-utills.tsx +90 -0
- package/frontend/web/src/lib/utils.ts +6 -0
- package/frontend/web/tsconfig.json +36 -0
- package/frontend/web/tsconfig.tsbuildinfo +1 -0
- package/frontend/web/vitest.config.ts +14 -0
- package/gateway/.env.example +23 -0
- package/gateway/README.md +13 -0
- package/gateway/package.json +24 -0
- package/gateway/src/auth.ts +49 -0
- package/gateway/src/index.ts +141 -0
- package/gateway/src/integrations/admin.ts +19 -0
- package/gateway/src/integrations/crypto.ts +52 -0
- package/gateway/src/integrations/handlers.ts +124 -0
- package/gateway/src/integrations/keys.ts +12 -0
- package/gateway/src/integrations/store.ts +106 -0
- package/gateway/src/stack-secrets.ts +35 -0
- package/gateway/tsconfig.json +13 -0
- package/package.json +33 -0
- package/turbo.json +27 -0
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import binascii
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from app.schemas.canonical import BlockType, CanonicalContentBlock, CanonicalDocument
|
|
11
|
+
from app.schemas.ingest import RawIngestEnvelope
|
|
12
|
+
from app.storage.blobs import BlobStore
|
|
13
|
+
|
|
14
|
+
URL_PATTERN = re.compile(
|
|
15
|
+
r"https?://[^\s<>\"'{}|\\^`\[\]]+",
|
|
16
|
+
re.IGNORECASE,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
MULTIMODAL_TEXT_KEYS: tuple[str, ...] = (
|
|
20
|
+
"text",
|
|
21
|
+
"body",
|
|
22
|
+
"caption",
|
|
23
|
+
"tweet",
|
|
24
|
+
"message",
|
|
25
|
+
"title",
|
|
26
|
+
)
|
|
27
|
+
MULTIMODAL_IMAGE_KEYS: tuple[str, ...] = (
|
|
28
|
+
"image",
|
|
29
|
+
"image_base64",
|
|
30
|
+
"thumbnail",
|
|
31
|
+
"photo",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class _PendingBlock:
|
|
37
|
+
block_type: BlockType
|
|
38
|
+
text: str | None = None
|
|
39
|
+
media: bytes | None = None
|
|
40
|
+
media_mime: str | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class _Draft:
|
|
45
|
+
pending: list[_PendingBlock] = field(default_factory=list)
|
|
46
|
+
text_segments_for_urls: list[str] = field(default_factory=list)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_urls_from_text(text: str) -> list[str]:
|
|
50
|
+
return URL_PATTERN.findall(text or "")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _extract_metadata_urls(metadata: dict[str, Any]) -> list[str]:
|
|
54
|
+
out: list[str] = []
|
|
55
|
+
u = metadata.get("url")
|
|
56
|
+
if isinstance(u, str) and u.strip():
|
|
57
|
+
out.append(u.strip())
|
|
58
|
+
raw_urls = metadata.get("urls")
|
|
59
|
+
if isinstance(raw_urls, list):
|
|
60
|
+
out.extend(str(x).strip() for x in raw_urls if isinstance(x, str) and x.strip())
|
|
61
|
+
return out
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def dedupe_urls(urls: list[str]) -> list[str]:
|
|
65
|
+
seen: set[str] = set()
|
|
66
|
+
out: list[str] = []
|
|
67
|
+
for u in urls:
|
|
68
|
+
if u not in seen:
|
|
69
|
+
seen.add(u)
|
|
70
|
+
out.append(u)
|
|
71
|
+
return out
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _decode_base64_image(data: str) -> tuple[bytes | None, str | None]:
|
|
75
|
+
try:
|
|
76
|
+
raw = base64.b64decode(data, validate=True)
|
|
77
|
+
except (binascii.Error, ValueError):
|
|
78
|
+
return None, None
|
|
79
|
+
return raw, "application/octet-stream"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _tags_from_metadata(metadata: dict[str, Any]) -> list[str]:
|
|
83
|
+
tags = metadata.get("tags")
|
|
84
|
+
if not isinstance(tags, list):
|
|
85
|
+
return []
|
|
86
|
+
return [str(t) for t in tags if t is not None]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _draft_from_plain_text(text: str) -> _Draft:
|
|
90
|
+
d = _Draft()
|
|
91
|
+
stripped = text.strip()
|
|
92
|
+
if stripped:
|
|
93
|
+
d.pending.append(_PendingBlock(block_type="text", text=stripped))
|
|
94
|
+
d.text_segments_for_urls.append(stripped)
|
|
95
|
+
return d
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _append_image_from_value(
|
|
99
|
+
d: _Draft,
|
|
100
|
+
value: Any,
|
|
101
|
+
default_mime: str | None,
|
|
102
|
+
) -> None:
|
|
103
|
+
if value is None:
|
|
104
|
+
return
|
|
105
|
+
if isinstance(value, (bytes, bytearray)):
|
|
106
|
+
d.pending.append(
|
|
107
|
+
_PendingBlock(
|
|
108
|
+
block_type="image",
|
|
109
|
+
media=bytes(value),
|
|
110
|
+
media_mime=default_mime or "application/octet-stream",
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
return
|
|
114
|
+
if isinstance(value, str):
|
|
115
|
+
decoded, _ = _decode_base64_image(value)
|
|
116
|
+
if decoded:
|
|
117
|
+
d.pending.append(
|
|
118
|
+
_PendingBlock(
|
|
119
|
+
block_type="image",
|
|
120
|
+
media=decoded,
|
|
121
|
+
media_mime=default_mime or "image/png",
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _draft_from_dict_payload(payload: dict[str, Any]) -> _Draft:
|
|
127
|
+
d = _Draft()
|
|
128
|
+
for key in MULTIMODAL_TEXT_KEYS:
|
|
129
|
+
if key not in payload:
|
|
130
|
+
continue
|
|
131
|
+
val = payload.get(key)
|
|
132
|
+
if val is None:
|
|
133
|
+
continue
|
|
134
|
+
if isinstance(val, str) and val.strip():
|
|
135
|
+
t = val.strip()
|
|
136
|
+
d.pending.append(_PendingBlock(block_type="text", text=t))
|
|
137
|
+
d.text_segments_for_urls.append(t)
|
|
138
|
+
elif isinstance(val, list):
|
|
139
|
+
for item in val:
|
|
140
|
+
if isinstance(item, str) and item.strip():
|
|
141
|
+
t = item.strip()
|
|
142
|
+
d.pending.append(_PendingBlock(block_type="text", text=t))
|
|
143
|
+
d.text_segments_for_urls.append(t)
|
|
144
|
+
|
|
145
|
+
for key in MULTIMODAL_IMAGE_KEYS:
|
|
146
|
+
if key not in payload:
|
|
147
|
+
continue
|
|
148
|
+
val = payload[key]
|
|
149
|
+
if isinstance(val, list):
|
|
150
|
+
for item in val:
|
|
151
|
+
_append_image_from_value(d, item, "image/png")
|
|
152
|
+
else:
|
|
153
|
+
mime = None
|
|
154
|
+
if isinstance(val, dict):
|
|
155
|
+
b64 = val.get("base64") or val.get("data")
|
|
156
|
+
mime = val.get("mime") or val.get("content_type")
|
|
157
|
+
if isinstance(b64, str):
|
|
158
|
+
_append_image_from_value(d, b64, str(mime) if mime else "image/png")
|
|
159
|
+
else:
|
|
160
|
+
_append_image_from_value(d, val, "image/png")
|
|
161
|
+
|
|
162
|
+
images = payload.get("images")
|
|
163
|
+
if isinstance(images, list):
|
|
164
|
+
for item in images:
|
|
165
|
+
if isinstance(item, str):
|
|
166
|
+
_append_image_from_value(d, item, "image/png")
|
|
167
|
+
elif isinstance(item, dict):
|
|
168
|
+
b64 = item.get("base64") or item.get("data")
|
|
169
|
+
mime = item.get("mime") or item.get("content_type")
|
|
170
|
+
if isinstance(b64, str):
|
|
171
|
+
_append_image_from_value(
|
|
172
|
+
d,
|
|
173
|
+
b64,
|
|
174
|
+
str(mime) if mime else "image/png",
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
return d
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _build_draft(envelope: RawIngestEnvelope) -> _Draft:
|
|
181
|
+
payload = envelope.payload
|
|
182
|
+
ct = envelope.content_type
|
|
183
|
+
|
|
184
|
+
if ct == "text":
|
|
185
|
+
if isinstance(payload, str):
|
|
186
|
+
return _draft_from_plain_text(payload)
|
|
187
|
+
if isinstance(payload, dict):
|
|
188
|
+
return _draft_from_dict_payload(payload)
|
|
189
|
+
return _draft_from_plain_text(json.dumps(payload, default=str))
|
|
190
|
+
|
|
191
|
+
if ct == "multimodal" and isinstance(payload, dict):
|
|
192
|
+
return _draft_from_dict_payload(payload)
|
|
193
|
+
|
|
194
|
+
if ct == "image":
|
|
195
|
+
d = _Draft()
|
|
196
|
+
if isinstance(payload, str):
|
|
197
|
+
_append_image_from_value(d, payload, "image/png")
|
|
198
|
+
elif isinstance(payload, dict):
|
|
199
|
+
b64 = payload.get("base64") or payload.get("data")
|
|
200
|
+
mime = payload.get("mime") or "image/png"
|
|
201
|
+
if isinstance(b64, str):
|
|
202
|
+
_append_image_from_value(d, b64, str(mime))
|
|
203
|
+
elif isinstance(payload, (bytes, bytearray)):
|
|
204
|
+
d.pending.append(
|
|
205
|
+
_PendingBlock(
|
|
206
|
+
block_type="image",
|
|
207
|
+
media=bytes(payload),
|
|
208
|
+
media_mime="application/octet-stream",
|
|
209
|
+
),
|
|
210
|
+
)
|
|
211
|
+
return d
|
|
212
|
+
|
|
213
|
+
if ct in ("audio", "video", "document"):
|
|
214
|
+
d = _Draft()
|
|
215
|
+
if isinstance(payload, dict):
|
|
216
|
+
b64 = payload.get("base64") or payload.get("data")
|
|
217
|
+
mime = payload.get("mime") or (
|
|
218
|
+
"audio/mpeg"
|
|
219
|
+
if ct == "audio"
|
|
220
|
+
else "video/mp4"
|
|
221
|
+
if ct == "video"
|
|
222
|
+
else "application/octet-stream"
|
|
223
|
+
)
|
|
224
|
+
if isinstance(b64, str):
|
|
225
|
+
try:
|
|
226
|
+
raw = base64.b64decode(b64, validate=True)
|
|
227
|
+
except (binascii.Error, ValueError):
|
|
228
|
+
raw = None
|
|
229
|
+
if raw:
|
|
230
|
+
d.pending.append(
|
|
231
|
+
_PendingBlock(
|
|
232
|
+
block_type=ct,
|
|
233
|
+
media=raw,
|
|
234
|
+
media_mime=str(mime),
|
|
235
|
+
),
|
|
236
|
+
)
|
|
237
|
+
blob = payload.get("bytes")
|
|
238
|
+
if isinstance(blob, (bytes, bytearray)):
|
|
239
|
+
d.pending.append(
|
|
240
|
+
_PendingBlock(
|
|
241
|
+
block_type=ct,
|
|
242
|
+
media=bytes(blob),
|
|
243
|
+
media_mime=str(mime),
|
|
244
|
+
),
|
|
245
|
+
)
|
|
246
|
+
elif isinstance(payload, (bytes, bytearray)):
|
|
247
|
+
d.pending.append(
|
|
248
|
+
_PendingBlock(
|
|
249
|
+
block_type=ct,
|
|
250
|
+
media=bytes(payload),
|
|
251
|
+
media_mime="application/octet-stream",
|
|
252
|
+
),
|
|
253
|
+
)
|
|
254
|
+
return d
|
|
255
|
+
|
|
256
|
+
if isinstance(payload, dict):
|
|
257
|
+
return _draft_from_dict_payload(payload)
|
|
258
|
+
if isinstance(payload, str):
|
|
259
|
+
return _draft_from_plain_text(payload)
|
|
260
|
+
return _draft_from_plain_text(json.dumps(payload, default=str))
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _collect_links(envelope: RawIngestEnvelope, draft: _Draft) -> list[str]:
|
|
264
|
+
found: list[str] = []
|
|
265
|
+
for seg in draft.text_segments_for_urls:
|
|
266
|
+
found.extend(extract_urls_from_text(seg))
|
|
267
|
+
found.extend(_extract_metadata_urls(envelope.metadata))
|
|
268
|
+
return dedupe_urls(found)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _summary_from_pending(pending: list[_PendingBlock]) -> str:
|
|
272
|
+
texts = [p.text for p in pending if p.text]
|
|
273
|
+
joined = " ".join(texts).strip()
|
|
274
|
+
if len(joined) <= 500:
|
|
275
|
+
return joined
|
|
276
|
+
return joined[:499] + "…"
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def normalize_envelope_to_canonical(
|
|
280
|
+
*,
|
|
281
|
+
document_id: str,
|
|
282
|
+
envelope: RawIngestEnvelope,
|
|
283
|
+
blob_store: BlobStore,
|
|
284
|
+
) -> CanonicalDocument:
|
|
285
|
+
"""Map raw ingest envelope to canonical document; write media to blob store."""
|
|
286
|
+
draft = _build_draft(envelope)
|
|
287
|
+
links = _collect_links(envelope, draft)
|
|
288
|
+
tags = _tags_from_metadata(envelope.metadata)
|
|
289
|
+
|
|
290
|
+
blocks: list[CanonicalContentBlock] = []
|
|
291
|
+
for p in draft.pending:
|
|
292
|
+
if p.block_type == "text" and p.text is not None:
|
|
293
|
+
blocks.append(
|
|
294
|
+
CanonicalContentBlock(
|
|
295
|
+
type="text",
|
|
296
|
+
data=p.text,
|
|
297
|
+
raw_input=None,
|
|
298
|
+
mime="text/plain",
|
|
299
|
+
),
|
|
300
|
+
)
|
|
301
|
+
continue
|
|
302
|
+
if p.media is not None and len(p.media) > 0:
|
|
303
|
+
_sha, uri = blob_store.write(p.media)
|
|
304
|
+
blocks.append(
|
|
305
|
+
CanonicalContentBlock(
|
|
306
|
+
type=p.block_type,
|
|
307
|
+
data=None,
|
|
308
|
+
raw_input=uri,
|
|
309
|
+
mime=p.media_mime,
|
|
310
|
+
),
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
summary = _summary_from_pending(draft.pending)
|
|
314
|
+
|
|
315
|
+
return CanonicalDocument(
|
|
316
|
+
id=document_id,
|
|
317
|
+
source=envelope.source,
|
|
318
|
+
timestamp=envelope.timestamp,
|
|
319
|
+
content_blocks=blocks,
|
|
320
|
+
raw_content=envelope.payload,
|
|
321
|
+
entities=[],
|
|
322
|
+
links=links,
|
|
323
|
+
tags=tags,
|
|
324
|
+
summary=summary,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def parse_blob_sha256(raw_input: str | None) -> str | None:
|
|
329
|
+
if raw_input and raw_input.startswith("blob://"):
|
|
330
|
+
return raw_input.removeprefix("blob://")
|
|
331
|
+
return None
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from sqlalchemy import text
|
|
6
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
7
|
+
|
|
8
|
+
from app.schemas.canonical import CanonicalDocument
|
|
9
|
+
from app.services.normalization.normalizer import parse_blob_sha256
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def persist_normalized_document(
|
|
13
|
+
session: AsyncSession,
|
|
14
|
+
*,
|
|
15
|
+
canonical: CanonicalDocument,
|
|
16
|
+
) -> None:
|
|
17
|
+
"""Replace content_blocks and document_links; set documents.summary."""
|
|
18
|
+
doc_id = canonical.id
|
|
19
|
+
|
|
20
|
+
await session.execute(
|
|
21
|
+
text("DELETE FROM content_blocks WHERE document_id = :d"),
|
|
22
|
+
{"d": doc_id},
|
|
23
|
+
)
|
|
24
|
+
await session.execute(
|
|
25
|
+
text("DELETE FROM document_links WHERE document_id = :d"),
|
|
26
|
+
{"d": doc_id},
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
for ordinal, block in enumerate(canonical.content_blocks):
|
|
30
|
+
sha = parse_blob_sha256(block.raw_input)
|
|
31
|
+
meta: dict = {}
|
|
32
|
+
if block.data is not None:
|
|
33
|
+
meta["text"] = block.data
|
|
34
|
+
meta_json = json.dumps(meta) if meta else None
|
|
35
|
+
storage_uri = block.raw_input if block.raw_input else None
|
|
36
|
+
await session.execute(
|
|
37
|
+
text(
|
|
38
|
+
"INSERT INTO content_blocks "
|
|
39
|
+
"(document_id, ordinal, type, storage_uri, inline_ref, mime, "
|
|
40
|
+
"sha256, meta) "
|
|
41
|
+
"VALUES (:did, :ord, :ty, :su, :ir, :mime, :sha, :meta)",
|
|
42
|
+
),
|
|
43
|
+
{
|
|
44
|
+
"did": doc_id,
|
|
45
|
+
"ord": ordinal,
|
|
46
|
+
"ty": block.type,
|
|
47
|
+
"su": storage_uri,
|
|
48
|
+
"ir": "canonical:inline" if block.data and not storage_uri else None,
|
|
49
|
+
"mime": block.mime,
|
|
50
|
+
"sha": sha,
|
|
51
|
+
"meta": meta_json,
|
|
52
|
+
},
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
for ordinal, url in enumerate(canonical.links):
|
|
56
|
+
await session.execute(
|
|
57
|
+
text(
|
|
58
|
+
"INSERT INTO document_links (document_id, url, ordinal) "
|
|
59
|
+
"VALUES (:d, :u, :o)",
|
|
60
|
+
),
|
|
61
|
+
{"d": doc_id, "u": url, "o": ordinal},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
await session.execute(
|
|
65
|
+
text("UPDATE documents SET summary = :s WHERE id = :id"),
|
|
66
|
+
{"s": canonical.summary, "id": doc_id},
|
|
67
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from app.services.playwright_extract.extract import (
|
|
2
|
+
PlaywrightExtractResult,
|
|
3
|
+
assert_url_allowed,
|
|
4
|
+
extract_visible_text_sync,
|
|
5
|
+
host_matches_allowlist,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"PlaywrightExtractResult",
|
|
10
|
+
"assert_url_allowed",
|
|
11
|
+
"extract_visible_text_sync",
|
|
12
|
+
"host_matches_allowlist",
|
|
13
|
+
]
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI: extract visible text for a URL using a persistent Chrome/Chromium profile.
|
|
3
|
+
|
|
4
|
+
Example (Windows, after installing optional deps and browsers)::
|
|
5
|
+
|
|
6
|
+
uv run python -m app.services.playwright_extract ^
|
|
7
|
+
https://example.com/path ^
|
|
8
|
+
--profile "C:\\\\Users\\\\you\\\\pw-profile" ^
|
|
9
|
+
--allow-hosts example.com
|
|
10
|
+
|
|
11
|
+
See ``app.services.playwright_extract.extract`` module docstring for profile
|
|
12
|
+
locking and path guidance.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> int:
|
|
24
|
+
p = argparse.ArgumentParser(
|
|
25
|
+
description="Playwright visible-text extraction (persistent user data dir)",
|
|
26
|
+
)
|
|
27
|
+
p.add_argument("url", help="https? URL to open")
|
|
28
|
+
p.add_argument(
|
|
29
|
+
"--profile",
|
|
30
|
+
required=True,
|
|
31
|
+
type=Path,
|
|
32
|
+
help="Persistent browser user data directory (dedicated folder recommended)",
|
|
33
|
+
)
|
|
34
|
+
p.add_argument(
|
|
35
|
+
"--allow-hosts",
|
|
36
|
+
required=True,
|
|
37
|
+
help="Comma-separated allowlist (supports *.suffix.com patterns)",
|
|
38
|
+
)
|
|
39
|
+
p.add_argument(
|
|
40
|
+
"--config",
|
|
41
|
+
type=Path,
|
|
42
|
+
help="Optional JSON file with keys: navigation_timeout_ms, max_response_chars, "
|
|
43
|
+
"headless, browser_channel",
|
|
44
|
+
)
|
|
45
|
+
p.add_argument(
|
|
46
|
+
"--headed",
|
|
47
|
+
action="store_true",
|
|
48
|
+
help="Run non-headless (overrides config headless)",
|
|
49
|
+
)
|
|
50
|
+
args = p.parse_args()
|
|
51
|
+
|
|
52
|
+
allowlisted = [h.strip() for h in args.allow_hosts.split(",") if h.strip()]
|
|
53
|
+
navigation_timeout_ms = 30_000
|
|
54
|
+
max_response_chars = 500_000
|
|
55
|
+
headless = not args.headed
|
|
56
|
+
browser_channel: str | None = None
|
|
57
|
+
|
|
58
|
+
if args.config is not None:
|
|
59
|
+
data = json.loads(args.config.read_text(encoding="utf-8"))
|
|
60
|
+
if not isinstance(data, dict):
|
|
61
|
+
print("config file must be a JSON object", file=sys.stderr)
|
|
62
|
+
return 2
|
|
63
|
+
navigation_timeout_ms = int(
|
|
64
|
+
data.get("navigation_timeout_ms", navigation_timeout_ms),
|
|
65
|
+
)
|
|
66
|
+
max_response_chars = int(
|
|
67
|
+
data.get("max_response_chars", max_response_chars),
|
|
68
|
+
)
|
|
69
|
+
headless = bool(data.get("headless", headless)) if not args.headed else False
|
|
70
|
+
ch = data.get("browser_channel")
|
|
71
|
+
browser_channel = str(ch) if ch else None
|
|
72
|
+
|
|
73
|
+
from app.services.playwright_extract.extract import extract_visible_text_sync
|
|
74
|
+
|
|
75
|
+
result = extract_visible_text_sync(
|
|
76
|
+
args.url,
|
|
77
|
+
str(args.profile.expanduser().resolve()),
|
|
78
|
+
allowlisted_hosts=allowlisted,
|
|
79
|
+
navigation_timeout_ms=navigation_timeout_ms,
|
|
80
|
+
max_response_chars=max_response_chars,
|
|
81
|
+
headless=headless,
|
|
82
|
+
browser_channel=browser_channel,
|
|
83
|
+
)
|
|
84
|
+
out = {
|
|
85
|
+
"visible_text": result.visible_text,
|
|
86
|
+
"title": result.title,
|
|
87
|
+
"final_url": result.final_url,
|
|
88
|
+
"truncated": result.truncated,
|
|
89
|
+
"meta": result.meta,
|
|
90
|
+
}
|
|
91
|
+
print(json.dumps(out, indent=2, ensure_ascii=False))
|
|
92
|
+
return 0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Playwright-based extraction of visible page text using a **persistent browser profile**.
|
|
3
|
+
|
|
4
|
+
Operational notes (especially **Windows**)
|
|
5
|
+
------------------------------------------
|
|
6
|
+
|
|
7
|
+
**Profile path**
|
|
8
|
+
|
|
9
|
+
- Use a **dedicated** directory for automation (e.g.
|
|
10
|
+
``C:\\Users\\you\\AppData\\Local\\PlaywrightIngestProfile``), not your daily
|
|
11
|
+
Chrome profile, to reduce lock conflicts and accidental data loss.
|
|
12
|
+
- Paths may be passed as normal Python strings; raw strings (``r"C:\\..."``) help
|
|
13
|
+
avoid escape issues in scripts.
|
|
14
|
+
- Prefer resolving with ``pathlib.Path`` and ``.expanduser()`` / ``.resolve()``.
|
|
15
|
+
|
|
16
|
+
**Profile locking ("SingletonLock", "profile in use")**
|
|
17
|
+
|
|
18
|
+
- The Chromium/Chrome user-data directory must not be open in another browser
|
|
19
|
+
instance. **Fully quit** regular Chrome (or use a copy of the profile) before
|
|
20
|
+
``launch_persistent_context`` runs.
|
|
21
|
+
- Only **one** Playwright persistent context should use a given directory at a
|
|
22
|
+
time; concurrent ingest requests with the same path will race and may fail.
|
|
23
|
+
- If you need parallel jobs, use **separate** ``user_data_dir`` roots per worker.
|
|
24
|
+
|
|
25
|
+
**Install**
|
|
26
|
+
|
|
27
|
+
- Install the optional dependency group (see ``pyproject.toml``), then run
|
|
28
|
+
``playwright install chromium`` (or the channel you select). Using
|
|
29
|
+
``channel="chrome"`` requires a local Google Chrome install.
|
|
30
|
+
|
|
31
|
+
This module performs **no** scraping guidance beyond driving a real browser
|
|
32
|
+
session; target sites' terms and robots rules still apply.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import logging
|
|
38
|
+
from dataclasses import dataclass, field
|
|
39
|
+
from typing import Any
|
|
40
|
+
from urllib.parse import urlparse
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True, slots=True)
|
|
46
|
+
class PlaywrightExtractResult:
|
|
47
|
+
visible_text: str
|
|
48
|
+
title: str | None
|
|
49
|
+
final_url: str
|
|
50
|
+
truncated: bool
|
|
51
|
+
meta: dict[str, Any] = field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _scheme_host(url: str) -> tuple[str, str]:
|
|
55
|
+
parsed = urlparse(url.strip())
|
|
56
|
+
if parsed.scheme not in ("http", "https"):
|
|
57
|
+
msg = f"URL scheme must be http or https, got {parsed.scheme!r}"
|
|
58
|
+
raise ValueError(msg)
|
|
59
|
+
host = (parsed.hostname or "").lower()
|
|
60
|
+
if not host:
|
|
61
|
+
msg = "URL must include a host"
|
|
62
|
+
raise ValueError(msg)
|
|
63
|
+
return parsed.scheme, host
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def host_matches_allowlist(hostname: str, allowlisted_hosts: list[str]) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Match hostname against entries.
|
|
69
|
+
|
|
70
|
+
- ``example.com`` — exact host match (case-insensitive).
|
|
71
|
+
- ``*.example.com`` — suffix match (``sub.example.com``, not ``evil-example.com``).
|
|
72
|
+
"""
|
|
73
|
+
h = hostname.lower().rstrip(".")
|
|
74
|
+
for raw in allowlisted_hosts:
|
|
75
|
+
p = raw.strip().lower()
|
|
76
|
+
if not p:
|
|
77
|
+
continue
|
|
78
|
+
if p.startswith("*."):
|
|
79
|
+
root = p[2:].lstrip(".")
|
|
80
|
+
if h == root or h.endswith("." + root):
|
|
81
|
+
return True
|
|
82
|
+
elif h == p:
|
|
83
|
+
return True
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def assert_url_allowed(url: str, allowlisted_hosts: list[str]) -> None:
|
|
88
|
+
if not allowlisted_hosts:
|
|
89
|
+
msg = "allowlisted_hosts must be a non-empty list"
|
|
90
|
+
raise ValueError(msg)
|
|
91
|
+
_, host = _scheme_host(url)
|
|
92
|
+
if not host_matches_allowlist(host, allowlisted_hosts):
|
|
93
|
+
msg = f"Host {host!r} is not in allowlisted_hosts"
|
|
94
|
+
raise ValueError(msg)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def extract_visible_text_sync(
|
|
98
|
+
url: str,
|
|
99
|
+
user_data_dir: str,
|
|
100
|
+
*,
|
|
101
|
+
allowlisted_hosts: list[str],
|
|
102
|
+
navigation_timeout_ms: int = 30_000,
|
|
103
|
+
max_response_chars: int = 500_000,
|
|
104
|
+
headless: bool = True,
|
|
105
|
+
browser_channel: str | None = None,
|
|
106
|
+
) -> PlaywrightExtractResult:
|
|
107
|
+
"""
|
|
108
|
+
Launch a persistent context, navigate to ``url``, return visible text + metadata.
|
|
109
|
+
|
|
110
|
+
Runs synchronously (intended for ``asyncio.to_thread`` from async routes or
|
|
111
|
+
direct use from the sync CLI). Requires the ``playwright`` package and
|
|
112
|
+
installed browser binaries.
|
|
113
|
+
"""
|
|
114
|
+
assert_url_allowed(url, allowlisted_hosts)
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
from playwright.sync_api import sync_playwright
|
|
118
|
+
except ImportError as e:
|
|
119
|
+
msg = (
|
|
120
|
+
"playwright is not installed. Install optional deps, e.g. "
|
|
121
|
+
"`uv sync --extra playwright`, then `playwright install chromium`."
|
|
122
|
+
)
|
|
123
|
+
raise RuntimeError(msg) from e
|
|
124
|
+
|
|
125
|
+
meta: dict[str, Any] = {
|
|
126
|
+
"navigation_timeout_ms": navigation_timeout_ms,
|
|
127
|
+
"max_response_chars": max_response_chars,
|
|
128
|
+
"user_data_dir": user_data_dir,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
with sync_playwright() as pw:
|
|
132
|
+
ctx_kwargs: dict[str, Any] = {
|
|
133
|
+
"user_data_dir": user_data_dir,
|
|
134
|
+
"headless": headless,
|
|
135
|
+
"viewport": {"width": 1280, "height": 720},
|
|
136
|
+
}
|
|
137
|
+
if browser_channel:
|
|
138
|
+
ctx_kwargs["channel"] = browser_channel
|
|
139
|
+
|
|
140
|
+
context = pw.chromium.launch_persistent_context(**ctx_kwargs)
|
|
141
|
+
try:
|
|
142
|
+
page = context.pages[0] if context.pages else context.new_page()
|
|
143
|
+
page.goto(
|
|
144
|
+
url,
|
|
145
|
+
wait_until="domcontentloaded",
|
|
146
|
+
timeout=navigation_timeout_ms,
|
|
147
|
+
)
|
|
148
|
+
final_url = page.url
|
|
149
|
+
title = page.title() or None
|
|
150
|
+
try:
|
|
151
|
+
raw_text = page.inner_text(
|
|
152
|
+
"body",
|
|
153
|
+
timeout=min(15_000, navigation_timeout_ms),
|
|
154
|
+
)
|
|
155
|
+
except Exception:
|
|
156
|
+
logger.debug(
|
|
157
|
+
"inner_text(body) failed; falling back to evaluate",
|
|
158
|
+
exc_info=True,
|
|
159
|
+
)
|
|
160
|
+
raw_text = (
|
|
161
|
+
page.evaluate(
|
|
162
|
+
"() => document.body ? document.body.innerText : ''",
|
|
163
|
+
)
|
|
164
|
+
or ""
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
truncated = False
|
|
168
|
+
if len(raw_text) > max_response_chars:
|
|
169
|
+
raw_text = raw_text[:max_response_chars]
|
|
170
|
+
truncated = True
|
|
171
|
+
meta["truncated"] = truncated
|
|
172
|
+
meta["final_url"] = final_url
|
|
173
|
+
return PlaywrightExtractResult(
|
|
174
|
+
visible_text=raw_text.strip(),
|
|
175
|
+
title=title,
|
|
176
|
+
final_url=final_url,
|
|
177
|
+
truncated=truncated,
|
|
178
|
+
meta=meta,
|
|
179
|
+
)
|
|
180
|
+
finally:
|
|
181
|
+
context.close()
|