business-stack 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/.python-version +1 -0
  2. package/backend/.env.example +65 -0
  3. package/backend/alembic/env.py +63 -0
  4. package/backend/alembic/script.py.mako +26 -0
  5. package/backend/alembic/versions/2a9c8f1d0e7b_multimodal_kb_schema.py +279 -0
  6. package/backend/alembic/versions/3c1d2e4f5a6b_sqlite_vec_embeddings.py +58 -0
  7. package/backend/alembic/versions/4e8b0c2d1a3f_document_links.py +50 -0
  8. package/backend/alembic/versions/6a0b1c2d3e4f_link_expansion_dedupe_columns.py +49 -0
  9. package/backend/alembic/versions/7d8e9f0a1b2c_document_chunks.py +70 -0
  10. package/backend/alembic/versions/8f2a1c0d9e3b_initial_empty_revision.py +22 -0
  11. package/backend/alembic/versions/9f0a1b2c3d4e_entity_mentions_cooccurrence.py +123 -0
  12. package/backend/alembic/versions/b1c2d3e4f5a6_pipeline_dedupe_dlq.py +99 -0
  13. package/backend/alembic/versions/c2d3e4f5061a_chat_sessions_messages.py +59 -0
  14. package/backend/alembic.ini +42 -0
  15. package/backend/app/__init__.py +0 -0
  16. package/backend/app/config.py +337 -0
  17. package/backend/app/connectors/__init__.py +13 -0
  18. package/backend/app/connectors/base.py +39 -0
  19. package/backend/app/connectors/builtins.py +51 -0
  20. package/backend/app/connectors/playwright_session.py +146 -0
  21. package/backend/app/connectors/registry.py +68 -0
  22. package/backend/app/connectors/thread_expansion/__init__.py +33 -0
  23. package/backend/app/connectors/thread_expansion/fakes.py +154 -0
  24. package/backend/app/connectors/thread_expansion/models.py +113 -0
  25. package/backend/app/connectors/thread_expansion/reddit.py +53 -0
  26. package/backend/app/connectors/thread_expansion/twitter.py +49 -0
  27. package/backend/app/db.py +5 -0
  28. package/backend/app/dependencies.py +34 -0
  29. package/backend/app/logging_config.py +35 -0
  30. package/backend/app/main.py +97 -0
  31. package/backend/app/middleware/__init__.py +0 -0
  32. package/backend/app/middleware/gateway_identity.py +17 -0
  33. package/backend/app/middleware/openapi_gateway.py +71 -0
  34. package/backend/app/middleware/request_id.py +23 -0
  35. package/backend/app/openapi_config.py +126 -0
  36. package/backend/app/routers/__init__.py +0 -0
  37. package/backend/app/routers/admin_pipeline.py +123 -0
  38. package/backend/app/routers/chat.py +206 -0
  39. package/backend/app/routers/chunks.py +36 -0
  40. package/backend/app/routers/entity_extract.py +31 -0
  41. package/backend/app/routers/example.py +8 -0
  42. package/backend/app/routers/gemini_embed.py +58 -0
  43. package/backend/app/routers/health.py +28 -0
  44. package/backend/app/routers/ingestion.py +146 -0
  45. package/backend/app/routers/link_expansion.py +34 -0
  46. package/backend/app/routers/pipeline_status.py +304 -0
  47. package/backend/app/routers/query.py +63 -0
  48. package/backend/app/routers/vectors.py +63 -0
  49. package/backend/app/schemas/__init__.py +0 -0
  50. package/backend/app/schemas/canonical.py +44 -0
  51. package/backend/app/schemas/chat.py +50 -0
  52. package/backend/app/schemas/ingest.py +29 -0
  53. package/backend/app/schemas/query.py +153 -0
  54. package/backend/app/schemas/vectors.py +56 -0
  55. package/backend/app/services/__init__.py +0 -0
  56. package/backend/app/services/chat_store.py +152 -0
  57. package/backend/app/services/chunking/__init__.py +3 -0
  58. package/backend/app/services/chunking/llm_boundaries.py +63 -0
  59. package/backend/app/services/chunking/schemas.py +30 -0
  60. package/backend/app/services/chunking/semantic_chunk.py +178 -0
  61. package/backend/app/services/chunking/splitters.py +214 -0
  62. package/backend/app/services/embeddings/__init__.py +20 -0
  63. package/backend/app/services/embeddings/build_inputs.py +140 -0
  64. package/backend/app/services/embeddings/dlq.py +128 -0
  65. package/backend/app/services/embeddings/gemini_api.py +207 -0
  66. package/backend/app/services/embeddings/persist.py +74 -0
  67. package/backend/app/services/embeddings/types.py +32 -0
  68. package/backend/app/services/embeddings/worker.py +224 -0
  69. package/backend/app/services/entities/__init__.py +12 -0
  70. package/backend/app/services/entities/gliner_extract.py +63 -0
  71. package/backend/app/services/entities/llm_extract.py +94 -0
  72. package/backend/app/services/entities/pipeline.py +179 -0
  73. package/backend/app/services/entities/spacy_extract.py +63 -0
  74. package/backend/app/services/entities/types.py +15 -0
  75. package/backend/app/services/gemini_chat.py +113 -0
  76. package/backend/app/services/hooks/__init__.py +3 -0
  77. package/backend/app/services/hooks/post_ingest.py +186 -0
  78. package/backend/app/services/ingestion/__init__.py +0 -0
  79. package/backend/app/services/ingestion/persist.py +188 -0
  80. package/backend/app/services/integrations_remote.py +91 -0
  81. package/backend/app/services/link_expansion/__init__.py +3 -0
  82. package/backend/app/services/link_expansion/canonical_url.py +45 -0
  83. package/backend/app/services/link_expansion/domain_policy.py +26 -0
  84. package/backend/app/services/link_expansion/html_extract.py +72 -0
  85. package/backend/app/services/link_expansion/rate_limit.py +32 -0
  86. package/backend/app/services/link_expansion/robots.py +46 -0
  87. package/backend/app/services/link_expansion/schemas.py +67 -0
  88. package/backend/app/services/link_expansion/worker.py +458 -0
  89. package/backend/app/services/normalization/__init__.py +7 -0
  90. package/backend/app/services/normalization/normalizer.py +331 -0
  91. package/backend/app/services/normalization/persist_normalized.py +67 -0
  92. package/backend/app/services/playwright_extract/__init__.py +13 -0
  93. package/backend/app/services/playwright_extract/__main__.py +96 -0
  94. package/backend/app/services/playwright_extract/extract.py +181 -0
  95. package/backend/app/services/retrieval_service.py +351 -0
  96. package/backend/app/sqlite_ext.py +36 -0
  97. package/backend/app/storage/__init__.py +3 -0
  98. package/backend/app/storage/blobs.py +30 -0
  99. package/backend/app/vectorstore/__init__.py +13 -0
  100. package/backend/app/vectorstore/sqlite_vec_store.py +242 -0
  101. package/backend/backend.egg-info/PKG-INFO +18 -0
  102. package/backend/backend.egg-info/SOURCES.txt +93 -0
  103. package/backend/backend.egg-info/dependency_links.txt +1 -0
  104. package/backend/backend.egg-info/entry_points.txt +2 -0
  105. package/backend/backend.egg-info/requires.txt +15 -0
  106. package/backend/backend.egg-info/top_level.txt +4 -0
  107. package/backend/package.json +15 -0
  108. package/backend/pyproject.toml +52 -0
  109. package/backend/tests/conftest.py +40 -0
  110. package/backend/tests/test_chat.py +92 -0
  111. package/backend/tests/test_chunking.py +132 -0
  112. package/backend/tests/test_entities.py +170 -0
  113. package/backend/tests/test_gemini_embed.py +224 -0
  114. package/backend/tests/test_health.py +24 -0
  115. package/backend/tests/test_ingest_raw.py +123 -0
  116. package/backend/tests/test_link_expansion.py +241 -0
  117. package/backend/tests/test_main.py +12 -0
  118. package/backend/tests/test_normalizer.py +114 -0
  119. package/backend/tests/test_openapi_gateway.py +40 -0
  120. package/backend/tests/test_pipeline_hardening.py +285 -0
  121. package/backend/tests/test_pipeline_status.py +71 -0
  122. package/backend/tests/test_playwright_extract.py +80 -0
  123. package/backend/tests/test_post_ingest_hooks.py +162 -0
  124. package/backend/tests/test_query.py +165 -0
  125. package/backend/tests/test_thread_expansion.py +72 -0
  126. package/backend/tests/test_vectors.py +85 -0
  127. package/backend/uv.lock +1839 -0
  128. package/bin/business-stack.cjs +412 -0
  129. package/frontend/web/.env.example +23 -0
  130. package/frontend/web/AGENTS.md +5 -0
  131. package/frontend/web/CLAUDE.md +1 -0
  132. package/frontend/web/README.md +36 -0
  133. package/frontend/web/components.json +25 -0
  134. package/frontend/web/next-env.d.ts +6 -0
  135. package/frontend/web/next.config.ts +30 -0
  136. package/frontend/web/package.json +65 -0
  137. package/frontend/web/postcss.config.mjs +7 -0
  138. package/frontend/web/skills-lock.json +35 -0
  139. package/frontend/web/src/app/account/[[...path]]/page.tsx +19 -0
  140. package/frontend/web/src/app/auth/[[...path]]/page.tsx +14 -0
  141. package/frontend/web/src/app/chat/page.tsx +725 -0
  142. package/frontend/web/src/app/favicon.ico +0 -0
  143. package/frontend/web/src/app/globals.css +563 -0
  144. package/frontend/web/src/app/layout.tsx +50 -0
  145. package/frontend/web/src/app/page.tsx +96 -0
  146. package/frontend/web/src/app/settings/integrations/actions.ts +74 -0
  147. package/frontend/web/src/app/settings/integrations/integrations-settings-form.tsx +330 -0
  148. package/frontend/web/src/app/settings/integrations/page.tsx +41 -0
  149. package/frontend/web/src/app/webhooks/alpha-alerts/route.ts +84 -0
  150. package/frontend/web/src/components/home-auth-panel.tsx +49 -0
  151. package/frontend/web/src/components/providers.tsx +50 -0
  152. package/frontend/web/src/lib/alpha-webhook/connectors/registry.ts +35 -0
  153. package/frontend/web/src/lib/alpha-webhook/connectors/types.ts +8 -0
  154. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.test.ts +40 -0
  155. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge-delivery.ts +78 -0
  156. package/frontend/web/src/lib/alpha-webhook/connectors/wabridge.ts +30 -0
  157. package/frontend/web/src/lib/alpha-webhook/handler.ts +12 -0
  158. package/frontend/web/src/lib/alpha-webhook/signature.test.ts +33 -0
  159. package/frontend/web/src/lib/alpha-webhook/signature.ts +21 -0
  160. package/frontend/web/src/lib/alpha-webhook/types.ts +23 -0
  161. package/frontend/web/src/lib/auth-client.ts +23 -0
  162. package/frontend/web/src/lib/integrations-config.ts +125 -0
  163. package/frontend/web/src/lib/ui-utills.tsx +90 -0
  164. package/frontend/web/src/lib/utils.ts +6 -0
  165. package/frontend/web/tsconfig.json +36 -0
  166. package/frontend/web/tsconfig.tsbuildinfo +1 -0
  167. package/frontend/web/vitest.config.ts +14 -0
  168. package/gateway/.env.example +23 -0
  169. package/gateway/README.md +13 -0
  170. package/gateway/package.json +24 -0
  171. package/gateway/src/auth.ts +49 -0
  172. package/gateway/src/index.ts +141 -0
  173. package/gateway/src/integrations/admin.ts +19 -0
  174. package/gateway/src/integrations/crypto.ts +52 -0
  175. package/gateway/src/integrations/handlers.ts +124 -0
  176. package/gateway/src/integrations/keys.ts +12 -0
  177. package/gateway/src/integrations/store.ts +106 -0
  178. package/gateway/src/stack-secrets.ts +35 -0
  179. package/gateway/tsconfig.json +13 -0
  180. package/package.json +33 -0
  181. package/turbo.json +27 -0
@@ -0,0 +1,331 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import binascii
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass, field
8
+ from typing import Any
9
+
10
+ from app.schemas.canonical import BlockType, CanonicalContentBlock, CanonicalDocument
11
+ from app.schemas.ingest import RawIngestEnvelope
12
+ from app.storage.blobs import BlobStore
13
+
14
+ URL_PATTERN = re.compile(
15
+ r"https?://[^\s<>\"'{}|\\^`\[\]]+",
16
+ re.IGNORECASE,
17
+ )
18
+
19
+ MULTIMODAL_TEXT_KEYS: tuple[str, ...] = (
20
+ "text",
21
+ "body",
22
+ "caption",
23
+ "tweet",
24
+ "message",
25
+ "title",
26
+ )
27
+ MULTIMODAL_IMAGE_KEYS: tuple[str, ...] = (
28
+ "image",
29
+ "image_base64",
30
+ "thumbnail",
31
+ "photo",
32
+ )
33
+
34
+
35
+ @dataclass
36
+ class _PendingBlock:
37
+ block_type: BlockType
38
+ text: str | None = None
39
+ media: bytes | None = None
40
+ media_mime: str | None = None
41
+
42
+
43
+ @dataclass
44
+ class _Draft:
45
+ pending: list[_PendingBlock] = field(default_factory=list)
46
+ text_segments_for_urls: list[str] = field(default_factory=list)
47
+
48
+
49
+ def extract_urls_from_text(text: str) -> list[str]:
50
+ return URL_PATTERN.findall(text or "")
51
+
52
+
53
+ def _extract_metadata_urls(metadata: dict[str, Any]) -> list[str]:
54
+ out: list[str] = []
55
+ u = metadata.get("url")
56
+ if isinstance(u, str) and u.strip():
57
+ out.append(u.strip())
58
+ raw_urls = metadata.get("urls")
59
+ if isinstance(raw_urls, list):
60
+ out.extend(str(x).strip() for x in raw_urls if isinstance(x, str) and x.strip())
61
+ return out
62
+
63
+
64
+ def dedupe_urls(urls: list[str]) -> list[str]:
65
+ seen: set[str] = set()
66
+ out: list[str] = []
67
+ for u in urls:
68
+ if u not in seen:
69
+ seen.add(u)
70
+ out.append(u)
71
+ return out
72
+
73
+
74
+ def _decode_base64_image(data: str) -> tuple[bytes | None, str | None]:
75
+ try:
76
+ raw = base64.b64decode(data, validate=True)
77
+ except (binascii.Error, ValueError):
78
+ return None, None
79
+ return raw, "application/octet-stream"
80
+
81
+
82
+ def _tags_from_metadata(metadata: dict[str, Any]) -> list[str]:
83
+ tags = metadata.get("tags")
84
+ if not isinstance(tags, list):
85
+ return []
86
+ return [str(t) for t in tags if t is not None]
87
+
88
+
89
+ def _draft_from_plain_text(text: str) -> _Draft:
90
+ d = _Draft()
91
+ stripped = text.strip()
92
+ if stripped:
93
+ d.pending.append(_PendingBlock(block_type="text", text=stripped))
94
+ d.text_segments_for_urls.append(stripped)
95
+ return d
96
+
97
+
98
+ def _append_image_from_value(
99
+ d: _Draft,
100
+ value: Any,
101
+ default_mime: str | None,
102
+ ) -> None:
103
+ if value is None:
104
+ return
105
+ if isinstance(value, (bytes, bytearray)):
106
+ d.pending.append(
107
+ _PendingBlock(
108
+ block_type="image",
109
+ media=bytes(value),
110
+ media_mime=default_mime or "application/octet-stream",
111
+ ),
112
+ )
113
+ return
114
+ if isinstance(value, str):
115
+ decoded, _ = _decode_base64_image(value)
116
+ if decoded:
117
+ d.pending.append(
118
+ _PendingBlock(
119
+ block_type="image",
120
+ media=decoded,
121
+ media_mime=default_mime or "image/png",
122
+ ),
123
+ )
124
+
125
+
126
+ def _draft_from_dict_payload(payload: dict[str, Any]) -> _Draft:
127
+ d = _Draft()
128
+ for key in MULTIMODAL_TEXT_KEYS:
129
+ if key not in payload:
130
+ continue
131
+ val = payload.get(key)
132
+ if val is None:
133
+ continue
134
+ if isinstance(val, str) and val.strip():
135
+ t = val.strip()
136
+ d.pending.append(_PendingBlock(block_type="text", text=t))
137
+ d.text_segments_for_urls.append(t)
138
+ elif isinstance(val, list):
139
+ for item in val:
140
+ if isinstance(item, str) and item.strip():
141
+ t = item.strip()
142
+ d.pending.append(_PendingBlock(block_type="text", text=t))
143
+ d.text_segments_for_urls.append(t)
144
+
145
+ for key in MULTIMODAL_IMAGE_KEYS:
146
+ if key not in payload:
147
+ continue
148
+ val = payload[key]
149
+ if isinstance(val, list):
150
+ for item in val:
151
+ _append_image_from_value(d, item, "image/png")
152
+ else:
153
+ mime = None
154
+ if isinstance(val, dict):
155
+ b64 = val.get("base64") or val.get("data")
156
+ mime = val.get("mime") or val.get("content_type")
157
+ if isinstance(b64, str):
158
+ _append_image_from_value(d, b64, str(mime) if mime else "image/png")
159
+ else:
160
+ _append_image_from_value(d, val, "image/png")
161
+
162
+ images = payload.get("images")
163
+ if isinstance(images, list):
164
+ for item in images:
165
+ if isinstance(item, str):
166
+ _append_image_from_value(d, item, "image/png")
167
+ elif isinstance(item, dict):
168
+ b64 = item.get("base64") or item.get("data")
169
+ mime = item.get("mime") or item.get("content_type")
170
+ if isinstance(b64, str):
171
+ _append_image_from_value(
172
+ d,
173
+ b64,
174
+ str(mime) if mime else "image/png",
175
+ )
176
+
177
+ return d
178
+
179
+
180
+ def _build_draft(envelope: RawIngestEnvelope) -> _Draft:
181
+ payload = envelope.payload
182
+ ct = envelope.content_type
183
+
184
+ if ct == "text":
185
+ if isinstance(payload, str):
186
+ return _draft_from_plain_text(payload)
187
+ if isinstance(payload, dict):
188
+ return _draft_from_dict_payload(payload)
189
+ return _draft_from_plain_text(json.dumps(payload, default=str))
190
+
191
+ if ct == "multimodal" and isinstance(payload, dict):
192
+ return _draft_from_dict_payload(payload)
193
+
194
+ if ct == "image":
195
+ d = _Draft()
196
+ if isinstance(payload, str):
197
+ _append_image_from_value(d, payload, "image/png")
198
+ elif isinstance(payload, dict):
199
+ b64 = payload.get("base64") or payload.get("data")
200
+ mime = payload.get("mime") or "image/png"
201
+ if isinstance(b64, str):
202
+ _append_image_from_value(d, b64, str(mime))
203
+ elif isinstance(payload, (bytes, bytearray)):
204
+ d.pending.append(
205
+ _PendingBlock(
206
+ block_type="image",
207
+ media=bytes(payload),
208
+ media_mime="application/octet-stream",
209
+ ),
210
+ )
211
+ return d
212
+
213
+ if ct in ("audio", "video", "document"):
214
+ d = _Draft()
215
+ if isinstance(payload, dict):
216
+ b64 = payload.get("base64") or payload.get("data")
217
+ mime = payload.get("mime") or (
218
+ "audio/mpeg"
219
+ if ct == "audio"
220
+ else "video/mp4"
221
+ if ct == "video"
222
+ else "application/octet-stream"
223
+ )
224
+ if isinstance(b64, str):
225
+ try:
226
+ raw = base64.b64decode(b64, validate=True)
227
+ except (binascii.Error, ValueError):
228
+ raw = None
229
+ if raw:
230
+ d.pending.append(
231
+ _PendingBlock(
232
+ block_type=ct,
233
+ media=raw,
234
+ media_mime=str(mime),
235
+ ),
236
+ )
237
+ blob = payload.get("bytes")
238
+ if isinstance(blob, (bytes, bytearray)):
239
+ d.pending.append(
240
+ _PendingBlock(
241
+ block_type=ct,
242
+ media=bytes(blob),
243
+ media_mime=str(mime),
244
+ ),
245
+ )
246
+ elif isinstance(payload, (bytes, bytearray)):
247
+ d.pending.append(
248
+ _PendingBlock(
249
+ block_type=ct,
250
+ media=bytes(payload),
251
+ media_mime="application/octet-stream",
252
+ ),
253
+ )
254
+ return d
255
+
256
+ if isinstance(payload, dict):
257
+ return _draft_from_dict_payload(payload)
258
+ if isinstance(payload, str):
259
+ return _draft_from_plain_text(payload)
260
+ return _draft_from_plain_text(json.dumps(payload, default=str))
261
+
262
+
263
+ def _collect_links(envelope: RawIngestEnvelope, draft: _Draft) -> list[str]:
264
+ found: list[str] = []
265
+ for seg in draft.text_segments_for_urls:
266
+ found.extend(extract_urls_from_text(seg))
267
+ found.extend(_extract_metadata_urls(envelope.metadata))
268
+ return dedupe_urls(found)
269
+
270
+
271
+ def _summary_from_pending(pending: list[_PendingBlock]) -> str:
272
+ texts = [p.text for p in pending if p.text]
273
+ joined = " ".join(texts).strip()
274
+ if len(joined) <= 500:
275
+ return joined
276
+ return joined[:499] + "…"
277
+
278
+
279
+ def normalize_envelope_to_canonical(
280
+ *,
281
+ document_id: str,
282
+ envelope: RawIngestEnvelope,
283
+ blob_store: BlobStore,
284
+ ) -> CanonicalDocument:
285
+ """Map raw ingest envelope to canonical document; write media to blob store."""
286
+ draft = _build_draft(envelope)
287
+ links = _collect_links(envelope, draft)
288
+ tags = _tags_from_metadata(envelope.metadata)
289
+
290
+ blocks: list[CanonicalContentBlock] = []
291
+ for p in draft.pending:
292
+ if p.block_type == "text" and p.text is not None:
293
+ blocks.append(
294
+ CanonicalContentBlock(
295
+ type="text",
296
+ data=p.text,
297
+ raw_input=None,
298
+ mime="text/plain",
299
+ ),
300
+ )
301
+ continue
302
+ if p.media is not None and len(p.media) > 0:
303
+ _sha, uri = blob_store.write(p.media)
304
+ blocks.append(
305
+ CanonicalContentBlock(
306
+ type=p.block_type,
307
+ data=None,
308
+ raw_input=uri,
309
+ mime=p.media_mime,
310
+ ),
311
+ )
312
+
313
+ summary = _summary_from_pending(draft.pending)
314
+
315
+ return CanonicalDocument(
316
+ id=document_id,
317
+ source=envelope.source,
318
+ timestamp=envelope.timestamp,
319
+ content_blocks=blocks,
320
+ raw_content=envelope.payload,
321
+ entities=[],
322
+ links=links,
323
+ tags=tags,
324
+ summary=summary,
325
+ )
326
+
327
+
328
+ def parse_blob_sha256(raw_input: str | None) -> str | None:
329
+ if raw_input and raw_input.startswith("blob://"):
330
+ return raw_input.removeprefix("blob://")
331
+ return None
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from sqlalchemy import text
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from app.schemas.canonical import CanonicalDocument
9
+ from app.services.normalization.normalizer import parse_blob_sha256
10
+
11
+
12
+ async def persist_normalized_document(
13
+ session: AsyncSession,
14
+ *,
15
+ canonical: CanonicalDocument,
16
+ ) -> None:
17
+ """Replace content_blocks and document_links; set documents.summary."""
18
+ doc_id = canonical.id
19
+
20
+ await session.execute(
21
+ text("DELETE FROM content_blocks WHERE document_id = :d"),
22
+ {"d": doc_id},
23
+ )
24
+ await session.execute(
25
+ text("DELETE FROM document_links WHERE document_id = :d"),
26
+ {"d": doc_id},
27
+ )
28
+
29
+ for ordinal, block in enumerate(canonical.content_blocks):
30
+ sha = parse_blob_sha256(block.raw_input)
31
+ meta: dict = {}
32
+ if block.data is not None:
33
+ meta["text"] = block.data
34
+ meta_json = json.dumps(meta) if meta else None
35
+ storage_uri = block.raw_input if block.raw_input else None
36
+ await session.execute(
37
+ text(
38
+ "INSERT INTO content_blocks "
39
+ "(document_id, ordinal, type, storage_uri, inline_ref, mime, "
40
+ "sha256, meta) "
41
+ "VALUES (:did, :ord, :ty, :su, :ir, :mime, :sha, :meta)",
42
+ ),
43
+ {
44
+ "did": doc_id,
45
+ "ord": ordinal,
46
+ "ty": block.type,
47
+ "su": storage_uri,
48
+ "ir": "canonical:inline" if block.data and not storage_uri else None,
49
+ "mime": block.mime,
50
+ "sha": sha,
51
+ "meta": meta_json,
52
+ },
53
+ )
54
+
55
+ for ordinal, url in enumerate(canonical.links):
56
+ await session.execute(
57
+ text(
58
+ "INSERT INTO document_links (document_id, url, ordinal) "
59
+ "VALUES (:d, :u, :o)",
60
+ ),
61
+ {"d": doc_id, "u": url, "o": ordinal},
62
+ )
63
+
64
+ await session.execute(
65
+ text("UPDATE documents SET summary = :s WHERE id = :id"),
66
+ {"s": canonical.summary, "id": doc_id},
67
+ )
@@ -0,0 +1,13 @@
1
+ from app.services.playwright_extract.extract import (
2
+ PlaywrightExtractResult,
3
+ assert_url_allowed,
4
+ extract_visible_text_sync,
5
+ host_matches_allowlist,
6
+ )
7
+
8
+ __all__ = [
9
+ "PlaywrightExtractResult",
10
+ "assert_url_allowed",
11
+ "extract_visible_text_sync",
12
+ "host_matches_allowlist",
13
+ ]
@@ -0,0 +1,96 @@
1
+ """
2
+ CLI: extract visible text for a URL using a persistent Chrome/Chromium profile.
3
+
4
+ Example (Windows, after installing optional deps and browsers)::
5
+
6
+ uv run python -m app.services.playwright_extract ^
7
+ https://example.com/path ^
8
+ --profile "C:\\\\Users\\\\you\\\\pw-profile" ^
9
+ --allow-hosts example.com
10
+
11
+ See ``app.services.playwright_extract.extract`` module docstring for profile
12
+ locking and path guidance.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import sys
20
+ from pathlib import Path
21
+
22
+
23
+ def main() -> int:
24
+ p = argparse.ArgumentParser(
25
+ description="Playwright visible-text extraction (persistent user data dir)",
26
+ )
27
+ p.add_argument("url", help="https? URL to open")
28
+ p.add_argument(
29
+ "--profile",
30
+ required=True,
31
+ type=Path,
32
+ help="Persistent browser user data directory (dedicated folder recommended)",
33
+ )
34
+ p.add_argument(
35
+ "--allow-hosts",
36
+ required=True,
37
+ help="Comma-separated allowlist (supports *.suffix.com patterns)",
38
+ )
39
+ p.add_argument(
40
+ "--config",
41
+ type=Path,
42
+ help="Optional JSON file with keys: navigation_timeout_ms, max_response_chars, "
43
+ "headless, browser_channel",
44
+ )
45
+ p.add_argument(
46
+ "--headed",
47
+ action="store_true",
48
+ help="Run non-headless (overrides config headless)",
49
+ )
50
+ args = p.parse_args()
51
+
52
+ allowlisted = [h.strip() for h in args.allow_hosts.split(",") if h.strip()]
53
+ navigation_timeout_ms = 30_000
54
+ max_response_chars = 500_000
55
+ headless = not args.headed
56
+ browser_channel: str | None = None
57
+
58
+ if args.config is not None:
59
+ data = json.loads(args.config.read_text(encoding="utf-8"))
60
+ if not isinstance(data, dict):
61
+ print("config file must be a JSON object", file=sys.stderr)
62
+ return 2
63
+ navigation_timeout_ms = int(
64
+ data.get("navigation_timeout_ms", navigation_timeout_ms),
65
+ )
66
+ max_response_chars = int(
67
+ data.get("max_response_chars", max_response_chars),
68
+ )
69
+ headless = bool(data.get("headless", headless)) if not args.headed else False
70
+ ch = data.get("browser_channel")
71
+ browser_channel = str(ch) if ch else None
72
+
73
+ from app.services.playwright_extract.extract import extract_visible_text_sync
74
+
75
+ result = extract_visible_text_sync(
76
+ args.url,
77
+ str(args.profile.expanduser().resolve()),
78
+ allowlisted_hosts=allowlisted,
79
+ navigation_timeout_ms=navigation_timeout_ms,
80
+ max_response_chars=max_response_chars,
81
+ headless=headless,
82
+ browser_channel=browser_channel,
83
+ )
84
+ out = {
85
+ "visible_text": result.visible_text,
86
+ "title": result.title,
87
+ "final_url": result.final_url,
88
+ "truncated": result.truncated,
89
+ "meta": result.meta,
90
+ }
91
+ print(json.dumps(out, indent=2, ensure_ascii=False))
92
+ return 0
93
+
94
+
95
+ if __name__ == "__main__":
96
+ raise SystemExit(main())
@@ -0,0 +1,181 @@
1
+ """
2
+ Playwright-based extraction of visible page text using a **persistent browser profile**.
3
+
4
+ Operational notes (especially **Windows**)
5
+ ------------------------------------------
6
+
7
+ **Profile path**
8
+
9
+ - Use a **dedicated** directory for automation (e.g.
10
+ ``C:\\Users\\you\\AppData\\Local\\PlaywrightIngestProfile``), not your daily
11
+ Chrome profile, to reduce lock conflicts and accidental data loss.
12
+ - Paths may be passed as normal Python strings; raw strings (``r"C:\\..."``) help
13
+ avoid escape issues in scripts.
14
+ - Prefer resolving with ``pathlib.Path`` and ``.expanduser()`` / ``.resolve()``.
15
+
16
+ **Profile locking ("SingletonLock", "profile in use")**
17
+
18
+ - The Chromium/Chrome user-data directory must not be open in another browser
19
+ instance. **Fully quit** regular Chrome (or use a copy of the profile) before
20
+ ``launch_persistent_context`` runs.
21
+ - Only **one** Playwright persistent context should use a given directory at a
22
+ time; concurrent ingest requests with the same path will race and may fail.
23
+ - If you need parallel jobs, use **separate** ``user_data_dir`` roots per worker.
24
+
25
+ **Install**
26
+
27
+ - Install the optional dependency group (see ``pyproject.toml``), then run
28
+ ``playwright install chromium`` (or the channel you select). Using
29
+ ``channel="chrome"`` requires a local Google Chrome install.
30
+
31
+ This module performs **no** scraping guidance beyond driving a real browser
32
+ session; target sites' terms and robots rules still apply.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import logging
38
+ from dataclasses import dataclass, field
39
+ from typing import Any
40
+ from urllib.parse import urlparse
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ @dataclass(frozen=True, slots=True)
46
+ class PlaywrightExtractResult:
47
+ visible_text: str
48
+ title: str | None
49
+ final_url: str
50
+ truncated: bool
51
+ meta: dict[str, Any] = field(default_factory=dict)
52
+
53
+
54
+ def _scheme_host(url: str) -> tuple[str, str]:
55
+ parsed = urlparse(url.strip())
56
+ if parsed.scheme not in ("http", "https"):
57
+ msg = f"URL scheme must be http or https, got {parsed.scheme!r}"
58
+ raise ValueError(msg)
59
+ host = (parsed.hostname or "").lower()
60
+ if not host:
61
+ msg = "URL must include a host"
62
+ raise ValueError(msg)
63
+ return parsed.scheme, host
64
+
65
+
66
+ def host_matches_allowlist(hostname: str, allowlisted_hosts: list[str]) -> bool:
67
+ """
68
+ Match hostname against entries.
69
+
70
+ - ``example.com`` — exact host match (case-insensitive).
71
+ - ``*.example.com`` — suffix match (``sub.example.com``, not ``evil-example.com``).
72
+ """
73
+ h = hostname.lower().rstrip(".")
74
+ for raw in allowlisted_hosts:
75
+ p = raw.strip().lower()
76
+ if not p:
77
+ continue
78
+ if p.startswith("*."):
79
+ root = p[2:].lstrip(".")
80
+ if h == root or h.endswith("." + root):
81
+ return True
82
+ elif h == p:
83
+ return True
84
+ return False
85
+
86
+
87
+ def assert_url_allowed(url: str, allowlisted_hosts: list[str]) -> None:
88
+ if not allowlisted_hosts:
89
+ msg = "allowlisted_hosts must be a non-empty list"
90
+ raise ValueError(msg)
91
+ _, host = _scheme_host(url)
92
+ if not host_matches_allowlist(host, allowlisted_hosts):
93
+ msg = f"Host {host!r} is not in allowlisted_hosts"
94
+ raise ValueError(msg)
95
+
96
+
97
+ def extract_visible_text_sync(
98
+ url: str,
99
+ user_data_dir: str,
100
+ *,
101
+ allowlisted_hosts: list[str],
102
+ navigation_timeout_ms: int = 30_000,
103
+ max_response_chars: int = 500_000,
104
+ headless: bool = True,
105
+ browser_channel: str | None = None,
106
+ ) -> PlaywrightExtractResult:
107
+ """
108
+ Launch a persistent context, navigate to ``url``, return visible text + metadata.
109
+
110
+ Runs synchronously (intended for ``asyncio.to_thread`` from async routes or
111
+ direct use from the sync CLI). Requires the ``playwright`` package and
112
+ installed browser binaries.
113
+ """
114
+ assert_url_allowed(url, allowlisted_hosts)
115
+
116
+ try:
117
+ from playwright.sync_api import sync_playwright
118
+ except ImportError as e:
119
+ msg = (
120
+ "playwright is not installed. Install optional deps, e.g. "
121
+ "`uv sync --extra playwright`, then `playwright install chromium`."
122
+ )
123
+ raise RuntimeError(msg) from e
124
+
125
+ meta: dict[str, Any] = {
126
+ "navigation_timeout_ms": navigation_timeout_ms,
127
+ "max_response_chars": max_response_chars,
128
+ "user_data_dir": user_data_dir,
129
+ }
130
+
131
+ with sync_playwright() as pw:
132
+ ctx_kwargs: dict[str, Any] = {
133
+ "user_data_dir": user_data_dir,
134
+ "headless": headless,
135
+ "viewport": {"width": 1280, "height": 720},
136
+ }
137
+ if browser_channel:
138
+ ctx_kwargs["channel"] = browser_channel
139
+
140
+ context = pw.chromium.launch_persistent_context(**ctx_kwargs)
141
+ try:
142
+ page = context.pages[0] if context.pages else context.new_page()
143
+ page.goto(
144
+ url,
145
+ wait_until="domcontentloaded",
146
+ timeout=navigation_timeout_ms,
147
+ )
148
+ final_url = page.url
149
+ title = page.title() or None
150
+ try:
151
+ raw_text = page.inner_text(
152
+ "body",
153
+ timeout=min(15_000, navigation_timeout_ms),
154
+ )
155
+ except Exception:
156
+ logger.debug(
157
+ "inner_text(body) failed; falling back to evaluate",
158
+ exc_info=True,
159
+ )
160
+ raw_text = (
161
+ page.evaluate(
162
+ "() => document.body ? document.body.innerText : ''",
163
+ )
164
+ or ""
165
+ )
166
+
167
+ truncated = False
168
+ if len(raw_text) > max_response_chars:
169
+ raw_text = raw_text[:max_response_chars]
170
+ truncated = True
171
+ meta["truncated"] = truncated
172
+ meta["final_url"] = final_url
173
+ return PlaywrightExtractResult(
174
+ visible_text=raw_text.strip(),
175
+ title=title,
176
+ final_url=final_url,
177
+ truncated=truncated,
178
+ meta=meta,
179
+ )
180
+ finally:
181
+ context.close()