dgk-lab-runtime 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,570 @@
|
|
|
1
|
+
"""dgk-lab-runtime — Lab notebook runtime utilities for Marimo notebooks in Obsidian vaults."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"is_pyodide_runtime",
|
|
7
|
+
"lab_runtime_context",
|
|
8
|
+
"require_local_runtime",
|
|
9
|
+
"normalize_dataset_path",
|
|
10
|
+
"dataset_candidate_paths",
|
|
11
|
+
"read_lab_json",
|
|
12
|
+
"load_lab_manifest",
|
|
13
|
+
"get_lab_dataset",
|
|
14
|
+
"read_lab_dataset",
|
|
15
|
+
"local_vault_path",
|
|
16
|
+
"write_local_json_snapshot",
|
|
17
|
+
"write_local_dataframe_snapshot",
|
|
18
|
+
"write_local_markdown_note",
|
|
19
|
+
"get_local_secret",
|
|
20
|
+
"clean_lab_text",
|
|
21
|
+
"fingerprint_data",
|
|
22
|
+
"with_data_provenance",
|
|
23
|
+
"read_local_text_file",
|
|
24
|
+
"read_local_bytes_file",
|
|
25
|
+
"parse_feed_xml",
|
|
26
|
+
"fetch_local_feed",
|
|
27
|
+
"fetch_local_url_text",
|
|
28
|
+
"scrape_local_page_text",
|
|
29
|
+
"extract_local_image_text",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def is_pyodide_runtime() -> bool:
|
|
34
|
+
"""Detecta se o notebook está rodando empacotado no Pyodide/WASM."""
|
|
35
|
+
try:
|
|
36
|
+
import pyodide # type: ignore # noqa: F401
|
|
37
|
+
except ImportError:
|
|
38
|
+
return False
|
|
39
|
+
return True
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def lab_runtime_context(notebooks_path: str = "lab"):
|
|
43
|
+
"""Descreve o modo atual do notebook para células com fallback local."""
|
|
44
|
+
import os as _os
|
|
45
|
+
|
|
46
|
+
packaged = is_pyodide_runtime()
|
|
47
|
+
resolved_notebooks_path = _os.environ.get("VAULT_NOTEBOOKS_PATH", notebooks_path)
|
|
48
|
+
local_capabilities = {
|
|
49
|
+
"filesystem": not packaged,
|
|
50
|
+
"secrets": not packaged,
|
|
51
|
+
"subprocess": not packaged,
|
|
52
|
+
"headlessBrowser": not packaged,
|
|
53
|
+
"ocr": not packaged,
|
|
54
|
+
"binaryFormats": not packaged,
|
|
55
|
+
}
|
|
56
|
+
return {
|
|
57
|
+
"runtime": "pyodide" if packaged else "local",
|
|
58
|
+
"isPackaged": packaged,
|
|
59
|
+
"isLocal": not packaged,
|
|
60
|
+
"canRunLocalEtl": not packaged,
|
|
61
|
+
"capabilities": local_capabilities,
|
|
62
|
+
"notebooksPath": resolved_notebooks_path,
|
|
63
|
+
"cwd": "" if packaged else _os.getcwd(),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def require_local_runtime(operation: str = "esta operação"):
|
|
68
|
+
"""Bloqueia operações que não devem rodar no HTML empacotado."""
|
|
69
|
+
context = lab_runtime_context()
|
|
70
|
+
if not context["isLocal"]:
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
f"{operation} só pode rodar no modo local do notebook, antes do export."
|
|
73
|
+
)
|
|
74
|
+
return context
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def normalize_dataset_path(path_or_url: str) -> str:
|
|
78
|
+
"""Normaliza caminhos de ativos do Lab para um formato canônico.
|
|
79
|
+
|
|
80
|
+
- remove barras iniciais e prefixo ./
|
|
81
|
+
- converte barras do Windows para barras Unix
|
|
82
|
+
- remove prefixos duplicados de `assets/`
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
if path_or_url.startswith(("http://", "https://")):
|
|
86
|
+
return path_or_url
|
|
87
|
+
|
|
88
|
+
value = path_or_url.replace("\\", "/").strip()
|
|
89
|
+
value = value.removeprefix("./").removeprefix("/")
|
|
90
|
+
while value.startswith("assets/"):
|
|
91
|
+
value = value.removeprefix("assets/")
|
|
92
|
+
return value
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def dataset_candidate_paths(path_or_url: str):
|
|
96
|
+
normalized = normalize_dataset_path(path_or_url)
|
|
97
|
+
if normalized.startswith(("http://", "https://")):
|
|
98
|
+
return [normalized]
|
|
99
|
+
|
|
100
|
+
candidates = []
|
|
101
|
+
if normalized:
|
|
102
|
+
candidates.append(normalized)
|
|
103
|
+
if not normalized.startswith("assets/"):
|
|
104
|
+
candidates.append(f"assets/{normalized}")
|
|
105
|
+
return candidates
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _read_lab_json_runtime(candidates):
|
|
109
|
+
import json as _json
|
|
110
|
+
|
|
111
|
+
from pyodide.http import open_url # type: ignore
|
|
112
|
+
|
|
113
|
+
last_error = None
|
|
114
|
+
for candidate in candidates:
|
|
115
|
+
try:
|
|
116
|
+
return _json.loads(open_url(candidate).read())
|
|
117
|
+
except Exception as exc:
|
|
118
|
+
last_error = exc
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if last_error:
|
|
122
|
+
raise RuntimeError(
|
|
123
|
+
"Não foi possível carregar o recurso de datasets."
|
|
124
|
+
) from last_error
|
|
125
|
+
raise RuntimeError("Não foi possível carregar o recurso de datasets.")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _read_lab_json_local(candidates, notebooks_path: str):
|
|
129
|
+
import json as _json
|
|
130
|
+
import os as _os
|
|
131
|
+
|
|
132
|
+
_notebooks_path = _os.environ.get("VAULT_NOTEBOOKS_PATH", notebooks_path)
|
|
133
|
+
last_error = None
|
|
134
|
+
for candidate in candidates:
|
|
135
|
+
candidate_path = _os.path.join(_os.getcwd(), "public", _notebooks_path, candidate)
|
|
136
|
+
try:
|
|
137
|
+
with open(candidate_path, encoding="utf-8") as f:
|
|
138
|
+
return _json.load(f)
|
|
139
|
+
except Exception as exc:
|
|
140
|
+
last_error = exc
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
if last_error:
|
|
144
|
+
raise last_error
|
|
145
|
+
raise RuntimeError("Não foi possível carregar o recurso de datasets.")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def read_lab_json(path_or_url: str, notebooks_path: str = "lab"):
|
|
149
|
+
"""Carrega JSON de dataset em ambiente Pyodide ou execução local.
|
|
150
|
+
|
|
151
|
+
Em Pyodide usa `open_url` para buscar URLs relativas ao diretório do site;
|
|
152
|
+
em execução local (CI/`uv`) cai para `public/<VAULT_NOTEBOOKS_PATH>/...`.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
normalized = normalize_dataset_path(path_or_url)
|
|
156
|
+
if not normalized:
|
|
157
|
+
raise RuntimeError("Não foi possível carregar o recurso de datasets.")
|
|
158
|
+
|
|
159
|
+
if normalized.startswith(("http://", "https://")):
|
|
160
|
+
import json as _json
|
|
161
|
+
from urllib.request import urlopen
|
|
162
|
+
|
|
163
|
+
return _json.loads(urlopen(normalized, timeout=15).read())
|
|
164
|
+
|
|
165
|
+
candidates = dataset_candidate_paths(normalized)
|
|
166
|
+
try:
|
|
167
|
+
return _read_lab_json_runtime(candidates)
|
|
168
|
+
except Exception:
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
return _read_lab_json_local(candidates, notebooks_path)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def load_lab_manifest(notebooks_path: str = "lab"):
|
|
175
|
+
"""Carrega o manifesto de datasets do Lab."""
|
|
176
|
+
return read_lab_json("datasets/manifest.json", notebooks_path)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_lab_dataset(dataset_id: str, manifest=None, notebooks_path: str = "lab"):
|
|
180
|
+
"""Busca uma entrada do manifesto por id."""
|
|
181
|
+
manifest = manifest or load_lab_manifest(notebooks_path)
|
|
182
|
+
for dataset in manifest.get("datasets", []):
|
|
183
|
+
if dataset.get("id") == dataset_id:
|
|
184
|
+
return dataset
|
|
185
|
+
raise KeyError(f"Dataset não declarado no manifesto do Lab: {dataset_id}")
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def read_lab_dataset(dataset_or_id, manifest=None, notebooks_path: str = "lab"):
|
|
189
|
+
"""Lê um dataset declarado no manifesto, localmente ou no HTML publicado."""
|
|
190
|
+
dataset = (
|
|
191
|
+
get_lab_dataset(dataset_or_id, manifest, notebooks_path)
|
|
192
|
+
if isinstance(dataset_or_id, str)
|
|
193
|
+
else dataset_or_id
|
|
194
|
+
)
|
|
195
|
+
location = dataset.get("assetPath") or dataset.get("path") or dataset.get("url")
|
|
196
|
+
if not location:
|
|
197
|
+
raise RuntimeError(
|
|
198
|
+
f"Dataset {dataset.get('id', '<sem id>')} não possui assetPath, path ou url."
|
|
199
|
+
)
|
|
200
|
+
return read_lab_json(location, notebooks_path)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _safe_relative_path(relative_path: str) -> str:
|
|
204
|
+
import os as _os
|
|
205
|
+
|
|
206
|
+
value = str(relative_path or "").replace("\\", "/").strip().lstrip("/")
|
|
207
|
+
normalized = _os.path.normpath(value).replace("\\", "/")
|
|
208
|
+
if not value or normalized == "." or normalized.startswith("../") or normalized == "..":
|
|
209
|
+
raise RuntimeError("Caminho de snapshot local inválido.")
|
|
210
|
+
return normalized
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def local_vault_path(relative_path: str):
|
|
214
|
+
"""Resolve um caminho seguro dentro do repositório local do vault."""
|
|
215
|
+
import os as _os
|
|
216
|
+
|
|
217
|
+
context = require_local_runtime("resolver caminho local do vault")
|
|
218
|
+
normalized = _safe_relative_path(relative_path)
|
|
219
|
+
root = _os.path.abspath(context["cwd"])
|
|
220
|
+
target = _os.path.abspath(_os.path.join(root, normalized))
|
|
221
|
+
if _os.path.commonpath([root, target]) != root:
|
|
222
|
+
raise RuntimeError("Caminho de snapshot local sai do vault.")
|
|
223
|
+
return target
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _local_write_result(relative_path: str, target: str):
|
|
227
|
+
import os as _os
|
|
228
|
+
|
|
229
|
+
return {
|
|
230
|
+
"path": target,
|
|
231
|
+
"relativePath": _safe_relative_path(relative_path),
|
|
232
|
+
"bytes": _os.path.getsize(target),
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def write_local_json_snapshot(relative_path: str, payload, *, indent: int = 2):
|
|
237
|
+
"""Escreve um snapshot JSON versionável no vault local.
|
|
238
|
+
|
|
239
|
+
Use para etapas de Extract que precisam de filesystem, binários, navegador,
|
|
240
|
+
rede autenticada ou outros recursos indisponíveis no HTML/WASM publicado.
|
|
241
|
+
"""
|
|
242
|
+
import json as _json
|
|
243
|
+
import os as _os
|
|
244
|
+
|
|
245
|
+
target = local_vault_path(relative_path)
|
|
246
|
+
_os.makedirs(_os.path.dirname(target), exist_ok=True)
|
|
247
|
+
with open(target, "w", encoding="utf-8") as f:
|
|
248
|
+
_json.dump(payload, f, ensure_ascii=False, indent=indent)
|
|
249
|
+
f.write("\n")
|
|
250
|
+
return _local_write_result(relative_path, target)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def write_local_dataframe_snapshot(dataframe, relative_path: str, *, format: str = None):
|
|
254
|
+
"""Escreve DataFrame local como CSV, JSON ou Parquet.
|
|
255
|
+
|
|
256
|
+
Parquet é opcional: só funciona quando `pyarrow` ou engine compatível estiver
|
|
257
|
+
instalado no ambiente local. O HTML publicado deve consumir snapshots já
|
|
258
|
+
gerados, não tentar escrever arquivos.
|
|
259
|
+
"""
|
|
260
|
+
import os as _os
|
|
261
|
+
|
|
262
|
+
target = local_vault_path(relative_path)
|
|
263
|
+
_os.makedirs(_os.path.dirname(target), exist_ok=True)
|
|
264
|
+
resolved_format = (format or _os.path.splitext(target)[1].lstrip(".")).lower()
|
|
265
|
+
|
|
266
|
+
if resolved_format == "csv":
|
|
267
|
+
dataframe.to_csv(target, index=False)
|
|
268
|
+
elif resolved_format == "json":
|
|
269
|
+
dataframe.to_json(target, orient="records", force_ascii=False, indent=2)
|
|
270
|
+
with open(target, "a", encoding="utf-8") as f:
|
|
271
|
+
f.write("\n")
|
|
272
|
+
elif resolved_format == "parquet":
|
|
273
|
+
dataframe.to_parquet(target, index=False)
|
|
274
|
+
else:
|
|
275
|
+
raise RuntimeError("Formato de snapshot tabular suportado: csv, json ou parquet.")
|
|
276
|
+
|
|
277
|
+
return _local_write_result(relative_path, target)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def write_local_markdown_note(relative_path: str, body: str, *, frontmatter=None):
|
|
281
|
+
"""Escreve uma nota Markdown local para Obsidian, Bases e Dataview.
|
|
282
|
+
|
|
283
|
+
Use quando uma análise do Lab deve virar artefato curável no vault. O HTML
|
|
284
|
+
publicado nunca escreve notas; ele consome snapshots/notas já versionados.
|
|
285
|
+
"""
|
|
286
|
+
import os as _os
|
|
287
|
+
|
|
288
|
+
target = local_vault_path(relative_path)
|
|
289
|
+
if not target.endswith(".md"):
|
|
290
|
+
raise RuntimeError("Notas geradas pelo Lab devem usar extensão .md.")
|
|
291
|
+
|
|
292
|
+
_os.makedirs(_os.path.dirname(target), exist_ok=True)
|
|
293
|
+
metadata = dict(frontmatter or {})
|
|
294
|
+
metadata.setdefault("lab_generated", True)
|
|
295
|
+
metadata.setdefault("status", "rascunho")
|
|
296
|
+
|
|
297
|
+
def _yaml_scalar(value):
|
|
298
|
+
if isinstance(value, bool):
|
|
299
|
+
return "true" if value else "false"
|
|
300
|
+
if value is None:
|
|
301
|
+
return "null"
|
|
302
|
+
return str(value).replace("\n", " ")
|
|
303
|
+
|
|
304
|
+
lines = ["---"]
|
|
305
|
+
for key in sorted(metadata):
|
|
306
|
+
value = metadata[key]
|
|
307
|
+
if isinstance(value, (list, tuple)):
|
|
308
|
+
lines.append(f"{key}:")
|
|
309
|
+
for item in value:
|
|
310
|
+
lines.append(f" - {_yaml_scalar(item)}")
|
|
311
|
+
else:
|
|
312
|
+
lines.append(f"{key}: {_yaml_scalar(value)}")
|
|
313
|
+
lines.extend(["---", "", str(body).rstrip(), ""])
|
|
314
|
+
|
|
315
|
+
with open(target, "w", encoding="utf-8") as f:
|
|
316
|
+
f.write("\n".join(lines))
|
|
317
|
+
return _local_write_result(relative_path, target)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def get_local_secret(name: str, default=None, *, required: bool = False):
|
|
321
|
+
"""Lê segredo do ambiente local sem expor credenciais no HTML publicado."""
|
|
322
|
+
import os as _os
|
|
323
|
+
|
|
324
|
+
require_local_runtime(f"ler segredo local {name}")
|
|
325
|
+
value = _os.environ.get(name, default)
|
|
326
|
+
if required and not value:
|
|
327
|
+
raise RuntimeError(f"Segredo local ausente: {name}")
|
|
328
|
+
return value
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def clean_lab_text(text, *, lower: bool = False) -> str:
|
|
332
|
+
"""Normaliza texto bruto vindo de scraping, OCR, arquivos ou APIs."""
|
|
333
|
+
import re as _re
|
|
334
|
+
|
|
335
|
+
cleaned = _re.sub(r"[\n\x0c\r]+", " ", str(text or ""))
|
|
336
|
+
cleaned = _re.sub(r"\s+", " ", cleaned).strip()
|
|
337
|
+
return cleaned.lower() if lower else cleaned
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def fingerprint_data(payload) -> str:
|
|
341
|
+
"""Calcula fingerprint SHA-256 estável para payloads JSON-serializáveis."""
|
|
342
|
+
import hashlib as _hashlib
|
|
343
|
+
import json as _json
|
|
344
|
+
|
|
345
|
+
encoded = _json.dumps(payload, ensure_ascii=False, sort_keys=True).encode("utf-8")
|
|
346
|
+
return _hashlib.sha256(encoded).hexdigest()
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def with_data_provenance(
|
|
350
|
+
payload,
|
|
351
|
+
*,
|
|
352
|
+
source: str,
|
|
353
|
+
license: str = "verificar",
|
|
354
|
+
privacy: str = "private-until-published",
|
|
355
|
+
collected_at: str = None,
|
|
356
|
+
):
|
|
357
|
+
"""Adiciona metadados mínimos de proveniência a um snapshot do Lab."""
|
|
358
|
+
from datetime import datetime as _datetime
|
|
359
|
+
from datetime import timezone as _timezone
|
|
360
|
+
|
|
361
|
+
collected = collected_at or _datetime.now(_timezone.utc).isoformat().replace("+00:00", "Z")
|
|
362
|
+
enriched = {
|
|
363
|
+
"schemaVersion": 1,
|
|
364
|
+
"source": source,
|
|
365
|
+
"collectedAt": collected,
|
|
366
|
+
"license": license,
|
|
367
|
+
"privacy": privacy,
|
|
368
|
+
"data": payload,
|
|
369
|
+
}
|
|
370
|
+
enriched["sha256"] = fingerprint_data(enriched)
|
|
371
|
+
return enriched
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def read_local_text_file(relative_path: str, *, encoding: str = "utf-8"):
|
|
375
|
+
"""Lê arquivo de texto local dentro do vault."""
|
|
376
|
+
with open(local_vault_path(relative_path), encoding=encoding) as f:
|
|
377
|
+
return f.read()
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def read_local_bytes_file(relative_path: str):
|
|
381
|
+
"""Lê arquivo binário local dentro do vault."""
|
|
382
|
+
with open(local_vault_path(relative_path), "rb") as f:
|
|
383
|
+
return f.read()
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _xml_child_text(element, names):
|
|
387
|
+
for name in names:
|
|
388
|
+
child = element.find(name)
|
|
389
|
+
if child is not None and child.text:
|
|
390
|
+
return clean_lab_text(child.text)
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _xml_atom_link(element):
|
|
395
|
+
for link in element.findall("{http://www.w3.org/2005/Atom}link"):
|
|
396
|
+
href = link.attrib.get("href")
|
|
397
|
+
rel = link.attrib.get("rel", "alternate")
|
|
398
|
+
if href and rel in {"alternate", ""}:
|
|
399
|
+
return href
|
|
400
|
+
link = element.find("{http://www.w3.org/2005/Atom}link")
|
|
401
|
+
return link.attrib.get("href") if link is not None else None
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def parse_feed_xml(xml_text: str, *, source_url: str = None, limit: int = 50):
|
|
405
|
+
"""Converte RSS ou Atom em registros pequenos e versionáveis."""
|
|
406
|
+
import xml.etree.ElementTree as _ET
|
|
407
|
+
|
|
408
|
+
root = _ET.fromstring(xml_text)
|
|
409
|
+
items = []
|
|
410
|
+
|
|
411
|
+
channel = root.find("channel")
|
|
412
|
+
if channel is not None:
|
|
413
|
+
feed_title = _xml_child_text(channel, ["title"])
|
|
414
|
+
for item in channel.findall("item")[:limit]:
|
|
415
|
+
items.append(
|
|
416
|
+
{
|
|
417
|
+
"title": _xml_child_text(item, ["title"]),
|
|
418
|
+
"url": _xml_child_text(item, ["link"]),
|
|
419
|
+
"published": _xml_child_text(item, ["pubDate", "date"]),
|
|
420
|
+
"updated": _xml_child_text(item, ["updated"]),
|
|
421
|
+
"summary": _xml_child_text(item, ["description", "summary"]),
|
|
422
|
+
"guid": _xml_child_text(item, ["guid", "id"]),
|
|
423
|
+
}
|
|
424
|
+
)
|
|
425
|
+
return {
|
|
426
|
+
"schemaVersion": 1,
|
|
427
|
+
"kind": "feed",
|
|
428
|
+
"format": "rss",
|
|
429
|
+
"source": source_url,
|
|
430
|
+
"title": feed_title,
|
|
431
|
+
"itemCount": len(items),
|
|
432
|
+
"items": items,
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
atom = "{http://www.w3.org/2005/Atom}"
|
|
436
|
+
feed_title = _xml_child_text(root, [f"{atom}title", "title"])
|
|
437
|
+
for entry in root.findall(f"{atom}entry")[:limit]:
|
|
438
|
+
items.append(
|
|
439
|
+
{
|
|
440
|
+
"title": _xml_child_text(entry, [f"{atom}title", "title"]),
|
|
441
|
+
"url": _xml_atom_link(entry),
|
|
442
|
+
"published": _xml_child_text(entry, [f"{atom}published", "published"]),
|
|
443
|
+
"updated": _xml_child_text(entry, [f"{atom}updated", "updated"]),
|
|
444
|
+
"summary": _xml_child_text(entry, [f"{atom}summary", f"{atom}content", "summary"]),
|
|
445
|
+
"guid": _xml_child_text(entry, [f"{atom}id", "id"]),
|
|
446
|
+
}
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
return {
|
|
450
|
+
"schemaVersion": 1,
|
|
451
|
+
"kind": "feed",
|
|
452
|
+
"format": "atom",
|
|
453
|
+
"source": source_url,
|
|
454
|
+
"title": feed_title,
|
|
455
|
+
"itemCount": len(items),
|
|
456
|
+
"items": items,
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def fetch_local_feed(url: str, *, timeout: int = 20, user_agent: str = "vault-seed-lab/1.0", limit: int = 50):
|
|
461
|
+
"""Baixa e normaliza um feed RSS/Atom no ambiente local."""
|
|
462
|
+
from urllib.request import Request as _Request
|
|
463
|
+
from urllib.request import urlopen as _urlopen
|
|
464
|
+
|
|
465
|
+
require_local_runtime("coletar feed RSS/Atom localmente")
|
|
466
|
+
request = _Request(url, headers={"User-Agent": user_agent})
|
|
467
|
+
with _urlopen(request, timeout=timeout) as response:
|
|
468
|
+
xml_text = response.read().decode(response.headers.get_content_charset() or "utf-8", "replace")
|
|
469
|
+
return parse_feed_xml(xml_text, source_url=url, limit=limit)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def fetch_local_url_text(url: str, *, timeout: int = 20, user_agent: str = "vault-seed-lab/1.0"):
|
|
473
|
+
"""Extrai HTML/texto de uma URL no ambiente local usando biblioteca padrão."""
|
|
474
|
+
import re as _re
|
|
475
|
+
from html.parser import HTMLParser as _HTMLParser
|
|
476
|
+
from urllib.request import Request as _Request
|
|
477
|
+
from urllib.request import urlopen as _urlopen
|
|
478
|
+
|
|
479
|
+
require_local_runtime("extrair página web localmente")
|
|
480
|
+
|
|
481
|
+
class _TextParser(_HTMLParser):
|
|
482
|
+
def __init__(self):
|
|
483
|
+
super().__init__()
|
|
484
|
+
self._title = []
|
|
485
|
+
self._chunks = []
|
|
486
|
+
self._in_title = False
|
|
487
|
+
self._ignored = 0
|
|
488
|
+
|
|
489
|
+
def handle_starttag(self, tag, attrs):
|
|
490
|
+
if tag in {"script", "style", "noscript"}:
|
|
491
|
+
self._ignored += 1
|
|
492
|
+
if tag == "title":
|
|
493
|
+
self._in_title = True
|
|
494
|
+
|
|
495
|
+
def handle_endtag(self, tag):
|
|
496
|
+
if tag in {"script", "style", "noscript"} and self._ignored:
|
|
497
|
+
self._ignored -= 1
|
|
498
|
+
if tag == "title":
|
|
499
|
+
self._in_title = False
|
|
500
|
+
|
|
501
|
+
def handle_data(self, data):
|
|
502
|
+
if self._ignored:
|
|
503
|
+
return
|
|
504
|
+
if self._in_title:
|
|
505
|
+
self._title.append(data)
|
|
506
|
+
self._chunks.append(data)
|
|
507
|
+
|
|
508
|
+
request = _Request(url, headers={"User-Agent": user_agent})
|
|
509
|
+
with _urlopen(request, timeout=timeout) as response:
|
|
510
|
+
html = response.read().decode(response.headers.get_content_charset() or "utf-8", "replace")
|
|
511
|
+
|
|
512
|
+
parser = _TextParser()
|
|
513
|
+
parser.feed(html)
|
|
514
|
+
text = clean_lab_text(" ".join(parser._chunks))
|
|
515
|
+
return {
|
|
516
|
+
"url": url,
|
|
517
|
+
"title": clean_lab_text(" ".join(parser._title)) or None,
|
|
518
|
+
"text": text,
|
|
519
|
+
"textPreview": text[:500],
|
|
520
|
+
"links": _re.findall(r"href=[\"']([^\"']+)", html)[:50],
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
async def scrape_local_page_text(url: str, *, wait_until: str = "networkidle"):
|
|
525
|
+
"""Extrai página dinâmica localmente com Playwright, quando instalado."""
|
|
526
|
+
require_local_runtime("extrair página dinâmica com Playwright")
|
|
527
|
+
try:
|
|
528
|
+
from playwright.async_api import async_playwright as _async_playwright
|
|
529
|
+
except ImportError as exc:
|
|
530
|
+
raise RuntimeError(
|
|
531
|
+
"Playwright não está instalado. Instale apenas no ambiente local quando precisar de scraping dinâmico."
|
|
532
|
+
) from exc
|
|
533
|
+
|
|
534
|
+
async with _async_playwright() as playwright:
|
|
535
|
+
browser = await playwright.chromium.launch(headless=True)
|
|
536
|
+
page = await browser.new_page()
|
|
537
|
+
await page.goto(url, wait_until=wait_until)
|
|
538
|
+
title = await page.title()
|
|
539
|
+
text = await page.inner_text("body")
|
|
540
|
+
await browser.close()
|
|
541
|
+
|
|
542
|
+
cleaned = clean_lab_text(text)
|
|
543
|
+
return {"url": url, "title": title, "text": cleaned, "textPreview": cleaned[:500]}
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def extract_local_image_text(image_input, *, languages: str = "por+eng"):
|
|
547
|
+
"""Executa OCR local em caminho, bytes, objeto PIL ou URL de imagem."""
|
|
548
|
+
from io import BytesIO as _BytesIO
|
|
549
|
+
from urllib.request import urlopen as _urlopen
|
|
550
|
+
|
|
551
|
+
require_local_runtime("executar OCR local")
|
|
552
|
+
try:
|
|
553
|
+
import pytesseract as _pytesseract
|
|
554
|
+
from PIL import Image as _Image
|
|
555
|
+
except ImportError as exc:
|
|
556
|
+
raise RuntimeError(
|
|
557
|
+
"OCR local requer pillow e pytesseract instalados, além do binário tesseract."
|
|
558
|
+
) from exc
|
|
559
|
+
|
|
560
|
+
if isinstance(image_input, str) and image_input.startswith(("http://", "https://")):
|
|
561
|
+
with _urlopen(image_input, timeout=20) as response:
|
|
562
|
+
image = _Image.open(_BytesIO(response.read()))
|
|
563
|
+
elif isinstance(image_input, str):
|
|
564
|
+
image = _Image.open(local_vault_path(image_input))
|
|
565
|
+
elif isinstance(image_input, bytes):
|
|
566
|
+
image = _Image.open(_BytesIO(image_input))
|
|
567
|
+
else:
|
|
568
|
+
image = image_input
|
|
569
|
+
|
|
570
|
+
return clean_lab_text(_pytesseract.image_to_string(image, lang=languages))
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dgk-lab-runtime
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Digital Gardening Kit — Lab notebook runtime utilities for Marimo notebooks in Obsidian vaults
|
|
5
|
+
Project-URL: Repository, https://github.com/aretw0/vault-seed
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/aretw0/vault-seed/issues
|
|
7
|
+
License: GPL-3.0-only
|
|
8
|
+
Keywords: digital-garden,marimo,notebook,obsidian,vault
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Provides-Extra: ocr
|
|
18
|
+
Requires-Dist: pillow>=10.0; extra == 'ocr'
|
|
19
|
+
Requires-Dist: pytesseract>=0.3; extra == 'ocr'
|
|
20
|
+
Provides-Extra: scraping
|
|
21
|
+
Requires-Dist: playwright>=1.40; extra == 'scraping'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# dgk-lab-runtime
|
|
25
|
+
|
|
26
|
+
Lab notebook runtime utilities from the [Digital Gardening Kit](https://github.com/aretw0/vault-seed).
|
|
27
|
+
|
|
28
|
+
Designed for Marimo notebooks that live alongside an Obsidian vault — handles the runtime boundary between local ETL (filesystem, secrets, network) and the published HTML/WASM notebook. Works with any vault layout; the defaults follow the `vault-seed` conventions (`lab/` as the notebooks path, `public/lab/` as the dataset directory) which you can override via environment variables or function arguments.
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install dgk-lab-runtime
|
|
34
|
+
# with scraping support (Playwright)
|
|
35
|
+
pip install "dgk-lab-runtime[scraping]"
|
|
36
|
+
# with OCR support (Tesseract)
|
|
37
|
+
pip install "dgk-lab-runtime[ocr]"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from dgk_lab_runtime import (
|
|
44
|
+
lab_runtime_context,
|
|
45
|
+
read_lab_json,
|
|
46
|
+
load_lab_manifest,
|
|
47
|
+
read_lab_dataset,
|
|
48
|
+
write_local_json_snapshot,
|
|
49
|
+
fetch_local_feed,
|
|
50
|
+
fingerprint_data,
|
|
51
|
+
with_data_provenance,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
ctx = lab_runtime_context()
|
|
55
|
+
# {"runtime": "local", "isPackaged": False, "capabilities": {...}, ...}
|
|
56
|
+
|
|
57
|
+
# Read a dataset from the Lab manifest (local or Pyodide/WASM)
|
|
58
|
+
data = read_lab_dataset("my-dataset")
|
|
59
|
+
|
|
60
|
+
# Write a versioned JSON snapshot to the vault
|
|
61
|
+
write_local_json_snapshot("40 - Recursos/data/snapshot.json", data)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Configuration
|
|
65
|
+
|
|
66
|
+
| Environment variable | Default | Description |
|
|
67
|
+
|---|---|---|
|
|
68
|
+
| `VAULT_NOTEBOOKS_PATH` | `lab` | URL segment where notebooks are published |
|
|
69
|
+
|
|
70
|
+
## Runtime boundary
|
|
71
|
+
|
|
72
|
+
Every function that writes files, reads secrets, or makes outbound requests calls `require_local_runtime()` and raises `RuntimeError` when running inside a packaged HTML/WASM notebook. This boundary is intentional: ETL logic runs locally before export; the published notebook only reads pre-generated snapshots.
|
|
73
|
+
|
|
74
|
+
## Vault-seed compatibility
|
|
75
|
+
|
|
76
|
+
If you use [vault-seed](https://github.com/aretw0/vault-seed), the `_lab_notebook_runtime.py` shim in `99 - Meta e Anexos/Notebooks/` imports this package transparently when installed, so existing notebooks work unchanged. Without installation the shim uses an inline fallback — same API, no external dependency.
|
|
77
|
+
|
|
78
|
+
## License
|
|
79
|
+
|
|
80
|
+
GPL-3.0-only — see [LICENSE.md](../../LICENSE.md) in the repository root.
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
dgk_lab_runtime/__init__.py,sha256=8DE5jzBRC-fMWQrbDe_LOYU3atEpwJEpABWoB1O_7qA,19992
|
|
2
|
+
dgk_lab_runtime-0.1.0.dist-info/METADATA,sha256=drFpoGNP93gUQLKzklzB0yq3ytjf3VKIJK-phNlWCs0,3219
|
|
3
|
+
dgk_lab_runtime-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
4
|
+
dgk_lab_runtime-0.1.0.dist-info/RECORD,,
|