omnius 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4959 -0
- package/dist/index.d.ts +6 -0
- package/dist/index.js +630665 -0
- package/dist/launcher.cjs +78 -0
- package/dist/postinstall-daemon.cjs +776 -0
- package/dist/preinstall.cjs +92 -0
- package/dist/scripts/autoresearch-prepare.py +459 -0
- package/dist/scripts/autoresearch-train.py +661 -0
- package/dist/scripts/crawlee-scraper.py +358 -0
- package/dist/scripts/live-nemotron.py +478 -0
- package/dist/scripts/live-whisper.py +242 -0
- package/dist/scripts/ocr-advanced.py +571 -0
- package/dist/scripts/start-moondream.py +112 -0
- package/dist/scripts/tor/UPSTREAM-README.md +148 -0
- package/dist/scripts/tor/destroy_tor.sh +29 -0
- package/dist/scripts/tor/tor_setup.sh +163 -0
- package/dist/scripts/transcribe-file.py +63 -0
- package/dist/scripts/web_scrape.py +1295 -0
- package/npm-shrinkwrap.json +7412 -0
- package/package.json +142 -0
- package/prompts/agentic/system-large.md +569 -0
- package/prompts/agentic/system-medium.md +211 -0
- package/prompts/agentic/system-small.md +114 -0
- package/prompts/compaction/context-compaction.md +44 -0
- package/prompts/personality/level-1-minimal.md +3 -0
- package/prompts/personality/level-2-concise.md +3 -0
- package/prompts/personality/level-4-explanatory.md +3 -0
- package/prompts/personality/level-5-thorough.md +3 -0
- package/prompts/personality/level-autist.md +3 -0
- package/prompts/personality/level-stark.md +3 -0
- package/prompts/runners/dispatcher.md +24 -0
- package/prompts/runners/editor.md +44 -0
- package/prompts/runners/evaluator.md +30 -0
- package/prompts/runners/merge-summary.md +9 -0
- package/prompts/runners/normalizer.md +23 -0
- package/prompts/runners/planner.md +33 -0
- package/prompts/runners/scout.md +39 -0
- package/prompts/runners/verifier.md +36 -0
- package/prompts/skill-builder/seed-analysis.md +30 -0
- package/prompts/skill-builder/skill-expansion.md +76 -0
- package/prompts/skill-builder/skill-validation.md +31 -0
- package/prompts/templates/analysis.md +14 -0
- package/prompts/templates/code-review.md +16 -0
- package/prompts/templates/code.md +13 -0
- package/prompts/templates/document.md +13 -0
- package/prompts/templates/error-diagnosis.md +14 -0
- package/prompts/templates/general.md +9 -0
- package/prompts/templates/plan.md +15 -0
- package/prompts/templates/system.md +16 -0
- package/prompts/tui/dmn-gather.md +128 -0
- package/prompts/tui/dream-consolidate.md +48 -0
- package/prompts/tui/dream-lucid-eval.md +17 -0
- package/prompts/tui/dream-lucid-implement.md +14 -0
- package/prompts/tui/dream-stages.md +19 -0
- package/prompts/tui/emotion-behavioral.md +2 -0
- package/prompts/tui/emotion-center.md +12 -0
- package/voices/personaplex/OverBarn.pt +0 -0
- package/voices/personaplex/clone-voice.py +384 -0
- package/voices/personaplex/dequant-loader.py +174 -0
- package/voices/personaplex/quantize-weights.py +167 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
crawlee-scraper.py — Self-contained web scraping script using Crawlee.
|
|
4
|
+
|
|
5
|
+
Supports two crawling strategies:
|
|
6
|
+
- beautifulsoup (default): Fast HTTP-based scraping via httpx + BeautifulSoup.
|
|
7
|
+
Best for static content, documentation pages, and API docs.
|
|
8
|
+
- playwright: Headless browser scraping for JS-rendered pages, SPAs, and
|
|
9
|
+
sites that require JavaScript execution to display content.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python3 crawlee-scraper.py --url URL [--strategy beautifulsoup|playwright]
|
|
13
|
+
[--max-pages N] [--max-depth N] [--output json|text] [--extract links|text|all]
|
|
14
|
+
|
|
15
|
+
Auto-installs crawlee and playwright if not present.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import asyncio
|
|
20
|
+
import json
|
|
21
|
+
import subprocess
|
|
22
|
+
import sys
|
|
23
|
+
import importlib
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Auto-install crawlee if missing
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
def ensure_crawlee():
|
|
31
|
+
"""Ensure crawlee is installed, installing it if needed."""
|
|
32
|
+
try:
|
|
33
|
+
importlib.import_module("crawlee")
|
|
34
|
+
return True
|
|
35
|
+
except ImportError:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
print("[crawlee-scraper] Installing crawlee...", file=sys.stderr)
|
|
39
|
+
try:
|
|
40
|
+
subprocess.check_call(
|
|
41
|
+
[sys.executable, "-m", "pip", "install", "crawlee[all]"],
|
|
42
|
+
stdout=subprocess.DEVNULL,
|
|
43
|
+
stderr=subprocess.PIPE,
|
|
44
|
+
)
|
|
45
|
+
return True
|
|
46
|
+
except subprocess.CalledProcessError as e:
|
|
47
|
+
# Try without [all] extras (minimal install — beautifulsoup only)
|
|
48
|
+
try:
|
|
49
|
+
subprocess.check_call(
|
|
50
|
+
[sys.executable, "-m", "pip", "install", "crawlee[beautifulsoup]"],
|
|
51
|
+
stdout=subprocess.DEVNULL,
|
|
52
|
+
stderr=subprocess.PIPE,
|
|
53
|
+
)
|
|
54
|
+
return True
|
|
55
|
+
except subprocess.CalledProcessError:
|
|
56
|
+
print(f"[crawlee-scraper] Failed to install crawlee: {e}", file=sys.stderr)
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def ensure_playwright():
|
|
61
|
+
"""Ensure playwright browsers are installed."""
|
|
62
|
+
try:
|
|
63
|
+
importlib.import_module("playwright")
|
|
64
|
+
subprocess.check_call(
|
|
65
|
+
[sys.executable, "-m", "playwright", "install", "chromium"],
|
|
66
|
+
stdout=subprocess.DEVNULL,
|
|
67
|
+
stderr=subprocess.PIPE,
|
|
68
|
+
)
|
|
69
|
+
return True
|
|
70
|
+
except (ImportError, subprocess.CalledProcessError):
|
|
71
|
+
try:
|
|
72
|
+
subprocess.check_call(
|
|
73
|
+
[sys.executable, "-m", "pip", "install", "playwright"],
|
|
74
|
+
stdout=subprocess.DEVNULL,
|
|
75
|
+
stderr=subprocess.PIPE,
|
|
76
|
+
)
|
|
77
|
+
subprocess.check_call(
|
|
78
|
+
[sys.executable, "-m", "playwright", "install", "chromium"],
|
|
79
|
+
stdout=subprocess.DEVNULL,
|
|
80
|
+
stderr=subprocess.PIPE,
|
|
81
|
+
)
|
|
82
|
+
return True
|
|
83
|
+
except subprocess.CalledProcessError as e:
|
|
84
|
+
print(f"[crawlee-scraper] Failed to install playwright: {e}", file=sys.stderr)
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# BeautifulSoup crawler
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
async def crawl_beautifulsoup(
|
|
93
|
+
start_url: str,
|
|
94
|
+
max_pages: int = 5,
|
|
95
|
+
max_depth: int = 1,
|
|
96
|
+
extract: str = "all",
|
|
97
|
+
) -> list[dict[str, Any]]:
|
|
98
|
+
"""Crawl using BeautifulSoup (HTTP-based, fast, no JS rendering)."""
|
|
99
|
+
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
|
|
100
|
+
from crawlee import ConcurrencySettings
|
|
101
|
+
|
|
102
|
+
results: list[dict[str, Any]] = []
|
|
103
|
+
pages_crawled = 0
|
|
104
|
+
|
|
105
|
+
crawler = BeautifulSoupCrawler(
|
|
106
|
+
max_request_retries=2,
|
|
107
|
+
request_handler_timeout=asyncio.timedelta(seconds=30),
|
|
108
|
+
max_requests_per_crawl=max_pages,
|
|
109
|
+
concurrency_settings=ConcurrencySettings(
|
|
110
|
+
max_concurrency=3,
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@crawler.router.default_handler
|
|
115
|
+
async def handler(context: BeautifulSoupCrawlingContext) -> None:
|
|
116
|
+
nonlocal pages_crawled
|
|
117
|
+
if pages_crawled >= max_pages:
|
|
118
|
+
return
|
|
119
|
+
pages_crawled += 1
|
|
120
|
+
|
|
121
|
+
soup = context.soup
|
|
122
|
+
page_data: dict[str, Any] = {
|
|
123
|
+
"url": context.request.url,
|
|
124
|
+
"status": 200,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# Extract title
|
|
128
|
+
title_tag = soup.find("title")
|
|
129
|
+
page_data["title"] = title_tag.get_text(strip=True) if title_tag else ""
|
|
130
|
+
|
|
131
|
+
# Extract text content
|
|
132
|
+
if extract in ("text", "all"):
|
|
133
|
+
# Remove script and style elements
|
|
134
|
+
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
|
135
|
+
tag.decompose()
|
|
136
|
+
text = soup.get_text(separator="\n", strip=True)
|
|
137
|
+
# Collapse whitespace
|
|
138
|
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
139
|
+
page_data["text"] = "\n".join(lines)
|
|
140
|
+
|
|
141
|
+
# Extract links
|
|
142
|
+
if extract in ("links", "all"):
|
|
143
|
+
links = []
|
|
144
|
+
for a_tag in soup.find_all("a", href=True):
|
|
145
|
+
href = str(a_tag["href"])
|
|
146
|
+
link_text = a_tag.get_text(strip=True)
|
|
147
|
+
if href.startswith(("http://", "https://")):
|
|
148
|
+
links.append({"url": href, "text": link_text})
|
|
149
|
+
page_data["links"] = links[:50] # Cap at 50 links per page
|
|
150
|
+
|
|
151
|
+
# Extract metadata
|
|
152
|
+
meta_tags = {}
|
|
153
|
+
for meta in soup.find_all("meta"):
|
|
154
|
+
name = meta.get("name", meta.get("property", ""))
|
|
155
|
+
content = meta.get("content", "")
|
|
156
|
+
if name and content:
|
|
157
|
+
meta_tags[str(name)] = str(content)
|
|
158
|
+
if meta_tags:
|
|
159
|
+
page_data["meta"] = meta_tags
|
|
160
|
+
|
|
161
|
+
results.append(page_data)
|
|
162
|
+
|
|
163
|
+
# Enqueue links for deeper crawling if depth allows
|
|
164
|
+
if max_depth > 0:
|
|
165
|
+
await context.enqueue_links(strategy="same-domain")
|
|
166
|
+
|
|
167
|
+
await crawler.run([start_url])
|
|
168
|
+
return results
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
# Playwright crawler
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
async def crawl_playwright(
|
|
176
|
+
start_url: str,
|
|
177
|
+
max_pages: int = 5,
|
|
178
|
+
max_depth: int = 1,
|
|
179
|
+
extract: str = "all",
|
|
180
|
+
) -> list[dict[str, Any]]:
|
|
181
|
+
"""Crawl using Playwright (headless browser, JS rendering)."""
|
|
182
|
+
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
|
|
183
|
+
from crawlee import ConcurrencySettings
|
|
184
|
+
|
|
185
|
+
results: list[dict[str, Any]] = []
|
|
186
|
+
pages_crawled = 0
|
|
187
|
+
|
|
188
|
+
crawler = PlaywrightCrawler(
|
|
189
|
+
max_request_retries=2,
|
|
190
|
+
request_handler_timeout=asyncio.timedelta(seconds=60),
|
|
191
|
+
max_requests_per_crawl=max_pages,
|
|
192
|
+
headless=True,
|
|
193
|
+
browser_type="chromium",
|
|
194
|
+
concurrency_settings=ConcurrencySettings(
|
|
195
|
+
max_concurrency=2,
|
|
196
|
+
),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
@crawler.router.default_handler
|
|
200
|
+
async def handler(context: PlaywrightCrawlingContext) -> None:
|
|
201
|
+
nonlocal pages_crawled
|
|
202
|
+
if pages_crawled >= max_pages:
|
|
203
|
+
return
|
|
204
|
+
pages_crawled += 1
|
|
205
|
+
|
|
206
|
+
page = context.page
|
|
207
|
+
page_data: dict[str, Any] = {
|
|
208
|
+
"url": context.request.url,
|
|
209
|
+
"status": 200,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
# Wait for content to load
|
|
213
|
+
try:
|
|
214
|
+
await page.wait_for_load_state("networkidle", timeout=15000)
|
|
215
|
+
except Exception:
|
|
216
|
+
await page.wait_for_load_state("domcontentloaded", timeout=10000)
|
|
217
|
+
|
|
218
|
+
# Extract title
|
|
219
|
+
page_data["title"] = await page.title()
|
|
220
|
+
|
|
221
|
+
# Extract text content
|
|
222
|
+
if extract in ("text", "all"):
|
|
223
|
+
# Remove non-content elements via JS
|
|
224
|
+
text = await page.evaluate("""() => {
|
|
225
|
+
const remove = document.querySelectorAll('script, style, nav, footer, header, [role="navigation"]');
|
|
226
|
+
remove.forEach(el => el.remove());
|
|
227
|
+
return document.body ? document.body.innerText : '';
|
|
228
|
+
}""")
|
|
229
|
+
lines = [line.strip() for line in str(text).splitlines() if line.strip()]
|
|
230
|
+
page_data["text"] = "\n".join(lines)
|
|
231
|
+
|
|
232
|
+
# Extract links
|
|
233
|
+
if extract in ("links", "all"):
|
|
234
|
+
links = await page.evaluate("""() => {
|
|
235
|
+
const anchors = document.querySelectorAll('a[href]');
|
|
236
|
+
return Array.from(anchors).slice(0, 50).map(a => ({
|
|
237
|
+
url: a.href,
|
|
238
|
+
text: a.innerText.trim()
|
|
239
|
+
})).filter(l => l.url.startsWith('http'));
|
|
240
|
+
}""")
|
|
241
|
+
page_data["links"] = links
|
|
242
|
+
|
|
243
|
+
# Extract meta tags
|
|
244
|
+
meta_tags = await page.evaluate("""() => {
|
|
245
|
+
const metas = document.querySelectorAll('meta[name], meta[property]');
|
|
246
|
+
const result = {};
|
|
247
|
+
metas.forEach(m => {
|
|
248
|
+
const key = m.getAttribute('name') || m.getAttribute('property');
|
|
249
|
+
const val = m.getAttribute('content');
|
|
250
|
+
if (key && val) result[key] = val;
|
|
251
|
+
});
|
|
252
|
+
return result;
|
|
253
|
+
}""")
|
|
254
|
+
if meta_tags:
|
|
255
|
+
page_data["meta"] = meta_tags
|
|
256
|
+
|
|
257
|
+
results.append(page_data)
|
|
258
|
+
|
|
259
|
+
# Enqueue links for deeper crawling
|
|
260
|
+
if max_depth > 0:
|
|
261
|
+
await context.enqueue_links(strategy="same-domain")
|
|
262
|
+
|
|
263
|
+
await crawler.run([start_url])
|
|
264
|
+
return results
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
# ---------------------------------------------------------------------------
|
|
268
|
+
# Main
|
|
269
|
+
# ---------------------------------------------------------------------------
|
|
270
|
+
|
|
271
|
+
async def main():
|
|
272
|
+
parser = argparse.ArgumentParser(description="Crawlee-based web scraper")
|
|
273
|
+
parser.add_argument("--url", required=True, help="Starting URL to crawl")
|
|
274
|
+
parser.add_argument(
|
|
275
|
+
"--strategy",
|
|
276
|
+
choices=["beautifulsoup", "playwright"],
|
|
277
|
+
default="beautifulsoup",
|
|
278
|
+
help="Crawling strategy (default: beautifulsoup)",
|
|
279
|
+
)
|
|
280
|
+
parser.add_argument(
|
|
281
|
+
"--max-pages", type=int, default=5, help="Max pages to crawl (default: 5)"
|
|
282
|
+
)
|
|
283
|
+
parser.add_argument(
|
|
284
|
+
"--max-depth", type=int, default=1, help="Max crawl depth (default: 1)"
|
|
285
|
+
)
|
|
286
|
+
parser.add_argument(
|
|
287
|
+
"--output",
|
|
288
|
+
choices=["json", "text"],
|
|
289
|
+
default="json",
|
|
290
|
+
help="Output format (default: json)",
|
|
291
|
+
)
|
|
292
|
+
parser.add_argument(
|
|
293
|
+
"--extract",
|
|
294
|
+
choices=["links", "text", "all"],
|
|
295
|
+
default="all",
|
|
296
|
+
help="What to extract (default: all)",
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
args = parser.parse_args()
|
|
300
|
+
|
|
301
|
+
# Ensure crawlee is installed
|
|
302
|
+
if not ensure_crawlee():
|
|
303
|
+
result = {"error": "Failed to install crawlee. Install manually: pip install 'crawlee[all]'"}
|
|
304
|
+
print(json.dumps(result))
|
|
305
|
+
sys.exit(1)
|
|
306
|
+
|
|
307
|
+
# For playwright strategy, ensure browsers are installed
|
|
308
|
+
if args.strategy == "playwright":
|
|
309
|
+
if not ensure_playwright():
|
|
310
|
+
result = {"error": "Failed to install playwright browsers. Install manually: playwright install chromium"}
|
|
311
|
+
print(json.dumps(result))
|
|
312
|
+
sys.exit(1)
|
|
313
|
+
|
|
314
|
+
# Run the crawler
|
|
315
|
+
try:
|
|
316
|
+
if args.strategy == "playwright":
|
|
317
|
+
results = await crawl_playwright(
|
|
318
|
+
args.url,
|
|
319
|
+
max_pages=args.max_pages,
|
|
320
|
+
max_depth=args.max_depth,
|
|
321
|
+
extract=args.extract,
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
results = await crawl_beautifulsoup(
|
|
325
|
+
args.url,
|
|
326
|
+
max_pages=args.max_pages,
|
|
327
|
+
max_depth=args.max_depth,
|
|
328
|
+
extract=args.extract,
|
|
329
|
+
)
|
|
330
|
+
except Exception as e:
|
|
331
|
+
result = {"error": f"Crawl failed: {str(e)}"}
|
|
332
|
+
print(json.dumps(result))
|
|
333
|
+
sys.exit(1)
|
|
334
|
+
|
|
335
|
+
# Output results
|
|
336
|
+
if args.output == "text":
|
|
337
|
+
for page in results:
|
|
338
|
+
print(f"=== {page.get('title', 'Untitled')} ===")
|
|
339
|
+
print(f"URL: {page.get('url', '')}")
|
|
340
|
+
if "text" in page:
|
|
341
|
+
print(page["text"][:5000])
|
|
342
|
+
if "links" in page:
|
|
343
|
+
print(f"\nLinks ({len(page['links'])}):")
|
|
344
|
+
for link in page["links"][:20]:
|
|
345
|
+
print(f" - {link.get('text', '')}: {link.get('url', '')}")
|
|
346
|
+
print()
|
|
347
|
+
else:
|
|
348
|
+
output = {
|
|
349
|
+
"success": True,
|
|
350
|
+
"strategy": args.strategy,
|
|
351
|
+
"pages_crawled": len(results),
|
|
352
|
+
"results": results,
|
|
353
|
+
}
|
|
354
|
+
print(json.dumps(output, ensure_ascii=False, indent=2))
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
if __name__ == "__main__":
|
|
358
|
+
asyncio.run(main())
|