omnius 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +4959 -0
  2. package/dist/index.d.ts +6 -0
  3. package/dist/index.js +630665 -0
  4. package/dist/launcher.cjs +78 -0
  5. package/dist/postinstall-daemon.cjs +776 -0
  6. package/dist/preinstall.cjs +92 -0
  7. package/dist/scripts/autoresearch-prepare.py +459 -0
  8. package/dist/scripts/autoresearch-train.py +661 -0
  9. package/dist/scripts/crawlee-scraper.py +358 -0
  10. package/dist/scripts/live-nemotron.py +478 -0
  11. package/dist/scripts/live-whisper.py +242 -0
  12. package/dist/scripts/ocr-advanced.py +571 -0
  13. package/dist/scripts/start-moondream.py +112 -0
  14. package/dist/scripts/tor/UPSTREAM-README.md +148 -0
  15. package/dist/scripts/tor/destroy_tor.sh +29 -0
  16. package/dist/scripts/tor/tor_setup.sh +163 -0
  17. package/dist/scripts/transcribe-file.py +63 -0
  18. package/dist/scripts/web_scrape.py +1295 -0
  19. package/npm-shrinkwrap.json +7412 -0
  20. package/package.json +142 -0
  21. package/prompts/agentic/system-large.md +569 -0
  22. package/prompts/agentic/system-medium.md +211 -0
  23. package/prompts/agentic/system-small.md +114 -0
  24. package/prompts/compaction/context-compaction.md +44 -0
  25. package/prompts/personality/level-1-minimal.md +3 -0
  26. package/prompts/personality/level-2-concise.md +3 -0
  27. package/prompts/personality/level-4-explanatory.md +3 -0
  28. package/prompts/personality/level-5-thorough.md +3 -0
  29. package/prompts/personality/level-autist.md +3 -0
  30. package/prompts/personality/level-stark.md +3 -0
  31. package/prompts/runners/dispatcher.md +24 -0
  32. package/prompts/runners/editor.md +44 -0
  33. package/prompts/runners/evaluator.md +30 -0
  34. package/prompts/runners/merge-summary.md +9 -0
  35. package/prompts/runners/normalizer.md +23 -0
  36. package/prompts/runners/planner.md +33 -0
  37. package/prompts/runners/scout.md +39 -0
  38. package/prompts/runners/verifier.md +36 -0
  39. package/prompts/skill-builder/seed-analysis.md +30 -0
  40. package/prompts/skill-builder/skill-expansion.md +76 -0
  41. package/prompts/skill-builder/skill-validation.md +31 -0
  42. package/prompts/templates/analysis.md +14 -0
  43. package/prompts/templates/code-review.md +16 -0
  44. package/prompts/templates/code.md +13 -0
  45. package/prompts/templates/document.md +13 -0
  46. package/prompts/templates/error-diagnosis.md +14 -0
  47. package/prompts/templates/general.md +9 -0
  48. package/prompts/templates/plan.md +15 -0
  49. package/prompts/templates/system.md +16 -0
  50. package/prompts/tui/dmn-gather.md +128 -0
  51. package/prompts/tui/dream-consolidate.md +48 -0
  52. package/prompts/tui/dream-lucid-eval.md +17 -0
  53. package/prompts/tui/dream-lucid-implement.md +14 -0
  54. package/prompts/tui/dream-stages.md +19 -0
  55. package/prompts/tui/emotion-behavioral.md +2 -0
  56. package/prompts/tui/emotion-center.md +12 -0
  57. package/voices/personaplex/OverBarn.pt +0 -0
  58. package/voices/personaplex/clone-voice.py +384 -0
  59. package/voices/personaplex/dequant-loader.py +174 -0
  60. package/voices/personaplex/quantize-weights.py +167 -0
@@ -0,0 +1,358 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ crawlee-scraper.py — Self-contained web scraping script using Crawlee.
4
+
5
+ Supports two crawling strategies:
6
+ - beautifulsoup (default): Fast HTTP-based scraping via httpx + BeautifulSoup.
7
+ Best for static content, documentation pages, and API docs.
8
+ - playwright: Headless browser scraping for JS-rendered pages, SPAs, and
9
+ sites that require JavaScript execution to display content.
10
+
11
+ Usage:
12
+ python3 crawlee-scraper.py --url URL [--strategy beautifulsoup|playwright]
13
+ [--max-pages N] [--max-depth N] [--output json|text] [--extract links|text|all]
14
+
15
+ Auto-installs crawlee and playwright if not present.
16
+ """
17
+
18
+ import argparse
19
+ import asyncio
20
+ import json
21
+ import subprocess
22
+ import sys
23
+ import importlib
24
+ from typing import Any
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Auto-install crawlee if missing
28
+ # ---------------------------------------------------------------------------
29
+
30
+ def ensure_crawlee():
31
+ """Ensure crawlee is installed, installing it if needed."""
32
+ try:
33
+ importlib.import_module("crawlee")
34
+ return True
35
+ except ImportError:
36
+ pass
37
+
38
+ print("[crawlee-scraper] Installing crawlee...", file=sys.stderr)
39
+ try:
40
+ subprocess.check_call(
41
+ [sys.executable, "-m", "pip", "install", "crawlee[all]"],
42
+ stdout=subprocess.DEVNULL,
43
+ stderr=subprocess.PIPE,
44
+ )
45
+ return True
46
+ except subprocess.CalledProcessError as e:
47
+ # Try without [all] extras (minimal install — beautifulsoup only)
48
+ try:
49
+ subprocess.check_call(
50
+ [sys.executable, "-m", "pip", "install", "crawlee[beautifulsoup]"],
51
+ stdout=subprocess.DEVNULL,
52
+ stderr=subprocess.PIPE,
53
+ )
54
+ return True
55
+ except subprocess.CalledProcessError:
56
+ print(f"[crawlee-scraper] Failed to install crawlee: {e}", file=sys.stderr)
57
+ return False
58
+
59
+
60
+ def ensure_playwright():
61
+ """Ensure playwright browsers are installed."""
62
+ try:
63
+ importlib.import_module("playwright")
64
+ subprocess.check_call(
65
+ [sys.executable, "-m", "playwright", "install", "chromium"],
66
+ stdout=subprocess.DEVNULL,
67
+ stderr=subprocess.PIPE,
68
+ )
69
+ return True
70
+ except (ImportError, subprocess.CalledProcessError):
71
+ try:
72
+ subprocess.check_call(
73
+ [sys.executable, "-m", "pip", "install", "playwright"],
74
+ stdout=subprocess.DEVNULL,
75
+ stderr=subprocess.PIPE,
76
+ )
77
+ subprocess.check_call(
78
+ [sys.executable, "-m", "playwright", "install", "chromium"],
79
+ stdout=subprocess.DEVNULL,
80
+ stderr=subprocess.PIPE,
81
+ )
82
+ return True
83
+ except subprocess.CalledProcessError as e:
84
+ print(f"[crawlee-scraper] Failed to install playwright: {e}", file=sys.stderr)
85
+ return False
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # BeautifulSoup crawler
90
+ # ---------------------------------------------------------------------------
91
+
92
+ async def crawl_beautifulsoup(
93
+ start_url: str,
94
+ max_pages: int = 5,
95
+ max_depth: int = 1,
96
+ extract: str = "all",
97
+ ) -> list[dict[str, Any]]:
98
+ """Crawl using BeautifulSoup (HTTP-based, fast, no JS rendering)."""
99
+ from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
100
+ from crawlee import ConcurrencySettings
101
+
102
+ results: list[dict[str, Any]] = []
103
+ pages_crawled = 0
104
+
105
+ crawler = BeautifulSoupCrawler(
106
+ max_request_retries=2,
107
+ request_handler_timeout=asyncio.timedelta(seconds=30),
108
+ max_requests_per_crawl=max_pages,
109
+ concurrency_settings=ConcurrencySettings(
110
+ max_concurrency=3,
111
+ ),
112
+ )
113
+
114
+ @crawler.router.default_handler
115
+ async def handler(context: BeautifulSoupCrawlingContext) -> None:
116
+ nonlocal pages_crawled
117
+ if pages_crawled >= max_pages:
118
+ return
119
+ pages_crawled += 1
120
+
121
+ soup = context.soup
122
+ page_data: dict[str, Any] = {
123
+ "url": context.request.url,
124
+ "status": 200,
125
+ }
126
+
127
+ # Extract title
128
+ title_tag = soup.find("title")
129
+ page_data["title"] = title_tag.get_text(strip=True) if title_tag else ""
130
+
131
+ # Extract text content
132
+ if extract in ("text", "all"):
133
+ # Remove script and style elements
134
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
135
+ tag.decompose()
136
+ text = soup.get_text(separator="\n", strip=True)
137
+ # Collapse whitespace
138
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
139
+ page_data["text"] = "\n".join(lines)
140
+
141
+ # Extract links
142
+ if extract in ("links", "all"):
143
+ links = []
144
+ for a_tag in soup.find_all("a", href=True):
145
+ href = str(a_tag["href"])
146
+ link_text = a_tag.get_text(strip=True)
147
+ if href.startswith(("http://", "https://")):
148
+ links.append({"url": href, "text": link_text})
149
+ page_data["links"] = links[:50] # Cap at 50 links per page
150
+
151
+ # Extract metadata
152
+ meta_tags = {}
153
+ for meta in soup.find_all("meta"):
154
+ name = meta.get("name", meta.get("property", ""))
155
+ content = meta.get("content", "")
156
+ if name and content:
157
+ meta_tags[str(name)] = str(content)
158
+ if meta_tags:
159
+ page_data["meta"] = meta_tags
160
+
161
+ results.append(page_data)
162
+
163
+ # Enqueue links for deeper crawling if depth allows
164
+ if max_depth > 0:
165
+ await context.enqueue_links(strategy="same-domain")
166
+
167
+ await crawler.run([start_url])
168
+ return results
169
+
170
+
171
+ # ---------------------------------------------------------------------------
172
+ # Playwright crawler
173
+ # ---------------------------------------------------------------------------
174
+
175
+ async def crawl_playwright(
176
+ start_url: str,
177
+ max_pages: int = 5,
178
+ max_depth: int = 1,
179
+ extract: str = "all",
180
+ ) -> list[dict[str, Any]]:
181
+ """Crawl using Playwright (headless browser, JS rendering)."""
182
+ from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
183
+ from crawlee import ConcurrencySettings
184
+
185
+ results: list[dict[str, Any]] = []
186
+ pages_crawled = 0
187
+
188
+ crawler = PlaywrightCrawler(
189
+ max_request_retries=2,
190
+ request_handler_timeout=asyncio.timedelta(seconds=60),
191
+ max_requests_per_crawl=max_pages,
192
+ headless=True,
193
+ browser_type="chromium",
194
+ concurrency_settings=ConcurrencySettings(
195
+ max_concurrency=2,
196
+ ),
197
+ )
198
+
199
+ @crawler.router.default_handler
200
+ async def handler(context: PlaywrightCrawlingContext) -> None:
201
+ nonlocal pages_crawled
202
+ if pages_crawled >= max_pages:
203
+ return
204
+ pages_crawled += 1
205
+
206
+ page = context.page
207
+ page_data: dict[str, Any] = {
208
+ "url": context.request.url,
209
+ "status": 200,
210
+ }
211
+
212
+ # Wait for content to load
213
+ try:
214
+ await page.wait_for_load_state("networkidle", timeout=15000)
215
+ except Exception:
216
+ await page.wait_for_load_state("domcontentloaded", timeout=10000)
217
+
218
+ # Extract title
219
+ page_data["title"] = await page.title()
220
+
221
+ # Extract text content
222
+ if extract in ("text", "all"):
223
+ # Remove non-content elements via JS
224
+ text = await page.evaluate("""() => {
225
+ const remove = document.querySelectorAll('script, style, nav, footer, header, [role="navigation"]');
226
+ remove.forEach(el => el.remove());
227
+ return document.body ? document.body.innerText : '';
228
+ }""")
229
+ lines = [line.strip() for line in str(text).splitlines() if line.strip()]
230
+ page_data["text"] = "\n".join(lines)
231
+
232
+ # Extract links
233
+ if extract in ("links", "all"):
234
+ links = await page.evaluate("""() => {
235
+ const anchors = document.querySelectorAll('a[href]');
236
+ return Array.from(anchors).slice(0, 50).map(a => ({
237
+ url: a.href,
238
+ text: a.innerText.trim()
239
+ })).filter(l => l.url.startsWith('http'));
240
+ }""")
241
+ page_data["links"] = links
242
+
243
+ # Extract meta tags
244
+ meta_tags = await page.evaluate("""() => {
245
+ const metas = document.querySelectorAll('meta[name], meta[property]');
246
+ const result = {};
247
+ metas.forEach(m => {
248
+ const key = m.getAttribute('name') || m.getAttribute('property');
249
+ const val = m.getAttribute('content');
250
+ if (key && val) result[key] = val;
251
+ });
252
+ return result;
253
+ }""")
254
+ if meta_tags:
255
+ page_data["meta"] = meta_tags
256
+
257
+ results.append(page_data)
258
+
259
+ # Enqueue links for deeper crawling
260
+ if max_depth > 0:
261
+ await context.enqueue_links(strategy="same-domain")
262
+
263
+ await crawler.run([start_url])
264
+ return results
265
+
266
+
267
+ # ---------------------------------------------------------------------------
268
+ # Main
269
+ # ---------------------------------------------------------------------------
270
+
271
+ async def main():
272
+ parser = argparse.ArgumentParser(description="Crawlee-based web scraper")
273
+ parser.add_argument("--url", required=True, help="Starting URL to crawl")
274
+ parser.add_argument(
275
+ "--strategy",
276
+ choices=["beautifulsoup", "playwright"],
277
+ default="beautifulsoup",
278
+ help="Crawling strategy (default: beautifulsoup)",
279
+ )
280
+ parser.add_argument(
281
+ "--max-pages", type=int, default=5, help="Max pages to crawl (default: 5)"
282
+ )
283
+ parser.add_argument(
284
+ "--max-depth", type=int, default=1, help="Max crawl depth (default: 1)"
285
+ )
286
+ parser.add_argument(
287
+ "--output",
288
+ choices=["json", "text"],
289
+ default="json",
290
+ help="Output format (default: json)",
291
+ )
292
+ parser.add_argument(
293
+ "--extract",
294
+ choices=["links", "text", "all"],
295
+ default="all",
296
+ help="What to extract (default: all)",
297
+ )
298
+
299
+ args = parser.parse_args()
300
+
301
+ # Ensure crawlee is installed
302
+ if not ensure_crawlee():
303
+ result = {"error": "Failed to install crawlee. Install manually: pip install 'crawlee[all]'"}
304
+ print(json.dumps(result))
305
+ sys.exit(1)
306
+
307
+ # For playwright strategy, ensure browsers are installed
308
+ if args.strategy == "playwright":
309
+ if not ensure_playwright():
310
+ result = {"error": "Failed to install playwright browsers. Install manually: playwright install chromium"}
311
+ print(json.dumps(result))
312
+ sys.exit(1)
313
+
314
+ # Run the crawler
315
+ try:
316
+ if args.strategy == "playwright":
317
+ results = await crawl_playwright(
318
+ args.url,
319
+ max_pages=args.max_pages,
320
+ max_depth=args.max_depth,
321
+ extract=args.extract,
322
+ )
323
+ else:
324
+ results = await crawl_beautifulsoup(
325
+ args.url,
326
+ max_pages=args.max_pages,
327
+ max_depth=args.max_depth,
328
+ extract=args.extract,
329
+ )
330
+ except Exception as e:
331
+ result = {"error": f"Crawl failed: {str(e)}"}
332
+ print(json.dumps(result))
333
+ sys.exit(1)
334
+
335
+ # Output results
336
+ if args.output == "text":
337
+ for page in results:
338
+ print(f"=== {page.get('title', 'Untitled')} ===")
339
+ print(f"URL: {page.get('url', '')}")
340
+ if "text" in page:
341
+ print(page["text"][:5000])
342
+ if "links" in page:
343
+ print(f"\nLinks ({len(page['links'])}):")
344
+ for link in page["links"][:20]:
345
+ print(f" - {link.get('text', '')}: {link.get('url', '')}")
346
+ print()
347
+ else:
348
+ output = {
349
+ "success": True,
350
+ "strategy": args.strategy,
351
+ "pages_crawled": len(results),
352
+ "results": results,
353
+ }
354
+ print(json.dumps(output, ensure_ascii=False, indent=2))
355
+
356
+
357
+ if __name__ == "__main__":
358
+ asyncio.run(main())