pythonclaw 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. pythonclaw/__init__.py +17 -0
  2. pythonclaw/__main__.py +6 -0
  3. pythonclaw/channels/discord_bot.py +231 -0
  4. pythonclaw/channels/telegram_bot.py +236 -0
  5. pythonclaw/config.py +190 -0
  6. pythonclaw/core/__init__.py +25 -0
  7. pythonclaw/core/agent.py +773 -0
  8. pythonclaw/core/compaction.py +220 -0
  9. pythonclaw/core/knowledge/rag.py +93 -0
  10. pythonclaw/core/llm/anthropic_client.py +107 -0
  11. pythonclaw/core/llm/base.py +26 -0
  12. pythonclaw/core/llm/gemini_client.py +139 -0
  13. pythonclaw/core/llm/openai_compatible.py +39 -0
  14. pythonclaw/core/llm/response.py +57 -0
  15. pythonclaw/core/memory/manager.py +120 -0
  16. pythonclaw/core/memory/storage.py +164 -0
  17. pythonclaw/core/persistent_agent.py +103 -0
  18. pythonclaw/core/retrieval/__init__.py +6 -0
  19. pythonclaw/core/retrieval/chunker.py +78 -0
  20. pythonclaw/core/retrieval/dense.py +152 -0
  21. pythonclaw/core/retrieval/fusion.py +51 -0
  22. pythonclaw/core/retrieval/reranker.py +112 -0
  23. pythonclaw/core/retrieval/retriever.py +166 -0
  24. pythonclaw/core/retrieval/sparse.py +69 -0
  25. pythonclaw/core/session_store.py +269 -0
  26. pythonclaw/core/skill_loader.py +322 -0
  27. pythonclaw/core/skillhub.py +290 -0
  28. pythonclaw/core/tools.py +622 -0
  29. pythonclaw/core/utils.py +64 -0
  30. pythonclaw/daemon.py +221 -0
  31. pythonclaw/init.py +61 -0
  32. pythonclaw/main.py +489 -0
  33. pythonclaw/onboard.py +290 -0
  34. pythonclaw/scheduler/cron.py +310 -0
  35. pythonclaw/scheduler/heartbeat.py +178 -0
  36. pythonclaw/server.py +145 -0
  37. pythonclaw/session_manager.py +104 -0
  38. pythonclaw/templates/persona/demo_persona.md +2 -0
  39. pythonclaw/templates/skills/communication/CATEGORY.md +4 -0
  40. pythonclaw/templates/skills/communication/email/SKILL.md +54 -0
  41. pythonclaw/templates/skills/communication/email/__pycache__/send_email.cpython-311.pyc +0 -0
  42. pythonclaw/templates/skills/communication/email/send_email.py +88 -0
  43. pythonclaw/templates/skills/data/CATEGORY.md +4 -0
  44. pythonclaw/templates/skills/data/csv_analyzer/SKILL.md +51 -0
  45. pythonclaw/templates/skills/data/csv_analyzer/__pycache__/analyze.cpython-311.pyc +0 -0
  46. pythonclaw/templates/skills/data/csv_analyzer/analyze.py +138 -0
  47. pythonclaw/templates/skills/data/finance/SKILL.md +41 -0
  48. pythonclaw/templates/skills/data/finance/__pycache__/fetch_quote.cpython-311.pyc +0 -0
  49. pythonclaw/templates/skills/data/finance/fetch_quote.py +118 -0
  50. pythonclaw/templates/skills/data/news/SKILL.md +39 -0
  51. pythonclaw/templates/skills/data/news/__pycache__/search_news.cpython-311.pyc +0 -0
  52. pythonclaw/templates/skills/data/news/search_news.py +57 -0
  53. pythonclaw/templates/skills/data/pdf_reader/SKILL.md +40 -0
  54. pythonclaw/templates/skills/data/pdf_reader/__pycache__/read_pdf.cpython-311.pyc +0 -0
  55. pythonclaw/templates/skills/data/pdf_reader/read_pdf.py +113 -0
  56. pythonclaw/templates/skills/data/scraper/SKILL.md +39 -0
  57. pythonclaw/templates/skills/data/scraper/__pycache__/scrape.cpython-311.pyc +0 -0
  58. pythonclaw/templates/skills/data/scraper/scrape.py +92 -0
  59. pythonclaw/templates/skills/data/weather/SKILL.md +42 -0
  60. pythonclaw/templates/skills/data/weather/__pycache__/weather.cpython-311.pyc +0 -0
  61. pythonclaw/templates/skills/data/weather/weather.py +142 -0
  62. pythonclaw/templates/skills/data/youtube/SKILL.md +43 -0
  63. pythonclaw/templates/skills/data/youtube/__pycache__/youtube_info.cpython-311.pyc +0 -0
  64. pythonclaw/templates/skills/data/youtube/youtube_info.py +167 -0
  65. pythonclaw/templates/skills/dev/CATEGORY.md +4 -0
  66. pythonclaw/templates/skills/dev/code_runner/SKILL.md +46 -0
  67. pythonclaw/templates/skills/dev/code_runner/__pycache__/run_code.cpython-311.pyc +0 -0
  68. pythonclaw/templates/skills/dev/code_runner/run_code.py +117 -0
  69. pythonclaw/templates/skills/dev/github/SKILL.md +52 -0
  70. pythonclaw/templates/skills/dev/github/__pycache__/gh.cpython-311.pyc +0 -0
  71. pythonclaw/templates/skills/dev/github/gh.py +165 -0
  72. pythonclaw/templates/skills/dev/http_request/SKILL.md +40 -0
  73. pythonclaw/templates/skills/dev/http_request/__pycache__/request.cpython-311.pyc +0 -0
  74. pythonclaw/templates/skills/dev/http_request/request.py +90 -0
  75. pythonclaw/templates/skills/google/CATEGORY.md +4 -0
  76. pythonclaw/templates/skills/google/workspace/SKILL.md +98 -0
  77. pythonclaw/templates/skills/google/workspace/check_setup.sh +52 -0
  78. pythonclaw/templates/skills/meta/CATEGORY.md +4 -0
  79. pythonclaw/templates/skills/meta/skill_creator/SKILL.md +151 -0
  80. pythonclaw/templates/skills/system/CATEGORY.md +4 -0
  81. pythonclaw/templates/skills/system/change_persona/SKILL.md +41 -0
  82. pythonclaw/templates/skills/system/change_setting/SKILL.md +65 -0
  83. pythonclaw/templates/skills/system/change_setting/__pycache__/update_config.cpython-311.pyc +0 -0
  84. pythonclaw/templates/skills/system/change_setting/update_config.py +129 -0
  85. pythonclaw/templates/skills/system/change_soul/SKILL.md +41 -0
  86. pythonclaw/templates/skills/system/onboarding/SKILL.md +63 -0
  87. pythonclaw/templates/skills/system/onboarding/__pycache__/write_identity.cpython-311.pyc +0 -0
  88. pythonclaw/templates/skills/system/onboarding/write_identity.py +218 -0
  89. pythonclaw/templates/skills/system/random/SKILL.md +33 -0
  90. pythonclaw/templates/skills/system/random/__pycache__/random_util.cpython-311.pyc +0 -0
  91. pythonclaw/templates/skills/system/random/random_util.py +45 -0
  92. pythonclaw/templates/skills/system/time/SKILL.md +33 -0
  93. pythonclaw/templates/skills/system/time/__pycache__/time_util.cpython-311.pyc +0 -0
  94. pythonclaw/templates/skills/system/time/time_util.py +81 -0
  95. pythonclaw/templates/skills/text/CATEGORY.md +4 -0
  96. pythonclaw/templates/skills/text/translator/SKILL.md +47 -0
  97. pythonclaw/templates/skills/text/translator/__pycache__/translate.cpython-311.pyc +0 -0
  98. pythonclaw/templates/skills/text/translator/translate.py +66 -0
  99. pythonclaw/templates/skills/web/CATEGORY.md +4 -0
  100. pythonclaw/templates/skills/web/tavily/SKILL.md +61 -0
  101. pythonclaw/templates/soul/SOUL.md +54 -0
  102. pythonclaw/web/__init__.py +1 -0
  103. pythonclaw/web/app.py +585 -0
  104. pythonclaw/web/static/favicon.png +0 -0
  105. pythonclaw/web/static/index.html +1318 -0
  106. pythonclaw/web/static/logo.png +0 -0
  107. pythonclaw-0.2.0.dist-info/METADATA +410 -0
  108. pythonclaw-0.2.0.dist-info/RECORD +112 -0
  109. pythonclaw-0.2.0.dist-info/WHEEL +5 -0
  110. pythonclaw-0.2.0.dist-info/entry_points.txt +2 -0
  111. pythonclaw-0.2.0.dist-info/licenses/LICENSE +21 -0
  112. pythonclaw-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,138 @@
1
+ #!/usr/bin/env python3
2
+ """Analyze CSV/Excel files with pandas."""
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+ try:
9
+ import pandas as pd
10
+ except ImportError:
11
+ print("Error: pandas not installed. Run: pip install pandas openpyxl",
12
+ file=sys.stderr)
13
+ sys.exit(1)
14
+
15
+
16
+ def load_data(path: str, columns: str | None = None) -> pd.DataFrame:
17
+ ext = path.rsplit(".", 1)[-1].lower()
18
+ if ext in ("xls", "xlsx"):
19
+ df = pd.read_excel(path)
20
+ elif ext == "tsv":
21
+ df = pd.read_csv(path, sep="\t")
22
+ else:
23
+ df = pd.read_csv(path)
24
+ if columns:
25
+ cols = [c.strip() for c in columns.split(",")]
26
+ df = df[cols]
27
+ return df
28
+
29
+
30
+ def cmd_info(df: pd.DataFrame, as_json: bool) -> None:
31
+ info = {
32
+ "shape": list(df.shape),
33
+ "columns": [
34
+ {"name": c, "dtype": str(df[c].dtype), "missing": int(df[c].isna().sum())}
35
+ for c in df.columns
36
+ ],
37
+ "memoryMB": round(df.memory_usage(deep=True).sum() / 1e6, 2),
38
+ }
39
+ if as_json:
40
+ print(json.dumps(info, indent=2))
41
+ else:
42
+ print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
43
+ print(f"Memory: {info['memoryMB']} MB\n")
44
+ print(f"{'Column':<30} {'Type':<15} {'Missing'}")
45
+ print("-" * 55)
46
+ for c in info["columns"]:
47
+ print(f"{c['name']:<30} {c['dtype']:<15} {c['missing']}")
48
+
49
+
50
+ def cmd_head(df: pd.DataFrame, rows: int, as_json: bool) -> None:
51
+ subset = df.head(rows)
52
+ if as_json:
53
+ print(subset.to_json(orient="records", indent=2, force_ascii=False))
54
+ else:
55
+ print(subset.to_string(index=False))
56
+
57
+
58
+ def cmd_stats(df: pd.DataFrame, as_json: bool) -> None:
59
+ numeric = df.select_dtypes(include="number")
60
+ if numeric.empty:
61
+ print("No numeric columns found.")
62
+ return
63
+ desc = numeric.describe()
64
+ if as_json:
65
+ print(desc.to_json(indent=2))
66
+ else:
67
+ print(desc.to_string())
68
+
69
+
70
+ def cmd_query(df: pd.DataFrame, expr: str, rows: int, as_json: bool) -> None:
71
+ result = df.query(expr)
72
+ subset = result.head(rows)
73
+ print(f"Matched {len(result)} rows (showing first {min(rows, len(result))}):\n")
74
+ if as_json:
75
+ print(subset.to_json(orient="records", indent=2, force_ascii=False))
76
+ else:
77
+ print(subset.to_string(index=False))
78
+
79
+
80
+ def cmd_groupby(df: pd.DataFrame, col: str, agg: str, as_json: bool) -> None:
81
+ numeric = df.select_dtypes(include="number").columns.tolist()
82
+ if col in numeric:
83
+ numeric.remove(col)
84
+ if not numeric:
85
+ print("No numeric columns to aggregate.")
86
+ return
87
+ result = df.groupby(col)[numeric].agg(agg).reset_index()
88
+ if as_json:
89
+ print(result.to_json(orient="records", indent=2, force_ascii=False))
90
+ else:
91
+ print(result.to_string(index=False))
92
+
93
+
94
+ def main():
95
+ parser = argparse.ArgumentParser(description="Analyze CSV/Excel files.")
96
+ parser.add_argument("path", help="Data file path (.csv, .tsv, .xlsx)")
97
+ parser.add_argument("command", nargs="?", default="info",
98
+ choices=["info", "head", "stats", "query", "groupby", "columns"])
99
+ parser.add_argument("--rows", type=int, default=10)
100
+ parser.add_argument("--query", dest="expr", default=None)
101
+ parser.add_argument("--groupby", default=None)
102
+ parser.add_argument("--agg", default="mean",
103
+ choices=["mean", "sum", "count", "min", "max"])
104
+ parser.add_argument("--columns", default=None)
105
+ parser.add_argument("--format", choices=["text", "json"], default="text")
106
+ args = parser.parse_args()
107
+
108
+ try:
109
+ df = load_data(args.path, args.columns)
110
+ except Exception as exc:
111
+ print(f"Error loading {args.path}: {exc}", file=sys.stderr)
112
+ sys.exit(1)
113
+
114
+ as_json = args.format == "json"
115
+
116
+ if args.command == "info":
117
+ cmd_info(df, as_json)
118
+ elif args.command == "head":
119
+ cmd_head(df, args.rows, as_json)
120
+ elif args.command == "stats":
121
+ cmd_stats(df, as_json)
122
+ elif args.command == "query":
123
+ if not args.expr:
124
+ print("Error: --query expression required.", file=sys.stderr)
125
+ sys.exit(1)
126
+ cmd_query(df, args.expr, args.rows, as_json)
127
+ elif args.command == "groupby":
128
+ if not args.groupby:
129
+ print("Error: --groupby column required.", file=sys.stderr)
130
+ sys.exit(1)
131
+ cmd_groupby(df, args.groupby, args.agg, as_json)
132
+ elif args.command == "columns":
133
+ for c in df.columns:
134
+ print(f" {c} ({df[c].dtype})")
135
+
136
+
137
+ if __name__ == "__main__":
138
+ main()
@@ -0,0 +1,41 @@
1
+ ---
2
+ name: finance
3
+ description: >
4
+ Fetch stock quotes, crypto prices, forex rates, and financial data.
5
+ Use when the user asks about any stock price, market data, company
6
+ financials, or cryptocurrency price.
7
+ ---
8
+
9
+ ## Instructions
10
+
11
+ Fetch real-time financial data for stocks, crypto, and forex using
12
+ Yahoo Finance (via the `yfinance` library).
13
+
14
+ ### Prerequisites
15
+
16
+ Install the dependency: `pip install yfinance`
17
+
18
+ No API key needed — Yahoo Finance is free.
19
+
20
+ ### Usage
21
+
22
+ ```bash
23
+ python {skill_path}/fetch_quote.py SYMBOL [SYMBOL2 ...]
24
+ ```
25
+
26
+ Options:
27
+ - `--format json` — output as JSON (default: human-readable text)
28
+ - `--history 5d` — include price history (1d, 5d, 1mo, 3mo, 6mo, 1y, 5y, max)
29
+
30
+ ### Examples
31
+
32
+ - "What's Tesla's stock price?" → `python {skill_path}/fetch_quote.py TSLA`
33
+ - "Compare AAPL and MSFT" → `python {skill_path}/fetch_quote.py AAPL MSFT`
34
+ - "Show Bitcoin price" → `python {skill_path}/fetch_quote.py BTC-USD`
35
+ - "EUR/USD exchange rate" → `python {skill_path}/fetch_quote.py EURUSD=X`
36
+
37
+ ## Resources
38
+
39
+ | File | Description |
40
+ |------|-------------|
41
+ | `fetch_quote.py` | Multi-symbol financial data fetcher |
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env python3
2
+ """Fetch financial quotes from Yahoo Finance."""
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+ try:
9
+ import yfinance as yf
10
+ except ImportError:
11
+ print("Error: yfinance not installed. Run: pip install yfinance", file=sys.stderr)
12
+ sys.exit(1)
13
+
14
+
15
+ def fetch_quote(symbol: str, history: str | None = None) -> dict:
16
+ ticker = yf.Ticker(symbol)
17
+ info = ticker.info
18
+
19
+ result = {
20
+ "symbol": symbol.upper(),
21
+ "name": info.get("shortName") or info.get("longName", symbol),
22
+ "price": info.get("currentPrice") or info.get("regularMarketPrice"),
23
+ "currency": info.get("currency", "USD"),
24
+ "change": info.get("regularMarketChange"),
25
+ "changePercent": info.get("regularMarketChangePercent"),
26
+ "dayHigh": info.get("dayHigh"),
27
+ "dayLow": info.get("dayLow"),
28
+ "volume": info.get("volume"),
29
+ "marketCap": info.get("marketCap"),
30
+ "fiftyTwoWeekHigh": info.get("fiftyTwoWeekHigh"),
31
+ "fiftyTwoWeekLow": info.get("fiftyTwoWeekLow"),
32
+ }
33
+
34
+ if history:
35
+ hist = ticker.history(period=history)
36
+ if not hist.empty:
37
+ records = []
38
+ for date, row in hist.iterrows():
39
+ records.append({
40
+ "date": date.strftime("%Y-%m-%d"),
41
+ "close": round(row["Close"], 2),
42
+ "volume": int(row["Volume"]),
43
+ })
44
+ result["history"] = records
45
+
46
+ return result
47
+
48
+
49
+ def format_text(data: dict) -> str:
50
+ lines = [f"{data['name']} ({data['symbol']})"]
51
+ price = data.get("price")
52
+ if price is not None:
53
+ ccy = data.get("currency", "")
54
+ change = data.get("change")
55
+ pct = data.get("changePercent")
56
+ change_str = ""
57
+ if change is not None and pct is not None:
58
+ sign = "+" if change >= 0 else ""
59
+ change_str = f" {sign}{change:.2f} ({sign}{pct:.2f}%)"
60
+ lines.append(f" Price: {ccy} {price:.2f}{change_str}")
61
+
62
+ for label, key in [("Day Range", None), ("52-Week", None),
63
+ ("Volume", "volume"), ("Market Cap", "marketCap")]:
64
+ if key and data.get(key) is not None:
65
+ val = data[key]
66
+ if val >= 1e12:
67
+ lines.append(f" {label}: {val/1e12:.2f}T")
68
+ elif val >= 1e9:
69
+ lines.append(f" {label}: {val/1e9:.2f}B")
70
+ elif val >= 1e6:
71
+ lines.append(f" {label}: {val/1e6:.2f}M")
72
+ else:
73
+ lines.append(f" {label}: {val:,.0f}")
74
+
75
+ lo, hi = data.get("dayLow"), data.get("dayHigh")
76
+ if lo and hi:
77
+ lines.append(f" Day Range: {lo:.2f} - {hi:.2f}")
78
+
79
+ lo52, hi52 = data.get("fiftyTwoWeekLow"), data.get("fiftyTwoWeekHigh")
80
+ if lo52 and hi52:
81
+ lines.append(f" 52-Week: {lo52:.2f} - {hi52:.2f}")
82
+
83
+ if "history" in data:
84
+ lines.append(f" History ({len(data['history'])} points):")
85
+ for h in data["history"][-5:]:
86
+ lines.append(f" {h['date']}: {h['close']}")
87
+
88
+ return "\n".join(lines)
89
+
90
+
91
+ def main():
92
+ parser = argparse.ArgumentParser(description="Fetch financial quotes.")
93
+ parser.add_argument("symbols", nargs="+", help="Ticker symbols (e.g. TSLA AAPL BTC-USD)")
94
+ parser.add_argument("--format", choices=["text", "json"], default="text")
95
+ parser.add_argument("--history", default=None, help="Price history period (1d,5d,1mo,3mo,6mo,1y,5y,max)")
96
+ args = parser.parse_args()
97
+
98
+ results = []
99
+ for sym in args.symbols:
100
+ try:
101
+ data = fetch_quote(sym.strip(), history=args.history)
102
+ results.append(data)
103
+ except Exception as exc:
104
+ results.append({"symbol": sym, "error": str(exc)})
105
+
106
+ if args.format == "json":
107
+ print(json.dumps(results, indent=2))
108
+ else:
109
+ for data in results:
110
+ if "error" in data:
111
+ print(f"{data['symbol']}: Error — {data['error']}")
112
+ else:
113
+ print(format_text(data))
114
+ print()
115
+
116
+
117
+ if __name__ == "__main__":
118
+ main()
@@ -0,0 +1,39 @@
1
+ ---
2
+ name: news
3
+ description: >
4
+ Search and summarise news on any topic using web search.
5
+ Use when the user asks about recent news, current events, or wants
6
+ a news briefing on any subject.
7
+ ---
8
+
9
+ ## Instructions
10
+
11
+ Search for recent news on any topic. This skill uses the built-in
12
+ `web_search` tool (Tavily) or falls back to a script that uses
13
+ DuckDuckGo if Tavily is not configured.
14
+
15
+ ### Usage
16
+
17
+ **Option A — use the `web_search` tool directly** (preferred when Tavily is configured):
18
+
19
+ ```
20
+ web_search(query="latest news about <topic>", topic="news", max_results=10)
21
+ ```
22
+
23
+ **Option B — use the bundled script** (works without Tavily):
24
+
25
+ ```bash
26
+ python {skill_path}/search_news.py "topic" [--max 10]
27
+ ```
28
+
29
+ ### Examples
30
+
31
+ - "What's happening in the tech industry today?"
32
+ - "Give me the latest AI news"
33
+ - "News about the 2026 World Cup"
34
+
35
+ ## Resources
36
+
37
+ | File | Description |
38
+ |------|-------------|
39
+ | `search_news.py` | Fallback news search via DuckDuckGo |
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env python3
2
+ """Search for recent news using DuckDuckGo (no API key required)."""
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+ try:
9
+ from duckduckgo_search import DDGS
10
+ except ImportError:
11
+ print("Error: duckduckgo-search not installed. Run: pip install duckduckgo-search", file=sys.stderr)
12
+ sys.exit(1)
13
+
14
+
15
+ def search_news(query: str, max_results: int = 10) -> list[dict]:
16
+ with DDGS() as ddgs:
17
+ results = list(ddgs.news(query, max_results=max_results))
18
+ return [
19
+ {
20
+ "title": r.get("title", ""),
21
+ "url": r.get("url", ""),
22
+ "source": r.get("source", ""),
23
+ "date": r.get("date", ""),
24
+ "body": r.get("body", "")[:300],
25
+ }
26
+ for r in results
27
+ ]
28
+
29
+
30
+ def main():
31
+ parser = argparse.ArgumentParser(description="Search news on any topic.")
32
+ parser.add_argument("query", help="News search query")
33
+ parser.add_argument("--max", type=int, default=10, help="Max results (default: 10)")
34
+ parser.add_argument("--format", choices=["text", "json"], default="text")
35
+ args = parser.parse_args()
36
+
37
+ results = search_news(args.query, max_results=args.max)
38
+
39
+ if args.format == "json":
40
+ print(json.dumps(results, indent=2, ensure_ascii=False))
41
+ else:
42
+ if not results:
43
+ print("No news found.")
44
+ return
45
+ for i, r in enumerate(results, 1):
46
+ print(f"{i}. {r['title']}")
47
+ if r["source"]:
48
+ print(f" Source: {r['source']} Date: {r['date']}")
49
+ if r["url"]:
50
+ print(f" {r['url']}")
51
+ if r["body"]:
52
+ print(f" {r['body']}")
53
+ print()
54
+
55
+
56
+ if __name__ == "__main__":
57
+ main()
@@ -0,0 +1,40 @@
1
+ ---
2
+ name: pdf_reader
3
+ description: >
4
+ Extract text content from PDF files. Supports multi-page PDFs,
5
+ page-level extraction, and metadata reading. Use when the user asks
6
+ to read, extract, or analyze content from a PDF document.
7
+ ---
8
+
9
+ ## Instructions
10
+
11
+ Extract text and metadata from PDF files.
12
+
13
+ ### Prerequisites
14
+
15
+ Install dependency: `pip install PyPDF2`
16
+
17
+ ### Usage
18
+
19
+ ```bash
20
+ python {skill_path}/read_pdf.py PATH_TO_PDF [options]
21
+ ```
22
+
23
+ Options:
24
+ - `--pages 1-5` — extract only specific pages (1-indexed, supports ranges)
25
+ - `--metadata` — include PDF metadata (author, title, creation date)
26
+ - `--format json` — output as JSON
27
+ - `--summary` — show page count and character count overview only
28
+
29
+ ### Examples
30
+
31
+ - "Read this PDF" → `python {skill_path}/read_pdf.py document.pdf`
32
+ - "Extract pages 2-4 from report.pdf" → `python {skill_path}/read_pdf.py report.pdf --pages 2-4`
33
+ - "What's in this PDF?" → `python {skill_path}/read_pdf.py file.pdf --summary`
34
+ - "Get PDF metadata" → `python {skill_path}/read_pdf.py file.pdf --metadata`
35
+
36
+ ## Resources
37
+
38
+ | File | Description |
39
+ |------|-------------|
40
+ | `read_pdf.py` | PDF text extractor |
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env python3
2
+ """Extract text and metadata from PDF files."""
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+ try:
9
+ from PyPDF2 import PdfReader
10
+ except ImportError:
11
+ print("Error: PyPDF2 not installed. Run: pip install PyPDF2", file=sys.stderr)
12
+ sys.exit(1)
13
+
14
+
15
+ def parse_page_range(spec: str, total: int) -> list[int]:
16
+ """Parse a page range like '1-5' or '2,4,6' into 0-based indices."""
17
+ pages = set()
18
+ for part in spec.split(","):
19
+ part = part.strip()
20
+ if "-" in part:
21
+ start, end = part.split("-", 1)
22
+ start = max(1, int(start))
23
+ end = min(total, int(end))
24
+ pages.update(range(start - 1, end))
25
+ else:
26
+ idx = int(part) - 1
27
+ if 0 <= idx < total:
28
+ pages.add(idx)
29
+ return sorted(pages)
30
+
31
+
32
+ def extract_text(path: str, pages: list[int] | None = None) -> dict:
33
+ reader = PdfReader(path)
34
+ total = len(reader.pages)
35
+
36
+ if pages is None:
37
+ pages = list(range(total))
38
+
39
+ extracted = []
40
+ for i in pages:
41
+ if 0 <= i < total:
42
+ text = reader.pages[i].extract_text() or ""
43
+ extracted.append({"page": i + 1, "text": text})
44
+
45
+ meta_raw = reader.metadata
46
+ metadata = {}
47
+ if meta_raw:
48
+ for key in ("title", "author", "subject", "creator", "producer"):
49
+ val = getattr(meta_raw, key, None)
50
+ if val:
51
+ metadata[key] = str(val)
52
+ if meta_raw.creation_date:
53
+ metadata["created"] = str(meta_raw.creation_date)
54
+
55
+ return {
56
+ "path": path,
57
+ "totalPages": total,
58
+ "extractedPages": len(extracted),
59
+ "metadata": metadata,
60
+ "pages": extracted,
61
+ }
62
+
63
+
64
+ def main():
65
+ parser = argparse.ArgumentParser(description="Extract text from PDF files.")
66
+ parser.add_argument("path", help="Path to the PDF file")
67
+ parser.add_argument("--pages", default=None, help="Page range (e.g. '1-5' or '2,4,6')")
68
+ parser.add_argument("--metadata", action="store_true", help="Show metadata only")
69
+ parser.add_argument("--summary", action="store_true", help="Show summary only")
70
+ parser.add_argument("--format", choices=["text", "json"], default="text")
71
+ args = parser.parse_args()
72
+
73
+ try:
74
+ reader = PdfReader(args.path)
75
+ total = len(reader.pages)
76
+ except Exception as exc:
77
+ print(f"Error opening PDF: {exc}", file=sys.stderr)
78
+ sys.exit(1)
79
+
80
+ page_indices = None
81
+ if args.pages:
82
+ page_indices = parse_page_range(args.pages, total)
83
+
84
+ data = extract_text(args.path, page_indices)
85
+
86
+ if args.format == "json":
87
+ print(json.dumps(data, indent=2, ensure_ascii=False))
88
+ return
89
+
90
+ if args.metadata:
91
+ print(f"File: {args.path} ({total} pages)")
92
+ for k, v in data["metadata"].items():
93
+ print(f" {k}: {v}")
94
+ return
95
+
96
+ if args.summary:
97
+ total_chars = sum(len(p["text"]) for p in data["pages"])
98
+ print(f"File: {args.path}")
99
+ print(f" Pages: {total}")
100
+ print(f" Characters: {total_chars:,}")
101
+ for p in data["pages"]:
102
+ print(f" Page {p['page']}: {len(p['text']):,} chars")
103
+ return
104
+
105
+ print(f"File: {args.path} ({data['extractedPages']}/{total} pages)\n")
106
+ for p in data["pages"]:
107
+ print(f"--- Page {p['page']} ---")
108
+ print(p["text"][:5000] if len(p["text"]) > 5000 else p["text"])
109
+ print()
110
+
111
+
112
+ if __name__ == "__main__":
113
+ main()
@@ -0,0 +1,39 @@
1
+ ---
2
+ name: web_scraper
3
+ description: >
4
+ Scrape and extract content from web pages. Supports extracting text,
5
+ links, headings, and structured data. Use when the user asks to read
6
+ a web page, extract information from a URL, or scrape website content.
7
+ ---
8
+
9
+ ## Instructions
10
+
11
+ Scrape and extract readable content from any web page.
12
+
13
+ ### Prerequisites
14
+
15
+ Install dependencies: `pip install requests beautifulsoup4`
16
+
17
+ ### Usage
18
+
19
+ ```bash
20
+ python {skill_path}/scrape.py URL [--format text|json|links|headings]
21
+ ```
22
+
23
+ Formats:
24
+ - `text` (default) — cleaned readable text
25
+ - `json` — structured JSON with title, text, links, headings
26
+ - `links` — all links on the page
27
+ - `headings` — all headings (h1–h6)
28
+
29
+ ### Examples
30
+
31
+ - "Read the content of https://example.com"
32
+ - "Extract all links from https://news.ycombinator.com"
33
+ - "What does this page say? https://some-article.com/post"
34
+
35
+ ## Resources
36
+
37
+ | File | Description |
38
+ |------|-------------|
39
+ | `scrape.py` | Generic web page scraper |
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/env python3
2
+ """Scrape and extract content from a web page."""
3
+
4
+ import argparse
5
+ import json
6
+ import sys
7
+
8
+ try:
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ except ImportError:
12
+ print(
13
+ "Error: requests and/or beautifulsoup4 not installed.\n"
14
+ "Run: pip install requests beautifulsoup4",
15
+ file=sys.stderr,
16
+ )
17
+ sys.exit(1)
18
+
19
+
20
+ HEADERS = {
21
+ "User-Agent": (
22
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
23
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
24
+ )
25
+ }
26
+
27
+
28
+ def scrape(url: str) -> dict:
29
+ resp = requests.get(url, headers=HEADERS, timeout=30)
30
+ resp.raise_for_status()
31
+ soup = BeautifulSoup(resp.text, "html.parser")
32
+
33
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
34
+ tag.decompose()
35
+
36
+ title = soup.title.string.strip() if soup.title and soup.title.string else ""
37
+ text = soup.get_text(separator="\n", strip=True)
38
+
39
+ links = []
40
+ for a in soup.find_all("a", href=True):
41
+ href = a["href"]
42
+ label = a.get_text(strip=True)
43
+ if href.startswith("http"):
44
+ links.append({"text": label, "url": href})
45
+
46
+ headings = []
47
+ for level in range(1, 7):
48
+ for h in soup.find_all(f"h{level}"):
49
+ headings.append({"level": level, "text": h.get_text(strip=True)})
50
+
51
+ return {
52
+ "url": url,
53
+ "title": title,
54
+ "text": text[:10000],
55
+ "links": links[:100],
56
+ "headings": headings,
57
+ }
58
+
59
+
60
+ def main():
61
+ parser = argparse.ArgumentParser(description="Scrape a web page.")
62
+ parser.add_argument("url", help="URL to scrape")
63
+ parser.add_argument(
64
+ "--format",
65
+ choices=["text", "json", "links", "headings"],
66
+ default="text",
67
+ help="Output format",
68
+ )
69
+ args = parser.parse_args()
70
+
71
+ try:
72
+ data = scrape(args.url)
73
+ except Exception as exc:
74
+ print(f"Error scraping {args.url}: {exc}", file=sys.stderr)
75
+ sys.exit(1)
76
+
77
+ if args.format == "json":
78
+ print(json.dumps(data, indent=2, ensure_ascii=False))
79
+ elif args.format == "links":
80
+ for link in data["links"]:
81
+ print(f" {link['text']} -> {link['url']}")
82
+ elif args.format == "headings":
83
+ for h in data["headings"]:
84
+ indent = " " * (h["level"] - 1)
85
+ print(f"{indent}h{h['level']}: {h['text']}")
86
+ else:
87
+ print(f"Title: {data['title']}\n")
88
+ print(data["text"][:5000])
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()