rawfy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rawfy/__init__.py +26 -0
- rawfy/__main__.py +5 -0
- rawfy/client.py +291 -0
- rawfy-0.1.0.dist-info/METADATA +98 -0
- rawfy-0.1.0.dist-info/RECORD +8 -0
- rawfy-0.1.0.dist-info/WHEEL +5 -0
- rawfy-0.1.0.dist-info/entry_points.txt +2 -0
- rawfy-0.1.0.dist-info/top_level.txt +1 -0
rawfy/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rawfy — Python wrapper for the Rawfy AI agent skill.
|
|
3
|
+
|
|
4
|
+
Provides a Pythonic interface to the Rawfy CLI for fetching and
|
|
5
|
+
processing web pages into agent-readable content.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from rawfy import fetch, metadata
|
|
9
|
+
|
|
10
|
+
# Full page fetch
|
|
11
|
+
result = fetch("https://example.com")
|
|
12
|
+
print(result)
|
|
13
|
+
|
|
14
|
+
# JSON format for structured data
|
|
15
|
+
data = fetch("https://example.com", format="json")
|
|
16
|
+
import json
|
|
17
|
+
parsed = json.loads(data)
|
|
18
|
+
|
|
19
|
+
# Metadata only (lightweight)
|
|
20
|
+
meta = metadata("https://example.com")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from rawfy.client import fetch, metadata, RawfyError
|
|
24
|
+
|
|
25
|
+
__version__ = "0.1.0"
|
|
26
|
+
__all__ = ["fetch", "metadata", "RawfyError", "__version__"]
|
rawfy/__main__.py
ADDED
rawfy/client.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rawfy Python client — subprocess wrapper around the Rawfy CLI.
|
|
3
|
+
|
|
4
|
+
This module shells out to the `rawfy` Node.js CLI and parses
|
|
5
|
+
the output. It requires Node.js >= 18 and `rawfy` to be installed
|
|
6
|
+
globally or available via npx.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
import sys
|
|
15
|
+
from typing import Any, Literal
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RawfyError(Exception):
|
|
19
|
+
"""Raised when the Rawfy CLI returns an error."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, message: str, code: str | None = None, url: str | None = None):
|
|
22
|
+
self.code = code
|
|
23
|
+
self.url = url
|
|
24
|
+
super().__init__(message)
|
|
25
|
+
|
|
26
|
+
def __repr__(self) -> str:
|
|
27
|
+
parts = [f"RawfyError({self.args[0]!r}"]
|
|
28
|
+
if self.code:
|
|
29
|
+
parts.append(f", code={self.code!r}")
|
|
30
|
+
if self.url:
|
|
31
|
+
parts.append(f", url={self.url!r}")
|
|
32
|
+
parts.append(")")
|
|
33
|
+
return "".join(parts)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
OutputFormat = Literal["markdown", "json", "text"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _find_rawfy_cli() -> list[str]:
|
|
40
|
+
"""
|
|
41
|
+
Locate the rawfy CLI executable.
|
|
42
|
+
|
|
43
|
+
Search order:
|
|
44
|
+
1. `rawfy` in PATH (global npm install)
|
|
45
|
+
2. `npx rawfy` (local install / npx resolution)
|
|
46
|
+
3. `node node_modules/.bin/rawfy` (project-local)
|
|
47
|
+
|
|
48
|
+
Returns the command prefix as a list of strings.
|
|
49
|
+
"""
|
|
50
|
+
# 1. Check if rawfy is directly in PATH
|
|
51
|
+
if shutil.which("rawfy"):
|
|
52
|
+
return ["rawfy"]
|
|
53
|
+
|
|
54
|
+
# 2. Check if npx is available
|
|
55
|
+
if shutil.which("npx"):
|
|
56
|
+
return ["npx", "-y", "rawfy"]
|
|
57
|
+
|
|
58
|
+
# 3. Check if node is available for direct execution
|
|
59
|
+
if shutil.which("node"):
|
|
60
|
+
return ["node", "node_modules/.bin/rawfy"]
|
|
61
|
+
|
|
62
|
+
raise RawfyError(
|
|
63
|
+
"Could not find rawfy CLI. Install it with: npm install -g rawfy",
|
|
64
|
+
code="CLI_NOT_FOUND",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def fetch(
|
|
69
|
+
url: str,
|
|
70
|
+
*,
|
|
71
|
+
format: OutputFormat = "markdown",
|
|
72
|
+
vision: bool = False,
|
|
73
|
+
no_playwright: bool = False,
|
|
74
|
+
max_tokens: int = 50_000,
|
|
75
|
+
timeout: int = 30,
|
|
76
|
+
) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Fetch a URL and return its content in the specified format.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
url: The URL to fetch and process.
|
|
82
|
+
format: Output format — "markdown" (default), "json", or "text".
|
|
83
|
+
vision: Enable vision API for image descriptions.
|
|
84
|
+
no_playwright: Skip Playwright, use static fetch only.
|
|
85
|
+
max_tokens: Maximum output tokens (default: 50000).
|
|
86
|
+
timeout: Subprocess timeout in seconds (default: 30).
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The processed page content as a string.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
RawfyError: If the fetch fails or the CLI is not found.
|
|
93
|
+
"""
|
|
94
|
+
cmd = _find_rawfy_cli()
|
|
95
|
+
cmd.extend(["fetch", url, "--format", format, "--max-tokens", str(max_tokens)])
|
|
96
|
+
|
|
97
|
+
if vision:
|
|
98
|
+
cmd.append("--vision")
|
|
99
|
+
if no_playwright:
|
|
100
|
+
cmd.append("--no-playwright")
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
result = subprocess.run(
|
|
104
|
+
cmd,
|
|
105
|
+
capture_output=True,
|
|
106
|
+
text=True,
|
|
107
|
+
timeout=timeout,
|
|
108
|
+
)
|
|
109
|
+
except FileNotFoundError:
|
|
110
|
+
raise RawfyError(
|
|
111
|
+
"Node.js not found. Rawfy requires Node.js >= 18.",
|
|
112
|
+
code="NODE_NOT_FOUND",
|
|
113
|
+
)
|
|
114
|
+
except subprocess.TimeoutExpired:
|
|
115
|
+
raise RawfyError(
|
|
116
|
+
f"Rawfy timed out after {timeout}s fetching {url}",
|
|
117
|
+
code="TIMEOUT",
|
|
118
|
+
url=url,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
if result.returncode != 0:
|
|
122
|
+
stderr = result.stderr.strip()
|
|
123
|
+
raise RawfyError(
|
|
124
|
+
stderr or f"rawfy fetch failed with exit code {result.returncode}",
|
|
125
|
+
code="FETCH_FAILED",
|
|
126
|
+
url=url,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return result.stdout
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def fetch_json(
|
|
133
|
+
url: str,
|
|
134
|
+
*,
|
|
135
|
+
vision: bool = False,
|
|
136
|
+
no_playwright: bool = False,
|
|
137
|
+
max_tokens: int = 50_000,
|
|
138
|
+
timeout: int = 30,
|
|
139
|
+
) -> dict[str, Any]:
|
|
140
|
+
"""
|
|
141
|
+
Fetch a URL and return structured data as a Python dict.
|
|
142
|
+
|
|
143
|
+
Convenience wrapper around fetch() with format="json".
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
url: The URL to fetch and process.
|
|
147
|
+
vision: Enable vision API for image descriptions.
|
|
148
|
+
no_playwright: Skip Playwright, use static fetch only.
|
|
149
|
+
max_tokens: Maximum output tokens (default: 50000).
|
|
150
|
+
timeout: Subprocess timeout in seconds (default: 30).
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Parsed JSON as a dict with keys: metadata, content, media,
|
|
154
|
+
interactive_elements, fetch_stats.
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
RawfyError: If the fetch fails or JSON parsing fails.
|
|
158
|
+
"""
|
|
159
|
+
raw = fetch(
|
|
160
|
+
url,
|
|
161
|
+
format="json",
|
|
162
|
+
vision=vision,
|
|
163
|
+
no_playwright=no_playwright,
|
|
164
|
+
max_tokens=max_tokens,
|
|
165
|
+
timeout=timeout,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
return json.loads(raw)
|
|
170
|
+
except json.JSONDecodeError as e:
|
|
171
|
+
raise RawfyError(
|
|
172
|
+
f"Failed to parse rawfy JSON output: {e}",
|
|
173
|
+
code="JSON_PARSE_ERROR",
|
|
174
|
+
url=url,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def metadata(
|
|
179
|
+
url: str,
|
|
180
|
+
*,
|
|
181
|
+
no_playwright: bool = False,
|
|
182
|
+
timeout: int = 15,
|
|
183
|
+
) -> dict[str, Any]:
|
|
184
|
+
"""
|
|
185
|
+
Fetch only the metadata for a URL (lightweight).
|
|
186
|
+
|
|
187
|
+
Returns title, description, type, language, word count, etc.
|
|
188
|
+
without processing media or generating full content.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
url: The URL to fetch metadata for.
|
|
192
|
+
no_playwright: Skip Playwright, use static fetch only.
|
|
193
|
+
timeout: Subprocess timeout in seconds (default: 15).
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Metadata dict with keys: url, title, description, type,
|
|
197
|
+
lang, word_count, reading_time_minutes, etc.
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
RawfyError: If the fetch fails.
|
|
201
|
+
"""
|
|
202
|
+
# Use the JSON format and extract just the metadata
|
|
203
|
+
data = fetch_json(
|
|
204
|
+
url,
|
|
205
|
+
no_playwright=no_playwright,
|
|
206
|
+
max_tokens=10_000,
|
|
207
|
+
timeout=timeout,
|
|
208
|
+
)
|
|
209
|
+
return data.get("metadata", {})
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def check_installation() -> dict[str, Any]:
|
|
213
|
+
"""
|
|
214
|
+
Check if Rawfy and its dependencies are properly installed.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Dict with installation status:
|
|
218
|
+
{
|
|
219
|
+
"node": True/False,
|
|
220
|
+
"rawfy_cli": True/False,
|
|
221
|
+
"playwright": True/False,
|
|
222
|
+
"version": "0.1.0" or None
|
|
223
|
+
}
|
|
224
|
+
"""
|
|
225
|
+
status: dict[str, Any] = {
|
|
226
|
+
"node": False,
|
|
227
|
+
"rawfy_cli": False,
|
|
228
|
+
"playwright": False,
|
|
229
|
+
"version": None,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
# Check Node.js
|
|
233
|
+
if shutil.which("node"):
|
|
234
|
+
status["node"] = True
|
|
235
|
+
|
|
236
|
+
# Check rawfy CLI
|
|
237
|
+
try:
|
|
238
|
+
cmd = _find_rawfy_cli()
|
|
239
|
+
result = subprocess.run(
|
|
240
|
+
[*cmd, "version"],
|
|
241
|
+
capture_output=True,
|
|
242
|
+
text=True,
|
|
243
|
+
timeout=5,
|
|
244
|
+
)
|
|
245
|
+
if result.returncode == 0:
|
|
246
|
+
status["rawfy_cli"] = True
|
|
247
|
+
status["version"] = result.stdout.strip()
|
|
248
|
+
except (RawfyError, subprocess.TimeoutExpired, FileNotFoundError):
|
|
249
|
+
pass
|
|
250
|
+
|
|
251
|
+
# Check Playwright
|
|
252
|
+
try:
|
|
253
|
+
result = subprocess.run(
|
|
254
|
+
["npx", "playwright", "--version"],
|
|
255
|
+
capture_output=True,
|
|
256
|
+
text=True,
|
|
257
|
+
timeout=5,
|
|
258
|
+
)
|
|
259
|
+
if result.returncode == 0:
|
|
260
|
+
status["playwright"] = True
|
|
261
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
return status
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def main() -> None:
|
|
268
|
+
"""CLI entry point for python -m rawfy."""
|
|
269
|
+
if len(sys.argv) < 2:
|
|
270
|
+
print("Usage: python -m rawfy <url> [--format markdown|json|text]")
|
|
271
|
+
sys.exit(1)
|
|
272
|
+
|
|
273
|
+
url = sys.argv[1]
|
|
274
|
+
fmt: OutputFormat = "markdown"
|
|
275
|
+
|
|
276
|
+
if "--format" in sys.argv:
|
|
277
|
+
idx = sys.argv.index("--format")
|
|
278
|
+
if idx + 1 < len(sys.argv):
|
|
279
|
+
fmt = sys.argv[idx + 1] # type: ignore[assignment]
|
|
280
|
+
|
|
281
|
+
try:
|
|
282
|
+
output = fetch(url, format=fmt)
|
|
283
|
+
print(output)
|
|
284
|
+
except RawfyError as e:
|
|
285
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
286
|
+
sys.exit(1)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# Entry point for `python -m rawfy`
|
|
290
|
+
if __name__ == "__main__":
|
|
291
|
+
main()
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rawfy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python wrapper for the Rawfy AI agent skill — converts any URL into structured, agent-readable content
|
|
5
|
+
Author: rishiiicreates
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rishiiicreates/rawfy
|
|
8
|
+
Project-URL: Repository, https://github.com/rishiiicreates/rawfy
|
|
9
|
+
Project-URL: Issues, https://github.com/rishiiicreates/rawfy/issues
|
|
10
|
+
Keywords: ai,agent,mcp,web,scraper,markdown,rawfy
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# Rawfy Python Wrapper
|
|
26
|
+
|
|
27
|
+
Python bindings for the [Rawfy](https://github.com/rishiiicreates/rawfy) AI agent skill.
|
|
28
|
+
|
|
29
|
+
## Prerequisites
|
|
30
|
+
|
|
31
|
+
- **Node.js >= 18** — Rawfy is a Node.js tool
|
|
32
|
+
- **Rawfy CLI** — `npm install -g rawfy`
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install rawfy
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from rawfy import fetch, metadata
|
|
44
|
+
|
|
45
|
+
# Full page fetch (returns WSM markdown)
|
|
46
|
+
content = fetch("https://example.com")
|
|
47
|
+
|
|
48
|
+
# Get structured JSON data
|
|
49
|
+
import json
|
|
50
|
+
data = fetch("https://example.com", format="json")
|
|
51
|
+
parsed = json.loads(data)
|
|
52
|
+
|
|
53
|
+
# Or use the convenience wrapper
|
|
54
|
+
from rawfy import fetch_json
|
|
55
|
+
data = fetch_json("https://example.com")
|
|
56
|
+
print(data["metadata"]["title"])
|
|
57
|
+
|
|
58
|
+
# Lightweight metadata only
|
|
59
|
+
meta = metadata("https://docs.python.org")
|
|
60
|
+
print(f"Title: {meta['title']}, Words: {meta['word_count']}")
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## API
|
|
64
|
+
|
|
65
|
+
### `fetch(url, *, format="markdown", vision=False, no_playwright=False, max_tokens=50000, timeout=30)`
|
|
66
|
+
|
|
67
|
+
Fetch and process a URL. Returns the content as a string.
|
|
68
|
+
|
|
69
|
+
### `fetch_json(url, **kwargs)`
|
|
70
|
+
|
|
71
|
+
Same as `fetch()` with `format="json"`, returns a parsed dict.
|
|
72
|
+
|
|
73
|
+
### `metadata(url, *, no_playwright=False, timeout=15)`
|
|
74
|
+
|
|
75
|
+
Fetch only page metadata (lightweight). Returns a dict.
|
|
76
|
+
|
|
77
|
+
### `check_installation()`
|
|
78
|
+
|
|
79
|
+
Check if Rawfy and dependencies are installed. Returns a status dict.
|
|
80
|
+
|
|
81
|
+
## Error Handling
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from rawfy import fetch, RawfyError
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
content = fetch("https://example.com")
|
|
88
|
+
except RawfyError as e:
|
|
89
|
+
print(f"Error [{e.code}]: {e}")
|
|
90
|
+
print(f"URL: {e.url}")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## CLI
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
python -m rawfy https://example.com
|
|
97
|
+
python -m rawfy https://example.com --format json
|
|
98
|
+
```
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
rawfy/__init__.py,sha256=gYdBY673D1zXuMXOhozl-hkMFYxT3ejp5Cm3iYEriL0,663
|
|
2
|
+
rawfy/__main__.py,sha256=laMAOyCTi0_1BQuZEuo2V6MxTIaHAHgtqncDhW3EB54,111
|
|
3
|
+
rawfy/client.py,sha256=iYOYZ1UMtCueuuGa9bbv3UOS1J_UeXytPOgMnrDbUMY,7638
|
|
4
|
+
rawfy-0.1.0.dist-info/METADATA,sha256=wa0N_aMfE6Y5p2t0MdramckQbKw4EEA8oDWvAMP87Pg,2669
|
|
5
|
+
rawfy-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
rawfy-0.1.0.dist-info/entry_points.txt,sha256=CqH6BEYVhmDf1Q_i0CMs0_oXMG1D2TDUKzFfgKAhX1U,47
|
|
7
|
+
rawfy-0.1.0.dist-info/top_level.txt,sha256=r_YJojnUlG1HYJ9XaiWjTD2AdNiXy62AWjnXEWiwR3w,6
|
|
8
|
+
rawfy-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
rawfy
|