yapit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yapit-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: yapit
3
+ Version: 0.1.0
4
+ Summary: CLI for yapit.md — fetch clean markdown from URLs and documents
5
+ Keywords: cli,tts,markdown,document-extraction,yapit
6
+ Author: Maximilian Wolf
7
+ Author-email: Maximilian Wolf <max@mwolf.dev>
8
+ License-Expression: AGPL-3.0-only
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
13
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
14
+ Requires-Dist: tyro>=1.0.9
15
+ Requires-Dist: httpx>=0.28.1
16
+ Requires-Python: >=3.14
17
+ Project-URL: Repository, https://github.com/yapit-tts/yapit-cli
18
+ Project-URL: Issues, https://github.com/yapit-tts/yapit-cli/issues
19
+ Project-URL: Homepage, https://yapit.md
20
+ Description-Content-Type: text/markdown
21
+
22
+ # yapit
23
+
24
+ CLI for [yapit.md](https://yapit.md) — fetch clean markdown from URLs and documents.
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ uv tool install yapit
30
+ ```
31
+
32
+ ## Usage
33
+
34
+ ```bash
35
+ # Fetch markdown of a shared document
36
+ yapit https://yapit.md/listen/<doc-id>
37
+
38
+ # Create document from URL and print markdown
39
+ yapit https://example.com/article
40
+
41
+ # With TTS annotations
42
+ yapit <doc-id> --annotated
43
+
44
+ # Archive locally with images (for Obsidian, etc.)
45
+ yapit https://arxiv.org/abs/2301.00001 --archive
46
+ ```
47
+
48
+ ## Auth
49
+
50
+ Creating documents or accessing private docs requires authentication:
51
+
52
+ ```bash
53
+ export YAPIT_EMAIL=you@example.com
54
+ export YAPIT_PASSWORD=...
55
+ ```
56
+
57
+ ## Archive mode
58
+
59
+ `--archive` saves to `~/Documents/archive/papers/<slug>/`:
60
+
61
+ ```
62
+ <slug>/
63
+ <slug>.md # clean markdown
64
+ TTS.md # annotated version
65
+ *.png # extracted images
66
+ ```
67
+
68
+ Override the base directory with `YAPIT_ARCHIVE_DIR` or `--archive-dir`.
yapit-0.1.0/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # yapit
2
+
3
+ CLI for [yapit.md](https://yapit.md) — fetch clean markdown from URLs and documents.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ uv tool install yapit
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```bash
14
+ # Fetch markdown of a shared document
15
+ yapit https://yapit.md/listen/<doc-id>
16
+
17
+ # Create document from URL and print markdown
18
+ yapit https://example.com/article
19
+
20
+ # With TTS annotations
21
+ yapit <doc-id> --annotated
22
+
23
+ # Archive locally with images (for Obsidian, etc.)
24
+ yapit https://arxiv.org/abs/2301.00001 --archive
25
+ ```
26
+
27
+ ## Auth
28
+
29
+ Creating documents or accessing private docs requires authentication:
30
+
31
+ ```bash
32
+ export YAPIT_EMAIL=you@example.com
33
+ export YAPIT_PASSWORD=...
34
+ ```
35
+
36
+ ## Archive mode
37
+
38
+ `--archive` saves to `~/Documents/archive/papers/<slug>/`:
39
+
40
+ ```
41
+ <slug>/
42
+ <slug>.md # clean markdown
43
+ TTS.md # annotated version
44
+ *.png # extracted images
45
+ ```
46
+
47
+ Override the base directory with `YAPIT_ARCHIVE_DIR` or `--archive-dir`.
@@ -0,0 +1,44 @@
1
+ [project]
2
+ name = "yapit"
3
+ version = "0.1.0"
4
+ description = "CLI for yapit.md — fetch clean markdown from URLs and documents"
5
+ readme = "README.md"
6
+ requires-python = ">=3.14"
7
+ license = "AGPL-3.0-only"
8
+ authors = [{ name = "Maximilian Wolf", email = "max@mwolf.dev" }]
9
+ keywords = ["cli", "tts", "markdown", "document-extraction", "yapit"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Developers",
13
+ "Programming Language :: Python :: 3",
14
+ "Topic :: Text Processing :: Markup :: Markdown",
15
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
16
+ ]
17
+ dependencies = [
18
+ "tyro>=1.0.9",
19
+ "httpx>=0.28.1",
20
+ ]
21
+
22
+ [project.urls]
23
+ Repository = "https://github.com/yapit-tts/yapit-cli"
24
+ Issues = "https://github.com/yapit-tts/yapit-cli/issues"
25
+ Homepage = "https://yapit.md"
26
+
27
+ [project.scripts]
28
+ yapit = "yapit:main"
29
+
30
+ [build-system]
31
+ requires = ["uv_build>=0.10.10,<0.11.0"]
32
+ build-backend = "uv_build"
33
+
34
+ [dependency-groups]
35
+ dev = [
36
+ "ruff>=0.15.6",
37
+ ]
38
+
39
+ [tool.ruff]
40
+ line-length = 120
41
+
42
+ [tool.ruff.lint]
43
+ select = ["E", "W", "F", "I", "B", "C4", "UP", "SIM"]
44
+ ignore = ["E501", "SIM108"]
@@ -0,0 +1,3 @@
1
+ from yapit.cli import main
2
+
3
+ __all__ = ["main"]
@@ -0,0 +1,3 @@
1
+ from yapit.cli import main
2
+
3
+ main()
@@ -0,0 +1,370 @@
1
+ """Fetch clean markdown from yapit.md documents and URLs.
2
+
3
+ Create documents from URLs and download their markdown, optionally with
4
+ TTS annotations. Archive locally with images for Obsidian integration.
5
+
6
+ Examples::
7
+
8
+ yapit https://example.com/article
9
+ yapit https://arxiv.org/abs/2301.00001
10
+ yapit https://yapit.md/listen/550e8400-e29b-41d4-a716-446655440000
11
+ yapit 550e8400-e29b-41d4-a716-446655440000 --annotated
12
+ yapit https://example.com/article --archive
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import os
18
+ import re
19
+ import sys
20
+ import time
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+ from typing import Annotated, Literal
24
+ from urllib.parse import urlparse
25
+
26
+ import httpx
27
+ import tyro
28
+
29
+ # Stack Auth public credentials (baked into the frontend bundle)
30
+ _STACK_PROJECT_ID = "6038930b-72c1-407f-9e38-f1287a4d1ede"
31
+ _STACK_CLIENT_KEY = "pck_m04c3bgjsmstpk4khbhtma5161b694zcrk94v6dcavpbr"
32
+
33
+ _UUID_RE = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE)
34
+ _YAPIT_LISTEN_RE = re.compile(r"yapit\.md/listen/([0-9a-f-]{36})", re.IGNORECASE)
35
+ _IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
36
+
37
+
38
+ def _err(msg: str) -> None:
39
+ print(msg, file=sys.stderr)
40
+
41
+
42
+ def _die(msg: str) -> None:
43
+ _err(f"error: {msg}")
44
+ sys.exit(1)
45
+
46
+
47
+ # --- Input resolution ---
48
+
49
+
50
+ def resolve_input(url_or_id: str) -> tuple[Literal["uuid", "external"], str]:
51
+ """Detect whether input is a yapit document UUID or an external URL."""
52
+ if _UUID_RE.match(url_or_id):
53
+ return "uuid", url_or_id
54
+
55
+ m = _YAPIT_LISTEN_RE.search(url_or_id)
56
+ if m:
57
+ return "uuid", m.group(1)
58
+
59
+ parsed = urlparse(url_or_id)
60
+ if not parsed.scheme:
61
+ url_or_id = f"https://{url_or_id}"
62
+
63
+ return "external", url_or_id
64
+
65
+
66
+ # --- Auth ---
67
+
68
+
69
+ def authenticate(base_url: str, email: str, password: str) -> str:
70
+ """Sign in via Stack Auth and return an access token."""
71
+ resp = httpx.post(
72
+ f"{base_url}/auth/api/v1/auth/password/sign-in",
73
+ headers={
74
+ "Content-Type": "application/json",
75
+ "X-Stack-Access-Type": "client",
76
+ "X-Stack-Project-Id": _STACK_PROJECT_ID,
77
+ "X-Stack-Publishable-Client-Key": _STACK_CLIENT_KEY,
78
+ },
79
+ json={"email": email, "password": password},
80
+ timeout=15,
81
+ )
82
+ if resp.status_code == 400:
83
+ _die("authentication failed — check email/password")
84
+ resp.raise_for_status()
85
+ return resp.json()["access_token"]
86
+
87
+
88
+ # --- Document creation ---
89
+
90
+
91
+ def create_document(client: httpx.Client, url: str, ai: bool) -> tuple[str, str | None]:
92
+ """Create a document from an external URL. Returns (doc_id, title)."""
93
+ # Step 1: prepare
94
+ resp = client.post("/v1/documents/prepare", json={"url": url}, timeout=30)
95
+ resp.raise_for_status()
96
+ prep = resp.json()
97
+
98
+ doc_hash = prep["hash"]
99
+ endpoint = prep["endpoint"]
100
+ title = prep["metadata"].get("title")
101
+ content_hash = prep["content_hash"]
102
+
103
+ _err(f"Creating document from {endpoint}...")
104
+
105
+ # Step 2: create
106
+ if endpoint == "website":
107
+ resp = client.post("/v1/documents/website", json={"hash": doc_hash}, timeout=60)
108
+ resp.raise_for_status()
109
+ data = resp.json()
110
+ return data["id"], data.get("title") or title
111
+
112
+ if endpoint == "document":
113
+ resp = client.post(
114
+ "/v1/documents/document",
115
+ json={"hash": doc_hash, "ai_transform": ai, "batch_mode": False},
116
+ timeout=60,
117
+ )
118
+ resp.raise_for_status()
119
+
120
+ if resp.status_code == 201:
121
+ data = resp.json()
122
+ return data["id"], data.get("title") or title
123
+
124
+ # 202 — async extraction, need to poll
125
+ extraction = resp.json()
126
+ extraction_id = extraction.get("extraction_id")
127
+ total_pages = extraction["total_pages"]
128
+ pages = list(range(total_pages))
129
+
130
+ return _poll_extraction(client, extraction_id, content_hash, ai, pages, title)
131
+
132
+ if endpoint == "text":
133
+ _die("text endpoint not supported for URL creation")
134
+
135
+ _die(f"unexpected endpoint type: {endpoint}")
136
+ raise AssertionError # unreachable
137
+
138
+
139
+ def _poll_extraction(
140
+ client: httpx.Client,
141
+ extraction_id: str | None,
142
+ content_hash: str,
143
+ ai_transform: bool,
144
+ pages: list[int],
145
+ title: str | None,
146
+ ) -> tuple[str, str | None]:
147
+ """Poll extraction status until complete. Returns (doc_id, title)."""
148
+ interval = 0.5
149
+ max_interval = 3.0
150
+
151
+ while True:
152
+ resp = client.post(
153
+ "/v1/documents/extraction/status",
154
+ json={
155
+ "extraction_id": extraction_id,
156
+ "content_hash": content_hash,
157
+ "ai_transform": ai_transform,
158
+ "pages": pages,
159
+ },
160
+ timeout=15,
161
+ )
162
+ resp.raise_for_status()
163
+ status = resp.json()
164
+
165
+ completed = len(status.get("completed_pages", []))
166
+ total = status["total_pages"]
167
+
168
+ if status["status"] == "complete":
169
+ if status.get("error"):
170
+ _die(f"extraction failed: {status['error']}")
171
+ doc_id = status.get("document_id")
172
+ if not doc_id:
173
+ _die("extraction completed but no document_id returned")
174
+ _err(f"Extracted {total}/{total} pages")
175
+ return doc_id, title
176
+
177
+ _err(f"Extracting... {completed}/{total} pages")
178
+ time.sleep(interval)
179
+ interval = min(interval * 1.5, max_interval)
180
+
181
+
182
+ # --- Markdown fetching ---
183
+
184
+
185
+ def fetch_markdown(base_url: str, doc_id: str, annotated: bool, token: str | None) -> str:
186
+ """Fetch markdown for a document. Auth optional — shared docs work without it."""
187
+ suffix = "md-annotated" if annotated else "md"
188
+ url = f"{base_url}/api/v1/documents/{doc_id}/{suffix}"
189
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
190
+
191
+ resp = httpx.get(url, headers=headers, timeout=30)
192
+ if resp.status_code == 200:
193
+ return resp.text
194
+ if resp.status_code == 404:
195
+ hint = "" if token else " (private doc? set YAPIT_EMAIL/YAPIT_PASSWORD)"
196
+ _die(f"document {doc_id} not found{hint}")
197
+ resp.raise_for_status()
198
+ raise AssertionError # unreachable
199
+
200
+
201
+ def fetch_title(base_url: str, doc_id: str, token: str | None) -> str | None:
202
+ """Fetch document title from the API."""
203
+ # /public endpoint always works for shared docs, no auth needed
204
+ resp = httpx.get(f"{base_url}/api/v1/documents/{doc_id}/public", timeout=15)
205
+ if resp.status_code == 200:
206
+ return resp.json().get("title")
207
+ # Private doc — need auth
208
+ if token:
209
+ resp = httpx.get(
210
+ f"{base_url}/api/v1/documents/{doc_id}",
211
+ headers={"Authorization": f"Bearer {token}"},
212
+ timeout=15,
213
+ )
214
+ if resp.status_code == 200:
215
+ return resp.json().get("title")
216
+ return None
217
+
218
+
219
+ # --- Archive ---
220
+
221
+
222
+ def archive_document(
223
+ markdown: str,
224
+ annotated_md: str | None,
225
+ title: str | None,
226
+ base_url: str,
227
+ archive_dir: Path,
228
+ ) -> Path:
229
+ """Save markdown and images to archive directory. Returns the archive path."""
230
+ slug = _slugify(title or "untitled")
231
+ doc_dir = archive_dir / slug
232
+ doc_dir.mkdir(parents=True, exist_ok=True)
233
+
234
+ # Download images and rewrite paths
235
+ markdown = _download_images(markdown, base_url, doc_dir)
236
+ if annotated_md:
237
+ annotated_md = _download_images(annotated_md, base_url, doc_dir)
238
+
239
+ (doc_dir / f"{slug}.md").write_text(markdown, encoding="utf-8")
240
+ if annotated_md:
241
+ (doc_dir / "TTS.md").write_text(annotated_md, encoding="utf-8")
242
+
243
+ return doc_dir
244
+
245
+
246
+ def _slugify(title: str) -> str:
247
+ slug = re.sub(r"[^a-zA-Z0-9]+", "-", title).strip("-").lower()
248
+ return slug[:100] or "untitled"
249
+
250
+
251
+ def _download_images(markdown: str, base_url: str, doc_dir: Path) -> str:
252
+ """Download images referenced in markdown, rewrite URLs to relative paths."""
253
+ seen: dict[str, str] = {}
254
+
255
+ def replace_image(match: re.Match) -> str:
256
+ alt, url = match.group(1), match.group(2)
257
+
258
+ if url in seen:
259
+ return f"![{alt}]({seen[url]})"
260
+
261
+ # Skip data URIs
262
+ if url.startswith("data:"):
263
+ return match.group(0)
264
+
265
+ # Resolve relative URLs (e.g. /images/hash/file.png)
266
+ if url.startswith("/"):
267
+ full_url = f"{base_url}{url}"
268
+ elif not url.startswith(("http://", "https://")):
269
+ return match.group(0)
270
+ else:
271
+ full_url = url
272
+
273
+ # Strip query params for filename
274
+ parsed = urlparse(full_url)
275
+ filename = Path(parsed.path).name
276
+ if not filename:
277
+ return match.group(0)
278
+
279
+ try:
280
+ resp = httpx.get(full_url, timeout=15, follow_redirects=True)
281
+ resp.raise_for_status()
282
+ (doc_dir / filename).write_bytes(resp.content)
283
+ relative = f"./{filename}"
284
+ seen[url] = relative
285
+ return f"![{alt}]({relative})"
286
+ except httpx.HTTPError:
287
+ _err(f"warning: failed to download image {full_url}")
288
+ return match.group(0)
289
+
290
+ return _IMAGE_RE.sub(replace_image, markdown)
291
+
292
+
293
+ # --- CLI ---
294
+
295
+
296
+ @dataclass
297
+ class Args:
298
+ """Fetch clean markdown from yapit.md documents and URLs."""
299
+
300
+ url: Annotated[str, tyro.conf.Positional]
301
+ """URL, yapit document UUID, or yapit.md/listen/... link."""
302
+
303
+ annotated: bool = False
304
+ """Include TTS annotations (yap-speak, yap-show, yap-cap tags)."""
305
+
306
+ archive: bool = False
307
+ """Save to archive directory with images instead of printing to stdout."""
308
+
309
+ archive_dir: str = ""
310
+ """Base directory for archived documents. Default: ~/Documents/archive/papers. Env: YAPIT_ARCHIVE_DIR."""
311
+
312
+ ai: bool = False
313
+ """Use AI extraction for PDFs (uses quota, better quality for complex layouts)."""
314
+
315
+ base_url: str = ""
316
+ """Yapit instance URL. Default: https://yapit.md. Env: YAPIT_BASE_URL."""
317
+
318
+ email: str = ""
319
+ """Auth email. Env: YAPIT_EMAIL."""
320
+
321
+ password: str = ""
322
+ """Auth password. Env: YAPIT_PASSWORD."""
323
+
324
+
325
+ def main() -> None:
326
+ args = tyro.cli(Args, description=__doc__)
327
+
328
+ base_url = (args.base_url or os.environ.get("YAPIT_BASE_URL", "https://yapit.md")).rstrip("/")
329
+ email = args.email or os.environ.get("YAPIT_EMAIL", "")
330
+ password = args.password or os.environ.get("YAPIT_PASSWORD", "")
331
+ archive_dir = Path(
332
+ args.archive_dir or os.environ.get("YAPIT_ARCHIVE_DIR", "~/Documents/archive/papers")
333
+ ).expanduser()
334
+
335
+ input_type, value = resolve_input(args.url)
336
+ token: str | None = None
337
+
338
+ # Authenticate if needed
339
+ needs_auth = input_type == "external" or (email and password)
340
+ if needs_auth:
341
+ if not email or not password:
342
+ _die("authentication required — set YAPIT_EMAIL and YAPIT_PASSWORD")
343
+ token = authenticate(base_url, email, password)
344
+
345
+ # Create document if external URL
346
+ doc_id: str
347
+ title: str | None = None
348
+ if input_type == "external":
349
+ assert token is not None
350
+ client = httpx.Client(
351
+ base_url=f"{base_url}/api",
352
+ headers={"Authorization": f"Bearer {token}"},
353
+ timeout=30,
354
+ )
355
+ doc_id, title = create_document(client, value, ai=args.ai)
356
+ _err(f"Document created: {base_url}/listen/{doc_id}")
357
+ else:
358
+ doc_id = value
359
+
360
+ # Fetch markdown
361
+ if args.archive:
362
+ if not title:
363
+ title = fetch_title(base_url, doc_id, token)
364
+ md = fetch_markdown(base_url, doc_id, annotated=False, token=token)
365
+ annotated_md = fetch_markdown(base_url, doc_id, annotated=True, token=token)
366
+ doc_dir = archive_document(md, annotated_md, title, base_url, archive_dir)
367
+ print(doc_dir)
368
+ else:
369
+ md = fetch_markdown(base_url, doc_id, annotated=args.annotated, token=token)
370
+ print(md)