anything2md 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anything2md-0.1.0/.github/workflows/ci.yml +60 -0
- anything2md-0.1.0/.github/workflows/publish.yml +35 -0
- anything2md-0.1.0/.gitignore +8 -0
- anything2md-0.1.0/PKG-INFO +88 -0
- anything2md-0.1.0/README.md +74 -0
- anything2md-0.1.0/pyproject.toml +32 -0
- anything2md-0.1.0/src/anything2md/__init__.py +50 -0
- anything2md-0.1.0/src/anything2md/__main__.py +5 -0
- anything2md-0.1.0/src/anything2md/cli.py +110 -0
- anything2md-0.1.0/src/anything2md/client.py +186 -0
- anything2md-0.1.0/src/anything2md/config.py +23 -0
- anything2md-0.1.0/src/anything2md/converter.py +332 -0
- anything2md-0.1.0/src/anything2md/errors.py +45 -0
- anything2md-0.1.0/src/anything2md/formats.py +100 -0
- anything2md-0.1.0/src/anything2md/models.py +58 -0
- anything2md-0.1.0/tests/conftest.py +9 -0
- anything2md-0.1.0/tests/test_cli.py +9 -0
- anything2md-0.1.0/tests/test_client.py +223 -0
- anything2md-0.1.0/tests/test_converter.py +420 -0
- anything2md-0.1.0/tests/test_entrypoint.py +57 -0
- anything2md-0.1.0/tests/test_formats.py +68 -0
- anything2md-0.1.0/uv.lock +230 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- master
|
|
7
|
+
- main
|
|
8
|
+
pull_request:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
name: Tests (Python ${{ matrix.python-version }})
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
strategy:
|
|
15
|
+
fail-fast: false
|
|
16
|
+
matrix:
|
|
17
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
18
|
+
steps:
|
|
19
|
+
- name: Checkout
|
|
20
|
+
uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
|
|
27
|
+
- name: Set up uv
|
|
28
|
+
uses: astral-sh/setup-uv@v6
|
|
29
|
+
with:
|
|
30
|
+
enable-cache: true
|
|
31
|
+
|
|
32
|
+
- name: Sync dependencies
|
|
33
|
+
run: uv sync --extra test --frozen
|
|
34
|
+
|
|
35
|
+
- name: Run tests
|
|
36
|
+
run: uv run pytest -q
|
|
37
|
+
|
|
38
|
+
build:
|
|
39
|
+
name: Build distribution
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
needs: test
|
|
42
|
+
steps:
|
|
43
|
+
- name: Checkout
|
|
44
|
+
uses: actions/checkout@v4
|
|
45
|
+
|
|
46
|
+
- name: Set up Python
|
|
47
|
+
uses: actions/setup-python@v5
|
|
48
|
+
with:
|
|
49
|
+
python-version: "3.11"
|
|
50
|
+
|
|
51
|
+
- name: Set up uv
|
|
52
|
+
uses: astral-sh/setup-uv@v6
|
|
53
|
+
with:
|
|
54
|
+
enable-cache: true
|
|
55
|
+
|
|
56
|
+
- name: Build package
|
|
57
|
+
run: uv build
|
|
58
|
+
|
|
59
|
+
- name: Check metadata
|
|
60
|
+
run: uv run --with twine twine check dist/*
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
name: Publish package to PyPI
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
environment:
|
|
13
|
+
name: pypi
|
|
14
|
+
url: https://pypi.org/p/anything2md
|
|
15
|
+
permissions:
|
|
16
|
+
id-token: write
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout
|
|
19
|
+
uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.11"
|
|
25
|
+
|
|
26
|
+
- name: Set up uv
|
|
27
|
+
uses: astral-sh/setup-uv@v6
|
|
28
|
+
with:
|
|
29
|
+
enable-cache: true
|
|
30
|
+
|
|
31
|
+
- name: Build distributions
|
|
32
|
+
run: uv build
|
|
33
|
+
|
|
34
|
+
- name: Publish distributions to PyPI
|
|
35
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: anything2md
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert documents to Markdown using Cloudflare Workers AI toMarkdown.
|
|
5
|
+
Project-URL: Homepage, https://github.com/herrkaefer/anything2md
|
|
6
|
+
Project-URL: Repository, https://github.com/herrkaefer/anything2md
|
|
7
|
+
Author: Anything2MD Contributors
|
|
8
|
+
License: MIT
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: httpx<1,>=0.27
|
|
11
|
+
Provides-Extra: test
|
|
12
|
+
Requires-Dist: pytest<9,>=8; extra == 'test'
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# anything2md
|
|
16
|
+
[](https://github.com/herrkaefer/anything2md/actions/workflows/ci.yml)
|
|
17
|
+
[](https://pypi.org/project/anything2md/)
|
|
18
|
+
[](https://pypi.org/project/anything2md/)
|
|
19
|
+
[](https://opensource.org/licenses/MIT)
|
|
20
|
+
|
|
21
|
+
Python package and CLI for converting URLs or local documents into Markdown using Cloudflare Workers AI `toMarkdown()`.
|
|
22
|
+
|
|
23
|
+
## Install
|
|
24
|
+
|
|
25
|
+
From GitHub:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install "git+https://github.com/herrkaefer/anything2md.git"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Cloudflare Token Setup
|
|
32
|
+
|
|
33
|
+
Create a Cloudflare API Token for the target account and include these permissions:
|
|
34
|
+
|
|
35
|
+
- `Workers AI`
|
|
36
|
+
- `Browser Rendering - Edit`
|
|
37
|
+
|
|
38
|
+
## Library Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import anything2md
|
|
42
|
+
|
|
43
|
+
mdconverter = anything2md(account_id="xxx", api_token="xxx")
|
|
44
|
+
result = mdconverter.convert("https://example.com")
|
|
45
|
+
print(result.markdown)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Supported Formats
|
|
49
|
+
|
|
50
|
+
Based on Cloudflare docs, current supported extensions include:
|
|
51
|
+
|
|
52
|
+
`pdf`, `jpeg/jpg`, `png`, `webp`, `svg`, `html/htm`, `xml`, `csv`, `docx`, `xlsx`, `xlsm`, `xlsb`, `xls`, `et`, `ods`, `odt`, `numbers`
|
|
53
|
+
|
|
54
|
+
Runtime check via API:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
uv run python -c "from anything2md import MarkdownConverter; c=MarkdownConverter(account_id='<id>', api_token='<token>'); print([f.extension for f in c.supported_formats()])"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Local Usage
|
|
61
|
+
|
|
62
|
+
Install dependencies:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
uv sync
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
export CLOUDFLARE_ACCOUNT_ID="your_account_id"
|
|
70
|
+
export CLOUDFLARE_API_TOKEN="your_api_token"
|
|
71
|
+
|
|
72
|
+
uv run anything2md https://pub-979cb28270cc461d94bc8a169d8f389d.r2.dev/somatosensory.pdf
|
|
73
|
+
uv run anything2md https://pub-979cb28270cc461d94bc8a169d8f389d.r2.dev/cat.jpeg -o output.md
|
|
74
|
+
uv run anything2md https://example.com
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## References
|
|
78
|
+
|
|
79
|
+
Cloudflare docs:
|
|
80
|
+
- Markdown Conversion overview: https://developers.cloudflare.com/workers-ai/features/markdown-conversion/
|
|
81
|
+
- API reference (`toMarkdown`): https://developers.cloudflare.com/api/resources/ai/methods/run/#to-markdown-conversion-to-markdown
|
|
82
|
+
- API reference (`supported formats`): https://developers.cloudflare.com/api/resources/ai/methods/run/#to-markdown-conversion-supported-formats
|
|
83
|
+
- Browser Rendering Markdown endpoint (URL input): https://developers.cloudflare.com/browser-rendering/rest-api/markdown-endpoint/
|
|
84
|
+
- Markdown for Agents (`Accept: text/markdown`): https://developers.cloudflare.com/fundamentals/reference/markdown-for-agents/
|
|
85
|
+
|
|
86
|
+
## License
|
|
87
|
+
|
|
88
|
+
MIT
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# anything2md
|
|
2
|
+
[](https://github.com/herrkaefer/anything2md/actions/workflows/ci.yml)
|
|
3
|
+
[](https://pypi.org/project/anything2md/)
|
|
4
|
+
[](https://pypi.org/project/anything2md/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
Python package and CLI for converting URLs or local documents into Markdown using Cloudflare Workers AI `toMarkdown()`.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
From GitHub:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install "git+https://github.com/herrkaefer/anything2md.git"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
### Cloudflare Token Setup
|
|
18
|
+
|
|
19
|
+
Create a Cloudflare API Token for the target account and include these permissions:
|
|
20
|
+
|
|
21
|
+
- `Workers AI`
|
|
22
|
+
- `Browser Rendering - Edit`
|
|
23
|
+
|
|
24
|
+
## Library Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import anything2md
|
|
28
|
+
|
|
29
|
+
mdconverter = anything2md(account_id="xxx", api_token="xxx")
|
|
30
|
+
result = mdconverter.convert("https://example.com")
|
|
31
|
+
print(result.markdown)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Supported Formats
|
|
35
|
+
|
|
36
|
+
Based on Cloudflare docs, current supported extensions include:
|
|
37
|
+
|
|
38
|
+
`pdf`, `jpeg/jpg`, `png`, `webp`, `svg`, `html/htm`, `xml`, `csv`, `docx`, `xlsx`, `xlsm`, `xlsb`, `xls`, `et`, `ods`, `odt`, `numbers`
|
|
39
|
+
|
|
40
|
+
Runtime check via API:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
uv run python -c "from anything2md import MarkdownConverter; c=MarkdownConverter(account_id='<id>', api_token='<token>'); print([f.extension for f in c.supported_formats()])"
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Local Usage
|
|
47
|
+
|
|
48
|
+
Install dependencies:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uv sync
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
export CLOUDFLARE_ACCOUNT_ID="your_account_id"
|
|
56
|
+
export CLOUDFLARE_API_TOKEN="your_api_token"
|
|
57
|
+
|
|
58
|
+
uv run anything2md https://pub-979cb28270cc461d94bc8a169d8f389d.r2.dev/somatosensory.pdf
|
|
59
|
+
uv run anything2md https://pub-979cb28270cc461d94bc8a169d8f389d.r2.dev/cat.jpeg -o output.md
|
|
60
|
+
uv run anything2md https://example.com
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## References
|
|
64
|
+
|
|
65
|
+
Cloudflare docs:
|
|
66
|
+
- Markdown Conversion overview: https://developers.cloudflare.com/workers-ai/features/markdown-conversion/
|
|
67
|
+
- API reference (`toMarkdown`): https://developers.cloudflare.com/api/resources/ai/methods/run/#to-markdown-conversion-to-markdown
|
|
68
|
+
- API reference (`supported formats`): https://developers.cloudflare.com/api/resources/ai/methods/run/#to-markdown-conversion-supported-formats
|
|
69
|
+
- Browser Rendering Markdown endpoint (URL input): https://developers.cloudflare.com/browser-rendering/rest-api/markdown-endpoint/
|
|
70
|
+
- Markdown for Agents (`Accept: text/markdown`): https://developers.cloudflare.com/fundamentals/reference/markdown-for-agents/
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.24.0"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "anything2md"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Convert documents to Markdown using Cloudflare Workers AI toMarkdown."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Anything2MD Contributors"}
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"httpx>=0.27,<1"
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://github.com/herrkaefer/anything2md"
|
|
21
|
+
Repository = "https://github.com/herrkaefer/anything2md"
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
test = [
|
|
25
|
+
"pytest>=8,<9"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
anything2md = "anything2md.cli:main"
|
|
30
|
+
|
|
31
|
+
[tool.pytest.ini_options]
|
|
32
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""anything2md package."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from types import ModuleType
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .config import CloudflareCredentials, ConvertOptions
|
|
8
|
+
from .converter import MarkdownConverter
|
|
9
|
+
from .errors import (
|
|
10
|
+
APIError,
|
|
11
|
+
Anything2MDError,
|
|
12
|
+
FileReadError,
|
|
13
|
+
HTTPError,
|
|
14
|
+
InvalidResponseError,
|
|
15
|
+
NetworkError,
|
|
16
|
+
UnsupportedFormatError,
|
|
17
|
+
)
|
|
18
|
+
from .models import ConversionResult, SupportedFormatInfo
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def anything2md(*args: Any, **kwargs: Any) -> MarkdownConverter:
|
|
22
|
+
"""Factory shorthand for MarkdownConverter(...)."""
|
|
23
|
+
return MarkdownConverter(*args, **kwargs)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class _Anything2MDModule(ModuleType):
|
|
27
|
+
def __call__(self, *args: Any, **kwargs: Any) -> MarkdownConverter:
|
|
28
|
+
return MarkdownConverter(*args, **kwargs)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
sys.modules[__name__].__class__ = _Anything2MDModule
|
|
32
|
+
|
|
33
|
+
SwiftToMarkdownConverter = MarkdownConverter
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"anything2md",
|
|
37
|
+
"Anything2MDError",
|
|
38
|
+
"UnsupportedFormatError",
|
|
39
|
+
"NetworkError",
|
|
40
|
+
"FileReadError",
|
|
41
|
+
"HTTPError",
|
|
42
|
+
"APIError",
|
|
43
|
+
"InvalidResponseError",
|
|
44
|
+
"CloudflareCredentials",
|
|
45
|
+
"ConvertOptions",
|
|
46
|
+
"ConversionResult",
|
|
47
|
+
"SupportedFormatInfo",
|
|
48
|
+
"MarkdownConverter",
|
|
49
|
+
"SwiftToMarkdownConverter",
|
|
50
|
+
]
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from .config import ConvertOptions
|
|
10
|
+
from .converter import MarkdownConverter
|
|
11
|
+
from .errors import Anything2MDError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_args() -> argparse.Namespace:
|
|
15
|
+
parser = argparse.ArgumentParser(
|
|
16
|
+
prog="anything2md",
|
|
17
|
+
description="Convert a URL or local file into Markdown using Cloudflare Workers AI.",
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument("input", help="Input URL (http/https) or local file path.")
|
|
20
|
+
parser.add_argument("--account-id", help="Cloudflare account ID. Falls back to CLOUDFLARE_ACCOUNT_ID.")
|
|
21
|
+
parser.add_argument("--api-token", help="Cloudflare API token. Falls back to CLOUDFLARE_API_TOKEN.")
|
|
22
|
+
parser.add_argument("--timeout", type=float, default=60.0, help="Request timeout in seconds. Default: 60.")
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
"--max-retry-count",
|
|
25
|
+
type=int,
|
|
26
|
+
default=2,
|
|
27
|
+
help="Retry attempts for retryable API/network failures. Default: 2.",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"--retry-base-delay",
|
|
31
|
+
type=float,
|
|
32
|
+
default=1.0,
|
|
33
|
+
help="Base retry delay in seconds (exponential backoff). Default: 1.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"-v",
|
|
37
|
+
"--verbose",
|
|
38
|
+
action="store_true",
|
|
39
|
+
help="Print progress messages to stderr.",
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--url-strategy",
|
|
43
|
+
choices=["auto", "download", "browser"],
|
|
44
|
+
default="auto",
|
|
45
|
+
help="URL conversion strategy. Web URL path tries Accept:text/markdown first, then browser-rendering fallback.",
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument("-o", "--output", help="Output markdown file path. Defaults to stdout.")
|
|
48
|
+
return parser.parse_args()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _resolve_credential(primary: str | None, env_name: str) -> str:
|
|
52
|
+
value = (primary or os.getenv(env_name, "")).strip()
|
|
53
|
+
if not value:
|
|
54
|
+
raise SystemExit(f"Missing credential: use flag or set {env_name}.")
|
|
55
|
+
return value
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _is_remote_url(value: str) -> bool:
|
|
59
|
+
parsed = urlparse(value)
|
|
60
|
+
return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _emit_progress(message: str) -> None:
|
|
64
|
+
print(f"[anything2md] {message}", file=sys.stderr, flush=True)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def main() -> None:
|
|
68
|
+
args = parse_args()
|
|
69
|
+
account_id = _resolve_credential(args.account_id, "CLOUDFLARE_ACCOUNT_ID")
|
|
70
|
+
api_token = _resolve_credential(args.api_token, "CLOUDFLARE_API_TOKEN")
|
|
71
|
+
options = ConvertOptions(
|
|
72
|
+
timeout=args.timeout,
|
|
73
|
+
max_retry_count=args.max_retry_count,
|
|
74
|
+
retry_base_delay=args.retry_base_delay,
|
|
75
|
+
)
|
|
76
|
+
progress_callback = _emit_progress if args.verbose else None
|
|
77
|
+
|
|
78
|
+
converter = MarkdownConverter(account_id=account_id, api_token=api_token, options=options)
|
|
79
|
+
try:
|
|
80
|
+
try:
|
|
81
|
+
local_or_remote_input: str | Path
|
|
82
|
+
if _is_remote_url(args.input):
|
|
83
|
+
local_or_remote_input = args.input
|
|
84
|
+
else:
|
|
85
|
+
local_or_remote_input = Path(args.input).expanduser()
|
|
86
|
+
|
|
87
|
+
result = converter.convert(
|
|
88
|
+
local_or_remote_input,
|
|
89
|
+
url_strategy=args.url_strategy,
|
|
90
|
+
progress_callback=progress_callback,
|
|
91
|
+
)
|
|
92
|
+
if isinstance(result, list):
|
|
93
|
+
raise SystemExit("CLI currently supports single-item conversion only.")
|
|
94
|
+
except Anything2MDError as exc:
|
|
95
|
+
print(str(exc), file=sys.stderr)
|
|
96
|
+
raise SystemExit(1) from exc
|
|
97
|
+
finally:
|
|
98
|
+
converter.close()
|
|
99
|
+
|
|
100
|
+
if args.output:
|
|
101
|
+
output_path = Path(args.output).expanduser()
|
|
102
|
+
output_path.write_text(result.markdown, encoding="utf-8")
|
|
103
|
+
print(str(output_path))
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
print(result.markdown, end="" if result.markdown.endswith("\n") else "\n")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
main()
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any
|
|
5
|
+
from typing import Sequence
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from .config import CloudflareCredentials, ConvertOptions
|
|
10
|
+
from .errors import APIError, HTTPError, InvalidResponseError, NetworkError, UnsupportedFormatError
|
|
11
|
+
from .formats import from_filename
|
|
12
|
+
from .models import ConversionResult, SupportedFormatInfo
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_RETRYABLE_NETWORK_EXCEPTIONS = (
|
|
16
|
+
httpx.ConnectError,
|
|
17
|
+
httpx.ReadError,
|
|
18
|
+
httpx.WriteError,
|
|
19
|
+
httpx.ConnectTimeout,
|
|
20
|
+
httpx.ReadTimeout,
|
|
21
|
+
httpx.WriteTimeout,
|
|
22
|
+
httpx.PoolTimeout,
|
|
23
|
+
httpx.RemoteProtocolError,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CloudflareClient:
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
credentials: CloudflareCredentials,
|
|
31
|
+
options: ConvertOptions = ConvertOptions(),
|
|
32
|
+
session: httpx.Client | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
self.credentials = credentials
|
|
35
|
+
self.options = options
|
|
36
|
+
self._session = session or httpx.Client(timeout=options.timeout)
|
|
37
|
+
|
|
38
|
+
def to_markdown(self, files: Sequence[tuple[bytes, str]]) -> list[ConversionResult]:
|
|
39
|
+
if not files:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
multipart: list[tuple[str, tuple[str, bytes, str]]] = []
|
|
43
|
+
for data, filename in files:
|
|
44
|
+
fmt = from_filename(filename)
|
|
45
|
+
if fmt is None:
|
|
46
|
+
raise UnsupportedFormatError(filename)
|
|
47
|
+
multipart.append(("files", (filename, data, fmt.mime_type)))
|
|
48
|
+
|
|
49
|
+
endpoint = (
|
|
50
|
+
f"https://api.cloudflare.com/client/v4/accounts/{self.credentials.account_id}/"
|
|
51
|
+
"ai/tomarkdown"
|
|
52
|
+
)
|
|
53
|
+
response = self._request_with_retry(
|
|
54
|
+
method="POST",
|
|
55
|
+
endpoint=endpoint,
|
|
56
|
+
files=multipart,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
payload = self._decode_success_payload(response)
|
|
60
|
+
result_payload = payload.get("result", [])
|
|
61
|
+
if not isinstance(result_payload, list):
|
|
62
|
+
raise InvalidResponseError()
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
return [ConversionResult.from_api_item(item) for item in result_payload]
|
|
66
|
+
except (TypeError, ValueError, KeyError) as exc:
|
|
67
|
+
raise InvalidResponseError() from exc
|
|
68
|
+
|
|
69
|
+
def supported_formats(self) -> list[SupportedFormatInfo]:
|
|
70
|
+
endpoint = (
|
|
71
|
+
f"https://api.cloudflare.com/client/v4/accounts/{self.credentials.account_id}/"
|
|
72
|
+
"ai/tomarkdown/supported"
|
|
73
|
+
)
|
|
74
|
+
response = self._request_with_retry(
|
|
75
|
+
method="GET",
|
|
76
|
+
endpoint=endpoint,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
payload = self._decode_success_payload(response)
|
|
80
|
+
result_payload = payload.get("result", [])
|
|
81
|
+
if not isinstance(result_payload, list):
|
|
82
|
+
raise InvalidResponseError()
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
return [SupportedFormatInfo.from_api_item(item) for item in result_payload]
|
|
86
|
+
except (TypeError, ValueError, KeyError) as exc:
|
|
87
|
+
raise InvalidResponseError() from exc
|
|
88
|
+
|
|
89
|
+
def markdown_from_url(self, url: str, **options: Any) -> str:
|
|
90
|
+
endpoint = (
|
|
91
|
+
f"https://api.cloudflare.com/client/v4/accounts/{self.credentials.account_id}/"
|
|
92
|
+
"browser-rendering/markdown"
|
|
93
|
+
)
|
|
94
|
+
payload: dict[str, Any] = {"url": url}
|
|
95
|
+
payload.update(options)
|
|
96
|
+
|
|
97
|
+
response = self._request_with_retry(
|
|
98
|
+
method="POST",
|
|
99
|
+
endpoint=endpoint,
|
|
100
|
+
json=payload,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
decoded = self._decode_success_payload(response)
|
|
104
|
+
result_payload = decoded.get("result")
|
|
105
|
+
if not isinstance(result_payload, str):
|
|
106
|
+
raise InvalidResponseError()
|
|
107
|
+
return result_payload
|
|
108
|
+
|
|
109
|
+
def close(self) -> None:
|
|
110
|
+
self._session.close()
|
|
111
|
+
|
|
112
|
+
def _request_with_retry(self, method: str, endpoint: str, **kwargs: Any) -> httpx.Response:
|
|
113
|
+
headers = kwargs.pop("headers", {}) or {}
|
|
114
|
+
headers["Authorization"] = f"Bearer {self.credentials.api_token}"
|
|
115
|
+
|
|
116
|
+
attempt = 0
|
|
117
|
+
while True:
|
|
118
|
+
try:
|
|
119
|
+
response = self._session.request(
|
|
120
|
+
method=method,
|
|
121
|
+
url=endpoint,
|
|
122
|
+
headers=headers,
|
|
123
|
+
timeout=self.options.timeout,
|
|
124
|
+
**kwargs,
|
|
125
|
+
)
|
|
126
|
+
except _RETRYABLE_NETWORK_EXCEPTIONS as exc:
|
|
127
|
+
if self._should_retry_network(attempt):
|
|
128
|
+
self._sleep_before_retry(attempt)
|
|
129
|
+
attempt += 1
|
|
130
|
+
continue
|
|
131
|
+
raise NetworkError(exc) from exc
|
|
132
|
+
except httpx.HTTPError as exc:
|
|
133
|
+
raise NetworkError(exc) from exc
|
|
134
|
+
|
|
135
|
+
if not 200 <= response.status_code < 300:
|
|
136
|
+
if self._should_retry_status(response.status_code, attempt):
|
|
137
|
+
self._sleep_before_retry(attempt)
|
|
138
|
+
attempt += 1
|
|
139
|
+
continue
|
|
140
|
+
raise HTTPError(response.status_code, response.text)
|
|
141
|
+
return response
|
|
142
|
+
|
|
143
|
+
def _decode_success_payload(self, response: httpx.Response) -> dict[str, Any]:
|
|
144
|
+
try:
|
|
145
|
+
payload = response.json()
|
|
146
|
+
except ValueError as exc:
|
|
147
|
+
raise InvalidResponseError() from exc
|
|
148
|
+
|
|
149
|
+
if not isinstance(payload, dict):
|
|
150
|
+
raise InvalidResponseError()
|
|
151
|
+
|
|
152
|
+
success = bool(payload.get("success", False))
|
|
153
|
+
if not success:
|
|
154
|
+
messages = self._extract_messages(payload)
|
|
155
|
+
raise APIError(messages)
|
|
156
|
+
return payload
|
|
157
|
+
|
|
158
|
+
def _should_retry_status(self, status_code: int, attempt: int) -> bool:
|
|
159
|
+
return attempt < self.options.max_retry_count and (
|
|
160
|
+
status_code == 429 or status_code >= 500
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _should_retry_network(self, attempt: int) -> bool:
|
|
164
|
+
return attempt < self.options.max_retry_count
|
|
165
|
+
|
|
166
|
+
def _sleep_before_retry(self, attempt: int) -> None:
|
|
167
|
+
base = max(0.0, self.options.retry_base_delay)
|
|
168
|
+
if base <= 0:
|
|
169
|
+
return
|
|
170
|
+
time.sleep(base * (2**attempt))
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def _extract_messages(payload: dict) -> list[str]:
|
|
174
|
+
values: list[str] = []
|
|
175
|
+
for key in ("errors", "messages"):
|
|
176
|
+
entries = payload.get(key, [])
|
|
177
|
+
if not isinstance(entries, list):
|
|
178
|
+
continue
|
|
179
|
+
for item in entries:
|
|
180
|
+
if isinstance(item, str):
|
|
181
|
+
values.append(item)
|
|
182
|
+
elif isinstance(item, dict):
|
|
183
|
+
message = item.get("message")
|
|
184
|
+
if isinstance(message, str):
|
|
185
|
+
values.append(message)
|
|
186
|
+
return values
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True)
|
|
7
|
+
class CloudflareCredentials:
|
|
8
|
+
"""Cloudflare credentials required for Workers AI requests."""
|
|
9
|
+
|
|
10
|
+
account_id: str
|
|
11
|
+
api_token: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class ConvertOptions:
|
|
16
|
+
"""Runtime options for conversion requests."""
|
|
17
|
+
|
|
18
|
+
timeout: float = 60.0
|
|
19
|
+
max_retry_count: int = 2
|
|
20
|
+
retry_base_delay: float = 1.0
|
|
21
|
+
|
|
22
|
+
def __post_init__(self) -> None:
|
|
23
|
+
object.__setattr__(self, "max_retry_count", max(0, self.max_retry_count))
|