toa 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toa/__init__.py +18 -0
- toa/__main__.py +104 -0
- toa/__mcp__.py +134 -0
- toa/__version__.py +1 -0
- toa/_base_converter.py +87 -0
- toa/_error.py +10 -0
- toa/_toa.py +510 -0
- toa/converters/__init__.py +33 -0
- toa/converters/_audio.py +86 -0
- toa/converters/_csv.py +63 -0
- toa/converters/_docx.py +74 -0
- toa/converters/_eml.py +160 -0
- toa/converters/_epub.py +55 -0
- toa/converters/_json.py +109 -0
- toa/converters/_msg.py +122 -0
- toa/converters/_pdf.py +175 -0
- toa/converters/_pptx.py +277 -0
- toa/converters/_rss.py +85 -0
- toa/converters/_text.py +56 -0
- toa/converters/_video.py +412 -0
- toa/converters/_xls.py +120 -0
- toa/converters/_xlsx.py +98 -0
- toa/converters/_xml.py +67 -0
- toa/engines/__init__.py +30 -0
- toa/engines/_code.py +35 -0
- toa/engines/_epub.py +448 -0
- toa/engines/_ftp.py +249 -0
- toa/engines/_html.py +32 -0
- toa/engines/_http.py +79 -0
- toa/engines/_msg.py +499 -0
- toa/engines/_ocr.py +141 -0
- toa/engines/_pdf.py +725 -0
- toa/engines/_vosk.py +321 -0
- toa/engines/_websocket.py +270 -0
- toa/engines/_xml.py +283 -0
- toa/templates/__init__.py +17 -0
- toa/templates/_default.py +145 -0
- toa/utils/__init__.py +13 -0
- toa/utils/_dataclass.py +33 -0
- toa/utils/_stream.py +228 -0
- toa/utils/_uri.py +180 -0
- toa-0.1.0.dist-info/METADATA +485 -0
- toa-0.1.0.dist-info/RECORD +46 -0
- toa-0.1.0.dist-info/WHEEL +4 -0
- toa-0.1.0.dist-info/entry_points.txt +3 -0
- toa-0.1.0.dist-info/licenses/LICENSE +201 -0
toa/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .__version__ import __version__
|
|
2
|
+
from ._base_converter import BaseConverter, ConversionResult, ConverterInfo
|
|
3
|
+
from ._error import ConversionFailedError, ConverterNotFoundError, UnsupportedError
|
|
4
|
+
from ._toa import Toa
|
|
5
|
+
from .utils import InputInfo, OutputInfo
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"__version__",
|
|
9
|
+
"Toa",
|
|
10
|
+
"BaseConverter",
|
|
11
|
+
"ConversionResult",
|
|
12
|
+
"ConverterInfo",
|
|
13
|
+
"ConversionFailedError",
|
|
14
|
+
"ConverterNotFoundError",
|
|
15
|
+
"UnsupportedError",
|
|
16
|
+
"InputInfo",
|
|
17
|
+
"OutputInfo",
|
|
18
|
+
]
|
toa/__main__.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from ._toa import Toa
|
|
7
|
+
from .utils import InputInfo, OutputInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main() -> None:
|
|
11
|
+
parser = argparse.ArgumentParser(
|
|
12
|
+
prog="toa",
|
|
13
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
14
|
+
epilog="""
|
|
15
|
+
Examples:
|
|
16
|
+
# basic
|
|
17
|
+
%(prog)s document.pdf
|
|
18
|
+
|
|
19
|
+
# output content to filename
|
|
20
|
+
%(prog)s document.pdf -o pdf.html
|
|
21
|
+
|
|
22
|
+
# fetch remote resource
|
|
23
|
+
%(prog)s https://example.org/html
|
|
24
|
+
|
|
25
|
+
# Enable advanced features
|
|
26
|
+
%(prog)s readme.md --mermaid --formula --highlight
|
|
27
|
+
|
|
28
|
+
# Output to standard output
|
|
29
|
+
cat pdf.pdf | %(prog)s
|
|
30
|
+
""",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
parser.add_argument('file', nargs="?", help="Input source")
|
|
34
|
+
# input options
|
|
35
|
+
input_group = parser.add_argument_group(title="Input options")
|
|
36
|
+
input_group.add_argument("-e", "--extension", help="File extension (e.g., .pdf, .mp3)")
|
|
37
|
+
input_group.add_argument("-m", "--mime-type", help="MIME type of the input file (e.g., application/pdf, audio/mpeg). If the `--extension` is already specified, the MIME type does not need to be specified.")
|
|
38
|
+
input_group.add_argument("--is-text", action="store_true", help="The file is a text file")
|
|
39
|
+
input_group.add_argument("--encoding", help="Only text files are supported, character set for non-text files is meaningless (default: utf-8)")
|
|
40
|
+
# output options
|
|
41
|
+
output_group = parser.add_argument_group(title="Output options")
|
|
42
|
+
output_group.add_argument("-o", "--output", help="Output file (default: stdout)")
|
|
43
|
+
output_group.add_argument("--engine", help="Engine to use")
|
|
44
|
+
output_group.add_argument("--wrapper", help="Method name used to wrap HTML content (a built-in wrapper named `default`)")
|
|
45
|
+
output_group.add_argument("--mermaid", action="store_true", help="Enable Mermaid diagram rendering")
|
|
46
|
+
output_group.add_argument("--highlight", action="store_true", help="Enable highlight")
|
|
47
|
+
output_group.add_argument("--formula", action="store_true", help="Enable mathematical formula rendering")
|
|
48
|
+
output_group.add_argument("--source-url", help="Source file address. An address accessible on a webpage. This parameter is required for some file types, such as PDF, MP3, etc.")
|
|
49
|
+
output_group.add_argument("--list-converters", action="store_true", help="List converter description information")
|
|
50
|
+
|
|
51
|
+
args = parser.parse_args()
|
|
52
|
+
|
|
53
|
+
toa = Toa()
|
|
54
|
+
|
|
55
|
+
if args.list_converters:
|
|
56
|
+
for item in list(toa.list_converters()):
|
|
57
|
+
print(f" - {item.name}: {item.description}")
|
|
58
|
+
sys.exit(0)
|
|
59
|
+
|
|
60
|
+
# Check if the flow information is empty.
|
|
61
|
+
b = all([args.extension is None, args.mime_type is None])
|
|
62
|
+
|
|
63
|
+
if b:
|
|
64
|
+
"""Only text files are supported, character set for non-text files is meaningless."""
|
|
65
|
+
args.is_text = None
|
|
66
|
+
args.encoding = None
|
|
67
|
+
|
|
68
|
+
input_info = InputInfo(
|
|
69
|
+
extension=args.extension,
|
|
70
|
+
mime_type=args.mime_type,
|
|
71
|
+
is_text=args.is_text,
|
|
72
|
+
encoding=args.encoding,
|
|
73
|
+
)
|
|
74
|
+
output_info = OutputInfo(
|
|
75
|
+
wrapper=args.wrapper,
|
|
76
|
+
mermaid=args.mermaid,
|
|
77
|
+
highlight=args.highlight,
|
|
78
|
+
formula=args.formula,
|
|
79
|
+
engine=args.engine,
|
|
80
|
+
filename=args.output,
|
|
81
|
+
source_url=args.source_url,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if args.file is None or args.file == "-":
|
|
85
|
+
result = toa.convert_stdin_stream(
|
|
86
|
+
sys.stdin.buffer,
|
|
87
|
+
input_info=input_info if input_info and not input_info.is_empty else None,
|
|
88
|
+
output_info=output_info,
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
result = toa.convert(
|
|
92
|
+
args.file,
|
|
93
|
+
input_info=input_info if input_info and not input_info.is_empty else None,
|
|
94
|
+
output_info=output_info,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if args.output:
|
|
98
|
+
Path(args.output).write_text(result.content, encoding=args.encoding or "utf-8")
|
|
99
|
+
else:
|
|
100
|
+
print(result.content)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
main()
|
toa/__mcp__.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import contextlib
|
|
4
|
+
from collections.abc import AsyncIterator
|
|
5
|
+
|
|
6
|
+
import uvicorn
|
|
7
|
+
from mcp.server import FastMCP
|
|
8
|
+
from mcp.server.sse import SseServerTransport
|
|
9
|
+
from mcp.server.streamable_http_manager import StreamableHTTPSessionManager
|
|
10
|
+
from mcp.server.websocket import websocket_server
|
|
11
|
+
from starlette.applications import Starlette
|
|
12
|
+
from starlette.requests import Request
|
|
13
|
+
from starlette.responses import Response
|
|
14
|
+
from starlette.routing import Mount, Route, WebSocketRoute
|
|
15
|
+
from starlette.websockets import WebSocket
|
|
16
|
+
|
|
17
|
+
from toa import Toa
|
|
18
|
+
|
|
19
|
+
_converter = Toa()
|
|
20
|
+
mcp = FastMCP("toa-mcp")
|
|
21
|
+
|
|
22
|
+
@mcp.tool()
|
|
23
|
+
async def convert_to_html(uri: str) -> str:
|
|
24
|
+
return _converter.convert(uri).content
|
|
25
|
+
|
|
26
|
+
@mcp.tool()
|
|
27
|
+
async def convert_html2md(html_content: str) -> str:
|
|
28
|
+
return Toa.html2md(html_content)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main() -> None:
|
|
32
|
+
"""Start HTTP/SSE/STDIO server"""
|
|
33
|
+
parser = argparse.ArgumentParser(
|
|
34
|
+
prog="toa-mcp",
|
|
35
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
36
|
+
epilog="""
|
|
37
|
+
Examples:
|
|
38
|
+
# web
|
|
39
|
+
%(prog)s --transport http
|
|
40
|
+
|
|
41
|
+
# stdio
|
|
42
|
+
%(prog)s
|
|
43
|
+
""",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
parser.add_argument("--transport", default="stdio", choices=("http", "sse", "ws", "stdio"), help="The transport to use to run the service")
|
|
47
|
+
parser.add_argument("--host", default="0.0.0.0", help="Which host to bind to - default is 0.0.0.0")
|
|
48
|
+
parser.add_argument("--port", default=56156, help="Port to listen on")
|
|
49
|
+
|
|
50
|
+
args = parser.parse_args()
|
|
51
|
+
|
|
52
|
+
if str(args.transport).lower() in ("http", "sse", "ws"):
|
|
53
|
+
mcp_server = mcp._mcp_server
|
|
54
|
+
|
|
55
|
+
sse = SseServerTransport("/messages/")
|
|
56
|
+
async def handle_sse(request: Request):
|
|
57
|
+
async with sse.connect_sse(
|
|
58
|
+
request.scope, request.receive, request._send
|
|
59
|
+
) as (read_stream, write_stream):
|
|
60
|
+
await mcp_server.run(
|
|
61
|
+
read_stream, write_stream, mcp_server.create_initialization_options()
|
|
62
|
+
)
|
|
63
|
+
return Response()
|
|
64
|
+
|
|
65
|
+
async def handle_ws(websocket: WebSocket):
|
|
66
|
+
async with websocket_server(
|
|
67
|
+
websocket.scope, websocket.receive, websocket.send
|
|
68
|
+
) as (read_stream, write_stream):
|
|
69
|
+
await mcp_server.run(
|
|
70
|
+
read_stream, write_stream, mcp_server.create_initialization_options()
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
session_manager = StreamableHTTPSessionManager(
|
|
74
|
+
app=mcp_server,
|
|
75
|
+
event_store=None,
|
|
76
|
+
json_response=True,
|
|
77
|
+
stateless=True,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@contextlib.asynccontextmanager
|
|
81
|
+
async def lifespan(app: Starlette) -> AsyncIterator[None]:
|
|
82
|
+
"""Manage the lifecycle of the session manager; each request has a corresponding session."""
|
|
83
|
+
async with session_manager.run():
|
|
84
|
+
origin = get_origin(args.host, args.port)
|
|
85
|
+
print(f"{'':=^60}")
|
|
86
|
+
print(" MCP Server is running")
|
|
87
|
+
print()
|
|
88
|
+
print(f" - Streamable HTTP endpoint: {origin}/mcp")
|
|
89
|
+
print(f" - Message endpoint: {origin}/messages/")
|
|
90
|
+
print(f" - SSE endpoint: {origin}/sse")
|
|
91
|
+
print(f" - WebSocket endpoint: {origin}/ws")
|
|
92
|
+
print(f"{'':=^60}")
|
|
93
|
+
yield
|
|
94
|
+
print("MCP Server is closing...")
|
|
95
|
+
|
|
96
|
+
app = Starlette(
|
|
97
|
+
routes=[
|
|
98
|
+
Route("/sse", endpoint=handle_sse, methods=["GET", "POST"]),
|
|
99
|
+
Mount("/messages/", app=sse.handle_post_message),
|
|
100
|
+
Mount("/mcp", app=session_manager.handle_request),
|
|
101
|
+
WebSocketRoute("/ws", endpoint=handle_ws),
|
|
102
|
+
Route(
|
|
103
|
+
"/ping",
|
|
104
|
+
endpoint=lambda request: Response("pong"),
|
|
105
|
+
methods=["GET", "POST"],
|
|
106
|
+
),
|
|
107
|
+
],
|
|
108
|
+
lifespan=lifespan,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
uvicorn.run(app, host=args.host, port=args.port)
|
|
112
|
+
else:
|
|
113
|
+
mcp.run()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def get_origin(host: str, port: int) -> str:
|
|
117
|
+
from urllib.parse import urlparse, urlunparse
|
|
118
|
+
|
|
119
|
+
parsed = urlparse(host)
|
|
120
|
+
|
|
121
|
+
if parsed.scheme:
|
|
122
|
+
scheme = parsed.scheme
|
|
123
|
+
netloc = parsed.netloc if parsed.netloc else parsed.path
|
|
124
|
+
else:
|
|
125
|
+
scheme = "https" if port == 443 else "http"
|
|
126
|
+
netloc = host
|
|
127
|
+
|
|
128
|
+
netloc = netloc.split(":")[0]
|
|
129
|
+
|
|
130
|
+
return urlunparse((scheme, f"{netloc}:{port}", "", "", "", ""))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
if __name__ == "__main__":
|
|
134
|
+
main()
|
toa/__version__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
toa/_base_converter.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import BinaryIO
|
|
4
|
+
|
|
5
|
+
from .utils import InputInfo, OutputInfo
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ConversionResult:
|
|
9
|
+
"""
|
|
10
|
+
Conversion result
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
content: str,
|
|
16
|
+
) -> None:
|
|
17
|
+
self.content = content
|
|
18
|
+
|
|
19
|
+
@dataclass(kw_only=True, frozen=True)
|
|
20
|
+
class ConverterInfo:
|
|
21
|
+
name: str
|
|
22
|
+
description: str | None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class BaseConverter(ABC):
|
|
27
|
+
def __init__(self) -> None:
|
|
28
|
+
super().__init__()
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def check_dependencies(self) -> bool:
|
|
33
|
+
"""
|
|
34
|
+
Returns a boolean value. Returns True if there are no external dependencies;
|
|
35
|
+
otherwise, the conversion will throw an error.
|
|
36
|
+
"""
|
|
37
|
+
raise NotImplementedError(
|
|
38
|
+
"Subclasses must implement the property to check dependencies."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def suffixes(self) -> list[str]:
|
|
44
|
+
"""extensions are supported (Format: ['.txt', '.md'])"""
|
|
45
|
+
raise NotImplementedError("file extensions are supported for conversion")
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def mime_types(self) -> list[str]:
|
|
50
|
+
"""
|
|
51
|
+
The format is ['text/markdown', 'application/pdf'].
|
|
52
|
+
"""
|
|
53
|
+
raise NotImplementedError("MIME types are supported for conversion")
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def convert(
|
|
57
|
+
self,
|
|
58
|
+
input_stream: BinaryIO,
|
|
59
|
+
input_info: InputInfo,
|
|
60
|
+
output_info: OutputInfo | None = None,
|
|
61
|
+
**kwargs,
|
|
62
|
+
) -> ConversionResult:
|
|
63
|
+
"""
|
|
64
|
+
The core logic of the conversion needs to be overridden by the subclass.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
input_stream: Converting paths, file streams, and remote addresses into accessible streams.
|
|
68
|
+
input_info: stream information
|
|
69
|
+
output_info: output information
|
|
70
|
+
"""
|
|
71
|
+
raise NotImplementedError("Subclasses must implement the convert method.")
|
|
72
|
+
|
|
73
|
+
def __repr__(self):
|
|
74
|
+
return f"<{self.__class__.__name__} suffixes={self.suffixes}>"
|
|
75
|
+
|
|
76
|
+
def __eq__(self, other):
|
|
77
|
+
if self is other:
|
|
78
|
+
return True
|
|
79
|
+
if self.__class__ != other.__class__:
|
|
80
|
+
return False
|
|
81
|
+
return self.suffixes == other.suffixes
|
|
82
|
+
|
|
83
|
+
def __hash__(self):
|
|
84
|
+
return hash((self.__class__, tuple(self.suffixes)))
|
|
85
|
+
|
|
86
|
+
def __getitem__(self, item: str):
|
|
87
|
+
return getattr(self, item)
|