toa 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
toa/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ from .__version__ import __version__
2
+ from ._base_converter import BaseConverter, ConversionResult, ConverterInfo
3
+ from ._error import ConversionFailedError, ConverterNotFoundError, UnsupportedError
4
+ from ._toa import Toa
5
+ from .utils import InputInfo, OutputInfo
6
+
7
+ __all__ = [
8
+ "__version__",
9
+ "Toa",
10
+ "BaseConverter",
11
+ "ConversionResult",
12
+ "ConverterInfo",
13
+ "ConversionFailedError",
14
+ "ConverterNotFoundError",
15
+ "UnsupportedError",
16
+ "InputInfo",
17
+ "OutputInfo",
18
+ ]
toa/__main__.py ADDED
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from ._toa import Toa
7
+ from .utils import InputInfo, OutputInfo
8
+
9
+
10
+ def main() -> None:
11
+ parser = argparse.ArgumentParser(
12
+ prog="toa",
13
+ formatter_class=argparse.RawDescriptionHelpFormatter,
14
+ epilog="""
15
+ Examples:
16
+ # basic
17
+ %(prog)s document.pdf
18
+
19
+ # output content to filename
20
+ %(prog)s document.pdf -o pdf.html
21
+
22
+ # fetch remote resource
23
+ %(prog)s https://example.org/html
24
+
25
+ # Enable advanced features
26
+ %(prog)s readme.md --mermaid --formula --highlight
27
+
28
+ # Output to standard output
29
+ cat pdf.pdf | %(prog)s
30
+ """,
31
+ )
32
+
33
+ parser.add_argument('file', nargs="?", help="Input source")
34
+ # input options
35
+ input_group = parser.add_argument_group(title="Input options")
36
+ input_group.add_argument("-e", "--extension", help="File extension (e.g., .pdf, .mp3)")
37
+ input_group.add_argument("-m", "--mime-type", help="MIME type of the input file (e.g., application/pdf, audio/mpeg). If the `--extension` is already specified, the MIME type does not need to be specified.")
38
+ input_group.add_argument("--is-text", action="store_true", help="The file is a text file")
39
+ input_group.add_argument("--encoding", help="Only text files are supported, character set for non-text files is meaningless (default: utf-8)")
40
+ # output options
41
+ output_group = parser.add_argument_group(title="Output options")
42
+ output_group.add_argument("-o", "--output", help="Output file (default: stdout)")
43
+ output_group.add_argument("--engine", help="Engine to use")
44
+ output_group.add_argument("--wrapper", help="Method name used to wrap HTML content (a built-in wrapper named `default`)")
45
+ output_group.add_argument("--mermaid", action="store_true", help="Enable Mermaid diagram rendering")
46
+ output_group.add_argument("--highlight", action="store_true", help="Enable highlight")
47
+ output_group.add_argument("--formula", action="store_true", help="Enable mathematical formula rendering")
48
+ output_group.add_argument("--source-url", help="Source file address. An address accessible on a webpage. This parameter is required for some file types, such as PDF, MP3, etc.")
49
+ output_group.add_argument("--list-converters", action="store_true", help="List converter description information")
50
+
51
+ args = parser.parse_args()
52
+
53
+ toa = Toa()
54
+
55
+ if args.list_converters:
56
+ for item in list(toa.list_converters()):
57
+ print(f" - {item.name}: {item.description}")
58
+ sys.exit(0)
59
+
60
+ # Check if the flow information is empty.
61
+ b = all([args.extension is None, args.mime_type is None])
62
+
63
+ if b:
64
+ """Only text files are supported, character set for non-text files is meaningless."""
65
+ args.is_text = None
66
+ args.encoding = None
67
+
68
+ input_info = InputInfo(
69
+ extension=args.extension,
70
+ mime_type=args.mime_type,
71
+ is_text=args.is_text,
72
+ encoding=args.encoding,
73
+ )
74
+ output_info = OutputInfo(
75
+ wrapper=args.wrapper,
76
+ mermaid=args.mermaid,
77
+ highlight=args.highlight,
78
+ formula=args.formula,
79
+ engine=args.engine,
80
+ filename=args.output,
81
+ source_url=args.source_url,
82
+ )
83
+
84
+ if args.file is None or args.file == "-":
85
+ result = toa.convert_stdin_stream(
86
+ sys.stdin.buffer,
87
+ input_info=input_info if input_info and not input_info.is_empty else None,
88
+ output_info=output_info,
89
+ )
90
+ else:
91
+ result = toa.convert(
92
+ args.file,
93
+ input_info=input_info if input_info and not input_info.is_empty else None,
94
+ output_info=output_info,
95
+ )
96
+
97
+ if args.output:
98
+ Path(args.output).write_text(result.content, encoding=args.encoding or "utf-8")
99
+ else:
100
+ print(result.content)
101
+
102
+
103
+ if __name__ == "__main__":
104
+ main()
toa/__mcp__.py ADDED
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import contextlib
4
+ from collections.abc import AsyncIterator
5
+
6
+ import uvicorn
7
+ from mcp.server import FastMCP
8
+ from mcp.server.sse import SseServerTransport
9
+ from mcp.server.streamable_http_manager import StreamableHTTPSessionManager
10
+ from mcp.server.websocket import websocket_server
11
+ from starlette.applications import Starlette
12
+ from starlette.requests import Request
13
+ from starlette.responses import Response
14
+ from starlette.routing import Mount, Route, WebSocketRoute
15
+ from starlette.websockets import WebSocket
16
+
17
+ from toa import Toa
18
+
19
+ _converter = Toa()
20
+ mcp = FastMCP("toa-mcp")
21
+
22
+ @mcp.tool()
23
+ async def convert_to_html(uri: str) -> str:
24
+ return _converter.convert(uri).content
25
+
26
+ @mcp.tool()
27
+ async def convert_html2md(html_content: str) -> str:
28
+ return Toa.html2md(html_content)
29
+
30
+
31
+ def main() -> None:
32
+ """Start HTTP/SSE/STDIO server"""
33
+ parser = argparse.ArgumentParser(
34
+ prog="toa-mcp",
35
+ formatter_class=argparse.RawDescriptionHelpFormatter,
36
+ epilog="""
37
+ Examples:
38
+ # web
39
+ %(prog)s --transport http
40
+
41
+ # stdio
42
+ %(prog)s
43
+ """,
44
+ )
45
+
46
+ parser.add_argument("--transport", default="stdio", choices=("http", "sse", "ws", "stdio"), help="The transport to use to run the service")
47
+ parser.add_argument("--host", default="0.0.0.0", help="Which host to bind to - default is 0.0.0.0")
48
+ parser.add_argument("--port", default=56156, help="Port to listen on")
49
+
50
+ args = parser.parse_args()
51
+
52
+ if str(args.transport).lower() in ("http", "sse", "ws"):
53
+ mcp_server = mcp._mcp_server
54
+
55
+ sse = SseServerTransport("/messages/")
56
+ async def handle_sse(request: Request):
57
+ async with sse.connect_sse(
58
+ request.scope, request.receive, request._send
59
+ ) as (read_stream, write_stream):
60
+ await mcp_server.run(
61
+ read_stream, write_stream, mcp_server.create_initialization_options()
62
+ )
63
+ return Response()
64
+
65
+ async def handle_ws(websocket: WebSocket):
66
+ async with websocket_server(
67
+ websocket.scope, websocket.receive, websocket.send
68
+ ) as (read_stream, write_stream):
69
+ await mcp_server.run(
70
+ read_stream, write_stream, mcp_server.create_initialization_options()
71
+ )
72
+
73
+ session_manager = StreamableHTTPSessionManager(
74
+ app=mcp_server,
75
+ event_store=None,
76
+ json_response=True,
77
+ stateless=True,
78
+ )
79
+
80
+ @contextlib.asynccontextmanager
81
+ async def lifespan(app: Starlette) -> AsyncIterator[None]:
82
+ """Manage the lifecycle of the session manager; each request has a corresponding session."""
83
+ async with session_manager.run():
84
+ origin = get_origin(args.host, args.port)
85
+ print(f"{'':=^60}")
86
+ print(" MCP Server is running")
87
+ print()
88
+ print(f" - Streamable HTTP endpoint: {origin}/mcp")
89
+ print(f" - Message endpoint: {origin}/messages/")
90
+ print(f" - SSE endpoint: {origin}/sse")
91
+ print(f" - WebSocket endpoint: {origin}/ws")
92
+ print(f"{'':=^60}")
93
+ yield
94
+ print("MCP Server is closing...")
95
+
96
+ app = Starlette(
97
+ routes=[
98
+ Route("/sse", endpoint=handle_sse, methods=["GET", "POST"]),
99
+ Mount("/messages/", app=sse.handle_post_message),
100
+ Mount("/mcp", app=session_manager.handle_request),
101
+ WebSocketRoute("/ws", endpoint=handle_ws),
102
+ Route(
103
+ "/ping",
104
+ endpoint=lambda request: Response("pong"),
105
+ methods=["GET", "POST"],
106
+ ),
107
+ ],
108
+ lifespan=lifespan,
109
+ )
110
+
111
+ uvicorn.run(app, host=args.host, port=args.port)
112
+ else:
113
+ mcp.run()
114
+
115
+
116
+ def get_origin(host: str, port: int) -> str:
117
+ from urllib.parse import urlparse, urlunparse
118
+
119
+ parsed = urlparse(host)
120
+
121
+ if parsed.scheme:
122
+ scheme = parsed.scheme
123
+ netloc = parsed.netloc if parsed.netloc else parsed.path
124
+ else:
125
+ scheme = "https" if port == 443 else "http"
126
+ netloc = host
127
+
128
+ netloc = netloc.split(":")[0]
129
+
130
+ return urlunparse((scheme, f"{netloc}:{port}", "", "", "", ""))
131
+
132
+
133
+ if __name__ == "__main__":
134
+ main()
toa/__version__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
toa/_base_converter.py ADDED
@@ -0,0 +1,87 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import BinaryIO
4
+
5
+ from .utils import InputInfo, OutputInfo
6
+
7
+
8
+ class ConversionResult:
9
+ """
10
+ Conversion result
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ content: str,
16
+ ) -> None:
17
+ self.content = content
18
+
19
+ @dataclass(kw_only=True, frozen=True)
20
+ class ConverterInfo:
21
+ name: str
22
+ description: str | None
23
+
24
+
25
+ @dataclass
26
+ class BaseConverter(ABC):
27
+ def __init__(self) -> None:
28
+ super().__init__()
29
+
30
+ @property
31
+ @abstractmethod
32
+ def check_dependencies(self) -> bool:
33
+ """
34
+ Returns a boolean value. Returns True if there are no external dependencies;
35
+ otherwise, the conversion will throw an error.
36
+ """
37
+ raise NotImplementedError(
38
+ "Subclasses must implement the property to check dependencies."
39
+ )
40
+
41
+ @property
42
+ @abstractmethod
43
+ def suffixes(self) -> list[str]:
44
+ """extensions are supported (Format: ['.txt', '.md'])"""
45
+ raise NotImplementedError("file extensions are supported for conversion")
46
+
47
+ @property
48
+ @abstractmethod
49
+ def mime_types(self) -> list[str]:
50
+ """
51
+ The format is ['text/markdown', 'application/pdf'].
52
+ """
53
+ raise NotImplementedError("MIME types are supported for conversion")
54
+
55
+ @abstractmethod
56
+ def convert(
57
+ self,
58
+ input_stream: BinaryIO,
59
+ input_info: InputInfo,
60
+ output_info: OutputInfo | None = None,
61
+ **kwargs,
62
+ ) -> ConversionResult:
63
+ """
64
+ The core logic of the conversion needs to be overridden by the subclass.
65
+
66
+ Args:
67
+ input_stream: Converting paths, file streams, and remote addresses into accessible streams.
68
+ input_info: stream information
69
+ output_info: output information
70
+ """
71
+ raise NotImplementedError("Subclasses must implement the convert method.")
72
+
73
+ def __repr__(self):
74
+ return f"<{self.__class__.__name__} suffixes={self.suffixes}>"
75
+
76
+ def __eq__(self, other):
77
+ if self is other:
78
+ return True
79
+ if self.__class__ != other.__class__:
80
+ return False
81
+ return self.suffixes == other.suffixes
82
+
83
+ def __hash__(self):
84
+ return hash((self.__class__, tuple(self.suffixes)))
85
+
86
+ def __getitem__(self, item: str):
87
+ return getattr(self, item)
toa/_error.py ADDED
@@ -0,0 +1,10 @@
1
+ class ConverterNotFoundError(Exception):
2
+ pass
3
+
4
+
5
+ class ConversionFailedError(Exception):
6
+ pass
7
+
8
+
9
+ class UnsupportedError(Exception):
10
+ pass