markmaton 0.1.4__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
markmaton/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """markmaton package."""
2
+
3
+ from .engine import convert_html, discover_engine
4
+ from .models import ConvertOptions, ConvertRequest, ConvertResponse
5
+
6
+ __all__ = [
7
+ "__version__",
8
+ "ConvertOptions",
9
+ "ConvertRequest",
10
+ "ConvertResponse",
11
+ "convert_html",
12
+ "discover_engine",
13
+ ]
14
+
15
+ __version__ = "0.1.0"
Binary file
markmaton/cli.py ADDED
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from .engine import convert_html
9
+ from .models import ConvertOptions, ConvertRequest
10
+
11
+
12
+ def build_parser() -> argparse.ArgumentParser:
13
+ parser = argparse.ArgumentParser(prog="markmaton")
14
+ subparsers = parser.add_subparsers(dest="command", required=True)
15
+
16
+ convert_parser = subparsers.add_parser("convert", help="Convert HTML into Markdown")
17
+ convert_parser.add_argument("--html-file", type=Path, help="Path to an HTML file")
18
+ convert_parser.add_argument("--url", help="Source URL used as parsing context")
19
+ convert_parser.add_argument("--final-url", help="Final URL after redirects")
20
+ convert_parser.add_argument("--content-type", help="Optional content type hint")
21
+ convert_parser.add_argument(
22
+ "--output-format",
23
+ choices=("json", "markdown"),
24
+ default="json",
25
+ help="Choose between full JSON output or markdown only",
26
+ )
27
+ convert_parser.add_argument(
28
+ "--full-content",
29
+ action="store_true",
30
+ help="Disable main-content-only cleaning",
31
+ )
32
+ convert_parser.add_argument(
33
+ "--include-selector",
34
+ action="append",
35
+ default=[],
36
+ help="CSS selector to force-include before conversion",
37
+ )
38
+ convert_parser.add_argument(
39
+ "--exclude-selector",
40
+ action="append",
41
+ default=[],
42
+ help="CSS selector to remove before conversion",
43
+ )
44
+
45
+ return parser
46
+
47
+
48
+ def main(argv: list[str] | None = None) -> int:
49
+ parser = build_parser()
50
+ args = parser.parse_args(argv)
51
+
52
+ if args.command != "convert":
53
+ parser.error("unknown command")
54
+
55
+ html = _read_html(args.html_file)
56
+ request = ConvertRequest(
57
+ html=html,
58
+ url=args.url,
59
+ final_url=args.final_url,
60
+ content_type=args.content_type,
61
+ options=ConvertOptions(
62
+ only_main_content=not args.full_content,
63
+ include_selectors=list(args.include_selector),
64
+ exclude_selectors=list(args.exclude_selector),
65
+ ),
66
+ )
67
+ response = convert_html(request)
68
+
69
+ if args.output_format == "markdown":
70
+ sys.stdout.write(response.markdown)
71
+ if response.markdown and not response.markdown.endswith("\n"):
72
+ sys.stdout.write("\n")
73
+ return 0
74
+
75
+ sys.stdout.write(
76
+ json.dumps(
77
+ {
78
+ "markdown": response.markdown,
79
+ "html_clean": response.html_clean,
80
+ "metadata": response.metadata.__dict__,
81
+ "links": response.links,
82
+ "images": response.images,
83
+ "quality": response.quality.__dict__,
84
+ },
85
+ ensure_ascii=False,
86
+ )
87
+ )
88
+ sys.stdout.write("\n")
89
+ return 0
90
+
91
+
92
+ def _read_html(path: Path | None) -> str:
93
+ if path is None:
94
+ return sys.stdin.read()
95
+ return path.read_text(encoding="utf-8")
96
+
97
+
98
+ if __name__ == "__main__":
99
+ raise SystemExit(main())
markmaton/engine.py ADDED
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import platform
6
+ import shutil
7
+ import subprocess
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from .models import ConvertRequest, ConvertResponse
12
+
13
+
14
+ class EngineNotFoundError(RuntimeError):
15
+ pass
16
+
17
+
18
+ def convert_html(request: ConvertRequest, binary_path: Optional[str] = None) -> ConvertResponse:
19
+ engine = discover_engine(binary_path)
20
+ completed = subprocess.run(
21
+ [str(engine)],
22
+ input=json.dumps(request.to_payload()),
23
+ capture_output=True,
24
+ text=True,
25
+ check=False,
26
+ )
27
+ if completed.returncode != 0:
28
+ message = completed.stderr.strip() or completed.stdout.strip() or "markmaton engine failed"
29
+ raise RuntimeError(message)
30
+
31
+ payload = json.loads(completed.stdout)
32
+ return ConvertResponse.from_dict(payload)
33
+
34
+
35
+ def discover_engine(explicit_path: Optional[str] = None) -> Path:
36
+ candidates = []
37
+ if explicit_path:
38
+ candidates.append(Path(explicit_path))
39
+
40
+ if env_path := os.environ.get("MARKMATON_ENGINE"):
41
+ candidates.append(Path(env_path))
42
+
43
+ package_bin = Path(__file__).resolve().parent / "bin" / _binary_name()
44
+ candidates.append(package_bin)
45
+
46
+ repo_bin = Path(__file__).resolve().parent.parent / "bin" / _binary_name()
47
+ candidates.append(repo_bin)
48
+
49
+ if which := shutil.which(_binary_name()):
50
+ candidates.append(Path(which))
51
+
52
+ for candidate in candidates:
53
+ if candidate.is_file():
54
+ return candidate
55
+
56
+ raise EngineNotFoundError(
57
+ "Could not find markmaton-engine. Set MARKMATON_ENGINE or place the binary in markmaton/bin or ./bin."
58
+ )
59
+
60
+
61
+ def _binary_name() -> str:
62
+ return "markmaton-engine.exe" if platform.system().lower().startswith("win") else "markmaton-engine"
markmaton/models.py ADDED
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, List, Mapping, Optional
5
+
6
+
7
+ @dataclass
8
+ class ConvertOptions:
9
+ only_main_content: bool = True
10
+ include_selectors: List[str] = field(default_factory=list)
11
+ exclude_selectors: List[str] = field(default_factory=list)
12
+
13
+ def to_dict(self) -> Dict[str, Any]:
14
+ return {
15
+ "only_main_content": self.only_main_content,
16
+ "include_selectors": list(self.include_selectors),
17
+ "exclude_selectors": list(self.exclude_selectors),
18
+ }
19
+
20
+
21
+ @dataclass
22
+ class ConvertRequest:
23
+ html: str
24
+ url: Optional[str] = None
25
+ final_url: Optional[str] = None
26
+ content_type: Optional[str] = None
27
+ options: ConvertOptions = field(default_factory=ConvertOptions)
28
+
29
+ def to_payload(self) -> Dict[str, Any]:
30
+ payload: Dict[str, Any] = {
31
+ "html": self.html,
32
+ "options": self.options.to_dict(),
33
+ }
34
+ if self.url:
35
+ payload["url"] = self.url
36
+ if self.final_url:
37
+ payload["final_url"] = self.final_url
38
+ if self.content_type:
39
+ payload["content_type"] = self.content_type
40
+ return payload
41
+
42
+
43
+ @dataclass
44
+ class Metadata:
45
+ title: str = ""
46
+ description: str = ""
47
+ canonical_url: str = ""
48
+ language: str = ""
49
+ author: str = ""
50
+ og_title: str = ""
51
+ og_description: str = ""
52
+ extras: Dict[str, str] = field(default_factory=dict)
53
+
54
+ @classmethod
55
+ def from_dict(cls, value: Mapping[str, Any]) -> "Metadata":
56
+ return cls(
57
+ title=str(value.get("title", "") or ""),
58
+ description=str(value.get("description", "") or ""),
59
+ canonical_url=str(value.get("canonical_url", "") or ""),
60
+ language=str(value.get("language", "") or ""),
61
+ author=str(value.get("author", "") or ""),
62
+ og_title=str(value.get("og_title", "") or ""),
63
+ og_description=str(value.get("og_description", "") or ""),
64
+ extras=dict(value.get("extras", {}) or {}),
65
+ )
66
+
67
+
68
+ @dataclass
69
+ class Quality:
70
+ text_length: int = 0
71
+ paragraph_count: int = 0
72
+ link_count: int = 0
73
+ image_count: int = 0
74
+ title_present: bool = False
75
+ link_density: float = 0.0
76
+ quality_score: float = 0.0
77
+ used_main_content: bool = True
78
+ fallback_used: bool = False
79
+
80
+ @classmethod
81
+ def from_dict(cls, value: Mapping[str, Any]) -> "Quality":
82
+ return cls(
83
+ text_length=int(value.get("text_length", 0) or 0),
84
+ paragraph_count=int(value.get("paragraph_count", 0) or 0),
85
+ link_count=int(value.get("link_count", 0) or 0),
86
+ image_count=int(value.get("image_count", 0) or 0),
87
+ title_present=bool(value.get("title_present", False)),
88
+ link_density=float(value.get("link_density", 0.0) or 0.0),
89
+ quality_score=float(value.get("quality_score", 0.0) or 0.0),
90
+ used_main_content=bool(value.get("used_main_content", True)),
91
+ fallback_used=bool(value.get("fallback_used", False)),
92
+ )
93
+
94
+
95
+ @dataclass
96
+ class ConvertResponse:
97
+ markdown: str
98
+ html_clean: str
99
+ metadata: Metadata
100
+ links: List[str]
101
+ images: List[str]
102
+ quality: Quality
103
+
104
+ @classmethod
105
+ def from_dict(cls, value: Mapping[str, Any]) -> "ConvertResponse":
106
+ return cls(
107
+ markdown=str(value.get("markdown", "") or ""),
108
+ html_clean=str(value.get("html_clean", "") or ""),
109
+ metadata=Metadata.from_dict(value.get("metadata", {}) or {}),
110
+ links=list(value.get("links", []) or []),
111
+ images=list(value.get("images", []) or []),
112
+ quality=Quality.from_dict(value.get("quality", {}) or {}),
113
+ )
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: markmaton
3
+ Version: 0.1.4
4
+ Summary: Lightweight HTML-to-Markdown tooling for agent workflows.
5
+ Project-URL: Homepage, https://github.com/appautomaton/markmaton
6
+ Project-URL: Repository, https://github.com/appautomaton/markmaton
7
+ Project-URL: Issues, https://github.com/appautomaton/markmaton/issues
8
+ Author: appautomaton
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Libraries
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+
21
+ # markmaton
22
+
23
+ Lightweight HTML-to-Markdown tooling for agent workflows.
24
+
25
+ ## Status
26
+
27
+ This repository is intentionally starting small.
28
+
29
+ The current goal is to build a clean, fast parser core that can:
30
+
31
+ - take normalized page HTML from tools like Playwright, `fetch`, or no-driver
32
+ - clean the page structure
33
+ - return robust Markdown and page metadata
34
+
35
+ ## Direction
36
+
37
+ - parser core: Go
38
+ - distribution: Python packaging / PyPI
39
+ - first focus: library and CLI for local agent use
40
+ - release track: GitHub Actions + Trusted Publishing
41
+
42
+ ## Current shape
43
+
44
+ - Go engine: `cmd/markmaton-engine`
45
+ - Python wrapper: `markmaton/`
46
+ - Architecture docs: `docs/`
47
+ - Plans and issue CSVs: `plan/` and `issues/`
48
+
49
+ ## Testing policy
50
+
51
+ - automated tests should be unit-test-first
52
+ - parser module tests should use local fixtures and golden files
53
+ - Python wrapper tests should mock the engine boundary
54
+ - real engine checks stay manual unless there is a strong reason to automate them
55
+
56
+ ## Testing layout
57
+
58
+ - Go package unit tests live beside each package under `internal/*`.
59
+ - Shared Go fixture/golden helpers live in `internal/testutil/`.
60
+ - Stable parser fixtures live under `testdata/fixtures/core/`.
61
+ - Real-world regression fixtures live under `testdata/fixtures/regression/`.
62
+ - Golden markdown outputs for stable core fixtures live under `testdata/golden/core/`.
63
+ - Python wrapper tests live under `tests/unit/`.
64
+
65
+ ## Local smoke
66
+
67
+ See:
68
+
69
+ - `docs/local-smoke.md`
70
+ - `docs/packaging-layout.md`
71
+ - `docs/pypi-release.md`
@@ -0,0 +1,10 @@
1
+ markmaton/__init__.py,sha256=NrKC0RUaN1d6RZQG5IabKjY0BPKP7uvEuOmHOoiOeMs,326
2
+ markmaton/cli.py,sha256=QNki9al0uM3GHg1vQTOQsqssV30ev_NFZcEqBwX0NIU,3080
3
+ markmaton/engine.py,sha256=wOJFku1WW4Xro983gHzVJJq3GZuzLBvK_6b4tFp4iYc,1847
4
+ markmaton/models.py,sha256=tfHX69BLEkREY7as9x37abD0sD1nmHJ2afHEEfWDUJQ,3852
5
+ markmaton/bin/markmaton-engine.exe,sha256=FdDsko-7ZLRwbJUa7iHQUZ_EvQqbNV8HjcG0t93nWgI,7063040
6
+ markmaton-0.1.4.dist-info/METADATA,sha256=8L9bAUF83_7JUVZmie6yUXnHVbuR7yr4DygSS_2_YlU,2272
7
+ markmaton-0.1.4.dist-info/WHEEL,sha256=OKr2XcpSNWrtUe-CU6RMYrBnsfqGnZ3jZM4vKnozTRA,94
8
+ markmaton-0.1.4.dist-info/entry_points.txt,sha256=9dr-Ibs3HAimb7LLYZqp8srYc3ZUhVNSSmmKH2oYDtg,49
9
+ markmaton-0.1.4.dist-info/licenses/LICENSE,sha256=jT17UDsaOIhnm4G8-3nl-R0q0ynrVfIFkjio0-U8pAw,1090
10
+ markmaton-0.1.4.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: false
4
+ Tag: py3-none-win_amd64
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ markmaton = markmaton.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 appautomaton
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.