markmaton 0.1.4__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markmaton/__init__.py +15 -0
- markmaton/bin/markmaton-engine.exe +0 -0
- markmaton/cli.py +99 -0
- markmaton/engine.py +62 -0
- markmaton/models.py +113 -0
- markmaton-0.1.4.dist-info/METADATA +71 -0
- markmaton-0.1.4.dist-info/RECORD +10 -0
- markmaton-0.1.4.dist-info/WHEEL +4 -0
- markmaton-0.1.4.dist-info/entry_points.txt +2 -0
- markmaton-0.1.4.dist-info/licenses/LICENSE +21 -0
markmaton/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""markmaton package."""
|
|
2
|
+
|
|
3
|
+
from .engine import convert_html, discover_engine
|
|
4
|
+
from .models import ConvertOptions, ConvertRequest, ConvertResponse
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"__version__",
|
|
8
|
+
"ConvertOptions",
|
|
9
|
+
"ConvertRequest",
|
|
10
|
+
"ConvertResponse",
|
|
11
|
+
"convert_html",
|
|
12
|
+
"discover_engine",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
__version__ = "0.1.0"
|
|
Binary file
|
markmaton/cli.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .engine import convert_html
|
|
9
|
+
from .models import ConvertOptions, ConvertRequest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
13
|
+
parser = argparse.ArgumentParser(prog="markmaton")
|
|
14
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
15
|
+
|
|
16
|
+
convert_parser = subparsers.add_parser("convert", help="Convert HTML into Markdown")
|
|
17
|
+
convert_parser.add_argument("--html-file", type=Path, help="Path to an HTML file")
|
|
18
|
+
convert_parser.add_argument("--url", help="Source URL used as parsing context")
|
|
19
|
+
convert_parser.add_argument("--final-url", help="Final URL after redirects")
|
|
20
|
+
convert_parser.add_argument("--content-type", help="Optional content type hint")
|
|
21
|
+
convert_parser.add_argument(
|
|
22
|
+
"--output-format",
|
|
23
|
+
choices=("json", "markdown"),
|
|
24
|
+
default="json",
|
|
25
|
+
help="Choose between full JSON output or markdown only",
|
|
26
|
+
)
|
|
27
|
+
convert_parser.add_argument(
|
|
28
|
+
"--full-content",
|
|
29
|
+
action="store_true",
|
|
30
|
+
help="Disable main-content-only cleaning",
|
|
31
|
+
)
|
|
32
|
+
convert_parser.add_argument(
|
|
33
|
+
"--include-selector",
|
|
34
|
+
action="append",
|
|
35
|
+
default=[],
|
|
36
|
+
help="CSS selector to force-include before conversion",
|
|
37
|
+
)
|
|
38
|
+
convert_parser.add_argument(
|
|
39
|
+
"--exclude-selector",
|
|
40
|
+
action="append",
|
|
41
|
+
default=[],
|
|
42
|
+
help="CSS selector to remove before conversion",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
return parser
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main(argv: list[str] | None = None) -> int:
|
|
49
|
+
parser = build_parser()
|
|
50
|
+
args = parser.parse_args(argv)
|
|
51
|
+
|
|
52
|
+
if args.command != "convert":
|
|
53
|
+
parser.error("unknown command")
|
|
54
|
+
|
|
55
|
+
html = _read_html(args.html_file)
|
|
56
|
+
request = ConvertRequest(
|
|
57
|
+
html=html,
|
|
58
|
+
url=args.url,
|
|
59
|
+
final_url=args.final_url,
|
|
60
|
+
content_type=args.content_type,
|
|
61
|
+
options=ConvertOptions(
|
|
62
|
+
only_main_content=not args.full_content,
|
|
63
|
+
include_selectors=list(args.include_selector),
|
|
64
|
+
exclude_selectors=list(args.exclude_selector),
|
|
65
|
+
),
|
|
66
|
+
)
|
|
67
|
+
response = convert_html(request)
|
|
68
|
+
|
|
69
|
+
if args.output_format == "markdown":
|
|
70
|
+
sys.stdout.write(response.markdown)
|
|
71
|
+
if response.markdown and not response.markdown.endswith("\n"):
|
|
72
|
+
sys.stdout.write("\n")
|
|
73
|
+
return 0
|
|
74
|
+
|
|
75
|
+
sys.stdout.write(
|
|
76
|
+
json.dumps(
|
|
77
|
+
{
|
|
78
|
+
"markdown": response.markdown,
|
|
79
|
+
"html_clean": response.html_clean,
|
|
80
|
+
"metadata": response.metadata.__dict__,
|
|
81
|
+
"links": response.links,
|
|
82
|
+
"images": response.images,
|
|
83
|
+
"quality": response.quality.__dict__,
|
|
84
|
+
},
|
|
85
|
+
ensure_ascii=False,
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
sys.stdout.write("\n")
|
|
89
|
+
return 0
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _read_html(path: Path | None) -> str:
|
|
93
|
+
if path is None:
|
|
94
|
+
return sys.stdin.read()
|
|
95
|
+
return path.read_text(encoding="utf-8")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
raise SystemExit(main())
|
markmaton/engine.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import platform
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from .models import ConvertRequest, ConvertResponse
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EngineNotFoundError(RuntimeError):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def convert_html(request: ConvertRequest, binary_path: Optional[str] = None) -> ConvertResponse:
|
|
19
|
+
engine = discover_engine(binary_path)
|
|
20
|
+
completed = subprocess.run(
|
|
21
|
+
[str(engine)],
|
|
22
|
+
input=json.dumps(request.to_payload()),
|
|
23
|
+
capture_output=True,
|
|
24
|
+
text=True,
|
|
25
|
+
check=False,
|
|
26
|
+
)
|
|
27
|
+
if completed.returncode != 0:
|
|
28
|
+
message = completed.stderr.strip() or completed.stdout.strip() or "markmaton engine failed"
|
|
29
|
+
raise RuntimeError(message)
|
|
30
|
+
|
|
31
|
+
payload = json.loads(completed.stdout)
|
|
32
|
+
return ConvertResponse.from_dict(payload)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def discover_engine(explicit_path: Optional[str] = None) -> Path:
|
|
36
|
+
candidates = []
|
|
37
|
+
if explicit_path:
|
|
38
|
+
candidates.append(Path(explicit_path))
|
|
39
|
+
|
|
40
|
+
if env_path := os.environ.get("MARKMATON_ENGINE"):
|
|
41
|
+
candidates.append(Path(env_path))
|
|
42
|
+
|
|
43
|
+
package_bin = Path(__file__).resolve().parent / "bin" / _binary_name()
|
|
44
|
+
candidates.append(package_bin)
|
|
45
|
+
|
|
46
|
+
repo_bin = Path(__file__).resolve().parent.parent / "bin" / _binary_name()
|
|
47
|
+
candidates.append(repo_bin)
|
|
48
|
+
|
|
49
|
+
if which := shutil.which(_binary_name()):
|
|
50
|
+
candidates.append(Path(which))
|
|
51
|
+
|
|
52
|
+
for candidate in candidates:
|
|
53
|
+
if candidate.is_file():
|
|
54
|
+
return candidate
|
|
55
|
+
|
|
56
|
+
raise EngineNotFoundError(
|
|
57
|
+
"Could not find markmaton-engine. Set MARKMATON_ENGINE or place the binary in markmaton/bin or ./bin."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _binary_name() -> str:
|
|
62
|
+
return "markmaton-engine.exe" if platform.system().lower().startswith("win") else "markmaton-engine"
|
markmaton/models.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Dict, List, Mapping, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ConvertOptions:
|
|
9
|
+
only_main_content: bool = True
|
|
10
|
+
include_selectors: List[str] = field(default_factory=list)
|
|
11
|
+
exclude_selectors: List[str] = field(default_factory=list)
|
|
12
|
+
|
|
13
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
14
|
+
return {
|
|
15
|
+
"only_main_content": self.only_main_content,
|
|
16
|
+
"include_selectors": list(self.include_selectors),
|
|
17
|
+
"exclude_selectors": list(self.exclude_selectors),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ConvertRequest:
|
|
23
|
+
html: str
|
|
24
|
+
url: Optional[str] = None
|
|
25
|
+
final_url: Optional[str] = None
|
|
26
|
+
content_type: Optional[str] = None
|
|
27
|
+
options: ConvertOptions = field(default_factory=ConvertOptions)
|
|
28
|
+
|
|
29
|
+
def to_payload(self) -> Dict[str, Any]:
|
|
30
|
+
payload: Dict[str, Any] = {
|
|
31
|
+
"html": self.html,
|
|
32
|
+
"options": self.options.to_dict(),
|
|
33
|
+
}
|
|
34
|
+
if self.url:
|
|
35
|
+
payload["url"] = self.url
|
|
36
|
+
if self.final_url:
|
|
37
|
+
payload["final_url"] = self.final_url
|
|
38
|
+
if self.content_type:
|
|
39
|
+
payload["content_type"] = self.content_type
|
|
40
|
+
return payload
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Metadata:
|
|
45
|
+
title: str = ""
|
|
46
|
+
description: str = ""
|
|
47
|
+
canonical_url: str = ""
|
|
48
|
+
language: str = ""
|
|
49
|
+
author: str = ""
|
|
50
|
+
og_title: str = ""
|
|
51
|
+
og_description: str = ""
|
|
52
|
+
extras: Dict[str, str] = field(default_factory=dict)
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_dict(cls, value: Mapping[str, Any]) -> "Metadata":
|
|
56
|
+
return cls(
|
|
57
|
+
title=str(value.get("title", "") or ""),
|
|
58
|
+
description=str(value.get("description", "") or ""),
|
|
59
|
+
canonical_url=str(value.get("canonical_url", "") or ""),
|
|
60
|
+
language=str(value.get("language", "") or ""),
|
|
61
|
+
author=str(value.get("author", "") or ""),
|
|
62
|
+
og_title=str(value.get("og_title", "") or ""),
|
|
63
|
+
og_description=str(value.get("og_description", "") or ""),
|
|
64
|
+
extras=dict(value.get("extras", {}) or {}),
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class Quality:
|
|
70
|
+
text_length: int = 0
|
|
71
|
+
paragraph_count: int = 0
|
|
72
|
+
link_count: int = 0
|
|
73
|
+
image_count: int = 0
|
|
74
|
+
title_present: bool = False
|
|
75
|
+
link_density: float = 0.0
|
|
76
|
+
quality_score: float = 0.0
|
|
77
|
+
used_main_content: bool = True
|
|
78
|
+
fallback_used: bool = False
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def from_dict(cls, value: Mapping[str, Any]) -> "Quality":
|
|
82
|
+
return cls(
|
|
83
|
+
text_length=int(value.get("text_length", 0) or 0),
|
|
84
|
+
paragraph_count=int(value.get("paragraph_count", 0) or 0),
|
|
85
|
+
link_count=int(value.get("link_count", 0) or 0),
|
|
86
|
+
image_count=int(value.get("image_count", 0) or 0),
|
|
87
|
+
title_present=bool(value.get("title_present", False)),
|
|
88
|
+
link_density=float(value.get("link_density", 0.0) or 0.0),
|
|
89
|
+
quality_score=float(value.get("quality_score", 0.0) or 0.0),
|
|
90
|
+
used_main_content=bool(value.get("used_main_content", True)),
|
|
91
|
+
fallback_used=bool(value.get("fallback_used", False)),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class ConvertResponse:
|
|
97
|
+
markdown: str
|
|
98
|
+
html_clean: str
|
|
99
|
+
metadata: Metadata
|
|
100
|
+
links: List[str]
|
|
101
|
+
images: List[str]
|
|
102
|
+
quality: Quality
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_dict(cls, value: Mapping[str, Any]) -> "ConvertResponse":
|
|
106
|
+
return cls(
|
|
107
|
+
markdown=str(value.get("markdown", "") or ""),
|
|
108
|
+
html_clean=str(value.get("html_clean", "") or ""),
|
|
109
|
+
metadata=Metadata.from_dict(value.get("metadata", {}) or {}),
|
|
110
|
+
links=list(value.get("links", []) or []),
|
|
111
|
+
images=list(value.get("images", []) or []),
|
|
112
|
+
quality=Quality.from_dict(value.get("quality", {}) or {}),
|
|
113
|
+
)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markmaton
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Lightweight HTML-to-Markdown tooling for agent workflows.
|
|
5
|
+
Project-URL: Homepage, https://github.com/appautomaton/markmaton
|
|
6
|
+
Project-URL: Repository, https://github.com/appautomaton/markmaton
|
|
7
|
+
Project-URL: Issues, https://github.com/appautomaton/markmaton/issues
|
|
8
|
+
Author: appautomaton
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# markmaton
|
|
22
|
+
|
|
23
|
+
Lightweight HTML-to-Markdown tooling for agent workflows.
|
|
24
|
+
|
|
25
|
+
## Status
|
|
26
|
+
|
|
27
|
+
This repository is intentionally starting small.
|
|
28
|
+
|
|
29
|
+
The current goal is to build a clean, fast parser core that can:
|
|
30
|
+
|
|
31
|
+
- take normalized page HTML from tools like Playwright, `fetch`, or no-driver
|
|
32
|
+
- clean the page structure
|
|
33
|
+
- return robust Markdown and page metadata
|
|
34
|
+
|
|
35
|
+
## Direction
|
|
36
|
+
|
|
37
|
+
- parser core: Go
|
|
38
|
+
- distribution: Python packaging / PyPI
|
|
39
|
+
- first focus: library and CLI for local agent use
|
|
40
|
+
- release track: GitHub Actions + Trusted Publishing
|
|
41
|
+
|
|
42
|
+
## Current shape
|
|
43
|
+
|
|
44
|
+
- Go engine: `cmd/markmaton-engine`
|
|
45
|
+
- Python wrapper: `markmaton/`
|
|
46
|
+
- Architecture docs: `docs/`
|
|
47
|
+
- Plans and issue CSVs: `plan/` and `issues/`
|
|
48
|
+
|
|
49
|
+
## Testing policy
|
|
50
|
+
|
|
51
|
+
- automated tests should be unit-test-first
|
|
52
|
+
- parser module tests should use local fixtures and golden files
|
|
53
|
+
- Python wrapper tests should mock the engine boundary
|
|
54
|
+
- real engine checks stay manual unless there is a strong reason to automate them
|
|
55
|
+
|
|
56
|
+
## Testing layout
|
|
57
|
+
|
|
58
|
+
- Go package unit tests live beside each package under `internal/*`.
|
|
59
|
+
- Shared Go fixture/golden helpers live in `internal/testutil/`.
|
|
60
|
+
- Stable parser fixtures live under `testdata/fixtures/core/`.
|
|
61
|
+
- Real-world regression fixtures live under `testdata/fixtures/regression/`.
|
|
62
|
+
- Golden markdown outputs for stable core fixtures live under `testdata/golden/core/`.
|
|
63
|
+
- Python wrapper tests live under `tests/unit/`.
|
|
64
|
+
|
|
65
|
+
## Local smoke
|
|
66
|
+
|
|
67
|
+
See:
|
|
68
|
+
|
|
69
|
+
- `docs/local-smoke.md`
|
|
70
|
+
- `docs/packaging-layout.md`
|
|
71
|
+
- `docs/pypi-release.md`
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
markmaton/__init__.py,sha256=NrKC0RUaN1d6RZQG5IabKjY0BPKP7uvEuOmHOoiOeMs,326
|
|
2
|
+
markmaton/cli.py,sha256=QNki9al0uM3GHg1vQTOQsqssV30ev_NFZcEqBwX0NIU,3080
|
|
3
|
+
markmaton/engine.py,sha256=wOJFku1WW4Xro983gHzVJJq3GZuzLBvK_6b4tFp4iYc,1847
|
|
4
|
+
markmaton/models.py,sha256=tfHX69BLEkREY7as9x37abD0sD1nmHJ2afHEEfWDUJQ,3852
|
|
5
|
+
markmaton/bin/markmaton-engine.exe,sha256=FdDsko-7ZLRwbJUa7iHQUZ_EvQqbNV8HjcG0t93nWgI,7063040
|
|
6
|
+
markmaton-0.1.4.dist-info/METADATA,sha256=8L9bAUF83_7JUVZmie6yUXnHVbuR7yr4DygSS_2_YlU,2272
|
|
7
|
+
markmaton-0.1.4.dist-info/WHEEL,sha256=OKr2XcpSNWrtUe-CU6RMYrBnsfqGnZ3jZM4vKnozTRA,94
|
|
8
|
+
markmaton-0.1.4.dist-info/entry_points.txt,sha256=9dr-Ibs3HAimb7LLYZqp8srYc3ZUhVNSSmmKH2oYDtg,49
|
|
9
|
+
markmaton-0.1.4.dist-info/licenses/LICENSE,sha256=jT17UDsaOIhnm4G8-3nl-R0q0ynrVfIFkjio0-U8pAw,1090
|
|
10
|
+
markmaton-0.1.4.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 appautomaton
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|