markmaton 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markmaton-0.1.4/.gitignore +31 -0
- markmaton-0.1.4/LICENSE +21 -0
- markmaton-0.1.4/PKG-INFO +71 -0
- markmaton-0.1.4/README.md +51 -0
- markmaton-0.1.4/cmd/markmaton-engine/main.go +58 -0
- markmaton-0.1.4/docs/architecture-brief.md +279 -0
- markmaton-0.1.4/docs/benchmark-matrix.md +139 -0
- markmaton-0.1.4/docs/benchmark-workflow.md +219 -0
- markmaton-0.1.4/docs/firecrawl-reference-audit.md +253 -0
- markmaton-0.1.4/docs/firecrawl-scrape-traceback.md +461 -0
- markmaton-0.1.4/docs/implementation-handoff.md +175 -0
- markmaton-0.1.4/docs/local-smoke.md +98 -0
- markmaton-0.1.4/docs/packaging-layout.md +99 -0
- markmaton-0.1.4/docs/pypi-release.md +120 -0
- markmaton-0.1.4/go.mod +14 -0
- markmaton-0.1.4/go.sum +85 -0
- markmaton-0.1.4/hatch_build.py +112 -0
- markmaton-0.1.4/internal/cleanhtml/clean.go +196 -0
- markmaton-0.1.4/internal/cleanhtml/clean_test.go +173 -0
- markmaton-0.1.4/internal/convert/builder.go +142 -0
- markmaton-0.1.4/internal/convert/builder_test.go +66 -0
- markmaton-0.1.4/internal/convert/convert.go +14 -0
- markmaton-0.1.4/internal/convert/convert_test.go +41 -0
- markmaton-0.1.4/internal/convert/hooks.go +436 -0
- markmaton-0.1.4/internal/convert/hooks_test.go +216 -0
- markmaton-0.1.4/internal/convert/plugins.go +20 -0
- markmaton-0.1.4/internal/engine/process.go +91 -0
- markmaton-0.1.4/internal/engine/process_test.go +317 -0
- markmaton-0.1.4/internal/images/extract.go +34 -0
- markmaton-0.1.4/internal/images/extract_test.go +14 -0
- markmaton-0.1.4/internal/links/extract.go +34 -0
- markmaton-0.1.4/internal/links/extract_test.go +14 -0
- markmaton-0.1.4/internal/metadata/extract.go +50 -0
- markmaton-0.1.4/internal/metadata/extract_test.go +37 -0
- markmaton-0.1.4/internal/model/types.go +78 -0
- markmaton-0.1.4/internal/postprocess/postprocess.go +127 -0
- markmaton-0.1.4/internal/postprocess/postprocess_test.go +80 -0
- markmaton-0.1.4/internal/quality/quality.go +204 -0
- markmaton-0.1.4/internal/quality/quality_test.go +95 -0
- markmaton-0.1.4/internal/resolve/resolve.go +138 -0
- markmaton-0.1.4/internal/resolve/resolve_test.go +33 -0
- markmaton-0.1.4/internal/testutil/testdata.go +41 -0
- markmaton-0.1.4/markmaton/__init__.py +15 -0
- markmaton-0.1.4/markmaton/cli.py +99 -0
- markmaton-0.1.4/markmaton/engine.py +62 -0
- markmaton-0.1.4/markmaton/models.py +113 -0
- markmaton-0.1.4/pyproject.toml +51 -0
- markmaton-0.1.4/testdata/fixtures/core/article.html +30 -0
- markmaton-0.1.4/testdata/fixtures/core/docs.html +23 -0
- markmaton-0.1.4/testdata/fixtures/core/news.html +23 -0
- markmaton-0.1.4/testdata/fixtures/regression/card_grid.html +37 -0
- markmaton-0.1.4/testdata/fixtures/regression/careers_landing.html +25 -0
- markmaton-0.1.4/testdata/fixtures/regression/github_issue_timeline.html +1769 -0
- markmaton-0.1.4/testdata/fixtures/regression/github_repo_shell.html +38 -0
- markmaton-0.1.4/testdata/fixtures/regression/job_detail.html +22 -0
- markmaton-0.1.4/testdata/fixtures/regression/openai_blog_shell.html +43 -0
- markmaton-0.1.4/testdata/fixtures/regression/stackoverflow_question_thread.html +7161 -0
- markmaton-0.1.4/testdata/golden/core/article.md +12 -0
- markmaton-0.1.4/testdata/golden/core/docs.md +10 -0
- markmaton-0.1.4/testdata/golden/core/news.md +9 -0
- markmaton-0.1.4/tests/__init__.py +1 -0
- markmaton-0.1.4/tests/smoke/installed_cli_smoke.py +41 -0
- markmaton-0.1.4/tests/unit/__init__.py +1 -0
- markmaton-0.1.4/tests/unit/test_cli.py +58 -0
- markmaton-0.1.4/tests/unit/test_engine.py +52 -0
- markmaton-0.1.4/tests/unit/test_hatch_build.py +43 -0
- markmaton-0.1.4/tests/unit/test_models.py +36 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# macOS
|
|
2
|
+
.DS_Store
|
|
3
|
+
|
|
4
|
+
# Python
|
|
5
|
+
__pycache__/
|
|
6
|
+
.pytest_cache/
|
|
7
|
+
.mypy_cache/
|
|
8
|
+
.ruff_cache/
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
build/
|
|
12
|
+
dist/
|
|
13
|
+
*.egg-info/
|
|
14
|
+
.hatch-build/
|
|
15
|
+
|
|
16
|
+
# Coverage
|
|
17
|
+
.coverage
|
|
18
|
+
coverage.xml
|
|
19
|
+
htmlcov/
|
|
20
|
+
|
|
21
|
+
# Go
|
|
22
|
+
bin/
|
|
23
|
+
markmaton/bin/markmaton-engine*
|
|
24
|
+
markmaton/bin/markmaton-engine.exe
|
|
25
|
+
*.test
|
|
26
|
+
coverage.out
|
|
27
|
+
tmp/
|
|
28
|
+
|
|
29
|
+
# Editors
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
markmaton-0.1.4/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 appautomaton
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
markmaton-0.1.4/PKG-INFO
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markmaton
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Lightweight HTML-to-Markdown tooling for agent workflows.
|
|
5
|
+
Project-URL: Homepage, https://github.com/appautomaton/markmaton
|
|
6
|
+
Project-URL: Repository, https://github.com/appautomaton/markmaton
|
|
7
|
+
Project-URL: Issues, https://github.com/appautomaton/markmaton/issues
|
|
8
|
+
Author: appautomaton
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# markmaton
|
|
22
|
+
|
|
23
|
+
Lightweight HTML-to-Markdown tooling for agent workflows.
|
|
24
|
+
|
|
25
|
+
## Status
|
|
26
|
+
|
|
27
|
+
This repository is intentionally starting small.
|
|
28
|
+
|
|
29
|
+
The current goal is to build a clean, fast parser core that can:
|
|
30
|
+
|
|
31
|
+
- take normalized page HTML from tools like Playwright, `fetch`, or no-driver
|
|
32
|
+
- clean the page structure
|
|
33
|
+
- return robust Markdown and page metadata
|
|
34
|
+
|
|
35
|
+
## Direction
|
|
36
|
+
|
|
37
|
+
- parser core: Go
|
|
38
|
+
- distribution: Python packaging / PyPI
|
|
39
|
+
- first focus: library and CLI for local agent use
|
|
40
|
+
- release track: GitHub Actions + Trusted Publishing
|
|
41
|
+
|
|
42
|
+
## Current shape
|
|
43
|
+
|
|
44
|
+
- Go engine: `cmd/markmaton-engine`
|
|
45
|
+
- Python wrapper: `markmaton/`
|
|
46
|
+
- Architecture docs: `docs/`
|
|
47
|
+
- Plans and issue CSVs: `plan/` and `issues/`
|
|
48
|
+
|
|
49
|
+
## Testing policy
|
|
50
|
+
|
|
51
|
+
- automated tests should be unit-test-first
|
|
52
|
+
- parser module tests should use local fixtures and golden files
|
|
53
|
+
- Python wrapper tests should mock the engine boundary
|
|
54
|
+
- real engine checks stay manual unless there is a strong reason to automate them
|
|
55
|
+
|
|
56
|
+
## Testing layout
|
|
57
|
+
|
|
58
|
+
- Go package unit tests live beside each package under `internal/*`.
|
|
59
|
+
- Shared Go fixture/golden helpers live in `internal/testutil/`.
|
|
60
|
+
- Stable parser fixtures live under `testdata/fixtures/core/`.
|
|
61
|
+
- Real-world regression fixtures live under `testdata/fixtures/regression/`.
|
|
62
|
+
- Golden markdown outputs for stable core fixtures live under `testdata/golden/core/`.
|
|
63
|
+
- Python wrapper tests live under `tests/unit/`.
|
|
64
|
+
|
|
65
|
+
## Local smoke
|
|
66
|
+
|
|
67
|
+
See:
|
|
68
|
+
|
|
69
|
+
- `docs/local-smoke.md`
|
|
70
|
+
- `docs/packaging-layout.md`
|
|
71
|
+
- `docs/pypi-release.md`
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# markmaton
|
|
2
|
+
|
|
3
|
+
Lightweight HTML-to-Markdown tooling for agent workflows.
|
|
4
|
+
|
|
5
|
+
## Status
|
|
6
|
+
|
|
7
|
+
This repository is intentionally starting small.
|
|
8
|
+
|
|
9
|
+
The current goal is to build a clean, fast parser core that can:
|
|
10
|
+
|
|
11
|
+
- take normalized page HTML from tools like Playwright, `fetch`, or no-driver
|
|
12
|
+
- clean the page structure
|
|
13
|
+
- return robust Markdown and page metadata
|
|
14
|
+
|
|
15
|
+
## Direction
|
|
16
|
+
|
|
17
|
+
- parser core: Go
|
|
18
|
+
- distribution: Python packaging / PyPI
|
|
19
|
+
- first focus: library and CLI for local agent use
|
|
20
|
+
- release track: GitHub Actions + Trusted Publishing
|
|
21
|
+
|
|
22
|
+
## Current shape
|
|
23
|
+
|
|
24
|
+
- Go engine: `cmd/markmaton-engine`
|
|
25
|
+
- Python wrapper: `markmaton/`
|
|
26
|
+
- Architecture docs: `docs/`
|
|
27
|
+
- Plans and issue CSVs: `plan/` and `issues/`
|
|
28
|
+
|
|
29
|
+
## Testing policy
|
|
30
|
+
|
|
31
|
+
- automated tests should be unit-test-first
|
|
32
|
+
- parser module tests should use local fixtures and golden files
|
|
33
|
+
- Python wrapper tests should mock the engine boundary
|
|
34
|
+
- real engine checks stay manual unless there is a strong reason to automate them
|
|
35
|
+
|
|
36
|
+
## Testing layout
|
|
37
|
+
|
|
38
|
+
- Go package unit tests live beside each package under `internal/*`.
|
|
39
|
+
- Shared Go fixture/golden helpers live in `internal/testutil/`.
|
|
40
|
+
- Stable parser fixtures live under `testdata/fixtures/core/`.
|
|
41
|
+
- Real-world regression fixtures live under `testdata/fixtures/regression/`.
|
|
42
|
+
- Golden markdown outputs for stable core fixtures live under `testdata/golden/core/`.
|
|
43
|
+
- Python wrapper tests live under `tests/unit/`.
|
|
44
|
+
|
|
45
|
+
## Local smoke
|
|
46
|
+
|
|
47
|
+
See:
|
|
48
|
+
|
|
49
|
+
- `docs/local-smoke.md`
|
|
50
|
+
- `docs/packaging-layout.md`
|
|
51
|
+
- `docs/pypi-release.md`
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
package main
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"encoding/json"
|
|
5
|
+
"errors"
|
|
6
|
+
"fmt"
|
|
7
|
+
"io"
|
|
8
|
+
"os"
|
|
9
|
+
|
|
10
|
+
"github.com/appautomaton/markmaton/internal/engine"
|
|
11
|
+
"github.com/appautomaton/markmaton/internal/model"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
func main() {
|
|
15
|
+
request, err := readRequest(os.Stdin)
|
|
16
|
+
if err != nil {
|
|
17
|
+
fail(err)
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
response, err := engine.Process(request)
|
|
21
|
+
if err != nil {
|
|
22
|
+
fail(err)
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
encoder := json.NewEncoder(os.Stdout)
|
|
26
|
+
encoder.SetEscapeHTML(false)
|
|
27
|
+
if err := encoder.Encode(response); err != nil {
|
|
28
|
+
fail(err)
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
func readRequest(r io.Reader) (model.Request, error) {
|
|
33
|
+
var request model.Request
|
|
34
|
+
|
|
35
|
+
payload, err := io.ReadAll(r)
|
|
36
|
+
if err != nil {
|
|
37
|
+
return request, fmt.Errorf("read request: %w", err)
|
|
38
|
+
}
|
|
39
|
+
if len(payload) == 0 {
|
|
40
|
+
return request, errors.New("request body is empty")
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if err := json.Unmarshal(payload, &request); err != nil {
|
|
44
|
+
return request, fmt.Errorf("decode request: %w", err)
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
request.ApplyDefaults()
|
|
48
|
+
if err := request.Validate(); err != nil {
|
|
49
|
+
return request, err
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return request, nil
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
func fail(err error) {
|
|
56
|
+
_, _ = fmt.Fprintf(os.Stderr, "%s\n", err)
|
|
57
|
+
os.Exit(1)
|
|
58
|
+
}
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# Markmaton Architecture Brief
|
|
2
|
+
|
|
3
|
+
## What markmaton is
|
|
4
|
+
|
|
5
|
+
`markmaton` 是一个很轻的 parser core。
|
|
6
|
+
|
|
7
|
+
它只做:
|
|
8
|
+
|
|
9
|
+
- 输入一份已经拿到的 HTML
|
|
10
|
+
- 清洗页面结构
|
|
11
|
+
- 转成干净 Markdown
|
|
12
|
+
- 顺手给出 metadata / links / images / quality signals
|
|
13
|
+
|
|
14
|
+
它不做:
|
|
15
|
+
|
|
16
|
+
- 网页抓取
|
|
17
|
+
- 浏览器控制
|
|
18
|
+
- 队列
|
|
19
|
+
- LLM 抽取
|
|
20
|
+
- summary / query / extract
|
|
21
|
+
|
|
22
|
+
## Design principles
|
|
23
|
+
|
|
24
|
+
1. **先把边界钉死**
|
|
25
|
+
- `markmaton` 只吃 HTML
|
|
26
|
+
- 抓取器永远在外面
|
|
27
|
+
|
|
28
|
+
2. **Go 做内核,Python 做壳**
|
|
29
|
+
- Go:解析、转换、后处理
|
|
30
|
+
- Python:CLI、包分发、调用体验
|
|
31
|
+
|
|
32
|
+
3. **Python 和 Go 只通过 JSON 通信**
|
|
33
|
+
- 不走 FFI
|
|
34
|
+
- 不走 Python extension
|
|
35
|
+
- 不把逻辑写两份
|
|
36
|
+
|
|
37
|
+
4. **一条主链先做稳**
|
|
38
|
+
- 先不要三套 fallback
|
|
39
|
+
- 先有一条可靠主路径
|
|
40
|
+
|
|
41
|
+
5. **用真实页面打磨,不靠玩具输入自我感动**
|
|
42
|
+
|
|
43
|
+
## Runtime split
|
|
44
|
+
|
|
45
|
+
### Go responsibilities
|
|
46
|
+
|
|
47
|
+
- clean HTML
|
|
48
|
+
- resolve URLs
|
|
49
|
+
- choose image source
|
|
50
|
+
- convert HTML to Markdown
|
|
51
|
+
- post-process Markdown
|
|
52
|
+
- extract metadata
|
|
53
|
+
- extract links
|
|
54
|
+
- extract images
|
|
55
|
+
- compute lightweight quality signals
|
|
56
|
+
|
|
57
|
+
### Python responsibilities
|
|
58
|
+
|
|
59
|
+
- 调 Go binary
|
|
60
|
+
- 提供 Python API
|
|
61
|
+
- 提供 CLI
|
|
62
|
+
- 处理打包和分发
|
|
63
|
+
- 把结果映射成清楚的数据结构
|
|
64
|
+
|
|
65
|
+
## Contract
|
|
66
|
+
|
|
67
|
+
### Input
|
|
68
|
+
|
|
69
|
+
最小输入:
|
|
70
|
+
|
|
71
|
+
```json
|
|
72
|
+
{
|
|
73
|
+
"url": "https://example.com/article",
|
|
74
|
+
"html": "<html>...</html>"
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
可选输入:
|
|
79
|
+
|
|
80
|
+
```json
|
|
81
|
+
{
|
|
82
|
+
"url": "https://example.com/article",
|
|
83
|
+
"html": "<html>...</html>",
|
|
84
|
+
"final_url": "https://www.example.com/article",
|
|
85
|
+
"content_type": "text/html",
|
|
86
|
+
"options": {
|
|
87
|
+
"only_main_content": true,
|
|
88
|
+
"include_selectors": [],
|
|
89
|
+
"exclude_selectors": []
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Output
|
|
95
|
+
|
|
96
|
+
```json
|
|
97
|
+
{
|
|
98
|
+
"markdown": "...",
|
|
99
|
+
"html_clean": "...",
|
|
100
|
+
"metadata": {
|
|
101
|
+
"title": "...",
|
|
102
|
+
"description": "...",
|
|
103
|
+
"canonical_url": "..."
|
|
104
|
+
},
|
|
105
|
+
"links": ["..."],
|
|
106
|
+
"images": ["..."],
|
|
107
|
+
"quality": {
|
|
108
|
+
"text_length": 12345,
|
|
109
|
+
"link_count": 42,
|
|
110
|
+
"image_count": 7,
|
|
111
|
+
"used_main_content": true,
|
|
112
|
+
"fallback_used": false,
|
|
113
|
+
"quality_score": 0.91
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Module layout
|
|
119
|
+
|
|
120
|
+
推荐的 Go 目录是:
|
|
121
|
+
|
|
122
|
+
```text
|
|
123
|
+
cmd/markmaton-engine/
|
|
124
|
+
internal/cleanhtml/
|
|
125
|
+
internal/resolve/
|
|
126
|
+
internal/convert/
|
|
127
|
+
internal/postprocess/
|
|
128
|
+
internal/metadata/
|
|
129
|
+
internal/links/
|
|
130
|
+
internal/images/
|
|
131
|
+
internal/quality/
|
|
132
|
+
internal/model/
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
每一层职责如下。
|
|
136
|
+
|
|
137
|
+
### `internal/model`
|
|
138
|
+
|
|
139
|
+
- 请求结构
|
|
140
|
+
- 响应结构
|
|
141
|
+
- 中间文档对象
|
|
142
|
+
|
|
143
|
+
### `internal/cleanhtml`
|
|
144
|
+
|
|
145
|
+
- 删除无关节点
|
|
146
|
+
- `only_main_content`
|
|
147
|
+
- include / exclude selectors
|
|
148
|
+
- 保证后续转换拿到的是合理 HTML
|
|
149
|
+
|
|
150
|
+
### `internal/resolve`
|
|
151
|
+
|
|
152
|
+
- `base href`
|
|
153
|
+
- 相对链接转绝对
|
|
154
|
+
- 图片 `src` / `srcset`
|
|
155
|
+
|
|
156
|
+
### `internal/convert`
|
|
157
|
+
|
|
158
|
+
- clean HTML -> Markdown
|
|
159
|
+
- 只负责转换,不做业务判断
|
|
160
|
+
|
|
161
|
+
### `internal/postprocess`
|
|
162
|
+
|
|
163
|
+
- 空行
|
|
164
|
+
- trailing spaces
|
|
165
|
+
- 链接和代码块边角修整
|
|
166
|
+
- Markdown 语义修补
|
|
167
|
+
|
|
168
|
+
### `internal/metadata`
|
|
169
|
+
|
|
170
|
+
- title
|
|
171
|
+
- description
|
|
172
|
+
- canonical
|
|
173
|
+
- OG / Twitter / author / language 这类字段
|
|
174
|
+
|
|
175
|
+
### `internal/links`
|
|
176
|
+
|
|
177
|
+
- 页面链接提取
|
|
178
|
+
|
|
179
|
+
### `internal/images`
|
|
180
|
+
|
|
181
|
+
- 图片提取
|
|
182
|
+
|
|
183
|
+
### `internal/quality`
|
|
184
|
+
|
|
185
|
+
- 判断结果是不是太空
|
|
186
|
+
- 判断是否值得从 main-content 回退到 full-content
|
|
187
|
+
- 给调用方一个可观测的质量信号
|
|
188
|
+
|
|
189
|
+
## Recommended repo shape
|
|
190
|
+
|
|
191
|
+
```text
|
|
192
|
+
markmaton/
|
|
193
|
+
pyproject.toml
|
|
194
|
+
README.md
|
|
195
|
+
go.mod
|
|
196
|
+
go.sum
|
|
197
|
+
|
|
198
|
+
cmd/
|
|
199
|
+
markmaton-engine/
|
|
200
|
+
main.go
|
|
201
|
+
|
|
202
|
+
internal/
|
|
203
|
+
...
|
|
204
|
+
|
|
205
|
+
markmaton/
|
|
206
|
+
__init__.py
|
|
207
|
+
cli.py
|
|
208
|
+
engine.py
|
|
209
|
+
models.py
|
|
210
|
+
|
|
211
|
+
docs/
|
|
212
|
+
tests/
|
|
213
|
+
testdata/
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## Why CLI + JSON
|
|
217
|
+
|
|
218
|
+
因为这是最稳的组合。
|
|
219
|
+
|
|
220
|
+
优点:
|
|
221
|
+
|
|
222
|
+
- Python 和 Go 边界清楚
|
|
223
|
+
- 不需要维护 FFI
|
|
224
|
+
- 本地开发简单
|
|
225
|
+
- CLI 天然可测试
|
|
226
|
+
- agent 也容易调
|
|
227
|
+
- 以后就算加别的 wrapper,也还是同一个 engine
|
|
228
|
+
|
|
229
|
+
坏处:
|
|
230
|
+
|
|
231
|
+
- 需要管理二进制分发
|
|
232
|
+
|
|
233
|
+
但这个坏处比 FFI 的复杂度轻得多。
|
|
234
|
+
|
|
235
|
+
## Distribution
|
|
236
|
+
|
|
237
|
+
主路径:
|
|
238
|
+
|
|
239
|
+
- Go engine 作为真正执行体
|
|
240
|
+
- Python 包作为分发壳
|
|
241
|
+
- 未来通过 PyPI / `uv tool` 提供安装体验
|
|
242
|
+
|
|
243
|
+
不要做的事:
|
|
244
|
+
|
|
245
|
+
- 不让 Python 侧复制一份转换逻辑
|
|
246
|
+
- 不做 npm-first
|
|
247
|
+
- 不做 HTTP service-first
|
|
248
|
+
|
|
249
|
+
## Quality policy
|
|
250
|
+
|
|
251
|
+
`markmaton` 不该只看“结果是不是空字符串”。
|
|
252
|
+
|
|
253
|
+
更好的质量信号至少包括:
|
|
254
|
+
|
|
255
|
+
- `text_length`
|
|
256
|
+
- `paragraph_count`
|
|
257
|
+
- `link_density`
|
|
258
|
+
- `image_count`
|
|
259
|
+
- `title_present`
|
|
260
|
+
- `quality_score`
|
|
261
|
+
- `used_main_content`
|
|
262
|
+
- `fallback_used`
|
|
263
|
+
|
|
264
|
+
这样外层调用方才知道这次结果是:
|
|
265
|
+
|
|
266
|
+
- 真抓好了
|
|
267
|
+
- 还是只是勉强有输出
|
|
268
|
+
|
|
269
|
+
## Non-goals for v1
|
|
270
|
+
|
|
271
|
+
- 不做站点级花哨规则系统
|
|
272
|
+
- 不做复杂插件市场
|
|
273
|
+
- 不做内置 fetch / Playwright
|
|
274
|
+
- 不做 LLM 结构化抽取
|
|
275
|
+
- 不做“全能 web platform”
|
|
276
|
+
|
|
277
|
+
v1 只求一件事:
|
|
278
|
+
|
|
279
|
+
**把一份 HTML 稳稳地变成值得喂给人和模型的 Markdown。**
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Benchmark Matrix
|
|
2
|
+
|
|
3
|
+
## Purpose
|
|
4
|
+
|
|
5
|
+
This matrix is the working baseline for parser refinement.
|
|
6
|
+
|
|
7
|
+
It records:
|
|
8
|
+
|
|
9
|
+
- which page classes we care about
|
|
10
|
+
- how we should acquire HTML for comparison
|
|
11
|
+
- how Firecrawl behaves today
|
|
12
|
+
- how `markmaton` behaves today
|
|
13
|
+
- where the likely parser gap lives
|
|
14
|
+
- whether the page should stay a local benchmark or become a regression fixture
|
|
15
|
+
|
|
16
|
+
## Initial benchmark set
|
|
17
|
+
|
|
18
|
+
First-pass benchmark artifacts now exist under:
|
|
19
|
+
|
|
20
|
+
```text
|
|
21
|
+
tmp/benchmarks/markmaton-benchmark-driven-refinement/
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
For this first pass, every benchmark row has:
|
|
25
|
+
|
|
26
|
+
- a local Firecrawl `/v2/scrape` snapshot
|
|
27
|
+
- a cached HTML artifact derived from Firecrawl `rawHtml`
|
|
28
|
+
- a local `markmaton` JSON output snapshot
|
|
29
|
+
|
|
30
|
+
This means the benchmark set is now grounded in cached artifacts rather than in-memory observations.
|
|
31
|
+
For rendered-first rows, we can still add an independent Playwright-rendered capture later if a remaining gap is ambiguous.
|
|
32
|
+
|
|
33
|
+
| ID | Page class | URL | HTML mode | Firecrawl `/v2/scrape` snapshot | `markmaton` current status | Likely gap layer | Fixture decision |
|
|
34
|
+
| --- | --- | --- | --- | --- | --- | --- | --- |
|
|
35
|
+
| BM-01 | Card/list grid | [OpenAI Engineering](https://openai.com/news/engineering/) | rendered | Good coverage, but still opens with site shell and packs card metadata densely | Strong content retention; opens directly on `Engineering`; generic list controls are gone, but card blocks still feel dense | `postprocess` first, then `convert/core` if repeated card-block awkwardness remains | Promote / already represented by `regression/card_grid.html` |
|
|
36
|
+
| BM-02 | Careers landing | [OpenAI Careers](https://openai.com/careers/) | rendered | Strong on hero and feature sections, though it still opens with brand shell | Stable baseline; hero copy and CTA survive well; only a light `Company` shell line remains | baseline protection only; no new parser work driven by this row | Keep as regression baseline |
|
|
37
|
+
| BM-03 | Job detail / application | [Ashby application page](https://jobs.ashbyhq.com/openai/49a16d46-bf3e-4806-a8af-a0e48c26336c/application) | rendered | Strong on job detail and compensation sections | Stable baseline; preserves section headings, location, compensation, and benefits; still carries a top wordmark block | baseline protection only; no new parser work driven by this row | Keep as regression baseline |
|
|
38
|
+
| BM-04 | Jobs landing | [Thermo Fisher jobs landing](https://jobs.thermofisher.com/global/en) | rendered | Good on recruiting message and search-entry framing | Main headline survives, but output is image-led and tile labels are thin; weaker than OpenAI careers | `convert/core` first, with possible `postprocess` cleanup later | Keep local cache first |
|
|
39
|
+
| BM-05 | Application form | [Thermo Fisher apply](https://jobs.thermofisher.com/global/en/apply?jobSeqNo=TFSCGLOBALR01347309EXTERNALENGLOBAL) | rendered | Weak: almost empty markdown, banner-heavy, weak title metadata | Still weak: skip-link noise is gone and quality is now scored more honestly, but the page still falls back to header chrome, duplicate logos, loading text, and minimal useful form content | `cleanhtml` first, then `quality`; likely revisit HTML acquisition before deeper parser work | Keep local cache first |
|
|
40
|
+
| BM-06 | Discussion thread | [Hacker News thread](https://news.ycombinator.com/item?id=40508445) | fetched-first | Weak: table-heavy chrome and comment structure leak heavily into Markdown | Still weak: opening is table markup and nav chrome, not a readable discussion thread | `convert/core` after shell cleanup; thread/timeline layout is not salvageable with light postprocess alone | Candidate for promotion as a thread/timeline fixture |
|
|
41
|
+
| BM-07 | News article with heavy shell | [Yahoo News article](https://www.yahoo.com/news/articles/officials-surprised-impact-customer-habits-210000558.html?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAAISaELtFiuTfiMrkD6MQ4LMAWD72pO_pLYCdVgO7G1NnfSbfJHJt__y0io-X2d_cZzBXMRIHurNfGZqDLB5Dk6okL3W27RpOS15soJx80eaYBv0avIJmiWlcTfSV1Z5Z6wpNOZTh5EKvQv-iIAr36nt-bD9AH8ddWNjrvRlh5ug1) | rendered-first | Weak opening: ads, homepage return link, top stories, promos | Improved materially: article content now surfaces near the top instead of opening on `Top Stories`, but heavy-shell article quality still scores too optimistically and article presentation still begins with image-heavy promo framing | `cleanhtml` first, then `quality` | Promote if we want a real heavy-shell article regression fixture |
|
|
42
|
+
| BM-08 | Wiki | [Wikipedia LLM page](https://en.wikipedia.org/wiki/Large_language_model) | fetched | Usable but retains wiki chrome, tabs, and maintenance notices | Article body is present, and quality now reflects the shell leakage better, but the page still opens with `Birthday mode`, article/talk tabs, and maintenance-table chrome | `cleanhtml` first; `postprocess` only for small cleanup if needed | Keep local cache first |
|
|
43
|
+
| BM-09 | Docs | [MDN HTTP Overview](https://developer.mozilla.org/en-US/docs/Web/HTTP/Overview) | fetched | Good body capture, but still keeps skip links and minor docs chrome | Strong result: opens directly on `# Overview of HTTP` and reads like real docs Markdown | baseline protection only; useful as a clean docs benchmark | Keep local cache first |
|
|
44
|
+
| BM-10 | Repo/app shell | [zellij repo page](https://github.com/zellij-org/zellij) | rendered-first | Good coverage, but still noisy and structurally dense | Better than the old shell-heavy state, but still opens with repo chrome and a very dense file table before the README body | `convert/core` first; `postprocess` can only shave small edges here | Keep as local rich benchmark; existing simplified repo shell fixture remains regression |
|
|
45
|
+
| BM-11 | Issue timeline | [VS Code issue #286040](https://github.com/microsoft/vscode/issues/286040) | rendered-first | Weak opening: skip links, stale-session alerts, repo chrome, auth chrome | Improved materially in the converter epic: it now opens on the issue title with a single status line, and duplicate issue actions/title echo are gone; assignee/label metadata is still dense before the main body | `convert/core` next for timeline metadata grouping; only light cleanup remains in outer layers | Promote for this converter-layer epic |
|
|
46
|
+
| BM-12 | Product/commerce | [Apple iPhone page](https://www.apple.com/iphone-16-pro/) | rendered-first | Strong page access, but output opens with dense promo/commerce blocks | Main product sections survive, but the page still opens with compressed promo text and commerce CTA density; title/canonical also skew to the broader iPhone hub | `cleanhtml` and `postprocess` first; escalate to `convert/core` only if hero/commerce blocks remain structurally awkward | Keep local cache first |
|
|
47
|
+
|
|
48
|
+
## Current takeaways
|
|
49
|
+
|
|
50
|
+
### Firecrawl is not uniformly clean
|
|
51
|
+
|
|
52
|
+
Current samples show:
|
|
53
|
+
|
|
54
|
+
- good results on careers landing and some docs pages
|
|
55
|
+
- middling results on heavy article shells, wiki chrome, and forms
|
|
56
|
+
- weak results on thread/timeline pages and GitHub-style app pages
|
|
57
|
+
- good page access does not automatically mean good article or product-page prioritization
|
|
58
|
+
|
|
59
|
+
This reinforces the right comparison stance:
|
|
60
|
+
|
|
61
|
+
- use Firecrawl as a mature behavioral reference
|
|
62
|
+
- do not chase exact output parity
|
|
63
|
+
|
|
64
|
+
### `markmaton` is already competitive on several classes
|
|
65
|
+
|
|
66
|
+
Current local observations show:
|
|
67
|
+
|
|
68
|
+
- careers landing pages are already strong
|
|
69
|
+
- application/detail pages are already strong
|
|
70
|
+
- docs pages are already close to “good enough” for the current parser shape
|
|
71
|
+
- shell-heavy pages have improved materially after cleanhtml hardening
|
|
72
|
+
- heavy-shell article pages now respond to better root selection, which is a sign that this layer is still worth improving
|
|
73
|
+
- card/list pages are improving, but still need a more mature organization layer
|
|
74
|
+
- thread/timeline and repo/app pages are now clearly the strongest argument for a later `convert/core` customization layer
|
|
75
|
+
|
|
76
|
+
## Promotion candidates for the next parser slice
|
|
77
|
+
|
|
78
|
+
Highest-value candidates for curated regression fixtures after local capture:
|
|
79
|
+
|
|
80
|
+
1. a richer heavy-shell article page
|
|
81
|
+
2. a discussion/timeline page
|
|
82
|
+
3. one richer app-shell page beyond the simplified repo fixture
|
|
83
|
+
4. optionally, a failed or degraded application flow page if we want one “bad form page” benchmark
|
|
84
|
+
|
|
85
|
+
These are the places where synthetic fixtures are most likely to miss the real parser failure mode.
|
|
86
|
+
|
|
87
|
+
## Harder second-tier benchmark set
|
|
88
|
+
|
|
89
|
+
These rows deliberately raise the difficulty level.
|
|
90
|
+
They are still evaluated as **general parser patterns**, not as site-specific targets.
|
|
91
|
+
|
|
92
|
+
| ID | Page class | URL | HTML mode | Firecrawl `/v2/scrape` snapshot | `markmaton` current status | Likely gap layer | Fixture decision |
|
|
93
|
+
| --- | --- | --- | --- | --- | --- | --- | --- |
|
|
94
|
+
| BM-13 | Q&A / answer thread | [Stack Overflow question](https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags) | fetched | Very noisy opening with Collectives shell, promo copy, and community chrome before the question body | Improved materially in the converter epic: the Markdown now opens directly on the question body and strips the most obvious interaction controls, but answer-score/metadata lines are still dense and quality remains too optimistic | `quality` first; later `convert/core` if answer hierarchy remains hard to read | Promote for this converter-layer epic |
|
|
95
|
+
| BM-14 | PR diff / files changed | [zellij PR files changed](https://github.com/zellij-org/zellij/pull/5012/files) | rendered | Opens with skip links, stale-session chrome, auth chrome, and repo shell before the PR body | Still opens with repo chrome, notifications, and repo stats before the PR title; diff/files structure is not yet expressed cleanly | `cleanhtml` first, then `convert/core` | Strong candidate for a richer timeline/app-shell benchmark |
|
|
96
|
+
| BM-15 | API docs with tabs and code samples | [Stripe API docs](https://docs.stripe.com/api/payment_intents/create) | rendered | Noisy docs shell with search and docs navigation before endpoint content | Good body prioritization: opens directly on the endpoint section, but docs affordances like `Was this section helpful?YesNo` still leak through | `cleanhtml` first, then `postprocess` | Keep local cache first |
|
|
97
|
+
| BM-16 | Product comparison / commerce | [Apple iPhone compare](https://www.apple.com/iphone/compare/) | rendered | Accessible, but very long and commerce-heavy; opens on promo/trade-in copy before comparison content | Similar behavior: comparison title survives, but the page still opens with promo/trade-in copy and dense commerce scaffolding | `postprocess` first, then `convert/core` only if comparison blocks remain structurally awkward | Keep local cache first |
|
|
98
|
+
|
|
99
|
+
## What the harder tier changes
|
|
100
|
+
|
|
101
|
+
The second-tier set sharpens the next-phase picture:
|
|
102
|
+
|
|
103
|
+
- `BM-13` shows that a general parser can beat Firecrawl on opening priority while still being weak on thread/answer structure.
|
|
104
|
+
- `BM-14` is a stronger signal than the plain repo page that app-shell and timeline-like views are now a converter-layer problem.
|
|
105
|
+
- `BM-15` confirms that docs pages can already be strong without site-specific rules, as long as shell cleanup keeps improving.
|
|
106
|
+
- `BM-16` shows that long commerce/comparison pages are not only shell-heavy; they also challenge block organization and ranking of what should appear first.
|
|
107
|
+
|
|
108
|
+
## Promotion decision for the current converter epic
|
|
109
|
+
|
|
110
|
+
Promote these rows into repo-backed regression fixtures now:
|
|
111
|
+
|
|
112
|
+
1. `BM-13` Stack Overflow question
|
|
113
|
+
2. `BM-11` GitHub issue timeline
|
|
114
|
+
|
|
115
|
+
Keep these rows as local-only hard benchmarks for now:
|
|
116
|
+
|
|
117
|
+
1. `BM-14` GitHub PR files changed
|
|
118
|
+
2. `BM-15` Stripe API docs
|
|
119
|
+
3. `BM-16` Apple iPhone compare
|
|
120
|
+
|
|
121
|
+
This keeps the first converter epic focused on:
|
|
122
|
+
|
|
123
|
+
- one discussion/Q&A structure
|
|
124
|
+
- one app-shell/timeline structure
|
|
125
|
+
|
|
126
|
+
without pulling diff-table handling and commerce-comparison ranking into the first converter pass.
|
|
127
|
+
|
|
128
|
+
## Converter-layer epic update
|
|
129
|
+
|
|
130
|
+
The first converter customization pass has now landed for the two promoted rows:
|
|
131
|
+
|
|
132
|
+
- `BM-13` now opens directly on the question body and no longer carries the worst control-line clutter at the top.
|
|
133
|
+
- `BM-11` now opens directly on the issue title, with issue-action noise and the redundant linked title echo removed.
|
|
134
|
+
|
|
135
|
+
What remains after this pass is more clearly structural than shell-related:
|
|
136
|
+
|
|
137
|
+
- answer/timeline metadata density
|
|
138
|
+
- grouping of assignees/labels/status blocks
|
|
139
|
+
- quality scoring that is still too optimistic on structure-heavy pages
|