markmaton 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. markmaton-0.1.4/.gitignore +31 -0
  2. markmaton-0.1.4/LICENSE +21 -0
  3. markmaton-0.1.4/PKG-INFO +71 -0
  4. markmaton-0.1.4/README.md +51 -0
  5. markmaton-0.1.4/cmd/markmaton-engine/main.go +58 -0
  6. markmaton-0.1.4/docs/architecture-brief.md +279 -0
  7. markmaton-0.1.4/docs/benchmark-matrix.md +139 -0
  8. markmaton-0.1.4/docs/benchmark-workflow.md +219 -0
  9. markmaton-0.1.4/docs/firecrawl-reference-audit.md +253 -0
  10. markmaton-0.1.4/docs/firecrawl-scrape-traceback.md +461 -0
  11. markmaton-0.1.4/docs/implementation-handoff.md +175 -0
  12. markmaton-0.1.4/docs/local-smoke.md +98 -0
  13. markmaton-0.1.4/docs/packaging-layout.md +99 -0
  14. markmaton-0.1.4/docs/pypi-release.md +120 -0
  15. markmaton-0.1.4/go.mod +14 -0
  16. markmaton-0.1.4/go.sum +85 -0
  17. markmaton-0.1.4/hatch_build.py +112 -0
  18. markmaton-0.1.4/internal/cleanhtml/clean.go +196 -0
  19. markmaton-0.1.4/internal/cleanhtml/clean_test.go +173 -0
  20. markmaton-0.1.4/internal/convert/builder.go +142 -0
  21. markmaton-0.1.4/internal/convert/builder_test.go +66 -0
  22. markmaton-0.1.4/internal/convert/convert.go +14 -0
  23. markmaton-0.1.4/internal/convert/convert_test.go +41 -0
  24. markmaton-0.1.4/internal/convert/hooks.go +436 -0
  25. markmaton-0.1.4/internal/convert/hooks_test.go +216 -0
  26. markmaton-0.1.4/internal/convert/plugins.go +20 -0
  27. markmaton-0.1.4/internal/engine/process.go +91 -0
  28. markmaton-0.1.4/internal/engine/process_test.go +317 -0
  29. markmaton-0.1.4/internal/images/extract.go +34 -0
  30. markmaton-0.1.4/internal/images/extract_test.go +14 -0
  31. markmaton-0.1.4/internal/links/extract.go +34 -0
  32. markmaton-0.1.4/internal/links/extract_test.go +14 -0
  33. markmaton-0.1.4/internal/metadata/extract.go +50 -0
  34. markmaton-0.1.4/internal/metadata/extract_test.go +37 -0
  35. markmaton-0.1.4/internal/model/types.go +78 -0
  36. markmaton-0.1.4/internal/postprocess/postprocess.go +127 -0
  37. markmaton-0.1.4/internal/postprocess/postprocess_test.go +80 -0
  38. markmaton-0.1.4/internal/quality/quality.go +204 -0
  39. markmaton-0.1.4/internal/quality/quality_test.go +95 -0
  40. markmaton-0.1.4/internal/resolve/resolve.go +138 -0
  41. markmaton-0.1.4/internal/resolve/resolve_test.go +33 -0
  42. markmaton-0.1.4/internal/testutil/testdata.go +41 -0
  43. markmaton-0.1.4/markmaton/__init__.py +15 -0
  44. markmaton-0.1.4/markmaton/cli.py +99 -0
  45. markmaton-0.1.4/markmaton/engine.py +62 -0
  46. markmaton-0.1.4/markmaton/models.py +113 -0
  47. markmaton-0.1.4/pyproject.toml +51 -0
  48. markmaton-0.1.4/testdata/fixtures/core/article.html +30 -0
  49. markmaton-0.1.4/testdata/fixtures/core/docs.html +23 -0
  50. markmaton-0.1.4/testdata/fixtures/core/news.html +23 -0
  51. markmaton-0.1.4/testdata/fixtures/regression/card_grid.html +37 -0
  52. markmaton-0.1.4/testdata/fixtures/regression/careers_landing.html +25 -0
  53. markmaton-0.1.4/testdata/fixtures/regression/github_issue_timeline.html +1769 -0
  54. markmaton-0.1.4/testdata/fixtures/regression/github_repo_shell.html +38 -0
  55. markmaton-0.1.4/testdata/fixtures/regression/job_detail.html +22 -0
  56. markmaton-0.1.4/testdata/fixtures/regression/openai_blog_shell.html +43 -0
  57. markmaton-0.1.4/testdata/fixtures/regression/stackoverflow_question_thread.html +7161 -0
  58. markmaton-0.1.4/testdata/golden/core/article.md +12 -0
  59. markmaton-0.1.4/testdata/golden/core/docs.md +10 -0
  60. markmaton-0.1.4/testdata/golden/core/news.md +9 -0
  61. markmaton-0.1.4/tests/__init__.py +1 -0
  62. markmaton-0.1.4/tests/smoke/installed_cli_smoke.py +41 -0
  63. markmaton-0.1.4/tests/unit/__init__.py +1 -0
  64. markmaton-0.1.4/tests/unit/test_cli.py +58 -0
  65. markmaton-0.1.4/tests/unit/test_engine.py +52 -0
  66. markmaton-0.1.4/tests/unit/test_hatch_build.py +43 -0
  67. markmaton-0.1.4/tests/unit/test_models.py +36 -0
@@ -0,0 +1,31 @@
1
+ # macOS
2
+ .DS_Store
3
+
4
+ # Python
5
+ __pycache__/
6
+ .pytest_cache/
7
+ .mypy_cache/
8
+ .ruff_cache/
9
+ .venv/
10
+ venv/
11
+ build/
12
+ dist/
13
+ *.egg-info/
14
+ .hatch-build/
15
+
16
+ # Coverage
17
+ .coverage
18
+ coverage.xml
19
+ htmlcov/
20
+
21
+ # Go
22
+ bin/
23
+ markmaton/bin/markmaton-engine*
24
+ markmaton/bin/markmaton-engine.exe
25
+ *.test
26
+ coverage.out
27
+ tmp/
28
+
29
+ # Editors
30
+ .idea/
31
+ .vscode/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 appautomaton
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: markmaton
3
+ Version: 0.1.4
4
+ Summary: Lightweight HTML-to-Markdown tooling for agent workflows.
5
+ Project-URL: Homepage, https://github.com/appautomaton/markmaton
6
+ Project-URL: Repository, https://github.com/appautomaton/markmaton
7
+ Project-URL: Issues, https://github.com/appautomaton/markmaton/issues
8
+ Author: appautomaton
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Software Development :: Libraries
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+
21
+ # markmaton
22
+
23
+ Lightweight HTML-to-Markdown tooling for agent workflows.
24
+
25
+ ## Status
26
+
27
+ This repository is intentionally starting small.
28
+
29
+ The current goal is to build a clean, fast parser core that can:
30
+
31
+ - take normalized page HTML from tools like Playwright, `fetch`, or no-driver
32
+ - clean the page structure
33
+ - return robust Markdown and page metadata
34
+
35
+ ## Direction
36
+
37
+ - parser core: Go
38
+ - distribution: Python packaging / PyPI
39
+ - first focus: library and CLI for local agent use
40
+ - release track: GitHub Actions + Trusted Publishing
41
+
42
+ ## Current shape
43
+
44
+ - Go engine: `cmd/markmaton-engine`
45
+ - Python wrapper: `markmaton/`
46
+ - Architecture docs: `docs/`
47
+ - Plans and issue CSVs: `plan/` and `issues/`
48
+
49
+ ## Testing policy
50
+
51
+ - automated tests should be unit-test-first
52
+ - parser module tests should use local fixtures and golden files
53
+ - Python wrapper tests should mock the engine boundary
54
+ - real engine checks stay manual unless there is a strong reason to automate them
55
+
56
+ ## Testing layout
57
+
58
+ - Go package unit tests live beside each package under `internal/*`.
59
+ - Shared Go fixture/golden helpers live in `internal/testutil/`.
60
+ - Stable parser fixtures live under `testdata/fixtures/core/`.
61
+ - Real-world regression fixtures live under `testdata/fixtures/regression/`.
62
+ - Golden markdown outputs for stable core fixtures live under `testdata/golden/core/`.
63
+ - Python wrapper tests live under `tests/unit/`.
64
+
65
+ ## Local smoke
66
+
67
+ See:
68
+
69
+ - `docs/local-smoke.md`
70
+ - `docs/packaging-layout.md`
71
+ - `docs/pypi-release.md`
@@ -0,0 +1,51 @@
1
+ # markmaton
2
+
3
+ Lightweight HTML-to-Markdown tooling for agent workflows.
4
+
5
+ ## Status
6
+
7
+ This repository is intentionally starting small.
8
+
9
+ The current goal is to build a clean, fast parser core that can:
10
+
11
+ - take normalized page HTML from tools like Playwright, `fetch`, or no-driver
12
+ - clean the page structure
13
+ - return robust Markdown and page metadata
14
+
15
+ ## Direction
16
+
17
+ - parser core: Go
18
+ - distribution: Python packaging / PyPI
19
+ - first focus: library and CLI for local agent use
20
+ - release track: GitHub Actions + Trusted Publishing
21
+
22
+ ## Current shape
23
+
24
+ - Go engine: `cmd/markmaton-engine`
25
+ - Python wrapper: `markmaton/`
26
+ - Architecture docs: `docs/`
27
+ - Plans and issue CSVs: `plan/` and `issues/`
28
+
29
+ ## Testing policy
30
+
31
+ - automated tests should be unit-test-first
32
+ - parser module tests should use local fixtures and golden files
33
+ - Python wrapper tests should mock the engine boundary
34
+ - real engine checks stay manual unless there is a strong reason to automate them
35
+
36
+ ## Testing layout
37
+
38
+ - Go package unit tests live beside each package under `internal/*`.
39
+ - Shared Go fixture/golden helpers live in `internal/testutil/`.
40
+ - Stable parser fixtures live under `testdata/fixtures/core/`.
41
+ - Real-world regression fixtures live under `testdata/fixtures/regression/`.
42
+ - Golden markdown outputs for stable core fixtures live under `testdata/golden/core/`.
43
+ - Python wrapper tests live under `tests/unit/`.
44
+
45
+ ## Local smoke
46
+
47
+ See:
48
+
49
+ - `docs/local-smoke.md`
50
+ - `docs/packaging-layout.md`
51
+ - `docs/pypi-release.md`
@@ -0,0 +1,58 @@
1
+ package main
2
+
3
+ import (
4
+ "encoding/json"
5
+ "errors"
6
+ "fmt"
7
+ "io"
8
+ "os"
9
+
10
+ "github.com/appautomaton/markmaton/internal/engine"
11
+ "github.com/appautomaton/markmaton/internal/model"
12
+ )
13
+
14
+ func main() {
15
+ request, err := readRequest(os.Stdin)
16
+ if err != nil {
17
+ fail(err)
18
+ }
19
+
20
+ response, err := engine.Process(request)
21
+ if err != nil {
22
+ fail(err)
23
+ }
24
+
25
+ encoder := json.NewEncoder(os.Stdout)
26
+ encoder.SetEscapeHTML(false)
27
+ if err := encoder.Encode(response); err != nil {
28
+ fail(err)
29
+ }
30
+ }
31
+
32
+ func readRequest(r io.Reader) (model.Request, error) {
33
+ var request model.Request
34
+
35
+ payload, err := io.ReadAll(r)
36
+ if err != nil {
37
+ return request, fmt.Errorf("read request: %w", err)
38
+ }
39
+ if len(payload) == 0 {
40
+ return request, errors.New("request body is empty")
41
+ }
42
+
43
+ if err := json.Unmarshal(payload, &request); err != nil {
44
+ return request, fmt.Errorf("decode request: %w", err)
45
+ }
46
+
47
+ request.ApplyDefaults()
48
+ if err := request.Validate(); err != nil {
49
+ return request, err
50
+ }
51
+
52
+ return request, nil
53
+ }
54
+
55
+ func fail(err error) {
56
+ _, _ = fmt.Fprintf(os.Stderr, "%s\n", err)
57
+ os.Exit(1)
58
+ }
@@ -0,0 +1,279 @@
1
+ # Markmaton Architecture Brief
2
+
3
+ ## What markmaton is
4
+
5
+ `markmaton` 是一个很轻的 parser core。
6
+
7
+ 它只做:
8
+
9
+ - 输入一份已经拿到的 HTML
10
+ - 清洗页面结构
11
+ - 转成干净 Markdown
12
+ - 顺手给出 metadata / links / images / quality signals
13
+
14
+ 它不做:
15
+
16
+ - 网页抓取
17
+ - 浏览器控制
18
+ - 队列
19
+ - LLM 抽取
20
+ - summary / query / extract
21
+
22
+ ## Design principles
23
+
24
+ 1. **先把边界钉死**
25
+ - `markmaton` 只吃 HTML
26
+ - 抓取器永远在外面
27
+
28
+ 2. **Go 做内核,Python 做壳**
29
+ - Go:解析、转换、后处理
30
+ - Python:CLI、包分发、调用体验
31
+
32
+ 3. **Python 和 Go 只通过 JSON 通信**
33
+ - 不走 FFI
34
+ - 不走 Python extension
35
+ - 不把逻辑写两份
36
+
37
+ 4. **一条主链先做稳**
38
+ - 先不要三套 fallback
39
+ - 先有一条可靠主路径
40
+
41
+ 5. **用真实页面打磨,不靠玩具输入自我感动**
42
+
43
+ ## Runtime split
44
+
45
+ ### Go responsibilities
46
+
47
+ - clean HTML
48
+ - resolve URLs
49
+ - choose image source
50
+ - convert HTML to Markdown
51
+ - post-process Markdown
52
+ - extract metadata
53
+ - extract links
54
+ - extract images
55
+ - compute lightweight quality signals
56
+
57
+ ### Python responsibilities
58
+
59
+ - 调 Go binary
60
+ - 提供 Python API
61
+ - 提供 CLI
62
+ - 处理打包和分发
63
+ - 把结果映射成清楚的数据结构
64
+
65
+ ## Contract
66
+
67
+ ### Input
68
+
69
+ 最小输入:
70
+
71
+ ```json
72
+ {
73
+ "url": "https://example.com/article",
74
+ "html": "<html>...</html>"
75
+ }
76
+ ```
77
+
78
+ 可选输入:
79
+
80
+ ```json
81
+ {
82
+ "url": "https://example.com/article",
83
+ "html": "<html>...</html>",
84
+ "final_url": "https://www.example.com/article",
85
+ "content_type": "text/html",
86
+ "options": {
87
+ "only_main_content": true,
88
+ "include_selectors": [],
89
+ "exclude_selectors": []
90
+ }
91
+ }
92
+ ```
93
+
94
+ ### Output
95
+
96
+ ```json
97
+ {
98
+ "markdown": "...",
99
+ "html_clean": "...",
100
+ "metadata": {
101
+ "title": "...",
102
+ "description": "...",
103
+ "canonical_url": "..."
104
+ },
105
+ "links": ["..."],
106
+ "images": ["..."],
107
+ "quality": {
108
+ "text_length": 12345,
109
+ "link_count": 42,
110
+ "image_count": 7,
111
+ "used_main_content": true,
112
+ "fallback_used": false,
113
+ "quality_score": 0.91
114
+ }
115
+ }
116
+ ```
117
+
118
+ ## Module layout
119
+
120
+ 推荐的 Go 目录是:
121
+
122
+ ```text
123
+ cmd/markmaton-engine/
124
+ internal/cleanhtml/
125
+ internal/resolve/
126
+ internal/convert/
127
+ internal/postprocess/
128
+ internal/metadata/
129
+ internal/links/
130
+ internal/images/
131
+ internal/quality/
132
+ internal/model/
133
+ ```
134
+
135
+ 每一层职责如下。
136
+
137
+ ### `internal/model`
138
+
139
+ - 请求结构
140
+ - 响应结构
141
+ - 中间文档对象
142
+
143
+ ### `internal/cleanhtml`
144
+
145
+ - 删除无关节点
146
+ - `only_main_content`
147
+ - include / exclude selectors
148
+ - 保证后续转换拿到的是合理 HTML
149
+
150
+ ### `internal/resolve`
151
+
152
+ - `base href`
153
+ - 相对链接转绝对
154
+ - 图片 `src` / `srcset`
155
+
156
+ ### `internal/convert`
157
+
158
+ - clean HTML -> Markdown
159
+ - 只负责转换,不做业务判断
160
+
161
+ ### `internal/postprocess`
162
+
163
+ - 空行
164
+ - trailing spaces
165
+ - 链接和代码块边角修整
166
+ - Markdown 语义修补
167
+
168
+ ### `internal/metadata`
169
+
170
+ - title
171
+ - description
172
+ - canonical
173
+ - OG / Twitter / author / language 这类字段
174
+
175
+ ### `internal/links`
176
+
177
+ - 页面链接提取
178
+
179
+ ### `internal/images`
180
+
181
+ - 图片提取
182
+
183
+ ### `internal/quality`
184
+
185
+ - 判断结果是不是太空
186
+ - 判断是否值得从 main-content 回退到 full-content
187
+ - 给调用方一个可观测的质量信号
188
+
189
+ ## Recommended repo shape
190
+
191
+ ```text
192
+ markmaton/
193
+ pyproject.toml
194
+ README.md
195
+ go.mod
196
+ go.sum
197
+
198
+ cmd/
199
+ markmaton-engine/
200
+ main.go
201
+
202
+ internal/
203
+ ...
204
+
205
+ markmaton/
206
+ __init__.py
207
+ cli.py
208
+ engine.py
209
+ models.py
210
+
211
+ docs/
212
+ tests/
213
+ testdata/
214
+ ```
215
+
216
+ ## Why CLI + JSON
217
+
218
+ 因为这是最稳的组合。
219
+
220
+ 优点:
221
+
222
+ - Python 和 Go 边界清楚
223
+ - 不需要维护 FFI
224
+ - 本地开发简单
225
+ - CLI 天然可测试
226
+ - agent 也容易调
227
+ - 以后就算加别的 wrapper,也还是同一个 engine
228
+
229
+ 坏处:
230
+
231
+ - 需要管理二进制分发
232
+
233
+ 但这个坏处比 FFI 的复杂度轻得多。
234
+
235
+ ## Distribution
236
+
237
+ 主路径:
238
+
239
+ - Go engine 作为真正执行体
240
+ - Python 包作为分发壳
241
+ - 未来通过 PyPI / `uv tool` 提供安装体验
242
+
243
+ 不要做的事:
244
+
245
+ - 不让 Python 侧复制一份转换逻辑
246
+ - 不做 npm-first
247
+ - 不做 HTTP service-first
248
+
249
+ ## Quality policy
250
+
251
+ `markmaton` 不该只看“结果是不是空字符串”。
252
+
253
+ 更好的质量信号至少包括:
254
+
255
+ - `text_length`
256
+ - `paragraph_count`
257
+ - `link_density`
258
+ - `image_count`
259
+ - `title_present`
260
+ - `quality_score`
261
+ - `used_main_content`
262
+ - `fallback_used`
263
+
264
+ 这样外层调用方才知道这次结果是:
265
+
266
+ - 真抓好了
267
+ - 还是只是勉强有输出
268
+
269
+ ## Non-goals for v1
270
+
271
+ - 不做站点级花哨规则系统
272
+ - 不做复杂插件市场
273
+ - 不做内置 fetch / Playwright
274
+ - 不做 LLM 结构化抽取
275
+ - 不做“全能 web platform”
276
+
277
+ v1 只求一件事:
278
+
279
+ **把一份 HTML 稳稳地变成值得喂给人和模型的 Markdown。**
@@ -0,0 +1,139 @@
1
+ # Benchmark Matrix
2
+
3
+ ## Purpose
4
+
5
+ This matrix is the working baseline for parser refinement.
6
+
7
+ It records:
8
+
9
+ - which page classes we care about
10
+ - how we should acquire HTML for comparison
11
+ - how Firecrawl behaves today
12
+ - how `markmaton` behaves today
13
+ - where the likely parser gap lives
14
+ - whether the page should stay a local benchmark or become a regression fixture
15
+
16
+ ## Initial benchmark set
17
+
18
+ First-pass benchmark artifacts now exist under:
19
+
20
+ ```text
21
+ tmp/benchmarks/markmaton-benchmark-driven-refinement/
22
+ ```
23
+
24
+ For this first pass, every benchmark row has:
25
+
26
+ - a local Firecrawl `/v2/scrape` snapshot
27
+ - a cached HTML artifact derived from Firecrawl `rawHtml`
28
+ - a local `markmaton` JSON output snapshot
29
+
30
+ This means the benchmark set is now grounded in cached artifacts rather than in-memory observations.
31
+ For rendered-first rows, we can still add an independent Playwright-rendered capture later if a remaining gap is ambiguous.
32
+
33
+ | ID | Page class | URL | HTML mode | Firecrawl `/v2/scrape` snapshot | `markmaton` current status | Likely gap layer | Fixture decision |
34
+ | --- | --- | --- | --- | --- | --- | --- | --- |
35
+ | BM-01 | Card/list grid | [OpenAI Engineering](https://openai.com/news/engineering/) | rendered | Good coverage, but still opens with site shell and packs card metadata densely | Strong content retention; opens directly on `Engineering`; generic list controls are gone, but card blocks still feel dense | `postprocess` first, then `convert/core` if repeated card-block awkwardness remains | Promote / already represented by `regression/card_grid.html` |
36
+ | BM-02 | Careers landing | [OpenAI Careers](https://openai.com/careers/) | rendered | Strong on hero and feature sections, though it still opens with brand shell | Stable baseline; hero copy and CTA survive well; only a light `Company` shell line remains | baseline protection only; no new parser work driven by this row | Keep as regression baseline |
37
+ | BM-03 | Job detail / application | [Ashby application page](https://jobs.ashbyhq.com/openai/49a16d46-bf3e-4806-a8af-a0e48c26336c/application) | rendered | Strong on job detail and compensation sections | Stable baseline; preserves section headings, location, compensation, and benefits; still carries a top wordmark block | baseline protection only; no new parser work driven by this row | Keep as regression baseline |
38
+ | BM-04 | Jobs landing | [Thermo Fisher jobs landing](https://jobs.thermofisher.com/global/en) | rendered | Good on recruiting message and search-entry framing | Main headline survives, but output is image-led and tile labels are thin; weaker than OpenAI careers | `convert/core` first, with possible `postprocess` cleanup later | Keep local cache first |
39
+ | BM-05 | Application form | [Thermo Fisher apply](https://jobs.thermofisher.com/global/en/apply?jobSeqNo=TFSCGLOBALR01347309EXTERNALENGLOBAL) | rendered | Weak: almost empty markdown, banner-heavy, weak title metadata | Still weak: skip-link noise is gone and quality is now scored more honestly, but the page still falls back to header chrome, duplicate logos, loading text, and minimal useful form content | `cleanhtml` first, then `quality`; likely revisit HTML acquisition before deeper parser work | Keep local cache first |
40
+ | BM-06 | Discussion thread | [Hacker News thread](https://news.ycombinator.com/item?id=40508445) | fetched-first | Weak: table-heavy chrome and comment structure leak heavily into Markdown | Still weak: opening is table markup and nav chrome, not a readable discussion thread | `convert/core` after shell cleanup; thread/timeline layout is not salvageable with light postprocess alone | Candidate for promotion as a thread/timeline fixture |
41
+ | BM-07 | News article with heavy shell | [Yahoo News article](https://www.yahoo.com/news/articles/officials-surprised-impact-customer-habits-210000558.html?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAAISaELtFiuTfiMrkD6MQ4LMAWD72pO_pLYCdVgO7G1NnfSbfJHJt__y0io-X2d_cZzBXMRIHurNfGZqDLB5Dk6okL3W27RpOS15soJx80eaYBv0avIJmiWlcTfSV1Z5Z6wpNOZTh5EKvQv-iIAr36nt-bD9AH8ddWNjrvRlh5ug1) | rendered-first | Weak opening: ads, homepage return link, top stories, promos | Improved materially: article content now surfaces near the top instead of opening on `Top Stories`, but heavy-shell article quality still scores too optimistically and article presentation still begins with image-heavy promo framing | `cleanhtml` first, then `quality` | Promote if we want a real heavy-shell article regression fixture |
42
+ | BM-08 | Wiki | [Wikipedia LLM page](https://en.wikipedia.org/wiki/Large_language_model) | fetched | Usable but retains wiki chrome, tabs, and maintenance notices | Article body is present, and quality now reflects the shell leakage better, but the page still opens with `Birthday mode`, article/talk tabs, and maintenance-table chrome | `cleanhtml` first; `postprocess` only for small cleanup if needed | Keep local cache first |
43
+ | BM-09 | Docs | [MDN HTTP Overview](https://developer.mozilla.org/en-US/docs/Web/HTTP/Overview) | fetched | Good body capture, but still keeps skip links and minor docs chrome | Strong result: opens directly on `# Overview of HTTP` and reads like real docs Markdown | baseline protection only; useful as a clean docs benchmark | Keep local cache first |
44
+ | BM-10 | Repo/app shell | [zellij repo page](https://github.com/zellij-org/zellij) | rendered-first | Good coverage, but still noisy and structurally dense | Better than the old shell-heavy state, but still opens with repo chrome and a very dense file table before the README body | `convert/core` first; `postprocess` can only shave small edges here | Keep as local rich benchmark; existing simplified repo shell fixture remains regression |
45
+ | BM-11 | Issue timeline | [VS Code issue #286040](https://github.com/microsoft/vscode/issues/286040) | rendered-first | Weak opening: skip links, stale-session alerts, repo chrome, auth chrome | Improved materially in the converter epic: it now opens on the issue title with a single status line, and duplicate issue actions/title echo are gone; assignee/label metadata is still dense before the main body | `convert/core` next for timeline metadata grouping; only light cleanup remains in outer layers | Promote for this converter-layer epic |
46
+ | BM-12 | Product/commerce | [Apple iPhone page](https://www.apple.com/iphone-16-pro/) | rendered-first | Strong page access, but output opens with dense promo/commerce blocks | Main product sections survive, but the page still opens with compressed promo text and commerce CTA density; title/canonical also skew to the broader iPhone hub | `cleanhtml` and `postprocess` first; escalate to `convert/core` only if hero/commerce blocks remain structurally awkward | Keep local cache first |
47
+
48
+ ## Current takeaways
49
+
50
+ ### Firecrawl is not uniformly clean
51
+
52
+ Current samples show:
53
+
54
+ - good results on careers landing and some docs pages
55
+ - middling results on heavy article shells, wiki chrome, and forms
56
+ - weak results on thread/timeline pages and GitHub-style app pages
57
+ - good page access does not automatically mean good article or product-page prioritization
58
+
59
+ This reinforces the right comparison stance:
60
+
61
+ - use Firecrawl as a mature behavioral reference
62
+ - do not chase exact output parity
63
+
64
+ ### `markmaton` is already competitive on several classes
65
+
66
+ Current local observations show:
67
+
68
+ - careers landing pages are already strong
69
+ - application/detail pages are already strong
70
+ - docs pages are already close to “good enough” for the current parser shape
71
+ - shell-heavy pages have improved materially after cleanhtml hardening
72
+ - heavy-shell article pages now respond to better root selection, which is a sign that this layer is still worth improving
73
+ - card/list pages are improving, but still need a more mature organization layer
74
+ - thread/timeline and repo/app pages are now clearly the strongest argument for a later `convert/core` customization layer
75
+
76
+ ## Promotion candidates for the next parser slice
77
+
78
+ Highest-value candidates for curated regression fixtures after local capture:
79
+
80
+ 1. a richer heavy-shell article page
81
+ 2. a discussion/timeline page
82
+ 3. one richer app-shell page beyond the simplified repo fixture
83
+ 4. optionally, a failed or degraded application flow page if we want one “bad form page” benchmark
84
+
85
+ These are the places where synthetic fixtures are most likely to miss the real parser failure mode.
86
+
87
+ ## Harder second-tier benchmark set
88
+
89
+ These rows deliberately raise the difficulty level.
90
+ They are still evaluated as **general parser patterns**, not as site-specific targets.
91
+
92
+ | ID | Page class | URL | HTML mode | Firecrawl `/v2/scrape` snapshot | `markmaton` current status | Likely gap layer | Fixture decision |
93
+ | --- | --- | --- | --- | --- | --- | --- | --- |
94
+ | BM-13 | Q&A / answer thread | [Stack Overflow question](https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags) | fetched | Very noisy opening with Collectives shell, promo copy, and community chrome before the question body | Improved materially in the converter epic: the Markdown now opens directly on the question body and strips the most obvious interaction controls, but answer-score/metadata lines are still dense and quality remains too optimistic | `quality` first; later `convert/core` if answer hierarchy remains hard to read | Promote for this converter-layer epic |
95
+ | BM-14 | PR diff / files changed | [zellij PR files changed](https://github.com/zellij-org/zellij/pull/5012/files) | rendered | Opens with skip links, stale-session chrome, auth chrome, and repo shell before the PR body | Still opens with repo chrome, notifications, and repo stats before the PR title; diff/files structure is not yet expressed cleanly | `cleanhtml` first, then `convert/core` | Strong candidate for a richer timeline/app-shell benchmark |
96
+ | BM-15 | API docs with tabs and code samples | [Stripe API docs](https://docs.stripe.com/api/payment_intents/create) | rendered | Noisy docs shell with search and docs navigation before endpoint content | Good body prioritization: opens directly on the endpoint section, but docs affordances like `Was this section helpful?YesNo` still leak through | `cleanhtml` first, then `postprocess` | Keep local cache first |
97
+ | BM-16 | Product comparison / commerce | [Apple iPhone compare](https://www.apple.com/iphone/compare/) | rendered | Accessible, but very long and commerce-heavy; opens on promo/trade-in copy before comparison content | Similar behavior: comparison title survives, but the page still opens with promo/trade-in copy and dense commerce scaffolding | `postprocess` first, then `convert/core` only if comparison blocks remain structurally awkward | Keep local cache first |
98
+
99
+ ## What the harder tier changes
100
+
101
+ The second-tier set sharpens the next-phase picture:
102
+
103
+ - `BM-13` shows that a general parser can beat Firecrawl on opening priority while still being weak on thread/answer structure.
104
+ - `BM-14` is a stronger signal than the plain repo page that app-shell and timeline-like views are now a converter-layer problem.
105
+ - `BM-15` confirms that docs pages can already be strong without site-specific rules, as long as shell cleanup keeps improving.
106
+ - `BM-16` shows that long commerce/comparison pages are not only shell-heavy; they also challenge block organization and ranking of what should appear first.
107
+
108
+ ## Promotion decision for the current converter epic
109
+
110
+ Promote these rows into repo-backed regression fixtures now:
111
+
112
+ 1. `BM-13` Stack Overflow question
113
+ 2. `BM-11` GitHub issue timeline
114
+
115
+ Keep these rows as local-only hard benchmarks for now:
116
+
117
+ 1. `BM-14` GitHub PR files changed
118
+ 2. `BM-15` Stripe API docs
119
+ 3. `BM-16` Apple iPhone compare
120
+
121
+ This keeps the first converter epic focused on:
122
+
123
+ - one discussion/Q&A structure
124
+ - one app-shell/timeline structure
125
+
126
+ without pulling diff-table handling and commerce-comparison ranking into the first converter pass.
127
+
128
+ ## Converter-layer epic update
129
+
130
+ The first converter customization pass has now landed for the two promoted rows:
131
+
132
+ - `BM-13` now opens directly on the question body and no longer carries the worst control-line clutter at the top.
133
+ - `BM-11` now opens directly on the issue title, with issue-action noise and the redundant linked title echo removed.
134
+
135
+ What remains after this pass is more clearly structural than shell-related:
136
+
137
+ - answer/timeline metadata density
138
+ - grouping of assignees/labels/status blocks
139
+ - quality scoring that is still too optimistic on structure-heavy pages