llm-html 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. llm_html-0.1.2/.gitignore +14 -0
  2. llm_html-0.1.2/PKG-INFO +133 -0
  3. llm_html-0.1.2/README.md +103 -0
  4. llm_html-0.1.2/pyproject.toml +83 -0
  5. llm_html-0.1.2/src/llm_html/__init__.py +213 -0
  6. llm_html-0.1.2/src/llm_html/cleaner/__init__.py +137 -0
  7. llm_html-0.1.2/src/llm_html/cleaner/aggressive.py +156 -0
  8. llm_html-0.1.2/src/llm_html/cleaner/classifiers/__init__.py +26 -0
  9. llm_html-0.1.2/src/llm_html/cleaner/classifiers/patterns.py +314 -0
  10. llm_html-0.1.2/src/llm_html/cleaner/classifiers/scorer.py +405 -0
  11. llm_html-0.1.2/src/llm_html/cleaner/cleaner.py +433 -0
  12. llm_html-0.1.2/src/llm_html/cleaner/config.py +318 -0
  13. llm_html-0.1.2/src/llm_html/cleaner/core.py +549 -0
  14. llm_html-0.1.2/src/llm_html/cleaner/extractors/__init__.py +32 -0
  15. llm_html-0.1.2/src/llm_html/cleaner/extractors/context.py +515 -0
  16. llm_html-0.1.2/src/llm_html/cleaner/extractors/hydration.py +607 -0
  17. llm_html-0.1.2/src/llm_html/cleaner/focused.py +229 -0
  18. llm_html-0.1.2/src/llm_html/cleaner/models.py +300 -0
  19. llm_html-0.1.2/src/llm_html/cleaner/outputs/__init__.py +31 -0
  20. llm_html-0.1.2/src/llm_html/cleaner/outputs/aom_yaml.py +420 -0
  21. llm_html-0.1.2/src/llm_html/cleaner/outputs/markdown.py +511 -0
  22. llm_html-0.1.2/src/llm_html/cleaner/outputs/xtree.py +415 -0
  23. llm_html-0.1.2/src/llm_html/cleaner/pipeline.py +430 -0
  24. llm_html-0.1.2/src/llm_html/cleaner/scripts.py +189 -0
  25. llm_html-0.1.2/src/llm_html/cleaner/transformers/__init__.py +47 -0
  26. llm_html-0.1.2/src/llm_html/cleaner/transformers/chunker.py +489 -0
  27. llm_html-0.1.2/src/llm_html/cleaner/transformers/downsampler.py +603 -0
  28. llm_html-0.1.2/src/llm_html/cleaner/transformers/shadow_dom.py +303 -0
  29. llm_html-0.1.2/src/llm_html/helpers/__init__.py +13 -0
  30. llm_html-0.1.2/src/llm_html/helpers/formatting.py +15 -0
  31. llm_html-0.1.2/src/llm_html/helpers/html.py +104 -0
  32. llm_html-0.1.2/src/llm_html/helpers/json_cleaner.py +53 -0
@@ -0,0 +1,14 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .pytest_cache/
8
+ .mypy_cache/
9
+ .ruff_cache/
10
+ .venv/
11
+ venv/
12
+ *.db
13
+ *.log
14
+ .DS_Store
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-html
3
+ Version: 0.1.2
4
+ Summary: LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats
5
+ Project-URL: Homepage, https://cmdop.com
6
+ Project-URL: Repository, https://github.com/commandoperator/llm-html
7
+ Author-email: CMDOP Team <team@cmdop.com>
8
+ License-Expression: MIT
9
+ Keywords: beautifulsoup,cleaner,cmdop,html,hydration,llm,markdown,token-budget,web-scraping
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Text Processing :: Markup :: HTML
18
+ Classifier: Typing :: Typed
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: beautifulsoup4>=4.12.0
21
+ Requires-Dist: lxml>=4.9.0
22
+ Requires-Dist: pydantic>=2.0.0
23
+ Requires-Dist: python-toon>=0.1.3
24
+ Provides-Extra: dev
25
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
26
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
27
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
28
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
29
+ Description-Content-Type: text/markdown
30
+
31
+ # llm-html
32
+
33
+ LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats.
34
+
35
+ ## Install
36
+
37
+ ```bash
38
+ pip install llm-html
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```python
44
+ from llm_html import HTMLCleaner, CleanerConfig, OutputFormat
45
+
46
+ # Basic cleaning
47
+ cleaner = HTMLCleaner()
48
+ result = cleaner.clean(html)
49
+ print(f"Reduction: {result.stats.reduction_percent}%")
50
+
51
+ # Hydration-first (extracts SSR data from Next.js, Nuxt, etc.)
52
+ if result.hydration_data:
53
+ data = result.hydration_data
54
+ else:
55
+ cleaned = result.html
56
+ ```
57
+
58
+ ## Convenience Functions
59
+
60
+ ```python
61
+ from llm_html import clean, clean_to_json, clean_html, clean_for_llm
62
+
63
+ # Quick clean
64
+ result = clean(html)
65
+
66
+ # Get JSON if SSR data available, otherwise cleaned HTML
67
+ data = clean_to_json(html)
68
+
69
+ # Pipeline with full control
70
+ result = clean_html(html, max_tokens=5000)
71
+ result = clean_for_llm(html, output_format="markdown")
72
+ ```
73
+
74
+ ## Output Formats
75
+
76
+ ```python
77
+ from llm_html import to_markdown, to_aom_yaml, to_xtree
78
+
79
+ md = to_markdown(html)
80
+ aom = to_aom_yaml(html)
81
+ xtree = to_xtree(html)
82
+ ```
83
+
84
+ ## Downsampling
85
+
86
+ Token-budget targeting with D2Snap algorithm:
87
+
88
+ ```python
89
+ from llm_html import downsample_html, estimate_tokens
90
+
91
+ tokens = estimate_tokens(html)
92
+ if tokens > 10000:
93
+ html = downsample_html(html, target_tokens=8000)
94
+ ```
95
+
96
+ ## Semantic Chunking
97
+
98
+ Split large pages into LLM-sized chunks:
99
+
100
+ ```python
101
+ from llm_html import SemanticChunker, ChunkConfig
102
+
103
+ config = ChunkConfig(max_tokens=8000, max_items=20)
104
+ chunker = SemanticChunker(config)
105
+ result = chunker.chunk(soup)
106
+ for chunk in result.chunks:
107
+ process(chunk.html)
108
+ ```
109
+
110
+ ## Shadow DOM
111
+
112
+ Flatten Web Components for LLM visibility:
113
+
114
+ ```python
115
+ from llm_html import flatten_shadow_dom
116
+
117
+ flat = flatten_shadow_dom(html)
118
+ ```
119
+
120
+ ## Helpers
121
+
122
+ ```python
123
+ from llm_html import html_to_text, extract_links, extract_images, json_to_toon
124
+
125
+ text = html_to_text(html)
126
+ links = extract_links(html, base_url="https://example.com")
127
+ images = extract_images(html)
128
+ toon = json_to_toon({"key": "value"})
129
+ ```
130
+
131
+ ## License
132
+
133
+ MIT
@@ -0,0 +1,103 @@
1
+ # llm-html
2
+
3
+ LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install llm-html
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from llm_html import HTMLCleaner, CleanerConfig, OutputFormat
15
+
16
+ # Basic cleaning
17
+ cleaner = HTMLCleaner()
18
+ result = cleaner.clean(html)
19
+ print(f"Reduction: {result.stats.reduction_percent}%")
20
+
21
+ # Hydration-first (extracts SSR data from Next.js, Nuxt, etc.)
22
+ if result.hydration_data:
23
+ data = result.hydration_data
24
+ else:
25
+ cleaned = result.html
26
+ ```
27
+
28
+ ## Convenience Functions
29
+
30
+ ```python
31
+ from llm_html import clean, clean_to_json, clean_html, clean_for_llm
32
+
33
+ # Quick clean
34
+ result = clean(html)
35
+
36
+ # Get JSON if SSR data available, otherwise cleaned HTML
37
+ data = clean_to_json(html)
38
+
39
+ # Pipeline with full control
40
+ result = clean_html(html, max_tokens=5000)
41
+ result = clean_for_llm(html, output_format="markdown")
42
+ ```
43
+
44
+ ## Output Formats
45
+
46
+ ```python
47
+ from llm_html import to_markdown, to_aom_yaml, to_xtree
48
+
49
+ md = to_markdown(html)
50
+ aom = to_aom_yaml(html)
51
+ xtree = to_xtree(html)
52
+ ```
53
+
54
+ ## Downsampling
55
+
56
+ Token-budget targeting with D2Snap algorithm:
57
+
58
+ ```python
59
+ from llm_html import downsample_html, estimate_tokens
60
+
61
+ tokens = estimate_tokens(html)
62
+ if tokens > 10000:
63
+ html = downsample_html(html, target_tokens=8000)
64
+ ```
65
+
66
+ ## Semantic Chunking
67
+
68
+ Split large pages into LLM-sized chunks:
69
+
70
+ ```python
71
+ from llm_html import SemanticChunker, ChunkConfig
72
+
73
+ config = ChunkConfig(max_tokens=8000, max_items=20)
74
+ chunker = SemanticChunker(config)
75
+ result = chunker.chunk(soup)
76
+ for chunk in result.chunks:
77
+ process(chunk.html)
78
+ ```
79
+
80
+ ## Shadow DOM
81
+
82
+ Flatten Web Components for LLM visibility:
83
+
84
+ ```python
85
+ from llm_html import flatten_shadow_dom
86
+
87
+ flat = flatten_shadow_dom(html)
88
+ ```
89
+
90
+ ## Helpers
91
+
92
+ ```python
93
+ from llm_html import html_to_text, extract_links, extract_images, json_to_toon
94
+
95
+ text = html_to_text(html)
96
+ links = extract_links(html, base_url="https://example.com")
97
+ images = extract_images(html)
98
+ toon = json_to_toon({"key": "value"})
99
+ ```
100
+
101
+ ## License
102
+
103
+ MIT
@@ -0,0 +1,83 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "llm-html"
7
+ version = "0.1.2"
8
+ description = "LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "CMDOP Team", email = "team@cmdop.com" }
14
+ ]
15
+ keywords = [
16
+ "html",
17
+ "llm",
18
+ "cleaner",
19
+ "beautifulsoup",
20
+ "markdown",
21
+ "hydration",
22
+ "token-budget",
23
+ "web-scraping",
24
+ "cmdop",
25
+ ]
26
+ classifiers = [
27
+ "Development Status :: 4 - Beta",
28
+ "Intended Audience :: Developers",
29
+ "License :: OSI Approved :: MIT License",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.10",
32
+ "Programming Language :: Python :: 3.11",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Topic :: Text Processing :: Markup :: HTML",
35
+ "Typing :: Typed",
36
+ ]
37
+
38
+ dependencies = [
39
+ "beautifulsoup4>=4.12.0",
40
+ "lxml>=4.9.0",
41
+ "pydantic>=2.0.0",
42
+ "python-toon>=0.1.3",
43
+ ]
44
+
45
+ [project.optional-dependencies]
46
+ dev = [
47
+ "pytest>=7.0.0",
48
+ "pytest-cov>=4.0.0",
49
+ "ruff>=0.1.0",
50
+ "mypy>=1.0.0",
51
+ ]
52
+
53
+ [project.urls]
54
+ Homepage = "https://cmdop.com"
55
+ Repository = "https://github.com/commandoperator/llm-html"
56
+
57
+ [tool.hatch.build.targets.wheel]
58
+ packages = ["src/llm_html"]
59
+
60
+ [tool.hatch.build.targets.sdist]
61
+ include = [
62
+ "src/",
63
+ "README.md",
64
+ "pyproject.toml",
65
+ "LICENSE",
66
+ ]
67
+
68
+ [tool.ruff]
69
+ target-version = "py310"
70
+ line-length = 100
71
+
72
+ [tool.ruff.lint]
73
+ select = ["E", "F", "I", "N", "W", "UP", "B", "C4", "SIM"]
74
+ ignore = ["E501"]
75
+
76
+ [tool.mypy]
77
+ python_version = "3.10"
78
+ strict = true
79
+ warn_return_any = true
80
+ warn_unused_ignores = true
81
+
82
+ [tool.pytest.ini_options]
83
+ testpaths = ["tests"]
@@ -0,0 +1,213 @@
1
+ """HTML Cleaner module for LLM-optimized HTML processing.
2
+
3
+ Provides intelligent HTML cleaning optimized for Large Language Models:
4
+
5
+ - **Hydration-First**: Extract SSR data (Next.js, Nuxt, etc.) before DOM parsing
6
+ - **Token Budget**: Target specific token limits with adaptive downsampling
7
+ - **Multiple Outputs**: HTML, Markdown, AOM YAML, XTree formats
8
+ - **Detailed Statistics**: Track reduction, timing, element counts
9
+
10
+ Example usage:
11
+ from llm_html import HTMLCleaner, CleanerConfig, OutputFormat
12
+
13
+ # Basic usage
14
+ cleaner = HTMLCleaner()
15
+ result = cleaner.clean(html)
16
+
17
+ # Check statistics
18
+ print(f"Size: {result.stats.original_size} → {result.stats.cleaned_size}")
19
+ print(f"Reduction: {result.stats.reduction_percent}%")
20
+ print(f"Scripts removed: {result.stats.scripts_removed}")
21
+
22
+ # Use hydration data if available (most efficient)
23
+ if result.hydration_data:
24
+ products = result.hydration_data.get("products", [])
25
+ else:
26
+ cleaned = result.html
27
+
28
+ # Custom configuration
29
+ config = CleanerConfig(
30
+ max_tokens=5000,
31
+ output_format=OutputFormat.MARKDOWN,
32
+ filter_classes=True,
33
+ )
34
+ cleaner = HTMLCleaner(config)
35
+ result = cleaner.clean(html)
36
+
37
+ Convenience functions:
38
+ from llm_html import clean, clean_to_json
39
+
40
+ # Quick clean with default settings
41
+ result = clean(html)
42
+
43
+ # Get JSON if SSR data available, otherwise cleaned HTML
44
+ data = clean_to_json(html)
45
+ """
46
+ from __future__ import annotations
47
+
48
+ from typing import Any, Dict, Union
49
+
50
+ # =============================================================================
51
+ # Cleaner subpackage (HTML cleaning pipeline)
52
+ # =============================================================================
53
+
54
+ # Models
55
+ from .cleaner.models import (
56
+ OutputFormat,
57
+ CleanerConfig,
58
+ CleanerStats,
59
+ ChunkInfo,
60
+ CleanerResult,
61
+ )
62
+
63
+ # Main cleaner class
64
+ from .cleaner.cleaner import (
65
+ HTMLCleaner,
66
+ clean,
67
+ clean_to_json,
68
+ )
69
+
70
+ # Extractors
71
+ from .cleaner.extractors import (
72
+ # Hydration extraction
73
+ HydrationExtractor,
74
+ HydrationData,
75
+ Framework,
76
+ extract_hydration,
77
+ detect_framework,
78
+ # Context Window
79
+ ContextExtractor,
80
+ ContextWindow,
81
+ ContextConfig,
82
+ extract_context,
83
+ find_stable_anchor,
84
+ generate_selector,
85
+ )
86
+
87
+ # Transformers
88
+ from .cleaner.transformers import (
89
+ # Shadow DOM
90
+ ShadowDOMFlattener,
91
+ flatten_shadow_dom,
92
+ # D2Snap Downsampling
93
+ D2SnapDownsampler,
94
+ D2SnapConfig,
95
+ downsample_html,
96
+ estimate_tokens,
97
+ # Semantic Chunking
98
+ SemanticChunker,
99
+ ChunkConfig,
100
+ ChunkResult,
101
+ )
102
+
103
+ # Classifiers
104
+ from .cleaner.classifiers import (
105
+ ClassSemanticScorer,
106
+ score_class,
107
+ filter_classes,
108
+ clean_classes,
109
+ detect_css_framework,
110
+ )
111
+
112
+ # Output Formats
113
+ from .cleaner.outputs import (
114
+ # AOM YAML (Playwright-style Aria Snapshot)
115
+ AOMYAMLExporter,
116
+ AOMConfig,
117
+ to_aom_yaml,
118
+ # Markdown
119
+ MarkdownExporter,
120
+ MarkdownConfig,
121
+ to_markdown,
122
+ # XTree
123
+ XTreeExporter,
124
+ XTreeConfig,
125
+ to_xtree,
126
+ )
127
+
128
+ # Pipeline
129
+ from .cleaner.pipeline import (
130
+ CleaningPipeline,
131
+ PipelineConfig,
132
+ PipelineResult,
133
+ clean_html,
134
+ clean_for_llm,
135
+ )
136
+
137
+ # =============================================================================
138
+ # Helpers (parsing utilities)
139
+ # =============================================================================
140
+
141
+ from .helpers import (
142
+ json_to_toon,
143
+ JsonCleaner,
144
+ html_to_text,
145
+ extract_links,
146
+ extract_images,
147
+ )
148
+
149
+
150
+ __all__ = [
151
+ # Primary API
152
+ "HTMLCleaner",
153
+ "CleanerConfig",
154
+ "CleanerResult",
155
+ "CleanerStats",
156
+ "ChunkInfo",
157
+ "OutputFormat",
158
+ "clean",
159
+ "clean_to_json",
160
+ # Extractors - Hydration
161
+ "HydrationExtractor",
162
+ "HydrationData",
163
+ "Framework",
164
+ "extract_hydration",
165
+ "detect_framework",
166
+ # Extractors - Context Window
167
+ "ContextExtractor",
168
+ "ContextWindow",
169
+ "ContextConfig",
170
+ "extract_context",
171
+ "find_stable_anchor",
172
+ "generate_selector",
173
+ # Transformers - Shadow DOM
174
+ "ShadowDOMFlattener",
175
+ "flatten_shadow_dom",
176
+ # Transformers - D2Snap
177
+ "D2SnapDownsampler",
178
+ "D2SnapConfig",
179
+ "downsample_html",
180
+ "estimate_tokens",
181
+ # Transformers - Chunking
182
+ "SemanticChunker",
183
+ "ChunkConfig",
184
+ "ChunkResult",
185
+ # Classifiers
186
+ "ClassSemanticScorer",
187
+ "score_class",
188
+ "filter_classes",
189
+ "clean_classes",
190
+ "detect_css_framework",
191
+ # Outputs
192
+ "AOMYAMLExporter",
193
+ "AOMConfig",
194
+ "to_aom_yaml",
195
+ "MarkdownExporter",
196
+ "MarkdownConfig",
197
+ "to_markdown",
198
+ "XTreeExporter",
199
+ "XTreeConfig",
200
+ "to_xtree",
201
+ # Pipeline
202
+ "CleaningPipeline",
203
+ "PipelineConfig",
204
+ "PipelineResult",
205
+ "clean_html",
206
+ "clean_for_llm",
207
+ # Helpers
208
+ "json_to_toon",
209
+ "JsonCleaner",
210
+ "html_to_text",
211
+ "extract_links",
212
+ "extract_images",
213
+ ]
@@ -0,0 +1,137 @@
1
+ """HTML Cleaner subpackage — aggressive & focused cleaning, SSR hydration, output formats."""
2
+
3
+ from __future__ import annotations
4
+
5
+ # Models
6
+ from .models import (
7
+ OutputFormat,
8
+ CleanerConfig,
9
+ CleanerStats,
10
+ ChunkInfo,
11
+ CleanerResult,
12
+ )
13
+
14
+ # Main cleaner class
15
+ from .cleaner import (
16
+ HTMLCleaner,
17
+ clean,
18
+ clean_to_json,
19
+ )
20
+
21
+ # Extractors
22
+ from .extractors import (
23
+ HydrationExtractor,
24
+ HydrationData,
25
+ Framework,
26
+ extract_hydration,
27
+ detect_framework,
28
+ ContextExtractor,
29
+ ContextWindow,
30
+ ContextConfig,
31
+ extract_context,
32
+ find_stable_anchor,
33
+ generate_selector,
34
+ )
35
+
36
+ # Transformers
37
+ from .transformers import (
38
+ ShadowDOMFlattener,
39
+ flatten_shadow_dom,
40
+ D2SnapDownsampler,
41
+ D2SnapConfig,
42
+ downsample_html,
43
+ estimate_tokens,
44
+ SemanticChunker,
45
+ ChunkConfig,
46
+ ChunkResult,
47
+ )
48
+
49
+ # Classifiers
50
+ from .classifiers import (
51
+ ClassSemanticScorer,
52
+ score_class,
53
+ filter_classes,
54
+ clean_classes,
55
+ detect_css_framework,
56
+ )
57
+
58
+ # Output Formats
59
+ from .outputs import (
60
+ AOMYAMLExporter,
61
+ AOMConfig,
62
+ to_aom_yaml,
63
+ MarkdownExporter,
64
+ MarkdownConfig,
65
+ to_markdown,
66
+ XTreeExporter,
67
+ XTreeConfig,
68
+ to_xtree,
69
+ )
70
+
71
+ # Pipeline
72
+ from .pipeline import (
73
+ CleaningPipeline,
74
+ PipelineConfig,
75
+ PipelineResult,
76
+ clean_html,
77
+ clean_for_llm,
78
+ )
79
+
80
+ __all__ = [
81
+ # Primary API
82
+ "HTMLCleaner",
83
+ "CleanerConfig",
84
+ "CleanerResult",
85
+ "CleanerStats",
86
+ "ChunkInfo",
87
+ "OutputFormat",
88
+ "clean",
89
+ "clean_to_json",
90
+ # Extractors - Hydration
91
+ "HydrationExtractor",
92
+ "HydrationData",
93
+ "Framework",
94
+ "extract_hydration",
95
+ "detect_framework",
96
+ # Extractors - Context Window
97
+ "ContextExtractor",
98
+ "ContextWindow",
99
+ "ContextConfig",
100
+ "extract_context",
101
+ "find_stable_anchor",
102
+ "generate_selector",
103
+ # Transformers - Shadow DOM
104
+ "ShadowDOMFlattener",
105
+ "flatten_shadow_dom",
106
+ # Transformers - D2Snap
107
+ "D2SnapDownsampler",
108
+ "D2SnapConfig",
109
+ "downsample_html",
110
+ "estimate_tokens",
111
+ # Transformers - Chunking
112
+ "SemanticChunker",
113
+ "ChunkConfig",
114
+ "ChunkResult",
115
+ # Classifiers
116
+ "ClassSemanticScorer",
117
+ "score_class",
118
+ "filter_classes",
119
+ "clean_classes",
120
+ "detect_css_framework",
121
+ # Outputs
122
+ "AOMYAMLExporter",
123
+ "AOMConfig",
124
+ "to_aom_yaml",
125
+ "MarkdownExporter",
126
+ "MarkdownConfig",
127
+ "to_markdown",
128
+ "XTreeExporter",
129
+ "XTreeConfig",
130
+ "to_xtree",
131
+ # Pipeline
132
+ "CleaningPipeline",
133
+ "PipelineConfig",
134
+ "PipelineResult",
135
+ "clean_html",
136
+ "clean_for_llm",
137
+ ]