llm-html 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_html-0.1.2/.gitignore +14 -0
- llm_html-0.1.2/PKG-INFO +133 -0
- llm_html-0.1.2/README.md +103 -0
- llm_html-0.1.2/pyproject.toml +83 -0
- llm_html-0.1.2/src/llm_html/__init__.py +213 -0
- llm_html-0.1.2/src/llm_html/cleaner/__init__.py +137 -0
- llm_html-0.1.2/src/llm_html/cleaner/aggressive.py +156 -0
- llm_html-0.1.2/src/llm_html/cleaner/classifiers/__init__.py +26 -0
- llm_html-0.1.2/src/llm_html/cleaner/classifiers/patterns.py +314 -0
- llm_html-0.1.2/src/llm_html/cleaner/classifiers/scorer.py +405 -0
- llm_html-0.1.2/src/llm_html/cleaner/cleaner.py +433 -0
- llm_html-0.1.2/src/llm_html/cleaner/config.py +318 -0
- llm_html-0.1.2/src/llm_html/cleaner/core.py +549 -0
- llm_html-0.1.2/src/llm_html/cleaner/extractors/__init__.py +32 -0
- llm_html-0.1.2/src/llm_html/cleaner/extractors/context.py +515 -0
- llm_html-0.1.2/src/llm_html/cleaner/extractors/hydration.py +607 -0
- llm_html-0.1.2/src/llm_html/cleaner/focused.py +229 -0
- llm_html-0.1.2/src/llm_html/cleaner/models.py +300 -0
- llm_html-0.1.2/src/llm_html/cleaner/outputs/__init__.py +31 -0
- llm_html-0.1.2/src/llm_html/cleaner/outputs/aom_yaml.py +420 -0
- llm_html-0.1.2/src/llm_html/cleaner/outputs/markdown.py +511 -0
- llm_html-0.1.2/src/llm_html/cleaner/outputs/xtree.py +415 -0
- llm_html-0.1.2/src/llm_html/cleaner/pipeline.py +430 -0
- llm_html-0.1.2/src/llm_html/cleaner/scripts.py +189 -0
- llm_html-0.1.2/src/llm_html/cleaner/transformers/__init__.py +47 -0
- llm_html-0.1.2/src/llm_html/cleaner/transformers/chunker.py +489 -0
- llm_html-0.1.2/src/llm_html/cleaner/transformers/downsampler.py +603 -0
- llm_html-0.1.2/src/llm_html/cleaner/transformers/shadow_dom.py +303 -0
- llm_html-0.1.2/src/llm_html/helpers/__init__.py +13 -0
- llm_html-0.1.2/src/llm_html/helpers/formatting.py +15 -0
- llm_html-0.1.2/src/llm_html/helpers/html.py +104 -0
- llm_html-0.1.2/src/llm_html/helpers/json_cleaner.py +53 -0
llm_html-0.1.2/PKG-INFO
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-html
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats
|
|
5
|
+
Project-URL: Homepage, https://cmdop.com
|
|
6
|
+
Project-URL: Repository, https://github.com/commandoperator/llm-html
|
|
7
|
+
Author-email: CMDOP Team <team@cmdop.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: beautifulsoup,cleaner,cmdop,html,hydration,llm,markdown,token-budget,web-scraping
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
21
|
+
Requires-Dist: lxml>=4.9.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0.0
|
|
23
|
+
Requires-Dist: python-toon>=0.1.3
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# llm-html
|
|
32
|
+
|
|
33
|
+
LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats.
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install llm-html
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from llm_html import HTMLCleaner, CleanerConfig, OutputFormat
|
|
45
|
+
|
|
46
|
+
# Basic cleaning
|
|
47
|
+
cleaner = HTMLCleaner()
|
|
48
|
+
result = cleaner.clean(html)
|
|
49
|
+
print(f"Reduction: {result.stats.reduction_percent}%")
|
|
50
|
+
|
|
51
|
+
# Hydration-first (extracts SSR data from Next.js, Nuxt, etc.)
|
|
52
|
+
if result.hydration_data:
|
|
53
|
+
data = result.hydration_data
|
|
54
|
+
else:
|
|
55
|
+
cleaned = result.html
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Convenience Functions
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from llm_html import clean, clean_to_json, clean_html, clean_for_llm
|
|
62
|
+
|
|
63
|
+
# Quick clean
|
|
64
|
+
result = clean(html)
|
|
65
|
+
|
|
66
|
+
# Get JSON if SSR data available, otherwise cleaned HTML
|
|
67
|
+
data = clean_to_json(html)
|
|
68
|
+
|
|
69
|
+
# Pipeline with full control
|
|
70
|
+
result = clean_html(html, max_tokens=5000)
|
|
71
|
+
result = clean_for_llm(html, output_format="markdown")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Output Formats
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from llm_html import to_markdown, to_aom_yaml, to_xtree
|
|
78
|
+
|
|
79
|
+
md = to_markdown(html)
|
|
80
|
+
aom = to_aom_yaml(html)
|
|
81
|
+
xtree = to_xtree(html)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Downsampling
|
|
85
|
+
|
|
86
|
+
Token-budget targeting with D2Snap algorithm:
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from llm_html import downsample_html, estimate_tokens
|
|
90
|
+
|
|
91
|
+
tokens = estimate_tokens(html)
|
|
92
|
+
if tokens > 10000:
|
|
93
|
+
html = downsample_html(html, target_tokens=8000)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Semantic Chunking
|
|
97
|
+
|
|
98
|
+
Split large pages into LLM-sized chunks:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from llm_html import SemanticChunker, ChunkConfig
|
|
102
|
+
|
|
103
|
+
config = ChunkConfig(max_tokens=8000, max_items=20)
|
|
104
|
+
chunker = SemanticChunker(config)
|
|
105
|
+
result = chunker.chunk(soup)
|
|
106
|
+
for chunk in result.chunks:
|
|
107
|
+
process(chunk.html)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Shadow DOM
|
|
111
|
+
|
|
112
|
+
Flatten Web Components for LLM visibility:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from llm_html import flatten_shadow_dom
|
|
116
|
+
|
|
117
|
+
flat = flatten_shadow_dom(html)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Helpers
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from llm_html import html_to_text, extract_links, extract_images, json_to_toon
|
|
124
|
+
|
|
125
|
+
text = html_to_text(html)
|
|
126
|
+
links = extract_links(html, base_url="https://example.com")
|
|
127
|
+
images = extract_images(html)
|
|
128
|
+
toon = json_to_toon({"key": "value"})
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT
|
llm_html-0.1.2/README.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# llm-html
|
|
2
|
+
|
|
3
|
+
LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install llm-html
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from llm_html import HTMLCleaner, CleanerConfig, OutputFormat
|
|
15
|
+
|
|
16
|
+
# Basic cleaning
|
|
17
|
+
cleaner = HTMLCleaner()
|
|
18
|
+
result = cleaner.clean(html)
|
|
19
|
+
print(f"Reduction: {result.stats.reduction_percent}%")
|
|
20
|
+
|
|
21
|
+
# Hydration-first (extracts SSR data from Next.js, Nuxt, etc.)
|
|
22
|
+
if result.hydration_data:
|
|
23
|
+
data = result.hydration_data
|
|
24
|
+
else:
|
|
25
|
+
cleaned = result.html
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Convenience Functions
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from llm_html import clean, clean_to_json, clean_html, clean_for_llm
|
|
32
|
+
|
|
33
|
+
# Quick clean
|
|
34
|
+
result = clean(html)
|
|
35
|
+
|
|
36
|
+
# Get JSON if SSR data available, otherwise cleaned HTML
|
|
37
|
+
data = clean_to_json(html)
|
|
38
|
+
|
|
39
|
+
# Pipeline with full control
|
|
40
|
+
result = clean_html(html, max_tokens=5000)
|
|
41
|
+
result = clean_for_llm(html, output_format="markdown")
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Output Formats
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from llm_html import to_markdown, to_aom_yaml, to_xtree
|
|
48
|
+
|
|
49
|
+
md = to_markdown(html)
|
|
50
|
+
aom = to_aom_yaml(html)
|
|
51
|
+
xtree = to_xtree(html)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Downsampling
|
|
55
|
+
|
|
56
|
+
Token-budget targeting with D2Snap algorithm:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from llm_html import downsample_html, estimate_tokens
|
|
60
|
+
|
|
61
|
+
tokens = estimate_tokens(html)
|
|
62
|
+
if tokens > 10000:
|
|
63
|
+
html = downsample_html(html, target_tokens=8000)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Semantic Chunking
|
|
67
|
+
|
|
68
|
+
Split large pages into LLM-sized chunks:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from llm_html import SemanticChunker, ChunkConfig
|
|
72
|
+
|
|
73
|
+
config = ChunkConfig(max_tokens=8000, max_items=20)
|
|
74
|
+
chunker = SemanticChunker(config)
|
|
75
|
+
result = chunker.chunk(soup)
|
|
76
|
+
for chunk in result.chunks:
|
|
77
|
+
process(chunk.html)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Shadow DOM
|
|
81
|
+
|
|
82
|
+
Flatten Web Components for LLM visibility:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from llm_html import flatten_shadow_dom
|
|
86
|
+
|
|
87
|
+
flat = flatten_shadow_dom(html)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Helpers
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from llm_html import html_to_text, extract_links, extract_images, json_to_toon
|
|
94
|
+
|
|
95
|
+
text = html_to_text(html)
|
|
96
|
+
links = extract_links(html, base_url="https://example.com")
|
|
97
|
+
images = extract_images(html)
|
|
98
|
+
toon = json_to_toon({"key": "value"})
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "llm-html"
|
|
7
|
+
version = "0.1.2"
|
|
8
|
+
description = "LLM-optimized HTML cleaning: hydration extraction, token budgets, multiple output formats"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "CMDOP Team", email = "team@cmdop.com" }
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"html",
|
|
17
|
+
"llm",
|
|
18
|
+
"cleaner",
|
|
19
|
+
"beautifulsoup",
|
|
20
|
+
"markdown",
|
|
21
|
+
"hydration",
|
|
22
|
+
"token-budget",
|
|
23
|
+
"web-scraping",
|
|
24
|
+
"cmdop",
|
|
25
|
+
]
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Development Status :: 4 - Beta",
|
|
28
|
+
"Intended Audience :: Developers",
|
|
29
|
+
"License :: OSI Approved :: MIT License",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3.10",
|
|
32
|
+
"Programming Language :: Python :: 3.11",
|
|
33
|
+
"Programming Language :: Python :: 3.12",
|
|
34
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
35
|
+
"Typing :: Typed",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
dependencies = [
|
|
39
|
+
"beautifulsoup4>=4.12.0",
|
|
40
|
+
"lxml>=4.9.0",
|
|
41
|
+
"pydantic>=2.0.0",
|
|
42
|
+
"python-toon>=0.1.3",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.optional-dependencies]
|
|
46
|
+
dev = [
|
|
47
|
+
"pytest>=7.0.0",
|
|
48
|
+
"pytest-cov>=4.0.0",
|
|
49
|
+
"ruff>=0.1.0",
|
|
50
|
+
"mypy>=1.0.0",
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
[project.urls]
|
|
54
|
+
Homepage = "https://cmdop.com"
|
|
55
|
+
Repository = "https://github.com/commandoperator/llm-html"
|
|
56
|
+
|
|
57
|
+
[tool.hatch.build.targets.wheel]
|
|
58
|
+
packages = ["src/llm_html"]
|
|
59
|
+
|
|
60
|
+
[tool.hatch.build.targets.sdist]
|
|
61
|
+
include = [
|
|
62
|
+
"src/",
|
|
63
|
+
"README.md",
|
|
64
|
+
"pyproject.toml",
|
|
65
|
+
"LICENSE",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
[tool.ruff]
|
|
69
|
+
target-version = "py310"
|
|
70
|
+
line-length = 100
|
|
71
|
+
|
|
72
|
+
[tool.ruff.lint]
|
|
73
|
+
select = ["E", "F", "I", "N", "W", "UP", "B", "C4", "SIM"]
|
|
74
|
+
ignore = ["E501"]
|
|
75
|
+
|
|
76
|
+
[tool.mypy]
|
|
77
|
+
python_version = "3.10"
|
|
78
|
+
strict = true
|
|
79
|
+
warn_return_any = true
|
|
80
|
+
warn_unused_ignores = true
|
|
81
|
+
|
|
82
|
+
[tool.pytest.ini_options]
|
|
83
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""HTML Cleaner module for LLM-optimized HTML processing.
|
|
2
|
+
|
|
3
|
+
Provides intelligent HTML cleaning optimized for Large Language Models:
|
|
4
|
+
|
|
5
|
+
- **Hydration-First**: Extract SSR data (Next.js, Nuxt, etc.) before DOM parsing
|
|
6
|
+
- **Token Budget**: Target specific token limits with adaptive downsampling
|
|
7
|
+
- **Multiple Outputs**: HTML, Markdown, AOM YAML, XTree formats
|
|
8
|
+
- **Detailed Statistics**: Track reduction, timing, element counts
|
|
9
|
+
|
|
10
|
+
Example usage:
|
|
11
|
+
from llm_html import HTMLCleaner, CleanerConfig, OutputFormat
|
|
12
|
+
|
|
13
|
+
# Basic usage
|
|
14
|
+
cleaner = HTMLCleaner()
|
|
15
|
+
result = cleaner.clean(html)
|
|
16
|
+
|
|
17
|
+
# Check statistics
|
|
18
|
+
print(f"Size: {result.stats.original_size} → {result.stats.cleaned_size}")
|
|
19
|
+
print(f"Reduction: {result.stats.reduction_percent}%")
|
|
20
|
+
print(f"Scripts removed: {result.stats.scripts_removed}")
|
|
21
|
+
|
|
22
|
+
# Use hydration data if available (most efficient)
|
|
23
|
+
if result.hydration_data:
|
|
24
|
+
products = result.hydration_data.get("products", [])
|
|
25
|
+
else:
|
|
26
|
+
cleaned = result.html
|
|
27
|
+
|
|
28
|
+
# Custom configuration
|
|
29
|
+
config = CleanerConfig(
|
|
30
|
+
max_tokens=5000,
|
|
31
|
+
output_format=OutputFormat.MARKDOWN,
|
|
32
|
+
filter_classes=True,
|
|
33
|
+
)
|
|
34
|
+
cleaner = HTMLCleaner(config)
|
|
35
|
+
result = cleaner.clean(html)
|
|
36
|
+
|
|
37
|
+
Convenience functions:
|
|
38
|
+
from llm_html import clean, clean_to_json
|
|
39
|
+
|
|
40
|
+
# Quick clean with default settings
|
|
41
|
+
result = clean(html)
|
|
42
|
+
|
|
43
|
+
# Get JSON if SSR data available, otherwise cleaned HTML
|
|
44
|
+
data = clean_to_json(html)
|
|
45
|
+
"""
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
from typing import Any, Dict, Union
|
|
49
|
+
|
|
50
|
+
# =============================================================================
|
|
51
|
+
# Cleaner subpackage (HTML cleaning pipeline)
|
|
52
|
+
# =============================================================================
|
|
53
|
+
|
|
54
|
+
# Models
|
|
55
|
+
from .cleaner.models import (
|
|
56
|
+
OutputFormat,
|
|
57
|
+
CleanerConfig,
|
|
58
|
+
CleanerStats,
|
|
59
|
+
ChunkInfo,
|
|
60
|
+
CleanerResult,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Main cleaner class
|
|
64
|
+
from .cleaner.cleaner import (
|
|
65
|
+
HTMLCleaner,
|
|
66
|
+
clean,
|
|
67
|
+
clean_to_json,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Extractors
|
|
71
|
+
from .cleaner.extractors import (
|
|
72
|
+
# Hydration extraction
|
|
73
|
+
HydrationExtractor,
|
|
74
|
+
HydrationData,
|
|
75
|
+
Framework,
|
|
76
|
+
extract_hydration,
|
|
77
|
+
detect_framework,
|
|
78
|
+
# Context Window
|
|
79
|
+
ContextExtractor,
|
|
80
|
+
ContextWindow,
|
|
81
|
+
ContextConfig,
|
|
82
|
+
extract_context,
|
|
83
|
+
find_stable_anchor,
|
|
84
|
+
generate_selector,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Transformers
|
|
88
|
+
from .cleaner.transformers import (
|
|
89
|
+
# Shadow DOM
|
|
90
|
+
ShadowDOMFlattener,
|
|
91
|
+
flatten_shadow_dom,
|
|
92
|
+
# D2Snap Downsampling
|
|
93
|
+
D2SnapDownsampler,
|
|
94
|
+
D2SnapConfig,
|
|
95
|
+
downsample_html,
|
|
96
|
+
estimate_tokens,
|
|
97
|
+
# Semantic Chunking
|
|
98
|
+
SemanticChunker,
|
|
99
|
+
ChunkConfig,
|
|
100
|
+
ChunkResult,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Classifiers
|
|
104
|
+
from .cleaner.classifiers import (
|
|
105
|
+
ClassSemanticScorer,
|
|
106
|
+
score_class,
|
|
107
|
+
filter_classes,
|
|
108
|
+
clean_classes,
|
|
109
|
+
detect_css_framework,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Output Formats
|
|
113
|
+
from .cleaner.outputs import (
|
|
114
|
+
# AOM YAML (Playwright-style Aria Snapshot)
|
|
115
|
+
AOMYAMLExporter,
|
|
116
|
+
AOMConfig,
|
|
117
|
+
to_aom_yaml,
|
|
118
|
+
# Markdown
|
|
119
|
+
MarkdownExporter,
|
|
120
|
+
MarkdownConfig,
|
|
121
|
+
to_markdown,
|
|
122
|
+
# XTree
|
|
123
|
+
XTreeExporter,
|
|
124
|
+
XTreeConfig,
|
|
125
|
+
to_xtree,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Pipeline
|
|
129
|
+
from .cleaner.pipeline import (
|
|
130
|
+
CleaningPipeline,
|
|
131
|
+
PipelineConfig,
|
|
132
|
+
PipelineResult,
|
|
133
|
+
clean_html,
|
|
134
|
+
clean_for_llm,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# =============================================================================
|
|
138
|
+
# Helpers (parsing utilities)
|
|
139
|
+
# =============================================================================
|
|
140
|
+
|
|
141
|
+
from .helpers import (
|
|
142
|
+
json_to_toon,
|
|
143
|
+
JsonCleaner,
|
|
144
|
+
html_to_text,
|
|
145
|
+
extract_links,
|
|
146
|
+
extract_images,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
__all__ = [
|
|
151
|
+
# Primary API
|
|
152
|
+
"HTMLCleaner",
|
|
153
|
+
"CleanerConfig",
|
|
154
|
+
"CleanerResult",
|
|
155
|
+
"CleanerStats",
|
|
156
|
+
"ChunkInfo",
|
|
157
|
+
"OutputFormat",
|
|
158
|
+
"clean",
|
|
159
|
+
"clean_to_json",
|
|
160
|
+
# Extractors - Hydration
|
|
161
|
+
"HydrationExtractor",
|
|
162
|
+
"HydrationData",
|
|
163
|
+
"Framework",
|
|
164
|
+
"extract_hydration",
|
|
165
|
+
"detect_framework",
|
|
166
|
+
# Extractors - Context Window
|
|
167
|
+
"ContextExtractor",
|
|
168
|
+
"ContextWindow",
|
|
169
|
+
"ContextConfig",
|
|
170
|
+
"extract_context",
|
|
171
|
+
"find_stable_anchor",
|
|
172
|
+
"generate_selector",
|
|
173
|
+
# Transformers - Shadow DOM
|
|
174
|
+
"ShadowDOMFlattener",
|
|
175
|
+
"flatten_shadow_dom",
|
|
176
|
+
# Transformers - D2Snap
|
|
177
|
+
"D2SnapDownsampler",
|
|
178
|
+
"D2SnapConfig",
|
|
179
|
+
"downsample_html",
|
|
180
|
+
"estimate_tokens",
|
|
181
|
+
# Transformers - Chunking
|
|
182
|
+
"SemanticChunker",
|
|
183
|
+
"ChunkConfig",
|
|
184
|
+
"ChunkResult",
|
|
185
|
+
# Classifiers
|
|
186
|
+
"ClassSemanticScorer",
|
|
187
|
+
"score_class",
|
|
188
|
+
"filter_classes",
|
|
189
|
+
"clean_classes",
|
|
190
|
+
"detect_css_framework",
|
|
191
|
+
# Outputs
|
|
192
|
+
"AOMYAMLExporter",
|
|
193
|
+
"AOMConfig",
|
|
194
|
+
"to_aom_yaml",
|
|
195
|
+
"MarkdownExporter",
|
|
196
|
+
"MarkdownConfig",
|
|
197
|
+
"to_markdown",
|
|
198
|
+
"XTreeExporter",
|
|
199
|
+
"XTreeConfig",
|
|
200
|
+
"to_xtree",
|
|
201
|
+
# Pipeline
|
|
202
|
+
"CleaningPipeline",
|
|
203
|
+
"PipelineConfig",
|
|
204
|
+
"PipelineResult",
|
|
205
|
+
"clean_html",
|
|
206
|
+
"clean_for_llm",
|
|
207
|
+
# Helpers
|
|
208
|
+
"json_to_toon",
|
|
209
|
+
"JsonCleaner",
|
|
210
|
+
"html_to_text",
|
|
211
|
+
"extract_links",
|
|
212
|
+
"extract_images",
|
|
213
|
+
]
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""HTML Cleaner subpackage — aggressive & focused cleaning, SSR hydration, output formats."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
# Models
|
|
6
|
+
from .models import (
|
|
7
|
+
OutputFormat,
|
|
8
|
+
CleanerConfig,
|
|
9
|
+
CleanerStats,
|
|
10
|
+
ChunkInfo,
|
|
11
|
+
CleanerResult,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# Main cleaner class
|
|
15
|
+
from .cleaner import (
|
|
16
|
+
HTMLCleaner,
|
|
17
|
+
clean,
|
|
18
|
+
clean_to_json,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Extractors
|
|
22
|
+
from .extractors import (
|
|
23
|
+
HydrationExtractor,
|
|
24
|
+
HydrationData,
|
|
25
|
+
Framework,
|
|
26
|
+
extract_hydration,
|
|
27
|
+
detect_framework,
|
|
28
|
+
ContextExtractor,
|
|
29
|
+
ContextWindow,
|
|
30
|
+
ContextConfig,
|
|
31
|
+
extract_context,
|
|
32
|
+
find_stable_anchor,
|
|
33
|
+
generate_selector,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Transformers
|
|
37
|
+
from .transformers import (
|
|
38
|
+
ShadowDOMFlattener,
|
|
39
|
+
flatten_shadow_dom,
|
|
40
|
+
D2SnapDownsampler,
|
|
41
|
+
D2SnapConfig,
|
|
42
|
+
downsample_html,
|
|
43
|
+
estimate_tokens,
|
|
44
|
+
SemanticChunker,
|
|
45
|
+
ChunkConfig,
|
|
46
|
+
ChunkResult,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Classifiers
|
|
50
|
+
from .classifiers import (
|
|
51
|
+
ClassSemanticScorer,
|
|
52
|
+
score_class,
|
|
53
|
+
filter_classes,
|
|
54
|
+
clean_classes,
|
|
55
|
+
detect_css_framework,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Output Formats
|
|
59
|
+
from .outputs import (
|
|
60
|
+
AOMYAMLExporter,
|
|
61
|
+
AOMConfig,
|
|
62
|
+
to_aom_yaml,
|
|
63
|
+
MarkdownExporter,
|
|
64
|
+
MarkdownConfig,
|
|
65
|
+
to_markdown,
|
|
66
|
+
XTreeExporter,
|
|
67
|
+
XTreeConfig,
|
|
68
|
+
to_xtree,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Pipeline
|
|
72
|
+
from .pipeline import (
|
|
73
|
+
CleaningPipeline,
|
|
74
|
+
PipelineConfig,
|
|
75
|
+
PipelineResult,
|
|
76
|
+
clean_html,
|
|
77
|
+
clean_for_llm,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
__all__ = [
|
|
81
|
+
# Primary API
|
|
82
|
+
"HTMLCleaner",
|
|
83
|
+
"CleanerConfig",
|
|
84
|
+
"CleanerResult",
|
|
85
|
+
"CleanerStats",
|
|
86
|
+
"ChunkInfo",
|
|
87
|
+
"OutputFormat",
|
|
88
|
+
"clean",
|
|
89
|
+
"clean_to_json",
|
|
90
|
+
# Extractors - Hydration
|
|
91
|
+
"HydrationExtractor",
|
|
92
|
+
"HydrationData",
|
|
93
|
+
"Framework",
|
|
94
|
+
"extract_hydration",
|
|
95
|
+
"detect_framework",
|
|
96
|
+
# Extractors - Context Window
|
|
97
|
+
"ContextExtractor",
|
|
98
|
+
"ContextWindow",
|
|
99
|
+
"ContextConfig",
|
|
100
|
+
"extract_context",
|
|
101
|
+
"find_stable_anchor",
|
|
102
|
+
"generate_selector",
|
|
103
|
+
# Transformers - Shadow DOM
|
|
104
|
+
"ShadowDOMFlattener",
|
|
105
|
+
"flatten_shadow_dom",
|
|
106
|
+
# Transformers - D2Snap
|
|
107
|
+
"D2SnapDownsampler",
|
|
108
|
+
"D2SnapConfig",
|
|
109
|
+
"downsample_html",
|
|
110
|
+
"estimate_tokens",
|
|
111
|
+
# Transformers - Chunking
|
|
112
|
+
"SemanticChunker",
|
|
113
|
+
"ChunkConfig",
|
|
114
|
+
"ChunkResult",
|
|
115
|
+
# Classifiers
|
|
116
|
+
"ClassSemanticScorer",
|
|
117
|
+
"score_class",
|
|
118
|
+
"filter_classes",
|
|
119
|
+
"clean_classes",
|
|
120
|
+
"detect_css_framework",
|
|
121
|
+
# Outputs
|
|
122
|
+
"AOMYAMLExporter",
|
|
123
|
+
"AOMConfig",
|
|
124
|
+
"to_aom_yaml",
|
|
125
|
+
"MarkdownExporter",
|
|
126
|
+
"MarkdownConfig",
|
|
127
|
+
"to_markdown",
|
|
128
|
+
"XTreeExporter",
|
|
129
|
+
"XTreeConfig",
|
|
130
|
+
"to_xtree",
|
|
131
|
+
# Pipeline
|
|
132
|
+
"CleaningPipeline",
|
|
133
|
+
"PipelineConfig",
|
|
134
|
+
"PipelineResult",
|
|
135
|
+
"clean_html",
|
|
136
|
+
"clean_for_llm",
|
|
137
|
+
]
|