betterhtmlchunking 0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Carlos A. Planchón
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.2
2
+ Name: betterhtmlchunking
3
+ Version: 0.9
4
+ Summary: A Python library for intelligent HTML segmentation and ROI extraction. It builds a DOM tree from raw HTML and extracts content-rich regions for efficient web scraping and analysis.
5
+ Author-email: "Carlos A. Planchón" <carlosandresplanchonprestes@gmail.com>
6
+ License: MIT License
7
+ Project-URL: repository, https://github.com/carlosplanchon/betterhtmlchunking.git
8
+ Keywords: html,chunking,scraping,dom,roi,content extraction,web-scraping
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Software Development :: Libraries
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: attrs
16
+ Requires-Dist: attrs-strict
17
+ Requires-Dist: beautifulsoup4
18
+ Requires-Dist: lxml
19
+ Requires-Dist: treelib
20
+ Requires-Dist: parsel_text
21
+
22
+ ```markdown
23
+ # betterhtmlchunking
24
+
25
+ A Python library for intelligently chunking HTML documents into structured, size-limited segments based on DOM tree analysis.
26
+
27
+ ## Overview
28
+
29
+ This library processes HTML content to split it into semantically coherent chunks while respecting specified size constraints. It analyzes the DOM structure to identify optimal split points, preserving contextual information and document hierarchy.
30
+
31
+ ## Key Features
32
+
33
+ - Custom DOM tree representation.
34
+ - Configurable chunk size limits (counting by text or HTML length).
35
+ - Intelligent region-of-interest detection.
36
+ - Dual output formats: HTML and plain text chunks.
37
+ - Preservation of structure relationships.
38
+ - Customizable tag filtering.
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install betterhtmlchunking
44
+ ```
45
+
46
+ ### Dependencies
47
+ - Python 3.12+
48
+ - attrs
49
+ - treelib
50
+ - beautifulsoup4
51
+ - parsel-text
52
+ - lxml
53
+ - attrs-strict
54
+
55
+ ## Usage
56
+
57
+ ### Basic Example
58
+
59
+ ```python
60
+ from betterhtmlchunking import DomRepresentation
61
+
62
+ html_content = """
63
+ <html>
64
+ <body>
65
+ <div id="content">
66
+ <h1>Document Title</h1>
67
+ <p>First paragraph...</p>
68
+ <p>Second paragraph...</p>
69
+ </div>
70
+ </body>
71
+ </html>
72
+ """
73
+
74
+ # Create document representation with 500 character chunks
75
+ doc = DomRepresentation(
76
+ MAX_NODE_REPR_LENGTH=500,
77
+ website_code=html_content,
78
+ repr_length_compared_by="html_length"
79
+ )
80
+
81
+ # Access HTML chunks
82
+ html_chunks = doc.render_system.html_render_roi
83
+ for chunk_id, chunk in html_chunks.items():
84
+ print(f"Chunk {chunk_id}:\n{chunk}\n{'='*50}")
85
+
86
+ # Access text chunks
87
+ text_chunks = doc.render_system.text_render_roi
88
+ for chunk_id, chunk in text_chunks.items():
89
+ print(f"Chunk {chunk_id}:\n{chunk}\n{'='*50}")
90
+ ```
91
+
92
+ ## Configuration
93
+
94
+ ### Key Parameters
95
+ - `MAX_NODE_REPR_LENGTH`: Maximum allowed length for each chunk (in characters)
96
+ - `repr_length_compared_by`: Length calculation method:
97
+ - `html_length`: HTML source length
98
+ - `text_length`: Rendered text length
99
+ - `website_code`: Input HTML content
100
+
101
+ ### Advanced Features
102
+ ```python
103
+ # Access the DOM tree structure
104
+ tree = doc.tree_representation.tree
105
+
106
+ # Get node metadata
107
+ for node in tree.all_nodes():
108
+ print(f"XPath: {node.identifier}")
109
+ print(f"Text length: {node.data.text_length}")
110
+ print(f"HTML length: {node.data.html_length}")
111
+
112
+ # Custom tag filtering (before processing)
113
+ from betterhtmlchunking.tree_representation import DOMTreeRepresentation
114
+ from betterhtmlchunking.utils import remove_unwanted_tags
115
+
116
+ # Example usage of remove_unwanted_tags:
117
+ tree_rep = DOMTreeRepresentation(website_code=html_content)
118
+ filtered_rep = remove_unwanted_tags(tree_rep)
119
+ ```
120
+
121
+ ## How It Works
122
+
123
+ 1. **DOM Parsing**
124
+ - Builds a tree representation of the HTML document.
125
+ - Calculates metadata (text length, HTML length) for each node.
126
+
127
+ 2. **Region Detection**
128
+ - Uses **Breadth First Search (BFS)** to traverse the DOM tree in a level-order fashion, ensuring that each node is processed systematically.
129
+ - Combines nodes until the specified size limit is reached.
130
+ - Preserves parent-child relationships to maintain contextual integrity.
131
+
132
+ 3. **Chunk Generation**
133
+ - Creates HTML chunks with original markup.
134
+ - Generates parallel text-only chunks.
135
+ - Maintains chunk order based on document structure.
136
+
137
+ ## Comparison to popular Chunking Techniques
138
+
139
+ The actual practice (Feb. 2025) is to use **plain-text** or **token-based** chunking strategies, primarily aimed at keeping prompts within certain token limits for large language models. This approach is ideal for quick semantic retrieval or QA tasks on *unstructured* text.
140
+
141
+ By contrast, **betterhtmlchunking** preserves the **HTML DOM structure**, calculating chunk boundaries based on each node’s text or HTML length. This approach is especially useful when you want to:
142
+ - Retain or leverage the **hierarchical relationships** in the HTML (e.g., headings, nested divs)
143
+ - Filter out undesired tags or sections (like `<script>` or `<style>`)
144
+ - Pinpoint exactly where each chunk originated in the document (via positional XPaths)
145
+
146
+ You can even combine the two techniques if you need both **structured extraction** (via betterhtmlchunking) and **LLM-friendly text chunking** (via LangChain) for advanced tasks such as summarization, semantic search, or large-scale QA pipelines.
147
+
148
+ ## License
149
+
150
+ MIT License
151
+
152
+ ## Contributing
153
+ Feel free to open issues or submit pull requests if you have suggestions or improvements.
@@ -0,0 +1,132 @@
1
+ ```markdown
2
+ # betterhtmlchunking
3
+
4
+ A Python library for intelligently chunking HTML documents into structured, size-limited segments based on DOM tree analysis.
5
+
6
+ ## Overview
7
+
8
+ This library processes HTML content to split it into semantically coherent chunks while respecting specified size constraints. It analyzes the DOM structure to identify optimal split points, preserving contextual information and document hierarchy.
9
+
10
+ ## Key Features
11
+
12
+ - Custom DOM tree representation.
13
+ - Configurable chunk size limits (counting by text or HTML length).
14
+ - Intelligent region-of-interest detection.
15
+ - Dual output formats: HTML and plain text chunks.
16
+ - Preservation of structure relationships.
17
+ - Customizable tag filtering.
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ pip install betterhtmlchunking
23
+ ```
24
+
25
+ ### Dependencies
26
+ - Python 3.12+
27
+ - attrs
28
+ - treelib
29
+ - beautifulsoup4
30
+ - parsel-text
31
+ - lxml
32
+ - attrs-strict
33
+
34
+ ## Usage
35
+
36
+ ### Basic Example
37
+
38
+ ```python
39
+ from betterhtmlchunking import DomRepresentation
40
+
41
+ html_content = """
42
+ <html>
43
+ <body>
44
+ <div id="content">
45
+ <h1>Document Title</h1>
46
+ <p>First paragraph...</p>
47
+ <p>Second paragraph...</p>
48
+ </div>
49
+ </body>
50
+ </html>
51
+ """
52
+
53
+ # Create document representation with 500 character chunks
54
+ doc = DomRepresentation(
55
+ MAX_NODE_REPR_LENGTH=500,
56
+ website_code=html_content,
57
+ repr_length_compared_by="html_length"
58
+ )
59
+
60
+ # Access HTML chunks
61
+ html_chunks = doc.render_system.html_render_roi
62
+ for chunk_id, chunk in html_chunks.items():
63
+ print(f"Chunk {chunk_id}:\n{chunk}\n{'='*50}")
64
+
65
+ # Access text chunks
66
+ text_chunks = doc.render_system.text_render_roi
67
+ for chunk_id, chunk in text_chunks.items():
68
+ print(f"Chunk {chunk_id}:\n{chunk}\n{'='*50}")
69
+ ```
70
+
71
+ ## Configuration
72
+
73
+ ### Key Parameters
74
+ - `MAX_NODE_REPR_LENGTH`: Maximum allowed length for each chunk (in characters)
75
+ - `repr_length_compared_by`: Length calculation method:
76
+ - `html_length`: HTML source length
77
+ - `text_length`: Rendered text length
78
+ - `website_code`: Input HTML content
79
+
80
+ ### Advanced Features
81
+ ```python
82
+ # Access the DOM tree structure
83
+ tree = doc.tree_representation.tree
84
+
85
+ # Get node metadata
86
+ for node in tree.all_nodes():
87
+ print(f"XPath: {node.identifier}")
88
+ print(f"Text length: {node.data.text_length}")
89
+ print(f"HTML length: {node.data.html_length}")
90
+
91
+ # Custom tag filtering (before processing)
92
+ from betterhtmlchunking.tree_representation import DOMTreeRepresentation
93
+ from betterhtmlchunking.utils import remove_unwanted_tags
94
+
95
+ # Example usage of remove_unwanted_tags:
96
+ tree_rep = DOMTreeRepresentation(website_code=html_content)
97
+ filtered_rep = remove_unwanted_tags(tree_rep)
98
+ ```
99
+
100
+ ## How It Works
101
+
102
+ 1. **DOM Parsing**
103
+ - Builds a tree representation of the HTML document.
104
+ - Calculates metadata (text length, HTML length) for each node.
105
+
106
+ 2. **Region Detection**
107
+ - Uses **Breadth First Search (BFS)** to traverse the DOM tree in a level-order fashion, ensuring that each node is processed systematically.
108
+ - Combines nodes until the specified size limit is reached.
109
+ - Preserves parent-child relationships to maintain contextual integrity.
110
+
111
+ 3. **Chunk Generation**
112
+ - Creates HTML chunks with original markup.
113
+ - Generates parallel text-only chunks.
114
+ - Maintains chunk order based on document structure.
115
+
116
+ ## Comparison to popular Chunking Techniques
117
+
118
+ The actual practice (Feb. 2025) is to use **plain-text** or **token-based** chunking strategies, primarily aimed at keeping prompts within certain token limits for large language models. This approach is ideal for quick semantic retrieval or QA tasks on *unstructured* text.
119
+
120
+ By contrast, **betterhtmlchunking** preserves the **HTML DOM structure**, calculating chunk boundaries based on each node’s text or HTML length. This approach is especially useful when you want to:
121
+ - Retain or leverage the **hierarchical relationships** in the HTML (e.g., headings, nested divs)
122
+ - Filter out undesired tags or sections (like `<script>` or `<style>`)
123
+ - Pinpoint exactly where each chunk originated in the document (via positional XPaths)
124
+
125
+ You can even combine the two techniques if you need both **structured extraction** (via betterhtmlchunking) and **LLM-friendly text chunking** (via LangChain) for advanced tasks such as summarization, semantic search, or large-scale QA pipelines.
126
+
127
+ ## License
128
+
129
+ MIT License
130
+
131
+ ## Contributing
132
+ Feel free to open issues or submit pull requests if you have suggestions or improvements.
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from betterhtmlchunking.main import DomRepresentation
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import attrs
4
+
5
+ from attrs_strict import type_validator
6
+
7
+ from betterhtmlchunking.utils import remove_unwanted_tags
8
+
9
+ from betterhtmlchunking.tree_representation import\
10
+ DOMTreeRepresentation
11
+
12
+ from betterhtmlchunking.tree_regions_system import\
13
+ TreeRegionsSystem
14
+ from betterhtmlchunking.tree_regions_system import\
15
+ ReprLengthComparisionBy
16
+
17
+ from betterhtmlchunking.render_system import\
18
+ RenderSystem
19
+
20
+
21
+ tag_list_to_filter_out: list[str] = [
22
+ "/head",
23
+ "/select",
24
+ # "/form",
25
+ "/footer",
26
+ "/svg",
27
+ "/defs",
28
+ "/g",
29
+ "/header",
30
+ "/footer",
31
+ "/script",
32
+ "/style"
33
+ ]
34
+
35
+
36
+ @attrs.define()
37
+ class DomRepresentation:
38
+ # Input:
39
+ MAX_NODE_REPR_LENGTH: int = attrs.field(
40
+ validator=type_validator()
41
+ )
42
+ website_code: str = attrs.field(
43
+ validator=type_validator(),
44
+ repr=False
45
+ )
46
+ repr_length_compared_by: ReprLengthComparisionBy = attrs.field(
47
+ validator=type_validator()
48
+ )
49
+
50
+ # Optional inputs:
51
+ tag_list_to_filter_out: list[str] = attrs.field(
52
+ validator=type_validator(),
53
+ default=None
54
+ )
55
+
56
+ # Result:
57
+ tree_representation: DOMTreeRepresentation = attrs.field(
58
+ validator=type_validator(),
59
+ init=False,
60
+ repr=False
61
+ )
62
+ tree_regions_system: TreeRegionsSystem = attrs.field(
63
+ validator=type_validator(),
64
+ init=False,
65
+ repr=False
66
+ )
67
+ render_system: RenderSystem = attrs.field(
68
+ validator=type_validator(),
69
+ init=False,
70
+ repr=False
71
+ )
72
+
73
+ def __attrs_post_init__(self):
74
+ if self.tag_list_to_filter_out is None:
75
+ self.tag_list_to_filter_out = tag_list_to_filter_out
76
+
77
+ def compute_tree_representation(self):
78
+ self.tree_representation = DOMTreeRepresentation(
79
+ website_code=self.website_code,
80
+ )
81
+ self.tree_representation = remove_unwanted_tags(
82
+ tree_representation=self.tree_representation,
83
+ tag_list_to_filter_out=self.tag_list_to_filter_out
84
+ )
85
+ self.tree_representation.recompute_representation()
86
+
87
+ def compute_tree_regions_system(self):
88
+ self.tree_regions_system = TreeRegionsSystem(
89
+ tree_representation=self.tree_representation,
90
+ max_node_repr_length=self.MAX_NODE_REPR_LENGTH,
91
+ repr_length_compared_by=self.repr_length_compared_by
92
+ )
93
+
94
+ def compute_render_system(self):
95
+ self.render_system = RenderSystem(
96
+ tree_regions_system=self.tree_regions_system,
97
+ tree_representation=self.tree_representation
98
+ )
99
+
100
+ def start(self):
101
+ print("--- DOM REPRESENTATION ---")
102
+ print(" > Computing tree representation:")
103
+ self.compute_tree_representation()
104
+ print(" > Computing tree regions system:")
105
+ self.compute_tree_regions_system()
106
+ print(" > Computing render:")
107
+ self.compute_render_system()
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import attrs
4
+
5
+ from attrs_strict import type_validator
6
+
7
+ import parsel_text
8
+
9
+ from betterhtmlchunking.tree_representation import\
10
+ DOMTreeRepresentation
11
+
12
+ from betterhtmlchunking.tree_regions_system import\
13
+ TreeRegionsSystem
14
+
15
+
16
+ RegionOfInterestRenderT = dict[int, str]
17
+
18
+
19
+ @attrs.define()
20
+ class RenderSystem:
21
+ tree_regions_system: TreeRegionsSystem = attrs.field(
22
+ validator=type_validator()
23
+ )
24
+ tree_representation: DOMTreeRepresentation = attrs.field(
25
+ validator=type_validator()
26
+ )
27
+
28
+ html_render_with_pos_xpath: dict[int, RegionOfInterestRenderT] =\
29
+ attrs.field(
30
+ validator=type_validator(),
31
+ init=False
32
+ )
33
+ text_render_with_pos_xpath: dict[int, RegionOfInterestRenderT] =\
34
+ attrs.field(
35
+ validator=type_validator(),
36
+ init=False
37
+ )
38
+
39
+ # Render of the regions of interest, each one of them full:
40
+ html_render_roi: dict[int, str] = attrs.field(
41
+ validator=type_validator(),
42
+ init=False
43
+ )
44
+ text_render_roi: dict[int, str] = attrs.field(
45
+ validator=type_validator(),
46
+ init=False
47
+ )
48
+
49
+ def get_roi_text_render_with_pos_xpath(self, roi_idx: int) -> str:
50
+ return "\n".join(
51
+ self.text_render_with_pos_xpath[roi_idx].values()
52
+ )
53
+
54
+ def get_roi_html_render_with_pos_xpath(self, roi_idx: int) -> str:
55
+ return "\n".join(
56
+ self.html_render_with_pos_xpath[roi_idx].values()
57
+ )
58
+
59
+ def render(self) -> None:
60
+ self.html_render_with_pos_xpath: dict[
61
+ int, RegionOfInterestRenderT] = {}
62
+ self.text_render_with_pos_xpath: dict[
63
+ int, RegionOfInterestRenderT] = {}
64
+
65
+ self.html_render_roi: dict[int, str] = {}
66
+ self.text_render_roi: dict[int, str] = {}
67
+
68
+ region_of_interest_idx: int = 0
69
+
70
+ # Execute the function:
71
+ for roi_idx, roi in\
72
+ self.tree_regions_system.sorted_roi_by_pos_xpath.items():
73
+ self.html_render_with_pos_xpath[roi_idx] = {}
74
+ self.text_render_with_pos_xpath[roi_idx] = {}
75
+
76
+ # print("*" * 50)
77
+ # print(roi.pos_xpath_list)
78
+
79
+ for pos_xpath in roi.pos_xpath_list:
80
+ # print(pos_xpath)
81
+
82
+ # HTML render:
83
+ prettified_pos_xpath_html: str =\
84
+ self.tree_regions_system.tree_representation.xpaths_metadata[
85
+ pos_xpath].bs4_elem.prettify(
86
+ formatter="minimal"
87
+ )
88
+ # print(prettified_pos_xpath_html)
89
+
90
+ # Text render:
91
+ pos_xpath_text: str =\
92
+ parsel_text.get_bs4_soup_text(
93
+ bs4_soup=self.tree_regions_system.tree_representation.xpaths_metadata[
94
+ pos_xpath
95
+ ].bs4_elem
96
+ )
97
+
98
+ self.html_render_with_pos_xpath[
99
+ roi_idx][pos_xpath] = prettified_pos_xpath_html
100
+ self.text_render_with_pos_xpath[
101
+ roi_idx][pos_xpath] = pos_xpath_text
102
+
103
+ region_of_interest_idx += 1
104
+
105
+ self.html_render_roi[roi_idx] =\
106
+ self.get_roi_html_render_with_pos_xpath(
107
+ roi_idx=roi_idx
108
+ )
109
+ self.text_render_roi[roi_idx] =\
110
+ self.get_roi_text_render_with_pos_xpath(
111
+ roi_idx=roi_idx
112
+ )
113
+
114
+ def __attrs_post_init__(self):
115
+ self.render()