betterhtmlchunking 0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from betterhtmlchunking.main import DomRepresentation
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import attrs
4
+
5
+ from attrs_strict import type_validator
6
+
7
+ from betterhtmlchunking.utils import remove_unwanted_tags
8
+
9
+ from betterhtmlchunking.tree_representation import\
10
+ DOMTreeRepresentation
11
+
12
+ from betterhtmlchunking.tree_regions_system import\
13
+ TreeRegionsSystem
14
+ from betterhtmlchunking.tree_regions_system import\
15
+ ReprLengthComparisionBy
16
+
17
+ from betterhtmlchunking.render_system import\
18
+ RenderSystem
19
+
20
+
21
+ tag_list_to_filter_out: list[str] = [
22
+ "/head",
23
+ "/select",
24
+ # "/form",
25
+ "/footer",
26
+ "/svg",
27
+ "/defs",
28
+ "/g",
29
+ "/header",
30
+ "/footer",
31
+ "/script",
32
+ "/style"
33
+ ]
34
+
35
+
36
+ @attrs.define()
37
+ class DomRepresentation:
38
+ # Input:
39
+ MAX_NODE_REPR_LENGTH: int = attrs.field(
40
+ validator=type_validator()
41
+ )
42
+ website_code: str = attrs.field(
43
+ validator=type_validator(),
44
+ repr=False
45
+ )
46
+ repr_length_compared_by: ReprLengthComparisionBy = attrs.field(
47
+ validator=type_validator()
48
+ )
49
+
50
+ # Optional inputs:
51
+ tag_list_to_filter_out: list[str] = attrs.field(
52
+ validator=type_validator(),
53
+ default=None
54
+ )
55
+
56
+ # Result:
57
+ tree_representation: DOMTreeRepresentation = attrs.field(
58
+ validator=type_validator(),
59
+ init=False,
60
+ repr=False
61
+ )
62
+ tree_regions_system: TreeRegionsSystem = attrs.field(
63
+ validator=type_validator(),
64
+ init=False,
65
+ repr=False
66
+ )
67
+ render_system: RenderSystem = attrs.field(
68
+ validator=type_validator(),
69
+ init=False,
70
+ repr=False
71
+ )
72
+
73
+ def __attrs_post_init__(self):
74
+ if self.tag_list_to_filter_out is None:
75
+ self.tag_list_to_filter_out = tag_list_to_filter_out
76
+
77
+ def compute_tree_representation(self):
78
+ self.tree_representation = DOMTreeRepresentation(
79
+ website_code=self.website_code,
80
+ )
81
+ self.tree_representation = remove_unwanted_tags(
82
+ tree_representation=self.tree_representation,
83
+ tag_list_to_filter_out=self.tag_list_to_filter_out
84
+ )
85
+ self.tree_representation.recompute_representation()
86
+
87
+ def compute_tree_regions_system(self):
88
+ self.tree_regions_system = TreeRegionsSystem(
89
+ tree_representation=self.tree_representation,
90
+ max_node_repr_length=self.MAX_NODE_REPR_LENGTH,
91
+ repr_length_compared_by=self.repr_length_compared_by
92
+ )
93
+
94
+ def compute_render_system(self):
95
+ self.render_system = RenderSystem(
96
+ tree_regions_system=self.tree_regions_system,
97
+ tree_representation=self.tree_representation
98
+ )
99
+
100
+ def start(self):
101
+ print("--- DOM REPRESENTATION ---")
102
+ print(" > Computing tree representation:")
103
+ self.compute_tree_representation()
104
+ print(" > Computing tree regions system:")
105
+ self.compute_tree_regions_system()
106
+ print(" > Computing render:")
107
+ self.compute_render_system()
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import attrs
4
+
5
+ from attrs_strict import type_validator
6
+
7
+ import parsel_text
8
+
9
+ from betterhtmlchunking.tree_representation import\
10
+ DOMTreeRepresentation
11
+
12
+ from betterhtmlchunking.tree_regions_system import\
13
+ TreeRegionsSystem
14
+
15
+
16
+ RegionOfInterestRenderT = dict[int, str]
17
+
18
+
19
+ @attrs.define()
20
+ class RenderSystem:
21
+ tree_regions_system: TreeRegionsSystem = attrs.field(
22
+ validator=type_validator()
23
+ )
24
+ tree_representation: DOMTreeRepresentation = attrs.field(
25
+ validator=type_validator()
26
+ )
27
+
28
+ html_render_with_pos_xpath: dict[int, RegionOfInterestRenderT] =\
29
+ attrs.field(
30
+ validator=type_validator(),
31
+ init=False
32
+ )
33
+ text_render_with_pos_xpath: dict[int, RegionOfInterestRenderT] =\
34
+ attrs.field(
35
+ validator=type_validator(),
36
+ init=False
37
+ )
38
+
39
+ # Render of the regions of interest, each one of them full:
40
+ html_render_roi: dict[int, str] = attrs.field(
41
+ validator=type_validator(),
42
+ init=False
43
+ )
44
+ text_render_roi: dict[int, str] = attrs.field(
45
+ validator=type_validator(),
46
+ init=False
47
+ )
48
+
49
+ def get_roi_text_render_with_pos_xpath(self, roi_idx: int) -> str:
50
+ return "\n".join(
51
+ self.text_render_with_pos_xpath[roi_idx].values()
52
+ )
53
+
54
+ def get_roi_html_render_with_pos_xpath(self, roi_idx: int) -> str:
55
+ return "\n".join(
56
+ self.html_render_with_pos_xpath[roi_idx].values()
57
+ )
58
+
59
+ def render(self) -> None:
60
+ self.html_render_with_pos_xpath: dict[
61
+ int, RegionOfInterestRenderT] = {}
62
+ self.text_render_with_pos_xpath: dict[
63
+ int, RegionOfInterestRenderT] = {}
64
+
65
+ self.html_render_roi: dict[int, str] = {}
66
+ self.text_render_roi: dict[int, str] = {}
67
+
68
+ region_of_interest_idx: int = 0
69
+
70
+ # Execute the function:
71
+ for roi_idx, roi in\
72
+ self.tree_regions_system.sorted_roi_by_pos_xpath.items():
73
+ self.html_render_with_pos_xpath[roi_idx] = {}
74
+ self.text_render_with_pos_xpath[roi_idx] = {}
75
+
76
+ # print("*" * 50)
77
+ # print(roi.pos_xpath_list)
78
+
79
+ for pos_xpath in roi.pos_xpath_list:
80
+ # print(pos_xpath)
81
+
82
+ # HTML render:
83
+ prettified_pos_xpath_html: str =\
84
+ self.tree_regions_system.tree_representation.xpaths_metadata[
85
+ pos_xpath].bs4_elem.prettify(
86
+ formatter="minimal"
87
+ )
88
+ # print(prettified_pos_xpath_html)
89
+
90
+ # Text render:
91
+ pos_xpath_text: str =\
92
+ parsel_text.get_bs4_soup_text(
93
+ bs4_soup=self.tree_regions_system.tree_representation.xpaths_metadata[
94
+ pos_xpath
95
+ ].bs4_elem
96
+ )
97
+
98
+ self.html_render_with_pos_xpath[
99
+ roi_idx][pos_xpath] = prettified_pos_xpath_html
100
+ self.text_render_with_pos_xpath[
101
+ roi_idx][pos_xpath] = pos_xpath_text
102
+
103
+ region_of_interest_idx += 1
104
+
105
+ self.html_render_roi[roi_idx] =\
106
+ self.get_roi_html_render_with_pos_xpath(
107
+ roi_idx=roi_idx
108
+ )
109
+ self.text_render_roi[roi_idx] =\
110
+ self.get_roi_text_render_with_pos_xpath(
111
+ roi_idx=roi_idx
112
+ )
113
+
114
+ def __attrs_post_init__(self):
115
+ self.render()
@@ -0,0 +1,379 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import attrs
4
+
5
+ from attrs_strict import type_validator
6
+
7
+ import queue
8
+
9
+ import treelib
10
+
11
+ from betterhtmlchunking.tree_representation import\
12
+ DOMTreeRepresentation
13
+ from betterhtmlchunking.tree_representation import\
14
+ get_xpath_depth
15
+
16
+ from enum import StrEnum
17
+
18
+ from typing import Iterator
19
+ from typing import Any
20
+
21
+ # import prettyprinter
22
+
23
+
24
+ #################################
25
+ # #
26
+ # --- TreeRegionsSystem --- #
27
+ # #
28
+ #################################
29
+
30
+ class ROIParsingState(StrEnum):
31
+ SEEK_END: str = "seek_end"
32
+ REGION_READY: str = "region_ready"
33
+ EOF: str = "EOF"
34
+
35
+
36
+ @attrs.define()
37
+ class RegionOfInterest:
38
+ pos_xpath_list: list[str] = attrs.field(
39
+ validator=type_validator(),
40
+ init=False
41
+ )
42
+ repr_length: int = attrs.field(
43
+ validator=type_validator(),
44
+ init=False
45
+ )
46
+ node_is_roi: bool = attrs.field(
47
+ validator=type_validator(),
48
+ init=False
49
+ )
50
+
51
+ def __attrs_post_init__(self):
52
+ self.pos_xpath_list: list[str] = []
53
+ self.repr_length: int = 0
54
+ self.node_is_roi: bool = False
55
+
56
+
57
+ class ReprLengthComparisionBy(StrEnum):
58
+ TEXT_LENGTH: str = "text_length"
59
+ HTML_LENGTH: str = "html_length"
60
+
61
+
62
+ @attrs.define()
63
+ class ROIMaker:
64
+ node_xpath: str = attrs.field(
65
+ validator=type_validator()
66
+ )
67
+ children_tags: list[str] = attrs.field(
68
+ validator=type_validator()
69
+ )
70
+ tree_representation: DOMTreeRepresentation = attrs.field(
71
+ validator=type_validator()
72
+ )
73
+ max_node_repr_length: int = attrs.field(
74
+ validator=type_validator()
75
+ )
76
+ repr_length_compared_by: ReprLengthComparisionBy = attrs.field(
77
+ validator=type_validator(),
78
+ )
79
+
80
+ PARSING_STATE: ROIParsingState = attrs.field(
81
+ validator=type_validator(),
82
+ default=ROIParsingState.SEEK_END
83
+ )
84
+ children_tags_iter: Iterator[str] = attrs.field(
85
+ validator=type_validator(),
86
+ init=False
87
+ )
88
+ actual_region_of_interest: Any = attrs.field(
89
+ validator=type_validator(),
90
+ init=False
91
+ )
92
+ regions_of_interest_list: list[RegionOfInterest] = attrs.field(
93
+ validator=type_validator(),
94
+ init=False
95
+ )
96
+ children_to_enqueue: list[str] = attrs.field(
97
+ validator=type_validator(),
98
+ init=False
99
+ )
100
+
101
+ def __attrs_post_init__(self) -> None:
102
+ # print(f"> ROIMaker: Node XPATH: {self.node_xpath}")
103
+
104
+ self.regions_of_interest_list: list[RegionOfInterest] = []
105
+ self.children_to_enqueue: list[str] = []
106
+ self.actual_region_of_interest = RegionOfInterest()
107
+
108
+ # Explore for ROIs on children.
109
+ self.children_tags_iter = iter(self.children_tags)
110
+
111
+ self.actual_region_of_interest.node_is_roi = False
112
+
113
+ self.step()
114
+ while self.PARSING_STATE != ROIParsingState.EOF:
115
+ self.step()
116
+
117
+ node_is_roi: bool = False
118
+ if len(self.children_tags) == 0:
119
+ node_is_roi = True
120
+ elif len(self.regions_of_interest_list) == 1:
121
+ roi = self.regions_of_interest_list[0]
122
+ if len(roi.pos_xpath_list) == len(self.children_tags):
123
+ node_is_roi = True
124
+
125
+ # Node itself is ROI.
126
+ if node_is_roi is True:
127
+ # print(f"> Node itself is ROI: {self.node_xpath}")
128
+ node: treelib.Node =\
129
+ self.tree_representation.tree.get_node(
130
+ nid=self.node_xpath
131
+ )
132
+
133
+ node_repr_length: int =\
134
+ self.get_node_repr_length(node=node)
135
+
136
+ # prettyprinter.cpprint(node_repr_length)
137
+
138
+ # if node_repr_length > self.max_node_repr_length:
139
+ self.actual_region_of_interest.repr_length =\
140
+ node_repr_length
141
+ self.actual_region_of_interest.pos_xpath_list.append(
142
+ self.node_xpath
143
+ )
144
+ self.actual_region_of_interest.node_is_roi = True
145
+
146
+ self.regions_of_interest_list = []
147
+
148
+ self.regions_of_interest_list.append(
149
+ self.actual_region_of_interest
150
+ )
151
+
152
+ return None
153
+
154
+ def get_node_repr_length(self, node: treelib.Node) -> int:
155
+ match self.repr_length_compared_by:
156
+ case ReprLengthComparisionBy.TEXT_LENGTH:
157
+ node_repr_length: int = node.data.text_length
158
+ case ReprLengthComparisionBy.HTML_LENGTH:
159
+ node_repr_length: int = node.data.html_length
160
+
161
+ return node_repr_length
162
+
163
+ # Based on XMLStreamer.
164
+ def step(self):
165
+ match self.PARSING_STATE:
166
+ case ROIParsingState.SEEK_END:
167
+ # print("> SEEK END:")
168
+ try:
169
+ children_tag: str = next(self.children_tags_iter)
170
+ # print(f"children_tag: {children_tag}")
171
+ node: treelib.Node =\
172
+ self.tree_representation.tree.get_node(
173
+ nid=children_tag
174
+ )
175
+
176
+ node_repr_length: int = self.get_node_repr_length(
177
+ node=node
178
+ )
179
+ # print(f"node_repr_length: {node_repr_length}")
180
+
181
+ if node_repr_length >= self.max_node_repr_length:
182
+ self.PARSING_STATE = ROIParsingState.REGION_READY
183
+ self.children_to_enqueue.append(children_tag)
184
+ else:
185
+ proposed_repr_length: int = node_repr_length +\
186
+ self.actual_region_of_interest.repr_length
187
+
188
+ # print(f"proposed_repr_length: {proposed_repr_length}")
189
+
190
+ self.actual_region_of_interest.pos_xpath_list.append(
191
+ node.identifier
192
+ )
193
+ self.actual_region_of_interest.repr_length =\
194
+ proposed_repr_length
195
+
196
+ if proposed_repr_length >= self.max_node_repr_length:
197
+ self.PARSING_STATE = ROIParsingState.REGION_READY
198
+
199
+ except StopIteration:
200
+ self.PARSING_STATE = ROIParsingState.EOF
201
+ # print("StopIteration.")
202
+ # prettyprinter.cpprint(self.actual_region_of_interest)
203
+ if self.actual_region_of_interest.repr_length > 0 and\
204
+ len(self.regions_of_interest_list) > 0:
205
+ # print("Hanging xpaths.")
206
+ self.regions_of_interest_list[-1].repr_length +=\
207
+ self.actual_region_of_interest.repr_length
208
+ self.regions_of_interest_list[-1].pos_xpath_list +=\
209
+ self.actual_region_of_interest.pos_xpath_list
210
+ self.actual_region_of_interest = RegionOfInterest()
211
+
212
+ case ROIParsingState.REGION_READY:
213
+ # print("> REGION READY:")
214
+ self.regions_of_interest_list.append(
215
+ self.actual_region_of_interest
216
+ )
217
+
218
+ self.PARSING_STATE = ROIParsingState.SEEK_END
219
+ self.actual_region_of_interest = RegionOfInterest()
220
+
221
+
222
+ def order_regions_of_interest_by_pos_xpath(
223
+ region_of_interest_list: list[RegionOfInterest],
224
+ pos_xpaths_list: list[str]
225
+ ) -> list[RegionOfInterest]:
226
+ # Create a mapping of xpath to its index in pos_xpaths_list
227
+ xpath_order = {
228
+ xpath: index for index, xpath in enumerate(pos_xpaths_list)
229
+ }
230
+
231
+ # Sort the region_of_interest_list
232
+ # based on the first pos_xpath_list entry for each region.
233
+ sorted_regions = sorted(
234
+ region_of_interest_list,
235
+ key=lambda region: xpath_order.get(
236
+ region.pos_xpath_list[0],
237
+ float("inf")
238
+ )
239
+ )
240
+
241
+ return sorted_regions
242
+
243
+
244
+ @attrs.define()
245
+ class TreeRegionsSystem:
246
+ tree_representation: DOMTreeRepresentation = attrs.field(
247
+ validator=type_validator()
248
+ )
249
+ max_node_repr_length: int = attrs.field(
250
+ validator=type_validator()
251
+ )
252
+ regions_of_interest_list: list[RegionOfInterest] = attrs.field(
253
+ validator=type_validator(),
254
+ init=False
255
+ )
256
+ sorted_roi_by_pos_xpath: dict[int, RegionOfInterest] = attrs.field(
257
+ validator=type_validator(),
258
+ init=False
259
+ )
260
+ repr_length_compared_by: ReprLengthComparisionBy = attrs.field(
261
+ validator=type_validator(),
262
+ default=ReprLengthComparisionBy.HTML_LENGTH
263
+ )
264
+
265
+ def __attrs_post_init__(self):
266
+ self.start()
267
+
268
+ def print_tree_node_states(self):
269
+ print("--- PRINT TREE NODE STATES ---")
270
+ for pos_xpath in self.tree_representation.pos_xpaths_list:
271
+ pad: str = get_xpath_depth(xpath=pos_xpath) * " " * 4
272
+ node = self.tree_representation.tree.get_node(pos_xpath)
273
+ print(f"{pad}|")
274
+ print(f"{pad}| {pos_xpath}")
275
+ print(f"{pad}| Text length: {node.data.text_length}")
276
+ print(f"{pad}| HTML length: {node.data.html_length}")
277
+
278
+ def get_node_repr_length(self, node: treelib.Node) -> int:
279
+ match self.repr_length_compared_by:
280
+ case ReprLengthComparisionBy.TEXT_LENGTH:
281
+ node_repr_length: int = node.data.text_length
282
+ case ReprLengthComparisionBy.HTML_LENGTH:
283
+ node_repr_length: int = node.data.html_length
284
+
285
+ return node_repr_length
286
+
287
+ def start(self):
288
+ self.regions_of_interest_list: list[RegionOfInterest] = []
289
+
290
+ subtrees_queue = queue.Queue()
291
+
292
+ subtrees_queue.put("/html")
293
+
294
+ while subtrees_queue.empty() is False:
295
+ # print("#" * 100)
296
+ node_xpath: str = subtrees_queue.get()
297
+ # print("--- NODE XPATH ---")
298
+ # print(node_xpath)
299
+
300
+ node: treelib.Node = self.tree_representation.tree.get_node(
301
+ node_xpath
302
+ )
303
+ # print(node.data.text_length)
304
+
305
+ children_tags: list[str] =\
306
+ self.tree_representation.get_children_tag_list(
307
+ xpath=node_xpath
308
+ )
309
+
310
+ region_of_interest_maker = ROIMaker(
311
+ node_xpath=node_xpath,
312
+ children_tags=children_tags,
313
+ tree_representation=self.tree_representation,
314
+ max_node_repr_length=self.max_node_repr_length,
315
+ repr_length_compared_by=self.repr_length_compared_by
316
+ )
317
+
318
+ # print("--- REGIONS OF INTEREST LIST ---")
319
+ # prettyprinter.cpprint(region_of_interest_maker.regions_of_interest_list)
320
+
321
+ for roi in region_of_interest_maker.regions_of_interest_list:
322
+ # If we are based on text_length,
323
+ # tags like img (text_length == 0) are ignored.
324
+ # For that reason we base ROI on pos_xpath_list.
325
+ # if roi.text_length > 0:
326
+ if roi.pos_xpath_list != []:
327
+ self.regions_of_interest_list.append(roi)
328
+
329
+ # print("--- CHILDREN TO ENQUEUE ---")
330
+ # prettyprinter.cpprint(region_of_interest_maker.children_to_enqueue)
331
+
332
+ """
333
+ Try to make ROIs under.
334
+ If ROI occupy all children, ROI contains node itself.
335
+
336
+ Those elements who are not ROI, are put into queue.
337
+ Elements who are ROI, are put into a separate dict.
338
+ """
339
+
340
+ for child_tag in region_of_interest_maker.children_to_enqueue:
341
+ subtrees_queue.put(child_tag)
342
+
343
+ """
344
+ for child_tag in children_tags:
345
+ subtrees_queue.put(child_tag)
346
+ """
347
+
348
+ sorted_regions: list[RegionOfInterest] =\
349
+ order_regions_of_interest_by_pos_xpath(
350
+ region_of_interest_list=\
351
+ self.regions_of_interest_list,
352
+ pos_xpaths_list=\
353
+ self.tree_representation.pos_xpaths_list
354
+ )
355
+
356
+ # This happen when there are no nodes to detect as RegionOfInterest
357
+ # or when max_node_repr_length is greater than total repr_length in
358
+ # the document.
359
+ if sorted_regions == [] and\
360
+ len(self.tree_representation.pos_xpaths_list) > 0:
361
+ node_xpath: str = self.tree_representation.pos_xpaths_list[0]
362
+
363
+ node: treelib.Node = self.tree_representation.tree.get_node(
364
+ node_xpath
365
+ )
366
+
367
+ node_repr_length: int = self.get_node_repr_length(
368
+ node=node
369
+ )
370
+ # print(node_repr_length)
371
+
372
+ roi = RegionOfInterest()
373
+ roi.pos_xpath_list = [node_xpath]
374
+ roi.repr_length = node_repr_length
375
+ roi.node_is_roi = True
376
+
377
+ sorted_regions = [roi]
378
+
379
+ self.sorted_roi_by_pos_xpath = dict(enumerate(sorted_regions))
@@ -0,0 +1,227 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import attrs
4
+ from attrs_strict import type_validator
5
+
6
+ import parsel_text
7
+
8
+ import treelib
9
+
10
+ import bs4
11
+
12
+ from typing import Any
13
+
14
+ # import prettyprinter
15
+
16
+ # prettyprinter.install_extras()
17
+
18
+
19
+ def get_parent_xpath(xpath: str) -> str:
20
+ if xpath.count("/") == 1:
21
+ return "root"
22
+ return "/".join(xpath.split("/")[:-1])
23
+
24
+
25
+ def get_xpath_depth(xpath: str) -> int:
26
+ xpath: str = xpath.rstrip("/")
27
+ return xpath.count("/")
28
+
29
+
30
+ def get_children_tags(node):
31
+ # Extract the tag names from the children of the given node
32
+ return [child.tag for child in node]
33
+
34
+
35
+ def get_pos_xpath_from_bs4_elem(element) -> str:
36
+ components = []
37
+ child = element if isinstance(element, bs4.Tag) else element.parent
38
+
39
+ for parent in child.parents:
40
+ siblings = parent.find_all(child.name, recursive=False)
41
+
42
+ if len(siblings) == 1:
43
+ component = child.name
44
+ else:
45
+ index = next(
46
+ i for i, s in enumerate(siblings, 1) if s is child
47
+ )
48
+ component = f"{child.name}[{index}]"
49
+
50
+ components.append(component)
51
+ child = parent
52
+
53
+ components.reverse()
54
+ return "/" + "/".join(components)
55
+
56
+
57
+ @attrs.define()
58
+ class NodeMetadata:
59
+ idx: int = attrs.field(
60
+ validator=type_validator(),
61
+ init=False
62
+ )
63
+ text_length: int = attrs.field(
64
+ validator=type_validator(),
65
+ init=False
66
+ )
67
+ html_length: int = attrs.field(
68
+ validator=type_validator(),
69
+ init=False
70
+ )
71
+ bs4_elem: Any = attrs.field(
72
+ validator=type_validator(),
73
+ init=False
74
+ )
75
+ extra_metadata: Any = attrs.field(
76
+ validator=type_validator(),
77
+ default=None
78
+ )
79
+
80
+
81
+ @attrs.define()
82
+ class DOMTreeRepresentation:
83
+ website_code: str = attrs.field(
84
+ validator=type_validator()
85
+ )
86
+ soup: bs4.BeautifulSoup = attrs.field(
87
+ validator=type_validator(),
88
+ init=False
89
+ )
90
+
91
+ tree: treelib.Tree = attrs.field(
92
+ validator=type_validator(),
93
+ init=False
94
+ )
95
+
96
+ xpaths_metadata: dict[str, NodeMetadata] = attrs.field(
97
+ validator=type_validator(),
98
+ init=False
99
+ )
100
+
101
+ pos_xpaths_list: list[str] = attrs.field(
102
+ validator=type_validator(),
103
+ init=False
104
+ )
105
+ pos_sorted_xpaths: list[str] = attrs.field(
106
+ validator=type_validator(),
107
+ init=False
108
+ )
109
+
110
+ def __attrs_post_init__(self):
111
+ self.start()
112
+
113
+ def make_html_soup(self):
114
+ self.soup = bs4.BeautifulSoup(
115
+ self.website_code,
116
+ features="lxml"
117
+ )
118
+
119
+ def compute_xpaths_data(self):
120
+ children = self.soup.find_all(
121
+ name=True,
122
+ recursive=True
123
+ )
124
+
125
+ self.xpaths_metadata: dict[str, Any] = {}
126
+
127
+ for child in children:
128
+ pos_xpath: str = get_pos_xpath_from_bs4_elem(
129
+ element=child
130
+ )
131
+
132
+ child_text: str = parsel_text.get_bs4_soup_text(
133
+ bs4_soup=child
134
+ )
135
+ text_length: int = len(child_text)
136
+
137
+ child_html: str = child.prettify(
138
+ formatter="minimal"
139
+ )
140
+ html_length: int = len(child_html)
141
+
142
+ node_metadata = NodeMetadata()
143
+ node_metadata.text_length = text_length
144
+ node_metadata.html_length = html_length
145
+ node_metadata.bs4_elem = child
146
+
147
+ self.xpaths_metadata[pos_xpath] = node_metadata
148
+
149
+ def make_tree_representation(self):
150
+ # Initialize the tree.
151
+ self.tree = treelib.Tree()
152
+
153
+ # Add the root node:
154
+ self.tree.create_node(
155
+ tag="root",
156
+ identifier="root"
157
+ )
158
+
159
+ i = 0
160
+ # Add nodes to the tree:
161
+
162
+ for pos_xpath, node_metadata in self.xpaths_metadata.items():
163
+ parent_xpath: str = get_parent_xpath(xpath=pos_xpath)
164
+
165
+ node_metadata.idx = i
166
+
167
+ self.tree.create_node(
168
+ tag=pos_xpath,
169
+ identifier=pos_xpath,
170
+ parent=parent_xpath,
171
+ data=node_metadata
172
+ )
173
+
174
+ i += 1
175
+
176
+ def define_pos_xpaths_list(self):
177
+ self.pos_xpaths_list: list[str] = list(
178
+ self.xpaths_metadata.keys()
179
+ )
180
+
181
+ def sort_pos_xpaths(self):
182
+ self.pos_sorted_xpaths: list[str] = sorted(
183
+ self.pos_xpaths_list,
184
+ key=get_xpath_depth,
185
+ reverse=True
186
+ )
187
+
188
+ def get_children_tag_list(self, xpath: str) -> list[str]:
189
+ children_tags: list[str] = get_children_tags(
190
+ self.tree.children(xpath)
191
+ )
192
+ return children_tags
193
+
194
+ def delete_node(self, pos_xpath: str) -> None:
195
+ # Delete on treelib.Tree:
196
+ self.tree.remove_node(pos_xpath)
197
+ # print(self.tree)
198
+
199
+ # Delete on soup:
200
+ node = self.xpaths_metadata[pos_xpath].bs4_elem
201
+ node.decompose()
202
+ # print(self.soup.prettify())
203
+
204
+ keys_to_remove: list[str] = [
205
+ xpath for xpath in self.pos_xpaths_list
206
+ if xpath.startswith(pos_xpath)
207
+ ]
208
+
209
+ # Delete on metadata all which start with pos_xpath:
210
+ for xpath in keys_to_remove:
211
+ del self.xpaths_metadata[xpath]
212
+
213
+ self.define_pos_xpaths_list()
214
+ self.sort_pos_xpaths()
215
+
216
+ # After operating with node deletion
217
+ # you need to recompute the representation.
218
+
219
+ def recompute_representation(self):
220
+ self.compute_xpaths_data()
221
+ self.make_tree_representation()
222
+ self.define_pos_xpaths_list()
223
+ self.sort_pos_xpaths()
224
+
225
+ def start(self):
226
+ self.make_html_soup()
227
+ self.recompute_representation()
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import treelib
4
+
5
+ from betterhtmlchunking.tree_representation import DOMTreeRepresentation
6
+
7
+
8
+ def wanted_xpath(
9
+ xpath: str,
10
+ tag_list_to_filter_out: list[str]
11
+ ) -> bool:
12
+ # Check if any of the unwanted tags are present in the given XPath
13
+ return not any(tag in xpath for tag in tag_list_to_filter_out)
14
+
15
+
16
+ def remove_unwanted_tags(
17
+ tree_representation: DOMTreeRepresentation,
18
+ tag_list_to_filter_out: list[str]
19
+ ):
20
+ for pos_xpath in tree_representation.pos_xpaths_list:
21
+ if wanted_xpath(
22
+ xpath=pos_xpath,
23
+ tag_list_to_filter_out=tag_list_to_filter_out
24
+ ) is False:
25
+ try:
26
+ tree_representation.delete_node(pos_xpath=pos_xpath)
27
+ except treelib.exceptions.NodeIDAbsentError:
28
+ ...
29
+ return tree_representation
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Carlos A. Planchón
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.2
2
+ Name: betterhtmlchunking
3
+ Version: 0.9
4
+ Summary: A Python library for intelligent HTML segmentation and ROI extraction. It builds a DOM tree from raw HTML and extracts content-rich regions for efficient web scraping and analysis.
5
+ Author-email: "Carlos A. Planchón" <carlosandresplanchonprestes@gmail.com>
6
+ License: MIT License
7
+ Project-URL: repository, https://github.com/carlosplanchon/betterhtmlchunking.git
8
+ Keywords: html,chunking,scraping,dom,roi,content extraction,web-scraping
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Software Development :: Libraries
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: attrs
16
+ Requires-Dist: attrs-strict
17
+ Requires-Dist: beautifulsoup4
18
+ Requires-Dist: lxml
19
+ Requires-Dist: treelib
20
+ Requires-Dist: parsel_text
21
+
22
+ ```markdown
23
+ # betterhtmlchunking
24
+
25
+ A Python library for intelligently chunking HTML documents into structured, size-limited segments based on DOM tree analysis.
26
+
27
+ ## Overview
28
+
29
+ This library processes HTML content to split it into semantically coherent chunks while respecting specified size constraints. It analyzes the DOM structure to identify optimal split points, preserving contextual information and document hierarchy.
30
+
31
+ ## Key Features
32
+
33
+ - Custom DOM tree representation.
34
+ - Configurable chunk size limits (counting by text or HTML length).
35
+ - Intelligent region-of-interest detection.
36
+ - Dual output formats: HTML and plain text chunks.
37
+ - Preservation of structure relationships.
38
+ - Customizable tag filtering.
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install betterhtmlchunking
44
+ ```
45
+
46
+ ### Dependencies
47
+ - Python 3.12+
48
+ - attrs
49
+ - treelib
50
+ - beautifulsoup4
51
+ - parsel-text
52
+ - lxml
53
+ - attrs-strict
54
+
55
+ ## Usage
56
+
57
+ ### Basic Example
58
+
59
+ ```python
60
+ from betterhtmlchunking import DomRepresentation
61
+
62
+ html_content = """
63
+ <html>
64
+ <body>
65
+ <div id="content">
66
+ <h1>Document Title</h1>
67
+ <p>First paragraph...</p>
68
+ <p>Second paragraph...</p>
69
+ </div>
70
+ </body>
71
+ </html>
72
+ """
73
+
74
+ # Create document representation with 500 character chunks
75
+ doc = DomRepresentation(
76
+ MAX_NODE_REPR_LENGTH=500,
77
+ website_code=html_content,
78
+ repr_length_compared_by="html_length"
79
+ )
80
+
81
+ # Access HTML chunks
82
+ html_chunks = doc.render_system.html_render_roi
83
+ for chunk_id, chunk in html_chunks.items():
84
+ print(f"Chunk {chunk_id}:\n{chunk}\n{'='*50}")
85
+
86
+ # Access text chunks
87
+ text_chunks = doc.render_system.text_render_roi
88
+ for chunk_id, chunk in text_chunks.items():
89
+ print(f"Chunk {chunk_id}:\n{chunk}\n{'='*50}")
90
+ ```
91
+
92
+ ## Configuration
93
+
94
+ ### Key Parameters
95
+ - `MAX_NODE_REPR_LENGTH`: Maximum allowed length for each chunk (in characters)
96
+ - `repr_length_compared_by`: Length calculation method:
97
+ - `html_length`: HTML source length
98
+ - `text_length`: Rendered text length
99
+ - `website_code`: Input HTML content
100
+
101
+ ### Advanced Features
102
+ ```python
103
+ # Access the DOM tree structure
104
+ tree = doc.tree_representation.tree
105
+
106
+ # Get node metadata
107
+ for node in tree.all_nodes():
108
+ print(f"XPath: {node.identifier}")
109
+ print(f"Text length: {node.data.text_length}")
110
+ print(f"HTML length: {node.data.html_length}")
111
+
112
+ # Custom tag filtering (before processing)
113
+ from betterhtmlchunking.tree_representation import DOMTreeRepresentation
114
+ from betterhtmlchunking.utils import remove_unwanted_tags
115
+
116
+ # Example usage of remove_unwanted_tags:
117
+ tree_rep = DOMTreeRepresentation(website_code=html_content)
118
+ filtered_rep = remove_unwanted_tags(tree_rep)
119
+ ```
120
+
121
+ ## How It Works
122
+
123
+ 1. **DOM Parsing**
124
+ - Builds a tree representation of the HTML document.
125
+ - Calculates metadata (text length, HTML length) for each node.
126
+
127
+ 2. **Region Detection**
128
+ - Uses **Breadth First Search (BFS)** to traverse the DOM tree in a level-order fashion, ensuring that each node is processed systematically.
129
+ - Combines nodes until the specified size limit is reached.
130
+ - Preserves parent-child relationships to maintain contextual integrity.
131
+
132
+ 3. **Chunk Generation**
133
+ - Creates HTML chunks with original markup.
134
+ - Generates parallel text-only chunks.
135
+ - Maintains chunk order based on document structure.
136
+
137
+ ## Comparison to popular Chunking Techniques
138
+
139
+ The actual practice (Feb. 2025) is to use **plain-text** or **token-based** chunking strategies, primarily aimed at keeping prompts within certain token limits for large language models. This approach is ideal for quick semantic retrieval or QA tasks on *unstructured* text.
140
+
141
+ By contrast, **betterhtmlchunking** preserves the **HTML DOM structure**, calculating chunk boundaries based on each node’s text or HTML length. This approach is especially useful when you want to:
142
+ - Retain or leverage the **hierarchical relationships** in the HTML (e.g., headings, nested divs)
143
+ - Filter out undesired tags or sections (like `<script>` or `<style>`)
144
+ - Pinpoint exactly where each chunk originated in the document (via positional XPaths)
145
+
146
+ You can even combine the two techniques if you need both **structured extraction** (via betterhtmlchunking) and **LLM-friendly text chunking** (via LangChain) for advanced tasks such as summarization, semantic search, or large-scale QA pipelines.
147
+
148
+ ## License
149
+
150
+ MIT License
151
+
152
+ ## Contributing
153
+ Feel free to open issues or submit pull requests if you have suggestions or improvements.
@@ -0,0 +1,11 @@
1
+ betterhtmlchunking/__init__.py,sha256=BBFsi9mQz1HccWlKF2jASNZxUK-z41f4gXPZY2bzfgE,78
2
+ betterhtmlchunking/main.py,sha256=EDHE_C15FnXpEXKjTc2hCl4Uoqmm46NnMg_ZhISFIjg,2921
3
+ betterhtmlchunking/render_system.py,sha256=CqL_WLNVqYV5UFqS0Q9pyV0Hb2pLwQBdUTxhUHqw5YU,3557
4
+ betterhtmlchunking/tree_regions_system.py,sha256=wcKbmWD340Ipz2sRtQiZx1DEmGOPMMGen1v1FMM4lpg,12861
5
+ betterhtmlchunking/tree_representation.py,sha256=qrcvVlLismL43csq4NDmnOz_rzLxn2Z5qJCk_C9mHxQ,5652
6
+ betterhtmlchunking/utils.py,sha256=FB_QiTKQFdpt5CvhuXcPcJr5Mq-CV__tcEsnYD-QiAI,861
7
+ betterhtmlchunking-0.9.dist-info/LICENSE,sha256=ng5atLTidZ1vzNYQp0LINoRma4D9cyw9ltbKTI8rvtE,1076
8
+ betterhtmlchunking-0.9.dist-info/METADATA,sha256=AGs8EnwBJvecSk9pSjIHKzfSSooV27R2PBGKqgCI35s,5311
9
+ betterhtmlchunking-0.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
10
+ betterhtmlchunking-0.9.dist-info/top_level.txt,sha256=tz4R-XL9TGFuEx5508yeFyoIS6m8l13O_EbYi91s5sM,19
11
+ betterhtmlchunking-0.9.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ betterhtmlchunking