sec2md 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +24 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +116 -0
- sec2md/chunker/markdown_chunk.py +76 -0
- sec2md/chunker/markdown_chunker.py +234 -0
- sec2md/chunking.py +66 -0
- sec2md/core.py +93 -0
- sec2md/models.py +153 -0
- sec2md/parser.py +586 -0
- sec2md/section_extractor.py +316 -0
- sec2md/sections.py +104 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.0.dist-info/METADATA +217 -0
- sec2md-0.1.0.dist-info/RECORD +19 -0
- sec2md-0.1.0.dist-info/WHEEL +5 -0
- sec2md-0.1.0.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.0.dist-info/top_level.txt +1 -0
sec2md/parser.py
ADDED
|
@@ -0,0 +1,586 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from typing import List, Dict, Union, Optional
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from bs4.element import NavigableString, Tag
|
|
9
|
+
|
|
10
|
+
from sec2md.absolute_table_parser import AbsolutelyPositionedTableParser, median
|
|
11
|
+
from sec2md.table_parser import TableParser
|
|
12
|
+
from sec2md.models import Page
|
|
13
|
+
|
|
14
|
+
BLOCK_TAGS = {"div", "p", "h1", "h2", "h3", "h4", "h5", "h6", "table", "br", "hr", "ul", "ol", "li"}
|
|
15
|
+
BOLD_TAGS = {"b", "strong"}
|
|
16
|
+
ITALIC_TAGS = {"i", "em"}
|
|
17
|
+
|
|
18
|
+
_ws = re.compile(r"\s+")
|
|
19
|
+
_css_decl = re.compile(r"^[a-zA-Z\-]+\s*:\s*[^;]+;\s*$")
|
|
20
|
+
ITEM_HEADER_CELL_RE = re.compile(r"^\s*Item\s+([0-9IVX]+)\.\s*$", re.I)
|
|
21
|
+
PART_HEADER_CELL_RE = re.compile(r"^\s*Part\s+([IVX]+)\s*$", re.I)
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Parser:
|
|
27
|
+
"""Document parser with support for regular tables and pseudo-tables."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, content: str):
|
|
30
|
+
self.soup = BeautifulSoup(content, "lxml")
|
|
31
|
+
self.includes_table = False
|
|
32
|
+
self.pages: Dict[int, List[str]] = defaultdict(list)
|
|
33
|
+
self.input_char_count = len(self.soup.get_text())
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def _is_bold(el: Tag) -> bool:
|
|
37
|
+
if not isinstance(el, Tag):
|
|
38
|
+
return False
|
|
39
|
+
style = (el.get("style") or "").lower()
|
|
40
|
+
return (
|
|
41
|
+
"font-weight:700" in style
|
|
42
|
+
or "font-weight:bold" in style
|
|
43
|
+
or el.name in BOLD_TAGS
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _is_italic(el: Tag) -> bool:
|
|
48
|
+
if not isinstance(el, Tag):
|
|
49
|
+
return False
|
|
50
|
+
style = (el.get("style") or "").lower()
|
|
51
|
+
return (
|
|
52
|
+
"font-style:italic" in style
|
|
53
|
+
or el.name in ITALIC_TAGS
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def _is_block(el: Tag) -> bool:
|
|
58
|
+
return isinstance(el, Tag) and el.name in BLOCK_TAGS
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _is_absolutely_positioned(el: Tag) -> bool:
|
|
62
|
+
"""Check if element has position:absolute"""
|
|
63
|
+
if not isinstance(el, Tag):
|
|
64
|
+
return False
|
|
65
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
66
|
+
return "position:absolute" in style
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _is_inline_display(el: Tag) -> bool:
|
|
70
|
+
"""Check if element has display:inline or display:inline-block"""
|
|
71
|
+
if not isinstance(el, Tag):
|
|
72
|
+
return False
|
|
73
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
74
|
+
return "display:inline-block" in style or "display:inline;" in style
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _has_break_before(el: Tag) -> bool:
|
|
78
|
+
if not isinstance(el, Tag):
|
|
79
|
+
return False
|
|
80
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
81
|
+
return (
|
|
82
|
+
"page-break-before:always" in style
|
|
83
|
+
or "break-before:page" in style
|
|
84
|
+
or "break-before:always" in style
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def _has_break_after(el: Tag) -> bool:
|
|
89
|
+
if not isinstance(el, Tag):
|
|
90
|
+
return False
|
|
91
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
92
|
+
return (
|
|
93
|
+
"page-break-after:always" in style
|
|
94
|
+
or "break-after:page" in style
|
|
95
|
+
or "break-after:always" in style
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def _is_hidden(el: Tag) -> bool:
|
|
100
|
+
"""Check if element has display:none"""
|
|
101
|
+
if not isinstance(el, Tag):
|
|
102
|
+
return False
|
|
103
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
104
|
+
return "display:none" in style
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def _clean_text(text: str) -> str:
|
|
108
|
+
# Remove zero-width spaces, BOM, normalize NBSP
|
|
109
|
+
text = text.replace("\u200b", "").replace("\ufeff", "").replace("\xa0", " ")
|
|
110
|
+
return _ws.sub(" ", text).strip()
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def _wrap_markdown(el: Tag) -> str:
|
|
114
|
+
"""Return the prefix/suffix markdown wrapper for this element."""
|
|
115
|
+
bold = Parser._is_bold(el)
|
|
116
|
+
italic = Parser._is_italic(el)
|
|
117
|
+
if bold and italic:
|
|
118
|
+
return "***"
|
|
119
|
+
if bold:
|
|
120
|
+
return "**"
|
|
121
|
+
if italic:
|
|
122
|
+
return "*"
|
|
123
|
+
return ""
|
|
124
|
+
|
|
125
|
+
def _append(self, page_num: int, s: str) -> None:
|
|
126
|
+
if s:
|
|
127
|
+
self.pages[page_num].append(s)
|
|
128
|
+
|
|
129
|
+
def _blankline_before(self, page_num: int) -> None:
|
|
130
|
+
"""Ensure exactly one blank line before the next block."""
|
|
131
|
+
buf = self.pages[page_num]
|
|
132
|
+
if not buf:
|
|
133
|
+
return
|
|
134
|
+
if not buf[-1].endswith("\n"):
|
|
135
|
+
buf.append("\n")
|
|
136
|
+
if len(buf) >= 2 and buf[-1] == "\n" and buf[-2] == "\n":
|
|
137
|
+
return
|
|
138
|
+
buf.append("\n")
|
|
139
|
+
|
|
140
|
+
def _blankline_after(self, page_num: int) -> None:
|
|
141
|
+
"""Mirror `_blankline_before` for symmetry; same rule."""
|
|
142
|
+
self._blankline_before(page_num)
|
|
143
|
+
|
|
144
|
+
def _process_text_node(self, node: NavigableString) -> str:
|
|
145
|
+
text = self._clean_text(str(node))
|
|
146
|
+
if text and _css_decl.match(text):
|
|
147
|
+
return ""
|
|
148
|
+
return text
|
|
149
|
+
|
|
150
|
+
def _process_element(self, element: Union[Tag, NavigableString]) -> str:
|
|
151
|
+
if isinstance(element, NavigableString):
|
|
152
|
+
return self._process_text_node(element)
|
|
153
|
+
|
|
154
|
+
if element.name == "table":
|
|
155
|
+
# Use effective (non-empty) rows for the decision
|
|
156
|
+
eff_rows = self._effective_rows(element)
|
|
157
|
+
if len(eff_rows) <= 1:
|
|
158
|
+
# Flatten single-row "header tables" like Item/Part banners
|
|
159
|
+
cells = eff_rows[0] if eff_rows else []
|
|
160
|
+
text = self._one_row_table_to_text(cells)
|
|
161
|
+
return text
|
|
162
|
+
|
|
163
|
+
self.includes_table = True
|
|
164
|
+
return TableParser(element).md().strip()
|
|
165
|
+
|
|
166
|
+
if element.name in {"ul", "ol"}:
|
|
167
|
+
items = []
|
|
168
|
+
for li in element.find_all("li", recursive=False):
|
|
169
|
+
item_text = self._process_element(li).strip()
|
|
170
|
+
if item_text:
|
|
171
|
+
item_text = item_text.lstrip("•·∙◦▪▫-").strip()
|
|
172
|
+
items.append(item_text)
|
|
173
|
+
if not items:
|
|
174
|
+
return ""
|
|
175
|
+
if element.name == "ol":
|
|
176
|
+
return "\n".join(f"{i + 1}. {t}" for i, t in enumerate(items))
|
|
177
|
+
else:
|
|
178
|
+
return "\n".join(f"- {t}" for t in items)
|
|
179
|
+
|
|
180
|
+
if element.name == "li":
|
|
181
|
+
parts = [self._process_element(c) for c in element.children]
|
|
182
|
+
return " ".join(p for p in parts if p).strip()
|
|
183
|
+
|
|
184
|
+
parts: List[str] = []
|
|
185
|
+
for child in element.children:
|
|
186
|
+
if isinstance(child, NavigableString):
|
|
187
|
+
t = self._process_text_node(child)
|
|
188
|
+
if t:
|
|
189
|
+
parts.append(t)
|
|
190
|
+
else:
|
|
191
|
+
t = self._process_element(child)
|
|
192
|
+
if t:
|
|
193
|
+
parts.append(t)
|
|
194
|
+
|
|
195
|
+
text = " ".join(p for p in parts if p).strip()
|
|
196
|
+
if not text:
|
|
197
|
+
return ""
|
|
198
|
+
|
|
199
|
+
wrap = self._wrap_markdown(element)
|
|
200
|
+
return f"{wrap}{text}{wrap}" if wrap else text
|
|
201
|
+
|
|
202
|
+
def _extract_absolutely_positioned_children(self, container: Tag) -> List[Tag]:
|
|
203
|
+
"""
|
|
204
|
+
Extract all absolutely positioned children from a container.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
List of absolutely positioned child elements
|
|
208
|
+
"""
|
|
209
|
+
positioned_children = []
|
|
210
|
+
for child in container.children:
|
|
211
|
+
if isinstance(child, Tag) and self._is_absolutely_positioned(child):
|
|
212
|
+
# Skip elements that are just styling (no text content)
|
|
213
|
+
if child.get_text(strip=True):
|
|
214
|
+
positioned_children.append(child)
|
|
215
|
+
return positioned_children
|
|
216
|
+
|
|
217
|
+
def _compute_line_gaps(self, elements: List[Tag]) -> List[float]:
|
|
218
|
+
"""
|
|
219
|
+
Compute gaps between consecutive Y positions (line gaps).
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
List of gap sizes in pixels
|
|
223
|
+
"""
|
|
224
|
+
y_positions = []
|
|
225
|
+
for el in elements:
|
|
226
|
+
style = el.get("style", "")
|
|
227
|
+
top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
|
|
228
|
+
if top_match:
|
|
229
|
+
y_positions.append(float(top_match.group(1)))
|
|
230
|
+
|
|
231
|
+
if len(y_positions) < 2:
|
|
232
|
+
return []
|
|
233
|
+
|
|
234
|
+
y_positions.sort()
|
|
235
|
+
gaps = [y_positions[i + 1] - y_positions[i] for i in range(len(y_positions) - 1)]
|
|
236
|
+
# Filter out very small gaps (same line) and very large gaps (section breaks)
|
|
237
|
+
gaps = [g for g in gaps if 5 < g < 100]
|
|
238
|
+
return gaps
|
|
239
|
+
|
|
240
|
+
def _split_positioned_groups(self, elements: List[Tag], gap_threshold: Optional[float] = None) -> List[List[Tag]]:
|
|
241
|
+
"""
|
|
242
|
+
Split positioned elements into separate groups.
|
|
243
|
+
Uses ADAPTIVE gap threshold based on document characteristics.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
elements: List of absolutely positioned elements
|
|
247
|
+
gap_threshold: Optional threshold in pixels (if None, computed adaptively)
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
List of element groups
|
|
251
|
+
"""
|
|
252
|
+
if not elements:
|
|
253
|
+
return []
|
|
254
|
+
|
|
255
|
+
# ADAPTIVE THRESHOLD: Learn from the document
|
|
256
|
+
if gap_threshold is None:
|
|
257
|
+
line_gaps = self._compute_line_gaps(elements)
|
|
258
|
+
if line_gaps:
|
|
259
|
+
median_gap = median(line_gaps)
|
|
260
|
+
# Use 1.2x median line gap, capped at 30px
|
|
261
|
+
gap_threshold = min(1.2 * median_gap, 30.0)
|
|
262
|
+
logger.debug(f"Adaptive gap threshold: {gap_threshold:.1f}px (median line gap: {median_gap:.1f}px)")
|
|
263
|
+
else:
|
|
264
|
+
gap_threshold = 30.0 # Fallback
|
|
265
|
+
|
|
266
|
+
# Extract Y coordinates
|
|
267
|
+
element_positions = []
|
|
268
|
+
for el in elements:
|
|
269
|
+
style = el.get("style", "")
|
|
270
|
+
top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
|
|
271
|
+
if top_match:
|
|
272
|
+
top = float(top_match.group(1))
|
|
273
|
+
element_positions.append((top, el))
|
|
274
|
+
|
|
275
|
+
if not element_positions:
|
|
276
|
+
return [elements]
|
|
277
|
+
|
|
278
|
+
# Sort by Y position
|
|
279
|
+
element_positions.sort(key=lambda x: x[0])
|
|
280
|
+
|
|
281
|
+
# Group by gaps
|
|
282
|
+
groups = []
|
|
283
|
+
current_group = [element_positions[0][1]]
|
|
284
|
+
last_y = element_positions[0][0]
|
|
285
|
+
|
|
286
|
+
for y, el in element_positions[1:]:
|
|
287
|
+
gap = y - last_y
|
|
288
|
+
if gap > gap_threshold:
|
|
289
|
+
# Large gap - start new group
|
|
290
|
+
if current_group:
|
|
291
|
+
groups.append(current_group)
|
|
292
|
+
current_group = [el]
|
|
293
|
+
else:
|
|
294
|
+
current_group.append(el)
|
|
295
|
+
last_y = y
|
|
296
|
+
|
|
297
|
+
if current_group:
|
|
298
|
+
groups.append(current_group)
|
|
299
|
+
|
|
300
|
+
# Post-process: split groups that transition from multi-column to single-column
|
|
301
|
+
final_groups = []
|
|
302
|
+
for group in groups:
|
|
303
|
+
split_groups = self._split_by_column_transition(group)
|
|
304
|
+
final_groups.extend(split_groups)
|
|
305
|
+
|
|
306
|
+
logger.debug(
|
|
307
|
+
f"Split {len(elements)} elements into {len(final_groups)} groups (threshold: {gap_threshold:.1f}px)")
|
|
308
|
+
return final_groups
|
|
309
|
+
|
|
310
|
+
def _split_by_column_transition(self, elements: List[Tag]) -> List[List[Tag]]:
|
|
311
|
+
"""
|
|
312
|
+
Split a group if it transitions from multi-column (table) to single-column (prose).
|
|
313
|
+
|
|
314
|
+
This handles cases where a table is followed immediately by paragraph text
|
|
315
|
+
without a large Y-gap between them.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
elements: List of elements in a group
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
List of split groups (or original group if no transition found)
|
|
322
|
+
"""
|
|
323
|
+
if len(elements) < 6:
|
|
324
|
+
return [elements]
|
|
325
|
+
|
|
326
|
+
# Extract X, Y positions for all elements
|
|
327
|
+
element_data = []
|
|
328
|
+
for el in elements:
|
|
329
|
+
style = el.get("style", "")
|
|
330
|
+
left_match = re.search(r'left:\s*(\d+(?:\.\d+)?)px', style)
|
|
331
|
+
top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
|
|
332
|
+
if left_match and top_match:
|
|
333
|
+
left = float(left_match.group(1))
|
|
334
|
+
top = float(top_match.group(1))
|
|
335
|
+
element_data.append((left, top, el))
|
|
336
|
+
|
|
337
|
+
if not element_data:
|
|
338
|
+
return [elements]
|
|
339
|
+
|
|
340
|
+
# Sort by Y position
|
|
341
|
+
element_data.sort(key=lambda x: x[1])
|
|
342
|
+
|
|
343
|
+
# Group into rows by Y position (15px tolerance)
|
|
344
|
+
rows = []
|
|
345
|
+
current_row = [element_data[0]]
|
|
346
|
+
last_y = element_data[0][1]
|
|
347
|
+
|
|
348
|
+
for left, top, el in element_data[1:]:
|
|
349
|
+
if abs(top - last_y) <= 15:
|
|
350
|
+
current_row.append((left, top, el))
|
|
351
|
+
else:
|
|
352
|
+
rows.append(current_row)
|
|
353
|
+
current_row = [(left, top, el)]
|
|
354
|
+
last_y = top
|
|
355
|
+
|
|
356
|
+
if current_row:
|
|
357
|
+
rows.append(current_row)
|
|
358
|
+
|
|
359
|
+
# Count unique X positions per row
|
|
360
|
+
def count_columns(row):
|
|
361
|
+
x_positions = set(left for left, _, _ in row)
|
|
362
|
+
return len(x_positions)
|
|
363
|
+
|
|
364
|
+
# Find transition point from multi-column to single-column
|
|
365
|
+
split_point = None
|
|
366
|
+
for i in range(len(rows) - 3): # Need at least 3 rows after split
|
|
367
|
+
current_cols = count_columns(rows[i])
|
|
368
|
+
next_cols = count_columns(rows[i + 1])
|
|
369
|
+
|
|
370
|
+
# Transition from 2+ columns to 1 column
|
|
371
|
+
if current_cols >= 2 and next_cols == 1:
|
|
372
|
+
# Check if next 2-3 rows are also single-column (confirms prose pattern)
|
|
373
|
+
following_single = sum(1 for j in range(i + 1, min(i + 4, len(rows)))
|
|
374
|
+
if count_columns(rows[j]) == 1)
|
|
375
|
+
if following_single >= 2:
|
|
376
|
+
split_point = i + 1
|
|
377
|
+
logger.debug(f"Column transition detected at row {i + 1} ({current_cols} cols -> {next_cols} col)")
|
|
378
|
+
break
|
|
379
|
+
|
|
380
|
+
if split_point is None:
|
|
381
|
+
return [elements]
|
|
382
|
+
|
|
383
|
+
# Split at the transition point
|
|
384
|
+
split_y = rows[split_point][0][1] # Y coordinate of first element in transition row
|
|
385
|
+
|
|
386
|
+
group1 = [el for left, top, el in element_data if top < split_y]
|
|
387
|
+
group2 = [el for left, top, el in element_data if top >= split_y]
|
|
388
|
+
|
|
389
|
+
result = []
|
|
390
|
+
if group1:
|
|
391
|
+
result.append(group1)
|
|
392
|
+
if group2:
|
|
393
|
+
result.append(group2)
|
|
394
|
+
|
|
395
|
+
return result if result else [elements]
|
|
396
|
+
|
|
397
|
+
def _process_absolutely_positioned_container(self, container: Tag, page_num: int) -> int:
|
|
398
|
+
"""
|
|
399
|
+
Handle containers with absolutely positioned children.
|
|
400
|
+
|
|
401
|
+
Step 1: Extract absolutely positioned elements
|
|
402
|
+
Step 2: Split into separate groups by Y-coordinate gaps AND column transitions
|
|
403
|
+
Step 3: Process each group independently (table or text)
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
container: The container element
|
|
407
|
+
page_num: Current page number
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Updated page number
|
|
411
|
+
"""
|
|
412
|
+
# Extract positioned children
|
|
413
|
+
positioned_children = self._extract_absolutely_positioned_children(container)
|
|
414
|
+
|
|
415
|
+
if not positioned_children:
|
|
416
|
+
# No positioned children, process normally
|
|
417
|
+
current = page_num
|
|
418
|
+
for child in container.children:
|
|
419
|
+
current = self._stream_pages(child, current)
|
|
420
|
+
return current
|
|
421
|
+
|
|
422
|
+
# Split into separate groups (adaptive threshold + column transition detection)
|
|
423
|
+
groups = self._split_positioned_groups(positioned_children)
|
|
424
|
+
|
|
425
|
+
# Process each group independently
|
|
426
|
+
for i, group in enumerate(groups):
|
|
427
|
+
table_parser = AbsolutelyPositionedTableParser(group)
|
|
428
|
+
|
|
429
|
+
if table_parser.is_table_like():
|
|
430
|
+
# It's a table! Render as markdown table
|
|
431
|
+
self.includes_table = True
|
|
432
|
+
markdown_table = table_parser.to_markdown()
|
|
433
|
+
if markdown_table:
|
|
434
|
+
self._append(page_num, markdown_table)
|
|
435
|
+
self._blankline_after(page_num)
|
|
436
|
+
else:
|
|
437
|
+
# Not a table - group by visual lines and render as text
|
|
438
|
+
text = table_parser.to_text()
|
|
439
|
+
if text:
|
|
440
|
+
if i > 0:
|
|
441
|
+
self._blankline_before(page_num)
|
|
442
|
+
self._append(page_num, text)
|
|
443
|
+
|
|
444
|
+
return page_num
|
|
445
|
+
|
|
446
|
+
def _stream_pages(self, root: Union[Tag, NavigableString], page_num: int = 1) -> int:
|
|
447
|
+
"""Walk the DOM once; split only on CSS break styles."""
|
|
448
|
+
if isinstance(root, Tag) and self._has_break_before(root):
|
|
449
|
+
page_num += 1
|
|
450
|
+
|
|
451
|
+
if isinstance(root, NavigableString):
|
|
452
|
+
t = self._process_text_node(root)
|
|
453
|
+
if t:
|
|
454
|
+
self._append(page_num, t + " ")
|
|
455
|
+
return page_num
|
|
456
|
+
|
|
457
|
+
if not isinstance(root, Tag):
|
|
458
|
+
return page_num
|
|
459
|
+
|
|
460
|
+
if self._is_hidden(root):
|
|
461
|
+
return page_num
|
|
462
|
+
|
|
463
|
+
# Check if this is a container with absolutely positioned children
|
|
464
|
+
is_absolutely_positioned = self._is_absolutely_positioned(root)
|
|
465
|
+
has_positioned_children = not is_absolutely_positioned and any(
|
|
466
|
+
isinstance(child, Tag) and self._is_absolutely_positioned(child)
|
|
467
|
+
for child in root.children
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
if has_positioned_children and root.name == "div":
|
|
471
|
+
# Special handling for absolutely positioned layouts
|
|
472
|
+
current = self._process_absolutely_positioned_container(root, page_num)
|
|
473
|
+
if self._has_break_after(root):
|
|
474
|
+
current += 1
|
|
475
|
+
return current
|
|
476
|
+
|
|
477
|
+
# Inline-display elements should not trigger blocks
|
|
478
|
+
is_inline_display = self._is_inline_display(root)
|
|
479
|
+
is_block = self._is_block(root) and root.name not in {"br",
|
|
480
|
+
"hr"} and not is_inline_display and not is_absolutely_positioned
|
|
481
|
+
|
|
482
|
+
if is_block:
|
|
483
|
+
self._blankline_before(page_num)
|
|
484
|
+
|
|
485
|
+
# Handle tables and lists atomically
|
|
486
|
+
if root.name in {"table", "ul", "ol"}:
|
|
487
|
+
t = self._process_element(root)
|
|
488
|
+
if t:
|
|
489
|
+
self._append(page_num, t)
|
|
490
|
+
self._blankline_after(page_num)
|
|
491
|
+
if self._has_break_after(root):
|
|
492
|
+
page_num += 1
|
|
493
|
+
return page_num
|
|
494
|
+
|
|
495
|
+
# For inline wrappers (bold/italic), render atomically
|
|
496
|
+
wrap = self._wrap_markdown(root)
|
|
497
|
+
if wrap and not is_block:
|
|
498
|
+
t = self._process_element(root)
|
|
499
|
+
if t:
|
|
500
|
+
self._append(page_num, t + " ")
|
|
501
|
+
if self._has_break_after(root):
|
|
502
|
+
page_num += 1
|
|
503
|
+
return page_num
|
|
504
|
+
|
|
505
|
+
# Stream children for block elements
|
|
506
|
+
current = page_num
|
|
507
|
+
for child in root.children:
|
|
508
|
+
current = self._stream_pages(child, current)
|
|
509
|
+
|
|
510
|
+
if is_block:
|
|
511
|
+
self._blankline_after(current)
|
|
512
|
+
|
|
513
|
+
if self._has_break_after(root):
|
|
514
|
+
current += 1
|
|
515
|
+
|
|
516
|
+
return current
|
|
517
|
+
|
|
518
|
+
def get_pages(self) -> List[Page]:
|
|
519
|
+
"""Get parsed pages as Page objects."""
|
|
520
|
+
self.pages = defaultdict(list)
|
|
521
|
+
self.includes_table = False
|
|
522
|
+
root = self.soup.body if self.soup.body else self.soup
|
|
523
|
+
self._stream_pages(root, page_num=1)
|
|
524
|
+
|
|
525
|
+
result: List[Page] = []
|
|
526
|
+
for page_num in sorted(self.pages.keys()):
|
|
527
|
+
raw = "".join(self.pages[page_num])
|
|
528
|
+
|
|
529
|
+
# Collapse excessive newlines
|
|
530
|
+
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
|
531
|
+
|
|
532
|
+
lines: List[str] = []
|
|
533
|
+
for line in raw.split("\n"):
|
|
534
|
+
line = line.strip()
|
|
535
|
+
if line or (lines and lines[-1]):
|
|
536
|
+
lines.append(line)
|
|
537
|
+
content = "\n".join(lines).strip()
|
|
538
|
+
|
|
539
|
+
result.append(Page(number=page_num, content=content))
|
|
540
|
+
|
|
541
|
+
# CONTENT-LOSS WATCHDOG
|
|
542
|
+
total_output_chars = sum(len(p.content) for p in result)
|
|
543
|
+
if self.input_char_count > 0:
|
|
544
|
+
retention_ratio = total_output_chars / self.input_char_count
|
|
545
|
+
if retention_ratio < 0.95:
|
|
546
|
+
# logger.warning(f"⚠️ Content loss detected: {100 * (1 - retention_ratio):.1f}% of input lost!")
|
|
547
|
+
# logger.warning(f" Input: {self.input_char_count} chars, Output: {total_output_chars} chars")
|
|
548
|
+
pass
|
|
549
|
+
else:
|
|
550
|
+
logger.debug(f"✓ Content retention: {100 * retention_ratio:.1f}%")
|
|
551
|
+
|
|
552
|
+
return result
|
|
553
|
+
|
|
554
|
+
def _effective_rows(self, table: Tag) -> list[list[Tag]]:
|
|
555
|
+
"""Return rows that have at least one non-empty td/th."""
|
|
556
|
+
rows = []
|
|
557
|
+
for tr in table.find_all('tr', recursive=True):
|
|
558
|
+
cells = tr.find_all(['td', 'th'], recursive=False) or tr.find_all(['td', 'th'], recursive=True)
|
|
559
|
+
texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
|
|
560
|
+
if any(texts):
|
|
561
|
+
rows.append(cells)
|
|
562
|
+
return rows
|
|
563
|
+
|
|
564
|
+
def _one_row_table_to_text(self, cells: list[Tag]) -> str:
|
|
565
|
+
"""Flatten a 1-row table to plain text; upgrade to header when possible."""
|
|
566
|
+
texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
|
|
567
|
+
if not texts:
|
|
568
|
+
return ""
|
|
569
|
+
|
|
570
|
+
first = texts[0]
|
|
571
|
+
if (m := ITEM_HEADER_CELL_RE.match(first)):
|
|
572
|
+
num = m.group(1).upper()
|
|
573
|
+
title = next((t for t in texts[1:] if t), "")
|
|
574
|
+
return f"ITEM {num}. {title}".strip()
|
|
575
|
+
|
|
576
|
+
if (m := PART_HEADER_CELL_RE.match(first)):
|
|
577
|
+
roman = m.group(1).upper()
|
|
578
|
+
return f"PART {roman}"
|
|
579
|
+
|
|
580
|
+
# generic flatten (avoid markdown pipes which might be misread later)
|
|
581
|
+
return " ".join(t for t in texts if t).strip()
|
|
582
|
+
|
|
583
|
+
def markdown(self) -> str:
|
|
584
|
+
"""Get full document as markdown string."""
|
|
585
|
+
pages = self.get_pages()
|
|
586
|
+
return "\n\n".join(page.content for page in pages if page.content)
|