sec2md 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +36 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +135 -0
- sec2md/chunker/markdown_chunk.py +133 -0
- sec2md/chunker/markdown_chunker.py +270 -0
- sec2md/chunking.py +179 -0
- sec2md/core.py +93 -0
- sec2md/models.py +400 -0
- sec2md/parser.py +1217 -0
- sec2md/section_extractor.py +623 -0
- sec2md/sections.py +84 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.5.dist-info/METADATA +216 -0
- sec2md-0.1.5.dist-info/RECORD +19 -0
- sec2md-0.1.5.dist-info/WHEEL +5 -0
- sec2md-0.1.5.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.5.dist-info/top_level.txt +1 -0
sec2md/parser.py
ADDED
|
@@ -0,0 +1,1217 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import logging
|
|
5
|
+
import hashlib
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import List, Dict, Union, Optional, Tuple
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
from bs4.element import NavigableString, Tag
|
|
11
|
+
|
|
12
|
+
from sec2md.absolute_table_parser import AbsolutelyPositionedTableParser, median
|
|
13
|
+
from sec2md.table_parser import TableParser
|
|
14
|
+
from sec2md.models import Page, Element
|
|
15
|
+
|
|
16
|
+
BLOCK_TAGS = {"div", "p", "h1", "h2", "h3", "h4", "h5", "h6", "table", "br", "hr", "ul", "ol", "li"}
|
|
17
|
+
BOLD_TAGS = {"b", "strong"}
|
|
18
|
+
ITALIC_TAGS = {"i", "em"}
|
|
19
|
+
|
|
20
|
+
_ws = re.compile(r"\s+")
|
|
21
|
+
_css_decl = re.compile(r"^[a-zA-Z\-]+\s*:\s*[^;]+;\s*$")
|
|
22
|
+
ITEM_HEADER_CELL_RE = re.compile(r"^\s*Item\s+([0-9IVX]+)\.\s*$", re.I)
|
|
23
|
+
PART_HEADER_CELL_RE = re.compile(r"^\s*Part\s+([IVX]+)\s*$", re.I)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class TextBlockInfo:
|
|
30
|
+
"""Tracks XBRL TextBlock context during parsing."""
|
|
31
|
+
name: str # e.g., "us-gaap:DebtDisclosureTextBlock"
|
|
32
|
+
title: Optional[str] = None # e.g., "Note 9 – Debt"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Parser:
|
|
36
|
+
"""Document parser with support for regular tables and pseudo-tables."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, content: str):
|
|
39
|
+
self.soup = BeautifulSoup(content, "lxml")
|
|
40
|
+
self.includes_table = False
|
|
41
|
+
self.pages: Dict[int, List[str]] = defaultdict(list)
|
|
42
|
+
# Track DOM provenance and TextBlock: (content, source_node, text_block_info)
|
|
43
|
+
self.page_segments: Dict[int, List[Tuple[str, Optional[Tag], Optional[TextBlockInfo]]]] = defaultdict(list)
|
|
44
|
+
self.input_char_count = len(self.soup.get_text())
|
|
45
|
+
|
|
46
|
+
# Track current TextBlock context
|
|
47
|
+
self.current_text_block: Optional[TextBlockInfo] = None
|
|
48
|
+
# Map continuation IDs to TextBlock context
|
|
49
|
+
self.continuation_map: Dict[str, TextBlockInfo] = {}
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _is_text_block_tag(el: Tag) -> bool:
|
|
53
|
+
"""Check if element is an ix:nonnumeric with a note-level TextBlock name.
|
|
54
|
+
|
|
55
|
+
Only tracks financial notes, not document metadata.
|
|
56
|
+
Tracks: us-gaap:*, cyd:* (notes and disclosures)
|
|
57
|
+
Ignores: dei:* (document metadata)
|
|
58
|
+
"""
|
|
59
|
+
if not isinstance(el, Tag):
|
|
60
|
+
return False
|
|
61
|
+
if el.name not in ('ix:nonnumeric', 'nonnumeric'):
|
|
62
|
+
return False
|
|
63
|
+
name = el.get('name', '')
|
|
64
|
+
if 'TextBlock' not in name:
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
# Only track note-level TextBlocks (us-gaap, cyd)
|
|
68
|
+
# Ignore document metadata (dei)
|
|
69
|
+
return name.startswith('us-gaap:') or name.startswith('cyd:')
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def _find_text_block_tag_in_children(el: Tag) -> Optional[Tag]:
|
|
73
|
+
"""Find TextBlock tag in children (search 2 levels deep).
|
|
74
|
+
|
|
75
|
+
Searches children and grandchildren to handle:
|
|
76
|
+
<div>
|
|
77
|
+
<span><ix:nonnumeric>Title</ix:nonnumeric></span>
|
|
78
|
+
<div>Content</div>
|
|
79
|
+
</div>
|
|
80
|
+
"""
|
|
81
|
+
if not isinstance(el, Tag):
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
# Check if current element is the TextBlock tag
|
|
85
|
+
if Parser._is_text_block_tag(el):
|
|
86
|
+
return el
|
|
87
|
+
|
|
88
|
+
# Check direct children
|
|
89
|
+
for child in el.children:
|
|
90
|
+
if isinstance(child, Tag):
|
|
91
|
+
if Parser._is_text_block_tag(child):
|
|
92
|
+
return child
|
|
93
|
+
# Check grandchildren (one more level)
|
|
94
|
+
for grandchild in child.children:
|
|
95
|
+
if isinstance(grandchild, Tag) and Parser._is_text_block_tag(grandchild):
|
|
96
|
+
return grandchild
|
|
97
|
+
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def _extract_text_block_info(el: Tag) -> Optional[TextBlockInfo]:
|
|
102
|
+
"""Extract TextBlock name and title from ix:nonnumeric tag.
|
|
103
|
+
|
|
104
|
+
Since we only track outermost TextBlocks, these should have short titles
|
|
105
|
+
(e.g., "Segment Information and Geographic Data") inside the tag.
|
|
106
|
+
"""
|
|
107
|
+
if not isinstance(el, Tag):
|
|
108
|
+
return None
|
|
109
|
+
name = el.get('name', '')
|
|
110
|
+
if not name or 'TextBlock' not in name:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
# Get text content from tag
|
|
114
|
+
tag_text = el.get_text(strip=True) or ''
|
|
115
|
+
|
|
116
|
+
# Use tag content if reasonable length (<200 chars is a note title)
|
|
117
|
+
# Otherwise derive from XBRL name
|
|
118
|
+
if tag_text and len(tag_text) < 200:
|
|
119
|
+
title = tag_text
|
|
120
|
+
else:
|
|
121
|
+
# Fallback: Derive from XBRL name
|
|
122
|
+
# "SegmentReportingDisclosureTextBlock" -> "Segment Reporting Disclosure"
|
|
123
|
+
import re
|
|
124
|
+
name_part = name.split(':')[-1].replace('TextBlock', '')
|
|
125
|
+
# Insert spaces before capitals
|
|
126
|
+
title = re.sub(r'([A-Z])', r' \1', name_part).strip()
|
|
127
|
+
# Clean up double spaces
|
|
128
|
+
title = re.sub(r'\s+', ' ', title)
|
|
129
|
+
|
|
130
|
+
return TextBlockInfo(name=name, title=title)
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _is_continuation_tag(el: Tag) -> bool:
|
|
134
|
+
"""Check if element is an ix:continuation tag."""
|
|
135
|
+
if not isinstance(el, Tag):
|
|
136
|
+
return False
|
|
137
|
+
return el.name in ('ix:continuation', 'continuation')
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def _is_bold(el: Tag) -> bool:
|
|
141
|
+
if not isinstance(el, Tag):
|
|
142
|
+
return False
|
|
143
|
+
style = (el.get("style") or "").lower()
|
|
144
|
+
return (
|
|
145
|
+
"font-weight:700" in style
|
|
146
|
+
or "font-weight:bold" in style
|
|
147
|
+
or el.name in BOLD_TAGS
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def _is_italic(el: Tag) -> bool:
|
|
152
|
+
if not isinstance(el, Tag):
|
|
153
|
+
return False
|
|
154
|
+
style = (el.get("style") or "").lower()
|
|
155
|
+
return (
|
|
156
|
+
"font-style:italic" in style
|
|
157
|
+
or el.name in ITALIC_TAGS
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def _is_block(el: Tag) -> bool:
|
|
162
|
+
return isinstance(el, Tag) and el.name in BLOCK_TAGS
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def _is_absolutely_positioned(el: Tag) -> bool:
|
|
166
|
+
"""Check if element has position:absolute"""
|
|
167
|
+
if not isinstance(el, Tag):
|
|
168
|
+
return False
|
|
169
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
170
|
+
return "position:absolute" in style
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def _is_inline_display(el: Tag) -> bool:
|
|
174
|
+
"""Check if element has display:inline or display:inline-block"""
|
|
175
|
+
if not isinstance(el, Tag):
|
|
176
|
+
return False
|
|
177
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
178
|
+
return "display:inline-block" in style or "display:inline;" in style
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def _has_break_before(el: Tag) -> bool:
|
|
182
|
+
if not isinstance(el, Tag):
|
|
183
|
+
return False
|
|
184
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
185
|
+
return (
|
|
186
|
+
"page-break-before:always" in style
|
|
187
|
+
or "break-before:page" in style
|
|
188
|
+
or "break-before:always" in style
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def _has_break_after(el: Tag) -> bool:
|
|
193
|
+
if not isinstance(el, Tag):
|
|
194
|
+
return False
|
|
195
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
196
|
+
return (
|
|
197
|
+
"page-break-after:always" in style
|
|
198
|
+
or "break-after:page" in style
|
|
199
|
+
or "break-after:always" in style
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def _is_hidden(el: Tag) -> bool:
|
|
204
|
+
"""Check if element has display:none"""
|
|
205
|
+
if not isinstance(el, Tag):
|
|
206
|
+
return False
|
|
207
|
+
style = (el.get("style") or "").lower().replace(" ", "")
|
|
208
|
+
return "display:none" in style
|
|
209
|
+
|
|
210
|
+
@staticmethod
|
|
211
|
+
def _clean_text(text: str) -> str:
|
|
212
|
+
# Remove zero-width spaces, BOM, normalize NBSP
|
|
213
|
+
text = text.replace("\u200b", "").replace("\ufeff", "").replace("\xa0", " ")
|
|
214
|
+
return _ws.sub(" ", text).strip()
|
|
215
|
+
|
|
216
|
+
@staticmethod
|
|
217
|
+
def _wrap_markdown(el: Tag) -> str:
|
|
218
|
+
"""Return the prefix/suffix markdown wrapper for this element."""
|
|
219
|
+
bold = Parser._is_bold(el)
|
|
220
|
+
italic = Parser._is_italic(el)
|
|
221
|
+
if bold and italic:
|
|
222
|
+
return "***"
|
|
223
|
+
if bold:
|
|
224
|
+
return "**"
|
|
225
|
+
if italic:
|
|
226
|
+
return "*"
|
|
227
|
+
return ""
|
|
228
|
+
|
|
229
|
+
def _append(self, page_num: int, s: str, source_node: Optional[Tag] = None, text_block: Optional[TextBlockInfo] = None) -> None:
|
|
230
|
+
if s:
|
|
231
|
+
self.pages[page_num].append(s)
|
|
232
|
+
# Use current_text_block if not explicitly provided
|
|
233
|
+
tb = text_block if text_block is not None else self.current_text_block
|
|
234
|
+
self.page_segments[page_num].append((s, source_node, tb))
|
|
235
|
+
|
|
236
|
+
def _blankline_before(self, page_num: int) -> None:
|
|
237
|
+
"""Ensure exactly one blank line before the next block."""
|
|
238
|
+
buf = self.pages[page_num]
|
|
239
|
+
seg_buf = self.page_segments[page_num]
|
|
240
|
+
if not buf:
|
|
241
|
+
return
|
|
242
|
+
if not buf[-1].endswith("\n"):
|
|
243
|
+
buf.append("\n")
|
|
244
|
+
seg_buf.append(("\n", None, self.current_text_block))
|
|
245
|
+
if len(buf) >= 2 and buf[-1] == "\n" and buf[-2] == "\n":
|
|
246
|
+
return
|
|
247
|
+
buf.append("\n")
|
|
248
|
+
seg_buf.append(("\n", None, self.current_text_block))
|
|
249
|
+
|
|
250
|
+
def _blankline_after(self, page_num: int) -> None:
|
|
251
|
+
"""Mirror `_blankline_before` for symmetry; same rule."""
|
|
252
|
+
self._blankline_before(page_num)
|
|
253
|
+
|
|
254
|
+
def _process_text_node(self, node: NavigableString) -> str:
|
|
255
|
+
text = self._clean_text(str(node))
|
|
256
|
+
if text and _css_decl.match(text):
|
|
257
|
+
return ""
|
|
258
|
+
return text
|
|
259
|
+
|
|
260
|
+
def _process_element(self, element: Union[Tag, NavigableString]) -> str:
|
|
261
|
+
if isinstance(element, NavigableString):
|
|
262
|
+
return self._process_text_node(element)
|
|
263
|
+
|
|
264
|
+
if element.name == "table":
|
|
265
|
+
# Use effective (non-empty) rows for the decision
|
|
266
|
+
eff_rows = self._effective_rows(element)
|
|
267
|
+
if len(eff_rows) <= 1:
|
|
268
|
+
# Flatten single-row "header tables" like Item/Part banners
|
|
269
|
+
cells = eff_rows[0] if eff_rows else []
|
|
270
|
+
text = self._one_row_table_to_text(cells)
|
|
271
|
+
return text
|
|
272
|
+
|
|
273
|
+
self.includes_table = True
|
|
274
|
+
return TableParser(element).md().strip()
|
|
275
|
+
|
|
276
|
+
if element.name in {"ul", "ol"}:
|
|
277
|
+
items = []
|
|
278
|
+
for li in element.find_all("li", recursive=False):
|
|
279
|
+
item_text = self._process_element(li).strip()
|
|
280
|
+
if item_text:
|
|
281
|
+
item_text = item_text.lstrip("•·∙◦▪▫-").strip()
|
|
282
|
+
items.append(item_text)
|
|
283
|
+
if not items:
|
|
284
|
+
return ""
|
|
285
|
+
if element.name == "ol":
|
|
286
|
+
return "\n".join(f"{i + 1}. {t}" for i, t in enumerate(items))
|
|
287
|
+
else:
|
|
288
|
+
return "\n".join(f"- {t}" for t in items)
|
|
289
|
+
|
|
290
|
+
if element.name == "li":
|
|
291
|
+
parts = [self._process_element(c) for c in element.children]
|
|
292
|
+
return " ".join(p for p in parts if p).strip()
|
|
293
|
+
|
|
294
|
+
parts: List[str] = []
|
|
295
|
+
for child in element.children:
|
|
296
|
+
if isinstance(child, NavigableString):
|
|
297
|
+
t = self._process_text_node(child)
|
|
298
|
+
if t:
|
|
299
|
+
parts.append(t)
|
|
300
|
+
else:
|
|
301
|
+
t = self._process_element(child)
|
|
302
|
+
if t:
|
|
303
|
+
parts.append(t)
|
|
304
|
+
|
|
305
|
+
text = " ".join(p for p in parts if p).strip()
|
|
306
|
+
if not text:
|
|
307
|
+
return ""
|
|
308
|
+
|
|
309
|
+
wrap = self._wrap_markdown(element)
|
|
310
|
+
return f"{wrap}{text}{wrap}" if wrap else text
|
|
311
|
+
|
|
312
|
+
def _extract_absolutely_positioned_children(self, container: Tag) -> List[Tag]:
|
|
313
|
+
"""
|
|
314
|
+
Extract all absolutely positioned children from a container.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
List of absolutely positioned child elements
|
|
318
|
+
"""
|
|
319
|
+
positioned_children = []
|
|
320
|
+
for child in container.children:
|
|
321
|
+
if isinstance(child, Tag) and self._is_absolutely_positioned(child):
|
|
322
|
+
# Skip elements that are just styling (no text content)
|
|
323
|
+
if child.get_text(strip=True):
|
|
324
|
+
positioned_children.append(child)
|
|
325
|
+
return positioned_children
|
|
326
|
+
|
|
327
|
+
def _compute_line_gaps(self, elements: List[Tag]) -> List[float]:
|
|
328
|
+
"""
|
|
329
|
+
Compute gaps between consecutive Y positions (line gaps).
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
List of gap sizes in pixels
|
|
333
|
+
"""
|
|
334
|
+
y_positions = []
|
|
335
|
+
for el in elements:
|
|
336
|
+
style = el.get("style", "")
|
|
337
|
+
top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
|
|
338
|
+
if top_match:
|
|
339
|
+
y_positions.append(float(top_match.group(1)))
|
|
340
|
+
|
|
341
|
+
if len(y_positions) < 2:
|
|
342
|
+
return []
|
|
343
|
+
|
|
344
|
+
y_positions.sort()
|
|
345
|
+
gaps = [y_positions[i + 1] - y_positions[i] for i in range(len(y_positions) - 1)]
|
|
346
|
+
# Filter out very small gaps (same line) and very large gaps (section breaks)
|
|
347
|
+
gaps = [g for g in gaps if 5 < g < 100]
|
|
348
|
+
return gaps
|
|
349
|
+
|
|
350
|
+
def _split_positioned_groups(self, elements: List[Tag], gap_threshold: Optional[float] = None) -> List[List[Tag]]:
|
|
351
|
+
"""
|
|
352
|
+
Split positioned elements into separate groups.
|
|
353
|
+
Uses ADAPTIVE gap threshold based on document characteristics.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
elements: List of absolutely positioned elements
|
|
357
|
+
gap_threshold: Optional threshold in pixels (if None, computed adaptively)
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
List of element groups
|
|
361
|
+
"""
|
|
362
|
+
if not elements:
|
|
363
|
+
return []
|
|
364
|
+
|
|
365
|
+
# ADAPTIVE THRESHOLD: Learn from the document
|
|
366
|
+
if gap_threshold is None:
|
|
367
|
+
line_gaps = self._compute_line_gaps(elements)
|
|
368
|
+
if line_gaps:
|
|
369
|
+
median_gap = median(line_gaps)
|
|
370
|
+
# Use 1.2x median line gap, capped at 30px
|
|
371
|
+
gap_threshold = min(1.2 * median_gap, 30.0)
|
|
372
|
+
logger.debug(f"Adaptive gap threshold: {gap_threshold:.1f}px (median line gap: {median_gap:.1f}px)")
|
|
373
|
+
else:
|
|
374
|
+
gap_threshold = 30.0 # Fallback
|
|
375
|
+
|
|
376
|
+
# Extract Y coordinates
|
|
377
|
+
element_positions = []
|
|
378
|
+
for el in elements:
|
|
379
|
+
style = el.get("style", "")
|
|
380
|
+
top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
|
|
381
|
+
if top_match:
|
|
382
|
+
top = float(top_match.group(1))
|
|
383
|
+
element_positions.append((top, el))
|
|
384
|
+
|
|
385
|
+
if not element_positions:
|
|
386
|
+
return [elements]
|
|
387
|
+
|
|
388
|
+
# Sort by Y position
|
|
389
|
+
element_positions.sort(key=lambda x: x[0])
|
|
390
|
+
|
|
391
|
+
# Group by gaps
|
|
392
|
+
groups = []
|
|
393
|
+
current_group = [element_positions[0][1]]
|
|
394
|
+
last_y = element_positions[0][0]
|
|
395
|
+
|
|
396
|
+
for y, el in element_positions[1:]:
|
|
397
|
+
gap = y - last_y
|
|
398
|
+
if gap > gap_threshold:
|
|
399
|
+
# Large gap - start new group
|
|
400
|
+
if current_group:
|
|
401
|
+
groups.append(current_group)
|
|
402
|
+
current_group = [el]
|
|
403
|
+
else:
|
|
404
|
+
current_group.append(el)
|
|
405
|
+
last_y = y
|
|
406
|
+
|
|
407
|
+
if current_group:
|
|
408
|
+
groups.append(current_group)
|
|
409
|
+
|
|
410
|
+
# Post-process: split groups that transition from multi-column to single-column
|
|
411
|
+
final_groups = []
|
|
412
|
+
for group in groups:
|
|
413
|
+
split_groups = self._split_by_column_transition(group)
|
|
414
|
+
final_groups.extend(split_groups)
|
|
415
|
+
|
|
416
|
+
logger.debug(
|
|
417
|
+
f"Split {len(elements)} elements into {len(final_groups)} groups (threshold: {gap_threshold:.1f}px)")
|
|
418
|
+
return final_groups
|
|
419
|
+
|
|
420
|
+
def _split_by_column_transition(self, elements: List[Tag]) -> List[List[Tag]]:
|
|
421
|
+
"""
|
|
422
|
+
Split a group if it transitions from multi-column (table) to single-column (prose).
|
|
423
|
+
|
|
424
|
+
This handles cases where a table is followed immediately by paragraph text
|
|
425
|
+
without a large Y-gap between them.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
elements: List of elements in a group
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
List of split groups (or original group if no transition found)
|
|
432
|
+
"""
|
|
433
|
+
if len(elements) < 6:
|
|
434
|
+
return [elements]
|
|
435
|
+
|
|
436
|
+
# Extract X, Y positions for all elements
|
|
437
|
+
element_data = []
|
|
438
|
+
for el in elements:
|
|
439
|
+
style = el.get("style", "")
|
|
440
|
+
left_match = re.search(r'left:\s*(\d+(?:\.\d+)?)px', style)
|
|
441
|
+
top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
|
|
442
|
+
if left_match and top_match:
|
|
443
|
+
left = float(left_match.group(1))
|
|
444
|
+
top = float(top_match.group(1))
|
|
445
|
+
element_data.append((left, top, el))
|
|
446
|
+
|
|
447
|
+
if not element_data:
|
|
448
|
+
return [elements]
|
|
449
|
+
|
|
450
|
+
# Sort by Y position
|
|
451
|
+
element_data.sort(key=lambda x: x[1])
|
|
452
|
+
|
|
453
|
+
# Group into rows by Y position (15px tolerance)
|
|
454
|
+
rows = []
|
|
455
|
+
current_row = [element_data[0]]
|
|
456
|
+
last_y = element_data[0][1]
|
|
457
|
+
|
|
458
|
+
for left, top, el in element_data[1:]:
|
|
459
|
+
if abs(top - last_y) <= 15:
|
|
460
|
+
current_row.append((left, top, el))
|
|
461
|
+
else:
|
|
462
|
+
rows.append(current_row)
|
|
463
|
+
current_row = [(left, top, el)]
|
|
464
|
+
last_y = top
|
|
465
|
+
|
|
466
|
+
if current_row:
|
|
467
|
+
rows.append(current_row)
|
|
468
|
+
|
|
469
|
+
# Count unique X positions per row
|
|
470
|
+
def count_columns(row):
|
|
471
|
+
x_positions = set(left for left, _, _ in row)
|
|
472
|
+
return len(x_positions)
|
|
473
|
+
|
|
474
|
+
# Find transition point from multi-column to single-column
|
|
475
|
+
split_point = None
|
|
476
|
+
for i in range(len(rows) - 3): # Need at least 3 rows after split
|
|
477
|
+
current_cols = count_columns(rows[i])
|
|
478
|
+
next_cols = count_columns(rows[i + 1])
|
|
479
|
+
|
|
480
|
+
# Transition from 2+ columns to 1 column
|
|
481
|
+
if current_cols >= 2 and next_cols == 1:
|
|
482
|
+
# Check if next 2-3 rows are also single-column (confirms prose pattern)
|
|
483
|
+
following_single = sum(1 for j in range(i + 1, min(i + 4, len(rows)))
|
|
484
|
+
if count_columns(rows[j]) == 1)
|
|
485
|
+
if following_single >= 2:
|
|
486
|
+
split_point = i + 1
|
|
487
|
+
logger.debug(f"Column transition detected at row {i + 1} ({current_cols} cols -> {next_cols} col)")
|
|
488
|
+
break
|
|
489
|
+
|
|
490
|
+
if split_point is None:
|
|
491
|
+
return [elements]
|
|
492
|
+
|
|
493
|
+
# Split at the transition point
|
|
494
|
+
split_y = rows[split_point][0][1] # Y coordinate of first element in transition row
|
|
495
|
+
|
|
496
|
+
group1 = [el for left, top, el in element_data if top < split_y]
|
|
497
|
+
group2 = [el for left, top, el in element_data if top >= split_y]
|
|
498
|
+
|
|
499
|
+
result = []
|
|
500
|
+
if group1:
|
|
501
|
+
result.append(group1)
|
|
502
|
+
if group2:
|
|
503
|
+
result.append(group2)
|
|
504
|
+
|
|
505
|
+
return result if result else [elements]
|
|
506
|
+
|
|
507
|
+
def _process_absolutely_positioned_container(self, container: Tag, page_num: int) -> int:
|
|
508
|
+
"""
|
|
509
|
+
Handle containers with absolutely positioned children.
|
|
510
|
+
|
|
511
|
+
Step 1: Extract absolutely positioned elements
|
|
512
|
+
Step 2: Split into separate groups by Y-coordinate gaps AND column transitions
|
|
513
|
+
Step 3: Process each group independently (table or text)
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
container: The container element
|
|
517
|
+
page_num: Current page number
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
Updated page number
|
|
521
|
+
"""
|
|
522
|
+
# Extract positioned children
|
|
523
|
+
positioned_children = self._extract_absolutely_positioned_children(container)
|
|
524
|
+
|
|
525
|
+
if not positioned_children:
|
|
526
|
+
# No positioned children, process normally
|
|
527
|
+
current = page_num
|
|
528
|
+
for child in container.children:
|
|
529
|
+
current = self._stream_pages(child, current)
|
|
530
|
+
return current
|
|
531
|
+
|
|
532
|
+
# Split into separate groups (adaptive threshold + column transition detection)
|
|
533
|
+
groups = self._split_positioned_groups(positioned_children)
|
|
534
|
+
|
|
535
|
+
# Process each group independently
|
|
536
|
+
for i, group in enumerate(groups):
|
|
537
|
+
table_parser = AbsolutelyPositionedTableParser(group)
|
|
538
|
+
|
|
539
|
+
if table_parser.is_table_like():
|
|
540
|
+
# It's a table! Render as markdown table
|
|
541
|
+
self.includes_table = True
|
|
542
|
+
markdown_table = table_parser.to_markdown()
|
|
543
|
+
if markdown_table:
|
|
544
|
+
# Use first element of group as source node
|
|
545
|
+
self._append(page_num, markdown_table, source_node=group[0] if group else None)
|
|
546
|
+
self._blankline_after(page_num)
|
|
547
|
+
else:
|
|
548
|
+
# Not a table - group by visual lines and render as text
|
|
549
|
+
text = table_parser.to_text()
|
|
550
|
+
if text:
|
|
551
|
+
if i > 0:
|
|
552
|
+
self._blankline_before(page_num)
|
|
553
|
+
# Use first element of group as source node
|
|
554
|
+
self._append(page_num, text, source_node=group[0] if group else None)
|
|
555
|
+
|
|
556
|
+
return page_num
|
|
557
|
+
|
|
558
|
+
def _stream_pages(self, root: Union[Tag, NavigableString], page_num: int = 1) -> int:
|
|
559
|
+
"""Walk the DOM once; split only on CSS break styles."""
|
|
560
|
+
if isinstance(root, Tag) and self._has_break_before(root):
|
|
561
|
+
page_num += 1
|
|
562
|
+
|
|
563
|
+
if isinstance(root, NavigableString):
|
|
564
|
+
t = self._process_text_node(root)
|
|
565
|
+
if t:
|
|
566
|
+
# For text nodes, use parent as source
|
|
567
|
+
parent = root.parent if isinstance(root.parent, Tag) else None
|
|
568
|
+
self._append(page_num, t + " ", source_node=parent)
|
|
569
|
+
return page_num
|
|
570
|
+
|
|
571
|
+
if not isinstance(root, Tag):
|
|
572
|
+
return page_num
|
|
573
|
+
|
|
574
|
+
if self._is_hidden(root):
|
|
575
|
+
return page_num
|
|
576
|
+
|
|
577
|
+
# Track XBRL TextBlock context (will be set later after determining if block)
|
|
578
|
+
text_block_started = False
|
|
579
|
+
text_block_has_continuation = False
|
|
580
|
+
previous_text_block = self.current_text_block
|
|
581
|
+
|
|
582
|
+
# Check if this is a continuation tag
|
|
583
|
+
continuation_ends_text_block = False
|
|
584
|
+
if self._is_continuation_tag(root):
|
|
585
|
+
cont_id = root.get('id')
|
|
586
|
+
if cont_id and cont_id in self.continuation_map:
|
|
587
|
+
self.current_text_block = self.continuation_map[cont_id]
|
|
588
|
+
text_block_started = True
|
|
589
|
+
# Check if continues further
|
|
590
|
+
continuedat = root.get('continuedat')
|
|
591
|
+
if continuedat:
|
|
592
|
+
text_block_has_continuation = True
|
|
593
|
+
self.continuation_map[continuedat] = self.current_text_block
|
|
594
|
+
else:
|
|
595
|
+
# No continuedat: this continuation tag ENDS the TextBlock
|
|
596
|
+
# We need to clear the context after processing this tag
|
|
597
|
+
continuation_ends_text_block = True
|
|
598
|
+
|
|
599
|
+
# Check if this is a container with absolutely positioned children
|
|
600
|
+
is_absolutely_positioned = self._is_absolutely_positioned(root)
|
|
601
|
+
has_positioned_children = not is_absolutely_positioned and any(
|
|
602
|
+
isinstance(child, Tag) and self._is_absolutely_positioned(child)
|
|
603
|
+
for child in root.children
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
if has_positioned_children and root.name == "div":
|
|
607
|
+
# Special handling for absolutely positioned layouts
|
|
608
|
+
current = self._process_absolutely_positioned_container(root, page_num)
|
|
609
|
+
if self._has_break_after(root):
|
|
610
|
+
current += 1
|
|
611
|
+
|
|
612
|
+
# Restore TextBlock context before early return
|
|
613
|
+
if text_block_started and not text_block_has_continuation:
|
|
614
|
+
if continuation_ends_text_block:
|
|
615
|
+
self.current_text_block = None
|
|
616
|
+
else:
|
|
617
|
+
self.current_text_block = previous_text_block
|
|
618
|
+
|
|
619
|
+
return current
|
|
620
|
+
|
|
621
|
+
# Inline-display elements should not trigger blocks
|
|
622
|
+
is_inline_display = self._is_inline_display(root)
|
|
623
|
+
is_block = self._is_block(root) and root.name not in {"br",
|
|
624
|
+
"hr"} and not is_inline_display and not is_absolutely_positioned
|
|
625
|
+
|
|
626
|
+
# Check if this block element contains a TextBlock tag in its children
|
|
627
|
+
# ALWAYS check block elements for new TextBlocks (not just when current_text_block is None)
|
|
628
|
+
# This allows new notes to replace old ones across pages
|
|
629
|
+
if is_block:
|
|
630
|
+
tb_tag = self._find_text_block_tag_in_children(root)
|
|
631
|
+
if tb_tag:
|
|
632
|
+
tb_info = self._extract_text_block_info(tb_tag)
|
|
633
|
+
if tb_info:
|
|
634
|
+
# Only set if it's a DIFFERENT TextBlock (ignore nested duplicates)
|
|
635
|
+
is_new_text_block = (
|
|
636
|
+
self.current_text_block is None or
|
|
637
|
+
self.current_text_block.name != tb_info.name
|
|
638
|
+
)
|
|
639
|
+
if is_new_text_block:
|
|
640
|
+
self.current_text_block = tb_info
|
|
641
|
+
text_block_started = True
|
|
642
|
+
# Check for continuedat attribute ON THE TAG ITSELF
|
|
643
|
+
continuedat = tb_tag.get('continuedat')
|
|
644
|
+
if continuedat:
|
|
645
|
+
text_block_has_continuation = True
|
|
646
|
+
self.continuation_map[continuedat] = tb_info
|
|
647
|
+
|
|
648
|
+
if is_block:
|
|
649
|
+
self._blankline_before(page_num)
|
|
650
|
+
|
|
651
|
+
# Handle tables and lists atomically
|
|
652
|
+
if root.name in {"table", "ul", "ol"}:
|
|
653
|
+
t = self._process_element(root)
|
|
654
|
+
if t:
|
|
655
|
+
self._append(page_num, t, source_node=root)
|
|
656
|
+
self._blankline_after(page_num)
|
|
657
|
+
if self._has_break_after(root):
|
|
658
|
+
page_num += 1
|
|
659
|
+
|
|
660
|
+
# Restore TextBlock context before early return
|
|
661
|
+
if text_block_started and not text_block_has_continuation:
|
|
662
|
+
if continuation_ends_text_block:
|
|
663
|
+
self.current_text_block = None
|
|
664
|
+
else:
|
|
665
|
+
self.current_text_block = previous_text_block
|
|
666
|
+
|
|
667
|
+
return page_num
|
|
668
|
+
|
|
669
|
+
# For inline wrappers (bold/italic), render atomically
|
|
670
|
+
wrap = self._wrap_markdown(root)
|
|
671
|
+
if wrap and not is_block:
|
|
672
|
+
t = self._process_element(root)
|
|
673
|
+
if t:
|
|
674
|
+
self._append(page_num, t + " ", source_node=root)
|
|
675
|
+
if self._has_break_after(root):
|
|
676
|
+
page_num += 1
|
|
677
|
+
|
|
678
|
+
# Restore TextBlock context before early return
|
|
679
|
+
if text_block_started and not text_block_has_continuation:
|
|
680
|
+
if continuation_ends_text_block:
|
|
681
|
+
self.current_text_block = None
|
|
682
|
+
else:
|
|
683
|
+
self.current_text_block = previous_text_block
|
|
684
|
+
|
|
685
|
+
return page_num
|
|
686
|
+
|
|
687
|
+
# Stream children for block elements
|
|
688
|
+
current = page_num
|
|
689
|
+
for child in root.children:
|
|
690
|
+
current = self._stream_pages(child, current)
|
|
691
|
+
|
|
692
|
+
if is_block:
|
|
693
|
+
self._blankline_after(current)
|
|
694
|
+
|
|
695
|
+
if self._has_break_after(root):
|
|
696
|
+
current += 1
|
|
697
|
+
|
|
698
|
+
# Restore previous TextBlock context if we started a new one
|
|
699
|
+
# This applies to:
|
|
700
|
+
# 1. Block elements with new TextBlock tags (restore to previous)
|
|
701
|
+
# 2. Continuation tags that END a TextBlock (clear to None)
|
|
702
|
+
# (unless the TextBlock tag has continuedat, meaning it continues elsewhere)
|
|
703
|
+
if text_block_started and not text_block_has_continuation:
|
|
704
|
+
if continuation_ends_text_block:
|
|
705
|
+
# Continuation tag with no continuedat ENDS the TextBlock
|
|
706
|
+
self.current_text_block = None
|
|
707
|
+
else:
|
|
708
|
+
# New TextBlock tag restores to previous context
|
|
709
|
+
self.current_text_block = previous_text_block
|
|
710
|
+
|
|
711
|
+
return current
|
|
712
|
+
|
|
713
|
+
def get_pages(self, include_elements: bool = False) -> List[Page]:
|
|
714
|
+
"""Get parsed pages as Page objects.
|
|
715
|
+
|
|
716
|
+
Args:
|
|
717
|
+
include_elements: If True, include structured Element objects with each page
|
|
718
|
+
|
|
719
|
+
Returns:
|
|
720
|
+
List of Page objects, optionally with elements
|
|
721
|
+
"""
|
|
722
|
+
# Parse content using existing stream parser
|
|
723
|
+
self.pages = defaultdict(list)
|
|
724
|
+
self.page_segments = defaultdict(list)
|
|
725
|
+
self.includes_table = False
|
|
726
|
+
root = self.soup.body if self.soup.body else self.soup
|
|
727
|
+
self._stream_pages(root, page_num=1)
|
|
728
|
+
|
|
729
|
+
result: List[Page] = []
|
|
730
|
+
for page_num in sorted(self.pages.keys()):
|
|
731
|
+
raw = "".join(self.pages[page_num])
|
|
732
|
+
|
|
733
|
+
# Collapse excessive newlines
|
|
734
|
+
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
|
735
|
+
|
|
736
|
+
lines: List[str] = []
|
|
737
|
+
for line in raw.split("\n"):
|
|
738
|
+
line = line.strip()
|
|
739
|
+
if line or (lines and lines[-1]):
|
|
740
|
+
lines.append(line)
|
|
741
|
+
content = "\n".join(lines).strip()
|
|
742
|
+
|
|
743
|
+
result.append(Page(number=page_num, content=content, elements=None))
|
|
744
|
+
|
|
745
|
+
# CONTENT-LOSS WATCHDOG
|
|
746
|
+
total_output_chars = sum(len(p.content) for p in result)
|
|
747
|
+
if self.input_char_count > 0:
|
|
748
|
+
retention_ratio = total_output_chars / self.input_char_count
|
|
749
|
+
if retention_ratio < 0.95:
|
|
750
|
+
# logger.warning(f"⚠️ Content loss detected: {100 * (1 - retention_ratio):.1f}% of input lost!")
|
|
751
|
+
# logger.warning(f" Input: {self.input_char_count} chars, Output: {total_output_chars} chars")
|
|
752
|
+
pass
|
|
753
|
+
else:
|
|
754
|
+
logger.debug(f"✓ Content retention: {100 * retention_ratio:.1f}%")
|
|
755
|
+
|
|
756
|
+
# If elements requested, process further
|
|
757
|
+
if include_elements:
|
|
758
|
+
result = self._add_elements_to_pages(result)
|
|
759
|
+
|
|
760
|
+
return result
|
|
761
|
+
|
|
762
|
+
def _effective_rows(self, table: Tag) -> list[list[Tag]]:
|
|
763
|
+
"""Return rows that have at least one non-empty td/th."""
|
|
764
|
+
rows = []
|
|
765
|
+
for tr in table.find_all('tr', recursive=True):
|
|
766
|
+
cells = tr.find_all(['td', 'th'], recursive=False) or tr.find_all(['td', 'th'], recursive=True)
|
|
767
|
+
texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
|
|
768
|
+
if any(texts):
|
|
769
|
+
rows.append(cells)
|
|
770
|
+
return rows
|
|
771
|
+
|
|
772
|
+
def _one_row_table_to_text(self, cells: list[Tag]) -> str:
|
|
773
|
+
"""Flatten a 1-row table to plain text; upgrade to header when possible."""
|
|
774
|
+
texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
|
|
775
|
+
if not texts:
|
|
776
|
+
return ""
|
|
777
|
+
|
|
778
|
+
first = texts[0]
|
|
779
|
+
if (m := ITEM_HEADER_CELL_RE.match(first)):
|
|
780
|
+
num = m.group(1).upper()
|
|
781
|
+
title = next((t for t in texts[1:] if t), "")
|
|
782
|
+
return f"ITEM {num}. {title}".strip()
|
|
783
|
+
|
|
784
|
+
if (m := PART_HEADER_CELL_RE.match(first)):
|
|
785
|
+
roman = m.group(1).upper()
|
|
786
|
+
return f"PART {roman}"
|
|
787
|
+
|
|
788
|
+
# generic flatten (avoid markdown pipes which might be misread later)
|
|
789
|
+
return " ".join(t for t in texts if t).strip()
|
|
790
|
+
|
|
791
|
+
def _add_elements_to_pages(self, pages: List[Page]) -> List[Page]:
|
|
792
|
+
"""Add structured elements and TextBlocks to pages.
|
|
793
|
+
|
|
794
|
+
Steps:
|
|
795
|
+
1. Group segments into blocks (split on double newlines)
|
|
796
|
+
2. Collect source nodes and TextBlock info for each block
|
|
797
|
+
3. Generate IDs and augment HTML
|
|
798
|
+
4. Create Element and TextBlock objects
|
|
799
|
+
5. Attach to pages
|
|
800
|
+
"""
|
|
801
|
+
from sec2md.models import TextBlock
|
|
802
|
+
|
|
803
|
+
# Build elements for each page, tracking their nodes and TextBlocks
|
|
804
|
+
page_elements: Dict[int, List[Element]] = {}
|
|
805
|
+
page_text_blocks: Dict[int, List[TextBlock]] = {}
|
|
806
|
+
block_nodes_map: Dict[str, List[Tag]] = {}
|
|
807
|
+
|
|
808
|
+
for page in pages:
|
|
809
|
+
page_num = page.number
|
|
810
|
+
segments = self.page_segments.get(page_num, [])
|
|
811
|
+
|
|
812
|
+
if not segments:
|
|
813
|
+
page_elements[page_num] = []
|
|
814
|
+
page_text_blocks[page_num] = []
|
|
815
|
+
continue
|
|
816
|
+
|
|
817
|
+
# Group segments into blocks (returns (Element, nodes, text_block) tuples)
|
|
818
|
+
blocks_with_nodes = self._group_segments_into_blocks(segments, page_num)
|
|
819
|
+
|
|
820
|
+
# Merge small blocks into larger semantic units
|
|
821
|
+
merged_blocks = self._merge_small_blocks(blocks_with_nodes, page_num, min_chars=500)
|
|
822
|
+
|
|
823
|
+
# Separate elements, nodes, and group by TextBlock
|
|
824
|
+
elements = []
|
|
825
|
+
text_block_map: Dict[str, List[str]] = {} # TextBlock name -> element IDs
|
|
826
|
+
|
|
827
|
+
for element, nodes, text_block_info in merged_blocks:
|
|
828
|
+
elements.append(element)
|
|
829
|
+
block_nodes_map[element.id] = nodes
|
|
830
|
+
|
|
831
|
+
# Track which TextBlock this element belongs to
|
|
832
|
+
if text_block_info:
|
|
833
|
+
tb_name = text_block_info.name
|
|
834
|
+
if tb_name not in text_block_map:
|
|
835
|
+
text_block_map[tb_name] = []
|
|
836
|
+
text_block_map[tb_name].append(element.id)
|
|
837
|
+
|
|
838
|
+
page_elements[page_num] = elements
|
|
839
|
+
|
|
840
|
+
# Create TextBlock objects with actual Element objects
|
|
841
|
+
text_blocks = []
|
|
842
|
+
# Group by name to get unique TextBlocks with their titles
|
|
843
|
+
seen_names = {}
|
|
844
|
+
for element, nodes, text_block_info in merged_blocks:
|
|
845
|
+
if text_block_info and text_block_info.name not in seen_names:
|
|
846
|
+
seen_names[text_block_info.name] = text_block_info
|
|
847
|
+
|
|
848
|
+
# Build element ID to Element map
|
|
849
|
+
element_map = {elem.id: elem for elem in elements}
|
|
850
|
+
|
|
851
|
+
for tb_name, tb_info in seen_names.items():
|
|
852
|
+
element_ids = text_block_map.get(tb_name, [])
|
|
853
|
+
if element_ids:
|
|
854
|
+
# Get actual Element objects
|
|
855
|
+
tb_elements = [element_map[eid] for eid in element_ids if eid in element_map]
|
|
856
|
+
text_blocks.append(TextBlock(
|
|
857
|
+
name=tb_name,
|
|
858
|
+
title=tb_info.title,
|
|
859
|
+
elements=tb_elements
|
|
860
|
+
))
|
|
861
|
+
|
|
862
|
+
page_text_blocks[page_num] = text_blocks
|
|
863
|
+
|
|
864
|
+
# Augment HTML with IDs
|
|
865
|
+
self._augment_html_with_ids(page_elements, block_nodes_map)
|
|
866
|
+
|
|
867
|
+
# Attach elements and TextBlocks to pages
|
|
868
|
+
result = []
|
|
869
|
+
for page in pages:
|
|
870
|
+
elements = page_elements.get(page.number, [])
|
|
871
|
+
text_blocks = page_text_blocks.get(page.number, [])
|
|
872
|
+
result.append(Page(
|
|
873
|
+
number=page.number,
|
|
874
|
+
content=page.content,
|
|
875
|
+
elements=elements if elements else None,
|
|
876
|
+
text_blocks=text_blocks if text_blocks else None
|
|
877
|
+
))
|
|
878
|
+
|
|
879
|
+
return result
|
|
880
|
+
|
|
881
|
+
def _is_bold_header(self, element: Element) -> bool:
|
|
882
|
+
"""Check if element is a bold header (main section marker).
|
|
883
|
+
|
|
884
|
+
Bold headers start with ** and contain only a short title (< 50 chars typically).
|
|
885
|
+
Example: **Services**, **Competition**, **Markets and Distribution**
|
|
886
|
+
"""
|
|
887
|
+
content = element.content.strip()
|
|
888
|
+
|
|
889
|
+
# Check if content starts and ends with **
|
|
890
|
+
if not (content.startswith('**') and '**' in content[2:]):
|
|
891
|
+
return False
|
|
892
|
+
|
|
893
|
+
# Extract the bold part
|
|
894
|
+
first_line = content.split('\n')[0].strip()
|
|
895
|
+
|
|
896
|
+
# Bold headers are typically short and standalone
|
|
897
|
+
# If the first line is entirely bold and short, it's likely a header
|
|
898
|
+
if first_line.startswith('**') and first_line.endswith('**'):
|
|
899
|
+
bold_text = first_line[2:-2].strip()
|
|
900
|
+
# Bold headers are typically < 50 chars and don't contain much punctuation
|
|
901
|
+
if len(bold_text) < 50 and bold_text.count('.') <= 1:
|
|
902
|
+
return True
|
|
903
|
+
|
|
904
|
+
return False
|
|
905
|
+
|
|
906
|
+
def _merge_small_blocks(
|
|
907
|
+
self,
|
|
908
|
+
blocks_with_nodes: List[Tuple[Element, List[Tag], Optional[TextBlockInfo]]],
|
|
909
|
+
page_num: int,
|
|
910
|
+
min_chars: int = 500
|
|
911
|
+
) -> List[Tuple[Element, List[Tag], Optional[TextBlockInfo]]]:
|
|
912
|
+
"""Merge consecutive small blocks into larger semantic units.
|
|
913
|
+
|
|
914
|
+
Rules:
|
|
915
|
+
- Tables always stay separate
|
|
916
|
+
- Bold headers (**text**) start new sections
|
|
917
|
+
- TextBlock boundaries always flush
|
|
918
|
+
- Merge consecutive blocks until min_chars threshold
|
|
919
|
+
- Regenerate IDs for merged blocks
|
|
920
|
+
|
|
921
|
+
Args:
|
|
922
|
+
blocks_with_nodes: List of (Element, nodes, text_block) tuples
|
|
923
|
+
page_num: Page number for ID generation
|
|
924
|
+
min_chars: Minimum characters per block (default: 500)
|
|
925
|
+
|
|
926
|
+
Returns:
|
|
927
|
+
List of merged (Element, nodes, text_block) tuples
|
|
928
|
+
"""
|
|
929
|
+
if not blocks_with_nodes:
|
|
930
|
+
return []
|
|
931
|
+
|
|
932
|
+
merged = []
|
|
933
|
+
current_elements = []
|
|
934
|
+
current_nodes = []
|
|
935
|
+
current_chars = 0
|
|
936
|
+
current_text_block = None
|
|
937
|
+
|
|
938
|
+
def flush(block_idx: int):
|
|
939
|
+
nonlocal current_elements, current_nodes, current_chars
|
|
940
|
+
if not current_elements:
|
|
941
|
+
return
|
|
942
|
+
|
|
943
|
+
# Merge content from all elements
|
|
944
|
+
merged_content = '\n\n'.join(e.content for e in current_elements)
|
|
945
|
+
|
|
946
|
+
# Infer kind from merged elements
|
|
947
|
+
kinds = [e.kind for e in current_elements]
|
|
948
|
+
if 'table' in kinds:
|
|
949
|
+
kind = 'table'
|
|
950
|
+
elif 'header' in kinds:
|
|
951
|
+
kind = 'section'
|
|
952
|
+
else:
|
|
953
|
+
kind = current_elements[0].kind
|
|
954
|
+
|
|
955
|
+
# Generate new ID
|
|
956
|
+
block_id = self._generate_block_id(page_num, block_idx, merged_content, kind)
|
|
957
|
+
|
|
958
|
+
merged_element = Element(
|
|
959
|
+
id=block_id,
|
|
960
|
+
content=merged_content,
|
|
961
|
+
kind=kind,
|
|
962
|
+
page_start=page_num,
|
|
963
|
+
page_end=page_num
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
merged.append((merged_element, list(current_nodes), current_text_block))
|
|
967
|
+
current_elements = []
|
|
968
|
+
current_nodes = []
|
|
969
|
+
current_chars = 0
|
|
970
|
+
|
|
971
|
+
for i, (element, nodes, text_block) in enumerate(blocks_with_nodes):
|
|
972
|
+
# Check if TextBlock changed - this is a hard boundary, always flush
|
|
973
|
+
text_block_changed = False
|
|
974
|
+
if current_text_block is not None or text_block is not None:
|
|
975
|
+
# Compare TextBlock names (None != anything)
|
|
976
|
+
if current_text_block is None and text_block is not None:
|
|
977
|
+
text_block_changed = True
|
|
978
|
+
elif current_text_block is not None and text_block is None:
|
|
979
|
+
text_block_changed = True
|
|
980
|
+
elif current_text_block is not None and text_block is not None:
|
|
981
|
+
text_block_changed = current_text_block.name != text_block.name
|
|
982
|
+
|
|
983
|
+
if text_block_changed and current_elements:
|
|
984
|
+
flush(len(merged))
|
|
985
|
+
|
|
986
|
+
# Update current TextBlock
|
|
987
|
+
current_text_block = text_block
|
|
988
|
+
|
|
989
|
+
# Check if this is a table
|
|
990
|
+
if element.kind == 'table':
|
|
991
|
+
# If we have accumulated text AND it's small (< min_chars), merge it with the table
|
|
992
|
+
# This handles captions/headers before tables
|
|
993
|
+
if current_elements and current_chars < min_chars:
|
|
994
|
+
# Merge caption with table
|
|
995
|
+
current_elements.append(element)
|
|
996
|
+
current_nodes.extend([n for n in nodes if n not in current_nodes])
|
|
997
|
+
flush(len(merged))
|
|
998
|
+
else:
|
|
999
|
+
# Flush current, then add table separately
|
|
1000
|
+
flush(len(merged))
|
|
1001
|
+
merged.append((element, nodes, text_block))
|
|
1002
|
+
continue
|
|
1003
|
+
|
|
1004
|
+
# Check if this is a bold header (main section break)
|
|
1005
|
+
is_bold_header = self._is_bold_header(element)
|
|
1006
|
+
|
|
1007
|
+
# If we hit a bold header and have content, flush before starting new section
|
|
1008
|
+
# BUT: never flush if current content is ONLY headers (headers need content)
|
|
1009
|
+
if is_bold_header and current_elements:
|
|
1010
|
+
# Check if ALL current elements are headers (no actual content)
|
|
1011
|
+
all_headers = all(self._is_bold_header(e) for e in current_elements)
|
|
1012
|
+
is_current_only_headers = all_headers and current_chars < 200
|
|
1013
|
+
|
|
1014
|
+
if not is_current_only_headers:
|
|
1015
|
+
flush(len(merged))
|
|
1016
|
+
|
|
1017
|
+
# Add to current merge group
|
|
1018
|
+
current_elements.append(element)
|
|
1019
|
+
current_nodes.extend([n for n in nodes if n not in current_nodes])
|
|
1020
|
+
current_chars += element.char_count
|
|
1021
|
+
|
|
1022
|
+
# Decide whether to flush
|
|
1023
|
+
# IMPORTANT: Never flush if current block is only headers - headers need content
|
|
1024
|
+
should_flush = False
|
|
1025
|
+
|
|
1026
|
+
# Flush if we've hit min_chars (paragraph is big enough)
|
|
1027
|
+
if current_chars >= min_chars:
|
|
1028
|
+
should_flush = True
|
|
1029
|
+
|
|
1030
|
+
# OR flush if next element is a bold header (section boundary)
|
|
1031
|
+
if i + 1 < len(blocks_with_nodes):
|
|
1032
|
+
next_element, _, _ = blocks_with_nodes[i + 1]
|
|
1033
|
+
if self._is_bold_header(next_element):
|
|
1034
|
+
should_flush = True
|
|
1035
|
+
|
|
1036
|
+
# OR flush if we're at the end
|
|
1037
|
+
if i == len(blocks_with_nodes) - 1:
|
|
1038
|
+
should_flush = True
|
|
1039
|
+
|
|
1040
|
+
if should_flush:
|
|
1041
|
+
# Check if ALL current elements are headers (no actual content)
|
|
1042
|
+
all_headers = all(self._is_bold_header(e) for e in current_elements)
|
|
1043
|
+
is_only_headers = all_headers and current_chars < 200
|
|
1044
|
+
|
|
1045
|
+
if not is_only_headers:
|
|
1046
|
+
flush(len(merged))
|
|
1047
|
+
|
|
1048
|
+
# Flush remaining
|
|
1049
|
+
if current_elements:
|
|
1050
|
+
flush(len(merged))
|
|
1051
|
+
|
|
1052
|
+
return merged
|
|
1053
|
+
|
|
1054
|
+
def _group_segments_into_blocks(self, segments: List[Tuple[str, Optional[Tag], Optional[TextBlockInfo]]], page_num: int) -> List[Tuple[Element, List[Tag], Optional[TextBlockInfo]]]:
|
|
1055
|
+
"""Group sequential segments into semantic blocks.
|
|
1056
|
+
|
|
1057
|
+
Returns:
|
|
1058
|
+
List of (Element, nodes, text_block) tuples
|
|
1059
|
+
"""
|
|
1060
|
+
blocks = []
|
|
1061
|
+
current_block_segments = []
|
|
1062
|
+
current_block_nodes = []
|
|
1063
|
+
current_text_block = None
|
|
1064
|
+
block_idx = 0
|
|
1065
|
+
|
|
1066
|
+
for content, node, text_block in segments:
|
|
1067
|
+
# Check if this is a block boundary (double newline)
|
|
1068
|
+
if content == "\n":
|
|
1069
|
+
# Check if previous segment was also a newline
|
|
1070
|
+
if current_block_segments and current_block_segments[-1] == "\n":
|
|
1071
|
+
# Double newline - flush current block
|
|
1072
|
+
if len(current_block_segments) > 1: # Has content beyond the trailing newline
|
|
1073
|
+
block = self._create_block(
|
|
1074
|
+
current_block_segments[:-1], # Exclude trailing newline
|
|
1075
|
+
current_block_nodes,
|
|
1076
|
+
page_num,
|
|
1077
|
+
block_idx
|
|
1078
|
+
)
|
|
1079
|
+
if block:
|
|
1080
|
+
blocks.append((block, list(current_block_nodes), current_text_block))
|
|
1081
|
+
block_idx += 1
|
|
1082
|
+
current_block_segments = []
|
|
1083
|
+
current_block_nodes = []
|
|
1084
|
+
current_text_block = None
|
|
1085
|
+
continue
|
|
1086
|
+
|
|
1087
|
+
current_block_segments.append(content)
|
|
1088
|
+
if node is not None and node not in current_block_nodes:
|
|
1089
|
+
current_block_nodes.append(node)
|
|
1090
|
+
# Track TextBlock (use last non-None value)
|
|
1091
|
+
if text_block is not None:
|
|
1092
|
+
current_text_block = text_block
|
|
1093
|
+
|
|
1094
|
+
# Flush remaining block
|
|
1095
|
+
if current_block_segments:
|
|
1096
|
+
# Remove trailing newlines
|
|
1097
|
+
while current_block_segments and current_block_segments[-1] == "\n":
|
|
1098
|
+
current_block_segments.pop()
|
|
1099
|
+
|
|
1100
|
+
if current_block_segments:
|
|
1101
|
+
block = self._create_block(
|
|
1102
|
+
current_block_segments,
|
|
1103
|
+
current_block_nodes,
|
|
1104
|
+
page_num,
|
|
1105
|
+
block_idx
|
|
1106
|
+
)
|
|
1107
|
+
if block:
|
|
1108
|
+
blocks.append((block, list(current_block_nodes), current_text_block))
|
|
1109
|
+
|
|
1110
|
+
return blocks
|
|
1111
|
+
|
|
1112
|
+
def _create_block(
|
|
1113
|
+
self,
|
|
1114
|
+
segments: List[str],
|
|
1115
|
+
nodes: List[Tag],
|
|
1116
|
+
page_num: int,
|
|
1117
|
+
block_idx: int
|
|
1118
|
+
) -> Optional[Element]:
|
|
1119
|
+
"""Create an Element from segments and nodes."""
|
|
1120
|
+
content = "".join(segments).strip()
|
|
1121
|
+
if not content:
|
|
1122
|
+
return None
|
|
1123
|
+
|
|
1124
|
+
# Infer block kind from nodes
|
|
1125
|
+
kind = self._infer_kind_from_nodes(nodes)
|
|
1126
|
+
|
|
1127
|
+
# Generate stable ID
|
|
1128
|
+
block_id = self._generate_block_id(page_num, block_idx, content, kind)
|
|
1129
|
+
|
|
1130
|
+
return Element(
|
|
1131
|
+
id=block_id,
|
|
1132
|
+
content=content,
|
|
1133
|
+
kind=kind,
|
|
1134
|
+
page_start=page_num,
|
|
1135
|
+
page_end=page_num
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
def _infer_kind_from_nodes(self, nodes: List[Tag]) -> str:
|
|
1139
|
+
"""Infer block kind from DOM nodes."""
|
|
1140
|
+
if not nodes:
|
|
1141
|
+
return "text"
|
|
1142
|
+
|
|
1143
|
+
# Check first meaningful node
|
|
1144
|
+
for node in nodes:
|
|
1145
|
+
if node.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
|
|
1146
|
+
return "header"
|
|
1147
|
+
elif node.name == "table":
|
|
1148
|
+
return "table"
|
|
1149
|
+
elif node.name in {"ul", "ol"}:
|
|
1150
|
+
return "list"
|
|
1151
|
+
elif node.name == "p":
|
|
1152
|
+
return "paragraph"
|
|
1153
|
+
|
|
1154
|
+
return "text"
|
|
1155
|
+
|
|
1156
|
+
def _generate_block_id(self, page: int, idx: int, content: str, kind: str) -> str:
|
|
1157
|
+
"""Generate stable block ID using normalized content hash."""
|
|
1158
|
+
# Normalize: collapse whitespace for stable hashing
|
|
1159
|
+
normalized = re.sub(r'\s+', ' ', content.strip()).lower()
|
|
1160
|
+
hash_part = hashlib.sha1(normalized.encode('utf-8')).hexdigest()[:8]
|
|
1161
|
+
kind_prefix = kind[0] if kind else "b"
|
|
1162
|
+
return f"sec2md-p{page}-{kind_prefix}{idx}-{hash_part}"
|
|
1163
|
+
|
|
1164
|
+
def _augment_html_with_ids(self, page_elements: Dict[int, List[Element]], block_nodes_map: Dict[str, List[Tag]]) -> None:
|
|
1165
|
+
"""Add id attributes and data-sec2md-block to DOM nodes.
|
|
1166
|
+
|
|
1167
|
+
Args:
|
|
1168
|
+
page_elements: Map of page_num -> List[Element]
|
|
1169
|
+
block_nodes_map: Map of element.id -> List[Tag] (the nodes for that element)
|
|
1170
|
+
"""
|
|
1171
|
+
seen_pages = set()
|
|
1172
|
+
|
|
1173
|
+
# Augment HTML
|
|
1174
|
+
for page_num in sorted(page_elements.keys()):
|
|
1175
|
+
elements = page_elements[page_num]
|
|
1176
|
+
|
|
1177
|
+
for i, element in enumerate(elements):
|
|
1178
|
+
nodes = block_nodes_map.get(element.id, [])
|
|
1179
|
+
if not nodes:
|
|
1180
|
+
continue
|
|
1181
|
+
|
|
1182
|
+
first_node = nodes[0]
|
|
1183
|
+
|
|
1184
|
+
# Add page ID to first block on this page
|
|
1185
|
+
if page_num not in seen_pages:
|
|
1186
|
+
# Add page-N as an additional ID on the first node
|
|
1187
|
+
if 'id' in first_node.attrs:
|
|
1188
|
+
# Node already has an ID, add page ID as a class instead
|
|
1189
|
+
existing_classes = first_node.get('class', [])
|
|
1190
|
+
if isinstance(existing_classes, str):
|
|
1191
|
+
existing_classes = existing_classes.split()
|
|
1192
|
+
existing_classes.append(f"page-{page_num}")
|
|
1193
|
+
first_node['class'] = existing_classes
|
|
1194
|
+
else:
|
|
1195
|
+
first_node['id'] = f"page-{page_num}"
|
|
1196
|
+
seen_pages.add(page_num)
|
|
1197
|
+
|
|
1198
|
+
# Add block ID to first node (if it doesn't already have page ID)
|
|
1199
|
+
if 'id' not in first_node.attrs:
|
|
1200
|
+
first_node['id'] = element.id
|
|
1201
|
+
|
|
1202
|
+
# Add data attribute to all nodes for highlighting
|
|
1203
|
+
for node in nodes:
|
|
1204
|
+
node['data-sec2md-block'] = element.id
|
|
1205
|
+
|
|
1206
|
+
def markdown(self) -> str:
|
|
1207
|
+
"""Get full document as markdown string."""
|
|
1208
|
+
pages = self.get_pages()
|
|
1209
|
+
return "\n\n".join(page.content for page in pages if page.content)
|
|
1210
|
+
|
|
1211
|
+
def html(self) -> str:
|
|
1212
|
+
"""Get the HTML with augmented anchors and data attributes.
|
|
1213
|
+
|
|
1214
|
+
Note: Call get_pages(include_elements=True) first to augment the HTML.
|
|
1215
|
+
If called before get_pages(), returns the original unmodified HTML.
|
|
1216
|
+
"""
|
|
1217
|
+
return str(self.soup)
|