sec2md 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/parser.py ADDED
@@ -0,0 +1,1217 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import logging
5
+ import hashlib
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass
8
+ from typing import List, Dict, Union, Optional, Tuple
9
+ from bs4 import BeautifulSoup
10
+ from bs4.element import NavigableString, Tag
11
+
12
+ from sec2md.absolute_table_parser import AbsolutelyPositionedTableParser, median
13
+ from sec2md.table_parser import TableParser
14
+ from sec2md.models import Page, Element
15
+
16
+ BLOCK_TAGS = {"div", "p", "h1", "h2", "h3", "h4", "h5", "h6", "table", "br", "hr", "ul", "ol", "li"}
17
+ BOLD_TAGS = {"b", "strong"}
18
+ ITALIC_TAGS = {"i", "em"}
19
+
20
+ _ws = re.compile(r"\s+")
21
+ _css_decl = re.compile(r"^[a-zA-Z\-]+\s*:\s*[^;]+;\s*$")
22
+ ITEM_HEADER_CELL_RE = re.compile(r"^\s*Item\s+([0-9IVX]+)\.\s*$", re.I)
23
+ PART_HEADER_CELL_RE = re.compile(r"^\s*Part\s+([IVX]+)\s*$", re.I)
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ @dataclass
29
+ class TextBlockInfo:
30
+ """Tracks XBRL TextBlock context during parsing."""
31
+ name: str # e.g., "us-gaap:DebtDisclosureTextBlock"
32
+ title: Optional[str] = None # e.g., "Note 9 – Debt"
33
+
34
+
35
+ class Parser:
36
+ """Document parser with support for regular tables and pseudo-tables."""
37
+
38
+ def __init__(self, content: str):
39
+ self.soup = BeautifulSoup(content, "lxml")
40
+ self.includes_table = False
41
+ self.pages: Dict[int, List[str]] = defaultdict(list)
42
+ # Track DOM provenance and TextBlock: (content, source_node, text_block_info)
43
+ self.page_segments: Dict[int, List[Tuple[str, Optional[Tag], Optional[TextBlockInfo]]]] = defaultdict(list)
44
+ self.input_char_count = len(self.soup.get_text())
45
+
46
+ # Track current TextBlock context
47
+ self.current_text_block: Optional[TextBlockInfo] = None
48
+ # Map continuation IDs to TextBlock context
49
+ self.continuation_map: Dict[str, TextBlockInfo] = {}
50
+
51
+ @staticmethod
52
+ def _is_text_block_tag(el: Tag) -> bool:
53
+ """Check if element is an ix:nonnumeric with a note-level TextBlock name.
54
+
55
+ Only tracks financial notes, not document metadata.
56
+ Tracks: us-gaap:*, cyd:* (notes and disclosures)
57
+ Ignores: dei:* (document metadata)
58
+ """
59
+ if not isinstance(el, Tag):
60
+ return False
61
+ if el.name not in ('ix:nonnumeric', 'nonnumeric'):
62
+ return False
63
+ name = el.get('name', '')
64
+ if 'TextBlock' not in name:
65
+ return False
66
+
67
+ # Only track note-level TextBlocks (us-gaap, cyd)
68
+ # Ignore document metadata (dei)
69
+ return name.startswith('us-gaap:') or name.startswith('cyd:')
70
+
71
+ @staticmethod
72
+ def _find_text_block_tag_in_children(el: Tag) -> Optional[Tag]:
73
+ """Find TextBlock tag in children (search 2 levels deep).
74
+
75
+ Searches children and grandchildren to handle:
76
+ <div>
77
+ <span><ix:nonnumeric>Title</ix:nonnumeric></span>
78
+ <div>Content</div>
79
+ </div>
80
+ """
81
+ if not isinstance(el, Tag):
82
+ return None
83
+
84
+ # Check if current element is the TextBlock tag
85
+ if Parser._is_text_block_tag(el):
86
+ return el
87
+
88
+ # Check direct children
89
+ for child in el.children:
90
+ if isinstance(child, Tag):
91
+ if Parser._is_text_block_tag(child):
92
+ return child
93
+ # Check grandchildren (one more level)
94
+ for grandchild in child.children:
95
+ if isinstance(grandchild, Tag) and Parser._is_text_block_tag(grandchild):
96
+ return grandchild
97
+
98
+ return None
99
+
100
+ @staticmethod
101
+ def _extract_text_block_info(el: Tag) -> Optional[TextBlockInfo]:
102
+ """Extract TextBlock name and title from ix:nonnumeric tag.
103
+
104
+ Since we only track outermost TextBlocks, these should have short titles
105
+ (e.g., "Segment Information and Geographic Data") inside the tag.
106
+ """
107
+ if not isinstance(el, Tag):
108
+ return None
109
+ name = el.get('name', '')
110
+ if not name or 'TextBlock' not in name:
111
+ return None
112
+
113
+ # Get text content from tag
114
+ tag_text = el.get_text(strip=True) or ''
115
+
116
+ # Use tag content if reasonable length (<200 chars is a note title)
117
+ # Otherwise derive from XBRL name
118
+ if tag_text and len(tag_text) < 200:
119
+ title = tag_text
120
+ else:
121
+ # Fallback: Derive from XBRL name
122
+ # "SegmentReportingDisclosureTextBlock" -> "Segment Reporting Disclosure"
123
+ import re
124
+ name_part = name.split(':')[-1].replace('TextBlock', '')
125
+ # Insert spaces before capitals
126
+ title = re.sub(r'([A-Z])', r' \1', name_part).strip()
127
+ # Clean up double spaces
128
+ title = re.sub(r'\s+', ' ', title)
129
+
130
+ return TextBlockInfo(name=name, title=title)
131
+
132
+ @staticmethod
133
+ def _is_continuation_tag(el: Tag) -> bool:
134
+ """Check if element is an ix:continuation tag."""
135
+ if not isinstance(el, Tag):
136
+ return False
137
+ return el.name in ('ix:continuation', 'continuation')
138
+
139
+ @staticmethod
140
+ def _is_bold(el: Tag) -> bool:
141
+ if not isinstance(el, Tag):
142
+ return False
143
+ style = (el.get("style") or "").lower()
144
+ return (
145
+ "font-weight:700" in style
146
+ or "font-weight:bold" in style
147
+ or el.name in BOLD_TAGS
148
+ )
149
+
150
+ @staticmethod
151
+ def _is_italic(el: Tag) -> bool:
152
+ if not isinstance(el, Tag):
153
+ return False
154
+ style = (el.get("style") or "").lower()
155
+ return (
156
+ "font-style:italic" in style
157
+ or el.name in ITALIC_TAGS
158
+ )
159
+
160
+ @staticmethod
161
+ def _is_block(el: Tag) -> bool:
162
+ return isinstance(el, Tag) and el.name in BLOCK_TAGS
163
+
164
+ @staticmethod
165
+ def _is_absolutely_positioned(el: Tag) -> bool:
166
+ """Check if element has position:absolute"""
167
+ if not isinstance(el, Tag):
168
+ return False
169
+ style = (el.get("style") or "").lower().replace(" ", "")
170
+ return "position:absolute" in style
171
+
172
+ @staticmethod
173
+ def _is_inline_display(el: Tag) -> bool:
174
+ """Check if element has display:inline or display:inline-block"""
175
+ if not isinstance(el, Tag):
176
+ return False
177
+ style = (el.get("style") or "").lower().replace(" ", "")
178
+ return "display:inline-block" in style or "display:inline;" in style
179
+
180
+ @staticmethod
181
+ def _has_break_before(el: Tag) -> bool:
182
+ if not isinstance(el, Tag):
183
+ return False
184
+ style = (el.get("style") or "").lower().replace(" ", "")
185
+ return (
186
+ "page-break-before:always" in style
187
+ or "break-before:page" in style
188
+ or "break-before:always" in style
189
+ )
190
+
191
+ @staticmethod
192
+ def _has_break_after(el: Tag) -> bool:
193
+ if not isinstance(el, Tag):
194
+ return False
195
+ style = (el.get("style") or "").lower().replace(" ", "")
196
+ return (
197
+ "page-break-after:always" in style
198
+ or "break-after:page" in style
199
+ or "break-after:always" in style
200
+ )
201
+
202
+ @staticmethod
203
+ def _is_hidden(el: Tag) -> bool:
204
+ """Check if element has display:none"""
205
+ if not isinstance(el, Tag):
206
+ return False
207
+ style = (el.get("style") or "").lower().replace(" ", "")
208
+ return "display:none" in style
209
+
210
+ @staticmethod
211
+ def _clean_text(text: str) -> str:
212
+ # Remove zero-width spaces, BOM, normalize NBSP
213
+ text = text.replace("\u200b", "").replace("\ufeff", "").replace("\xa0", " ")
214
+ return _ws.sub(" ", text).strip()
215
+
216
+ @staticmethod
217
+ def _wrap_markdown(el: Tag) -> str:
218
+ """Return the prefix/suffix markdown wrapper for this element."""
219
+ bold = Parser._is_bold(el)
220
+ italic = Parser._is_italic(el)
221
+ if bold and italic:
222
+ return "***"
223
+ if bold:
224
+ return "**"
225
+ if italic:
226
+ return "*"
227
+ return ""
228
+
229
+ def _append(self, page_num: int, s: str, source_node: Optional[Tag] = None, text_block: Optional[TextBlockInfo] = None) -> None:
230
+ if s:
231
+ self.pages[page_num].append(s)
232
+ # Use current_text_block if not explicitly provided
233
+ tb = text_block if text_block is not None else self.current_text_block
234
+ self.page_segments[page_num].append((s, source_node, tb))
235
+
236
+ def _blankline_before(self, page_num: int) -> None:
237
+ """Ensure exactly one blank line before the next block."""
238
+ buf = self.pages[page_num]
239
+ seg_buf = self.page_segments[page_num]
240
+ if not buf:
241
+ return
242
+ if not buf[-1].endswith("\n"):
243
+ buf.append("\n")
244
+ seg_buf.append(("\n", None, self.current_text_block))
245
+ if len(buf) >= 2 and buf[-1] == "\n" and buf[-2] == "\n":
246
+ return
247
+ buf.append("\n")
248
+ seg_buf.append(("\n", None, self.current_text_block))
249
+
250
+ def _blankline_after(self, page_num: int) -> None:
251
+ """Mirror `_blankline_before` for symmetry; same rule."""
252
+ self._blankline_before(page_num)
253
+
254
+ def _process_text_node(self, node: NavigableString) -> str:
255
+ text = self._clean_text(str(node))
256
+ if text and _css_decl.match(text):
257
+ return ""
258
+ return text
259
+
260
+ def _process_element(self, element: Union[Tag, NavigableString]) -> str:
261
+ if isinstance(element, NavigableString):
262
+ return self._process_text_node(element)
263
+
264
+ if element.name == "table":
265
+ # Use effective (non-empty) rows for the decision
266
+ eff_rows = self._effective_rows(element)
267
+ if len(eff_rows) <= 1:
268
+ # Flatten single-row "header tables" like Item/Part banners
269
+ cells = eff_rows[0] if eff_rows else []
270
+ text = self._one_row_table_to_text(cells)
271
+ return text
272
+
273
+ self.includes_table = True
274
+ return TableParser(element).md().strip()
275
+
276
+ if element.name in {"ul", "ol"}:
277
+ items = []
278
+ for li in element.find_all("li", recursive=False):
279
+ item_text = self._process_element(li).strip()
280
+ if item_text:
281
+ item_text = item_text.lstrip("•·∙◦▪▫-").strip()
282
+ items.append(item_text)
283
+ if not items:
284
+ return ""
285
+ if element.name == "ol":
286
+ return "\n".join(f"{i + 1}. {t}" for i, t in enumerate(items))
287
+ else:
288
+ return "\n".join(f"- {t}" for t in items)
289
+
290
+ if element.name == "li":
291
+ parts = [self._process_element(c) for c in element.children]
292
+ return " ".join(p for p in parts if p).strip()
293
+
294
+ parts: List[str] = []
295
+ for child in element.children:
296
+ if isinstance(child, NavigableString):
297
+ t = self._process_text_node(child)
298
+ if t:
299
+ parts.append(t)
300
+ else:
301
+ t = self._process_element(child)
302
+ if t:
303
+ parts.append(t)
304
+
305
+ text = " ".join(p for p in parts if p).strip()
306
+ if not text:
307
+ return ""
308
+
309
+ wrap = self._wrap_markdown(element)
310
+ return f"{wrap}{text}{wrap}" if wrap else text
311
+
312
+ def _extract_absolutely_positioned_children(self, container: Tag) -> List[Tag]:
313
+ """
314
+ Extract all absolutely positioned children from a container.
315
+
316
+ Returns:
317
+ List of absolutely positioned child elements
318
+ """
319
+ positioned_children = []
320
+ for child in container.children:
321
+ if isinstance(child, Tag) and self._is_absolutely_positioned(child):
322
+ # Skip elements that are just styling (no text content)
323
+ if child.get_text(strip=True):
324
+ positioned_children.append(child)
325
+ return positioned_children
326
+
327
+ def _compute_line_gaps(self, elements: List[Tag]) -> List[float]:
328
+ """
329
+ Compute gaps between consecutive Y positions (line gaps).
330
+
331
+ Returns:
332
+ List of gap sizes in pixels
333
+ """
334
+ y_positions = []
335
+ for el in elements:
336
+ style = el.get("style", "")
337
+ top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
338
+ if top_match:
339
+ y_positions.append(float(top_match.group(1)))
340
+
341
+ if len(y_positions) < 2:
342
+ return []
343
+
344
+ y_positions.sort()
345
+ gaps = [y_positions[i + 1] - y_positions[i] for i in range(len(y_positions) - 1)]
346
+ # Filter out very small gaps (same line) and very large gaps (section breaks)
347
+ gaps = [g for g in gaps if 5 < g < 100]
348
+ return gaps
349
+
350
+ def _split_positioned_groups(self, elements: List[Tag], gap_threshold: Optional[float] = None) -> List[List[Tag]]:
351
+ """
352
+ Split positioned elements into separate groups.
353
+ Uses ADAPTIVE gap threshold based on document characteristics.
354
+
355
+ Args:
356
+ elements: List of absolutely positioned elements
357
+ gap_threshold: Optional threshold in pixels (if None, computed adaptively)
358
+
359
+ Returns:
360
+ List of element groups
361
+ """
362
+ if not elements:
363
+ return []
364
+
365
+ # ADAPTIVE THRESHOLD: Learn from the document
366
+ if gap_threshold is None:
367
+ line_gaps = self._compute_line_gaps(elements)
368
+ if line_gaps:
369
+ median_gap = median(line_gaps)
370
+ # Use 1.2x median line gap, capped at 30px
371
+ gap_threshold = min(1.2 * median_gap, 30.0)
372
+ logger.debug(f"Adaptive gap threshold: {gap_threshold:.1f}px (median line gap: {median_gap:.1f}px)")
373
+ else:
374
+ gap_threshold = 30.0 # Fallback
375
+
376
+ # Extract Y coordinates
377
+ element_positions = []
378
+ for el in elements:
379
+ style = el.get("style", "")
380
+ top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
381
+ if top_match:
382
+ top = float(top_match.group(1))
383
+ element_positions.append((top, el))
384
+
385
+ if not element_positions:
386
+ return [elements]
387
+
388
+ # Sort by Y position
389
+ element_positions.sort(key=lambda x: x[0])
390
+
391
+ # Group by gaps
392
+ groups = []
393
+ current_group = [element_positions[0][1]]
394
+ last_y = element_positions[0][0]
395
+
396
+ for y, el in element_positions[1:]:
397
+ gap = y - last_y
398
+ if gap > gap_threshold:
399
+ # Large gap - start new group
400
+ if current_group:
401
+ groups.append(current_group)
402
+ current_group = [el]
403
+ else:
404
+ current_group.append(el)
405
+ last_y = y
406
+
407
+ if current_group:
408
+ groups.append(current_group)
409
+
410
+ # Post-process: split groups that transition from multi-column to single-column
411
+ final_groups = []
412
+ for group in groups:
413
+ split_groups = self._split_by_column_transition(group)
414
+ final_groups.extend(split_groups)
415
+
416
+ logger.debug(
417
+ f"Split {len(elements)} elements into {len(final_groups)} groups (threshold: {gap_threshold:.1f}px)")
418
+ return final_groups
419
+
420
+ def _split_by_column_transition(self, elements: List[Tag]) -> List[List[Tag]]:
421
+ """
422
+ Split a group if it transitions from multi-column (table) to single-column (prose).
423
+
424
+ This handles cases where a table is followed immediately by paragraph text
425
+ without a large Y-gap between them.
426
+
427
+ Args:
428
+ elements: List of elements in a group
429
+
430
+ Returns:
431
+ List of split groups (or original group if no transition found)
432
+ """
433
+ if len(elements) < 6:
434
+ return [elements]
435
+
436
+ # Extract X, Y positions for all elements
437
+ element_data = []
438
+ for el in elements:
439
+ style = el.get("style", "")
440
+ left_match = re.search(r'left:\s*(\d+(?:\.\d+)?)px', style)
441
+ top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
442
+ if left_match and top_match:
443
+ left = float(left_match.group(1))
444
+ top = float(top_match.group(1))
445
+ element_data.append((left, top, el))
446
+
447
+ if not element_data:
448
+ return [elements]
449
+
450
+ # Sort by Y position
451
+ element_data.sort(key=lambda x: x[1])
452
+
453
+ # Group into rows by Y position (15px tolerance)
454
+ rows = []
455
+ current_row = [element_data[0]]
456
+ last_y = element_data[0][1]
457
+
458
+ for left, top, el in element_data[1:]:
459
+ if abs(top - last_y) <= 15:
460
+ current_row.append((left, top, el))
461
+ else:
462
+ rows.append(current_row)
463
+ current_row = [(left, top, el)]
464
+ last_y = top
465
+
466
+ if current_row:
467
+ rows.append(current_row)
468
+
469
+ # Count unique X positions per row
470
+ def count_columns(row):
471
+ x_positions = set(left for left, _, _ in row)
472
+ return len(x_positions)
473
+
474
+ # Find transition point from multi-column to single-column
475
+ split_point = None
476
+ for i in range(len(rows) - 3): # Need at least 3 rows after split
477
+ current_cols = count_columns(rows[i])
478
+ next_cols = count_columns(rows[i + 1])
479
+
480
+ # Transition from 2+ columns to 1 column
481
+ if current_cols >= 2 and next_cols == 1:
482
+ # Check if next 2-3 rows are also single-column (confirms prose pattern)
483
+ following_single = sum(1 for j in range(i + 1, min(i + 4, len(rows)))
484
+ if count_columns(rows[j]) == 1)
485
+ if following_single >= 2:
486
+ split_point = i + 1
487
+ logger.debug(f"Column transition detected at row {i + 1} ({current_cols} cols -> {next_cols} col)")
488
+ break
489
+
490
+ if split_point is None:
491
+ return [elements]
492
+
493
+ # Split at the transition point
494
+ split_y = rows[split_point][0][1] # Y coordinate of first element in transition row
495
+
496
+ group1 = [el for left, top, el in element_data if top < split_y]
497
+ group2 = [el for left, top, el in element_data if top >= split_y]
498
+
499
+ result = []
500
+ if group1:
501
+ result.append(group1)
502
+ if group2:
503
+ result.append(group2)
504
+
505
+ return result if result else [elements]
506
+
507
+ def _process_absolutely_positioned_container(self, container: Tag, page_num: int) -> int:
508
+ """
509
+ Handle containers with absolutely positioned children.
510
+
511
+ Step 1: Extract absolutely positioned elements
512
+ Step 2: Split into separate groups by Y-coordinate gaps AND column transitions
513
+ Step 3: Process each group independently (table or text)
514
+
515
+ Args:
516
+ container: The container element
517
+ page_num: Current page number
518
+
519
+ Returns:
520
+ Updated page number
521
+ """
522
+ # Extract positioned children
523
+ positioned_children = self._extract_absolutely_positioned_children(container)
524
+
525
+ if not positioned_children:
526
+ # No positioned children, process normally
527
+ current = page_num
528
+ for child in container.children:
529
+ current = self._stream_pages(child, current)
530
+ return current
531
+
532
+ # Split into separate groups (adaptive threshold + column transition detection)
533
+ groups = self._split_positioned_groups(positioned_children)
534
+
535
+ # Process each group independently
536
+ for i, group in enumerate(groups):
537
+ table_parser = AbsolutelyPositionedTableParser(group)
538
+
539
+ if table_parser.is_table_like():
540
+ # It's a table! Render as markdown table
541
+ self.includes_table = True
542
+ markdown_table = table_parser.to_markdown()
543
+ if markdown_table:
544
+ # Use first element of group as source node
545
+ self._append(page_num, markdown_table, source_node=group[0] if group else None)
546
+ self._blankline_after(page_num)
547
+ else:
548
+ # Not a table - group by visual lines and render as text
549
+ text = table_parser.to_text()
550
+ if text:
551
+ if i > 0:
552
+ self._blankline_before(page_num)
553
+ # Use first element of group as source node
554
+ self._append(page_num, text, source_node=group[0] if group else None)
555
+
556
+ return page_num
557
+
558
+ def _stream_pages(self, root: Union[Tag, NavigableString], page_num: int = 1) -> int:
559
+ """Walk the DOM once; split only on CSS break styles."""
560
+ if isinstance(root, Tag) and self._has_break_before(root):
561
+ page_num += 1
562
+
563
+ if isinstance(root, NavigableString):
564
+ t = self._process_text_node(root)
565
+ if t:
566
+ # For text nodes, use parent as source
567
+ parent = root.parent if isinstance(root.parent, Tag) else None
568
+ self._append(page_num, t + " ", source_node=parent)
569
+ return page_num
570
+
571
+ if not isinstance(root, Tag):
572
+ return page_num
573
+
574
+ if self._is_hidden(root):
575
+ return page_num
576
+
577
+ # Track XBRL TextBlock context (will be set later after determining if block)
578
+ text_block_started = False
579
+ text_block_has_continuation = False
580
+ previous_text_block = self.current_text_block
581
+
582
+ # Check if this is a continuation tag
583
+ continuation_ends_text_block = False
584
+ if self._is_continuation_tag(root):
585
+ cont_id = root.get('id')
586
+ if cont_id and cont_id in self.continuation_map:
587
+ self.current_text_block = self.continuation_map[cont_id]
588
+ text_block_started = True
589
+ # Check if continues further
590
+ continuedat = root.get('continuedat')
591
+ if continuedat:
592
+ text_block_has_continuation = True
593
+ self.continuation_map[continuedat] = self.current_text_block
594
+ else:
595
+ # No continuedat: this continuation tag ENDS the TextBlock
596
+ # We need to clear the context after processing this tag
597
+ continuation_ends_text_block = True
598
+
599
+ # Check if this is a container with absolutely positioned children
600
+ is_absolutely_positioned = self._is_absolutely_positioned(root)
601
+ has_positioned_children = not is_absolutely_positioned and any(
602
+ isinstance(child, Tag) and self._is_absolutely_positioned(child)
603
+ for child in root.children
604
+ )
605
+
606
+ if has_positioned_children and root.name == "div":
607
+ # Special handling for absolutely positioned layouts
608
+ current = self._process_absolutely_positioned_container(root, page_num)
609
+ if self._has_break_after(root):
610
+ current += 1
611
+
612
+ # Restore TextBlock context before early return
613
+ if text_block_started and not text_block_has_continuation:
614
+ if continuation_ends_text_block:
615
+ self.current_text_block = None
616
+ else:
617
+ self.current_text_block = previous_text_block
618
+
619
+ return current
620
+
621
+ # Inline-display elements should not trigger blocks
622
+ is_inline_display = self._is_inline_display(root)
623
+ is_block = self._is_block(root) and root.name not in {"br",
624
+ "hr"} and not is_inline_display and not is_absolutely_positioned
625
+
626
+ # Check if this block element contains a TextBlock tag in its children
627
+ # ALWAYS check block elements for new TextBlocks (not just when current_text_block is None)
628
+ # This allows new notes to replace old ones across pages
629
+ if is_block:
630
+ tb_tag = self._find_text_block_tag_in_children(root)
631
+ if tb_tag:
632
+ tb_info = self._extract_text_block_info(tb_tag)
633
+ if tb_info:
634
+ # Only set if it's a DIFFERENT TextBlock (ignore nested duplicates)
635
+ is_new_text_block = (
636
+ self.current_text_block is None or
637
+ self.current_text_block.name != tb_info.name
638
+ )
639
+ if is_new_text_block:
640
+ self.current_text_block = tb_info
641
+ text_block_started = True
642
+ # Check for continuedat attribute ON THE TAG ITSELF
643
+ continuedat = tb_tag.get('continuedat')
644
+ if continuedat:
645
+ text_block_has_continuation = True
646
+ self.continuation_map[continuedat] = tb_info
647
+
648
+ if is_block:
649
+ self._blankline_before(page_num)
650
+
651
+ # Handle tables and lists atomically
652
+ if root.name in {"table", "ul", "ol"}:
653
+ t = self._process_element(root)
654
+ if t:
655
+ self._append(page_num, t, source_node=root)
656
+ self._blankline_after(page_num)
657
+ if self._has_break_after(root):
658
+ page_num += 1
659
+
660
+ # Restore TextBlock context before early return
661
+ if text_block_started and not text_block_has_continuation:
662
+ if continuation_ends_text_block:
663
+ self.current_text_block = None
664
+ else:
665
+ self.current_text_block = previous_text_block
666
+
667
+ return page_num
668
+
669
+ # For inline wrappers (bold/italic), render atomically
670
+ wrap = self._wrap_markdown(root)
671
+ if wrap and not is_block:
672
+ t = self._process_element(root)
673
+ if t:
674
+ self._append(page_num, t + " ", source_node=root)
675
+ if self._has_break_after(root):
676
+ page_num += 1
677
+
678
+ # Restore TextBlock context before early return
679
+ if text_block_started and not text_block_has_continuation:
680
+ if continuation_ends_text_block:
681
+ self.current_text_block = None
682
+ else:
683
+ self.current_text_block = previous_text_block
684
+
685
+ return page_num
686
+
687
+ # Stream children for block elements
688
+ current = page_num
689
+ for child in root.children:
690
+ current = self._stream_pages(child, current)
691
+
692
+ if is_block:
693
+ self._blankline_after(current)
694
+
695
+ if self._has_break_after(root):
696
+ current += 1
697
+
698
+ # Restore previous TextBlock context if we started a new one
699
+ # This applies to:
700
+ # 1. Block elements with new TextBlock tags (restore to previous)
701
+ # 2. Continuation tags that END a TextBlock (clear to None)
702
+ # (unless the TextBlock tag has continuedat, meaning it continues elsewhere)
703
+ if text_block_started and not text_block_has_continuation:
704
+ if continuation_ends_text_block:
705
+ # Continuation tag with no continuedat ENDS the TextBlock
706
+ self.current_text_block = None
707
+ else:
708
+ # New TextBlock tag restores to previous context
709
+ self.current_text_block = previous_text_block
710
+
711
+ return current
712
+
713
+ def get_pages(self, include_elements: bool = False) -> List[Page]:
714
+ """Get parsed pages as Page objects.
715
+
716
+ Args:
717
+ include_elements: If True, include structured Element objects with each page
718
+
719
+ Returns:
720
+ List of Page objects, optionally with elements
721
+ """
722
+ # Parse content using existing stream parser
723
+ self.pages = defaultdict(list)
724
+ self.page_segments = defaultdict(list)
725
+ self.includes_table = False
726
+ root = self.soup.body if self.soup.body else self.soup
727
+ self._stream_pages(root, page_num=1)
728
+
729
+ result: List[Page] = []
730
+ for page_num in sorted(self.pages.keys()):
731
+ raw = "".join(self.pages[page_num])
732
+
733
+ # Collapse excessive newlines
734
+ raw = re.sub(r"\n{3,}", "\n\n", raw)
735
+
736
+ lines: List[str] = []
737
+ for line in raw.split("\n"):
738
+ line = line.strip()
739
+ if line or (lines and lines[-1]):
740
+ lines.append(line)
741
+ content = "\n".join(lines).strip()
742
+
743
+ result.append(Page(number=page_num, content=content, elements=None))
744
+
745
+ # CONTENT-LOSS WATCHDOG
746
+ total_output_chars = sum(len(p.content) for p in result)
747
+ if self.input_char_count > 0:
748
+ retention_ratio = total_output_chars / self.input_char_count
749
+ if retention_ratio < 0.95:
750
+ # logger.warning(f"⚠️ Content loss detected: {100 * (1 - retention_ratio):.1f}% of input lost!")
751
+ # logger.warning(f" Input: {self.input_char_count} chars, Output: {total_output_chars} chars")
752
+ pass
753
+ else:
754
+ logger.debug(f"✓ Content retention: {100 * retention_ratio:.1f}%")
755
+
756
+ # If elements requested, process further
757
+ if include_elements:
758
+ result = self._add_elements_to_pages(result)
759
+
760
+ return result
761
+
762
+ def _effective_rows(self, table: Tag) -> list[list[Tag]]:
763
+ """Return rows that have at least one non-empty td/th."""
764
+ rows = []
765
+ for tr in table.find_all('tr', recursive=True):
766
+ cells = tr.find_all(['td', 'th'], recursive=False) or tr.find_all(['td', 'th'], recursive=True)
767
+ texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
768
+ if any(texts):
769
+ rows.append(cells)
770
+ return rows
771
+
772
+ def _one_row_table_to_text(self, cells: list[Tag]) -> str:
773
+ """Flatten a 1-row table to plain text; upgrade to header when possible."""
774
+ texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
775
+ if not texts:
776
+ return ""
777
+
778
+ first = texts[0]
779
+ if (m := ITEM_HEADER_CELL_RE.match(first)):
780
+ num = m.group(1).upper()
781
+ title = next((t for t in texts[1:] if t), "")
782
+ return f"ITEM {num}. {title}".strip()
783
+
784
+ if (m := PART_HEADER_CELL_RE.match(first)):
785
+ roman = m.group(1).upper()
786
+ return f"PART {roman}"
787
+
788
+ # generic flatten (avoid markdown pipes which might be misread later)
789
+ return " ".join(t for t in texts if t).strip()
790
+
791
+ def _add_elements_to_pages(self, pages: List[Page]) -> List[Page]:
792
+ """Add structured elements and TextBlocks to pages.
793
+
794
+ Steps:
795
+ 1. Group segments into blocks (split on double newlines)
796
+ 2. Collect source nodes and TextBlock info for each block
797
+ 3. Generate IDs and augment HTML
798
+ 4. Create Element and TextBlock objects
799
+ 5. Attach to pages
800
+ """
801
+ from sec2md.models import TextBlock
802
+
803
+ # Build elements for each page, tracking their nodes and TextBlocks
804
+ page_elements: Dict[int, List[Element]] = {}
805
+ page_text_blocks: Dict[int, List[TextBlock]] = {}
806
+ block_nodes_map: Dict[str, List[Tag]] = {}
807
+
808
+ for page in pages:
809
+ page_num = page.number
810
+ segments = self.page_segments.get(page_num, [])
811
+
812
+ if not segments:
813
+ page_elements[page_num] = []
814
+ page_text_blocks[page_num] = []
815
+ continue
816
+
817
+ # Group segments into blocks (returns (Element, nodes, text_block) tuples)
818
+ blocks_with_nodes = self._group_segments_into_blocks(segments, page_num)
819
+
820
+ # Merge small blocks into larger semantic units
821
+ merged_blocks = self._merge_small_blocks(blocks_with_nodes, page_num, min_chars=500)
822
+
823
+ # Separate elements, nodes, and group by TextBlock
824
+ elements = []
825
+ text_block_map: Dict[str, List[str]] = {} # TextBlock name -> element IDs
826
+
827
+ for element, nodes, text_block_info in merged_blocks:
828
+ elements.append(element)
829
+ block_nodes_map[element.id] = nodes
830
+
831
+ # Track which TextBlock this element belongs to
832
+ if text_block_info:
833
+ tb_name = text_block_info.name
834
+ if tb_name not in text_block_map:
835
+ text_block_map[tb_name] = []
836
+ text_block_map[tb_name].append(element.id)
837
+
838
+ page_elements[page_num] = elements
839
+
840
+ # Create TextBlock objects with actual Element objects
841
+ text_blocks = []
842
+ # Group by name to get unique TextBlocks with their titles
843
+ seen_names = {}
844
+ for element, nodes, text_block_info in merged_blocks:
845
+ if text_block_info and text_block_info.name not in seen_names:
846
+ seen_names[text_block_info.name] = text_block_info
847
+
848
+ # Build element ID to Element map
849
+ element_map = {elem.id: elem for elem in elements}
850
+
851
+ for tb_name, tb_info in seen_names.items():
852
+ element_ids = text_block_map.get(tb_name, [])
853
+ if element_ids:
854
+ # Get actual Element objects
855
+ tb_elements = [element_map[eid] for eid in element_ids if eid in element_map]
856
+ text_blocks.append(TextBlock(
857
+ name=tb_name,
858
+ title=tb_info.title,
859
+ elements=tb_elements
860
+ ))
861
+
862
+ page_text_blocks[page_num] = text_blocks
863
+
864
+ # Augment HTML with IDs
865
+ self._augment_html_with_ids(page_elements, block_nodes_map)
866
+
867
+ # Attach elements and TextBlocks to pages
868
+ result = []
869
+ for page in pages:
870
+ elements = page_elements.get(page.number, [])
871
+ text_blocks = page_text_blocks.get(page.number, [])
872
+ result.append(Page(
873
+ number=page.number,
874
+ content=page.content,
875
+ elements=elements if elements else None,
876
+ text_blocks=text_blocks if text_blocks else None
877
+ ))
878
+
879
+ return result
880
+
881
+ def _is_bold_header(self, element: Element) -> bool:
882
+ """Check if element is a bold header (main section marker).
883
+
884
+ Bold headers start with ** and contain only a short title (< 50 chars typically).
885
+ Example: **Services**, **Competition**, **Markets and Distribution**
886
+ """
887
+ content = element.content.strip()
888
+
889
+ # Check if content starts and ends with **
890
+ if not (content.startswith('**') and '**' in content[2:]):
891
+ return False
892
+
893
+ # Extract the bold part
894
+ first_line = content.split('\n')[0].strip()
895
+
896
+ # Bold headers are typically short and standalone
897
+ # If the first line is entirely bold and short, it's likely a header
898
+ if first_line.startswith('**') and first_line.endswith('**'):
899
+ bold_text = first_line[2:-2].strip()
900
+ # Bold headers are typically < 50 chars and don't contain much punctuation
901
+ if len(bold_text) < 50 and bold_text.count('.') <= 1:
902
+ return True
903
+
904
+ return False
905
+
906
+ def _merge_small_blocks(
907
+ self,
908
+ blocks_with_nodes: List[Tuple[Element, List[Tag], Optional[TextBlockInfo]]],
909
+ page_num: int,
910
+ min_chars: int = 500
911
+ ) -> List[Tuple[Element, List[Tag], Optional[TextBlockInfo]]]:
912
+ """Merge consecutive small blocks into larger semantic units.
913
+
914
+ Rules:
915
+ - Tables always stay separate
916
+ - Bold headers (**text**) start new sections
917
+ - TextBlock boundaries always flush
918
+ - Merge consecutive blocks until min_chars threshold
919
+ - Regenerate IDs for merged blocks
920
+
921
+ Args:
922
+ blocks_with_nodes: List of (Element, nodes, text_block) tuples
923
+ page_num: Page number for ID generation
924
+ min_chars: Minimum characters per block (default: 500)
925
+
926
+ Returns:
927
+ List of merged (Element, nodes, text_block) tuples
928
+ """
929
+ if not blocks_with_nodes:
930
+ return []
931
+
932
+ merged = []
933
+ current_elements = []
934
+ current_nodes = []
935
+ current_chars = 0
936
+ current_text_block = None
937
+
938
+ def flush(block_idx: int):
939
+ nonlocal current_elements, current_nodes, current_chars
940
+ if not current_elements:
941
+ return
942
+
943
+ # Merge content from all elements
944
+ merged_content = '\n\n'.join(e.content for e in current_elements)
945
+
946
+ # Infer kind from merged elements
947
+ kinds = [e.kind for e in current_elements]
948
+ if 'table' in kinds:
949
+ kind = 'table'
950
+ elif 'header' in kinds:
951
+ kind = 'section'
952
+ else:
953
+ kind = current_elements[0].kind
954
+
955
+ # Generate new ID
956
+ block_id = self._generate_block_id(page_num, block_idx, merged_content, kind)
957
+
958
+ merged_element = Element(
959
+ id=block_id,
960
+ content=merged_content,
961
+ kind=kind,
962
+ page_start=page_num,
963
+ page_end=page_num
964
+ )
965
+
966
+ merged.append((merged_element, list(current_nodes), current_text_block))
967
+ current_elements = []
968
+ current_nodes = []
969
+ current_chars = 0
970
+
971
+ for i, (element, nodes, text_block) in enumerate(blocks_with_nodes):
972
+ # Check if TextBlock changed - this is a hard boundary, always flush
973
+ text_block_changed = False
974
+ if current_text_block is not None or text_block is not None:
975
+ # Compare TextBlock names (None != anything)
976
+ if current_text_block is None and text_block is not None:
977
+ text_block_changed = True
978
+ elif current_text_block is not None and text_block is None:
979
+ text_block_changed = True
980
+ elif current_text_block is not None and text_block is not None:
981
+ text_block_changed = current_text_block.name != text_block.name
982
+
983
+ if text_block_changed and current_elements:
984
+ flush(len(merged))
985
+
986
+ # Update current TextBlock
987
+ current_text_block = text_block
988
+
989
+ # Check if this is a table
990
+ if element.kind == 'table':
991
+ # If we have accumulated text AND it's small (< min_chars), merge it with the table
992
+ # This handles captions/headers before tables
993
+ if current_elements and current_chars < min_chars:
994
+ # Merge caption with table
995
+ current_elements.append(element)
996
+ current_nodes.extend([n for n in nodes if n not in current_nodes])
997
+ flush(len(merged))
998
+ else:
999
+ # Flush current, then add table separately
1000
+ flush(len(merged))
1001
+ merged.append((element, nodes, text_block))
1002
+ continue
1003
+
1004
+ # Check if this is a bold header (main section break)
1005
+ is_bold_header = self._is_bold_header(element)
1006
+
1007
+ # If we hit a bold header and have content, flush before starting new section
1008
+ # BUT: never flush if current content is ONLY headers (headers need content)
1009
+ if is_bold_header and current_elements:
1010
+ # Check if ALL current elements are headers (no actual content)
1011
+ all_headers = all(self._is_bold_header(e) for e in current_elements)
1012
+ is_current_only_headers = all_headers and current_chars < 200
1013
+
1014
+ if not is_current_only_headers:
1015
+ flush(len(merged))
1016
+
1017
+ # Add to current merge group
1018
+ current_elements.append(element)
1019
+ current_nodes.extend([n for n in nodes if n not in current_nodes])
1020
+ current_chars += element.char_count
1021
+
1022
+ # Decide whether to flush
1023
+ # IMPORTANT: Never flush if current block is only headers - headers need content
1024
+ should_flush = False
1025
+
1026
+ # Flush if we've hit min_chars (paragraph is big enough)
1027
+ if current_chars >= min_chars:
1028
+ should_flush = True
1029
+
1030
+ # OR flush if next element is a bold header (section boundary)
1031
+ if i + 1 < len(blocks_with_nodes):
1032
+ next_element, _, _ = blocks_with_nodes[i + 1]
1033
+ if self._is_bold_header(next_element):
1034
+ should_flush = True
1035
+
1036
+ # OR flush if we're at the end
1037
+ if i == len(blocks_with_nodes) - 1:
1038
+ should_flush = True
1039
+
1040
+ if should_flush:
1041
+ # Check if ALL current elements are headers (no actual content)
1042
+ all_headers = all(self._is_bold_header(e) for e in current_elements)
1043
+ is_only_headers = all_headers and current_chars < 200
1044
+
1045
+ if not is_only_headers:
1046
+ flush(len(merged))
1047
+
1048
+ # Flush remaining
1049
+ if current_elements:
1050
+ flush(len(merged))
1051
+
1052
+ return merged
1053
+
1054
+ def _group_segments_into_blocks(self, segments: List[Tuple[str, Optional[Tag], Optional[TextBlockInfo]]], page_num: int) -> List[Tuple[Element, List[Tag], Optional[TextBlockInfo]]]:
1055
+ """Group sequential segments into semantic blocks.
1056
+
1057
+ Returns:
1058
+ List of (Element, nodes, text_block) tuples
1059
+ """
1060
+ blocks = []
1061
+ current_block_segments = []
1062
+ current_block_nodes = []
1063
+ current_text_block = None
1064
+ block_idx = 0
1065
+
1066
+ for content, node, text_block in segments:
1067
+ # Check if this is a block boundary (double newline)
1068
+ if content == "\n":
1069
+ # Check if previous segment was also a newline
1070
+ if current_block_segments and current_block_segments[-1] == "\n":
1071
+ # Double newline - flush current block
1072
+ if len(current_block_segments) > 1: # Has content beyond the trailing newline
1073
+ block = self._create_block(
1074
+ current_block_segments[:-1], # Exclude trailing newline
1075
+ current_block_nodes,
1076
+ page_num,
1077
+ block_idx
1078
+ )
1079
+ if block:
1080
+ blocks.append((block, list(current_block_nodes), current_text_block))
1081
+ block_idx += 1
1082
+ current_block_segments = []
1083
+ current_block_nodes = []
1084
+ current_text_block = None
1085
+ continue
1086
+
1087
+ current_block_segments.append(content)
1088
+ if node is not None and node not in current_block_nodes:
1089
+ current_block_nodes.append(node)
1090
+ # Track TextBlock (use last non-None value)
1091
+ if text_block is not None:
1092
+ current_text_block = text_block
1093
+
1094
+ # Flush remaining block
1095
+ if current_block_segments:
1096
+ # Remove trailing newlines
1097
+ while current_block_segments and current_block_segments[-1] == "\n":
1098
+ current_block_segments.pop()
1099
+
1100
+ if current_block_segments:
1101
+ block = self._create_block(
1102
+ current_block_segments,
1103
+ current_block_nodes,
1104
+ page_num,
1105
+ block_idx
1106
+ )
1107
+ if block:
1108
+ blocks.append((block, list(current_block_nodes), current_text_block))
1109
+
1110
+ return blocks
1111
+
1112
+ def _create_block(
1113
+ self,
1114
+ segments: List[str],
1115
+ nodes: List[Tag],
1116
+ page_num: int,
1117
+ block_idx: int
1118
+ ) -> Optional[Element]:
1119
+ """Create an Element from segments and nodes."""
1120
+ content = "".join(segments).strip()
1121
+ if not content:
1122
+ return None
1123
+
1124
+ # Infer block kind from nodes
1125
+ kind = self._infer_kind_from_nodes(nodes)
1126
+
1127
+ # Generate stable ID
1128
+ block_id = self._generate_block_id(page_num, block_idx, content, kind)
1129
+
1130
+ return Element(
1131
+ id=block_id,
1132
+ content=content,
1133
+ kind=kind,
1134
+ page_start=page_num,
1135
+ page_end=page_num
1136
+ )
1137
+
1138
+ def _infer_kind_from_nodes(self, nodes: List[Tag]) -> str:
1139
+ """Infer block kind from DOM nodes."""
1140
+ if not nodes:
1141
+ return "text"
1142
+
1143
+ # Check first meaningful node
1144
+ for node in nodes:
1145
+ if node.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
1146
+ return "header"
1147
+ elif node.name == "table":
1148
+ return "table"
1149
+ elif node.name in {"ul", "ol"}:
1150
+ return "list"
1151
+ elif node.name == "p":
1152
+ return "paragraph"
1153
+
1154
+ return "text"
1155
+
1156
+ def _generate_block_id(self, page: int, idx: int, content: str, kind: str) -> str:
1157
+ """Generate stable block ID using normalized content hash."""
1158
+ # Normalize: collapse whitespace for stable hashing
1159
+ normalized = re.sub(r'\s+', ' ', content.strip()).lower()
1160
+ hash_part = hashlib.sha1(normalized.encode('utf-8')).hexdigest()[:8]
1161
+ kind_prefix = kind[0] if kind else "b"
1162
+ return f"sec2md-p{page}-{kind_prefix}{idx}-{hash_part}"
1163
+
1164
+ def _augment_html_with_ids(self, page_elements: Dict[int, List[Element]], block_nodes_map: Dict[str, List[Tag]]) -> None:
1165
+ """Add id attributes and data-sec2md-block to DOM nodes.
1166
+
1167
+ Args:
1168
+ page_elements: Map of page_num -> List[Element]
1169
+ block_nodes_map: Map of element.id -> List[Tag] (the nodes for that element)
1170
+ """
1171
+ seen_pages = set()
1172
+
1173
+ # Augment HTML
1174
+ for page_num in sorted(page_elements.keys()):
1175
+ elements = page_elements[page_num]
1176
+
1177
+ for i, element in enumerate(elements):
1178
+ nodes = block_nodes_map.get(element.id, [])
1179
+ if not nodes:
1180
+ continue
1181
+
1182
+ first_node = nodes[0]
1183
+
1184
+ # Add page ID to first block on this page
1185
+ if page_num not in seen_pages:
1186
+ # Add page-N as an additional ID on the first node
1187
+ if 'id' in first_node.attrs:
1188
+ # Node already has an ID, add page ID as a class instead
1189
+ existing_classes = first_node.get('class', [])
1190
+ if isinstance(existing_classes, str):
1191
+ existing_classes = existing_classes.split()
1192
+ existing_classes.append(f"page-{page_num}")
1193
+ first_node['class'] = existing_classes
1194
+ else:
1195
+ first_node['id'] = f"page-{page_num}"
1196
+ seen_pages.add(page_num)
1197
+
1198
+ # Add block ID to first node (if it doesn't already have page ID)
1199
+ if 'id' not in first_node.attrs:
1200
+ first_node['id'] = element.id
1201
+
1202
+ # Add data attribute to all nodes for highlighting
1203
+ for node in nodes:
1204
+ node['data-sec2md-block'] = element.id
1205
+
1206
+ def markdown(self) -> str:
1207
+ """Get full document as markdown string."""
1208
+ pages = self.get_pages()
1209
+ return "\n\n".join(page.content for page in pages if page.content)
1210
+
1211
+ def html(self) -> str:
1212
+ """Get the HTML with augmented anchors and data attributes.
1213
+
1214
+ Note: Call get_pages(include_elements=True) first to augment the HTML.
1215
+ If called before get_pages(), returns the original unmodified HTML.
1216
+ """
1217
+ return str(self.soup)