sec2md 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/parser.py ADDED
@@ -0,0 +1,586 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import logging
5
+ from collections import defaultdict
6
+ from typing import List, Dict, Union, Optional
7
+ from bs4 import BeautifulSoup
8
+ from bs4.element import NavigableString, Tag
9
+
10
+ from sec2md.absolute_table_parser import AbsolutelyPositionedTableParser, median
11
+ from sec2md.table_parser import TableParser
12
+ from sec2md.models import Page
13
+
14
+ BLOCK_TAGS = {"div", "p", "h1", "h2", "h3", "h4", "h5", "h6", "table", "br", "hr", "ul", "ol", "li"}
15
+ BOLD_TAGS = {"b", "strong"}
16
+ ITALIC_TAGS = {"i", "em"}
17
+
18
+ _ws = re.compile(r"\s+")
19
+ _css_decl = re.compile(r"^[a-zA-Z\-]+\s*:\s*[^;]+;\s*$")
20
+ ITEM_HEADER_CELL_RE = re.compile(r"^\s*Item\s+([0-9IVX]+)\.\s*$", re.I)
21
+ PART_HEADER_CELL_RE = re.compile(r"^\s*Part\s+([IVX]+)\s*$", re.I)
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class Parser:
27
+ """Document parser with support for regular tables and pseudo-tables."""
28
+
29
+ def __init__(self, content: str):
30
+ self.soup = BeautifulSoup(content, "lxml")
31
+ self.includes_table = False
32
+ self.pages: Dict[int, List[str]] = defaultdict(list)
33
+ self.input_char_count = len(self.soup.get_text())
34
+
35
+ @staticmethod
36
+ def _is_bold(el: Tag) -> bool:
37
+ if not isinstance(el, Tag):
38
+ return False
39
+ style = (el.get("style") or "").lower()
40
+ return (
41
+ "font-weight:700" in style
42
+ or "font-weight:bold" in style
43
+ or el.name in BOLD_TAGS
44
+ )
45
+
46
+ @staticmethod
47
+ def _is_italic(el: Tag) -> bool:
48
+ if not isinstance(el, Tag):
49
+ return False
50
+ style = (el.get("style") or "").lower()
51
+ return (
52
+ "font-style:italic" in style
53
+ or el.name in ITALIC_TAGS
54
+ )
55
+
56
+ @staticmethod
57
+ def _is_block(el: Tag) -> bool:
58
+ return isinstance(el, Tag) and el.name in BLOCK_TAGS
59
+
60
+ @staticmethod
61
+ def _is_absolutely_positioned(el: Tag) -> bool:
62
+ """Check if element has position:absolute"""
63
+ if not isinstance(el, Tag):
64
+ return False
65
+ style = (el.get("style") or "").lower().replace(" ", "")
66
+ return "position:absolute" in style
67
+
68
+ @staticmethod
69
+ def _is_inline_display(el: Tag) -> bool:
70
+ """Check if element has display:inline or display:inline-block"""
71
+ if not isinstance(el, Tag):
72
+ return False
73
+ style = (el.get("style") or "").lower().replace(" ", "")
74
+ return "display:inline-block" in style or "display:inline;" in style
75
+
76
+ @staticmethod
77
+ def _has_break_before(el: Tag) -> bool:
78
+ if not isinstance(el, Tag):
79
+ return False
80
+ style = (el.get("style") or "").lower().replace(" ", "")
81
+ return (
82
+ "page-break-before:always" in style
83
+ or "break-before:page" in style
84
+ or "break-before:always" in style
85
+ )
86
+
87
+ @staticmethod
88
+ def _has_break_after(el: Tag) -> bool:
89
+ if not isinstance(el, Tag):
90
+ return False
91
+ style = (el.get("style") or "").lower().replace(" ", "")
92
+ return (
93
+ "page-break-after:always" in style
94
+ or "break-after:page" in style
95
+ or "break-after:always" in style
96
+ )
97
+
98
+ @staticmethod
99
+ def _is_hidden(el: Tag) -> bool:
100
+ """Check if element has display:none"""
101
+ if not isinstance(el, Tag):
102
+ return False
103
+ style = (el.get("style") or "").lower().replace(" ", "")
104
+ return "display:none" in style
105
+
106
+ @staticmethod
107
+ def _clean_text(text: str) -> str:
108
+ # Remove zero-width spaces, BOM, normalize NBSP
109
+ text = text.replace("\u200b", "").replace("\ufeff", "").replace("\xa0", " ")
110
+ return _ws.sub(" ", text).strip()
111
+
112
+ @staticmethod
113
+ def _wrap_markdown(el: Tag) -> str:
114
+ """Return the prefix/suffix markdown wrapper for this element."""
115
+ bold = Parser._is_bold(el)
116
+ italic = Parser._is_italic(el)
117
+ if bold and italic:
118
+ return "***"
119
+ if bold:
120
+ return "**"
121
+ if italic:
122
+ return "*"
123
+ return ""
124
+
125
+ def _append(self, page_num: int, s: str) -> None:
126
+ if s:
127
+ self.pages[page_num].append(s)
128
+
129
+ def _blankline_before(self, page_num: int) -> None:
130
+ """Ensure exactly one blank line before the next block."""
131
+ buf = self.pages[page_num]
132
+ if not buf:
133
+ return
134
+ if not buf[-1].endswith("\n"):
135
+ buf.append("\n")
136
+ if len(buf) >= 2 and buf[-1] == "\n" and buf[-2] == "\n":
137
+ return
138
+ buf.append("\n")
139
+
140
+ def _blankline_after(self, page_num: int) -> None:
141
+ """Mirror `_blankline_before` for symmetry; same rule."""
142
+ self._blankline_before(page_num)
143
+
144
+ def _process_text_node(self, node: NavigableString) -> str:
145
+ text = self._clean_text(str(node))
146
+ if text and _css_decl.match(text):
147
+ return ""
148
+ return text
149
+
150
+ def _process_element(self, element: Union[Tag, NavigableString]) -> str:
151
+ if isinstance(element, NavigableString):
152
+ return self._process_text_node(element)
153
+
154
+ if element.name == "table":
155
+ # Use effective (non-empty) rows for the decision
156
+ eff_rows = self._effective_rows(element)
157
+ if len(eff_rows) <= 1:
158
+ # Flatten single-row "header tables" like Item/Part banners
159
+ cells = eff_rows[0] if eff_rows else []
160
+ text = self._one_row_table_to_text(cells)
161
+ return text
162
+
163
+ self.includes_table = True
164
+ return TableParser(element).md().strip()
165
+
166
+ if element.name in {"ul", "ol"}:
167
+ items = []
168
+ for li in element.find_all("li", recursive=False):
169
+ item_text = self._process_element(li).strip()
170
+ if item_text:
171
+ item_text = item_text.lstrip("•·∙◦▪▫-").strip()
172
+ items.append(item_text)
173
+ if not items:
174
+ return ""
175
+ if element.name == "ol":
176
+ return "\n".join(f"{i + 1}. {t}" for i, t in enumerate(items))
177
+ else:
178
+ return "\n".join(f"- {t}" for t in items)
179
+
180
+ if element.name == "li":
181
+ parts = [self._process_element(c) for c in element.children]
182
+ return " ".join(p for p in parts if p).strip()
183
+
184
+ parts: List[str] = []
185
+ for child in element.children:
186
+ if isinstance(child, NavigableString):
187
+ t = self._process_text_node(child)
188
+ if t:
189
+ parts.append(t)
190
+ else:
191
+ t = self._process_element(child)
192
+ if t:
193
+ parts.append(t)
194
+
195
+ text = " ".join(p for p in parts if p).strip()
196
+ if not text:
197
+ return ""
198
+
199
+ wrap = self._wrap_markdown(element)
200
+ return f"{wrap}{text}{wrap}" if wrap else text
201
+
202
+ def _extract_absolutely_positioned_children(self, container: Tag) -> List[Tag]:
203
+ """
204
+ Extract all absolutely positioned children from a container.
205
+
206
+ Returns:
207
+ List of absolutely positioned child elements
208
+ """
209
+ positioned_children = []
210
+ for child in container.children:
211
+ if isinstance(child, Tag) and self._is_absolutely_positioned(child):
212
+ # Skip elements that are just styling (no text content)
213
+ if child.get_text(strip=True):
214
+ positioned_children.append(child)
215
+ return positioned_children
216
+
217
+ def _compute_line_gaps(self, elements: List[Tag]) -> List[float]:
218
+ """
219
+ Compute gaps between consecutive Y positions (line gaps).
220
+
221
+ Returns:
222
+ List of gap sizes in pixels
223
+ """
224
+ y_positions = []
225
+ for el in elements:
226
+ style = el.get("style", "")
227
+ top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
228
+ if top_match:
229
+ y_positions.append(float(top_match.group(1)))
230
+
231
+ if len(y_positions) < 2:
232
+ return []
233
+
234
+ y_positions.sort()
235
+ gaps = [y_positions[i + 1] - y_positions[i] for i in range(len(y_positions) - 1)]
236
+ # Filter out very small gaps (same line) and very large gaps (section breaks)
237
+ gaps = [g for g in gaps if 5 < g < 100]
238
+ return gaps
239
+
240
+ def _split_positioned_groups(self, elements: List[Tag], gap_threshold: Optional[float] = None) -> List[List[Tag]]:
241
+ """
242
+ Split positioned elements into separate groups.
243
+ Uses ADAPTIVE gap threshold based on document characteristics.
244
+
245
+ Args:
246
+ elements: List of absolutely positioned elements
247
+ gap_threshold: Optional threshold in pixels (if None, computed adaptively)
248
+
249
+ Returns:
250
+ List of element groups
251
+ """
252
+ if not elements:
253
+ return []
254
+
255
+ # ADAPTIVE THRESHOLD: Learn from the document
256
+ if gap_threshold is None:
257
+ line_gaps = self._compute_line_gaps(elements)
258
+ if line_gaps:
259
+ median_gap = median(line_gaps)
260
+ # Use 1.2x median line gap, capped at 30px
261
+ gap_threshold = min(1.2 * median_gap, 30.0)
262
+ logger.debug(f"Adaptive gap threshold: {gap_threshold:.1f}px (median line gap: {median_gap:.1f}px)")
263
+ else:
264
+ gap_threshold = 30.0 # Fallback
265
+
266
+ # Extract Y coordinates
267
+ element_positions = []
268
+ for el in elements:
269
+ style = el.get("style", "")
270
+ top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
271
+ if top_match:
272
+ top = float(top_match.group(1))
273
+ element_positions.append((top, el))
274
+
275
+ if not element_positions:
276
+ return [elements]
277
+
278
+ # Sort by Y position
279
+ element_positions.sort(key=lambda x: x[0])
280
+
281
+ # Group by gaps
282
+ groups = []
283
+ current_group = [element_positions[0][1]]
284
+ last_y = element_positions[0][0]
285
+
286
+ for y, el in element_positions[1:]:
287
+ gap = y - last_y
288
+ if gap > gap_threshold:
289
+ # Large gap - start new group
290
+ if current_group:
291
+ groups.append(current_group)
292
+ current_group = [el]
293
+ else:
294
+ current_group.append(el)
295
+ last_y = y
296
+
297
+ if current_group:
298
+ groups.append(current_group)
299
+
300
+ # Post-process: split groups that transition from multi-column to single-column
301
+ final_groups = []
302
+ for group in groups:
303
+ split_groups = self._split_by_column_transition(group)
304
+ final_groups.extend(split_groups)
305
+
306
+ logger.debug(
307
+ f"Split {len(elements)} elements into {len(final_groups)} groups (threshold: {gap_threshold:.1f}px)")
308
+ return final_groups
309
+
310
+ def _split_by_column_transition(self, elements: List[Tag]) -> List[List[Tag]]:
311
+ """
312
+ Split a group if it transitions from multi-column (table) to single-column (prose).
313
+
314
+ This handles cases where a table is followed immediately by paragraph text
315
+ without a large Y-gap between them.
316
+
317
+ Args:
318
+ elements: List of elements in a group
319
+
320
+ Returns:
321
+ List of split groups (or original group if no transition found)
322
+ """
323
+ if len(elements) < 6:
324
+ return [elements]
325
+
326
+ # Extract X, Y positions for all elements
327
+ element_data = []
328
+ for el in elements:
329
+ style = el.get("style", "")
330
+ left_match = re.search(r'left:\s*(\d+(?:\.\d+)?)px', style)
331
+ top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
332
+ if left_match and top_match:
333
+ left = float(left_match.group(1))
334
+ top = float(top_match.group(1))
335
+ element_data.append((left, top, el))
336
+
337
+ if not element_data:
338
+ return [elements]
339
+
340
+ # Sort by Y position
341
+ element_data.sort(key=lambda x: x[1])
342
+
343
+ # Group into rows by Y position (15px tolerance)
344
+ rows = []
345
+ current_row = [element_data[0]]
346
+ last_y = element_data[0][1]
347
+
348
+ for left, top, el in element_data[1:]:
349
+ if abs(top - last_y) <= 15:
350
+ current_row.append((left, top, el))
351
+ else:
352
+ rows.append(current_row)
353
+ current_row = [(left, top, el)]
354
+ last_y = top
355
+
356
+ if current_row:
357
+ rows.append(current_row)
358
+
359
+ # Count unique X positions per row
360
+ def count_columns(row):
361
+ x_positions = set(left for left, _, _ in row)
362
+ return len(x_positions)
363
+
364
+ # Find transition point from multi-column to single-column
365
+ split_point = None
366
+ for i in range(len(rows) - 3): # Need at least 3 rows after split
367
+ current_cols = count_columns(rows[i])
368
+ next_cols = count_columns(rows[i + 1])
369
+
370
+ # Transition from 2+ columns to 1 column
371
+ if current_cols >= 2 and next_cols == 1:
372
+ # Check if next 2-3 rows are also single-column (confirms prose pattern)
373
+ following_single = sum(1 for j in range(i + 1, min(i + 4, len(rows)))
374
+ if count_columns(rows[j]) == 1)
375
+ if following_single >= 2:
376
+ split_point = i + 1
377
+ logger.debug(f"Column transition detected at row {i + 1} ({current_cols} cols -> {next_cols} col)")
378
+ break
379
+
380
+ if split_point is None:
381
+ return [elements]
382
+
383
+ # Split at the transition point
384
+ split_y = rows[split_point][0][1] # Y coordinate of first element in transition row
385
+
386
+ group1 = [el for left, top, el in element_data if top < split_y]
387
+ group2 = [el for left, top, el in element_data if top >= split_y]
388
+
389
+ result = []
390
+ if group1:
391
+ result.append(group1)
392
+ if group2:
393
+ result.append(group2)
394
+
395
+ return result if result else [elements]
396
+
397
+ def _process_absolutely_positioned_container(self, container: Tag, page_num: int) -> int:
398
+ """
399
+ Handle containers with absolutely positioned children.
400
+
401
+ Step 1: Extract absolutely positioned elements
402
+ Step 2: Split into separate groups by Y-coordinate gaps AND column transitions
403
+ Step 3: Process each group independently (table or text)
404
+
405
+ Args:
406
+ container: The container element
407
+ page_num: Current page number
408
+
409
+ Returns:
410
+ Updated page number
411
+ """
412
+ # Extract positioned children
413
+ positioned_children = self._extract_absolutely_positioned_children(container)
414
+
415
+ if not positioned_children:
416
+ # No positioned children, process normally
417
+ current = page_num
418
+ for child in container.children:
419
+ current = self._stream_pages(child, current)
420
+ return current
421
+
422
+ # Split into separate groups (adaptive threshold + column transition detection)
423
+ groups = self._split_positioned_groups(positioned_children)
424
+
425
+ # Process each group independently
426
+ for i, group in enumerate(groups):
427
+ table_parser = AbsolutelyPositionedTableParser(group)
428
+
429
+ if table_parser.is_table_like():
430
+ # It's a table! Render as markdown table
431
+ self.includes_table = True
432
+ markdown_table = table_parser.to_markdown()
433
+ if markdown_table:
434
+ self._append(page_num, markdown_table)
435
+ self._blankline_after(page_num)
436
+ else:
437
+ # Not a table - group by visual lines and render as text
438
+ text = table_parser.to_text()
439
+ if text:
440
+ if i > 0:
441
+ self._blankline_before(page_num)
442
+ self._append(page_num, text)
443
+
444
+ return page_num
445
+
446
+ def _stream_pages(self, root: Union[Tag, NavigableString], page_num: int = 1) -> int:
447
+ """Walk the DOM once; split only on CSS break styles."""
448
+ if isinstance(root, Tag) and self._has_break_before(root):
449
+ page_num += 1
450
+
451
+ if isinstance(root, NavigableString):
452
+ t = self._process_text_node(root)
453
+ if t:
454
+ self._append(page_num, t + " ")
455
+ return page_num
456
+
457
+ if not isinstance(root, Tag):
458
+ return page_num
459
+
460
+ if self._is_hidden(root):
461
+ return page_num
462
+
463
+ # Check if this is a container with absolutely positioned children
464
+ is_absolutely_positioned = self._is_absolutely_positioned(root)
465
+ has_positioned_children = not is_absolutely_positioned and any(
466
+ isinstance(child, Tag) and self._is_absolutely_positioned(child)
467
+ for child in root.children
468
+ )
469
+
470
+ if has_positioned_children and root.name == "div":
471
+ # Special handling for absolutely positioned layouts
472
+ current = self._process_absolutely_positioned_container(root, page_num)
473
+ if self._has_break_after(root):
474
+ current += 1
475
+ return current
476
+
477
+ # Inline-display elements should not trigger blocks
478
+ is_inline_display = self._is_inline_display(root)
479
+ is_block = self._is_block(root) and root.name not in {"br",
480
+ "hr"} and not is_inline_display and not is_absolutely_positioned
481
+
482
+ if is_block:
483
+ self._blankline_before(page_num)
484
+
485
+ # Handle tables and lists atomically
486
+ if root.name in {"table", "ul", "ol"}:
487
+ t = self._process_element(root)
488
+ if t:
489
+ self._append(page_num, t)
490
+ self._blankline_after(page_num)
491
+ if self._has_break_after(root):
492
+ page_num += 1
493
+ return page_num
494
+
495
+ # For inline wrappers (bold/italic), render atomically
496
+ wrap = self._wrap_markdown(root)
497
+ if wrap and not is_block:
498
+ t = self._process_element(root)
499
+ if t:
500
+ self._append(page_num, t + " ")
501
+ if self._has_break_after(root):
502
+ page_num += 1
503
+ return page_num
504
+
505
+ # Stream children for block elements
506
+ current = page_num
507
+ for child in root.children:
508
+ current = self._stream_pages(child, current)
509
+
510
+ if is_block:
511
+ self._blankline_after(current)
512
+
513
+ if self._has_break_after(root):
514
+ current += 1
515
+
516
+ return current
517
+
518
+ def get_pages(self) -> List[Page]:
519
+ """Get parsed pages as Page objects."""
520
+ self.pages = defaultdict(list)
521
+ self.includes_table = False
522
+ root = self.soup.body if self.soup.body else self.soup
523
+ self._stream_pages(root, page_num=1)
524
+
525
+ result: List[Page] = []
526
+ for page_num in sorted(self.pages.keys()):
527
+ raw = "".join(self.pages[page_num])
528
+
529
+ # Collapse excessive newlines
530
+ raw = re.sub(r"\n{3,}", "\n\n", raw)
531
+
532
+ lines: List[str] = []
533
+ for line in raw.split("\n"):
534
+ line = line.strip()
535
+ if line or (lines and lines[-1]):
536
+ lines.append(line)
537
+ content = "\n".join(lines).strip()
538
+
539
+ result.append(Page(number=page_num, content=content))
540
+
541
+ # CONTENT-LOSS WATCHDOG
542
+ total_output_chars = sum(len(p.content) for p in result)
543
+ if self.input_char_count > 0:
544
+ retention_ratio = total_output_chars / self.input_char_count
545
+ if retention_ratio < 0.95:
546
+ # logger.warning(f"⚠️ Content loss detected: {100 * (1 - retention_ratio):.1f}% of input lost!")
547
+ # logger.warning(f" Input: {self.input_char_count} chars, Output: {total_output_chars} chars")
548
+ pass
549
+ else:
550
+ logger.debug(f"✓ Content retention: {100 * retention_ratio:.1f}%")
551
+
552
+ return result
553
+
554
+ def _effective_rows(self, table: Tag) -> list[list[Tag]]:
555
+ """Return rows that have at least one non-empty td/th."""
556
+ rows = []
557
+ for tr in table.find_all('tr', recursive=True):
558
+ cells = tr.find_all(['td', 'th'], recursive=False) or tr.find_all(['td', 'th'], recursive=True)
559
+ texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
560
+ if any(texts):
561
+ rows.append(cells)
562
+ return rows
563
+
564
+ def _one_row_table_to_text(self, cells: list[Tag]) -> str:
565
+ """Flatten a 1-row table to plain text; upgrade to header when possible."""
566
+ texts = [self._clean_text(c.get_text(" ", strip=True)) for c in cells]
567
+ if not texts:
568
+ return ""
569
+
570
+ first = texts[0]
571
+ if (m := ITEM_HEADER_CELL_RE.match(first)):
572
+ num = m.group(1).upper()
573
+ title = next((t for t in texts[1:] if t), "")
574
+ return f"ITEM {num}. {title}".strip()
575
+
576
+ if (m := PART_HEADER_CELL_RE.match(first)):
577
+ roman = m.group(1).upper()
578
+ return f"PART {roman}"
579
+
580
+ # generic flatten (avoid markdown pipes which might be misread later)
581
+ return " ".join(t for t in texts if t).strip()
582
+
583
+ def markdown(self) -> str:
584
+ """Get full document as markdown string."""
585
+ pages = self.get_pages()
586
+ return "\n\n".join(page.content for page in pages if page.content)