sec2md 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """sec2md: Convert SEC filings to high-quality Markdown."""
2
+
3
+ from sec2md.core import convert_to_markdown
4
+ from sec2md.utils import flatten_note
5
+ from sec2md.sections import extract_sections, get_section
6
+ from sec2md.chunking import chunk_pages, chunk_section
7
+ from sec2md.models import Page, Section, Item10K, Item10Q, FilingType
8
+ from sec2md.chunker.markdown_chunk import MarkdownChunk
9
+
10
+ __version__ = "0.1.0"
11
+ __all__ = [
12
+ "convert_to_markdown",
13
+ "flatten_note",
14
+ "extract_sections",
15
+ "get_section",
16
+ "chunk_pages",
17
+ "chunk_section",
18
+ "Page",
19
+ "Section",
20
+ "Item10K",
21
+ "Item10Q",
22
+ "FilingType",
23
+ "MarkdownChunk",
24
+ ]
@@ -0,0 +1,622 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from bs4 import Tag
5
+ from collections import defaultdict
6
+ from typing import List, Optional, Tuple, Dict
7
+
8
+ NUMERIC_RE = re.compile(r"""
9
+ ^\s*
10
+ [\(\[]? # optional opening paren/bracket
11
+ [\-—–]?\s* # optional dash
12
+ [$€£¥]?\s* # optional currency
13
+ \d+(?:[.,]\d{3})* # integer part (with or without thousands)
14
+ (?:[.,]\d+)? # decimals
15
+ \s*%? # optional percent
16
+ [\)\]]?\s*$ # optional closing paren/bracket
17
+ """, re.X)
18
+
19
+
20
+ def median(values: List[float]) -> float:
21
+ """Calculate median of a list of numbers."""
22
+ if not values:
23
+ return 0.0
24
+ sorted_vals = sorted(values)
25
+ n = len(sorted_vals)
26
+ if n % 2 == 0:
27
+ return (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2.0
28
+ return sorted_vals[n // 2]
29
+
30
+
31
+ class AbsolutelyPositionedTableParser:
32
+ """
33
+ Parser for pseudo-tables constructed from absolutely positioned div elements.
34
+
35
+ These appear in some SEC filings where tables are rendered using position:absolute
36
+ divs instead of proper HTML table elements.
37
+ """
38
+
39
+ def __init__(self, elements: List[Tag]):
40
+ """
41
+ Initialize with a list of absolutely positioned elements.
42
+
43
+ Args:
44
+ elements: List of Tag elements that are absolutely positioned
45
+ """
46
+ self.elements = elements
47
+ self.positioned_elements = self._extract_positions()
48
+
49
+ def _get_position(self, el: Tag) -> Optional[Tuple[float, float]]:
50
+ """Extract (left, top) position from element style."""
51
+ if not isinstance(el, Tag):
52
+ return None
53
+ style = el.get("style", "")
54
+ left_match = re.search(r'left:\s*(\d+(?:\.\d+)?)px', style)
55
+ top_match = re.search(r'top:\s*(\d+(?:\.\d+)?)px', style)
56
+ if left_match and top_match:
57
+ return (float(left_match.group(1)), float(top_match.group(1)))
58
+ return None
59
+
60
+ def _clean_text(self, element: Tag) -> str:
61
+ """Extract and clean text from an element."""
62
+ text = element.get_text(separator=" ", strip=True)
63
+ text = text.replace("\u200b", "").replace("\ufeff", "").replace("\xa0", " ")
64
+ text = re.sub(r'\s+', ' ', text).strip()
65
+ return text
66
+
67
+ def _is_bold(self, el: Tag) -> bool:
68
+ """Check if element has bold styling."""
69
+ style = (el.get("style") or "").lower()
70
+ return "font-weight:700" in style or "font-weight:bold" in style
71
+
72
+ def _is_spacer(self, el: Tag) -> bool:
73
+ """
74
+ Detect inline-block spacer boxes that should be treated as spaces.
75
+
76
+ These are common in PDF->HTML conversions: <div style="display:inline-block;width:5px">&nbsp;</div>
77
+ """
78
+ if not isinstance(el, Tag):
79
+ return False
80
+
81
+ style = el.get("style", "").lower().replace(" ", "")
82
+ text = el.get_text(strip=True)
83
+ has_nbsp = '\xa0' in str(el) or '&nbsp;' in str(el)
84
+ width_match = re.search(r'width:(\d+)px', style)
85
+
86
+ is_inline_block = 'display:inline-block' in style
87
+ is_empty_or_nbsp = (not text or has_nbsp)
88
+ is_narrow = width_match and int(width_match.group(1)) < 30
89
+
90
+ return is_inline_block and is_empty_or_nbsp and is_narrow
91
+
92
+ def _contains_number(self, text: str) -> bool:
93
+ """Check if text contains a numeric value using robust pattern."""
94
+ return bool(NUMERIC_RE.search(text))
95
+
96
+ def _extract_positions(self) -> List[Tuple[float, float, Tag]]:
97
+ """Extract positions for all elements with valid positioning."""
98
+ positioned = []
99
+ for el in self.elements:
100
+ pos = self._get_position(el)
101
+
102
+ # Handle spacer boxes - add as synthetic space marker
103
+ if self._is_spacer(el):
104
+ if pos:
105
+ # Create a synthetic tag that we'll recognize later
106
+ positioned.append((pos[0], pos[1], el))
107
+ continue
108
+
109
+ text = self._clean_text(el)
110
+ # Only include elements with both position and text content
111
+ if pos and text:
112
+ left, top = pos
113
+ positioned.append((left, top, el))
114
+ return positioned
115
+
116
+ def _filter_table_content(self, elements: List[Tuple[float, float, Tag]]) -> List[Tuple[float, float, Tag]]:
117
+ """
118
+ Filter out title/caption text that appears before the actual table.
119
+
120
+ Tables often have introductory text like "The following table sets forth..."
121
+ This should be excluded from table detection and rendering.
122
+ """
123
+ if len(elements) < 10:
124
+ return elements # Too small to have significant leading text
125
+
126
+ # Group by Y position to find rows
127
+ y_coords = [top for _, top, _ in elements]
128
+ y_clusters = self._cluster_by_eps(y_coords, eps=15)
129
+
130
+ # Count elements per row
131
+ row_counts = defaultdict(list)
132
+ for left, top, el in elements:
133
+ row_cluster = y_clusters[top]
134
+ row_counts[row_cluster].append((left, top, el))
135
+
136
+ # Sort rows by Y position
137
+ sorted_rows = sorted(row_counts.items(), key=lambda x: min(t for _, t, _ in x[1]))
138
+
139
+ # Find the first row with multiple elements (likely start of actual table)
140
+ table_start_row = None
141
+ for row_id, row_elements in sorted_rows:
142
+ if len(row_elements) >= 3: # Row with at least 3 elements = likely table row
143
+ table_start_row = row_id
144
+ break
145
+
146
+ if table_start_row is None:
147
+ return elements # Couldn't identify table start, return all
148
+
149
+ # Get the Y position of the table start
150
+ table_start_y = min(top for _, top, _ in row_counts[table_start_row])
151
+
152
+ # Filter out elements that are significantly above the table start (>30px)
153
+ filtered = [(l, t, e) for l, t, e in elements if t >= table_start_y - 30]
154
+
155
+ return filtered if len(filtered) >= 6 else elements # Sanity check
156
+
157
+ def _cluster_by_eps(self, values: List[float], eps: float) -> Dict[float, int]:
158
+ """
159
+ Cluster positions within epsilon tolerance.
160
+
161
+ This is more robust than gap-based clustering because it handles
162
+ rendering jitter (e.g., 100.0, 100.5, 101.2 should be same cluster).
163
+
164
+ Args:
165
+ values: List of coordinate values
166
+ eps: Epsilon tolerance (pixels)
167
+
168
+ Returns:
169
+ Dictionary mapping value -> cluster_id
170
+ """
171
+ if not values:
172
+ return {}
173
+
174
+ sorted_vals = sorted(set(values))
175
+ cluster_id = 0
176
+ clusters = {}
177
+ anchor = sorted_vals[0]
178
+
179
+ for val in sorted_vals:
180
+ if val - anchor > eps:
181
+ cluster_id += 1
182
+ anchor = val
183
+ clusters[val] = cluster_id
184
+
185
+ return clusters
186
+
187
+ def is_table_like(self) -> bool:
188
+ """
189
+ Determine if the positioned elements form a table-like structure.
190
+
191
+ This uses multiple heuristics to distinguish actual data tables from
192
+ normal paragraph text that happens to be absolutely positioned.
193
+
194
+ Returns:
195
+ True if elements appear to form a table, False otherwise
196
+ """
197
+ if len(self.positioned_elements) < 6: # Need at least a 2x3 table
198
+ return False
199
+
200
+ # Filter out caption/title text
201
+ filtered_elements = self._filter_table_content(self.positioned_elements)
202
+
203
+ if len(filtered_elements) < 6:
204
+ return False
205
+
206
+ # Extract coordinates from filtered elements
207
+ x_coords = [left for left, _, _ in filtered_elements]
208
+ y_coords = [top for _, top, _ in filtered_elements]
209
+
210
+ # Cluster with epsilon tolerance (12px for rows, 50px for columns)
211
+ y_clusters = self._cluster_by_eps(y_coords, eps=12)
212
+ x_clusters = self._cluster_by_eps(x_coords, eps=50)
213
+
214
+ n_rows = len(set(y_clusters.values()))
215
+ n_cols = len(set(x_clusters.values()))
216
+
217
+ # Need at least 2x3 grid (2 columns minimum)
218
+ if n_rows < 2 or n_cols < 2:
219
+ return False
220
+
221
+ # CRITICAL: Check for numeric content - tables should have numbers
222
+ # Use robust numeric pattern
223
+ elements_with_numbers = sum(
224
+ 1 for _, _, el in filtered_elements
225
+ if not self._is_spacer(el) and self._contains_number(self._clean_text(el))
226
+ )
227
+ numeric_ratio = elements_with_numbers / len(filtered_elements)
228
+
229
+ # At least 20% of cells should contain numbers
230
+ if numeric_ratio < 0.20:
231
+ return False
232
+
233
+ # Check average text length - tables have short cell content
234
+ avg_length = sum(len(self._clean_text(el)) for _, _, el in filtered_elements) / len(filtered_elements)
235
+
236
+ # If average cell is > 50 characters, probably paragraph text, not a table
237
+ if avg_length > 50:
238
+ return False
239
+
240
+ # Check for sentence structures (periods indicating prose)
241
+ text_with_periods = sum(
242
+ 1 for _, _, el in filtered_elements
243
+ if '.' in self._clean_text(el) and len(self._clean_text(el)) > 20
244
+ )
245
+
246
+ # If >40% of cells have periods and long text, probably prose
247
+ if text_with_periods / len(filtered_elements) > 0.40:
248
+ return False
249
+
250
+ # Check density - should be reasonably filled
251
+ expected_cells = n_rows * n_cols
252
+ actual_cells = len(filtered_elements)
253
+ density = actual_cells / expected_cells
254
+
255
+ if density < 0.25: # Less than 25% filled = probably not a table
256
+ return False
257
+
258
+ # Check row consistency - rows should have similar number of elements
259
+ row_counts = defaultdict(int)
260
+ for left, top, _ in filtered_elements:
261
+ row_cluster = y_clusters[top]
262
+ row_counts[row_cluster] += 1
263
+
264
+ counts = list(row_counts.values())
265
+ if not counts or sum(counts) / len(counts) < 2:
266
+ return False
267
+
268
+ # Check for actual column structure - at least one column should have numeric content
269
+ col_elements = defaultdict(list)
270
+ for left, top, element in filtered_elements:
271
+ col_cluster = x_clusters[left]
272
+ col_elements[col_cluster].append(element)
273
+
274
+ has_numeric_column = False
275
+ for col_id, elements in col_elements.items():
276
+ if len(elements) >= 2:
277
+ numeric_in_col = sum(
278
+ 1 for el in elements
279
+ if not self._is_spacer(el) and self._contains_number(self._clean_text(el))
280
+ )
281
+ if numeric_in_col / len(elements) > 0.5:
282
+ has_numeric_column = True
283
+ break
284
+
285
+ if not has_numeric_column:
286
+ return False
287
+
288
+ return True
289
+
290
+ def to_grid(self) -> Optional[List[List[List[Tuple[float, float, Tag]]]]]:
291
+ """
292
+ Convert positioned elements to a 2D grid structure.
293
+
294
+ Returns:
295
+ 2D grid where each cell contains a list of (left, top, element) tuples,
296
+ or None if structure is not table-like
297
+ """
298
+ if not self.is_table_like():
299
+ return None
300
+
301
+ # Filter out caption/title text
302
+ filtered_elements = self._filter_table_content(self.positioned_elements)
303
+
304
+ # Extract coordinates from filtered elements
305
+ x_coords = [left for left, _, _ in filtered_elements]
306
+ y_coords = [top for _, top, _ in filtered_elements]
307
+
308
+ # Cluster with epsilon tolerance
309
+ y_clusters = self._cluster_by_eps(y_coords, eps=12)
310
+ x_clusters = self._cluster_by_eps(x_coords, eps=50)
311
+
312
+ n_rows = len(set(y_clusters.values()))
313
+ n_cols = len(set(x_clusters.values()))
314
+
315
+ # Build grid dictionary
316
+ grid_dict: Dict[Tuple[int, int], List[Tuple[float, float, Tag]]] = defaultdict(list)
317
+
318
+ for left, top, element in filtered_elements:
319
+ row_cluster = y_clusters[top]
320
+ col_cluster = x_clusters[left]
321
+
322
+ # Map to 0-based indices
323
+ row_id = sorted(set(y_clusters.values())).index(row_cluster)
324
+ col_id = sorted(set(x_clusters.values())).index(col_cluster)
325
+
326
+ grid_dict[(row_id, col_id)].append((left, top, element))
327
+
328
+ # Convert to 2D list
329
+ grid = [[[] for _ in range(n_cols)] for _ in range(n_rows)]
330
+
331
+ for (row, col), cell_elements in grid_dict.items():
332
+ if row < n_rows and col < n_cols:
333
+ # Sort by horizontal position within cell
334
+ cell_elements.sort(key=lambda x: x[0])
335
+ grid[row][col] = cell_elements
336
+
337
+ return grid
338
+
339
+ def to_markdown(self) -> str:
340
+ """
341
+ Convert to markdown table format.
342
+
343
+ Returns:
344
+ Markdown table string, or empty string if not table-like
345
+ """
346
+ grid = self.to_grid()
347
+ if grid is None:
348
+ return ""
349
+
350
+ # Extract text from grid, merging elements in same cell
351
+ text_grid = []
352
+ for row in grid:
353
+ text_row = []
354
+ for cell_elements in row:
355
+ if not cell_elements:
356
+ text_row.append("")
357
+ else:
358
+ # Merge all text from elements in this cell
359
+ texts = []
360
+ for _, _, element in cell_elements:
361
+ if self._is_spacer(element):
362
+ # Spacer box - add a space if we have previous text
363
+ if texts:
364
+ texts.append(" ")
365
+ else:
366
+ text = self._clean_text(element)
367
+ if text:
368
+ # Preserve bold formatting
369
+ if self._is_bold(element):
370
+ text = f"**{text}**"
371
+ texts.append(text)
372
+ text_row.append("".join(texts))
373
+ text_grid.append(text_row)
374
+
375
+ if not text_grid:
376
+ return ""
377
+
378
+ n_cols = len(text_grid[0]) if text_grid else 0
379
+
380
+ # Build markdown table
381
+ lines = []
382
+ for i, row in enumerate(text_grid):
383
+ # Pad row to match column count
384
+ while len(row) < n_cols:
385
+ row.append("")
386
+ # Escape pipe characters
387
+ escaped_row = [cell.replace("|", "\\|") for cell in row]
388
+ lines.append("| " + " | ".join(escaped_row) + " |")
389
+
390
+ # Add separator after first row (header)
391
+ if i == 0:
392
+ lines.append("| " + " | ".join(["---"] * n_cols) + " |")
393
+
394
+ markdown = "\n".join(lines)
395
+
396
+ # Clean up the markdown
397
+ return self._clean_markdown_table(markdown)
398
+
399
+ def _clean_markdown_table(self, markdown: str) -> str:
400
+ """
401
+ Clean up markdown table by removing junk rows and empty columns.
402
+
403
+ Args:
404
+ markdown: Raw markdown table string
405
+
406
+ Returns:
407
+ Cleaned markdown table string
408
+ """
409
+ if not markdown:
410
+ return ""
411
+
412
+ lines = markdown.strip().split('\n')
413
+ if len(lines) < 3: # Need at least header, separator, one data row
414
+ return markdown
415
+
416
+ # Parse rows
417
+ rows = []
418
+ separator_idx = -1
419
+ for i, line in enumerate(lines):
420
+ cells = [c.strip() for c in line.split('|')[1:-1]] # Remove leading/trailing |
421
+ if all(c in ['---', ''] for c in cells):
422
+ separator_idx = i
423
+ rows.append(cells)
424
+ else:
425
+ rows.append(cells)
426
+
427
+ if not rows or separator_idx < 0:
428
+ return markdown
429
+
430
+ # Identify junk rows (footnotes, page numbers, mostly empty)
431
+ def is_junk_row(row, row_idx):
432
+ if row_idx <= separator_idx: # Keep header rows
433
+ return False
434
+
435
+ # Check if mostly empty
436
+ non_empty = [c for c in row if c and c != '---']
437
+ if len(non_empty) == 0:
438
+ return True
439
+ if len(non_empty) == 1 and len(non_empty[0]) < 5: # Single short cell (like page number)
440
+ return True
441
+
442
+ # Check if it's a footnote (starts with (a), (b), etc.)
443
+ first_non_empty = next((c for c in row if c), "")
444
+ if re.match(r'^\([a-z]\)', first_non_empty):
445
+ return True
446
+
447
+ # Check if one very long cell (footnote text) and rest empty
448
+ if len(non_empty) == 1 and len(non_empty[0]) > 100:
449
+ return True
450
+
451
+ return False
452
+
453
+ # Filter out junk rows
454
+ cleaned_rows = [row for i, row in enumerate(rows) if not is_junk_row(row, i)]
455
+
456
+ if not cleaned_rows or len(cleaned_rows) < 3:
457
+ return markdown
458
+
459
+ # Identify and remove empty columns
460
+ n_cols = len(cleaned_rows[0])
461
+ col_has_content = [False] * n_cols
462
+
463
+ for row_idx, row in enumerate(cleaned_rows):
464
+ if row_idx == separator_idx: # Skip separator
465
+ continue
466
+ for col_idx, cell in enumerate(row):
467
+ if col_idx < n_cols and cell and cell != '---':
468
+ col_has_content[col_idx] = True
469
+
470
+ # Remove completely empty columns
471
+ cols_to_keep = [i for i in range(n_cols) if col_has_content[i]]
472
+
473
+ # Rebuild table with kept columns
474
+ if not cols_to_keep:
475
+ return markdown
476
+
477
+ final_rows = []
478
+ for row in cleaned_rows:
479
+ new_row = [row[i] if i < len(row) else "" for i in cols_to_keep]
480
+ final_rows.append(new_row)
481
+
482
+ # Rebuild markdown
483
+ result_lines = []
484
+ for i, row in enumerate(final_rows):
485
+ result_lines.append("| " + " | ".join(row) + " |")
486
+
487
+ return "\n".join(result_lines)
488
+
489
+ def _join_lines(self, prev: str, current: str, gap: float, median_gap: float) -> Tuple[str, bool]:
490
+ """
491
+ Smart line joining with hyphenation handling.
492
+
493
+ Args:
494
+ prev: Previous line text
495
+ current: Current line text
496
+ gap: Vertical gap between lines (pixels)
497
+ median_gap: Median line gap in document
498
+
499
+ Returns:
500
+ Tuple of (joined_text, should_add_newline)
501
+ """
502
+ # Hyphenated word continuation
503
+ if prev.endswith('-'):
504
+ # Check if it's likely a hyphenated word (next starts with lowercase)
505
+ if current and current[0].islower():
506
+ # Remove hyphen and join directly
507
+ return (prev[:-1] + current, False)
508
+ else:
509
+ # Keep hyphen but join with space (e.g., "end-of-year Statement")
510
+ return (prev + " " + current, False)
511
+
512
+ # Check if previous line looks like it continues (no terminal punctuation)
513
+ ends_with_continuation = not prev.rstrip().endswith(('.', '!', '?', ':', ';', ')', ']'))
514
+
515
+ # Small gap + continuation = join with space
516
+ if ends_with_continuation and gap < 1.4 * median_gap:
517
+ return (prev + " " + current, False)
518
+
519
+ # Otherwise, separate with newline
520
+ return (prev, True)
521
+
522
+ def to_text(self) -> str:
523
+ """
524
+ Convert to plain text format (fallback if not table-like).
525
+ Preserves bold formatting and handles hyphenation.
526
+
527
+ Returns:
528
+ Text representation with elements sorted by position and formatting preserved
529
+ """
530
+ # Sort by vertical then horizontal position
531
+ sorted_elements = sorted(self.positioned_elements, key=lambda x: (x[1], x[0]))
532
+
533
+ # Group by rows (epsilon clustering for Y coordinates)
534
+ if not sorted_elements:
535
+ return ""
536
+
537
+ y_coords = [top for _, top, _ in sorted_elements]
538
+ median_line_gap = median([y_coords[i + 1] - y_coords[i]
539
+ for i in range(len(y_coords) - 1)
540
+ if y_coords[i + 1] - y_coords[i] > 1]) if len(y_coords) > 1 else 15.0
541
+
542
+ rows = []
543
+ current_row = []
544
+ last_top = None
545
+
546
+ for left, top, element in sorted_elements:
547
+ if last_top is None or abs(top - last_top) <= 5: # Same row (5px tolerance)
548
+ current_row.append((left, top, element))
549
+ else:
550
+ if current_row:
551
+ rows.append(current_row)
552
+ current_row = [(left, top, element)]
553
+ last_top = top
554
+
555
+ if current_row:
556
+ rows.append(current_row)
557
+
558
+ # Convert to text with formatting and hyphenation handling
559
+ lines = []
560
+ for i, row in enumerate(rows):
561
+ # Sort by horizontal position within row
562
+ row.sort(key=lambda x: x[0])
563
+ texts = []
564
+ for _, _, el in row:
565
+ if self._is_spacer(el):
566
+ # Add space marker
567
+ if texts:
568
+ texts.append(" ")
569
+ else:
570
+ text = self._clean_text(el)
571
+ if text:
572
+ # Preserve bold formatting
573
+ if self._is_bold(el):
574
+ text = f"**{text}**"
575
+ texts.append(text)
576
+
577
+ if not texts:
578
+ continue
579
+
580
+ line = "".join(texts)
581
+
582
+ # Determine if we need spacing before this line
583
+ if i == 0:
584
+ # First line - no spacing needed
585
+ lines.append(line)
586
+ else:
587
+ # Check previous line to determine spacing
588
+ prev_row = rows[i - 1]
589
+ prev_y = prev_row[0][1]
590
+ current_y = row[0][1]
591
+ gap = abs(current_y - prev_y)
592
+
593
+ # Check if previous line is a continuation
594
+ prev_line = lines[-1] if lines else ""
595
+
596
+ # Check if current line is a bold header
597
+ is_header = (
598
+ any(self._is_bold(el) for _, _, el in row if not self._is_spacer(el)) and
599
+ all(self._is_bold(el) for _, _, el in row if
600
+ not self._is_spacer(el) and self._clean_text(el)) and
601
+ len(line) < 80
602
+ )
603
+
604
+ if is_header and not prev_line.endswith('-'):
605
+ # Add blank line before header
606
+ lines.append("")
607
+ lines.append(line)
608
+ else:
609
+ # Use smart joining
610
+ joined_text, needs_newline = self._join_lines(prev_line, line, gap, median_line_gap)
611
+
612
+ if needs_newline:
613
+ # Replace last line with joined text and add current as new line
614
+ if lines:
615
+ lines[-1] = joined_text
616
+ lines.append(line)
617
+ else:
618
+ # Replace last line with joined result
619
+ if lines:
620
+ lines[-1] = joined_text
621
+
622
+ return "\n".join(lines)
File without changes