sec2md 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/table_parser.py ADDED
@@ -0,0 +1,386 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import logging
5
+ from bs4 import Tag
6
+ from dataclasses import dataclass
7
+ from typing import List, Optional
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ BULLETS = {"•", "●", "◦", "–", "-", "—", "·", ""}
12
+
13
+ # Robust numeric pattern for SEC filings
14
+ # Matches: $1,234.56, (1,234), -1234, 12.5%, etc.
15
+ NUMERIC_RE = re.compile(r"""
16
+ ^\s*
17
+ [\(\[]? # optional opening paren/bracket
18
+ [\-—–]?\s* # optional dash
19
+ [$€£¥]?\s* # optional currency
20
+ \d+(?:[.,]\d{3})* # integer part (with or without thousands)
21
+ (?:[.,]\d+)? # decimals
22
+ \s*%? # optional percent
23
+ [\)\]]?\s*$ # optional closing paren/bracket
24
+ """, re.X)
25
+
26
+
27
+ @dataclass
28
+ class Cell:
29
+ """A single cell in a table, potentially containing XBRL data"""
30
+ text: str
31
+ rowspan: int = 1
32
+ colspan: int = 1
33
+
34
+ def __bool__(self) -> bool:
35
+ return bool(self.text.strip())
36
+
37
+ def __repr__(self) -> str:
38
+ return f"Cell(text={self.text!r}, rowspan={self.rowspan}, colspan={self.colspan})"
39
+
40
+
41
+ class GridCell:
42
+ """A cell in the final grid, possibly part of a spanning cell"""
43
+
44
+ def __init__(self, cell: Cell, is_spanning: bool = False):
45
+ self.cell = cell
46
+ self.is_spanning = is_spanning
47
+
48
+ @property
49
+ def text(self) -> str:
50
+ return self.cell.text if not self.is_spanning else ""
51
+
52
+ def __bool__(self) -> bool:
53
+ return bool(self.text.strip())
54
+
55
+ def __repr__(self) -> str:
56
+ return f"GridCell(cell={self.cell!r}, is_spanning={self.is_spanning})"
57
+
58
+
59
+ class TableParser:
60
+ """A table within a filing document"""
61
+
62
+ def __init__(self, table_element: Tag):
63
+ """
64
+ Initialize table from a BS4 table tag
65
+
66
+ Args:
67
+ table_element: The specific table BS4 tag
68
+ """
69
+ if not isinstance(table_element, Tag) or table_element.name != 'table':
70
+ raise ValueError("table_element must be a table tag")
71
+
72
+ self.table_element = table_element
73
+
74
+ self.cells = self._extract_cells()
75
+ self.grid = self._create_grid()
76
+
77
+ def _extract_cells(self) -> List[List[Cell]]:
78
+ rows = []
79
+ for tr in self.table_element.find_all('tr'):
80
+ row = []
81
+ for td in tr.find_all(['td', 'th']):
82
+ text = td.get_text(separator=" ", strip=True).replace('\xa0', ' ')
83
+ if not text:
84
+ if td.find('img'):
85
+ text = '●' # or '•' depending on your BULLETS set
86
+ rowspan = self._safe_parse_int(td.get('rowspan'))
87
+ colspan = self._safe_parse_int(td.get('colspan'))
88
+ row.append(Cell(text=text, rowspan=rowspan, colspan=colspan))
89
+ if row:
90
+ rows.append(row)
91
+ return rows or [[Cell(text="")]]
92
+
93
+ @staticmethod
94
+ def _safe_parse_int(value: str, default: int = 1) -> int:
95
+ """Safely parse an integer value, returning default if parsing fails"""
96
+ try:
97
+ if not value or not isinstance(value, str):
98
+ return default
99
+ cleaned = ''.join(c for c in value if c.isdigit())
100
+ return int(cleaned) if cleaned else default
101
+ except (ValueError, TypeError):
102
+ return default
103
+
104
+ def _create_grid(self) -> List[List[GridCell]]:
105
+ """Create grid with spanning cells handled"""
106
+ if not self.cells:
107
+ return []
108
+
109
+ # Calculate grid dimensions
110
+ max_cols = max(sum(cell.colspan for cell in row) for row in self.cells)
111
+ grid = [[None for _ in range(max_cols)] for _ in range(len(self.cells))]
112
+
113
+ for i, row in enumerate(self.cells):
114
+ col = 0
115
+ for cell in row:
116
+ # Find next empty cell
117
+ while col < max_cols and grid[i][col] is not None:
118
+ col += 1
119
+
120
+ if col >= max_cols:
121
+ break
122
+
123
+ grid[i][col] = GridCell(cell)
124
+
125
+ for r in range(cell.rowspan):
126
+ for c in range(cell.colspan):
127
+ if r == 0 and c == 0: # Skip main cell
128
+ continue
129
+ ri, ci = i + r, col + c
130
+ if ri < len(grid) and ci < max_cols:
131
+ grid[ri][ci] = GridCell(cell, is_spanning=True)
132
+
133
+ col += cell.colspan
134
+
135
+ grid = self._clean_grid(grid)
136
+ grid = self._merge_grid(grid)
137
+
138
+ return grid
139
+
140
+ def _should_merge_cells(self, val1: Optional[GridCell], val2: Optional[GridCell]) -> bool:
141
+ """Check if two cells should be merged based on the rules"""
142
+ # Handle empty cells
143
+ if not val1 or not val2:
144
+ return True
145
+
146
+ s1 = val1.text.strip()
147
+ s2 = val2.text.strip()
148
+
149
+ if not s1 or not s2:
150
+ return True
151
+
152
+ if self.is_footnote(s2):
153
+ return True
154
+
155
+ if s1 == '$':
156
+ return True
157
+
158
+ if s2 == '%':
159
+ return True
160
+
161
+ return False
162
+
163
+ @staticmethod
164
+ def is_footnote(text: str) -> bool:
165
+ """Check if string is a number or letter within square brackets and nothing else, e.g., [1], [b]"""
166
+ pattern = r'^\[[a-zA-Z0-9]+\]$'
167
+ return bool(re.match(pattern, text))
168
+
169
+ @staticmethod
170
+ def _clean_grid(grid: List[List[GridCell]]) -> List[List[GridCell]]:
171
+ """Drop rows and columns that contain only empty cells (no text and no XBRL data)"""
172
+ if not grid:
173
+ return grid
174
+
175
+ rows_to_keep = [
176
+ i for i, row in enumerate(grid)
177
+ if any(
178
+ cell is not None and cell.text.strip()
179
+ for cell in row
180
+ )
181
+ ]
182
+
183
+ columns_to_keep = [
184
+ j for j in range(len(grid[0]))
185
+ if any(
186
+ grid[i][j] is not None and
187
+ (grid[i][j].text.strip())
188
+ for i in range(len(grid))
189
+ )
190
+ ]
191
+
192
+ filtered_grid = [
193
+ [grid[i][j] for j in columns_to_keep]
194
+ for i in rows_to_keep
195
+ ]
196
+
197
+ return filtered_grid
198
+
199
+ def _merge_grid(self, grid: List[List[GridCell]]) -> List[List[GridCell]]:
200
+ """Merge columns in one clean pass"""
201
+ if not grid or not grid[0]:
202
+ return grid
203
+
204
+ result = []
205
+ current_col = None
206
+
207
+ for col_idx in range(len(grid[0])):
208
+ col = [row[col_idx] for row in grid]
209
+
210
+ if current_col is None:
211
+ current_col = col
212
+ continue
213
+
214
+ cell_pairs = list(zip(current_col[1:], col[1:]))
215
+ should_merge = all(self._should_merge_cells(c1, c2) for c1, c2 in cell_pairs)
216
+
217
+ if should_merge:
218
+ merged = [current_col[0]] # Keep header
219
+ for c1, c2 in cell_pairs:
220
+ if not c1:
221
+ merged.append(c2)
222
+ elif not c2:
223
+ merged.append(c1)
224
+ else:
225
+ text = f"{c1.text} {c2.text}".strip()
226
+ merged_cell = Cell(text=text)
227
+ merged.append(GridCell(merged_cell))
228
+ current_col = merged
229
+ else:
230
+ result.append(current_col)
231
+ current_col = col
232
+
233
+ if current_col is not None:
234
+ result.append(current_col)
235
+
236
+ return list(map(list, zip(*result)))
237
+
238
+ def to_matrix(self) -> List[List[str]]:
239
+ """Convert grid to text matrix"""
240
+ return [[cell.text if cell else "" for cell in row] for row in self.grid]
241
+
242
+ def _normalize_text(self, text: str) -> str:
243
+ """Normalize text while preserving deliberate blanks"""
244
+ if text is None:
245
+ return ""
246
+ return str(text).replace("\xa0", " ").strip()
247
+
248
+ def _process_headers(self, matrix: List[List[str]]) -> tuple[List[str], List[List[str]]]:
249
+ """
250
+ Process table headers with smart header fusion.
251
+
252
+ Returns:
253
+ Tuple of (headers, data_rows)
254
+ """
255
+ if not matrix or len(matrix) < 1:
256
+ return [], []
257
+
258
+ nrows = len(matrix)
259
+ ncols = len(matrix[0]) if matrix else 0
260
+
261
+ if nrows < 2:
262
+ # Single row - treat as header with no data
263
+ return [self._normalize_text(v) for v in matrix[0]], []
264
+
265
+ # Get first two rows
266
+ row0 = [self._normalize_text(v) for v in matrix[0]]
267
+ row1 = [self._normalize_text(v) for v in matrix[1]]
268
+
269
+ # Check if we should fuse headers
270
+ nonempty_row1 = sum(1 for v in row1 if v)
271
+ many_blanks_in_row0 = sum(1 for v in row0 if v == "") >= max(2, ncols // 2)
272
+
273
+ if nonempty_row1 >= max(2, ncols // 2) and many_blanks_in_row0:
274
+ # Fuse the two header rows
275
+ fused = []
276
+ for j in range(ncols):
277
+ top = row0[j] if j < len(row0) else ""
278
+ bot = row1[j] if j < len(row1) else ""
279
+ if top and bot:
280
+ fused.append(f"{top} — {bot}")
281
+ elif top:
282
+ fused.append(top)
283
+ elif bot:
284
+ fused.append(bot)
285
+ else:
286
+ fused.append("")
287
+ return fused, matrix[2:]
288
+ else:
289
+ # Use row0 as header, rest as data
290
+ return row0, matrix[1:]
291
+
292
+ def _clean_empty_rows_and_cols(self, headers: List[str], data: List[List[str]]) -> tuple[List[str], List[List[str]]]:
293
+ """Remove completely empty rows and columns"""
294
+ if not data:
295
+ return headers, data
296
+
297
+ ncols = len(headers)
298
+
299
+ # Remove empty rows
300
+ cleaned_data = [row for row in data if any(self._normalize_text(cell) for cell in row)]
301
+
302
+ if not cleaned_data:
303
+ return headers, []
304
+
305
+ # Identify empty columns
306
+ cols_with_content = set()
307
+ for row in cleaned_data:
308
+ for j, cell in enumerate(row):
309
+ if j < ncols and self._normalize_text(cell):
310
+ cols_with_content.add(j)
311
+
312
+ # Keep columns with content
313
+ if not cols_with_content:
314
+ return [], []
315
+
316
+ cols_to_keep = sorted(cols_with_content)
317
+ new_headers = [headers[j] for j in cols_to_keep if j < len(headers)]
318
+ new_data = [[row[j] if j < len(row) else "" for j in cols_to_keep] for row in cleaned_data]
319
+
320
+ return new_headers, new_data
321
+
322
+ def _looks_like_list_table(self) -> bool:
323
+ """Special case - some quirky files format lists as tables"""
324
+ if len(self.cells) != 1:
325
+ return False
326
+ row = self.cells[0]
327
+ texts = [c.text.strip() for c in row]
328
+ has_bullet = any(t in BULLETS for t in texts)
329
+ has_payload = any(t for t in texts[1:])
330
+ return has_bullet and has_payload
331
+
332
+ def to_markdown(self) -> str:
333
+ """
334
+ Convert table to markdown format.
335
+
336
+ Returns:
337
+ Markdown table string
338
+ """
339
+ # Special-case list tables
340
+ if self._looks_like_list_table():
341
+ row = self.cells[0]
342
+ payload = ""
343
+ for c in reversed(row):
344
+ t = c.text.strip()
345
+ if t and t not in BULLETS:
346
+ payload = t
347
+ break
348
+ return f"- {payload}" if payload else ""
349
+
350
+ # Get the matrix
351
+ matrix = self.to_matrix()
352
+ if not matrix:
353
+ return ""
354
+
355
+ # Process headers
356
+ headers, data = self._process_headers(matrix)
357
+
358
+ # Clean empty rows/columns
359
+ headers, data = self._clean_empty_rows_and_cols(headers, data)
360
+
361
+ if not headers and not data:
362
+ return ""
363
+
364
+ # Build markdown table
365
+ lines = []
366
+
367
+ # Header row
368
+ if headers:
369
+ escaped_headers = [str(h).replace("|", "\\|") for h in headers]
370
+ lines.append("| " + " | ".join(escaped_headers) + " |")
371
+ lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
372
+
373
+ # Data rows
374
+ for row in data:
375
+ # Pad row to match header length
376
+ while len(row) < len(headers):
377
+ row.append("")
378
+ # Escape pipe characters
379
+ escaped_row = [str(cell).replace("|", "\\|") for cell in row[:len(headers)]]
380
+ lines.append("| " + " | ".join(escaped_row) + " |")
381
+
382
+ return "\n".join(lines)
383
+
384
+ def md(self) -> str:
385
+ """Alias for to_markdown() for backwards compatibility"""
386
+ return self.to_markdown()
sec2md/utils.py ADDED
@@ -0,0 +1,109 @@
1
+ """Utility functions for fetching HTML."""
2
+
3
+ import requests
4
+ from typing import Optional
5
+ from urllib.parse import urlparse
6
+ from bs4 import BeautifulSoup
7
+
8
+
9
+ def is_url(source: str) -> bool:
10
+ """
11
+ Check if source is a valid URL using urllib.parse.
12
+
13
+ Args:
14
+ source: String to check
15
+
16
+ Returns:
17
+ True if source is a valid URL with http/https scheme
18
+ """
19
+ try:
20
+ result = urlparse(source)
21
+ return all([result.scheme, result.netloc]) and result.scheme in ('http', 'https')
22
+ except Exception:
23
+ return False
24
+
25
+
26
+ def is_edgar_url(url: str) -> bool:
27
+ """Check if URL is an SEC EDGAR URL."""
28
+ return "sec.gov" in url.lower()
29
+
30
+
31
+ def fetch(url: str, user_agent: str | None = None) -> str:
32
+ """
33
+ Fetch HTML content from a URL.
34
+
35
+ Args:
36
+ url: The URL to fetch
37
+ user_agent: User agent string (required for EDGAR URLs)
38
+
39
+ Returns:
40
+ HTML content as string
41
+
42
+ Raises:
43
+ ValueError: If EDGAR URL is accessed without user_agent
44
+ requests.RequestException: If request fails
45
+ """
46
+ if is_edgar_url(url) and not user_agent:
47
+ raise ValueError(
48
+ "SEC EDGAR requires a User-Agent header. "
49
+ "Pass user_agent='YourName your@email.com'"
50
+ )
51
+
52
+ headers = {}
53
+ if user_agent:
54
+ headers["User-Agent"] = user_agent
55
+
56
+ response = requests.get(url, headers=headers, timeout=30)
57
+ response.raise_for_status()
58
+ return response.text
59
+
60
+
61
+ def flatten_note(content: str) -> Optional[str]:
62
+ """
63
+ Flatten note structure by removing the outer table wrapper.
64
+
65
+ Notes to financial statements are often wrapped in an unnecessary outer table
66
+ with a single cell containing the actual content. This function unwraps that
67
+ outer table to reveal the properly structured content inside.
68
+
69
+ Args:
70
+ content: HTML string containing the note content
71
+
72
+ Returns:
73
+ Flattened HTML string, or None if flattening fails
74
+
75
+ Example:
76
+ >>> from edgar import Company, set_identity
77
+ >>> import sec2md
78
+ >>>
79
+ >>> set_identity("Your Name <you@example.com>")
80
+ >>> company = Company('AAPL')
81
+ >>> filing = company.get_filings(form="10-K").latest()
82
+ >>> notes = filing.reports.get_by_category("Notes")
83
+ >>> note = notes.get_by_short_name("Revenue")
84
+ >>>
85
+ >>> # Flatten the note wrapper
86
+ >>> flattened_html = sec2md.flatten_note(note.content)
87
+ >>> md = sec2md.convert_to_markdown(flattened_html)
88
+ """
89
+ soup = BeautifulSoup(content, 'lxml')
90
+ elements = []
91
+
92
+ body = soup.find("body")
93
+ if not body:
94
+ return None
95
+
96
+ table = body.find("table")
97
+ if table is None:
98
+ return None
99
+
100
+ for row in table.find_all('tr', recursive=False):
101
+ cells = row.find_all(['th', 'td'], recursive=False)
102
+
103
+ for cell in cells:
104
+ elements.append(cell)
105
+
106
+ if len(elements) == 0:
107
+ return None
108
+
109
+ return ''.join([str(element) for element in elements])