sec2md 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

@@ -0,0 +1,316 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import List, Dict, Optional, Literal
5
+
6
+ LEAD_WRAP = r'(?:\*\*|__)?\s*(?:</?[^>]+>\s*)*'
7
+
8
+ PART_PATTERN = re.compile(
9
+ rf'^\s*{LEAD_WRAP}(PART\s+[IVXLC]+)\b(?:\s*$|\s+)',
10
+ re.IGNORECASE | re.MULTILINE
11
+ )
12
+ ITEM_PATTERN = re.compile(
13
+ rf'^\s*{LEAD_WRAP}(ITEM)\s+(\d{{1,2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
14
+ re.IGNORECASE | re.MULTILINE
15
+ )
16
+
17
+ HEADER_FOOTER_RE = re.compile(
18
+ r'^\s*(?:[A-Z][A-Za-z0-9 .,&\-]+)?\s*\|\s*\d{4}\s+Form\s+10-[KQ]\s*\|\s*\d+\s*$'
19
+ )
20
+ PAGE_NUM_RE = re.compile(r'^\s*Page\s+\d+\s*(?:of\s+\d+)?\s*$|^\s*\d+\s*$', re.IGNORECASE)
21
+ MD_EDGE = re.compile(r'^\s*(?:\*\*|__)\s*|\s*(?:\*\*|__)\s*$')
22
+
23
+ NBSP, NARROW_NBSP, ZWSP = '\u00A0', '\u202F', '\u200B'
24
+
25
+ DOT_LEAD_RE = re.compile(r'^.*\.{3,}\s*\d{1,4}\s*$', re.M) # "... 123"
26
+ ITEM_ROWS_RE = re.compile(r'^\s*ITEM\s+\d{1,2}[A-Z]?\.?\b', re.I | re.M)
27
+
28
+ FILING_STRUCTURES = {
29
+ "10-K": {
30
+ "PART I": ["ITEM 1", "ITEM 1A", "ITEM 1B", "ITEM 1C", "ITEM 2", "ITEM 3", "ITEM 4"],
31
+ "PART II": ["ITEM 5", "ITEM 6", "ITEM 7", "ITEM 7A", "ITEM 8", "ITEM 9", "ITEM 9A", "ITEM 9B", "ITEM 9C"],
32
+ "PART III": ["ITEM 10", "ITEM 11", "ITEM 12", "ITEM 13", "ITEM 14"],
33
+ "PART IV": ["ITEM 15", "ITEM 16"]
34
+ },
35
+ "10-Q": {
36
+ "PART I": ["ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4"],
37
+ "PART II": ["ITEM 1", "ITEM 1A", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6"]
38
+ },
39
+ "20-F": {
40
+ "PART I": [
41
+ "ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6",
42
+ "ITEM 7", "ITEM 8", "ITEM 9", "ITEM 10", "ITEM 11", "ITEM 12", "ITEM 12D"
43
+ ],
44
+ "PART II": [
45
+ "ITEM 13", "ITEM 14", "ITEM 15",
46
+ # include all 16X variants explicitly so validation stays strict
47
+ "ITEM 16", "ITEM 16A", "ITEM 16B", "ITEM 16C", "ITEM 16D", "ITEM 16E", "ITEM 16F", "ITEM 16G", "ITEM 16H",
48
+ "ITEM 16I"
49
+ ],
50
+ "PART III": ["ITEM 17", "ITEM 18", "ITEM 19"]
51
+ }
52
+ }
53
+
54
+
55
+ class SectionExtractor:
56
+ def __init__(self, pages: List[Dict], filing_type: Optional[Literal["10-K", "10-Q", "20-F"]] = None, debug: bool = False):
57
+ self.pages = pages
58
+ self.filing_type = filing_type
59
+ self.structure = FILING_STRUCTURES.get(filing_type) if filing_type else None
60
+ self.debug = debug
61
+
62
+ self._toc_locked = False
63
+
64
+ def _log(self, msg: str):
65
+ if self.debug:
66
+ print(msg)
67
+
68
+ @staticmethod
69
+ def _normalize_section_key(part: Optional[str], item_num: Optional[str]) -> tuple[Optional[str], Optional[str]]:
70
+ part_key = re.sub(r'\s+', ' ', part.upper().strip()) if part else None
71
+ item_key = f"ITEM {item_num.upper()}" if item_num else None
72
+ return part_key, item_key
73
+
74
+ @staticmethod
75
+ def _normalize_section(text: str) -> str:
76
+ return re.sub(r'\s+', ' ', text.upper().strip())
77
+
78
+ def _clean_lines(self, content: str) -> List[str]:
79
+ content = content.replace(NBSP, ' ').replace(NARROW_NBSP, ' ').replace(ZWSP, '')
80
+ lines = [ln.rstrip() for ln in content.split('\n')]
81
+ out = []
82
+ for ln in lines:
83
+ if HEADER_FOOTER_RE.match(ln) or PAGE_NUM_RE.match(ln):
84
+ continue
85
+ ln = MD_EDGE.sub('', ln)
86
+ out.append(ln)
87
+ return out
88
+
89
+ def _infer_part_for_item(self, filing_type: str, item_key: str) -> Optional[str]:
90
+ m = re.match(r'ITEM\s+(\d{1,2})', item_key)
91
+ if not m:
92
+ return None
93
+ num = int(m.group(1))
94
+ if filing_type == "10-K":
95
+ if 1 <= num <= 4:
96
+ return "PART I"
97
+ elif 5 <= num <= 9:
98
+ return "PART II"
99
+ elif 10 <= num <= 14:
100
+ return "PART III"
101
+ elif 15 <= num <= 16:
102
+ return "PART IV"
103
+ elif filing_type == "10-Q":
104
+ if 1 <= num <= 4:
105
+ return "PART I"
106
+ else:
107
+ return "PART II"
108
+ return None
109
+
110
+ @staticmethod
111
+ def _clean_item_title(title: str) -> str:
112
+ title = re.sub(r'^\s*[:.\-–—]\s*', '', title)
113
+ title = re.sub(r'\s+', ' ', title).strip()
114
+ return title
115
+
116
+ def _is_toc(self, content: str, page_num: int = 1) -> bool:
117
+ # Simple rule: within first 5 pages, if we see multiple matches, treat as TOC.
118
+ # “Multiple” = ≥3 ITEM rows OR ≥3 dotted-leader lines.
119
+ if self._toc_locked or page_num > 5:
120
+ return False
121
+
122
+ item_hits = len(ITEM_ROWS_RE.findall(content))
123
+ leader_hits = len(DOT_LEAD_RE.findall(content))
124
+
125
+ return (item_hits >= 3) or (leader_hits >= 3)
126
+ def get_sections(self) -> List[Dict]:
127
+ sections = []
128
+ current_part = None
129
+ current_item = None
130
+ current_item_title = None
131
+ current_pages: List[Dict] = []
132
+
133
+ def flush_section():
134
+ nonlocal sections, current_part, current_item, current_item_title, current_pages
135
+ if current_pages:
136
+ sections.append({
137
+ "part": current_part,
138
+ "item": current_item,
139
+ "item_title": current_item_title,
140
+ "page_start": current_pages[0]["page"],
141
+ "pages": current_pages
142
+ })
143
+ current_pages = []
144
+
145
+ for page_dict in self.pages:
146
+ page_num = page_dict["page"]
147
+ content = page_dict["content"]
148
+
149
+ if self._is_toc(content, page_num):
150
+ self._log(f"DEBUG: Page {page_num} detected as TOC, skipping")
151
+ continue
152
+
153
+ lines = self._clean_lines(content)
154
+ joined = "\n".join(lines)
155
+
156
+ if not joined.strip():
157
+ self._log(f"DEBUG: Page {page_num} is empty after cleaning")
158
+ continue
159
+
160
+ part_m = None
161
+ item_m = None
162
+ first_idx = None
163
+ first_kind = None
164
+
165
+ for m in PART_PATTERN.finditer(joined):
166
+ part_m = m
167
+ first_idx = m.start()
168
+ first_kind = 'part'
169
+ self._log(f"DEBUG: Page {page_num} found PART at position {first_idx}: {m.group(1)}")
170
+ break
171
+
172
+ for m in ITEM_PATTERN.finditer(joined):
173
+ if first_idx is None or m.start() < first_idx:
174
+ item_m = m
175
+ first_idx = m.start()
176
+ first_kind = 'item'
177
+ self._log(f"DEBUG: Page {page_num} found ITEM at position {first_idx}: ITEM {m.group(2)}")
178
+ break
179
+
180
+ if first_kind is None:
181
+ self._log(f"DEBUG: Page {page_num} - no header found. In section: {current_part or current_item}")
182
+ if current_part or current_item:
183
+ if joined.strip():
184
+ current_pages.append({"page": page_num, "content": joined})
185
+ continue
186
+
187
+ before = joined[:first_idx].strip()
188
+ after = joined[first_idx:].strip()
189
+
190
+ if (current_part or current_item) and before:
191
+ current_pages.append({"page": page_num, "content": before})
192
+
193
+ flush_section()
194
+
195
+ if first_kind == 'part' and part_m:
196
+ part_text = part_m.group(1)
197
+ current_part, _ = self._normalize_section_key(part_text, None)
198
+ current_item = None
199
+ current_item_title = None
200
+ elif first_kind == 'item' and item_m:
201
+ item_num = item_m.group(2)
202
+ title = (item_m.group(3) or "").strip()
203
+ current_item_title = self._clean_item_title(title) if title else None
204
+ if current_part is None and self.filing_type:
205
+ inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
206
+ if inferred:
207
+ current_part = inferred
208
+ self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
209
+ _, current_item = self._normalize_section_key(current_part, item_num)
210
+
211
+ if after:
212
+ current_pages.append({"page": page_num, "content": after})
213
+
214
+ if first_kind == 'part' and part_m:
215
+ item_after = None
216
+ for m in ITEM_PATTERN.finditer(after):
217
+ item_after = m
218
+ break
219
+ if item_after:
220
+ start = item_after.start()
221
+ current_pages[-1]["content"] = after[start:]
222
+ item_num = item_after.group(2)
223
+ title = (item_after.group(3) or "").strip()
224
+ current_item_title = self._clean_item_title(title) if title else None
225
+ _, current_item = self._normalize_section_key(current_part, item_num)
226
+ self._log(f"DEBUG: Page {page_num} - promoted PART to ITEM {item_num} (intra-page)")
227
+ after = current_pages[-1]["content"]
228
+
229
+ tail = after
230
+ while True:
231
+ next_kind, next_idx, next_part_m, next_item_m = None, None, None, None
232
+
233
+ for m in PART_PATTERN.finditer(tail):
234
+ if m.start() > 0:
235
+ next_kind, next_idx, next_part_m = 'part', m.start(), m
236
+ break
237
+ for m in ITEM_PATTERN.finditer(tail):
238
+ if m.start() > 0 and (next_idx is None or m.start() < next_idx):
239
+ next_kind, next_idx, next_item_m = 'item', m.start(), m
240
+
241
+ if next_idx is None:
242
+ break
243
+
244
+ before_seg = tail[:next_idx].strip()
245
+ after_seg = tail[next_idx:].strip()
246
+
247
+ if before_seg:
248
+ current_pages[-1]["content"] = before_seg
249
+ flush_section()
250
+
251
+ if next_kind == 'part' and next_part_m:
252
+ current_part, _ = self._normalize_section_key(next_part_m.group(1), None)
253
+ current_item = None
254
+ current_item_title = None
255
+ self._log(f"DEBUG: Page {page_num} - intra-page PART transition to {current_part}")
256
+ elif next_kind == 'item' and next_item_m:
257
+ item_num = next_item_m.group(2)
258
+ title = (next_item_m.group(3) or "").strip()
259
+ current_item_title = self._clean_item_title(title) if title else None
260
+ if current_part is None and self.filing_type:
261
+ inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
262
+ if inferred:
263
+ current_part = inferred
264
+ self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
265
+ _, current_item = self._normalize_section_key(current_part, item_num)
266
+ self._log(f"DEBUG: Page {page_num} - intra-page ITEM transition to {current_item}")
267
+
268
+ current_pages.append({"page": page_num, "content": after_seg})
269
+ tail = after_seg
270
+
271
+ flush_section()
272
+
273
+ self._log(f"DEBUG: Total sections before validation: {len(sections)}")
274
+ for s in sections:
275
+ self._log(f" - Part: {s['part']}, Item: {s['item']}, Pages: {len(s['pages'])}, Start: {s['page_start']}")
276
+
277
+ def _section_text_len(s):
278
+ return sum(len(p["content"].strip()) for p in s["pages"])
279
+
280
+ sections = [s for s in sections if s["item"] is not None or _section_text_len(s) > 80]
281
+ self._log(f"DEBUG: Sections after dropping empty PART stubs: {len(sections)}")
282
+
283
+ if self.structure and sections:
284
+ self._log(f"DEBUG: Validating against structure: {self.filing_type}")
285
+ fixed = []
286
+ for s in sections:
287
+ part = s["part"]
288
+ item = s["item"]
289
+
290
+ if part is None and item and self.filing_type:
291
+ inferred = self._infer_part_for_item(self.filing_type, item)
292
+ if inferred:
293
+ self._log(f"DEBUG: Inferred {inferred} from {item}")
294
+ s = {**s, "part": inferred}
295
+ part = inferred
296
+
297
+ if (part in self.structure) and (item is None or item in self.structure.get(part, [])):
298
+ fixed.append(s)
299
+ else:
300
+ self._log(f"DEBUG: Dropped section - Part: {part}, Item: {item}")
301
+
302
+ sections = fixed
303
+ self._log(f"DEBUG: Sections after validation: {len(sections)}")
304
+
305
+ return sections
306
+
307
+ def get_section(self, part: str, item: Optional[str] = None) -> Optional[Dict]:
308
+ part_normalized = self._normalize_section(part)
309
+ item_normalized = self._normalize_section(item) if item else None
310
+ sections = self.get_sections()
311
+
312
+ for section in sections:
313
+ if section["part"] == part_normalized:
314
+ if item_normalized is None or section["item"] == item_normalized:
315
+ return section
316
+ return None
sec2md/sections.py ADDED
@@ -0,0 +1,104 @@
1
+ """Section extraction utilities for SEC filings."""
2
+
3
+ from typing import List, Optional, Union
4
+ from sec2md.models import Page, Section, FilingType, Item10K, Item10Q, ITEM_10K_MAPPING, ITEM_10Q_MAPPING
5
+ from sec2md.section_extractor import SectionExtractor
6
+
7
+
8
+ def extract_sections(
9
+ pages: List[Page],
10
+ filing_type: FilingType,
11
+ debug: bool = False
12
+ ) -> List[Section]:
13
+ """
14
+ Extract sections from filing pages.
15
+
16
+ Args:
17
+ pages: List of Page objects from convert_to_markdown(return_pages=True)
18
+ filing_type: Type of filing ("10-K" or "10-Q")
19
+ debug: Enable debug logging
20
+
21
+ Returns:
22
+ List of Section objects, each containing pages for that section
23
+
24
+ Example:
25
+ >>> pages = sec2md.convert_to_markdown(html, return_pages=True)
26
+ >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
27
+ >>> for section in sections:
28
+ ... print(f"{section.item}: {section.item_title}")
29
+ """
30
+ # Convert Page objects to dict format for SectionExtractor
31
+ pages_data = [{"page": p.number, "content": p.content} for p in pages]
32
+
33
+ extractor = SectionExtractor(
34
+ pages=pages_data,
35
+ filing_type=filing_type,
36
+ debug=debug
37
+ )
38
+
39
+ sections_data = extractor.get_sections()
40
+
41
+ # Convert back to Section objects with Page objects
42
+ sections = []
43
+ for section_data in sections_data:
44
+ section_pages = [
45
+ Page(number=p["page"], content=p["content"])
46
+ for p in section_data["pages"]
47
+ ]
48
+ sections.append(
49
+ Section(
50
+ part=section_data["part"],
51
+ item=section_data["item"],
52
+ item_title=section_data["item_title"],
53
+ pages=section_pages
54
+ )
55
+ )
56
+
57
+ return sections
58
+
59
+
60
+ def get_section(
61
+ sections: List[Section],
62
+ item: Union[Item10K, Item10Q, str],
63
+ filing_type: FilingType = "10-K"
64
+ ) -> Optional[Section]:
65
+ """
66
+ Get a specific section by item enum or string.
67
+
68
+ Args:
69
+ sections: List of sections from extract_sections()
70
+ item: Item enum (Item10K.RISK_FACTORS) or string ("ITEM 1A")
71
+ filing_type: Type of filing ("10-K" or "10-Q")
72
+
73
+ Returns:
74
+ Section object if found, None otherwise
75
+
76
+ Example:
77
+ >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
78
+ >>> risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
79
+ >>> print(risk.markdown())
80
+ """
81
+ # Map enum to (part, item) tuple
82
+ if isinstance(item, Item10K):
83
+ if filing_type != "10-K":
84
+ raise ValueError(f"Item10K enum requires filing_type='10-K', got '{filing_type}'")
85
+ target_part, target_item = ITEM_10K_MAPPING[item]
86
+ elif isinstance(item, Item10Q):
87
+ if filing_type != "10-Q":
88
+ raise ValueError(f"Item10Q enum requires filing_type='10-Q', got '{filing_type}'")
89
+ target_part, target_item = ITEM_10Q_MAPPING[item]
90
+ else:
91
+ # String format - normalize it
92
+ item_str = str(item).upper().strip()
93
+ if not item_str.startswith("ITEM"):
94
+ item_str = f"ITEM {item_str}"
95
+ target_item = item_str
96
+ target_part = None # Match any part
97
+
98
+ # Find matching section
99
+ for section in sections:
100
+ if section.item == target_item:
101
+ if target_part is None or section.part == target_part:
102
+ return section
103
+
104
+ return None