sec2md 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

@@ -0,0 +1,623 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import List, Dict, Optional, Literal, Union, Any
5
+
6
+ LEAD_WRAP = r'(?:\*\*|__)?\s*(?:</?[^>]+>\s*)*'
7
+
8
+ PART_PATTERN = re.compile(
9
+ rf'^\s*{LEAD_WRAP}(PART\s+[IVXLC]+)\b(?:\s*$|\s+)',
10
+ re.IGNORECASE | re.MULTILINE
11
+ )
12
+ ITEM_PATTERN = re.compile(
13
+ rf'^\s*{LEAD_WRAP}(ITEM)\s+(\d{{1,2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
14
+ re.IGNORECASE | re.MULTILINE
15
+ )
16
+
17
+ HEADER_FOOTER_RE = re.compile(
18
+ r'^\s*(?:[A-Z][A-Za-z0-9 .,&\-]+)?\s*\|\s*\d{4}\s+Form\s+10-[KQ]\s*\|\s*\d+\s*$'
19
+ )
20
+ PAGE_NUM_RE = re.compile(r'^\s*Page\s+\d+\s*(?:of\s+\d+)?\s*$|^\s*\d+\s*$', re.IGNORECASE)
21
+ MD_EDGE = re.compile(r'^\s*(?:\*\*|__)\s*|\s*(?:\*\*|__)\s*$')
22
+
23
+ NBSP, NARROW_NBSP, ZWSP = '\u00A0', '\u202F', '\u200B'
24
+
25
+ DOT_LEAD_RE = re.compile(r'^.*\.{3,}\s*\d{1,4}\s*$', re.M) # "... 123"
26
+ ITEM_ROWS_RE = re.compile(r'^\s*ITEM\s+\d{1,2}[A-Z]?\.?\b', re.I | re.M)
27
+
28
+ FILING_STRUCTURES = {
29
+ "10-K": {
30
+ "PART I": ["ITEM 1", "ITEM 1A", "ITEM 1B", "ITEM 1C", "ITEM 2", "ITEM 3", "ITEM 4"],
31
+ "PART II": ["ITEM 5", "ITEM 6", "ITEM 7", "ITEM 7A", "ITEM 8", "ITEM 9", "ITEM 9A", "ITEM 9B", "ITEM 9C"],
32
+ "PART III": ["ITEM 10", "ITEM 11", "ITEM 12", "ITEM 13", "ITEM 14"],
33
+ "PART IV": ["ITEM 15", "ITEM 16"]
34
+ },
35
+ "10-Q": {
36
+ "PART I": ["ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4"],
37
+ "PART II": ["ITEM 1", "ITEM 1A", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6"]
38
+ },
39
+ "20-F": {
40
+ "PART I": [
41
+ "ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6",
42
+ "ITEM 7", "ITEM 8", "ITEM 9", "ITEM 10", "ITEM 11", "ITEM 12", "ITEM 12D"
43
+ ],
44
+ "PART II": [
45
+ "ITEM 13", "ITEM 14", "ITEM 15",
46
+ # include all 16X variants explicitly so validation stays strict
47
+ "ITEM 16", "ITEM 16A", "ITEM 16B", "ITEM 16C", "ITEM 16D", "ITEM 16E", "ITEM 16F", "ITEM 16G", "ITEM 16H",
48
+ "ITEM 16I"
49
+ ],
50
+ "PART III": ["ITEM 17", "ITEM 18", "ITEM 19"]
51
+ }
52
+ }
53
+
54
+
55
+ class SectionExtractor:
56
+ def __init__(self, pages: List[Any], filing_type: Optional[Literal["10-K", "10-Q", "20-F", "8-K"]] = None,
57
+ desired_items: Optional[set] = None, debug: bool = False):
58
+ """Initialize SectionExtractor.
59
+
60
+ Args:
61
+ pages: List of Page objects
62
+ filing_type: Type of filing ("10-K", "10-Q", "20-F", or "8-K")
63
+ desired_items: For 8-K only: set of item numbers to extract (e.g., {"2.02", "9.01"})
64
+ debug: Enable debug logging
65
+ """
66
+ from sec2md.models import Page
67
+
68
+ # Store original Page objects to preserve elements
69
+ self._original_pages = {p.number: p for p in pages}
70
+
71
+ # Convert to dict format for internal processing
72
+ self.pages = [{"page": p.number, "content": p.content} for p in pages]
73
+ self.filing_type = filing_type
74
+ self.structure = FILING_STRUCTURES.get(filing_type) if filing_type else None
75
+ self.desired_items = desired_items
76
+ self.debug = debug
77
+
78
+ self._toc_locked = False
79
+
80
+ def _log(self, msg: str):
81
+ if self.debug:
82
+ print(msg)
83
+
84
+ @staticmethod
85
+ def _normalize_section_key(part: Optional[str], item_num: Optional[str]) -> tuple[Optional[str], Optional[str]]:
86
+ part_key = re.sub(r'\s+', ' ', part.upper().strip()) if part else None
87
+ item_key = f"ITEM {item_num.upper()}" if item_num else None
88
+ return part_key, item_key
89
+
90
+ @staticmethod
91
+ def _normalize_section(text: str) -> str:
92
+ return re.sub(r'\s+', ' ', text.upper().strip())
93
+
94
+ def _clean_lines(self, content: str) -> List[str]:
95
+ content = content.replace(NBSP, ' ').replace(NARROW_NBSP, ' ').replace(ZWSP, '')
96
+ lines = [ln.rstrip() for ln in content.split('\n')]
97
+ out = []
98
+ for ln in lines:
99
+ if HEADER_FOOTER_RE.match(ln) or PAGE_NUM_RE.match(ln):
100
+ continue
101
+ ln = MD_EDGE.sub('', ln)
102
+ out.append(ln)
103
+ return out
104
+
105
+ def _infer_part_for_item(self, filing_type: str, item_key: str) -> Optional[str]:
106
+ m = re.match(r'ITEM\s+(\d{1,2})', item_key)
107
+ if not m:
108
+ return None
109
+ num = int(m.group(1))
110
+ if filing_type == "10-K":
111
+ if 1 <= num <= 4:
112
+ return "PART I"
113
+ elif 5 <= num <= 9:
114
+ return "PART II"
115
+ elif 10 <= num <= 14:
116
+ return "PART III"
117
+ elif 15 <= num <= 16:
118
+ return "PART IV"
119
+ elif filing_type == "10-Q":
120
+ if 1 <= num <= 4:
121
+ return "PART I"
122
+ else:
123
+ return "PART II"
124
+ return None
125
+
126
+ @staticmethod
127
+ def _clean_item_title(title: str) -> str:
128
+ title = re.sub(r'^\s*[:.\-–—]\s*', '', title)
129
+ title = re.sub(r'\s+', ' ', title).strip()
130
+ return title
131
+
132
+ def _is_toc(self, content: str, page_num: int = 1) -> bool:
133
+ # Simple rule: within first 5 pages, if we see multiple matches, treat as TOC.
134
+ # “Multiple” = ≥3 ITEM rows OR ≥3 dotted-leader lines.
135
+ if self._toc_locked or page_num > 5:
136
+ return False
137
+
138
+ item_hits = len(ITEM_ROWS_RE.findall(content))
139
+ leader_hits = len(DOT_LEAD_RE.findall(content))
140
+
141
+ return (item_hits >= 3) or (leader_hits >= 3)
142
+
143
+ # ========== 8-K Specific Methods ==========
144
+
145
+ # 8-K item header regex: ITEM 1.01 / 7.01 / 9.01
146
+ _ITEM_8K_RE = re.compile(
147
+ rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
148
+ re.IGNORECASE | re.MULTILINE
149
+ )
150
+
151
+ # 8-K hard stops (SIGNATURES, EXHIBIT INDEX)
152
+ _HARD_STOP_8K_RE = re.compile(r'^\s*(SIGNATURES|EXHIBIT\s+INDEX)\b', re.IGNORECASE | re.MULTILINE)
153
+
154
+ # Promote inline "Item x.xx" to its own line
155
+ _PROMOTE_ITEM_8K_RE = re.compile(r'(?<!\n)(\s)(ITEM\s+[1-9]\.\d{2}[A-Z]?\s*[.:–—-])', re.IGNORECASE)
156
+
157
+ # Exhibits table parsing
158
+ _PIPE_ROW_RE = re.compile(r'^\s*\|?\s*([0-9]{1,4}(?:\.[0-9A-Za-z]+)?)\s*\|\s*(.+?)\s*\|?\s*$', re.MULTILINE)
159
+ _SPACE_ROW_RE = re.compile(r'^\s*([0-9]{1,4}(?:\.[0-9A-Za-z]+)?)\s{2,}(.+?)\s*$', re.MULTILINE)
160
+ _HTML_ROW_RE = re.compile(
161
+ r'<tr[^>]*>\s*<t[dh][^>]*>\s*([^<]+?)\s*</t[dh]>\s*<t[dh][^>]*>\s*([^<]+?)\s*</t[dh]>\s*</tr>',
162
+ re.IGNORECASE | re.DOTALL
163
+ )
164
+
165
+ @staticmethod
166
+ def _normalize_8k_item_code(code: str) -> str:
167
+ """Normalize '5.2' -> '5.02', keep suffix 'A' if present."""
168
+ code = code.upper().strip()
169
+ m = re.match(r'^([1-9])\.(\d{1,2})([A-Z]?)$', code)
170
+ if not m:
171
+ return code
172
+ major, minor, suffix = m.groups()
173
+ minor = f"{int(minor):02d}"
174
+ return f"{major}.{minor}{suffix}"
175
+
176
+ def _clean_8k_text(self, text: str) -> str:
177
+ """Clean 8-K text: remove headers/footers, normalize whitespace, promote inline items."""
178
+ text = text.replace(NBSP, " ").replace(NARROW_NBSP, " ").replace(ZWSP, "")
179
+
180
+ # Promote inline item headings to their own line
181
+ text = self._PROMOTE_ITEM_8K_RE.sub(r'\n\2', text)
182
+
183
+ # Remove Form 8-K headers/footers
184
+ header_footer_8k = re.compile(
185
+ r'^\s*(Form\s+8\-K|Page\s+\d+(?:\s+of\s+\d+)?|UNITED\s+STATES\s+SECURITIES\s+AND\s+EXCHANGE\s+COMMISSION)\b',
186
+ re.IGNORECASE
187
+ )
188
+
189
+ lines: List[str] = []
190
+ for ln in text.splitlines():
191
+ t = ln.strip()
192
+ if header_footer_8k.match(t):
193
+ continue
194
+ t = MD_EDGE.sub("", t) # strip leading/trailing **/__ wrappers
195
+ # Drop trivial table header separators like | --- | --- |
196
+ if re.fullmatch(r'\|\s*-{3,}\s*\|\s*-{3,}\s*\|?', t):
197
+ continue
198
+ lines.append(t)
199
+
200
+ # Collapse multiple blank lines into one
201
+ out: List[str] = []
202
+ prev_blank = False
203
+ for ln in lines:
204
+ blank = (ln == "")
205
+ if blank and prev_blank:
206
+ continue
207
+ out.append(ln)
208
+ prev_blank = blank
209
+
210
+ return "\n".join(out).strip()
211
+
212
+ def _parse_exhibits(self, block: str) -> List[Any]:
213
+ """Parse exhibit table from 9.01 section."""
214
+ from sec2md.models import Exhibit
215
+
216
+ rows: List[Exhibit] = []
217
+
218
+ # Try pipe table rows first
219
+ for m in self._PIPE_ROW_RE.finditer(block):
220
+ left, right = m.group(1).strip(), m.group(2).strip()
221
+ if not re.match(r'^\d', left):
222
+ continue # skip headers like "EXHIBIT NO."
223
+ if left.startswith('---') or right.startswith('---'):
224
+ continue # skip separators
225
+ rows.append(Exhibit(exhibit_no=left, description=right))
226
+ if rows:
227
+ return rows
228
+
229
+ # Fallback: space-aligned two columns
230
+ for m in self._SPACE_ROW_RE.finditer(block):
231
+ left, right = m.group(1).strip(), m.group(2).strip()
232
+ if not re.match(r'^\d', left):
233
+ continue
234
+ rows.append(Exhibit(exhibit_no=left, description=right))
235
+ if rows:
236
+ return rows
237
+
238
+ # Fallback: basic HTML table
239
+ for m in self._HTML_ROW_RE.finditer(block):
240
+ left, right = m.group(1).strip(), m.group(2).strip()
241
+ if not re.match(r'^\d', left):
242
+ continue
243
+ rows.append(Exhibit(exhibit_no=left, description=right))
244
+
245
+ return rows
246
+
247
+ def _slice_8k_body(self, doc: str, start_after: int, next_item_start: int) -> str:
248
+ """Slice body text from start_after up to earliest hard stop or next_item_start."""
249
+ mstop = self._HARD_STOP_8K_RE.search(doc, pos=start_after, endpos=next_item_start)
250
+ end = mstop.start() if mstop else next_item_start
251
+ return doc[start_after:end].strip()
252
+
253
+ def _get_8k_sections(self) -> List[Any]:
254
+ """Extract 8-K sections (items only, no PART divisions)."""
255
+ from sec2md.models import Section, Page, ITEM_8K_TITLES
256
+
257
+ # Concatenate all pages into one doc
258
+ full_content = "\n\n".join(p["content"] for p in self.pages)
259
+ doc = self._clean_8k_text(full_content)
260
+
261
+ if not doc:
262
+ self._log("DEBUG: No content after cleaning")
263
+ return []
264
+
265
+ # Find all item headers
266
+ headers: List[Dict] = []
267
+ for m in self._ITEM_8K_RE.finditer(doc):
268
+ code = self._normalize_8k_item_code(m.group(2))
269
+ title_inline = (m.group(3) or "").strip()
270
+ # Clean markdown artifacts from title
271
+ title_inline = MD_EDGE.sub("", title_inline)
272
+ title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
273
+ headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
274
+ self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
275
+
276
+ if not headers:
277
+ self._log("DEBUG: No item headers found")
278
+ return []
279
+
280
+ self._log(f"DEBUG: Total headers found: {len(headers)}")
281
+
282
+ # Extract sections
283
+ results: List[Section] = []
284
+ for i, h in enumerate(headers):
285
+ code = h["no"]
286
+ next_start = headers[i + 1]["start"] if i + 1 < len(headers) else len(doc)
287
+ body = self._slice_8k_body(doc, h["end"], next_start)
288
+
289
+ # Filter by desired_items if provided
290
+ if self.desired_items and code not in self.desired_items:
291
+ self._log(f"DEBUG: Skipping ITEM {code} (not in desired_items)")
292
+ continue
293
+
294
+ # For 9.01, parse exhibits
295
+ exhibits = []
296
+ if code.startswith("9.01"):
297
+ md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', body, re.IGNORECASE | re.MULTILINE)
298
+ ex_block = body[md.end():].strip() if md else body
299
+ exhibits = self._parse_exhibits(ex_block)
300
+ self._log(f"DEBUG: Found {len(exhibits)} exhibits in 9.01")
301
+
302
+ # Map back to Page objects (approximate page boundaries from original content)
303
+ # Since 8-K sections can span pages, we need to find which pages contain this content
304
+ section_pages = self._map_8k_content_to_pages(body)
305
+
306
+ # Create Section with exhibits (now part of the model)
307
+ section = Section(
308
+ part=None, # 8-K has no PART divisions
309
+ item=f"ITEM {code}",
310
+ item_title=h["title"],
311
+ pages=section_pages,
312
+ exhibits=exhibits if exhibits else None
313
+ )
314
+
315
+ results.append(section)
316
+ self._log(f"DEBUG: Extracted ITEM {code} with {len(section_pages)} pages")
317
+
318
+ self._log(f"DEBUG: Total sections extracted: {len(results)}")
319
+ return results
320
+
321
+ def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
322
+ """Map extracted section content back to Page objects."""
323
+ from sec2md.models import Page
324
+
325
+ # Try to find which original pages contain this content
326
+ # This is heuristic-based: match by content overlap
327
+ matched_pages = []
328
+ section_preview = section_content[:500] # Use first 500 chars for matching
329
+
330
+ for page_dict in self.pages:
331
+ page_num = page_dict["page"]
332
+ page_content = self._clean_8k_text(page_dict["content"])
333
+
334
+ # Check if this page contains part of the section
335
+ if section_preview in page_content or page_content in section_content:
336
+ original_page = self._original_pages.get(page_num)
337
+ matched_pages.append(
338
+ Page(
339
+ number=page_num,
340
+ content=page_content,
341
+ elements=original_page.elements if original_page else None,
342
+ text_blocks=original_page.text_blocks if original_page else None
343
+ )
344
+ )
345
+
346
+ # If no matches found (shouldn't happen), create a synthetic page
347
+ if not matched_pages:
348
+ matched_pages.append(
349
+ Page(
350
+ number=1,
351
+ content=section_content,
352
+ elements=None,
353
+ text_blocks=None
354
+ )
355
+ )
356
+
357
+ return matched_pages
358
+
359
+ # ========== End 8-K Methods ==========
360
+
361
+ def get_sections(self) -> List[Any]:
362
+ """Get sections from the filing.
363
+
364
+ Routes to appropriate handler based on filing_type:
365
+ - 8-K: Uses _get_8k_sections() (flat item structure)
366
+ - 10-K/10-Q/20-F: Uses _get_standard_sections() (PART + ITEM structure)
367
+ """
368
+ if self.filing_type == "8-K":
369
+ return self._get_8k_sections()
370
+ else:
371
+ return self._get_standard_sections()
372
+
373
+ def _get_standard_sections(self) -> List[Any]:
374
+ """Extract 10-K/10-Q/20-F sections (PART + ITEM structure)."""
375
+ sections = []
376
+ current_part = None
377
+ current_item = None
378
+ current_item_title = None
379
+ current_pages: List[Dict] = []
380
+
381
+ def flush_section():
382
+ nonlocal sections, current_part, current_item, current_item_title, current_pages
383
+ if current_pages:
384
+ sections.append({
385
+ "part": current_part,
386
+ "item": current_item,
387
+ "item_title": current_item_title,
388
+ "page_start": current_pages[0]["page"],
389
+ "pages": current_pages
390
+ })
391
+ current_pages = []
392
+
393
+ for page_dict in self.pages:
394
+ page_num = page_dict["page"]
395
+ content = page_dict["content"]
396
+
397
+ if self._is_toc(content, page_num):
398
+ self._log(f"DEBUG: Page {page_num} detected as TOC, skipping")
399
+ continue
400
+
401
+ lines = self._clean_lines(content)
402
+ joined = "\n".join(lines)
403
+
404
+ if not joined.strip():
405
+ self._log(f"DEBUG: Page {page_num} is empty after cleaning")
406
+ continue
407
+
408
+ part_m = None
409
+ item_m = None
410
+ first_idx = None
411
+ first_kind = None
412
+
413
+ for m in PART_PATTERN.finditer(joined):
414
+ part_m = m
415
+ first_idx = m.start()
416
+ first_kind = 'part'
417
+ self._log(f"DEBUG: Page {page_num} found PART at position {first_idx}: {m.group(1)}")
418
+ break
419
+
420
+ for m in ITEM_PATTERN.finditer(joined):
421
+ if first_idx is None or m.start() < first_idx:
422
+ item_m = m
423
+ first_idx = m.start()
424
+ first_kind = 'item'
425
+ self._log(f"DEBUG: Page {page_num} found ITEM at position {first_idx}: ITEM {m.group(2)}")
426
+ break
427
+
428
+ if first_kind is None:
429
+ self._log(f"DEBUG: Page {page_num} - no header found. In section: {current_part or current_item}")
430
+ if current_part or current_item:
431
+ if joined.strip():
432
+ current_pages.append({"page": page_num, "content": joined})
433
+ continue
434
+
435
+ before = joined[:first_idx].strip()
436
+ after = joined[first_idx:].strip()
437
+
438
+ if (current_part or current_item) and before:
439
+ current_pages.append({"page": page_num, "content": before})
440
+
441
+ flush_section()
442
+
443
+ if first_kind == 'part' and part_m:
444
+ part_text = part_m.group(1)
445
+ current_part, _ = self._normalize_section_key(part_text, None)
446
+ current_item = None
447
+ current_item_title = None
448
+ elif first_kind == 'item' and item_m:
449
+ item_num = item_m.group(2)
450
+ title = (item_m.group(3) or "").strip()
451
+ current_item_title = self._clean_item_title(title) if title else None
452
+ if current_part is None and self.filing_type:
453
+ inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
454
+ if inferred:
455
+ current_part = inferred
456
+ self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
457
+ _, current_item = self._normalize_section_key(current_part, item_num)
458
+
459
+ if after:
460
+ current_pages.append({"page": page_num, "content": after})
461
+
462
+ if first_kind == 'part' and part_m:
463
+ item_after = None
464
+ for m in ITEM_PATTERN.finditer(after):
465
+ item_after = m
466
+ break
467
+ if item_after:
468
+ start = item_after.start()
469
+ current_pages[-1]["content"] = after[start:]
470
+ item_num = item_after.group(2)
471
+ title = (item_after.group(3) or "").strip()
472
+ current_item_title = self._clean_item_title(title) if title else None
473
+ _, current_item = self._normalize_section_key(current_part, item_num)
474
+ self._log(f"DEBUG: Page {page_num} - promoted PART to ITEM {item_num} (intra-page)")
475
+ after = current_pages[-1]["content"]
476
+
477
+ tail = after
478
+ while True:
479
+ next_kind, next_idx, next_part_m, next_item_m = None, None, None, None
480
+
481
+ for m in PART_PATTERN.finditer(tail):
482
+ if m.start() > 0:
483
+ next_kind, next_idx, next_part_m = 'part', m.start(), m
484
+ break
485
+ for m in ITEM_PATTERN.finditer(tail):
486
+ if m.start() > 0 and (next_idx is None or m.start() < next_idx):
487
+ next_kind, next_idx, next_item_m = 'item', m.start(), m
488
+
489
+ if next_idx is None:
490
+ break
491
+
492
+ before_seg = tail[:next_idx].strip()
493
+ after_seg = tail[next_idx:].strip()
494
+
495
+ if before_seg:
496
+ current_pages[-1]["content"] = before_seg
497
+ flush_section()
498
+
499
+ if next_kind == 'part' and next_part_m:
500
+ current_part, _ = self._normalize_section_key(next_part_m.group(1), None)
501
+ current_item = None
502
+ current_item_title = None
503
+ self._log(f"DEBUG: Page {page_num} - intra-page PART transition to {current_part}")
504
+ elif next_kind == 'item' and next_item_m:
505
+ item_num = next_item_m.group(2)
506
+ title = (next_item_m.group(3) or "").strip()
507
+ current_item_title = self._clean_item_title(title) if title else None
508
+ if current_part is None and self.filing_type:
509
+ inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
510
+ if inferred:
511
+ current_part = inferred
512
+ self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
513
+ _, current_item = self._normalize_section_key(current_part, item_num)
514
+ self._log(f"DEBUG: Page {page_num} - intra-page ITEM transition to {current_item}")
515
+
516
+ current_pages.append({"page": page_num, "content": after_seg})
517
+ tail = after_seg
518
+
519
+ flush_section()
520
+
521
+ self._log(f"DEBUG: Total sections before validation: {len(sections)}")
522
+ for s in sections:
523
+ self._log(f" - Part: {s['part']}, Item: {s['item']}, Pages: {len(s['pages'])}, Start: {s['page_start']}")
524
+
525
+ def _section_text_len(s):
526
+ return sum(len(p["content"].strip()) for p in s["pages"])
527
+
528
+ sections = [s for s in sections if s["item"] is not None or _section_text_len(s) > 80]
529
+ self._log(f"DEBUG: Sections after dropping empty PART stubs: {len(sections)}")
530
+
531
+ if self.structure and sections:
532
+ self._log(f"DEBUG: Validating against structure: {self.filing_type}")
533
+ fixed = []
534
+ for s in sections:
535
+ part = s["part"]
536
+ item = s["item"]
537
+
538
+ if part is None and item and self.filing_type:
539
+ inferred = self._infer_part_for_item(self.filing_type, item)
540
+ if inferred:
541
+ self._log(f"DEBUG: Inferred {inferred} from {item}")
542
+ s = {**s, "part": inferred}
543
+ part = inferred
544
+
545
+ if (part in self.structure) and (item is None or item in self.structure.get(part, [])):
546
+ fixed.append(s)
547
+ else:
548
+ self._log(f"DEBUG: Dropped section - Part: {part}, Item: {item}")
549
+
550
+ sections = fixed
551
+ self._log(f"DEBUG: Sections after validation: {len(sections)}")
552
+
553
+ # Convert to Section objects with Page objects (preserving elements)
554
+ from sec2md.models import Section, Page
555
+
556
+ section_objects = []
557
+ for section_data in sections:
558
+ # Build Page objects for this section, preserving elements from originals
559
+ section_pages = []
560
+ for page_dict in section_data["pages"]:
561
+ page_num = page_dict["page"]
562
+ original_page = self._original_pages.get(page_num)
563
+
564
+ # Filter text_blocks to only include ones relevant to this section's content
565
+ filtered_text_blocks = None
566
+ if original_page and original_page.text_blocks:
567
+ section_content = page_dict["content"]
568
+ filtered_text_blocks = []
569
+ for tb in original_page.text_blocks:
570
+ # Include TextBlock if:
571
+ # 1. Its title appears in section content, OR
572
+ # 2. Any of its element content appears in section (for short titles)
573
+ title_match = tb.title and tb.title in section_content
574
+ content_match = any(
575
+ # Check if element content (or significant portion) is in section
576
+ elem.content[:200] in section_content or section_content in elem.content
577
+ for elem in tb.elements
578
+ )
579
+ if title_match or content_match:
580
+ filtered_text_blocks.append(tb)
581
+ filtered_text_blocks = filtered_text_blocks if filtered_text_blocks else None
582
+
583
+ section_pages.append(
584
+ Page(
585
+ number=page_num,
586
+ content=page_dict["content"],
587
+ elements=original_page.elements if original_page else None,
588
+ text_blocks=filtered_text_blocks
589
+ )
590
+ )
591
+
592
+ section_objects.append(
593
+ Section(
594
+ part=section_data["part"],
595
+ item=section_data["item"],
596
+ item_title=section_data["item_title"],
597
+ pages=section_pages
598
+ )
599
+ )
600
+
601
+ return section_objects
602
+
603
+ def get_section(self, part: str, item: Optional[str] = None):
604
+ """Get a specific section by part and item.
605
+
606
+ Args:
607
+ part: Part name (e.g., "PART I")
608
+ item: Optional item name (e.g., "ITEM 1A")
609
+
610
+ Returns:
611
+ Section object if found, None otherwise
612
+ """
613
+ from sec2md.models import Section
614
+
615
+ part_normalized = self._normalize_section(part)
616
+ item_normalized = self._normalize_section(item) if item else None
617
+ sections = self.get_sections()
618
+
619
+ for section in sections:
620
+ if section.part == part_normalized:
621
+ if item_normalized is None or section.item == item_normalized:
622
+ return section
623
+ return None