sec2md 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +36 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +135 -0
- sec2md/chunker/markdown_chunk.py +133 -0
- sec2md/chunker/markdown_chunker.py +270 -0
- sec2md/chunking.py +179 -0
- sec2md/core.py +93 -0
- sec2md/models.py +400 -0
- sec2md/parser.py +1217 -0
- sec2md/section_extractor.py +623 -0
- sec2md/sections.py +84 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.5.dist-info/METADATA +216 -0
- sec2md-0.1.5.dist-info/RECORD +19 -0
- sec2md-0.1.5.dist-info/WHEEL +5 -0
- sec2md-0.1.5.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,623 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Dict, Optional, Literal, Union, Any
|
|
5
|
+
|
|
6
|
+
LEAD_WRAP = r'(?:\*\*|__)?\s*(?:</?[^>]+>\s*)*'
|
|
7
|
+
|
|
8
|
+
PART_PATTERN = re.compile(
|
|
9
|
+
rf'^\s*{LEAD_WRAP}(PART\s+[IVXLC]+)\b(?:\s*$|\s+)',
|
|
10
|
+
re.IGNORECASE | re.MULTILINE
|
|
11
|
+
)
|
|
12
|
+
ITEM_PATTERN = re.compile(
|
|
13
|
+
rf'^\s*{LEAD_WRAP}(ITEM)\s+(\d{{1,2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
|
|
14
|
+
re.IGNORECASE | re.MULTILINE
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
HEADER_FOOTER_RE = re.compile(
|
|
18
|
+
r'^\s*(?:[A-Z][A-Za-z0-9 .,&\-]+)?\s*\|\s*\d{4}\s+Form\s+10-[KQ]\s*\|\s*\d+\s*$'
|
|
19
|
+
)
|
|
20
|
+
PAGE_NUM_RE = re.compile(r'^\s*Page\s+\d+\s*(?:of\s+\d+)?\s*$|^\s*\d+\s*$', re.IGNORECASE)
|
|
21
|
+
MD_EDGE = re.compile(r'^\s*(?:\*\*|__)\s*|\s*(?:\*\*|__)\s*$')
|
|
22
|
+
|
|
23
|
+
NBSP, NARROW_NBSP, ZWSP = '\u00A0', '\u202F', '\u200B'
|
|
24
|
+
|
|
25
|
+
DOT_LEAD_RE = re.compile(r'^.*\.{3,}\s*\d{1,4}\s*$', re.M) # "... 123"
|
|
26
|
+
ITEM_ROWS_RE = re.compile(r'^\s*ITEM\s+\d{1,2}[A-Z]?\.?\b', re.I | re.M)
|
|
27
|
+
|
|
28
|
+
FILING_STRUCTURES = {
|
|
29
|
+
"10-K": {
|
|
30
|
+
"PART I": ["ITEM 1", "ITEM 1A", "ITEM 1B", "ITEM 1C", "ITEM 2", "ITEM 3", "ITEM 4"],
|
|
31
|
+
"PART II": ["ITEM 5", "ITEM 6", "ITEM 7", "ITEM 7A", "ITEM 8", "ITEM 9", "ITEM 9A", "ITEM 9B", "ITEM 9C"],
|
|
32
|
+
"PART III": ["ITEM 10", "ITEM 11", "ITEM 12", "ITEM 13", "ITEM 14"],
|
|
33
|
+
"PART IV": ["ITEM 15", "ITEM 16"]
|
|
34
|
+
},
|
|
35
|
+
"10-Q": {
|
|
36
|
+
"PART I": ["ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4"],
|
|
37
|
+
"PART II": ["ITEM 1", "ITEM 1A", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6"]
|
|
38
|
+
},
|
|
39
|
+
"20-F": {
|
|
40
|
+
"PART I": [
|
|
41
|
+
"ITEM 1", "ITEM 2", "ITEM 3", "ITEM 4", "ITEM 5", "ITEM 6",
|
|
42
|
+
"ITEM 7", "ITEM 8", "ITEM 9", "ITEM 10", "ITEM 11", "ITEM 12", "ITEM 12D"
|
|
43
|
+
],
|
|
44
|
+
"PART II": [
|
|
45
|
+
"ITEM 13", "ITEM 14", "ITEM 15",
|
|
46
|
+
# include all 16X variants explicitly so validation stays strict
|
|
47
|
+
"ITEM 16", "ITEM 16A", "ITEM 16B", "ITEM 16C", "ITEM 16D", "ITEM 16E", "ITEM 16F", "ITEM 16G", "ITEM 16H",
|
|
48
|
+
"ITEM 16I"
|
|
49
|
+
],
|
|
50
|
+
"PART III": ["ITEM 17", "ITEM 18", "ITEM 19"]
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class SectionExtractor:
|
|
56
|
+
def __init__(self, pages: List[Any], filing_type: Optional[Literal["10-K", "10-Q", "20-F", "8-K"]] = None,
|
|
57
|
+
desired_items: Optional[set] = None, debug: bool = False):
|
|
58
|
+
"""Initialize SectionExtractor.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
pages: List of Page objects
|
|
62
|
+
filing_type: Type of filing ("10-K", "10-Q", "20-F", or "8-K")
|
|
63
|
+
desired_items: For 8-K only: set of item numbers to extract (e.g., {"2.02", "9.01"})
|
|
64
|
+
debug: Enable debug logging
|
|
65
|
+
"""
|
|
66
|
+
from sec2md.models import Page
|
|
67
|
+
|
|
68
|
+
# Store original Page objects to preserve elements
|
|
69
|
+
self._original_pages = {p.number: p for p in pages}
|
|
70
|
+
|
|
71
|
+
# Convert to dict format for internal processing
|
|
72
|
+
self.pages = [{"page": p.number, "content": p.content} for p in pages]
|
|
73
|
+
self.filing_type = filing_type
|
|
74
|
+
self.structure = FILING_STRUCTURES.get(filing_type) if filing_type else None
|
|
75
|
+
self.desired_items = desired_items
|
|
76
|
+
self.debug = debug
|
|
77
|
+
|
|
78
|
+
self._toc_locked = False
|
|
79
|
+
|
|
80
|
+
def _log(self, msg: str):
|
|
81
|
+
if self.debug:
|
|
82
|
+
print(msg)
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _normalize_section_key(part: Optional[str], item_num: Optional[str]) -> tuple[Optional[str], Optional[str]]:
|
|
86
|
+
part_key = re.sub(r'\s+', ' ', part.upper().strip()) if part else None
|
|
87
|
+
item_key = f"ITEM {item_num.upper()}" if item_num else None
|
|
88
|
+
return part_key, item_key
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def _normalize_section(text: str) -> str:
|
|
92
|
+
return re.sub(r'\s+', ' ', text.upper().strip())
|
|
93
|
+
|
|
94
|
+
def _clean_lines(self, content: str) -> List[str]:
|
|
95
|
+
content = content.replace(NBSP, ' ').replace(NARROW_NBSP, ' ').replace(ZWSP, '')
|
|
96
|
+
lines = [ln.rstrip() for ln in content.split('\n')]
|
|
97
|
+
out = []
|
|
98
|
+
for ln in lines:
|
|
99
|
+
if HEADER_FOOTER_RE.match(ln) or PAGE_NUM_RE.match(ln):
|
|
100
|
+
continue
|
|
101
|
+
ln = MD_EDGE.sub('', ln)
|
|
102
|
+
out.append(ln)
|
|
103
|
+
return out
|
|
104
|
+
|
|
105
|
+
def _infer_part_for_item(self, filing_type: str, item_key: str) -> Optional[str]:
|
|
106
|
+
m = re.match(r'ITEM\s+(\d{1,2})', item_key)
|
|
107
|
+
if not m:
|
|
108
|
+
return None
|
|
109
|
+
num = int(m.group(1))
|
|
110
|
+
if filing_type == "10-K":
|
|
111
|
+
if 1 <= num <= 4:
|
|
112
|
+
return "PART I"
|
|
113
|
+
elif 5 <= num <= 9:
|
|
114
|
+
return "PART II"
|
|
115
|
+
elif 10 <= num <= 14:
|
|
116
|
+
return "PART III"
|
|
117
|
+
elif 15 <= num <= 16:
|
|
118
|
+
return "PART IV"
|
|
119
|
+
elif filing_type == "10-Q":
|
|
120
|
+
if 1 <= num <= 4:
|
|
121
|
+
return "PART I"
|
|
122
|
+
else:
|
|
123
|
+
return "PART II"
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _clean_item_title(title: str) -> str:
|
|
128
|
+
title = re.sub(r'^\s*[:.\-–—]\s*', '', title)
|
|
129
|
+
title = re.sub(r'\s+', ' ', title).strip()
|
|
130
|
+
return title
|
|
131
|
+
|
|
132
|
+
def _is_toc(self, content: str, page_num: int = 1) -> bool:
|
|
133
|
+
# Simple rule: within first 5 pages, if we see multiple matches, treat as TOC.
|
|
134
|
+
# “Multiple” = ≥3 ITEM rows OR ≥3 dotted-leader lines.
|
|
135
|
+
if self._toc_locked or page_num > 5:
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
item_hits = len(ITEM_ROWS_RE.findall(content))
|
|
139
|
+
leader_hits = len(DOT_LEAD_RE.findall(content))
|
|
140
|
+
|
|
141
|
+
return (item_hits >= 3) or (leader_hits >= 3)
|
|
142
|
+
|
|
143
|
+
# ========== 8-K Specific Methods ==========
|
|
144
|
+
|
|
145
|
+
# 8-K item header regex: ITEM 1.01 / 7.01 / 9.01
|
|
146
|
+
_ITEM_8K_RE = re.compile(
|
|
147
|
+
rf'^\s*{LEAD_WRAP}(ITEM)\s+([1-9]\.\d{{2}}[A-Z]?)\.?\s*(?:[:.\-–—]\s*)?(.*)',
|
|
148
|
+
re.IGNORECASE | re.MULTILINE
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# 8-K hard stops (SIGNATURES, EXHIBIT INDEX)
|
|
152
|
+
_HARD_STOP_8K_RE = re.compile(r'^\s*(SIGNATURES|EXHIBIT\s+INDEX)\b', re.IGNORECASE | re.MULTILINE)
|
|
153
|
+
|
|
154
|
+
# Promote inline "Item x.xx" to its own line
|
|
155
|
+
_PROMOTE_ITEM_8K_RE = re.compile(r'(?<!\n)(\s)(ITEM\s+[1-9]\.\d{2}[A-Z]?\s*[.:–—-])', re.IGNORECASE)
|
|
156
|
+
|
|
157
|
+
# Exhibits table parsing
|
|
158
|
+
_PIPE_ROW_RE = re.compile(r'^\s*\|?\s*([0-9]{1,4}(?:\.[0-9A-Za-z]+)?)\s*\|\s*(.+?)\s*\|?\s*$', re.MULTILINE)
|
|
159
|
+
_SPACE_ROW_RE = re.compile(r'^\s*([0-9]{1,4}(?:\.[0-9A-Za-z]+)?)\s{2,}(.+?)\s*$', re.MULTILINE)
|
|
160
|
+
_HTML_ROW_RE = re.compile(
|
|
161
|
+
r'<tr[^>]*>\s*<t[dh][^>]*>\s*([^<]+?)\s*</t[dh]>\s*<t[dh][^>]*>\s*([^<]+?)\s*</t[dh]>\s*</tr>',
|
|
162
|
+
re.IGNORECASE | re.DOTALL
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def _normalize_8k_item_code(code: str) -> str:
|
|
167
|
+
"""Normalize '5.2' -> '5.02', keep suffix 'A' if present."""
|
|
168
|
+
code = code.upper().strip()
|
|
169
|
+
m = re.match(r'^([1-9])\.(\d{1,2})([A-Z]?)$', code)
|
|
170
|
+
if not m:
|
|
171
|
+
return code
|
|
172
|
+
major, minor, suffix = m.groups()
|
|
173
|
+
minor = f"{int(minor):02d}"
|
|
174
|
+
return f"{major}.{minor}{suffix}"
|
|
175
|
+
|
|
176
|
+
def _clean_8k_text(self, text: str) -> str:
|
|
177
|
+
"""Clean 8-K text: remove headers/footers, normalize whitespace, promote inline items."""
|
|
178
|
+
text = text.replace(NBSP, " ").replace(NARROW_NBSP, " ").replace(ZWSP, "")
|
|
179
|
+
|
|
180
|
+
# Promote inline item headings to their own line
|
|
181
|
+
text = self._PROMOTE_ITEM_8K_RE.sub(r'\n\2', text)
|
|
182
|
+
|
|
183
|
+
# Remove Form 8-K headers/footers
|
|
184
|
+
header_footer_8k = re.compile(
|
|
185
|
+
r'^\s*(Form\s+8\-K|Page\s+\d+(?:\s+of\s+\d+)?|UNITED\s+STATES\s+SECURITIES\s+AND\s+EXCHANGE\s+COMMISSION)\b',
|
|
186
|
+
re.IGNORECASE
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
lines: List[str] = []
|
|
190
|
+
for ln in text.splitlines():
|
|
191
|
+
t = ln.strip()
|
|
192
|
+
if header_footer_8k.match(t):
|
|
193
|
+
continue
|
|
194
|
+
t = MD_EDGE.sub("", t) # strip leading/trailing **/__ wrappers
|
|
195
|
+
# Drop trivial table header separators like | --- | --- |
|
|
196
|
+
if re.fullmatch(r'\|\s*-{3,}\s*\|\s*-{3,}\s*\|?', t):
|
|
197
|
+
continue
|
|
198
|
+
lines.append(t)
|
|
199
|
+
|
|
200
|
+
# Collapse multiple blank lines into one
|
|
201
|
+
out: List[str] = []
|
|
202
|
+
prev_blank = False
|
|
203
|
+
for ln in lines:
|
|
204
|
+
blank = (ln == "")
|
|
205
|
+
if blank and prev_blank:
|
|
206
|
+
continue
|
|
207
|
+
out.append(ln)
|
|
208
|
+
prev_blank = blank
|
|
209
|
+
|
|
210
|
+
return "\n".join(out).strip()
|
|
211
|
+
|
|
212
|
+
def _parse_exhibits(self, block: str) -> List[Any]:
|
|
213
|
+
"""Parse exhibit table from 9.01 section."""
|
|
214
|
+
from sec2md.models import Exhibit
|
|
215
|
+
|
|
216
|
+
rows: List[Exhibit] = []
|
|
217
|
+
|
|
218
|
+
# Try pipe table rows first
|
|
219
|
+
for m in self._PIPE_ROW_RE.finditer(block):
|
|
220
|
+
left, right = m.group(1).strip(), m.group(2).strip()
|
|
221
|
+
if not re.match(r'^\d', left):
|
|
222
|
+
continue # skip headers like "EXHIBIT NO."
|
|
223
|
+
if left.startswith('---') or right.startswith('---'):
|
|
224
|
+
continue # skip separators
|
|
225
|
+
rows.append(Exhibit(exhibit_no=left, description=right))
|
|
226
|
+
if rows:
|
|
227
|
+
return rows
|
|
228
|
+
|
|
229
|
+
# Fallback: space-aligned two columns
|
|
230
|
+
for m in self._SPACE_ROW_RE.finditer(block):
|
|
231
|
+
left, right = m.group(1).strip(), m.group(2).strip()
|
|
232
|
+
if not re.match(r'^\d', left):
|
|
233
|
+
continue
|
|
234
|
+
rows.append(Exhibit(exhibit_no=left, description=right))
|
|
235
|
+
if rows:
|
|
236
|
+
return rows
|
|
237
|
+
|
|
238
|
+
# Fallback: basic HTML table
|
|
239
|
+
for m in self._HTML_ROW_RE.finditer(block):
|
|
240
|
+
left, right = m.group(1).strip(), m.group(2).strip()
|
|
241
|
+
if not re.match(r'^\d', left):
|
|
242
|
+
continue
|
|
243
|
+
rows.append(Exhibit(exhibit_no=left, description=right))
|
|
244
|
+
|
|
245
|
+
return rows
|
|
246
|
+
|
|
247
|
+
def _slice_8k_body(self, doc: str, start_after: int, next_item_start: int) -> str:
|
|
248
|
+
"""Slice body text from start_after up to earliest hard stop or next_item_start."""
|
|
249
|
+
mstop = self._HARD_STOP_8K_RE.search(doc, pos=start_after, endpos=next_item_start)
|
|
250
|
+
end = mstop.start() if mstop else next_item_start
|
|
251
|
+
return doc[start_after:end].strip()
|
|
252
|
+
|
|
253
|
+
def _get_8k_sections(self) -> List[Any]:
|
|
254
|
+
"""Extract 8-K sections (items only, no PART divisions)."""
|
|
255
|
+
from sec2md.models import Section, Page, ITEM_8K_TITLES
|
|
256
|
+
|
|
257
|
+
# Concatenate all pages into one doc
|
|
258
|
+
full_content = "\n\n".join(p["content"] for p in self.pages)
|
|
259
|
+
doc = self._clean_8k_text(full_content)
|
|
260
|
+
|
|
261
|
+
if not doc:
|
|
262
|
+
self._log("DEBUG: No content after cleaning")
|
|
263
|
+
return []
|
|
264
|
+
|
|
265
|
+
# Find all item headers
|
|
266
|
+
headers: List[Dict] = []
|
|
267
|
+
for m in self._ITEM_8K_RE.finditer(doc):
|
|
268
|
+
code = self._normalize_8k_item_code(m.group(2))
|
|
269
|
+
title_inline = (m.group(3) or "").strip()
|
|
270
|
+
# Clean markdown artifacts from title
|
|
271
|
+
title_inline = MD_EDGE.sub("", title_inline)
|
|
272
|
+
title = title_inline if title_inline else ITEM_8K_TITLES.get(code)
|
|
273
|
+
headers.append({"start": m.start(), "end": m.end(), "no": code, "title": title})
|
|
274
|
+
self._log(f"DEBUG: Found ITEM {code} at position {m.start()}")
|
|
275
|
+
|
|
276
|
+
if not headers:
|
|
277
|
+
self._log("DEBUG: No item headers found")
|
|
278
|
+
return []
|
|
279
|
+
|
|
280
|
+
self._log(f"DEBUG: Total headers found: {len(headers)}")
|
|
281
|
+
|
|
282
|
+
# Extract sections
|
|
283
|
+
results: List[Section] = []
|
|
284
|
+
for i, h in enumerate(headers):
|
|
285
|
+
code = h["no"]
|
|
286
|
+
next_start = headers[i + 1]["start"] if i + 1 < len(headers) else len(doc)
|
|
287
|
+
body = self._slice_8k_body(doc, h["end"], next_start)
|
|
288
|
+
|
|
289
|
+
# Filter by desired_items if provided
|
|
290
|
+
if self.desired_items and code not in self.desired_items:
|
|
291
|
+
self._log(f"DEBUG: Skipping ITEM {code} (not in desired_items)")
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
# For 9.01, parse exhibits
|
|
295
|
+
exhibits = []
|
|
296
|
+
if code.startswith("9.01"):
|
|
297
|
+
md = re.search(r'^\s*\(?d\)?\s*Exhibits\b.*$', body, re.IGNORECASE | re.MULTILINE)
|
|
298
|
+
ex_block = body[md.end():].strip() if md else body
|
|
299
|
+
exhibits = self._parse_exhibits(ex_block)
|
|
300
|
+
self._log(f"DEBUG: Found {len(exhibits)} exhibits in 9.01")
|
|
301
|
+
|
|
302
|
+
# Map back to Page objects (approximate page boundaries from original content)
|
|
303
|
+
# Since 8-K sections can span pages, we need to find which pages contain this content
|
|
304
|
+
section_pages = self._map_8k_content_to_pages(body)
|
|
305
|
+
|
|
306
|
+
# Create Section with exhibits (now part of the model)
|
|
307
|
+
section = Section(
|
|
308
|
+
part=None, # 8-K has no PART divisions
|
|
309
|
+
item=f"ITEM {code}",
|
|
310
|
+
item_title=h["title"],
|
|
311
|
+
pages=section_pages,
|
|
312
|
+
exhibits=exhibits if exhibits else None
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
results.append(section)
|
|
316
|
+
self._log(f"DEBUG: Extracted ITEM {code} with {len(section_pages)} pages")
|
|
317
|
+
|
|
318
|
+
self._log(f"DEBUG: Total sections extracted: {len(results)}")
|
|
319
|
+
return results
|
|
320
|
+
|
|
321
|
+
def _map_8k_content_to_pages(self, section_content: str) -> List[Any]:
|
|
322
|
+
"""Map extracted section content back to Page objects."""
|
|
323
|
+
from sec2md.models import Page
|
|
324
|
+
|
|
325
|
+
# Try to find which original pages contain this content
|
|
326
|
+
# This is heuristic-based: match by content overlap
|
|
327
|
+
matched_pages = []
|
|
328
|
+
section_preview = section_content[:500] # Use first 500 chars for matching
|
|
329
|
+
|
|
330
|
+
for page_dict in self.pages:
|
|
331
|
+
page_num = page_dict["page"]
|
|
332
|
+
page_content = self._clean_8k_text(page_dict["content"])
|
|
333
|
+
|
|
334
|
+
# Check if this page contains part of the section
|
|
335
|
+
if section_preview in page_content or page_content in section_content:
|
|
336
|
+
original_page = self._original_pages.get(page_num)
|
|
337
|
+
matched_pages.append(
|
|
338
|
+
Page(
|
|
339
|
+
number=page_num,
|
|
340
|
+
content=page_content,
|
|
341
|
+
elements=original_page.elements if original_page else None,
|
|
342
|
+
text_blocks=original_page.text_blocks if original_page else None
|
|
343
|
+
)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# If no matches found (shouldn't happen), create a synthetic page
|
|
347
|
+
if not matched_pages:
|
|
348
|
+
matched_pages.append(
|
|
349
|
+
Page(
|
|
350
|
+
number=1,
|
|
351
|
+
content=section_content,
|
|
352
|
+
elements=None,
|
|
353
|
+
text_blocks=None
|
|
354
|
+
)
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
return matched_pages
|
|
358
|
+
|
|
359
|
+
# ========== End 8-K Methods ==========
|
|
360
|
+
|
|
361
|
+
def get_sections(self) -> List[Any]:
|
|
362
|
+
"""Get sections from the filing.
|
|
363
|
+
|
|
364
|
+
Routes to appropriate handler based on filing_type:
|
|
365
|
+
- 8-K: Uses _get_8k_sections() (flat item structure)
|
|
366
|
+
- 10-K/10-Q/20-F: Uses _get_standard_sections() (PART + ITEM structure)
|
|
367
|
+
"""
|
|
368
|
+
if self.filing_type == "8-K":
|
|
369
|
+
return self._get_8k_sections()
|
|
370
|
+
else:
|
|
371
|
+
return self._get_standard_sections()
|
|
372
|
+
|
|
373
|
+
def _get_standard_sections(self) -> List[Any]:
|
|
374
|
+
"""Extract 10-K/10-Q/20-F sections (PART + ITEM structure)."""
|
|
375
|
+
sections = []
|
|
376
|
+
current_part = None
|
|
377
|
+
current_item = None
|
|
378
|
+
current_item_title = None
|
|
379
|
+
current_pages: List[Dict] = []
|
|
380
|
+
|
|
381
|
+
def flush_section():
|
|
382
|
+
nonlocal sections, current_part, current_item, current_item_title, current_pages
|
|
383
|
+
if current_pages:
|
|
384
|
+
sections.append({
|
|
385
|
+
"part": current_part,
|
|
386
|
+
"item": current_item,
|
|
387
|
+
"item_title": current_item_title,
|
|
388
|
+
"page_start": current_pages[0]["page"],
|
|
389
|
+
"pages": current_pages
|
|
390
|
+
})
|
|
391
|
+
current_pages = []
|
|
392
|
+
|
|
393
|
+
for page_dict in self.pages:
|
|
394
|
+
page_num = page_dict["page"]
|
|
395
|
+
content = page_dict["content"]
|
|
396
|
+
|
|
397
|
+
if self._is_toc(content, page_num):
|
|
398
|
+
self._log(f"DEBUG: Page {page_num} detected as TOC, skipping")
|
|
399
|
+
continue
|
|
400
|
+
|
|
401
|
+
lines = self._clean_lines(content)
|
|
402
|
+
joined = "\n".join(lines)
|
|
403
|
+
|
|
404
|
+
if not joined.strip():
|
|
405
|
+
self._log(f"DEBUG: Page {page_num} is empty after cleaning")
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
part_m = None
|
|
409
|
+
item_m = None
|
|
410
|
+
first_idx = None
|
|
411
|
+
first_kind = None
|
|
412
|
+
|
|
413
|
+
for m in PART_PATTERN.finditer(joined):
|
|
414
|
+
part_m = m
|
|
415
|
+
first_idx = m.start()
|
|
416
|
+
first_kind = 'part'
|
|
417
|
+
self._log(f"DEBUG: Page {page_num} found PART at position {first_idx}: {m.group(1)}")
|
|
418
|
+
break
|
|
419
|
+
|
|
420
|
+
for m in ITEM_PATTERN.finditer(joined):
|
|
421
|
+
if first_idx is None or m.start() < first_idx:
|
|
422
|
+
item_m = m
|
|
423
|
+
first_idx = m.start()
|
|
424
|
+
first_kind = 'item'
|
|
425
|
+
self._log(f"DEBUG: Page {page_num} found ITEM at position {first_idx}: ITEM {m.group(2)}")
|
|
426
|
+
break
|
|
427
|
+
|
|
428
|
+
if first_kind is None:
|
|
429
|
+
self._log(f"DEBUG: Page {page_num} - no header found. In section: {current_part or current_item}")
|
|
430
|
+
if current_part or current_item:
|
|
431
|
+
if joined.strip():
|
|
432
|
+
current_pages.append({"page": page_num, "content": joined})
|
|
433
|
+
continue
|
|
434
|
+
|
|
435
|
+
before = joined[:first_idx].strip()
|
|
436
|
+
after = joined[first_idx:].strip()
|
|
437
|
+
|
|
438
|
+
if (current_part or current_item) and before:
|
|
439
|
+
current_pages.append({"page": page_num, "content": before})
|
|
440
|
+
|
|
441
|
+
flush_section()
|
|
442
|
+
|
|
443
|
+
if first_kind == 'part' and part_m:
|
|
444
|
+
part_text = part_m.group(1)
|
|
445
|
+
current_part, _ = self._normalize_section_key(part_text, None)
|
|
446
|
+
current_item = None
|
|
447
|
+
current_item_title = None
|
|
448
|
+
elif first_kind == 'item' and item_m:
|
|
449
|
+
item_num = item_m.group(2)
|
|
450
|
+
title = (item_m.group(3) or "").strip()
|
|
451
|
+
current_item_title = self._clean_item_title(title) if title else None
|
|
452
|
+
if current_part is None and self.filing_type:
|
|
453
|
+
inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
|
|
454
|
+
if inferred:
|
|
455
|
+
current_part = inferred
|
|
456
|
+
self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
|
|
457
|
+
_, current_item = self._normalize_section_key(current_part, item_num)
|
|
458
|
+
|
|
459
|
+
if after:
|
|
460
|
+
current_pages.append({"page": page_num, "content": after})
|
|
461
|
+
|
|
462
|
+
if first_kind == 'part' and part_m:
|
|
463
|
+
item_after = None
|
|
464
|
+
for m in ITEM_PATTERN.finditer(after):
|
|
465
|
+
item_after = m
|
|
466
|
+
break
|
|
467
|
+
if item_after:
|
|
468
|
+
start = item_after.start()
|
|
469
|
+
current_pages[-1]["content"] = after[start:]
|
|
470
|
+
item_num = item_after.group(2)
|
|
471
|
+
title = (item_after.group(3) or "").strip()
|
|
472
|
+
current_item_title = self._clean_item_title(title) if title else None
|
|
473
|
+
_, current_item = self._normalize_section_key(current_part, item_num)
|
|
474
|
+
self._log(f"DEBUG: Page {page_num} - promoted PART to ITEM {item_num} (intra-page)")
|
|
475
|
+
after = current_pages[-1]["content"]
|
|
476
|
+
|
|
477
|
+
tail = after
|
|
478
|
+
while True:
|
|
479
|
+
next_kind, next_idx, next_part_m, next_item_m = None, None, None, None
|
|
480
|
+
|
|
481
|
+
for m in PART_PATTERN.finditer(tail):
|
|
482
|
+
if m.start() > 0:
|
|
483
|
+
next_kind, next_idx, next_part_m = 'part', m.start(), m
|
|
484
|
+
break
|
|
485
|
+
for m in ITEM_PATTERN.finditer(tail):
|
|
486
|
+
if m.start() > 0 and (next_idx is None or m.start() < next_idx):
|
|
487
|
+
next_kind, next_idx, next_item_m = 'item', m.start(), m
|
|
488
|
+
|
|
489
|
+
if next_idx is None:
|
|
490
|
+
break
|
|
491
|
+
|
|
492
|
+
before_seg = tail[:next_idx].strip()
|
|
493
|
+
after_seg = tail[next_idx:].strip()
|
|
494
|
+
|
|
495
|
+
if before_seg:
|
|
496
|
+
current_pages[-1]["content"] = before_seg
|
|
497
|
+
flush_section()
|
|
498
|
+
|
|
499
|
+
if next_kind == 'part' and next_part_m:
|
|
500
|
+
current_part, _ = self._normalize_section_key(next_part_m.group(1), None)
|
|
501
|
+
current_item = None
|
|
502
|
+
current_item_title = None
|
|
503
|
+
self._log(f"DEBUG: Page {page_num} - intra-page PART transition to {current_part}")
|
|
504
|
+
elif next_kind == 'item' and next_item_m:
|
|
505
|
+
item_num = next_item_m.group(2)
|
|
506
|
+
title = (next_item_m.group(3) or "").strip()
|
|
507
|
+
current_item_title = self._clean_item_title(title) if title else None
|
|
508
|
+
if current_part is None and self.filing_type:
|
|
509
|
+
inferred = self._infer_part_for_item(self.filing_type, f"ITEM {item_num.upper()}")
|
|
510
|
+
if inferred:
|
|
511
|
+
current_part = inferred
|
|
512
|
+
self._log(f"DEBUG: Inferred {inferred} at detection time for ITEM {item_num}")
|
|
513
|
+
_, current_item = self._normalize_section_key(current_part, item_num)
|
|
514
|
+
self._log(f"DEBUG: Page {page_num} - intra-page ITEM transition to {current_item}")
|
|
515
|
+
|
|
516
|
+
current_pages.append({"page": page_num, "content": after_seg})
|
|
517
|
+
tail = after_seg
|
|
518
|
+
|
|
519
|
+
flush_section()
|
|
520
|
+
|
|
521
|
+
self._log(f"DEBUG: Total sections before validation: {len(sections)}")
|
|
522
|
+
for s in sections:
|
|
523
|
+
self._log(f" - Part: {s['part']}, Item: {s['item']}, Pages: {len(s['pages'])}, Start: {s['page_start']}")
|
|
524
|
+
|
|
525
|
+
def _section_text_len(s):
|
|
526
|
+
return sum(len(p["content"].strip()) for p in s["pages"])
|
|
527
|
+
|
|
528
|
+
sections = [s for s in sections if s["item"] is not None or _section_text_len(s) > 80]
|
|
529
|
+
self._log(f"DEBUG: Sections after dropping empty PART stubs: {len(sections)}")
|
|
530
|
+
|
|
531
|
+
if self.structure and sections:
|
|
532
|
+
self._log(f"DEBUG: Validating against structure: {self.filing_type}")
|
|
533
|
+
fixed = []
|
|
534
|
+
for s in sections:
|
|
535
|
+
part = s["part"]
|
|
536
|
+
item = s["item"]
|
|
537
|
+
|
|
538
|
+
if part is None and item and self.filing_type:
|
|
539
|
+
inferred = self._infer_part_for_item(self.filing_type, item)
|
|
540
|
+
if inferred:
|
|
541
|
+
self._log(f"DEBUG: Inferred {inferred} from {item}")
|
|
542
|
+
s = {**s, "part": inferred}
|
|
543
|
+
part = inferred
|
|
544
|
+
|
|
545
|
+
if (part in self.structure) and (item is None or item in self.structure.get(part, [])):
|
|
546
|
+
fixed.append(s)
|
|
547
|
+
else:
|
|
548
|
+
self._log(f"DEBUG: Dropped section - Part: {part}, Item: {item}")
|
|
549
|
+
|
|
550
|
+
sections = fixed
|
|
551
|
+
self._log(f"DEBUG: Sections after validation: {len(sections)}")
|
|
552
|
+
|
|
553
|
+
# Convert to Section objects with Page objects (preserving elements)
|
|
554
|
+
from sec2md.models import Section, Page
|
|
555
|
+
|
|
556
|
+
section_objects = []
|
|
557
|
+
for section_data in sections:
|
|
558
|
+
# Build Page objects for this section, preserving elements from originals
|
|
559
|
+
section_pages = []
|
|
560
|
+
for page_dict in section_data["pages"]:
|
|
561
|
+
page_num = page_dict["page"]
|
|
562
|
+
original_page = self._original_pages.get(page_num)
|
|
563
|
+
|
|
564
|
+
# Filter text_blocks to only include ones relevant to this section's content
|
|
565
|
+
filtered_text_blocks = None
|
|
566
|
+
if original_page and original_page.text_blocks:
|
|
567
|
+
section_content = page_dict["content"]
|
|
568
|
+
filtered_text_blocks = []
|
|
569
|
+
for tb in original_page.text_blocks:
|
|
570
|
+
# Include TextBlock if:
|
|
571
|
+
# 1. Its title appears in section content, OR
|
|
572
|
+
# 2. Any of its element content appears in section (for short titles)
|
|
573
|
+
title_match = tb.title and tb.title in section_content
|
|
574
|
+
content_match = any(
|
|
575
|
+
# Check if element content (or significant portion) is in section
|
|
576
|
+
elem.content[:200] in section_content or section_content in elem.content
|
|
577
|
+
for elem in tb.elements
|
|
578
|
+
)
|
|
579
|
+
if title_match or content_match:
|
|
580
|
+
filtered_text_blocks.append(tb)
|
|
581
|
+
filtered_text_blocks = filtered_text_blocks if filtered_text_blocks else None
|
|
582
|
+
|
|
583
|
+
section_pages.append(
|
|
584
|
+
Page(
|
|
585
|
+
number=page_num,
|
|
586
|
+
content=page_dict["content"],
|
|
587
|
+
elements=original_page.elements if original_page else None,
|
|
588
|
+
text_blocks=filtered_text_blocks
|
|
589
|
+
)
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
section_objects.append(
|
|
593
|
+
Section(
|
|
594
|
+
part=section_data["part"],
|
|
595
|
+
item=section_data["item"],
|
|
596
|
+
item_title=section_data["item_title"],
|
|
597
|
+
pages=section_pages
|
|
598
|
+
)
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
return section_objects
|
|
602
|
+
|
|
603
|
+
def get_section(self, part: str, item: Optional[str] = None):
|
|
604
|
+
"""Get a specific section by part and item.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
part: Part name (e.g., "PART I")
|
|
608
|
+
item: Optional item name (e.g., "ITEM 1A")
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
Section object if found, None otherwise
|
|
612
|
+
"""
|
|
613
|
+
from sec2md.models import Section
|
|
614
|
+
|
|
615
|
+
part_normalized = self._normalize_section(part)
|
|
616
|
+
item_normalized = self._normalize_section(item) if item else None
|
|
617
|
+
sections = self.get_sections()
|
|
618
|
+
|
|
619
|
+
for section in sections:
|
|
620
|
+
if section.part == part_normalized:
|
|
621
|
+
if item_normalized is None or section.item == item_normalized:
|
|
622
|
+
return section
|
|
623
|
+
return None
|