sec2md 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sec2md might be problematic. Click here for more details.

sec2md/chunking.py ADDED
@@ -0,0 +1,179 @@
1
+ """Chunking utilities for page-aware markdown splitting."""
2
+
3
+ from typing import List, Optional
4
+ from collections import defaultdict
5
+ from sec2md.models import Page, Section, TextBlock
6
+ from sec2md.chunker.markdown_chunker import MarkdownChunker
7
+ from sec2md.chunker.markdown_chunk import MarkdownChunk
8
+
9
+
10
+ def chunk_pages(
11
+ pages: List[Page],
12
+ chunk_size: int = 512,
13
+ chunk_overlap: int = 128,
14
+ header: Optional[str] = None
15
+ ) -> List[MarkdownChunk]:
16
+ """
17
+ Chunk pages into overlapping markdown chunks.
18
+
19
+ Args:
20
+ pages: List of Page objects (with optional elements)
21
+ chunk_size: Target chunk size in tokens (estimated as chars/4)
22
+ chunk_overlap: Overlap between chunks in tokens
23
+ header: Optional header to prepend to each chunk's embedding_text
24
+
25
+ Returns:
26
+ List of MarkdownChunk objects with page tracking and elements
27
+
28
+ Example:
29
+ >>> pages = sec2md.convert_to_markdown(html, return_pages=True, include_elements=True)
30
+ >>> chunks = sec2md.chunk_pages(pages, chunk_size=512)
31
+ >>> for chunk in chunks:
32
+ ... print(f"Page {chunk.page}: {chunk.content[:100]}...")
33
+ ... print(f"Elements: {chunk.elements}")
34
+ """
35
+ chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
36
+ return chunker.split(pages=pages, header=header)
37
+
38
+
39
+ def chunk_section(
40
+ section: Section,
41
+ chunk_size: int = 512,
42
+ chunk_overlap: int = 128,
43
+ header: Optional[str] = None
44
+ ) -> List[MarkdownChunk]:
45
+ """
46
+ Chunk a filing section into overlapping markdown chunks.
47
+
48
+ Args:
49
+ section: Section object from extract_sections()
50
+ chunk_size: Target chunk size in tokens (estimated as chars/4)
51
+ chunk_overlap: Overlap between chunks in tokens
52
+ header: Optional header to prepend to each chunk's embedding_text
53
+
54
+ Returns:
55
+ List of MarkdownChunk objects
56
+
57
+ Example:
58
+ >>> sections = sec2md.extract_sections(pages, filing_type="10-K")
59
+ >>> risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
60
+ >>> chunks = sec2md.chunk_section(risk, chunk_size=512)
61
+ """
62
+ return chunk_pages(
63
+ pages=section.pages,
64
+ chunk_size=chunk_size,
65
+ chunk_overlap=chunk_overlap,
66
+ header=header
67
+ )
68
+
69
+
70
+ def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
71
+ """
72
+ Merge multi-page TextBlocks into single TextBlock objects.
73
+
74
+ When a financial note (e.g., Debt Disclosure) spans multiple pages,
75
+ this merges all elements and page references into one TextBlock.
76
+
77
+ Args:
78
+ pages: List of Page objects with text_blocks populated
79
+
80
+ Returns:
81
+ List of merged TextBlock objects with page metadata:
82
+ - page_start: First page the note appears on
83
+ - page_end: Last page the note appears on
84
+ - source_pages: All pages the note spans
85
+ - elements: All elements from all pages
86
+
87
+ Example:
88
+ >>> pages = parser.get_pages(include_elements=True)
89
+ >>> merged = merge_text_blocks(pages)
90
+ >>> for tb in merged:
91
+ ... print(f"{tb.title}: pages {tb.page_start}-{tb.page_end}")
92
+ Debt Disclosure: pages 45-46
93
+ Segment Reporting: pages 49-50
94
+ """
95
+ # Group by TextBlock name
96
+ tb_map = defaultdict(lambda: {
97
+ "name": None,
98
+ "title": None,
99
+ "elements": [],
100
+ "page_start": float('inf'),
101
+ "page_end": -1,
102
+ "pages": set()
103
+ })
104
+
105
+ for page in pages:
106
+ if page.text_blocks:
107
+ for tb in page.text_blocks:
108
+ tb_map[tb.name]["name"] = tb.name
109
+ tb_map[tb.name]["title"] = tb.title
110
+ tb_map[tb.name]["elements"].extend(tb.elements)
111
+ tb_map[tb.name]["page_start"] = min(tb_map[tb.name]["page_start"], page.number)
112
+ tb_map[tb.name]["page_end"] = max(tb_map[tb.name]["page_end"], page.number)
113
+ tb_map[tb.name]["pages"].add(page.number)
114
+
115
+ # Create merged TextBlock objects
116
+ merged = []
117
+ for tb_data in tb_map.values():
118
+ tb = TextBlock(
119
+ name=tb_data["name"],
120
+ title=tb_data["title"],
121
+ elements=tb_data["elements"],
122
+ page_start=tb_data["page_start"],
123
+ page_end=tb_data["page_end"],
124
+ source_pages=sorted(tb_data["pages"])
125
+ )
126
+ merged.append(tb)
127
+
128
+ return merged
129
+
130
+
131
+ def chunk_text_block(
132
+ text_block: TextBlock,
133
+ chunk_size: int = 512,
134
+ chunk_overlap: int = 128,
135
+ include_title_as_header: bool = True
136
+ ) -> List[MarkdownChunk]:
137
+ """
138
+ Chunk a single TextBlock (financial note).
139
+
140
+ Args:
141
+ text_block: TextBlock object (possibly spanning multiple pages)
142
+ chunk_size: Target chunk size in tokens (estimated as chars/4)
143
+ chunk_overlap: Overlap between chunks in tokens
144
+ include_title_as_header: Prepend note title to chunks for embedding
145
+
146
+ Returns:
147
+ List of MarkdownChunk objects with elements preserved
148
+
149
+ Example:
150
+ >>> merged = merge_text_blocks(pages)
151
+ >>> debt_note = [tb for tb in merged if "Debt" in tb.title][0]
152
+ >>> chunks = chunk_text_block(debt_note, chunk_size=512)
153
+ >>> print(f"Chunked {debt_note.title} into {len(chunks)} chunks")
154
+ >>> print(f"Note spans pages {debt_note.page_start}-{debt_note.page_end}")
155
+ """
156
+ # Group elements by page
157
+ elements_by_page = defaultdict(list)
158
+ for elem in text_block.elements:
159
+ # Use page_start for grouping (elements are always on single pages in practice)
160
+ elements_by_page[elem.page_start].append(elem)
161
+
162
+ # Create one Page per page the TextBlock spans, with only elements from that page
163
+ pages = []
164
+ for page_num in sorted(elements_by_page.keys()):
165
+ elems = elements_by_page[page_num]
166
+ # Join content from elements on this page
167
+ content = "\n\n".join(e.content for e in elems)
168
+
169
+ pages.append(Page(
170
+ number=page_num, # Real page number
171
+ content=content, # Only content from this page
172
+ elements=elems # Only elements from this page
173
+ ))
174
+
175
+ # Chunk normally across all pages
176
+ header = f"Note: {text_block.title}" if include_title_as_header and text_block.title else None
177
+ chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
178
+
179
+ return chunker.split(pages=pages, header=header)
sec2md/core.py ADDED
@@ -0,0 +1,93 @@
1
+ """Core conversion functionality."""
2
+
3
+ from typing import overload, List
4
+ from sec2md.utils import is_url, fetch
5
+ from sec2md.parser import Parser
6
+ from sec2md.models import Page
7
+
8
+
9
+ @overload
10
+ def convert_to_markdown(
11
+ source: str | bytes,
12
+ *,
13
+ user_agent: str | None = None,
14
+ return_pages: bool = False,
15
+ ) -> str: ...
16
+
17
+
18
+ @overload
19
+ def convert_to_markdown(
20
+ source: str | bytes,
21
+ *,
22
+ user_agent: str | None = None,
23
+ return_pages: bool = True,
24
+ ) -> List[Page]: ...
25
+
26
+
27
+ def convert_to_markdown(
28
+ source: str | bytes,
29
+ *,
30
+ user_agent: str | None = None,
31
+ return_pages: bool = False,
32
+ ) -> str | List[Page]:
33
+ """
34
+ Convert SEC filing HTML to Markdown.
35
+
36
+ Args:
37
+ source: URL or HTML string/bytes
38
+ user_agent: User agent for EDGAR requests (required for sec.gov URLs)
39
+ return_pages: If True, returns List[Page] instead of markdown string
40
+
41
+ Returns:
42
+ Markdown string (default) or List[Page] if return_pages=True
43
+
44
+ Raises:
45
+ ValueError: If source appears to be PDF content or other non-HTML format
46
+
47
+ Examples:
48
+ >>> # From URL - get markdown
49
+ >>> md = convert_to_markdown(
50
+ ... "https://www.sec.gov/Archives/edgar/data/.../10k.htm",
51
+ ... user_agent="Lucas Astorian <lucas@intellifin.ai>"
52
+ ... )
53
+
54
+ >>> # Get pages for section extraction
55
+ >>> pages = convert_to_markdown(filing.html(), return_pages=True)
56
+
57
+ >>> # With edgartools
58
+ >>> from edgar import Company, set_identity
59
+ >>> set_identity("Lucas Astorian <lucas@intellifin.ai>")
60
+ >>> company = Company('AAPL')
61
+ >>> filing = company.get_filings(form="10-K").latest()
62
+ >>> md = convert_to_markdown(filing.html())
63
+ """
64
+ # Handle bytes input
65
+ if isinstance(source, bytes):
66
+ # Check if it's PDF
67
+ if source.startswith(b'%PDF'):
68
+ raise ValueError(
69
+ "PDF content detected. This library only supports HTML input. "
70
+ "Please extract HTML from the filing first."
71
+ )
72
+ source = source.decode('utf-8', errors='ignore')
73
+
74
+ # Check for PDF in string
75
+ if isinstance(source, str) and source.strip().startswith('%PDF'):
76
+ raise ValueError(
77
+ "PDF content detected. This library only supports HTML input. "
78
+ "Please extract HTML from the filing first."
79
+ )
80
+
81
+ # Fetch from URL if needed
82
+ if is_url(source):
83
+ html = fetch(source, user_agent=user_agent)
84
+ else:
85
+ html = source
86
+
87
+ # Parse and convert
88
+ parser = Parser(html)
89
+
90
+ if return_pages:
91
+ return parser.get_pages()
92
+ else:
93
+ return parser.markdown()
sec2md/models.py ADDED
@@ -0,0 +1,400 @@
1
+ """Data models for SEC filing parsing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+ from typing import List, Optional, Literal, Tuple
7
+ from pydantic import BaseModel, Field, field_validator, computed_field
8
+
9
+ try:
10
+ import tiktoken
11
+ TIKTOKEN_AVAILABLE = True
12
+ except ImportError:
13
+ TIKTOKEN_AVAILABLE = False
14
+
15
+ try:
16
+ from IPython.display import display, Markdown as IPythonMarkdown
17
+ IPYTHON_AVAILABLE = True
18
+ except ImportError:
19
+ IPYTHON_AVAILABLE = False
20
+
21
+
22
+ def _count_tokens(text: str) -> int:
23
+ """Count tokens in text using tiktoken if available, else char/4 heuristic."""
24
+ if TIKTOKEN_AVAILABLE:
25
+ encoding = tiktoken.get_encoding("cl100k_base")
26
+ return len(encoding.encode(text))
27
+ else:
28
+ return max(1, len(text) // 4)
29
+
30
+
31
+ # Type alias for filing types
32
+ FilingType = Literal["10-K", "10-Q"]
33
+
34
+
35
+ class Item10K(str, Enum):
36
+ """10-K Filing Items - human readable names mapped to item numbers."""
37
+
38
+ # Part I
39
+ BUSINESS = "1"
40
+ RISK_FACTORS = "1A"
41
+ UNRESOLVED_STAFF_COMMENTS = "1B"
42
+ CYBERSECURITY = "1C"
43
+ PROPERTIES = "2"
44
+ LEGAL_PROCEEDINGS = "3"
45
+ MINE_SAFETY = "4"
46
+
47
+ # Part II
48
+ MARKET_FOR_STOCK = "5"
49
+ SELECTED_FINANCIAL_DATA = "6" # Removed in recent years
50
+ MD_AND_A = "7"
51
+ MARKET_RISK = "7A"
52
+ FINANCIAL_STATEMENTS = "8"
53
+ CHANGES_IN_ACCOUNTING = "9"
54
+ CONTROLS_AND_PROCEDURES = "9A"
55
+ OTHER_INFORMATION = "9B"
56
+ CYBERSECURITY_DISCLOSURES = "9C"
57
+
58
+ # Part III
59
+ DIRECTORS_AND_OFFICERS = "10"
60
+ EXECUTIVE_COMPENSATION = "11"
61
+ SECURITY_OWNERSHIP = "12"
62
+ CERTAIN_RELATIONSHIPS = "13"
63
+ PRINCIPAL_ACCOUNTANT = "14"
64
+
65
+ # Part IV
66
+ EXHIBITS = "15"
67
+ FORM_10K_SUMMARY = "16"
68
+
69
+
70
+ class Item10Q(str, Enum):
71
+ """10-Q Filing Items - human readable names with part disambiguation."""
72
+
73
+ # Part I
74
+ FINANCIAL_STATEMENTS_P1 = "1.P1"
75
+ MD_AND_A_P1 = "2.P1"
76
+ MARKET_RISK_P1 = "3.P1"
77
+ CONTROLS_AND_PROCEDURES_P1 = "4.P1"
78
+
79
+ # Part II
80
+ LEGAL_PROCEEDINGS_P2 = "1.P2"
81
+ RISK_FACTORS_P2 = "1A.P2"
82
+ UNREGISTERED_SALES_P2 = "2.P2"
83
+ DEFAULTS_P2 = "3.P2"
84
+ MINE_SAFETY_P2 = "4.P2"
85
+ OTHER_INFORMATION_P2 = "5.P2"
86
+ EXHIBITS_P2 = "6.P2"
87
+
88
+
89
+ class Item8K(str, Enum):
90
+ """8-K Filing Items - event-driven disclosure items."""
91
+
92
+ # Section 1 – Registrant's Business and Operations
93
+ MATERIAL_AGREEMENT = "1.01"
94
+ TERMINATION_OF_AGREEMENT = "1.02"
95
+ BANKRUPTCY = "1.03"
96
+ MINE_SAFETY = "1.04"
97
+ CYBERSECURITY_INCIDENT = "1.05"
98
+
99
+ # Section 2 – Financial Information
100
+ ACQUISITION_DISPOSITION = "2.01"
101
+ RESULTS_OF_OPERATIONS = "2.02"
102
+ DIRECT_FINANCIAL_OBLIGATION = "2.03"
103
+ TRIGGERING_EVENTS = "2.04"
104
+ EXIT_DISPOSAL_COSTS = "2.05"
105
+ MATERIAL_IMPAIRMENTS = "2.06"
106
+
107
+ # Section 3 – Securities and Trading Markets
108
+ DELISTING_NOTICE = "3.01"
109
+ UNREGISTERED_SALES = "3.02"
110
+ SECURITY_RIGHTS_MODIFICATION = "3.03"
111
+
112
+ # Section 4 – Matters Related to Accountants and Financial Statements
113
+ ACCOUNTANT_CHANGE = "4.01"
114
+ NON_RELIANCE = "4.02"
115
+
116
+ # Section 5 – Corporate Governance and Management
117
+ CONTROL_CHANGE = "5.01"
118
+ DIRECTOR_OFFICER_CHANGE = "5.02"
119
+ AMENDMENTS_TO_ARTICLES = "5.03"
120
+ TRADING_SUSPENSION = "5.04"
121
+ CODE_OF_ETHICS = "5.05"
122
+ SHELL_COMPANY_STATUS = "5.06"
123
+ SHAREHOLDER_VOTE = "5.07"
124
+ SHAREHOLDER_NOMINATIONS = "5.08"
125
+
126
+ # Section 6 – Asset-Backed Securities
127
+ ABS_INFORMATIONAL = "6.01"
128
+ SERVICER_TRUSTEE_CHANGE = "6.02"
129
+ CREDIT_ENHANCEMENT_CHANGE = "6.03"
130
+ DISTRIBUTION_FAILURE = "6.04"
131
+ SECURITIES_ACT_UPDATING = "6.05"
132
+ STATIC_POOL = "6.06"
133
+
134
+ # Section 7 – Regulation FD
135
+ REGULATION_FD = "7.01"
136
+
137
+ # Section 8 – Other Events
138
+ OTHER_EVENTS = "8.01"
139
+
140
+ # Section 9 – Financial Statements and Exhibits
141
+ FINANCIAL_STATEMENTS_EXHIBITS = "9.01"
142
+
143
+
144
+ # Internal mappings from enum to (part, item) tuples
145
+ ITEM_10K_MAPPING: dict[Item10K, Tuple[str, str]] = {
146
+ # Part I
147
+ Item10K.BUSINESS: ("PART I", "ITEM 1"),
148
+ Item10K.RISK_FACTORS: ("PART I", "ITEM 1A"),
149
+ Item10K.UNRESOLVED_STAFF_COMMENTS: ("PART I", "ITEM 1B"),
150
+ Item10K.CYBERSECURITY: ("PART I", "ITEM 1C"),
151
+ Item10K.PROPERTIES: ("PART I", "ITEM 2"),
152
+ Item10K.LEGAL_PROCEEDINGS: ("PART I", "ITEM 3"),
153
+ Item10K.MINE_SAFETY: ("PART I", "ITEM 4"),
154
+
155
+ # Part II
156
+ Item10K.MARKET_FOR_STOCK: ("PART II", "ITEM 5"),
157
+ Item10K.SELECTED_FINANCIAL_DATA: ("PART II", "ITEM 6"),
158
+ Item10K.MD_AND_A: ("PART II", "ITEM 7"),
159
+ Item10K.MARKET_RISK: ("PART II", "ITEM 7A"),
160
+ Item10K.FINANCIAL_STATEMENTS: ("PART II", "ITEM 8"),
161
+ Item10K.CHANGES_IN_ACCOUNTING: ("PART II", "ITEM 9"),
162
+ Item10K.CONTROLS_AND_PROCEDURES: ("PART II", "ITEM 9A"),
163
+ Item10K.OTHER_INFORMATION: ("PART II", "ITEM 9B"),
164
+ Item10K.CYBERSECURITY_DISCLOSURES: ("PART II", "ITEM 9C"),
165
+
166
+ # Part III
167
+ Item10K.DIRECTORS_AND_OFFICERS: ("PART III", "ITEM 10"),
168
+ Item10K.EXECUTIVE_COMPENSATION: ("PART III", "ITEM 11"),
169
+ Item10K.SECURITY_OWNERSHIP: ("PART III", "ITEM 12"),
170
+ Item10K.CERTAIN_RELATIONSHIPS: ("PART III", "ITEM 13"),
171
+ Item10K.PRINCIPAL_ACCOUNTANT: ("PART III", "ITEM 14"),
172
+
173
+ # Part IV
174
+ Item10K.EXHIBITS: ("PART IV", "ITEM 15"),
175
+ Item10K.FORM_10K_SUMMARY: ("PART IV", "ITEM 16"),
176
+ }
177
+
178
+
179
+ ITEM_10Q_MAPPING: dict[Item10Q, Tuple[str, str]] = {
180
+ # Part I
181
+ Item10Q.FINANCIAL_STATEMENTS_P1: ("PART I", "ITEM 1"),
182
+ Item10Q.MD_AND_A_P1: ("PART I", "ITEM 2"),
183
+ Item10Q.MARKET_RISK_P1: ("PART I", "ITEM 3"),
184
+ Item10Q.CONTROLS_AND_PROCEDURES_P1: ("PART I", "ITEM 4"),
185
+
186
+ # Part II
187
+ Item10Q.LEGAL_PROCEEDINGS_P2: ("PART II", "ITEM 1"),
188
+ Item10Q.RISK_FACTORS_P2: ("PART II", "ITEM 1A"),
189
+ Item10Q.UNREGISTERED_SALES_P2: ("PART II", "ITEM 2"),
190
+ Item10Q.DEFAULTS_P2: ("PART II", "ITEM 3"),
191
+ Item10Q.MINE_SAFETY_P2: ("PART II", "ITEM 4"),
192
+ Item10Q.OTHER_INFORMATION_P2: ("PART II", "ITEM 5"),
193
+ Item10Q.EXHIBITS_P2: ("PART II", "ITEM 6"),
194
+ }
195
+
196
+
197
+ # 8-K items don't have PART divisions
198
+ ITEM_8K_TITLES: dict[str, str] = {
199
+ "1.01": "Entry into a Material Definitive Agreement",
200
+ "1.02": "Termination of a Material Definitive Agreement",
201
+ "1.03": "Bankruptcy or Receivership",
202
+ "1.04": "Mine Safety – Reporting of Shutdowns and Patterns of Violations",
203
+ "1.05": "Material Cybersecurity Incidents",
204
+ "2.01": "Completion of Acquisition or Disposition of Assets",
205
+ "2.02": "Results of Operations and Financial Condition",
206
+ "2.03": "Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement of a Registrant",
207
+ "2.04": "Triggering Events That Accelerate or Increase a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement",
208
+ "2.05": "Costs Associated with Exit or Disposal Activities",
209
+ "2.06": "Material Impairments",
210
+ "3.01": "Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; Transfer of Listing",
211
+ "3.02": "Unregistered Sales of Equity Securities",
212
+ "3.03": "Material Modification to Rights of Security Holders",
213
+ "4.01": "Changes in Registrant's Certifying Accountant",
214
+ "4.02": "Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or Completed Interim Review",
215
+ "5.01": "Changes in Control of Registrant",
216
+ "5.02": "Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers",
217
+ "5.03": "Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year",
218
+ "5.04": "Temporary Suspension of Trading Under Registrant's Employee Benefit Plans",
219
+ "5.05": "Amendments to the Registrant's Code of Ethics, or Waiver of a Provision of the Code of Ethics",
220
+ "5.06": "Change in Shell Company Status",
221
+ "5.07": "Submission of Matters to a Vote of Security Holders",
222
+ "5.08": "Shareholder Director Nominations",
223
+ "6.01": "ABS Informational and Computational Material",
224
+ "6.02": "Change of Servicer or Trustee",
225
+ "6.03": "Change in Credit Enhancement or Other External Support",
226
+ "6.04": "Failure to Make a Required Distribution",
227
+ "6.05": "Securities Act Updating Disclosure",
228
+ "6.06": "Static Pool",
229
+ "7.01": "Regulation FD Disclosure",
230
+ "8.01": "Other Events",
231
+ "9.01": "Financial Statements and Exhibits",
232
+ }
233
+
234
+
235
+ class Exhibit(BaseModel):
236
+ """8-K exhibit entry."""
237
+ exhibit_no: str = Field(..., description="Exhibit number (e.g., '99.1', '104')")
238
+ description: str = Field(..., description="Exhibit description")
239
+
240
+ model_config = {"frozen": False}
241
+
242
+
243
+ class TextBlock(BaseModel):
244
+ """XBRL TextBlock (e.g., financial statement note)."""
245
+
246
+ name: str = Field(..., description="XBRL tag name (e.g., 'us-gaap:DebtDisclosureTextBlock')")
247
+ title: Optional[str] = Field(None, description="Human-readable title (e.g., 'Note 9 – Debt')")
248
+ elements: List['Element'] = Field(default_factory=list, description="Element objects in this TextBlock")
249
+
250
+ # Optional: Set by merge_text_blocks() for multi-page notes
251
+ page_start: Optional[int] = Field(None, description="First page this TextBlock appears on")
252
+ page_end: Optional[int] = Field(None, description="Last page this TextBlock appears on")
253
+ source_pages: Optional[List[int]] = Field(None, description="All pages this TextBlock spans")
254
+
255
+ model_config = {"frozen": False, "arbitrary_types_allowed": True}
256
+
257
+ @computed_field
258
+ @property
259
+ def element_ids(self) -> List[str]:
260
+ """Get list of element IDs."""
261
+ return [e.id for e in self.elements]
262
+
263
+ def __repr__(self) -> str:
264
+ pages_info = f", pages={self.page_start}-{self.page_end}" if self.page_start else ""
265
+ return f"TextBlock(name='{self.name}', title='{self.title}', elements={len(self.elements)}{pages_info})"
266
+
267
+
268
+ class Element(BaseModel):
269
+ """Citable semantic block of content."""
270
+
271
+ id: str = Field(..., description="Unique element ID for citation")
272
+ content: str = Field(..., description="Element text content")
273
+ kind: str = Field(..., description="Element type (e.g., 'paragraph', 'table', 'heading')")
274
+ page_start: int = Field(..., description="First page this element appears on")
275
+ page_end: int = Field(..., description="Last page this element appears on")
276
+
277
+ model_config = {"frozen": False}
278
+
279
+ @computed_field
280
+ @property
281
+ def char_count(self) -> int:
282
+ """Character count of this element."""
283
+ return len(self.content)
284
+
285
+ @computed_field
286
+ @property
287
+ def tokens(self) -> int:
288
+ """Token count of this element."""
289
+ return _count_tokens(self.content)
290
+
291
+ def __repr__(self) -> str:
292
+ preview = self.content[:80].replace('\n', ' ')
293
+ pages = f"p{self.page_start}" if self.page_start == self.page_end else f"p{self.page_start}-{self.page_end}"
294
+ return f"Element(id='{self.id}', kind='{self.kind}', {pages}, chars={len(self.content)}, preview='{preview}...')"
295
+
296
+
297
+ class Page(BaseModel):
298
+ """Represents a single page of markdown content."""
299
+
300
+ number: int = Field(..., description="Page number in the filing")
301
+ content: str = Field(..., description="Markdown content of the page")
302
+ elements: Optional[List[Element]] = Field(None, description="Citable elements on this page")
303
+ text_blocks: Optional[List[TextBlock]] = Field(None, description="XBRL TextBlocks on this page")
304
+
305
+ model_config = {"frozen": False, "arbitrary_types_allowed": True}
306
+
307
+ @computed_field
308
+ @property
309
+ def tokens(self) -> int:
310
+ """Total number of tokens on this page."""
311
+ return _count_tokens(self.content)
312
+
313
+ def preview(self) -> None:
314
+ """
315
+ Preview the full page content.
316
+
317
+ Renders as Markdown in Jupyter/IPython, plain text in console.
318
+ """
319
+ if IPYTHON_AVAILABLE:
320
+ display(IPythonMarkdown(self.content))
321
+ else:
322
+ print(f"=== Page {self.number} ({self.tokens} tokens) ===")
323
+ print(self.content)
324
+
325
+ def __str__(self) -> str:
326
+ return self.content
327
+
328
+ def __repr__(self) -> str:
329
+ preview = self.content[:100].replace('\n', ' ')
330
+ elem_info = f", elements={len(self.elements)}" if self.elements else ""
331
+ tb_info = f", text_blocks={len(self.text_blocks)}" if self.text_blocks else ""
332
+ return f"Page(number={self.number}, tokens={self.tokens}{elem_info}{tb_info}, preview='{preview}...')"
333
+
334
+
335
+ class Section(BaseModel):
336
+ """Represents a filing section (e.g., ITEM 1A - Risk Factors)."""
337
+
338
+ part: Optional[str] = Field(None, description="Part name (e.g., 'PART I', None for 8-K)")
339
+ item: Optional[str] = Field(None, description="Item identifier (e.g., 'ITEM 1A', 'ITEM 2.02')")
340
+ item_title: Optional[str] = Field(None, description="Item title")
341
+ pages: List[Page] = Field(default_factory=list, description="Pages in this section")
342
+ exhibits: Optional[List[Exhibit]] = Field(None, description="8-K exhibits (Item 9.01 only)")
343
+
344
+ model_config = {"frozen": False, "arbitrary_types_allowed": True}
345
+
346
+ @field_validator('pages')
347
+ @classmethod
348
+ def validate_pages_not_empty(cls, v: List[Page]) -> List[Page]:
349
+ """Ensure section has at least one page."""
350
+ if not v:
351
+ raise ValueError("Section must contain at least one page")
352
+ return v
353
+
354
+ @computed_field
355
+ @property
356
+ def page_range(self) -> Tuple[int, int]:
357
+ """Get the start and end page numbers for this section."""
358
+ if not self.pages:
359
+ return 0, 0
360
+ return self.pages[0].number, self.pages[-1].number
361
+
362
+ @computed_field
363
+ @property
364
+ def tokens(self) -> int:
365
+ """Total number of tokens in this section."""
366
+ return sum(p.tokens for p in self.pages)
367
+
368
+ @property
369
+ def content(self) -> str:
370
+ """Get section content with page delimiters."""
371
+ return "\n\n---\n\n".join(p.content for p in self.pages)
372
+
373
+ def markdown(self) -> str:
374
+ """Get section content as single markdown string."""
375
+ return "\n\n".join(p.content for p in self.pages)
376
+
377
+ def preview(self) -> None:
378
+ """
379
+ Preview the full section content.
380
+
381
+ Renders as Markdown in Jupyter/IPython, plain text in console.
382
+ """
383
+ content = self.markdown()
384
+
385
+ if IPYTHON_AVAILABLE:
386
+ display(IPythonMarkdown(content))
387
+ else:
388
+ header = f"{self.item}: {self.item_title}"
389
+ print(f"=== {header} ({self.tokens} tokens, pages {self.page_range[0]}-{self.page_range[1]}) ===")
390
+ print(content)
391
+
392
+ def __str__(self) -> str:
393
+ return self.markdown()
394
+
395
+ def __repr__(self) -> str:
396
+ page_range = self.page_range
397
+ return (
398
+ f"Section(item='{self.item}', title='{self.item_title}', "
399
+ f"pages={page_range[0]}-{page_range[1]}, tokens={self.tokens})"
400
+ )