sec2md 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +36 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +135 -0
- sec2md/chunker/markdown_chunk.py +133 -0
- sec2md/chunker/markdown_chunker.py +270 -0
- sec2md/chunking.py +179 -0
- sec2md/core.py +93 -0
- sec2md/models.py +400 -0
- sec2md/parser.py +1217 -0
- sec2md/section_extractor.py +623 -0
- sec2md/sections.py +84 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.5.dist-info/METADATA +216 -0
- sec2md-0.1.5.dist-info/RECORD +19 -0
- sec2md-0.1.5.dist-info/WHEEL +5 -0
- sec2md-0.1.5.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.5.dist-info/top_level.txt +1 -0
sec2md/chunking.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Chunking utilities for page-aware markdown splitting."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from sec2md.models import Page, Section, TextBlock
|
|
6
|
+
from sec2md.chunker.markdown_chunker import MarkdownChunker
|
|
7
|
+
from sec2md.chunker.markdown_chunk import MarkdownChunk
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def chunk_pages(
|
|
11
|
+
pages: List[Page],
|
|
12
|
+
chunk_size: int = 512,
|
|
13
|
+
chunk_overlap: int = 128,
|
|
14
|
+
header: Optional[str] = None
|
|
15
|
+
) -> List[MarkdownChunk]:
|
|
16
|
+
"""
|
|
17
|
+
Chunk pages into overlapping markdown chunks.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
pages: List of Page objects (with optional elements)
|
|
21
|
+
chunk_size: Target chunk size in tokens (estimated as chars/4)
|
|
22
|
+
chunk_overlap: Overlap between chunks in tokens
|
|
23
|
+
header: Optional header to prepend to each chunk's embedding_text
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
List of MarkdownChunk objects with page tracking and elements
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> pages = sec2md.convert_to_markdown(html, return_pages=True, include_elements=True)
|
|
30
|
+
>>> chunks = sec2md.chunk_pages(pages, chunk_size=512)
|
|
31
|
+
>>> for chunk in chunks:
|
|
32
|
+
... print(f"Page {chunk.page}: {chunk.content[:100]}...")
|
|
33
|
+
... print(f"Elements: {chunk.elements}")
|
|
34
|
+
"""
|
|
35
|
+
chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
36
|
+
return chunker.split(pages=pages, header=header)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def chunk_section(
|
|
40
|
+
section: Section,
|
|
41
|
+
chunk_size: int = 512,
|
|
42
|
+
chunk_overlap: int = 128,
|
|
43
|
+
header: Optional[str] = None
|
|
44
|
+
) -> List[MarkdownChunk]:
|
|
45
|
+
"""
|
|
46
|
+
Chunk a filing section into overlapping markdown chunks.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
section: Section object from extract_sections()
|
|
50
|
+
chunk_size: Target chunk size in tokens (estimated as chars/4)
|
|
51
|
+
chunk_overlap: Overlap between chunks in tokens
|
|
52
|
+
header: Optional header to prepend to each chunk's embedding_text
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of MarkdownChunk objects
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> sections = sec2md.extract_sections(pages, filing_type="10-K")
|
|
59
|
+
>>> risk = sec2md.get_section(sections, Item10K.RISK_FACTORS)
|
|
60
|
+
>>> chunks = sec2md.chunk_section(risk, chunk_size=512)
|
|
61
|
+
"""
|
|
62
|
+
return chunk_pages(
|
|
63
|
+
pages=section.pages,
|
|
64
|
+
chunk_size=chunk_size,
|
|
65
|
+
chunk_overlap=chunk_overlap,
|
|
66
|
+
header=header
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def merge_text_blocks(pages: List[Page]) -> List[TextBlock]:
|
|
71
|
+
"""
|
|
72
|
+
Merge multi-page TextBlocks into single TextBlock objects.
|
|
73
|
+
|
|
74
|
+
When a financial note (e.g., Debt Disclosure) spans multiple pages,
|
|
75
|
+
this merges all elements and page references into one TextBlock.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
pages: List of Page objects with text_blocks populated
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
List of merged TextBlock objects with page metadata:
|
|
82
|
+
- page_start: First page the note appears on
|
|
83
|
+
- page_end: Last page the note appears on
|
|
84
|
+
- source_pages: All pages the note spans
|
|
85
|
+
- elements: All elements from all pages
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
>>> pages = parser.get_pages(include_elements=True)
|
|
89
|
+
>>> merged = merge_text_blocks(pages)
|
|
90
|
+
>>> for tb in merged:
|
|
91
|
+
... print(f"{tb.title}: pages {tb.page_start}-{tb.page_end}")
|
|
92
|
+
Debt Disclosure: pages 45-46
|
|
93
|
+
Segment Reporting: pages 49-50
|
|
94
|
+
"""
|
|
95
|
+
# Group by TextBlock name
|
|
96
|
+
tb_map = defaultdict(lambda: {
|
|
97
|
+
"name": None,
|
|
98
|
+
"title": None,
|
|
99
|
+
"elements": [],
|
|
100
|
+
"page_start": float('inf'),
|
|
101
|
+
"page_end": -1,
|
|
102
|
+
"pages": set()
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
for page in pages:
|
|
106
|
+
if page.text_blocks:
|
|
107
|
+
for tb in page.text_blocks:
|
|
108
|
+
tb_map[tb.name]["name"] = tb.name
|
|
109
|
+
tb_map[tb.name]["title"] = tb.title
|
|
110
|
+
tb_map[tb.name]["elements"].extend(tb.elements)
|
|
111
|
+
tb_map[tb.name]["page_start"] = min(tb_map[tb.name]["page_start"], page.number)
|
|
112
|
+
tb_map[tb.name]["page_end"] = max(tb_map[tb.name]["page_end"], page.number)
|
|
113
|
+
tb_map[tb.name]["pages"].add(page.number)
|
|
114
|
+
|
|
115
|
+
# Create merged TextBlock objects
|
|
116
|
+
merged = []
|
|
117
|
+
for tb_data in tb_map.values():
|
|
118
|
+
tb = TextBlock(
|
|
119
|
+
name=tb_data["name"],
|
|
120
|
+
title=tb_data["title"],
|
|
121
|
+
elements=tb_data["elements"],
|
|
122
|
+
page_start=tb_data["page_start"],
|
|
123
|
+
page_end=tb_data["page_end"],
|
|
124
|
+
source_pages=sorted(tb_data["pages"])
|
|
125
|
+
)
|
|
126
|
+
merged.append(tb)
|
|
127
|
+
|
|
128
|
+
return merged
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def chunk_text_block(
|
|
132
|
+
text_block: TextBlock,
|
|
133
|
+
chunk_size: int = 512,
|
|
134
|
+
chunk_overlap: int = 128,
|
|
135
|
+
include_title_as_header: bool = True
|
|
136
|
+
) -> List[MarkdownChunk]:
|
|
137
|
+
"""
|
|
138
|
+
Chunk a single TextBlock (financial note).
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
text_block: TextBlock object (possibly spanning multiple pages)
|
|
142
|
+
chunk_size: Target chunk size in tokens (estimated as chars/4)
|
|
143
|
+
chunk_overlap: Overlap between chunks in tokens
|
|
144
|
+
include_title_as_header: Prepend note title to chunks for embedding
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of MarkdownChunk objects with elements preserved
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
>>> merged = merge_text_blocks(pages)
|
|
151
|
+
>>> debt_note = [tb for tb in merged if "Debt" in tb.title][0]
|
|
152
|
+
>>> chunks = chunk_text_block(debt_note, chunk_size=512)
|
|
153
|
+
>>> print(f"Chunked {debt_note.title} into {len(chunks)} chunks")
|
|
154
|
+
>>> print(f"Note spans pages {debt_note.page_start}-{debt_note.page_end}")
|
|
155
|
+
"""
|
|
156
|
+
# Group elements by page
|
|
157
|
+
elements_by_page = defaultdict(list)
|
|
158
|
+
for elem in text_block.elements:
|
|
159
|
+
# Use page_start for grouping (elements are always on single pages in practice)
|
|
160
|
+
elements_by_page[elem.page_start].append(elem)
|
|
161
|
+
|
|
162
|
+
# Create one Page per page the TextBlock spans, with only elements from that page
|
|
163
|
+
pages = []
|
|
164
|
+
for page_num in sorted(elements_by_page.keys()):
|
|
165
|
+
elems = elements_by_page[page_num]
|
|
166
|
+
# Join content from elements on this page
|
|
167
|
+
content = "\n\n".join(e.content for e in elems)
|
|
168
|
+
|
|
169
|
+
pages.append(Page(
|
|
170
|
+
number=page_num, # Real page number
|
|
171
|
+
content=content, # Only content from this page
|
|
172
|
+
elements=elems # Only elements from this page
|
|
173
|
+
))
|
|
174
|
+
|
|
175
|
+
# Chunk normally across all pages
|
|
176
|
+
header = f"Note: {text_block.title}" if include_title_as_header and text_block.title else None
|
|
177
|
+
chunker = MarkdownChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
178
|
+
|
|
179
|
+
return chunker.split(pages=pages, header=header)
|
sec2md/core.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Core conversion functionality."""
|
|
2
|
+
|
|
3
|
+
from typing import overload, List
|
|
4
|
+
from sec2md.utils import is_url, fetch
|
|
5
|
+
from sec2md.parser import Parser
|
|
6
|
+
from sec2md.models import Page
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@overload
|
|
10
|
+
def convert_to_markdown(
|
|
11
|
+
source: str | bytes,
|
|
12
|
+
*,
|
|
13
|
+
user_agent: str | None = None,
|
|
14
|
+
return_pages: bool = False,
|
|
15
|
+
) -> str: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@overload
|
|
19
|
+
def convert_to_markdown(
|
|
20
|
+
source: str | bytes,
|
|
21
|
+
*,
|
|
22
|
+
user_agent: str | None = None,
|
|
23
|
+
return_pages: bool = True,
|
|
24
|
+
) -> List[Page]: ...
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def convert_to_markdown(
|
|
28
|
+
source: str | bytes,
|
|
29
|
+
*,
|
|
30
|
+
user_agent: str | None = None,
|
|
31
|
+
return_pages: bool = False,
|
|
32
|
+
) -> str | List[Page]:
|
|
33
|
+
"""
|
|
34
|
+
Convert SEC filing HTML to Markdown.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
source: URL or HTML string/bytes
|
|
38
|
+
user_agent: User agent for EDGAR requests (required for sec.gov URLs)
|
|
39
|
+
return_pages: If True, returns List[Page] instead of markdown string
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Markdown string (default) or List[Page] if return_pages=True
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
ValueError: If source appears to be PDF content or other non-HTML format
|
|
46
|
+
|
|
47
|
+
Examples:
|
|
48
|
+
>>> # From URL - get markdown
|
|
49
|
+
>>> md = convert_to_markdown(
|
|
50
|
+
... "https://www.sec.gov/Archives/edgar/data/.../10k.htm",
|
|
51
|
+
... user_agent="Lucas Astorian <lucas@intellifin.ai>"
|
|
52
|
+
... )
|
|
53
|
+
|
|
54
|
+
>>> # Get pages for section extraction
|
|
55
|
+
>>> pages = convert_to_markdown(filing.html(), return_pages=True)
|
|
56
|
+
|
|
57
|
+
>>> # With edgartools
|
|
58
|
+
>>> from edgar import Company, set_identity
|
|
59
|
+
>>> set_identity("Lucas Astorian <lucas@intellifin.ai>")
|
|
60
|
+
>>> company = Company('AAPL')
|
|
61
|
+
>>> filing = company.get_filings(form="10-K").latest()
|
|
62
|
+
>>> md = convert_to_markdown(filing.html())
|
|
63
|
+
"""
|
|
64
|
+
# Handle bytes input
|
|
65
|
+
if isinstance(source, bytes):
|
|
66
|
+
# Check if it's PDF
|
|
67
|
+
if source.startswith(b'%PDF'):
|
|
68
|
+
raise ValueError(
|
|
69
|
+
"PDF content detected. This library only supports HTML input. "
|
|
70
|
+
"Please extract HTML from the filing first."
|
|
71
|
+
)
|
|
72
|
+
source = source.decode('utf-8', errors='ignore')
|
|
73
|
+
|
|
74
|
+
# Check for PDF in string
|
|
75
|
+
if isinstance(source, str) and source.strip().startswith('%PDF'):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
"PDF content detected. This library only supports HTML input. "
|
|
78
|
+
"Please extract HTML from the filing first."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Fetch from URL if needed
|
|
82
|
+
if is_url(source):
|
|
83
|
+
html = fetch(source, user_agent=user_agent)
|
|
84
|
+
else:
|
|
85
|
+
html = source
|
|
86
|
+
|
|
87
|
+
# Parse and convert
|
|
88
|
+
parser = Parser(html)
|
|
89
|
+
|
|
90
|
+
if return_pages:
|
|
91
|
+
return parser.get_pages()
|
|
92
|
+
else:
|
|
93
|
+
return parser.markdown()
|
sec2md/models.py
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
"""Data models for SEC filing parsing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import List, Optional, Literal, Tuple
|
|
7
|
+
from pydantic import BaseModel, Field, field_validator, computed_field
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import tiktoken
|
|
11
|
+
TIKTOKEN_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
TIKTOKEN_AVAILABLE = False
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from IPython.display import display, Markdown as IPythonMarkdown
|
|
17
|
+
IPYTHON_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
IPYTHON_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _count_tokens(text: str) -> int:
|
|
23
|
+
"""Count tokens in text using tiktoken if available, else char/4 heuristic."""
|
|
24
|
+
if TIKTOKEN_AVAILABLE:
|
|
25
|
+
encoding = tiktoken.get_encoding("cl100k_base")
|
|
26
|
+
return len(encoding.encode(text))
|
|
27
|
+
else:
|
|
28
|
+
return max(1, len(text) // 4)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Type alias for filing types
|
|
32
|
+
FilingType = Literal["10-K", "10-Q"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Item10K(str, Enum):
|
|
36
|
+
"""10-K Filing Items - human readable names mapped to item numbers."""
|
|
37
|
+
|
|
38
|
+
# Part I
|
|
39
|
+
BUSINESS = "1"
|
|
40
|
+
RISK_FACTORS = "1A"
|
|
41
|
+
UNRESOLVED_STAFF_COMMENTS = "1B"
|
|
42
|
+
CYBERSECURITY = "1C"
|
|
43
|
+
PROPERTIES = "2"
|
|
44
|
+
LEGAL_PROCEEDINGS = "3"
|
|
45
|
+
MINE_SAFETY = "4"
|
|
46
|
+
|
|
47
|
+
# Part II
|
|
48
|
+
MARKET_FOR_STOCK = "5"
|
|
49
|
+
SELECTED_FINANCIAL_DATA = "6" # Removed in recent years
|
|
50
|
+
MD_AND_A = "7"
|
|
51
|
+
MARKET_RISK = "7A"
|
|
52
|
+
FINANCIAL_STATEMENTS = "8"
|
|
53
|
+
CHANGES_IN_ACCOUNTING = "9"
|
|
54
|
+
CONTROLS_AND_PROCEDURES = "9A"
|
|
55
|
+
OTHER_INFORMATION = "9B"
|
|
56
|
+
CYBERSECURITY_DISCLOSURES = "9C"
|
|
57
|
+
|
|
58
|
+
# Part III
|
|
59
|
+
DIRECTORS_AND_OFFICERS = "10"
|
|
60
|
+
EXECUTIVE_COMPENSATION = "11"
|
|
61
|
+
SECURITY_OWNERSHIP = "12"
|
|
62
|
+
CERTAIN_RELATIONSHIPS = "13"
|
|
63
|
+
PRINCIPAL_ACCOUNTANT = "14"
|
|
64
|
+
|
|
65
|
+
# Part IV
|
|
66
|
+
EXHIBITS = "15"
|
|
67
|
+
FORM_10K_SUMMARY = "16"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Item10Q(str, Enum):
|
|
71
|
+
"""10-Q Filing Items - human readable names with part disambiguation."""
|
|
72
|
+
|
|
73
|
+
# Part I
|
|
74
|
+
FINANCIAL_STATEMENTS_P1 = "1.P1"
|
|
75
|
+
MD_AND_A_P1 = "2.P1"
|
|
76
|
+
MARKET_RISK_P1 = "3.P1"
|
|
77
|
+
CONTROLS_AND_PROCEDURES_P1 = "4.P1"
|
|
78
|
+
|
|
79
|
+
# Part II
|
|
80
|
+
LEGAL_PROCEEDINGS_P2 = "1.P2"
|
|
81
|
+
RISK_FACTORS_P2 = "1A.P2"
|
|
82
|
+
UNREGISTERED_SALES_P2 = "2.P2"
|
|
83
|
+
DEFAULTS_P2 = "3.P2"
|
|
84
|
+
MINE_SAFETY_P2 = "4.P2"
|
|
85
|
+
OTHER_INFORMATION_P2 = "5.P2"
|
|
86
|
+
EXHIBITS_P2 = "6.P2"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class Item8K(str, Enum):
|
|
90
|
+
"""8-K Filing Items - event-driven disclosure items."""
|
|
91
|
+
|
|
92
|
+
# Section 1 – Registrant's Business and Operations
|
|
93
|
+
MATERIAL_AGREEMENT = "1.01"
|
|
94
|
+
TERMINATION_OF_AGREEMENT = "1.02"
|
|
95
|
+
BANKRUPTCY = "1.03"
|
|
96
|
+
MINE_SAFETY = "1.04"
|
|
97
|
+
CYBERSECURITY_INCIDENT = "1.05"
|
|
98
|
+
|
|
99
|
+
# Section 2 – Financial Information
|
|
100
|
+
ACQUISITION_DISPOSITION = "2.01"
|
|
101
|
+
RESULTS_OF_OPERATIONS = "2.02"
|
|
102
|
+
DIRECT_FINANCIAL_OBLIGATION = "2.03"
|
|
103
|
+
TRIGGERING_EVENTS = "2.04"
|
|
104
|
+
EXIT_DISPOSAL_COSTS = "2.05"
|
|
105
|
+
MATERIAL_IMPAIRMENTS = "2.06"
|
|
106
|
+
|
|
107
|
+
# Section 3 – Securities and Trading Markets
|
|
108
|
+
DELISTING_NOTICE = "3.01"
|
|
109
|
+
UNREGISTERED_SALES = "3.02"
|
|
110
|
+
SECURITY_RIGHTS_MODIFICATION = "3.03"
|
|
111
|
+
|
|
112
|
+
# Section 4 – Matters Related to Accountants and Financial Statements
|
|
113
|
+
ACCOUNTANT_CHANGE = "4.01"
|
|
114
|
+
NON_RELIANCE = "4.02"
|
|
115
|
+
|
|
116
|
+
# Section 5 – Corporate Governance and Management
|
|
117
|
+
CONTROL_CHANGE = "5.01"
|
|
118
|
+
DIRECTOR_OFFICER_CHANGE = "5.02"
|
|
119
|
+
AMENDMENTS_TO_ARTICLES = "5.03"
|
|
120
|
+
TRADING_SUSPENSION = "5.04"
|
|
121
|
+
CODE_OF_ETHICS = "5.05"
|
|
122
|
+
SHELL_COMPANY_STATUS = "5.06"
|
|
123
|
+
SHAREHOLDER_VOTE = "5.07"
|
|
124
|
+
SHAREHOLDER_NOMINATIONS = "5.08"
|
|
125
|
+
|
|
126
|
+
# Section 6 – Asset-Backed Securities
|
|
127
|
+
ABS_INFORMATIONAL = "6.01"
|
|
128
|
+
SERVICER_TRUSTEE_CHANGE = "6.02"
|
|
129
|
+
CREDIT_ENHANCEMENT_CHANGE = "6.03"
|
|
130
|
+
DISTRIBUTION_FAILURE = "6.04"
|
|
131
|
+
SECURITIES_ACT_UPDATING = "6.05"
|
|
132
|
+
STATIC_POOL = "6.06"
|
|
133
|
+
|
|
134
|
+
# Section 7 – Regulation FD
|
|
135
|
+
REGULATION_FD = "7.01"
|
|
136
|
+
|
|
137
|
+
# Section 8 – Other Events
|
|
138
|
+
OTHER_EVENTS = "8.01"
|
|
139
|
+
|
|
140
|
+
# Section 9 – Financial Statements and Exhibits
|
|
141
|
+
FINANCIAL_STATEMENTS_EXHIBITS = "9.01"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# Internal mappings from enum to (part, item) tuples
|
|
145
|
+
ITEM_10K_MAPPING: dict[Item10K, Tuple[str, str]] = {
|
|
146
|
+
# Part I
|
|
147
|
+
Item10K.BUSINESS: ("PART I", "ITEM 1"),
|
|
148
|
+
Item10K.RISK_FACTORS: ("PART I", "ITEM 1A"),
|
|
149
|
+
Item10K.UNRESOLVED_STAFF_COMMENTS: ("PART I", "ITEM 1B"),
|
|
150
|
+
Item10K.CYBERSECURITY: ("PART I", "ITEM 1C"),
|
|
151
|
+
Item10K.PROPERTIES: ("PART I", "ITEM 2"),
|
|
152
|
+
Item10K.LEGAL_PROCEEDINGS: ("PART I", "ITEM 3"),
|
|
153
|
+
Item10K.MINE_SAFETY: ("PART I", "ITEM 4"),
|
|
154
|
+
|
|
155
|
+
# Part II
|
|
156
|
+
Item10K.MARKET_FOR_STOCK: ("PART II", "ITEM 5"),
|
|
157
|
+
Item10K.SELECTED_FINANCIAL_DATA: ("PART II", "ITEM 6"),
|
|
158
|
+
Item10K.MD_AND_A: ("PART II", "ITEM 7"),
|
|
159
|
+
Item10K.MARKET_RISK: ("PART II", "ITEM 7A"),
|
|
160
|
+
Item10K.FINANCIAL_STATEMENTS: ("PART II", "ITEM 8"),
|
|
161
|
+
Item10K.CHANGES_IN_ACCOUNTING: ("PART II", "ITEM 9"),
|
|
162
|
+
Item10K.CONTROLS_AND_PROCEDURES: ("PART II", "ITEM 9A"),
|
|
163
|
+
Item10K.OTHER_INFORMATION: ("PART II", "ITEM 9B"),
|
|
164
|
+
Item10K.CYBERSECURITY_DISCLOSURES: ("PART II", "ITEM 9C"),
|
|
165
|
+
|
|
166
|
+
# Part III
|
|
167
|
+
Item10K.DIRECTORS_AND_OFFICERS: ("PART III", "ITEM 10"),
|
|
168
|
+
Item10K.EXECUTIVE_COMPENSATION: ("PART III", "ITEM 11"),
|
|
169
|
+
Item10K.SECURITY_OWNERSHIP: ("PART III", "ITEM 12"),
|
|
170
|
+
Item10K.CERTAIN_RELATIONSHIPS: ("PART III", "ITEM 13"),
|
|
171
|
+
Item10K.PRINCIPAL_ACCOUNTANT: ("PART III", "ITEM 14"),
|
|
172
|
+
|
|
173
|
+
# Part IV
|
|
174
|
+
Item10K.EXHIBITS: ("PART IV", "ITEM 15"),
|
|
175
|
+
Item10K.FORM_10K_SUMMARY: ("PART IV", "ITEM 16"),
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
ITEM_10Q_MAPPING: dict[Item10Q, Tuple[str, str]] = {
|
|
180
|
+
# Part I
|
|
181
|
+
Item10Q.FINANCIAL_STATEMENTS_P1: ("PART I", "ITEM 1"),
|
|
182
|
+
Item10Q.MD_AND_A_P1: ("PART I", "ITEM 2"),
|
|
183
|
+
Item10Q.MARKET_RISK_P1: ("PART I", "ITEM 3"),
|
|
184
|
+
Item10Q.CONTROLS_AND_PROCEDURES_P1: ("PART I", "ITEM 4"),
|
|
185
|
+
|
|
186
|
+
# Part II
|
|
187
|
+
Item10Q.LEGAL_PROCEEDINGS_P2: ("PART II", "ITEM 1"),
|
|
188
|
+
Item10Q.RISK_FACTORS_P2: ("PART II", "ITEM 1A"),
|
|
189
|
+
Item10Q.UNREGISTERED_SALES_P2: ("PART II", "ITEM 2"),
|
|
190
|
+
Item10Q.DEFAULTS_P2: ("PART II", "ITEM 3"),
|
|
191
|
+
Item10Q.MINE_SAFETY_P2: ("PART II", "ITEM 4"),
|
|
192
|
+
Item10Q.OTHER_INFORMATION_P2: ("PART II", "ITEM 5"),
|
|
193
|
+
Item10Q.EXHIBITS_P2: ("PART II", "ITEM 6"),
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# 8-K items don't have PART divisions
|
|
198
|
+
ITEM_8K_TITLES: dict[str, str] = {
|
|
199
|
+
"1.01": "Entry into a Material Definitive Agreement",
|
|
200
|
+
"1.02": "Termination of a Material Definitive Agreement",
|
|
201
|
+
"1.03": "Bankruptcy or Receivership",
|
|
202
|
+
"1.04": "Mine Safety – Reporting of Shutdowns and Patterns of Violations",
|
|
203
|
+
"1.05": "Material Cybersecurity Incidents",
|
|
204
|
+
"2.01": "Completion of Acquisition or Disposition of Assets",
|
|
205
|
+
"2.02": "Results of Operations and Financial Condition",
|
|
206
|
+
"2.03": "Creation of a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement of a Registrant",
|
|
207
|
+
"2.04": "Triggering Events That Accelerate or Increase a Direct Financial Obligation or an Obligation under an Off-Balance Sheet Arrangement",
|
|
208
|
+
"2.05": "Costs Associated with Exit or Disposal Activities",
|
|
209
|
+
"2.06": "Material Impairments",
|
|
210
|
+
"3.01": "Notice of Delisting or Failure to Satisfy a Continued Listing Rule or Standard; Transfer of Listing",
|
|
211
|
+
"3.02": "Unregistered Sales of Equity Securities",
|
|
212
|
+
"3.03": "Material Modification to Rights of Security Holders",
|
|
213
|
+
"4.01": "Changes in Registrant's Certifying Accountant",
|
|
214
|
+
"4.02": "Non-Reliance on Previously Issued Financial Statements or a Related Audit Report or Completed Interim Review",
|
|
215
|
+
"5.01": "Changes in Control of Registrant",
|
|
216
|
+
"5.02": "Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers",
|
|
217
|
+
"5.03": "Amendments to Articles of Incorporation or Bylaws; Change in Fiscal Year",
|
|
218
|
+
"5.04": "Temporary Suspension of Trading Under Registrant's Employee Benefit Plans",
|
|
219
|
+
"5.05": "Amendments to the Registrant's Code of Ethics, or Waiver of a Provision of the Code of Ethics",
|
|
220
|
+
"5.06": "Change in Shell Company Status",
|
|
221
|
+
"5.07": "Submission of Matters to a Vote of Security Holders",
|
|
222
|
+
"5.08": "Shareholder Director Nominations",
|
|
223
|
+
"6.01": "ABS Informational and Computational Material",
|
|
224
|
+
"6.02": "Change of Servicer or Trustee",
|
|
225
|
+
"6.03": "Change in Credit Enhancement or Other External Support",
|
|
226
|
+
"6.04": "Failure to Make a Required Distribution",
|
|
227
|
+
"6.05": "Securities Act Updating Disclosure",
|
|
228
|
+
"6.06": "Static Pool",
|
|
229
|
+
"7.01": "Regulation FD Disclosure",
|
|
230
|
+
"8.01": "Other Events",
|
|
231
|
+
"9.01": "Financial Statements and Exhibits",
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class Exhibit(BaseModel):
|
|
236
|
+
"""8-K exhibit entry."""
|
|
237
|
+
exhibit_no: str = Field(..., description="Exhibit number (e.g., '99.1', '104')")
|
|
238
|
+
description: str = Field(..., description="Exhibit description")
|
|
239
|
+
|
|
240
|
+
model_config = {"frozen": False}
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class TextBlock(BaseModel):
|
|
244
|
+
"""XBRL TextBlock (e.g., financial statement note)."""
|
|
245
|
+
|
|
246
|
+
name: str = Field(..., description="XBRL tag name (e.g., 'us-gaap:DebtDisclosureTextBlock')")
|
|
247
|
+
title: Optional[str] = Field(None, description="Human-readable title (e.g., 'Note 9 – Debt')")
|
|
248
|
+
elements: List['Element'] = Field(default_factory=list, description="Element objects in this TextBlock")
|
|
249
|
+
|
|
250
|
+
# Optional: Set by merge_text_blocks() for multi-page notes
|
|
251
|
+
page_start: Optional[int] = Field(None, description="First page this TextBlock appears on")
|
|
252
|
+
page_end: Optional[int] = Field(None, description="Last page this TextBlock appears on")
|
|
253
|
+
source_pages: Optional[List[int]] = Field(None, description="All pages this TextBlock spans")
|
|
254
|
+
|
|
255
|
+
model_config = {"frozen": False, "arbitrary_types_allowed": True}
|
|
256
|
+
|
|
257
|
+
@computed_field
|
|
258
|
+
@property
|
|
259
|
+
def element_ids(self) -> List[str]:
|
|
260
|
+
"""Get list of element IDs."""
|
|
261
|
+
return [e.id for e in self.elements]
|
|
262
|
+
|
|
263
|
+
def __repr__(self) -> str:
|
|
264
|
+
pages_info = f", pages={self.page_start}-{self.page_end}" if self.page_start else ""
|
|
265
|
+
return f"TextBlock(name='{self.name}', title='{self.title}', elements={len(self.elements)}{pages_info})"
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class Element(BaseModel):
|
|
269
|
+
"""Citable semantic block of content."""
|
|
270
|
+
|
|
271
|
+
id: str = Field(..., description="Unique element ID for citation")
|
|
272
|
+
content: str = Field(..., description="Element text content")
|
|
273
|
+
kind: str = Field(..., description="Element type (e.g., 'paragraph', 'table', 'heading')")
|
|
274
|
+
page_start: int = Field(..., description="First page this element appears on")
|
|
275
|
+
page_end: int = Field(..., description="Last page this element appears on")
|
|
276
|
+
|
|
277
|
+
model_config = {"frozen": False}
|
|
278
|
+
|
|
279
|
+
@computed_field
|
|
280
|
+
@property
|
|
281
|
+
def char_count(self) -> int:
|
|
282
|
+
"""Character count of this element."""
|
|
283
|
+
return len(self.content)
|
|
284
|
+
|
|
285
|
+
@computed_field
|
|
286
|
+
@property
|
|
287
|
+
def tokens(self) -> int:
|
|
288
|
+
"""Token count of this element."""
|
|
289
|
+
return _count_tokens(self.content)
|
|
290
|
+
|
|
291
|
+
def __repr__(self) -> str:
|
|
292
|
+
preview = self.content[:80].replace('\n', ' ')
|
|
293
|
+
pages = f"p{self.page_start}" if self.page_start == self.page_end else f"p{self.page_start}-{self.page_end}"
|
|
294
|
+
return f"Element(id='{self.id}', kind='{self.kind}', {pages}, chars={len(self.content)}, preview='{preview}...')"
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class Page(BaseModel):
|
|
298
|
+
"""Represents a single page of markdown content."""
|
|
299
|
+
|
|
300
|
+
number: int = Field(..., description="Page number in the filing")
|
|
301
|
+
content: str = Field(..., description="Markdown content of the page")
|
|
302
|
+
elements: Optional[List[Element]] = Field(None, description="Citable elements on this page")
|
|
303
|
+
text_blocks: Optional[List[TextBlock]] = Field(None, description="XBRL TextBlocks on this page")
|
|
304
|
+
|
|
305
|
+
model_config = {"frozen": False, "arbitrary_types_allowed": True}
|
|
306
|
+
|
|
307
|
+
@computed_field
|
|
308
|
+
@property
|
|
309
|
+
def tokens(self) -> int:
|
|
310
|
+
"""Total number of tokens on this page."""
|
|
311
|
+
return _count_tokens(self.content)
|
|
312
|
+
|
|
313
|
+
def preview(self) -> None:
|
|
314
|
+
"""
|
|
315
|
+
Preview the full page content.
|
|
316
|
+
|
|
317
|
+
Renders as Markdown in Jupyter/IPython, plain text in console.
|
|
318
|
+
"""
|
|
319
|
+
if IPYTHON_AVAILABLE:
|
|
320
|
+
display(IPythonMarkdown(self.content))
|
|
321
|
+
else:
|
|
322
|
+
print(f"=== Page {self.number} ({self.tokens} tokens) ===")
|
|
323
|
+
print(self.content)
|
|
324
|
+
|
|
325
|
+
def __str__(self) -> str:
|
|
326
|
+
return self.content
|
|
327
|
+
|
|
328
|
+
def __repr__(self) -> str:
|
|
329
|
+
preview = self.content[:100].replace('\n', ' ')
|
|
330
|
+
elem_info = f", elements={len(self.elements)}" if self.elements else ""
|
|
331
|
+
tb_info = f", text_blocks={len(self.text_blocks)}" if self.text_blocks else ""
|
|
332
|
+
return f"Page(number={self.number}, tokens={self.tokens}{elem_info}{tb_info}, preview='{preview}...')"
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class Section(BaseModel):
|
|
336
|
+
"""Represents a filing section (e.g., ITEM 1A - Risk Factors)."""
|
|
337
|
+
|
|
338
|
+
part: Optional[str] = Field(None, description="Part name (e.g., 'PART I', None for 8-K)")
|
|
339
|
+
item: Optional[str] = Field(None, description="Item identifier (e.g., 'ITEM 1A', 'ITEM 2.02')")
|
|
340
|
+
item_title: Optional[str] = Field(None, description="Item title")
|
|
341
|
+
pages: List[Page] = Field(default_factory=list, description="Pages in this section")
|
|
342
|
+
exhibits: Optional[List[Exhibit]] = Field(None, description="8-K exhibits (Item 9.01 only)")
|
|
343
|
+
|
|
344
|
+
model_config = {"frozen": False, "arbitrary_types_allowed": True}
|
|
345
|
+
|
|
346
|
+
@field_validator('pages')
|
|
347
|
+
@classmethod
|
|
348
|
+
def validate_pages_not_empty(cls, v: List[Page]) -> List[Page]:
|
|
349
|
+
"""Ensure section has at least one page."""
|
|
350
|
+
if not v:
|
|
351
|
+
raise ValueError("Section must contain at least one page")
|
|
352
|
+
return v
|
|
353
|
+
|
|
354
|
+
@computed_field
|
|
355
|
+
@property
|
|
356
|
+
def page_range(self) -> Tuple[int, int]:
|
|
357
|
+
"""Get the start and end page numbers for this section."""
|
|
358
|
+
if not self.pages:
|
|
359
|
+
return 0, 0
|
|
360
|
+
return self.pages[0].number, self.pages[-1].number
|
|
361
|
+
|
|
362
|
+
@computed_field
|
|
363
|
+
@property
|
|
364
|
+
def tokens(self) -> int:
|
|
365
|
+
"""Total number of tokens in this section."""
|
|
366
|
+
return sum(p.tokens for p in self.pages)
|
|
367
|
+
|
|
368
|
+
@property
|
|
369
|
+
def content(self) -> str:
|
|
370
|
+
"""Get section content with page delimiters."""
|
|
371
|
+
return "\n\n---\n\n".join(p.content for p in self.pages)
|
|
372
|
+
|
|
373
|
+
def markdown(self) -> str:
|
|
374
|
+
"""Get section content as single markdown string."""
|
|
375
|
+
return "\n\n".join(p.content for p in self.pages)
|
|
376
|
+
|
|
377
|
+
def preview(self) -> None:
|
|
378
|
+
"""
|
|
379
|
+
Preview the full section content.
|
|
380
|
+
|
|
381
|
+
Renders as Markdown in Jupyter/IPython, plain text in console.
|
|
382
|
+
"""
|
|
383
|
+
content = self.markdown()
|
|
384
|
+
|
|
385
|
+
if IPYTHON_AVAILABLE:
|
|
386
|
+
display(IPythonMarkdown(content))
|
|
387
|
+
else:
|
|
388
|
+
header = f"{self.item}: {self.item_title}"
|
|
389
|
+
print(f"=== {header} ({self.tokens} tokens, pages {self.page_range[0]}-{self.page_range[1]}) ===")
|
|
390
|
+
print(content)
|
|
391
|
+
|
|
392
|
+
def __str__(self) -> str:
|
|
393
|
+
return self.markdown()
|
|
394
|
+
|
|
395
|
+
def __repr__(self) -> str:
|
|
396
|
+
page_range = self.page_range
|
|
397
|
+
return (
|
|
398
|
+
f"Section(item='{self.item}', title='{self.item_title}', "
|
|
399
|
+
f"pages={page_range[0]}-{page_range[1]}, tokens={self.tokens})"
|
|
400
|
+
)
|