sec2md 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sec2md might be problematic. Click here for more details.
- sec2md/__init__.py +24 -0
- sec2md/absolute_table_parser.py +622 -0
- sec2md/chunker/__init__.py +0 -0
- sec2md/chunker/markdown_blocks.py +116 -0
- sec2md/chunker/markdown_chunk.py +76 -0
- sec2md/chunker/markdown_chunker.py +234 -0
- sec2md/chunking.py +66 -0
- sec2md/core.py +93 -0
- sec2md/models.py +153 -0
- sec2md/parser.py +586 -0
- sec2md/section_extractor.py +316 -0
- sec2md/sections.py +104 -0
- sec2md/table_parser.py +386 -0
- sec2md/utils.py +109 -0
- sec2md-0.1.0.dist-info/METADATA +217 -0
- sec2md-0.1.0.dist-info/RECORD +19 -0
- sec2md-0.1.0.dist-info/WHEEL +5 -0
- sec2md-0.1.0.dist-info/licenses/LICENSE +21 -0
- sec2md-0.1.0.dist-info/top_level.txt +1 -0
sec2md/models.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Data models for SEC filing parsing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import List, Optional, Literal, Tuple
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Type alias for filing types
|
|
11
|
+
FilingType = Literal["10-K", "10-Q"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Item10K(str, Enum):
|
|
15
|
+
"""10-K Filing Items - human readable names mapped to item numbers."""
|
|
16
|
+
|
|
17
|
+
# Part I
|
|
18
|
+
BUSINESS = "1"
|
|
19
|
+
RISK_FACTORS = "1A"
|
|
20
|
+
UNRESOLVED_STAFF_COMMENTS = "1B"
|
|
21
|
+
CYBERSECURITY = "1C"
|
|
22
|
+
PROPERTIES = "2"
|
|
23
|
+
LEGAL_PROCEEDINGS = "3"
|
|
24
|
+
MINE_SAFETY = "4"
|
|
25
|
+
|
|
26
|
+
# Part II
|
|
27
|
+
MARKET_FOR_STOCK = "5"
|
|
28
|
+
SELECTED_FINANCIAL_DATA = "6" # Removed in recent years
|
|
29
|
+
MD_AND_A = "7"
|
|
30
|
+
MARKET_RISK = "7A"
|
|
31
|
+
FINANCIAL_STATEMENTS = "8"
|
|
32
|
+
CHANGES_IN_ACCOUNTING = "9"
|
|
33
|
+
CONTROLS_AND_PROCEDURES = "9A"
|
|
34
|
+
OTHER_INFORMATION = "9B"
|
|
35
|
+
CYBERSECURITY_DISCLOSURES = "9C"
|
|
36
|
+
|
|
37
|
+
# Part III
|
|
38
|
+
DIRECTORS_AND_OFFICERS = "10"
|
|
39
|
+
EXECUTIVE_COMPENSATION = "11"
|
|
40
|
+
SECURITY_OWNERSHIP = "12"
|
|
41
|
+
CERTAIN_RELATIONSHIPS = "13"
|
|
42
|
+
PRINCIPAL_ACCOUNTANT = "14"
|
|
43
|
+
|
|
44
|
+
# Part IV
|
|
45
|
+
EXHIBITS = "15"
|
|
46
|
+
FORM_10K_SUMMARY = "16"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Item10Q(str, Enum):
|
|
50
|
+
"""10-Q Filing Items - human readable names with part disambiguation."""
|
|
51
|
+
|
|
52
|
+
# Part I
|
|
53
|
+
FINANCIAL_STATEMENTS_P1 = "1.P1"
|
|
54
|
+
MD_AND_A_P1 = "2.P1"
|
|
55
|
+
MARKET_RISK_P1 = "3.P1"
|
|
56
|
+
CONTROLS_AND_PROCEDURES_P1 = "4.P1"
|
|
57
|
+
|
|
58
|
+
# Part II
|
|
59
|
+
LEGAL_PROCEEDINGS_P2 = "1.P2"
|
|
60
|
+
RISK_FACTORS_P2 = "1A.P2"
|
|
61
|
+
UNREGISTERED_SALES_P2 = "2.P2"
|
|
62
|
+
DEFAULTS_P2 = "3.P2"
|
|
63
|
+
MINE_SAFETY_P2 = "4.P2"
|
|
64
|
+
OTHER_INFORMATION_P2 = "5.P2"
|
|
65
|
+
EXHIBITS_P2 = "6.P2"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# Internal mappings from enum to (part, item) tuples
|
|
69
|
+
ITEM_10K_MAPPING: dict[Item10K, Tuple[str, str]] = {
|
|
70
|
+
# Part I
|
|
71
|
+
Item10K.BUSINESS: ("PART I", "ITEM 1"),
|
|
72
|
+
Item10K.RISK_FACTORS: ("PART I", "ITEM 1A"),
|
|
73
|
+
Item10K.UNRESOLVED_STAFF_COMMENTS: ("PART I", "ITEM 1B"),
|
|
74
|
+
Item10K.CYBERSECURITY: ("PART I", "ITEM 1C"),
|
|
75
|
+
Item10K.PROPERTIES: ("PART I", "ITEM 2"),
|
|
76
|
+
Item10K.LEGAL_PROCEEDINGS: ("PART I", "ITEM 3"),
|
|
77
|
+
Item10K.MINE_SAFETY: ("PART I", "ITEM 4"),
|
|
78
|
+
|
|
79
|
+
# Part II
|
|
80
|
+
Item10K.MARKET_FOR_STOCK: ("PART II", "ITEM 5"),
|
|
81
|
+
Item10K.SELECTED_FINANCIAL_DATA: ("PART II", "ITEM 6"),
|
|
82
|
+
Item10K.MD_AND_A: ("PART II", "ITEM 7"),
|
|
83
|
+
Item10K.MARKET_RISK: ("PART II", "ITEM 7A"),
|
|
84
|
+
Item10K.FINANCIAL_STATEMENTS: ("PART II", "ITEM 8"),
|
|
85
|
+
Item10K.CHANGES_IN_ACCOUNTING: ("PART II", "ITEM 9"),
|
|
86
|
+
Item10K.CONTROLS_AND_PROCEDURES: ("PART II", "ITEM 9A"),
|
|
87
|
+
Item10K.OTHER_INFORMATION: ("PART II", "ITEM 9B"),
|
|
88
|
+
Item10K.CYBERSECURITY_DISCLOSURES: ("PART II", "ITEM 9C"),
|
|
89
|
+
|
|
90
|
+
# Part III
|
|
91
|
+
Item10K.DIRECTORS_AND_OFFICERS: ("PART III", "ITEM 10"),
|
|
92
|
+
Item10K.EXECUTIVE_COMPENSATION: ("PART III", "ITEM 11"),
|
|
93
|
+
Item10K.SECURITY_OWNERSHIP: ("PART III", "ITEM 12"),
|
|
94
|
+
Item10K.CERTAIN_RELATIONSHIPS: ("PART III", "ITEM 13"),
|
|
95
|
+
Item10K.PRINCIPAL_ACCOUNTANT: ("PART III", "ITEM 14"),
|
|
96
|
+
|
|
97
|
+
# Part IV
|
|
98
|
+
Item10K.EXHIBITS: ("PART IV", "ITEM 15"),
|
|
99
|
+
Item10K.FORM_10K_SUMMARY: ("PART IV", "ITEM 16"),
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
ITEM_10Q_MAPPING: dict[Item10Q, Tuple[str, str]] = {
|
|
104
|
+
# Part I
|
|
105
|
+
Item10Q.FINANCIAL_STATEMENTS_P1: ("PART I", "ITEM 1"),
|
|
106
|
+
Item10Q.MD_AND_A_P1: ("PART I", "ITEM 2"),
|
|
107
|
+
Item10Q.MARKET_RISK_P1: ("PART I", "ITEM 3"),
|
|
108
|
+
Item10Q.CONTROLS_AND_PROCEDURES_P1: ("PART I", "ITEM 4"),
|
|
109
|
+
|
|
110
|
+
# Part II
|
|
111
|
+
Item10Q.LEGAL_PROCEEDINGS_P2: ("PART II", "ITEM 1"),
|
|
112
|
+
Item10Q.RISK_FACTORS_P2: ("PART II", "ITEM 1A"),
|
|
113
|
+
Item10Q.UNREGISTERED_SALES_P2: ("PART II", "ITEM 2"),
|
|
114
|
+
Item10Q.DEFAULTS_P2: ("PART II", "ITEM 3"),
|
|
115
|
+
Item10Q.MINE_SAFETY_P2: ("PART II", "ITEM 4"),
|
|
116
|
+
Item10Q.OTHER_INFORMATION_P2: ("PART II", "ITEM 5"),
|
|
117
|
+
Item10Q.EXHIBITS_P2: ("PART II", "ITEM 6"),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass
|
|
122
|
+
class Page:
|
|
123
|
+
"""Represents a single page of markdown content."""
|
|
124
|
+
|
|
125
|
+
number: int
|
|
126
|
+
content: str
|
|
127
|
+
|
|
128
|
+
def __str__(self) -> str:
|
|
129
|
+
return self.content
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class Section:
|
|
134
|
+
"""Represents a filing section (e.g., ITEM 1A - Risk Factors)."""
|
|
135
|
+
|
|
136
|
+
part: Optional[str]
|
|
137
|
+
item: Optional[str]
|
|
138
|
+
item_title: Optional[str]
|
|
139
|
+
pages: List[Page]
|
|
140
|
+
|
|
141
|
+
def markdown(self) -> str:
|
|
142
|
+
"""Get section content as single markdown string."""
|
|
143
|
+
return "\n\n".join(p.content for p in self.pages)
|
|
144
|
+
|
|
145
|
+
def __str__(self) -> str:
|
|
146
|
+
return self.markdown()
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def page_range(self) -> Tuple[int, int]:
|
|
150
|
+
"""Get the start and end page numbers for this section."""
|
|
151
|
+
if not self.pages:
|
|
152
|
+
return (0, 0)
|
|
153
|
+
return (self.pages[0].number, self.pages[-1].number)
|