sec-engine 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sec_engine-0.1.0/.claude/settings.local.json +7 -0
- sec_engine-0.1.0/.gitignore +24 -0
- sec_engine-0.1.0/CLAUDE.md +32 -0
- sec_engine-0.1.0/Dockerfile +8 -0
- sec_engine-0.1.0/LICENSE +21 -0
- sec_engine-0.1.0/PKG-INFO +12 -0
- sec_engine-0.1.0/README.md +83 -0
- sec_engine-0.1.0/compose.yml +6 -0
- sec_engine-0.1.0/pyproject.toml +22 -0
- sec_engine-0.1.0/sec_engine/__init__.py +1 -0
- sec_engine-0.1.0/sec_engine/__main__.py +4 -0
- sec_engine-0.1.0/sec_engine/edgar.py +265 -0
- sec_engine-0.1.0/sec_engine/server.py +374 -0
- sec_engine-0.1.0/uv.lock +811 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
*.egg
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv/
|
|
11
|
+
venv/
|
|
12
|
+
|
|
13
|
+
# IDE
|
|
14
|
+
.idea/
|
|
15
|
+
.vscode/
|
|
16
|
+
*.swp
|
|
17
|
+
*.swo
|
|
18
|
+
|
|
19
|
+
# OS
|
|
20
|
+
.DS_Store
|
|
21
|
+
Thumbs.db
|
|
22
|
+
|
|
23
|
+
# Environment
|
|
24
|
+
.env
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
## Project
|
|
4
|
+
|
|
5
|
+
sec-engine is an MCP server that exposes SEC EDGAR financial data to LLMs. It runs over stdio and provides 6 tools for company lookup, filing navigation, XBRL metrics, and narrative text extraction.
|
|
6
|
+
|
|
7
|
+
## Structure
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
sec_engine/
|
|
11
|
+
server.py — FastMCP server, defines all 6 tools
|
|
12
|
+
edgar.py — SEC EDGAR API client (async httpx, rate-limited)
|
|
13
|
+
__main__.py — CLI entry point
|
|
14
|
+
__init__.py
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Build & Run
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
uv pip install -e . # install in dev mode
|
|
21
|
+
sec-engine # run the MCP server (stdio)
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Build system is hatchling. Dependencies: mcp, httpx, beautifulsoup4, lxml.
|
|
25
|
+
|
|
26
|
+
## Key Details
|
|
27
|
+
|
|
28
|
+
- Python >= 3.11 required
|
|
29
|
+
- SEC API has a 10 req/sec rate limit; EdgarClient handles this
|
|
30
|
+
- SEC requires a descriptive User-Agent header (configurable via `SEC_EDGAR_USER_AGENT` env var)
|
|
31
|
+
- No tests yet
|
|
32
|
+
- No auth needed — all SEC APIs are public
|
sec_engine-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 taika-st
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sec-engine
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM-optimized SEC EDGAR MCP server
|
|
5
|
+
Author-email: taika-st <taika-st@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
10
|
+
Requires-Dist: httpx>=0.27.0
|
|
11
|
+
Requires-Dist: lxml>=5.0.0
|
|
12
|
+
Requires-Dist: mcp>=1.0.0
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# sec-engine
|
|
2
|
+
|
|
3
|
+
LLM-optimized SEC EDGAR MCP server. Provides six tools that wrap the SEC's public JSON APIs for financial data extraction and analysis — no authentication required.
|
|
4
|
+
|
|
5
|
+
## Tools
|
|
6
|
+
|
|
7
|
+
| Tool | Purpose |
|
|
8
|
+
|------|---------|
|
|
9
|
+
| `resolve` | Ticker/CIK to company identity (name, SIC, exchanges, fiscal year end) |
|
|
10
|
+
| `filings` | List SEC filings with form type and date filtering |
|
|
11
|
+
| `snapshot` | Latest key metrics (~20) + full XBRL concept index |
|
|
12
|
+
| `concept_history` | Time series for one XBRL metric across all filings |
|
|
13
|
+
| `cross_company` | Compare one metric across ALL filers for peer/sector screening |
|
|
14
|
+
| `filing_section` | Extract narrative text from 10-K/10-Q sections (MD&A, Risk Factors, etc.) |
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install sec-engine
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
uv pip install sec-engine
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Claude Desktop
|
|
31
|
+
|
|
32
|
+
Add to your Claude Desktop config (`claude_desktop_config.json`):
|
|
33
|
+
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"mcpServers": {
|
|
37
|
+
"sec-engine": {
|
|
38
|
+
"command": "sec-engine"
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Docker Compose
|
|
45
|
+
|
|
46
|
+
Edit `SEC_EDGAR_USER_AGENT` in `compose.yml` with your name and email, then:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
docker compose up --build
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Docker
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
docker build -t sec-engine .
|
|
56
|
+
docker run -i sec-engine
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Direct
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
sec-engine
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
The server communicates over stdio using the [Model Context Protocol](https://modelcontextprotocol.io/).
|
|
66
|
+
|
|
67
|
+
## Configuration
|
|
68
|
+
|
|
69
|
+
| Environment Variable | Default | Description |
|
|
70
|
+
|---------------------|---------|-------------|
|
|
71
|
+
| `SEC_EDGAR_USER_AGENT` | `SEC-Engine/0.1 (sec-engine-mcp)` | User-Agent string sent to SEC.gov (SEC requires a descriptive UA) |
|
|
72
|
+
|
|
73
|
+
## Requirements
|
|
74
|
+
|
|
75
|
+
- Python >= 3.11
|
|
76
|
+
|
|
77
|
+
## Disclaimer
|
|
78
|
+
|
|
79
|
+
This software retrieves publicly available data from the SEC EDGAR system. It is provided as-is for informational purposes only and does not constitute financial, legal, or investment advice. The authors are not responsible for how you use this data or any consequences arising from its use. You are solely responsible for complying with all applicable SEC policies, regulations, and terms of service.
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
MIT
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sec-engine"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "LLM-optimized SEC EDGAR MCP server"
|
|
5
|
+
license = "MIT"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "taika-st", email = "taika-st@users.noreply.github.com" },
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"mcp>=1.0.0",
|
|
12
|
+
"httpx>=0.27.0",
|
|
13
|
+
"beautifulsoup4>=4.12.0",
|
|
14
|
+
"lxml>=5.0.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
sec-engine = "sec_engine.server:main"
|
|
19
|
+
|
|
20
|
+
[build-system]
|
|
21
|
+
requires = ["hatchling"]
|
|
22
|
+
build-backend = "hatchling.build"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""SEC Engine - LLM-optimized SEC EDGAR MCP server."""
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""SEC EDGAR public API client.
|
|
2
|
+
|
|
3
|
+
Thin wrapper around data.sec.gov JSON APIs. No auth required — just a User-Agent string.
|
|
4
|
+
Rate limited to 10 req/sec per SEC policy.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import re
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
SEC_DATA = "https://data.sec.gov"
|
|
16
|
+
SEC_WWW = "https://www.sec.gov"
|
|
17
|
+
TICKERS_URL = f"{SEC_WWW}/files/company_tickers.json"
|
|
18
|
+
|
|
19
|
+
# Key us-gaap concepts for the snapshot tool
|
|
20
|
+
KEY_CONCEPTS = [
|
|
21
|
+
"Revenues",
|
|
22
|
+
"RevenueFromContractWithCustomerExcludingAssessedTax",
|
|
23
|
+
"NetIncomeLoss",
|
|
24
|
+
"OperatingIncomeLoss",
|
|
25
|
+
"GrossProfit",
|
|
26
|
+
"CostOfGoodsAndServicesSold",
|
|
27
|
+
"Assets",
|
|
28
|
+
"AssetsCurrent",
|
|
29
|
+
"Liabilities",
|
|
30
|
+
"LiabilitiesCurrent",
|
|
31
|
+
"StockholdersEquity",
|
|
32
|
+
"CashAndCashEquivalentsAtCarryingValue",
|
|
33
|
+
"EarningsPerShareBasic",
|
|
34
|
+
"EarningsPerShareDiluted",
|
|
35
|
+
"CommonStockSharesOutstanding",
|
|
36
|
+
"LongTermDebt",
|
|
37
|
+
"LongTermDebtNoncurrent",
|
|
38
|
+
"NetCashProvidedByOperatingActivities",
|
|
39
|
+
"PaymentsOfDividends",
|
|
40
|
+
"ResearchAndDevelopmentExpense",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Section aliases for filing_section tool
|
|
44
|
+
SECTION_ALIASES: dict[str, str] = {
|
|
45
|
+
"business": "1",
|
|
46
|
+
"risk factors": "1A",
|
|
47
|
+
"risk": "1A",
|
|
48
|
+
"cybersecurity": "1C",
|
|
49
|
+
"properties": "2",
|
|
50
|
+
"legal": "3",
|
|
51
|
+
"legal proceedings": "3",
|
|
52
|
+
"mda": "7",
|
|
53
|
+
"md&a": "7",
|
|
54
|
+
"management discussion": "7",
|
|
55
|
+
"management's discussion": "7",
|
|
56
|
+
"market risk": "7A",
|
|
57
|
+
"quantitative": "7A",
|
|
58
|
+
"financial statements": "8",
|
|
59
|
+
"controls": "9A",
|
|
60
|
+
"controls and procedures": "9A",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Regex for Item headers in SEC filings
|
|
64
|
+
ITEM_HEADER = re.compile(
|
|
65
|
+
r"^[\s]*(?:ITEM|Item)\s+(\d+[A-Ca-c]?)[\.\s:]+[-\u2014\u2013]?\s*(.+?)$",
|
|
66
|
+
re.MULTILINE,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class EdgarClient:
|
|
71
|
+
"""Async client for SEC EDGAR public APIs."""
|
|
72
|
+
|
|
73
|
+
def __init__(self, user_agent: str):
|
|
74
|
+
self.user_agent = user_agent
|
|
75
|
+
self._headers = {
|
|
76
|
+
"User-Agent": user_agent,
|
|
77
|
+
"Accept-Encoding": "gzip, deflate",
|
|
78
|
+
}
|
|
79
|
+
self._semaphore = asyncio.Semaphore(10)
|
|
80
|
+
self._ticker_map: dict[str, int] | None = None
|
|
81
|
+
self._http: httpx.AsyncClient | None = None
|
|
82
|
+
|
|
83
|
+
def _client(self) -> httpx.AsyncClient:
|
|
84
|
+
if self._http is None or self._http.is_closed:
|
|
85
|
+
self._http = httpx.AsyncClient(
|
|
86
|
+
headers=self._headers,
|
|
87
|
+
timeout=30.0,
|
|
88
|
+
follow_redirects=True,
|
|
89
|
+
)
|
|
90
|
+
return self._http
|
|
91
|
+
|
|
92
|
+
async def _get(self, url: str) -> httpx.Response:
|
|
93
|
+
async with self._semaphore:
|
|
94
|
+
r = await self._client().get(url)
|
|
95
|
+
r.raise_for_status()
|
|
96
|
+
return r
|
|
97
|
+
|
|
98
|
+
async def _json(self, url: str) -> dict:
|
|
99
|
+
return (await self._get(url)).json()
|
|
100
|
+
|
|
101
|
+
async def _text(self, url: str) -> str:
|
|
102
|
+
return (await self._get(url)).text
|
|
103
|
+
|
|
104
|
+
# ── Identity ──────────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
async def _load_tickers(self) -> dict[str, int]:
|
|
107
|
+
if self._ticker_map is None:
|
|
108
|
+
raw = await self._json(TICKERS_URL)
|
|
109
|
+
self._ticker_map = {
|
|
110
|
+
v["ticker"].upper(): int(v["cik_str"]) for v in raw.values()
|
|
111
|
+
}
|
|
112
|
+
return self._ticker_map
|
|
113
|
+
|
|
114
|
+
async def resolve_cik(self, identifier: str) -> int:
|
|
115
|
+
"""Ticker or CIK string -> integer CIK."""
|
|
116
|
+
clean = identifier.strip().upper()
|
|
117
|
+
try:
|
|
118
|
+
return int(clean)
|
|
119
|
+
except ValueError:
|
|
120
|
+
tmap = await self._load_tickers()
|
|
121
|
+
if clean in tmap:
|
|
122
|
+
return tmap[clean]
|
|
123
|
+
raise ValueError(f"Unknown ticker: {identifier}")
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def pad_cik(cik: int) -> str:
|
|
127
|
+
return str(cik).zfill(10)
|
|
128
|
+
|
|
129
|
+
# ── SEC Data APIs ─────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
async def submissions(self, cik: int) -> dict:
|
|
132
|
+
"""Filing history and company metadata."""
|
|
133
|
+
return await self._json(
|
|
134
|
+
f"{SEC_DATA}/submissions/CIK{self.pad_cik(cik)}.json"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
async def company_facts(self, cik: int) -> dict:
|
|
138
|
+
"""All XBRL financial facts for a company."""
|
|
139
|
+
return await self._json(
|
|
140
|
+
f"{SEC_DATA}/api/xbrl/companyfacts/CIK{self.pad_cik(cik)}.json"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
async def company_concept(
|
|
144
|
+
self, cik: int, concept: str, taxonomy: str = "us-gaap"
|
|
145
|
+
) -> dict:
|
|
146
|
+
"""Time series for one XBRL concept."""
|
|
147
|
+
return await self._json(
|
|
148
|
+
f"{SEC_DATA}/api/xbrl/companyconcept/"
|
|
149
|
+
f"CIK{self.pad_cik(cik)}/{taxonomy}/{concept}.json"
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
async def frames(
|
|
153
|
+
self,
|
|
154
|
+
concept: str,
|
|
155
|
+
period: str,
|
|
156
|
+
currency: str = "USD",
|
|
157
|
+
taxonomy: str = "us-gaap",
|
|
158
|
+
) -> dict:
|
|
159
|
+
"""One concept across all companies for a period."""
|
|
160
|
+
return await self._json(
|
|
161
|
+
f"{SEC_DATA}/api/xbrl/frames/{taxonomy}/{concept}/{currency}/{period}.json"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
async def filing_document(
|
|
165
|
+
self, cik: int, accession: str, filename: str
|
|
166
|
+
) -> str:
|
|
167
|
+
"""Fetch a specific document from a filing archive."""
|
|
168
|
+
acc_clean = accession.replace("-", "")
|
|
169
|
+
url = f"{SEC_WWW}/Archives/edgar/data/{cik}/{acc_clean}/{filename}"
|
|
170
|
+
return await self._text(url)
|
|
171
|
+
|
|
172
|
+
# ── Helpers ────────────────────────────────────────────────────────
|
|
173
|
+
|
|
174
|
+
async def find_primary_doc(self, cik: int, accession: str) -> str | None:
|
|
175
|
+
"""Look up the primary document filename from submissions."""
|
|
176
|
+
subs = await self.submissions(cik)
|
|
177
|
+
recent = subs.get("filings", {}).get("recent", {})
|
|
178
|
+
accs = recent.get("accessionNumber", [])
|
|
179
|
+
docs = recent.get("primaryDocument", [])
|
|
180
|
+
for i, acc in enumerate(accs):
|
|
181
|
+
if acc == accession and i < len(docs):
|
|
182
|
+
return docs[i]
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
def extract_snapshot(self, facts: dict) -> dict[str, Any]:
|
|
186
|
+
"""Extract key financial metrics from companyfacts."""
|
|
187
|
+
us_gaap = facts.get("facts", {}).get("us-gaap", {})
|
|
188
|
+
metrics: dict[str, Any] = {}
|
|
189
|
+
for concept in KEY_CONCEPTS:
|
|
190
|
+
if concept not in us_gaap:
|
|
191
|
+
continue
|
|
192
|
+
units = us_gaap[concept].get("units", {})
|
|
193
|
+
for unit_name, datapoints in units.items():
|
|
194
|
+
if not datapoints:
|
|
195
|
+
continue
|
|
196
|
+
latest = max(datapoints, key=lambda d: d.get("end", ""))
|
|
197
|
+
metrics[concept] = {
|
|
198
|
+
"value": latest["val"],
|
|
199
|
+
"end": latest.get("end"),
|
|
200
|
+
"fiscal_year": latest.get("fy"),
|
|
201
|
+
"period": latest.get("fp"),
|
|
202
|
+
"form": latest.get("form"),
|
|
203
|
+
"filed": latest.get("filed"),
|
|
204
|
+
"unit": unit_name,
|
|
205
|
+
"label": us_gaap[concept].get("label", concept),
|
|
206
|
+
}
|
|
207
|
+
break # first unit type only
|
|
208
|
+
return metrics
|
|
209
|
+
|
|
210
|
+
def extract_concept_index(self, facts: dict) -> dict[str, Any]:
|
|
211
|
+
"""Build a compact index of all available concepts."""
|
|
212
|
+
result: dict[str, Any] = {}
|
|
213
|
+
for taxonomy, concepts in facts.get("facts", {}).items():
|
|
214
|
+
names = sorted(concepts.keys())
|
|
215
|
+
result[taxonomy] = {
|
|
216
|
+
"count": len(names),
|
|
217
|
+
"concepts": names,
|
|
218
|
+
}
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
def parse_filing_sections(self, html: str) -> dict[str, dict[str, str]]:
|
|
222
|
+
"""Parse 10-K/10-Q HTML into sections by Item number."""
|
|
223
|
+
import warnings
|
|
224
|
+
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
|
|
225
|
+
|
|
226
|
+
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
|
227
|
+
soup = BeautifulSoup(html, "lxml")
|
|
228
|
+
for tag in soup(["script", "style"]):
|
|
229
|
+
tag.decompose()
|
|
230
|
+
|
|
231
|
+
text = soup.get_text(separator="\n")
|
|
232
|
+
text = text.replace("\xa0", " ")
|
|
233
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
234
|
+
|
|
235
|
+
matches = list(ITEM_HEADER.finditer(text))
|
|
236
|
+
if not matches:
|
|
237
|
+
return {}
|
|
238
|
+
|
|
239
|
+
sections: dict[str, dict[str, str]] = {}
|
|
240
|
+
for i, m in enumerate(matches):
|
|
241
|
+
item_num = m.group(1).upper()
|
|
242
|
+
title = m.group(2).strip()
|
|
243
|
+
start = m.end()
|
|
244
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
245
|
+
content = text[start:end].strip()
|
|
246
|
+
|
|
247
|
+
# Skip ToC entries — real sections have substantial content
|
|
248
|
+
if len(content) < 500:
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
# Keep longest version (handles ToC reference + actual content)
|
|
252
|
+
if item_num not in sections or len(content) > len(
|
|
253
|
+
sections[item_num]["content"]
|
|
254
|
+
):
|
|
255
|
+
sections[item_num] = {"title": title, "content": content}
|
|
256
|
+
|
|
257
|
+
return sections
|
|
258
|
+
|
|
259
|
+
@staticmethod
|
|
260
|
+
def resolve_section_key(section: str) -> str:
|
|
261
|
+
"""Resolve section name/alias to Item number."""
|
|
262
|
+
clean = section.strip().lower()
|
|
263
|
+
if clean in SECTION_ALIASES:
|
|
264
|
+
return SECTION_ALIASES[clean]
|
|
265
|
+
return section.strip().upper()
|