sec-analyzer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sec_analyzer-0.1.0/.github/workflows/publish.yml +32 -0
- sec_analyzer-0.1.0/.gitignore +6 -0
- sec_analyzer-0.1.0/LICENSE +21 -0
- sec_analyzer-0.1.0/PKG-INFO +271 -0
- sec_analyzer-0.1.0/README.md +250 -0
- sec_analyzer-0.1.0/pyproject.toml +32 -0
- sec_analyzer-0.1.0/src/sec_analyzer/__init__.py +5 -0
- sec_analyzer-0.1.0/src/sec_analyzer/cli.py +61 -0
- sec_analyzer-0.1.0/src/sec_analyzer/engine.py +430 -0
- sec_analyzer-0.1.0/src/sec_analyzer/presets/__init__.py +5 -0
- sec_analyzer-0.1.0/src/sec_analyzer/presets/supply_chain.py +145 -0
- sec_analyzer-0.1.0/tests/benchmark.py +276 -0
- sec_analyzer-0.1.0/tests/results/comparison/AAPL_comparison.json +146 -0
- sec_analyzer-0.1.0/tests/results/comparison/ASML_comparison.json +176 -0
- sec_analyzer-0.1.0/tests/results/comparison/CAT_comparison.json +167 -0
- sec_analyzer-0.1.0/tests/results/comparison/CRWD_comparison.json +103 -0
- sec_analyzer-0.1.0/tests/results/comparison/F_comparison.json +143 -0
- sec_analyzer-0.1.0/tests/results/comparison/JPM_comparison.json +129 -0
- sec_analyzer-0.1.0/tests/results/comparison/NVDA_comparison.json +161 -0
- sec_analyzer-0.1.0/tests/results/comparison/PLTR_comparison.json +102 -0
- sec_analyzer-0.1.0/tests/results/comparison/SMCI_comparison.json +184 -0
- sec_analyzer-0.1.0/tests/results/comparison/TSM_comparison.json +104 -0
- sec_analyzer-0.1.0/tests/results/comparison_summary.json +1417 -0
- sec_analyzer-0.1.0/tests/results/consistency/AAPL_run1.json +141 -0
- sec_analyzer-0.1.0/tests/results/consistency/AAPL_run2.json +141 -0
- sec_analyzer-0.1.0/tests/results/consistency/AAPL_run3.json +141 -0
- sec_analyzer-0.1.0/tests/results/consistency/ASML_run1.json +153 -0
- sec_analyzer-0.1.0/tests/results/consistency/ASML_run2.json +182 -0
- sec_analyzer-0.1.0/tests/results/consistency/ASML_run3.json +160 -0
- sec_analyzer-0.1.0/tests/results/consistency/CAT_run1.json +173 -0
- sec_analyzer-0.1.0/tests/results/consistency/CAT_run2.json +176 -0
- sec_analyzer-0.1.0/tests/results/consistency/CAT_run3.json +176 -0
- sec_analyzer-0.1.0/tests/results/consistency/CRWD_run1.json +131 -0
- sec_analyzer-0.1.0/tests/results/consistency/CRWD_run2.json +142 -0
- sec_analyzer-0.1.0/tests/results/consistency/CRWD_run3.json +149 -0
- sec_analyzer-0.1.0/tests/results/consistency/F_run1.json +183 -0
- sec_analyzer-0.1.0/tests/results/consistency/F_run2.json +163 -0
- sec_analyzer-0.1.0/tests/results/consistency/F_run3.json +173 -0
- sec_analyzer-0.1.0/tests/results/consistency/JPM_run1.json +165 -0
- sec_analyzer-0.1.0/tests/results/consistency/JPM_run2.json +145 -0
- sec_analyzer-0.1.0/tests/results/consistency/JPM_run3.json +144 -0
- sec_analyzer-0.1.0/tests/results/consistency/NVDA_run1.json +213 -0
- sec_analyzer-0.1.0/tests/results/consistency/NVDA_run2.json +187 -0
- sec_analyzer-0.1.0/tests/results/consistency/NVDA_run3.json +193 -0
- sec_analyzer-0.1.0/tests/results/consistency/PLTR_run1.json +182 -0
- sec_analyzer-0.1.0/tests/results/consistency/PLTR_run2.json +170 -0
- sec_analyzer-0.1.0/tests/results/consistency/PLTR_run3.json +160 -0
- sec_analyzer-0.1.0/tests/results/consistency/SMCI_run1.json +226 -0
- sec_analyzer-0.1.0/tests/results/consistency/SMCI_run2.json +217 -0
- sec_analyzer-0.1.0/tests/results/consistency/SMCI_run3.json +224 -0
- sec_analyzer-0.1.0/tests/results/consistency/TSM_run1.json +227 -0
- sec_analyzer-0.1.0/tests/results/consistency/TSM_run2.json +239 -0
- sec_analyzer-0.1.0/tests/results/consistency/TSM_run3.json +3 -0
- sec_analyzer-0.1.0/tests/results/consistency_stats.json +1331 -0
- sec_analyzer-0.1.0/tests/results/nvda_run1.json +204 -0
- sec_analyzer-0.1.0/tests/results/nvda_run2.json +187 -0
- sec_analyzer-0.1.0/tests/results/nvda_run3.json +193 -0
- sec_analyzer-0.1.0/tests/results/report.json +1587 -0
- sec_analyzer-0.1.0/tests/results/xbrl/AAPL_xbrl.json +84 -0
- sec_analyzer-0.1.0/tests/results/xbrl/ASML_xbrl.json +104 -0
- sec_analyzer-0.1.0/tests/results/xbrl/CAT_xbrl.json +75 -0
- sec_analyzer-0.1.0/tests/results/xbrl/CRWD_xbrl.json +49 -0
- sec_analyzer-0.1.0/tests/results/xbrl/F_xbrl.json +70 -0
- sec_analyzer-0.1.0/tests/results/xbrl/JPM_xbrl.json +56 -0
- sec_analyzer-0.1.0/tests/results/xbrl/NVDA_xbrl.json +83 -0
- sec_analyzer-0.1.0/tests/results/xbrl/PLTR_xbrl.json +34 -0
- sec_analyzer-0.1.0/tests/results/xbrl/SMCI_xbrl.json +87 -0
- sec_analyzer-0.1.0/tests/results/xbrl/TSM_xbrl.json +10 -0
- sec_analyzer-0.1.0/tests/results/xbrl_availability.json +72 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build-and-publish:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Set up Python
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.11"
|
|
22
|
+
|
|
23
|
+
- name: Install build tools
|
|
24
|
+
run: pip install hatch build
|
|
25
|
+
|
|
26
|
+
- name: Build package
|
|
27
|
+
run: python -m build
|
|
28
|
+
|
|
29
|
+
- name: Publish to PyPI
|
|
30
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
31
|
+
with:
|
|
32
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Seongjin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sec-analyzer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract structured data from SEC filings using LLM + Pydantic presets
|
|
5
|
+
Project-URL: Homepage, https://github.com/tjdwls101010/SEC-Analyzer
|
|
6
|
+
Author: Seongjin
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: edgar,finance,llm,pydantic,sec,structured-data
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Office/Business :: Financial
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: edgartools>=3.0
|
|
17
|
+
Requires-Dist: google-genai>=1.0
|
|
18
|
+
Requires-Dist: pydantic>=2.0
|
|
19
|
+
Requires-Dist: python-dotenv>=1.0
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
<div align="center">
|
|
23
|
+
|
|
24
|
+
<img src="https://i.namu.wiki/i/HbVpHEsWi0aG30L2PEWRL9FEA0P7Vf-iLYm0QPbH1iOGJabk3vYcDQz1Uxo1DX3OaujOJWX62rs6QgqXFOybLw.svg" width="120" alt="SEC">
|
|
25
|
+
|
|
26
|
+
# SEC-Analyzer
|
|
27
|
+
|
|
28
|
+
**Extract structured data from SEC filings using LLM + Pydantic presets.**
|
|
29
|
+
|
|
30
|
+
Turn any SEC filing (10-K, 10-Q, 20-F, DEF 14A, ...) into structured JSON — define a Pydantic model, and the library does the rest.
|
|
31
|
+
|
|
32
|
+
[](#)
|
|
33
|
+
[](#)
|
|
34
|
+
|
|
35
|
+
[Installation](#installation) · [Quick Start](#quick-start) · [Custom Presets](#custom-presets) · [API Reference](#api-reference) · [CLI](#cli)
|
|
36
|
+
|
|
37
|
+
</div>
|
|
38
|
+
|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Why This Library?
|
|
44
|
+
|
|
45
|
+
SEC filings contain invaluable data — supply chains, revenue concentration, executive compensation, risk factors — but every filing has a different format. Traditional parsing breaks constantly.
|
|
46
|
+
|
|
47
|
+
This library uses **LLM structured output** (Gemini) to extract exactly the data you define in a **Pydantic model**. The LLM reads the filing and fills in your schema. No regex, no HTML parsing, no breakage.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from sec_analyzer import extract
|
|
51
|
+
from sec_analyzer.presets import SupplyChain
|
|
52
|
+
|
|
53
|
+
result = extract("NVDA", preset=SupplyChain)
|
|
54
|
+
print(result["data"]["suppliers"])
|
|
55
|
+
# [{'entity': 'Taiwan Semiconductor Manufacturing Company Limited',
|
|
56
|
+
# 'relationship': 'foundry for semiconductor wafers',
|
|
57
|
+
# 'context': 'We utilize foundries, such as TSMC and Samsung...'}, ...]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install sec-analyzer
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Requires Python 3.10+ and a [Google AI API key](https://ai.google.dev/).
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Quick Start
|
|
73
|
+
|
|
74
|
+
### 1. Set your API key
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
export GOOGLE_API_KEY="your-key-here"
|
|
78
|
+
export EDGAR_IDENTITY="YourApp/1.0 your@email.com"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Or create a `.env` file:
|
|
82
|
+
```
|
|
83
|
+
GOOGLE_API_KEY=your-key-here
|
|
84
|
+
EDGAR_IDENTITY=YourApp/1.0 your@email.com
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. Extract data
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from sec_analyzer import extract
|
|
91
|
+
from sec_analyzer.presets import SupplyChain
|
|
92
|
+
|
|
93
|
+
# Latest 10-K
|
|
94
|
+
result = extract("NVDA", preset=SupplyChain)
|
|
95
|
+
|
|
96
|
+
# Specific form
|
|
97
|
+
result = extract("TSM", preset=SupplyChain, form="20-F")
|
|
98
|
+
|
|
99
|
+
# Specific filing date
|
|
100
|
+
result = extract("AAPL", preset=SupplyChain, filing_date="2025-10-30")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### 3. Use the result
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
filing = result["filing"]
|
|
107
|
+
# {'form': '10-K', 'filing_date': '2026-02-25', 'accession_number': '...', 'filing_url': '...'}
|
|
108
|
+
|
|
109
|
+
data = result["data"]
|
|
110
|
+
print(f"Suppliers: {len(data['suppliers'])}")
|
|
111
|
+
print(f"Customers: {len(data['customers'])}")
|
|
112
|
+
print(f"Single-source deps: {len(data['single_source_dependencies'])}")
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Custom Presets
|
|
118
|
+
|
|
119
|
+
The real power: **define your own Pydantic model** to extract anything.
|
|
120
|
+
|
|
121
|
+
### Basic custom preset
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from pydantic import BaseModel, Field
|
|
125
|
+
from sec_analyzer import extract
|
|
126
|
+
|
|
127
|
+
class RiskFactors(BaseModel):
|
|
128
|
+
regulatory_risks: list[dict] = Field(
|
|
129
|
+
default_factory=list,
|
|
130
|
+
description="Government regulations that could impact the business"
|
|
131
|
+
)
|
|
132
|
+
litigation: list[dict] = Field(
|
|
133
|
+
default_factory=list,
|
|
134
|
+
description="Pending lawsuits and legal proceedings"
|
|
135
|
+
)
|
|
136
|
+
cybersecurity_risks: list[dict] = Field(
|
|
137
|
+
default_factory=list,
|
|
138
|
+
description="Data breach and cybersecurity threats"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
result = extract("META", preset=RiskFactors)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
When no `__prompt__` is defined, the library auto-generates a prompt from your field descriptions.
|
|
145
|
+
|
|
146
|
+
### Advanced: custom prompt
|
|
147
|
+
|
|
148
|
+
For expert-level control, add a `__prompt__` class variable:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from typing import ClassVar
|
|
152
|
+
from pydantic import BaseModel, Field
|
|
153
|
+
|
|
154
|
+
class ExecutiveComp(BaseModel):
|
|
155
|
+
__prompt__: ClassVar[str] = """\
|
|
156
|
+
You are analyzing a DEF 14A proxy statement for {company_name}.
|
|
157
|
+
Extract executive compensation data from the Summary Compensation Table
|
|
158
|
+
and related disclosure sections.
|
|
159
|
+
|
|
160
|
+
Rules:
|
|
161
|
+
1. Include only Named Executive Officers (NEOs)
|
|
162
|
+
2. All dollar amounts in exact figures from the filing
|
|
163
|
+
3. Include stock awards, option awards, and non-equity incentive plan separately
|
|
164
|
+
|
|
165
|
+
Filing text:
|
|
166
|
+
{filing_text}
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
executives: list[dict] = Field(description="NEO compensation details")
|
|
170
|
+
equity_awards: list[dict] = Field(description="Stock and option grant details")
|
|
171
|
+
|
|
172
|
+
result = extract("AAPL", preset=ExecutiveComp, form="DEF 14A")
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
The `{company_name}` and `{filing_text}` placeholders are filled automatically.
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Built-in Presets
|
|
180
|
+
|
|
181
|
+
### `SupplyChain`
|
|
182
|
+
|
|
183
|
+
Extracts 11 categories of supply chain intelligence from 10-K/10-Q/20-F filings:
|
|
184
|
+
|
|
185
|
+
| Category | Description |
|
|
186
|
+
|----------|-------------|
|
|
187
|
+
| `suppliers` | Companies supplying products/materials/services |
|
|
188
|
+
| `customers` | Companies purchasing products/services |
|
|
189
|
+
| `single_source_dependencies` | Components with sole-source suppliers |
|
|
190
|
+
| `geographic_concentration` | Manufacturing/sourcing location concentration |
|
|
191
|
+
| `capacity_constraints` | Production limitations and lead times |
|
|
192
|
+
| `supply_chain_risks` | Disruption risks (tariffs, shortages, geopolitical) |
|
|
193
|
+
| `revenue_concentration` | Customer/segment revenue % from Notes |
|
|
194
|
+
| `geographic_revenue` | Revenue by country/region from Notes |
|
|
195
|
+
| `purchase_obligations` | Commitments and take-or-pay contracts |
|
|
196
|
+
| `market_risk_disclosures` | Commodity/FX/interest rate exposures (Item 7A) |
|
|
197
|
+
| `inventory_composition` | Raw materials/WIP/finished goods breakdown |
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## API Reference
|
|
202
|
+
|
|
203
|
+
### `extract(symbol, preset, form="10-K", filing_date=None, max_chars=2_000_000, api_key=None, model=None)`
|
|
204
|
+
|
|
205
|
+
| Parameter | Type | Description |
|
|
206
|
+
|-----------|------|-------------|
|
|
207
|
+
| `symbol` | str | Ticker symbol (e.g., "NVDA") |
|
|
208
|
+
| `preset` | BaseModel class | Pydantic model defining extraction schema |
|
|
209
|
+
| `form` | str | Filing type. Auto-fallback 10-K → 20-F |
|
|
210
|
+
| `filing_date` | str | Specific date (YYYY-MM-DD). None = latest |
|
|
211
|
+
| `max_chars` | int | Max filing markdown length |
|
|
212
|
+
| `api_key` | str | Google API key (fallback: `GOOGLE_API_KEY` env) |
|
|
213
|
+
| `model` | str | Gemini model (fallback: `GOOGLE_MODEL` env, default: `gemini-2.5-flash`) |
|
|
214
|
+
|
|
215
|
+
**Returns** `{"filing": {...}, "data": {...}}`
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## CLI
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
# Supply chain extraction (default)
|
|
223
|
+
sec-analyzer NVDA
|
|
224
|
+
|
|
225
|
+
# Specific form
|
|
226
|
+
sec-analyzer TSM --form 20-F
|
|
227
|
+
|
|
228
|
+
# Compact JSON
|
|
229
|
+
sec-analyzer NVDA --json
|
|
230
|
+
|
|
231
|
+
# Specific filing date
|
|
232
|
+
sec-analyzer AAPL --filing-date 2025-10-30
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## How It Works
|
|
238
|
+
|
|
239
|
+
```
|
|
240
|
+
1. edgartools finds the filing on SEC EDGAR
|
|
241
|
+
2. Filing converted to markdown (tables preserved)
|
|
242
|
+
3. Full markdown + Pydantic schema sent to Gemini
|
|
243
|
+
4. Gemini returns structured JSON matching the schema
|
|
244
|
+
5. Pydantic validates and returns typed data
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
The key insight: Gemini's **structured output** mode forces the response to match your Pydantic schema exactly. No post-processing, no regex, no parsing.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Environment Variables
|
|
252
|
+
|
|
253
|
+
| Variable | Required | Default | Description |
|
|
254
|
+
|----------|----------|---------|-------------|
|
|
255
|
+
| `GOOGLE_API_KEY` | Yes | - | Google AI API key |
|
|
256
|
+
| `EDGAR_IDENTITY` | No | `SECAnalyzer/1.0 user@example.com` | SEC EDGAR User-Agent |
|
|
257
|
+
| `GOOGLE_MODEL` | No | `gemini-2.5-flash` | Gemini model ID |
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## Disclaimer
|
|
262
|
+
|
|
263
|
+
This project is **not affiliated with the SEC, EDGAR, or Google**. Filing data comes from SEC EDGAR (public). LLM extraction may contain errors — always verify critical data against the original filing.
|
|
264
|
+
|
|
265
|
+
This tool is for **research and educational purposes only**. It is not financial advice.
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## License
|
|
270
|
+
|
|
271
|
+
MIT
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="https://i.namu.wiki/i/HbVpHEsWi0aG30L2PEWRL9FEA0P7Vf-iLYm0QPbH1iOGJabk3vYcDQz1Uxo1DX3OaujOJWX62rs6QgqXFOybLw.svg" width="120" alt="SEC">
|
|
4
|
+
|
|
5
|
+
# SEC-Analyzer
|
|
6
|
+
|
|
7
|
+
**Extract structured data from SEC filings using LLM + Pydantic presets.**
|
|
8
|
+
|
|
9
|
+
Turn any SEC filing (10-K, 10-Q, 20-F, DEF 14A, ...) into structured JSON — define a Pydantic model, and the library does the rest.
|
|
10
|
+
|
|
11
|
+
[](#)
|
|
12
|
+
[](#)
|
|
13
|
+
|
|
14
|
+
[Installation](#installation) · [Quick Start](#quick-start) · [Custom Presets](#custom-presets) · [API Reference](#api-reference) · [CLI](#cli)
|
|
15
|
+
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+

|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Why This Library?
|
|
23
|
+
|
|
24
|
+
SEC filings contain invaluable data — supply chains, revenue concentration, executive compensation, risk factors — but every filing has a different format. Traditional parsing breaks constantly.
|
|
25
|
+
|
|
26
|
+
This library uses **LLM structured output** (Gemini) to extract exactly the data you define in a **Pydantic model**. The LLM reads the filing and fills in your schema. No regex, no HTML parsing, no breakage.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from sec_analyzer import extract
|
|
30
|
+
from sec_analyzer.presets import SupplyChain
|
|
31
|
+
|
|
32
|
+
result = extract("NVDA", preset=SupplyChain)
|
|
33
|
+
print(result["data"]["suppliers"])
|
|
34
|
+
# [{'entity': 'Taiwan Semiconductor Manufacturing Company Limited',
|
|
35
|
+
# 'relationship': 'foundry for semiconductor wafers',
|
|
36
|
+
# 'context': 'We utilize foundries, such as TSMC and Samsung...'}, ...]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install sec-analyzer
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Requires Python 3.10+ and a [Google AI API key](https://ai.google.dev/).
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
### 1. Set your API key
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
export GOOGLE_API_KEY="your-key-here"
|
|
57
|
+
export EDGAR_IDENTITY="YourApp/1.0 your@email.com"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Or create a `.env` file:
|
|
61
|
+
```
|
|
62
|
+
GOOGLE_API_KEY=your-key-here
|
|
63
|
+
EDGAR_IDENTITY=YourApp/1.0 your@email.com
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 2. Extract data
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from sec_analyzer import extract
|
|
70
|
+
from sec_analyzer.presets import SupplyChain
|
|
71
|
+
|
|
72
|
+
# Latest 10-K
|
|
73
|
+
result = extract("NVDA", preset=SupplyChain)
|
|
74
|
+
|
|
75
|
+
# Specific form
|
|
76
|
+
result = extract("TSM", preset=SupplyChain, form="20-F")
|
|
77
|
+
|
|
78
|
+
# Specific filing date
|
|
79
|
+
result = extract("AAPL", preset=SupplyChain, filing_date="2025-10-30")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### 3. Use the result
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
filing = result["filing"]
|
|
86
|
+
# {'form': '10-K', 'filing_date': '2026-02-25', 'accession_number': '...', 'filing_url': '...'}
|
|
87
|
+
|
|
88
|
+
data = result["data"]
|
|
89
|
+
print(f"Suppliers: {len(data['suppliers'])}")
|
|
90
|
+
print(f"Customers: {len(data['customers'])}")
|
|
91
|
+
print(f"Single-source deps: {len(data['single_source_dependencies'])}")
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Custom Presets
|
|
97
|
+
|
|
98
|
+
The real power: **define your own Pydantic model** to extract anything.
|
|
99
|
+
|
|
100
|
+
### Basic custom preset
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from pydantic import BaseModel, Field
|
|
104
|
+
from sec_analyzer import extract
|
|
105
|
+
|
|
106
|
+
class RiskFactors(BaseModel):
|
|
107
|
+
regulatory_risks: list[dict] = Field(
|
|
108
|
+
default_factory=list,
|
|
109
|
+
description="Government regulations that could impact the business"
|
|
110
|
+
)
|
|
111
|
+
litigation: list[dict] = Field(
|
|
112
|
+
default_factory=list,
|
|
113
|
+
description="Pending lawsuits and legal proceedings"
|
|
114
|
+
)
|
|
115
|
+
cybersecurity_risks: list[dict] = Field(
|
|
116
|
+
default_factory=list,
|
|
117
|
+
description="Data breach and cybersecurity threats"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
result = extract("META", preset=RiskFactors)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
When no `__prompt__` is defined, the library auto-generates a prompt from your field descriptions.
|
|
124
|
+
|
|
125
|
+
### Advanced: custom prompt
|
|
126
|
+
|
|
127
|
+
For expert-level control, add a `__prompt__` class variable:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from typing import ClassVar
|
|
131
|
+
from pydantic import BaseModel, Field
|
|
132
|
+
|
|
133
|
+
class ExecutiveComp(BaseModel):
|
|
134
|
+
__prompt__: ClassVar[str] = """\
|
|
135
|
+
You are analyzing a DEF 14A proxy statement for {company_name}.
|
|
136
|
+
Extract executive compensation data from the Summary Compensation Table
|
|
137
|
+
and related disclosure sections.
|
|
138
|
+
|
|
139
|
+
Rules:
|
|
140
|
+
1. Include only Named Executive Officers (NEOs)
|
|
141
|
+
2. All dollar amounts in exact figures from the filing
|
|
142
|
+
3. Include stock awards, option awards, and non-equity incentive plan separately
|
|
143
|
+
|
|
144
|
+
Filing text:
|
|
145
|
+
{filing_text}
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
executives: list[dict] = Field(description="NEO compensation details")
|
|
149
|
+
equity_awards: list[dict] = Field(description="Stock and option grant details")
|
|
150
|
+
|
|
151
|
+
result = extract("AAPL", preset=ExecutiveComp, form="DEF 14A")
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
The `{company_name}` and `{filing_text}` placeholders are filled automatically.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Built-in Presets
|
|
159
|
+
|
|
160
|
+
### `SupplyChain`
|
|
161
|
+
|
|
162
|
+
Extracts 11 categories of supply chain intelligence from 10-K/10-Q/20-F filings:
|
|
163
|
+
|
|
164
|
+
| Category | Description |
|
|
165
|
+
|----------|-------------|
|
|
166
|
+
| `suppliers` | Companies supplying products/materials/services |
|
|
167
|
+
| `customers` | Companies purchasing products/services |
|
|
168
|
+
| `single_source_dependencies` | Components with sole-source suppliers |
|
|
169
|
+
| `geographic_concentration` | Manufacturing/sourcing location concentration |
|
|
170
|
+
| `capacity_constraints` | Production limitations and lead times |
|
|
171
|
+
| `supply_chain_risks` | Disruption risks (tariffs, shortages, geopolitical) |
|
|
172
|
+
| `revenue_concentration` | Customer/segment revenue % from Notes |
|
|
173
|
+
| `geographic_revenue` | Revenue by country/region from Notes |
|
|
174
|
+
| `purchase_obligations` | Commitments and take-or-pay contracts |
|
|
175
|
+
| `market_risk_disclosures` | Commodity/FX/interest rate exposures (Item 7A) |
|
|
176
|
+
| `inventory_composition` | Raw materials/WIP/finished goods breakdown |
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## API Reference
|
|
181
|
+
|
|
182
|
+
### `extract(symbol, preset, form="10-K", filing_date=None, max_chars=2_000_000, api_key=None, model=None)`
|
|
183
|
+
|
|
184
|
+
| Parameter | Type | Description |
|
|
185
|
+
|-----------|------|-------------|
|
|
186
|
+
| `symbol` | str | Ticker symbol (e.g., "NVDA") |
|
|
187
|
+
| `preset` | BaseModel class | Pydantic model defining extraction schema |
|
|
188
|
+
| `form` | str | Filing type. Auto-fallback 10-K → 20-F |
|
|
189
|
+
| `filing_date` | str | Specific date (YYYY-MM-DD). None = latest |
|
|
190
|
+
| `max_chars` | int | Max filing markdown length |
|
|
191
|
+
| `api_key` | str | Google API key (fallback: `GOOGLE_API_KEY` env) |
|
|
192
|
+
| `model` | str | Gemini model (fallback: `GOOGLE_MODEL` env, default: `gemini-2.5-flash`) |
|
|
193
|
+
|
|
194
|
+
**Returns** `{"filing": {...}, "data": {...}}`
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## CLI
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
# Supply chain extraction (default)
|
|
202
|
+
sec-analyzer NVDA
|
|
203
|
+
|
|
204
|
+
# Specific form
|
|
205
|
+
sec-analyzer TSM --form 20-F
|
|
206
|
+
|
|
207
|
+
# Compact JSON
|
|
208
|
+
sec-analyzer NVDA --json
|
|
209
|
+
|
|
210
|
+
# Specific filing date
|
|
211
|
+
sec-analyzer AAPL --filing-date 2025-10-30
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## How It Works
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
1. edgartools finds the filing on SEC EDGAR
|
|
220
|
+
2. Filing converted to markdown (tables preserved)
|
|
221
|
+
3. Full markdown + Pydantic schema sent to Gemini
|
|
222
|
+
4. Gemini returns structured JSON matching the schema
|
|
223
|
+
5. Pydantic validates and returns typed data
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
The key insight: Gemini's **structured output** mode forces the response to match your Pydantic schema exactly. No post-processing, no regex, no parsing.
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Environment Variables
|
|
231
|
+
|
|
232
|
+
| Variable | Required | Default | Description |
|
|
233
|
+
|----------|----------|---------|-------------|
|
|
234
|
+
| `GOOGLE_API_KEY` | Yes | - | Google AI API key |
|
|
235
|
+
| `EDGAR_IDENTITY` | No | `SECAnalyzer/1.0 user@example.com` | SEC EDGAR User-Agent |
|
|
236
|
+
| `GOOGLE_MODEL` | No | `gemini-2.5-flash` | Gemini model ID |
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Disclaimer
|
|
241
|
+
|
|
242
|
+
This project is **not affiliated with the SEC, EDGAR, or Google**. Filing data comes from SEC EDGAR (public). LLM extraction may contain errors — always verify critical data against the original filing.
|
|
243
|
+
|
|
244
|
+
This tool is for **research and educational purposes only**. It is not financial advice.
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
|
|
250
|
+
MIT
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sec-analyzer"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Extract structured data from SEC filings using LLM + Pydantic presets"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "Seongjin" }]
|
|
13
|
+
keywords = ["sec", "edgar", "llm", "pydantic", "structured-data", "finance"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Financial and Insurance Industry",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Office/Business :: Financial",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"edgartools>=3.0",
|
|
23
|
+
"google-genai>=1.0",
|
|
24
|
+
"pydantic>=2.0",
|
|
25
|
+
"python-dotenv>=1.0",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
sec-analyzer = "sec_analyzer.cli:main"
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/tjdwls101010/SEC-Analyzer"
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""CLI entry point for sec-analyzer."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
_PRESET_MAP = {
|
|
9
|
+
"supply-chain": "sec_analyzer.presets.supply_chain:SupplyChain",
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _load_preset(name: str):
|
|
14
|
+
"""Load a preset class by name."""
|
|
15
|
+
if name not in _PRESET_MAP:
|
|
16
|
+
print(f"Unknown preset: {name}", file=sys.stderr)
|
|
17
|
+
print(f"Available presets: {', '.join(_PRESET_MAP)}", file=sys.stderr)
|
|
18
|
+
sys.exit(1)
|
|
19
|
+
|
|
20
|
+
module_path, class_name = _PRESET_MAP[name].rsplit(":", 1)
|
|
21
|
+
import importlib
|
|
22
|
+
mod = importlib.import_module(module_path)
|
|
23
|
+
return getattr(mod, class_name)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main():
|
|
27
|
+
parser = argparse.ArgumentParser(
|
|
28
|
+
description="Extract structured data from SEC filings"
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("symbol", help="Ticker symbol (e.g., NVDA, AAPL, TSM)")
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--preset", default="supply-chain",
|
|
33
|
+
help=f"Extraction preset ({', '.join(_PRESET_MAP)})",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument("--form", default="10-K", help="Filing form type (default: 10-K)")
|
|
36
|
+
parser.add_argument("--filing-date", default=None, help="Specific filing date (YYYY-MM-DD)")
|
|
37
|
+
parser.add_argument("--json", action="store_true", dest="compact", help="Compact JSON output")
|
|
38
|
+
|
|
39
|
+
args = parser.parse_args()
|
|
40
|
+
|
|
41
|
+
preset_cls = _load_preset(args.preset)
|
|
42
|
+
|
|
43
|
+
from .engine import extract
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
result = extract(
|
|
47
|
+
symbol=args.symbol,
|
|
48
|
+
preset=preset_cls,
|
|
49
|
+
form=args.form,
|
|
50
|
+
filing_date=args.filing_date,
|
|
51
|
+
)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
indent = None if args.compact else 2
|
|
57
|
+
print(json.dumps(result, indent=indent, default=str))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
main()
|