sec-analyzer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sec_analyzer-0.1.0/.github/workflows/publish.yml +32 -0
  2. sec_analyzer-0.1.0/.gitignore +6 -0
  3. sec_analyzer-0.1.0/LICENSE +21 -0
  4. sec_analyzer-0.1.0/PKG-INFO +271 -0
  5. sec_analyzer-0.1.0/README.md +250 -0
  6. sec_analyzer-0.1.0/pyproject.toml +32 -0
  7. sec_analyzer-0.1.0/src/sec_analyzer/__init__.py +5 -0
  8. sec_analyzer-0.1.0/src/sec_analyzer/cli.py +61 -0
  9. sec_analyzer-0.1.0/src/sec_analyzer/engine.py +430 -0
  10. sec_analyzer-0.1.0/src/sec_analyzer/presets/__init__.py +5 -0
  11. sec_analyzer-0.1.0/src/sec_analyzer/presets/supply_chain.py +145 -0
  12. sec_analyzer-0.1.0/tests/benchmark.py +276 -0
  13. sec_analyzer-0.1.0/tests/results/comparison/AAPL_comparison.json +146 -0
  14. sec_analyzer-0.1.0/tests/results/comparison/ASML_comparison.json +176 -0
  15. sec_analyzer-0.1.0/tests/results/comparison/CAT_comparison.json +167 -0
  16. sec_analyzer-0.1.0/tests/results/comparison/CRWD_comparison.json +103 -0
  17. sec_analyzer-0.1.0/tests/results/comparison/F_comparison.json +143 -0
  18. sec_analyzer-0.1.0/tests/results/comparison/JPM_comparison.json +129 -0
  19. sec_analyzer-0.1.0/tests/results/comparison/NVDA_comparison.json +161 -0
  20. sec_analyzer-0.1.0/tests/results/comparison/PLTR_comparison.json +102 -0
  21. sec_analyzer-0.1.0/tests/results/comparison/SMCI_comparison.json +184 -0
  22. sec_analyzer-0.1.0/tests/results/comparison/TSM_comparison.json +104 -0
  23. sec_analyzer-0.1.0/tests/results/comparison_summary.json +1417 -0
  24. sec_analyzer-0.1.0/tests/results/consistency/AAPL_run1.json +141 -0
  25. sec_analyzer-0.1.0/tests/results/consistency/AAPL_run2.json +141 -0
  26. sec_analyzer-0.1.0/tests/results/consistency/AAPL_run3.json +141 -0
  27. sec_analyzer-0.1.0/tests/results/consistency/ASML_run1.json +153 -0
  28. sec_analyzer-0.1.0/tests/results/consistency/ASML_run2.json +182 -0
  29. sec_analyzer-0.1.0/tests/results/consistency/ASML_run3.json +160 -0
  30. sec_analyzer-0.1.0/tests/results/consistency/CAT_run1.json +173 -0
  31. sec_analyzer-0.1.0/tests/results/consistency/CAT_run2.json +176 -0
  32. sec_analyzer-0.1.0/tests/results/consistency/CAT_run3.json +176 -0
  33. sec_analyzer-0.1.0/tests/results/consistency/CRWD_run1.json +131 -0
  34. sec_analyzer-0.1.0/tests/results/consistency/CRWD_run2.json +142 -0
  35. sec_analyzer-0.1.0/tests/results/consistency/CRWD_run3.json +149 -0
  36. sec_analyzer-0.1.0/tests/results/consistency/F_run1.json +183 -0
  37. sec_analyzer-0.1.0/tests/results/consistency/F_run2.json +163 -0
  38. sec_analyzer-0.1.0/tests/results/consistency/F_run3.json +173 -0
  39. sec_analyzer-0.1.0/tests/results/consistency/JPM_run1.json +165 -0
  40. sec_analyzer-0.1.0/tests/results/consistency/JPM_run2.json +145 -0
  41. sec_analyzer-0.1.0/tests/results/consistency/JPM_run3.json +144 -0
  42. sec_analyzer-0.1.0/tests/results/consistency/NVDA_run1.json +213 -0
  43. sec_analyzer-0.1.0/tests/results/consistency/NVDA_run2.json +187 -0
  44. sec_analyzer-0.1.0/tests/results/consistency/NVDA_run3.json +193 -0
  45. sec_analyzer-0.1.0/tests/results/consistency/PLTR_run1.json +182 -0
  46. sec_analyzer-0.1.0/tests/results/consistency/PLTR_run2.json +170 -0
  47. sec_analyzer-0.1.0/tests/results/consistency/PLTR_run3.json +160 -0
  48. sec_analyzer-0.1.0/tests/results/consistency/SMCI_run1.json +226 -0
  49. sec_analyzer-0.1.0/tests/results/consistency/SMCI_run2.json +217 -0
  50. sec_analyzer-0.1.0/tests/results/consistency/SMCI_run3.json +224 -0
  51. sec_analyzer-0.1.0/tests/results/consistency/TSM_run1.json +227 -0
  52. sec_analyzer-0.1.0/tests/results/consistency/TSM_run2.json +239 -0
  53. sec_analyzer-0.1.0/tests/results/consistency/TSM_run3.json +3 -0
  54. sec_analyzer-0.1.0/tests/results/consistency_stats.json +1331 -0
  55. sec_analyzer-0.1.0/tests/results/nvda_run1.json +204 -0
  56. sec_analyzer-0.1.0/tests/results/nvda_run2.json +187 -0
  57. sec_analyzer-0.1.0/tests/results/nvda_run3.json +193 -0
  58. sec_analyzer-0.1.0/tests/results/report.json +1587 -0
  59. sec_analyzer-0.1.0/tests/results/xbrl/AAPL_xbrl.json +84 -0
  60. sec_analyzer-0.1.0/tests/results/xbrl/ASML_xbrl.json +104 -0
  61. sec_analyzer-0.1.0/tests/results/xbrl/CAT_xbrl.json +75 -0
  62. sec_analyzer-0.1.0/tests/results/xbrl/CRWD_xbrl.json +49 -0
  63. sec_analyzer-0.1.0/tests/results/xbrl/F_xbrl.json +70 -0
  64. sec_analyzer-0.1.0/tests/results/xbrl/JPM_xbrl.json +56 -0
  65. sec_analyzer-0.1.0/tests/results/xbrl/NVDA_xbrl.json +83 -0
  66. sec_analyzer-0.1.0/tests/results/xbrl/PLTR_xbrl.json +34 -0
  67. sec_analyzer-0.1.0/tests/results/xbrl/SMCI_xbrl.json +87 -0
  68. sec_analyzer-0.1.0/tests/results/xbrl/TSM_xbrl.json +10 -0
  69. sec_analyzer-0.1.0/tests/results/xbrl_availability.json +72 -0
@@ -0,0 +1,32 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ build-and-publish:
11
+ runs-on: ubuntu-latest
12
+ permissions:
13
+ contents: read
14
+
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.11"
22
+
23
+ - name: Install build tools
24
+ run: pip install hatch build
25
+
26
+ - name: Build package
27
+ run: python -m build
28
+
29
+ - name: Publish to PyPI
30
+ uses: pypa/gh-action-pypi-publish@release/v1
31
+ with:
32
+ password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,6 @@
1
+ .env
2
+ __pycache__/
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Seongjin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.4
2
+ Name: sec-analyzer
3
+ Version: 0.1.0
4
+ Summary: Extract structured data from SEC filings using LLM + Pydantic presets
5
+ Project-URL: Homepage, https://github.com/tjdwls101010/SEC-Analyzer
6
+ Author: Seongjin
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Keywords: edgar,finance,llm,pydantic,sec,structured-data
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Financial and Insurance Industry
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Office/Business :: Financial
15
+ Requires-Python: >=3.10
16
+ Requires-Dist: edgartools>=3.0
17
+ Requires-Dist: google-genai>=1.0
18
+ Requires-Dist: pydantic>=2.0
19
+ Requires-Dist: python-dotenv>=1.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ <div align="center">
23
+
24
+ <img src="https://i.namu.wiki/i/HbVpHEsWi0aG30L2PEWRL9FEA0P7Vf-iLYm0QPbH1iOGJabk3vYcDQz1Uxo1DX3OaujOJWX62rs6QgqXFOybLw.svg" width="120" alt="SEC">
25
+
26
+ # SEC-Analyzer
27
+
28
+ **Extract structured data from SEC filings using LLM + Pydantic presets.**
29
+
30
+ Turn any SEC filing (10-K, 10-Q, 20-F, DEF 14A, ...) into structured JSON — define a Pydantic model, and the library does the rest.
31
+
32
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](#)
33
+ [![License: MIT](https://img.shields.io/badge/license-MIT-lightgrey)](#)
34
+
35
+ [Installation](#installation) · [Quick Start](#quick-start) · [Custom Presets](#custom-presets) · [API Reference](#api-reference) · [CLI](#cli)
36
+
37
+ </div>
38
+
39
+ ![](https://github.com/tjdwls101010/DUMOK/blob/main/Images/gemini-3-pro-1774265890176ioxhdiv1w.png?raw=true)
40
+
41
+ ---
42
+
43
+ ## Why This Library?
44
+
45
+ SEC filings contain invaluable data — supply chains, revenue concentration, executive compensation, risk factors — but every filing has a different format. Traditional parsing breaks constantly.
46
+
47
+ This library uses **LLM structured output** (Gemini) to extract exactly the data you define in a **Pydantic model**. The LLM reads the filing and fills in your schema. No regex, no HTML parsing, no breakage.
48
+
49
+ ```python
50
+ from sec_analyzer import extract
51
+ from sec_analyzer.presets import SupplyChain
52
+
53
+ result = extract("NVDA", preset=SupplyChain)
54
+ print(result["data"]["suppliers"])
55
+ # [{'entity': 'Taiwan Semiconductor Manufacturing Company Limited',
56
+ # 'relationship': 'foundry for semiconductor wafers',
57
+ # 'context': 'We utilize foundries, such as TSMC and Samsung...'}, ...]
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ pip install sec-analyzer
66
+ ```
67
+
68
+ Requires Python 3.10+ and a [Google AI API key](https://ai.google.dev/).
69
+
70
+ ---
71
+
72
+ ## Quick Start
73
+
74
+ ### 1. Set your API key
75
+
76
+ ```bash
77
+ export GOOGLE_API_KEY="your-key-here"
78
+ export EDGAR_IDENTITY="YourApp/1.0 your@email.com"
79
+ ```
80
+
81
+ Or create a `.env` file:
82
+ ```
83
+ GOOGLE_API_KEY=your-key-here
84
+ EDGAR_IDENTITY=YourApp/1.0 your@email.com
85
+ ```
86
+
87
+ ### 2. Extract data
88
+
89
+ ```python
90
+ from sec_analyzer import extract
91
+ from sec_analyzer.presets import SupplyChain
92
+
93
+ # Latest 10-K
94
+ result = extract("NVDA", preset=SupplyChain)
95
+
96
+ # Specific form
97
+ result = extract("TSM", preset=SupplyChain, form="20-F")
98
+
99
+ # Specific filing date
100
+ result = extract("AAPL", preset=SupplyChain, filing_date="2025-10-30")
101
+ ```
102
+
103
+ ### 3. Use the result
104
+
105
+ ```python
106
+ filing = result["filing"]
107
+ # {'form': '10-K', 'filing_date': '2026-02-25', 'accession_number': '...', 'filing_url': '...'}
108
+
109
+ data = result["data"]
110
+ print(f"Suppliers: {len(data['suppliers'])}")
111
+ print(f"Customers: {len(data['customers'])}")
112
+ print(f"Single-source deps: {len(data['single_source_dependencies'])}")
113
+ ```
114
+
115
+ ---
116
+
117
+ ## Custom Presets
118
+
119
+ The real power: **define your own Pydantic model** to extract anything.
120
+
121
+ ### Basic custom preset
122
+
123
+ ```python
124
+ from pydantic import BaseModel, Field
125
+ from sec_analyzer import extract
126
+
127
+ class RiskFactors(BaseModel):
128
+ regulatory_risks: list[dict] = Field(
129
+ default_factory=list,
130
+ description="Government regulations that could impact the business"
131
+ )
132
+ litigation: list[dict] = Field(
133
+ default_factory=list,
134
+ description="Pending lawsuits and legal proceedings"
135
+ )
136
+ cybersecurity_risks: list[dict] = Field(
137
+ default_factory=list,
138
+ description="Data breach and cybersecurity threats"
139
+ )
140
+
141
+ result = extract("META", preset=RiskFactors)
142
+ ```
143
+
144
+ When no `__prompt__` is defined, the library auto-generates a prompt from your field descriptions.
145
+
146
+ ### Advanced: custom prompt
147
+
148
+ For expert-level control, add a `__prompt__` class variable:
149
+
150
+ ```python
151
+ from typing import ClassVar
152
+ from pydantic import BaseModel, Field
153
+
154
+ class ExecutiveComp(BaseModel):
155
+ __prompt__: ClassVar[str] = """\
156
+ You are analyzing a DEF 14A proxy statement for {company_name}.
157
+ Extract executive compensation data from the Summary Compensation Table
158
+ and related disclosure sections.
159
+
160
+ Rules:
161
+ 1. Include only Named Executive Officers (NEOs)
162
+ 2. All dollar amounts in exact figures from the filing
163
+ 3. Include stock awards, option awards, and non-equity incentive plan separately
164
+
165
+ Filing text:
166
+ {filing_text}
167
+ """
168
+
169
+ executives: list[dict] = Field(description="NEO compensation details")
170
+ equity_awards: list[dict] = Field(description="Stock and option grant details")
171
+
172
+ result = extract("AAPL", preset=ExecutiveComp, form="DEF 14A")
173
+ ```
174
+
175
+ The `{company_name}` and `{filing_text}` placeholders are filled automatically.
176
+
177
+ ---
178
+
179
+ ## Built-in Presets
180
+
181
+ ### `SupplyChain`
182
+
183
+ Extracts 11 categories of supply chain intelligence from 10-K/10-Q/20-F filings:
184
+
185
+ | Category | Description |
186
+ |----------|-------------|
187
+ | `suppliers` | Companies supplying products/materials/services |
188
+ | `customers` | Companies purchasing products/services |
189
+ | `single_source_dependencies` | Components with sole-source suppliers |
190
+ | `geographic_concentration` | Manufacturing/sourcing location concentration |
191
+ | `capacity_constraints` | Production limitations and lead times |
192
+ | `supply_chain_risks` | Disruption risks (tariffs, shortages, geopolitical) |
193
+ | `revenue_concentration` | Customer/segment revenue % from Notes |
194
+ | `geographic_revenue` | Revenue by country/region from Notes |
195
+ | `purchase_obligations` | Commitments and take-or-pay contracts |
196
+ | `market_risk_disclosures` | Commodity/FX/interest rate exposures (Item 7A) |
197
+ | `inventory_composition` | Raw materials/WIP/finished goods breakdown |
198
+
199
+ ---
200
+
201
+ ## API Reference
202
+
203
+ ### `extract(symbol, preset, form="10-K", filing_date=None, max_chars=2_000_000, api_key=None, model=None)`
204
+
205
+ | Parameter | Type | Description |
206
+ |-----------|------|-------------|
207
+ | `symbol` | str | Ticker symbol (e.g., "NVDA") |
208
+ | `preset` | BaseModel class | Pydantic model defining extraction schema |
209
+ | `form` | str | Filing type. Auto-fallback 10-K → 20-F |
210
+ | `filing_date` | str | Specific date (YYYY-MM-DD). None = latest |
211
+ | `max_chars` | int | Max filing markdown length |
212
+ | `api_key` | str | Google API key (fallback: `GOOGLE_API_KEY` env) |
213
+ | `model` | str | Gemini model (fallback: `GOOGLE_MODEL` env, default: `gemini-2.5-flash`) |
214
+
215
+ **Returns** `{"filing": {...}, "data": {...}}`
216
+
217
+ ---
218
+
219
+ ## CLI
220
+
221
+ ```bash
222
+ # Supply chain extraction (default)
223
+ sec-analyzer NVDA
224
+
225
+ # Specific form
226
+ sec-analyzer TSM --form 20-F
227
+
228
+ # Compact JSON
229
+ sec-analyzer NVDA --json
230
+
231
+ # Specific filing date
232
+ sec-analyzer AAPL --filing-date 2025-10-30
233
+ ```
234
+
235
+ ---
236
+
237
+ ## How It Works
238
+
239
+ ```
240
+ 1. edgartools finds the filing on SEC EDGAR
241
+ 2. Filing converted to markdown (tables preserved)
242
+ 3. Full markdown + Pydantic schema sent to Gemini
243
+ 4. Gemini returns structured JSON matching the schema
244
+ 5. Pydantic validates and returns typed data
245
+ ```
246
+
247
+ The key insight: Gemini's **structured output** mode forces the response to match your Pydantic schema exactly. No post-processing, no regex, no parsing.
248
+
249
+ ---
250
+
251
+ ## Environment Variables
252
+
253
+ | Variable | Required | Default | Description |
254
+ |----------|----------|---------|-------------|
255
+ | `GOOGLE_API_KEY` | Yes | - | Google AI API key |
256
+ | `EDGAR_IDENTITY` | No | `SECAnalyzer/1.0 user@example.com` | SEC EDGAR User-Agent |
257
+ | `GOOGLE_MODEL` | No | `gemini-2.5-flash` | Gemini model ID |
258
+
259
+ ---
260
+
261
+ ## Disclaimer
262
+
263
+ This project is **not affiliated with the SEC, EDGAR, or Google**. Filing data comes from SEC EDGAR (public). LLM extraction may contain errors — always verify critical data against the original filing.
264
+
265
+ This tool is for **research and educational purposes only**. It is not financial advice.
266
+
267
+ ---
268
+
269
+ ## License
270
+
271
+ MIT
@@ -0,0 +1,250 @@
1
+ <div align="center">
2
+
3
+ <img src="https://i.namu.wiki/i/HbVpHEsWi0aG30L2PEWRL9FEA0P7Vf-iLYm0QPbH1iOGJabk3vYcDQz1Uxo1DX3OaujOJWX62rs6QgqXFOybLw.svg" width="120" alt="SEC">
4
+
5
+ # SEC-Analyzer
6
+
7
+ **Extract structured data from SEC filings using LLM + Pydantic presets.**
8
+
9
+ Turn any SEC filing (10-K, 10-Q, 20-F, DEF 14A, ...) into structured JSON — define a Pydantic model, and the library does the rest.
10
+
11
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](#)
12
+ [![License: MIT](https://img.shields.io/badge/license-MIT-lightgrey)](#)
13
+
14
+ [Installation](#installation) · [Quick Start](#quick-start) · [Custom Presets](#custom-presets) · [API Reference](#api-reference) · [CLI](#cli)
15
+
16
+ </div>
17
+
18
+ ![](https://github.com/tjdwls101010/DUMOK/blob/main/Images/gemini-3-pro-1774265890176ioxhdiv1w.png?raw=true)
19
+
20
+ ---
21
+
22
+ ## Why This Library?
23
+
24
+ SEC filings contain invaluable data — supply chains, revenue concentration, executive compensation, risk factors — but every filing has a different format. Traditional parsing breaks constantly.
25
+
26
+ This library uses **LLM structured output** (Gemini) to extract exactly the data you define in a **Pydantic model**. The LLM reads the filing and fills in your schema. No regex, no HTML parsing, no breakage.
27
+
28
+ ```python
29
+ from sec_analyzer import extract
30
+ from sec_analyzer.presets import SupplyChain
31
+
32
+ result = extract("NVDA", preset=SupplyChain)
33
+ print(result["data"]["suppliers"])
34
+ # [{'entity': 'Taiwan Semiconductor Manufacturing Company Limited',
35
+ # 'relationship': 'foundry for semiconductor wafers',
36
+ # 'context': 'We utilize foundries, such as TSMC and Samsung...'}, ...]
37
+ ```
38
+
39
+ ---
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ pip install sec-analyzer
45
+ ```
46
+
47
+ Requires Python 3.10+ and a [Google AI API key](https://ai.google.dev/).
48
+
49
+ ---
50
+
51
+ ## Quick Start
52
+
53
+ ### 1. Set your API key
54
+
55
+ ```bash
56
+ export GOOGLE_API_KEY="your-key-here"
57
+ export EDGAR_IDENTITY="YourApp/1.0 your@email.com"
58
+ ```
59
+
60
+ Or create a `.env` file:
61
+ ```
62
+ GOOGLE_API_KEY=your-key-here
63
+ EDGAR_IDENTITY=YourApp/1.0 your@email.com
64
+ ```
65
+
66
+ ### 2. Extract data
67
+
68
+ ```python
69
+ from sec_analyzer import extract
70
+ from sec_analyzer.presets import SupplyChain
71
+
72
+ # Latest 10-K
73
+ result = extract("NVDA", preset=SupplyChain)
74
+
75
+ # Specific form
76
+ result = extract("TSM", preset=SupplyChain, form="20-F")
77
+
78
+ # Specific filing date
79
+ result = extract("AAPL", preset=SupplyChain, filing_date="2025-10-30")
80
+ ```
81
+
82
+ ### 3. Use the result
83
+
84
+ ```python
85
+ filing = result["filing"]
86
+ # {'form': '10-K', 'filing_date': '2026-02-25', 'accession_number': '...', 'filing_url': '...'}
87
+
88
+ data = result["data"]
89
+ print(f"Suppliers: {len(data['suppliers'])}")
90
+ print(f"Customers: {len(data['customers'])}")
91
+ print(f"Single-source deps: {len(data['single_source_dependencies'])}")
92
+ ```
93
+
94
+ ---
95
+
96
+ ## Custom Presets
97
+
98
+ The real power: **define your own Pydantic model** to extract anything.
99
+
100
+ ### Basic custom preset
101
+
102
+ ```python
103
+ from pydantic import BaseModel, Field
104
+ from sec_analyzer import extract
105
+
106
+ class RiskFactors(BaseModel):
107
+ regulatory_risks: list[dict] = Field(
108
+ default_factory=list,
109
+ description="Government regulations that could impact the business"
110
+ )
111
+ litigation: list[dict] = Field(
112
+ default_factory=list,
113
+ description="Pending lawsuits and legal proceedings"
114
+ )
115
+ cybersecurity_risks: list[dict] = Field(
116
+ default_factory=list,
117
+ description="Data breach and cybersecurity threats"
118
+ )
119
+
120
+ result = extract("META", preset=RiskFactors)
121
+ ```
122
+
123
+ When no `__prompt__` is defined, the library auto-generates a prompt from your field descriptions.
124
+
125
+ ### Advanced: custom prompt
126
+
127
+ For expert-level control, add a `__prompt__` class variable:
128
+
129
+ ```python
130
+ from typing import ClassVar
131
+ from pydantic import BaseModel, Field
132
+
133
+ class ExecutiveComp(BaseModel):
134
+ __prompt__: ClassVar[str] = """\
135
+ You are analyzing a DEF 14A proxy statement for {company_name}.
136
+ Extract executive compensation data from the Summary Compensation Table
137
+ and related disclosure sections.
138
+
139
+ Rules:
140
+ 1. Include only Named Executive Officers (NEOs)
141
+ 2. All dollar amounts in exact figures from the filing
142
+ 3. Include stock awards, option awards, and non-equity incentive plan separately
143
+
144
+ Filing text:
145
+ {filing_text}
146
+ """
147
+
148
+ executives: list[dict] = Field(description="NEO compensation details")
149
+ equity_awards: list[dict] = Field(description="Stock and option grant details")
150
+
151
+ result = extract("AAPL", preset=ExecutiveComp, form="DEF 14A")
152
+ ```
153
+
154
+ The `{company_name}` and `{filing_text}` placeholders are filled automatically.
155
+
156
+ ---
157
+
158
+ ## Built-in Presets
159
+
160
+ ### `SupplyChain`
161
+
162
+ Extracts 11 categories of supply chain intelligence from 10-K/10-Q/20-F filings:
163
+
164
+ | Category | Description |
165
+ |----------|-------------|
166
+ | `suppliers` | Companies supplying products/materials/services |
167
+ | `customers` | Companies purchasing products/services |
168
+ | `single_source_dependencies` | Components with sole-source suppliers |
169
+ | `geographic_concentration` | Manufacturing/sourcing location concentration |
170
+ | `capacity_constraints` | Production limitations and lead times |
171
+ | `supply_chain_risks` | Disruption risks (tariffs, shortages, geopolitical) |
172
+ | `revenue_concentration` | Customer/segment revenue % from Notes |
173
+ | `geographic_revenue` | Revenue by country/region from Notes |
174
+ | `purchase_obligations` | Commitments and take-or-pay contracts |
175
+ | `market_risk_disclosures` | Commodity/FX/interest rate exposures (Item 7A) |
176
+ | `inventory_composition` | Raw materials/WIP/finished goods breakdown |
177
+
178
+ ---
179
+
180
+ ## API Reference
181
+
182
+ ### `extract(symbol, preset, form="10-K", filing_date=None, max_chars=2_000_000, api_key=None, model=None)`
183
+
184
+ | Parameter | Type | Description |
185
+ |-----------|------|-------------|
186
+ | `symbol` | str | Ticker symbol (e.g., "NVDA") |
187
+ | `preset` | BaseModel class | Pydantic model defining extraction schema |
188
+ | `form` | str | Filing type. Auto-fallback 10-K → 20-F |
189
+ | `filing_date` | str | Specific date (YYYY-MM-DD). None = latest |
190
+ | `max_chars` | int | Max filing markdown length |
191
+ | `api_key` | str | Google API key (fallback: `GOOGLE_API_KEY` env) |
192
+ | `model` | str | Gemini model (fallback: `GOOGLE_MODEL` env, default: `gemini-2.5-flash`) |
193
+
194
+ **Returns** `{"filing": {...}, "data": {...}}`
195
+
196
+ ---
197
+
198
+ ## CLI
199
+
200
+ ```bash
201
+ # Supply chain extraction (default)
202
+ sec-analyzer NVDA
203
+
204
+ # Specific form
205
+ sec-analyzer TSM --form 20-F
206
+
207
+ # Compact JSON
208
+ sec-analyzer NVDA --json
209
+
210
+ # Specific filing date
211
+ sec-analyzer AAPL --filing-date 2025-10-30
212
+ ```
213
+
214
+ ---
215
+
216
+ ## How It Works
217
+
218
+ ```
219
+ 1. edgartools finds the filing on SEC EDGAR
220
+ 2. Filing converted to markdown (tables preserved)
221
+ 3. Full markdown + Pydantic schema sent to Gemini
222
+ 4. Gemini returns structured JSON matching the schema
223
+ 5. Pydantic validates and returns typed data
224
+ ```
225
+
226
+ The key insight: Gemini's **structured output** mode forces the response to match your Pydantic schema exactly. No post-processing, no regex, no parsing.
227
+
228
+ ---
229
+
230
+ ## Environment Variables
231
+
232
+ | Variable | Required | Default | Description |
233
+ |----------|----------|---------|-------------|
234
+ | `GOOGLE_API_KEY` | Yes | - | Google AI API key |
235
+ | `EDGAR_IDENTITY` | No | `SECAnalyzer/1.0 user@example.com` | SEC EDGAR User-Agent |
236
+ | `GOOGLE_MODEL` | No | `gemini-2.5-flash` | Gemini model ID |
237
+
238
+ ---
239
+
240
+ ## Disclaimer
241
+
242
+ This project is **not affiliated with the SEC, EDGAR, or Google**. Filing data comes from SEC EDGAR (public). LLM extraction may contain errors — always verify critical data against the original filing.
243
+
244
+ This tool is for **research and educational purposes only**. It is not financial advice.
245
+
246
+ ---
247
+
248
+ ## License
249
+
250
+ MIT
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "sec-analyzer"
7
+ version = "0.1.0"
8
+ description = "Extract structured data from SEC filings using LLM + Pydantic presets"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "Seongjin" }]
13
+ keywords = ["sec", "edgar", "llm", "pydantic", "structured-data", "finance"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Financial and Insurance Industry",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Topic :: Office/Business :: Financial",
20
+ ]
21
+ dependencies = [
22
+ "edgartools>=3.0",
23
+ "google-genai>=1.0",
24
+ "pydantic>=2.0",
25
+ "python-dotenv>=1.0",
26
+ ]
27
+
28
+ [project.scripts]
29
+ sec-analyzer = "sec_analyzer.cli:main"
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/tjdwls101010/SEC-Analyzer"
@@ -0,0 +1,5 @@
1
+ """SEC-Analyzer: Extract structured data from SEC filings using LLM + Pydantic presets."""
2
+
3
+ from .engine import extract, extract_xbrl
4
+
5
+ __all__ = ["extract", "extract_xbrl"]
@@ -0,0 +1,61 @@
1
+ """CLI entry point for sec-analyzer."""
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+
7
+
8
+ _PRESET_MAP = {
9
+ "supply-chain": "sec_analyzer.presets.supply_chain:SupplyChain",
10
+ }
11
+
12
+
13
+ def _load_preset(name: str):
14
+ """Load a preset class by name."""
15
+ if name not in _PRESET_MAP:
16
+ print(f"Unknown preset: {name}", file=sys.stderr)
17
+ print(f"Available presets: {', '.join(_PRESET_MAP)}", file=sys.stderr)
18
+ sys.exit(1)
19
+
20
+ module_path, class_name = _PRESET_MAP[name].rsplit(":", 1)
21
+ import importlib
22
+ mod = importlib.import_module(module_path)
23
+ return getattr(mod, class_name)
24
+
25
+
26
+ def main():
27
+ parser = argparse.ArgumentParser(
28
+ description="Extract structured data from SEC filings"
29
+ )
30
+ parser.add_argument("symbol", help="Ticker symbol (e.g., NVDA, AAPL, TSM)")
31
+ parser.add_argument(
32
+ "--preset", default="supply-chain",
33
+ help=f"Extraction preset ({', '.join(_PRESET_MAP)})",
34
+ )
35
+ parser.add_argument("--form", default="10-K", help="Filing form type (default: 10-K)")
36
+ parser.add_argument("--filing-date", default=None, help="Specific filing date (YYYY-MM-DD)")
37
+ parser.add_argument("--json", action="store_true", dest="compact", help="Compact JSON output")
38
+
39
+ args = parser.parse_args()
40
+
41
+ preset_cls = _load_preset(args.preset)
42
+
43
+ from .engine import extract
44
+
45
+ try:
46
+ result = extract(
47
+ symbol=args.symbol,
48
+ preset=preset_cls,
49
+ form=args.form,
50
+ filing_date=args.filing_date,
51
+ )
52
+ except Exception as e:
53
+ print(json.dumps({"error": str(e)}), file=sys.stderr)
54
+ sys.exit(1)
55
+
56
+ indent = None if args.compact else 2
57
+ print(json.dumps(result, indent=indent, default=str))
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()