sec-analyzer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sec_analyzer/__init__.py +5 -0
- sec_analyzer/cli.py +61 -0
- sec_analyzer/engine.py +430 -0
- sec_analyzer/presets/__init__.py +5 -0
- sec_analyzer/presets/supply_chain.py +145 -0
- sec_analyzer-0.1.0.dist-info/METADATA +271 -0
- sec_analyzer-0.1.0.dist-info/RECORD +10 -0
- sec_analyzer-0.1.0.dist-info/WHEEL +4 -0
- sec_analyzer-0.1.0.dist-info/entry_points.txt +2 -0
- sec_analyzer-0.1.0.dist-info/licenses/LICENSE +21 -0
sec_analyzer/__init__.py
ADDED
sec_analyzer/cli.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""CLI entry point for sec-analyzer."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
_PRESET_MAP = {
|
|
9
|
+
"supply-chain": "sec_analyzer.presets.supply_chain:SupplyChain",
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _load_preset(name: str):
|
|
14
|
+
"""Load a preset class by name."""
|
|
15
|
+
if name not in _PRESET_MAP:
|
|
16
|
+
print(f"Unknown preset: {name}", file=sys.stderr)
|
|
17
|
+
print(f"Available presets: {', '.join(_PRESET_MAP)}", file=sys.stderr)
|
|
18
|
+
sys.exit(1)
|
|
19
|
+
|
|
20
|
+
module_path, class_name = _PRESET_MAP[name].rsplit(":", 1)
|
|
21
|
+
import importlib
|
|
22
|
+
mod = importlib.import_module(module_path)
|
|
23
|
+
return getattr(mod, class_name)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def main():
|
|
27
|
+
parser = argparse.ArgumentParser(
|
|
28
|
+
description="Extract structured data from SEC filings"
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("symbol", help="Ticker symbol (e.g., NVDA, AAPL, TSM)")
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--preset", default="supply-chain",
|
|
33
|
+
help=f"Extraction preset ({', '.join(_PRESET_MAP)})",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument("--form", default="10-K", help="Filing form type (default: 10-K)")
|
|
36
|
+
parser.add_argument("--filing-date", default=None, help="Specific filing date (YYYY-MM-DD)")
|
|
37
|
+
parser.add_argument("--json", action="store_true", dest="compact", help="Compact JSON output")
|
|
38
|
+
|
|
39
|
+
args = parser.parse_args()
|
|
40
|
+
|
|
41
|
+
preset_cls = _load_preset(args.preset)
|
|
42
|
+
|
|
43
|
+
from .engine import extract
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
result = extract(
|
|
47
|
+
symbol=args.symbol,
|
|
48
|
+
preset=preset_cls,
|
|
49
|
+
form=args.form,
|
|
50
|
+
filing_date=args.filing_date,
|
|
51
|
+
)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
indent = None if args.compact else 2
|
|
57
|
+
print(json.dumps(result, indent=indent, default=str))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
main()
|
sec_analyzer/engine.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""Core extraction engine: edgartools filing load + Gemini structured output."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
_MAX_MARKDOWN_CHARS = 2_000_000
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _init_edgar(identity: str | None = None):
|
|
18
|
+
"""Initialize edgartools SEC identity."""
|
|
19
|
+
from edgar import set_identity
|
|
20
|
+
|
|
21
|
+
identity = identity or os.environ.get(
|
|
22
|
+
"EDGAR_IDENTITY", "SECAnalyzer/1.0 user@example.com"
|
|
23
|
+
)
|
|
24
|
+
set_identity(identity)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _get_filing(symbol: str, form: str = "10-K", filing_date: str | None = None):
|
|
28
|
+
"""Search latest filing via edgartools. Auto-fallback 10-K -> 20-F.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
tuple: (filing, metadata_dict, company_name)
|
|
32
|
+
"""
|
|
33
|
+
from edgar import Company
|
|
34
|
+
|
|
35
|
+
_init_edgar()
|
|
36
|
+
company = Company(symbol)
|
|
37
|
+
|
|
38
|
+
retries = 3
|
|
39
|
+
last_error = None
|
|
40
|
+
for attempt in range(1, retries + 1):
|
|
41
|
+
try:
|
|
42
|
+
filings = company.get_filings(form=form)
|
|
43
|
+
if len(filings) == 0 and form == "10-K":
|
|
44
|
+
filings = company.get_filings(form="20-F")
|
|
45
|
+
form = "20-F"
|
|
46
|
+
if len(filings) == 0:
|
|
47
|
+
raise ValueError(f"No {form} filing found for {symbol}")
|
|
48
|
+
|
|
49
|
+
if filing_date:
|
|
50
|
+
for f in filings:
|
|
51
|
+
if str(f.filing_date) == filing_date:
|
|
52
|
+
filing = f
|
|
53
|
+
break
|
|
54
|
+
else:
|
|
55
|
+
filing = filings[0]
|
|
56
|
+
else:
|
|
57
|
+
filing = filings[0]
|
|
58
|
+
|
|
59
|
+
metadata = {
|
|
60
|
+
"form": form,
|
|
61
|
+
"filing_date": str(filing.filing_date),
|
|
62
|
+
"accession_number": filing.accession_number,
|
|
63
|
+
"filing_url": filing.filing_url,
|
|
64
|
+
}
|
|
65
|
+
return filing, metadata, company.name
|
|
66
|
+
except ValueError:
|
|
67
|
+
raise
|
|
68
|
+
except Exception as e:
|
|
69
|
+
last_error = e
|
|
70
|
+
if attempt < retries:
|
|
71
|
+
time.sleep(2**attempt)
|
|
72
|
+
continue
|
|
73
|
+
raise RuntimeError(f"edgartools failed after {retries} attempts: {last_error}")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _get_markdown(filing, max_chars: int = _MAX_MARKDOWN_CHARS) -> str:
|
|
77
|
+
"""Convert filing to markdown with safe truncation."""
|
|
78
|
+
md = filing.markdown()
|
|
79
|
+
if len(md) > max_chars:
|
|
80
|
+
md = md[:max_chars]
|
|
81
|
+
return md
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _build_default_prompt(preset_cls: type[BaseModel], company_name: str) -> str:
|
|
85
|
+
"""Build a default extraction prompt from Pydantic field descriptions."""
|
|
86
|
+
schema = preset_cls.model_json_schema()
|
|
87
|
+
fields_desc = []
|
|
88
|
+
for name, prop in schema.get("properties", {}).items():
|
|
89
|
+
desc = prop.get("description", name)
|
|
90
|
+
fields_desc.append(f"- {name}: {desc}")
|
|
91
|
+
|
|
92
|
+
return f"""\
|
|
93
|
+
You are a financial analyst extracting structured data from SEC filings.
|
|
94
|
+
Extract entities and data exactly as stated in the filing text.
|
|
95
|
+
|
|
96
|
+
Filing company: {company_name}
|
|
97
|
+
|
|
98
|
+
Extract the following fields:
|
|
99
|
+
{chr(10).join(fields_desc)}
|
|
100
|
+
|
|
101
|
+
Rules:
|
|
102
|
+
1. Use exact names and figures from the filing — do not paraphrase or invent.
|
|
103
|
+
2. For context fields, copy 1-3 relevant sentences verbatim from the filing.
|
|
104
|
+
3. If a field has no relevant data, return an empty list or null.
|
|
105
|
+
|
|
106
|
+
Extract from this SEC filing text:
|
|
107
|
+
|
|
108
|
+
{{filing_text}}
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _extract_with_llm(
|
|
113
|
+
filing_text: str,
|
|
114
|
+
preset_cls: type[BaseModel],
|
|
115
|
+
company_name: str = "",
|
|
116
|
+
api_key: str | None = None,
|
|
117
|
+
model: str | None = None,
|
|
118
|
+
) -> BaseModel | None:
|
|
119
|
+
"""Extract structured data using Gemini structured output + Pydantic.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Pydantic model instance, or None on failure.
|
|
123
|
+
"""
|
|
124
|
+
from google import genai
|
|
125
|
+
|
|
126
|
+
api_key = api_key or os.environ.get("GOOGLE_API_KEY")
|
|
127
|
+
if not api_key:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
"GOOGLE_API_KEY not set. Pass api_key parameter or set the environment variable."
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
model_id = model or os.environ.get("GOOGLE_MODEL", "gemini-2.5-flash")
|
|
133
|
+
|
|
134
|
+
# Build prompt: use preset's __prompt__ if available, else generate default
|
|
135
|
+
custom_prompt = getattr(preset_cls, "__prompt__", None)
|
|
136
|
+
if custom_prompt:
|
|
137
|
+
prompt = custom_prompt.format(
|
|
138
|
+
company_name=company_name or "Unknown",
|
|
139
|
+
filing_text=filing_text,
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
template = _build_default_prompt(preset_cls, company_name or "Unknown")
|
|
143
|
+
prompt = template.format(filing_text=filing_text)
|
|
144
|
+
|
|
145
|
+
gen_config = {
|
|
146
|
+
"response_mime_type": "application/json",
|
|
147
|
+
"response_json_schema": preset_cls.model_json_schema(),
|
|
148
|
+
"temperature": 0.1,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
thinking_level = os.environ.get("GOOGLE_THINKING_LEVEL", "low")
|
|
152
|
+
if thinking_level and thinking_level.lower() in ("low", "medium", "high", "minimal"):
|
|
153
|
+
from google.genai import types
|
|
154
|
+
gen_config["thinking_config"] = types.ThinkingConfig(
|
|
155
|
+
thinking_level=thinking_level.lower()
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
client = genai.Client(api_key=api_key)
|
|
159
|
+
retries = 3
|
|
160
|
+
|
|
161
|
+
for attempt in range(1, retries + 1):
|
|
162
|
+
try:
|
|
163
|
+
response = client.models.generate_content(
|
|
164
|
+
model=model_id,
|
|
165
|
+
contents=prompt,
|
|
166
|
+
config=gen_config,
|
|
167
|
+
)
|
|
168
|
+
return preset_cls.model_validate_json(response.text)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
print(
|
|
171
|
+
f"[sec-analyzer] LLM attempt {attempt}/{retries} failed: {e}",
|
|
172
|
+
file=sys.stderr,
|
|
173
|
+
)
|
|
174
|
+
if attempt < retries:
|
|
175
|
+
time.sleep(2**attempt)
|
|
176
|
+
continue
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def extract(
|
|
181
|
+
symbol: str,
|
|
182
|
+
preset: type[BaseModel],
|
|
183
|
+
form: str = "10-K",
|
|
184
|
+
filing_date: str | None = None,
|
|
185
|
+
max_chars: int = _MAX_MARKDOWN_CHARS,
|
|
186
|
+
api_key: str | None = None,
|
|
187
|
+
model: str | None = None,
|
|
188
|
+
) -> dict:
|
|
189
|
+
"""Extract structured data from an SEC filing using a Pydantic preset.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
symbol: Ticker symbol (e.g., "NVDA", "AAPL", "TSM").
|
|
193
|
+
preset: Pydantic BaseModel class defining the extraction schema.
|
|
194
|
+
Optionally include a `__prompt__` class variable with a custom
|
|
195
|
+
extraction prompt (use {company_name} and {filing_text} placeholders).
|
|
196
|
+
form: Filing form type ("10-K", "10-Q", "20-F", "DEF 14A", etc.).
|
|
197
|
+
Auto-fallback from 10-K to 20-F for foreign issuers.
|
|
198
|
+
filing_date: Specific filing date (YYYY-MM-DD). None for latest.
|
|
199
|
+
max_chars: Maximum filing markdown length.
|
|
200
|
+
api_key: Google API key. Falls back to GOOGLE_API_KEY env var.
|
|
201
|
+
model: Gemini model ID. Falls back to GOOGLE_MODEL env var.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
dict with "filing" (metadata) and "data" (extracted fields).
|
|
205
|
+
"""
|
|
206
|
+
from dotenv import load_dotenv
|
|
207
|
+
|
|
208
|
+
load_dotenv()
|
|
209
|
+
|
|
210
|
+
filing, metadata, company_name = _get_filing(symbol, form, filing_date)
|
|
211
|
+
markdown = _get_markdown(filing, max_chars)
|
|
212
|
+
|
|
213
|
+
result = _extract_with_llm(
|
|
214
|
+
filing_text=markdown,
|
|
215
|
+
preset_cls=preset,
|
|
216
|
+
company_name=company_name,
|
|
217
|
+
api_key=api_key,
|
|
218
|
+
model=model,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if result is None:
|
|
222
|
+
raise RuntimeError(f"LLM extraction failed for {symbol} ({metadata['form']})")
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
"filing": metadata,
|
|
226
|
+
"data": result.model_dump(),
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
# XBRL structured data extraction
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
def extract_xbrl(symbol: str, form: str = "10-K") -> dict:
|
|
235
|
+
"""Extract structured quantitative data from XBRL tags.
|
|
236
|
+
|
|
237
|
+
Extracts 4 categories using standardized US-GAAP XBRL tags:
|
|
238
|
+
- revenue_concentration: Customer/segment revenue % (ConcentrationRiskPercentage)
|
|
239
|
+
- geographic_revenue: Revenue by country/region
|
|
240
|
+
- inventory_composition: Raw materials / WIP / finished goods
|
|
241
|
+
- purchase_obligations: Unconditional purchase commitments
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
symbol: Ticker symbol.
|
|
245
|
+
form: Filing form type.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
dict with "filing" metadata and "data" containing available categories.
|
|
249
|
+
Empty categories are omitted.
|
|
250
|
+
"""
|
|
251
|
+
from dotenv import load_dotenv
|
|
252
|
+
load_dotenv()
|
|
253
|
+
|
|
254
|
+
filing, metadata, _ = _get_filing(symbol, form)
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
xbrl = filing.xbrl()
|
|
258
|
+
if xbrl is None:
|
|
259
|
+
return {"filing": metadata, "data": {}, "xbrl_available": False}
|
|
260
|
+
except Exception:
|
|
261
|
+
return {"filing": metadata, "data": {}, "xbrl_available": False}
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
import pandas as pd
|
|
265
|
+
df = xbrl.instance.facts.reset_index()
|
|
266
|
+
except Exception:
|
|
267
|
+
return {"filing": metadata, "data": {}, "xbrl_available": False}
|
|
268
|
+
|
|
269
|
+
supplements = {}
|
|
270
|
+
|
|
271
|
+
# --- Revenue Concentration ---
|
|
272
|
+
conc = df[df["concept"].astype(str).str.contains(
|
|
273
|
+
"ConcentrationRiskPercentage", case=False, na=False)]
|
|
274
|
+
if len(conc) > 0:
|
|
275
|
+
benchmark_col = "us-gaap:ConcentrationRiskByBenchmarkAxis"
|
|
276
|
+
if benchmark_col in conc.columns:
|
|
277
|
+
conc = conc[conc[benchmark_col].astype(str).str.contains(
|
|
278
|
+
"Revenue", case=False, na=False)]
|
|
279
|
+
if "end_date" in conc.columns and len(conc) > 0:
|
|
280
|
+
latest_date = conc["end_date"].max()
|
|
281
|
+
conc = conc[conc["end_date"] == latest_date]
|
|
282
|
+
|
|
283
|
+
entries = []
|
|
284
|
+
seen = set()
|
|
285
|
+
for _, row in conc.iterrows():
|
|
286
|
+
customer = str(row.get("srt:MajorCustomersAxis", ""))
|
|
287
|
+
if not customer or customer == "nan":
|
|
288
|
+
continue
|
|
289
|
+
name = customer.split(":")[-1].replace("Member", "")
|
|
290
|
+
name = re.sub(r"([a-z])([A-Z])", r"\1 \2", name)
|
|
291
|
+
_GEO = ("Based End Customers", "Region", "Country", "Americas",
|
|
292
|
+
"Europe", "Asia", "Pacific", "United States", "China", "Japan")
|
|
293
|
+
if any(kw.lower() in name.lower() for kw in _GEO):
|
|
294
|
+
continue
|
|
295
|
+
if name in seen:
|
|
296
|
+
continue
|
|
297
|
+
seen.add(name)
|
|
298
|
+
try:
|
|
299
|
+
pct = round(float(row["value"]) * 100, 2)
|
|
300
|
+
except (ValueError, TypeError):
|
|
301
|
+
pct = None
|
|
302
|
+
entries.append({"entity": name, "revenue_pct": pct,
|
|
303
|
+
"source": "xbrl", "end_date": str(row.get("end_date", ""))})
|
|
304
|
+
if entries:
|
|
305
|
+
supplements["revenue_concentration"] = entries
|
|
306
|
+
|
|
307
|
+
# --- Geographic Revenue ---
|
|
308
|
+
geo_col = "srt:StatementGeographicalAxis"
|
|
309
|
+
if geo_col in df.columns:
|
|
310
|
+
geo = df[(df[geo_col].notna()) &
|
|
311
|
+
(df["concept"].astype(str).str.contains("Revenue", case=False, na=False))]
|
|
312
|
+
if len(geo) > 0:
|
|
313
|
+
_rev_patterns = ["RevenueFromContractWithCustomer", r"^us-gaap:Revenues$"]
|
|
314
|
+
total_rev = None
|
|
315
|
+
for pat in _rev_patterns:
|
|
316
|
+
rows = df[
|
|
317
|
+
(df["concept"].astype(str).str.contains(pat, case=False, na=False)) &
|
|
318
|
+
(~df["concept"].astype(str).str.contains(
|
|
319
|
+
"TextBlock|Policy|Description|Percentage|Cost", case=False, na=False)) &
|
|
320
|
+
(df[geo_col].isna()) & (df["period_type"] == "duration")
|
|
321
|
+
].copy()
|
|
322
|
+
if len(rows) > 0:
|
|
323
|
+
try:
|
|
324
|
+
rows["_val"] = pd.to_numeric(rows["value"], errors="coerce")
|
|
325
|
+
rows = rows.dropna(subset=["_val"])
|
|
326
|
+
if len(rows) > 0:
|
|
327
|
+
latest = rows[rows["end_date"] == rows["end_date"].max()]
|
|
328
|
+
total_rev = float(latest["_val"].max())
|
|
329
|
+
break
|
|
330
|
+
except Exception:
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
entries = []
|
|
334
|
+
seen = set()
|
|
335
|
+
_COUNTRY_MAP = {"US": "United States", "CN": "China", "JP": "Japan",
|
|
336
|
+
"TW": "Taiwan", "KR": "South Korea", "DE": "Germany",
|
|
337
|
+
"GB": "United Kingdom", "IN": "India"}
|
|
338
|
+
for _, row in geo.iterrows():
|
|
339
|
+
region = str(row[geo_col]).split(":")[-1].replace("Member", "")
|
|
340
|
+
region = re.sub(r"([a-z])([A-Z])", r"\1 \2", region)
|
|
341
|
+
region = _COUNTRY_MAP.get(region, region)
|
|
342
|
+
try:
|
|
343
|
+
amount = float(row["value"])
|
|
344
|
+
except (ValueError, TypeError):
|
|
345
|
+
continue
|
|
346
|
+
end_date = str(row.get("end_date", ""))
|
|
347
|
+
key = f"{region}_{end_date}"
|
|
348
|
+
if key in seen:
|
|
349
|
+
continue
|
|
350
|
+
seen.add(key)
|
|
351
|
+
pct = round(amount / total_rev * 100, 2) if total_rev else None
|
|
352
|
+
amt_str = f"${amount/1e9:.1f}B" if amount >= 1e9 else f"${amount/1e6:.0f}M"
|
|
353
|
+
entries.append({"region": region, "revenue_pct": pct,
|
|
354
|
+
"revenue_amount": amt_str, "source": "xbrl", "end_date": end_date})
|
|
355
|
+
if entries:
|
|
356
|
+
entries.sort(key=lambda x: x["end_date"], reverse=True)
|
|
357
|
+
if entries:
|
|
358
|
+
first_period = entries[0]["end_date"]
|
|
359
|
+
entries = [e for e in entries if e["end_date"] == first_period]
|
|
360
|
+
supplements["geographic_revenue"] = entries
|
|
361
|
+
|
|
362
|
+
# --- Inventory Composition ---
|
|
363
|
+
inv_concepts = {
|
|
364
|
+
"InventoryRawMaterialsAndSupplies": "raw_materials",
|
|
365
|
+
"InventoryRawMaterials": "raw_materials",
|
|
366
|
+
"InventoryWorkInProcess": "work_in_progress",
|
|
367
|
+
"InventoryFinishedGoods": "finished_goods",
|
|
368
|
+
"InventoryFinishedGoodsAndWorkInProcess": "finished_goods",
|
|
369
|
+
}
|
|
370
|
+
inv_total_row = df[df["concept"].astype(str).str.contains(
|
|
371
|
+
r"^us-gaap:InventoryNet$", case=False, na=False)]
|
|
372
|
+
inv_total = None
|
|
373
|
+
if len(inv_total_row) > 0:
|
|
374
|
+
try:
|
|
375
|
+
inv_total = float(inv_total_row.iloc[0]["value"])
|
|
376
|
+
except (ValueError, TypeError):
|
|
377
|
+
pass
|
|
378
|
+
|
|
379
|
+
inv_entries = []
|
|
380
|
+
comp_total = 0.0
|
|
381
|
+
for suffix, category in inv_concepts.items():
|
|
382
|
+
rows = df[df["concept"].astype(str).str.contains(suffix, case=False, na=False)]
|
|
383
|
+
if len(rows) > 0:
|
|
384
|
+
try:
|
|
385
|
+
amount = float(rows.iloc[0]["value"])
|
|
386
|
+
except (ValueError, TypeError):
|
|
387
|
+
continue
|
|
388
|
+
comp_total += amount
|
|
389
|
+
amt_str = f"${amount/1e9:.1f}B" if amount >= 1e9 else f"${amount/1e6:.0f}M"
|
|
390
|
+
inv_entries.append({"category": category, "amount": amt_str,
|
|
391
|
+
"_raw": amount, "source": "xbrl"})
|
|
392
|
+
if inv_entries:
|
|
393
|
+
denom = inv_total if inv_total and comp_total <= inv_total * 1.05 else comp_total
|
|
394
|
+
for e in inv_entries:
|
|
395
|
+
raw = e.pop("_raw")
|
|
396
|
+
e["pct_of_total"] = round(raw / denom * 100, 2) if denom else None
|
|
397
|
+
supplements["inventory_composition"] = inv_entries
|
|
398
|
+
|
|
399
|
+
# --- Purchase Obligations ---
|
|
400
|
+
po_rows = df[df["concept"].astype(str).str.contains(
|
|
401
|
+
"UnrecordedUnconditionalPurchaseObligation", case=False, na=False)]
|
|
402
|
+
po_rows = po_rows[~po_rows["concept"].astype(str).str.contains(
|
|
403
|
+
"TextBlock|Policy", case=False, na=False)]
|
|
404
|
+
if len(po_rows) > 0:
|
|
405
|
+
po_entries = []
|
|
406
|
+
for _, row in po_rows.iterrows():
|
|
407
|
+
concept = str(row["concept"])
|
|
408
|
+
try:
|
|
409
|
+
amount = float(row["value"])
|
|
410
|
+
except (ValueError, TypeError):
|
|
411
|
+
continue
|
|
412
|
+
amt_str = f"${amount/1e9:.1f}B" if amount >= 1e9 else f"${amount/1e6:.0f}M"
|
|
413
|
+
timeframe = ""
|
|
414
|
+
for label, tf in [("BalanceSheetAmount", "total"),
|
|
415
|
+
("FirstAnniversary", "year 1"), ("SecondAnniversary", "year 2"),
|
|
416
|
+
("ThirdAnniversary", "year 3"), ("FourthAnniversary", "year 4"),
|
|
417
|
+
("FifthAnniversary", "year 5"), ("AfterFiveYears", "after year 5")]:
|
|
418
|
+
if label in concept:
|
|
419
|
+
timeframe = tf
|
|
420
|
+
break
|
|
421
|
+
po_entries.append({"obligation_type": "unconditional purchase obligation",
|
|
422
|
+
"amount": amt_str, "timeframe": timeframe, "source": "xbrl"})
|
|
423
|
+
if po_entries:
|
|
424
|
+
supplements["purchase_obligations"] = po_entries
|
|
425
|
+
|
|
426
|
+
return {
|
|
427
|
+
"filing": metadata,
|
|
428
|
+
"data": supplements,
|
|
429
|
+
"xbrl_available": True,
|
|
430
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Supply chain intelligence preset for SEC filings.
|
|
2
|
+
|
|
3
|
+
Extracts suppliers, customers, single-source dependencies, geographic
|
|
4
|
+
concentration, capacity constraints, supply chain risks, revenue concentration,
|
|
5
|
+
geographic revenue, purchase obligations, market risk disclosures, and
|
|
6
|
+
inventory composition from 10-K/10-Q/20-F filings.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import ClassVar, Literal
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SupplierEntry(BaseModel):
|
|
17
|
+
entity: str = Field(description="Name of the supplier company. Use exact name from the filing.")
|
|
18
|
+
relationship: str = Field(default="", description="Nature of the supply relationship (e.g., 'sole source supplier', 'key component vendor').")
|
|
19
|
+
context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing that supports this supplier relationship.")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CustomerEntry(BaseModel):
|
|
23
|
+
entity: str = Field(description="Name of the customer company. Use exact name from the filing.")
|
|
24
|
+
relationship: str = Field(default="", description="Nature of the customer relationship (e.g., 'major customer', 'accounted for 35% of revenue').")
|
|
25
|
+
context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing that supports this customer relationship.")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SingleSourceEntry(BaseModel):
|
|
29
|
+
component: str = Field(default="", description="Component or material with single-source dependency (e.g., 'DRAM memory chips').")
|
|
30
|
+
supplier: str = Field(description="Name of the sole-source or single-source supplier.")
|
|
31
|
+
context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing this dependency.")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GeographicEntry(BaseModel):
|
|
35
|
+
location: str = Field(description="Country or region name (e.g., 'Taiwan', 'South Korea').")
|
|
36
|
+
activity: str = Field(default="", description="Type of activity at this location (e.g., 'manufacturing', 'assembly').")
|
|
37
|
+
context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing geographic concentration.")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class CapacityConstraintEntry(BaseModel):
|
|
41
|
+
constraint: str = Field(description="Type of capacity constraint (e.g., 'extended lead times', 'production capacity limitation').")
|
|
42
|
+
context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing the constraint.")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SupplyChainRiskEntry(BaseModel):
|
|
46
|
+
risk: str = Field(description="Type of supply chain risk (e.g., 'tariff impact', 'raw material shortage').")
|
|
47
|
+
context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing this risk.")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class RevenueConcentrationEntry(BaseModel):
|
|
51
|
+
entity: str = Field(description="Customer or segment name from filing.")
|
|
52
|
+
revenue_pct: float | None = Field(default=None, description="% of total revenue (e.g., 35.2).")
|
|
53
|
+
revenue_amount: str = Field(default="", description="Amount if disclosed (e.g., '$5.2 billion').")
|
|
54
|
+
context: str = Field(default="", description="1-3 sentences from Notes.")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class GeographicRevenueEntry(BaseModel):
|
|
58
|
+
region: str = Field(description="Country or region (e.g., 'United States', 'China').")
|
|
59
|
+
revenue_pct: float | None = Field(default=None, description="% of total revenue.")
|
|
60
|
+
revenue_amount: str = Field(default="", description="Amount if disclosed.")
|
|
61
|
+
context: str = Field(default="", description="1-3 sentences from Notes.")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class PurchaseObligationEntry(BaseModel):
|
|
65
|
+
counterparty: str = Field(default="", description="Supplier name if disclosed.")
|
|
66
|
+
obligation_type: str = Field(description="Type (e.g., 'inventory commitment', 'capacity reservation').")
|
|
67
|
+
amount: str = Field(default="", description="Dollar amount (e.g., '$2.5 billion').")
|
|
68
|
+
timeframe: str = Field(default="", description="Duration (e.g., 'through fiscal 2027').")
|
|
69
|
+
context: str = Field(default="", description="1-3 sentences from Notes.")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class MarketRiskEntry(BaseModel):
|
|
73
|
+
risk_type: Literal["commodity", "fx", "interest_rate"] = Field(description="Type: 'commodity', 'fx', or 'interest_rate'.")
|
|
74
|
+
exposure: str = Field(default="", description="Specific exposure (e.g., 'gold price', 'EUR/USD').")
|
|
75
|
+
sensitivity: str = Field(default="", description="Quantitative impact if disclosed (e.g., '10% increase = $50M COGS impact').")
|
|
76
|
+
hedging: str = Field(default="", description="Hedging strategy if disclosed.")
|
|
77
|
+
context: str = Field(default="", description="1-3 sentences from filing.")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class InventoryCompositionEntry(BaseModel):
|
|
81
|
+
category: Literal["raw_materials", "work_in_progress", "finished_goods"] = Field(description="Category: 'raw_materials', 'work_in_progress', or 'finished_goods'.")
|
|
82
|
+
amount: str = Field(default="", description="Dollar amount if disclosed (e.g., '$1.2 billion').")
|
|
83
|
+
pct_of_total: float | None = Field(default=None, description="% of total inventory.")
|
|
84
|
+
context: str = Field(default="", description="1-3 sentences from Notes (valuation, aging, obsolescence).")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class SupplyChain(BaseModel):
|
|
88
|
+
"""Supply chain intelligence extraction from SEC filings."""
|
|
89
|
+
|
|
90
|
+
__prompt__: ClassVar[str] = """\
|
|
91
|
+
You are a financial analyst extracting supply chain intelligence from SEC filings.
|
|
92
|
+
Extract entities and relationships exactly as stated in the filing text.
|
|
93
|
+
|
|
94
|
+
The filing text below is the complete SEC filing in markdown format.
|
|
95
|
+
Identify relevant sections by their natural headings:
|
|
96
|
+
- Item 1 (Business): suppliers, customers, single-source dependencies
|
|
97
|
+
- Item 1A (Risk Factors): supply chain risks, geographic concentration
|
|
98
|
+
- Item 7 (MD&A): capacity constraints, operating discussion
|
|
99
|
+
- Item 7A (Market Risk): commodity/FX/interest rate exposures
|
|
100
|
+
- Item 8 Notes: revenue segments, inventory, commitments, concentration
|
|
101
|
+
For 20-F filings, look for equivalent items (Item 4, 3D, 5, 11, 18/19).
|
|
102
|
+
|
|
103
|
+
Rules:
|
|
104
|
+
1. Use exact company names from the filing — do not paraphrase or invent.
|
|
105
|
+
2. For context fields, copy 1-3 relevant sentences verbatim from the filing.
|
|
106
|
+
3. If a category has no relevant data, return an empty list.
|
|
107
|
+
4. Do NOT extract the filing company itself as its own supplier or customer.
|
|
108
|
+
5. Focus on factual supply chain relationships — skip generic boilerplate.
|
|
109
|
+
6. De facto single-source: If a supplier is described as the PRIMARY or ONLY provider
|
|
110
|
+
for a critical component category and NO alternative supplier is mentioned for that
|
|
111
|
+
same category, classify it as single_source_dependencies.
|
|
112
|
+
7. Customer extraction: Look for revenue concentration disclosures, named buyers,
|
|
113
|
+
and "accounted for X% of revenue" language.
|
|
114
|
+
8. Relationship specificity: Use precise descriptions such as "sole foundry for
|
|
115
|
+
leading-edge GPUs", "memory supplier (HBM)", "anchor customer >10% revenue".
|
|
116
|
+
9. Only infer relationships where the filing text provides contextual support.
|
|
117
|
+
10. Revenue concentration: From Notes, extract customers/segments with specific % of
|
|
118
|
+
revenue. Include exact percentage as revenue_pct.
|
|
119
|
+
11. Geographic revenue: From Notes, extract revenue by country/region with exact
|
|
120
|
+
percentages. Use standardized country names.
|
|
121
|
+
12. Purchase obligations: From Notes (Commitments and Contingencies), extract purchase
|
|
122
|
+
commitments, capacity reservations, take-or-pay contracts.
|
|
123
|
+
13. Market risk disclosures: From Item 7A, extract commodity/FX/interest rate exposures.
|
|
124
|
+
Classify risk_type as "commodity", "fx", or "interest_rate".
|
|
125
|
+
14. Inventory composition: From Notes, extract raw materials, work-in-progress, and
|
|
126
|
+
finished goods amounts and percentages.
|
|
127
|
+
|
|
128
|
+
Filing company: {company_name}
|
|
129
|
+
|
|
130
|
+
Extract all supply chain entities from this SEC filing text:
|
|
131
|
+
|
|
132
|
+
{filing_text}
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
suppliers: list[SupplierEntry] = Field(default_factory=list, description="Companies that supply products, materials, or services to the filing company.")
|
|
136
|
+
customers: list[CustomerEntry] = Field(default_factory=list, description="Companies that purchase products or services from the filing company.")
|
|
137
|
+
single_source_dependencies: list[SingleSourceEntry] = Field(default_factory=list, description="Components with sole-source or single-source supplier dependencies.")
|
|
138
|
+
geographic_concentration: list[GeographicEntry] = Field(default_factory=list, description="Locations where manufacturing, production, or sourcing is concentrated.")
|
|
139
|
+
capacity_constraints: list[CapacityConstraintEntry] = Field(default_factory=list, description="Production capacity limitations, extended lead times, or backlogs.")
|
|
140
|
+
supply_chain_risks: list[SupplyChainRiskEntry] = Field(default_factory=list, description="Supply disruption risks including tariffs, shortages, geopolitical risks.")
|
|
141
|
+
revenue_concentration: list[RevenueConcentrationEntry] = Field(default_factory=list, description="Customer/segment revenue concentration from Notes.")
|
|
142
|
+
geographic_revenue: list[GeographicRevenueEntry] = Field(default_factory=list, description="Revenue breakdown by country/region from Notes.")
|
|
143
|
+
purchase_obligations: list[PurchaseObligationEntry] = Field(default_factory=list, description="Purchase commitments, capacity reservations from Notes.")
|
|
144
|
+
market_risk_disclosures: list[MarketRiskEntry] = Field(default_factory=list, description="Market risk exposures from Item 7A.")
|
|
145
|
+
inventory_composition: list[InventoryCompositionEntry] = Field(default_factory=list, description="Inventory breakdown from Notes.")
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sec-analyzer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract structured data from SEC filings using LLM + Pydantic presets
|
|
5
|
+
Project-URL: Homepage, https://github.com/tjdwls101010/SEC-Analyzer
|
|
6
|
+
Author: Seongjin
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: edgar,finance,llm,pydantic,sec,structured-data
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Office/Business :: Financial
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: edgartools>=3.0
|
|
17
|
+
Requires-Dist: google-genai>=1.0
|
|
18
|
+
Requires-Dist: pydantic>=2.0
|
|
19
|
+
Requires-Dist: python-dotenv>=1.0
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
<div align="center">
|
|
23
|
+
|
|
24
|
+
<img src="https://i.namu.wiki/i/HbVpHEsWi0aG30L2PEWRL9FEA0P7Vf-iLYm0QPbH1iOGJabk3vYcDQz1Uxo1DX3OaujOJWX62rs6QgqXFOybLw.svg" width="120" alt="SEC">
|
|
25
|
+
|
|
26
|
+
# SEC-Analyzer
|
|
27
|
+
|
|
28
|
+
**Extract structured data from SEC filings using LLM + Pydantic presets.**
|
|
29
|
+
|
|
30
|
+
Turn any SEC filing (10-K, 10-Q, 20-F, DEF 14A, ...) into structured JSON — define a Pydantic model, and the library does the rest.
|
|
31
|
+
|
|
32
|
+
[](#)
|
|
33
|
+
[](#)
|
|
34
|
+
|
|
35
|
+
[Installation](#installation) · [Quick Start](#quick-start) · [Custom Presets](#custom-presets) · [API Reference](#api-reference) · [CLI](#cli)
|
|
36
|
+
|
|
37
|
+
</div>
|
|
38
|
+
|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Why This Library?
|
|
44
|
+
|
|
45
|
+
SEC filings contain invaluable data — supply chains, revenue concentration, executive compensation, risk factors — but every filing has a different format. Traditional parsing breaks constantly.
|
|
46
|
+
|
|
47
|
+
This library uses **LLM structured output** (Gemini) to extract exactly the data you define in a **Pydantic model**. The LLM reads the filing and fills in your schema. No regex, no HTML parsing, no breakage.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from sec_analyzer import extract
|
|
51
|
+
from sec_analyzer.presets import SupplyChain
|
|
52
|
+
|
|
53
|
+
result = extract("NVDA", preset=SupplyChain)
|
|
54
|
+
print(result["data"]["suppliers"])
|
|
55
|
+
# [{'entity': 'Taiwan Semiconductor Manufacturing Company Limited',
|
|
56
|
+
# 'relationship': 'foundry for semiconductor wafers',
|
|
57
|
+
# 'context': 'We utilize foundries, such as TSMC and Samsung...'}, ...]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install sec-analyzer
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Requires Python 3.10+ and a [Google AI API key](https://ai.google.dev/).
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Quick Start
|
|
73
|
+
|
|
74
|
+
### 1. Set your API key
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
export GOOGLE_API_KEY="your-key-here"
|
|
78
|
+
export EDGAR_IDENTITY="YourApp/1.0 your@email.com"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Or create a `.env` file:
|
|
82
|
+
```
|
|
83
|
+
GOOGLE_API_KEY=your-key-here
|
|
84
|
+
EDGAR_IDENTITY=YourApp/1.0 your@email.com
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. Extract data
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from sec_analyzer import extract
|
|
91
|
+
from sec_analyzer.presets import SupplyChain
|
|
92
|
+
|
|
93
|
+
# Latest 10-K
|
|
94
|
+
result = extract("NVDA", preset=SupplyChain)
|
|
95
|
+
|
|
96
|
+
# Specific form
|
|
97
|
+
result = extract("TSM", preset=SupplyChain, form="20-F")
|
|
98
|
+
|
|
99
|
+
# Specific filing date
|
|
100
|
+
result = extract("AAPL", preset=SupplyChain, filing_date="2025-10-30")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### 3. Use the result
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
filing = result["filing"]
|
|
107
|
+
# {'form': '10-K', 'filing_date': '2026-02-25', 'accession_number': '...', 'filing_url': '...'}
|
|
108
|
+
|
|
109
|
+
data = result["data"]
|
|
110
|
+
print(f"Suppliers: {len(data['suppliers'])}")
|
|
111
|
+
print(f"Customers: {len(data['customers'])}")
|
|
112
|
+
print(f"Single-source deps: {len(data['single_source_dependencies'])}")
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Custom Presets
|
|
118
|
+
|
|
119
|
+
The real power: **define your own Pydantic model** to extract anything.
|
|
120
|
+
|
|
121
|
+
### Basic custom preset
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from pydantic import BaseModel, Field
|
|
125
|
+
from sec_analyzer import extract
|
|
126
|
+
|
|
127
|
+
class RiskFactors(BaseModel):
|
|
128
|
+
regulatory_risks: list[dict] = Field(
|
|
129
|
+
default_factory=list,
|
|
130
|
+
description="Government regulations that could impact the business"
|
|
131
|
+
)
|
|
132
|
+
litigation: list[dict] = Field(
|
|
133
|
+
default_factory=list,
|
|
134
|
+
description="Pending lawsuits and legal proceedings"
|
|
135
|
+
)
|
|
136
|
+
cybersecurity_risks: list[dict] = Field(
|
|
137
|
+
default_factory=list,
|
|
138
|
+
description="Data breach and cybersecurity threats"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
result = extract("META", preset=RiskFactors)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
When no `__prompt__` is defined, the library auto-generates a prompt from your field descriptions.
|
|
145
|
+
|
|
146
|
+
### Advanced: custom prompt
|
|
147
|
+
|
|
148
|
+
For expert-level control, add a `__prompt__` class variable:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from typing import ClassVar
|
|
152
|
+
from pydantic import BaseModel, Field
|
|
153
|
+
|
|
154
|
+
class ExecutiveComp(BaseModel):
|
|
155
|
+
__prompt__: ClassVar[str] = """\
|
|
156
|
+
You are analyzing a DEF 14A proxy statement for {company_name}.
|
|
157
|
+
Extract executive compensation data from the Summary Compensation Table
|
|
158
|
+
and related disclosure sections.
|
|
159
|
+
|
|
160
|
+
Rules:
|
|
161
|
+
1. Include only Named Executive Officers (NEOs)
|
|
162
|
+
2. All dollar amounts in exact figures from the filing
|
|
163
|
+
3. Include stock awards, option awards, and non-equity incentive plan separately
|
|
164
|
+
|
|
165
|
+
Filing text:
|
|
166
|
+
{filing_text}
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
executives: list[dict] = Field(description="NEO compensation details")
|
|
170
|
+
equity_awards: list[dict] = Field(description="Stock and option grant details")
|
|
171
|
+
|
|
172
|
+
result = extract("AAPL", preset=ExecutiveComp, form="DEF 14A")
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
The `{company_name}` and `{filing_text}` placeholders are filled automatically.
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Built-in Presets
|
|
180
|
+
|
|
181
|
+
### `SupplyChain`
|
|
182
|
+
|
|
183
|
+
Extracts 11 categories of supply chain intelligence from 10-K/10-Q/20-F filings:
|
|
184
|
+
|
|
185
|
+
| Category | Description |
|
|
186
|
+
|----------|-------------|
|
|
187
|
+
| `suppliers` | Companies supplying products/materials/services |
|
|
188
|
+
| `customers` | Companies purchasing products/services |
|
|
189
|
+
| `single_source_dependencies` | Components with sole-source suppliers |
|
|
190
|
+
| `geographic_concentration` | Manufacturing/sourcing location concentration |
|
|
191
|
+
| `capacity_constraints` | Production limitations and lead times |
|
|
192
|
+
| `supply_chain_risks` | Disruption risks (tariffs, shortages, geopolitical) |
|
|
193
|
+
| `revenue_concentration` | Customer/segment revenue % from Notes |
|
|
194
|
+
| `geographic_revenue` | Revenue by country/region from Notes |
|
|
195
|
+
| `purchase_obligations` | Commitments and take-or-pay contracts |
|
|
196
|
+
| `market_risk_disclosures` | Commodity/FX/interest rate exposures (Item 7A) |
|
|
197
|
+
| `inventory_composition` | Raw materials/WIP/finished goods breakdown |
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## API Reference
|
|
202
|
+
|
|
203
|
+
### `extract(symbol, preset, form="10-K", filing_date=None, max_chars=2_000_000, api_key=None, model=None)`
|
|
204
|
+
|
|
205
|
+
| Parameter | Type | Description |
|
|
206
|
+
|-----------|------|-------------|
|
|
207
|
+
| `symbol` | str | Ticker symbol (e.g., "NVDA") |
|
|
208
|
+
| `preset` | BaseModel class | Pydantic model defining extraction schema |
|
|
209
|
+
| `form` | str | Filing type. Auto-fallback 10-K → 20-F |
|
|
210
|
+
| `filing_date` | str | Specific date (YYYY-MM-DD). None = latest |
|
|
211
|
+
| `max_chars` | int | Max filing markdown length |
|
|
212
|
+
| `api_key` | str | Google API key (fallback: `GOOGLE_API_KEY` env) |
|
|
213
|
+
| `model` | str | Gemini model (fallback: `GOOGLE_MODEL` env, default: `gemini-2.5-flash`) |
|
|
214
|
+
|
|
215
|
+
**Returns** `{"filing": {...}, "data": {...}}`
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## CLI
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
# Supply chain extraction (default)
|
|
223
|
+
sec-analyzer NVDA
|
|
224
|
+
|
|
225
|
+
# Specific form
|
|
226
|
+
sec-analyzer TSM --form 20-F
|
|
227
|
+
|
|
228
|
+
# Compact JSON
|
|
229
|
+
sec-analyzer NVDA --json
|
|
230
|
+
|
|
231
|
+
# Specific filing date
|
|
232
|
+
sec-analyzer AAPL --filing-date 2025-10-30
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## How It Works
|
|
238
|
+
|
|
239
|
+
```
|
|
240
|
+
1. edgartools finds the filing on SEC EDGAR
|
|
241
|
+
2. Filing converted to markdown (tables preserved)
|
|
242
|
+
3. Full markdown + Pydantic schema sent to Gemini
|
|
243
|
+
4. Gemini returns structured JSON matching the schema
|
|
244
|
+
5. Pydantic validates and returns typed data
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
The key insight: Gemini's **structured output** mode forces the response to match your Pydantic schema exactly. No post-processing, no regex, no parsing.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Environment Variables
|
|
252
|
+
|
|
253
|
+
| Variable | Required | Default | Description |
|
|
254
|
+
|----------|----------|---------|-------------|
|
|
255
|
+
| `GOOGLE_API_KEY` | Yes | - | Google AI API key |
|
|
256
|
+
| `EDGAR_IDENTITY` | No | `SECAnalyzer/1.0 user@example.com` | SEC EDGAR User-Agent |
|
|
257
|
+
| `GOOGLE_MODEL` | No | `gemini-2.5-flash` | Gemini model ID |
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## Disclaimer
|
|
262
|
+
|
|
263
|
+
This project is **not affiliated with the SEC, EDGAR, or Google**. Filing data comes from SEC EDGAR (public). LLM extraction may contain errors — always verify critical data against the original filing.
|
|
264
|
+
|
|
265
|
+
This tool is for **research and educational purposes only**. It is not financial advice.
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## License
|
|
270
|
+
|
|
271
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
sec_analyzer/__init__.py,sha256=eO_j8ok9YZ3FTrmkGkMFTZlMNRhuCzXv4YBEpJKVCoo,173
|
|
2
|
+
sec_analyzer/cli.py,sha256=XcWhp-onokVCmdTUFdWLThMA1UOBShpDlChuEFmUOI4,1758
|
|
3
|
+
sec_analyzer/engine.py,sha256=5LTbvKkvKI2NqKP-8Mj1sEd1qZCTFBeIMhoXUgR7v9w,16094
|
|
4
|
+
sec_analyzer/presets/__init__.py,sha256=EXY1wa66qYKonFEUx1xtGThS5jFpUF3IPN45AIbKPpY,101
|
|
5
|
+
sec_analyzer/presets/supply_chain.py,sha256=oiBg0LKJBBZtjUrNQ2_qmc4_repQQAbBvl_CwUAsK1g,9392
|
|
6
|
+
sec_analyzer-0.1.0.dist-info/METADATA,sha256=ceFL6fQHdK-CT8fLIuNKLGxQJHDOsyvOR97qHP3vUE0,8216
|
|
7
|
+
sec_analyzer-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
8
|
+
sec_analyzer-0.1.0.dist-info/entry_points.txt,sha256=2iYSZHJ9pQCKz2oaMyAWlAZu1k3gveR4G6tZrqo5La8,55
|
|
9
|
+
sec_analyzer-0.1.0.dist-info/licenses/LICENSE,sha256=TcoeGh2YiCuQikmHXW5evYQ74WkVQzZAgNzGodYHhIA,1065
|
|
10
|
+
sec_analyzer-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Seongjin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|