sec-analyzer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ """SEC-Analyzer: Extract structured data from SEC filings using LLM + Pydantic presets."""
2
+
3
+ from .engine import extract, extract_xbrl
4
+
5
+ __all__ = ["extract", "extract_xbrl"]
sec_analyzer/cli.py ADDED
@@ -0,0 +1,61 @@
1
+ """CLI entry point for sec-analyzer."""
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+
7
+
8
+ _PRESET_MAP = {
9
+ "supply-chain": "sec_analyzer.presets.supply_chain:SupplyChain",
10
+ }
11
+
12
+
13
+ def _load_preset(name: str):
14
+ """Load a preset class by name."""
15
+ if name not in _PRESET_MAP:
16
+ print(f"Unknown preset: {name}", file=sys.stderr)
17
+ print(f"Available presets: {', '.join(_PRESET_MAP)}", file=sys.stderr)
18
+ sys.exit(1)
19
+
20
+ module_path, class_name = _PRESET_MAP[name].rsplit(":", 1)
21
+ import importlib
22
+ mod = importlib.import_module(module_path)
23
+ return getattr(mod, class_name)
24
+
25
+
26
+ def main():
27
+ parser = argparse.ArgumentParser(
28
+ description="Extract structured data from SEC filings"
29
+ )
30
+ parser.add_argument("symbol", help="Ticker symbol (e.g., NVDA, AAPL, TSM)")
31
+ parser.add_argument(
32
+ "--preset", default="supply-chain",
33
+ help=f"Extraction preset ({', '.join(_PRESET_MAP)})",
34
+ )
35
+ parser.add_argument("--form", default="10-K", help="Filing form type (default: 10-K)")
36
+ parser.add_argument("--filing-date", default=None, help="Specific filing date (YYYY-MM-DD)")
37
+ parser.add_argument("--json", action="store_true", dest="compact", help="Compact JSON output")
38
+
39
+ args = parser.parse_args()
40
+
41
+ preset_cls = _load_preset(args.preset)
42
+
43
+ from .engine import extract
44
+
45
+ try:
46
+ result = extract(
47
+ symbol=args.symbol,
48
+ preset=preset_cls,
49
+ form=args.form,
50
+ filing_date=args.filing_date,
51
+ )
52
+ except Exception as e:
53
+ print(json.dumps({"error": str(e)}), file=sys.stderr)
54
+ sys.exit(1)
55
+
56
+ indent = None if args.compact else 2
57
+ print(json.dumps(result, indent=indent, default=str))
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()
sec_analyzer/engine.py ADDED
@@ -0,0 +1,430 @@
1
+ """Core extraction engine: edgartools filing load + Gemini structured output."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import sys
8
+ import time
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from pydantic import BaseModel
13
+
14
+ _MAX_MARKDOWN_CHARS = 2_000_000
15
+
16
+
17
+ def _init_edgar(identity: str | None = None):
18
+ """Initialize edgartools SEC identity."""
19
+ from edgar import set_identity
20
+
21
+ identity = identity or os.environ.get(
22
+ "EDGAR_IDENTITY", "SECAnalyzer/1.0 user@example.com"
23
+ )
24
+ set_identity(identity)
25
+
26
+
27
+ def _get_filing(symbol: str, form: str = "10-K", filing_date: str | None = None):
28
+ """Search latest filing via edgartools. Auto-fallback 10-K -> 20-F.
29
+
30
+ Returns:
31
+ tuple: (filing, metadata_dict, company_name)
32
+ """
33
+ from edgar import Company
34
+
35
+ _init_edgar()
36
+ company = Company(symbol)
37
+
38
+ retries = 3
39
+ last_error = None
40
+ for attempt in range(1, retries + 1):
41
+ try:
42
+ filings = company.get_filings(form=form)
43
+ if len(filings) == 0 and form == "10-K":
44
+ filings = company.get_filings(form="20-F")
45
+ form = "20-F"
46
+ if len(filings) == 0:
47
+ raise ValueError(f"No {form} filing found for {symbol}")
48
+
49
+ if filing_date:
50
+ for f in filings:
51
+ if str(f.filing_date) == filing_date:
52
+ filing = f
53
+ break
54
+ else:
55
+ filing = filings[0]
56
+ else:
57
+ filing = filings[0]
58
+
59
+ metadata = {
60
+ "form": form,
61
+ "filing_date": str(filing.filing_date),
62
+ "accession_number": filing.accession_number,
63
+ "filing_url": filing.filing_url,
64
+ }
65
+ return filing, metadata, company.name
66
+ except ValueError:
67
+ raise
68
+ except Exception as e:
69
+ last_error = e
70
+ if attempt < retries:
71
+ time.sleep(2**attempt)
72
+ continue
73
+ raise RuntimeError(f"edgartools failed after {retries} attempts: {last_error}")
74
+
75
+
76
+ def _get_markdown(filing, max_chars: int = _MAX_MARKDOWN_CHARS) -> str:
77
+ """Convert filing to markdown with safe truncation."""
78
+ md = filing.markdown()
79
+ if len(md) > max_chars:
80
+ md = md[:max_chars]
81
+ return md
82
+
83
+
84
+ def _build_default_prompt(preset_cls: type[BaseModel], company_name: str) -> str:
85
+ """Build a default extraction prompt from Pydantic field descriptions."""
86
+ schema = preset_cls.model_json_schema()
87
+ fields_desc = []
88
+ for name, prop in schema.get("properties", {}).items():
89
+ desc = prop.get("description", name)
90
+ fields_desc.append(f"- {name}: {desc}")
91
+
92
+ return f"""\
93
+ You are a financial analyst extracting structured data from SEC filings.
94
+ Extract entities and data exactly as stated in the filing text.
95
+
96
+ Filing company: {company_name}
97
+
98
+ Extract the following fields:
99
+ {chr(10).join(fields_desc)}
100
+
101
+ Rules:
102
+ 1. Use exact names and figures from the filing — do not paraphrase or invent.
103
+ 2. For context fields, copy 1-3 relevant sentences verbatim from the filing.
104
+ 3. If a field has no relevant data, return an empty list or null.
105
+
106
+ Extract from this SEC filing text:
107
+
108
+ {{filing_text}}
109
+ """
110
+
111
+
112
+ def _extract_with_llm(
113
+ filing_text: str,
114
+ preset_cls: type[BaseModel],
115
+ company_name: str = "",
116
+ api_key: str | None = None,
117
+ model: str | None = None,
118
+ ) -> BaseModel | None:
119
+ """Extract structured data using Gemini structured output + Pydantic.
120
+
121
+ Returns:
122
+ Pydantic model instance, or None on failure.
123
+ """
124
+ from google import genai
125
+
126
+ api_key = api_key or os.environ.get("GOOGLE_API_KEY")
127
+ if not api_key:
128
+ raise ValueError(
129
+ "GOOGLE_API_KEY not set. Pass api_key parameter or set the environment variable."
130
+ )
131
+
132
+ model_id = model or os.environ.get("GOOGLE_MODEL", "gemini-2.5-flash")
133
+
134
+ # Build prompt: use preset's __prompt__ if available, else generate default
135
+ custom_prompt = getattr(preset_cls, "__prompt__", None)
136
+ if custom_prompt:
137
+ prompt = custom_prompt.format(
138
+ company_name=company_name or "Unknown",
139
+ filing_text=filing_text,
140
+ )
141
+ else:
142
+ template = _build_default_prompt(preset_cls, company_name or "Unknown")
143
+ prompt = template.format(filing_text=filing_text)
144
+
145
+ gen_config = {
146
+ "response_mime_type": "application/json",
147
+ "response_json_schema": preset_cls.model_json_schema(),
148
+ "temperature": 0.1,
149
+ }
150
+
151
+ thinking_level = os.environ.get("GOOGLE_THINKING_LEVEL", "low")
152
+ if thinking_level and thinking_level.lower() in ("low", "medium", "high", "minimal"):
153
+ from google.genai import types
154
+ gen_config["thinking_config"] = types.ThinkingConfig(
155
+ thinking_level=thinking_level.lower()
156
+ )
157
+
158
+ client = genai.Client(api_key=api_key)
159
+ retries = 3
160
+
161
+ for attempt in range(1, retries + 1):
162
+ try:
163
+ response = client.models.generate_content(
164
+ model=model_id,
165
+ contents=prompt,
166
+ config=gen_config,
167
+ )
168
+ return preset_cls.model_validate_json(response.text)
169
+ except Exception as e:
170
+ print(
171
+ f"[sec-analyzer] LLM attempt {attempt}/{retries} failed: {e}",
172
+ file=sys.stderr,
173
+ )
174
+ if attempt < retries:
175
+ time.sleep(2**attempt)
176
+ continue
177
+ return None
178
+
179
+
180
+ def extract(
181
+ symbol: str,
182
+ preset: type[BaseModel],
183
+ form: str = "10-K",
184
+ filing_date: str | None = None,
185
+ max_chars: int = _MAX_MARKDOWN_CHARS,
186
+ api_key: str | None = None,
187
+ model: str | None = None,
188
+ ) -> dict:
189
+ """Extract structured data from an SEC filing using a Pydantic preset.
190
+
191
+ Args:
192
+ symbol: Ticker symbol (e.g., "NVDA", "AAPL", "TSM").
193
+ preset: Pydantic BaseModel class defining the extraction schema.
194
+ Optionally include a `__prompt__` class variable with a custom
195
+ extraction prompt (use {company_name} and {filing_text} placeholders).
196
+ form: Filing form type ("10-K", "10-Q", "20-F", "DEF 14A", etc.).
197
+ Auto-fallback from 10-K to 20-F for foreign issuers.
198
+ filing_date: Specific filing date (YYYY-MM-DD). None for latest.
199
+ max_chars: Maximum filing markdown length.
200
+ api_key: Google API key. Falls back to GOOGLE_API_KEY env var.
201
+ model: Gemini model ID. Falls back to GOOGLE_MODEL env var.
202
+
203
+ Returns:
204
+ dict with "filing" (metadata) and "data" (extracted fields).
205
+ """
206
+ from dotenv import load_dotenv
207
+
208
+ load_dotenv()
209
+
210
+ filing, metadata, company_name = _get_filing(symbol, form, filing_date)
211
+ markdown = _get_markdown(filing, max_chars)
212
+
213
+ result = _extract_with_llm(
214
+ filing_text=markdown,
215
+ preset_cls=preset,
216
+ company_name=company_name,
217
+ api_key=api_key,
218
+ model=model,
219
+ )
220
+
221
+ if result is None:
222
+ raise RuntimeError(f"LLM extraction failed for {symbol} ({metadata['form']})")
223
+
224
+ return {
225
+ "filing": metadata,
226
+ "data": result.model_dump(),
227
+ }
228
+
229
+
230
+ # ---------------------------------------------------------------------------
231
+ # XBRL structured data extraction
232
+ # ---------------------------------------------------------------------------
233
+
234
+ def extract_xbrl(symbol: str, form: str = "10-K") -> dict:
235
+ """Extract structured quantitative data from XBRL tags.
236
+
237
+ Extracts 4 categories using standardized US-GAAP XBRL tags:
238
+ - revenue_concentration: Customer/segment revenue % (ConcentrationRiskPercentage)
239
+ - geographic_revenue: Revenue by country/region
240
+ - inventory_composition: Raw materials / WIP / finished goods
241
+ - purchase_obligations: Unconditional purchase commitments
242
+
243
+ Args:
244
+ symbol: Ticker symbol.
245
+ form: Filing form type.
246
+
247
+ Returns:
248
+ dict with "filing" metadata and "data" containing available categories.
249
+ Empty categories are omitted.
250
+ """
251
+ from dotenv import load_dotenv
252
+ load_dotenv()
253
+
254
+ filing, metadata, _ = _get_filing(symbol, form)
255
+
256
+ try:
257
+ xbrl = filing.xbrl()
258
+ if xbrl is None:
259
+ return {"filing": metadata, "data": {}, "xbrl_available": False}
260
+ except Exception:
261
+ return {"filing": metadata, "data": {}, "xbrl_available": False}
262
+
263
+ try:
264
+ import pandas as pd
265
+ df = xbrl.instance.facts.reset_index()
266
+ except Exception:
267
+ return {"filing": metadata, "data": {}, "xbrl_available": False}
268
+
269
+ supplements = {}
270
+
271
+ # --- Revenue Concentration ---
272
+ conc = df[df["concept"].astype(str).str.contains(
273
+ "ConcentrationRiskPercentage", case=False, na=False)]
274
+ if len(conc) > 0:
275
+ benchmark_col = "us-gaap:ConcentrationRiskByBenchmarkAxis"
276
+ if benchmark_col in conc.columns:
277
+ conc = conc[conc[benchmark_col].astype(str).str.contains(
278
+ "Revenue", case=False, na=False)]
279
+ if "end_date" in conc.columns and len(conc) > 0:
280
+ latest_date = conc["end_date"].max()
281
+ conc = conc[conc["end_date"] == latest_date]
282
+
283
+ entries = []
284
+ seen = set()
285
+ for _, row in conc.iterrows():
286
+ customer = str(row.get("srt:MajorCustomersAxis", ""))
287
+ if not customer or customer == "nan":
288
+ continue
289
+ name = customer.split(":")[-1].replace("Member", "")
290
+ name = re.sub(r"([a-z])([A-Z])", r"\1 \2", name)
291
+ _GEO = ("Based End Customers", "Region", "Country", "Americas",
292
+ "Europe", "Asia", "Pacific", "United States", "China", "Japan")
293
+ if any(kw.lower() in name.lower() for kw in _GEO):
294
+ continue
295
+ if name in seen:
296
+ continue
297
+ seen.add(name)
298
+ try:
299
+ pct = round(float(row["value"]) * 100, 2)
300
+ except (ValueError, TypeError):
301
+ pct = None
302
+ entries.append({"entity": name, "revenue_pct": pct,
303
+ "source": "xbrl", "end_date": str(row.get("end_date", ""))})
304
+ if entries:
305
+ supplements["revenue_concentration"] = entries
306
+
307
+ # --- Geographic Revenue ---
308
+ geo_col = "srt:StatementGeographicalAxis"
309
+ if geo_col in df.columns:
310
+ geo = df[(df[geo_col].notna()) &
311
+ (df["concept"].astype(str).str.contains("Revenue", case=False, na=False))]
312
+ if len(geo) > 0:
313
+ _rev_patterns = ["RevenueFromContractWithCustomer", r"^us-gaap:Revenues$"]
314
+ total_rev = None
315
+ for pat in _rev_patterns:
316
+ rows = df[
317
+ (df["concept"].astype(str).str.contains(pat, case=False, na=False)) &
318
+ (~df["concept"].astype(str).str.contains(
319
+ "TextBlock|Policy|Description|Percentage|Cost", case=False, na=False)) &
320
+ (df[geo_col].isna()) & (df["period_type"] == "duration")
321
+ ].copy()
322
+ if len(rows) > 0:
323
+ try:
324
+ rows["_val"] = pd.to_numeric(rows["value"], errors="coerce")
325
+ rows = rows.dropna(subset=["_val"])
326
+ if len(rows) > 0:
327
+ latest = rows[rows["end_date"] == rows["end_date"].max()]
328
+ total_rev = float(latest["_val"].max())
329
+ break
330
+ except Exception:
331
+ continue
332
+
333
+ entries = []
334
+ seen = set()
335
+ _COUNTRY_MAP = {"US": "United States", "CN": "China", "JP": "Japan",
336
+ "TW": "Taiwan", "KR": "South Korea", "DE": "Germany",
337
+ "GB": "United Kingdom", "IN": "India"}
338
+ for _, row in geo.iterrows():
339
+ region = str(row[geo_col]).split(":")[-1].replace("Member", "")
340
+ region = re.sub(r"([a-z])([A-Z])", r"\1 \2", region)
341
+ region = _COUNTRY_MAP.get(region, region)
342
+ try:
343
+ amount = float(row["value"])
344
+ except (ValueError, TypeError):
345
+ continue
346
+ end_date = str(row.get("end_date", ""))
347
+ key = f"{region}_{end_date}"
348
+ if key in seen:
349
+ continue
350
+ seen.add(key)
351
+ pct = round(amount / total_rev * 100, 2) if total_rev else None
352
+ amt_str = f"${amount/1e9:.1f}B" if amount >= 1e9 else f"${amount/1e6:.0f}M"
353
+ entries.append({"region": region, "revenue_pct": pct,
354
+ "revenue_amount": amt_str, "source": "xbrl", "end_date": end_date})
355
+ if entries:
356
+ entries.sort(key=lambda x: x["end_date"], reverse=True)
357
+ if entries:
358
+ first_period = entries[0]["end_date"]
359
+ entries = [e for e in entries if e["end_date"] == first_period]
360
+ supplements["geographic_revenue"] = entries
361
+
362
+ # --- Inventory Composition ---
363
+ inv_concepts = {
364
+ "InventoryRawMaterialsAndSupplies": "raw_materials",
365
+ "InventoryRawMaterials": "raw_materials",
366
+ "InventoryWorkInProcess": "work_in_progress",
367
+ "InventoryFinishedGoods": "finished_goods",
368
+ "InventoryFinishedGoodsAndWorkInProcess": "finished_goods",
369
+ }
370
+ inv_total_row = df[df["concept"].astype(str).str.contains(
371
+ r"^us-gaap:InventoryNet$", case=False, na=False)]
372
+ inv_total = None
373
+ if len(inv_total_row) > 0:
374
+ try:
375
+ inv_total = float(inv_total_row.iloc[0]["value"])
376
+ except (ValueError, TypeError):
377
+ pass
378
+
379
+ inv_entries = []
380
+ comp_total = 0.0
381
+ for suffix, category in inv_concepts.items():
382
+ rows = df[df["concept"].astype(str).str.contains(suffix, case=False, na=False)]
383
+ if len(rows) > 0:
384
+ try:
385
+ amount = float(rows.iloc[0]["value"])
386
+ except (ValueError, TypeError):
387
+ continue
388
+ comp_total += amount
389
+ amt_str = f"${amount/1e9:.1f}B" if amount >= 1e9 else f"${amount/1e6:.0f}M"
390
+ inv_entries.append({"category": category, "amount": amt_str,
391
+ "_raw": amount, "source": "xbrl"})
392
+ if inv_entries:
393
+ denom = inv_total if inv_total and comp_total <= inv_total * 1.05 else comp_total
394
+ for e in inv_entries:
395
+ raw = e.pop("_raw")
396
+ e["pct_of_total"] = round(raw / denom * 100, 2) if denom else None
397
+ supplements["inventory_composition"] = inv_entries
398
+
399
+ # --- Purchase Obligations ---
400
+ po_rows = df[df["concept"].astype(str).str.contains(
401
+ "UnrecordedUnconditionalPurchaseObligation", case=False, na=False)]
402
+ po_rows = po_rows[~po_rows["concept"].astype(str).str.contains(
403
+ "TextBlock|Policy", case=False, na=False)]
404
+ if len(po_rows) > 0:
405
+ po_entries = []
406
+ for _, row in po_rows.iterrows():
407
+ concept = str(row["concept"])
408
+ try:
409
+ amount = float(row["value"])
410
+ except (ValueError, TypeError):
411
+ continue
412
+ amt_str = f"${amount/1e9:.1f}B" if amount >= 1e9 else f"${amount/1e6:.0f}M"
413
+ timeframe = ""
414
+ for label, tf in [("BalanceSheetAmount", "total"),
415
+ ("FirstAnniversary", "year 1"), ("SecondAnniversary", "year 2"),
416
+ ("ThirdAnniversary", "year 3"), ("FourthAnniversary", "year 4"),
417
+ ("FifthAnniversary", "year 5"), ("AfterFiveYears", "after year 5")]:
418
+ if label in concept:
419
+ timeframe = tf
420
+ break
421
+ po_entries.append({"obligation_type": "unconditional purchase obligation",
422
+ "amount": amt_str, "timeframe": timeframe, "source": "xbrl"})
423
+ if po_entries:
424
+ supplements["purchase_obligations"] = po_entries
425
+
426
+ return {
427
+ "filing": metadata,
428
+ "data": supplements,
429
+ "xbrl_available": True,
430
+ }
@@ -0,0 +1,5 @@
1
+ """Built-in extraction presets."""
2
+
3
+ from .supply_chain import SupplyChain
4
+
5
+ __all__ = ["SupplyChain"]
@@ -0,0 +1,145 @@
1
+ """Supply chain intelligence preset for SEC filings.
2
+
3
+ Extracts suppliers, customers, single-source dependencies, geographic
4
+ concentration, capacity constraints, supply chain risks, revenue concentration,
5
+ geographic revenue, purchase obligations, market risk disclosures, and
6
+ inventory composition from 10-K/10-Q/20-F filings.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import ClassVar, Literal
12
+
13
+ from pydantic import BaseModel, Field
14
+
15
+
16
+ class SupplierEntry(BaseModel):
17
+ entity: str = Field(description="Name of the supplier company. Use exact name from the filing.")
18
+ relationship: str = Field(default="", description="Nature of the supply relationship (e.g., 'sole source supplier', 'key component vendor').")
19
+ context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing that supports this supplier relationship.")
20
+
21
+
22
+ class CustomerEntry(BaseModel):
23
+ entity: str = Field(description="Name of the customer company. Use exact name from the filing.")
24
+ relationship: str = Field(default="", description="Nature of the customer relationship (e.g., 'major customer', 'accounted for 35% of revenue').")
25
+ context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing that supports this customer relationship.")
26
+
27
+
28
+ class SingleSourceEntry(BaseModel):
29
+ component: str = Field(default="", description="Component or material with single-source dependency (e.g., 'DRAM memory chips').")
30
+ supplier: str = Field(description="Name of the sole-source or single-source supplier.")
31
+ context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing this dependency.")
32
+
33
+
34
+ class GeographicEntry(BaseModel):
35
+ location: str = Field(description="Country or region name (e.g., 'Taiwan', 'South Korea').")
36
+ activity: str = Field(default="", description="Type of activity at this location (e.g., 'manufacturing', 'assembly').")
37
+ context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing geographic concentration.")
38
+
39
+
40
+ class CapacityConstraintEntry(BaseModel):
41
+ constraint: str = Field(description="Type of capacity constraint (e.g., 'extended lead times', 'production capacity limitation').")
42
+ context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing the constraint.")
43
+
44
+
45
+ class SupplyChainRiskEntry(BaseModel):
46
+ risk: str = Field(description="Type of supply chain risk (e.g., 'tariff impact', 'raw material shortage').")
47
+ context: str = Field(default="", description="Brief relevant excerpt (1-3 sentences) from the filing describing this risk.")
48
+
49
+
50
+ class RevenueConcentrationEntry(BaseModel):
51
+ entity: str = Field(description="Customer or segment name from filing.")
52
+ revenue_pct: float | None = Field(default=None, description="% of total revenue (e.g., 35.2).")
53
+ revenue_amount: str = Field(default="", description="Amount if disclosed (e.g., '$5.2 billion').")
54
+ context: str = Field(default="", description="1-3 sentences from Notes.")
55
+
56
+
57
+ class GeographicRevenueEntry(BaseModel):
58
+ region: str = Field(description="Country or region (e.g., 'United States', 'China').")
59
+ revenue_pct: float | None = Field(default=None, description="% of total revenue.")
60
+ revenue_amount: str = Field(default="", description="Amount if disclosed.")
61
+ context: str = Field(default="", description="1-3 sentences from Notes.")
62
+
63
+
64
+ class PurchaseObligationEntry(BaseModel):
65
+ counterparty: str = Field(default="", description="Supplier name if disclosed.")
66
+ obligation_type: str = Field(description="Type (e.g., 'inventory commitment', 'capacity reservation').")
67
+ amount: str = Field(default="", description="Dollar amount (e.g., '$2.5 billion').")
68
+ timeframe: str = Field(default="", description="Duration (e.g., 'through fiscal 2027').")
69
+ context: str = Field(default="", description="1-3 sentences from Notes.")
70
+
71
+
72
+ class MarketRiskEntry(BaseModel):
73
+ risk_type: Literal["commodity", "fx", "interest_rate"] = Field(description="Type: 'commodity', 'fx', or 'interest_rate'.")
74
+ exposure: str = Field(default="", description="Specific exposure (e.g., 'gold price', 'EUR/USD').")
75
+ sensitivity: str = Field(default="", description="Quantitative impact if disclosed (e.g., '10% increase = $50M COGS impact').")
76
+ hedging: str = Field(default="", description="Hedging strategy if disclosed.")
77
+ context: str = Field(default="", description="1-3 sentences from filing.")
78
+
79
+
80
+ class InventoryCompositionEntry(BaseModel):
81
+ category: Literal["raw_materials", "work_in_progress", "finished_goods"] = Field(description="Category: 'raw_materials', 'work_in_progress', or 'finished_goods'.")
82
+ amount: str = Field(default="", description="Dollar amount if disclosed (e.g., '$1.2 billion').")
83
+ pct_of_total: float | None = Field(default=None, description="% of total inventory.")
84
+ context: str = Field(default="", description="1-3 sentences from Notes (valuation, aging, obsolescence).")
85
+
86
+
87
+ class SupplyChain(BaseModel):
88
+ """Supply chain intelligence extraction from SEC filings."""
89
+
90
+ __prompt__: ClassVar[str] = """\
91
+ You are a financial analyst extracting supply chain intelligence from SEC filings.
92
+ Extract entities and relationships exactly as stated in the filing text.
93
+
94
+ The filing text below is the complete SEC filing in markdown format.
95
+ Identify relevant sections by their natural headings:
96
+ - Item 1 (Business): suppliers, customers, single-source dependencies
97
+ - Item 1A (Risk Factors): supply chain risks, geographic concentration
98
+ - Item 7 (MD&A): capacity constraints, operating discussion
99
+ - Item 7A (Market Risk): commodity/FX/interest rate exposures
100
+ - Item 8 Notes: revenue segments, inventory, commitments, concentration
101
+ For 20-F filings, look for equivalent items (Item 4, 3D, 5, 11, 18/19).
102
+
103
+ Rules:
104
+ 1. Use exact company names from the filing — do not paraphrase or invent.
105
+ 2. For context fields, copy 1-3 relevant sentences verbatim from the filing.
106
+ 3. If a category has no relevant data, return an empty list.
107
+ 4. Do NOT extract the filing company itself as its own supplier or customer.
108
+ 5. Focus on factual supply chain relationships — skip generic boilerplate.
109
+ 6. De facto single-source: If a supplier is described as the PRIMARY or ONLY provider
110
+ for a critical component category and NO alternative supplier is mentioned for that
111
+ same category, classify it as single_source_dependencies.
112
+ 7. Customer extraction: Look for revenue concentration disclosures, named buyers,
113
+ and "accounted for X% of revenue" language.
114
+ 8. Relationship specificity: Use precise descriptions such as "sole foundry for
115
+ leading-edge GPUs", "memory supplier (HBM)", "anchor customer >10% revenue".
116
+ 9. Only infer relationships where the filing text provides contextual support.
117
+ 10. Revenue concentration: From Notes, extract customers/segments with specific % of
118
+ revenue. Include exact percentage as revenue_pct.
119
+ 11. Geographic revenue: From Notes, extract revenue by country/region with exact
120
+ percentages. Use standardized country names.
121
+ 12. Purchase obligations: From Notes (Commitments and Contingencies), extract purchase
122
+ commitments, capacity reservations, take-or-pay contracts.
123
+ 13. Market risk disclosures: From Item 7A, extract commodity/FX/interest rate exposures.
124
+ Classify risk_type as "commodity", "fx", or "interest_rate".
125
+ 14. Inventory composition: From Notes, extract raw materials, work-in-progress, and
126
+ finished goods amounts and percentages.
127
+
128
+ Filing company: {company_name}
129
+
130
+ Extract all supply chain entities from this SEC filing text:
131
+
132
+ {filing_text}
133
+ """
134
+
135
+ suppliers: list[SupplierEntry] = Field(default_factory=list, description="Companies that supply products, materials, or services to the filing company.")
136
+ customers: list[CustomerEntry] = Field(default_factory=list, description="Companies that purchase products or services from the filing company.")
137
+ single_source_dependencies: list[SingleSourceEntry] = Field(default_factory=list, description="Components with sole-source or single-source supplier dependencies.")
138
+ geographic_concentration: list[GeographicEntry] = Field(default_factory=list, description="Locations where manufacturing, production, or sourcing is concentrated.")
139
+ capacity_constraints: list[CapacityConstraintEntry] = Field(default_factory=list, description="Production capacity limitations, extended lead times, or backlogs.")
140
+ supply_chain_risks: list[SupplyChainRiskEntry] = Field(default_factory=list, description="Supply disruption risks including tariffs, shortages, geopolitical risks.")
141
+ revenue_concentration: list[RevenueConcentrationEntry] = Field(default_factory=list, description="Customer/segment revenue concentration from Notes.")
142
+ geographic_revenue: list[GeographicRevenueEntry] = Field(default_factory=list, description="Revenue breakdown by country/region from Notes.")
143
+ purchase_obligations: list[PurchaseObligationEntry] = Field(default_factory=list, description="Purchase commitments, capacity reservations from Notes.")
144
+ market_risk_disclosures: list[MarketRiskEntry] = Field(default_factory=list, description="Market risk exposures from Item 7A.")
145
+ inventory_composition: list[InventoryCompositionEntry] = Field(default_factory=list, description="Inventory breakdown from Notes.")
@@ -0,0 +1,271 @@
1
+ Metadata-Version: 2.4
2
+ Name: sec-analyzer
3
+ Version: 0.1.0
4
+ Summary: Extract structured data from SEC filings using LLM + Pydantic presets
5
+ Project-URL: Homepage, https://github.com/tjdwls101010/SEC-Analyzer
6
+ Author: Seongjin
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Keywords: edgar,finance,llm,pydantic,sec,structured-data
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Financial and Insurance Industry
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Office/Business :: Financial
15
+ Requires-Python: >=3.10
16
+ Requires-Dist: edgartools>=3.0
17
+ Requires-Dist: google-genai>=1.0
18
+ Requires-Dist: pydantic>=2.0
19
+ Requires-Dist: python-dotenv>=1.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ <div align="center">
23
+
24
+ <img src="https://i.namu.wiki/i/HbVpHEsWi0aG30L2PEWRL9FEA0P7Vf-iLYm0QPbH1iOGJabk3vYcDQz1Uxo1DX3OaujOJWX62rs6QgqXFOybLw.svg" width="120" alt="SEC">
25
+
26
+ # SEC-Analyzer
27
+
28
+ **Extract structured data from SEC filings using LLM + Pydantic presets.**
29
+
30
+ Turn any SEC filing (10-K, 10-Q, 20-F, DEF 14A, ...) into structured JSON — define a Pydantic model, and the library does the rest.
31
+
32
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](#)
33
+ [![License: MIT](https://img.shields.io/badge/license-MIT-lightgrey)](#)
34
+
35
+ [Installation](#installation) · [Quick Start](#quick-start) · [Custom Presets](#custom-presets) · [API Reference](#api-reference) · [CLI](#cli)
36
+
37
+ </div>
38
+
39
+ ![](https://github.com/tjdwls101010/DUMOK/blob/main/Images/gemini-3-pro-1774265890176ioxhdiv1w.png?raw=true)
40
+
41
+ ---
42
+
43
+ ## Why This Library?
44
+
45
+ SEC filings contain invaluable data — supply chains, revenue concentration, executive compensation, risk factors — but every filing has a different format. Traditional parsing breaks constantly.
46
+
47
+ This library uses **LLM structured output** (Gemini) to extract exactly the data you define in a **Pydantic model**. The LLM reads the filing and fills in your schema. No regex, no HTML parsing, no breakage.
48
+
49
+ ```python
50
+ from sec_analyzer import extract
51
+ from sec_analyzer.presets import SupplyChain
52
+
53
+ result = extract("NVDA", preset=SupplyChain)
54
+ print(result["data"]["suppliers"])
55
+ # [{'entity': 'Taiwan Semiconductor Manufacturing Company Limited',
56
+ # 'relationship': 'foundry for semiconductor wafers',
57
+ # 'context': 'We utilize foundries, such as TSMC and Samsung...'}, ...]
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Installation
63
+
64
+ ```bash
65
+ pip install sec-analyzer
66
+ ```
67
+
68
+ Requires Python 3.10+ and a [Google AI API key](https://ai.google.dev/).
69
+
70
+ ---
71
+
72
+ ## Quick Start
73
+
74
+ ### 1. Set your API key
75
+
76
+ ```bash
77
+ export GOOGLE_API_KEY="your-key-here"
78
+ export EDGAR_IDENTITY="YourApp/1.0 your@email.com"
79
+ ```
80
+
81
+ Or create a `.env` file:
82
+ ```
83
+ GOOGLE_API_KEY=your-key-here
84
+ EDGAR_IDENTITY=YourApp/1.0 your@email.com
85
+ ```
86
+
87
+ ### 2. Extract data
88
+
89
+ ```python
90
+ from sec_analyzer import extract
91
+ from sec_analyzer.presets import SupplyChain
92
+
93
+ # Latest 10-K
94
+ result = extract("NVDA", preset=SupplyChain)
95
+
96
+ # Specific form
97
+ result = extract("TSM", preset=SupplyChain, form="20-F")
98
+
99
+ # Specific filing date
100
+ result = extract("AAPL", preset=SupplyChain, filing_date="2025-10-30")
101
+ ```
102
+
103
+ ### 3. Use the result
104
+
105
+ ```python
106
+ filing = result["filing"]
107
+ # {'form': '10-K', 'filing_date': '2026-02-25', 'accession_number': '...', 'filing_url': '...'}
108
+
109
+ data = result["data"]
110
+ print(f"Suppliers: {len(data['suppliers'])}")
111
+ print(f"Customers: {len(data['customers'])}")
112
+ print(f"Single-source deps: {len(data['single_source_dependencies'])}")
113
+ ```
114
+
115
+ ---
116
+
117
+ ## Custom Presets
118
+
119
+ The real power: **define your own Pydantic model** to extract anything.
120
+
121
+ ### Basic custom preset
122
+
123
+ ```python
124
+ from pydantic import BaseModel, Field
125
+ from sec_analyzer import extract
126
+
127
+ class RiskFactors(BaseModel):
128
+ regulatory_risks: list[dict] = Field(
129
+ default_factory=list,
130
+ description="Government regulations that could impact the business"
131
+ )
132
+ litigation: list[dict] = Field(
133
+ default_factory=list,
134
+ description="Pending lawsuits and legal proceedings"
135
+ )
136
+ cybersecurity_risks: list[dict] = Field(
137
+ default_factory=list,
138
+ description="Data breach and cybersecurity threats"
139
+ )
140
+
141
+ result = extract("META", preset=RiskFactors)
142
+ ```
143
+
144
+ When no `__prompt__` is defined, the library auto-generates a prompt from your field descriptions.
145
+
146
+ ### Advanced: custom prompt
147
+
148
+ For expert-level control, add a `__prompt__` class variable:
149
+
150
+ ```python
151
+ from typing import ClassVar
152
+ from pydantic import BaseModel, Field
153
+
154
+ class ExecutiveComp(BaseModel):
155
+ __prompt__: ClassVar[str] = """\
156
+ You are analyzing a DEF 14A proxy statement for {company_name}.
157
+ Extract executive compensation data from the Summary Compensation Table
158
+ and related disclosure sections.
159
+
160
+ Rules:
161
+ 1. Include only Named Executive Officers (NEOs)
162
+ 2. All dollar amounts in exact figures from the filing
163
+ 3. Include stock awards, option awards, and non-equity incentive plan separately
164
+
165
+ Filing text:
166
+ {filing_text}
167
+ """
168
+
169
+ executives: list[dict] = Field(description="NEO compensation details")
170
+ equity_awards: list[dict] = Field(description="Stock and option grant details")
171
+
172
+ result = extract("AAPL", preset=ExecutiveComp, form="DEF 14A")
173
+ ```
174
+
175
+ The `{company_name}` and `{filing_text}` placeholders are filled automatically.
176
+
177
+ ---
178
+
179
+ ## Built-in Presets
180
+
181
+ ### `SupplyChain`
182
+
183
+ Extracts 11 categories of supply chain intelligence from 10-K/10-Q/20-F filings:
184
+
185
+ | Category | Description |
186
+ |----------|-------------|
187
+ | `suppliers` | Companies supplying products/materials/services |
188
+ | `customers` | Companies purchasing products/services |
189
+ | `single_source_dependencies` | Components with sole-source suppliers |
190
+ | `geographic_concentration` | Manufacturing/sourcing location concentration |
191
+ | `capacity_constraints` | Production limitations and lead times |
192
+ | `supply_chain_risks` | Disruption risks (tariffs, shortages, geopolitical) |
193
+ | `revenue_concentration` | Customer/segment revenue % from Notes |
194
+ | `geographic_revenue` | Revenue by country/region from Notes |
195
+ | `purchase_obligations` | Commitments and take-or-pay contracts |
196
+ | `market_risk_disclosures` | Commodity/FX/interest rate exposures (Item 7A) |
197
+ | `inventory_composition` | Raw materials/WIP/finished goods breakdown |
198
+
199
+ ---
200
+
201
+ ## API Reference
202
+
203
+ ### `extract(symbol, preset, form="10-K", filing_date=None, max_chars=2_000_000, api_key=None, model=None)`
204
+
205
+ | Parameter | Type | Description |
206
+ |-----------|------|-------------|
207
+ | `symbol` | str | Ticker symbol (e.g., "NVDA") |
208
+ | `preset` | BaseModel class | Pydantic model defining extraction schema |
209
+ | `form` | str | Filing type. Auto-fallback 10-K → 20-F |
210
+ | `filing_date` | str | Specific date (YYYY-MM-DD). None = latest |
211
+ | `max_chars` | int | Max filing markdown length |
212
+ | `api_key` | str | Google API key (fallback: `GOOGLE_API_KEY` env) |
213
+ | `model` | str | Gemini model (fallback: `GOOGLE_MODEL` env, default: `gemini-2.5-flash`) |
214
+
215
+ **Returns** `{"filing": {...}, "data": {...}}`
216
+
217
+ ---
218
+
219
+ ## CLI
220
+
221
+ ```bash
222
+ # Supply chain extraction (default)
223
+ sec-analyzer NVDA
224
+
225
+ # Specific form
226
+ sec-analyzer TSM --form 20-F
227
+
228
+ # Compact JSON
229
+ sec-analyzer NVDA --json
230
+
231
+ # Specific filing date
232
+ sec-analyzer AAPL --filing-date 2025-10-30
233
+ ```
234
+
235
+ ---
236
+
237
+ ## How It Works
238
+
239
+ ```
240
+ 1. edgartools finds the filing on SEC EDGAR
241
+ 2. Filing converted to markdown (tables preserved)
242
+ 3. Full markdown + Pydantic schema sent to Gemini
243
+ 4. Gemini returns structured JSON matching the schema
244
+ 5. Pydantic validates and returns typed data
245
+ ```
246
+
247
+ The key insight: Gemini's **structured output** mode forces the response to match your Pydantic schema exactly. No post-processing, no regex, no parsing.
248
+
249
+ ---
250
+
251
+ ## Environment Variables
252
+
253
+ | Variable | Required | Default | Description |
254
+ |----------|----------|---------|-------------|
255
+ | `GOOGLE_API_KEY` | Yes | - | Google AI API key |
256
+ | `EDGAR_IDENTITY` | No | `SECAnalyzer/1.0 user@example.com` | SEC EDGAR User-Agent |
257
+ | `GOOGLE_MODEL` | No | `gemini-2.5-flash` | Gemini model ID |
258
+
259
+ ---
260
+
261
+ ## Disclaimer
262
+
263
+ This project is **not affiliated with the SEC, EDGAR, or Google**. Filing data comes from SEC EDGAR (public). LLM extraction may contain errors — always verify critical data against the original filing.
264
+
265
+ This tool is for **research and educational purposes only**. It is not financial advice.
266
+
267
+ ---
268
+
269
+ ## License
270
+
271
+ MIT
@@ -0,0 +1,10 @@
1
+ sec_analyzer/__init__.py,sha256=eO_j8ok9YZ3FTrmkGkMFTZlMNRhuCzXv4YBEpJKVCoo,173
2
+ sec_analyzer/cli.py,sha256=XcWhp-onokVCmdTUFdWLThMA1UOBShpDlChuEFmUOI4,1758
3
+ sec_analyzer/engine.py,sha256=5LTbvKkvKI2NqKP-8Mj1sEd1qZCTFBeIMhoXUgR7v9w,16094
4
+ sec_analyzer/presets/__init__.py,sha256=EXY1wa66qYKonFEUx1xtGThS5jFpUF3IPN45AIbKPpY,101
5
+ sec_analyzer/presets/supply_chain.py,sha256=oiBg0LKJBBZtjUrNQ2_qmc4_repQQAbBvl_CwUAsK1g,9392
6
+ sec_analyzer-0.1.0.dist-info/METADATA,sha256=ceFL6fQHdK-CT8fLIuNKLGxQJHDOsyvOR97qHP3vUE0,8216
7
+ sec_analyzer-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ sec_analyzer-0.1.0.dist-info/entry_points.txt,sha256=2iYSZHJ9pQCKz2oaMyAWlAZu1k3gveR4G6tZrqo5La8,55
9
+ sec_analyzer-0.1.0.dist-info/licenses/LICENSE,sha256=TcoeGh2YiCuQikmHXW5evYQ74WkVQzZAgNzGodYHhIA,1065
10
+ sec_analyzer-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ sec-analyzer = sec_analyzer.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Seongjin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.