finance_data_llm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. finance_data_llm-0.1.0/PKG-INFO +235 -0
  2. finance_data_llm-0.1.0/README.md +220 -0
  3. finance_data_llm-0.1.0/dataloader/__init__.py +23 -0
  4. finance_data_llm-0.1.0/dataloader/pipeline.py +132 -0
  5. finance_data_llm-0.1.0/dataloader/repl_env.py +68 -0
  6. finance_data_llm-0.1.0/dataloader/text_splitter.py +163 -0
  7. finance_data_llm-0.1.0/dataloader/vector_store.py +400 -0
  8. finance_data_llm-0.1.0/earnings_transcripts/__init__.py +0 -0
  9. finance_data_llm-0.1.0/earnings_transcripts/transcripts.py +346 -0
  10. finance_data_llm-0.1.0/filings/__init__.py +15 -0
  11. finance_data_llm-0.1.0/filings/sec_data.py +204 -0
  12. finance_data_llm-0.1.0/filings/utils.py +277 -0
  13. finance_data_llm-0.1.0/finance_data/__init__.py +27 -0
  14. finance_data_llm-0.1.0/finance_data/api.py +87 -0
  15. finance_data_llm-0.1.0/finance_data/app.py +11 -0
  16. finance_data_llm-0.1.0/finance_data/cli.py +12 -0
  17. finance_data_llm-0.1.0/finance_data_llm.egg-info/PKG-INFO +235 -0
  18. finance_data_llm-0.1.0/finance_data_llm.egg-info/SOURCES.txt +27 -0
  19. finance_data_llm-0.1.0/finance_data_llm.egg-info/dependency_links.txt +1 -0
  20. finance_data_llm-0.1.0/finance_data_llm.egg-info/entry_points.txt +2 -0
  21. finance_data_llm-0.1.0/finance_data_llm.egg-info/requires.txt +8 -0
  22. finance_data_llm-0.1.0/finance_data_llm.egg-info/top_level.txt +9 -0
  23. finance_data_llm-0.1.0/mcp_server.py +360 -0
  24. finance_data_llm-0.1.0/ocr/__init__.py +0 -0
  25. finance_data_llm-0.1.0/ocr/olmocr_pipeline.py +1501 -0
  26. finance_data_llm-0.1.0/pyproject.toml +50 -0
  27. finance_data_llm-0.1.0/server.py +292 -0
  28. finance_data_llm-0.1.0/settings.py +31 -0
  29. finance_data_llm-0.1.0/setup.cfg +4 -0
@@ -0,0 +1,235 @@
1
+ Metadata-Version: 2.4
2
+ Name: finance_data_llm
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: <3.14,>=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: fastapi>=0.115.0
8
+ Requires-Dist: pydantic-settings>=2.6.0
9
+ Requires-Dist: uvicorn[standard]>=0.32.0
10
+ Requires-Dist: loguru>=0.7.3
11
+ Requires-Dist: ratelimit>=2.2.1
12
+ Requires-Dist: playwright>=1.49
13
+ Requires-Dist: yfinance>=1.2.0
14
+ Requires-Dist: numpy>=1.26
15
+
16
+ # SEC-filings-Markdown
17
+
18
+ ## Configuration
19
+
20
+ Settings are loaded via Pydantic Settings from environment variables or a `.env` file:
21
+
22
+ | Variable | Description | Default |
23
+ |----------|-------------|---------|
24
+ | `SEC_API_ORGANIZATION` | Organization name for SEC API User-Agent | `Your-Organization` |
25
+ | `SEC_API_EMAIL` | Contact email for SEC API User-Agent | `your-email@example.com` |
26
+ | `OLMOCR_SERVER` | vLLM server URL for olmOCR | `http://localhost:8000/v1` |
27
+ | `OLMOCR_MODEL` | Model name for olmOCR | `allenai/olmOCR-2-7B-1025-FP8` |
28
+ | `OLMOCR_WORKSPACE` | Workspace directory for OCR output | `./localworkspace` |
29
+ | `EARNINGS_TRANSCRIPTS_DIR` | Directory for fetched transcript JSONL files | `earnings_transcripts_data` |
30
+ | `EMBEDDING_SERVER` | OpenAI-compatible embedding API (e.g. vLLM pooling) | `http://127.0.0.1:8888/v1` |
31
+ | `EMBEDDING_MODEL` | Model id passed to the embedding server | `Qwen/Qwen3-Embedding-0.6B` |
32
+ | `CHROMA_PERSIST_DIR` | ChromaDB persistence directory | `./chroma_db` |
33
+
34
+
35
+ ## MCP server
36
+
37
+ This repository includes an MCP server at `mcp_server.py` that exposes the same operational functions as `server.py` (SEC fetch, OCR, embedding, and search), plus file exploration tools for PDFs, JSONL, markdown, and other artifacts under configured data roots.
38
+
39
+ Run it with the MCP dependency group:
40
+
41
+ ```bash
42
+ uv run --group mcp python mcp_server.py
43
+ ```
44
+
45
+ Key exploration tools exposed to MCP clients:
46
+
47
+ - `list_data_roots_tool`: shows root directories available for browsing.
48
+ - `list_data_files_tool`: glob file listing (for example `**/*.pdf`, `**/*.jsonl`).
49
+ - `read_data_file_tool`: reads text-based files directly and provides metadata/preview for binary files.
50
+
51
+ ## Docker
52
+
53
+ ### Build
54
+
55
+ ```bash
56
+ docker build -t sec-filings-md .
57
+ ```
58
+
59
+ The image now defaults to a smaller footprint by using the CUDA runtime base while still preinstalling Playwright Chromium for scraping.
60
+ If you want to skip Playwright browser installation (to reduce image size further), build with:
61
+
62
+ ```bash
63
+ docker build --build-arg INSTALL_PLAYWRIGHT_BROWSER=0 -t sec-filings-md .
64
+ ```
65
+
66
+ Or via Makefile:
67
+
68
+ ```bash
69
+ make docker-build
70
+ ```
71
+
72
+ ### Run
73
+
74
+ ```bash
75
+ GPU_DEVICE=${GPU_DEVICE:-3}
76
+ docker run --gpus device=${GPU_DEVICE} \
77
+ -e SEC_API_ORGANIZATION="Your-Organization" \
78
+ -e SEC_API_EMAIL="your-email@example.com" \
79
+ -v ./sec_data:/app/sec_data \
80
+ -v ./localworkspace:/app/localworkspace \
81
+ -p 8081:8081 \
82
+ sec-filings-md
83
+ ```
84
+
85
+ Or via Makefile (build + run in one step):
86
+
87
+ ```bash
88
+ make docker-start
89
+ ```
90
+
91
+ Makefile overrides:
92
+
93
+ | Variable | Description | Default |
94
+ |----------|-------------|---------|
95
+ | `IMAGE_NAME` | Docker image name | `sec-filings-md` |
96
+ | `GPU_DEVICE` | GPU device index | `0` |
97
+ | `API_PORT` | Host port for API | `8081` |
98
+ | `SEC_API_ORGANIZATION` | SEC API User-Agent org | `Your-Organization` |
99
+ | `SEC_API_EMAIL` | SEC API contact email | `your-email@example.com` |
100
+
101
+ Example with overrides:
102
+
103
+ ```bash
104
+ make docker-start GPU_DEVICE=3 SEC_API_EMAIL="you@example.com"
105
+ ```
106
+
107
+ The two volumes persist data across container restarts:
108
+
109
+ | Volume | Container path | Purpose |
110
+ |--------|---------------|---------|
111
+ | `sec_data` | `/app/sec_data` | Downloaded SEC filing PDFs |
112
+ | `localworkspace` | `/app/localworkspace` | OCR workspace and output markdown |
113
+
114
+ Override the workspace path at runtime with `-e OLMOCR_WORKSPACE=/custom/path`.
115
+
116
+ ## Installation
117
+
118
+ ```bash
119
+ uv sync
120
+ playwright install chromium
121
+ ```
122
+
123
+ Install OCR/markdown + embedding stack dependencies when you need those pipelines:
124
+
125
+ ```bash
126
+ uv sync --group ocr-md
127
+ ```
128
+
129
+ Package install (for publishing/consuming from PyPI):
130
+
131
+ ```bash
132
+ pip install finance_data_llm
133
+ ```
134
+
135
+ Use package functions directly from Python (no server process required):
136
+
137
+ ```python
138
+ from finance_data import (
139
+ company_name_to_ticker,
140
+ fetch_sec_filings_sync,
141
+ )
142
+
143
+ ticker = company_name_to_ticker("Amazon")
144
+ filings = fetch_sec_filings_sync(ticker=ticker or "AMZN", year="2025")
145
+ ```
146
+
147
+ If you do want to run the API, use the packaged console script:
148
+
149
+ ```bash
150
+ finance-data-llm-server
151
+ ```
152
+
153
+ ## Usage
154
+
155
+ Start vLLM server:
156
+ ```bash
157
+ make vllm-olmocr-serve
158
+ ```
159
+
160
+ Benchmark vLLM with guidellm (start the vLLM server first, then in another terminal):
161
+ ```bash
162
+ make guidellm-benchmark
163
+ ```
164
+
165
+ Fetch SEC filings:
166
+ ```bash
167
+ uv run python -m filings.sec_data --ticker AMZN --year 2025
168
+ ```
169
+
170
+ Run OCR pipeline:
171
+ ```bash
172
+ uv run python ocr/olmocr_pipeline.py --pdf-dir sec_data/AMZN-2025
173
+ ```
174
+
175
+ ## Earnings call transcripts
176
+
177
+ Transcripts are scraped from [discountingcashflows.com](https://discountingcashflows.com) (Playwright + Chromium). Each quarter is saved as one JSONL file under `{EARNINGS_TRANSCRIPTS_DIR}/{TICKER}/{year}/Q{n}.jsonl`.
178
+
179
+ ### 1. Fetch transcripts
180
+
181
+ **CLI** (writes files under `earnings_transcripts_data` by default):
182
+
183
+ ```bash
184
+ uv run python -m earnings_transcripts.transcripts AMZN 2025
185
+ ```
186
+
187
+ Optional: `--max-concurrency` (default `4`) to limit parallel quarter fetches.
188
+
189
+ **HTTP** (same fetch + persist, with the API running):
190
+
191
+ ```bash
192
+ curl -s -X POST "http://127.0.0.1:8081/earnings_transcripts/for_year" \
193
+ -H "Content-Type: application/json" \
194
+ -d '{"ticker":"AMZN","year":2025}'
195
+ ```
196
+
197
+ Response body is a JSON array of transcript objects (`ticker`, `year`, `quarter_num`, `date`, `speaker_texts`, …).
198
+
199
+ ### 2. Start embedding server and API
200
+
201
+ Transcript chunks are embedded with the same OpenAI-compatible embedding endpoint as SEC filings (`EMBEDDING_SERVER` / `EMBEDDING_MODEL`). In one terminal:
202
+
203
+ ```bash
204
+ make vllm-embd-serve
205
+ ```
206
+
207
+ In another:
208
+
209
+ ```bash
210
+ make start-server
211
+ ```
212
+
213
+ (Adjust `API_PORT` / `EMBD_PORT` in the `Makefile` or your environment if needed.)
214
+
215
+ ### 3. Index transcripts in Chroma
216
+
217
+ ```bash
218
+ curl -s -X POST "http://127.0.0.1:8081/vector_store/embed_transcripts" \
219
+ -H "Content-Type: application/json" \
220
+ -d '{"ticker":"AMZN","year":"2025","force":false}'
221
+ ```
222
+
223
+ Use `"force": true` to replace existing vectors for those quarters. Filing types in the index appear as `Q1`–`Q4`.
224
+
225
+ ### 4. Search across indexed quarters
226
+
227
+ Search merges hits from all transcript quarters present for that ticker/year:
228
+
229
+ ```bash
230
+ curl -s -X POST "http://127.0.0.1:8081/vector_store/search_transcripts" \
231
+ -H "Content-Type: application/json" \
232
+ -d '{"ticker":"AMZN","year":"2025","query":"AWS revenue growth","top_k":5}'
233
+ ```
234
+
235
+ Each result includes `filing_type` (`Q1`, …) so you can see which call the chunk came from.
@@ -0,0 +1,220 @@
1
+ # SEC-filings-Markdown
2
+
3
+ ## Configuration
4
+
5
+ Settings are loaded via Pydantic Settings from environment variables or a `.env` file:
6
+
7
+ | Variable | Description | Default |
8
+ |----------|-------------|---------|
9
+ | `SEC_API_ORGANIZATION` | Organization name for SEC API User-Agent | `Your-Organization` |
10
+ | `SEC_API_EMAIL` | Contact email for SEC API User-Agent | `your-email@example.com` |
11
+ | `OLMOCR_SERVER` | vLLM server URL for olmOCR | `http://localhost:8000/v1` |
12
+ | `OLMOCR_MODEL` | Model name for olmOCR | `allenai/olmOCR-2-7B-1025-FP8` |
13
+ | `OLMOCR_WORKSPACE` | Workspace directory for OCR output | `./localworkspace` |
14
+ | `EARNINGS_TRANSCRIPTS_DIR` | Directory for fetched transcript JSONL files | `earnings_transcripts_data` |
15
+ | `EMBEDDING_SERVER` | OpenAI-compatible embedding API (e.g. vLLM pooling) | `http://127.0.0.1:8888/v1` |
16
+ | `EMBEDDING_MODEL` | Model id passed to the embedding server | `Qwen/Qwen3-Embedding-0.6B` |
17
+ | `CHROMA_PERSIST_DIR` | ChromaDB persistence directory | `./chroma_db` |
18
+
19
+
20
+ ## MCP server
21
+
22
+ This repository includes an MCP server at `mcp_server.py` that exposes the same operational functions as `server.py` (SEC fetch, OCR, embedding, and search), plus file exploration tools for PDFs, JSONL, markdown, and other artifacts under configured data roots.
23
+
24
+ Run it with the MCP dependency group:
25
+
26
+ ```bash
27
+ uv run --group mcp python mcp_server.py
28
+ ```
29
+
30
+ Key exploration tools exposed to MCP clients:
31
+
32
+ - `list_data_roots_tool`: shows root directories available for browsing.
33
+ - `list_data_files_tool`: glob file listing (for example `**/*.pdf`, `**/*.jsonl`).
34
+ - `read_data_file_tool`: reads text-based files directly and provides metadata/preview for binary files.
35
+
36
+ ## Docker
37
+
38
+ ### Build
39
+
40
+ ```bash
41
+ docker build -t sec-filings-md .
42
+ ```
43
+
44
+ The image now defaults to a smaller footprint by using the CUDA runtime base while still preinstalling Playwright Chromium for scraping.
45
+ If you want to skip Playwright browser installation (to reduce image size further), build with:
46
+
47
+ ```bash
48
+ docker build --build-arg INSTALL_PLAYWRIGHT_BROWSER=0 -t sec-filings-md .
49
+ ```
50
+
51
+ Or via Makefile:
52
+
53
+ ```bash
54
+ make docker-build
55
+ ```
56
+
57
+ ### Run
58
+
59
+ ```bash
60
+ GPU_DEVICE=${GPU_DEVICE:-3}
61
+ docker run --gpus device=${GPU_DEVICE} \
62
+ -e SEC_API_ORGANIZATION="Your-Organization" \
63
+ -e SEC_API_EMAIL="your-email@example.com" \
64
+ -v ./sec_data:/app/sec_data \
65
+ -v ./localworkspace:/app/localworkspace \
66
+ -p 8081:8081 \
67
+ sec-filings-md
68
+ ```
69
+
70
+ Or via Makefile (build + run in one step):
71
+
72
+ ```bash
73
+ make docker-start
74
+ ```
75
+
76
+ Makefile overrides:
77
+
78
+ | Variable | Description | Default |
79
+ |----------|-------------|---------|
80
+ | `IMAGE_NAME` | Docker image name | `sec-filings-md` |
81
+ | `GPU_DEVICE` | GPU device index | `0` |
82
+ | `API_PORT` | Host port for API | `8081` |
83
+ | `SEC_API_ORGANIZATION` | SEC API User-Agent org | `Your-Organization` |
84
+ | `SEC_API_EMAIL` | SEC API contact email | `your-email@example.com` |
85
+
86
+ Example with overrides:
87
+
88
+ ```bash
89
+ make docker-start GPU_DEVICE=3 SEC_API_EMAIL="you@example.com"
90
+ ```
91
+
92
+ The two volumes persist data across container restarts:
93
+
94
+ | Volume | Container path | Purpose |
95
+ |--------|---------------|---------|
96
+ | `sec_data` | `/app/sec_data` | Downloaded SEC filing PDFs |
97
+ | `localworkspace` | `/app/localworkspace` | OCR workspace and output markdown |
98
+
99
+ Override the workspace path at runtime with `-e OLMOCR_WORKSPACE=/custom/path`.
100
+
101
+ ## Installation
102
+
103
+ ```bash
104
+ uv sync
105
+ playwright install chromium
106
+ ```
107
+
108
+ Install OCR/markdown + embedding stack dependencies when you need those pipelines:
109
+
110
+ ```bash
111
+ uv sync --group ocr-md
112
+ ```
113
+
114
+ Package install (for publishing/consuming from PyPI):
115
+
116
+ ```bash
117
+ pip install finance_data_llm
118
+ ```
119
+
120
+ Use package functions directly from Python (no server process required):
121
+
122
+ ```python
123
+ from finance_data import (
124
+ company_name_to_ticker,
125
+ fetch_sec_filings_sync,
126
+ )
127
+
128
+ ticker = company_name_to_ticker("Amazon")
129
+ filings = fetch_sec_filings_sync(ticker=ticker or "AMZN", year="2025")
130
+ ```
131
+
132
+ If you do want to run the API, use the packaged console script:
133
+
134
+ ```bash
135
+ finance-data-llm-server
136
+ ```
137
+
138
+ ## Usage
139
+
140
+ Start vLLM server:
141
+ ```bash
142
+ make vllm-olmocr-serve
143
+ ```
144
+
145
+ Benchmark vLLM with guidellm (start the vLLM server first, then in another terminal):
146
+ ```bash
147
+ make guidellm-benchmark
148
+ ```
149
+
150
+ Fetch SEC filings:
151
+ ```bash
152
+ uv run python -m filings.sec_data --ticker AMZN --year 2025
153
+ ```
154
+
155
+ Run OCR pipeline:
156
+ ```bash
157
+ uv run python ocr/olmocr_pipeline.py --pdf-dir sec_data/AMZN-2025
158
+ ```
159
+
160
+ ## Earnings call transcripts
161
+
162
+ Transcripts are scraped from [discountingcashflows.com](https://discountingcashflows.com) (Playwright + Chromium). Each quarter is saved as one JSONL file under `{EARNINGS_TRANSCRIPTS_DIR}/{TICKER}/{year}/Q{n}.jsonl`.
163
+
164
+ ### 1. Fetch transcripts
165
+
166
+ **CLI** (writes files under `earnings_transcripts_data` by default):
167
+
168
+ ```bash
169
+ uv run python -m earnings_transcripts.transcripts AMZN 2025
170
+ ```
171
+
172
+ Optional: `--max-concurrency` (default `4`) to limit parallel quarter fetches.
173
+
174
+ **HTTP** (same fetch + persist, with the API running):
175
+
176
+ ```bash
177
+ curl -s -X POST "http://127.0.0.1:8081/earnings_transcripts/for_year" \
178
+ -H "Content-Type: application/json" \
179
+ -d '{"ticker":"AMZN","year":2025}'
180
+ ```
181
+
182
+ Response body is a JSON array of transcript objects (`ticker`, `year`, `quarter_num`, `date`, `speaker_texts`, …).
183
+
184
+ ### 2. Start embedding server and API
185
+
186
+ Transcript chunks are embedded with the same OpenAI-compatible embedding endpoint as SEC filings (`EMBEDDING_SERVER` / `EMBEDDING_MODEL`). In one terminal:
187
+
188
+ ```bash
189
+ make vllm-embd-serve
190
+ ```
191
+
192
+ In another:
193
+
194
+ ```bash
195
+ make start-server
196
+ ```
197
+
198
+ (Adjust `API_PORT` / `EMBD_PORT` in the `Makefile` or your environment if needed.)
199
+
200
+ ### 3. Index transcripts in Chroma
201
+
202
+ ```bash
203
+ curl -s -X POST "http://127.0.0.1:8081/vector_store/embed_transcripts" \
204
+ -H "Content-Type: application/json" \
205
+ -d '{"ticker":"AMZN","year":"2025","force":false}'
206
+ ```
207
+
208
+ Use `"force": true` to replace existing vectors for those quarters. Filing types in the index appear as `Q1`–`Q4`.
209
+
210
+ ### 4. Search across indexed quarters
211
+
212
+ Search merges hits from all transcript quarters present for that ticker/year:
213
+
214
+ ```bash
215
+ curl -s -X POST "http://127.0.0.1:8081/vector_store/search_transcripts" \
216
+ -H "Content-Type: application/json" \
217
+ -d '{"ticker":"AMZN","year":"2025","query":"AWS revenue growth","top_k":5}'
218
+ ```
219
+
220
+ Each result includes `filing_type` (`Q1`, …) so you can see which call the chunk came from.
@@ -0,0 +1,23 @@
1
+ """Dataloader for SEC filings: fetch, OCR, embed, and vector search."""
2
+
3
+ from filings.utils import company_to_ticker
4
+
5
+ from .pipeline import ensure_sec_data, prepare_sec_filing_envs
6
+ from .repl_env import MarkdownReplEnvironment, markdown_to_repl_env
7
+ from .text_splitter import Chunk, chunk_markdown
8
+ from .vector_store import (
9
+ FaissVectorIndex,
10
+ embed_chunks,
11
+ )
12
+
13
+ __all__ = [
14
+ "company_to_ticker",
15
+ "ensure_sec_data",
16
+ "prepare_sec_filing_envs",
17
+ "MarkdownReplEnvironment",
18
+ "markdown_to_repl_env",
19
+ "Chunk",
20
+ "FaissVectorIndex",
21
+ "chunk_markdown",
22
+ "embed_chunks",
23
+ ]
@@ -0,0 +1,132 @@
1
+ """Pipeline to fetch SEC filings, run OCR, and prepare REPL environments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from filings.sec_data import (
8
+ SecResults,
9
+ get_sec_results,
10
+ save_sec_results_as_pdfs,
11
+ )
12
+ from ocr.olmocr_pipeline import get_markdown_path, run_olmo_ocr
13
+ from settings import sec_settings
14
+
15
+ from .repl_env import MarkdownReplEnvironment, markdown_to_repl_env
16
+
17
+
18
+ def _matches_filing_type(sec_result: SecResults, filing_type: str) -> bool:
19
+ """Return True if sec_result matches the requested filing type."""
20
+ if filing_type == "10-K":
21
+ return sec_result.form_name == "10-K"
22
+ if filing_type == "10-Q":
23
+ return sec_result.form_name.startswith("10-Q")
24
+ return sec_result.form_name == filing_type
25
+
26
+
27
+ async def ensure_sec_data(
28
+ ticker: str,
29
+ year: str,
30
+ filing_types: list[str],
31
+ include_amends: bool = True,
32
+ ) -> tuple[list[SecResults], list[Path]]:
33
+ """
34
+ Ensure SEC filing PDFs exist locally. Download only missing files.
35
+
36
+ PDFs are stored in sec_data/{ticker}-{year}/ (per filings.sec_data).
37
+
38
+ Returns:
39
+ (sec_results matching filing_types, paths to all PDFs)
40
+ """
41
+ sec_results = get_sec_results(
42
+ ticker=ticker,
43
+ year=year,
44
+ filing_types=filing_types,
45
+ include_amends=include_amends,
46
+ )
47
+ output_dir = Path("sec_data") / f"{ticker}-{year}"
48
+
49
+ filtered = [
50
+ sr
51
+ for sr in sec_results
52
+ if any(_matches_filing_type(sr, ft) for ft in filing_types)
53
+ ]
54
+ existing_paths: list[Path] = []
55
+ missing_results: list[SecResults] = []
56
+ for sr in filtered:
57
+ p = output_dir / f"{sr.form_name}.pdf"
58
+ if p.exists():
59
+ existing_paths.append(p)
60
+ else:
61
+ missing_results.append(sr)
62
+
63
+ if missing_results:
64
+ new_paths = await save_sec_results_as_pdfs(
65
+ sec_results=missing_results,
66
+ ticker=ticker,
67
+ year=year,
68
+ )
69
+ pdf_paths = existing_paths + new_paths
70
+ else:
71
+ pdf_paths = existing_paths
72
+
73
+ return filtered, pdf_paths
74
+
75
+
76
+ async def prepare_sec_filing_envs(
77
+ ticker: str,
78
+ year: str,
79
+ filing_type: str,
80
+ include_amends: bool = True,
81
+ workspace: str | Path | None = None,
82
+ ) -> list[MarkdownReplEnvironment]:
83
+ """
84
+ Fetch SEC filings (if needed), run OCR, and return REPL environments.
85
+
86
+ Args:
87
+ ticker: Stock ticker symbol (e.g. "GOOG", "AAPL").
88
+ year: Filing year (e.g. "2025").
89
+ filing_type: One of "10-K" or "10-Q".
90
+ include_amends: Include amended filings.
91
+ workspace: olmOCR workspace (default from settings). Markdown is written
92
+ to workspace/markdown/sec_data/{ticker}-{year}/...
93
+
94
+ Returns:
95
+ List of MarkdownReplEnvironment, one per filing (e.g. 10-K or 10-Q1..10-Q4).
96
+ """
97
+ workspace_str = str(workspace or sec_settings.olmocr_workspace)
98
+ pdf_dir_str = f"sec_data/{ticker}-{year}"
99
+
100
+ filing_types = [filing_type]
101
+ sec_results, _pdf_paths = await ensure_sec_data(
102
+ ticker=ticker,
103
+ year=year,
104
+ filing_types=filing_types,
105
+ include_amends=include_amends,
106
+ )
107
+ if not sec_results:
108
+ return []
109
+
110
+ await run_olmo_ocr(
111
+ pdf_dir=pdf_dir_str,
112
+ workspace=workspace_str,
113
+ )
114
+
115
+ envs: list[MarkdownReplEnvironment] = []
116
+ rel_pdf_base = f"sec_data/{ticker}-{year}"
117
+ for sr in sec_results:
118
+ source_file = f"{rel_pdf_base}/{sr.form_name}.pdf"
119
+ markdown_path_str = get_markdown_path(workspace_str, source_file)
120
+ markdown_path = Path(markdown_path_str)
121
+
122
+ if not markdown_path.exists():
123
+ continue
124
+ env = markdown_to_repl_env(
125
+ markdown_path=markdown_path,
126
+ ticker=ticker,
127
+ year=year,
128
+ sec_result=sr,
129
+ )
130
+ envs.append(env)
131
+
132
+ return envs
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+ from code import InteractiveConsole
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ import functools
6
+
7
+ from filings.sec_data import SecResults
8
+
9
+
10
+ @dataclass
11
+ class MarkdownReplEnvironment:
12
+ ticker: str
13
+ year: str
14
+ filing_type: str
15
+ markdown_path: Path
16
+ markdown_text: str
17
+ namespace: dict[str, object]
18
+ console: InteractiveConsole
19
+ sec_result: SecResults
20
+
21
+
22
+ @functools.lru_cache
23
+ def markdown_to_repl_env(
24
+ markdown_path: Path,
25
+ ticker: str,
26
+ year: str,
27
+ sec_result: SecResults,
28
+ ) -> MarkdownReplEnvironment:
29
+ resolved_path = markdown_path.resolve()
30
+ markdown_text = resolved_path.read_text(encoding="utf-8")
31
+ filing_type = resolved_path.stem
32
+ namespace: dict[str, object] = {
33
+ "ticker": ticker,
34
+ "year": year,
35
+ "filing_type": filing_type,
36
+ "markdown_path": resolved_path,
37
+ "markdown_text": markdown_text,
38
+ "sec_result": sec_result,
39
+ }
40
+ console = InteractiveConsole(locals=namespace)
41
+
42
+ return MarkdownReplEnvironment(
43
+ ticker=ticker,
44
+ year=year,
45
+ filing_type=filing_type,
46
+ markdown_path=resolved_path,
47
+ markdown_text=markdown_text,
48
+ namespace=namespace,
49
+ console=console,
50
+ sec_result=sec_result,
51
+ )
52
+
53
+
54
+ if __name__ == "__main__":
55
+ import asyncio
56
+
57
+ from dataloader.pipeline import prepare_sec_filing_envs
58
+
59
+ envs = asyncio.run(
60
+ prepare_sec_filing_envs(
61
+ ticker="AMZN",
62
+ year="2025",
63
+ filing_type="10-K",
64
+ include_amends=True,
65
+ )
66
+ )
67
+ for env in envs:
68
+ print(env.ticker, env.year, env.filing_type, env.markdown_path)