finance_data_llm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- finance_data_llm-0.1.0/PKG-INFO +235 -0
- finance_data_llm-0.1.0/README.md +220 -0
- finance_data_llm-0.1.0/dataloader/__init__.py +23 -0
- finance_data_llm-0.1.0/dataloader/pipeline.py +132 -0
- finance_data_llm-0.1.0/dataloader/repl_env.py +68 -0
- finance_data_llm-0.1.0/dataloader/text_splitter.py +163 -0
- finance_data_llm-0.1.0/dataloader/vector_store.py +400 -0
- finance_data_llm-0.1.0/earnings_transcripts/__init__.py +0 -0
- finance_data_llm-0.1.0/earnings_transcripts/transcripts.py +346 -0
- finance_data_llm-0.1.0/filings/__init__.py +15 -0
- finance_data_llm-0.1.0/filings/sec_data.py +204 -0
- finance_data_llm-0.1.0/filings/utils.py +277 -0
- finance_data_llm-0.1.0/finance_data/__init__.py +27 -0
- finance_data_llm-0.1.0/finance_data/api.py +87 -0
- finance_data_llm-0.1.0/finance_data/app.py +11 -0
- finance_data_llm-0.1.0/finance_data/cli.py +12 -0
- finance_data_llm-0.1.0/finance_data_llm.egg-info/PKG-INFO +235 -0
- finance_data_llm-0.1.0/finance_data_llm.egg-info/SOURCES.txt +27 -0
- finance_data_llm-0.1.0/finance_data_llm.egg-info/dependency_links.txt +1 -0
- finance_data_llm-0.1.0/finance_data_llm.egg-info/entry_points.txt +2 -0
- finance_data_llm-0.1.0/finance_data_llm.egg-info/requires.txt +8 -0
- finance_data_llm-0.1.0/finance_data_llm.egg-info/top_level.txt +9 -0
- finance_data_llm-0.1.0/mcp_server.py +360 -0
- finance_data_llm-0.1.0/ocr/__init__.py +0 -0
- finance_data_llm-0.1.0/ocr/olmocr_pipeline.py +1501 -0
- finance_data_llm-0.1.0/pyproject.toml +50 -0
- finance_data_llm-0.1.0/server.py +292 -0
- finance_data_llm-0.1.0/settings.py +31 -0
- finance_data_llm-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: finance_data_llm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: <3.14,>=3.13
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: fastapi>=0.115.0
|
|
8
|
+
Requires-Dist: pydantic-settings>=2.6.0
|
|
9
|
+
Requires-Dist: uvicorn[standard]>=0.32.0
|
|
10
|
+
Requires-Dist: loguru>=0.7.3
|
|
11
|
+
Requires-Dist: ratelimit>=2.2.1
|
|
12
|
+
Requires-Dist: playwright>=1.49
|
|
13
|
+
Requires-Dist: yfinance>=1.2.0
|
|
14
|
+
Requires-Dist: numpy>=1.26
|
|
15
|
+
|
|
16
|
+
# SEC-filings-Markdown
|
|
17
|
+
|
|
18
|
+
## Configuration
|
|
19
|
+
|
|
20
|
+
Settings are loaded via Pydantic Settings from environment variables or a `.env` file:
|
|
21
|
+
|
|
22
|
+
| Variable | Description | Default |
|
|
23
|
+
|----------|-------------|---------|
|
|
24
|
+
| `SEC_API_ORGANIZATION` | Organization name for SEC API User-Agent | `Your-Organization` |
|
|
25
|
+
| `SEC_API_EMAIL` | Contact email for SEC API User-Agent | `your-email@example.com` |
|
|
26
|
+
| `OLMOCR_SERVER` | vLLM server URL for olmOCR | `http://localhost:8000/v1` |
|
|
27
|
+
| `OLMOCR_MODEL` | Model name for olmOCR | `allenai/olmOCR-2-7B-1025-FP8` |
|
|
28
|
+
| `OLMOCR_WORKSPACE` | Workspace directory for OCR output | `./localworkspace` |
|
|
29
|
+
| `EARNINGS_TRANSCRIPTS_DIR` | Directory for fetched transcript JSONL files | `earnings_transcripts_data` |
|
|
30
|
+
| `EMBEDDING_SERVER` | OpenAI-compatible embedding API (e.g. vLLM pooling) | `http://127.0.0.1:8888/v1` |
|
|
31
|
+
| `EMBEDDING_MODEL` | Model id passed to the embedding server | `Qwen/Qwen3-Embedding-0.6B` |
|
|
32
|
+
| `CHROMA_PERSIST_DIR` | ChromaDB persistence directory | `./chroma_db` |
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## MCP server
|
|
36
|
+
|
|
37
|
+
This repository includes an MCP server at `mcp_server.py` that exposes the same operational functions as `server.py` (SEC fetch, OCR, embedding, and search), plus file exploration tools for PDFs, JSONL, markdown, and other artifacts under configured data roots.
|
|
38
|
+
|
|
39
|
+
Run it with the MCP dependency group:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
uv run --group mcp python mcp_server.py
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Key exploration tools exposed to MCP clients:
|
|
46
|
+
|
|
47
|
+
- `list_data_roots_tool`: shows root directories available for browsing.
|
|
48
|
+
- `list_data_files_tool`: glob file listing (for example `**/*.pdf`, `**/*.jsonl`).
|
|
49
|
+
- `read_data_file_tool`: reads text-based files directly and provides metadata/preview for binary files.
|
|
50
|
+
|
|
51
|
+
## Docker
|
|
52
|
+
|
|
53
|
+
### Build
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
docker build -t sec-filings-md .
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
The image now defaults to a smaller footprint by using the CUDA runtime base while still preinstalling Playwright Chromium for scraping.
|
|
60
|
+
If you want to skip Playwright browser installation (to reduce image size further), build with:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
docker build --build-arg INSTALL_PLAYWRIGHT_BROWSER=0 -t sec-filings-md .
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Or via Makefile:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
make docker-build
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Run
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
GPU_DEVICE=${GPU_DEVICE:-3}
|
|
76
|
+
docker run --gpus device=${GPU_DEVICE} \
|
|
77
|
+
-e SEC_API_ORGANIZATION="Your-Organization" \
|
|
78
|
+
-e SEC_API_EMAIL="your-email@example.com" \
|
|
79
|
+
-v ./sec_data:/app/sec_data \
|
|
80
|
+
-v ./localworkspace:/app/localworkspace \
|
|
81
|
+
-p 8081:8081 \
|
|
82
|
+
sec-filings-md
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Or via Makefile (build + run in one step):
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
make docker-start
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Makefile overrides:
|
|
92
|
+
|
|
93
|
+
| Variable | Description | Default |
|
|
94
|
+
|----------|-------------|---------|
|
|
95
|
+
| `IMAGE_NAME` | Docker image name | `sec-filings-md` |
|
|
96
|
+
| `GPU_DEVICE` | GPU device index | `0` |
|
|
97
|
+
| `API_PORT` | Host port for API | `8081` |
|
|
98
|
+
| `SEC_API_ORGANIZATION` | SEC API User-Agent org | `Your-Organization` |
|
|
99
|
+
| `SEC_API_EMAIL` | SEC API contact email | `your-email@example.com` |
|
|
100
|
+
|
|
101
|
+
Example with overrides:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
make docker-start GPU_DEVICE=3 SEC_API_EMAIL="you@example.com"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The two volumes persist data across container restarts:
|
|
108
|
+
|
|
109
|
+
| Volume | Container path | Purpose |
|
|
110
|
+
|--------|---------------|---------|
|
|
111
|
+
| `sec_data` | `/app/sec_data` | Downloaded SEC filing PDFs |
|
|
112
|
+
| `localworkspace` | `/app/localworkspace` | OCR workspace and output markdown |
|
|
113
|
+
|
|
114
|
+
Override the workspace path at runtime with `-e OLMOCR_WORKSPACE=/custom/path`.
|
|
115
|
+
|
|
116
|
+
## Installation
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
uv sync
|
|
120
|
+
playwright install chromium
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Install OCR/markdown + embedding stack dependencies when you need those pipelines:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
uv sync --group ocr-md
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Package install (for publishing/consuming from PyPI):
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
pip install finance_data_llm
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Use package functions directly from Python (no server process required):
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from finance_data import (
|
|
139
|
+
company_name_to_ticker,
|
|
140
|
+
fetch_sec_filings_sync,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
ticker = company_name_to_ticker("Amazon")
|
|
144
|
+
filings = fetch_sec_filings_sync(ticker=ticker or "AMZN", year="2025")
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
If you do want to run the API, use the packaged console script:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
finance-data-llm-server
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Usage
|
|
154
|
+
|
|
155
|
+
Start vLLM server:
|
|
156
|
+
```bash
|
|
157
|
+
make vllm-olmocr-serve
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Benchmark vLLM with guidellm (start the vLLM server first, then in another terminal):
|
|
161
|
+
```bash
|
|
162
|
+
make guidellm-benchmark
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Fetch SEC filings:
|
|
166
|
+
```bash
|
|
167
|
+
uv run python -m filings.sec_data --ticker AMZN --year 2025
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Run OCR pipeline:
|
|
171
|
+
```bash
|
|
172
|
+
uv run python ocr/olmocr_pipeline.py --pdf-dir sec_data/AMZN-2025
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Earnings call transcripts
|
|
176
|
+
|
|
177
|
+
Transcripts are scraped from [discountingcashflows.com](https://discountingcashflows.com) (Playwright + Chromium). Each quarter is saved as one JSONL file under `{EARNINGS_TRANSCRIPTS_DIR}/{TICKER}/{year}/Q{n}.jsonl`.
|
|
178
|
+
|
|
179
|
+
### 1. Fetch transcripts
|
|
180
|
+
|
|
181
|
+
**CLI** (writes files under `earnings_transcripts_data` by default):
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
uv run python -m earnings_transcripts.transcripts AMZN 2025
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Optional: `--max-concurrency` (default `4`) to limit parallel quarter fetches.
|
|
188
|
+
|
|
189
|
+
**HTTP** (same fetch + persist, with the API running):
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
curl -s -X POST "http://127.0.0.1:8081/earnings_transcripts/for_year" \
|
|
193
|
+
-H "Content-Type: application/json" \
|
|
194
|
+
-d '{"ticker":"AMZN","year":2025}'
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Response body is a JSON array of transcript objects (`ticker`, `year`, `quarter_num`, `date`, `speaker_texts`, …).
|
|
198
|
+
|
|
199
|
+
### 2. Start embedding server and API
|
|
200
|
+
|
|
201
|
+
Transcript chunks are embedded with the same OpenAI-compatible embedding endpoint as SEC filings (`EMBEDDING_SERVER` / `EMBEDDING_MODEL`). In one terminal:
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
make vllm-embd-serve
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
In another:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
make start-server
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
(Adjust `API_PORT` / `EMBD_PORT` in the `Makefile` or your environment if needed.)
|
|
214
|
+
|
|
215
|
+
### 3. Index transcripts in Chroma
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
curl -s -X POST "http://127.0.0.1:8081/vector_store/embed_transcripts" \
|
|
219
|
+
-H "Content-Type: application/json" \
|
|
220
|
+
-d '{"ticker":"AMZN","year":"2025","force":false}'
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Use `"force": true` to replace existing vectors for those quarters. Filing types in the index appear as `Q1`–`Q4`.
|
|
224
|
+
|
|
225
|
+
### 4. Search across indexed quarters
|
|
226
|
+
|
|
227
|
+
Search merges hits from all transcript quarters present for that ticker/year:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
curl -s -X POST "http://127.0.0.1:8081/vector_store/search_transcripts" \
|
|
231
|
+
-H "Content-Type: application/json" \
|
|
232
|
+
-d '{"ticker":"AMZN","year":"2025","query":"AWS revenue growth","top_k":5}'
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Each result includes `filing_type` (`Q1`, …) so you can see which call the chunk came from.
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# SEC-filings-Markdown
|
|
2
|
+
|
|
3
|
+
## Configuration
|
|
4
|
+
|
|
5
|
+
Settings are loaded via Pydantic Settings from environment variables or a `.env` file:
|
|
6
|
+
|
|
7
|
+
| Variable | Description | Default |
|
|
8
|
+
|----------|-------------|---------|
|
|
9
|
+
| `SEC_API_ORGANIZATION` | Organization name for SEC API User-Agent | `Your-Organization` |
|
|
10
|
+
| `SEC_API_EMAIL` | Contact email for SEC API User-Agent | `your-email@example.com` |
|
|
11
|
+
| `OLMOCR_SERVER` | vLLM server URL for olmOCR | `http://localhost:8000/v1` |
|
|
12
|
+
| `OLMOCR_MODEL` | Model name for olmOCR | `allenai/olmOCR-2-7B-1025-FP8` |
|
|
13
|
+
| `OLMOCR_WORKSPACE` | Workspace directory for OCR output | `./localworkspace` |
|
|
14
|
+
| `EARNINGS_TRANSCRIPTS_DIR` | Directory for fetched transcript JSONL files | `earnings_transcripts_data` |
|
|
15
|
+
| `EMBEDDING_SERVER` | OpenAI-compatible embedding API (e.g. vLLM pooling) | `http://127.0.0.1:8888/v1` |
|
|
16
|
+
| `EMBEDDING_MODEL` | Model id passed to the embedding server | `Qwen/Qwen3-Embedding-0.6B` |
|
|
17
|
+
| `CHROMA_PERSIST_DIR` | ChromaDB persistence directory | `./chroma_db` |
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
## MCP server
|
|
21
|
+
|
|
22
|
+
This repository includes an MCP server at `mcp_server.py` that exposes the same operational functions as `server.py` (SEC fetch, OCR, embedding, and search), plus file exploration tools for PDFs, JSONL, markdown, and other artifacts under configured data roots.
|
|
23
|
+
|
|
24
|
+
Run it with the MCP dependency group:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
uv run --group mcp python mcp_server.py
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Key exploration tools exposed to MCP clients:
|
|
31
|
+
|
|
32
|
+
- `list_data_roots_tool`: shows root directories available for browsing.
|
|
33
|
+
- `list_data_files_tool`: glob file listing (for example `**/*.pdf`, `**/*.jsonl`).
|
|
34
|
+
- `read_data_file_tool`: reads text-based files directly and provides metadata/preview for binary files.
|
|
35
|
+
|
|
36
|
+
## Docker
|
|
37
|
+
|
|
38
|
+
### Build
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
docker build -t sec-filings-md .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
The image now defaults to a smaller footprint by using the CUDA runtime base while still preinstalling Playwright Chromium for scraping.
|
|
45
|
+
If you want to skip Playwright browser installation (to reduce image size further), build with:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
docker build --build-arg INSTALL_PLAYWRIGHT_BROWSER=0 -t sec-filings-md .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Or via Makefile:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
make docker-build
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Run
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
GPU_DEVICE=${GPU_DEVICE:-3}
|
|
61
|
+
docker run --gpus device=${GPU_DEVICE} \
|
|
62
|
+
-e SEC_API_ORGANIZATION="Your-Organization" \
|
|
63
|
+
-e SEC_API_EMAIL="your-email@example.com" \
|
|
64
|
+
-v ./sec_data:/app/sec_data \
|
|
65
|
+
-v ./localworkspace:/app/localworkspace \
|
|
66
|
+
-p 8081:8081 \
|
|
67
|
+
sec-filings-md
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Or via Makefile (build + run in one step):
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
make docker-start
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Makefile overrides:
|
|
77
|
+
|
|
78
|
+
| Variable | Description | Default |
|
|
79
|
+
|----------|-------------|---------|
|
|
80
|
+
| `IMAGE_NAME` | Docker image name | `sec-filings-md` |
|
|
81
|
+
| `GPU_DEVICE` | GPU device index | `0` |
|
|
82
|
+
| `API_PORT` | Host port for API | `8081` |
|
|
83
|
+
| `SEC_API_ORGANIZATION` | SEC API User-Agent org | `Your-Organization` |
|
|
84
|
+
| `SEC_API_EMAIL` | SEC API contact email | `your-email@example.com` |
|
|
85
|
+
|
|
86
|
+
Example with overrides:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
make docker-start GPU_DEVICE=3 SEC_API_EMAIL="you@example.com"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The two volumes persist data across container restarts:
|
|
93
|
+
|
|
94
|
+
| Volume | Container path | Purpose |
|
|
95
|
+
|--------|---------------|---------|
|
|
96
|
+
| `sec_data` | `/app/sec_data` | Downloaded SEC filing PDFs |
|
|
97
|
+
| `localworkspace` | `/app/localworkspace` | OCR workspace and output markdown |
|
|
98
|
+
|
|
99
|
+
Override the workspace path at runtime with `-e OLMOCR_WORKSPACE=/custom/path`.
|
|
100
|
+
|
|
101
|
+
## Installation
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
uv sync
|
|
105
|
+
playwright install chromium
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Install OCR/markdown + embedding stack dependencies when you need those pipelines:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
uv sync --group ocr-md
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Package install (for publishing/consuming from PyPI):
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pip install finance_data_llm
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Use package functions directly from Python (no server process required):
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from finance_data import (
|
|
124
|
+
company_name_to_ticker,
|
|
125
|
+
fetch_sec_filings_sync,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
ticker = company_name_to_ticker("Amazon")
|
|
129
|
+
filings = fetch_sec_filings_sync(ticker=ticker or "AMZN", year="2025")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
If you do want to run the API, use the packaged console script:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
finance-data-llm-server
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Usage
|
|
139
|
+
|
|
140
|
+
Start vLLM server:
|
|
141
|
+
```bash
|
|
142
|
+
make vllm-olmocr-serve
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Benchmark vLLM with guidellm (start the vLLM server first, then in another terminal):
|
|
146
|
+
```bash
|
|
147
|
+
make guidellm-benchmark
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Fetch SEC filings:
|
|
151
|
+
```bash
|
|
152
|
+
uv run python -m filings.sec_data --ticker AMZN --year 2025
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Run OCR pipeline:
|
|
156
|
+
```bash
|
|
157
|
+
uv run python ocr/olmocr_pipeline.py --pdf-dir sec_data/AMZN-2025
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Earnings call transcripts
|
|
161
|
+
|
|
162
|
+
Transcripts are scraped from [discountingcashflows.com](https://discountingcashflows.com) (Playwright + Chromium). Each quarter is saved as one JSONL file under `{EARNINGS_TRANSCRIPTS_DIR}/{TICKER}/{year}/Q{n}.jsonl`.
|
|
163
|
+
|
|
164
|
+
### 1. Fetch transcripts
|
|
165
|
+
|
|
166
|
+
**CLI** (writes files under `earnings_transcripts_data` by default):
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
uv run python -m earnings_transcripts.transcripts AMZN 2025
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Optional: `--max-concurrency` (default `4`) to limit parallel quarter fetches.
|
|
173
|
+
|
|
174
|
+
**HTTP** (same fetch + persist, with the API running):
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
curl -s -X POST "http://127.0.0.1:8081/earnings_transcripts/for_year" \
|
|
178
|
+
-H "Content-Type: application/json" \
|
|
179
|
+
-d '{"ticker":"AMZN","year":2025}'
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Response body is a JSON array of transcript objects (`ticker`, `year`, `quarter_num`, `date`, `speaker_texts`, …).
|
|
183
|
+
|
|
184
|
+
### 2. Start embedding server and API
|
|
185
|
+
|
|
186
|
+
Transcript chunks are embedded with the same OpenAI-compatible embedding endpoint as SEC filings (`EMBEDDING_SERVER` / `EMBEDDING_MODEL`). In one terminal:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
make vllm-embd-serve
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
In another:
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
make start-server
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
(Adjust `API_PORT` / `EMBD_PORT` in the `Makefile` or your environment if needed.)
|
|
199
|
+
|
|
200
|
+
### 3. Index transcripts in Chroma
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
curl -s -X POST "http://127.0.0.1:8081/vector_store/embed_transcripts" \
|
|
204
|
+
-H "Content-Type: application/json" \
|
|
205
|
+
-d '{"ticker":"AMZN","year":"2025","force":false}'
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Use `"force": true` to replace existing vectors for those quarters. Filing types in the index appear as `Q1`–`Q4`.
|
|
209
|
+
|
|
210
|
+
### 4. Search across indexed quarters
|
|
211
|
+
|
|
212
|
+
Search merges hits from all transcript quarters present for that ticker/year:
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
curl -s -X POST "http://127.0.0.1:8081/vector_store/search_transcripts" \
|
|
216
|
+
-H "Content-Type: application/json" \
|
|
217
|
+
-d '{"ticker":"AMZN","year":"2025","query":"AWS revenue growth","top_k":5}'
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Each result includes `filing_type` (`Q1`, …) so you can see which call the chunk came from.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Dataloader for SEC filings: fetch, OCR, embed, and vector search."""
|
|
2
|
+
|
|
3
|
+
from filings.utils import company_to_ticker
|
|
4
|
+
|
|
5
|
+
from .pipeline import ensure_sec_data, prepare_sec_filing_envs
|
|
6
|
+
from .repl_env import MarkdownReplEnvironment, markdown_to_repl_env
|
|
7
|
+
from .text_splitter import Chunk, chunk_markdown
|
|
8
|
+
from .vector_store import (
|
|
9
|
+
FaissVectorIndex,
|
|
10
|
+
embed_chunks,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"company_to_ticker",
|
|
15
|
+
"ensure_sec_data",
|
|
16
|
+
"prepare_sec_filing_envs",
|
|
17
|
+
"MarkdownReplEnvironment",
|
|
18
|
+
"markdown_to_repl_env",
|
|
19
|
+
"Chunk",
|
|
20
|
+
"FaissVectorIndex",
|
|
21
|
+
"chunk_markdown",
|
|
22
|
+
"embed_chunks",
|
|
23
|
+
]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Pipeline to fetch SEC filings, run OCR, and prepare REPL environments."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from filings.sec_data import (
|
|
8
|
+
SecResults,
|
|
9
|
+
get_sec_results,
|
|
10
|
+
save_sec_results_as_pdfs,
|
|
11
|
+
)
|
|
12
|
+
from ocr.olmocr_pipeline import get_markdown_path, run_olmo_ocr
|
|
13
|
+
from settings import sec_settings
|
|
14
|
+
|
|
15
|
+
from .repl_env import MarkdownReplEnvironment, markdown_to_repl_env
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _matches_filing_type(sec_result: SecResults, filing_type: str) -> bool:
|
|
19
|
+
"""Return True if sec_result matches the requested filing type."""
|
|
20
|
+
if filing_type == "10-K":
|
|
21
|
+
return sec_result.form_name == "10-K"
|
|
22
|
+
if filing_type == "10-Q":
|
|
23
|
+
return sec_result.form_name.startswith("10-Q")
|
|
24
|
+
return sec_result.form_name == filing_type
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
async def ensure_sec_data(
|
|
28
|
+
ticker: str,
|
|
29
|
+
year: str,
|
|
30
|
+
filing_types: list[str],
|
|
31
|
+
include_amends: bool = True,
|
|
32
|
+
) -> tuple[list[SecResults], list[Path]]:
|
|
33
|
+
"""
|
|
34
|
+
Ensure SEC filing PDFs exist locally. Download only missing files.
|
|
35
|
+
|
|
36
|
+
PDFs are stored in sec_data/{ticker}-{year}/ (per filings.sec_data).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
(sec_results matching filing_types, paths to all PDFs)
|
|
40
|
+
"""
|
|
41
|
+
sec_results = get_sec_results(
|
|
42
|
+
ticker=ticker,
|
|
43
|
+
year=year,
|
|
44
|
+
filing_types=filing_types,
|
|
45
|
+
include_amends=include_amends,
|
|
46
|
+
)
|
|
47
|
+
output_dir = Path("sec_data") / f"{ticker}-{year}"
|
|
48
|
+
|
|
49
|
+
filtered = [
|
|
50
|
+
sr
|
|
51
|
+
for sr in sec_results
|
|
52
|
+
if any(_matches_filing_type(sr, ft) for ft in filing_types)
|
|
53
|
+
]
|
|
54
|
+
existing_paths: list[Path] = []
|
|
55
|
+
missing_results: list[SecResults] = []
|
|
56
|
+
for sr in filtered:
|
|
57
|
+
p = output_dir / f"{sr.form_name}.pdf"
|
|
58
|
+
if p.exists():
|
|
59
|
+
existing_paths.append(p)
|
|
60
|
+
else:
|
|
61
|
+
missing_results.append(sr)
|
|
62
|
+
|
|
63
|
+
if missing_results:
|
|
64
|
+
new_paths = await save_sec_results_as_pdfs(
|
|
65
|
+
sec_results=missing_results,
|
|
66
|
+
ticker=ticker,
|
|
67
|
+
year=year,
|
|
68
|
+
)
|
|
69
|
+
pdf_paths = existing_paths + new_paths
|
|
70
|
+
else:
|
|
71
|
+
pdf_paths = existing_paths
|
|
72
|
+
|
|
73
|
+
return filtered, pdf_paths
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def prepare_sec_filing_envs(
|
|
77
|
+
ticker: str,
|
|
78
|
+
year: str,
|
|
79
|
+
filing_type: str,
|
|
80
|
+
include_amends: bool = True,
|
|
81
|
+
workspace: str | Path | None = None,
|
|
82
|
+
) -> list[MarkdownReplEnvironment]:
|
|
83
|
+
"""
|
|
84
|
+
Fetch SEC filings (if needed), run OCR, and return REPL environments.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
ticker: Stock ticker symbol (e.g. "GOOG", "AAPL").
|
|
88
|
+
year: Filing year (e.g. "2025").
|
|
89
|
+
filing_type: One of "10-K" or "10-Q".
|
|
90
|
+
include_amends: Include amended filings.
|
|
91
|
+
workspace: olmOCR workspace (default from settings). Markdown is written
|
|
92
|
+
to workspace/markdown/sec_data/{ticker}-{year}/...
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of MarkdownReplEnvironment, one per filing (e.g. 10-K or 10-Q1..10-Q4).
|
|
96
|
+
"""
|
|
97
|
+
workspace_str = str(workspace or sec_settings.olmocr_workspace)
|
|
98
|
+
pdf_dir_str = f"sec_data/{ticker}-{year}"
|
|
99
|
+
|
|
100
|
+
filing_types = [filing_type]
|
|
101
|
+
sec_results, _pdf_paths = await ensure_sec_data(
|
|
102
|
+
ticker=ticker,
|
|
103
|
+
year=year,
|
|
104
|
+
filing_types=filing_types,
|
|
105
|
+
include_amends=include_amends,
|
|
106
|
+
)
|
|
107
|
+
if not sec_results:
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
await run_olmo_ocr(
|
|
111
|
+
pdf_dir=pdf_dir_str,
|
|
112
|
+
workspace=workspace_str,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
envs: list[MarkdownReplEnvironment] = []
|
|
116
|
+
rel_pdf_base = f"sec_data/{ticker}-{year}"
|
|
117
|
+
for sr in sec_results:
|
|
118
|
+
source_file = f"{rel_pdf_base}/{sr.form_name}.pdf"
|
|
119
|
+
markdown_path_str = get_markdown_path(workspace_str, source_file)
|
|
120
|
+
markdown_path = Path(markdown_path_str)
|
|
121
|
+
|
|
122
|
+
if not markdown_path.exists():
|
|
123
|
+
continue
|
|
124
|
+
env = markdown_to_repl_env(
|
|
125
|
+
markdown_path=markdown_path,
|
|
126
|
+
ticker=ticker,
|
|
127
|
+
year=year,
|
|
128
|
+
sec_result=sr,
|
|
129
|
+
)
|
|
130
|
+
envs.append(env)
|
|
131
|
+
|
|
132
|
+
return envs
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from code import InteractiveConsole
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import functools
|
|
6
|
+
|
|
7
|
+
from filings.sec_data import SecResults
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class MarkdownReplEnvironment:
|
|
12
|
+
ticker: str
|
|
13
|
+
year: str
|
|
14
|
+
filing_type: str
|
|
15
|
+
markdown_path: Path
|
|
16
|
+
markdown_text: str
|
|
17
|
+
namespace: dict[str, object]
|
|
18
|
+
console: InteractiveConsole
|
|
19
|
+
sec_result: SecResults
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@functools.lru_cache
|
|
23
|
+
def markdown_to_repl_env(
|
|
24
|
+
markdown_path: Path,
|
|
25
|
+
ticker: str,
|
|
26
|
+
year: str,
|
|
27
|
+
sec_result: SecResults,
|
|
28
|
+
) -> MarkdownReplEnvironment:
|
|
29
|
+
resolved_path = markdown_path.resolve()
|
|
30
|
+
markdown_text = resolved_path.read_text(encoding="utf-8")
|
|
31
|
+
filing_type = resolved_path.stem
|
|
32
|
+
namespace: dict[str, object] = {
|
|
33
|
+
"ticker": ticker,
|
|
34
|
+
"year": year,
|
|
35
|
+
"filing_type": filing_type,
|
|
36
|
+
"markdown_path": resolved_path,
|
|
37
|
+
"markdown_text": markdown_text,
|
|
38
|
+
"sec_result": sec_result,
|
|
39
|
+
}
|
|
40
|
+
console = InteractiveConsole(locals=namespace)
|
|
41
|
+
|
|
42
|
+
return MarkdownReplEnvironment(
|
|
43
|
+
ticker=ticker,
|
|
44
|
+
year=year,
|
|
45
|
+
filing_type=filing_type,
|
|
46
|
+
markdown_path=resolved_path,
|
|
47
|
+
markdown_text=markdown_text,
|
|
48
|
+
namespace=namespace,
|
|
49
|
+
console=console,
|
|
50
|
+
sec_result=sec_result,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
import asyncio
|
|
56
|
+
|
|
57
|
+
from dataloader.pipeline import prepare_sec_filing_envs
|
|
58
|
+
|
|
59
|
+
envs = asyncio.run(
|
|
60
|
+
prepare_sec_filing_envs(
|
|
61
|
+
ticker="AMZN",
|
|
62
|
+
year="2025",
|
|
63
|
+
filing_type="10-K",
|
|
64
|
+
include_amends=True,
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
for env in envs:
|
|
68
|
+
print(env.ticker, env.year, env.filing_type, env.markdown_path)
|