featureSQL 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featuresql-0.1.0/LICENSE +21 -0
- featuresql-0.1.0/LICENSE.qlib +21 -0
- featuresql-0.1.0/PKG-INFO +279 -0
- featuresql-0.1.0/README.md +258 -0
- featuresql-0.1.0/featureSQL/__init__.py +30 -0
- featuresql-0.1.0/featureSQL/cli.py +248 -0
- featuresql-0.1.0/featureSQL/duck.py +140 -0
- featuresql-0.1.0/featureSQL/dump_bin.py +668 -0
- featuresql-0.1.0/featureSQL/storage.py +220 -0
- featuresql-0.1.0/featureSQL/utils.py +29 -0
- featuresql-0.1.0/featureSQL/yahoo.py +334 -0
- featuresql-0.1.0/featureSQL.egg-info/PKG-INFO +279 -0
- featuresql-0.1.0/featureSQL.egg-info/SOURCES.txt +21 -0
- featuresql-0.1.0/featureSQL.egg-info/dependency_links.txt +1 -0
- featuresql-0.1.0/featureSQL.egg-info/requires.txt +9 -0
- featuresql-0.1.0/featureSQL.egg-info/top_level.txt +1 -0
- featuresql-0.1.0/pyproject.toml +17 -0
- featuresql-0.1.0/setup.cfg +4 -0
- featuresql-0.1.0/setup.py +25 -0
- featuresql-0.1.0/tests/test_cli_query.py +34 -0
- featuresql-0.1.0/tests/test_duck.py +58 -0
- featuresql-0.1.0/tests/test_storage.py +204 -0
- featuresql-0.1.0/tests/test_workflows.py +261 -0
featuresql-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Microsoft Corporation.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Microsoft Corporation.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: featureSQL
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author:
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
License-File: LICENSE.qlib
|
|
10
|
+
Requires-Dist: build>=1.4.0
|
|
11
|
+
Requires-Dist: duckdb>=1.4.4
|
|
12
|
+
Requires-Dist: fire>=0.7.1
|
|
13
|
+
Requires-Dist: google-auth>=2.48.0
|
|
14
|
+
Requires-Dist: google-cloud-storage>=3.9.0
|
|
15
|
+
Requires-Dist: loguru>=0.7.3
|
|
16
|
+
Requires-Dist: numpy>=2.4.2
|
|
17
|
+
Requires-Dist: pytest>=9.0.2
|
|
18
|
+
Requires-Dist: yahooquery>=2.4.1
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
|
|
22
|
+
This repository is now packaged as the **featureSQL** Python module. You can
|
|
23
|
+
install it in one of two ways:
|
|
24
|
+
|
|
25
|
+
1. **From PyPI:**
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install featureSQL
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
This makes the `featureSQL` console script available on your PATH and lets
|
|
32
|
+
you `import featureSQL` from any Python program.
|
|
33
|
+
|
|
34
|
+
2. **From source (development mode):**
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
cd /path/to/featureSQL
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
This installs the package in editable mode, so local edits are reflected
|
|
42
|
+
immediately without reinstalling. Useful when working on the project.
|
|
43
|
+
|
|
44
|
+
After installation you can still use the original CLI helpers directly by
|
|
45
|
+
importing:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from featureSQL.cli import Run
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
and invoking `Run().download(...)` or `featureSQL` at the shell. The remainder
|
|
52
|
+
of this document is a how‑to reference that you can include in your own
|
|
53
|
+
projects; it covers the three common workflows you asked about.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## 1. Download a list of symbols (OCHLVF) to CSV
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# 1. prepare a text file listing tickers, one per line
|
|
61
|
+
cat > .testsymbols.txt <<'EOF'
|
|
62
|
+
AAPL
|
|
63
|
+
AMZN
|
|
64
|
+
GOOG
|
|
65
|
+
TSLA
|
|
66
|
+
EOF
|
|
67
|
+
|
|
68
|
+
# 2. run the collector, writing CSVs into a directory (they’ll land in
|
|
69
|
+
# the `feature-csv` subfolder of whatever you pass to --data_path)
|
|
70
|
+
uv run -m featureSQL.cli download \
|
|
71
|
+
--region US \
|
|
72
|
+
--start 2026-01-01 \
|
|
73
|
+
--end 2026-02-28 \
|
|
74
|
+
--symbols_file ./.testsymbols.txt \
|
|
75
|
+
--data_path ./source \
|
|
76
|
+
--store_type fs # Output to local fs (default). Change to 'gcs' and set data_path to bucket name for GCS.
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
> **GCS setup:** when using `--store_type gcs` you must configure two
|
|
80
|
+
> environment variables before running any commands or tests:
|
|
81
|
+
>
|
|
82
|
+
> ```bash
|
|
83
|
+
> export GCS_SC_JSON='{"type":"service_account", …}' # credentials
|
|
84
|
+
> export GCS_BUCKET_NAME='your-bucket-name' # target bucket
|
|
85
|
+
> ```
|
|
86
|
+
>
|
|
87
|
+
> The unit tests will skip network‑dependent scenarios if `GCS_BUCKET_NAME`
|
|
88
|
+
> is unset, and they mock the client in other cases. However, any manual
|
|
89
|
+
> invocation against a real bucket requires both variables to be set.
|
|
90
|
+
|
|
91
|
+
*Files created*: `/path/to/target/csv/dir/feature-csv/AAPL.csv`,
|
|
92
|
+
`feature-csv/AMZN.csv`, … (i.e. `feature-csv` under the directory you
|
|
93
|
+
passed with `--data_path`). Each CSV contains the usual Open/Close/High/Low/
|
|
94
|
+
Volume/AdjClose data plus `symbol`/`date` columns.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## 2. Convert a directory of CSVs into a binary dataset
|
|
99
|
+
|
|
100
|
+
This is the “dump all” step: it reads **all** CSV files in `data_path` and builds the calendar, instruments list and feature bins in the standard binary layout.
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
uv run -m featureSQL.dump_bin dump_all \
|
|
104
|
+
--data_path ./source/feature-csv \
|
|
105
|
+
--dump_dir ./source/ \
|
|
106
|
+
--exclude_fields symbol,date \
|
|
107
|
+
--store_type fs # don’t try to treat metadata as floats.
|
|
108
|
+
# (./source/ is /path/to/output/dir/)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
After the command finishes you’ll have a structure like:
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
/path/to/output/dir/
|
|
115
|
+
calendars/day.txt # trading dates
|
|
116
|
+
instruments/all.txt # ticker start/end dates
|
|
117
|
+
features/<ticker>/<field>.day.bin …
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
This directory can be consumed by any tool that understands the same layout – it’s a full **initialised** dataset.
|
|
121
|
+
|
|
122
|
+
> 💡 run `dump_all` only once per collection. To add new data later, use `dump_update` (see next section).
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 3. Download symbols and produce the binary dataset in one go
|
|
127
|
+
|
|
128
|
+
The built‑in CLI helper (exposed via the `featureSQL` script or as
|
|
129
|
+
`python -m featureSQL.cli`) can drive both phases and even run ad‑hoc
|
|
130
|
+
queries against a binary dataset:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
uv run -m featureSQL.cli download \
|
|
134
|
+
--region US \
|
|
135
|
+
--start 2026-01-01 \
|
|
136
|
+
--end 2026-02-28 \
|
|
137
|
+
--symbols AMZN,GOOG,TSLA \
|
|
138
|
+
--data_path source \
|
|
139
|
+
--out_format bin \
|
|
140
|
+
--store_type fs
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
This will
|
|
144
|
+
|
|
145
|
+
1. fetch OCHLVF data from Yahoo and save raw CSVs under
|
|
146
|
+
`--{data_path}/feature-csv` (or `./source` if you didn’t pass
|
|
147
|
+
`--data_path`),
|
|
148
|
+
2. run the binary dumper. On the **first** invocation the helper uses
|
|
149
|
+
`DumpDataAll` to initialise the dataset; subsequent runs use
|
|
150
|
+
`DumpDataUpdate` to append new days. The target directory is the same as
|
|
151
|
+
`--data_path` and you can still provide `exclude_fields="symbol,date"`.
|
|
152
|
+
|
|
153
|
+
The first run must be preceded by an empty or non‑existent output directory
|
|
154
|
+
(and/or you can manually run `dump_all` as shown above); thereafter the
|
|
155
|
+
same command will **append** new days to the existing bins.
|
|
156
|
+
|
|
157
|
+
## 4. Query the binary dataset using SQL
|
|
158
|
+
|
|
159
|
+
Once you have an initialised dataset you can issue SQL against it. The
|
|
160
|
+
``query`` subcommand lazily loads only the symbols mentioned in the query
|
|
161
|
+
and keeps a small LRU cache in memory.
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
uv run -m featureSQL.cli query \
|
|
165
|
+
--data_path source \
|
|
166
|
+
--max_symbols 100 \
|
|
167
|
+
--max_memory 2000000000 \
|
|
168
|
+
--store_type fs \
|
|
169
|
+
"select date, open, close, high, low, volume, adjclose from AAPL where volume > 1000000"
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Joins work transparently as long as both tables have been dumped:
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
uv run -m featureSQL.cli query --data_path source --store_type fs \
|
|
176
|
+
"select a.open, n.close from AAPL a join NVDA n on a.date = n.date"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
(The cache flags are optional; omit them to use unlimited resources.)
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
### Notes & tips
|
|
183
|
+
|
|
184
|
+
- **Symbols list:** `--symbols` accepts a comma string, Python list/tuple, or
|
|
185
|
+
`--symbols_file` path. If you provide a file that exists but contains no
|
|
186
|
+
tickers the downloader will now do nothing (previous behaviour fell through
|
|
187
|
+
to downloading the entire US universe).
|
|
188
|
+
- **Reloading tickers:** use `--reload_symbols` to refresh the cached symbol list.
|
|
189
|
+
- **Alternate flows:** to skip the binning step, omit `--out_format` or set it
|
|
190
|
+
to `yahoo` (the default).
|
|
191
|
+
|
|
192
|
+
#### Maintaining binary dataset integrity
|
|
193
|
+
|
|
194
|
+
Because the dumper appends new days based on the existing calendar, you
|
|
195
|
+
must take care when fetching overlapping or out‑of‑order date ranges:
|
|
196
|
+
|
|
197
|
+
1. **Always use a single start date that is at or before any previously
|
|
198
|
+
downloaded data.** For a continuous collection you can simply run:
|
|
199
|
+
```bash
|
|
200
|
+
uv run -m featureSQL.cli download \
|
|
201
|
+
--region US \
|
|
202
|
+
--start 2025-01-01 \
|
|
203
|
+
--end $(date +%Y-%m-%d) \
|
|
204
|
+
--symbols TSLA \
|
|
205
|
+
--data_path source --out_format bin
|
|
206
|
+
```
|
|
207
|
+
The collector will skip existing CSVs and the dumper will append only the
|
|
208
|
+
new days, keeping the existing bins and calendar consistent.
|
|
209
|
+
|
|
210
|
+
2. **If you need to back‑fill a gap or change the start date earlier:**
|
|
211
|
+
delete the old bin files (or the entire `source/features` tree) and run
|
|
212
|
+
with `--out_format bin` again (or manually use
|
|
213
|
+
`uv run -m featureSQL.dump_bin dump_all`) so that `DumpDataAll` recomputes the
|
|
214
|
+
calendar from scratch. The update mode never rewrites the date index,
|
|
215
|
+
so appending older data without rebuilding will cause the printed dates to
|
|
216
|
+
be incorrect.
|
|
217
|
+
|
|
218
|
+
3. **Automate detection if desired.**
|
|
219
|
+
You can extend the downloader to inspect the last date in existing CSVs
|
|
220
|
+
and request only missing days, and/or modify the dumper to warn when the
|
|
221
|
+
incoming data’s maximum date does not exceed the current calendar end.
|
|
222
|
+
|
|
223
|
+
Following these practices prevents “wrong” date offsets from appearing when
|
|
224
|
+
you use `uv run -m featureSQL.cli view …` (or simply `featureSQL view …`), and ensures the binary dataset remains a
|
|
225
|
+
faithful time series.
|
|
226
|
+
|
|
227
|
+
Feel free to copy‑paste these examples into your own docs or scripts!
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Viewing the contents of a bin file
|
|
232
|
+
|
|
233
|
+
A new CLI subcommand makes this easy without writing Python. Once you have
|
|
234
|
+
an initialised dataset you can inspect any field file with:
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
uv run -m featureSQL.cli view /path/to/output/dir/features/aapl/open.day.bin --store_type fs
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
By default the command prints the starting date index and the shape of the
|
|
241
|
+
array, followed by each raw float value. If the dataset contains a calendar
|
|
242
|
+
file (``calendars/day.txt``) the subcommand will automatically find it and
|
|
243
|
+
show the corresponding date for each value. You can also supply an explicit
|
|
244
|
+
calendar path:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
uv run -m featureSQL.cli view path/to/bin/file --calendar_file path/to/calendars/day.txt --store_type fs
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Internally the helper still uses `numpy.fromfile` so the Python snippet
|
|
251
|
+
below remains available if you prefer to inspect the file manually.
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
import numpy as np
|
|
255
|
+
# if you have a helper for converting codes to filenames, import it here
|
|
256
|
+
from featureSQL.dump_bin import code_to_fname
|
|
257
|
+
|
|
258
|
+
# point to a particular field file (e.g. open.day.bin) for one symbol
|
|
259
|
+
bin_path = "/path/to/output/dir/features/aapl/open.day.bin"
|
|
260
|
+
arr = np.fromfile(bin_path, dtype="<f")
|
|
261
|
+
|
|
262
|
+
# first value is the date offset, the rest are data values
|
|
263
|
+
date_index = int(arr[0])
|
|
264
|
+
values = arr[1:]
|
|
265
|
+
print(date_index, values.shape)
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
Alternatively, many tools provide helpers once a dataset is loaded; you can
|
|
269
|
+
query the field of a symbol and receive a NumPy array. Use whatever API
|
|
270
|
+
your application or library supplies – the underlying files remain the same.
|
|
271
|
+
|
|
272
|
+
```pythonfeatureSQL
|
|
273
|
+
# pseudo-code using a generic loader
|
|
274
|
+
data = my_loader.load("/path/to/output/dir")
|
|
275
|
+
print(data['AAPL']['open'])
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
If you just want to look at the calendar, it’s a text file under
|
|
279
|
+
`calendars/day.txt` with one date per line.
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
This repository is now packaged as the **featureSQL** Python module. You can
|
|
2
|
+
install it in one of two ways:
|
|
3
|
+
|
|
4
|
+
1. **From PyPI:**
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
pip install featureSQL
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
This makes the `featureSQL` console script available on your PATH and lets
|
|
11
|
+
you `import featureSQL` from any Python program.
|
|
12
|
+
|
|
13
|
+
2. **From source (development mode):**
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
cd /path/to/featureSQL
|
|
17
|
+
pip install -e .
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
This installs the package in editable mode, so local edits are reflected
|
|
21
|
+
immediately without reinstalling. Useful when working on the project.
|
|
22
|
+
|
|
23
|
+
After installation you can still use the original CLI helpers directly by
|
|
24
|
+
importing:
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from featureSQL.cli import Run
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
and invoking `Run().download(...)` or `featureSQL` at the shell. The remainder
|
|
31
|
+
of this document is a how‑to reference that you can include in your own
|
|
32
|
+
projects; it covers the three common workflows you asked about.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 1. Download a list of symbols (OCHLVF) to CSV
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# 1. prepare a text file listing tickers, one per line
|
|
40
|
+
cat > .testsymbols.txt <<'EOF'
|
|
41
|
+
AAPL
|
|
42
|
+
AMZN
|
|
43
|
+
GOOG
|
|
44
|
+
TSLA
|
|
45
|
+
EOF
|
|
46
|
+
|
|
47
|
+
# 2. run the collector, writing CSVs into a directory (they’ll land in
|
|
48
|
+
# the `feature-csv` subfolder of whatever you pass to --data_path)
|
|
49
|
+
uv run -m featureSQL.cli download \
|
|
50
|
+
--region US \
|
|
51
|
+
--start 2026-01-01 \
|
|
52
|
+
--end 2026-02-28 \
|
|
53
|
+
--symbols_file ./.testsymbols.txt \
|
|
54
|
+
--data_path ./source \
|
|
55
|
+
--store_type fs # Output to local fs (default). Change to 'gcs' and set data_path to bucket name for GCS.
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
> **GCS setup:** when using `--store_type gcs` you must configure two
|
|
59
|
+
> environment variables before running any commands or tests:
|
|
60
|
+
>
|
|
61
|
+
> ```bash
|
|
62
|
+
> export GCS_SC_JSON='{"type":"service_account", …}' # credentials
|
|
63
|
+
> export GCS_BUCKET_NAME='your-bucket-name' # target bucket
|
|
64
|
+
> ```
|
|
65
|
+
>
|
|
66
|
+
> The unit tests will skip network‑dependent scenarios if `GCS_BUCKET_NAME`
|
|
67
|
+
> is unset, and they mock the client in other cases. However, any manual
|
|
68
|
+
> invocation against a real bucket requires both variables to be set.
|
|
69
|
+
|
|
70
|
+
*Files created*: `/path/to/target/csv/dir/feature-csv/AAPL.csv`,
|
|
71
|
+
`feature-csv/AMZN.csv`, … (i.e. `feature-csv` under the directory you
|
|
72
|
+
passed with `--data_path`). Each CSV contains the usual Open/Close/High/Low/
|
|
73
|
+
Volume/AdjClose data plus `symbol`/`date` columns.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## 2. Convert a directory of CSVs into a binary dataset
|
|
78
|
+
|
|
79
|
+
This is the “dump all” step: it reads **all** CSV files in `data_path` and builds the calendar, instruments list and feature bins in the standard binary layout.
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv run -m featureSQL.dump_bin dump_all \
|
|
83
|
+
--data_path ./source/feature-csv \
|
|
84
|
+
--dump_dir ./source/ \
|
|
85
|
+
--exclude_fields symbol,date \
|
|
86
|
+
--store_type fs # don’t try to treat metadata as floats.
|
|
87
|
+
# (./source/ is /path/to/output/dir/)
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
After the command finishes you’ll have a structure like:
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
/path/to/output/dir/
|
|
94
|
+
calendars/day.txt # trading dates
|
|
95
|
+
instruments/all.txt # ticker start/end dates
|
|
96
|
+
features/<ticker>/<field>.day.bin …
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
This directory can be consumed by any tool that understands the same layout – it’s a full **initialised** dataset.
|
|
100
|
+
|
|
101
|
+
> 💡 run `dump_all` only once per collection. To add new data later, use `dump_update` (see next section).
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 3. Download symbols and produce the binary dataset in one go
|
|
106
|
+
|
|
107
|
+
The built‑in CLI helper (exposed via the `featureSQL` script or as
|
|
108
|
+
`python -m featureSQL.cli`) can drive both phases and even run ad‑hoc
|
|
109
|
+
queries against a binary dataset:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
uv run -m featureSQL.cli download \
|
|
113
|
+
--region US \
|
|
114
|
+
--start 2026-01-01 \
|
|
115
|
+
--end 2026-02-28 \
|
|
116
|
+
--symbols AMZN,GOOG,TSLA \
|
|
117
|
+
--data_path source \
|
|
118
|
+
--out_format bin \
|
|
119
|
+
--store_type fs
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
This will
|
|
123
|
+
|
|
124
|
+
1. fetch OCHLVF data from Yahoo and save raw CSVs under
|
|
125
|
+
`--{data_path}/feature-csv` (or `./source` if you didn’t pass
|
|
126
|
+
`--data_path`),
|
|
127
|
+
2. run the binary dumper. On the **first** invocation the helper uses
|
|
128
|
+
`DumpDataAll` to initialise the dataset; subsequent runs use
|
|
129
|
+
`DumpDataUpdate` to append new days. The target directory is the same as
|
|
130
|
+
`--data_path` and you can still provide `exclude_fields="symbol,date"`.
|
|
131
|
+
|
|
132
|
+
The first run must be preceded by an empty or non‑existent output directory
|
|
133
|
+
(and/or you can manually run `dump_all` as shown above); thereafter the
|
|
134
|
+
same command will **append** new days to the existing bins.
|
|
135
|
+
|
|
136
|
+
## 4. Query the binary dataset using SQL
|
|
137
|
+
|
|
138
|
+
Once you have an initialised dataset you can issue SQL against it. The
|
|
139
|
+
``query`` subcommand lazily loads only the symbols mentioned in the query
|
|
140
|
+
and keeps a small LRU cache in memory.
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
uv run -m featureSQL.cli query \
|
|
144
|
+
--data_path source \
|
|
145
|
+
--max_symbols 100 \
|
|
146
|
+
--max_memory 2000000000 \
|
|
147
|
+
--store_type fs \
|
|
148
|
+
"select date, open, close, high, low, volume, adjclose from AAPL where volume > 1000000"
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Joins work transparently as long as both tables have been dumped:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
uv run -m featureSQL.cli query --data_path source --store_type fs \
|
|
155
|
+
"select a.open, n.close from AAPL a join NVDA n on a.date = n.date"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
(The cache flags are optional; omit them to use unlimited resources.)
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
### Notes & tips
|
|
162
|
+
|
|
163
|
+
- **Symbols list:** `--symbols` accepts a comma string, Python list/tuple, or
|
|
164
|
+
`--symbols_file` path. If you provide a file that exists but contains no
|
|
165
|
+
tickers the downloader will now do nothing (previous behaviour fell through
|
|
166
|
+
to downloading the entire US universe).
|
|
167
|
+
- **Reloading tickers:** use `--reload_symbols` to refresh the cached symbol list.
|
|
168
|
+
- **Alternate flows:** to skip the binning step, omit `--out_format` or set it
|
|
169
|
+
to `yahoo` (the default).
|
|
170
|
+
|
|
171
|
+
#### Maintaining binary dataset integrity
|
|
172
|
+
|
|
173
|
+
Because the dumper appends new days based on the existing calendar, you
|
|
174
|
+
must take care when fetching overlapping or out‑of‑order date ranges:
|
|
175
|
+
|
|
176
|
+
1. **Always use a single start date that is at or before any previously
|
|
177
|
+
downloaded data.** For a continuous collection you can simply run:
|
|
178
|
+
```bash
|
|
179
|
+
uv run -m featureSQL.cli download \
|
|
180
|
+
--region US \
|
|
181
|
+
--start 2025-01-01 \
|
|
182
|
+
--end $(date +%Y-%m-%d) \
|
|
183
|
+
--symbols TSLA \
|
|
184
|
+
--data_path source --out_format bin
|
|
185
|
+
```
|
|
186
|
+
The collector will skip existing CSVs and the dumper will append only the
|
|
187
|
+
new days, keeping the existing bins and calendar consistent.
|
|
188
|
+
|
|
189
|
+
2. **If you need to back‑fill a gap or change the start date earlier:**
|
|
190
|
+
delete the old bin files (or the entire `source/features` tree) and run
|
|
191
|
+
with `--out_format bin` again (or manually use
|
|
192
|
+
`uv run -m featureSQL.dump_bin dump_all`) so that `DumpDataAll` recomputes the
|
|
193
|
+
calendar from scratch. The update mode never rewrites the date index,
|
|
194
|
+
so appending older data without rebuilding will cause the printed dates to
|
|
195
|
+
be incorrect.
|
|
196
|
+
|
|
197
|
+
3. **Automate detection if desired.**
|
|
198
|
+
You can extend the downloader to inspect the last date in existing CSVs
|
|
199
|
+
and request only missing days, and/or modify the dumper to warn when the
|
|
200
|
+
incoming data’s maximum date does not exceed the current calendar end.
|
|
201
|
+
|
|
202
|
+
Following these practices prevents “wrong” date offsets from appearing when
|
|
203
|
+
you use `uv run -m featureSQL.cli view …` (or simply `featureSQL view …`), and ensures the binary dataset remains a
|
|
204
|
+
faithful time series.
|
|
205
|
+
|
|
206
|
+
Feel free to copy‑paste these examples into your own docs or scripts!
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## Viewing the contents of a bin file
|
|
211
|
+
|
|
212
|
+
A new CLI subcommand makes this easy without writing Python. Once you have
|
|
213
|
+
an initialised dataset you can inspect any field file with:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
uv run -m featureSQL.cli view /path/to/output/dir/features/aapl/open.day.bin --store_type fs
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
By default the command prints the starting date index and the shape of the
|
|
220
|
+
array, followed by each raw float value. If the dataset contains a calendar
|
|
221
|
+
file (``calendars/day.txt``) the subcommand will automatically find it and
|
|
222
|
+
show the corresponding date for each value. You can also supply an explicit
|
|
223
|
+
calendar path:
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
uv run -m featureSQL.cli view path/to/bin/file --calendar_file path/to/calendars/day.txt --store_type fs
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Internally the helper still uses `numpy.fromfile` so the Python snippet
|
|
230
|
+
below remains available if you prefer to inspect the file manually.
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
import numpy as np
|
|
234
|
+
# if you have a helper for converting codes to filenames, import it here
|
|
235
|
+
from featureSQL.dump_bin import code_to_fname
|
|
236
|
+
|
|
237
|
+
# point to a particular field file (e.g. open.day.bin) for one symbol
|
|
238
|
+
bin_path = "/path/to/output/dir/features/aapl/open.day.bin"
|
|
239
|
+
arr = np.fromfile(bin_path, dtype="<f")
|
|
240
|
+
|
|
241
|
+
# first value is the date offset, the rest are data values
|
|
242
|
+
date_index = int(arr[0])
|
|
243
|
+
values = arr[1:]
|
|
244
|
+
print(date_index, values.shape)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Alternatively, many tools provide helpers once a dataset is loaded; you can
|
|
248
|
+
query the field of a symbol and receive a NumPy array. Use whatever API
|
|
249
|
+
your application or library supplies – the underlying files remain the same.
|
|
250
|
+
|
|
251
|
+
```pythonfeatureSQL
|
|
252
|
+
# pseudo-code using a generic loader
|
|
253
|
+
data = my_loader.load("/path/to/output/dir")
|
|
254
|
+
print(data['AAPL']['open'])
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
If you just want to look at the calendar, it’s a text file under
|
|
258
|
+
`calendars/day.txt` with one date per line.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Top-level package for featureSQL.
|
|
2
|
+
|
|
3
|
+
This file exposes the public API and maintains the version.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
from .cli import Run # expose for convenience
|
|
9
|
+
from .dump_bin import DumpDataAll, DumpDataUpdate
|
|
10
|
+
from .yahoo import (
|
|
11
|
+
get_calendar_list,
|
|
12
|
+
get_us_stock_symbols,
|
|
13
|
+
get_hs_stock_symbols,
|
|
14
|
+
YahooCollectorUS,
|
|
15
|
+
YahooNormalize,
|
|
16
|
+
)
|
|
17
|
+
from .utils import deco_retry
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"Run",
|
|
21
|
+
"DumpDataAll",
|
|
22
|
+
"DumpDataUpdate",
|
|
23
|
+
"get_calendar_list",
|
|
24
|
+
"get_us_stock_symbols",
|
|
25
|
+
"get_hs_stock_symbols",
|
|
26
|
+
"YahooCollectorUS",
|
|
27
|
+
"YahooNormalize",
|
|
28
|
+
"deco_retry",
|
|
29
|
+
"__version__",
|
|
30
|
+
]
|