featureSQL 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featureSQL/__init__.py +30 -0
- featureSQL/cli.py +248 -0
- featureSQL/duck.py +140 -0
- featureSQL/dump_bin.py +668 -0
- featureSQL/storage.py +220 -0
- featureSQL/utils.py +29 -0
- featureSQL/yahoo.py +334 -0
- featuresql-0.1.0.dist-info/METADATA +279 -0
- featuresql-0.1.0.dist-info/RECORD +13 -0
- featuresql-0.1.0.dist-info/WHEEL +5 -0
- featuresql-0.1.0.dist-info/licenses/LICENSE +21 -0
- featuresql-0.1.0.dist-info/licenses/LICENSE.qlib +21 -0
- featuresql-0.1.0.dist-info/top_level.txt +1 -0
featureSQL/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Top-level package for featureSQL.
|
|
2
|
+
|
|
3
|
+
This file exposes the public API and maintains the version.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
from .cli import Run # expose for convenience
|
|
9
|
+
from .dump_bin import DumpDataAll, DumpDataUpdate
|
|
10
|
+
from .yahoo import (
|
|
11
|
+
get_calendar_list,
|
|
12
|
+
get_us_stock_symbols,
|
|
13
|
+
get_hs_stock_symbols,
|
|
14
|
+
YahooCollectorUS,
|
|
15
|
+
YahooNormalize,
|
|
16
|
+
)
|
|
17
|
+
from .utils import deco_retry
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"Run",
|
|
21
|
+
"DumpDataAll",
|
|
22
|
+
"DumpDataUpdate",
|
|
23
|
+
"get_calendar_list",
|
|
24
|
+
"get_us_stock_symbols",
|
|
25
|
+
"get_hs_stock_symbols",
|
|
26
|
+
"YahooCollectorUS",
|
|
27
|
+
"YahooNormalize",
|
|
28
|
+
"deco_retry",
|
|
29
|
+
"__version__",
|
|
30
|
+
]
|
featureSQL/cli.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""CLI entrypoint for the package.
|
|
2
|
+
|
|
3
|
+
This module delegates Yahoo logic to ``featureSQL.yahoo`` and binary dumping to
|
|
4
|
+
``featureSQL.dump_bin``. Only the user-facing ``Run`` class and the Fire
|
|
5
|
+
`main` helper remain here.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from loguru import logger
|
|
13
|
+
import fire
|
|
14
|
+
|
|
15
|
+
# import the yahoo helpers/collectors
|
|
16
|
+
from .yahoo import YahooCollectorUS, YahooNormalize, get_us_stock_symbols
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# simple CLI using fire
|
|
20
|
+
|
|
21
|
+
class Run:
|
|
22
|
+
def __init__(self, source_dir="./source"):
|
|
23
|
+
self.source_dir = source_dir
|
|
24
|
+
|
|
25
|
+
def download(
|
|
26
|
+
self,
|
|
27
|
+
region: str = "US",
|
|
28
|
+
start: str = None,
|
|
29
|
+
end: str = None,
|
|
30
|
+
symbols: str = None,
|
|
31
|
+
symbols_file: str = None,
|
|
32
|
+
reload_symbols: bool = False,
|
|
33
|
+
data_path: str = None,
|
|
34
|
+
out_format: str = "csv",
|
|
35
|
+
store_type: str = "fs",
|
|
36
|
+
):
|
|
37
|
+
from .storage import get_storage
|
|
38
|
+
store = get_storage(store_type, data_path)
|
|
39
|
+
|
|
40
|
+
# determine symbol_list either from explicit symbols or file
|
|
41
|
+
sym_list = None
|
|
42
|
+
if symbols_file:
|
|
43
|
+
path = symbols_file
|
|
44
|
+
if store_type == "fs":
|
|
45
|
+
path_obj = Path(symbols_file).expanduser()
|
|
46
|
+
if path_obj.exists():
|
|
47
|
+
sym_list = [s.strip().upper() for s in path_obj.read_text().splitlines() if s.strip()]
|
|
48
|
+
else:
|
|
49
|
+
logger.warning(f"symbols_file {path} does not exist")
|
|
50
|
+
else:
|
|
51
|
+
if store.exists(path):
|
|
52
|
+
sym_list = [s.strip().upper() for s in store.read_text(path).splitlines() if s.strip()]
|
|
53
|
+
else:
|
|
54
|
+
logger.warning(f"symbols_file {path} does not exist in {store_type}")
|
|
55
|
+
|
|
56
|
+
# if the file exists but is empty, we still want to treat that as an
|
|
57
|
+
# intentional (albeit odd) request to download nothing rather than
|
|
58
|
+
# blow up and fetch the full universe.
|
|
59
|
+
if reload_symbols or sym_list is None:
|
|
60
|
+
# fetch fresh and optionally write back (only if explicitly
|
|
61
|
+
# requested via reload_symbols)
|
|
62
|
+
sym_list = get_us_stock_symbols(reload=True, data_path=data_path, store=store)
|
|
63
|
+
try:
|
|
64
|
+
store.write_text(path, "\n".join(sym_list))
|
|
65
|
+
except Exception:
|
|
66
|
+
logger.warning(f"could not write symbol file {path}")
|
|
67
|
+
elif symbols:
|
|
68
|
+
# fire may give us a list/tuple, or a comma string
|
|
69
|
+
if isinstance(symbols, (list, tuple)):
|
|
70
|
+
sym_list = [s.strip().upper() for s in symbols if isinstance(s, str) and s.strip()]
|
|
71
|
+
else:
|
|
72
|
+
sym_list = [s.strip().upper() for s in str(symbols).split(",") if s.strip()]
|
|
73
|
+
|
|
74
|
+
# if the caller passed a data_path we use that as the base;
|
|
75
|
+
# otherwise fall back to the configured source directory. the
|
|
76
|
+
# storage backend will interpret the base string appropriately (e.g.
|
|
77
|
+
# a bucket name/prefix for GCS).
|
|
78
|
+
base = data_path if data_path is not None else self.source_dir
|
|
79
|
+
csv_dir = store.joinpath(base, "feature-csv")
|
|
80
|
+
|
|
81
|
+
if region.upper() == "US":
|
|
82
|
+
collector = YahooCollectorUS(str(csv_dir), symbol_list=sym_list, store=store)
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError("region not supported")
|
|
85
|
+
collector.download_data(start=start, end=end)
|
|
86
|
+
|
|
87
|
+
# optionally produce binary dump if requested
|
|
88
|
+
if out_format.lower() in ("bin", "dump"):
|
|
89
|
+
try:
|
|
90
|
+
# import from the package rather than a top-level module
|
|
91
|
+
from .dump_bin import DumpDataUpdate, DumpDataAll
|
|
92
|
+
|
|
93
|
+
dump_dir = data_path if data_path is not None else csv_dir
|
|
94
|
+
# decide whether to do a full initial dump or an update; the
|
|
95
|
+
# former is required if the target directory does not yet
|
|
96
|
+
# contain a calendar file.
|
|
97
|
+
cal_file = store.joinpath(dump_dir, "calendars", "day.txt")
|
|
98
|
+
if store.exists(cal_file):
|
|
99
|
+
dumper = DumpDataUpdate(
|
|
100
|
+
data_path=str(csv_dir),
|
|
101
|
+
dump_dir=dump_dir,
|
|
102
|
+
exclude_fields="symbol,date",
|
|
103
|
+
store_type=store_type,
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
dumper = DumpDataAll(
|
|
107
|
+
data_path=str(csv_dir),
|
|
108
|
+
dump_dir=dump_dir,
|
|
109
|
+
exclude_fields="symbol,date",
|
|
110
|
+
store_type=store_type,
|
|
111
|
+
)
|
|
112
|
+
dumper.dump()
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.warning(f"unable to perform binary dump: {e}")
|
|
115
|
+
|
|
116
|
+
def normalize(self, source_dir: str = None, store_type: str = "fs"):
|
|
117
|
+
from .storage import get_storage
|
|
118
|
+
store = get_storage(store_type, source_dir or self.source_dir)
|
|
119
|
+
src = source_dir or self.source_dir
|
|
120
|
+
|
|
121
|
+
# This uses pandas directly heavily, we will adapt it
|
|
122
|
+
import io
|
|
123
|
+
for csv_path in store.glob(src, "*.csv"):
|
|
124
|
+
if store_type == "fs":
|
|
125
|
+
df = pd.read_csv(csv_path)
|
|
126
|
+
else:
|
|
127
|
+
csv_bytes = store.read_bytes(csv_path)
|
|
128
|
+
df = pd.read_csv(io.BytesIO(csv_bytes))
|
|
129
|
+
|
|
130
|
+
df2 = YahooNormalize.normalize_yahoo(df)
|
|
131
|
+
|
|
132
|
+
if store_type == "fs":
|
|
133
|
+
df2.to_csv(csv_path, index=False)
|
|
134
|
+
else:
|
|
135
|
+
csv_buffer = io.StringIO()
|
|
136
|
+
df2.to_csv(csv_buffer, index=False)
|
|
137
|
+
store.write_text(csv_path, csv_buffer.getvalue())
|
|
138
|
+
|
|
139
|
+
def view(self, bin_file: str, calendar_file: str = None, store_type: str = "fs", data_path: str = None):
|
|
140
|
+
"""Print basic information about a binary feature file.
|
|
141
|
+
|
|
142
|
+
The bin format starts with a 4-byte date index followed by
|
|
143
|
+
little-endian floats. The date index is an offset into a calendar
|
|
144
|
+
file (one date per line) that lives in the dataset root under
|
|
145
|
+
``calendars/day.txt``. When a `calendar_file` path is supplied (or if
|
|
146
|
+
the code can automatically locate one by traversing upwards from the
|
|
147
|
+
bin file location) this helper will print the corresponding date for
|
|
148
|
+
each value.
|
|
149
|
+
"""
|
|
150
|
+
from .storage import get_storage
|
|
151
|
+
# for gcs, bucket is data_path
|
|
152
|
+
store = get_storage(store_type, data_path)
|
|
153
|
+
|
|
154
|
+
if not store.exists(bin_file):
|
|
155
|
+
logger.error(f"file not found: {bin_file}")
|
|
156
|
+
return
|
|
157
|
+
try:
|
|
158
|
+
if store_type == "fs":
|
|
159
|
+
arr = np.fromfile(bin_file, dtype="<f")
|
|
160
|
+
else:
|
|
161
|
+
arr = np.frombuffer(store.read_bytes(bin_file), dtype="<f")
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.error(f"unable to read file {bin_file}: {e}")
|
|
164
|
+
return
|
|
165
|
+
if arr.size == 0:
|
|
166
|
+
print(f"{bin_file} is empty")
|
|
167
|
+
return
|
|
168
|
+
date_index = int(arr[0])
|
|
169
|
+
values = arr[1:]
|
|
170
|
+
print(f"date index: {date_index}, values shape: {values.shape}")
|
|
171
|
+
|
|
172
|
+
dates = None
|
|
173
|
+
# locate calendar if not given
|
|
174
|
+
if calendar_file:
|
|
175
|
+
cal_path = calendar_file
|
|
176
|
+
else:
|
|
177
|
+
if store_type == "fs":
|
|
178
|
+
path = Path(bin_file).expanduser()
|
|
179
|
+
cal_path = path
|
|
180
|
+
for _ in range(5):
|
|
181
|
+
cal_path = cal_path.parent
|
|
182
|
+
candidate = cal_path.joinpath("calendars/day.txt")
|
|
183
|
+
if candidate.exists():
|
|
184
|
+
cal_path = candidate
|
|
185
|
+
break
|
|
186
|
+
else:
|
|
187
|
+
cal_path = None
|
|
188
|
+
if cal_path: cal_path = str(cal_path)
|
|
189
|
+
else:
|
|
190
|
+
# GCS is flat, try removing segments
|
|
191
|
+
parts = bin_file.split("/")
|
|
192
|
+
cal_path = None
|
|
193
|
+
for i in range(len(parts)-1, 0, -1):
|
|
194
|
+
prefix = "/".join(parts[:i])
|
|
195
|
+
candidate = f"{prefix}/calendars/day.txt" if prefix else "calendars/day.txt"
|
|
196
|
+
if store.exists(candidate):
|
|
197
|
+
cal_path = candidate
|
|
198
|
+
break
|
|
199
|
+
|
|
200
|
+
if cal_path and store.exists(cal_path):
|
|
201
|
+
try:
|
|
202
|
+
dates = [line.strip() for line in store.read_text(cal_path).splitlines() if line.strip()]
|
|
203
|
+
except Exception:
|
|
204
|
+
dates = None
|
|
205
|
+
if dates is not None:
|
|
206
|
+
# print date mapping for each value
|
|
207
|
+
offset = date_index
|
|
208
|
+
for i, val in enumerate(values):
|
|
209
|
+
idx = offset + i
|
|
210
|
+
date_str = dates[idx] if idx < len(dates) else "<out of range>"
|
|
211
|
+
print(f"{date_str}: {val}")
|
|
212
|
+
else:
|
|
213
|
+
print(values)
|
|
214
|
+
|
|
215
|
+
def query(
|
|
216
|
+
self,
|
|
217
|
+
sql: str,
|
|
218
|
+
data_path: str = None,
|
|
219
|
+
max_symbols: int = None,
|
|
220
|
+
max_memory: int = None,
|
|
221
|
+
store_type: str = "fs",
|
|
222
|
+
):
|
|
223
|
+
"""Execute an SQL query over the binary dataset using DuckDB.
|
|
224
|
+
|
|
225
|
+
The query engine lazily loads symbol directories as needed and uses an
|
|
226
|
+
LRU cache to limit memory/number of symbols. ``sql`` should be a
|
|
227
|
+
valid SQL string referencing symbol names as table names.
|
|
228
|
+
"""
|
|
229
|
+
from .duck import DuckQueryService, LRUCache
|
|
230
|
+
from .storage import get_storage
|
|
231
|
+
|
|
232
|
+
base = data_path if data_path is not None else self.source_dir
|
|
233
|
+
store = get_storage(store_type, base)
|
|
234
|
+
|
|
235
|
+
cache = LRUCache(max_symbols=max_symbols, max_memory=max_memory)
|
|
236
|
+
svc = DuckQueryService(base, cache=cache, store=store)
|
|
237
|
+
df = svc.execute(sql)
|
|
238
|
+
# pretty print result
|
|
239
|
+
print(df.to_string(index=False))
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def main():
|
|
243
|
+
fire.Fire(Run)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# ensure the CLI runs when the module is executed directly
|
|
247
|
+
if __name__ == "__main__":
|
|
248
|
+
main()
|
featureSQL/duck.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Simple SQL service using DuckDB with lazy-loaded symbol data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections import OrderedDict
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import re
|
|
8
|
+
from typing import Callable, Dict, Optional
|
|
9
|
+
|
|
10
|
+
import duckdb
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LRUCache:
|
|
16
|
+
"""Very lightweight LRU cache keyed by symbol name.
|
|
17
|
+
|
|
18
|
+
The cache keeps a mapping from symbol->(df, memory) and evicts the least
|
|
19
|
+
recently used entries whenever a configured threshold is exceeded. Two
|
|
20
|
+
limits are supported:
|
|
21
|
+
|
|
22
|
+
* ``max_symbols``: maximum number of distinct symbols to retain
|
|
23
|
+
* ``max_memory``: approximate total bytes of DataFrame memory to keep
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
max_symbols: Optional[int] = None,
|
|
29
|
+
max_memory: Optional[int] = None,
|
|
30
|
+
):
|
|
31
|
+
self.max_symbols = max_symbols
|
|
32
|
+
self.max_memory = max_memory
|
|
33
|
+
self._cache: "OrderedDict[str, Dict[str, object]]" = OrderedDict()
|
|
34
|
+
self._total_memory = 0
|
|
35
|
+
|
|
36
|
+
def get(self, key: str, loader: Callable[[str], pd.DataFrame]) -> pd.DataFrame:
|
|
37
|
+
# on hit, bump to end and return
|
|
38
|
+
if key in self._cache:
|
|
39
|
+
self._cache.move_to_end(key)
|
|
40
|
+
return self._cache[key]["df"]
|
|
41
|
+
|
|
42
|
+
# load new data
|
|
43
|
+
df = loader(key)
|
|
44
|
+
mem = df.memory_usage(deep=True).sum()
|
|
45
|
+
self._cache[key] = {"df": df, "mem": mem}
|
|
46
|
+
self._total_memory += mem
|
|
47
|
+
self._evict_if_needed()
|
|
48
|
+
return df
|
|
49
|
+
|
|
50
|
+
def _evict_if_needed(self) -> None:
|
|
51
|
+
# remove oldest entries until under both thresholds
|
|
52
|
+
while self._cache and (
|
|
53
|
+
(self.max_symbols and len(self._cache) > self.max_symbols)
|
|
54
|
+
or (self.max_memory and self._total_memory > self.max_memory)
|
|
55
|
+
):
|
|
56
|
+
oldest_key, oldest_val = self._cache.popitem(last=False)
|
|
57
|
+
self._total_memory -= oldest_val["mem"]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DuckQueryService:
|
|
61
|
+
"""SQL service using :mod:`duckdb` over symbol bin files.
|
|
62
|
+
|
|
63
|
+
Each symbol corresponds to a subdirectory under ``root``; that directory
|
|
64
|
+
contains one ``<field>.day.bin`` file per numeric column. When a query
|
|
65
|
+
references a symbol we lazily read its bins into a DataFrame and register
|
|
66
|
+
it with DuckDB. The cache above keeps only a limited number of symbols
|
|
67
|
+
in memory.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
# match `FROM foo` or `JOIN foo` so that symbols used in joins are
|
|
71
|
+
# loaded as well. we deliberately avoid more complex SQL parsing;
|
|
72
|
+
# a simple regex is sufficient for the limited syntax we expect.
|
|
73
|
+
SYMBOL_RE = re.compile(r"\b(?:from|join)\s+([A-Za-z0-9_]+)\b", re.IGNORECASE)
|
|
74
|
+
|
|
75
|
+
def __init__(self, root: str, cache: Optional[LRUCache] = None, store=None):
|
|
76
|
+
from .storage import get_storage
|
|
77
|
+
self.store = store if store else get_storage("fs")
|
|
78
|
+
self.root = str(root)
|
|
79
|
+
self.cache = cache or LRUCache()
|
|
80
|
+
self._conn = duckdb.connect()
|
|
81
|
+
# try to read calendar file (optional)
|
|
82
|
+
cal_path = self.store.joinpath(self.root, "calendars", "day.txt")
|
|
83
|
+
self._calendar: Optional[list] = None
|
|
84
|
+
if self.store.exists(cal_path):
|
|
85
|
+
self._calendar = [line.strip() for line in self.store.read_text(cal_path).splitlines() if line.strip()]
|
|
86
|
+
|
|
87
|
+
def _load_symbol_df(self, symbol: str) -> pd.DataFrame:
|
|
88
|
+
"""Read all bin files for ``symbol`` and return a DataFrame."""
|
|
89
|
+
symbol_dir = self.store.joinpath(self.root, "features", symbol.lower())
|
|
90
|
+
if not self.store.exists(symbol_dir) and getattr(self.store, 'store_type', 'fs') == 'fs':
|
|
91
|
+
# Note: For GCS, exists on a directory might be false if no object exactly matches the dir name
|
|
92
|
+
# So we just proceed to glob
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
# normalize for glob: remove any leading slash which confuses
|
|
96
|
+
# object-storage backends. file system paths should be left alone.
|
|
97
|
+
if isinstance(self.store, __import__('featureSQL.storage', fromlist=['FileSystemStore']).FileSystemStore):
|
|
98
|
+
glob_path = symbol_dir
|
|
99
|
+
else:
|
|
100
|
+
glob_path = symbol_dir.lstrip("/") if isinstance(symbol_dir, str) else symbol_dir
|
|
101
|
+
|
|
102
|
+
cols: Dict[str, np.ndarray] = {}
|
|
103
|
+
for binfile in self.store.glob(glob_path, "*.day.bin"):
|
|
104
|
+
field = str(binfile).split("/")[-1].replace(".day.bin", "")
|
|
105
|
+
|
|
106
|
+
if isinstance(self.store, __import__('featureSQL.storage', fromlist=['FileSystemStore']).FileSystemStore):
|
|
107
|
+
arr = np.fromfile(binfile, dtype="<f")
|
|
108
|
+
else:
|
|
109
|
+
arr = np.frombuffer(self.store.read_bytes(binfile), dtype="<f")
|
|
110
|
+
|
|
111
|
+
if arr.size == 0:
|
|
112
|
+
continue
|
|
113
|
+
# first element is date index; convert to calendar date if available
|
|
114
|
+
data = arr[1:]
|
|
115
|
+
if self._calendar is not None:
|
|
116
|
+
# produce a date column if not already
|
|
117
|
+
cols.setdefault("date", pd.Series([self._calendar[int(arr[0]) + i] for i in range(len(data))]))
|
|
118
|
+
cols[field] = data
|
|
119
|
+
|
|
120
|
+
if not cols:
|
|
121
|
+
raise FileNotFoundError(f"symbol directory/files not found or empty: {symbol_dir}")
|
|
122
|
+
|
|
123
|
+
return pd.DataFrame(cols)
|
|
124
|
+
|
|
125
|
+
def _ensure_symbols(self, sql: str) -> None:
|
|
126
|
+
symbols = set(self.SYMBOL_RE.findall(sql))
|
|
127
|
+
for sym in symbols:
|
|
128
|
+
key = sym.lower()
|
|
129
|
+
# load using lowercase key to keep cache consistent
|
|
130
|
+
df = self.cache.get(key, self._load_symbol_df)
|
|
131
|
+
# register table name using original capitalization from query
|
|
132
|
+
self._conn.register(sym, df)
|
|
133
|
+
|
|
134
|
+
def execute(self, sql: str) -> pd.DataFrame:
|
|
135
|
+
"""Run ``sql`` after loading any referenced symbols.
|
|
136
|
+
|
|
137
|
+
The returned DataFrame comes from DuckDB's result set.
|
|
138
|
+
"""
|
|
139
|
+
self._ensure_symbols(sql)
|
|
140
|
+
return self._conn.execute(sql).df()
|