featureSQL 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
featureSQL/__init__.py ADDED
@@ -0,0 +1,30 @@
1
+ """Top-level package for featureSQL.
2
+
3
+ This file exposes the public API and maintains the version.
4
+ """
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ from .cli import Run # expose for convenience
9
+ from .dump_bin import DumpDataAll, DumpDataUpdate
10
+ from .yahoo import (
11
+ get_calendar_list,
12
+ get_us_stock_symbols,
13
+ get_hs_stock_symbols,
14
+ YahooCollectorUS,
15
+ YahooNormalize,
16
+ )
17
+ from .utils import deco_retry
18
+
19
+ __all__ = [
20
+ "Run",
21
+ "DumpDataAll",
22
+ "DumpDataUpdate",
23
+ "get_calendar_list",
24
+ "get_us_stock_symbols",
25
+ "get_hs_stock_symbols",
26
+ "YahooCollectorUS",
27
+ "YahooNormalize",
28
+ "deco_retry",
29
+ "__version__",
30
+ ]
featureSQL/cli.py ADDED
@@ -0,0 +1,248 @@
1
+ """CLI entrypoint for the package.
2
+
3
+ This module delegates Yahoo logic to ``featureSQL.yahoo`` and binary dumping to
4
+ ``featureSQL.dump_bin``. Only the user-facing ``Run`` class and the Fire
5
+ `main` helper remain here.
6
+ """
7
+
8
+ from pathlib import Path
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from loguru import logger
13
+ import fire
14
+
15
+ # import the yahoo helpers/collectors
16
+ from .yahoo import YahooCollectorUS, YahooNormalize, get_us_stock_symbols
17
+
18
+
19
+ # simple CLI using fire
20
+
21
+ class Run:
22
+ def __init__(self, source_dir="./source"):
23
+ self.source_dir = source_dir
24
+
25
+ def download(
26
+ self,
27
+ region: str = "US",
28
+ start: str = None,
29
+ end: str = None,
30
+ symbols: str = None,
31
+ symbols_file: str = None,
32
+ reload_symbols: bool = False,
33
+ data_path: str = None,
34
+ out_format: str = "csv",
35
+ store_type: str = "fs",
36
+ ):
37
+ from .storage import get_storage
38
+ store = get_storage(store_type, data_path)
39
+
40
+ # determine symbol_list either from explicit symbols or file
41
+ sym_list = None
42
+ if symbols_file:
43
+ path = symbols_file
44
+ if store_type == "fs":
45
+ path_obj = Path(symbols_file).expanduser()
46
+ if path_obj.exists():
47
+ sym_list = [s.strip().upper() for s in path_obj.read_text().splitlines() if s.strip()]
48
+ else:
49
+ logger.warning(f"symbols_file {path} does not exist")
50
+ else:
51
+ if store.exists(path):
52
+ sym_list = [s.strip().upper() for s in store.read_text(path).splitlines() if s.strip()]
53
+ else:
54
+ logger.warning(f"symbols_file {path} does not exist in {store_type}")
55
+
56
+ # if the file exists but is empty, we still want to treat that as an
57
+ # intentional (albeit odd) request to download nothing rather than
58
+ # blow up and fetch the full universe.
59
+ if reload_symbols or sym_list is None:
60
+ # fetch fresh and optionally write back (only if explicitly
61
+ # requested via reload_symbols)
62
+ sym_list = get_us_stock_symbols(reload=True, data_path=data_path, store=store)
63
+ try:
64
+ store.write_text(path, "\n".join(sym_list))
65
+ except Exception:
66
+ logger.warning(f"could not write symbol file {path}")
67
+ elif symbols:
68
+ # fire may give us a list/tuple, or a comma string
69
+ if isinstance(symbols, (list, tuple)):
70
+ sym_list = [s.strip().upper() for s in symbols if isinstance(s, str) and s.strip()]
71
+ else:
72
+ sym_list = [s.strip().upper() for s in str(symbols).split(",") if s.strip()]
73
+
74
+ # if the caller passed a data_path we use that as the base;
75
+ # otherwise fall back to the configured source directory. the
76
+ # storage backend will interpret the base string appropriately (e.g.
77
+ # a bucket name/prefix for GCS).
78
+ base = data_path if data_path is not None else self.source_dir
79
+ csv_dir = store.joinpath(base, "feature-csv")
80
+
81
+ if region.upper() == "US":
82
+ collector = YahooCollectorUS(str(csv_dir), symbol_list=sym_list, store=store)
83
+ else:
84
+ raise ValueError("region not supported")
85
+ collector.download_data(start=start, end=end)
86
+
87
+ # optionally produce binary dump if requested
88
+ if out_format.lower() in ("bin", "dump"):
89
+ try:
90
+ # import from the package rather than a top-level module
91
+ from .dump_bin import DumpDataUpdate, DumpDataAll
92
+
93
+ dump_dir = data_path if data_path is not None else csv_dir
94
+ # decide whether to do a full initial dump or an update; the
95
+ # former is required if the target directory does not yet
96
+ # contain a calendar file.
97
+ cal_file = store.joinpath(dump_dir, "calendars", "day.txt")
98
+ if store.exists(cal_file):
99
+ dumper = DumpDataUpdate(
100
+ data_path=str(csv_dir),
101
+ dump_dir=dump_dir,
102
+ exclude_fields="symbol,date",
103
+ store_type=store_type,
104
+ )
105
+ else:
106
+ dumper = DumpDataAll(
107
+ data_path=str(csv_dir),
108
+ dump_dir=dump_dir,
109
+ exclude_fields="symbol,date",
110
+ store_type=store_type,
111
+ )
112
+ dumper.dump()
113
+ except Exception as e:
114
+ logger.warning(f"unable to perform binary dump: {e}")
115
+
116
+ def normalize(self, source_dir: str = None, store_type: str = "fs"):
117
+ from .storage import get_storage
118
+ store = get_storage(store_type, source_dir or self.source_dir)
119
+ src = source_dir or self.source_dir
120
+
121
+ # This uses pandas directly heavily, we will adapt it
122
+ import io
123
+ for csv_path in store.glob(src, "*.csv"):
124
+ if store_type == "fs":
125
+ df = pd.read_csv(csv_path)
126
+ else:
127
+ csv_bytes = store.read_bytes(csv_path)
128
+ df = pd.read_csv(io.BytesIO(csv_bytes))
129
+
130
+ df2 = YahooNormalize.normalize_yahoo(df)
131
+
132
+ if store_type == "fs":
133
+ df2.to_csv(csv_path, index=False)
134
+ else:
135
+ csv_buffer = io.StringIO()
136
+ df2.to_csv(csv_buffer, index=False)
137
+ store.write_text(csv_path, csv_buffer.getvalue())
138
+
139
+ def view(self, bin_file: str, calendar_file: str = None, store_type: str = "fs", data_path: str = None):
140
+ """Print basic information about a binary feature file.
141
+
142
+ The bin format starts with a 4-byte date index followed by
143
+ little-endian floats. The date index is an offset into a calendar
144
+ file (one date per line) that lives in the dataset root under
145
+ ``calendars/day.txt``. When a `calendar_file` path is supplied (or if
146
+ the code can automatically locate one by traversing upwards from the
147
+ bin file location) this helper will print the corresponding date for
148
+ each value.
149
+ """
150
+ from .storage import get_storage
151
+ # for gcs, bucket is data_path
152
+ store = get_storage(store_type, data_path)
153
+
154
+ if not store.exists(bin_file):
155
+ logger.error(f"file not found: {bin_file}")
156
+ return
157
+ try:
158
+ if store_type == "fs":
159
+ arr = np.fromfile(bin_file, dtype="<f")
160
+ else:
161
+ arr = np.frombuffer(store.read_bytes(bin_file), dtype="<f")
162
+ except Exception as e:
163
+ logger.error(f"unable to read file {bin_file}: {e}")
164
+ return
165
+ if arr.size == 0:
166
+ print(f"{bin_file} is empty")
167
+ return
168
+ date_index = int(arr[0])
169
+ values = arr[1:]
170
+ print(f"date index: {date_index}, values shape: {values.shape}")
171
+
172
+ dates = None
173
+ # locate calendar if not given
174
+ if calendar_file:
175
+ cal_path = calendar_file
176
+ else:
177
+ if store_type == "fs":
178
+ path = Path(bin_file).expanduser()
179
+ cal_path = path
180
+ for _ in range(5):
181
+ cal_path = cal_path.parent
182
+ candidate = cal_path.joinpath("calendars/day.txt")
183
+ if candidate.exists():
184
+ cal_path = candidate
185
+ break
186
+ else:
187
+ cal_path = None
188
+ if cal_path: cal_path = str(cal_path)
189
+ else:
190
+ # GCS is flat, try removing segments
191
+ parts = bin_file.split("/")
192
+ cal_path = None
193
+ for i in range(len(parts)-1, 0, -1):
194
+ prefix = "/".join(parts[:i])
195
+ candidate = f"{prefix}/calendars/day.txt" if prefix else "calendars/day.txt"
196
+ if store.exists(candidate):
197
+ cal_path = candidate
198
+ break
199
+
200
+ if cal_path and store.exists(cal_path):
201
+ try:
202
+ dates = [line.strip() for line in store.read_text(cal_path).splitlines() if line.strip()]
203
+ except Exception:
204
+ dates = None
205
+ if dates is not None:
206
+ # print date mapping for each value
207
+ offset = date_index
208
+ for i, val in enumerate(values):
209
+ idx = offset + i
210
+ date_str = dates[idx] if idx < len(dates) else "<out of range>"
211
+ print(f"{date_str}: {val}")
212
+ else:
213
+ print(values)
214
+
215
+ def query(
216
+ self,
217
+ sql: str,
218
+ data_path: str = None,
219
+ max_symbols: int = None,
220
+ max_memory: int = None,
221
+ store_type: str = "fs",
222
+ ):
223
+ """Execute an SQL query over the binary dataset using DuckDB.
224
+
225
+ The query engine lazily loads symbol directories as needed and uses an
226
+ LRU cache to limit memory/number of symbols. ``sql`` should be a
227
+ valid SQL string referencing symbol names as table names.
228
+ """
229
+ from .duck import DuckQueryService, LRUCache
230
+ from .storage import get_storage
231
+
232
+ base = data_path if data_path is not None else self.source_dir
233
+ store = get_storage(store_type, base)
234
+
235
+ cache = LRUCache(max_symbols=max_symbols, max_memory=max_memory)
236
+ svc = DuckQueryService(base, cache=cache, store=store)
237
+ df = svc.execute(sql)
238
+ # pretty print result
239
+ print(df.to_string(index=False))
240
+
241
+
242
+ def main():
243
+ fire.Fire(Run)
244
+
245
+
246
+ # ensure the CLI runs when the module is executed directly
247
+ if __name__ == "__main__":
248
+ main()
featureSQL/duck.py ADDED
@@ -0,0 +1,140 @@
1
+ """Simple SQL service using DuckDB with lazy-loaded symbol data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import OrderedDict
6
+ from pathlib import Path
7
+ import re
8
+ from typing import Callable, Dict, Optional
9
+
10
+ import duckdb
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+
15
+ class LRUCache:
16
+ """Very lightweight LRU cache keyed by symbol name.
17
+
18
+ The cache keeps a mapping from symbol->(df, memory) and evicts the least
19
+ recently used entries whenever a configured threshold is exceeded. Two
20
+ limits are supported:
21
+
22
+ * ``max_symbols``: maximum number of distinct symbols to retain
23
+ * ``max_memory``: approximate total bytes of DataFrame memory to keep
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ max_symbols: Optional[int] = None,
29
+ max_memory: Optional[int] = None,
30
+ ):
31
+ self.max_symbols = max_symbols
32
+ self.max_memory = max_memory
33
+ self._cache: "OrderedDict[str, Dict[str, object]]" = OrderedDict()
34
+ self._total_memory = 0
35
+
36
+ def get(self, key: str, loader: Callable[[str], pd.DataFrame]) -> pd.DataFrame:
37
+ # on hit, bump to end and return
38
+ if key in self._cache:
39
+ self._cache.move_to_end(key)
40
+ return self._cache[key]["df"]
41
+
42
+ # load new data
43
+ df = loader(key)
44
+ mem = df.memory_usage(deep=True).sum()
45
+ self._cache[key] = {"df": df, "mem": mem}
46
+ self._total_memory += mem
47
+ self._evict_if_needed()
48
+ return df
49
+
50
+ def _evict_if_needed(self) -> None:
51
+ # remove oldest entries until under both thresholds
52
+ while self._cache and (
53
+ (self.max_symbols and len(self._cache) > self.max_symbols)
54
+ or (self.max_memory and self._total_memory > self.max_memory)
55
+ ):
56
+ oldest_key, oldest_val = self._cache.popitem(last=False)
57
+ self._total_memory -= oldest_val["mem"]
58
+
59
+
60
+ class DuckQueryService:
61
+ """SQL service using :mod:`duckdb` over symbol bin files.
62
+
63
+ Each symbol corresponds to a subdirectory under ``root``; that directory
64
+ contains one ``<field>.day.bin`` file per numeric column. When a query
65
+ references a symbol we lazily read its bins into a DataFrame and register
66
+ it with DuckDB. The cache above keeps only a limited number of symbols
67
+ in memory.
68
+ """
69
+
70
+ # match `FROM foo` or `JOIN foo` so that symbols used in joins are
71
+ # loaded as well. we deliberately avoid more complex SQL parsing;
72
+ # a simple regex is sufficient for the limited syntax we expect.
73
+ SYMBOL_RE = re.compile(r"\b(?:from|join)\s+([A-Za-z0-9_]+)\b", re.IGNORECASE)
74
+
75
+ def __init__(self, root: str, cache: Optional[LRUCache] = None, store=None):
76
+ from .storage import get_storage
77
+ self.store = store if store else get_storage("fs")
78
+ self.root = str(root)
79
+ self.cache = cache or LRUCache()
80
+ self._conn = duckdb.connect()
81
+ # try to read calendar file (optional)
82
+ cal_path = self.store.joinpath(self.root, "calendars", "day.txt")
83
+ self._calendar: Optional[list] = None
84
+ if self.store.exists(cal_path):
85
+ self._calendar = [line.strip() for line in self.store.read_text(cal_path).splitlines() if line.strip()]
86
+
87
+ def _load_symbol_df(self, symbol: str) -> pd.DataFrame:
88
+ """Read all bin files for ``symbol`` and return a DataFrame."""
89
+ symbol_dir = self.store.joinpath(self.root, "features", symbol.lower())
90
+ if not self.store.exists(symbol_dir) and getattr(self.store, 'store_type', 'fs') == 'fs':
91
+ # Note: For GCS, exists on a directory might be false if no object exactly matches the dir name
92
+ # So we just proceed to glob
93
+ pass
94
+
95
+ # normalize for glob: remove any leading slash which confuses
96
+ # object-storage backends. file system paths should be left alone.
97
+ if isinstance(self.store, __import__('featureSQL.storage', fromlist=['FileSystemStore']).FileSystemStore):
98
+ glob_path = symbol_dir
99
+ else:
100
+ glob_path = symbol_dir.lstrip("/") if isinstance(symbol_dir, str) else symbol_dir
101
+
102
+ cols: Dict[str, np.ndarray] = {}
103
+ for binfile in self.store.glob(glob_path, "*.day.bin"):
104
+ field = str(binfile).split("/")[-1].replace(".day.bin", "")
105
+
106
+ if isinstance(self.store, __import__('featureSQL.storage', fromlist=['FileSystemStore']).FileSystemStore):
107
+ arr = np.fromfile(binfile, dtype="<f")
108
+ else:
109
+ arr = np.frombuffer(self.store.read_bytes(binfile), dtype="<f")
110
+
111
+ if arr.size == 0:
112
+ continue
113
+ # first element is date index; convert to calendar date if available
114
+ data = arr[1:]
115
+ if self._calendar is not None:
116
+ # produce a date column if not already
117
+ cols.setdefault("date", pd.Series([self._calendar[int(arr[0]) + i] for i in range(len(data))]))
118
+ cols[field] = data
119
+
120
+ if not cols:
121
+ raise FileNotFoundError(f"symbol directory/files not found or empty: {symbol_dir}")
122
+
123
+ return pd.DataFrame(cols)
124
+
125
+ def _ensure_symbols(self, sql: str) -> None:
126
+ symbols = set(self.SYMBOL_RE.findall(sql))
127
+ for sym in symbols:
128
+ key = sym.lower()
129
+ # load using lowercase key to keep cache consistent
130
+ df = self.cache.get(key, self._load_symbol_df)
131
+ # register table name using original capitalization from query
132
+ self._conn.register(sym, df)
133
+
134
+ def execute(self, sql: str) -> pd.DataFrame:
135
+ """Run ``sql`` after loading any referenced symbols.
136
+
137
+ The returned DataFrame comes from DuckDB's result set.
138
+ """
139
+ self._ensure_symbols(sql)
140
+ return self._conn.execute(sql).df()