tradernick-data-provider 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tradernick_data_provider/__init__.py +10 -0
- tradernick_data_provider/_client.py +148 -0
- tradernick_data_provider/_http.py +55 -0
- tradernick_data_provider/_query.py +378 -0
- tradernick_data_provider/binance.py +228 -0
- tradernick_data_provider/btc.py +93 -0
- tradernick_data_provider/erc20.py +406 -0
- tradernick_data_provider/evm.py +87 -0
- tradernick_data_provider/exceptions.py +8 -0
- tradernick_data_provider/jobs.py +72 -0
- tradernick_data_provider/protocols.py +951 -0
- tradernick_data_provider/py.typed +0 -0
- tradernick_data_provider/snapshots.py +116 -0
- tradernick_data_provider/tron.py +88 -0
- tradernick_data_provider/wallets.py +93 -0
- tradernick_data_provider-0.2.0.dist-info/METADATA +114 -0
- tradernick_data_provider-0.2.0.dist-info/RECORD +18 -0
- tradernick_data_provider-0.2.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from ._client import DataProviderClient
|
|
2
|
+
from .exceptions import DataProviderError, DataProviderHTTPError
|
|
3
|
+
from .wallets import WalletsNamespace
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"DataProviderClient",
|
|
7
|
+
"DataProviderError",
|
|
8
|
+
"DataProviderHTTPError",
|
|
9
|
+
"WalletsNamespace",
|
|
10
|
+
]
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Literal, Optional, Union
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from ._http import load_parquet_bytes, list_snapshots, delete_snapshot
|
|
9
|
+
from ._query import _to_timestamp
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _to_datetime(date: datetime | str | int) -> datetime:
|
|
13
|
+
"""Convert any date input to a timezone-aware datetime."""
|
|
14
|
+
ts = _to_timestamp(date) # returns 'YYYY-MM-DDTHH:MM:SSZ'
|
|
15
|
+
return datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _cast_time_ms_utc(df: pl.DataFrame) -> pl.DataFrame:
|
|
19
|
+
"""Normalize the ``time`` column to ``Datetime('ms', 'UTC')``.
|
|
20
|
+
|
|
21
|
+
DuckDB-written snapshots come back as μs+UTC; cache reads come back
|
|
22
|
+
as ms+UTC. The cast keeps everything joinable on the polars side.
|
|
23
|
+
"""
|
|
24
|
+
if 'time' in df.columns:
|
|
25
|
+
dt = df.schema['time']
|
|
26
|
+
if isinstance(dt, pl.Datetime) and (
|
|
27
|
+
dt.time_unit != 'ms' or dt.time_zone != 'UTC'
|
|
28
|
+
):
|
|
29
|
+
df = df.with_columns(pl.col('time').cast(pl.Datetime('ms', 'UTC')))
|
|
30
|
+
return df
|
|
31
|
+
|
|
32
|
+
from .binance import BinanceNamespace, HyperliquidNamespace
|
|
33
|
+
from .btc import BtcNamespace
|
|
34
|
+
from .evm import EvmNamespace
|
|
35
|
+
from .jobs import JobsNamespace
|
|
36
|
+
from .protocols import CacheNamespace
|
|
37
|
+
from .tron import TronNamespace
|
|
38
|
+
from .wallets import WalletsNamespace
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DataProviderClient:
|
|
42
|
+
evm: EvmNamespace
|
|
43
|
+
tron: TronNamespace
|
|
44
|
+
btc: BtcNamespace
|
|
45
|
+
binance: BinanceNamespace
|
|
46
|
+
hyperliquid: HyperliquidNamespace
|
|
47
|
+
wallets: WalletsNamespace
|
|
48
|
+
cache: CacheNamespace
|
|
49
|
+
jobs: JobsNamespace
|
|
50
|
+
|
|
51
|
+
def __init__(self, url: str):
|
|
52
|
+
self._url = url.rstrip("/")
|
|
53
|
+
self._session = httpx.AsyncClient(timeout=86400)
|
|
54
|
+
self.evm = EvmNamespace(self._session, self._url)
|
|
55
|
+
self.tron = TronNamespace(self._session, self._url)
|
|
56
|
+
self.btc = BtcNamespace(self._session, self._url)
|
|
57
|
+
self.binance = BinanceNamespace(self._session, self._url)
|
|
58
|
+
self.hyperliquid = HyperliquidNamespace(self._session, self._url)
|
|
59
|
+
self.wallets = WalletsNamespace(self._session, self._url)
|
|
60
|
+
self.cache = CacheNamespace(self._session, self._url)
|
|
61
|
+
self.jobs = JobsNamespace(self._session, self._url)
|
|
62
|
+
|
|
63
|
+
async def health(self) -> bool:
|
|
64
|
+
response = await self._session.get(self._url + "/health")
|
|
65
|
+
response.raise_for_status()
|
|
66
|
+
return True
|
|
67
|
+
|
|
68
|
+
async def load_parquet(
|
|
69
|
+
self,
|
|
70
|
+
key: str,
|
|
71
|
+
since: Optional[Union[datetime, str, int]] = None,
|
|
72
|
+
until: Optional[Union[datetime, str, int]] = None,
|
|
73
|
+
) -> pl.DataFrame:
|
|
74
|
+
"""Load a saved snapshot as a polars DataFrame.
|
|
75
|
+
|
|
76
|
+
``time`` is normalized to ``Datetime('ms', UTC)`` so joins with
|
|
77
|
+
transfer-read DataFrames (which the cache layer also returns at
|
|
78
|
+
ms+UTC) don't trip the polars 'datatypes of join keys don't
|
|
79
|
+
match' check. Snapshots saved via DuckDB COPY are stored at
|
|
80
|
+
μs+UTC internally; we cast on read.
|
|
81
|
+
|
|
82
|
+
For pandas, call ``(await client.load_parquet(key)).to_pandas()``.
|
|
83
|
+
"""
|
|
84
|
+
raw = await load_parquet_bytes(self._session, self._url, key)
|
|
85
|
+
df = pl.read_parquet(io.BytesIO(raw))
|
|
86
|
+
df = _cast_time_ms_utc(df)
|
|
87
|
+
if since is not None or until is not None:
|
|
88
|
+
time_col = "timestamp" if "timestamp" in df.columns else "time"
|
|
89
|
+
if time_col in df.columns:
|
|
90
|
+
if since is not None:
|
|
91
|
+
df = df.filter(pl.col(time_col) >= _to_datetime(since))
|
|
92
|
+
if until is not None:
|
|
93
|
+
df = df.filter(pl.col(time_col) <= _to_datetime(until))
|
|
94
|
+
return df
|
|
95
|
+
|
|
96
|
+
def scan_parquet(self, key: str, *,
|
|
97
|
+
since: Optional[Union[datetime, str, int]] = None,
|
|
98
|
+
until: Optional[Union[datetime, str, int]] = None,
|
|
99
|
+
engine: Literal['polars', 'duckdb'] = 'duckdb',
|
|
100
|
+
normalize_addresses: Optional[bool] = None):
|
|
101
|
+
"""Lazy-scan a saved snapshot with ``local_*`` filters applied
|
|
102
|
+
server-side. Returns a ``ScanParquetQuery`` builder. Chain
|
|
103
|
+
``local_*`` filter methods then call a terminal ``as_polars()`` /
|
|
104
|
+
``as_pandas()`` / ``as_parquet(new_key)``.
|
|
105
|
+
|
|
106
|
+
``engine``:
|
|
107
|
+
- ``'duckdb'`` (default): server mounts the snapshot + wallets
|
|
108
|
+
parquets as DuckDB views and runs the filter as SQL.
|
|
109
|
+
Streams via ``COPY ... TO PARQUET``. Best optimizer for
|
|
110
|
+
large ``IN`` filters; ~3-50× faster than polars on big
|
|
111
|
+
wallet-set queries.
|
|
112
|
+
- ``'polars'``: server uses ``pl.scan_parquet`` and a polars
|
|
113
|
+
lazy filter pipeline. Streams via ``sink_parquet``.
|
|
114
|
+
|
|
115
|
+
``normalize_addresses``: default ``None`` (auto). Set to ``False``
|
|
116
|
+
only when you know the snapshot is canonical and the file lacks
|
|
117
|
+
the metadata flag — auto-detect already handles canonical files.
|
|
118
|
+
|
|
119
|
+
Example::
|
|
120
|
+
|
|
121
|
+
df = await client.scan_parquet('huge_snapshot') \\
|
|
122
|
+
.local_exclude_sender_categories(['Hot-Wallet','Cold-Wallet']) \\
|
|
123
|
+
.local_involving_entities(['Binance']) \\
|
|
124
|
+
.as_polars()
|
|
125
|
+
"""
|
|
126
|
+
from .snapshots import ScanParquetQuery
|
|
127
|
+
return ScanParquetQuery(
|
|
128
|
+
self._session, self._url, key,
|
|
129
|
+
since=since, until=until,
|
|
130
|
+
engine=engine, normalize_addresses=normalize_addresses,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
async def list_snapshots(self) -> list[str]:
|
|
134
|
+
"""List all saved snapshot keys."""
|
|
135
|
+
return await list_snapshots(self._session, self._url)
|
|
136
|
+
|
|
137
|
+
async def delete_snapshot(self, key: str) -> None:
|
|
138
|
+
"""Delete a saved snapshot."""
|
|
139
|
+
await delete_snapshot(self._session, self._url, key)
|
|
140
|
+
|
|
141
|
+
async def close(self) -> None:
|
|
142
|
+
await self._session.aclose()
|
|
143
|
+
|
|
144
|
+
async def __aenter__(self) -> "DataProviderClient":
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
async def __aexit__(self, *_) -> None:
|
|
148
|
+
await self.close()
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import io
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
import pyarrow.parquet as pq
|
|
6
|
+
|
|
7
|
+
from .exceptions import DataProviderHTTPError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def fetch_table(session: httpx.AsyncClient, url: str, body: dict) -> pa.Table | None:
|
|
11
|
+
response = await session.post(url, json=body)
|
|
12
|
+
content_type = response.headers.get("content-type", "")
|
|
13
|
+
if "application/json" in content_type:
|
|
14
|
+
data = response.json()
|
|
15
|
+
if response.is_success and data.get("saved"):
|
|
16
|
+
return None
|
|
17
|
+
raise DataProviderHTTPError(response.status_code, data.get("error", str(data)))
|
|
18
|
+
response.raise_for_status()
|
|
19
|
+
return pq.read_table(io.BytesIO(response.content))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def save_parquet(session: httpx.AsyncClient, url: str, body: dict, key: str) -> None:
|
|
23
|
+
"""Send a query with save_key to save the result as a named snapshot."""
|
|
24
|
+
resp = await session.post(url, json={**body, "save_key": key})
|
|
25
|
+
resp.raise_for_status()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def load_parquet_bytes(session: httpx.AsyncClient, base_url: str, key: str) -> bytes:
|
|
29
|
+
"""Load a previously saved snapshot as raw parquet bytes."""
|
|
30
|
+
resp = await session.post(f"{base_url}/snapshots/load", json={"key": key})
|
|
31
|
+
content_type = resp.headers.get("content-type", "")
|
|
32
|
+
if "application/json" in content_type:
|
|
33
|
+
data = resp.json()
|
|
34
|
+
raise DataProviderHTTPError(resp.status_code, data.get("error", str(data)))
|
|
35
|
+
resp.raise_for_status()
|
|
36
|
+
return resp.content
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def load_parquet(session: httpx.AsyncClient, base_url: str, key: str) -> pa.Table:
|
|
40
|
+
"""Load a previously saved snapshot as a pyarrow Table."""
|
|
41
|
+
raw = await load_parquet_bytes(session, base_url, key)
|
|
42
|
+
return pq.read_table(io.BytesIO(raw))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def delete_snapshot(session: httpx.AsyncClient, base_url: str, key: str) -> None:
|
|
46
|
+
"""Delete a snapshot by key."""
|
|
47
|
+
resp = await session.post(f"{base_url}/snapshots/delete", json={"key": key})
|
|
48
|
+
resp.raise_for_status()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
async def list_snapshots(session: httpx.AsyncClient, base_url: str) -> list[str]:
|
|
52
|
+
"""List all saved snapshot keys."""
|
|
53
|
+
resp = await session.get(f"{base_url}/snapshots/list")
|
|
54
|
+
resp.raise_for_status()
|
|
55
|
+
return resp.json()["keys"]
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import polars as pl
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
|
|
11
|
+
from ._http import fetch_table
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from typing import Self
|
|
15
|
+
|
|
16
|
+
_TIME_COL = "timestamp"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _to_timestamp(date: datetime | str | int) -> str:
|
|
20
|
+
"""Normalise any DateType value to 'YYYY-MM-DDTHH:MM:SSZ' for the server."""
|
|
21
|
+
if isinstance(date, int):
|
|
22
|
+
dt = datetime.fromtimestamp(date / 1000, tz=timezone.utc)
|
|
23
|
+
elif isinstance(date, str):
|
|
24
|
+
if "T" in date:
|
|
25
|
+
dt = datetime.fromisoformat(date.replace("Z", "+00:00"))
|
|
26
|
+
elif len(date) == 10:
|
|
27
|
+
dt = datetime.strptime(date, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
|
28
|
+
else:
|
|
29
|
+
dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
|
|
30
|
+
elif isinstance(date, datetime):
|
|
31
|
+
dt = date
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f"Unsupported date type: {type(date)}")
|
|
34
|
+
if dt.tzinfo is None:
|
|
35
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
36
|
+
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class _LocalFiltersMixin:
|
|
40
|
+
"""24 ``local_*`` methods that accumulate filter steps into
|
|
41
|
+
``self._body['local_filters']``. Used by both transfer queries (where
|
|
42
|
+
filters apply post-fetch on the server) and the new
|
|
43
|
+
``ScanParquetQuery`` (where they apply via a lazy scan on the server).
|
|
44
|
+
|
|
45
|
+
Filter rules:
|
|
46
|
+
- Each call appends one sequential ``df.filter(...)`` step.
|
|
47
|
+
- Within a single call, ``values`` is union-ed (any-of).
|
|
48
|
+
- ``involving_*`` matches sender OR receiver.
|
|
49
|
+
- ``exclude_*`` negates the predicate.
|
|
50
|
+
- Address lookups are case-insensitive on entity/category/label terms;
|
|
51
|
+
EVM ``0x…`` addresses get lowercased per-row in the lazy plan,
|
|
52
|
+
TRON/BTC pass through unchanged.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
_body: dict # set by the concrete subclass
|
|
56
|
+
|
|
57
|
+
def _add_local_filter(self, op: str, values: list[str]):
|
|
58
|
+
if not isinstance(values, (list, tuple, set)) or not all(isinstance(v, str) for v in values):
|
|
59
|
+
raise TypeError(f"{op}: values must be a list of strings")
|
|
60
|
+
if not values:
|
|
61
|
+
return self
|
|
62
|
+
steps = self._body.setdefault("local_filters", [])
|
|
63
|
+
steps.append({"op": op, "values": list(values)})
|
|
64
|
+
return self
|
|
65
|
+
|
|
66
|
+
# involving (sender OR receiver)
|
|
67
|
+
def local_involving(self, addresses: list[str]): return self._add_local_filter("involving", addresses)
|
|
68
|
+
def local_involving_labels(self, labels: list[str]): return self._add_local_filter("involving_labels", labels)
|
|
69
|
+
def local_involving_categories(self, categories: list[str]): return self._add_local_filter("involving_categories", categories)
|
|
70
|
+
def local_involving_entities(self, entities: list[str]): return self._add_local_filter("involving_entities", entities)
|
|
71
|
+
|
|
72
|
+
# sender
|
|
73
|
+
def local_sender(self, addresses: list[str]): return self._add_local_filter("sender", addresses)
|
|
74
|
+
def local_sender_labels(self, labels: list[str]): return self._add_local_filter("sender_labels", labels)
|
|
75
|
+
def local_sender_categories(self, categories: list[str]): return self._add_local_filter("sender_categories", categories)
|
|
76
|
+
def local_sender_entities(self, entities: list[str]): return self._add_local_filter("sender_entities", entities)
|
|
77
|
+
|
|
78
|
+
# receiver
|
|
79
|
+
def local_receiver(self, addresses: list[str]): return self._add_local_filter("receiver", addresses)
|
|
80
|
+
def local_receiver_labels(self, labels: list[str]): return self._add_local_filter("receiver_labels", labels)
|
|
81
|
+
def local_receiver_categories(self, categories: list[str]): return self._add_local_filter("receiver_categories", categories)
|
|
82
|
+
def local_receiver_entities(self, entities: list[str]): return self._add_local_filter("receiver_entities", entities)
|
|
83
|
+
|
|
84
|
+
# exclude variants
|
|
85
|
+
def local_exclude_involving(self, addresses: list[str]): return self._add_local_filter("exclude_involving", addresses)
|
|
86
|
+
def local_exclude_involving_labels(self, labels: list[str]): return self._add_local_filter("exclude_involving_labels", labels)
|
|
87
|
+
def local_exclude_involving_categories(self, categories: list[str]): return self._add_local_filter("exclude_involving_categories", categories)
|
|
88
|
+
def local_exclude_involving_entities(self, entities: list[str]): return self._add_local_filter("exclude_involving_entities", entities)
|
|
89
|
+
|
|
90
|
+
def local_exclude_sender(self, addresses: list[str]): return self._add_local_filter("exclude_sender", addresses)
|
|
91
|
+
def local_exclude_sender_labels(self, labels: list[str]): return self._add_local_filter("exclude_sender_labels", labels)
|
|
92
|
+
def local_exclude_sender_categories(self, categories: list[str]): return self._add_local_filter("exclude_sender_categories", categories)
|
|
93
|
+
def local_exclude_sender_entities(self, entities: list[str]): return self._add_local_filter("exclude_sender_entities", entities)
|
|
94
|
+
|
|
95
|
+
def local_exclude_receiver(self, addresses: list[str]): return self._add_local_filter("exclude_receiver", addresses)
|
|
96
|
+
def local_exclude_receiver_labels(self, labels: list[str]): return self._add_local_filter("exclude_receiver_labels", labels)
|
|
97
|
+
def local_exclude_receiver_categories(self, categories: list[str]): return self._add_local_filter("exclude_receiver_categories", categories)
|
|
98
|
+
def local_exclude_receiver_entities(self, entities: list[str]): return self._add_local_filter("exclude_receiver_entities", entities)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class BaseQuery(_LocalFiltersMixin):
|
|
102
|
+
def __init__(self, session: httpx.AsyncClient, base_url: str, body: dict):
|
|
103
|
+
self._session = session
|
|
104
|
+
self._base_url = base_url
|
|
105
|
+
self._body = body
|
|
106
|
+
|
|
107
|
+
def network(self, n: str | list[str]) -> Self:
|
|
108
|
+
# EVM-class endpoints accept a list to fan out per-network. The cache
|
|
109
|
+
# is keyed per-network on the server, so each chain reads/writes its
|
|
110
|
+
# own partition independently. The combined result is concatenated
|
|
111
|
+
# client-side and (when len > 1) automatically tagged with
|
|
112
|
+
# ``with_network`` so rows are distinguishable. TRON/BTC are always
|
|
113
|
+
# single-network — passing a one-element list still works.
|
|
114
|
+
if isinstance(n, list):
|
|
115
|
+
self._body["networks"] = n
|
|
116
|
+
else:
|
|
117
|
+
self._body["network"] = n
|
|
118
|
+
return self
|
|
119
|
+
|
|
120
|
+
def with_network(self, enabled: bool = True) -> Self:
|
|
121
|
+
self._body["with_network"] = enabled
|
|
122
|
+
return self
|
|
123
|
+
|
|
124
|
+
def include_zero_amounts(self, enabled: bool = True) -> Self:
|
|
125
|
+
"""Keep rows where amount == 0 in the result. By default these
|
|
126
|
+
are filtered out — they're typically token-approval-style noise
|
|
127
|
+
that inflates row counts without representing real flow.
|
|
128
|
+
|
|
129
|
+
The filter applies right before the response is written or the
|
|
130
|
+
snapshot is saved; the data-provider cache itself is unaffected
|
|
131
|
+
and contains all rows including zero-amount ones, so toggling
|
|
132
|
+
this flag does not invalidate cache."""
|
|
133
|
+
self._body["include_zero_amounts"] = enabled
|
|
134
|
+
return self
|
|
135
|
+
|
|
136
|
+
def _auto_with_network(self) -> None:
|
|
137
|
+
"""Force ``with_network`` on for multi-network fan-out unless the user
|
|
138
|
+
explicitly opted out via ``with_network(False)``."""
|
|
139
|
+
nets = self._body.get("networks") or []
|
|
140
|
+
if len(nets) > 1 and "with_network" not in self._body:
|
|
141
|
+
self._body["with_network"] = True
|
|
142
|
+
|
|
143
|
+
def time_range(self, since: datetime | str | int, until: datetime | str | int) -> Self:
|
|
144
|
+
self._body["since"] = _to_timestamp(since)
|
|
145
|
+
self._body["until"] = _to_timestamp(until)
|
|
146
|
+
return self
|
|
147
|
+
|
|
148
|
+
def involving(self, address: str) -> Self:
|
|
149
|
+
self._body["involving"] = address
|
|
150
|
+
return self
|
|
151
|
+
|
|
152
|
+
def involving_label(self, label: str) -> Self:
|
|
153
|
+
self._body["involving_label"] = label
|
|
154
|
+
return self
|
|
155
|
+
|
|
156
|
+
def involving_category(self, category: str) -> Self:
|
|
157
|
+
self._body["involving_category"] = category
|
|
158
|
+
return self
|
|
159
|
+
|
|
160
|
+
def exclude_involving(self, address: str) -> Self:
|
|
161
|
+
self._body["exclude_involving"] = address
|
|
162
|
+
return self
|
|
163
|
+
|
|
164
|
+
def exclude_involving_label(self, label: str) -> Self:
|
|
165
|
+
self._body["exclude_involving_label"] = label
|
|
166
|
+
return self
|
|
167
|
+
|
|
168
|
+
def exclude_involving_category(self, category: str) -> Self:
|
|
169
|
+
self._body["exclude_involving_category"] = category
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
def wallet_namespace(self, ns: str) -> Self:
|
|
173
|
+
self._body["wallet_namespace"] = ns
|
|
174
|
+
return self
|
|
175
|
+
|
|
176
|
+
def with_value(self) -> Self:
|
|
177
|
+
self._body["with_value"] = True
|
|
178
|
+
return self
|
|
179
|
+
|
|
180
|
+
def verbose(self) -> Self:
|
|
181
|
+
self._body["verbose"] = True
|
|
182
|
+
return self
|
|
183
|
+
|
|
184
|
+
def aggregate(self, group_by: str = "time", period: str = "1h") -> Self:
|
|
185
|
+
self._body["aggregate"] = True
|
|
186
|
+
self._body["group_by"] = group_by
|
|
187
|
+
self._body["period"] = period
|
|
188
|
+
return self
|
|
189
|
+
|
|
190
|
+
# ---- local_wallets filters ----------------------------------------------
|
|
191
|
+
# Inherited from _LocalFiltersMixin (defined below) — see its docstring.
|
|
192
|
+
|
|
193
|
+
async def as_pandas(self) -> pd.DataFrame:
|
|
194
|
+
table = await self._fetch_table()
|
|
195
|
+
df = table.to_pandas()
|
|
196
|
+
# Canonicalize OHLCV / aggregate responses that come back with
|
|
197
|
+
# `window` instead of `time`. Downstream consumers (backtester,
|
|
198
|
+
# chain_analysis) expect `time` uniformly. Track whether this
|
|
199
|
+
# came from `window` so we know to time-index the result —
|
|
200
|
+
# aggregate frames are naturally time-keyed; transfer frames
|
|
201
|
+
# are row-streams where multiple rows share a time.
|
|
202
|
+
came_from_window = "window" in df.columns and "time" not in df.columns
|
|
203
|
+
if came_from_window:
|
|
204
|
+
df = df.rename(columns={"window": "time"})
|
|
205
|
+
sort_col = next(
|
|
206
|
+
(c for c in (_TIME_COL, "time") if c in df.columns), None
|
|
207
|
+
)
|
|
208
|
+
if sort_col:
|
|
209
|
+
df = df.sort_values(sort_col, ignore_index=True)
|
|
210
|
+
# Normalize the time column precision to ms+UTC for consistency
|
|
211
|
+
# with cache reads and with the polars side. Pandas 2.x supports
|
|
212
|
+
# .dt.as_unit('ms'); guard for older pandas.
|
|
213
|
+
if "time" in df.columns and pd.api.types.is_datetime64_any_dtype(df["time"]):
|
|
214
|
+
try:
|
|
215
|
+
df["time"] = df["time"].dt.as_unit("ms")
|
|
216
|
+
except (AttributeError, TypeError):
|
|
217
|
+
pass
|
|
218
|
+
# Aggregate-shaped frames become time-indexed so they're directly
|
|
219
|
+
# usable in time-series workflows (resampling, plotting, the
|
|
220
|
+
# backtester's set_index check).
|
|
221
|
+
if came_from_window and "time" in df.columns:
|
|
222
|
+
df = df.set_index("time")
|
|
223
|
+
return df
|
|
224
|
+
|
|
225
|
+
async def as_polars(self) -> pl.DataFrame:
|
|
226
|
+
table = await self._fetch_table()
|
|
227
|
+
df = pl.from_arrow(table)
|
|
228
|
+
if "window" in df.columns and "time" not in df.columns:
|
|
229
|
+
df = df.rename({"window": "time"})
|
|
230
|
+
sort_col = next(
|
|
231
|
+
(c for c in (_TIME_COL, "time") if c in df.columns), None
|
|
232
|
+
)
|
|
233
|
+
if sort_col:
|
|
234
|
+
df = df.sort(sort_col)
|
|
235
|
+
# Normalize the time column to ms+UTC so joins with snapshot /
|
|
236
|
+
# cache reads (which are ms+UTC) don't trip polars' precision-
|
|
237
|
+
# mismatch check.
|
|
238
|
+
if "time" in df.columns:
|
|
239
|
+
dt = df.schema["time"]
|
|
240
|
+
if isinstance(dt, pl.Datetime) and (
|
|
241
|
+
dt.time_unit != "ms" or dt.time_zone != "UTC"
|
|
242
|
+
):
|
|
243
|
+
df = df.with_columns(pl.col("time").cast(pl.Datetime("ms", "UTC")))
|
|
244
|
+
return df
|
|
245
|
+
|
|
246
|
+
async def as_parquet(self, key: str) -> None:
|
|
247
|
+
"""Save the query result as a named parquet snapshot on the server.
|
|
248
|
+
|
|
249
|
+
Single-network path uses the worker-side ``save_key`` mechanism
|
|
250
|
+
(server-side save with no extra round-trip).
|
|
251
|
+
|
|
252
|
+
Multi-network path can NOT use ``save_key`` because each per-network
|
|
253
|
+
worker would clobber the same file. Instead we fan out without
|
|
254
|
+
``save_key``, concat client-side, then upload the combined parquet
|
|
255
|
+
via ``POST /snapshots/save``.
|
|
256
|
+
"""
|
|
257
|
+
nets = self._body.get("networks") or []
|
|
258
|
+
protocol = getattr(self, "_PROTOCOL", None)
|
|
259
|
+
# Single-network calls land in body['network'] (string). For
|
|
260
|
+
# transfer queries that declare _PROTOCOL we still want the
|
|
261
|
+
# server-side streaming + DuckDB-merge path — coerce to a one-
|
|
262
|
+
# element list and route through /snapshots/save_multi. Saves
|
|
263
|
+
# the legacy single-network path (which eagerly materializes the
|
|
264
|
+
# full result and OOMs on huge volumes like TRON USDT).
|
|
265
|
+
if not nets and protocol:
|
|
266
|
+
single = self._body.get("network")
|
|
267
|
+
if single:
|
|
268
|
+
nets = [single]
|
|
269
|
+
if nets and protocol:
|
|
270
|
+
# Server-side multi-network save: data-provider fans out per-
|
|
271
|
+
# network reads in subprocesses and merges via DuckDB. Bytes
|
|
272
|
+
# never travel back to the client; RAM stays bounded at every
|
|
273
|
+
# hop. Available for transfer queries that declare _PROTOCOL.
|
|
274
|
+
# Single-network path is included — the merge step on one
|
|
275
|
+
# input is still preferable to the legacy eager-DF flow.
|
|
276
|
+
#
|
|
277
|
+
# _resolve_path() side-effects min_amount (and similar) into
|
|
278
|
+
# self._body — call it so the body we POST contains every
|
|
279
|
+
# field the per-network reads need.
|
|
280
|
+
if hasattr(self, "_resolve_path"):
|
|
281
|
+
self._resolve_path()
|
|
282
|
+
body = {**self._body, "protocol": protocol,
|
|
283
|
+
"save_key": key, "networks": nets}
|
|
284
|
+
# Drop the singular-network field so the server doesn't see
|
|
285
|
+
# both forms. Server reads only `networks`.
|
|
286
|
+
body.pop("network", None)
|
|
287
|
+
resp = await self._session.post(
|
|
288
|
+
f"{self._base_url}/snapshots/save_multi",
|
|
289
|
+
json=body,
|
|
290
|
+
timeout=None,
|
|
291
|
+
)
|
|
292
|
+
resp.raise_for_status()
|
|
293
|
+
return
|
|
294
|
+
if len(nets) > 1:
|
|
295
|
+
import os, tempfile
|
|
296
|
+
df = await self.as_polars()
|
|
297
|
+
# Fallback path for query types without _PROTOCOL: write to a
|
|
298
|
+
# tempfile, drop the polars DF, stream the file to the server
|
|
299
|
+
# via an async generator (httpx AsyncClient rejects sync file
|
|
300
|
+
# handles; chunked async iteration keeps peak memory low).
|
|
301
|
+
fd, tmp_path = tempfile.mkstemp(suffix='.parquet')
|
|
302
|
+
os.close(fd)
|
|
303
|
+
try:
|
|
304
|
+
df.write_parquet(tmp_path)
|
|
305
|
+
del df
|
|
306
|
+
|
|
307
|
+
async def _stream(path, chunk=1024 * 1024):
|
|
308
|
+
with open(path, 'rb') as fh:
|
|
309
|
+
while True:
|
|
310
|
+
buf = fh.read(chunk)
|
|
311
|
+
if not buf:
|
|
312
|
+
break
|
|
313
|
+
yield buf
|
|
314
|
+
|
|
315
|
+
size = os.path.getsize(tmp_path)
|
|
316
|
+
resp = await self._session.post(
|
|
317
|
+
f"{self._base_url}/snapshots/save",
|
|
318
|
+
content=_stream(tmp_path),
|
|
319
|
+
headers={
|
|
320
|
+
"X-Snapshot-Key": key,
|
|
321
|
+
"Content-Type": "application/octet-stream",
|
|
322
|
+
"Content-Length": str(size),
|
|
323
|
+
},
|
|
324
|
+
timeout=None,
|
|
325
|
+
)
|
|
326
|
+
resp.raise_for_status()
|
|
327
|
+
finally:
|
|
328
|
+
try: os.unlink(tmp_path)
|
|
329
|
+
except FileNotFoundError: pass
|
|
330
|
+
return
|
|
331
|
+
self._body["save_key"] = key
|
|
332
|
+
try:
|
|
333
|
+
await self._fetch_table()
|
|
334
|
+
finally:
|
|
335
|
+
del self._body["save_key"]
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class CacheableQuery(BaseQuery):
|
|
339
|
+
def cache(self, cache_type: str = "append") -> Self:
|
|
340
|
+
self._body["cache"] = True
|
|
341
|
+
self._body["cache_type"] = cache_type
|
|
342
|
+
return self
|
|
343
|
+
|
|
344
|
+
def parallel(self) -> Self:
|
|
345
|
+
self._body["parallel"] = True
|
|
346
|
+
return self
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
class EventQuery(CacheableQuery):
|
|
350
|
+
def __init__(self, session: httpx.AsyncClient, base_url: str, path: str, body: dict):
|
|
351
|
+
super().__init__(session, base_url, body)
|
|
352
|
+
self._path = path
|
|
353
|
+
|
|
354
|
+
def _resolve_path(self) -> str:
|
|
355
|
+
if self._body.get("aggregate"):
|
|
356
|
+
return self._path.rsplit("/read", 1)[0] + "/aggregate"
|
|
357
|
+
return self._path
|
|
358
|
+
|
|
359
|
+
async def _fetch_single(self, network: str) -> pa.Table:
|
|
360
|
+
# _resolve_path() may mutate self._body. Run it before the snapshot.
|
|
361
|
+
path = self._resolve_path()
|
|
362
|
+
body = {**self._body, "network": network}
|
|
363
|
+
body.pop("networks", None)
|
|
364
|
+
return await fetch_table(self._session, self._base_url + path, body)
|
|
365
|
+
|
|
366
|
+
async def _fetch_table(self) -> pa.Table:
|
|
367
|
+
import asyncio
|
|
368
|
+
|
|
369
|
+
networks = self._body.get("networks")
|
|
370
|
+
if networks:
|
|
371
|
+
self._auto_with_network()
|
|
372
|
+
tables = await asyncio.gather(*[self._fetch_single(n) for n in networks])
|
|
373
|
+
non_empty = [t for t in tables if t is not None and len(t) > 0]
|
|
374
|
+
if not non_empty:
|
|
375
|
+
return tables[0] if tables else pa.table({})
|
|
376
|
+
return pa.concat_tables(non_empty)
|
|
377
|
+
path = self._resolve_path()
|
|
378
|
+
return await fetch_table(self._session, self._base_url + path, self._body)
|