pymycorr 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymycorr-0.2.0/.gitignore +40 -0
- pymycorr-0.2.0/LICENSE +21 -0
- pymycorr-0.2.0/PKG-INFO +159 -0
- pymycorr-0.2.0/README.md +120 -0
- pymycorr-0.2.0/pyproject.toml +95 -0
- pymycorr-0.2.0/src/pymycorr/__init__.py +25 -0
- pymycorr-0.2.0/src/pymycorr/_progress.py +174 -0
- pymycorr-0.2.0/src/pymycorr/client.py +593 -0
- pymycorr-0.2.0/src/pymycorr/exceptions.py +56 -0
- pymycorr-0.2.0/src/pymycorr/py.typed +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Byte-compiled / optimized
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
*.egg
|
|
11
|
+
|
|
12
|
+
# Virtual environments
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
ENV/
|
|
16
|
+
|
|
17
|
+
# IDE
|
|
18
|
+
.idea/
|
|
19
|
+
.vscode/
|
|
20
|
+
*.swp
|
|
21
|
+
*.swo
|
|
22
|
+
|
|
23
|
+
# Testing
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.coverage
|
|
26
|
+
htmlcov/
|
|
27
|
+
.mypy_cache/
|
|
28
|
+
.ruff_cache/
|
|
29
|
+
|
|
30
|
+
# Jupyter
|
|
31
|
+
.ipynb_checkpoints/
|
|
32
|
+
|
|
33
|
+
# OS
|
|
34
|
+
.DS_Store
|
|
35
|
+
Thumbs.db
|
|
36
|
+
|
|
37
|
+
# Environment
|
|
38
|
+
.env
|
|
39
|
+
.env.local
|
|
40
|
+
*.pem
|
pymycorr-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Recons Ltd
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pymycorr-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pymycorr
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python client for fetching table data from the MyCorr API via Apache Arrow
|
|
5
|
+
Project-URL: Homepage, https://github.com/recons-ltd/pymycorr
|
|
6
|
+
Project-URL: Documentation, https://github.com/recons-ltd/pymycorr#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/recons-ltd/pymycorr
|
|
8
|
+
Project-URL: Issues, https://github.com/recons-ltd/pymycorr/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/recons-ltd/pymycorr/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: Maximilian Schuberth <maxs@recons-ltd.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: api-client,arrow,dataframe,mycorr,pandas,polars
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.10
|
|
27
|
+
Requires-Dist: httpx>=0.27.0
|
|
28
|
+
Requires-Dist: pandas>=2.0.0
|
|
29
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
30
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: nest-asyncio>=1.5.0; extra == 'all'
|
|
33
|
+
Requires-Dist: polars>=0.20.0; extra == 'all'
|
|
34
|
+
Provides-Extra: jupyter
|
|
35
|
+
Requires-Dist: nest-asyncio>=1.5.0; extra == 'jupyter'
|
|
36
|
+
Provides-Extra: polars
|
|
37
|
+
Requires-Dist: polars>=0.20.0; extra == 'polars'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# pymycorr
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/pymycorr/)
|
|
43
|
+
[](https://github.com/sambaclab/mycorr-python/actions/workflows/ci.yml)
|
|
44
|
+
[](https://github.com/sambaclab/mycorr-python/blob/main/pyproject.toml)
|
|
45
|
+
[](https://github.com/sambaclab/mycorr-python/blob/main/LICENSE)
|
|
46
|
+
|
|
47
|
+
Python client for fetching table data from the [MyCorr](https://alpha.mycorr.app) API using Apache Arrow for efficient data transfer.
|
|
48
|
+
|
|
49
|
+
> **Note:** The production API is coming soon. Set `MYCORR_API_URL` to your endpoint if you have early access.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install pymycorr
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Configuration
|
|
58
|
+
|
|
59
|
+
Set your API token and (optionally) the API URL as environment variables:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
export MYCORR_API_TOKEN="your-api-token"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Or create a `.env` file in your project root:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
MYCORR_API_TOKEN=your-api-token
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
The client automatically loads from environment variables and `.env` files.
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from pymycorr import MyCorr
|
|
77
|
+
|
|
78
|
+
# Initialize client (loads token from MYCORR_API_TOKEN env var or .env file)
|
|
79
|
+
client = MyCorr()
|
|
80
|
+
|
|
81
|
+
# Fetch as pandas DataFrame
|
|
82
|
+
df = client.get_table("table-id")
|
|
83
|
+
|
|
84
|
+
# Fetch a specific version
|
|
85
|
+
df = client.get_table("table-id", version=2)
|
|
86
|
+
|
|
87
|
+
# Fetch using version alias
|
|
88
|
+
df = client.get_table("table-id", version="stable")
|
|
89
|
+
|
|
90
|
+
# Get table metadata without fetching data
|
|
91
|
+
info = client.get_table_info("table-id")
|
|
92
|
+
print(info["schema"])
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Progress Display
|
|
96
|
+
|
|
97
|
+
Progress is shown automatically in interactive environments (terminals and notebooks). You can control this behavior:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# Always show progress
|
|
101
|
+
client = MyCorr(progress=True)
|
|
102
|
+
|
|
103
|
+
# Never show progress
|
|
104
|
+
client = MyCorr(progress=False)
|
|
105
|
+
|
|
106
|
+
# Override per-request
|
|
107
|
+
df = client.get_table("table-id", progress=False)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## API Reference
|
|
111
|
+
|
|
112
|
+
### MyCorr
|
|
113
|
+
|
|
114
|
+
#### `__init__(url=None, token=None, env_file=None, progress="auto")`
|
|
115
|
+
|
|
116
|
+
Initialize the client.
|
|
117
|
+
|
|
118
|
+
- `url`: API base URL (optional). Falls back to `MYCORR_API_URL` env var, then default.
|
|
119
|
+
- `token`: Authentication token (Bearer token from MyCorr UI). Falls back to `MYCORR_API_TOKEN` env var or `.env` file.
|
|
120
|
+
- `env_file`: Path to custom `.env` file (optional).
|
|
121
|
+
- `progress`: Show download progress. `True` always shows, `False` never shows, `"auto"` (default) shows in interactive environments.
|
|
122
|
+
|
|
123
|
+
#### `get_table(table_id, version=None, engine="pandas", progress=None)`
|
|
124
|
+
|
|
125
|
+
Fetch table data as a DataFrame.
|
|
126
|
+
|
|
127
|
+
- `table_id`: Unique identifier for the table.
|
|
128
|
+
- `version`: Version number (int) or alias (str like `"latest"`, `"stable"`).
|
|
129
|
+
- `engine`: `"pandas"` (default) or `"polars"`.
|
|
130
|
+
- `progress`: Override client's progress setting for this request.
|
|
131
|
+
- Returns: pandas or polars DataFrame.
|
|
132
|
+
|
|
133
|
+
#### `get_table_info(table_id, version=None)`
|
|
134
|
+
|
|
135
|
+
Get table schema and metadata.
|
|
136
|
+
|
|
137
|
+
- `table_id`: Unique identifier for the table.
|
|
138
|
+
- `version`: Version number (int) or alias (str).
|
|
139
|
+
- Returns: Dictionary with table metadata including schema.
|
|
140
|
+
|
|
141
|
+
## Exceptions
|
|
142
|
+
|
|
143
|
+
- `TableAPIError`: Base exception for API errors.
|
|
144
|
+
- `TableNotFoundError`: Table not found (404).
|
|
145
|
+
- `QuotaExceededError`: Egress quota exceeded (429).
|
|
146
|
+
- `TableConversionError`: Failed to convert Arrow data to DataFrame.
|
|
147
|
+
- `StreamingError`: Error during data streaming or IPC parsing.
|
|
148
|
+
|
|
149
|
+
## Development
|
|
150
|
+
|
|
151
|
+
See [.env.example](.env.example) for environment variable configuration. For local development:
|
|
152
|
+
|
|
153
|
+
1. Copy `.env.example` to `.env`
|
|
154
|
+
2. Set `MYCORR_API_URL` to your local/dev base URL (no path segments)
|
|
155
|
+
3. SSL verification is automatically disabled for `localhost` and `127.0.0.1` URLs
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
pymycorr-0.2.0/README.md
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# pymycorr
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/pymycorr/)
|
|
4
|
+
[](https://github.com/sambaclab/mycorr-python/actions/workflows/ci.yml)
|
|
5
|
+
[](https://github.com/sambaclab/mycorr-python/blob/main/pyproject.toml)
|
|
6
|
+
[](https://github.com/sambaclab/mycorr-python/blob/main/LICENSE)
|
|
7
|
+
|
|
8
|
+
Python client for fetching table data from the [MyCorr](https://alpha.mycorr.app) API using Apache Arrow for efficient data transfer.
|
|
9
|
+
|
|
10
|
+
> **Note:** The production API is coming soon. Set `MYCORR_API_URL` to your endpoint if you have early access.
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install pymycorr
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Configuration
|
|
19
|
+
|
|
20
|
+
Set your API token and (optionally) the API URL as environment variables:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
export MYCORR_API_TOKEN="your-api-token"
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Or create a `.env` file in your project root:
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
MYCORR_API_TOKEN=your-api-token
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
The client automatically loads from environment variables and `.env` files.
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from pymycorr import MyCorr
|
|
38
|
+
|
|
39
|
+
# Initialize client (loads token from MYCORR_API_TOKEN env var or .env file)
|
|
40
|
+
client = MyCorr()
|
|
41
|
+
|
|
42
|
+
# Fetch as pandas DataFrame
|
|
43
|
+
df = client.get_table("table-id")
|
|
44
|
+
|
|
45
|
+
# Fetch a specific version
|
|
46
|
+
df = client.get_table("table-id", version=2)
|
|
47
|
+
|
|
48
|
+
# Fetch using version alias
|
|
49
|
+
df = client.get_table("table-id", version="stable")
|
|
50
|
+
|
|
51
|
+
# Get table metadata without fetching data
|
|
52
|
+
info = client.get_table_info("table-id")
|
|
53
|
+
print(info["schema"])
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Progress Display
|
|
57
|
+
|
|
58
|
+
Progress is shown automatically in interactive environments (terminals and notebooks). You can control this behavior:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# Always show progress
|
|
62
|
+
client = MyCorr(progress=True)
|
|
63
|
+
|
|
64
|
+
# Never show progress
|
|
65
|
+
client = MyCorr(progress=False)
|
|
66
|
+
|
|
67
|
+
# Override per-request
|
|
68
|
+
df = client.get_table("table-id", progress=False)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## API Reference
|
|
72
|
+
|
|
73
|
+
### MyCorr
|
|
74
|
+
|
|
75
|
+
#### `__init__(url=None, token=None, env_file=None, progress="auto")`
|
|
76
|
+
|
|
77
|
+
Initialize the client.
|
|
78
|
+
|
|
79
|
+
- `url`: API base URL (optional). Falls back to `MYCORR_API_URL` env var, then default.
|
|
80
|
+
- `token`: Authentication token (Bearer token from MyCorr UI). Falls back to `MYCORR_API_TOKEN` env var or `.env` file.
|
|
81
|
+
- `env_file`: Path to custom `.env` file (optional).
|
|
82
|
+
- `progress`: Show download progress. `True` always shows, `False` never shows, `"auto"` (default) shows in interactive environments.
|
|
83
|
+
|
|
84
|
+
#### `get_table(table_id, version=None, engine="pandas", progress=None)`
|
|
85
|
+
|
|
86
|
+
Fetch table data as a DataFrame.
|
|
87
|
+
|
|
88
|
+
- `table_id`: Unique identifier for the table.
|
|
89
|
+
- `version`: Version number (int) or alias (str like `"latest"`, `"stable"`).
|
|
90
|
+
- `engine`: `"pandas"` (default) or `"polars"`.
|
|
91
|
+
- `progress`: Override client's progress setting for this request.
|
|
92
|
+
- Returns: pandas or polars DataFrame.
|
|
93
|
+
|
|
94
|
+
#### `get_table_info(table_id, version=None)`
|
|
95
|
+
|
|
96
|
+
Get table schema and metadata.
|
|
97
|
+
|
|
98
|
+
- `table_id`: Unique identifier for the table.
|
|
99
|
+
- `version`: Version number (int) or alias (str).
|
|
100
|
+
- Returns: Dictionary with table metadata including schema.
|
|
101
|
+
|
|
102
|
+
## Exceptions
|
|
103
|
+
|
|
104
|
+
- `TableAPIError`: Base exception for API errors.
|
|
105
|
+
- `TableNotFoundError`: Table not found (404).
|
|
106
|
+
- `QuotaExceededError`: Egress quota exceeded (429).
|
|
107
|
+
- `TableConversionError`: Failed to convert Arrow data to DataFrame.
|
|
108
|
+
- `StreamingError`: Error during data streaming or IPC parsing.
|
|
109
|
+
|
|
110
|
+
## Development
|
|
111
|
+
|
|
112
|
+
See [.env.example](.env.example) for environment variable configuration. For local development:
|
|
113
|
+
|
|
114
|
+
1. Copy `.env.example` to `.env`
|
|
115
|
+
2. Set `MYCORR_API_URL` to your local/dev base URL (no path segments)
|
|
116
|
+
3. SSL verification is automatically disabled for `localhost` and `127.0.0.1` URLs
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
|
|
120
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pymycorr"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Python client for fetching table data from the MyCorr API via Apache Arrow"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [{ name = "Maximilian Schuberth", email = "maxs@recons-ltd.com" }]
|
|
8
|
+
requires-python = ">=3.10"
|
|
9
|
+
keywords = ["mycorr", "arrow", "dataframe", "api-client", "polars", "pandas"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"Intended Audience :: Science/Research",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Topic :: Scientific/Engineering",
|
|
22
|
+
"Typing :: Typed",
|
|
23
|
+
]
|
|
24
|
+
dependencies = ["httpx>=0.27.0", "pyarrow>=14.0.0", "python-dotenv>=1.0.0", "pandas>=2.0.0"]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
polars = ["polars>=0.20.0"]
|
|
28
|
+
jupyter = ["nest-asyncio>=1.5.0"]
|
|
29
|
+
all = ["polars>=0.20.0", "nest-asyncio>=1.5.0"]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/recons-ltd/pymycorr"
|
|
33
|
+
Documentation = "https://github.com/recons-ltd/pymycorr#readme"
|
|
34
|
+
Repository = "https://github.com/recons-ltd/pymycorr"
|
|
35
|
+
Issues = "https://github.com/recons-ltd/pymycorr/issues"
|
|
36
|
+
Changelog = "https://github.com/recons-ltd/pymycorr/blob/main/CHANGELOG.md"
|
|
37
|
+
|
|
38
|
+
[build-system]
|
|
39
|
+
requires = ["hatchling"]
|
|
40
|
+
build-backend = "hatchling.build"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.sdist]
|
|
43
|
+
include = ["/src"]
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.wheel]
|
|
46
|
+
packages = ["src/pymycorr"]
|
|
47
|
+
|
|
48
|
+
[dependency-groups]
|
|
49
|
+
dev = [
|
|
50
|
+
"pytest>=8.0",
|
|
51
|
+
"pytest-asyncio>=0.23",
|
|
52
|
+
"pytest-cov>=4.1",
|
|
53
|
+
"respx>=0.21.0",
|
|
54
|
+
"mypy>=1.8",
|
|
55
|
+
"ruff>=0.2",
|
|
56
|
+
"pre-commit>=3.6",
|
|
57
|
+
"ipykernel>=6.29",
|
|
58
|
+
"notebook>=7.0",
|
|
59
|
+
"build>=1.0",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[tool.ruff]
|
|
63
|
+
target-version = "py310"
|
|
64
|
+
line-length = 100
|
|
65
|
+
src = ["src", "tests"]
|
|
66
|
+
|
|
67
|
+
[tool.ruff.lint]
|
|
68
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
69
|
+
|
|
70
|
+
[tool.mypy]
|
|
71
|
+
python_version = "3.10"
|
|
72
|
+
strict = true
|
|
73
|
+
warn_return_any = true
|
|
74
|
+
warn_unused_configs = true
|
|
75
|
+
warn_unused_ignores = false
|
|
76
|
+
|
|
77
|
+
[[tool.mypy.overrides]]
|
|
78
|
+
module = ["pyarrow.*", "httpx.*", "polars.*", "pandas.*", "nest_asyncio.*", "IPython.*"]
|
|
79
|
+
ignore_missing_imports = true
|
|
80
|
+
|
|
81
|
+
[[tool.mypy.overrides]]
|
|
82
|
+
module = ["tests.*"]
|
|
83
|
+
disallow_untyped_decorators = false
|
|
84
|
+
|
|
85
|
+
[tool.pytest.ini_options]
|
|
86
|
+
testpaths = ["tests"]
|
|
87
|
+
asyncio_mode = "auto"
|
|
88
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
89
|
+
|
|
90
|
+
[tool.coverage.run]
|
|
91
|
+
source = ["src/pymycorr"]
|
|
92
|
+
branch = true
|
|
93
|
+
|
|
94
|
+
[tool.coverage.report]
|
|
95
|
+
exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""PyMyCorr - Python client for fetching table data from the MyCorr API."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import version
|
|
4
|
+
|
|
5
|
+
from pymycorr.client import MyCorr
|
|
6
|
+
from pymycorr.exceptions import (
|
|
7
|
+
QuotaExceededError,
|
|
8
|
+
RateLimitError,
|
|
9
|
+
StreamingError,
|
|
10
|
+
TableAPIError,
|
|
11
|
+
TableConversionError,
|
|
12
|
+
TableNotFoundError,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__version__ = version("pymycorr")
|
|
16
|
+
__all__ = [
|
|
17
|
+
"MyCorr",
|
|
18
|
+
"QuotaExceededError",
|
|
19
|
+
"RateLimitError",
|
|
20
|
+
"StreamingError",
|
|
21
|
+
"TableAPIError",
|
|
22
|
+
"TableNotFoundError",
|
|
23
|
+
"TableConversionError",
|
|
24
|
+
"__version__",
|
|
25
|
+
]
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Progress display utilities for streaming operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def detect_environment() -> Literal["notebook", "terminal", "non_interactive"]:
|
|
11
|
+
"""Detect execution environment for appropriate progress display.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
'notebook' if running in Jupyter/IPython notebook,
|
|
15
|
+
'terminal' if running in interactive terminal,
|
|
16
|
+
'non_interactive' if output is piped or in CI.
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
from IPython.core.getipython import get_ipython
|
|
20
|
+
|
|
21
|
+
ipython = get_ipython() # type: ignore[no-untyped-call]
|
|
22
|
+
if ipython is not None and "IPKernelApp" in ipython.config:
|
|
23
|
+
return "notebook"
|
|
24
|
+
except (ImportError, AttributeError):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
if hasattr(sys.stdout, "isatty") and sys.stdout.isatty():
|
|
28
|
+
return "terminal"
|
|
29
|
+
|
|
30
|
+
return "non_interactive"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def format_bytes(num_bytes: int | float) -> str:
|
|
34
|
+
"""Format bytes as human-readable string.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
num_bytes: Number of bytes.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Formatted string like '12.5 MB'.
|
|
41
|
+
"""
|
|
42
|
+
value = float(num_bytes)
|
|
43
|
+
for unit in ("B", "KB", "MB", "GB"):
|
|
44
|
+
if abs(value) < 1024:
|
|
45
|
+
return f"{value:.1f} {unit}"
|
|
46
|
+
value /= 1024
|
|
47
|
+
return f"{value:.1f} TB"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def format_elapsed(seconds: float) -> str:
|
|
51
|
+
"""Format elapsed time as MM:SS.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
seconds: Elapsed time in seconds.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Formatted string like '01:23'.
|
|
58
|
+
"""
|
|
59
|
+
mins, secs = divmod(int(seconds), 60)
|
|
60
|
+
return f"{mins:02d}:{secs:02d}"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def format_transfer_summary(
|
|
64
|
+
total_bytes: int,
|
|
65
|
+
elapsed_seconds: float,
|
|
66
|
+
uncompressed_bytes: int | None = None,
|
|
67
|
+
) -> str:
|
|
68
|
+
"""Format completion summary message.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
total_bytes: Total compressed bytes transferred over the wire.
|
|
72
|
+
elapsed_seconds: Time taken in seconds.
|
|
73
|
+
uncompressed_bytes: Optional uncompressed (in-memory) size of the data.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Formatted summary like 'Downloaded 4.2 MB → 14.7 MB in 2.9s (1.4 MB/s)'.
|
|
77
|
+
"""
|
|
78
|
+
speed = total_bytes / elapsed_seconds if elapsed_seconds > 0 else 0
|
|
79
|
+
speed_str = format_bytes(speed)
|
|
80
|
+
size_str = format_bytes(total_bytes)
|
|
81
|
+
if uncompressed_bytes is not None:
|
|
82
|
+
size_str = f"{size_str} → {format_bytes(uncompressed_bytes)}"
|
|
83
|
+
return f"Downloaded {size_str} in {elapsed_seconds:.1f}s ({speed_str}/s)"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ProgressTracker:
|
|
87
|
+
"""Context manager for tracking streaming progress with text output.
|
|
88
|
+
|
|
89
|
+
Prints live progress updates that overwrite the same line,
|
|
90
|
+
then prints a final summary on completion.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
enabled: bool | Literal["auto"],
|
|
96
|
+
desc: str = "Downloading",
|
|
97
|
+
) -> None:
|
|
98
|
+
"""Initialize the progress tracker.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
enabled: True to always show, False to never show, 'auto' to detect.
|
|
102
|
+
desc: Description label for the progress output.
|
|
103
|
+
"""
|
|
104
|
+
self._enabled = enabled
|
|
105
|
+
self._desc = desc
|
|
106
|
+
self._start_time: float = 0
|
|
107
|
+
self._elapsed: float = 0
|
|
108
|
+
self._total_bytes: int = 0
|
|
109
|
+
self._show_progress: bool = False
|
|
110
|
+
self._last_line_len: int = 0
|
|
111
|
+
|
|
112
|
+
def __enter__(self) -> ProgressTracker:
|
|
113
|
+
"""Start tracking progress."""
|
|
114
|
+
self._start_time = time.monotonic()
|
|
115
|
+
self._show_progress = self._should_show()
|
|
116
|
+
return self
|
|
117
|
+
|
|
118
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
119
|
+
"""Stop tracking and clear progress line. Does not print summary."""
|
|
120
|
+
self._elapsed = time.monotonic() - self._start_time
|
|
121
|
+
if self._show_progress:
|
|
122
|
+
self._clear_line()
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def total_bytes(self) -> int:
|
|
126
|
+
"""Total compressed bytes received so far."""
|
|
127
|
+
return self._total_bytes
|
|
128
|
+
|
|
129
|
+
def print_summary(self, uncompressed_bytes: int | None = None) -> None:
|
|
130
|
+
"""Print the completion summary line.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
uncompressed_bytes: Optional uncompressed (in-memory) size of the data.
|
|
134
|
+
"""
|
|
135
|
+
if self._show_progress and self._total_bytes > 0:
|
|
136
|
+
summary = format_transfer_summary(self._total_bytes, self._elapsed, uncompressed_bytes)
|
|
137
|
+
sys.stdout.write(summary + "\n")
|
|
138
|
+
sys.stdout.flush()
|
|
139
|
+
|
|
140
|
+
def _should_show(self) -> bool:
|
|
141
|
+
"""Determine if progress should be shown."""
|
|
142
|
+
if self._enabled is False:
|
|
143
|
+
return False
|
|
144
|
+
if self._enabled == "auto":
|
|
145
|
+
return detect_environment() != "non_interactive"
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
def _clear_line(self) -> None:
|
|
149
|
+
"""Clear the current line by overwriting with spaces."""
|
|
150
|
+
if self._last_line_len > 0:
|
|
151
|
+
sys.stdout.write("\r" + " " * self._last_line_len + "\r")
|
|
152
|
+
|
|
153
|
+
def _write_line(self, line: str) -> None:
|
|
154
|
+
"""Write a line, tracking length for later clearing."""
|
|
155
|
+
self._clear_line()
|
|
156
|
+
sys.stdout.write(line)
|
|
157
|
+
sys.stdout.flush()
|
|
158
|
+
self._last_line_len = len(line)
|
|
159
|
+
|
|
160
|
+
def update(self, chunk_size: int) -> None:
|
|
161
|
+
"""Update progress with received chunk.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
chunk_size: Number of bytes received.
|
|
165
|
+
"""
|
|
166
|
+
self._total_bytes += chunk_size
|
|
167
|
+
if self._show_progress:
|
|
168
|
+
elapsed = time.monotonic() - self._start_time
|
|
169
|
+
speed = self._total_bytes / elapsed if elapsed > 0 else 0
|
|
170
|
+
line = (
|
|
171
|
+
f"\r{self._desc}: {format_bytes(self._total_bytes)} "
|
|
172
|
+
f"[{format_elapsed(elapsed)}, {format_bytes(speed)}/s]"
|
|
173
|
+
)
|
|
174
|
+
self._write_line(line)
|
|
@@ -0,0 +1,593 @@
|
|
|
1
|
+
"""MyCorr API client for fetching table data with Arrow format support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import os
|
|
7
|
+
import queue
|
|
8
|
+
import threading
|
|
9
|
+
from collections.abc import AsyncIterator, Coroutine, Iterator
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
import pyarrow as pa
|
|
16
|
+
import pyarrow.ipc as ipc
|
|
17
|
+
from dotenv import load_dotenv
|
|
18
|
+
|
|
19
|
+
from pymycorr._progress import ProgressTracker
|
|
20
|
+
from pymycorr.exceptions import (
|
|
21
|
+
QuotaExceededError,
|
|
22
|
+
RateLimitError,
|
|
23
|
+
StreamingError,
|
|
24
|
+
TableAPIError,
|
|
25
|
+
TableConversionError,
|
|
26
|
+
TableNotFoundError,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import polars as pl
|
|
32
|
+
|
|
33
|
+
T = TypeVar("T")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class _StreamingBuffer:
|
|
37
|
+
"""Bridges async HTTP chunks to PyArrow's synchronous read() interface."""
|
|
38
|
+
|
|
39
|
+
def __init__(self) -> None:
|
|
40
|
+
self._buffer = bytearray()
|
|
41
|
+
self._chunks: queue.Queue[bytes | None] = queue.Queue()
|
|
42
|
+
self._eof = False
|
|
43
|
+
|
|
44
|
+
def feed(self, chunk: bytes) -> None:
|
|
45
|
+
"""Called by producer with incoming chunks."""
|
|
46
|
+
self._chunks.put(chunk)
|
|
47
|
+
|
|
48
|
+
def close(self) -> None:
|
|
49
|
+
"""Signal end of stream."""
|
|
50
|
+
self._chunks.put(None)
|
|
51
|
+
|
|
52
|
+
def read(self, n: int = -1) -> bytes:
|
|
53
|
+
"""Called by PyArrow - blocks until data available."""
|
|
54
|
+
while not self._eof and (n == -1 or len(self._buffer) < n):
|
|
55
|
+
try:
|
|
56
|
+
chunk = self._chunks.get(timeout=300)
|
|
57
|
+
except queue.Empty:
|
|
58
|
+
break
|
|
59
|
+
if chunk is None:
|
|
60
|
+
self._eof = True
|
|
61
|
+
break
|
|
62
|
+
self._buffer.extend(chunk)
|
|
63
|
+
|
|
64
|
+
if n == -1 or n >= len(self._buffer):
|
|
65
|
+
result = bytes(self._buffer)
|
|
66
|
+
self._buffer.clear()
|
|
67
|
+
else:
|
|
68
|
+
result = bytes(self._buffer[:n])
|
|
69
|
+
del self._buffer[:n]
|
|
70
|
+
return result
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MyCorr:
|
|
74
|
+
"""Client for fetching table data from API with Arrow format support."""
|
|
75
|
+
|
|
76
|
+
DEFAULT_URL = "https://mycorr.app"
|
|
77
|
+
_default_progress: bool | Literal["auto"]
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
url: str | None = None,
|
|
82
|
+
token: str | None = None,
|
|
83
|
+
env_file: str | Path | None = None,
|
|
84
|
+
progress: bool | Literal["auto"] = "auto",
|
|
85
|
+
) -> None:
|
|
86
|
+
"""Initialize the client with authentication token and API URL.
|
|
87
|
+
|
|
88
|
+
Configuration is resolved in the following order:
|
|
89
|
+
1. Explicit parameters (highest priority)
|
|
90
|
+
2. Environment variables (MYCORR_API_URL, MYCORR_API_TOKEN)
|
|
91
|
+
3. Default values (URL only)
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
url: API base URL. Falls back to MYCORR_API_URL env var, then default.
|
|
95
|
+
token: Authentication token. Falls back to MYCORR_API_TOKEN env var.
|
|
96
|
+
env_file: Optional path to .env file. If None, auto-discovers .env.
|
|
97
|
+
progress: Show download progress. True always shows, False never shows,
|
|
98
|
+
'auto' (default) shows in notebooks/terminals but not in non-interactive.
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
ValueError: If token cannot be resolved.
|
|
102
|
+
"""
|
|
103
|
+
load_dotenv(dotenv_path=env_file)
|
|
104
|
+
|
|
105
|
+
self.url = (url or os.getenv("MYCORR_API_URL") or self.DEFAULT_URL).rstrip("/")
|
|
106
|
+
self.token = token or os.getenv("MYCORR_API_TOKEN")
|
|
107
|
+
hostname = urlparse(self.url).hostname or ""
|
|
108
|
+
self._verify_ssl = hostname not in ("localhost", "127.0.0.1")
|
|
109
|
+
self._default_progress = progress
|
|
110
|
+
|
|
111
|
+
if not self.token:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
"token is required (pass explicitly or set MYCORR_API_TOKEN environment variable)"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
def _run_sync(self, coro: Coroutine[Any, Any, T]) -> T:
|
|
117
|
+
"""Run an async coroutine from a sync context.
|
|
118
|
+
|
|
119
|
+
Handles detection of running event loops and applies nest_asyncio
|
|
120
|
+
when needed (e.g., in Jupyter notebooks).
|
|
121
|
+
"""
|
|
122
|
+
try:
|
|
123
|
+
loop = asyncio.get_running_loop()
|
|
124
|
+
except RuntimeError:
|
|
125
|
+
return asyncio.run(coro)
|
|
126
|
+
|
|
127
|
+
if loop.is_running():
|
|
128
|
+
try:
|
|
129
|
+
import nest_asyncio
|
|
130
|
+
|
|
131
|
+
nest_asyncio.apply()
|
|
132
|
+
return loop.run_until_complete(coro)
|
|
133
|
+
except ImportError as e:
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
"nest_asyncio is required when calling from within an async context "
|
|
136
|
+
"(e.g., Jupyter notebooks). Install with: pip install nest-asyncio"
|
|
137
|
+
) from e
|
|
138
|
+
return loop.run_until_complete(coro)
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _handle_error_response(status_code: int, body: bytes | str, context: str = "") -> None:
|
|
142
|
+
"""Raise the appropriate exception for an HTTP error response.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
status_code: The HTTP status code.
|
|
146
|
+
body: The raw response body (bytes or str).
|
|
147
|
+
context: Description of the operation for error messages.
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
TableNotFoundError: If the status code is 404.
|
|
151
|
+
QuotaExceededError: If the status code is 429.
|
|
152
|
+
TableAPIError: For all other non-200 status codes.
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
import json
|
|
156
|
+
|
|
157
|
+
error_body = json.loads(body) if isinstance(body, bytes | str) else {}
|
|
158
|
+
except Exception:
|
|
159
|
+
error_body = {}
|
|
160
|
+
|
|
161
|
+
if status_code == 404:
|
|
162
|
+
msg = error_body.get("message", "Not found")
|
|
163
|
+
raise TableNotFoundError(f"{context}{msg}")
|
|
164
|
+
if status_code == 429:
|
|
165
|
+
if error_body.get("error") == "rate_limit_exceeded":
|
|
166
|
+
raise RateLimitError(error_body)
|
|
167
|
+
raise QuotaExceededError(error_body)
|
|
168
|
+
|
|
169
|
+
msg = error_body.get("message", f"HTTP {status_code}")
|
|
170
|
+
raise TableAPIError(f"{context}{msg}")
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def _build_version_params(
|
|
174
|
+
table_id: str,
|
|
175
|
+
version: int | str | None,
|
|
176
|
+
*,
|
|
177
|
+
include_scope: bool = True,
|
|
178
|
+
) -> dict[str, Any]:
|
|
179
|
+
"""Build query params dict for version specification.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
table_id: The table identifier.
|
|
183
|
+
version: Version number (int), alias (str), or None for 'latest'.
|
|
184
|
+
include_scope: Whether to include scope='read' in params.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Dict with table_id, version/version_alias, and optionally scope.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
TypeError: If version has an invalid type.
|
|
191
|
+
"""
|
|
192
|
+
if version is None:
|
|
193
|
+
version = "latest"
|
|
194
|
+
|
|
195
|
+
params: dict[str, Any] = {"table_id": table_id}
|
|
196
|
+
if include_scope:
|
|
197
|
+
params["scope"] = "read"
|
|
198
|
+
|
|
199
|
+
if isinstance(version, int):
|
|
200
|
+
params["version"] = version
|
|
201
|
+
elif isinstance(version, str):
|
|
202
|
+
params["version_alias"] = version
|
|
203
|
+
else:
|
|
204
|
+
raise TypeError(
|
|
205
|
+
f"Expected 'version' to be int, str, or None, got {type(version).__name__}"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return params
|
|
209
|
+
|
|
210
|
+
async def _get_data_stream(
|
|
211
|
+
self,
|
|
212
|
+
table_id: str,
|
|
213
|
+
version: int | str | None = None,
|
|
214
|
+
progress: bool | Literal["auto"] | None = None,
|
|
215
|
+
) -> pa.Table:
|
|
216
|
+
"""Fetch Arrow stream from API asynchronously.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
table_id: Unique identifier for the table.
|
|
220
|
+
version: Version number (int) or version alias (str, e.g., 'latest', 'stable').
|
|
221
|
+
progress: Show download progress. None uses client default.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
PyArrow Table containing the data.
|
|
225
|
+
|
|
226
|
+
Raises:
|
|
227
|
+
ValueError: If table_id is empty.
|
|
228
|
+
TableNotFoundError: If the table is not found (404).
|
|
229
|
+
QuotaExceededError: If the egress quota is exceeded (429).
|
|
230
|
+
TableAPIError: For other API errors.
|
|
231
|
+
StreamingError: If IPC stream parsing fails.
|
|
232
|
+
"""
|
|
233
|
+
batches = []
|
|
234
|
+
async for batch in self._stream_record_batches(table_id, version, progress=progress):
|
|
235
|
+
batches.append(batch)
|
|
236
|
+
|
|
237
|
+
if not batches:
|
|
238
|
+
return pa.table({})
|
|
239
|
+
|
|
240
|
+
return pa.Table.from_batches(batches)
|
|
241
|
+
|
|
242
|
+
async def _stream_record_batches(
|
|
243
|
+
self,
|
|
244
|
+
table_id: str,
|
|
245
|
+
version: int | str | None = None,
|
|
246
|
+
progress: bool | Literal["auto"] | None = None,
|
|
247
|
+
) -> AsyncIterator[pa.RecordBatch]:
|
|
248
|
+
"""Stream Arrow record batches from API asynchronously.
|
|
249
|
+
|
|
250
|
+
Yields individual RecordBatch objects as they arrive over the network,
|
|
251
|
+
allowing processing of large tables without loading everything into memory.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
table_id: Unique identifier for the table.
|
|
255
|
+
version: Version number (int) or version alias (str, e.g., 'latest', 'stable').
|
|
256
|
+
progress: Show download progress. None uses client default.
|
|
257
|
+
|
|
258
|
+
Yields:
|
|
259
|
+
PyArrow RecordBatch objects.
|
|
260
|
+
|
|
261
|
+
Raises:
|
|
262
|
+
ValueError: If table_id is empty.
|
|
263
|
+
TableNotFoundError: If the table is not found (404).
|
|
264
|
+
QuotaExceededError: If the egress quota is exceeded (429).
|
|
265
|
+
TableAPIError: For other API errors.
|
|
266
|
+
StreamingError: If IPC stream parsing fails.
|
|
267
|
+
"""
|
|
268
|
+
if not table_id:
|
|
269
|
+
raise ValueError("Table ID is required")
|
|
270
|
+
|
|
271
|
+
params = self._build_version_params(table_id, version)
|
|
272
|
+
effective_progress: bool | Literal["auto"] = (
|
|
273
|
+
progress if progress is not None else self._default_progress
|
|
274
|
+
)
|
|
275
|
+
timeout = httpx.Timeout(timeout=300.0, connect=30.0)
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
async with (
|
|
279
|
+
httpx.AsyncClient(timeout=timeout, verify=self._verify_ssl) as client,
|
|
280
|
+
client.stream(
|
|
281
|
+
"GET",
|
|
282
|
+
f"{self.url}/data/table/stream",
|
|
283
|
+
headers={
|
|
284
|
+
"Authorization": f"Bearer {self.token}",
|
|
285
|
+
"Accept": "application/vnd.apache.arrow.stream",
|
|
286
|
+
},
|
|
287
|
+
params=params,
|
|
288
|
+
) as response,
|
|
289
|
+
):
|
|
290
|
+
if response.status_code != 200:
|
|
291
|
+
await response.aread()
|
|
292
|
+
self._handle_error_response(
|
|
293
|
+
response.status_code, response.content, "Error fetching table: "
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Collect all chunks (server now sends single IPC stream)
|
|
297
|
+
chunks: list[bytes] = []
|
|
298
|
+
tracker = ProgressTracker(effective_progress, desc=f"Fetching {table_id}")
|
|
299
|
+
prev_downloaded = 0
|
|
300
|
+
with tracker:
|
|
301
|
+
async for chunk in response.aiter_bytes():
|
|
302
|
+
wire_bytes = response.num_bytes_downloaded
|
|
303
|
+
tracker.update(wire_bytes - prev_downloaded)
|
|
304
|
+
prev_downloaded = wire_bytes
|
|
305
|
+
chunks.append(chunk)
|
|
306
|
+
|
|
307
|
+
# Parse single IPC stream with PyArrow's native reader
|
|
308
|
+
data = b"".join(chunks)
|
|
309
|
+
try:
|
|
310
|
+
reader = ipc.open_stream(data)
|
|
311
|
+
uncompressed = 0
|
|
312
|
+
for batch in reader:
|
|
313
|
+
uncompressed += batch.nbytes
|
|
314
|
+
yield batch
|
|
315
|
+
except (pa.ArrowInvalid, pa.ArrowIOError) as e:
|
|
316
|
+
raise StreamingError(f"Failed to parse IPC stream: {e}") from e
|
|
317
|
+
else:
|
|
318
|
+
tracker.print_summary(uncompressed_bytes=uncompressed)
|
|
319
|
+
|
|
320
|
+
except httpx.HTTPError as e:
|
|
321
|
+
raise StreamingError(f"Connection error: {e!s}") from e
|
|
322
|
+
|
|
323
|
+
def _iter_record_batches(
|
|
324
|
+
self,
|
|
325
|
+
table_id: str,
|
|
326
|
+
version: int | str | None = None,
|
|
327
|
+
progress: bool | Literal["auto"] | None = None,
|
|
328
|
+
) -> Iterator[pa.RecordBatch]:
|
|
329
|
+
"""Synchronous iterator over record batches.
|
|
330
|
+
|
|
331
|
+
Uses a background thread with queue for true streaming in sync contexts.
|
|
332
|
+
In Jupyter notebooks or running event loops, falls back to collecting
|
|
333
|
+
all batches first (requires nest-asyncio).
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
table_id: Unique identifier for the table.
|
|
337
|
+
version: Version number (int) or version alias (str, e.g., 'latest', 'stable').
|
|
338
|
+
progress: Show download progress. None uses client default.
|
|
339
|
+
|
|
340
|
+
Yields:
|
|
341
|
+
PyArrow RecordBatch objects.
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
ValueError: If table_id is empty.
|
|
345
|
+
TableNotFoundError: If the table is not found (404).
|
|
346
|
+
QuotaExceededError: If the egress quota is exceeded (429).
|
|
347
|
+
TableAPIError: For other API errors.
|
|
348
|
+
StreamingError: If IPC stream parsing fails.
|
|
349
|
+
RuntimeError: If nest_asyncio required but not installed.
|
|
350
|
+
"""
|
|
351
|
+
try:
|
|
352
|
+
asyncio.get_running_loop()
|
|
353
|
+
has_running_loop = True
|
|
354
|
+
except RuntimeError:
|
|
355
|
+
has_running_loop = False
|
|
356
|
+
|
|
357
|
+
if has_running_loop:
|
|
358
|
+
# In Jupyter or async context - collect all batches via _run_sync
|
|
359
|
+
async def collect() -> list[pa.RecordBatch]:
|
|
360
|
+
return [
|
|
361
|
+
batch
|
|
362
|
+
async for batch in self._stream_record_batches(
|
|
363
|
+
table_id, version, progress=progress
|
|
364
|
+
)
|
|
365
|
+
]
|
|
366
|
+
|
|
367
|
+
yield from self._run_sync(collect())
|
|
368
|
+
else:
|
|
369
|
+
# No running loop - use thread + queue for true streaming
|
|
370
|
+
yield from self._sync_stream_batches(table_id, version, progress=progress)
|
|
371
|
+
|
|
372
|
+
def _sync_stream_batches(
|
|
373
|
+
self,
|
|
374
|
+
table_id: str,
|
|
375
|
+
version: int | str | None,
|
|
376
|
+
progress: bool | Literal["auto"] | None = None,
|
|
377
|
+
) -> Iterator[pa.RecordBatch]:
|
|
378
|
+
"""True streaming sync iteration using thread + PyArrow native reader.
|
|
379
|
+
|
|
380
|
+
Uses _StreamingBuffer to bridge async HTTP chunks to PyArrow's
|
|
381
|
+
synchronous read() interface, enabling true streaming where batches
|
|
382
|
+
are yielded as data arrives over the network.
|
|
383
|
+
"""
|
|
384
|
+
buffer = _StreamingBuffer()
|
|
385
|
+
error_holder: list[Exception] = []
|
|
386
|
+
tracker_holder: list[ProgressTracker] = []
|
|
387
|
+
|
|
388
|
+
def producer() -> None:
|
|
389
|
+
async def fetch() -> None:
|
|
390
|
+
params = self._build_version_params(table_id, version)
|
|
391
|
+
effective_progress: bool | Literal["auto"] = (
|
|
392
|
+
progress if progress is not None else self._default_progress
|
|
393
|
+
)
|
|
394
|
+
timeout = httpx.Timeout(timeout=300.0, connect=30.0)
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
async with (
|
|
398
|
+
httpx.AsyncClient(timeout=timeout, verify=self._verify_ssl) as client,
|
|
399
|
+
client.stream(
|
|
400
|
+
"GET",
|
|
401
|
+
f"{self.url}/data/table/stream",
|
|
402
|
+
headers={
|
|
403
|
+
"Authorization": f"Bearer {self.token}",
|
|
404
|
+
"Accept": "application/vnd.apache.arrow.stream",
|
|
405
|
+
},
|
|
406
|
+
params=params,
|
|
407
|
+
) as response,
|
|
408
|
+
):
|
|
409
|
+
if response.status_code != 200:
|
|
410
|
+
await response.aread()
|
|
411
|
+
self._handle_error_response(
|
|
412
|
+
response.status_code,
|
|
413
|
+
response.content,
|
|
414
|
+
"Error fetching table: ",
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
tracker = ProgressTracker(effective_progress, desc=f"Fetching {table_id}")
|
|
418
|
+
tracker_holder.append(tracker)
|
|
419
|
+
prev_downloaded = 0
|
|
420
|
+
with tracker:
|
|
421
|
+
async for chunk in response.aiter_bytes():
|
|
422
|
+
wire_bytes = response.num_bytes_downloaded
|
|
423
|
+
tracker.update(wire_bytes - prev_downloaded)
|
|
424
|
+
prev_downloaded = wire_bytes
|
|
425
|
+
buffer.feed(chunk)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
error_holder.append(e)
|
|
428
|
+
finally:
|
|
429
|
+
buffer.close()
|
|
430
|
+
|
|
431
|
+
asyncio.run(fetch())
|
|
432
|
+
|
|
433
|
+
thread = threading.Thread(target=producer, daemon=True)
|
|
434
|
+
thread.start()
|
|
435
|
+
|
|
436
|
+
# PyArrow reads from buffer as data arrives
|
|
437
|
+
uncompressed = 0
|
|
438
|
+
try:
|
|
439
|
+
reader = ipc.open_stream(buffer)
|
|
440
|
+
for batch in reader:
|
|
441
|
+
uncompressed += batch.nbytes
|
|
442
|
+
yield batch
|
|
443
|
+
except (pa.ArrowInvalid, pa.ArrowIOError) as e:
|
|
444
|
+
thread.join()
|
|
445
|
+
if error_holder:
|
|
446
|
+
raise error_holder[0] from e
|
|
447
|
+
raise StreamingError(f"Failed to parse IPC stream: {e}") from e
|
|
448
|
+
|
|
449
|
+
thread.join()
|
|
450
|
+
|
|
451
|
+
# Re-raise any error from the producer thread
|
|
452
|
+
if error_holder:
|
|
453
|
+
raise error_holder[0]
|
|
454
|
+
|
|
455
|
+
# Print summary with uncompressed size from main thread
|
|
456
|
+
if tracker_holder:
|
|
457
|
+
tracker_holder[0].print_summary(uncompressed_bytes=uncompressed)
|
|
458
|
+
|
|
459
|
+
def _stream_to_dataframes(
|
|
460
|
+
self,
|
|
461
|
+
table_id: str,
|
|
462
|
+
version: int | str | None = None,
|
|
463
|
+
engine: Literal["pandas", "polars"] = "pandas",
|
|
464
|
+
progress: bool | Literal["auto"] | None = None,
|
|
465
|
+
) -> Iterator[Any]:
|
|
466
|
+
"""Stream data as individual DataFrames per batch.
|
|
467
|
+
|
|
468
|
+
Useful for processing large tables in chunks without loading
|
|
469
|
+
everything into memory.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
table_id: Unique identifier for the table.
|
|
473
|
+
version: Version number (int) or version alias (str, e.g., 'latest', 'stable').
|
|
474
|
+
engine: Data processing engine ('pandas' or 'polars').
|
|
475
|
+
progress: Show download progress. None uses client default.
|
|
476
|
+
|
|
477
|
+
Yields:
|
|
478
|
+
DataFrame in the specified format (pandas or polars).
|
|
479
|
+
|
|
480
|
+
Raises:
|
|
481
|
+
ValueError: If engine is not supported.
|
|
482
|
+
TableNotFoundError: If the table is not found (404).
|
|
483
|
+
QuotaExceededError: If the egress quota is exceeded (429).
|
|
484
|
+
TableAPIError: For other API errors.
|
|
485
|
+
StreamingError: If IPC stream parsing fails.
|
|
486
|
+
TableConversionError: If conversion to DataFrame fails.
|
|
487
|
+
"""
|
|
488
|
+
if engine not in ("pandas", "polars"):
|
|
489
|
+
raise ValueError(f"Engine must be 'pandas' or 'polars', got '{engine}'")
|
|
490
|
+
|
|
491
|
+
for batch in self._iter_record_batches(table_id, version, progress=progress):
|
|
492
|
+
try:
|
|
493
|
+
if engine == "pandas":
|
|
494
|
+
# Convert batch to table first to ensure DataFrame output
|
|
495
|
+
table = pa.Table.from_batches([batch])
|
|
496
|
+
yield table.to_pandas()
|
|
497
|
+
else:
|
|
498
|
+
import polars as pl_module
|
|
499
|
+
|
|
500
|
+
yield pl_module.from_arrow(batch)
|
|
501
|
+
except Exception as e:
|
|
502
|
+
raise TableConversionError(
|
|
503
|
+
f"Failed to convert batch to {engine} format: {e!s}"
|
|
504
|
+
) from e
|
|
505
|
+
|
|
506
|
+
def get_table(
|
|
507
|
+
self,
|
|
508
|
+
table_id: str,
|
|
509
|
+
version: int | str | None = None,
|
|
510
|
+
engine: Literal["pandas", "polars"] = "pandas",
|
|
511
|
+
progress: bool | Literal["auto"] | None = None,
|
|
512
|
+
) -> pd.DataFrame | pl.DataFrame:
|
|
513
|
+
"""Fetch table data synchronously and convert to DataFrame.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
table_id: Unique identifier for the table.
|
|
517
|
+
version: Version number (int) or version alias (str, e.g., 'latest', 'stable').
|
|
518
|
+
engine: Data processing engine ('pandas' or 'polars').
|
|
519
|
+
progress: Show download progress. None uses client default.
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
DataFrame in the specified format (pandas or polars).
|
|
523
|
+
|
|
524
|
+
Raises:
|
|
525
|
+
TypeError: If version has wrong type.
|
|
526
|
+
ValueError: If engine is not supported.
|
|
527
|
+
TableNotFoundError: If the table is not found (404).
|
|
528
|
+
QuotaExceededError: If the egress quota is exceeded (429).
|
|
529
|
+
TableAPIError: For other API-related errors.
|
|
530
|
+
TableConversionError: If conversion to DataFrame fails.
|
|
531
|
+
StreamingError: If IPC stream parsing fails.
|
|
532
|
+
"""
|
|
533
|
+
if engine not in ("pandas", "polars"):
|
|
534
|
+
raise ValueError(f"Engine must be 'pandas' or 'polars', got '{engine}'")
|
|
535
|
+
|
|
536
|
+
async def get_dataframe_async() -> pd.DataFrame | pl.DataFrame:
|
|
537
|
+
"""Internal async function to fetch and convert data."""
|
|
538
|
+
data_stream = await self._get_data_stream(table_id, version, progress=progress)
|
|
539
|
+
|
|
540
|
+
try:
|
|
541
|
+
if engine == "pandas":
|
|
542
|
+
return cast("pd.DataFrame", data_stream.to_pandas())
|
|
543
|
+
else:
|
|
544
|
+
import polars as pl_module
|
|
545
|
+
|
|
546
|
+
return cast("pl.DataFrame", pl_module.from_arrow(data_stream))
|
|
547
|
+
except Exception as e:
|
|
548
|
+
raise TableConversionError(
|
|
549
|
+
f"Failed to convert table to {engine} format: {e!s}"
|
|
550
|
+
) from e
|
|
551
|
+
|
|
552
|
+
return self._run_sync(get_dataframe_async())
|
|
553
|
+
|
|
554
|
+
def get_table_info(self, table_id: str, version: int | str | None = None) -> dict[str, Any]:
|
|
555
|
+
"""Get table schema and metadata without fetching full data.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
table_id: Unique identifier for the table.
|
|
559
|
+
version: Version number (int) or version alias (str, e.g., 'latest', 'stable').
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
Dictionary containing table snapshot (version, meta, schema).
|
|
563
|
+
|
|
564
|
+
Raises:
|
|
565
|
+
ValueError: If table_id is empty.
|
|
566
|
+
TableNotFoundError: If the table is not found (404).
|
|
567
|
+
QuotaExceededError: If the egress quota is exceeded (429).
|
|
568
|
+
TableAPIError: For other API-related errors.
|
|
569
|
+
"""
|
|
570
|
+
if not table_id:
|
|
571
|
+
raise ValueError("Table ID is required")
|
|
572
|
+
|
|
573
|
+
params: dict[str, Any] = {
|
|
574
|
+
"table_id": table_id,
|
|
575
|
+
"scope": "read",
|
|
576
|
+
"version_alias": version if isinstance(version, str) else "latest",
|
|
577
|
+
}
|
|
578
|
+
if isinstance(version, int):
|
|
579
|
+
params["version"] = version
|
|
580
|
+
|
|
581
|
+
response = httpx.get(
|
|
582
|
+
f"{self.url}/data/tableinfo",
|
|
583
|
+
headers={"Authorization": f"Bearer {self.token}"},
|
|
584
|
+
params=params,
|
|
585
|
+
verify=self._verify_ssl,
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
if response.status_code != 200:
|
|
589
|
+
self._handle_error_response(
|
|
590
|
+
response.status_code, response.text, "Error fetching table info: "
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
return cast(dict[str, Any], response.json())
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Custom exceptions for the PyMyCorr client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TableAPIError(Exception):
|
|
9
|
+
"""Base exception for table API errors."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TableNotFoundError(TableAPIError):
|
|
13
|
+
"""Raised when table is not found."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TableConversionError(TableAPIError):
|
|
17
|
+
"""Raised when table conversion fails."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class StreamingError(TableAPIError):
|
|
21
|
+
"""Raised when an error occurs during data streaming."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, message: str, batches_received: int = 0):
|
|
24
|
+
super().__init__(message)
|
|
25
|
+
self.batches_received = batches_received
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RateLimitError(TableAPIError):
|
|
29
|
+
"""Raised when the API rate limit is exceeded (HTTP 429, governor).
|
|
30
|
+
|
|
31
|
+
The ``retry_after`` attribute contains the number of seconds to wait
|
|
32
|
+
before retrying (parsed from the server's ``retry_after_secs`` field),
|
|
33
|
+
or ``None`` if the server did not provide it.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, detail: dict[str, Any]) -> None:
|
|
37
|
+
message = detail.get("message") or detail.get("error", "rate limit exceeded")
|
|
38
|
+
super().__init__(message)
|
|
39
|
+
self.detail = detail
|
|
40
|
+
try:
|
|
41
|
+
self.retry_after: int | None = int(detail["retry_after_secs"])
|
|
42
|
+
except (KeyError, ValueError, TypeError):
|
|
43
|
+
self.retry_after = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class QuotaExceededError(TableAPIError):
|
|
47
|
+
"""Raised when the daily egress quota is exceeded (HTTP 429).
|
|
48
|
+
|
|
49
|
+
The ``detail`` attribute contains the raw error payload from the server
|
|
50
|
+
(e.g. error code and reset time when the server provides it).
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, detail: dict[str, Any]) -> None:
|
|
54
|
+
message = detail.get("message") or detail.get("error", "egress quota exceeded")
|
|
55
|
+
super().__init__(message)
|
|
56
|
+
self.detail = detail
|
|
File without changes
|