oasis-data-manager 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/MANIFEST.in +1 -2
- oasis_data_manager-0.2.3/PKG-INFO +410 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/README.md +28 -0
- oasis_data_manager-0.2.3/oasis_data_manager/__init__.py +1 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/aws.py +7 -1
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/base.py +9 -7
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/config.py +1 -0
- oasis_data_manager-0.2.3/oasis_data_manager.egg-info/PKG-INFO +410 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager.egg-info/SOURCES.txt +1 -3
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager.egg-info/requires.txt +14 -2
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager.egg-info/top_level.txt +1 -0
- oasis_data_manager-0.2.3/pyproject.toml +73 -0
- oasis_data_manager-0.2.3/setup.cfg +4 -0
- oasis_data_manager-0.2.2/PKG-INFO +0 -38
- oasis_data_manager-0.2.2/oasis_data_manager/__init__.py +0 -1
- oasis_data_manager-0.2.2/oasis_data_manager.egg-info/PKG-INFO +0 -38
- oasis_data_manager-0.2.2/requirements-package.in +0 -5
- oasis_data_manager-0.2.2/setup.cfg +0 -29
- oasis_data_manager-0.2.2/setup.py +0 -64
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/CHANGELOG.rst +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/LICENSE +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/complex/__init__.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/complex/complex.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/complex/examples.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/config.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/__init__.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/backends/__init__.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/backends/base.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/backends/dask.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/backends/pandas.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/backends/pyarrow.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/config.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/exceptions.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/df_reader/reader.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/errors/__init__.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/__init__.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/__init__.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/aws_s3.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/azure.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/azure_abfs.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/local.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/filestore.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/log.py +0 -0
- {oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager.egg-info/dependency_links.txt +0 -0
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: oasis-data-manager
|
|
3
|
+
Version: 0.2.3
|
|
4
|
+
Author-email: Oasis LMF <support@oasislmf.org>
|
|
5
|
+
License: BSD-3-Clause
|
|
6
|
+
Project-URL: Homepage, https://github.com/OasisLMF/OasisDataManager
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: fastparquet
|
|
10
|
+
Requires-Dist: fsspec>=2023.12.2
|
|
11
|
+
Requires-Dist: pandas
|
|
12
|
+
Requires-Dist: typing_extensions
|
|
13
|
+
Requires-Dist: xxhash
|
|
14
|
+
Provides-Extra: geo
|
|
15
|
+
Requires-Dist: geopandas==0.14.4; extra == "geo"
|
|
16
|
+
Requires-Dist: pyogrio; extra == "geo"
|
|
17
|
+
Provides-Extra: dask
|
|
18
|
+
Requires-Dist: oasis-data-manager[geo]; extra == "dask"
|
|
19
|
+
Requires-Dist: dask>2024.1.1; extra == "dask"
|
|
20
|
+
Requires-Dist: dask-geopandas; extra == "dask"
|
|
21
|
+
Requires-Dist: dask-sql; extra == "dask"
|
|
22
|
+
Requires-Dist: distributed; extra == "dask"
|
|
23
|
+
Provides-Extra: s3
|
|
24
|
+
Requires-Dist: s3fs>=2023.12.2; extra == "s3"
|
|
25
|
+
Provides-Extra: azure
|
|
26
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
27
|
+
Provides-Extra: extra
|
|
28
|
+
Requires-Dist: oasis-data-manager[geo]; extra == "extra"
|
|
29
|
+
Requires-Dist: oasis-data-manager[dask]; extra == "extra"
|
|
30
|
+
Requires-Dist: oasis-data-manager[s3]; extra == "extra"
|
|
31
|
+
Requires-Dist: oasis-data-manager[azure]; extra == "extra"
|
|
32
|
+
|
|
33
|
+
# OasisDataManager
|
|
34
|
+
|
|
35
|
+
A Python library providing unified data access patterns across different storage backends and DataFrame engines, as part of the [OasisLMF](https://github.com/OasisLMF) catastrophe modelling platform.
|
|
36
|
+
|
|
37
|
+
It abstracts:
|
|
38
|
+
- **File I/O** — local filesystem, AWS S3, Azure Blob Storage
|
|
39
|
+
- **DataFrame reading** — pandas, Dask, PyArrow
|
|
40
|
+
- **Data pipelines** — fetch → filter → SQL → transform, composable via a fluent API
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Core install (pandas + local storage only)
|
|
48
|
+
pip install oasis-data-manager
|
|
49
|
+
|
|
50
|
+
# Development install
|
|
51
|
+
pip install -e .
|
|
52
|
+
pip install -r requirements.txt
|
|
53
|
+
|
|
54
|
+
# With optional cloud and distributed features (Dask, S3, Azure, PyArrow)
|
|
55
|
+
pip install -e ".[extra]"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Optional dependencies
|
|
59
|
+
|
|
60
|
+
The package ships several optional extras. Install any combination with `pip install oasis-data-manager[<extra>,...]`.
|
|
61
|
+
|
|
62
|
+
| Extra | What it adds | Key packages |
|
|
63
|
+
|---|---|---|
|
|
64
|
+
| `s3` | AWS S3 storage backend (`AwsS3Storage`) | `s3fs` |
|
|
65
|
+
| `azure` | Azure Blob Storage backend (`AzureABFSStorage`) | `adlfs` |
|
|
66
|
+
| `geo` | Geospatial DataFrame support (GeoDataFrame read/write) | `geopandas`, `pyogrio` |
|
|
67
|
+
| `dask` | Dask reader, distributed execution, and geospatial Dask support | `dask`, `dask-sql`, `distributed`, `dask-geopandas` (includes `geo`) |
|
|
68
|
+
| `extra` | Everything above bundled together | all of the above |
|
|
69
|
+
|
|
70
|
+
**Examples**
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# S3 support only
|
|
74
|
+
pip install "oasis-data-manager[s3]"
|
|
75
|
+
|
|
76
|
+
# Both cloud backends
|
|
77
|
+
pip install "oasis-data-manager[s3,azure]"
|
|
78
|
+
|
|
79
|
+
# Dask reader (also installs geo)
|
|
80
|
+
pip install "oasis-data-manager[dask]"
|
|
81
|
+
|
|
82
|
+
# Everything
|
|
83
|
+
pip install "oasis-data-manager[extra]"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Quick start
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from oasis_data_manager.df_reader.backends.pandas import OasisPandasReader
|
|
92
|
+
from oasis_data_manager.filestore.backends.local import LocalStorage
|
|
93
|
+
|
|
94
|
+
storage = LocalStorage("/data")
|
|
95
|
+
|
|
96
|
+
# Read a CSV and get a pandas DataFrame
|
|
97
|
+
df = OasisPandasReader("accounts.csv", storage).as_pandas()
|
|
98
|
+
|
|
99
|
+
# Chain filters
|
|
100
|
+
df = (
|
|
101
|
+
OasisPandasReader("accounts.csv", storage)
|
|
102
|
+
.filter([lambda x: x[x["PortNumber"] == "1"]])
|
|
103
|
+
.as_pandas()
|
|
104
|
+
)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Storage backends
|
|
110
|
+
|
|
111
|
+
Three backends are provided. All share the same interface.
|
|
112
|
+
|
|
113
|
+
### Local
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from oasis_data_manager.filestore.backends.local import LocalStorage
|
|
117
|
+
|
|
118
|
+
storage = LocalStorage(root_dir="/data")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### AWS S3
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from oasis_data_manager.filestore.backends.aws import AwsS3Storage
|
|
125
|
+
|
|
126
|
+
storage = AwsS3Storage(
|
|
127
|
+
bucket_name="my-bucket",
|
|
128
|
+
access_key="AKIA...",
|
|
129
|
+
secret_key="...",
|
|
130
|
+
root_dir="models/", # optional sub-path within the bucket
|
|
131
|
+
)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Azure Blob Storage
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from oasis_data_manager.filestore.backends.azure import AzureABFSStorage
|
|
138
|
+
|
|
139
|
+
storage = AzureABFSStorage(
|
|
140
|
+
account_name="myaccount",
|
|
141
|
+
account_key="...",
|
|
142
|
+
azure_container="my-container",
|
|
143
|
+
root_dir="models/", # optional sub-path within the container
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Configuration dict pattern
|
|
148
|
+
|
|
149
|
+
Used throughout the OasisLMF platform to configure storage from serialisable dicts:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from oasis_data_manager.filestore.config import get_storage_from_config
|
|
153
|
+
|
|
154
|
+
config = {
|
|
155
|
+
"storage_class": "AwsS3Storage",
|
|
156
|
+
"options": {
|
|
157
|
+
"bucket_name": "my-bucket",
|
|
158
|
+
"access_key": "AKIA...",
|
|
159
|
+
"secret_key": "...",
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
storage = get_storage_from_config(config)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Common storage operations
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
# Open a file (context manager, like built-in open)
|
|
169
|
+
with storage.open("path/to/file.csv") as f:
|
|
170
|
+
data = f.read()
|
|
171
|
+
|
|
172
|
+
# Copy a file to a local temp directory
|
|
173
|
+
local_path = storage.get("remote/file.parquet", "/tmp/")
|
|
174
|
+
|
|
175
|
+
# Upload a file
|
|
176
|
+
storage.put("/tmp/output.csv", "remote/output.csv")
|
|
177
|
+
|
|
178
|
+
# Delete
|
|
179
|
+
storage.delete_file("remote/old.csv")
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## DataFrame readers
|
|
185
|
+
|
|
186
|
+
### Reader backends
|
|
187
|
+
|
|
188
|
+
| Class | Engine | Formats | Filter behaviour |
|
|
189
|
+
|---|---|---|---|
|
|
190
|
+
| `OasisPandasReader` | pandas | CSV, Parquet | In-memory (post-load) |
|
|
191
|
+
| `OasisDaskReader` | Dask | CSV, Parquet | In-memory via dask-sql |
|
|
192
|
+
| `OasisPyarrowReader` | PyArrow | Parquet only | Predicate pushdown (pre-load) |
|
|
193
|
+
|
|
194
|
+
Format-specific subclasses (`OasisPandasReaderCSV`, `OasisDaskReaderParquet`, etc.) are available for Pandas and Dask, but only base OasisPyarrowReader is available and uses parquet.
|
|
195
|
+
|
|
196
|
+
### Fluent API
|
|
197
|
+
|
|
198
|
+
All readers share the same chainable interface. The actual file read is **lazy** — it happens on the first access to `.df` or when `.as_pandas()` is called.
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
from oasis_data_manager.df_reader.backends.pandas import OasisPandasReader
|
|
202
|
+
from oasis_data_manager.df_reader.backends.dask import OasisDaskReader
|
|
203
|
+
from oasis_data_manager.filestore.backends.local import LocalStorage
|
|
204
|
+
|
|
205
|
+
storage = LocalStorage("/data")
|
|
206
|
+
|
|
207
|
+
# Pandas — CSV
|
|
208
|
+
df = OasisPandasReader("losses.csv", storage).as_pandas()
|
|
209
|
+
|
|
210
|
+
# Pandas — Parquet (detected automatically from extension)
|
|
211
|
+
df = OasisPandasReader("losses.parquet", storage).as_pandas()
|
|
212
|
+
|
|
213
|
+
# Dask
|
|
214
|
+
df = OasisDaskReader("losses.csv", storage).as_pandas()
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Filtering
|
|
218
|
+
|
|
219
|
+
Pass a list of callables; each receives the DataFrame and must return a (filtered) DataFrame.
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
df = (
|
|
223
|
+
OasisPandasReader("locations.csv", storage)
|
|
224
|
+
.filter([
|
|
225
|
+
lambda x: x[x["CountryCode"] == "US"],
|
|
226
|
+
lambda x: x[x["LocNumber"].notna()],
|
|
227
|
+
])
|
|
228
|
+
.as_pandas()
|
|
229
|
+
)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
`OasisPandasReader` and `OasisDaskReader` apply filters **after** loading the full file into memory. `OasisPyarrowReader` accepts a `filters` kwarg (list of tuples or list of lists) that is pushed down into the Parquet engine before any data is read into memory — use this for large Parquet files where row-group skipping matters.
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from oasis_data_manager.df_reader.backends.pyarrow import OasisPyarrowReader
|
|
236
|
+
|
|
237
|
+
# AND of conditions — list of tuples
|
|
238
|
+
df = (
|
|
239
|
+
OasisPyarrowReader("losses.parquet", storage)
|
|
240
|
+
.read(filters=[("CountryCode", "==", "US"), ("TIV", ">=", 1_000_000)])
|
|
241
|
+
.as_pandas()
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# OR of AND-groups — list of lists
|
|
245
|
+
df = (
|
|
246
|
+
OasisPyarrowReader("losses.parquet", storage)
|
|
247
|
+
.read(filters=[[("CountryCode", "==", "US")], [("CountryCode", "==", "GB")]])
|
|
248
|
+
.as_pandas()
|
|
249
|
+
)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
Supported operators: `==`, `!=`, `<`, `<=`, `>`, `>=`, `in`, `not in`.
|
|
253
|
+
|
|
254
|
+
### SQL (Dask only)
|
|
255
|
+
|
|
256
|
+
Requires `dask-sql`. The reserved table name is `table`.
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from oasis_data_manager.df_reader.backends.dask import OasisDaskReader
|
|
260
|
+
|
|
261
|
+
df = (
|
|
262
|
+
OasisDaskReader("locations.csv", storage)
|
|
263
|
+
.sql("SELECT LocNumber, Latitude, Longitude FROM table WHERE CountryCode = 'US'")
|
|
264
|
+
.as_pandas()
|
|
265
|
+
)
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Arbitrary queries
|
|
269
|
+
|
|
270
|
+
`.query(fn)` passes the raw DataFrame to any callable and returns the result directly (not a reader).
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
count = OasisPandasReader("losses.csv", storage).query(lambda df: len(df))
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
### Configuration dict pattern
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
from oasis_data_manager.df_reader.config import get_df_reader
|
|
280
|
+
|
|
281
|
+
config = {
|
|
282
|
+
"path": "accounts.csv",
|
|
283
|
+
"storage": storage,
|
|
284
|
+
"options": {"dtype": {"LocNumber": str}},
|
|
285
|
+
"engine": "OasisPandasReaderCSV",
|
|
286
|
+
}
|
|
287
|
+
reader = get_df_reader(config)
|
|
288
|
+
df = reader.as_pandas()
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## Complex data pipelines
|
|
294
|
+
|
|
295
|
+
`ComplexData` composes storage fetch, SQL filtering, and post-read transformations into a single reusable class.
|
|
296
|
+
|
|
297
|
+
### `FileStoreComplexData` — files not handled by the df_reader (e.g. HDF5)
|
|
298
|
+
|
|
299
|
+
```python
|
|
300
|
+
from oasis_data_manager.complex import FileStoreComplexData, Adjustment
|
|
301
|
+
import h5py
|
|
302
|
+
import pandas as pd
|
|
303
|
+
|
|
304
|
+
class NormaliseAdjustment(Adjustment):
|
|
305
|
+
@classmethod
|
|
306
|
+
def apply(cls, df):
|
|
307
|
+
df["loss"] = df["loss"] / df["loss"].max()
|
|
308
|
+
return df
|
|
309
|
+
|
|
310
|
+
class EventLossData(FileStoreComplexData):
|
|
311
|
+
filename = "event_losses.hdf5"
|
|
312
|
+
sql = "SELECT * FROM table WHERE event_id > 1000"
|
|
313
|
+
adjustments = [NormaliseAdjustment]
|
|
314
|
+
|
|
315
|
+
def to_dataframe(self, result) -> pd.DataFrame:
|
|
316
|
+
f = h5py.File(result)
|
|
317
|
+
return pd.DataFrame({"event_id": list(f["event_id"]), "loss": list(f["loss"])})
|
|
318
|
+
|
|
319
|
+
# Run the pipeline
|
|
320
|
+
df = EventLossData(storage=storage).run().as_pandas()
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### `RestComplexData` — HTTP endpoints
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
from oasis_data_manager.complex import RestComplexData
|
|
327
|
+
|
|
328
|
+
class ExposureAPI(RestComplexData):
|
|
329
|
+
url = "https://api.example.com/exposures"
|
|
330
|
+
timeout = 30
|
|
331
|
+
|
|
332
|
+
def get_headers(self):
|
|
333
|
+
return {"Authorization": "Bearer my-token"}
|
|
334
|
+
|
|
335
|
+
def handle_response(self, response):
|
|
336
|
+
return response.json()["data"]
|
|
337
|
+
|
|
338
|
+
df = ExposureAPI().run().as_pandas()
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
---
|
|
342
|
+
|
|
343
|
+
## Exceptions
|
|
344
|
+
|
|
345
|
+
| Exception | Description |
|
|
346
|
+
|---|---|
|
|
347
|
+
| `OasisDataManagerException` | Base exception for this library |
|
|
348
|
+
| `OasisException` | Backward-compatible alias for the above |
|
|
349
|
+
| `MissingInputsException` | Raised when a required input file is not found |
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
from oasis_data_manager.errors import OasisDataManagerException, MissingInputsException
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
## Import paths
|
|
358
|
+
|
|
359
|
+
```python
|
|
360
|
+
# Storage backends
|
|
361
|
+
from oasis_data_manager.filestore.backends.local import LocalStorage
|
|
362
|
+
from oasis_data_manager.filestore.backends.aws import AwsS3Storage
|
|
363
|
+
from oasis_data_manager.filestore.backends.azure import AzureABFSStorage
|
|
364
|
+
from oasis_data_manager.filestore.config import get_storage_from_config
|
|
365
|
+
|
|
366
|
+
# DataFrame readers
|
|
367
|
+
from oasis_data_manager.df_reader.backends.pandas import OasisPandasReader, OasisPandasReaderCSV, OasisPandasReaderParquet
|
|
368
|
+
from oasis_data_manager.df_reader.backends.dask import OasisDaskReader, OasisDaskReaderCSV, OasisDaskReaderParquet
|
|
369
|
+
from oasis_data_manager.df_reader.backends.pyarrow import OasisPyarrowReader
|
|
370
|
+
from oasis_data_manager.df_reader.config import get_df_reader
|
|
371
|
+
|
|
372
|
+
# Exceptions
|
|
373
|
+
from oasis_data_manager.errors import OasisDataManagerException, OasisException
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
Deprecated module paths (`filestore/backends/aws_s3.py`, `filestore/backends/azure_abfs.py`) still work but emit a `DeprecationWarning`.
|
|
377
|
+
|
|
378
|
+
---
|
|
379
|
+
|
|
380
|
+
## Development
|
|
381
|
+
|
|
382
|
+
```bash
|
|
383
|
+
# Install dev dependencies
|
|
384
|
+
pip install -e .
|
|
385
|
+
pip install -r requirements.txt
|
|
386
|
+
|
|
387
|
+
# Run tests
|
|
388
|
+
pytest
|
|
389
|
+
|
|
390
|
+
# Skip type checking and import sorting for faster iteration
|
|
391
|
+
pytest --no-header -p no:mypy -p no:isort tests/df_reader/
|
|
392
|
+
|
|
393
|
+
# Cloud integration tests (requires Docker)
|
|
394
|
+
docker compose up -d
|
|
395
|
+
pytest tests/filestorage/test_aws.py tests/filestorage/test_azure.py
|
|
396
|
+
docker compose down
|
|
397
|
+
|
|
398
|
+
# Linting
|
|
399
|
+
flake8 --select F401,F522,F524,F541 --show-source ./
|
|
400
|
+
autopep8 --diff --exit-code --recursive --max-line-length 150 --ignore E402 .
|
|
401
|
+
|
|
402
|
+
# Build
|
|
403
|
+
python setup.py sdist && python setup.py bdist_wheel
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
---
|
|
407
|
+
|
|
408
|
+
## License
|
|
409
|
+
|
|
410
|
+
Part of the [OasisLMF](https://github.com/OasisLMF) platform. See repository for licence details.
|
|
@@ -23,6 +23,34 @@ pip install -r requirements.txt
|
|
|
23
23
|
pip install -e ".[extra]"
|
|
24
24
|
```
|
|
25
25
|
|
|
26
|
+
### Optional dependencies
|
|
27
|
+
|
|
28
|
+
The package ships several optional extras. Install any combination with `pip install oasis-data-manager[<extra>,...]`.
|
|
29
|
+
|
|
30
|
+
| Extra | What it adds | Key packages |
|
|
31
|
+
|---|---|---|
|
|
32
|
+
| `s3` | AWS S3 storage backend (`AwsS3Storage`) | `s3fs` |
|
|
33
|
+
| `azure` | Azure Blob Storage backend (`AzureABFSStorage`) | `adlfs` |
|
|
34
|
+
| `geo` | Geospatial DataFrame support (GeoDataFrame read/write) | `geopandas`, `pyogrio` |
|
|
35
|
+
| `dask` | Dask reader, distributed execution, and geospatial Dask support | `dask`, `dask-sql`, `distributed`, `dask-geopandas` (includes `geo`) |
|
|
36
|
+
| `extra` | Everything above bundled together | all of the above |
|
|
37
|
+
|
|
38
|
+
**Examples**
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# S3 support only
|
|
42
|
+
pip install "oasis-data-manager[s3]"
|
|
43
|
+
|
|
44
|
+
# Both cloud backends
|
|
45
|
+
pip install "oasis-data-manager[s3,azure]"
|
|
46
|
+
|
|
47
|
+
# Dask reader (also installs geo)
|
|
48
|
+
pip install "oasis-data-manager[dask]"
|
|
49
|
+
|
|
50
|
+
# Everything
|
|
51
|
+
pip install "oasis-data-manager[extra]"
|
|
52
|
+
```
|
|
53
|
+
|
|
26
54
|
---
|
|
27
55
|
|
|
28
56
|
## Quick start
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.2.3'
|
{oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/aws.py
RENAMED
|
@@ -19,6 +19,7 @@ class AwsS3Storage(BaseStorage):
|
|
|
19
19
|
access_key: Optional[str] = None,
|
|
20
20
|
secret_key: Optional[str] = None,
|
|
21
21
|
endpoint_url: Optional[str] = None,
|
|
22
|
+
public_bucket: bool = False,
|
|
22
23
|
file_overwrite=True,
|
|
23
24
|
object_parameters: Optional[dict] = None,
|
|
24
25
|
auto_create_bucket=False,
|
|
@@ -92,6 +93,7 @@ class AwsS3Storage(BaseStorage):
|
|
|
92
93
|
self.access_key = access_key
|
|
93
94
|
self.secret_key = secret_key
|
|
94
95
|
self.endpoint_url = endpoint_url
|
|
96
|
+
self.public_bucket = public_bucket
|
|
95
97
|
self.file_overwrite = file_overwrite
|
|
96
98
|
self.object_parameters = object_parameters
|
|
97
99
|
self.auto_create_bucket = auto_create_bucket
|
|
@@ -128,6 +130,7 @@ class AwsS3Storage(BaseStorage):
|
|
|
128
130
|
"access_key": self.access_key,
|
|
129
131
|
"secret_key": self.secret_key,
|
|
130
132
|
"endpoint_url": self.endpoint_url,
|
|
133
|
+
"public_bucket": self.public_bucket,
|
|
131
134
|
"file_overwrite": self.file_overwrite,
|
|
132
135
|
"object_parameters": self.object_parameters,
|
|
133
136
|
"auto_create_bucket": self.auto_create_bucket,
|
|
@@ -163,7 +166,7 @@ class AwsS3Storage(BaseStorage):
|
|
|
163
166
|
if self.reduced_redundancy:
|
|
164
167
|
s3_additional_kwargs["StorageClass"] = "REDUCED_REDUNDANCY"
|
|
165
168
|
|
|
166
|
-
|
|
169
|
+
options = {
|
|
167
170
|
"key": self.access_key,
|
|
168
171
|
"secret": self.secret_key,
|
|
169
172
|
"token": self.security_token,
|
|
@@ -174,6 +177,9 @@ class AwsS3Storage(BaseStorage):
|
|
|
174
177
|
"region_name": self.region_name,
|
|
175
178
|
},
|
|
176
179
|
}
|
|
180
|
+
if self.public_bucket:
|
|
181
|
+
options["anon"] = True
|
|
182
|
+
return options
|
|
177
183
|
|
|
178
184
|
def _strip_signing_parameters(self, url):
|
|
179
185
|
"""Duplicated Unsiged URLs from Django-Stroage
|
{oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/backends/base.py
RENAMED
|
@@ -69,7 +69,7 @@ class BaseStorage(object):
|
|
|
69
69
|
fsspec_filesystem_class: Optional[Type[fsspec.AbstractFileSystem]]
|
|
70
70
|
|
|
71
71
|
def __init__(
|
|
72
|
-
self, root_dir="", cache_dir: Union[str, None] = "/tmp/data-cache", logger=None
|
|
72
|
+
self, root_dir="", cache_dir: Union[str, None] = "/tmp/data-cache", logger=None, **kwargs
|
|
73
73
|
):
|
|
74
74
|
# Use for caching files across multiple runs, set value 'None' or 'False' to disable
|
|
75
75
|
self.cache_root = cache_dir
|
|
@@ -458,9 +458,10 @@ class BaseStorage(object):
|
|
|
458
458
|
def open(self, path, *args, **kwargs):
|
|
459
459
|
if self._is_valid_url(path):
|
|
460
460
|
with tempfile.TemporaryDirectory() as d:
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
461
|
+
local_path = self.get_from_cache(path, no_cache_target=os.path.join(d, "f"))
|
|
462
|
+
if local_path is None:
|
|
463
|
+
raise FileNotFoundError(f"No such file or directory: '{path}'")
|
|
464
|
+
with open(local_path) as f:
|
|
464
465
|
yield f
|
|
465
466
|
else:
|
|
466
467
|
with self.fs.open(path, *args, **kwargs) as f:
|
|
@@ -470,7 +471,8 @@ class BaseStorage(object):
|
|
|
470
471
|
def with_fileno(self, path, mode="rb"):
|
|
471
472
|
with tempfile.TemporaryDirectory() as d:
|
|
472
473
|
target = os.path.join(d, "fileno")
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
474
|
+
local_path = self.get_from_cache(path, no_cache_target=target)
|
|
475
|
+
if local_path is None:
|
|
476
|
+
raise FileNotFoundError(f"No such file or directory: '{path}'")
|
|
477
|
+
with open(local_path, mode) as f:
|
|
476
478
|
yield f
|
{oasis_data_manager-0.2.2 → oasis_data_manager-0.2.3}/oasis_data_manager/filestore/config.py
RENAMED
|
@@ -19,6 +19,7 @@ class S3StorageConfig(BaseStorageConfig):
|
|
|
19
19
|
access_key: NotRequired[str]
|
|
20
20
|
secret_key: NotRequired[str]
|
|
21
21
|
endpoint_url: NotRequired[str]
|
|
22
|
+
public_bucket: NotRequired[bool]
|
|
22
23
|
file_overwrite: NotRequired[bool]
|
|
23
24
|
object_parameters: NotRequired[dict]
|
|
24
25
|
auto_create_bucket: NotRequired[bool]
|