oasis-data-manager 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oasis_data_manager/__init__.py +1 -0
- oasis_data_manager/complex/__init__.py +0 -0
- oasis_data_manager/complex/complex.py +173 -0
- oasis_data_manager/complex/examples.py +40 -0
- oasis_data_manager/config.py +36 -0
- oasis_data_manager/df_reader/__init__.py +0 -0
- oasis_data_manager/df_reader/backends/__init__.py +0 -0
- oasis_data_manager/df_reader/backends/base.py +119 -0
- oasis_data_manager/df_reader/backends/dask.py +191 -0
- oasis_data_manager/df_reader/backends/pandas.py +85 -0
- oasis_data_manager/df_reader/backends/pyarrow.py +65 -0
- oasis_data_manager/df_reader/config.py +60 -0
- oasis_data_manager/df_reader/exceptions.py +6 -0
- oasis_data_manager/df_reader/reader.py +26 -0
- oasis_data_manager/errors/__init__.py +24 -0
- oasis_data_manager/filestore/__init__.py +0 -0
- oasis_data_manager/filestore/backends/__init__.py +0 -0
- oasis_data_manager/filestore/backends/aws.py +305 -0
- oasis_data_manager/filestore/backends/aws_s3.py +15 -0
- oasis_data_manager/filestore/backends/azure.py +139 -0
- oasis_data_manager/filestore/backends/azure_abfs.py +15 -0
- oasis_data_manager/filestore/backends/base.py +478 -0
- oasis_data_manager/filestore/backends/local.py +44 -0
- oasis_data_manager/filestore/config.py +106 -0
- oasis_data_manager/filestore/filestore.py +94 -0
- oasis_data_manager/filestore/log.py +22 -0
- oasis_data_manager-0.2.3.dist-info/METADATA +410 -0
- oasis_data_manager-0.2.3.dist-info/RECORD +30 -0
- oasis_data_manager-0.2.3.dist-info/WHEEL +5 -0
- oasis_data_manager-0.2.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = '0.2.3'
|
|
File without changes
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import pathlib
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import List, Type
|
|
5
|
+
|
|
6
|
+
import httpcore
|
|
7
|
+
import httpx
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from oasis_data_manager.df_reader.config import clean_config, get_df_reader
|
|
11
|
+
from oasis_data_manager.df_reader.reader import OasisReader
|
|
12
|
+
from oasis_data_manager.filestore.backends.local import LocalStorage
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Adjustment:
|
|
18
|
+
"""
|
|
19
|
+
Adjustments are any Pandas adjustments made after the data is fetched and filtered by SQL if applicable.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def apply(cls, df):
|
|
24
|
+
return df
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ComplexData:
|
|
28
|
+
adjustments: List[Type[Adjustment]] = []
|
|
29
|
+
filename: str = ""
|
|
30
|
+
url: str = ""
|
|
31
|
+
fetch_required: bool = True
|
|
32
|
+
sql: str = ""
|
|
33
|
+
|
|
34
|
+
def __init__(self, storage=None):
|
|
35
|
+
if not storage:
|
|
36
|
+
storage = LocalStorage()
|
|
37
|
+
self.storage = storage
|
|
38
|
+
|
|
39
|
+
def fetch(self):
|
|
40
|
+
raise NotImplementedError
|
|
41
|
+
|
|
42
|
+
def get_sql(self):
|
|
43
|
+
return self.sql
|
|
44
|
+
|
|
45
|
+
def adjust(self, reader) -> OasisReader:
|
|
46
|
+
"""
|
|
47
|
+
Hook to apply any adjustments.
|
|
48
|
+
|
|
49
|
+
TODO adjustments are filters? Functions fun on the readers df i.e
|
|
50
|
+
apply, should be change filter to apply then or is that confusing with pandas?
|
|
51
|
+
"""
|
|
52
|
+
if self.adjustments:
|
|
53
|
+
return reader.filter([a.apply for a in self.adjustments])
|
|
54
|
+
return reader
|
|
55
|
+
|
|
56
|
+
def to_dataframe(self, result) -> pd.DataFrame:
|
|
57
|
+
"""
|
|
58
|
+
Hook to allow conversion of the fetch() to a dataframe, in the case of the
|
|
59
|
+
RestComplexData class for example, the result is json, so can be fed
|
|
60
|
+
directly to a dataframe and wrapped into our df_reader.
|
|
61
|
+
"""
|
|
62
|
+
return pd.DataFrame(result)
|
|
63
|
+
|
|
64
|
+
def get_df_reader(self, filepath, **kwargs) -> OasisReader:
|
|
65
|
+
df_reader_config = clean_config(
|
|
66
|
+
{"filepath": filepath, "engine": "OasisDaskReader"}
|
|
67
|
+
)
|
|
68
|
+
df_reader_config["engine"]["options"]["storage"] = self.storage
|
|
69
|
+
|
|
70
|
+
return get_df_reader(df_reader_config, **kwargs)
|
|
71
|
+
|
|
72
|
+
def to_reader(self, fetch_result) -> OasisReader:
|
|
73
|
+
if fetch_result:
|
|
74
|
+
# When the result has been fetched, apply to_dataframe and pass directly into the df_reader
|
|
75
|
+
df = self.to_dataframe(fetch_result)
|
|
76
|
+
return self.get_df_reader(
|
|
77
|
+
None, # TODO - None value instead of filename_or_buffer, improve this.
|
|
78
|
+
dataframe=df,
|
|
79
|
+
has_read=True,
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
# Not fetched, let df_reader fetch as normal.
|
|
83
|
+
return self.get_df_reader(self.filename if self.filename else self.url)
|
|
84
|
+
|
|
85
|
+
def run(self):
|
|
86
|
+
# CSV and Parquet files are read directly by the df_reader, so fetch() is not needed.
|
|
87
|
+
# Only formats the df_reader can't handle (e.g. custom binary formats) require fetch().
|
|
88
|
+
if self.fetch_required and self.filename:
|
|
89
|
+
extension = pathlib.Path(self.filename).suffix
|
|
90
|
+
self.fetch_required = extension not in [".parquet", ".pq", ".csv"]
|
|
91
|
+
|
|
92
|
+
fetch_result = None
|
|
93
|
+
if self.fetch_required:
|
|
94
|
+
fetch_result = self.fetch()
|
|
95
|
+
|
|
96
|
+
reader = self.to_reader(fetch_result)
|
|
97
|
+
|
|
98
|
+
sql = self.get_sql()
|
|
99
|
+
if hasattr(reader, "sql") and sql:
|
|
100
|
+
reader = reader.sql(sql)
|
|
101
|
+
|
|
102
|
+
reader = self.adjust(reader)
|
|
103
|
+
|
|
104
|
+
# TODO store file? return for now
|
|
105
|
+
return reader
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class FileStoreComplexData(ComplexData):
|
|
109
|
+
def to_dataframe(self, result) -> pd.DataFrame:
|
|
110
|
+
"""
|
|
111
|
+
As this is only called on filetypes not handled by the df_reader, this will always be custom.
|
|
112
|
+
"""
|
|
113
|
+
raise NotImplementedError
|
|
114
|
+
|
|
115
|
+
def fetch(self):
|
|
116
|
+
with self.storage.open(self.filename, "rb") as f:
|
|
117
|
+
result = BytesIO(f.read())
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class RestComplexData(ComplexData):
|
|
122
|
+
exceptions = (
|
|
123
|
+
httpx.RequestError,
|
|
124
|
+
httpcore.ReadTimeout,
|
|
125
|
+
httpcore.ConnectTimeout,
|
|
126
|
+
httpcore.ConnectError,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
url: str
|
|
130
|
+
timeout: int = 10
|
|
131
|
+
|
|
132
|
+
def handle_error(self, exception, **kwargs):
|
|
133
|
+
logger.warning(
|
|
134
|
+
"Exception in complex data call",
|
|
135
|
+
extra={"exception": exception, "uri": self.url},
|
|
136
|
+
)
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
def handle_response(self, response) -> dict:
|
|
140
|
+
return response.json()
|
|
141
|
+
|
|
142
|
+
def get_uri(self) -> str:
|
|
143
|
+
return self.url
|
|
144
|
+
|
|
145
|
+
def get_headers(self) -> dict:
|
|
146
|
+
return {}
|
|
147
|
+
|
|
148
|
+
def get_post_json(self) -> dict:
|
|
149
|
+
return {}
|
|
150
|
+
|
|
151
|
+
def fetch(self):
|
|
152
|
+
uri = self.get_uri()
|
|
153
|
+
headers = self.get_headers()
|
|
154
|
+
post_json = self.get_post_json()
|
|
155
|
+
timeout = self.timeout
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
if post_json:
|
|
159
|
+
response = httpx.post(
|
|
160
|
+
uri, json=post_json, headers=headers, timeout=timeout
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
response = httpx.get(uri, headers=headers, timeout=timeout)
|
|
164
|
+
|
|
165
|
+
if response.status_code == 200:
|
|
166
|
+
return self.handle_response(response)
|
|
167
|
+
else:
|
|
168
|
+
return self.handle_error(
|
|
169
|
+
exception="Unexpected status in complex data call",
|
|
170
|
+
post_json=post_json,
|
|
171
|
+
)
|
|
172
|
+
except self.exceptions as e:
|
|
173
|
+
return self.handle_error(exception=str(e), post_json=post_json)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import h5py
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from oasis_data_manager.complex.complex import Adjustment, FileStoreComplexData
|
|
7
|
+
from oasis_data_manager.df_reader.reader import OasisDaskReader
|
|
8
|
+
from oasis_data_manager.filestore.backends.local import LocalStorage
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AddColAdjustment(Adjustment):
|
|
12
|
+
@classmethod
|
|
13
|
+
def apply(cls, df):
|
|
14
|
+
df["else"] = "test"
|
|
15
|
+
return df
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FloodDataExample(FileStoreComplexData):
|
|
19
|
+
# Note this file needs to be sourced. TODO - is this public? If not is there a public/file we can include?
|
|
20
|
+
filename = "tropical_cyclone_10synth_tracks_150arcsec_rcp26_KNA_2080.hdf5"
|
|
21
|
+
adjustments = [AddColAdjustment]
|
|
22
|
+
sql = "SELECT * FROM table WHERE event_id > 3000"
|
|
23
|
+
|
|
24
|
+
def to_dataframe(self, result) -> pd.DataFrame:
|
|
25
|
+
result = h5py.File(result)
|
|
26
|
+
|
|
27
|
+
df = pd.DataFrame(list(result["event_id"]), columns=["event_id"])
|
|
28
|
+
for m in ["event_name", "date", "frequency", "orig"]:
|
|
29
|
+
df[m] = list(result[m])
|
|
30
|
+
df = df.reset_index()
|
|
31
|
+
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if __name__ == "__main__":
|
|
36
|
+
reader = OasisDaskReader
|
|
37
|
+
test_storage = LocalStorage("/tmp")
|
|
38
|
+
|
|
39
|
+
result = FloodDataExample(storage=test_storage).run()
|
|
40
|
+
print(result.as_pandas())
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
|
|
3
|
+
_ALIASES = {
|
|
4
|
+
# Storage backends
|
|
5
|
+
"LocalStorage": "oasis_data_manager.filestore.backends.local.LocalStorage",
|
|
6
|
+
"AwsS3Storage": "oasis_data_manager.filestore.backends.aws.AwsS3Storage",
|
|
7
|
+
"AzureABFSStorage": "oasis_data_manager.filestore.backends.azure.AzureABFSStorage",
|
|
8
|
+
# Reader backends
|
|
9
|
+
"OasisPandasReader": "oasis_data_manager.df_reader.backends.pandas.OasisPandasReader",
|
|
10
|
+
"OasisPandasReaderCSV": "oasis_data_manager.df_reader.backends.pandas.OasisPandasReaderCSV",
|
|
11
|
+
"OasisPandasReaderParquet": "oasis_data_manager.df_reader.backends.pandas.OasisPandasReaderParquet",
|
|
12
|
+
"OasisDaskReader": "oasis_data_manager.df_reader.backends.dask.OasisDaskReader",
|
|
13
|
+
"OasisDaskReaderCSV": "oasis_data_manager.df_reader.backends.dask.OasisDaskReaderCSV",
|
|
14
|
+
"OasisDaskReaderParquet": "oasis_data_manager.df_reader.backends.dask.OasisDaskReaderParquet",
|
|
15
|
+
"OasisPyarrowReader": "oasis_data_manager.df_reader.backends.pyarrow.OasisPyarrowReader",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ConfigError(Exception):
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_class(path, base=None):
|
|
24
|
+
path = _ALIASES.get(path, path)
|
|
25
|
+
path_split = path.rsplit(".", 1)
|
|
26
|
+
if len(path_split) != 2:
|
|
27
|
+
raise ConfigError(f"'{path}' is not a valid class path (expected 'module.ClassName' or a known alias)")
|
|
28
|
+
|
|
29
|
+
module_path, cls_name = path_split
|
|
30
|
+
module = importlib.import_module(module_path)
|
|
31
|
+
cls = getattr(module, cls_name)
|
|
32
|
+
|
|
33
|
+
if base and not issubclass(cls, base):
|
|
34
|
+
raise ConfigError(f"'{cls.__name__}' does not extend '{base.__name__}'")
|
|
35
|
+
|
|
36
|
+
return cls
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
from typing import Iterable
|
|
3
|
+
|
|
4
|
+
from ...filestore.backends.base import BaseStorage
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OasisReader:
|
|
8
|
+
"""
|
|
9
|
+
Base reader.
|
|
10
|
+
|
|
11
|
+
as_pandas(), sql() & filter() can all be chained with self.has_read controlling whether the base
|
|
12
|
+
read (read_csv/read_parquet) needs to be triggered. This is because in the case of spark
|
|
13
|
+
we need to read differently depending on if the intention is to do sql or filter.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
filename_or_buffer,
|
|
19
|
+
storage: BaseStorage,
|
|
20
|
+
*args,
|
|
21
|
+
dataframe=None,
|
|
22
|
+
has_read=False,
|
|
23
|
+
**kwargs,
|
|
24
|
+
):
|
|
25
|
+
self.filename_or_buffer = filename_or_buffer
|
|
26
|
+
self.storage = storage
|
|
27
|
+
self._df = dataframe
|
|
28
|
+
self.has_read = has_read
|
|
29
|
+
self.reader_args = args
|
|
30
|
+
self.reader_kwargs = kwargs
|
|
31
|
+
|
|
32
|
+
if not filename_or_buffer:
|
|
33
|
+
if dataframe is None and not has_read:
|
|
34
|
+
raise RuntimeError(
|
|
35
|
+
"Reader must be initialised with either a "
|
|
36
|
+
"filename_or_buffer or by passing a dataframe "
|
|
37
|
+
"and has_read=True"
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
self.read_from_dataframe()
|
|
41
|
+
|
|
42
|
+
if (
|
|
43
|
+
filename_or_buffer
|
|
44
|
+
and isinstance(self.filename_or_buffer, str)
|
|
45
|
+
and self.filename_or_buffer.lower().endswith(".zip")
|
|
46
|
+
):
|
|
47
|
+
self.reader_kwargs["compression"] = "zip"
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def df(self):
|
|
51
|
+
self._read()
|
|
52
|
+
return self._df
|
|
53
|
+
|
|
54
|
+
@df.setter
|
|
55
|
+
def df(self, other):
|
|
56
|
+
self._df = other
|
|
57
|
+
|
|
58
|
+
def read_csv(self, *args, **kwargs):
|
|
59
|
+
raise NotImplementedError()
|
|
60
|
+
|
|
61
|
+
def read_parquet(self, *args, **kwargs):
|
|
62
|
+
raise NotImplementedError()
|
|
63
|
+
|
|
64
|
+
def _read(self):
|
|
65
|
+
if not self.has_read:
|
|
66
|
+
if hasattr(self.filename_or_buffer, "name"):
|
|
67
|
+
parts = pathlib.Path(self.filename_or_buffer.name).parts
|
|
68
|
+
else:
|
|
69
|
+
parts = pathlib.Path(self.filename_or_buffer).parts
|
|
70
|
+
|
|
71
|
+
is_parquet = any(
|
|
72
|
+
part.endswith((".parquet", ".pq")) for part in parts
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Set has_read before calling read to prevent re-entrant calls (e.g. Dask
|
|
76
|
+
# readers access self.df internally during read). Reset on failure so the
|
|
77
|
+
# read can be retried.
|
|
78
|
+
self.has_read = True
|
|
79
|
+
try:
|
|
80
|
+
if is_parquet:
|
|
81
|
+
self.read_parquet(*self.reader_args, **self.reader_kwargs)
|
|
82
|
+
else:
|
|
83
|
+
# assume the file is csv if not parquet
|
|
84
|
+
self.read_csv(*self.reader_args, **self.reader_kwargs)
|
|
85
|
+
except Exception:
|
|
86
|
+
self.has_read = False
|
|
87
|
+
raise
|
|
88
|
+
|
|
89
|
+
return self
|
|
90
|
+
|
|
91
|
+
def copy_with_df(self, df):
|
|
92
|
+
return type(self)(
|
|
93
|
+
self.filename_or_buffer, self.storage, dataframe=df, has_read=self.has_read
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def filter(self, filters):
|
|
97
|
+
self._read()
|
|
98
|
+
|
|
99
|
+
df = self._df
|
|
100
|
+
for df_filter in filters if isinstance(filters, Iterable) else [filters]:
|
|
101
|
+
df = df_filter(df)
|
|
102
|
+
|
|
103
|
+
return self.copy_with_df(df)
|
|
104
|
+
|
|
105
|
+
def sql(self, sql):
|
|
106
|
+
if sql:
|
|
107
|
+
self._read()
|
|
108
|
+
return self.apply_sql(sql)
|
|
109
|
+
return self
|
|
110
|
+
|
|
111
|
+
def query(self, fn):
|
|
112
|
+
return fn(self.df)
|
|
113
|
+
|
|
114
|
+
def as_pandas(self):
|
|
115
|
+
self._read()
|
|
116
|
+
return self._df
|
|
117
|
+
|
|
118
|
+
def read_from_dataframe(self):
|
|
119
|
+
pass
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import pathlib
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import dask
|
|
6
|
+
import dask_geopandas as dgpd
|
|
7
|
+
from dask import dataframe as dd
|
|
8
|
+
from dask_sql import Context
|
|
9
|
+
from dask_sql.utils import ParsingException
|
|
10
|
+
from distributed import Client
|
|
11
|
+
|
|
12
|
+
from ..exceptions import InvalidSQLException
|
|
13
|
+
from .base import OasisReader
|
|
14
|
+
|
|
15
|
+
dask.config.set(
|
|
16
|
+
{"dataframe.convert-string": False}
|
|
17
|
+
) # allows dask sql to support pyarrow
|
|
18
|
+
logger = logging.getLogger("oasis_data_manager.df_reader.reader")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OasisDaskReader(OasisReader):
|
|
22
|
+
sql_table_name = "table"
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, client_address=None, **kwargs):
|
|
25
|
+
if client_address:
|
|
26
|
+
self.client = Client(client_address, set_as_default=False)
|
|
27
|
+
else:
|
|
28
|
+
self.client = None
|
|
29
|
+
|
|
30
|
+
self.sql_context = Context()
|
|
31
|
+
self.table_names = [self.sql_table_name]
|
|
32
|
+
self.pre_sql_columns = []
|
|
33
|
+
|
|
34
|
+
super().__init__(*args, **kwargs)
|
|
35
|
+
|
|
36
|
+
def copy_with_df(self, df):
|
|
37
|
+
if not isinstance(df, dd.DataFrame):
|
|
38
|
+
df = dd.from_pandas(df, npartitions=1)
|
|
39
|
+
res = super().copy_with_df(df)
|
|
40
|
+
res.client = self.client
|
|
41
|
+
return res
|
|
42
|
+
|
|
43
|
+
def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
|
|
44
|
+
"""
|
|
45
|
+
Read in a shape file and return the _read file with geo data joined.
|
|
46
|
+
"""
|
|
47
|
+
# TODO: fix this so that it can work with non local files
|
|
48
|
+
# with self.storage.open(self.shape_filename_path) as f:
|
|
49
|
+
# shape_df = dgpd.read_file(f, npartitions=1)
|
|
50
|
+
|
|
51
|
+
shape_df = dgpd.read_file(shape_filename_path, npartitions=1)
|
|
52
|
+
|
|
53
|
+
# for situations where the columns in the source data are different.
|
|
54
|
+
lon_col = kwargs.get("geo_lon_col", "longitude")
|
|
55
|
+
lat_col = kwargs.get("geo_lat_col", "latitude")
|
|
56
|
+
|
|
57
|
+
df_columns = self.df.columns.tolist()
|
|
58
|
+
if lat_col not in df_columns or lon_col not in df_columns:
|
|
59
|
+
logger.warning("Invalid shape file provided")
|
|
60
|
+
# temp until we decide on handling, i.e don't return full data if it fails.
|
|
61
|
+
return self.copy_with_df(dd.DataFrame.from_dict({}, npartitions=1))
|
|
62
|
+
|
|
63
|
+
df = self.df.copy()
|
|
64
|
+
|
|
65
|
+
# convert read df to geo
|
|
66
|
+
df["geometry"] = dgpd.points_from_xy(df, lon_col, lat_col)
|
|
67
|
+
df = dgpd.from_dask_dataframe(df)
|
|
68
|
+
|
|
69
|
+
# Make sure they're using the same projection reference
|
|
70
|
+
df.crs = shape_df.crs
|
|
71
|
+
|
|
72
|
+
# join the datasets, matching `geometry` to points within the shape df
|
|
73
|
+
df = df.sjoin(shape_df, how="inner")
|
|
74
|
+
|
|
75
|
+
if drop_geo:
|
|
76
|
+
df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
|
|
77
|
+
|
|
78
|
+
return self.copy_with_df(df)
|
|
79
|
+
|
|
80
|
+
def apply_sql(self, sql):
|
|
81
|
+
try:
|
|
82
|
+
# Initially this was the filename, but some filenames are invalid for the table,
|
|
83
|
+
# is it ok to call it the same name all the time? Mapped to DaskDataTable in case
|
|
84
|
+
# we need to change this.
|
|
85
|
+
self.sql_context.create_table("DaskDataTable", self.df)
|
|
86
|
+
formatted_sql = sql.replace(self.sql_table_name, "DaskDataTable")
|
|
87
|
+
|
|
88
|
+
# Combine columns from join() tables with current df columns for case restoration
|
|
89
|
+
col_map = {}
|
|
90
|
+
for col in list(self.pre_sql_columns) + list(self.df.columns):
|
|
91
|
+
col_map.setdefault(col.lower(), col)
|
|
92
|
+
|
|
93
|
+
# dask expects the columns to be lower case, which won't match some data
|
|
94
|
+
df = self.sql_context.sql(
|
|
95
|
+
formatted_sql,
|
|
96
|
+
config_options={"sql.identifier.case_sensitive": False},
|
|
97
|
+
)
|
|
98
|
+
# which means we then need to map the columns back to the original
|
|
99
|
+
# and allow for any aggregations to be retained
|
|
100
|
+
df.columns = [col_map.get(v.lower(), v) for v in df.columns]
|
|
101
|
+
|
|
102
|
+
return self.copy_with_df(df)
|
|
103
|
+
except ParsingException:
|
|
104
|
+
raise InvalidSQLException
|
|
105
|
+
|
|
106
|
+
def join(self, df, table_name):
|
|
107
|
+
"""
|
|
108
|
+
Creates a secondary table as a sql table in order to allow joins when apply_sql is called.
|
|
109
|
+
"""
|
|
110
|
+
if table_name in self.table_names:
|
|
111
|
+
raise RuntimeError(
|
|
112
|
+
f"Table name already in use: [{','.join(self.table_names)}]"
|
|
113
|
+
)
|
|
114
|
+
self.pre_sql_columns.extend(df.columns)
|
|
115
|
+
self.sql_context.create_table(table_name, df)
|
|
116
|
+
self.table_names.append(table_name)
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
def read_from_dataframe(self):
|
|
120
|
+
if not isinstance(self.df, dd.DataFrame):
|
|
121
|
+
self.df = dd.from_pandas(self.df, npartitions=1)
|
|
122
|
+
|
|
123
|
+
def as_pandas(self):
|
|
124
|
+
super().as_pandas()
|
|
125
|
+
if self.client:
|
|
126
|
+
return self.client.compute(self.df).result()
|
|
127
|
+
else:
|
|
128
|
+
return self.df.compute()
|
|
129
|
+
|
|
130
|
+
def read_dict(self, data):
|
|
131
|
+
self.df = dd.DataFrame.from_dict(data)
|
|
132
|
+
|
|
133
|
+
def read_csv(self, *args, **kwargs):
|
|
134
|
+
# remove standard pandas kwargs which will case an issue in dask.
|
|
135
|
+
dask_safe_kwargs = kwargs.copy()
|
|
136
|
+
dask_safe_kwargs.pop("memory_map", None)
|
|
137
|
+
dask_safe_kwargs.pop("low_memory", None)
|
|
138
|
+
|
|
139
|
+
filename_or_buffer = self.filename_or_buffer
|
|
140
|
+
if isinstance(filename_or_buffer, pathlib.PosixPath):
|
|
141
|
+
filename_or_buffer = str(self.filename_or_buffer)
|
|
142
|
+
|
|
143
|
+
if isinstance(filename_or_buffer, io.TextIOWrapper) or isinstance(
|
|
144
|
+
filename_or_buffer, io.BufferedReader
|
|
145
|
+
):
|
|
146
|
+
filename_or_buffer = filename_or_buffer.name
|
|
147
|
+
|
|
148
|
+
# django files
|
|
149
|
+
if hasattr(filename_or_buffer, "path"):
|
|
150
|
+
filename_or_buffer = filename_or_buffer.path
|
|
151
|
+
|
|
152
|
+
_, uri = self.storage.get_storage_url(filename_or_buffer, encode_params=False)
|
|
153
|
+
self.df = dd.read_csv(
|
|
154
|
+
uri,
|
|
155
|
+
*args,
|
|
156
|
+
**dask_safe_kwargs,
|
|
157
|
+
storage_options=self.storage.get_fsspec_storage_options(),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def read_parquet(self, *args, **kwargs):
|
|
161
|
+
if isinstance(self.filename_or_buffer, str):
|
|
162
|
+
_, uri = self.storage.get_storage_url(
|
|
163
|
+
self.filename_or_buffer, encode_params=False
|
|
164
|
+
)
|
|
165
|
+
filename = uri
|
|
166
|
+
kwargs["storage_options"] = self.storage.get_fsspec_storage_options()
|
|
167
|
+
else:
|
|
168
|
+
filename = self.filename_or_buffer
|
|
169
|
+
|
|
170
|
+
self.df = dd.read_parquet(
|
|
171
|
+
filename,
|
|
172
|
+
*args,
|
|
173
|
+
**kwargs,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# dask-sql doesn't handle categorical columns, but we need to be careful
|
|
177
|
+
# how we convert them, if an assign is used we will end up stopping
|
|
178
|
+
# the `Predicate pushdown optimization` within dask-sql from applying the
|
|
179
|
+
# sql to the read_parquet filters.
|
|
180
|
+
categories_to_convert = {}
|
|
181
|
+
for col in self.df.select_dtypes(include="category").columns:
|
|
182
|
+
categories_to_convert[col] = self.df[col].dtype.categories.dtype
|
|
183
|
+
self.df = self.df.astype(categories_to_convert)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class OasisDaskReaderCSV(OasisDaskReader):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class OasisDaskReaderParquet(OasisDaskReader):
|
|
191
|
+
pass
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import geopandas as gpd
|
|
8
|
+
except ModuleNotFoundError:
|
|
9
|
+
gpd = None
|
|
10
|
+
|
|
11
|
+
from .base import OasisReader
|
|
12
|
+
from ..exceptions import MissingOptionalDependency
|
|
13
|
+
logger = logging.getLogger("oasis_data_manager.df_reader.reader")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OasisPandasReader(OasisReader):
|
|
17
|
+
def _read_with(self, read_fn, *args, **kwargs):
|
|
18
|
+
if isinstance(self.filename_or_buffer, str) and not self.filename_or_buffer.startswith(("http://", "https://")):
|
|
19
|
+
_, uri = self.storage.get_storage_url(
|
|
20
|
+
self.filename_or_buffer, encode_params=False
|
|
21
|
+
)
|
|
22
|
+
self.df = read_fn(
|
|
23
|
+
uri,
|
|
24
|
+
*args,
|
|
25
|
+
**kwargs,
|
|
26
|
+
storage_options=self.storage.get_fsspec_storage_options(),
|
|
27
|
+
)
|
|
28
|
+
else:
|
|
29
|
+
self.df = read_fn(self.filename_or_buffer, *args, **kwargs)
|
|
30
|
+
|
|
31
|
+
def read_csv(self, *args, **kwargs):
|
|
32
|
+
if pd.__version__ >= "3": # remove unsupported options issue https://github.com/OasisLMF/OasisLMF/issues/1896
|
|
33
|
+
kwargs.pop('low_memory', None)
|
|
34
|
+
self._read_with(pd.read_csv, *args, **kwargs)
|
|
35
|
+
|
|
36
|
+
def read_parquet(self, *args, **kwargs):
|
|
37
|
+
self._read_with(pd.read_parquet, *args, **kwargs)
|
|
38
|
+
|
|
39
|
+
def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
|
|
40
|
+
"""
|
|
41
|
+
Read in a shape file and return the _read file with geo data joined.
|
|
42
|
+
"""
|
|
43
|
+
# TODO: fix this so that it can work with non local files
|
|
44
|
+
# with self.storage.open(self.shape_filename_path) as f:
|
|
45
|
+
# shape_df = gpd.read_file(f)
|
|
46
|
+
|
|
47
|
+
if gpd is None:
|
|
48
|
+
raise MissingOptionalDependency(
|
|
49
|
+
"Missing optional dependency 'geopandas' for 'apply_geo' method, install package using `pip install oasis-data-manager[extra]`")
|
|
50
|
+
|
|
51
|
+
shape_df = gpd.read_file(shape_filename_path)
|
|
52
|
+
|
|
53
|
+
# for situations where the columns in the source data are different.
|
|
54
|
+
lon_col = kwargs.get("geo_lon_col", "longitude")
|
|
55
|
+
lat_col = kwargs.get("geo_lat_col", "latitude")
|
|
56
|
+
|
|
57
|
+
df_columns = self.df.columns.tolist()
|
|
58
|
+
if lat_col not in df_columns or lon_col not in df_columns:
|
|
59
|
+
logger.warning("Invalid shape file provided")
|
|
60
|
+
# temp until we decide on handling, i.e don't return full data if it fails.
|
|
61
|
+
return self.copy_with_df(pd.DataFrame.from_dict({}))
|
|
62
|
+
|
|
63
|
+
# convert read df to geo
|
|
64
|
+
df = gpd.GeoDataFrame(
|
|
65
|
+
self.df, geometry=gpd.points_from_xy(self.df[lon_col], self.df[lat_col])
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Make sure they're using the same projection reference
|
|
69
|
+
df.crs = shape_df.crs
|
|
70
|
+
|
|
71
|
+
# join the datasets, matching `geometry` to points within the shape df
|
|
72
|
+
df = df.sjoin(shape_df, how="inner")
|
|
73
|
+
|
|
74
|
+
if drop_geo:
|
|
75
|
+
df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
|
|
76
|
+
|
|
77
|
+
return self.copy_with_df(df)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class OasisPandasReaderCSV(OasisPandasReader):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class OasisPandasReaderParquet(OasisPandasReader):
|
|
85
|
+
pass
|