oasis-data-manager 0.1.0rc1__py2.py3-none-any.whl → 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oasis_data_manager/__init__.py +1 -1
- oasis_data_manager/df_reader/backends/__init__.py +0 -0
- oasis_data_manager/df_reader/backends/base.py +109 -0
- oasis_data_manager/df_reader/backends/dask.py +197 -0
- oasis_data_manager/df_reader/backends/pandas.py +101 -0
- oasis_data_manager/df_reader/config.py +8 -4
- oasis_data_manager/df_reader/exceptions.py +4 -0
- oasis_data_manager/df_reader/reader.py +15 -387
- oasis_data_manager/filestore/config.py +8 -3
- {oasis_data_manager-0.1.0rc1.dist-info → oasis_data_manager-0.1.1.dist-info}/METADATA +14 -11
- {oasis_data_manager-0.1.0rc1.dist-info → oasis_data_manager-0.1.1.dist-info}/RECORD +14 -10
- {oasis_data_manager-0.1.0rc1.dist-info → oasis_data_manager-0.1.1.dist-info}/LICENSE +0 -0
- {oasis_data_manager-0.1.0rc1.dist-info → oasis_data_manager-0.1.1.dist-info}/WHEEL +0 -0
- {oasis_data_manager-0.1.0rc1.dist-info → oasis_data_manager-0.1.1.dist-info}/top_level.txt +0 -0
oasis_data_manager/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = '0.1.
|
|
1
|
+
__version__ = '0.1.1'
|
|
File without changes
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
from typing import Iterable
|
|
3
|
+
|
|
4
|
+
from ...filestore.backends.base import BaseStorage
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class OasisReader:
|
|
8
|
+
"""
|
|
9
|
+
Base reader.
|
|
10
|
+
|
|
11
|
+
as_pandas(), sql() & filter() can all be chained with self.has_read controlling whether the base
|
|
12
|
+
read (read_csv/read_parquet) needs to be triggered. This is because in the case of spark
|
|
13
|
+
we need to read differently depending on if the intention is to do sql or filter.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
filename_or_buffer,
|
|
19
|
+
storage: BaseStorage,
|
|
20
|
+
*args,
|
|
21
|
+
dataframe=None,
|
|
22
|
+
has_read=False,
|
|
23
|
+
**kwargs,
|
|
24
|
+
):
|
|
25
|
+
self.filename_or_buffer = filename_or_buffer
|
|
26
|
+
self.storage = storage
|
|
27
|
+
self._df = dataframe
|
|
28
|
+
self.has_read = has_read
|
|
29
|
+
self.reader_args = args
|
|
30
|
+
self.reader_kwargs = kwargs
|
|
31
|
+
|
|
32
|
+
if not filename_or_buffer:
|
|
33
|
+
if dataframe is None and not has_read:
|
|
34
|
+
raise RuntimeError(
|
|
35
|
+
"Reader must be initialised with either a "
|
|
36
|
+
"filename_or_buffer or by passing a dataframe "
|
|
37
|
+
"and has_read=True"
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
self.read_from_dataframe()
|
|
41
|
+
|
|
42
|
+
if (
|
|
43
|
+
filename_or_buffer
|
|
44
|
+
and isinstance(self.filename_or_buffer, str)
|
|
45
|
+
and self.filename_or_buffer.lower().endswith(".zip")
|
|
46
|
+
):
|
|
47
|
+
self.reader_kwargs["compression"] = "zip"
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def df(self):
|
|
51
|
+
self._read()
|
|
52
|
+
return self._df
|
|
53
|
+
|
|
54
|
+
@df.setter
|
|
55
|
+
def df(self, other):
|
|
56
|
+
self._df = other
|
|
57
|
+
|
|
58
|
+
def read_csv(self, *args, **kwargs):
|
|
59
|
+
raise NotImplementedError()
|
|
60
|
+
|
|
61
|
+
def read_parquet(self, *args, **kwargs):
|
|
62
|
+
raise NotImplementedError()
|
|
63
|
+
|
|
64
|
+
def _read(self):
|
|
65
|
+
if not self.has_read:
|
|
66
|
+
if hasattr(self.filename_or_buffer, "name"):
|
|
67
|
+
extension = pathlib.Path(self.filename_or_buffer.name).suffix
|
|
68
|
+
else:
|
|
69
|
+
extension = pathlib.Path(self.filename_or_buffer).suffix
|
|
70
|
+
|
|
71
|
+
if extension in [".parquet", ".pq"]:
|
|
72
|
+
self.has_read = True
|
|
73
|
+
self.read_parquet(*self.reader_args, **self.reader_kwargs)
|
|
74
|
+
else:
|
|
75
|
+
# assume the file is csv if not parquet
|
|
76
|
+
self.has_read = True
|
|
77
|
+
self.read_csv(*self.reader_args, **self.reader_kwargs)
|
|
78
|
+
|
|
79
|
+
return self
|
|
80
|
+
|
|
81
|
+
def copy_with_df(self, df):
|
|
82
|
+
return type(self)(
|
|
83
|
+
self.filename_or_buffer, self.storage, dataframe=df, has_read=self.has_read
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def filter(self, filters):
|
|
87
|
+
self._read()
|
|
88
|
+
|
|
89
|
+
df = self.df
|
|
90
|
+
for df_filter in filters if isinstance(filters, Iterable) else [filters]:
|
|
91
|
+
df = df_filter(df)
|
|
92
|
+
|
|
93
|
+
return self.copy_with_df(df)
|
|
94
|
+
|
|
95
|
+
def sql(self, sql):
|
|
96
|
+
if sql:
|
|
97
|
+
self._read()
|
|
98
|
+
return self.apply_sql(sql)
|
|
99
|
+
return self
|
|
100
|
+
|
|
101
|
+
def query(self, fn):
|
|
102
|
+
return fn(self.df)
|
|
103
|
+
|
|
104
|
+
def as_pandas(self):
|
|
105
|
+
self._read()
|
|
106
|
+
return self.df
|
|
107
|
+
|
|
108
|
+
def read_from_dataframe(self):
|
|
109
|
+
pass
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import pathlib
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import dask
|
|
6
|
+
import dask_geopandas as dgpd
|
|
7
|
+
from dask import dataframe as dd
|
|
8
|
+
from dask_sql import Context
|
|
9
|
+
from dask_sql.utils import ParsingException
|
|
10
|
+
from distributed import Client
|
|
11
|
+
|
|
12
|
+
from ..exceptions import InvalidSQLException
|
|
13
|
+
from .base import OasisReader
|
|
14
|
+
|
|
15
|
+
dask.config.set(
|
|
16
|
+
{"dataframe.convert-string": False}
|
|
17
|
+
) # allows dask sql to support pyarrow
|
|
18
|
+
logger = logging.getLogger("oasis_data_manager.df_reader.reader")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OasisDaskReader(OasisReader):
|
|
22
|
+
sql_table_name = "table"
|
|
23
|
+
|
|
24
|
+
def __init__(self, *args, client_address=None, **kwargs):
|
|
25
|
+
if client_address:
|
|
26
|
+
self.client = Client(client_address, set_as_default=False)
|
|
27
|
+
else:
|
|
28
|
+
self.client = None
|
|
29
|
+
|
|
30
|
+
self.sql_context = Context()
|
|
31
|
+
self.table_names = [self.sql_table_name]
|
|
32
|
+
self.pre_sql_columns = []
|
|
33
|
+
|
|
34
|
+
super().__init__(*args, **kwargs)
|
|
35
|
+
|
|
36
|
+
def copy_with_df(self, df):
|
|
37
|
+
res = super().copy_with_df(df)
|
|
38
|
+
res.client = self.client
|
|
39
|
+
return res
|
|
40
|
+
|
|
41
|
+
def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
|
|
42
|
+
"""
|
|
43
|
+
Read in a shape file and return the _read file with geo data joined.
|
|
44
|
+
"""
|
|
45
|
+
# TODO: fix this so that it can work with non local files
|
|
46
|
+
# with self.storage.open(self.shape_filename_path) as f:
|
|
47
|
+
# shape_df = dgpd.read_file(f, npartitions=1)
|
|
48
|
+
|
|
49
|
+
shape_df = dgpd.read_file(shape_filename_path, npartitions=1)
|
|
50
|
+
|
|
51
|
+
# for situations where the columns in the source data are different.
|
|
52
|
+
lon_col = kwargs.get("geo_lon_col", "longitude")
|
|
53
|
+
lat_col = kwargs.get("geo_lat_col", "latitude")
|
|
54
|
+
|
|
55
|
+
df_columns = self.df.columns.tolist()
|
|
56
|
+
if lat_col not in df_columns or lon_col not in df_columns:
|
|
57
|
+
logger.warning("Invalid shape file provided")
|
|
58
|
+
# temp until we decide on handling, i.e don't return full data if it fails.
|
|
59
|
+
return self.copy_with_df(dd.DataFrame.from_dict({}, npartitions=1))
|
|
60
|
+
|
|
61
|
+
df = self.df.copy()
|
|
62
|
+
|
|
63
|
+
# convert read df to geo
|
|
64
|
+
df["geometry"] = dgpd.points_from_xy(df, lon_col, lat_col)
|
|
65
|
+
df = dgpd.from_dask_dataframe(df)
|
|
66
|
+
|
|
67
|
+
# Make sure they're using the same projection reference
|
|
68
|
+
df.crs = shape_df.crs
|
|
69
|
+
|
|
70
|
+
# join the datasets, matching `geometry` to points within the shape df
|
|
71
|
+
df = df.sjoin(shape_df, how="inner")
|
|
72
|
+
|
|
73
|
+
if drop_geo:
|
|
74
|
+
df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
|
|
75
|
+
|
|
76
|
+
return self.copy_with_df(df)
|
|
77
|
+
|
|
78
|
+
def apply_sql(self, sql):
|
|
79
|
+
df = self.df.copy()
|
|
80
|
+
try:
|
|
81
|
+
# Initially this was the filename, but some filenames are invalid for the table,
|
|
82
|
+
# is it ok to call it the same name all the time? Mapped to DaskDataTable in case
|
|
83
|
+
# we need to change this.
|
|
84
|
+
self.sql_context.create_table("DaskDataTable", self.df)
|
|
85
|
+
formatted_sql = sql.replace(self.sql_table_name, "DaskDataTable")
|
|
86
|
+
|
|
87
|
+
self.pre_sql_columns.extend(df.columns)
|
|
88
|
+
|
|
89
|
+
# dask expects the columns to be lower case, which won't match some data
|
|
90
|
+
df = self.sql_context.sql(
|
|
91
|
+
formatted_sql,
|
|
92
|
+
config_options={"sql.identifier.case_sensitive": False},
|
|
93
|
+
)
|
|
94
|
+
# which means we then need to map the columns back to the original
|
|
95
|
+
# and allow for any aggregations to be retained
|
|
96
|
+
validated_columns = []
|
|
97
|
+
for v in df.columns:
|
|
98
|
+
pre = False
|
|
99
|
+
for x in self.pre_sql_columns:
|
|
100
|
+
if v.lower() == x.lower():
|
|
101
|
+
validated_columns.append(x)
|
|
102
|
+
pre = True
|
|
103
|
+
|
|
104
|
+
if not pre:
|
|
105
|
+
validated_columns.append(v)
|
|
106
|
+
df.columns = validated_columns
|
|
107
|
+
|
|
108
|
+
return self.copy_with_df(df)
|
|
109
|
+
except ParsingException:
|
|
110
|
+
raise InvalidSQLException
|
|
111
|
+
|
|
112
|
+
def join(self, df, table_name):
|
|
113
|
+
"""
|
|
114
|
+
Creates a secondary table as a sql table in order to allow joins when apply_sql is called.
|
|
115
|
+
"""
|
|
116
|
+
if table_name in self.table_names:
|
|
117
|
+
raise RuntimeError(
|
|
118
|
+
f"Table name already in use: [{','.join(self.table_names)}]"
|
|
119
|
+
)
|
|
120
|
+
self.pre_sql_columns.extend(df.columns)
|
|
121
|
+
self.sql_context.create_table(table_name, df)
|
|
122
|
+
self.table_names.append(table_name)
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def read_from_dataframe(self):
|
|
126
|
+
if not isinstance(self.df, dd.DataFrame):
|
|
127
|
+
self.df = dd.from_pandas(self.df, npartitions=1)
|
|
128
|
+
|
|
129
|
+
def as_pandas(self):
|
|
130
|
+
super().as_pandas()
|
|
131
|
+
if self.client:
|
|
132
|
+
return self.client.compute(self.df).result()
|
|
133
|
+
else:
|
|
134
|
+
return self.df.compute()
|
|
135
|
+
|
|
136
|
+
def read_dict(self, data):
|
|
137
|
+
self.df = dd.DataFrame.from_dict(data)
|
|
138
|
+
|
|
139
|
+
def read_csv(self, *args, **kwargs):
|
|
140
|
+
# remove standard pandas kwargs which will case an issue in dask.
|
|
141
|
+
dask_safe_kwargs = kwargs.copy()
|
|
142
|
+
dask_safe_kwargs.pop("memory_map", None)
|
|
143
|
+
dask_safe_kwargs.pop("low_memory", None)
|
|
144
|
+
|
|
145
|
+
filename_or_buffer = self.filename_or_buffer
|
|
146
|
+
if isinstance(filename_or_buffer, pathlib.PosixPath):
|
|
147
|
+
filename_or_buffer = str(self.filename_or_buffer)
|
|
148
|
+
|
|
149
|
+
if isinstance(filename_or_buffer, io.TextIOWrapper) or isinstance(
|
|
150
|
+
filename_or_buffer, io.BufferedReader
|
|
151
|
+
):
|
|
152
|
+
filename_or_buffer = filename_or_buffer.name
|
|
153
|
+
|
|
154
|
+
# django files
|
|
155
|
+
if hasattr(filename_or_buffer, "path"):
|
|
156
|
+
filename_or_buffer = filename_or_buffer.path
|
|
157
|
+
|
|
158
|
+
_, uri = self.storage.get_storage_url(filename_or_buffer, encode_params=False)
|
|
159
|
+
self.df = dd.read_csv(
|
|
160
|
+
uri,
|
|
161
|
+
*args,
|
|
162
|
+
**dask_safe_kwargs,
|
|
163
|
+
storage_options=self.storage.get_fsspec_storage_options(),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def read_parquet(self, *args, **kwargs):
|
|
167
|
+
if isinstance(self.filename_or_buffer, str):
|
|
168
|
+
_, uri = self.storage.get_storage_url(
|
|
169
|
+
self.filename_or_buffer, encode_params=False
|
|
170
|
+
)
|
|
171
|
+
filename = uri
|
|
172
|
+
kwargs["storage_options"] = self.storage.get_fsspec_storage_options()
|
|
173
|
+
else:
|
|
174
|
+
filename = self.filename_or_buffer
|
|
175
|
+
|
|
176
|
+
self.df = dd.read_parquet(
|
|
177
|
+
filename,
|
|
178
|
+
*args,
|
|
179
|
+
**kwargs,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# dask-sql doesn't handle categorical columns, but we need to be careful
|
|
183
|
+
# how we convert them, if an assign is used we will end up stopping
|
|
184
|
+
# the `Predicate pushdown optimization` within dask-sql from applying the
|
|
185
|
+
# sql to the read_parquet filters.
|
|
186
|
+
categories_to_convert = {}
|
|
187
|
+
for col in self.df.select_dtypes(include="category").columns:
|
|
188
|
+
categories_to_convert[col] = self.df[col].dtype.categories.dtype
|
|
189
|
+
self.df = self.df.astype(categories_to_convert)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class OasisDaskReaderCSV(OasisDaskReader):
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class OasisDaskReaderParquet(OasisDaskReader):
|
|
197
|
+
pass
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import geopandas as gpd
|
|
8
|
+
except ModuleNotFoundError:
|
|
9
|
+
gpd = None
|
|
10
|
+
|
|
11
|
+
from .base import OasisReader
|
|
12
|
+
from ..exceptions import MissingOptionalDependency
|
|
13
|
+
logger = logging.getLogger("oasis_data_manager.df_reader.reader")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OasisPandasReader(OasisReader):
|
|
17
|
+
def read_csv(self, *args, **kwargs):
|
|
18
|
+
if isinstance(self.filename_or_buffer, str):
|
|
19
|
+
if self.filename_or_buffer.startswith(
|
|
20
|
+
"http://"
|
|
21
|
+
) or self.filename_or_buffer.startswith("https://"):
|
|
22
|
+
self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
|
|
23
|
+
else:
|
|
24
|
+
_, uri = self.storage.get_storage_url(
|
|
25
|
+
self.filename_or_buffer, encode_params=False
|
|
26
|
+
)
|
|
27
|
+
self.df = pd.read_csv(
|
|
28
|
+
uri,
|
|
29
|
+
*args,
|
|
30
|
+
**kwargs,
|
|
31
|
+
storage_options=self.storage.get_fsspec_storage_options(),
|
|
32
|
+
)
|
|
33
|
+
else:
|
|
34
|
+
self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
|
|
35
|
+
|
|
36
|
+
def read_parquet(self, *args, **kwargs):
|
|
37
|
+
if isinstance(self.filename_or_buffer, str):
|
|
38
|
+
if self.filename_or_buffer.startswith(
|
|
39
|
+
"http://"
|
|
40
|
+
) or self.filename_or_buffer.startswith("https://"):
|
|
41
|
+
self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
|
|
42
|
+
else:
|
|
43
|
+
_, uri = self.storage.get_storage_url(
|
|
44
|
+
self.filename_or_buffer, encode_params=False
|
|
45
|
+
)
|
|
46
|
+
self.df = pd.read_parquet(
|
|
47
|
+
uri,
|
|
48
|
+
*args,
|
|
49
|
+
**kwargs,
|
|
50
|
+
storage_options=self.storage.get_fsspec_storage_options(),
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
|
|
54
|
+
|
|
55
|
+
def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
|
|
56
|
+
"""
|
|
57
|
+
Read in a shape file and return the _read file with geo data joined.
|
|
58
|
+
"""
|
|
59
|
+
# TODO: fix this so that it can work with non local files
|
|
60
|
+
# with self.storage.open(self.shape_filename_path) as f:
|
|
61
|
+
# shape_df = gpd.read_file(f)
|
|
62
|
+
|
|
63
|
+
if gpd is None:
|
|
64
|
+
raise MissingOptionalDependency(
|
|
65
|
+
"Missing optional dependency 'geopandas' for 'apply_geo' method, install package using `pip install oasis-data-manager[extra]`")
|
|
66
|
+
|
|
67
|
+
shape_df = gpd.read_file(shape_filename_path)
|
|
68
|
+
|
|
69
|
+
# for situations where the columns in the source data are different.
|
|
70
|
+
lon_col = kwargs.get("geo_lon_col", "longitude")
|
|
71
|
+
lat_col = kwargs.get("geo_lat_col", "latitude")
|
|
72
|
+
|
|
73
|
+
df_columns = self.df.columns.tolist()
|
|
74
|
+
if lat_col not in df_columns or lon_col not in df_columns:
|
|
75
|
+
logger.warning("Invalid shape file provided")
|
|
76
|
+
# temp until we decide on handling, i.e don't return full data if it fails.
|
|
77
|
+
return self.copy_with_df(pd.DataFrame.from_dict({}))
|
|
78
|
+
|
|
79
|
+
# convert read df to geo
|
|
80
|
+
df = gpd.GeoDataFrame(
|
|
81
|
+
self.df, geometry=gpd.points_from_xy(self.df[lon_col], self.df[lat_col])
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Make sure they're using the same projection reference
|
|
85
|
+
df.crs = shape_df.crs
|
|
86
|
+
|
|
87
|
+
# join the datasets, matching `geometry` to points within the shape df
|
|
88
|
+
df = df.sjoin(shape_df, how="inner")
|
|
89
|
+
|
|
90
|
+
if drop_geo:
|
|
91
|
+
df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
|
|
92
|
+
|
|
93
|
+
return self.copy_with_df(df)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class OasisPandasReaderCSV(OasisPandasReader):
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class OasisPandasReaderParquet(OasisPandasReader):
|
|
101
|
+
pass
|
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import sys
|
|
2
3
|
from copy import deepcopy
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Dict, TypedDict, Union
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
if sys.version_info >= (3, 8):
|
|
7
|
+
from typing import Any, Dict, TypedDict, Union
|
|
8
|
+
from typing_extensions import NotRequired
|
|
9
|
+
else:
|
|
10
|
+
from typing import Any, Dict, Union
|
|
11
|
+
from typing_extensions import NotRequired, TypedDict
|
|
7
12
|
|
|
8
13
|
from ..config import ConfigError, load_class
|
|
9
14
|
from ..filestore.backends.local import LocalStorage
|
|
@@ -67,10 +72,9 @@ def clean_config(config: Union[str, InputReaderConfig]) -> ResolvedReaderConfig:
|
|
|
67
72
|
|
|
68
73
|
def get_df_reader(config, *args, **kwargs):
|
|
69
74
|
config = clean_config(config)
|
|
70
|
-
|
|
71
75
|
cls = load_class(config["engine"]["path"], OasisReader)
|
|
72
|
-
|
|
73
76
|
storage = config["engine"]["options"].pop("storage", None) or LocalStorage("/")
|
|
77
|
+
|
|
74
78
|
return cls(
|
|
75
79
|
config["filepath"], storage, *args, **kwargs, **config["engine"]["options"]
|
|
76
80
|
)
|
|
@@ -1,394 +1,22 @@
|
|
|
1
|
+
__all__ = [
|
|
2
|
+
'OasisReader',
|
|
3
|
+
'OasisPandasReader',
|
|
4
|
+
'OasisPandasReaderCSV',
|
|
5
|
+
'OasisPandasReaderParquet',
|
|
6
|
+
'OasisDaskReader',
|
|
7
|
+
'OasisDaskReaderCSV',
|
|
8
|
+
'OasisDaskReaderParquet',
|
|
9
|
+
]
|
|
10
|
+
|
|
1
11
|
"""
|
|
2
12
|
Readers to replace direct usage of pd.read_csv/read_parquet and allows for filters() & sql()
|
|
3
13
|
to be provided.
|
|
4
14
|
"""
|
|
5
15
|
|
|
6
|
-
import
|
|
7
|
-
import
|
|
8
|
-
import pathlib
|
|
9
|
-
from typing import Iterable
|
|
10
|
-
|
|
11
|
-
import dask
|
|
12
|
-
import dask_geopandas as dgpd
|
|
13
|
-
import geopandas as gpd
|
|
14
|
-
import pandas as pd
|
|
15
|
-
from dask import dataframe as dd
|
|
16
|
-
from dask_sql import Context
|
|
17
|
-
from dask_sql.utils import ParsingException
|
|
18
|
-
from distributed import Client
|
|
19
|
-
|
|
20
|
-
from ..filestore.backends.base import BaseStorage
|
|
21
|
-
from .exceptions import InvalidSQLException
|
|
22
|
-
|
|
23
|
-
dask.config.set(
|
|
24
|
-
{"dataframe.convert-string": False}
|
|
25
|
-
) # allows dask sql to support pyarrow
|
|
26
|
-
logger = logging.getLogger("oasis_data_manager.df_reader.reader")
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class OasisReader:
|
|
30
|
-
"""
|
|
31
|
-
Base reader.
|
|
32
|
-
|
|
33
|
-
as_pandas(), sql() & filter() can all be chained with self.has_read controlling whether the base
|
|
34
|
-
read (read_csv/read_parquet) needs to be triggered. This is because in the case of spark
|
|
35
|
-
we need to read differently depending on if the intention is to do sql or filter.
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
def __init__(
|
|
39
|
-
self,
|
|
40
|
-
filename_or_buffer,
|
|
41
|
-
storage: BaseStorage,
|
|
42
|
-
*args,
|
|
43
|
-
dataframe=None,
|
|
44
|
-
has_read=False,
|
|
45
|
-
**kwargs,
|
|
46
|
-
):
|
|
47
|
-
self.filename_or_buffer = filename_or_buffer
|
|
48
|
-
self.storage = storage
|
|
49
|
-
self._df = dataframe
|
|
50
|
-
self.has_read = has_read
|
|
51
|
-
self.reader_args = args
|
|
52
|
-
self.reader_kwargs = kwargs
|
|
53
|
-
|
|
54
|
-
if not filename_or_buffer:
|
|
55
|
-
if dataframe is None and not has_read:
|
|
56
|
-
raise RuntimeError(
|
|
57
|
-
"Reader must be initialised with either a "
|
|
58
|
-
"filename_or_buffer or by passing a dataframe "
|
|
59
|
-
"and has_read=True"
|
|
60
|
-
)
|
|
61
|
-
else:
|
|
62
|
-
self.read_from_dataframe()
|
|
63
|
-
|
|
64
|
-
if (
|
|
65
|
-
filename_or_buffer
|
|
66
|
-
and isinstance(self.filename_or_buffer, str)
|
|
67
|
-
and self.filename_or_buffer.lower().endswith(".zip")
|
|
68
|
-
):
|
|
69
|
-
self.reader_kwargs["compression"] = "zip"
|
|
70
|
-
|
|
71
|
-
@property
|
|
72
|
-
def df(self):
|
|
73
|
-
self._read()
|
|
74
|
-
return self._df
|
|
75
|
-
|
|
76
|
-
@df.setter
|
|
77
|
-
def df(self, other):
|
|
78
|
-
self._df = other
|
|
79
|
-
|
|
80
|
-
def read_csv(self, *args, **kwargs):
|
|
81
|
-
raise NotImplementedError()
|
|
82
|
-
|
|
83
|
-
def read_parquet(self, *args, **kwargs):
|
|
84
|
-
raise NotImplementedError()
|
|
85
|
-
|
|
86
|
-
def _read(self):
|
|
87
|
-
if not self.has_read:
|
|
88
|
-
if hasattr(self.filename_or_buffer, "name"):
|
|
89
|
-
extension = pathlib.Path(self.filename_or_buffer.name).suffix
|
|
90
|
-
else:
|
|
91
|
-
extension = pathlib.Path(self.filename_or_buffer).suffix
|
|
92
|
-
|
|
93
|
-
if extension in [".parquet", ".pq"]:
|
|
94
|
-
self.has_read = True
|
|
95
|
-
self.read_parquet(*self.reader_args, **self.reader_kwargs)
|
|
96
|
-
else:
|
|
97
|
-
# assume the file is csv if not parquet
|
|
98
|
-
self.has_read = True
|
|
99
|
-
self.read_csv(*self.reader_args, **self.reader_kwargs)
|
|
100
|
-
|
|
101
|
-
return self
|
|
102
|
-
|
|
103
|
-
def copy_with_df(self, df):
|
|
104
|
-
return type(self)(
|
|
105
|
-
self.filename_or_buffer, self.storage, dataframe=df, has_read=self.has_read
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
def filter(self, filters):
|
|
109
|
-
self._read()
|
|
110
|
-
|
|
111
|
-
df = self.df
|
|
112
|
-
for df_filter in filters if isinstance(filters, Iterable) else [filters]:
|
|
113
|
-
df = df_filter(df)
|
|
114
|
-
|
|
115
|
-
return self.copy_with_df(df)
|
|
116
|
-
|
|
117
|
-
def sql(self, sql):
|
|
118
|
-
if sql:
|
|
119
|
-
self._read()
|
|
120
|
-
return self.apply_sql(sql)
|
|
121
|
-
return self
|
|
122
|
-
|
|
123
|
-
def query(self, fn):
|
|
124
|
-
return fn(self.df)
|
|
125
|
-
|
|
126
|
-
def as_pandas(self):
|
|
127
|
-
self._read()
|
|
128
|
-
return self.df
|
|
129
|
-
|
|
130
|
-
def read_from_dataframe(self):
|
|
131
|
-
pass
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
class OasisPandasReader(OasisReader):
|
|
135
|
-
def read_csv(self, *args, **kwargs):
|
|
136
|
-
if isinstance(self.filename_or_buffer, str):
|
|
137
|
-
if self.filename_or_buffer.startswith(
|
|
138
|
-
"http://"
|
|
139
|
-
) or self.filename_or_buffer.startswith("https://"):
|
|
140
|
-
self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
|
|
141
|
-
else:
|
|
142
|
-
_, uri = self.storage.get_storage_url(
|
|
143
|
-
self.filename_or_buffer, encode_params=False
|
|
144
|
-
)
|
|
145
|
-
self.df = pd.read_csv(
|
|
146
|
-
uri,
|
|
147
|
-
*args,
|
|
148
|
-
**kwargs,
|
|
149
|
-
storage_options=self.storage.get_fsspec_storage_options(),
|
|
150
|
-
)
|
|
151
|
-
else:
|
|
152
|
-
self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
|
|
153
|
-
|
|
154
|
-
def read_parquet(self, *args, **kwargs):
|
|
155
|
-
if isinstance(self.filename_or_buffer, str):
|
|
156
|
-
if self.filename_or_buffer.startswith(
|
|
157
|
-
"http://"
|
|
158
|
-
) or self.filename_or_buffer.startswith("https://"):
|
|
159
|
-
self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
|
|
160
|
-
else:
|
|
161
|
-
_, uri = self.storage.get_storage_url(
|
|
162
|
-
self.filename_or_buffer, encode_params=False
|
|
163
|
-
)
|
|
164
|
-
self.df = pd.read_parquet(
|
|
165
|
-
uri,
|
|
166
|
-
*args,
|
|
167
|
-
**kwargs,
|
|
168
|
-
storage_options=self.storage.get_fsspec_storage_options(),
|
|
169
|
-
)
|
|
170
|
-
else:
|
|
171
|
-
self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
|
|
172
|
-
|
|
173
|
-
def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
|
|
174
|
-
"""
|
|
175
|
-
Read in a shape file and return the _read file with geo data joined.
|
|
176
|
-
"""
|
|
177
|
-
# TODO: fix this so that it can work with non local files
|
|
178
|
-
# with self.storage.open(self.shape_filename_path) as f:
|
|
179
|
-
# shape_df = gpd.read_file(f)
|
|
180
|
-
|
|
181
|
-
shape_df = gpd.read_file(shape_filename_path)
|
|
182
|
-
|
|
183
|
-
# for situations where the columns in the source data are different.
|
|
184
|
-
lon_col = kwargs.get("geo_lon_col", "longitude")
|
|
185
|
-
lat_col = kwargs.get("geo_lat_col", "latitude")
|
|
186
|
-
|
|
187
|
-
df_columns = self.df.columns.tolist()
|
|
188
|
-
if lat_col not in df_columns or lon_col not in df_columns:
|
|
189
|
-
logger.warning("Invalid shape file provided")
|
|
190
|
-
# temp until we decide on handling, i.e don't return full data if it fails.
|
|
191
|
-
return self.copy_with_df(pd.DataFrame.from_dict({}))
|
|
192
|
-
|
|
193
|
-
# convert read df to geo
|
|
194
|
-
df = gpd.GeoDataFrame(
|
|
195
|
-
self.df, geometry=gpd.points_from_xy(self.df[lon_col], self.df[lat_col])
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
# Make sure they're using the same projection reference
|
|
199
|
-
df.crs = shape_df.crs
|
|
200
|
-
|
|
201
|
-
# join the datasets, matching `geometry` to points within the shape df
|
|
202
|
-
df = df.sjoin(shape_df, how="inner")
|
|
203
|
-
|
|
204
|
-
if drop_geo:
|
|
205
|
-
df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
|
|
206
|
-
|
|
207
|
-
return self.copy_with_df(df)
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
class OasisPandasReaderCSV(OasisPandasReader):
|
|
211
|
-
pass
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
class OasisPandasReaderParquet(OasisPandasReader):
|
|
215
|
-
pass
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
class OasisDaskReader(OasisReader):
|
|
219
|
-
sql_table_name = "table"
|
|
220
|
-
|
|
221
|
-
def __init__(self, *args, client_address=None, **kwargs):
|
|
222
|
-
if client_address:
|
|
223
|
-
self.client = Client(client_address, set_as_default=False)
|
|
224
|
-
else:
|
|
225
|
-
self.client = None
|
|
226
|
-
|
|
227
|
-
self.sql_context = Context()
|
|
228
|
-
self.table_names = [self.sql_table_name]
|
|
229
|
-
self.pre_sql_columns = []
|
|
230
|
-
|
|
231
|
-
super().__init__(*args, **kwargs)
|
|
232
|
-
|
|
233
|
-
def copy_with_df(self, df):
|
|
234
|
-
res = super().copy_with_df(df)
|
|
235
|
-
res.client = self.client
|
|
236
|
-
return res
|
|
237
|
-
|
|
238
|
-
def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
|
|
239
|
-
"""
|
|
240
|
-
Read in a shape file and return the _read file with geo data joined.
|
|
241
|
-
"""
|
|
242
|
-
# TODO: fix this so that it can work with non local files
|
|
243
|
-
# with self.storage.open(self.shape_filename_path) as f:
|
|
244
|
-
# shape_df = dgpd.read_file(f, npartitions=1)
|
|
245
|
-
|
|
246
|
-
shape_df = dgpd.read_file(shape_filename_path, npartitions=1)
|
|
247
|
-
|
|
248
|
-
# for situations where the columns in the source data are different.
|
|
249
|
-
lon_col = kwargs.get("geo_lon_col", "longitude")
|
|
250
|
-
lat_col = kwargs.get("geo_lat_col", "latitude")
|
|
251
|
-
|
|
252
|
-
df_columns = self.df.columns.tolist()
|
|
253
|
-
if lat_col not in df_columns or lon_col not in df_columns:
|
|
254
|
-
logger.warning("Invalid shape file provided")
|
|
255
|
-
# temp until we decide on handling, i.e don't return full data if it fails.
|
|
256
|
-
return self.copy_with_df(dd.DataFrame.from_dict({}, npartitions=1))
|
|
257
|
-
|
|
258
|
-
df = self.df.copy()
|
|
259
|
-
|
|
260
|
-
# convert read df to geo
|
|
261
|
-
df["geometry"] = dgpd.points_from_xy(df, lon_col, lat_col)
|
|
262
|
-
df = dgpd.from_dask_dataframe(df)
|
|
263
|
-
|
|
264
|
-
# Make sure they're using the same projection reference
|
|
265
|
-
df.crs = shape_df.crs
|
|
266
|
-
|
|
267
|
-
# join the datasets, matching `geometry` to points within the shape df
|
|
268
|
-
df = df.sjoin(shape_df, how="inner")
|
|
269
|
-
|
|
270
|
-
if drop_geo:
|
|
271
|
-
df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
|
|
272
|
-
|
|
273
|
-
return self.copy_with_df(df)
|
|
274
|
-
|
|
275
|
-
def apply_sql(self, sql):
|
|
276
|
-
df = self.df.copy()
|
|
277
|
-
try:
|
|
278
|
-
# Initially this was the filename, but some filenames are invalid for the table,
|
|
279
|
-
# is it ok to call it the same name all the time? Mapped to DaskDataTable in case
|
|
280
|
-
# we need to change this.
|
|
281
|
-
self.sql_context.create_table("DaskDataTable", self.df)
|
|
282
|
-
formatted_sql = sql.replace(self.sql_table_name, "DaskDataTable")
|
|
283
|
-
|
|
284
|
-
self.pre_sql_columns.extend(df.columns)
|
|
285
|
-
|
|
286
|
-
# dask expects the columns to be lower case, which won't match some data
|
|
287
|
-
df = self.sql_context.sql(
|
|
288
|
-
formatted_sql,
|
|
289
|
-
config_options={"sql.identifier.case_sensitive": False},
|
|
290
|
-
)
|
|
291
|
-
# which means we then need to map the columns back to the original
|
|
292
|
-
# and allow for any aggregations to be retained
|
|
293
|
-
validated_columns = []
|
|
294
|
-
for v in df.columns:
|
|
295
|
-
pre = False
|
|
296
|
-
for x in self.pre_sql_columns:
|
|
297
|
-
if v.lower() == x.lower():
|
|
298
|
-
validated_columns.append(x)
|
|
299
|
-
pre = True
|
|
300
|
-
|
|
301
|
-
if not pre:
|
|
302
|
-
validated_columns.append(v)
|
|
303
|
-
df.columns = validated_columns
|
|
304
|
-
|
|
305
|
-
return self.copy_with_df(df)
|
|
306
|
-
except ParsingException:
|
|
307
|
-
raise InvalidSQLException
|
|
308
|
-
|
|
309
|
-
def join(self, df, table_name):
|
|
310
|
-
"""
|
|
311
|
-
Creates a secondary table as a sql table in order to allow joins when apply_sql is called.
|
|
312
|
-
"""
|
|
313
|
-
if table_name in self.table_names:
|
|
314
|
-
raise RuntimeError(
|
|
315
|
-
f"Table name already in use: [{','.join(self.table_names)}]"
|
|
316
|
-
)
|
|
317
|
-
self.pre_sql_columns.extend(df.columns)
|
|
318
|
-
self.sql_context.create_table(table_name, df)
|
|
319
|
-
self.table_names.append(table_name)
|
|
320
|
-
return self
|
|
321
|
-
|
|
322
|
-
def read_from_dataframe(self):
|
|
323
|
-
if not isinstance(self.df, dd.DataFrame):
|
|
324
|
-
self.df = dd.from_pandas(self.df, npartitions=1)
|
|
325
|
-
|
|
326
|
-
def as_pandas(self):
|
|
327
|
-
super().as_pandas()
|
|
328
|
-
if self.client:
|
|
329
|
-
return self.client.compute(self.df).result()
|
|
330
|
-
else:
|
|
331
|
-
return self.df.compute()
|
|
332
|
-
|
|
333
|
-
def read_dict(self, data):
|
|
334
|
-
self.df = dd.DataFrame.from_dict(data)
|
|
335
|
-
|
|
336
|
-
def read_csv(self, *args, **kwargs):
|
|
337
|
-
# remove standard pandas kwargs which will case an issue in dask.
|
|
338
|
-
dask_safe_kwargs = kwargs.copy()
|
|
339
|
-
dask_safe_kwargs.pop("memory_map", None)
|
|
340
|
-
dask_safe_kwargs.pop("low_memory", None)
|
|
341
|
-
|
|
342
|
-
filename_or_buffer = self.filename_or_buffer
|
|
343
|
-
if isinstance(filename_or_buffer, pathlib.PosixPath):
|
|
344
|
-
filename_or_buffer = str(self.filename_or_buffer)
|
|
345
|
-
|
|
346
|
-
if isinstance(filename_or_buffer, io.TextIOWrapper) or isinstance(
|
|
347
|
-
filename_or_buffer, io.BufferedReader
|
|
348
|
-
):
|
|
349
|
-
filename_or_buffer = filename_or_buffer.name
|
|
350
|
-
|
|
351
|
-
# django files
|
|
352
|
-
if hasattr(filename_or_buffer, "path"):
|
|
353
|
-
filename_or_buffer = filename_or_buffer.path
|
|
354
|
-
|
|
355
|
-
_, uri = self.storage.get_storage_url(filename_or_buffer, encode_params=False)
|
|
356
|
-
self.df = dd.read_csv(
|
|
357
|
-
uri,
|
|
358
|
-
*args,
|
|
359
|
-
**dask_safe_kwargs,
|
|
360
|
-
storage_options=self.storage.get_fsspec_storage_options(),
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
def read_parquet(self, *args, **kwargs):
|
|
364
|
-
if isinstance(self.filename_or_buffer, str):
|
|
365
|
-
_, uri = self.storage.get_storage_url(
|
|
366
|
-
self.filename_or_buffer, encode_params=False
|
|
367
|
-
)
|
|
368
|
-
filename = uri
|
|
369
|
-
kwargs["storage_options"] = self.storage.get_fsspec_storage_options()
|
|
370
|
-
else:
|
|
371
|
-
filename = self.filename_or_buffer
|
|
372
|
-
|
|
373
|
-
self.df = dd.read_parquet(
|
|
374
|
-
filename,
|
|
375
|
-
*args,
|
|
376
|
-
**kwargs,
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
# dask-sql doesn't handle categorical columns, but we need to be careful
|
|
380
|
-
# how we convert them, if an assign is used we will end up stopping
|
|
381
|
-
# the `Predicate pushdown optimization` within dask-sql from applying the
|
|
382
|
-
# sql to the read_parquet filters.
|
|
383
|
-
categories_to_convert = {}
|
|
384
|
-
for col in self.df.select_dtypes(include="category").columns:
|
|
385
|
-
categories_to_convert[col] = self.df[col].dtype.categories.dtype
|
|
386
|
-
self.df = self.df.astype(categories_to_convert)
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
class OasisDaskReaderCSV(OasisDaskReader):
|
|
390
|
-
pass
|
|
391
|
-
|
|
16
|
+
from .backends.base import OasisReader
|
|
17
|
+
from .backends.pandas import OasisPandasReader, OasisPandasReaderCSV, OasisPandasReaderParquet
|
|
392
18
|
|
|
393
|
-
|
|
19
|
+
try:
|
|
20
|
+
from .backends.dask import OasisDaskReader, OasisDaskReaderCSV, OasisDaskReaderParquet
|
|
21
|
+
except ModuleNotFoundError as e:
|
|
394
22
|
pass
|
|
@@ -1,8 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
if sys.version_info >= (3, 8):
|
|
6
|
+
from typing import Optional, Tuple, TypedDict, Union
|
|
7
|
+
from typing_extensions import NotRequired
|
|
8
|
+
else:
|
|
9
|
+
from typing import Optional, Tuple, Union
|
|
10
|
+
from typing_extensions import NotRequired, TypedDict
|
|
6
11
|
|
|
7
12
|
from oasis_data_manager.config import ConfigError, load_class
|
|
8
13
|
from oasis_data_manager.filestore.backends.base import BaseStorage
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: oasis-data-manager
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: UNKNOWN
|
|
5
5
|
Home-page: https://github.com/OasisLMF/OasisDataManager
|
|
6
6
|
Author: Oasis LMF
|
|
@@ -15,18 +15,21 @@ Classifier: Programming Language :: Python :: 3.6
|
|
|
15
15
|
Requires-Python: >=3.6
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
-
Requires-Dist: pandas
|
|
19
|
-
Requires-Dist: geopandas
|
|
20
|
-
Requires-Dist: dask
|
|
21
|
-
Requires-Dist: dask-geopandas
|
|
22
|
-
Requires-Dist: dask-sql
|
|
23
|
-
Requires-Dist: distributed
|
|
24
18
|
Requires-Dist: fastparquet
|
|
25
|
-
Requires-Dist: pyogrio
|
|
26
19
|
Requires-Dist: fsspec
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
20
|
+
Requires-Dist: pandas
|
|
21
|
+
Requires-Dist: typing
|
|
22
|
+
Requires-Dist: typing-extensions
|
|
23
|
+
Provides-Extra: extra
|
|
24
|
+
Requires-Dist: adlfs ; extra == 'extra'
|
|
25
|
+
Requires-Dist: boto3 ; extra == 'extra'
|
|
26
|
+
Requires-Dist: dask ; extra == 'extra'
|
|
27
|
+
Requires-Dist: dask-geopandas ; extra == 'extra'
|
|
28
|
+
Requires-Dist: dask-sql ; extra == 'extra'
|
|
29
|
+
Requires-Dist: distributed ; extra == 'extra'
|
|
30
|
+
Requires-Dist: geopandas ; extra == 'extra'
|
|
31
|
+
Requires-Dist: pyogrio ; extra == 'extra'
|
|
32
|
+
Requires-Dist: s3fs >=2023.9.0 ; extra == 'extra'
|
|
30
33
|
|
|
31
34
|
UNKNOWN
|
|
32
35
|
|
|
@@ -1,15 +1,19 @@
|
|
|
1
|
-
oasis_data_manager/__init__.py,sha256=
|
|
1
|
+
oasis_data_manager/__init__.py,sha256=ls1camlIoMxEZz9gSkZ1OJo-MXqHWwKPtdPbZJmwp7E,22
|
|
2
2
|
oasis_data_manager/config.py,sha256=_qx2Mu5n0Jx3W5SKCiqLr1SPdWLrbFv_B82r6Eosp_k,534
|
|
3
3
|
oasis_data_manager/complex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
oasis_data_manager/complex/complex.py,sha256=8oomV9WyLsa8sz8aMzlwv4naKCGOL3UdSlYQJxUFqCk,5382
|
|
5
5
|
oasis_data_manager/complex/examples.py,sha256=HlwOzJ2SVF9yE7ei9d2HWglUkYApiyQxwm8WiL84wdY,1220
|
|
6
6
|
oasis_data_manager/df_reader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
oasis_data_manager/df_reader/config.py,sha256=
|
|
8
|
-
oasis_data_manager/df_reader/exceptions.py,sha256=
|
|
9
|
-
oasis_data_manager/df_reader/reader.py,sha256=
|
|
7
|
+
oasis_data_manager/df_reader/config.py,sha256=2xwWg5b6dnERUgQYWBjt2W64GdYUhXTtXCgrSPP60Mg,2507
|
|
8
|
+
oasis_data_manager/df_reader/exceptions.py,sha256=9FV8n2eqrkTGpEt47GGs5k0eon2Y-Xz5K3wyc1R9fBs,102
|
|
9
|
+
oasis_data_manager/df_reader/reader.py,sha256=14wuGTBKnIRslDMXsA3QjBiuvzcweRi29nM-V46pmLE,597
|
|
10
|
+
oasis_data_manager/df_reader/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
oasis_data_manager/df_reader/backends/base.py,sha256=cwvHzncinHfwwHW5hIjMQZ6KCVECxE3NixPT5SX7s-M,3057
|
|
12
|
+
oasis_data_manager/df_reader/backends/dask.py,sha256=fE2vEAd5y7haUzi0oEUBE_4D5UL0jkmKyaAEaWnuFC4,6975
|
|
13
|
+
oasis_data_manager/df_reader/backends/pandas.py,sha256=yTfULUunn0JLUqa_P5nazkGF6g6omaUY0litnPFITD4,3656
|
|
10
14
|
oasis_data_manager/errors/__init__.py,sha256=9q_7nk5DNg1-WfQoBM4kw_Us34Y2szNkZwfE5-6_Rg0,687
|
|
11
15
|
oasis_data_manager/filestore/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
-
oasis_data_manager/filestore/config.py,sha256=
|
|
16
|
+
oasis_data_manager/filestore/config.py,sha256=N0hSfPpRpej7uWGV54MEEH_0vvdpcgo3fZlx6m2muoY,3625
|
|
13
17
|
oasis_data_manager/filestore/filestore.py,sha256=eaQGAer7Q9KM4B3bq9WmZAtjFdj9aRef_E3rI2i0dOk,2615
|
|
14
18
|
oasis_data_manager/filestore/log.py,sha256=8l54LoOJiOG2pr4o93LzMocjH7dHcsOp14JWJ_MrqHQ,693
|
|
15
19
|
oasis_data_manager/filestore/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -17,8 +21,8 @@ oasis_data_manager/filestore/backends/aws_s3.py,sha256=te2rVQl6n4xr4X-xQv68gDNQD
|
|
|
17
21
|
oasis_data_manager/filestore/backends/azure_abfs.py,sha256=Lo2aBBQsFPJJEb9svm1-W43Gz2g4LCaHlM-9mVXDhzg,5354
|
|
18
22
|
oasis_data_manager/filestore/backends/base.py,sha256=aj__0KsmnVbXTHYSA_qlrIe5pxImSZ14dPMzskdNzJc,12971
|
|
19
23
|
oasis_data_manager/filestore/backends/local.py,sha256=MEX_CvwhsDfv9lvBjc8CdaDXaN53l9onQHmOgKjoJcg,1242
|
|
20
|
-
oasis_data_manager-0.1.
|
|
21
|
-
oasis_data_manager-0.1.
|
|
22
|
-
oasis_data_manager-0.1.
|
|
23
|
-
oasis_data_manager-0.1.
|
|
24
|
-
oasis_data_manager-0.1.
|
|
24
|
+
oasis_data_manager-0.1.1.dist-info/LICENSE,sha256=qr-PXl5mSpeUk-A7RzYcH0dhR93hhgVK8SW9mzco0Ao,1517
|
|
25
|
+
oasis_data_manager-0.1.1.dist-info/METADATA,sha256=UoaRd22Lz2O_UNdRPGCXf-tOBf_TPvEQoFoqVrLFK18,1098
|
|
26
|
+
oasis_data_manager-0.1.1.dist-info/WHEEL,sha256=-G_t0oGuE7UD0DrSpVZnq1hHMBV9DD2XkS5v7XpmTnk,110
|
|
27
|
+
oasis_data_manager-0.1.1.dist-info/top_level.txt,sha256=qMC39T9UvDCPbNJLVtgu8h6f7c4KJYel7SnIpz62wsU,19
|
|
28
|
+
oasis_data_manager-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|