oasis-data-manager 0.1.0rc1__py2.py3-none-any.whl → 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- __version__ = '0.1.0rc1'
1
+ __version__ = '0.1.1'
File without changes
@@ -0,0 +1,109 @@
1
+ import pathlib
2
+ from typing import Iterable
3
+
4
+ from ...filestore.backends.base import BaseStorage
5
+
6
+
7
+ class OasisReader:
8
+ """
9
+ Base reader.
10
+
11
+ as_pandas(), sql() & filter() can all be chained with self.has_read controlling whether the base
12
+ read (read_csv/read_parquet) needs to be triggered. This is because in the case of spark
13
+ we need to read differently depending on if the intention is to do sql or filter.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ filename_or_buffer,
19
+ storage: BaseStorage,
20
+ *args,
21
+ dataframe=None,
22
+ has_read=False,
23
+ **kwargs,
24
+ ):
25
+ self.filename_or_buffer = filename_or_buffer
26
+ self.storage = storage
27
+ self._df = dataframe
28
+ self.has_read = has_read
29
+ self.reader_args = args
30
+ self.reader_kwargs = kwargs
31
+
32
+ if not filename_or_buffer:
33
+ if dataframe is None and not has_read:
34
+ raise RuntimeError(
35
+ "Reader must be initialised with either a "
36
+ "filename_or_buffer or by passing a dataframe "
37
+ "and has_read=True"
38
+ )
39
+ else:
40
+ self.read_from_dataframe()
41
+
42
+ if (
43
+ filename_or_buffer
44
+ and isinstance(self.filename_or_buffer, str)
45
+ and self.filename_or_buffer.lower().endswith(".zip")
46
+ ):
47
+ self.reader_kwargs["compression"] = "zip"
48
+
49
+ @property
50
+ def df(self):
51
+ self._read()
52
+ return self._df
53
+
54
+ @df.setter
55
+ def df(self, other):
56
+ self._df = other
57
+
58
+ def read_csv(self, *args, **kwargs):
59
+ raise NotImplementedError()
60
+
61
+ def read_parquet(self, *args, **kwargs):
62
+ raise NotImplementedError()
63
+
64
+ def _read(self):
65
+ if not self.has_read:
66
+ if hasattr(self.filename_or_buffer, "name"):
67
+ extension = pathlib.Path(self.filename_or_buffer.name).suffix
68
+ else:
69
+ extension = pathlib.Path(self.filename_or_buffer).suffix
70
+
71
+ if extension in [".parquet", ".pq"]:
72
+ self.has_read = True
73
+ self.read_parquet(*self.reader_args, **self.reader_kwargs)
74
+ else:
75
+ # assume the file is csv if not parquet
76
+ self.has_read = True
77
+ self.read_csv(*self.reader_args, **self.reader_kwargs)
78
+
79
+ return self
80
+
81
+ def copy_with_df(self, df):
82
+ return type(self)(
83
+ self.filename_or_buffer, self.storage, dataframe=df, has_read=self.has_read
84
+ )
85
+
86
+ def filter(self, filters):
87
+ self._read()
88
+
89
+ df = self.df
90
+ for df_filter in filters if isinstance(filters, Iterable) else [filters]:
91
+ df = df_filter(df)
92
+
93
+ return self.copy_with_df(df)
94
+
95
+ def sql(self, sql):
96
+ if sql:
97
+ self._read()
98
+ return self.apply_sql(sql)
99
+ return self
100
+
101
+ def query(self, fn):
102
+ return fn(self.df)
103
+
104
+ def as_pandas(self):
105
+ self._read()
106
+ return self.df
107
+
108
+ def read_from_dataframe(self):
109
+ pass
@@ -0,0 +1,197 @@
1
+ import io
2
+ import pathlib
3
+ import logging
4
+
5
+ import dask
6
+ import dask_geopandas as dgpd
7
+ from dask import dataframe as dd
8
+ from dask_sql import Context
9
+ from dask_sql.utils import ParsingException
10
+ from distributed import Client
11
+
12
+ from ..exceptions import InvalidSQLException
13
+ from .base import OasisReader
14
+
15
+ dask.config.set(
16
+ {"dataframe.convert-string": False}
17
+ ) # allows dask sql to support pyarrow
18
+ logger = logging.getLogger("oasis_data_manager.df_reader.reader")
19
+
20
+
21
+ class OasisDaskReader(OasisReader):
22
+ sql_table_name = "table"
23
+
24
+ def __init__(self, *args, client_address=None, **kwargs):
25
+ if client_address:
26
+ self.client = Client(client_address, set_as_default=False)
27
+ else:
28
+ self.client = None
29
+
30
+ self.sql_context = Context()
31
+ self.table_names = [self.sql_table_name]
32
+ self.pre_sql_columns = []
33
+
34
+ super().__init__(*args, **kwargs)
35
+
36
+ def copy_with_df(self, df):
37
+ res = super().copy_with_df(df)
38
+ res.client = self.client
39
+ return res
40
+
41
+ def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
42
+ """
43
+ Read in a shape file and return the _read file with geo data joined.
44
+ """
45
+ # TODO: fix this so that it can work with non local files
46
+ # with self.storage.open(self.shape_filename_path) as f:
47
+ # shape_df = dgpd.read_file(f, npartitions=1)
48
+
49
+ shape_df = dgpd.read_file(shape_filename_path, npartitions=1)
50
+
51
+ # for situations where the columns in the source data are different.
52
+ lon_col = kwargs.get("geo_lon_col", "longitude")
53
+ lat_col = kwargs.get("geo_lat_col", "latitude")
54
+
55
+ df_columns = self.df.columns.tolist()
56
+ if lat_col not in df_columns or lon_col not in df_columns:
57
+ logger.warning("Invalid shape file provided")
58
+ # temp until we decide on handling, i.e don't return full data if it fails.
59
+ return self.copy_with_df(dd.DataFrame.from_dict({}, npartitions=1))
60
+
61
+ df = self.df.copy()
62
+
63
+ # convert read df to geo
64
+ df["geometry"] = dgpd.points_from_xy(df, lon_col, lat_col)
65
+ df = dgpd.from_dask_dataframe(df)
66
+
67
+ # Make sure they're using the same projection reference
68
+ df.crs = shape_df.crs
69
+
70
+ # join the datasets, matching `geometry` to points within the shape df
71
+ df = df.sjoin(shape_df, how="inner")
72
+
73
+ if drop_geo:
74
+ df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
75
+
76
+ return self.copy_with_df(df)
77
+
78
+ def apply_sql(self, sql):
79
+ df = self.df.copy()
80
+ try:
81
+ # Initially this was the filename, but some filenames are invalid for the table,
82
+ # is it ok to call it the same name all the time? Mapped to DaskDataTable in case
83
+ # we need to change this.
84
+ self.sql_context.create_table("DaskDataTable", self.df)
85
+ formatted_sql = sql.replace(self.sql_table_name, "DaskDataTable")
86
+
87
+ self.pre_sql_columns.extend(df.columns)
88
+
89
+ # dask expects the columns to be lower case, which won't match some data
90
+ df = self.sql_context.sql(
91
+ formatted_sql,
92
+ config_options={"sql.identifier.case_sensitive": False},
93
+ )
94
+ # which means we then need to map the columns back to the original
95
+ # and allow for any aggregations to be retained
96
+ validated_columns = []
97
+ for v in df.columns:
98
+ pre = False
99
+ for x in self.pre_sql_columns:
100
+ if v.lower() == x.lower():
101
+ validated_columns.append(x)
102
+ pre = True
103
+
104
+ if not pre:
105
+ validated_columns.append(v)
106
+ df.columns = validated_columns
107
+
108
+ return self.copy_with_df(df)
109
+ except ParsingException:
110
+ raise InvalidSQLException
111
+
112
+ def join(self, df, table_name):
113
+ """
114
+ Creates a secondary table as a sql table in order to allow joins when apply_sql is called.
115
+ """
116
+ if table_name in self.table_names:
117
+ raise RuntimeError(
118
+ f"Table name already in use: [{','.join(self.table_names)}]"
119
+ )
120
+ self.pre_sql_columns.extend(df.columns)
121
+ self.sql_context.create_table(table_name, df)
122
+ self.table_names.append(table_name)
123
+ return self
124
+
125
+ def read_from_dataframe(self):
126
+ if not isinstance(self.df, dd.DataFrame):
127
+ self.df = dd.from_pandas(self.df, npartitions=1)
128
+
129
+ def as_pandas(self):
130
+ super().as_pandas()
131
+ if self.client:
132
+ return self.client.compute(self.df).result()
133
+ else:
134
+ return self.df.compute()
135
+
136
+ def read_dict(self, data):
137
+ self.df = dd.DataFrame.from_dict(data)
138
+
139
+ def read_csv(self, *args, **kwargs):
140
+ # remove standard pandas kwargs which will case an issue in dask.
141
+ dask_safe_kwargs = kwargs.copy()
142
+ dask_safe_kwargs.pop("memory_map", None)
143
+ dask_safe_kwargs.pop("low_memory", None)
144
+
145
+ filename_or_buffer = self.filename_or_buffer
146
+ if isinstance(filename_or_buffer, pathlib.PosixPath):
147
+ filename_or_buffer = str(self.filename_or_buffer)
148
+
149
+ if isinstance(filename_or_buffer, io.TextIOWrapper) or isinstance(
150
+ filename_or_buffer, io.BufferedReader
151
+ ):
152
+ filename_or_buffer = filename_or_buffer.name
153
+
154
+ # django files
155
+ if hasattr(filename_or_buffer, "path"):
156
+ filename_or_buffer = filename_or_buffer.path
157
+
158
+ _, uri = self.storage.get_storage_url(filename_or_buffer, encode_params=False)
159
+ self.df = dd.read_csv(
160
+ uri,
161
+ *args,
162
+ **dask_safe_kwargs,
163
+ storage_options=self.storage.get_fsspec_storage_options(),
164
+ )
165
+
166
+ def read_parquet(self, *args, **kwargs):
167
+ if isinstance(self.filename_or_buffer, str):
168
+ _, uri = self.storage.get_storage_url(
169
+ self.filename_or_buffer, encode_params=False
170
+ )
171
+ filename = uri
172
+ kwargs["storage_options"] = self.storage.get_fsspec_storage_options()
173
+ else:
174
+ filename = self.filename_or_buffer
175
+
176
+ self.df = dd.read_parquet(
177
+ filename,
178
+ *args,
179
+ **kwargs,
180
+ )
181
+
182
+ # dask-sql doesn't handle categorical columns, but we need to be careful
183
+ # how we convert them, if an assign is used we will end up stopping
184
+ # the `Predicate pushdown optimization` within dask-sql from applying the
185
+ # sql to the read_parquet filters.
186
+ categories_to_convert = {}
187
+ for col in self.df.select_dtypes(include="category").columns:
188
+ categories_to_convert[col] = self.df[col].dtype.categories.dtype
189
+ self.df = self.df.astype(categories_to_convert)
190
+
191
+
192
+ class OasisDaskReaderCSV(OasisDaskReader):
193
+ pass
194
+
195
+
196
+ class OasisDaskReaderParquet(OasisDaskReader):
197
+ pass
@@ -0,0 +1,101 @@
1
+ import logging
2
+
3
+ import pandas as pd
4
+
5
+
6
+ try:
7
+ import geopandas as gpd
8
+ except ModuleNotFoundError:
9
+ gpd = None
10
+
11
+ from .base import OasisReader
12
+ from ..exceptions import MissingOptionalDependency
13
+ logger = logging.getLogger("oasis_data_manager.df_reader.reader")
14
+
15
+
16
+ class OasisPandasReader(OasisReader):
17
+ def read_csv(self, *args, **kwargs):
18
+ if isinstance(self.filename_or_buffer, str):
19
+ if self.filename_or_buffer.startswith(
20
+ "http://"
21
+ ) or self.filename_or_buffer.startswith("https://"):
22
+ self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
23
+ else:
24
+ _, uri = self.storage.get_storage_url(
25
+ self.filename_or_buffer, encode_params=False
26
+ )
27
+ self.df = pd.read_csv(
28
+ uri,
29
+ *args,
30
+ **kwargs,
31
+ storage_options=self.storage.get_fsspec_storage_options(),
32
+ )
33
+ else:
34
+ self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
35
+
36
+ def read_parquet(self, *args, **kwargs):
37
+ if isinstance(self.filename_or_buffer, str):
38
+ if self.filename_or_buffer.startswith(
39
+ "http://"
40
+ ) or self.filename_or_buffer.startswith("https://"):
41
+ self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
42
+ else:
43
+ _, uri = self.storage.get_storage_url(
44
+ self.filename_or_buffer, encode_params=False
45
+ )
46
+ self.df = pd.read_parquet(
47
+ uri,
48
+ *args,
49
+ **kwargs,
50
+ storage_options=self.storage.get_fsspec_storage_options(),
51
+ )
52
+ else:
53
+ self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
54
+
55
+ def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
56
+ """
57
+ Read in a shape file and return the _read file with geo data joined.
58
+ """
59
+ # TODO: fix this so that it can work with non local files
60
+ # with self.storage.open(self.shape_filename_path) as f:
61
+ # shape_df = gpd.read_file(f)
62
+
63
+ if gpd is None:
64
+ raise MissingOptionalDependency(
65
+ "Missing optional dependency 'geopandas' for 'apply_geo' method, install package using `pip install oasis-data-manager[extra]`")
66
+
67
+ shape_df = gpd.read_file(shape_filename_path)
68
+
69
+ # for situations where the columns in the source data are different.
70
+ lon_col = kwargs.get("geo_lon_col", "longitude")
71
+ lat_col = kwargs.get("geo_lat_col", "latitude")
72
+
73
+ df_columns = self.df.columns.tolist()
74
+ if lat_col not in df_columns or lon_col not in df_columns:
75
+ logger.warning("Invalid shape file provided")
76
+ # temp until we decide on handling, i.e don't return full data if it fails.
77
+ return self.copy_with_df(pd.DataFrame.from_dict({}))
78
+
79
+ # convert read df to geo
80
+ df = gpd.GeoDataFrame(
81
+ self.df, geometry=gpd.points_from_xy(self.df[lon_col], self.df[lat_col])
82
+ )
83
+
84
+ # Make sure they're using the same projection reference
85
+ df.crs = shape_df.crs
86
+
87
+ # join the datasets, matching `geometry` to points within the shape df
88
+ df = df.sjoin(shape_df, how="inner")
89
+
90
+ if drop_geo:
91
+ df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
92
+
93
+ return self.copy_with_df(df)
94
+
95
+
96
+ class OasisPandasReaderCSV(OasisPandasReader):
97
+ pass
98
+
99
+
100
+ class OasisPandasReaderParquet(OasisPandasReader):
101
+ pass
@@ -1,9 +1,14 @@
1
1
  import json
2
+ import sys
2
3
  from copy import deepcopy
3
4
  from pathlib import Path
4
- from typing import Any, Dict, TypedDict, Union
5
5
 
6
- from typing_extensions import NotRequired
6
+ if sys.version_info >= (3, 8):
7
+ from typing import Any, Dict, TypedDict, Union
8
+ from typing_extensions import NotRequired
9
+ else:
10
+ from typing import Any, Dict, Union
11
+ from typing_extensions import NotRequired, TypedDict
7
12
 
8
13
  from ..config import ConfigError, load_class
9
14
  from ..filestore.backends.local import LocalStorage
@@ -67,10 +72,9 @@ def clean_config(config: Union[str, InputReaderConfig]) -> ResolvedReaderConfig:
67
72
 
68
73
  def get_df_reader(config, *args, **kwargs):
69
74
  config = clean_config(config)
70
-
71
75
  cls = load_class(config["engine"]["path"], OasisReader)
72
-
73
76
  storage = config["engine"]["options"].pop("storage", None) or LocalStorage("/")
77
+
74
78
  return cls(
75
79
  config["filepath"], storage, *args, **kwargs, **config["engine"]["options"]
76
80
  )
@@ -1,2 +1,6 @@
1
1
  class InvalidSQLException(Exception):
2
2
  pass
3
+
4
+
5
+ class MissingOptionalDependency(Exception):
6
+ pass
@@ -1,394 +1,22 @@
1
+ __all__ = [
2
+ 'OasisReader',
3
+ 'OasisPandasReader',
4
+ 'OasisPandasReaderCSV',
5
+ 'OasisPandasReaderParquet',
6
+ 'OasisDaskReader',
7
+ 'OasisDaskReaderCSV',
8
+ 'OasisDaskReaderParquet',
9
+ ]
10
+
1
11
  """
2
12
  Readers to replace direct usage of pd.read_csv/read_parquet and allows for filters() & sql()
3
13
  to be provided.
4
14
  """
5
15
 
6
- import io
7
- import logging
8
- import pathlib
9
- from typing import Iterable
10
-
11
- import dask
12
- import dask_geopandas as dgpd
13
- import geopandas as gpd
14
- import pandas as pd
15
- from dask import dataframe as dd
16
- from dask_sql import Context
17
- from dask_sql.utils import ParsingException
18
- from distributed import Client
19
-
20
- from ..filestore.backends.base import BaseStorage
21
- from .exceptions import InvalidSQLException
22
-
23
- dask.config.set(
24
- {"dataframe.convert-string": False}
25
- ) # allows dask sql to support pyarrow
26
- logger = logging.getLogger("oasis_data_manager.df_reader.reader")
27
-
28
-
29
- class OasisReader:
30
- """
31
- Base reader.
32
-
33
- as_pandas(), sql() & filter() can all be chained with self.has_read controlling whether the base
34
- read (read_csv/read_parquet) needs to be triggered. This is because in the case of spark
35
- we need to read differently depending on if the intention is to do sql or filter.
36
- """
37
-
38
- def __init__(
39
- self,
40
- filename_or_buffer,
41
- storage: BaseStorage,
42
- *args,
43
- dataframe=None,
44
- has_read=False,
45
- **kwargs,
46
- ):
47
- self.filename_or_buffer = filename_or_buffer
48
- self.storage = storage
49
- self._df = dataframe
50
- self.has_read = has_read
51
- self.reader_args = args
52
- self.reader_kwargs = kwargs
53
-
54
- if not filename_or_buffer:
55
- if dataframe is None and not has_read:
56
- raise RuntimeError(
57
- "Reader must be initialised with either a "
58
- "filename_or_buffer or by passing a dataframe "
59
- "and has_read=True"
60
- )
61
- else:
62
- self.read_from_dataframe()
63
-
64
- if (
65
- filename_or_buffer
66
- and isinstance(self.filename_or_buffer, str)
67
- and self.filename_or_buffer.lower().endswith(".zip")
68
- ):
69
- self.reader_kwargs["compression"] = "zip"
70
-
71
- @property
72
- def df(self):
73
- self._read()
74
- return self._df
75
-
76
- @df.setter
77
- def df(self, other):
78
- self._df = other
79
-
80
- def read_csv(self, *args, **kwargs):
81
- raise NotImplementedError()
82
-
83
- def read_parquet(self, *args, **kwargs):
84
- raise NotImplementedError()
85
-
86
- def _read(self):
87
- if not self.has_read:
88
- if hasattr(self.filename_or_buffer, "name"):
89
- extension = pathlib.Path(self.filename_or_buffer.name).suffix
90
- else:
91
- extension = pathlib.Path(self.filename_or_buffer).suffix
92
-
93
- if extension in [".parquet", ".pq"]:
94
- self.has_read = True
95
- self.read_parquet(*self.reader_args, **self.reader_kwargs)
96
- else:
97
- # assume the file is csv if not parquet
98
- self.has_read = True
99
- self.read_csv(*self.reader_args, **self.reader_kwargs)
100
-
101
- return self
102
-
103
- def copy_with_df(self, df):
104
- return type(self)(
105
- self.filename_or_buffer, self.storage, dataframe=df, has_read=self.has_read
106
- )
107
-
108
- def filter(self, filters):
109
- self._read()
110
-
111
- df = self.df
112
- for df_filter in filters if isinstance(filters, Iterable) else [filters]:
113
- df = df_filter(df)
114
-
115
- return self.copy_with_df(df)
116
-
117
- def sql(self, sql):
118
- if sql:
119
- self._read()
120
- return self.apply_sql(sql)
121
- return self
122
-
123
- def query(self, fn):
124
- return fn(self.df)
125
-
126
- def as_pandas(self):
127
- self._read()
128
- return self.df
129
-
130
- def read_from_dataframe(self):
131
- pass
132
-
133
-
134
- class OasisPandasReader(OasisReader):
135
- def read_csv(self, *args, **kwargs):
136
- if isinstance(self.filename_or_buffer, str):
137
- if self.filename_or_buffer.startswith(
138
- "http://"
139
- ) or self.filename_or_buffer.startswith("https://"):
140
- self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
141
- else:
142
- _, uri = self.storage.get_storage_url(
143
- self.filename_or_buffer, encode_params=False
144
- )
145
- self.df = pd.read_csv(
146
- uri,
147
- *args,
148
- **kwargs,
149
- storage_options=self.storage.get_fsspec_storage_options(),
150
- )
151
- else:
152
- self.df = pd.read_csv(self.filename_or_buffer, *args, **kwargs)
153
-
154
- def read_parquet(self, *args, **kwargs):
155
- if isinstance(self.filename_or_buffer, str):
156
- if self.filename_or_buffer.startswith(
157
- "http://"
158
- ) or self.filename_or_buffer.startswith("https://"):
159
- self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
160
- else:
161
- _, uri = self.storage.get_storage_url(
162
- self.filename_or_buffer, encode_params=False
163
- )
164
- self.df = pd.read_parquet(
165
- uri,
166
- *args,
167
- **kwargs,
168
- storage_options=self.storage.get_fsspec_storage_options(),
169
- )
170
- else:
171
- self.df = pd.read_parquet(self.filename_or_buffer, *args, **kwargs)
172
-
173
- def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
174
- """
175
- Read in a shape file and return the _read file with geo data joined.
176
- """
177
- # TODO: fix this so that it can work with non local files
178
- # with self.storage.open(self.shape_filename_path) as f:
179
- # shape_df = gpd.read_file(f)
180
-
181
- shape_df = gpd.read_file(shape_filename_path)
182
-
183
- # for situations where the columns in the source data are different.
184
- lon_col = kwargs.get("geo_lon_col", "longitude")
185
- lat_col = kwargs.get("geo_lat_col", "latitude")
186
-
187
- df_columns = self.df.columns.tolist()
188
- if lat_col not in df_columns or lon_col not in df_columns:
189
- logger.warning("Invalid shape file provided")
190
- # temp until we decide on handling, i.e don't return full data if it fails.
191
- return self.copy_with_df(pd.DataFrame.from_dict({}))
192
-
193
- # convert read df to geo
194
- df = gpd.GeoDataFrame(
195
- self.df, geometry=gpd.points_from_xy(self.df[lon_col], self.df[lat_col])
196
- )
197
-
198
- # Make sure they're using the same projection reference
199
- df.crs = shape_df.crs
200
-
201
- # join the datasets, matching `geometry` to points within the shape df
202
- df = df.sjoin(shape_df, how="inner")
203
-
204
- if drop_geo:
205
- df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
206
-
207
- return self.copy_with_df(df)
208
-
209
-
210
- class OasisPandasReaderCSV(OasisPandasReader):
211
- pass
212
-
213
-
214
- class OasisPandasReaderParquet(OasisPandasReader):
215
- pass
216
-
217
-
218
- class OasisDaskReader(OasisReader):
219
- sql_table_name = "table"
220
-
221
- def __init__(self, *args, client_address=None, **kwargs):
222
- if client_address:
223
- self.client = Client(client_address, set_as_default=False)
224
- else:
225
- self.client = None
226
-
227
- self.sql_context = Context()
228
- self.table_names = [self.sql_table_name]
229
- self.pre_sql_columns = []
230
-
231
- super().__init__(*args, **kwargs)
232
-
233
- def copy_with_df(self, df):
234
- res = super().copy_with_df(df)
235
- res.client = self.client
236
- return res
237
-
238
- def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
239
- """
240
- Read in a shape file and return the _read file with geo data joined.
241
- """
242
- # TODO: fix this so that it can work with non local files
243
- # with self.storage.open(self.shape_filename_path) as f:
244
- # shape_df = dgpd.read_file(f, npartitions=1)
245
-
246
- shape_df = dgpd.read_file(shape_filename_path, npartitions=1)
247
-
248
- # for situations where the columns in the source data are different.
249
- lon_col = kwargs.get("geo_lon_col", "longitude")
250
- lat_col = kwargs.get("geo_lat_col", "latitude")
251
-
252
- df_columns = self.df.columns.tolist()
253
- if lat_col not in df_columns or lon_col not in df_columns:
254
- logger.warning("Invalid shape file provided")
255
- # temp until we decide on handling, i.e don't return full data if it fails.
256
- return self.copy_with_df(dd.DataFrame.from_dict({}, npartitions=1))
257
-
258
- df = self.df.copy()
259
-
260
- # convert read df to geo
261
- df["geometry"] = dgpd.points_from_xy(df, lon_col, lat_col)
262
- df = dgpd.from_dask_dataframe(df)
263
-
264
- # Make sure they're using the same projection reference
265
- df.crs = shape_df.crs
266
-
267
- # join the datasets, matching `geometry` to points within the shape df
268
- df = df.sjoin(shape_df, how="inner")
269
-
270
- if drop_geo:
271
- df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
272
-
273
- return self.copy_with_df(df)
274
-
275
- def apply_sql(self, sql):
276
- df = self.df.copy()
277
- try:
278
- # Initially this was the filename, but some filenames are invalid for the table,
279
- # is it ok to call it the same name all the time? Mapped to DaskDataTable in case
280
- # we need to change this.
281
- self.sql_context.create_table("DaskDataTable", self.df)
282
- formatted_sql = sql.replace(self.sql_table_name, "DaskDataTable")
283
-
284
- self.pre_sql_columns.extend(df.columns)
285
-
286
- # dask expects the columns to be lower case, which won't match some data
287
- df = self.sql_context.sql(
288
- formatted_sql,
289
- config_options={"sql.identifier.case_sensitive": False},
290
- )
291
- # which means we then need to map the columns back to the original
292
- # and allow for any aggregations to be retained
293
- validated_columns = []
294
- for v in df.columns:
295
- pre = False
296
- for x in self.pre_sql_columns:
297
- if v.lower() == x.lower():
298
- validated_columns.append(x)
299
- pre = True
300
-
301
- if not pre:
302
- validated_columns.append(v)
303
- df.columns = validated_columns
304
-
305
- return self.copy_with_df(df)
306
- except ParsingException:
307
- raise InvalidSQLException
308
-
309
- def join(self, df, table_name):
310
- """
311
- Creates a secondary table as a sql table in order to allow joins when apply_sql is called.
312
- """
313
- if table_name in self.table_names:
314
- raise RuntimeError(
315
- f"Table name already in use: [{','.join(self.table_names)}]"
316
- )
317
- self.pre_sql_columns.extend(df.columns)
318
- self.sql_context.create_table(table_name, df)
319
- self.table_names.append(table_name)
320
- return self
321
-
322
- def read_from_dataframe(self):
323
- if not isinstance(self.df, dd.DataFrame):
324
- self.df = dd.from_pandas(self.df, npartitions=1)
325
-
326
- def as_pandas(self):
327
- super().as_pandas()
328
- if self.client:
329
- return self.client.compute(self.df).result()
330
- else:
331
- return self.df.compute()
332
-
333
- def read_dict(self, data):
334
- self.df = dd.DataFrame.from_dict(data)
335
-
336
- def read_csv(self, *args, **kwargs):
337
- # remove standard pandas kwargs which will case an issue in dask.
338
- dask_safe_kwargs = kwargs.copy()
339
- dask_safe_kwargs.pop("memory_map", None)
340
- dask_safe_kwargs.pop("low_memory", None)
341
-
342
- filename_or_buffer = self.filename_or_buffer
343
- if isinstance(filename_or_buffer, pathlib.PosixPath):
344
- filename_or_buffer = str(self.filename_or_buffer)
345
-
346
- if isinstance(filename_or_buffer, io.TextIOWrapper) or isinstance(
347
- filename_or_buffer, io.BufferedReader
348
- ):
349
- filename_or_buffer = filename_or_buffer.name
350
-
351
- # django files
352
- if hasattr(filename_or_buffer, "path"):
353
- filename_or_buffer = filename_or_buffer.path
354
-
355
- _, uri = self.storage.get_storage_url(filename_or_buffer, encode_params=False)
356
- self.df = dd.read_csv(
357
- uri,
358
- *args,
359
- **dask_safe_kwargs,
360
- storage_options=self.storage.get_fsspec_storage_options(),
361
- )
362
-
363
- def read_parquet(self, *args, **kwargs):
364
- if isinstance(self.filename_or_buffer, str):
365
- _, uri = self.storage.get_storage_url(
366
- self.filename_or_buffer, encode_params=False
367
- )
368
- filename = uri
369
- kwargs["storage_options"] = self.storage.get_fsspec_storage_options()
370
- else:
371
- filename = self.filename_or_buffer
372
-
373
- self.df = dd.read_parquet(
374
- filename,
375
- *args,
376
- **kwargs,
377
- )
378
-
379
- # dask-sql doesn't handle categorical columns, but we need to be careful
380
- # how we convert them, if an assign is used we will end up stopping
381
- # the `Predicate pushdown optimization` within dask-sql from applying the
382
- # sql to the read_parquet filters.
383
- categories_to_convert = {}
384
- for col in self.df.select_dtypes(include="category").columns:
385
- categories_to_convert[col] = self.df[col].dtype.categories.dtype
386
- self.df = self.df.astype(categories_to_convert)
387
-
388
-
389
- class OasisDaskReaderCSV(OasisDaskReader):
390
- pass
391
-
16
+ from .backends.base import OasisReader
17
+ from .backends.pandas import OasisPandasReader, OasisPandasReaderCSV, OasisPandasReaderParquet
392
18
 
393
- class OasisDaskReaderParquet(OasisDaskReader):
19
+ try:
20
+ from .backends.dask import OasisDaskReader, OasisDaskReaderCSV, OasisDaskReaderParquet
21
+ except ModuleNotFoundError as e:
394
22
  pass
@@ -1,8 +1,13 @@
1
1
  import json
2
2
  import os
3
- from typing import Optional, Tuple, TypedDict, Union
4
-
5
- from typing_extensions import NotRequired
3
+ import sys
4
+
5
+ if sys.version_info >= (3, 8):
6
+ from typing import Optional, Tuple, TypedDict, Union
7
+ from typing_extensions import NotRequired
8
+ else:
9
+ from typing import Optional, Tuple, Union
10
+ from typing_extensions import NotRequired, TypedDict
6
11
 
7
12
  from oasis_data_manager.config import ConfigError, load_class
8
13
  from oasis_data_manager.filestore.backends.base import BaseStorage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: oasis-data-manager
3
- Version: 0.1.0rc1
3
+ Version: 0.1.1
4
4
  Summary: UNKNOWN
5
5
  Home-page: https://github.com/OasisLMF/OasisDataManager
6
6
  Author: Oasis LMF
@@ -15,18 +15,21 @@ Classifier: Programming Language :: Python :: 3.6
15
15
  Requires-Python: >=3.6
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: pandas
19
- Requires-Dist: geopandas
20
- Requires-Dist: dask
21
- Requires-Dist: dask-geopandas
22
- Requires-Dist: dask-sql
23
- Requires-Dist: distributed
24
18
  Requires-Dist: fastparquet
25
- Requires-Dist: pyogrio
26
19
  Requires-Dist: fsspec
27
- Requires-Dist: boto3
28
- Requires-Dist: s3fs >=2023.9.0
29
- Requires-Dist: adlfs
20
+ Requires-Dist: pandas
21
+ Requires-Dist: typing
22
+ Requires-Dist: typing-extensions
23
+ Provides-Extra: extra
24
+ Requires-Dist: adlfs ; extra == 'extra'
25
+ Requires-Dist: boto3 ; extra == 'extra'
26
+ Requires-Dist: dask ; extra == 'extra'
27
+ Requires-Dist: dask-geopandas ; extra == 'extra'
28
+ Requires-Dist: dask-sql ; extra == 'extra'
29
+ Requires-Dist: distributed ; extra == 'extra'
30
+ Requires-Dist: geopandas ; extra == 'extra'
31
+ Requires-Dist: pyogrio ; extra == 'extra'
32
+ Requires-Dist: s3fs >=2023.9.0 ; extra == 'extra'
30
33
 
31
34
  UNKNOWN
32
35
 
@@ -1,15 +1,19 @@
1
- oasis_data_manager/__init__.py,sha256=6V73sh1J5lwvlLxrQtds_32hB-EbuKlbhZt2XfigNYo,25
1
+ oasis_data_manager/__init__.py,sha256=ls1camlIoMxEZz9gSkZ1OJo-MXqHWwKPtdPbZJmwp7E,22
2
2
  oasis_data_manager/config.py,sha256=_qx2Mu5n0Jx3W5SKCiqLr1SPdWLrbFv_B82r6Eosp_k,534
3
3
  oasis_data_manager/complex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  oasis_data_manager/complex/complex.py,sha256=8oomV9WyLsa8sz8aMzlwv4naKCGOL3UdSlYQJxUFqCk,5382
5
5
  oasis_data_manager/complex/examples.py,sha256=HlwOzJ2SVF9yE7ei9d2HWglUkYApiyQxwm8WiL84wdY,1220
6
6
  oasis_data_manager/df_reader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- oasis_data_manager/df_reader/config.py,sha256=jr8uxd2JAqKpz5YJ_P4kFm8QtuZJj5aCAXC7WvjQt_g,2355
8
- oasis_data_manager/df_reader/exceptions.py,sha256=P2npfQ4NUCBaoO0AXRhmRJkTK8P_9TwtlPTnLi-BhbA,47
9
- oasis_data_manager/df_reader/reader.py,sha256=E3QQSiXeCxmsBllYyeimo-alfIhLJIaxzKZbCLTBdDM,13333
7
+ oasis_data_manager/df_reader/config.py,sha256=2xwWg5b6dnERUgQYWBjt2W64GdYUhXTtXCgrSPP60Mg,2507
8
+ oasis_data_manager/df_reader/exceptions.py,sha256=9FV8n2eqrkTGpEt47GGs5k0eon2Y-Xz5K3wyc1R9fBs,102
9
+ oasis_data_manager/df_reader/reader.py,sha256=14wuGTBKnIRslDMXsA3QjBiuvzcweRi29nM-V46pmLE,597
10
+ oasis_data_manager/df_reader/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ oasis_data_manager/df_reader/backends/base.py,sha256=cwvHzncinHfwwHW5hIjMQZ6KCVECxE3NixPT5SX7s-M,3057
12
+ oasis_data_manager/df_reader/backends/dask.py,sha256=fE2vEAd5y7haUzi0oEUBE_4D5UL0jkmKyaAEaWnuFC4,6975
13
+ oasis_data_manager/df_reader/backends/pandas.py,sha256=yTfULUunn0JLUqa_P5nazkGF6g6omaUY0litnPFITD4,3656
10
14
  oasis_data_manager/errors/__init__.py,sha256=9q_7nk5DNg1-WfQoBM4kw_Us34Y2szNkZwfE5-6_Rg0,687
11
15
  oasis_data_manager/filestore/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- oasis_data_manager/filestore/config.py,sha256=CihInw_F8ZgQDpwavMbmcbvvorMjJl4QAsd1qJUyzpY,3466
16
+ oasis_data_manager/filestore/config.py,sha256=N0hSfPpRpej7uWGV54MEEH_0vvdpcgo3fZlx6m2muoY,3625
13
17
  oasis_data_manager/filestore/filestore.py,sha256=eaQGAer7Q9KM4B3bq9WmZAtjFdj9aRef_E3rI2i0dOk,2615
14
18
  oasis_data_manager/filestore/log.py,sha256=8l54LoOJiOG2pr4o93LzMocjH7dHcsOp14JWJ_MrqHQ,693
15
19
  oasis_data_manager/filestore/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -17,8 +21,8 @@ oasis_data_manager/filestore/backends/aws_s3.py,sha256=te2rVQl6n4xr4X-xQv68gDNQD
17
21
  oasis_data_manager/filestore/backends/azure_abfs.py,sha256=Lo2aBBQsFPJJEb9svm1-W43Gz2g4LCaHlM-9mVXDhzg,5354
18
22
  oasis_data_manager/filestore/backends/base.py,sha256=aj__0KsmnVbXTHYSA_qlrIe5pxImSZ14dPMzskdNzJc,12971
19
23
  oasis_data_manager/filestore/backends/local.py,sha256=MEX_CvwhsDfv9lvBjc8CdaDXaN53l9onQHmOgKjoJcg,1242
20
- oasis_data_manager-0.1.0rc1.dist-info/LICENSE,sha256=qr-PXl5mSpeUk-A7RzYcH0dhR93hhgVK8SW9mzco0Ao,1517
21
- oasis_data_manager-0.1.0rc1.dist-info/METADATA,sha256=3w2fJ2Mu3gI2r4B1Md5TbPRIDId694tyLAduDFnVi_Q,853
22
- oasis_data_manager-0.1.0rc1.dist-info/WHEEL,sha256=-G_t0oGuE7UD0DrSpVZnq1hHMBV9DD2XkS5v7XpmTnk,110
23
- oasis_data_manager-0.1.0rc1.dist-info/top_level.txt,sha256=qMC39T9UvDCPbNJLVtgu8h6f7c4KJYel7SnIpz62wsU,19
24
- oasis_data_manager-0.1.0rc1.dist-info/RECORD,,
24
+ oasis_data_manager-0.1.1.dist-info/LICENSE,sha256=qr-PXl5mSpeUk-A7RzYcH0dhR93hhgVK8SW9mzco0Ao,1517
25
+ oasis_data_manager-0.1.1.dist-info/METADATA,sha256=UoaRd22Lz2O_UNdRPGCXf-tOBf_TPvEQoFoqVrLFK18,1098
26
+ oasis_data_manager-0.1.1.dist-info/WHEEL,sha256=-G_t0oGuE7UD0DrSpVZnq1hHMBV9DD2XkS5v7XpmTnk,110
27
+ oasis_data_manager-0.1.1.dist-info/top_level.txt,sha256=qMC39T9UvDCPbNJLVtgu8h6f7c4KJYel7SnIpz62wsU,19
28
+ oasis_data_manager-0.1.1.dist-info/RECORD,,