oasis-data-manager 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. oasis_data_manager/__init__.py +1 -0
  2. oasis_data_manager/complex/__init__.py +0 -0
  3. oasis_data_manager/complex/complex.py +173 -0
  4. oasis_data_manager/complex/examples.py +40 -0
  5. oasis_data_manager/config.py +36 -0
  6. oasis_data_manager/df_reader/__init__.py +0 -0
  7. oasis_data_manager/df_reader/backends/__init__.py +0 -0
  8. oasis_data_manager/df_reader/backends/base.py +119 -0
  9. oasis_data_manager/df_reader/backends/dask.py +191 -0
  10. oasis_data_manager/df_reader/backends/pandas.py +85 -0
  11. oasis_data_manager/df_reader/backends/pyarrow.py +65 -0
  12. oasis_data_manager/df_reader/config.py +60 -0
  13. oasis_data_manager/df_reader/exceptions.py +6 -0
  14. oasis_data_manager/df_reader/reader.py +26 -0
  15. oasis_data_manager/errors/__init__.py +24 -0
  16. oasis_data_manager/filestore/__init__.py +0 -0
  17. oasis_data_manager/filestore/backends/__init__.py +0 -0
  18. oasis_data_manager/filestore/backends/aws.py +305 -0
  19. oasis_data_manager/filestore/backends/aws_s3.py +15 -0
  20. oasis_data_manager/filestore/backends/azure.py +139 -0
  21. oasis_data_manager/filestore/backends/azure_abfs.py +15 -0
  22. oasis_data_manager/filestore/backends/base.py +478 -0
  23. oasis_data_manager/filestore/backends/local.py +44 -0
  24. oasis_data_manager/filestore/config.py +106 -0
  25. oasis_data_manager/filestore/filestore.py +94 -0
  26. oasis_data_manager/filestore/log.py +22 -0
  27. oasis_data_manager-0.2.3.dist-info/METADATA +410 -0
  28. oasis_data_manager-0.2.3.dist-info/RECORD +30 -0
  29. oasis_data_manager-0.2.3.dist-info/WHEEL +5 -0
  30. oasis_data_manager-0.2.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1 @@
1
+ __version__ = '0.2.3'
File without changes
@@ -0,0 +1,173 @@
1
+ import logging
2
+ import pathlib
3
+ from io import BytesIO
4
+ from typing import List, Type
5
+
6
+ import httpcore
7
+ import httpx
8
+ import pandas as pd
9
+
10
+ from oasis_data_manager.df_reader.config import clean_config, get_df_reader
11
+ from oasis_data_manager.df_reader.reader import OasisReader
12
+ from oasis_data_manager.filestore.backends.local import LocalStorage
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class Adjustment:
18
+ """
19
+ Adjustments are any Pandas adjustments made after the data is fetched and filtered by SQL if applicable.
20
+ """
21
+
22
+ @classmethod
23
+ def apply(cls, df):
24
+ return df
25
+
26
+
27
+ class ComplexData:
28
+ adjustments: List[Type[Adjustment]] = []
29
+ filename: str = ""
30
+ url: str = ""
31
+ fetch_required: bool = True
32
+ sql: str = ""
33
+
34
+ def __init__(self, storage=None):
35
+ if not storage:
36
+ storage = LocalStorage()
37
+ self.storage = storage
38
+
39
+ def fetch(self):
40
+ raise NotImplementedError
41
+
42
+ def get_sql(self):
43
+ return self.sql
44
+
45
+ def adjust(self, reader) -> OasisReader:
46
+ """
47
+ Hook to apply any adjustments.
48
+
49
+ TODO adjustments are filters? Functions fun on the readers df i.e
50
+ apply, should be change filter to apply then or is that confusing with pandas?
51
+ """
52
+ if self.adjustments:
53
+ return reader.filter([a.apply for a in self.adjustments])
54
+ return reader
55
+
56
+ def to_dataframe(self, result) -> pd.DataFrame:
57
+ """
58
+ Hook to allow conversion of the fetch() to a dataframe, in the case of the
59
+ RestComplexData class for example, the result is json, so can be fed
60
+ directly to a dataframe and wrapped into our df_reader.
61
+ """
62
+ return pd.DataFrame(result)
63
+
64
+ def get_df_reader(self, filepath, **kwargs) -> OasisReader:
65
+ df_reader_config = clean_config(
66
+ {"filepath": filepath, "engine": "OasisDaskReader"}
67
+ )
68
+ df_reader_config["engine"]["options"]["storage"] = self.storage
69
+
70
+ return get_df_reader(df_reader_config, **kwargs)
71
+
72
+ def to_reader(self, fetch_result) -> OasisReader:
73
+ if fetch_result:
74
+ # When the result has been fetched, apply to_dataframe and pass directly into the df_reader
75
+ df = self.to_dataframe(fetch_result)
76
+ return self.get_df_reader(
77
+ None, # TODO - None value instead of filename_or_buffer, improve this.
78
+ dataframe=df,
79
+ has_read=True,
80
+ )
81
+ else:
82
+ # Not fetched, let df_reader fetch as normal.
83
+ return self.get_df_reader(self.filename if self.filename else self.url)
84
+
85
+ def run(self):
86
+ # CSV and Parquet files are read directly by the df_reader, so fetch() is not needed.
87
+ # Only formats the df_reader can't handle (e.g. custom binary formats) require fetch().
88
+ if self.fetch_required and self.filename:
89
+ extension = pathlib.Path(self.filename).suffix
90
+ self.fetch_required = extension not in [".parquet", ".pq", ".csv"]
91
+
92
+ fetch_result = None
93
+ if self.fetch_required:
94
+ fetch_result = self.fetch()
95
+
96
+ reader = self.to_reader(fetch_result)
97
+
98
+ sql = self.get_sql()
99
+ if hasattr(reader, "sql") and sql:
100
+ reader = reader.sql(sql)
101
+
102
+ reader = self.adjust(reader)
103
+
104
+ # TODO store file? return for now
105
+ return reader
106
+
107
+
108
+ class FileStoreComplexData(ComplexData):
109
+ def to_dataframe(self, result) -> pd.DataFrame:
110
+ """
111
+ As this is only called on filetypes not handled by the df_reader, this will always be custom.
112
+ """
113
+ raise NotImplementedError
114
+
115
+ def fetch(self):
116
+ with self.storage.open(self.filename, "rb") as f:
117
+ result = BytesIO(f.read())
118
+ return result
119
+
120
+
121
+ class RestComplexData(ComplexData):
122
+ exceptions = (
123
+ httpx.RequestError,
124
+ httpcore.ReadTimeout,
125
+ httpcore.ConnectTimeout,
126
+ httpcore.ConnectError,
127
+ )
128
+
129
+ url: str
130
+ timeout: int = 10
131
+
132
+ def handle_error(self, exception, **kwargs):
133
+ logger.warning(
134
+ "Exception in complex data call",
135
+ extra={"exception": exception, "uri": self.url},
136
+ )
137
+ return None
138
+
139
+ def handle_response(self, response) -> dict:
140
+ return response.json()
141
+
142
+ def get_uri(self) -> str:
143
+ return self.url
144
+
145
+ def get_headers(self) -> dict:
146
+ return {}
147
+
148
+ def get_post_json(self) -> dict:
149
+ return {}
150
+
151
+ def fetch(self):
152
+ uri = self.get_uri()
153
+ headers = self.get_headers()
154
+ post_json = self.get_post_json()
155
+ timeout = self.timeout
156
+
157
+ try:
158
+ if post_json:
159
+ response = httpx.post(
160
+ uri, json=post_json, headers=headers, timeout=timeout
161
+ )
162
+ else:
163
+ response = httpx.get(uri, headers=headers, timeout=timeout)
164
+
165
+ if response.status_code == 200:
166
+ return self.handle_response(response)
167
+ else:
168
+ return self.handle_error(
169
+ exception="Unexpected status in complex data call",
170
+ post_json=post_json,
171
+ )
172
+ except self.exceptions as e:
173
+ return self.handle_error(exception=str(e), post_json=post_json)
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env python
2
+
3
+ import h5py
4
+ import pandas as pd
5
+
6
+ from oasis_data_manager.complex.complex import Adjustment, FileStoreComplexData
7
+ from oasis_data_manager.df_reader.reader import OasisDaskReader
8
+ from oasis_data_manager.filestore.backends.local import LocalStorage
9
+
10
+
11
+ class AddColAdjustment(Adjustment):
12
+ @classmethod
13
+ def apply(cls, df):
14
+ df["else"] = "test"
15
+ return df
16
+
17
+
18
+ class FloodDataExample(FileStoreComplexData):
19
+ # Note this file needs to be sourced. TODO - is this public? If not is there a public/file we can include?
20
+ filename = "tropical_cyclone_10synth_tracks_150arcsec_rcp26_KNA_2080.hdf5"
21
+ adjustments = [AddColAdjustment]
22
+ sql = "SELECT * FROM table WHERE event_id > 3000"
23
+
24
+ def to_dataframe(self, result) -> pd.DataFrame:
25
+ result = h5py.File(result)
26
+
27
+ df = pd.DataFrame(list(result["event_id"]), columns=["event_id"])
28
+ for m in ["event_name", "date", "frequency", "orig"]:
29
+ df[m] = list(result[m])
30
+ df = df.reset_index()
31
+
32
+ return df
33
+
34
+
35
+ if __name__ == "__main__":
36
+ reader = OasisDaskReader
37
+ test_storage = LocalStorage("/tmp")
38
+
39
+ result = FloodDataExample(storage=test_storage).run()
40
+ print(result.as_pandas())
@@ -0,0 +1,36 @@
1
+ import importlib
2
+
3
+ _ALIASES = {
4
+ # Storage backends
5
+ "LocalStorage": "oasis_data_manager.filestore.backends.local.LocalStorage",
6
+ "AwsS3Storage": "oasis_data_manager.filestore.backends.aws.AwsS3Storage",
7
+ "AzureABFSStorage": "oasis_data_manager.filestore.backends.azure.AzureABFSStorage",
8
+ # Reader backends
9
+ "OasisPandasReader": "oasis_data_manager.df_reader.backends.pandas.OasisPandasReader",
10
+ "OasisPandasReaderCSV": "oasis_data_manager.df_reader.backends.pandas.OasisPandasReaderCSV",
11
+ "OasisPandasReaderParquet": "oasis_data_manager.df_reader.backends.pandas.OasisPandasReaderParquet",
12
+ "OasisDaskReader": "oasis_data_manager.df_reader.backends.dask.OasisDaskReader",
13
+ "OasisDaskReaderCSV": "oasis_data_manager.df_reader.backends.dask.OasisDaskReaderCSV",
14
+ "OasisDaskReaderParquet": "oasis_data_manager.df_reader.backends.dask.OasisDaskReaderParquet",
15
+ "OasisPyarrowReader": "oasis_data_manager.df_reader.backends.pyarrow.OasisPyarrowReader",
16
+ }
17
+
18
+
19
+ class ConfigError(Exception):
20
+ pass
21
+
22
+
23
+ def load_class(path, base=None):
24
+ path = _ALIASES.get(path, path)
25
+ path_split = path.rsplit(".", 1)
26
+ if len(path_split) != 2:
27
+ raise ConfigError(f"'{path}' is not a valid class path (expected 'module.ClassName' or a known alias)")
28
+
29
+ module_path, cls_name = path_split
30
+ module = importlib.import_module(module_path)
31
+ cls = getattr(module, cls_name)
32
+
33
+ if base and not issubclass(cls, base):
34
+ raise ConfigError(f"'{cls.__name__}' does not extend '{base.__name__}'")
35
+
36
+ return cls
File without changes
File without changes
@@ -0,0 +1,119 @@
1
+ import pathlib
2
+ from typing import Iterable
3
+
4
+ from ...filestore.backends.base import BaseStorage
5
+
6
+
7
+ class OasisReader:
8
+ """
9
+ Base reader.
10
+
11
+ as_pandas(), sql() & filter() can all be chained with self.has_read controlling whether the base
12
+ read (read_csv/read_parquet) needs to be triggered. This is because in the case of spark
13
+ we need to read differently depending on if the intention is to do sql or filter.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ filename_or_buffer,
19
+ storage: BaseStorage,
20
+ *args,
21
+ dataframe=None,
22
+ has_read=False,
23
+ **kwargs,
24
+ ):
25
+ self.filename_or_buffer = filename_or_buffer
26
+ self.storage = storage
27
+ self._df = dataframe
28
+ self.has_read = has_read
29
+ self.reader_args = args
30
+ self.reader_kwargs = kwargs
31
+
32
+ if not filename_or_buffer:
33
+ if dataframe is None and not has_read:
34
+ raise RuntimeError(
35
+ "Reader must be initialised with either a "
36
+ "filename_or_buffer or by passing a dataframe "
37
+ "and has_read=True"
38
+ )
39
+ else:
40
+ self.read_from_dataframe()
41
+
42
+ if (
43
+ filename_or_buffer
44
+ and isinstance(self.filename_or_buffer, str)
45
+ and self.filename_or_buffer.lower().endswith(".zip")
46
+ ):
47
+ self.reader_kwargs["compression"] = "zip"
48
+
49
+ @property
50
+ def df(self):
51
+ self._read()
52
+ return self._df
53
+
54
+ @df.setter
55
+ def df(self, other):
56
+ self._df = other
57
+
58
+ def read_csv(self, *args, **kwargs):
59
+ raise NotImplementedError()
60
+
61
+ def read_parquet(self, *args, **kwargs):
62
+ raise NotImplementedError()
63
+
64
+ def _read(self):
65
+ if not self.has_read:
66
+ if hasattr(self.filename_or_buffer, "name"):
67
+ parts = pathlib.Path(self.filename_or_buffer.name).parts
68
+ else:
69
+ parts = pathlib.Path(self.filename_or_buffer).parts
70
+
71
+ is_parquet = any(
72
+ part.endswith((".parquet", ".pq")) for part in parts
73
+ )
74
+
75
+ # Set has_read before calling read to prevent re-entrant calls (e.g. Dask
76
+ # readers access self.df internally during read). Reset on failure so the
77
+ # read can be retried.
78
+ self.has_read = True
79
+ try:
80
+ if is_parquet:
81
+ self.read_parquet(*self.reader_args, **self.reader_kwargs)
82
+ else:
83
+ # assume the file is csv if not parquet
84
+ self.read_csv(*self.reader_args, **self.reader_kwargs)
85
+ except Exception:
86
+ self.has_read = False
87
+ raise
88
+
89
+ return self
90
+
91
+ def copy_with_df(self, df):
92
+ return type(self)(
93
+ self.filename_or_buffer, self.storage, dataframe=df, has_read=self.has_read
94
+ )
95
+
96
+ def filter(self, filters):
97
+ self._read()
98
+
99
+ df = self._df
100
+ for df_filter in filters if isinstance(filters, Iterable) else [filters]:
101
+ df = df_filter(df)
102
+
103
+ return self.copy_with_df(df)
104
+
105
+ def sql(self, sql):
106
+ if sql:
107
+ self._read()
108
+ return self.apply_sql(sql)
109
+ return self
110
+
111
+ def query(self, fn):
112
+ return fn(self.df)
113
+
114
+ def as_pandas(self):
115
+ self._read()
116
+ return self._df
117
+
118
+ def read_from_dataframe(self):
119
+ pass
@@ -0,0 +1,191 @@
1
+ import io
2
+ import pathlib
3
+ import logging
4
+
5
+ import dask
6
+ import dask_geopandas as dgpd
7
+ from dask import dataframe as dd
8
+ from dask_sql import Context
9
+ from dask_sql.utils import ParsingException
10
+ from distributed import Client
11
+
12
+ from ..exceptions import InvalidSQLException
13
+ from .base import OasisReader
14
+
15
+ dask.config.set(
16
+ {"dataframe.convert-string": False}
17
+ ) # allows dask sql to support pyarrow
18
+ logger = logging.getLogger("oasis_data_manager.df_reader.reader")
19
+
20
+
21
+ class OasisDaskReader(OasisReader):
22
+ sql_table_name = "table"
23
+
24
+ def __init__(self, *args, client_address=None, **kwargs):
25
+ if client_address:
26
+ self.client = Client(client_address, set_as_default=False)
27
+ else:
28
+ self.client = None
29
+
30
+ self.sql_context = Context()
31
+ self.table_names = [self.sql_table_name]
32
+ self.pre_sql_columns = []
33
+
34
+ super().__init__(*args, **kwargs)
35
+
36
+ def copy_with_df(self, df):
37
+ if not isinstance(df, dd.DataFrame):
38
+ df = dd.from_pandas(df, npartitions=1)
39
+ res = super().copy_with_df(df)
40
+ res.client = self.client
41
+ return res
42
+
43
+ def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
44
+ """
45
+ Read in a shape file and return the _read file with geo data joined.
46
+ """
47
+ # TODO: fix this so that it can work with non local files
48
+ # with self.storage.open(self.shape_filename_path) as f:
49
+ # shape_df = dgpd.read_file(f, npartitions=1)
50
+
51
+ shape_df = dgpd.read_file(shape_filename_path, npartitions=1)
52
+
53
+ # for situations where the columns in the source data are different.
54
+ lon_col = kwargs.get("geo_lon_col", "longitude")
55
+ lat_col = kwargs.get("geo_lat_col", "latitude")
56
+
57
+ df_columns = self.df.columns.tolist()
58
+ if lat_col not in df_columns or lon_col not in df_columns:
59
+ logger.warning("Invalid shape file provided")
60
+ # temp until we decide on handling, i.e don't return full data if it fails.
61
+ return self.copy_with_df(dd.DataFrame.from_dict({}, npartitions=1))
62
+
63
+ df = self.df.copy()
64
+
65
+ # convert read df to geo
66
+ df["geometry"] = dgpd.points_from_xy(df, lon_col, lat_col)
67
+ df = dgpd.from_dask_dataframe(df)
68
+
69
+ # Make sure they're using the same projection reference
70
+ df.crs = shape_df.crs
71
+
72
+ # join the datasets, matching `geometry` to points within the shape df
73
+ df = df.sjoin(shape_df, how="inner")
74
+
75
+ if drop_geo:
76
+ df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
77
+
78
+ return self.copy_with_df(df)
79
+
80
+ def apply_sql(self, sql):
81
+ try:
82
+ # Initially this was the filename, but some filenames are invalid for the table,
83
+ # is it ok to call it the same name all the time? Mapped to DaskDataTable in case
84
+ # we need to change this.
85
+ self.sql_context.create_table("DaskDataTable", self.df)
86
+ formatted_sql = sql.replace(self.sql_table_name, "DaskDataTable")
87
+
88
+ # Combine columns from join() tables with current df columns for case restoration
89
+ col_map = {}
90
+ for col in list(self.pre_sql_columns) + list(self.df.columns):
91
+ col_map.setdefault(col.lower(), col)
92
+
93
+ # dask expects the columns to be lower case, which won't match some data
94
+ df = self.sql_context.sql(
95
+ formatted_sql,
96
+ config_options={"sql.identifier.case_sensitive": False},
97
+ )
98
+ # which means we then need to map the columns back to the original
99
+ # and allow for any aggregations to be retained
100
+ df.columns = [col_map.get(v.lower(), v) for v in df.columns]
101
+
102
+ return self.copy_with_df(df)
103
+ except ParsingException:
104
+ raise InvalidSQLException
105
+
106
+ def join(self, df, table_name):
107
+ """
108
+ Creates a secondary table as a sql table in order to allow joins when apply_sql is called.
109
+ """
110
+ if table_name in self.table_names:
111
+ raise RuntimeError(
112
+ f"Table name already in use: [{','.join(self.table_names)}]"
113
+ )
114
+ self.pre_sql_columns.extend(df.columns)
115
+ self.sql_context.create_table(table_name, df)
116
+ self.table_names.append(table_name)
117
+ return self
118
+
119
+ def read_from_dataframe(self):
120
+ if not isinstance(self.df, dd.DataFrame):
121
+ self.df = dd.from_pandas(self.df, npartitions=1)
122
+
123
+ def as_pandas(self):
124
+ super().as_pandas()
125
+ if self.client:
126
+ return self.client.compute(self.df).result()
127
+ else:
128
+ return self.df.compute()
129
+
130
+ def read_dict(self, data):
131
+ self.df = dd.DataFrame.from_dict(data)
132
+
133
+ def read_csv(self, *args, **kwargs):
134
+ # remove standard pandas kwargs which will case an issue in dask.
135
+ dask_safe_kwargs = kwargs.copy()
136
+ dask_safe_kwargs.pop("memory_map", None)
137
+ dask_safe_kwargs.pop("low_memory", None)
138
+
139
+ filename_or_buffer = self.filename_or_buffer
140
+ if isinstance(filename_or_buffer, pathlib.PosixPath):
141
+ filename_or_buffer = str(self.filename_or_buffer)
142
+
143
+ if isinstance(filename_or_buffer, io.TextIOWrapper) or isinstance(
144
+ filename_or_buffer, io.BufferedReader
145
+ ):
146
+ filename_or_buffer = filename_or_buffer.name
147
+
148
+ # django files
149
+ if hasattr(filename_or_buffer, "path"):
150
+ filename_or_buffer = filename_or_buffer.path
151
+
152
+ _, uri = self.storage.get_storage_url(filename_or_buffer, encode_params=False)
153
+ self.df = dd.read_csv(
154
+ uri,
155
+ *args,
156
+ **dask_safe_kwargs,
157
+ storage_options=self.storage.get_fsspec_storage_options(),
158
+ )
159
+
160
+ def read_parquet(self, *args, **kwargs):
161
+ if isinstance(self.filename_or_buffer, str):
162
+ _, uri = self.storage.get_storage_url(
163
+ self.filename_or_buffer, encode_params=False
164
+ )
165
+ filename = uri
166
+ kwargs["storage_options"] = self.storage.get_fsspec_storage_options()
167
+ else:
168
+ filename = self.filename_or_buffer
169
+
170
+ self.df = dd.read_parquet(
171
+ filename,
172
+ *args,
173
+ **kwargs,
174
+ )
175
+
176
+ # dask-sql doesn't handle categorical columns, but we need to be careful
177
+ # how we convert them, if an assign is used we will end up stopping
178
+ # the `Predicate pushdown optimization` within dask-sql from applying the
179
+ # sql to the read_parquet filters.
180
+ categories_to_convert = {}
181
+ for col in self.df.select_dtypes(include="category").columns:
182
+ categories_to_convert[col] = self.df[col].dtype.categories.dtype
183
+ self.df = self.df.astype(categories_to_convert)
184
+
185
+
186
+ class OasisDaskReaderCSV(OasisDaskReader):
187
+ pass
188
+
189
+
190
+ class OasisDaskReaderParquet(OasisDaskReader):
191
+ pass
@@ -0,0 +1,85 @@
1
+ import logging
2
+
3
+ import pandas as pd
4
+
5
+
6
+ try:
7
+ import geopandas as gpd
8
+ except ModuleNotFoundError:
9
+ gpd = None
10
+
11
+ from .base import OasisReader
12
+ from ..exceptions import MissingOptionalDependency
13
+ logger = logging.getLogger("oasis_data_manager.df_reader.reader")
14
+
15
+
16
+ class OasisPandasReader(OasisReader):
17
+ def _read_with(self, read_fn, *args, **kwargs):
18
+ if isinstance(self.filename_or_buffer, str) and not self.filename_or_buffer.startswith(("http://", "https://")):
19
+ _, uri = self.storage.get_storage_url(
20
+ self.filename_or_buffer, encode_params=False
21
+ )
22
+ self.df = read_fn(
23
+ uri,
24
+ *args,
25
+ **kwargs,
26
+ storage_options=self.storage.get_fsspec_storage_options(),
27
+ )
28
+ else:
29
+ self.df = read_fn(self.filename_or_buffer, *args, **kwargs)
30
+
31
+ def read_csv(self, *args, **kwargs):
32
+ if pd.__version__ >= "3": # remove unsupported options issue https://github.com/OasisLMF/OasisLMF/issues/1896
33
+ kwargs.pop('low_memory', None)
34
+ self._read_with(pd.read_csv, *args, **kwargs)
35
+
36
+ def read_parquet(self, *args, **kwargs):
37
+ self._read_with(pd.read_parquet, *args, **kwargs)
38
+
39
+ def apply_geo(self, shape_filename_path, *args, drop_geo=True, **kwargs):
40
+ """
41
+ Read in a shape file and return the _read file with geo data joined.
42
+ """
43
+ # TODO: fix this so that it can work with non local files
44
+ # with self.storage.open(self.shape_filename_path) as f:
45
+ # shape_df = gpd.read_file(f)
46
+
47
+ if gpd is None:
48
+ raise MissingOptionalDependency(
49
+ "Missing optional dependency 'geopandas' for 'apply_geo' method, install package using `pip install oasis-data-manager[extra]`")
50
+
51
+ shape_df = gpd.read_file(shape_filename_path)
52
+
53
+ # for situations where the columns in the source data are different.
54
+ lon_col = kwargs.get("geo_lon_col", "longitude")
55
+ lat_col = kwargs.get("geo_lat_col", "latitude")
56
+
57
+ df_columns = self.df.columns.tolist()
58
+ if lat_col not in df_columns or lon_col not in df_columns:
59
+ logger.warning("Invalid shape file provided")
60
+ # temp until we decide on handling, i.e don't return full data if it fails.
61
+ return self.copy_with_df(pd.DataFrame.from_dict({}))
62
+
63
+ # convert read df to geo
64
+ df = gpd.GeoDataFrame(
65
+ self.df, geometry=gpd.points_from_xy(self.df[lon_col], self.df[lat_col])
66
+ )
67
+
68
+ # Make sure they're using the same projection reference
69
+ df.crs = shape_df.crs
70
+
71
+ # join the datasets, matching `geometry` to points within the shape df
72
+ df = df.sjoin(shape_df, how="inner")
73
+
74
+ if drop_geo:
75
+ df = df.drop(shape_df.columns.tolist() + ["index_right"], axis=1)
76
+
77
+ return self.copy_with_df(df)
78
+
79
+
80
+ class OasisPandasReaderCSV(OasisPandasReader):
81
+ pass
82
+
83
+
84
+ class OasisPandasReaderParquet(OasisPandasReader):
85
+ pass