PyPI - cog-worker - Versions diffs - 0.3.0__py3-none-any.whl - Mend

cog-worker 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

cog_worker/__init__.py +11 -0
cog_worker/distributed.py +176 -0
cog_worker/manager.py +424 -0
cog_worker/py.typed +0 -0
cog_worker/types.py +56 -0
cog_worker/utils.py +40 -0
cog_worker/worker.py +277 -0
cog_worker-0.3.0.dist-info/LICENSE.txt +21 -0
cog_worker-0.3.0.dist-info/METADATA +188 -0
cog_worker-0.3.0.dist-info/RECORD +12 -0
cog_worker-0.3.0.dist-info/WHEEL +5 -0
cog_worker-0.3.0.dist-info/top_level.txt +1 -0

cog_worker/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""A python module for scalable analysis of Cloud Optimized GeoTIFFs.
+COG Worker is a simple library to help you chunk and run large scale analysis
+on Cloud Optimized GeoTIFFs (COGS).
+"""
+__version__ = "0.3.0"
+from .worker import Worker  # noqa
+from .manager import Manager  # noqa
+from .types import *  # noqa

cog_worker/distributed.py ADDED Viewed

@@ -0,0 +1,176 @@
+"""Distributed processing with Dask.
+The DaskManager class provides an identical interface to the
+:obj:`cog_worker.manager.Manager`, but executes tasks in a
+`Dask cluster <https://distributed.dask.org/>`_, instead of on your local machine.
+Note:
+    cog_worker does not include dask.distributed as a dependency by default.
+    In order to use cog_worker.distributed you must install dask.distributed::
+        pip install dask[distributed]
+Example:
+    Read a COG in chunks and sum the results::
+        from cog_worker.distributed import DaskManager
+        from dask.distributed import Client, LocalCluster
+        def my_analysis(worker):
+            arr = worker.read('example-cog.tif')
+            return arr.sum()
+        cluster = LocalCluster()
+        client = Client(cluster)
+        manager = DaskManager(client)
+        results = manager.chunk_execute(my_analysis)
+        total = sum(results)
+"""
+import logging
+from typing import Any, Iterable, Iterator, Mapping, Tuple, Union
+import dask
+import dask.distributed
+from dask.delayed import Delayed
+from pyproj import Proj
+import cog_worker
+from cog_worker.types import BoundingBox, WorkerFunction
+logger = logging.getLogger(__name__)
+class DaskManager(cog_worker.manager.Manager):
+    """Class for chunking and executing cog_worker functions in a dask cluster.
+    The DaskManager identical to the cog_worker.manager.Manager, except that it
+    executes functions in a Dask cluster instead of locally.
+    """
+    def __init__(
+        self,
+        dask_client: dask.distributed.Client,
+        bounds: BoundingBox = (-180, -85, 180, 85),
+        proj: Union[int, str, Proj] = 3857,
+        scale: float = 10000,
+        buffer: int = 16,
+    ):
+        """Initialize a DaskManager with a dask client.
+        Args:
+            dask_client (dask.distributed.Client): The dask client to use to
+                execute analysis.
+            bounds (BoundingBox): The region to be analyzed.
+            proj (pyproj.Proj, str, int): The projection to analyze in.
+                Generally accepts any proj4 string, WKT projection, or EPSG
+                code. See pyproj.Proj for valid values.
+            scale (float): The pixel size for analysis in the projection's units
+                (usually meters or degrees).
+            buffer (int): When dividing analysis into chunks, the number of
+                additional pixels to read on all sides to avoid edge effects.
+                The ideal buffer size depends on your analysis (e.g. whether you
+                use convolutions or distance functions).
+        """
+        self.client = dask_client
+        super().__init__(bounds, proj, scale, buffer)
+    def execute(
+        self,
+        f: WorkerFunction,
+        f_args: Union[Iterable, None] = None,
+        f_kwargs: Union[Mapping, None] = None,
+        clip: bool = True,
+        compute: bool = True,
+        **kwargs,
+    ) -> Union[Tuple[Any, BoundingBox], Delayed]:
+        """Execute a cog_worker function in the DaskManager's cluster.
+        The execute method is the underlying method for running analysis. By
+        default, it will run the function over the Manager's bounding box in a
+        single chunk.
+        When executing functions, the Manager instantiates a
+        cog_worker.worker.Worker and passes it to the function as its first
+        parameter. The Worker keeps track of the scale, projection, and bounds
+        of its piece of the analysis, which it uses to handle the reading and
+        writing of Cloud Optimized GeoTIFFs.
+        Args:
+            f (:obj:`cog_worker.types.WorkerFunction`): The function to execute. The function will
+                recieve a cog_worker.worker.Worker as its first argument.
+            f_args (list): Additional arguments to pass to the function.
+            f_kwargs (dict): Additional keyword arguments to pass to the
+                function.
+            clip (bool): Whether or not to clip the `buffer` from the completed
+                analysis.
+            compute (bool): Whether or not to compute the chunks immediately.
+            **kwargs: Additional keyword arguments to overload the Manager's
+                properties. (bounds, proj, scale, or buffer)
+        Returns:
+            A tuple containing the return value of the function and the bounding
+            box of the executed analysis in the target projection. Or, if
+            compute is False, a Delayed object.
+        """
+        args = {
+            "f": f,
+            "f_args": f_args,
+            "f_kwargs": f_kwargs,
+            "bounds": self.bounds,
+            "proj": self.proj,
+            "scale": self.scale,
+            "buffer": self.buffer,
+            "clip": clip,
+        }
+        args.update(kwargs)
+        task = dask.delayed(cog_worker.manager._execute)(**args)
+        if compute:
+            future = self.client.compute(task)
+            return future.result()  # type: ignore
+        return task
+    def chunk_execute(
+        self,
+        f: WorkerFunction,
+        f_args: Union[Iterable, None] = None,
+        f_kwargs: Union[Mapping, None] = None,
+        chunksize: int = 512,
+        compute: bool = True,
+    ) -> Union[Iterator[Tuple[Any, BoundingBox]], Iterator[Delayed]]:  # type: ignore
+        """Compute chunks in parallel in the DaskManager's cluster.
+        Chunks will be yielded as they are completed. The order in which they
+        are yielded is not guaranteed.
+        Note:
+            You can estimate the memory requirement of executing a function at a
+            given chunksize as:
+            ``(chunksize + 2*buffer)**2 * number_of_bands_or_arrays * bit_depth``.
+        Args:
+            f (:obj:`cog_worker.types.WorkerFunction`): The function to execute. The function will
+                recieve a cog_worker.worker.Worker as its first argument.
+            f_args (list): Additional arguments to pass to the function.
+            f_kwargs (dict): Additional keyword arguments to pass to the
+                function.
+            chunksize (int): Size of the chunks in pixels (excluding buffer).
+            compute (bool): Whether or not to compute the chunks immediately.
+        Yields:
+            A tuple containing the return value of the function for each chunk
+            and the bounding box of the executed analysis in the target
+            projection. Or, if compute is False, a Delayed object for each chunk.
+        """
+        tasks = [
+            dask.delayed(cog_worker.manager._execute)(f, f_args, f_kwargs, **params)
+            for params in self.chunk_params(chunksize)
+        ]
+        if compute:
+            futures = self.client.compute(tasks)
+            for future, result in dask.distributed.as_completed(futures, with_results=True):
+                future.release()
+                yield result
+        else:
+            return tasks  # type: ignore

cog_worker/manager.py ADDED Viewed

@@ -0,0 +1,424 @@
+"""Previewing, chunking, and executing analysis.
+The Manager class is used to divide an area of analysis into chunks of manageable size,
+and execute functions on each of these chunks.
+When executing functions, the Manager instantiates a :obj:`cog_worker.worker.Worker` and passes
+it to the function as its first parameter. The Worker keeps track of the scale, projection,
+and bounds of its piece of the analysis, which it uses to handle the reading and writing of
+Cloud Optimized GeoTIFFs.
+Example:
+    Use the manager to preview an analysis before executing it::
+        from cog_worker import Manager
+        from rasterio.plot import show
+        def my_analysis(worker):
+            arr = worker.read('example-cog.tif')
+            # calculations ...
+            return arr
+        manager = Manager()
+        arr, bbox = manager.preview(my_analysis)
+        show(arr)
+    Execute the analysis in chunks, saving the results to disk::
+        manager.chuck_save('output.tif', myanalysis):
+"""
+import logging
+import math
+from typing import IO, Any, Iterable, Iterator, Mapping, Tuple, Type, Union
+import morecantile
+import numpy as np
+import rasterio as rio
+import rasterio.transform
+import rasterio.windows
+from pyproj import Proj
+from rasterio.io import DatasetWriter
+import cog_worker.worker
+from .types import BoundingBox, WorkerFunction
+from .utils import _bbox_size, _get_profile
+logger = logging.getLogger(__name__)
+class Manager:
+    """Class for managing scalable analysis of Cloud Optimized GeoTIFFs."""
+    def __init__(
+        self,
+        bounds: BoundingBox = (-180, -85, 180, 85),
+        proj: Union[int, str, Proj] = 3857,
+        scale: float = 10000,
+        buffer: int = 16,
+    ):
+        """Initialize a Manager with a projection, scale, and bounding box for analysis.
+        Args:
+            bounds (BoundingBox): The region to be analyzed as a (west, south,
+                east, north) tuple.
+            proj (pyproj.Proj, str, int): The projection to analyze in.
+                Generally accepts any proj4 string, WKT projection, or EPSG
+                code. See pyproj.Proj for valid values.
+            scale (float): The pixel size for analysis in the projection's units
+                (usually meters or degrees).
+            buffer (int): When dividing analysis into chunks, the number of additional pixels
+                to read on all sides to avoid edge effects. The ideal buffer size depends on
+                your analysis (e.g. whether you use convolutions or distance functions).
+        """
+        self.proj = proj if isinstance(proj, Proj) else Proj(proj, preserve_units=False)
+        self.bounds = bounds
+        self.scale = scale
+        self.buffer = buffer
+        self._proj_bounds = self.proj.transform_bounds(*bounds)
+        self.tms = morecantile.TileMatrixSet.custom(list(self._proj_bounds), self.proj.crs)
+    def execute(
+        self,
+        f: WorkerFunction,
+        f_args: Union[Iterable, None] = None,
+        f_kwargs: Union[Mapping, None] = None,
+        clip: bool = True,
+        **kwargs,
+    ) -> Tuple[Any, BoundingBox]:
+        """Execute a function that takes a cog_worker.worker.Worker as its first parameter.
+        The execute method is the underlying method for running analysis. By default, it
+        will run the function once for the Manager's given scale and bounding box.
+        When executing functions, the Manager instantiates a cog_worker.worker.Worker and passes
+        it to the function as its first parameter. The Worker keeps track of the scale, projection,
+        and bounds of its piece of the analysis, which it uses to handle the reading and writing of
+        Cloud Optimized GeoTIFFs.
+        Args:
+            f (:obj:`cog_worker.types.WorkerFunction`): The function to execute. The function will recieve a
+                cog_worker.worker.Worker as its first argument.
+            f_args (list): Additional arguments to pass to the function.
+            f_kwargs (dict): Additional keyword arguments to pass to the function.
+            clip (bool): Whether or not to clip the buffer from the completed analysis.
+            **kwargs: Additional keyword arguments to overload the Manager's properties.
+                (bounds, proj, scale, or buffer)
+        Returns:
+            A tuple containing the return value of the function and the bounding
+            box of the executed analysis in the target projection.
+        """
+        args = {
+            "bounds": self.bounds,
+            "proj": self.proj,
+            "scale": self.scale,
+            "buffer": self.buffer,
+        }
+        args.update(kwargs)
+        return _execute(f, f_args, f_kwargs, clip, **args)
+    def preview(
+        self,
+        f: WorkerFunction,
+        f_args: Union[Iterable, None] = None,
+        f_kwargs: Union[Mapping, None] = None,
+        bounds: Union[BoundingBox, None] = None,
+        max_size: int = 1024,
+        **kwargs,
+    ) -> Tuple[Any, BoundingBox]:
+        """Preview a function by executing it at a reduced scale.
+        The preview method automatically reduces the scale of analysis to fit within `max_size`.
+        Args:
+            f (WorkerFunction): The function to execute. The function will
+                recieve a cog_worker.worker.Worker as its first argument.
+            f_args (list): Additional arguments to pass to the function.
+            f_kwargs (dict): Additional keyword arguments to pass to the function.
+            bounds (BoundingBox, default: self.bounds): The region to analize.
+            max_size (int): The maximum size (width or height) in pixels to
+                compute, ignoring any buffer (default: 1024px).
+            **kwargs: Additional keyword arguments to overload the Manager's properties.
+                (proj or buffer).
+        Returns:
+            A tuple containing the return value of the function and the bounding
+            box of the executed analysis in the target projection.
+        """
+        bounds = self.bounds if bounds is None else bounds
+        proj = kwargs.pop("proj", self.proj)
+        proj = proj if isinstance(proj, Proj) else Proj(proj, preserve_units=False)
+        proj_bounds = self.proj.transform_bounds(*bounds)
+        width, height = _bbox_size(proj_bounds, self.scale)
+        _size = max(width, height)
+        scale = self.scale * _size / max_size
+        kwargs.update({"proj_bounds": proj_bounds, "proj": proj, "scale": scale})
+        return self.execute(f, f_args, f_kwargs, **kwargs)
+    def tile(
+        self,
+        f: WorkerFunction,
+        f_args: Union[Iterable, None] = None,
+        f_kwargs: Union[Mapping, None] = None,
+        z: int = 0,
+        x: int = 0,
+        y: int = 0,
+        tilesize: int = 256,
+        **kwargs,
+    ) -> Tuple[Any, BoundingBox]:
+        """Execute a function for the scale and bounds of a TMS tile.
+        The tile method supports non-global and non-mercator tiling schemes via
+        Morecantile. To generate standard web tiles, instantiate the Manager
+        with the default parameters.
+        Args:
+            f (:obj:`cog_worker.types.WorkerFunction`): The function to execute. The function will
+                recieve a cog_worker.worker.Worker as its first argument.
+            f_args (list): Additional arguments to pass to the function.
+            f_kwargs (dict): Additional keyword arguments to pass to the function.
+            bounds (BoundingBox): The region to analize (default: self.bounds)
+            max_size (int): The maximum size (width or height) in pixels to compute, ignoring any buffer
+                (default: 1024px). Automatically reduces the scale of analysis to fit within `max_size`.
+            **kwargs: Additional keyword arguments to overload the Manager's properties.
+                (buffer).
+        Returns:
+            A tuple containing the return value of the function and the bounding
+            box of the executed analysis in the target projection.
+        """
+        proj_bounds = self.tms.xy_bounds(x, y, z)  # type: ignore
+        left, bottom, right, top = proj_bounds
+        size = max(right - left, top - bottom)
+        scale = size / tilesize
+        kwargs.update(
+            {
+                "proj_bounds": proj_bounds,
+                "scale": scale,
+            }
+        )
+        return self.execute(f, f_args, f_kwargs, **kwargs)
+    def chunk_execute(
+        self,
+        f: WorkerFunction,
+        f_args: Union[Iterable, None] = None,
+        f_kwargs: Union[Mapping, None] = None,
+        chunksize: int = 512,
+    ) -> Iterator[Tuple[Any, BoundingBox]]:
+        """Return a generator that executes a function on chunks of at most `chunksize` pixels.
+        Note:
+            Manager.chunk_execute computes each chunk sequentially, trading time for reduced memory footprint.
+            To run large scale analysis in parallel using dask, see cog_worker.distributed.
+        Note:
+            You can estimate the memory requirement of executing a function at a given chunksize as
+            ``(chunksize + 2*buffer)**2 * number_of_bands_or_arrays * bit_depth``.
+        Args:
+            f (:obj:`cog_worker.types.WorkerFunction`): The function to execute. The function will recieve a
+                cog_worker.worker.Worker as its first argument.
+            f_args (list): Additional arguments to pass to the function.
+            f_kwargs (dict): Additional keyword arguments to pass to the function.
+            chunksize (int): Size of the chunks in pixels (excluding buffer).
+        Yields:
+            A tuple containing the return value of the function and the bounding
+            box of the executed analysis in the target projection.
+        """
+        for params in self.chunk_params(chunksize):
+            yield self.execute(f, f_args, f_kwargs, **params)
+    def chunk_save(
+        self,
+        dst: Union[str, IO],
+        f: WorkerFunction,
+        f_args: Union[Iterable, None] = None,
+        f_kwargs: Union[Mapping, None] = None,
+        chunksize: int = 512,
+        **kwargs,
+    ):
+        """Execute a function in chunks and write each chunk to disk as it is completed.
+        The chunk_save method is identical to Manager.chunk_execute, except it writes results to ``dst``
+        instead of yielding them. Manager.chunk_save uses the rasterio GeoTiff driver.
+        Note:
+            The function to be executed will recieve a cog_worker.worker.Worker as its first argument and
+            should return a 3-dimensional numpy array of ``chunksize`` (optionally plus the buffer pixels).
+            e.g.::
+                # Read a cog in chunks and write those chunks to 'test.tif'
+                manager.chunk_save('test.tif', lambda worker: worker.read('example-cog-url.tif'))
+        Args:
+            dst (str): The file path to write to.
+            f (:obj:`cog_worker.types.WorkerFunction`): The function to execute.
+                The function will recieve a cog_worker.worker.Worker as its first argument
+                and must return a 3-dimensional numpy array of ``chunksize`` (including or excluding the buffer).
+            f_args (list): Additional arguments to pass to the function.
+            f_kwargs (dict): Additional keyword arguments to pass to the function.
+            chunksize (int): Size of the chunks in pixels (excluding buffer).
+            **kwargs: Additional keyword arguments to pass to rasterio.open.
+        """
+        chunks = self.chunk_execute(f, f_args, f_kwargs, chunksize)
+        arr, bbox = next(chunks)
+        with self._open_writer(dst, arr.shape[0], arr.dtype, **kwargs) as _writer:
+            self._write_chunk(_writer, arr, bbox)
+            for arr, bbox in chunks:
+                self._write_chunk(_writer, arr, bbox)
+    def _open_writer(self, dst: Union[str, IO], count: int, dtype: Type, **kwargs) -> DatasetWriter:
+        """Open a rasterio.DatasetWriter with default profile."""
+        profile = _get_profile(count, self.scale, self._proj_bounds, self.proj, dtype, **kwargs)
+        return rio.open(dst, "w", **profile)  # type: ignore
+    def _write_chunk(
+        self,
+        writer: DatasetWriter,
+        arr: np.ndarray,
+        bbox: BoundingBox,
+    ):
+        """Write a chunk to a rasterio.DatasetWriter."""
+        if len(arr.shape) == 2:
+            arr = arr[np.newaxis]
+        height, width = arr.shape[1:]
+        left, bottom, right, top = bbox
+        rows, cols = rasterio.transform.rowcol(
+            writer.transform,
+            [left],
+            [top],
+            op=round,
+        )
+        window = rasterio.windows.Window(min(cols), min(rows), width, height)
+        writer.write(arr, window=window)
+        if isinstance(arr, np.ma.MaskedArray):
+            mask = np.ma.getmask(arr)
+            if len(mask.shape) == 3:
+                mask = np.any(mask, axis=0)
+            writer.write_mask(~mask, window=window)
+    def chunk_params(self, chunksize: int = 512, **kwargs):
+        """Generate parameters to execute a function in chunks.
+        Generates dicts of keyword arguments that can be passed to Manager.execute to run a function in chunks
+        of size <chunksize>. This may be useful for distributing tasks to workers to execute in parallel. Each dict
+        will contain the projection, scale, bounding box, and buffer. Attributes will be identical except
+        for ``proj_bounds`` which define the area to analyze.
+        Note:
+            ``manager.chunk_execute(f)`` is equivalent to
+            ``(manager.execute(f, **params) for params in manager.chunk_params())``
+        Args:
+            chunksize (int): Size of the chunks in pixels (excluding buffer).
+            **kwargs: optional additional keyword arguments to save to the dict (to eventually pass to Manager.execute)
+                e.g. ``f``, ``f_args``, ``f_kwargs``
+        Yields:
+            Dicts of keyword arguments that can be passed to :obj:`cog_worker.manager.Manager.execute()`.
+        """
+        _args = {
+            "proj": self.proj.srs,
+            "scale": self.scale,
+            "buffer": self.buffer,
+        }
+        _args.update(kwargs)
+        for proj_bounds in self.chunks(chunksize):
+            args = _args.copy()
+            args["proj_bounds"] = proj_bounds
+            yield args
+    def chunks(self, chunksize: int = 512) -> Iterator[BoundingBox]:
+        """Generate bounding boxes for chunks of at most <chunksize> pixels in the managers scale and projection.
+        The chunks method divides the Manager's bounding box into chunks of manageable size.
+        Each chunk will be at most <chunksize> pixels, though the geographic extent of the chunk
+        depends on the Manager's projection and scale.
+        Args:
+            chunksize (int): Size of the chunks in pixels (excluding buffer).
+        Yields:
+            BoundingBox: The bounding box of the chunk in the Manager's projection
+        """
+        xshards, yshards = self._num_chunks(chunksize)
+        for i in range(xshards):
+            for j in range(yshards):
+                bounds = self._chunk_bounds(i, j, chunksize)
+                if np.isfinite(bounds).all():
+                    yield bounds
+    def _chunk_bounds(
+        self,
+        x: int,
+        y: int,
+        chunksize: int,
+    ) -> BoundingBox:
+        """Get the bounding box of a chunk with index <x>,<y>."""
+        left, bottom, right, top = self._proj_bounds
+        _chunksize = chunksize * self.scale
+        l = left + x * _chunksize  # noqa: E741
+        r = min(l + _chunksize, right)
+        t = top - y * _chunksize
+        b = max(t - _chunksize, bottom)
+        return (l, b, r, t)
+    def _num_chunks(
+        self,
+        chunksize: int,
+    ) -> Tuple[int, int]:
+        """Return the number of chunks necessary to cover the Manager's bounding box."""
+        left, bottom, right, top = self._proj_bounds
+        return (
+            math.ceil((right - left) / self.scale / chunksize),
+            math.ceil((top - bottom) / self.scale / chunksize),
+        )
+def _execute(
+    f: WorkerFunction,
+    f_args: Union[Iterable, None] = None,
+    f_kwargs: Union[Mapping, None] = None,
+    clip: bool = True,
+    **kwargs,
+) -> Tuple[Any, BoundingBox]:
+    """Execute a function that takes a cog_worker.worker.Worker as its first parameter.
+    Instantiate a cog_worker.worker.Worker and pass it to the function as its first parameter.
+    Args:
+        f (:obj:`cog_worker.types.WorkerFunction`): The function to execute. The function will recieve a
+            cog_worker.worker.Worker as its first argument.
+        f_args (list): Additional arguments to pass to the function.
+        f_kwargs (dict): Additional keyword arguments to pass to the function.
+        clip (bool): Whether or not to clip the buffer from the completed analysis.
+        **kwargs: Additional keyword arguments to instantiate the cog_worker.worker.Worker
+    Returns:
+        A tuple containing the return value of the function and the bounding
+        box of the executed analysis in the target projection.
+    """
+    worker = cog_worker.worker.Worker(**kwargs)
+    f_args = [] if f_args is None else f_args
+    f_kwargs = {} if f_kwargs is None else f_kwargs
+    arr: np.ndarray = f(worker, *f_args, **f_kwargs)  # type: ignore
+    if clip and isinstance(arr, np.ndarray):
+        arr = worker.clip_buffer(arr)
+    return arr, worker.bounds

cog_worker/py.typed ADDED Viewed

File without changes

cog_worker/types.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""cog_worker type definitions."""
+from typing import Callable, Tuple, Union
+import numpy as np
+import cog_worker.worker
+BoundingBox = Tuple[float, float, float, float]
+"""A ``(west, south, east, north)`` tuple."""
+WorkerFunction = Union[
+    Callable[["cog_worker.worker.Worker"], np.ndarray],
+    Callable,
+]
+"""A function that can recieve a cog_worker.worker.Worker as its first parameter.
+Additional aguments and keyword arguments can be passed to the Worker function
+at time of execution with the ``f_args`` and ``f_kwargs`` parameters of
+:obj:`cog_worker.manager.Manager.execute()`
+Example:
+    Read a specific COG and return it as an array::
+        def my_analysis(worker: cog_worker.Worker):
+            arr = worker.read('example-cog.tif')
+            return arr
+    Read a COG at a given url and get the neighborhood mean for a 1km square kernel::
+        from scipy.ndimage import uniform_filter
+        def my_analysis(worker: cog_worker.Worker, source_url: str):
+            arr = worker.read(source_url)
+            kernel_size = 1000/worker.scale  # in map units (meters)
+            return uniform_filter(arr, kernel_size)
+    Read a COG and optionally upload each chunk to an S3 bucket as it is computed::
+        from rasterio import MemoryFile
+        import boto3
+        def my_analysis(worker: cog_worker.Worker, dst_bucket: str):
+            arr = worker.read('example-cog.tif')
+            if dst_bucket:
+                with MemoryFile() as memfile:
+                    fname = f'output_{worker.scale}_{worker.bounds[0]}_{worker.bounds[3]}.tif'
+                    worker.write(arr, memfile)
+                    memfile.seek(0)
+                    boto3.client('s3').upload_fileobj(memfile, dst_bucket, fname)
+            return arr
+"""

cog_worker/utils.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Utility functions."""
+from typing import Tuple, Type, Union
+import numpy as np
+from pyproj import Proj
+from rasterio import transform
+from .types import BoundingBox
+def _get_profile(
+    count: int, scale: float, proj_bounds: BoundingBox, proj: Proj, dtype: Union[Type, np.dtype], **kwargs
+) -> dict:
+    width, height = _bbox_size(proj_bounds, scale)
+    affine = transform.from_origin(proj_bounds[0], proj_bounds[3], scale, scale)
+    profile = {
+        "driver": "GTiff",
+        "interleave": "pixel",
+        "blockxsize": 512,
+        "blockysize": 512,
+        "tiled": True,
+        "compress": "lzw",
+        "crs": proj.srs,
+        "transform": affine,
+        "dtype": dtype,
+        "width": width,
+        "height": height,
+        "count": count,
+    }
+    profile.update(kwargs)
+    return profile
+def _bbox_size(
+    bounds: BoundingBox,
+    scale: float,
+) -> Tuple[int, int]:
+    left, bottom, right, top = bounds
+    return (round((right - left) / scale), round((top - bottom) / scale))

cog_worker/worker.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Reading COGs.
+The Worker class keeps track of the region, projection, and scale to
+conduct analysis in.
+When writing cog_worker functions, the main method you will use is
+:obj:`Worker.read()`, which is a wrapper around ``rio_tiler`` to clip,
+reproject and resample the data into the target resolution.
+Example:
+    Read a COG, reprojecting it onto a global 1-deg lat-long grid::
+        from cog_worker import Worker
+        from rasterio.plot import show
+        worker = Worker(bounds=(-180, -90, 180, 90), proj=4326, scale=1.0)
+        arr = worker.read('example-cog-url.tif')
+        show(arr)
+"""
+import logging
+from collections.abc import Sequence
+from typing import Union
+import numpy as np
+import rasterio as rio
+from pyproj import Proj
+from pyproj.enums import TransformDirection
+from rio_tiler.errors import EmptyMosaicError
+from rio_tiler.io import COGReader
+from rio_tiler.models import ImageData
+from rio_tiler.mosaic.reader import mosaic_reader
+from cog_worker.types import BoundingBox
+from cog_worker.utils import _bbox_size, _get_profile
+logger = logging.getLogger(__name__)
+class Worker:
+    """Class for reading Cloud Optimized GeoTIFFs."""
+    def __init__(
+        self,
+        bounds: BoundingBox = (-180, -85, 180, 85),
+        proj_bounds: Union[BoundingBox, None] = None,
+        proj: Union[int, str, Proj] = 3857,
+        scale: float = 10000,
+        buffer: int = 16,
+    ):
+        """Initialize a Worker with a bounding box, scale, and projection.
+        Args:
+            bounds (BoundingBox): The region to be analyzed as a (west, south, east, north) tuple.
+                Ignored when ``proj_bounds`` is provided.
+            proj_bounds (BoundingBox): The region to be analyzed in the Worker's projection.
+                Overrides ``bounds`` when provided.
+            proj (pyproj.Proj, str, int): The projection to analyze in. See
+                ``pyproj.Proj`` for valid values (https://pyproj4.github.io/pyproj/).
+            scale (float): The pixel size for analysis in the projection's units (usually meters or degrees).
+            buffer (int): The number of additional pixels to read on all sides to avoid edge effects.
+                The ideal buffer size depends on your analysis (e.g. whether you plan to use convolutions or
+                distance functions).
+        """
+        self._proj = proj if isinstance(proj, Proj) else Proj(proj, preserve_units=False)
+        if proj_bounds is None:
+            proj_bounds = self._proj.transform_bounds(*bounds)
+        self._bounds = proj_bounds
+        self._scale = scale
+        self._buffer = buffer
+        self._width, self._height = _bbox_size(self._bounds, scale)
+    @property
+    def proj(self) -> Proj:
+        """The projection used for reading."""
+        return self._proj
+    @property
+    def bounds(self) -> BoundingBox:
+        """The the bounding box in projected coordinates."""
+        return self._bounds
+    @property
+    def scale(self) -> float:
+        """The size of pixels in projection units."""
+        return self._scale
+    @property
+    def buffer(self) -> int:
+        """The number of additional pixels to read on all sides of the Worker's bounding box."""
+        return self._buffer
+    @property
+    def width(self) -> int:
+        """The width of the Worker's bounding box in pixels."""
+        return self._width
+    @property
+    def height(self) -> int:
+        """The height of the Worker's bounding box in pixels."""
+        return self._height
+    def xy_bounds(self, buffered: bool = False) -> BoundingBox:
+        """Return the Worker's bounding box in projected coordinates.
+        Args:
+            buffered (bool): Buffer the worker's bounding box
+        """
+        return self._buffer_bbox() if buffered else self.bounds
+    def lnglat_bounds(self, buffered: bool = False) -> BoundingBox:
+        """Return the Worker's bounding box in geographic coordinates.
+        Note:
+            When using a projected coordinate system, the geographic bounding box
+            that covers the Worker's projected extent may be larger
+            than the bounding box used to instantiate the Worker.
+        Args:
+            buffered (bool): Buffer the Worker's bounding box.
+        """
+        pts = max(self.width, self.height) + (buffered * self.buffer * 2) - 1
+        pts = min(pts, 10000)
+        bounds = self.xy_bounds(buffered)
+        return self.proj.transform_bounds(*bounds, pts, direction=TransformDirection.INVERSE)
+    def empty(self, mask: bool = False) -> np.ndarray:
+        """Return a zeroed array covering the Worker's extent including the buffer.
+        Args:
+            mask (bool): Return a Numpy masked array with all pixels masked.
+                Otherwise returns a standard Numpy array filled with zeros.
+        """
+        arr = np.zeros((1, self.height + self.buffer * 2, self.width + self.buffer * 2))
+        if mask:
+            _mask = np.ones((1, self.height + self.buffer * 2, self.width + self.buffer * 2))
+            arr = np.ma.array(arr, mask=_mask)
+        return arr
+    def read(self, src: Union[str, Sequence[str]], masked=True, **kwargs) -> Union[np.ndarray, np.ma.MaskedArray]:
+        """Read a COG, reprojecting and clipping as necessary.
+        The read method uses ``rio_tiler.COGReader`` to takes advantage of the
+        file structure and internal overviews in COGs, minimizing the amount of
+        data that needs to be read and transferred when working at reduced resolutions.
+        In general, any valid GDAL path can be read. This may be a url pointing to a COG, a local
+        GeoTIFF or a GDAL virtual file system path. However, it may be very inefficient to
+        read data sources that are not valid Cloud Optimized GeoTIFFs.
+        If a list of data sources is provided, ``Worker.read`` will use ``rio_tiler.mosaic_reader``
+        to mosaic the sources together.
+        Note:
+            The resampling method used to generate the COG's internal overviews will affect
+            how it appears at reduced resolutions.
+        Args:
+            src (str, list): The data source to read or list of sources to mosiac.
+            masked (bool): Return a Numpy masked array, otherwise ignore dataset mask.
+            **kwargs: Additional keyword arguments to pass to ``rio_tiler.COGReader.part``
+                or ``rio_tiler.mosaic_reader``. See: https://cogeotiff.github.io/rio-tiler/.
+        Returns:
+            A Numpy masked array containing the data for the Worker's bounding box and its
+            buffer.
+        Note:
+            The mask values of a Numpy masked array is the inverse of a GDAL (alpha) mask.
+            A masked value of True corresponds to nodata or an alpha value of 0.
+        """
+        proj_bounds = self._buffer_bbox()
+        width, height = _bbox_size(proj_bounds, self._scale)
+        if isinstance(src, str):
+            img = _read_cog(src, proj_bounds, self._proj.crs, width, height, **kwargs)
+        elif isinstance(src, Sequence):
+            try:
+                img, asset = mosaic_reader(src, _read_cog, proj_bounds, self.proj.crs, width, height, **kwargs)
+            except EmptyMosaicError:
+                return self.empty(mask=True)
+        arr = img.array
+        if not masked:
+            return arr.data
+        return arr
+    def write(self, arr: np.ndarray, dst: str, **kwargs):
+        """Write a Numpy array to a GeoTIFF.
+        The write method will create a GeoTIFF with a profile matching the Worker's properties.
+        Uses ``rasterio.open`` under the hood.
+        Args:
+            arr (numpy.ndarray): The array to write. Must be 2 or 3-dimensional, with a width and
+                height matching the Worker (including or excluding the buffer). If the array
+                includes the Worker's buffer, the buffer will be clipped before writing.
+            dst (str): The file path to write to.
+            **kwargs: Additional keyword arguments to pass to rasterio.open
+                See: https://rasterio.readthedocs.io/en/latest/topics/writing.html
+        """
+        arr = self.clip_buffer(arr)
+        count, height, width = arr.shape
+        profile = _get_profile(count, self.scale, self._bounds, self.proj, arr.dtype, **kwargs)
+        with rio.open(dst, "w", **profile) as writer:
+            writer.write(arr)
+            if isinstance(arr, np.ma.MaskedArray):
+                mask = np.ma.getmask(arr)
+                if len(mask.shape) == 3:  # noqa PLR2004
+                    mask = np.any(mask, axis=0)
+                writer.write_mask(~mask)
+    def clip_buffer(self, arr: np.ndarray) -> np.ndarray:
+        """Clip the buffer pixels from an array if they exist.
+        Args:
+            arr (numpy.ndarray): The array to clip.
+        Returns:
+            The array with buffer pixels removed.
+        Raises:
+            ValueError: If the array's shape does not match the Worker's width and height
+        """
+        if len(arr.shape) == 2:  # noqa PLR2004
+            # single band flat array needs extra axis to have the same number of axis as
+            # an rgb raster
+            arr = arr[np.newaxis]
+        buffer_width = self.width + self.buffer * 2
+        buffer_height = self.height + self.buffer * 2
+        c, h, w = arr.shape
+        if w == self.width and h == self.height:
+            return arr
+        elif w == buffer_width and h == buffer_height:
+            return arr[:, self.buffer : -self.buffer, self.buffer : -self.buffer]
+        else:
+            raise ValueError(
+                f"Array not expected size. Was {w}x{h} expected {self.width}x{self.height} or {buffer_width}x{buffer_height}"  # noqa: E501
+            )
+    def _buffer_bbox(self) -> BoundingBox:
+        """Returns the worker's bounding box extended by the buffered pixels."""
+        l, b, r, t = self._bounds  # noqa: E741
+        _buffer = self.buffer * self.scale
+        return (l - _buffer, b - _buffer, r + _buffer, t + _buffer)
+def _read_cog(
+    asset: str,
+    proj_bounds: BoundingBox,
+    crs: Union[str, int, Proj],
+    width: int,
+    height: int,
+    **kwargs,
+) -> ImageData:
+    """Read part of a COG, warping and resampling to a target shape."""
+    with COGReader(asset) as cog:  # type: ignore
+        part = cog.part(proj_bounds, bounds_crs=crs, dst_crs=crs, max_size=None, width=width, height=height, **kwargs)
+        # 2024-07-18
+        # This is wrong.
+        # Numpy masked array fill_value is fixed for all ints to 999999 which does not fit the 8 and 16 bits ints.
+        # This is "not an issue" for numpy < 2 since it allow conversion of out of bounds integer arrays but in > 2 it
+        # is not allowed making the serialization and deserializition of masked arrays a problem in multiprocessing.
+        #
+        # `fill_value` is a property where the setter checks and casts the value if possible.
+        # By setting the fill_value to itself, numpy overflows silently the np.int64(999999) to some trash.
+        # This allows serializing and deserializing the masked array without issues in numpy 2.x.
+        # We know this is nonsense but this has been the numpy way for decades and it is expected that the fill method
+        # is rarely or never used at all.
+        part.array.fill_value = part.array.fill_value
+        return part

cog_worker-0.3.0.dist-info/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2021 Simbiotica. S.L.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

cog_worker-0.3.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,188 @@
+Metadata-Version: 2.1
+Name: cog_worker
+Version: 0.3.0
+Summary: Scalable geospatial analysis on Cloud Optimized GeoTIFFs.
+Author-email: Francis Gassert <francis.gassert@vizzuality.com>
+License: MIT License
+        Copyright (c) 2021 Simbiotica. S.L.
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: Homepage, https://github.com/vizzuality/cog_worker
+Project-URL: Issues, https://github.com/vizzuality/cog_worker/issues
+Keywords: cog,geotiff,raster,gdal,rasterio,dask
+Classifier: Intended Audience :: Information Technology
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: GIS
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE.txt
+Requires-Dist: numpy >=1
+Requires-Dist: pyproj >=3.0.0
+Requires-Dist: rasterio >=1.3
+Requires-Dist: morecantile <6.0.0,>=5.0.0
+Requires-Dist: rio-tiler <7.0.0,>=6.0.0
+Provides-Extra: dev
+Requires-Dist: pre-commit ; extra == 'dev'
+Requires-Dist: bump-my-version ; extra == 'dev'
+Provides-Extra: distributed
+Requires-Dist: dask[distributed] ; extra == 'distributed'
+Provides-Extra: docs
+Requires-Dist: Sphinx ; extra == 'docs'
+Requires-Dist: sphinxcontrib-napoleon ; extra == 'docs'
+Requires-Dist: furo ; extra == 'docs'
+Requires-Dist: nbsphinx ; extra == 'docs'
+Requires-Dist: nbconvert ; extra == 'docs'
+Provides-Extra: test
+Requires-Dist: pytest ; extra == 'test'
+# Cog Worker
+Scalable geospatial analysis on Cloud Optimized GeoTIFFs.
+ - **Documentation**: https://vizzuality.github.io/cog_worker
+ - **PyPI**: https://pypi.org/project/cog-worker
+cog_worker is a simple library to help write scripts to conduct scaleable
+analysis of gridded data. It's intended to be useful for moderate- to large-scale
+GIS, remote sensing, and machine learning applications.
+## Installation
+```
+pip install cog_worker
+```
+## Examples
+See `docs/examples` for Jupyter notebook examples
+## Quick start
+0. A simple cog_worker script
+```python
+from rasterio.plot import show
+from cog_worker import Manager
+def my_analysis(worker):
+    arr = worker.read('roads_cog.tif')
+    return arr
+manager = Manager(proj='wgs84', scale=0.083333)
+arr, bbox = manager.preview(my_analysis)
+show(arr)
+```
+1. Define an analysis function that recieves a cog_worker.Worker as the first parameter.
+```python
+from cog_worker import Worker, Manager
+import numpy as np
+# Define an analysis function to read and process COG data sources
+def MyAnalysis(worker: Worker) -> np.ndarray:
+    # 1. Read a COG (reprojecting, resampling and clipping as necessary)
+    array: np.ndarray = worker.read('roads_cog.tif')
+    # 2. Work on the array
+    # ...
+    # 3. Return (or post to blob storage etc.)
+    return array
+```
+2. Run your analysis in different scales and projections
+```python
+import rasterio as rio
+# Run your analysis using a cog_worker.Manager which handles chunking
+manager = Manager(
+    proj = 'wgs84',       # any pyproj string
+    scale = 0.083333,  # in projection units (degrees or meters)
+    bounds = (-180, -90, 180, 90),
+    buffer = 128          # buffer pixels when chunking analysis
+)
+# preview analysis
+arr, bbox = manager.preview(MyAnalysis, max_size=1024)
+rio.plot.show(arr)
+# preview analysis chunks
+for bbox in manager.chunks(chunksize=1500):
+    print(bbox)
+# execute analysis chunks sequentially
+for arr, bbox in manager.chunk_execute(MyAnalysis, chunksize=1500):
+    rio.plot.show(arr)
+# generate job execution parameters
+for params in manager.chunk_params(chunksize=1500):
+    print(params)
+```
+3. Write scale-dependent functions¶
+```python
+import scipy
+def focal_mean(
+    worker: Worker,
+    kernel_radius: float = 1000 # radius in projection units (meters)
+) -> np.ndarray:
+    array: np.ndarray = worker.read('sample-geotiff.tif')
+    # Access the pixel size at worker.scale
+    kernel_size = kernel_radius * 2 / worker.scale
+    array = scipy.ndimage.uniform_filter(array, kernel_size)
+    return array
+```
+4. Chunk your analysis and run it in a dask cluster
+```python
+from cog_worker.distributed import DaskManager
+from dask.distributed import LocalCluster, Client
+# Set up a Manager with that connects to a Dask cluster
+cluster = LocalCluster()
+client = Client(cluster)
+distributed_manager = DaskManager(
+    client,
+    proj = 'wgs84',
+    scale = 0.083333,
+    bounds = (-180, -90, 180, 90),
+    buffer = 128
+)
+# Execute in worker pool and save chunks to disk as they complete.
+distributed_manager.chunk_save('output.tif', MyAnalysis, chunksize=2048)
+```

cog_worker-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+cog_worker/__init__.py,sha256=cMjDemK58ri18Sv9BJOyAz47hpcRH97bcdvFEh-103E,314
+cog_worker/distributed.py,sha256=X3r-1Ef04nlyTZisMhMU1A6xZL6_NsAuCLAFAzjXncI,6900
+cog_worker/manager.py,sha256=YB6BeoU2hqyTHrfO_vXVUQ3bTDMvBihAGG_X6_2O5Oc,17456
+cog_worker/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+cog_worker/types.py,sha256=9UKK4Zie1XMbT_pHttRDC8Zb2kxkoxQ4xdfbJLM2cVo,1781
+cog_worker/utils.py,sha256=jRGUecMR-0vzfiYoLL3uBz0eNRLGYE5py_pxRo76y1E,1018
+cog_worker/worker.py,sha256=ckp0BH_isKKD-nPiwmVLKesvan5Iqs1srQ_ju3tbLN4,11352
+cog_worker-0.3.0.dist-info/LICENSE.txt,sha256=Sd40qFfjMndidtlw_mKQ5TPBLSpk-wSFMtfDM1qTfoA,1073
+cog_worker-0.3.0.dist-info/METADATA,sha256=eBsRiQCCRfvclSGDVTaPYGdlYgqlPBGRSrKH-7I9340,5999
+cog_worker-0.3.0.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
+cog_worker-0.3.0.dist-info/top_level.txt,sha256=jUWy8Vkc6yjVPFbIip1rX4W37aFxIAfpiVFs8mbmOr0,11
+cog_worker-0.3.0.dist-info/RECORD,,

cog_worker-0.3.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (71.1.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

cog_worker-0.3.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ cog_worker