libadalina-core 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- libadalina_core/__init__.py +0 -0
- libadalina_core/examples/example_hospitals_in_provinces.py +46 -0
- libadalina_core/examples/example_population_in_provinces.py +42 -0
- libadalina_core/examples/example_population_served_by_hospitals.py +59 -0
- libadalina_core/readers/__init__.py +0 -0
- libadalina_core/readers/readers.py +18 -0
- libadalina_core/sedona_configuration/__init__.py +0 -0
- libadalina_core/sedona_configuration/jdk_installer.py +22 -0
- libadalina_core/sedona_configuration/sedona_configuration.py +104 -0
- libadalina_core/sedona_utils/__init__.py +0 -0
- libadalina_core/sedona_utils/coordinate_formats.py +22 -0
- libadalina_core/sedona_utils/utils.py +27 -0
- libadalina_core/spatial_join/__init__.py +0 -0
- libadalina_core/spatial_join/query_builder.py +147 -0
- libadalina_core/writers/__init__.py +0 -0
- libadalina_core/writers/writers.py +22 -0
- libadalina_core-1.0.dist-info/METADATA +67 -0
- libadalina_core-1.0.dist-info/RECORD +22 -0
- libadalina_core-1.0.dist-info/WHEEL +5 -0
- libadalina_core-1.0.dist-info/entry_points.txt +2 -0
- libadalina_core-1.0.dist-info/licenses/LICENSE +21 -0
- libadalina_core-1.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from libadalina_core.readers.readers import geopackage_to_dataframe
|
|
2
|
+
import pathlib
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from libadalina_core.spatial_join.query_builder import spatial_join, JoinType, spatial_aggregation, AggregationType, \
|
|
6
|
+
AggregationFunction
|
|
7
|
+
|
|
8
|
+
if __name__ == "__main__":
|
|
9
|
+
"""Example of how to use libadalina to find hospitals in specific provinces in Italy and aggregate their data."""
|
|
10
|
+
|
|
11
|
+
# Set pandas display options
|
|
12
|
+
pd.set_option('display.max_columns', None)
|
|
13
|
+
pd.set_option('display.width', None)
|
|
14
|
+
pd.set_option('display.max_colwidth', 100)
|
|
15
|
+
|
|
16
|
+
hospitals = geopackage_to_dataframe(
|
|
17
|
+
str(pathlib.Path(__file__).parent.parent.parent / "tests" / "samples" / "healthcare" / "EU_healthcare.gpkg"),
|
|
18
|
+
"EU"
|
|
19
|
+
)[["hospital_name", "geometry", "city", "cap_beds"]]
|
|
20
|
+
|
|
21
|
+
regions = geopackage_to_dataframe(
|
|
22
|
+
str(pathlib.Path(__file__).parent.parent.parent / "tests" / "samples" / "regions" / "NUTS_RG_20M_2024_4326.gpkg"),
|
|
23
|
+
"NUTS_RG_20M_2024_4326.gpkg"
|
|
24
|
+
)[["LEVL_CODE", "NUTS_NAME", "CNTR_CODE", "geometry"]]
|
|
25
|
+
|
|
26
|
+
# select province of Milan and Cremona
|
|
27
|
+
filtered_regions = regions[
|
|
28
|
+
(regions['LEVL_CODE'] == 3) &
|
|
29
|
+
(regions['CNTR_CODE'] == "IT") &
|
|
30
|
+
(regions['NUTS_NAME'].str.contains('Milano|Cremona', case=False))
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# join with hospitals table to get hospitals in these provinces
|
|
34
|
+
result = (spatial_join(filtered_regions, hospitals, join_type=JoinType.LEFT)
|
|
35
|
+
# join operator renames the geometries adding suffixes _left and _right to avoid conflicts
|
|
36
|
+
.withColumnRenamed('geometry_left', 'geometry'))
|
|
37
|
+
result.show(truncate=False)
|
|
38
|
+
|
|
39
|
+
# get the number of hospitals in each province along with the total and average number of beds
|
|
40
|
+
result = spatial_aggregation(result, aggregate_functions=[
|
|
41
|
+
AggregationFunction("hospital_name", AggregationType.COUNT, 'hospitals'),
|
|
42
|
+
AggregationFunction("cap_beds", AggregationType.SUM, 'total_beds'),
|
|
43
|
+
AggregationFunction("cap_beds", AggregationType.AVG, 'average_beds'),
|
|
44
|
+
])
|
|
45
|
+
result.show(truncate=False)
|
|
46
|
+
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from libadalina_core.readers.readers import geopackage_to_dataframe
|
|
2
|
+
import pathlib
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from libadalina_core.spatial_join.query_builder import spatial_join, JoinType, spatial_aggregation, AggregationType, \
|
|
6
|
+
AggregationFunction
|
|
7
|
+
|
|
8
|
+
if __name__ == "__main__":
|
|
9
|
+
"""Example of how to use libadalina to find the total amount of the population living in specific provinces in Italy."""
|
|
10
|
+
|
|
11
|
+
# Set pandas display options
|
|
12
|
+
pd.set_option('display.max_columns', None)
|
|
13
|
+
pd.set_option('display.width', None)
|
|
14
|
+
pd.set_option('display.max_colwidth', 100)
|
|
15
|
+
|
|
16
|
+
population = geopackage_to_dataframe(
|
|
17
|
+
str(pathlib.Path(__file__).parent.parent.parent / "tests" / "samples" / "population-north-italy" / "nord-italia.gpkg"),
|
|
18
|
+
"census2021"
|
|
19
|
+
)[['T', 'geometry']]
|
|
20
|
+
|
|
21
|
+
regions = geopackage_to_dataframe(
|
|
22
|
+
str(pathlib.Path(__file__).parent.parent.parent / "tests" / "samples" / "regions" / "NUTS_RG_20M_2024_4326.gpkg"),
|
|
23
|
+
"NUTS_RG_20M_2024_4326.gpkg"
|
|
24
|
+
)[["LEVL_CODE", "NUTS_NAME", "CNTR_CODE", "geometry"]]
|
|
25
|
+
|
|
26
|
+
# select province of Milan and Cremona
|
|
27
|
+
filtered_regions = regions[
|
|
28
|
+
(regions['LEVL_CODE'] == 3) &
|
|
29
|
+
(regions['CNTR_CODE'] == "IT") &
|
|
30
|
+
(regions['NUTS_NAME'].str.contains('Milano|Cremona', case=False))
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
# join with population table to get the population of these provinces
|
|
34
|
+
result = spatial_aggregation(
|
|
35
|
+
spatial_join(filtered_regions, population, join_type=JoinType.LEFT)
|
|
36
|
+
# join operator renames the geometries adding suffixes _left and _right to avoid conflicts
|
|
37
|
+
.withColumnRenamed('geometry_left', 'geometry'),
|
|
38
|
+
aggregate_functions=[
|
|
39
|
+
AggregationFunction("T", AggregationType.SUM, 'population', proportional='geometry_right'),
|
|
40
|
+
])
|
|
41
|
+
result.show(truncate=False)
|
|
42
|
+
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from libadalina_core.readers.readers import geopackage_to_dataframe
|
|
2
|
+
import pathlib
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from libadalina_core.spatial_join.query_builder import spatial_join, JoinType, spatial_aggregation, AggregationType, \
|
|
6
|
+
AggregationFunction, polygonize
|
|
7
|
+
|
|
8
|
+
if __name__ == "__main__":
|
|
9
|
+
"""Example of how to use libadalina to find the amount of population living within 1km from each hospital of a specific province in Italy."""
|
|
10
|
+
|
|
11
|
+
# Set pandas display options
|
|
12
|
+
pd.set_option('display.max_columns', None)
|
|
13
|
+
pd.set_option('display.width', None)
|
|
14
|
+
pd.set_option('display.max_colwidth', 100)
|
|
15
|
+
|
|
16
|
+
population = geopackage_to_dataframe(
|
|
17
|
+
str(pathlib.Path(
|
|
18
|
+
__file__).parent.parent.parent / "tests" / "samples" / "population-north-italy" / "nord-italia.gpkg"),
|
|
19
|
+
"census2021"
|
|
20
|
+
)[['T', 'geometry']]
|
|
21
|
+
|
|
22
|
+
hospitals = geopackage_to_dataframe(
|
|
23
|
+
str(pathlib.Path(__file__).parent.parent.parent / "tests" / "samples" / "healthcare" / "EU_healthcare.gpkg"),
|
|
24
|
+
"EU"
|
|
25
|
+
)[["hospital_name", "geometry", "city", "cap_beds"]]
|
|
26
|
+
|
|
27
|
+
regions = geopackage_to_dataframe(
|
|
28
|
+
str(pathlib.Path(__file__).parent.parent.parent / "tests" / "samples" / "regions" / "NUTS_RG_20M_2024_4326.gpkg"),
|
|
29
|
+
"NUTS_RG_20M_2024_4326.gpkg"
|
|
30
|
+
)[["LEVL_CODE", "NUTS_NAME", "CNTR_CODE", "geometry"]]
|
|
31
|
+
|
|
32
|
+
# select province of Cremona
|
|
33
|
+
filtered_regions = regions[
|
|
34
|
+
(regions['LEVL_CODE'] == 3) &
|
|
35
|
+
(regions['CNTR_CODE'] == "IT") &
|
|
36
|
+
(regions['NUTS_NAME'].str.contains('Cremona', case=False))
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
# join with hospitals table to get hospitals in these provinces
|
|
40
|
+
result = (spatial_join(filtered_regions, hospitals, join_type=JoinType.LEFT)
|
|
41
|
+
# join operator renames the geometries adding suffixes _left and _right to avoid conflicts
|
|
42
|
+
.withColumnRenamed('geometry_left', 'geometry_provinces')
|
|
43
|
+
.withColumnRenamed('geometry_right', 'geometry'))
|
|
44
|
+
result.show(truncate=False)
|
|
45
|
+
|
|
46
|
+
# transform the points representing the hospitals on the map to circle-like shaped polygons with a radius of 1000 meters
|
|
47
|
+
result = (polygonize(result, 1000)
|
|
48
|
+
.withColumnRenamed('geometry', 'original_geometry')
|
|
49
|
+
.withColumnRenamed('polygonized_geometry', 'geometry'))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
result = spatial_aggregation(spatial_join(result, population, join_type=JoinType.INNER)
|
|
53
|
+
# join operator renames the geometries adding suffixes _left and _right to avoid conflicts
|
|
54
|
+
.withColumnRenamed('geometry_left', 'geometry'),
|
|
55
|
+
aggregate_functions=[
|
|
56
|
+
AggregationFunction("T", AggregationType.SUM, 'population',
|
|
57
|
+
proportional='geometry_right'),
|
|
58
|
+
])
|
|
59
|
+
result.show(truncate=False)
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import geopandas as gpd
|
|
2
|
+
|
|
3
|
+
from libadalina_core.sedona_utils.coordinate_formats import DEFAULT_EPSG
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def geopackage_to_dataframe(path: str, layer: str) -> gpd.GeoDataFrame:
|
|
7
|
+
"""
|
|
8
|
+
Read a GeoPackage file into a GeoDataFrame.
|
|
9
|
+
|
|
10
|
+
Geometry is automatically converted in libadalina default EPSG `DEFAULT_EPSG`.
|
|
11
|
+
|
|
12
|
+
:param path: The path to the GeoPackage file.
|
|
13
|
+
:param layer: The layer name of the GeoPackage.
|
|
14
|
+
:return: A GeoDataFrame containing the data from the specified layer.
|
|
15
|
+
"""
|
|
16
|
+
gdf = gpd.read_file(path, layer=layer)
|
|
17
|
+
gdf.to_crs(epsg=DEFAULT_EPSG.value, inplace=True)
|
|
18
|
+
return gdf
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import jdk
|
|
4
|
+
|
|
5
|
+
def install_jdk_if_needed():
|
|
6
|
+
"""
|
|
7
|
+
Install a compatible JDK if `JAVA_HOME` environment variable is not found.
|
|
8
|
+
"""
|
|
9
|
+
if 'JAVA_HOME' not in os.environ:
|
|
10
|
+
version = '17'
|
|
11
|
+
jre_path = os.path.join(jdk._JRE_DIR, version)
|
|
12
|
+
|
|
13
|
+
if os.path.exists(jre_path):
|
|
14
|
+
logging.info(f'JAVA_HOME not set but JRE already downloaded')
|
|
15
|
+
os.environ['JAVA_HOME'] = jre_path
|
|
16
|
+
else:
|
|
17
|
+
logging.info('JAVA_HOME not set, installing JRE...')
|
|
18
|
+
java_home = jdk.install(version, jre=True)
|
|
19
|
+
os.environ['JAVA_HOME'] = java_home
|
|
20
|
+
os.symlink(java_home, jre_path)
|
|
21
|
+
|
|
22
|
+
logging.info(f'JAVA_HOME set to {os.environ.get("JAVA_HOME")}')
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from pyspark.sql import SparkSession
|
|
2
|
+
from sedona.spark import SedonaContext
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from libadalina_core.sedona_configuration.jdk_installer import install_jdk_if_needed
|
|
5
|
+
|
|
6
|
+
# compatibility with Pandas
|
|
7
|
+
pd.DataFrame.iteritems = pd.DataFrame.items
|
|
8
|
+
|
|
9
|
+
def _get_sedona_master_configuration(master_host: str) -> SparkSession:
|
|
10
|
+
spark = (SparkSession.builder
|
|
11
|
+
.appName("Adalina")
|
|
12
|
+
.master(master_host)
|
|
13
|
+
.config(
|
|
14
|
+
"spark.jars.packages",
|
|
15
|
+
"org.apache.sedona:sedona-spark-3.3_2.12:1.7.1,"
|
|
16
|
+
"org.datasyslab:geotools-wrapper:1.7.1-28.5",
|
|
17
|
+
)
|
|
18
|
+
.config(
|
|
19
|
+
"spark.jars.repositories",
|
|
20
|
+
"https://artifacts.unidata.ucar.edu/repository/unidata-all"
|
|
21
|
+
)
|
|
22
|
+
.config("spark.executor.instances", 1)
|
|
23
|
+
.config("spark.executor.cores", "1")
|
|
24
|
+
.config("spark.executor.memory", "2G")
|
|
25
|
+
.getOrCreate())
|
|
26
|
+
return SedonaContext.create(spark)
|
|
27
|
+
|
|
28
|
+
def _sedona_configuration() -> SparkSession:
|
|
29
|
+
config = (
|
|
30
|
+
SedonaContext.builder()
|
|
31
|
+
.appName("Adalina")
|
|
32
|
+
.config(
|
|
33
|
+
"spark.jars.packages",
|
|
34
|
+
"org.apache.sedona:sedona-spark-3.3_2.12:1.7.1,"
|
|
35
|
+
"org.datasyslab:geotools-wrapper:1.7.1-28.5",
|
|
36
|
+
)
|
|
37
|
+
.config("spark.driver.memory", "20g")
|
|
38
|
+
.config(
|
|
39
|
+
"spark.jars.repositories",
|
|
40
|
+
"https://artifacts.unidata.ucar.edu/repository/unidata-all"
|
|
41
|
+
)
|
|
42
|
+
.getOrCreate()
|
|
43
|
+
)
|
|
44
|
+
return SedonaContext.create(config)
|
|
45
|
+
|
|
46
|
+
_sedona_context: SparkSession | None = None
|
|
47
|
+
|
|
48
|
+
def init_sedona_context(
|
|
49
|
+
spark_master: str | None = None,
|
|
50
|
+
spark: SparkSession | None = None
|
|
51
|
+
):
|
|
52
|
+
"""
|
|
53
|
+
Initialize the Sedona context for spatial data processing.
|
|
54
|
+
|
|
55
|
+
This function can either:
|
|
56
|
+
|
|
57
|
+
1. Create a new Sedona context with a specified Spark master,
|
|
58
|
+
2. Use an existing SparkSession, or
|
|
59
|
+
3. Create a default Sedona context with the default Spark configuration.
|
|
60
|
+
|
|
61
|
+
If no parameters are provided, it will create a default Sedona context (option 3).
|
|
62
|
+
|
|
63
|
+
If a `JAVA_HOME` environment variable is not set, it will attempt to install a compatible JDK.
|
|
64
|
+
|
|
65
|
+
:param spark_master: The Spark master URL to connect to. If provided, a new Sedona context will be created with this master.
|
|
66
|
+
:param spark: An existing SparkSession to use. If provided, it will be used to create the Sedona context.
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
# Initialize with default configuration
|
|
70
|
+
init_sedona_context()
|
|
71
|
+
|
|
72
|
+
# Initialize with specific Spark master
|
|
73
|
+
init_sedona_context(spark_master="spark://localhost:7077")
|
|
74
|
+
|
|
75
|
+
# Initialize with existing SparkSession
|
|
76
|
+
spark = SparkSession.builder.getOrCreate()
|
|
77
|
+
init_sedona_context(spark=spark)
|
|
78
|
+
"""
|
|
79
|
+
global _sedona_context
|
|
80
|
+
|
|
81
|
+
install_jdk_if_needed()
|
|
82
|
+
|
|
83
|
+
if spark_master is not None:
|
|
84
|
+
_sedona_context = _get_sedona_master_configuration(spark_master)
|
|
85
|
+
elif isinstance(spark, SparkSession):
|
|
86
|
+
_sedona_context = SedonaContext.create(spark)
|
|
87
|
+
else:
|
|
88
|
+
_sedona_context = _sedona_configuration()
|
|
89
|
+
|
|
90
|
+
def get_sedona_context() -> SparkSession:
|
|
91
|
+
"""
|
|
92
|
+
Get the Sedona context for spatial data processing.
|
|
93
|
+
This context is the one used for all spatial operations in libadalina.
|
|
94
|
+
|
|
95
|
+
If the Sedona context has not been initialized yet with `init_sedona_context`,
|
|
96
|
+
the function `init_sedona_context` will be called to initialize it with the default configuration.
|
|
97
|
+
|
|
98
|
+
:return: The Sedona context as a SparkSession.
|
|
99
|
+
"""
|
|
100
|
+
global _sedona_context
|
|
101
|
+
|
|
102
|
+
if _sedona_context is None:
|
|
103
|
+
init_sedona_context()
|
|
104
|
+
return _sedona_context
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
class EPSGFormats(Enum):
|
|
4
|
+
"""
|
|
5
|
+
Enum representing common EPSG formats used in geospatial data.
|
|
6
|
+
"""
|
|
7
|
+
EPSG4326 = 4326 # WGS84
|
|
8
|
+
EPSG32632 = 32632 # UTM zone 32N
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def from_code(code: int) -> 'EPSGFormats':
|
|
12
|
+
for f in EPSGFormats:
|
|
13
|
+
if f.value == code:
|
|
14
|
+
return f
|
|
15
|
+
raise ValueError(f"No EPSG format found for code {code}")
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
Default EPSG format used in libadalina.
|
|
19
|
+
|
|
20
|
+
All DataFrame are converted upon reading and writing to this format.
|
|
21
|
+
"""
|
|
22
|
+
DEFAULT_EPSG = EPSGFormats.EPSG4326
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import geopandas as gpd
|
|
3
|
+
import pyspark.sql as ps
|
|
4
|
+
|
|
5
|
+
from libadalina_core.sedona_configuration.sedona_configuration import get_sedona_context
|
|
6
|
+
|
|
7
|
+
def to_spark_dataframe(df: pd.DataFrame | gpd.GeoDataFrame | ps.DataFrame) -> ps.DataFrame:
|
|
8
|
+
"""
|
|
9
|
+
Covert a pandas DataFrame or a GeoPandas GeoDataFrame to a Spark DataFrame.
|
|
10
|
+
If the input is already a Spark DataFrame, it will be returned as is.
|
|
11
|
+
|
|
12
|
+
This function is useful for converting data to a format suitable for processing with Apache Sedona,
|
|
13
|
+
however, each function of libadalina already converts the input DataFrame to a Spark DataFrame before processing.
|
|
14
|
+
|
|
15
|
+
:param df: The DataFrame to convert, which can be a pandas DataFrame, a GeoPandas GeoDataFrame, or a Spark DataFrame.
|
|
16
|
+
:return: A Spark DataFrame.
|
|
17
|
+
"""
|
|
18
|
+
if isinstance(df, ps.DataFrame):
|
|
19
|
+
return df
|
|
20
|
+
sedona = get_sedona_context()
|
|
21
|
+
if isinstance(df, gpd.GeoDataFrame):
|
|
22
|
+
return sedona.createDataFrame(df)
|
|
23
|
+
if isinstance(df, pd.DataFrame):
|
|
24
|
+
return sedona.createDataFrame(df)
|
|
25
|
+
if isinstance(df, ps.DataFrame):
|
|
26
|
+
return df # nothing to do here
|
|
27
|
+
raise TypeError(f"Unsupported type {type(df)}. Expected pandas, geopandas, or spark DataFrame.")
|
|
File without changes
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import geopandas as gpd
|
|
6
|
+
import pyspark.sql as ps
|
|
7
|
+
import pyspark.sql.functions as func
|
|
8
|
+
|
|
9
|
+
from libadalina_core.sedona_utils.utils import to_spark_dataframe
|
|
10
|
+
from sedona.sql import ST_Intersects, ST_Area, ST_Intersection, ST_Union, ST_Buffer, ST_GeometryType, ST_Dump
|
|
11
|
+
|
|
12
|
+
DataFrame = pd.DataFrame | gpd.GeoDataFrame | ps.DataFrame
|
|
13
|
+
|
|
14
|
+
def polygonize(df: DataFrame, radius_meters: float) -> ps.DataFrame:
|
|
15
|
+
table = to_spark_dataframe(df)
|
|
16
|
+
|
|
17
|
+
return table.select("*", func
|
|
18
|
+
.when(df.geometry.isNull(), None)
|
|
19
|
+
.when(ST_GeometryType(df.geometry).like('%Point%'),
|
|
20
|
+
ST_Buffer(func.col('geometry'), radius_meters, func.lit(True)))
|
|
21
|
+
.when(ST_GeometryType(func.col('geometry')).like('%LineString%'),
|
|
22
|
+
ST_Union(
|
|
23
|
+
ST_Buffer(func.col('geometry'), radius_meters, func.lit(True),
|
|
24
|
+
parameters=func.lit('endcap=flat side=left')),
|
|
25
|
+
ST_Buffer(func.col('geometry'), radius_meters, func.lit(True),
|
|
26
|
+
parameters=func.lit('endcap=flat side=right'))
|
|
27
|
+
))
|
|
28
|
+
.otherwise(df.geometry)
|
|
29
|
+
.alias('polygonized_geometry')
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def explode_multi_geometry(df: DataFrame) -> ps.DataFrame:
|
|
34
|
+
table = to_spark_dataframe(df)
|
|
35
|
+
|
|
36
|
+
return table.select("*", func
|
|
37
|
+
.when(df.geometry.isNull(), func.array())
|
|
38
|
+
.when(ST_GeometryType(df.geometry).like('%Multi%'),
|
|
39
|
+
func.explode(ST_Dump(df.geometry)))
|
|
40
|
+
.otherwise(df.geometry)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
class AggregationType(Enum):
|
|
44
|
+
COUNT = 'count'
|
|
45
|
+
SUM = 'sum'
|
|
46
|
+
AVG = 'avg'
|
|
47
|
+
MIN = 'min'
|
|
48
|
+
MAX = 'max'
|
|
49
|
+
|
|
50
|
+
def to_spark_func(self):
|
|
51
|
+
if self == AggregationType.COUNT:
|
|
52
|
+
return func.count
|
|
53
|
+
elif self == AggregationType.SUM:
|
|
54
|
+
return func.sum
|
|
55
|
+
elif self == AggregationType.AVG:
|
|
56
|
+
return func.avg
|
|
57
|
+
elif self == AggregationType.MIN:
|
|
58
|
+
return func.min
|
|
59
|
+
elif self == AggregationType.MAX:
|
|
60
|
+
return func.max
|
|
61
|
+
return func.count # Default to count if none matched
|
|
62
|
+
|
|
63
|
+
def __str__(self):
|
|
64
|
+
return self.value
|
|
65
|
+
|
|
66
|
+
def __repr__(self):
|
|
67
|
+
return self.value
|
|
68
|
+
|
|
69
|
+
@dataclasses.dataclass
|
|
70
|
+
class AggregationFunction:
|
|
71
|
+
column: str
|
|
72
|
+
aggregation_type: AggregationType
|
|
73
|
+
alias: str | None = None
|
|
74
|
+
proportional: str | None = None
|
|
75
|
+
|
|
76
|
+
def spatial_aggregation(table: DataFrame, aggregate_functions: list[AggregationFunction]) -> ps.DataFrame:
|
|
77
|
+
table = to_spark_dataframe(table)
|
|
78
|
+
|
|
79
|
+
columns_to_aggregate = [c.column for c in aggregate_functions]
|
|
80
|
+
projection_of_not_aggregated_columns = (
|
|
81
|
+
func.first(c).alias(c) for c in table.columns if c != 'geometry' and c not in columns_to_aggregate
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
columns_with_no_proportional_aggregation = [c for c in aggregate_functions if c.proportional is None]
|
|
85
|
+
columns_with_proportional_aggregation = [c for c in aggregate_functions if c.proportional is not None]
|
|
86
|
+
|
|
87
|
+
projection_of_aggregated_columns = (
|
|
88
|
+
agg_func.aggregation_type.to_spark_func()(func.col(agg_func.column)).alias(
|
|
89
|
+
f"{agg_func.aggregation_type.value}({agg_func.column})" if agg_func.alias is None else agg_func.alias
|
|
90
|
+
) for agg_func in columns_with_no_proportional_aggregation if agg_func.column in table.columns
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
projection_of_proportional_aggregated_columns = (
|
|
94
|
+
agg_func.aggregation_type.to_spark_func()(func.col(agg_func.column) * ST_Area(ST_Intersection(func.col('geometry'), func.col(agg_func.proportional))) / ST_Area(func.col(agg_func.proportional))).alias(
|
|
95
|
+
f"{agg_func.aggregation_type.value}({agg_func.column})" if agg_func.alias is None else agg_func.alias
|
|
96
|
+
) for agg_func in columns_with_proportional_aggregation if agg_func.column in table.columns
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Group by geometry and aggregate other columns
|
|
100
|
+
aggregated = (table
|
|
101
|
+
.groupby(table.geometry)
|
|
102
|
+
.agg(
|
|
103
|
+
# from the columns for which is not specified an aggregation function, take the first value
|
|
104
|
+
*projection_of_not_aggregated_columns,
|
|
105
|
+
# apply the aggregation functions to the other columns
|
|
106
|
+
*projection_of_aggregated_columns,
|
|
107
|
+
*projection_of_proportional_aggregated_columns
|
|
108
|
+
))
|
|
109
|
+
|
|
110
|
+
return aggregated
|
|
111
|
+
|
|
112
|
+
class JoinType(Enum):
|
|
113
|
+
INNER = 'inner'
|
|
114
|
+
LEFT = 'left'
|
|
115
|
+
RIGHT = 'right'
|
|
116
|
+
FULL = 'full'
|
|
117
|
+
|
|
118
|
+
def __str__(self):
|
|
119
|
+
return self.value
|
|
120
|
+
|
|
121
|
+
def __repr__(self):
|
|
122
|
+
return self.value
|
|
123
|
+
|
|
124
|
+
def spatial_join(
|
|
125
|
+
left_table: DataFrame,
|
|
126
|
+
right_table: DataFrame,
|
|
127
|
+
join_type: JoinType = JoinType.INNER,
|
|
128
|
+
aggregate: bool = False,
|
|
129
|
+
aggregate_functions: list[AggregationFunction] | None = None
|
|
130
|
+
) -> ps.DataFrame:
|
|
131
|
+
|
|
132
|
+
left_table = to_spark_dataframe(left_table)
|
|
133
|
+
right_table = to_spark_dataframe(right_table)
|
|
134
|
+
|
|
135
|
+
result = (left_table
|
|
136
|
+
.withColumnRenamed('geometry', 'geometry_left')
|
|
137
|
+
.join(right_table.withColumnRenamed('geometry', 'geometry_right'),
|
|
138
|
+
on=ST_Intersects(func.col('geometry_left'), func.col('geometry_right')), how=join_type.value)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if aggregate:
|
|
142
|
+
if aggregate_functions is None:
|
|
143
|
+
raise ValueError("aggregate_functions must be provided when aggregate is True")
|
|
144
|
+
result = spatial_aggregation(result, aggregate_functions)
|
|
145
|
+
|
|
146
|
+
return result
|
|
147
|
+
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import geopandas as gpd
|
|
3
|
+
import pyspark.sql as ps
|
|
4
|
+
from libadalina_core.sedona_utils.coordinate_formats import DEFAULT_EPSG
|
|
5
|
+
|
|
6
|
+
def dataframe_to_geopackage(df: pd.DataFrame | gpd.GeoDataFrame | ps.DataFrame, path: str):
|
|
7
|
+
"""
|
|
8
|
+
Write a DataFrame to a GeoPackage file.
|
|
9
|
+
DataFrame geometry is assumed to be in libadalina default EPSG `DEFAULT_EPSG`
|
|
10
|
+
|
|
11
|
+
:param df: The DataFrame to write, which can be a pandas DataFrame, a GeoPandas GeoDataFrame, or a Spark DataFrame.
|
|
12
|
+
:param path: The path to the GeoPackage file where the DataFrame will be saved.
|
|
13
|
+
"""
|
|
14
|
+
if isinstance(df, ps.DataFrame):
|
|
15
|
+
df = gpd.GeoDataFrame(df.toPandas(), geometry = 'geometry', crs = DEFAULT_EPSG.value)
|
|
16
|
+
elif isinstance(df, pd.DataFrame):
|
|
17
|
+
df = gpd.GeoDataFrame(df, geometry='geometry', crs=DEFAULT_EPSG.value)
|
|
18
|
+
elif isinstance(df, gpd.GeoDataFrame):
|
|
19
|
+
pass # already a GeoDataFrame
|
|
20
|
+
else:
|
|
21
|
+
raise TypeError(f"Unsupported type {type(df)}. Expected pandas DataFrame, geopandas GeoDataFrame, or spark DataFrame.")
|
|
22
|
+
df.to_file(path, layer='dataframe', driver="GPKG")
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: libadalina-core
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: A library for spatial joins of geographic data
|
|
5
|
+
Author-email: Marco Casazza <d.marcocasazza@gmail.com>, Alberto Ceselli <alberto.ceselli@unimi.it>, Marco Premoli <marco.premoli@unimi.it>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://gitlab.com/amelia_unimi/libadalina
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Requires-Python: ~=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: apache-sedona[spark]==1.7.1
|
|
16
|
+
Requires-Dist: pyspark==3.3.2
|
|
17
|
+
Requires-Dist: pandas==2.2.3
|
|
18
|
+
Requires-Dist: geopandas==1.0.1
|
|
19
|
+
Requires-Dist: shapely==2.1.1
|
|
20
|
+
Requires-Dist: install-jdk==1.1.0
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest==8.4.1; extra == "dev"
|
|
23
|
+
Requires-Dist: black; extra == "dev"
|
|
24
|
+
Requires-Dist: isort; extra == "dev"
|
|
25
|
+
Requires-Dist: sphinx; extra == "dev"
|
|
26
|
+
Requires-Dist: pydata-sphinx-theme; extra == "dev"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# libadalina-core
|
|
30
|
+
|
|
31
|
+
A Python library for spatial data processing.
|
|
32
|
+
It makes it easier to work with geospatial data in Python by providing a high-level interface
|
|
33
|
+
to Apache Sedona, a powerful geospatial processing engine, and integrates nicely with other well-known libraries
|
|
34
|
+
such as *geopandas* and *pandas*.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
liabadalina-core can be installed using pip:
|
|
39
|
+
```
|
|
40
|
+
pip install libadalina-core
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
If `JAVA_HOME` environment variable is not set a suitable JDK will be downloaded in `$HOME/.jre` and used automatically.
|
|
44
|
+
Not all JRE are supported, so if you encounter issues, you can try the automatically installed version.
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
You can find the documentation and example at [libadalina-core documentation](https://libadalinacore-6b2a95.gitlab.io/).
|
|
49
|
+
|
|
50
|
+
## Features
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
* Reading and writing geospatial data from various formats
|
|
54
|
+
* Spatial joins between datasets
|
|
55
|
+
* Spatial aggregations
|
|
56
|
+
* Utilities for working with Apache Sedona
|
|
57
|
+
* Configuration helpers for setting up Apache Sedona
|
|
58
|
+
|
|
59
|
+
## Requirements
|
|
60
|
+
|
|
61
|
+
- Python 3.10
|
|
62
|
+
- Dependencies:
|
|
63
|
+
- apache-sedona[spark]
|
|
64
|
+
- pyspark
|
|
65
|
+
- pandas
|
|
66
|
+
- geopandas
|
|
67
|
+
- install-jdk
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
libadalina_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
libadalina_core/examples/example_hospitals_in_provinces.py,sha256=ytDOWOG6Tp0Tmu6jst-9QUmaEeYaxQfAxnm6DtSoVyM,2075
|
|
3
|
+
libadalina_core/examples/example_population_in_provinces.py,sha256=ost2ymqSPZYgMgjnh7f2bvlla8n9b2FmLtjrVU63Vq8,1807
|
|
4
|
+
libadalina_core/examples/example_population_served_by_hospitals.py,sha256=3M1F0GpQ2i6tR4ZjQ2dkm1vBq02OjvwAqZ3FQje7hP0,2833
|
|
5
|
+
libadalina_core/readers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
libadalina_core/readers/readers.py,sha256=PbUThxwu_kBSJMlpjjixvIG46G8i8hPpJIb6HoFOQoU,608
|
|
7
|
+
libadalina_core/sedona_configuration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
libadalina_core/sedona_configuration/jdk_installer.py,sha256=i59G9YiNJ9H0BGpbeGmlTiFT0vP_SfjlIFAJylknR3I,730
|
|
9
|
+
libadalina_core/sedona_configuration/sedona_configuration.py,sha256=qhHqL74CBEXDdol3YP-lklsCkZvuvzPXHgezfyhSiiw,3697
|
|
10
|
+
libadalina_core/sedona_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
libadalina_core/sedona_utils/coordinate_formats.py,sha256=Ye1npHwJT7fLST1zkwyE8867mwXK-77ZsqaSEIL8RQs,571
|
|
12
|
+
libadalina_core/sedona_utils/utils.py,sha256=IYBc3XSpXDxmSdndLj08Nv1SkZEh8nGKg99xWEmgv7I,1227
|
|
13
|
+
libadalina_core/spatial_join/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
libadalina_core/spatial_join/query_builder.py,sha256=G12nXQid0ZQbeLvytFNY78ynLmk5iKIp08kGSJ8Tk5o,5750
|
|
15
|
+
libadalina_core/writers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
libadalina_core/writers/writers.py,sha256=kFD-oZbEiirmlNMeccZ5DaYvtXd3LeBafCpGKuXFlwQ,1105
|
|
17
|
+
libadalina_core-1.0.dist-info/licenses/LICENSE,sha256=U4yJabEzK3cseBN2UTwArB1I1p0ExZhl2eLUcCz_pl8,1075
|
|
18
|
+
libadalina_core-1.0.dist-info/METADATA,sha256=afDUocHENbBj2qHhunRdtocaJe-4Q2H_twIKTdoxaJU,2179
|
|
19
|
+
libadalina_core-1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
libadalina_core-1.0.dist-info/entry_points.txt,sha256=tsKAZ3w781QnK-TreW6KdA0H0655k5dW2ayYMjsTxPk,56
|
|
21
|
+
libadalina_core-1.0.dist-info/top_level.txt,sha256=DspVcM_AHdB9K-Za5D4H_LHHHOB918nTuOLCtOpo-xA,16
|
|
22
|
+
libadalina_core-1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 University of Milan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
libadalina_core
|