pyspark-fluvius 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ """PySpark custom data sources for Fluvius Energy API.
2
+
3
+ This package provides PySpark data sources for reading energy and mandate data
4
+ from the Fluvius Energy API directly into Spark DataFrames.
5
+
6
+ Example:
7
+ ```python
8
+ from pyspark.sql import SparkSession
9
+ import pyspark_fluvius # Registers data sources
10
+
11
+ spark = SparkSession.builder.getOrCreate()
12
+
13
+ # Read mandates
14
+ mandates_df = spark.read.format("fluvius.mandates") \\
15
+ .option("status", "Approved") \\
16
+ .load()
17
+
18
+ # Read energy data
19
+ energy_df = spark.read.format("fluvius.energy") \\
20
+ .option("ean", "541234567890123456") \\
21
+ .option("period_type", "readTime") \\
22
+ .option("granularity", "daily") \\
23
+ .option("from_date", "2024-01-01") \\
24
+ .option("to_date", "2024-01-31") \\
25
+ .load()
26
+ ```
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ from .datasources import FluviusEnergyDataSource, FluviusMandatesDataSource
32
+ from .schemas import ENERGY_SCHEMA, MANDATES_SCHEMA
33
+
34
+ __version__ = "0.1.0"
35
+ __all__ = [
36
+ "FluviusEnergyDataSource",
37
+ "FluviusMandatesDataSource",
38
+ "ENERGY_SCHEMA",
39
+ "MANDATES_SCHEMA",
40
+ "register_datasources",
41
+ ]
42
+
43
+
44
+ def register_datasources() -> None:
45
+ """Register Fluvius data sources with the active SparkSession.
46
+
47
+ This function registers both fluvius.energy and fluvius.mandates data sources
48
+ with the current SparkSession. Call this after creating your SparkSession.
49
+
50
+ Example:
51
+ ```python
52
+ from pyspark.sql import SparkSession
53
+ from pyspark_fluvius import register_datasources
54
+
55
+ spark = SparkSession.builder.getOrCreate()
56
+ register_datasources()
57
+
58
+ df = spark.read.format("fluvius.mandates").load()
59
+ ```
60
+ """
61
+ from pyspark.sql import SparkSession
62
+
63
+ spark = SparkSession.getActiveSession()
64
+ if spark is None:
65
+ raise RuntimeError(
66
+ "No active SparkSession found. "
67
+ "Create a SparkSession before calling register_datasources()."
68
+ )
69
+
70
+ spark.dataSource.register(FluviusEnergyDataSource)
71
+ spark.dataSource.register(FluviusMandatesDataSource)
@@ -0,0 +1,6 @@
1
+ """Converters from Pydantic models to Spark Rows."""
2
+
3
+ from .energy_converter import convert_energy_response
4
+ from .mandates_converter import convert_mandate
5
+
6
+ __all__ = ["convert_energy_response", "convert_mandate"]
@@ -0,0 +1,427 @@
1
+ """Convert energy models to Spark-compatible tuples."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from typing import TYPE_CHECKING
7
+
8
+
9
+ def _get_enum_value(value: object) -> str | None:
10
+ """Extract string value from an enum or return string as-is."""
11
+ if value is None:
12
+ return None
13
+ if hasattr(value, "value"):
14
+ return value.value
15
+ return str(value)
16
+
17
+
18
+ if TYPE_CHECKING:
19
+ from fluvius_energy_api.models.energy import (
20
+ GetEnergyResponseApiDataResponse,
21
+ MeasurementDirection,
22
+ MeasurementTimeSlice,
23
+ MeasurementValue,
24
+ MeasurementValueSet,
25
+ PhysicalMeter,
26
+ SubHeadpoint,
27
+ )
28
+
29
+ # Tuple structure matches ENERGY_SCHEMA field order:
30
+ # 11 header fields + 4 directions * (4 fields for total + 5 registers * 3 fields) = 11 + 4*19 = 87 fields
31
+ EnergyTuple = tuple[
32
+ str | None, # ean
33
+ str | None, # energy_type
34
+ str | None, # metering_type
35
+ datetime | None, # measurement_start
36
+ datetime | None, # measurement_end
37
+ str | None, # granularity
38
+ str | None, # meter_seq_number
39
+ str | None, # meter_id
40
+ str | None, # subheadpoint_ean
41
+ str | None, # subheadpoint_type
42
+ str | None, # subheadpoint_seq_number
43
+ # Offtake: total (4), day (3), night (3), reactive (3), inductive (3), capacitive (3) = 19 fields
44
+ float | None, str | None, str | None, str | None, # offtake_total
45
+ float | None, str | None, str | None, # offtake_day
46
+ float | None, str | None, str | None, # offtake_night
47
+ float | None, str | None, str | None, # offtake_reactive
48
+ float | None, str | None, str | None, # offtake_inductive
49
+ float | None, str | None, str | None, # offtake_capacitive
50
+ # Injection: 19 fields
51
+ float | None, str | None, str | None, str | None, # injection_total
52
+ float | None, str | None, str | None, # injection_day
53
+ float | None, str | None, str | None, # injection_night
54
+ float | None, str | None, str | None, # injection_reactive
55
+ float | None, str | None, str | None, # injection_inductive
56
+ float | None, str | None, str | None, # injection_capacitive
57
+ # Production: 19 fields
58
+ float | None, str | None, str | None, str | None, # production_total
59
+ float | None, str | None, str | None, # production_day
60
+ float | None, str | None, str | None, # production_night
61
+ float | None, str | None, str | None, # production_reactive
62
+ float | None, str | None, str | None, # production_inductive
63
+ float | None, str | None, str | None, # production_capacitive
64
+ # Auxiliary: 19 fields
65
+ float | None, str | None, str | None, str | None, # auxiliary_total
66
+ float | None, str | None, str | None, # auxiliary_day
67
+ float | None, str | None, str | None, # auxiliary_night
68
+ float | None, str | None, str | None, # auxiliary_reactive
69
+ float | None, str | None, str | None, # auxiliary_inductive
70
+ float | None, str | None, str | None, # auxiliary_capacitive
71
+ ]
72
+
73
+
74
+ def _extract_measurement_value_with_gas(
75
+ mv: MeasurementValue | None,
76
+ ) -> tuple[float | None, str | None, str | None, str | None]:
77
+ """Extract fields from a MeasurementValue including gas conversion factor."""
78
+ if mv is None:
79
+ return None, None, None, None
80
+ return (
81
+ mv.value,
82
+ _get_enum_value(mv.unit),
83
+ _get_enum_value(mv.validation_state),
84
+ _get_enum_value(mv.gas_conversion_factor),
85
+ )
86
+
87
+
88
+ def _extract_measurement_value(
89
+ mv: MeasurementValue | None,
90
+ ) -> tuple[float | None, str | None, str | None]:
91
+ """Extract fields from a MeasurementValue without gas conversion factor."""
92
+ if mv is None:
93
+ return None, None, None
94
+ return (
95
+ mv.value,
96
+ _get_enum_value(mv.unit),
97
+ _get_enum_value(mv.validation_state),
98
+ )
99
+
100
+
101
+ def _extract_value_set(vs: MeasurementValueSet | None) -> tuple:
102
+ """Extract all measurement values from a MeasurementValueSet.
103
+
104
+ Returns 19 fields: total (4) + day (3) + night (3) + reactive (3) + inductive (3) + capacitive (3)
105
+
106
+ If total is not provided but day and night are, computes total = day + night.
107
+ """
108
+ if vs is None:
109
+ return (None,) * 19
110
+
111
+ # Extract raw values
112
+ total_fields = _extract_measurement_value_with_gas(vs.total)
113
+ day_fields = _extract_measurement_value(vs.day)
114
+ night_fields = _extract_measurement_value(vs.night)
115
+
116
+ # Compute total from day + night if total is missing
117
+ total_value, total_unit, total_validation, total_gas = total_fields
118
+ day_value, day_unit, day_validation = day_fields
119
+ night_value, night_unit, night_validation = night_fields
120
+
121
+ if total_value is None and day_value is not None and night_value is not None:
122
+ total_value = day_value + night_value
123
+ total_unit = day_unit # Use day's unit (should be same as night)
124
+ total_validation = day_validation # Use day's validation state
125
+ total_fields = (total_value, total_unit, total_validation, total_gas)
126
+
127
+ return (
128
+ *total_fields,
129
+ *day_fields,
130
+ *night_fields,
131
+ *_extract_measurement_value(vs.reactive),
132
+ *_extract_measurement_value(vs.inductive),
133
+ *_extract_measurement_value(vs.capacitive),
134
+ )
135
+
136
+
137
+ def _extract_measurements(
138
+ directions: list[MeasurementDirection] | None,
139
+ ) -> tuple:
140
+ """Extract all measurement values from directions.
141
+
142
+ Returns 76 fields: 4 directions * 19 fields each.
143
+ """
144
+ if not directions:
145
+ return (None,) * 76
146
+
147
+ # Take the first direction (typical case has one)
148
+ direction = directions[0]
149
+
150
+ return (
151
+ *_extract_value_set(direction.offtake),
152
+ *_extract_value_set(direction.injection),
153
+ *_extract_value_set(direction.production),
154
+ *_extract_value_set(direction.auxiliary),
155
+ )
156
+
157
+
158
+ def _process_time_slice(
159
+ time_slice: MeasurementTimeSlice,
160
+ ean: str | None,
161
+ energy_type: str | None,
162
+ metering_type: str,
163
+ granularity: str,
164
+ meter_seq_number: str | None = None,
165
+ meter_id: str | None = None,
166
+ subheadpoint_ean: str | None = None,
167
+ subheadpoint_type: str | None = None,
168
+ subheadpoint_seq_number: str | None = None,
169
+ ) -> EnergyTuple:
170
+ """Convert a single time slice to a tuple."""
171
+ measurements = _extract_measurements(time_slice.measurements)
172
+
173
+ return (
174
+ ean,
175
+ energy_type,
176
+ metering_type,
177
+ time_slice.start,
178
+ time_slice.end,
179
+ granularity,
180
+ meter_seq_number,
181
+ meter_id,
182
+ subheadpoint_ean,
183
+ subheadpoint_type,
184
+ subheadpoint_seq_number,
185
+ *measurements,
186
+ )
187
+
188
+
189
+ def _process_energy_list(
190
+ energy_list: list[MeasurementTimeSlice] | None,
191
+ ean: str | None,
192
+ energy_type: str | None,
193
+ metering_type: str,
194
+ granularity: str,
195
+ meter_seq_number: str | None = None,
196
+ meter_id: str | None = None,
197
+ subheadpoint_ean: str | None = None,
198
+ subheadpoint_type: str | None = None,
199
+ subheadpoint_seq_number: str | None = None,
200
+ ) -> list[EnergyTuple]:
201
+ """Process a list of time slices."""
202
+ if not energy_list:
203
+ return []
204
+
205
+ return [
206
+ _process_time_slice(
207
+ ts,
208
+ ean,
209
+ energy_type,
210
+ metering_type,
211
+ granularity,
212
+ meter_seq_number,
213
+ meter_id,
214
+ subheadpoint_ean,
215
+ subheadpoint_type,
216
+ subheadpoint_seq_number,
217
+ )
218
+ for ts in energy_list
219
+ ]
220
+
221
+
222
+ def _process_physical_meter(
223
+ meter: PhysicalMeter,
224
+ ean: str | None,
225
+ energy_type: str | None,
226
+ metering_type: str,
227
+ ) -> list[EnergyTuple]:
228
+ """Process energy data from a physical meter."""
229
+ results: list[EnergyTuple] = []
230
+
231
+ results.extend(
232
+ _process_energy_list(
233
+ meter.daily_energy,
234
+ ean,
235
+ energy_type,
236
+ metering_type,
237
+ "daily",
238
+ meter.seq_number,
239
+ meter.meter_id,
240
+ )
241
+ )
242
+ results.extend(
243
+ _process_energy_list(
244
+ meter.hourly_energy,
245
+ ean,
246
+ energy_type,
247
+ metering_type,
248
+ "hourly",
249
+ meter.seq_number,
250
+ meter.meter_id,
251
+ )
252
+ )
253
+ results.extend(
254
+ _process_energy_list(
255
+ meter.quarter_hourly_energy,
256
+ ean,
257
+ energy_type,
258
+ metering_type,
259
+ "quarter_hourly",
260
+ meter.seq_number,
261
+ meter.meter_id,
262
+ )
263
+ )
264
+
265
+ return results
266
+
267
+
268
+ def _process_subheadpoint(
269
+ sub: SubHeadpoint,
270
+ ean: str | None,
271
+ energy_type: str | None,
272
+ metering_type: str,
273
+ ) -> list[EnergyTuple]:
274
+ """Process energy data from a subheadpoint."""
275
+ results: list[EnergyTuple] = []
276
+
277
+ subheadpoint_type = sub.type_discriminator.replace("submetering-", "")
278
+
279
+ results.extend(
280
+ _process_energy_list(
281
+ sub.daily_energy,
282
+ ean,
283
+ energy_type,
284
+ metering_type,
285
+ "daily",
286
+ subheadpoint_ean=sub.ean,
287
+ subheadpoint_type=subheadpoint_type,
288
+ subheadpoint_seq_number=sub.seq_number,
289
+ )
290
+ )
291
+ results.extend(
292
+ _process_energy_list(
293
+ sub.hourly_energy,
294
+ ean,
295
+ energy_type,
296
+ metering_type,
297
+ "hourly",
298
+ subheadpoint_ean=sub.ean,
299
+ subheadpoint_type=subheadpoint_type,
300
+ subheadpoint_seq_number=sub.seq_number,
301
+ )
302
+ )
303
+ results.extend(
304
+ _process_energy_list(
305
+ sub.quarter_hourly_energy,
306
+ ean,
307
+ energy_type,
308
+ metering_type,
309
+ "quarter_hourly",
310
+ subheadpoint_ean=sub.ean,
311
+ subheadpoint_type=subheadpoint_type,
312
+ subheadpoint_seq_number=sub.seq_number,
313
+ )
314
+ )
315
+
316
+ return results
317
+
318
+
319
+ def convert_energy_response(response: GetEnergyResponseApiDataResponse) -> list[EnergyTuple]:
320
+ """Convert an energy API response to a list of tuples for Spark Rows.
321
+
322
+ This function flattens the nested energy response structure into rows
323
+ suitable for a Spark DataFrame.
324
+
325
+ Args:
326
+ response: The energy API response from fluvius-energy-api.
327
+
328
+ Returns:
329
+ A list of tuples matching the ENERGY_SCHEMA field order.
330
+ """
331
+ results: list[EnergyTuple] = []
332
+
333
+ if not response.data or not response.data.headpoint:
334
+ return results
335
+
336
+ headpoint = response.data.headpoint
337
+ ean = headpoint.ean
338
+ energy_type = _get_enum_value(headpoint.energy_type)
339
+ metering_type = headpoint.type_discriminator
340
+
341
+ # Process based on metering type
342
+ if metering_type == "metering-on-headpoint":
343
+ # Direct energy data on headpoint
344
+ results.extend(
345
+ _process_energy_list(
346
+ headpoint.daily_energy, # type: ignore[attr-defined]
347
+ ean,
348
+ energy_type,
349
+ metering_type,
350
+ "daily",
351
+ )
352
+ )
353
+ results.extend(
354
+ _process_energy_list(
355
+ headpoint.hourly_energy, # type: ignore[attr-defined]
356
+ ean,
357
+ energy_type,
358
+ metering_type,
359
+ "hourly",
360
+ )
361
+ )
362
+ results.extend(
363
+ _process_energy_list(
364
+ headpoint.quarter_hourly_energy, # type: ignore[attr-defined]
365
+ ean,
366
+ energy_type,
367
+ metering_type,
368
+ "quarter_hourly",
369
+ )
370
+ )
371
+
372
+ # Process subheadpoints if present
373
+ sub_headpoints = getattr(headpoint, "sub_headpoints", None)
374
+ if sub_headpoints:
375
+ for sub in sub_headpoints:
376
+ results.extend(_process_subheadpoint(sub, ean, energy_type, metering_type))
377
+
378
+ elif metering_type == "metering-on-meter":
379
+ # Energy data on physical meters
380
+ physical_meters = getattr(headpoint, "physical_meters", None)
381
+ if physical_meters:
382
+ for meter in physical_meters:
383
+ results.extend(_process_physical_meter(meter, ean, energy_type, metering_type))
384
+
385
+ elif metering_type == "metering-on-headpoint-and-meter":
386
+ # Both headpoint and meter level data
387
+ results.extend(
388
+ _process_energy_list(
389
+ headpoint.daily_energy, # type: ignore[attr-defined]
390
+ ean,
391
+ energy_type,
392
+ metering_type,
393
+ "daily",
394
+ )
395
+ )
396
+ results.extend(
397
+ _process_energy_list(
398
+ headpoint.hourly_energy, # type: ignore[attr-defined]
399
+ ean,
400
+ energy_type,
401
+ metering_type,
402
+ "hourly",
403
+ )
404
+ )
405
+ results.extend(
406
+ _process_energy_list(
407
+ headpoint.quarter_hourly_energy, # type: ignore[attr-defined]
408
+ ean,
409
+ energy_type,
410
+ metering_type,
411
+ "quarter_hourly",
412
+ )
413
+ )
414
+
415
+ # Process physical meters
416
+ physical_meters = getattr(headpoint, "physical_meters", None)
417
+ if physical_meters:
418
+ for meter in physical_meters:
419
+ results.extend(_process_physical_meter(meter, ean, energy_type, metering_type))
420
+
421
+ # Process subheadpoints
422
+ sub_headpoints = getattr(headpoint, "sub_headpoints", None)
423
+ if sub_headpoints:
424
+ for sub in sub_headpoints:
425
+ results.extend(_process_subheadpoint(sub, ean, energy_type, metering_type))
426
+
427
+ return results
@@ -0,0 +1,52 @@
1
+ """Convert mandate models to Spark-compatible tuples."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from typing import TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from fluvius_energy_api.models.mandate import Mandate
10
+
11
+ MandateTuple = tuple[
12
+ str | None, # reference_number
13
+ str | None, # status
14
+ str | None, # ean
15
+ str | None, # energy_type
16
+ datetime | None, # data_period_from
17
+ datetime | None, # data_period_to
18
+ str | None, # data_service_type
19
+ datetime | None, # mandate_expiration_date
20
+ str | None, # renewal_status
21
+ ]
22
+
23
+
24
+ def _get_enum_value(value: object) -> str | None:
25
+ """Extract string value from an enum or return string as-is."""
26
+ if value is None:
27
+ return None
28
+ if hasattr(value, "value"):
29
+ return value.value
30
+ return str(value)
31
+
32
+
33
+ def convert_mandate(mandate: Mandate) -> MandateTuple:
34
+ """Convert a Mandate Pydantic model to a tuple for Spark Row.
35
+
36
+ Args:
37
+ mandate: The Mandate model from fluvius-energy-api.
38
+
39
+ Returns:
40
+ A tuple matching the MANDATES_SCHEMA field order.
41
+ """
42
+ return (
43
+ mandate.reference_number,
44
+ _get_enum_value(mandate.status),
45
+ mandate.ean,
46
+ _get_enum_value(mandate.energy_type),
47
+ mandate.data_period_from,
48
+ mandate.data_period_to,
49
+ _get_enum_value(mandate.data_service_type),
50
+ mandate.mandate_expiration_date,
51
+ _get_enum_value(mandate.renewal_status),
52
+ )
@@ -0,0 +1,6 @@
1
+ """Fluvius PySpark data sources."""
2
+
3
+ from .energy import FluviusEnergyDataSource
4
+ from .mandates import FluviusMandatesDataSource
5
+
6
+ __all__ = ["FluviusEnergyDataSource", "FluviusMandatesDataSource"]
@@ -0,0 +1,100 @@
1
+ """Fluvius Energy data source for PySpark."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from pyspark.sql.datasource import DataSource, DataSourceReader
8
+ from pyspark.sql.types import StructType
9
+
10
+ from ..readers.energy_reader import FluviusEnergyReader
11
+ from ..schemas.energy_schema import ENERGY_SCHEMA
12
+
13
+ if TYPE_CHECKING:
14
+ pass
15
+
16
+
17
+ class FluviusEnergyDataSource(DataSource):
18
+ """PySpark data source for reading Fluvius energy measurements.
19
+
20
+ This data source allows you to read energy measurement data from the
21
+ Fluvius Energy API directly into a Spark DataFrame.
22
+
23
+ Required Options:
24
+ - ean: GSRN EAN-code that identifies the installation
25
+ - period_type: Type of period ("readTime" or "insertTime")
26
+
27
+ Optional Options:
28
+ Credential options (if not using environment variables):
29
+ - subscription_key: Azure API Management subscription key
30
+ - client_id: Azure AD application (client) ID
31
+ - tenant_id: Azure AD tenant ID
32
+ - scope: OAuth2 scope
33
+ - data_access_contract_number: Data access contract number
34
+ - certificate_thumbprint: Certificate thumbprint (for cert auth)
35
+ - private_key: Private key in PEM format (for cert auth)
36
+ - client_secret: Client secret (for secret auth)
37
+ - credentials_prefix: Environment variable prefix (default: "FLUVIUS")
38
+
39
+ Environment options:
40
+ - environment: "sandbox" (default) or "production"
41
+
42
+ Filter options:
43
+ - reference_number: Custom reference number
44
+ - granularity: Granularity filter (e.g., "daily", "hourly_quarterhourly")
45
+ - complex_energy_types: Types of complex energy (e.g., "active,reactive")
46
+ - from_date: Start date (ISO format, e.g., "2024-01-01")
47
+ - to_date: End date (ISO format, e.g., "2024-01-31")
48
+
49
+ Example:
50
+ ```python
51
+ df = spark.read.format("fluvius.energy") \\
52
+ .option("ean", "541234567890123456") \\
53
+ .option("period_type", "readTime") \\
54
+ .option("granularity", "daily") \\
55
+ .option("from_date", "2024-01-01") \\
56
+ .option("to_date", "2024-01-31") \\
57
+ .load()
58
+ ```
59
+
60
+ Schema:
61
+ The returned DataFrame has the following columns:
62
+ - ean: EAN code of the installation
63
+ - energy_type: "E" (electricity) or "G" (gas)
64
+ - metering_type: Type of metering installation
65
+ - measurement_start: Start time of the measurement period
66
+ - measurement_end: End time of the measurement period
67
+ - granularity: Measurement granularity (daily, hourly, quarter_hourly)
68
+ - meter_seq_number: Physical meter sequence number (if applicable)
69
+ - meter_id: Physical meter ID (if applicable)
70
+ - subheadpoint_ean: Subheadpoint EAN (for submetering)
71
+ - subheadpoint_type: Type of subheadpoint
72
+ - subheadpoint_seq_number: Subheadpoint sequence number
73
+ - offtake_total_value/unit/validation_state/gas_conversion_factor
74
+ - offtake_day_value/unit/validation_state
75
+ - offtake_night_value/unit/validation_state
76
+ - injection_total_value/unit/validation_state
77
+ - injection_day_value/unit/validation_state
78
+ - injection_night_value/unit/validation_state
79
+ - production_total_value/unit/validation_state
80
+ """
81
+
82
+ @classmethod
83
+ def name(cls) -> str:
84
+ """Return the short name of this data source."""
85
+ return "fluvius.energy"
86
+
87
+ def schema(self) -> StructType:
88
+ """Return the schema for energy data."""
89
+ return ENERGY_SCHEMA
90
+
91
+ def reader(self, schema: StructType) -> DataSourceReader:
92
+ """Return a reader for energy data.
93
+
94
+ Args:
95
+ schema: The schema to use (typically the default ENERGY_SCHEMA).
96
+
97
+ Returns:
98
+ A FluviusEnergyReader instance.
99
+ """
100
+ return FluviusEnergyReader(schema, self.options)