libxrk 0.6.0__cp310-cp310-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
libxrk/base.py ADDED
@@ -0,0 +1,337 @@
1
+ # Copyright 2024, Scott Smith. MIT License (see LICENSE).
2
+
3
+ from collections.abc import Sequence
4
+ from dataclasses import dataclass
5
+ import heapq
6
+ from itertools import groupby
7
+ import sys
8
+ import pyarrow as pa
9
+ import pyarrow.compute as pc
10
+ import numpy as np
11
+
12
+ # We use array and memoryview for efficient operations, but that
13
+ # assumes the sizes we expect match the file format. Lets assert a
14
+ # few of those assumptions here. Our use of struct is safe since it
15
+ # has tighter control over byte order and sizing.
16
+ assert sys.byteorder == "little"
17
+
18
+
19
+ @dataclass(eq=False)
20
+ class LogFile:
21
+ """
22
+ Container for parsed XRK/XRZ telemetry data.
23
+
24
+ Attributes:
25
+ channels: Dict mapping channel names to PyArrow tables. Each table has
26
+ 'timecodes' (int64, ms) and '<channel_name>' columns. Channel metadata
27
+ (units, dec_pts, interpolate) stored in schema.field.metadata with bytes keys.
28
+ laps: PyArrow table with columns: num (int), start_time (int), end_time (int).
29
+ Times are in milliseconds.
30
+ metadata: Dict of session metadata (racer, vehicle, venue, etc.)
31
+ file_name: Original filename or "<bytes>" if loaded from bytes.
32
+
33
+ Example:
34
+ >>> log = aim_xrk('file.xrk')
35
+ >>> log.channels['Engine RPM'].to_pandas() # Single channel
36
+ >>> log.get_channels_as_table().to_pandas() # All merged
37
+ """
38
+
39
+ channels: dict[str, pa.Table]
40
+ laps: pa.Table
41
+ metadata: dict[str, str]
42
+ file_name: str
43
+
44
+ def get_channels_as_table(self) -> pa.Table:
45
+ """
46
+ Merge all channels into a single PyArrow table with full outer join on timestamps.
47
+
48
+ For channels with interpolate="True" metadata, performs linear interpolation for null values.
49
+ For other channels, fills nulls with the previous non-null value (forward fill).
50
+ After filling, any remaining leading nulls are backward filled with the first available value.
51
+
52
+ Returns:
53
+ A PyArrow table with a 'timecodes' column and one column per channel.
54
+ Missing values are interpolated or forward-filled based on channel metadata.
55
+ Leading nulls are backward filled to ensure no nulls remain.
56
+ Column metadata is preserved.
57
+ """
58
+ if not self.channels:
59
+ # Return an empty table with just timecodes column if no channels
60
+ return pa.table({"timecodes": pa.array([], type=pa.int64())})
61
+
62
+ # Compute union of all channel timecodes using k-way merge (O(N) vs O(N log N) for sort)
63
+ # Each channel's timecodes are already sorted, so we merge and deduplicate in one pass
64
+ timecode_iterators = [
65
+ channel_table.column("timecodes").to_pylist()
66
+ for channel_table in self.channels.values()
67
+ ]
68
+ merged = heapq.merge(*timecode_iterators)
69
+ unique_timecodes = [k for k, _ in groupby(merged)]
70
+ union_timecodes = pa.array(unique_timecodes, type=pa.int64())
71
+
72
+ # Resample all channels to the union timecodes
73
+ resampled = self.resample_to_timecodes(union_timecodes)
74
+
75
+ # Build merged table from resampled channels (simple horizontal concatenation)
76
+ channel_names = sorted(resampled.channels.keys())
77
+
78
+ # Collect metadata for restoration
79
+ channel_metadata = {}
80
+ for name in channel_names:
81
+ field = resampled.channels[name].schema.field(name)
82
+ if field.metadata:
83
+ channel_metadata[name] = field.metadata
84
+
85
+ # Build the result table
86
+ columns_dict = {"timecodes": union_timecodes}
87
+ for name in channel_names:
88
+ columns_dict[name] = resampled.channels[name].column(name)
89
+
90
+ result = pa.table(columns_dict)
91
+
92
+ # Restore schema with metadata
93
+ if channel_metadata:
94
+ new_fields = []
95
+ for field in result.schema:
96
+ if field.name in channel_metadata:
97
+ new_fields.append(field.with_metadata(channel_metadata[field.name]))
98
+ else:
99
+ new_fields.append(field)
100
+ new_schema = pa.schema(new_fields)
101
+ result = result.cast(new_schema)
102
+
103
+ return result
104
+
105
+ def select_channels(self, channel_names: Sequence[str]) -> "LogFile":
106
+ """
107
+ Create a new LogFile with only the specified channels.
108
+
109
+ Args:
110
+ channel_names: Sequence of channel names to include.
111
+
112
+ Returns:
113
+ New LogFile containing only the specified channels.
114
+
115
+ Raises:
116
+ KeyError: If any channel name is not found.
117
+
118
+ Example:
119
+ >>> log = aim_xrk('session.xrk')
120
+ >>> gps_log = log.select_channels(['GPS Latitude', 'GPS Longitude', 'GPS Speed'])
121
+ >>> print(gps_log.channels.keys())
122
+ """
123
+ missing = set(channel_names) - set(self.channels.keys())
124
+ if missing:
125
+ raise KeyError(f"Channels not found: {sorted(missing)}")
126
+
127
+ new_channels = {name: self.channels[name] for name in channel_names}
128
+ return LogFile(
129
+ channels=new_channels,
130
+ laps=self.laps,
131
+ metadata=self.metadata,
132
+ file_name=self.file_name,
133
+ )
134
+
135
+ def filter_by_time_range(
136
+ self,
137
+ start_time: int,
138
+ end_time: int,
139
+ channel_names: Sequence[str] | None = None,
140
+ ) -> "LogFile":
141
+ """
142
+ Filter channels to a time range [start_time, end_time) at native sample rates.
143
+
144
+ Args:
145
+ start_time: Start time in milliseconds (inclusive).
146
+ end_time: End time in milliseconds (exclusive).
147
+ channel_names: Optional sequence of channel names to include. If None, all channels.
148
+
149
+ Returns:
150
+ New LogFile with channels filtered to the time range.
151
+
152
+ Example:
153
+ >>> log = aim_xrk('session.xrk')
154
+ >>> segment = log.filter_by_time_range(60000, 120000)
155
+ >>> print(segment.channels['Engine RPM'].num_rows)
156
+ """
157
+ source = self.select_channels(channel_names) if channel_names is not None else self
158
+
159
+ new_channels = {}
160
+ for name, channel_table in source.channels.items():
161
+ timecodes = channel_table.column("timecodes")
162
+
163
+ # Filter to [start_time, end_time)
164
+ mask = pc.and_(
165
+ pc.greater_equal(timecodes, start_time),
166
+ pc.less(timecodes, end_time),
167
+ )
168
+ new_channels[name] = channel_table.filter(mask)
169
+
170
+ # Filter laps to only those overlapping with the time range
171
+ laps_start = self.laps.column("start_time")
172
+ laps_end = self.laps.column("end_time")
173
+
174
+ # A lap overlaps if: lap_start < end_time AND lap_end > start_time
175
+ laps_mask = pc.and_(
176
+ pc.less(laps_start, end_time),
177
+ pc.greater(laps_end, start_time),
178
+ )
179
+ new_laps = self.laps.filter(laps_mask)
180
+
181
+ return LogFile(
182
+ channels=new_channels,
183
+ laps=new_laps,
184
+ metadata=self.metadata,
185
+ file_name=self.file_name,
186
+ )
187
+
188
+ def filter_by_lap(
189
+ self,
190
+ lap_num: int,
191
+ channel_names: Sequence[str] | None = None,
192
+ ) -> "LogFile":
193
+ """
194
+ Filter channels to a specific lap's time range.
195
+
196
+ Args:
197
+ lap_num: The lap number to filter to.
198
+ channel_names: Optional sequence of channel names to include. If None, all channels.
199
+
200
+ Returns:
201
+ New LogFile with channels filtered to the lap's time range.
202
+
203
+ Raises:
204
+ ValueError: If lap_num is not found in the laps table.
205
+
206
+ Example:
207
+ >>> log = aim_xrk('session.xrk')
208
+ >>> lap5 = log.filter_by_lap(5, ['GPS Speed', 'Engine RPM'])
209
+ >>> df = lap5.get_channels_as_table().to_pandas()
210
+ """
211
+ lap_nums = self.laps.column("num").to_pylist()
212
+ if lap_num not in lap_nums:
213
+ raise ValueError(f"Lap {lap_num} not found. Available laps: {lap_nums}")
214
+
215
+ lap_idx = lap_nums.index(lap_num)
216
+ start_time = self.laps.column("start_time")[lap_idx].as_py()
217
+ end_time = self.laps.column("end_time")[lap_idx].as_py()
218
+
219
+ return self.filter_by_time_range(int(start_time), int(end_time), channel_names)
220
+
221
+ def resample_to_timecodes(
222
+ self,
223
+ timecodes: pa.Array,
224
+ channel_names: Sequence[str] | None = None,
225
+ ) -> "LogFile":
226
+ """
227
+ Resample all channels to a target timebase.
228
+
229
+ For channels with interpolate="True" metadata, performs linear interpolation.
230
+ For other channels, uses forward-fill then backward-fill for leading nulls.
231
+
232
+ Args:
233
+ timecodes: Target timecodes array (int64, milliseconds) to resample to.
234
+ channel_names: Optional sequence of channel names to include. If None, all channels.
235
+
236
+ Returns:
237
+ New LogFile with all channels resampled to the target timecodes.
238
+
239
+ Example:
240
+ >>> log = aim_xrk('session.xrk')
241
+ >>> target = pa.array(range(0, 100000, 100), type=pa.int64())
242
+ >>> resampled = log.resample_to_timecodes(target)
243
+ """
244
+ source = self.select_channels(channel_names) if channel_names is not None else self
245
+
246
+ target_timecodes_np = timecodes.to_numpy()
247
+ new_channels = {}
248
+
249
+ for name, channel_table in source.channels.items():
250
+ field = channel_table.schema.field(name)
251
+ channel_timecodes = channel_table.column("timecodes").to_numpy()
252
+ channel_values = channel_table.column(name).to_numpy(zero_copy_only=False)
253
+
254
+ # Check if we should interpolate
255
+ should_interpolate = False
256
+ if field.metadata:
257
+ interpolate_value = field.metadata.get(b"interpolate", b"").decode("utf-8")
258
+ should_interpolate = interpolate_value == "True"
259
+
260
+ if should_interpolate:
261
+ # Linear interpolation using numpy
262
+ # np.interp handles extrapolation by extending edge values
263
+ resampled_values = np.interp(
264
+ target_timecodes_np,
265
+ channel_timecodes,
266
+ channel_values,
267
+ )
268
+ else:
269
+ # Forward-fill approach using searchsorted
270
+ # Find the index of the largest timecode <= each target timecode
271
+ indices = np.searchsorted(channel_timecodes, target_timecodes_np, side="right") - 1
272
+
273
+ # Handle leading nulls: where target is before first source timecode
274
+ leading_mask = indices < 0
275
+
276
+ # Clamp indices to valid range
277
+ indices = np.clip(indices, 0, len(channel_values) - 1)
278
+
279
+ resampled_values = channel_values[indices]
280
+
281
+ # For leading values (before first source timecode), backward fill
282
+ if np.any(leading_mask):
283
+ resampled_values = resampled_values.copy()
284
+ resampled_values[leading_mask] = channel_values[0]
285
+
286
+ # Build new channel table preserving metadata
287
+ new_table = pa.table(
288
+ {
289
+ "timecodes": timecodes,
290
+ name: pa.array(resampled_values, type=field.type),
291
+ }
292
+ )
293
+
294
+ # Restore metadata
295
+ if field.metadata:
296
+ new_field = new_table.schema.field(name).with_metadata(field.metadata)
297
+ new_schema = pa.schema([new_table.schema.field("timecodes"), new_field])
298
+ new_table = new_table.cast(new_schema)
299
+
300
+ new_channels[name] = new_table
301
+
302
+ return LogFile(
303
+ channels=new_channels,
304
+ laps=self.laps,
305
+ metadata=self.metadata,
306
+ file_name=self.file_name,
307
+ )
308
+
309
+ def resample_to_channel(
310
+ self,
311
+ reference_channel: str,
312
+ channel_names: Sequence[str] | None = None,
313
+ ) -> "LogFile":
314
+ """
315
+ Resample all channels to match a reference channel's timebase.
316
+
317
+ Args:
318
+ reference_channel: Name of the channel whose timecodes will be used.
319
+ channel_names: Optional sequence of channel names to include. If None, all channels.
320
+
321
+ Returns:
322
+ New LogFile with all channels resampled to the reference channel's timecodes.
323
+
324
+ Raises:
325
+ KeyError: If reference_channel is not found.
326
+
327
+ Example:
328
+ >>> log = aim_xrk('session.xrk')
329
+ >>> aligned = log.resample_to_channel('GPS Speed')
330
+ >>> df = aligned.get_channels_as_table().to_pandas()
331
+ """
332
+ if reference_channel not in self.channels:
333
+ raise KeyError(f"Reference channel not found: {reference_channel}")
334
+
335
+ ref_timecodes = self.channels[reference_channel].column("timecodes").combine_chunks()
336
+
337
+ return self.resample_to_timecodes(ref_timecodes, channel_names)