gpxtractor 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. gpxtractor-0.1.0/LICENSE +7 -0
  2. gpxtractor-0.1.0/PKG-INFO +79 -0
  3. gpxtractor-0.1.0/README.md +62 -0
  4. gpxtractor-0.1.0/gpxtractor/__init__.py +12 -0
  5. gpxtractor-0.1.0/gpxtractor/_core.py +265 -0
  6. gpxtractor-0.1.0/gpxtractor/_fit_extraction.py +90 -0
  7. gpxtractor-0.1.0/gpxtractor/_transformation.py +99 -0
  8. gpxtractor-0.1.0/gpxtractor/_utils.py +32 -0
  9. gpxtractor-0.1.0/gpxtractor/_xml_extraction.py +165 -0
  10. gpxtractor-0.1.0/gpxtractor/cli.py +75 -0
  11. gpxtractor-0.1.0/gpxtractor/sql/compute_distance_and_speed.sql +49 -0
  12. gpxtractor-0.1.0/gpxtractor/sql/compute_speed.sql +49 -0
  13. gpxtractor-0.1.0/gpxtractor/sql/haversine_formula.sql +8 -0
  14. gpxtractor-0.1.0/gpxtractor/sql/km_data_query.sql +92 -0
  15. gpxtractor-0.1.0/gpxtractor/sql/lap_data_query.sql +86 -0
  16. gpxtractor-0.1.0/gpxtractor/sql/overall_stats.sql +42 -0
  17. gpxtractor-0.1.0/gpxtractor/sql/preprocess_data.sql +50 -0
  18. gpxtractor-0.1.0/gpxtractor/sql/preprocess_running_data.sql +58 -0
  19. gpxtractor-0.1.0/gpxtractor.egg-info/PKG-INFO +79 -0
  20. gpxtractor-0.1.0/gpxtractor.egg-info/SOURCES.txt +28 -0
  21. gpxtractor-0.1.0/gpxtractor.egg-info/dependency_links.txt +1 -0
  22. gpxtractor-0.1.0/gpxtractor.egg-info/entry_points.txt +2 -0
  23. gpxtractor-0.1.0/gpxtractor.egg-info/requires.txt +8 -0
  24. gpxtractor-0.1.0/gpxtractor.egg-info/top_level.txt +1 -0
  25. gpxtractor-0.1.0/pyproject.toml +33 -0
  26. gpxtractor-0.1.0/setup.cfg +4 -0
  27. gpxtractor-0.1.0/tests/test_core.py +19 -0
  28. gpxtractor-0.1.0/tests/test_fit_extraction.py +9 -0
  29. gpxtractor-0.1.0/tests/test_transformation.py +13 -0
  30. gpxtractor-0.1.0/tests/test_xml_extraction.py +34 -0
@@ -0,0 +1,7 @@
1
+ Copyright (c) 2026 Charles Stapylton-Smith
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: gpxtractor
3
+ Version: 0.1.0
4
+ Summary: GPX, TCX and FIT data extraction for Python
5
+ Author-email: Charlie Stapylton <278091496+c-stap@users.noreply.github.com>
6
+ Requires-Python: >=3.13
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: lxml>=6.0.2
10
+ Requires-Dist: numpy>=2.3.1
11
+ Requires-Dist: pyarrow>=20.0.0
12
+ Requires-Dist: pandas>=2.3.1
13
+ Requires-Dist: duckdb>=1.4.4
14
+ Provides-Extra: cli
15
+ Requires-Dist: visidata; extra == "cli"
16
+ Dynamic: license-file
17
+
18
+ # gpxtractor
19
+
20
+ **GPX, TCX and FIT data extraction for Python**
21
+
22
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
23
+
24
+ ## Description
25
+
26
+
27
+ ## Features
28
+
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ git clone
34
+ cd gpxtractor
35
+ pip install .
36
+ ```
37
+
38
+ ## Usage Example
39
+ Use the `gpxtractor.extract_data` function that returns a gpxtractor.Activity instance.
40
+
41
+ ```python
42
+ import gpxtractor
43
+
44
+ activity = gpxtractor.extract_data("your-gpx-tcx-or-fit_file.gpx")
45
+
46
+ print(activity.sport) # Output: name of the sport in the file as a string
47
+
48
+ ```
49
+
50
+ The records attribute is a `pandas.DataFrame` holding the records extracted from the file\n
51
+ with the `gpxtractor.extract_data` function. So the usual `pandas.DataFrame` methods can be applied
52
+
53
+ ```python
54
+ print(activity.records.head())
55
+ ```
56
+
57
+ Once an instance of an Activity as been created with the `extract_data` function, the method\n
58
+ `transform_records` can be used to calculate distance and speed if missing from the file as well as\n
59
+ elevation incremental difference, gradient and in the case of running activities, pace.
60
+
61
+ ```python
62
+ activity.transform_records()
63
+ print(activity.records.head())
64
+ ```
65
+
66
+ And once the records have been transformed with `transform_records`, it is possible to use the 2\n
67
+ following methods to calculate aggregated data for kilometre and lap splits.
68
+
69
+ ```python
70
+ activity.compute_km_splits()
71
+ print(activity.km_splits)
72
+
73
+ activity.compute_lap_splits()
74
+ print(activity.lap_splits)
75
+ ```
76
+ Note: the `compute_lap_splits` will only compute lap splits if the file contains lap data which is not\n
77
+ the case for GPX files. It does not update the `lap_splits` attribute otherwise.
78
+
79
+
@@ -0,0 +1,62 @@
1
+ # gpxtractor
2
+
3
+ **GPX, TCX and FIT data extraction for Python**
4
+
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
7
+ ## Description
8
+
9
+
10
+ ## Features
11
+
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ git clone
17
+ cd gpxtractor
18
+ pip install .
19
+ ```
20
+
21
+ ## Usage Example
22
+ Use the `gpxtractor.extract_data` function that returns a gpxtractor.Activity instance.
23
+
24
+ ```python
25
+ import gpxtractor
26
+
27
+ activity = gpxtractor.extract_data("your-gpx-tcx-or-fit_file.gpx")
28
+
29
+ print(activity.sport) # Output: name of the sport in the file as a string
30
+
31
+ ```
32
+
33
+ The records attribute is a `pandas.DataFrame` holding the records extracted from the file\n
34
+ with the `gpxtractor.extract_data` function. So the usual `pandas.DataFrame` methods can be applied
35
+
36
+ ```python
37
+ print(activity.records.head())
38
+ ```
39
+
40
+ Once an instance of an Activity as been created with the `extract_data` function, the method\n
41
+ `transform_records` can be used to calculate distance and speed if missing from the file as well as\n
42
+ elevation incremental difference, gradient and in the case of running activities, pace.
43
+
44
+ ```python
45
+ activity.transform_records()
46
+ print(activity.records.head())
47
+ ```
48
+
49
+ And once the records have been transformed with `transform_records`, it is possible to use the 2\n
50
+ following methods to calculate aggregated data for kilometre and lap splits.
51
+
52
+ ```python
53
+ activity.compute_km_splits()
54
+ print(activity.km_splits)
55
+
56
+ activity.compute_lap_splits()
57
+ print(activity.lap_splits)
58
+ ```
59
+ Note: the `compute_lap_splits` will only compute lap splits if the file contains lap data which is not\n
60
+ the case for GPX files. It does not update the `lap_splits` attribute otherwise.
61
+
62
+
@@ -0,0 +1,12 @@
1
+ """
2
+ GPX, TCX and FIT data extraction for Python
3
+ ======================================
4
+
5
+ gpxtractor is a python package to extract data from
6
+ gpx, tcx and fit files and present it in a dataframe.
7
+ """
8
+
9
+ __version__ = "0.1.0"
10
+
11
+
12
+ from gpxtractor._core import Activity, extract_data
@@ -0,0 +1,265 @@
1
+ import gzip
2
+ import pathlib
3
+ from typing import Optional
4
+ from dataclasses import dataclass, field
5
+ import pyarrow as pa
6
+ import pandas as pd
7
+
8
+ import gpxtractor._xml_extraction as xml_ext
9
+ import gpxtractor._fit_extraction as fit_ext
10
+ import gpxtractor._transformation as tr
11
+ import gpxtractor._utils as ut
12
+
13
+
14
+ @dataclass
15
+ class Activity:
16
+ """Stores and manages records and metadata parsed from a gpx, tcx or
17
+ fit file.
18
+
19
+ This class is designed to hold structured data and associated metadata
20
+ extracted from a gpx, tcx or fit file, providing methods for accessing
21
+ and transforming the records.
22
+
23
+ Parameters
24
+ ----------
25
+ file_type : str
26
+ Can be any of the following: 'GPX', 'TCX' or 'FIT'.
27
+ Corresponds to the type of the file for which the instance of the
28
+ class holds data.
29
+
30
+ sport : str
31
+ Is the type of sport as extracted from the file in lower case.
32
+
33
+ records : pandas.DataFrame
34
+ DataFrame holding the records extracted from the gpx, tcx or fit file.
35
+
36
+ Attributes
37
+ ----------
38
+ is_transformed : bool
39
+ initially False, becomes True once either the method
40
+ `transform_records` or `full_transform` is used.
41
+
42
+ file_type : str
43
+ Can be any of the following: 'GPX', 'TCX' or 'FIT'.
44
+ Corresponds to the type of the file for which the instance of the
45
+ class holds data.
46
+
47
+ sport : None or str
48
+ Is the type of sport as extracted from the file in lower case.
49
+
50
+ start_time : None or pandas.Timestamp
51
+ Is None before a transformation method has been called.
52
+ A pandas Timestamp with timezone information indicating the start
53
+ time of the activity.
54
+
55
+ elapsed_time : None or int
56
+ Is None before a transformation method has been called.
57
+ An integer indicating the total elapsed time of the activity in
58
+ seconds.
59
+
60
+ distance : None or float
61
+ Is None before a transformation method has been called.
62
+ A float indicating the total distance covered during the activity
63
+ in kilometres.
64
+
65
+ avg_speed : None or float
66
+ Is None before a transformation method has been called.
67
+ A float indicating the average speed over the activity in kph.
68
+
69
+ avg_pace : None or str
70
+ Is None before a transformation method has been called.
71
+ A string indicating the average pace over the activity in min per km.
72
+
73
+ elevation_gain : None or int
74
+ Is None before a transformation method has been called.
75
+ An integer indicating the total elevation gained during the activity
76
+ in meters.
77
+
78
+ elevation_loss : None or int
79
+ Is None before a transformation method has been called.
80
+ An integer indicating the total elevation lossed during the activity
81
+ in meters.
82
+
83
+ avg_heart_rate : None or int
84
+ Is None before a transformation method has been called.
85
+ An integer indicating the average heart rate of the activity in bpm.
86
+
87
+ max_heart_rate : None or int
88
+ Is None before a transformation method has been called.
89
+ An integer indicating the maximum heart rate of the activity in bpm.
90
+
91
+ avg_cadence : None or int
92
+ Is None before a transformation method has been called.
93
+ An integer indicating the average cadence of the activity in either
94
+ rpm or, in the case of a running activity spm.
95
+
96
+ max_cadence : None or int
97
+ Is None before a transformation method has been called.
98
+ An integer indicating the maximum cadence of the activity in either
99
+ rpm or, in the case of a running activity spm.
100
+
101
+ records : pandas.DataFrame
102
+ DataFrame holding the records extracted from the gpx, tcx or fit file.
103
+ Records can be transformed with the methods `transform_records` or
104
+ `full_transform`.
105
+
106
+ km_splits : None or pandas.DataFrame
107
+ Initially None. DataFrame holding the transformed and aggregated data
108
+ grouped by kilometre splits once the `compute_km_splits` or
109
+ `full_transform` method has been used.
110
+
111
+ lap_splits : None or pandas.DataFrame
112
+ Initially None. DataFrame holding the transformed and aggregated data
113
+ grouped by lap splits once the `compute_lap_splits` or
114
+ `full_transform` method has been used. Can only hold data if the file
115
+ has lap data which is not the case for gpx files.
116
+ """
117
+
118
+ file_type: str
119
+ sport: str
120
+ records: pd.DataFrame
121
+ is_transformed: bool = field(default=False, init=False)
122
+ start_time: Optional[pd.Timestamp] = field(default=None, init=False)
123
+ elapsed_time: Optional[int] = field(default=None, init=False)
124
+ distance: Optional[float] = field(default=None, init=False)
125
+ avg_speed: Optional[float] = field(default=None, init=False)
126
+ max_speed: Optional[float] = field(default=None, init=False)
127
+ avg_pace: Optional[str] = field(default=None, init=False)
128
+ elevation_gain: Optional[int] = field(default=None, init=False)
129
+ elevation_loss: Optional[int] = field(default=None, init=False)
130
+ avg_heart_rate: Optional[int] = field(default=None, init=False)
131
+ max_heart_rate: Optional[int] = field(default=None, init=False)
132
+ avg_cadence: Optional[int] = field(default=None, init=False)
133
+ max_cadence: Optional[int] = field(default=None, init=False)
134
+ km_splits: Optional[pd.DataFrame] = field(default=None, init=False)
135
+ lap_splits: Optional[pd.DataFrame] = field(default=None, init=False)
136
+
137
+ def __str__(self):
138
+ records_str = str(self.records.head())
139
+ km_splits_str = (
140
+ str(self.km_splits.head()) if self.km_splits is not None else None
141
+ )
142
+ lap_splits_str = (
143
+ str(self.lap_splits.head()) if self.lap_splits is not None else None
144
+ )
145
+ return (
146
+ "Activity(\n"
147
+ f" is_transformed: {self.is_transformed}\n"
148
+ f" file_type: {self.file_type}\n"
149
+ f" sport: {self.sport}\n"
150
+ f" start_time: {self.start_time}\n"
151
+ f" elapsed_time: {self.elapsed_time}\n"
152
+ f" distance: {self.distance}\n"
153
+ f" avg_speed: {self.avg_speed}\n"
154
+ f" max_speed: {self.max_speed}\n"
155
+ f" avg_pace: {self.avg_pace}\n"
156
+ f" elevation_gain: {self.elevation_gain}\n"
157
+ f" elevation_loss: {self.elevation_loss}\n"
158
+ f" avg_heart_rate: {self.avg_heart_rate}\n"
159
+ f" max_heart_rate: {self.max_heart_rate}\n"
160
+ f" avg_cadence: {self.avg_cadence}\n"
161
+ f" max_cadence: {self.max_cadence}\n"
162
+ f" records:\n{records_str}\n"
163
+ f" km_splits:\n{km_splits_str}\n"
164
+ f" lap_splits:\n{lap_splits_str}\n"
165
+ ")"
166
+ )
167
+
168
+ def _transform_records_to_pyarrow(self):
169
+ if not self.is_transformed:
170
+ self.records = pa.Table.from_pandas(self.records)
171
+ self.records = tr.transform_data(self.records, self.sport)
172
+ stats = tr.compute_overall_stats(self.records)
173
+ self.start_time = stats["start_time"].at[0]
174
+ self.elapsed_time = int(stats["elapsed_time"].at[0])
175
+ self.distance = float(stats["distance"].at[0])
176
+ self.avg_speed = float(stats["avg_speed"].at[0])
177
+ self.max_speed = float(stats["max_speed"].at[0])
178
+ self.avg_pace = stats["avg_pace"].at[0]
179
+ self.elevation_gain = int(stats["elevation_gain"].at[0])
180
+ self.elevation_loss = int(stats["elevation_loss"].at[0])
181
+ self.avg_heart_rate = int(stats["avg_heart_rate"].at[0])
182
+ self.max_heart_rate = int(stats["max_heart_rate"].at[0])
183
+ self.avg_cadence = int(stats["avg_cadence"].at[0])
184
+ self.max_cadence = int(stats["max_cadence"].at[0])
185
+
186
+ def transform_records(self):
187
+ """Transforms the data in the records attributes to calculate distance,
188
+ speed if absent and elevation difference, gradient and, in the case of
189
+ running activities, pace.
190
+ """
191
+ if not self.is_transformed:
192
+ self._transform_records_to_pyarrow()
193
+ self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
194
+ self.is_transformed = True
195
+
196
+ def compute_lap_splits(self):
197
+ """If there is lap data in the records, updates the lap_splits to a
198
+ DataFrame holding the transformed and aggregated data grouped by lap
199
+ splits. Note: there is no lap data in gpx files.
200
+ """
201
+ if self.file_type != "GPX" and self.is_transformed:
202
+ self.records = pa.Table.from_pandas(self.records)
203
+ self.lap_splits = tr.compute_lap_data(self.records)
204
+ self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
205
+
206
+ def compute_km_splits(self):
207
+ """Updates km_splits attribute to a DataFrame holding the transformed
208
+ and aggregated data grouped by kilometre splits.
209
+ """
210
+ if self.is_transformed:
211
+ self.records = pa.Table.from_pandas(self.records)
212
+ self.km_splits = tr.compute_km_data(self.records)
213
+ self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
214
+
215
+ def full_transform(self):
216
+ """Transforms data in records, computes km and lap splits"""
217
+ if not self.is_transformed:
218
+ self._transform_records_to_pyarrow()
219
+ self.km_splits = tr.compute_km_data(self.records)
220
+ if self.file_type != "GPX":
221
+ self.lap_splits = tr.compute_lap_data(self.records)
222
+ self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
223
+ self.is_transformed = True
224
+
225
+
226
+ def extract_data(file_path: pathlib.Path) -> Activity:
227
+ """Extract records from a gpx, tcx or fit file.
228
+ Create and return a new Activity instance where records are
229
+ stored as a pandas.DataFrame in the records attribute and the
230
+ sport is stored as a string in the sport attribute.
231
+
232
+ Parameters
233
+ ----------
234
+ file_path : pathlib.Path
235
+ Path to a file of type .gpx, .tcx or .fit. Can be gzipped.
236
+
237
+ Returns
238
+ -------
239
+ gpxtractor.Activity
240
+
241
+ Raises
242
+ ------
243
+ ValueError
244
+ if the file type is not gpx, tcx or fit or their gzipped
245
+ equivalent.
246
+ """
247
+ extensions = ut._get_file_extensions(file_path)
248
+ match extensions:
249
+ case ".gpx" | ".gpx.gz":
250
+ sport, records = ut._handle_gzipped_xml_files(
251
+ file_path, extensions, xml_ext.get_sport_from_gpx, xml_ext.extract_gpx
252
+ )
253
+ case ".tcx" | ".tcx.gz":
254
+ sport, records = ut._handle_gzipped_xml_files(
255
+ file_path, extensions, xml_ext.get_sport_from_tcx, xml_ext.extract_tcx
256
+ )
257
+ case ".fit":
258
+ sport, records = fit_ext.extract_fit(file_path)
259
+ case ".fit.gz":
260
+ with gzip.open(file_path, "rb") as gz:
261
+ sport, records = fit_ext.extract_fit(gz)
262
+ case _:
263
+ raise ValueError("Not a valid file type: Try a GPX, TCX or FIT file")
264
+ file_type = ut._get_file_type_from_extensions(extensions)
265
+ return Activity(file_type=file_type, sport=sport, records=records)
@@ -0,0 +1,90 @@
1
+ import pathlib
2
+ import numpy as np
3
+ import pandas as pd
4
+ import fitdecode
5
+
6
+
7
+ def _convert_fit_coords_to_deg(coord):
8
+ """Convert semicircle 32-bit integer coordinate to degrees"""
9
+ return coord * (180 / 2**31)
10
+
11
+
12
+ def _generate_frame_from_fit(fit_file: pathlib.Path, selected_frames: list):
13
+ with fitdecode.FitReader(fit_file, check_crc=False) as fit:
14
+ for frame in fit:
15
+ if (
16
+ frame.frame_type == fitdecode.FIT_FRAME_DATA
17
+ and frame.name in selected_frames
18
+ ):
19
+ yield frame
20
+
21
+
22
+ def _extract_str(frame, field_name: str):
23
+ if frame.has_field(field_name) and frame.get_value(field_name) is not None:
24
+ return frame.get_value(field_name)
25
+ return None
26
+
27
+
28
+ def _extract_value(frame, field_name: str, datatype):
29
+ if frame.has_field(field_name) and frame.get_value(field_name) is not None:
30
+
31
+ return datatype(frame.get_value(field_name))
32
+ return 0 if datatype is int else np.nan
33
+
34
+
35
+ def get_sport_from_fit(fit_content) -> str:
36
+ for frame in _generate_frame_from_fit(fit_content, ["session"]):
37
+ return _extract_str(frame, "sport")
38
+ return None
39
+
40
+
41
+ def extract_fit(file_path: pathlib.Path) -> pd.DataFrame:
42
+ lap_number = 1
43
+ laps = []
44
+ times = []
45
+ lats = []
46
+ lons = []
47
+ eles = []
48
+ dists = []
49
+ speeds = []
50
+ hrs = []
51
+ cads = []
52
+
53
+ for frame in _generate_frame_from_fit(file_path, ["lap", "record", "session"]):
54
+ if frame.name == "record":
55
+ laps.append(lap_number)
56
+ times.append(_extract_str(frame, "timestamp"))
57
+ lats.append(_extract_value(frame, "position_lat", float))
58
+ lons.append(_extract_value(frame, "position_long", float))
59
+ eles.append(_extract_value(frame, "altitude", float))
60
+ dists.append(_extract_value(frame, "distance", float))
61
+ speeds.append(_extract_value(frame, "speed", float))
62
+ hrs.append(_extract_value(frame, "heart_rate", int))
63
+ cads.append(_extract_value(frame, "cadence", int))
64
+ elif frame.name == "lap":
65
+ lap_number += 1
66
+ elif frame.name == "session":
67
+ sport = _extract_str(frame, "sport")
68
+
69
+ laps = np.array(laps, dtype=np.uint16)
70
+ lats = _convert_fit_coords_to_deg(np.array(lats, dtype=np.float32))
71
+ lons = _convert_fit_coords_to_deg(np.array(lons, dtype=np.float32))
72
+ eles = np.array(eles, dtype=np.float32)
73
+ dists = np.array(dists, dtype=np.float32)
74
+ speeds = np.array(speeds, dtype=np.float32)
75
+ hrs = np.array(hrs, dtype=np.uint8)
76
+ cads = np.array(cads, dtype=np.uint8)
77
+
78
+ return sport, pd.DataFrame(
79
+ {
80
+ "lap": laps,
81
+ "timestamp": pd.to_datetime(times),
82
+ "latitude": lats,
83
+ "longitude": lons,
84
+ "distance": dists,
85
+ "speed": speeds,
86
+ "altitude": eles,
87
+ "heart_rate": hrs,
88
+ "cadence": cads,
89
+ }
90
+ )
@@ -0,0 +1,99 @@
1
+ import inspect
2
+ from importlib_resources import files
3
+ import pandas as pd
4
+ import pyarrow as pa
5
+ import duckdb
6
+
7
+
8
+ def get_var_name(var):
9
+ callers_local_vars = inspect.currentframe().f_back.f_locals.items()
10
+ return [name for name, val in callers_local_vars if val is var][0]
11
+
12
+
13
+ def is_col_all_null(table: pa.Table, col: str) -> bool:
14
+ null_mask = pa.compute.is_null(table.column(col))
15
+ return pa.compute.all(null_mask).as_py()
16
+
17
+
18
+ def add_empty_col_if_absent(arrow_table: pa.Table, col: str, datatype) -> pa.Table:
19
+ if col not in arrow_table.schema.names:
20
+ empty_values = pa.nulls(len(arrow_table), type=datatype)
21
+ return arrow_table.append_column(col, empty_values)
22
+ else:
23
+ return arrow_table
24
+
25
+
26
+ def query_table(arrow_table: pa.Table, sql_file: str) -> pa.Table:
27
+ sql_path = files("gpxtractor.sql").joinpath(sql_file)
28
+ safe_table_name = get_var_name(arrow_table)
29
+ sql_query = sql_path.read_text().format(table_name=safe_table_name)
30
+ return duckdb.sql(sql_query).arrow().read_all()
31
+
32
+
33
+ def compute_distance_and_speed(arrow_table: pa.Table) -> pa.Table:
34
+ sql_haversine_file = files("gpxtractor.sql").joinpath("haversine_formula.sql")
35
+ haversine_formula = sql_haversine_file.read_text()
36
+ duckdb.sql(haversine_formula)
37
+ sql_file = "compute_distance_and_speed.sql"
38
+ return query_table(arrow_table, sql_file)
39
+
40
+
41
+ def compute_speed(arrow_table: pa.Table) -> pa.Table:
42
+ sql_file = "compute_speed.sql"
43
+ return query_table(arrow_table, sql_file)
44
+
45
+
46
+ def preprocess_data(arrow_table: pa.Table) -> pa.Table:
47
+ sql_file = "preprocess_data.sql"
48
+ return query_table(arrow_table, sql_file)
49
+
50
+
51
+ def preprocess_running_data(arrow_table: pa.Table) -> pa.Table:
52
+ sql_file = "preprocess_running_data.sql"
53
+ return query_table(arrow_table, sql_file)
54
+
55
+
56
+ def transform_data(arrow_table: pa.Table, sport: str) -> pa.Table:
57
+ REQUIRED_COLUMNS = {
58
+ "timestamp": pa.timestamp("us"),
59
+ "latitude": pa.float32(),
60
+ "longitude": pa.float32(),
61
+ "altitude": pa.float32(),
62
+ "heart_rate": pa.uint8(),
63
+ "cadence": pa.uint8(),
64
+ "lap": pa.uint16(),
65
+ }
66
+ for col, datatype in REQUIRED_COLUMNS.items():
67
+ arrow_table = add_empty_col_if_absent(arrow_table, col, datatype)
68
+ if "distance" not in arrow_table.schema.names or is_col_all_null(
69
+ arrow_table, "distance"
70
+ ):
71
+ arrow_table = compute_distance_and_speed(arrow_table)
72
+ elif "speed" not in arrow_table.schema.names or is_col_all_null(
73
+ arrow_table, "speed"
74
+ ):
75
+ arrow_table = compute_speed(arrow_table)
76
+ if sport == "running":
77
+ arrow_table = preprocess_running_data(arrow_table)
78
+ else:
79
+ arrow_table = preprocess_data(arrow_table)
80
+
81
+ return arrow_table
82
+
83
+
84
+ def compute_km_data(arrow_table: pa.Table) -> pd.DataFrame:
85
+ sql_file = "km_data_query.sql"
86
+ arrow_table = query_table(arrow_table, sql_file)
87
+ return arrow_table.to_pandas(types_mapper=pd.ArrowDtype)
88
+
89
+
90
+ def compute_lap_data(arrow_table: pa.Table) -> pd.DataFrame:
91
+ sql_file = "lap_data_query.sql"
92
+ arrow_table = query_table(arrow_table, sql_file)
93
+ return arrow_table.to_pandas(types_mapper=pd.ArrowDtype)
94
+
95
+
96
+ def compute_overall_stats(arrow_table: pa.Table):
97
+ sql_file = "overall_stats.sql"
98
+ arrow_table = query_table(arrow_table, sql_file)
99
+ return arrow_table.to_pandas(types_mapper=pd.ArrowDtype)
@@ -0,0 +1,32 @@
1
+ import pathlib
2
+ import gzip
3
+
4
+
5
+ def _get_file_extensions(file_path: pathlib.Path) -> str:
6
+ path = pathlib.Path(file_path)
7
+ return "".join(path.suffixes)
8
+
9
+
10
+ def _get_file_type_from_extensions(extensions) -> str:
11
+ match extensions:
12
+ case ".gpx" | ".gpx.gz":
13
+ return "GPX"
14
+ case ".tcx" | ".tcx.gz":
15
+ return "TCX"
16
+ case ".fit" | ".fit.gz":
17
+ return "FIT"
18
+
19
+
20
+ def _handle_gzipped_xml_files(
21
+ file_path: pathlib.Path, extensions, sport_func, extraction_func
22
+ ):
23
+ is_gzipped = ".gz" in extensions
24
+ if is_gzipped:
25
+ with gzip.open(file_path, "rt") as gz:
26
+ sport = sport_func(gz)
27
+ with gzip.GzipFile(file_path, "r") as gz:
28
+ return sport, extraction_func(gz)
29
+ else:
30
+ with open(file_path, "r") as file:
31
+ sport = sport_func(file)
32
+ return sport, extraction_func(file_path)