gpxtractor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gpxtractor-0.1.0/LICENSE +7 -0
- gpxtractor-0.1.0/PKG-INFO +79 -0
- gpxtractor-0.1.0/README.md +62 -0
- gpxtractor-0.1.0/gpxtractor/__init__.py +12 -0
- gpxtractor-0.1.0/gpxtractor/_core.py +265 -0
- gpxtractor-0.1.0/gpxtractor/_fit_extraction.py +90 -0
- gpxtractor-0.1.0/gpxtractor/_transformation.py +99 -0
- gpxtractor-0.1.0/gpxtractor/_utils.py +32 -0
- gpxtractor-0.1.0/gpxtractor/_xml_extraction.py +165 -0
- gpxtractor-0.1.0/gpxtractor/cli.py +75 -0
- gpxtractor-0.1.0/gpxtractor/sql/compute_distance_and_speed.sql +49 -0
- gpxtractor-0.1.0/gpxtractor/sql/compute_speed.sql +49 -0
- gpxtractor-0.1.0/gpxtractor/sql/haversine_formula.sql +8 -0
- gpxtractor-0.1.0/gpxtractor/sql/km_data_query.sql +92 -0
- gpxtractor-0.1.0/gpxtractor/sql/lap_data_query.sql +86 -0
- gpxtractor-0.1.0/gpxtractor/sql/overall_stats.sql +42 -0
- gpxtractor-0.1.0/gpxtractor/sql/preprocess_data.sql +50 -0
- gpxtractor-0.1.0/gpxtractor/sql/preprocess_running_data.sql +58 -0
- gpxtractor-0.1.0/gpxtractor.egg-info/PKG-INFO +79 -0
- gpxtractor-0.1.0/gpxtractor.egg-info/SOURCES.txt +28 -0
- gpxtractor-0.1.0/gpxtractor.egg-info/dependency_links.txt +1 -0
- gpxtractor-0.1.0/gpxtractor.egg-info/entry_points.txt +2 -0
- gpxtractor-0.1.0/gpxtractor.egg-info/requires.txt +8 -0
- gpxtractor-0.1.0/gpxtractor.egg-info/top_level.txt +1 -0
- gpxtractor-0.1.0/pyproject.toml +33 -0
- gpxtractor-0.1.0/setup.cfg +4 -0
- gpxtractor-0.1.0/tests/test_core.py +19 -0
- gpxtractor-0.1.0/tests/test_fit_extraction.py +9 -0
- gpxtractor-0.1.0/tests/test_transformation.py +13 -0
- gpxtractor-0.1.0/tests/test_xml_extraction.py +34 -0
gpxtractor-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright (c) 2026 Charles Stapylton-Smith
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gpxtractor
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: GPX, TCX and FIT data extraction for Python
|
|
5
|
+
Author-email: Charlie Stapylton <278091496+c-stap@users.noreply.github.com>
|
|
6
|
+
Requires-Python: >=3.13
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: lxml>=6.0.2
|
|
10
|
+
Requires-Dist: numpy>=2.3.1
|
|
11
|
+
Requires-Dist: pyarrow>=20.0.0
|
|
12
|
+
Requires-Dist: pandas>=2.3.1
|
|
13
|
+
Requires-Dist: duckdb>=1.4.4
|
|
14
|
+
Provides-Extra: cli
|
|
15
|
+
Requires-Dist: visidata; extra == "cli"
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# gpxtractor
|
|
19
|
+
|
|
20
|
+
**GPX, TCX and FIT data extraction for Python**
|
|
21
|
+
|
|
22
|
+
[](https://opensource.org/licenses/MIT)
|
|
23
|
+
|
|
24
|
+
## Description
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
git clone
|
|
34
|
+
cd gpxtractor
|
|
35
|
+
pip install .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage Example
|
|
39
|
+
Use the `gpxtractor.extract_data` function that returns a gpxtractor.Activity instance.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import gpxtractor
|
|
43
|
+
|
|
44
|
+
activity = gpxtractor.extract_data("your-gpx-tcx-or-fit_file.gpx")
|
|
45
|
+
|
|
46
|
+
print(activity.sport) # Output: name of the sport in the file as a string
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
The records attribute is a `pandas.DataFrame` holding the records extracted from the file\n
|
|
51
|
+
with the `gpxtractor.extract_data` function. So the usual `pandas.DataFrame` methods can be applied
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
print(activity.records.head())
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Once an instance of an Activity as been created with the `extract_data` function, the method\n
|
|
58
|
+
`transform_records` can be used to calculate distance and speed if missing from the file as well as\n
|
|
59
|
+
elevation incremental difference, gradient and in the case of running activities, pace.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
activity.transform_records()
|
|
63
|
+
print(activity.records.head())
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
And once the records have been transformed with `transform_records`, it is possible to use the 2\n
|
|
67
|
+
following methods to calculate aggregated data for kilometre and lap splits.
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
activity.compute_km_splits()
|
|
71
|
+
print(activity.km_splits)
|
|
72
|
+
|
|
73
|
+
activity.compute_lap_splits()
|
|
74
|
+
print(activity.lap_splits)
|
|
75
|
+
```
|
|
76
|
+
Note: the `compute_lap_splits` will only compute lap splits if the file contains lap data which is not\n
|
|
77
|
+
the case for GPX files. It does not update the `lap_splits` attribute otherwise.
|
|
78
|
+
|
|
79
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# gpxtractor
|
|
2
|
+
|
|
3
|
+
**GPX, TCX and FIT data extraction for Python**
|
|
4
|
+
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
## Description
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
git clone
|
|
17
|
+
cd gpxtractor
|
|
18
|
+
pip install .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage Example
|
|
22
|
+
Use the `gpxtractor.extract_data` function that returns a gpxtractor.Activity instance.
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
import gpxtractor
|
|
26
|
+
|
|
27
|
+
activity = gpxtractor.extract_data("your-gpx-tcx-or-fit_file.gpx")
|
|
28
|
+
|
|
29
|
+
print(activity.sport) # Output: name of the sport in the file as a string
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
The records attribute is a `pandas.DataFrame` holding the records extracted from the file\n
|
|
34
|
+
with the `gpxtractor.extract_data` function. So the usual `pandas.DataFrame` methods can be applied
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
print(activity.records.head())
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Once an instance of an Activity as been created with the `extract_data` function, the method\n
|
|
41
|
+
`transform_records` can be used to calculate distance and speed if missing from the file as well as\n
|
|
42
|
+
elevation incremental difference, gradient and in the case of running activities, pace.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
activity.transform_records()
|
|
46
|
+
print(activity.records.head())
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
And once the records have been transformed with `transform_records`, it is possible to use the 2\n
|
|
50
|
+
following methods to calculate aggregated data for kilometre and lap splits.
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
activity.compute_km_splits()
|
|
54
|
+
print(activity.km_splits)
|
|
55
|
+
|
|
56
|
+
activity.compute_lap_splits()
|
|
57
|
+
print(activity.lap_splits)
|
|
58
|
+
```
|
|
59
|
+
Note: the `compute_lap_splits` will only compute lap splits if the file contains lap data which is not\n
|
|
60
|
+
the case for GPX files. It does not update the `lap_splits` attribute otherwise.
|
|
61
|
+
|
|
62
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GPX, TCX and FIT data extraction for Python
|
|
3
|
+
======================================
|
|
4
|
+
|
|
5
|
+
gpxtractor is a python package to extract data from
|
|
6
|
+
gpx, tcx and fit files and present it in a dataframe.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.0"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
from gpxtractor._core import Activity, extract_data
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import pathlib
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
import gpxtractor._xml_extraction as xml_ext
|
|
9
|
+
import gpxtractor._fit_extraction as fit_ext
|
|
10
|
+
import gpxtractor._transformation as tr
|
|
11
|
+
import gpxtractor._utils as ut
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Activity:
|
|
16
|
+
"""Stores and manages records and metadata parsed from a gpx, tcx or
|
|
17
|
+
fit file.
|
|
18
|
+
|
|
19
|
+
This class is designed to hold structured data and associated metadata
|
|
20
|
+
extracted from a gpx, tcx or fit file, providing methods for accessing
|
|
21
|
+
and transforming the records.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
file_type : str
|
|
26
|
+
Can be any of the following: 'GPX', 'TCX' or 'FIT'.
|
|
27
|
+
Corresponds to the type of the file for which the instance of the
|
|
28
|
+
class holds data.
|
|
29
|
+
|
|
30
|
+
sport : str
|
|
31
|
+
Is the type of sport as extracted from the file in lower case.
|
|
32
|
+
|
|
33
|
+
records : pandas.DataFrame
|
|
34
|
+
DataFrame holding the records extracted from the gpx, tcx or fit file.
|
|
35
|
+
|
|
36
|
+
Attributes
|
|
37
|
+
----------
|
|
38
|
+
is_transformed : bool
|
|
39
|
+
initially False, becomes True once either the method
|
|
40
|
+
`transform_records` or `full_transform` is used.
|
|
41
|
+
|
|
42
|
+
file_type : str
|
|
43
|
+
Can be any of the following: 'GPX', 'TCX' or 'FIT'.
|
|
44
|
+
Corresponds to the type of the file for which the instance of the
|
|
45
|
+
class holds data.
|
|
46
|
+
|
|
47
|
+
sport : None or str
|
|
48
|
+
Is the type of sport as extracted from the file in lower case.
|
|
49
|
+
|
|
50
|
+
start_time : None or pandas.Timestamp
|
|
51
|
+
Is None before a transformation method has been called.
|
|
52
|
+
A pandas Timestamp with timezone information indicating the start
|
|
53
|
+
time of the activity.
|
|
54
|
+
|
|
55
|
+
elapsed_time : None or int
|
|
56
|
+
Is None before a transformation method has been called.
|
|
57
|
+
An integer indicating the total elapsed time of the activity in
|
|
58
|
+
seconds.
|
|
59
|
+
|
|
60
|
+
distance : None or float
|
|
61
|
+
Is None before a transformation method has been called.
|
|
62
|
+
A float indicating the total distance covered during the activity
|
|
63
|
+
in kilometres.
|
|
64
|
+
|
|
65
|
+
avg_speed : None or float
|
|
66
|
+
Is None before a transformation method has been called.
|
|
67
|
+
A float indicating the average speed over the activity in kph.
|
|
68
|
+
|
|
69
|
+
avg_pace : None or str
|
|
70
|
+
Is None before a transformation method has been called.
|
|
71
|
+
A string indicating the average pace over the activity in min per km.
|
|
72
|
+
|
|
73
|
+
elevation_gain : None or int
|
|
74
|
+
Is None before a transformation method has been called.
|
|
75
|
+
An integer indicating the total elevation gained during the activity
|
|
76
|
+
in meters.
|
|
77
|
+
|
|
78
|
+
elevation_loss : None or int
|
|
79
|
+
Is None before a transformation method has been called.
|
|
80
|
+
An integer indicating the total elevation lossed during the activity
|
|
81
|
+
in meters.
|
|
82
|
+
|
|
83
|
+
avg_heart_rate : None or int
|
|
84
|
+
Is None before a transformation method has been called.
|
|
85
|
+
An integer indicating the average heart rate of the activity in bpm.
|
|
86
|
+
|
|
87
|
+
max_heart_rate : None or int
|
|
88
|
+
Is None before a transformation method has been called.
|
|
89
|
+
An integer indicating the maximum heart rate of the activity in bpm.
|
|
90
|
+
|
|
91
|
+
avg_cadence : None or int
|
|
92
|
+
Is None before a transformation method has been called.
|
|
93
|
+
An integer indicating the average cadence of the activity in either
|
|
94
|
+
rpm or, in the case of a running activity spm.
|
|
95
|
+
|
|
96
|
+
max_cadence : None or int
|
|
97
|
+
Is None before a transformation method has been called.
|
|
98
|
+
An integer indicating the maximum cadence of the activity in either
|
|
99
|
+
rpm or, in the case of a running activity spm.
|
|
100
|
+
|
|
101
|
+
records : pandas.DataFrame
|
|
102
|
+
DataFrame holding the records extracted from the gpx, tcx or fit file.
|
|
103
|
+
Records can be transformed with the methods `transform_records` or
|
|
104
|
+
`full_transform`.
|
|
105
|
+
|
|
106
|
+
km_splits : None or pandas.DataFrame
|
|
107
|
+
Initially None. DataFrame holding the transformed and aggregated data
|
|
108
|
+
grouped by kilometre splits once the `compute_km_splits` or
|
|
109
|
+
`full_transform` method has been used.
|
|
110
|
+
|
|
111
|
+
lap_splits : None or pandas.DataFrame
|
|
112
|
+
Initially None. DataFrame holding the transformed and aggregated data
|
|
113
|
+
grouped by lap splits once the `compute_lap_splits` or
|
|
114
|
+
`full_transform` method has been used. Can only hold data if the file
|
|
115
|
+
has lap data which is not the case for gpx files.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
file_type: str
|
|
119
|
+
sport: str
|
|
120
|
+
records: pd.DataFrame
|
|
121
|
+
is_transformed: bool = field(default=False, init=False)
|
|
122
|
+
start_time: Optional[pd.Timestamp] = field(default=None, init=False)
|
|
123
|
+
elapsed_time: Optional[int] = field(default=None, init=False)
|
|
124
|
+
distance: Optional[float] = field(default=None, init=False)
|
|
125
|
+
avg_speed: Optional[float] = field(default=None, init=False)
|
|
126
|
+
max_speed: Optional[float] = field(default=None, init=False)
|
|
127
|
+
avg_pace: Optional[str] = field(default=None, init=False)
|
|
128
|
+
elevation_gain: Optional[int] = field(default=None, init=False)
|
|
129
|
+
elevation_loss: Optional[int] = field(default=None, init=False)
|
|
130
|
+
avg_heart_rate: Optional[int] = field(default=None, init=False)
|
|
131
|
+
max_heart_rate: Optional[int] = field(default=None, init=False)
|
|
132
|
+
avg_cadence: Optional[int] = field(default=None, init=False)
|
|
133
|
+
max_cadence: Optional[int] = field(default=None, init=False)
|
|
134
|
+
km_splits: Optional[pd.DataFrame] = field(default=None, init=False)
|
|
135
|
+
lap_splits: Optional[pd.DataFrame] = field(default=None, init=False)
|
|
136
|
+
|
|
137
|
+
def __str__(self):
|
|
138
|
+
records_str = str(self.records.head())
|
|
139
|
+
km_splits_str = (
|
|
140
|
+
str(self.km_splits.head()) if self.km_splits is not None else None
|
|
141
|
+
)
|
|
142
|
+
lap_splits_str = (
|
|
143
|
+
str(self.lap_splits.head()) if self.lap_splits is not None else None
|
|
144
|
+
)
|
|
145
|
+
return (
|
|
146
|
+
"Activity(\n"
|
|
147
|
+
f" is_transformed: {self.is_transformed}\n"
|
|
148
|
+
f" file_type: {self.file_type}\n"
|
|
149
|
+
f" sport: {self.sport}\n"
|
|
150
|
+
f" start_time: {self.start_time}\n"
|
|
151
|
+
f" elapsed_time: {self.elapsed_time}\n"
|
|
152
|
+
f" distance: {self.distance}\n"
|
|
153
|
+
f" avg_speed: {self.avg_speed}\n"
|
|
154
|
+
f" max_speed: {self.max_speed}\n"
|
|
155
|
+
f" avg_pace: {self.avg_pace}\n"
|
|
156
|
+
f" elevation_gain: {self.elevation_gain}\n"
|
|
157
|
+
f" elevation_loss: {self.elevation_loss}\n"
|
|
158
|
+
f" avg_heart_rate: {self.avg_heart_rate}\n"
|
|
159
|
+
f" max_heart_rate: {self.max_heart_rate}\n"
|
|
160
|
+
f" avg_cadence: {self.avg_cadence}\n"
|
|
161
|
+
f" max_cadence: {self.max_cadence}\n"
|
|
162
|
+
f" records:\n{records_str}\n"
|
|
163
|
+
f" km_splits:\n{km_splits_str}\n"
|
|
164
|
+
f" lap_splits:\n{lap_splits_str}\n"
|
|
165
|
+
")"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _transform_records_to_pyarrow(self):
|
|
169
|
+
if not self.is_transformed:
|
|
170
|
+
self.records = pa.Table.from_pandas(self.records)
|
|
171
|
+
self.records = tr.transform_data(self.records, self.sport)
|
|
172
|
+
stats = tr.compute_overall_stats(self.records)
|
|
173
|
+
self.start_time = stats["start_time"].at[0]
|
|
174
|
+
self.elapsed_time = int(stats["elapsed_time"].at[0])
|
|
175
|
+
self.distance = float(stats["distance"].at[0])
|
|
176
|
+
self.avg_speed = float(stats["avg_speed"].at[0])
|
|
177
|
+
self.max_speed = float(stats["max_speed"].at[0])
|
|
178
|
+
self.avg_pace = stats["avg_pace"].at[0]
|
|
179
|
+
self.elevation_gain = int(stats["elevation_gain"].at[0])
|
|
180
|
+
self.elevation_loss = int(stats["elevation_loss"].at[0])
|
|
181
|
+
self.avg_heart_rate = int(stats["avg_heart_rate"].at[0])
|
|
182
|
+
self.max_heart_rate = int(stats["max_heart_rate"].at[0])
|
|
183
|
+
self.avg_cadence = int(stats["avg_cadence"].at[0])
|
|
184
|
+
self.max_cadence = int(stats["max_cadence"].at[0])
|
|
185
|
+
|
|
186
|
+
def transform_records(self):
|
|
187
|
+
"""Transforms the data in the records attributes to calculate distance,
|
|
188
|
+
speed if absent and elevation difference, gradient and, in the case of
|
|
189
|
+
running activities, pace.
|
|
190
|
+
"""
|
|
191
|
+
if not self.is_transformed:
|
|
192
|
+
self._transform_records_to_pyarrow()
|
|
193
|
+
self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
|
|
194
|
+
self.is_transformed = True
|
|
195
|
+
|
|
196
|
+
def compute_lap_splits(self):
|
|
197
|
+
"""If there is lap data in the records, updates the lap_splits to a
|
|
198
|
+
DataFrame holding the transformed and aggregated data grouped by lap
|
|
199
|
+
splits. Note: there is no lap data in gpx files.
|
|
200
|
+
"""
|
|
201
|
+
if self.file_type != "GPX" and self.is_transformed:
|
|
202
|
+
self.records = pa.Table.from_pandas(self.records)
|
|
203
|
+
self.lap_splits = tr.compute_lap_data(self.records)
|
|
204
|
+
self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
|
|
205
|
+
|
|
206
|
+
def compute_km_splits(self):
|
|
207
|
+
"""Updates km_splits attribute to a DataFrame holding the transformed
|
|
208
|
+
and aggregated data grouped by kilometre splits.
|
|
209
|
+
"""
|
|
210
|
+
if self.is_transformed:
|
|
211
|
+
self.records = pa.Table.from_pandas(self.records)
|
|
212
|
+
self.km_splits = tr.compute_km_data(self.records)
|
|
213
|
+
self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
|
|
214
|
+
|
|
215
|
+
def full_transform(self):
|
|
216
|
+
"""Transforms data in records, computes km and lap splits"""
|
|
217
|
+
if not self.is_transformed:
|
|
218
|
+
self._transform_records_to_pyarrow()
|
|
219
|
+
self.km_splits = tr.compute_km_data(self.records)
|
|
220
|
+
if self.file_type != "GPX":
|
|
221
|
+
self.lap_splits = tr.compute_lap_data(self.records)
|
|
222
|
+
self.records = self.records.to_pandas(types_mapper=pd.ArrowDtype)
|
|
223
|
+
self.is_transformed = True
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def extract_data(file_path: pathlib.Path) -> Activity:
|
|
227
|
+
"""Extract records from a gpx, tcx or fit file.
|
|
228
|
+
Create and return a new Activity instance where records are
|
|
229
|
+
stored as a pandas.DataFrame in the records attribute and the
|
|
230
|
+
sport is stored as a string in the sport attribute.
|
|
231
|
+
|
|
232
|
+
Parameters
|
|
233
|
+
----------
|
|
234
|
+
file_path : pathlib.Path
|
|
235
|
+
Path to a file of type .gpx, .tcx or .fit. Can be gzipped.
|
|
236
|
+
|
|
237
|
+
Returns
|
|
238
|
+
-------
|
|
239
|
+
gpxtractor.Activity
|
|
240
|
+
|
|
241
|
+
Raises
|
|
242
|
+
------
|
|
243
|
+
ValueError
|
|
244
|
+
if the file type is not gpx, tcx or fit or their gzipped
|
|
245
|
+
equivalent.
|
|
246
|
+
"""
|
|
247
|
+
extensions = ut._get_file_extensions(file_path)
|
|
248
|
+
match extensions:
|
|
249
|
+
case ".gpx" | ".gpx.gz":
|
|
250
|
+
sport, records = ut._handle_gzipped_xml_files(
|
|
251
|
+
file_path, extensions, xml_ext.get_sport_from_gpx, xml_ext.extract_gpx
|
|
252
|
+
)
|
|
253
|
+
case ".tcx" | ".tcx.gz":
|
|
254
|
+
sport, records = ut._handle_gzipped_xml_files(
|
|
255
|
+
file_path, extensions, xml_ext.get_sport_from_tcx, xml_ext.extract_tcx
|
|
256
|
+
)
|
|
257
|
+
case ".fit":
|
|
258
|
+
sport, records = fit_ext.extract_fit(file_path)
|
|
259
|
+
case ".fit.gz":
|
|
260
|
+
with gzip.open(file_path, "rb") as gz:
|
|
261
|
+
sport, records = fit_ext.extract_fit(gz)
|
|
262
|
+
case _:
|
|
263
|
+
raise ValueError("Not a valid file type: Try a GPX, TCX or FIT file")
|
|
264
|
+
file_type = ut._get_file_type_from_extensions(extensions)
|
|
265
|
+
return Activity(file_type=file_type, sport=sport, records=records)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import fitdecode
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _convert_fit_coords_to_deg(coord):
|
|
8
|
+
"""Convert semicircle 32-bit integer coordinate to degrees"""
|
|
9
|
+
return coord * (180 / 2**31)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _generate_frame_from_fit(fit_file: pathlib.Path, selected_frames: list):
|
|
13
|
+
with fitdecode.FitReader(fit_file, check_crc=False) as fit:
|
|
14
|
+
for frame in fit:
|
|
15
|
+
if (
|
|
16
|
+
frame.frame_type == fitdecode.FIT_FRAME_DATA
|
|
17
|
+
and frame.name in selected_frames
|
|
18
|
+
):
|
|
19
|
+
yield frame
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _extract_str(frame, field_name: str):
|
|
23
|
+
if frame.has_field(field_name) and frame.get_value(field_name) is not None:
|
|
24
|
+
return frame.get_value(field_name)
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _extract_value(frame, field_name: str, datatype):
|
|
29
|
+
if frame.has_field(field_name) and frame.get_value(field_name) is not None:
|
|
30
|
+
|
|
31
|
+
return datatype(frame.get_value(field_name))
|
|
32
|
+
return 0 if datatype is int else np.nan
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_sport_from_fit(fit_content) -> str:
|
|
36
|
+
for frame in _generate_frame_from_fit(fit_content, ["session"]):
|
|
37
|
+
return _extract_str(frame, "sport")
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def extract_fit(file_path: pathlib.Path) -> pd.DataFrame:
|
|
42
|
+
lap_number = 1
|
|
43
|
+
laps = []
|
|
44
|
+
times = []
|
|
45
|
+
lats = []
|
|
46
|
+
lons = []
|
|
47
|
+
eles = []
|
|
48
|
+
dists = []
|
|
49
|
+
speeds = []
|
|
50
|
+
hrs = []
|
|
51
|
+
cads = []
|
|
52
|
+
|
|
53
|
+
for frame in _generate_frame_from_fit(file_path, ["lap", "record", "session"]):
|
|
54
|
+
if frame.name == "record":
|
|
55
|
+
laps.append(lap_number)
|
|
56
|
+
times.append(_extract_str(frame, "timestamp"))
|
|
57
|
+
lats.append(_extract_value(frame, "position_lat", float))
|
|
58
|
+
lons.append(_extract_value(frame, "position_long", float))
|
|
59
|
+
eles.append(_extract_value(frame, "altitude", float))
|
|
60
|
+
dists.append(_extract_value(frame, "distance", float))
|
|
61
|
+
speeds.append(_extract_value(frame, "speed", float))
|
|
62
|
+
hrs.append(_extract_value(frame, "heart_rate", int))
|
|
63
|
+
cads.append(_extract_value(frame, "cadence", int))
|
|
64
|
+
elif frame.name == "lap":
|
|
65
|
+
lap_number += 1
|
|
66
|
+
elif frame.name == "session":
|
|
67
|
+
sport = _extract_str(frame, "sport")
|
|
68
|
+
|
|
69
|
+
laps = np.array(laps, dtype=np.uint16)
|
|
70
|
+
lats = _convert_fit_coords_to_deg(np.array(lats, dtype=np.float32))
|
|
71
|
+
lons = _convert_fit_coords_to_deg(np.array(lons, dtype=np.float32))
|
|
72
|
+
eles = np.array(eles, dtype=np.float32)
|
|
73
|
+
dists = np.array(dists, dtype=np.float32)
|
|
74
|
+
speeds = np.array(speeds, dtype=np.float32)
|
|
75
|
+
hrs = np.array(hrs, dtype=np.uint8)
|
|
76
|
+
cads = np.array(cads, dtype=np.uint8)
|
|
77
|
+
|
|
78
|
+
return sport, pd.DataFrame(
|
|
79
|
+
{
|
|
80
|
+
"lap": laps,
|
|
81
|
+
"timestamp": pd.to_datetime(times),
|
|
82
|
+
"latitude": lats,
|
|
83
|
+
"longitude": lons,
|
|
84
|
+
"distance": dists,
|
|
85
|
+
"speed": speeds,
|
|
86
|
+
"altitude": eles,
|
|
87
|
+
"heart_rate": hrs,
|
|
88
|
+
"cadence": cads,
|
|
89
|
+
}
|
|
90
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from importlib_resources import files
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
import duckdb
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_var_name(var):
|
|
9
|
+
callers_local_vars = inspect.currentframe().f_back.f_locals.items()
|
|
10
|
+
return [name for name, val in callers_local_vars if val is var][0]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_col_all_null(table: pa.Table, col: str) -> bool:
|
|
14
|
+
null_mask = pa.compute.is_null(table.column(col))
|
|
15
|
+
return pa.compute.all(null_mask).as_py()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def add_empty_col_if_absent(arrow_table: pa.Table, col: str, datatype) -> pa.Table:
|
|
19
|
+
if col not in arrow_table.schema.names:
|
|
20
|
+
empty_values = pa.nulls(len(arrow_table), type=datatype)
|
|
21
|
+
return arrow_table.append_column(col, empty_values)
|
|
22
|
+
else:
|
|
23
|
+
return arrow_table
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def query_table(arrow_table: pa.Table, sql_file: str) -> pa.Table:
|
|
27
|
+
sql_path = files("gpxtractor.sql").joinpath(sql_file)
|
|
28
|
+
safe_table_name = get_var_name(arrow_table)
|
|
29
|
+
sql_query = sql_path.read_text().format(table_name=safe_table_name)
|
|
30
|
+
return duckdb.sql(sql_query).arrow().read_all()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def compute_distance_and_speed(arrow_table: pa.Table) -> pa.Table:
|
|
34
|
+
sql_haversine_file = files("gpxtractor.sql").joinpath("haversine_formula.sql")
|
|
35
|
+
haversine_formula = sql_haversine_file.read_text()
|
|
36
|
+
duckdb.sql(haversine_formula)
|
|
37
|
+
sql_file = "compute_distance_and_speed.sql"
|
|
38
|
+
return query_table(arrow_table, sql_file)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def compute_speed(arrow_table: pa.Table) -> pa.Table:
|
|
42
|
+
sql_file = "compute_speed.sql"
|
|
43
|
+
return query_table(arrow_table, sql_file)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def preprocess_data(arrow_table: pa.Table) -> pa.Table:
|
|
47
|
+
sql_file = "preprocess_data.sql"
|
|
48
|
+
return query_table(arrow_table, sql_file)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def preprocess_running_data(arrow_table: pa.Table) -> pa.Table:
|
|
52
|
+
sql_file = "preprocess_running_data.sql"
|
|
53
|
+
return query_table(arrow_table, sql_file)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def transform_data(arrow_table: pa.Table, sport: str) -> pa.Table:
|
|
57
|
+
REQUIRED_COLUMNS = {
|
|
58
|
+
"timestamp": pa.timestamp("us"),
|
|
59
|
+
"latitude": pa.float32(),
|
|
60
|
+
"longitude": pa.float32(),
|
|
61
|
+
"altitude": pa.float32(),
|
|
62
|
+
"heart_rate": pa.uint8(),
|
|
63
|
+
"cadence": pa.uint8(),
|
|
64
|
+
"lap": pa.uint16(),
|
|
65
|
+
}
|
|
66
|
+
for col, datatype in REQUIRED_COLUMNS.items():
|
|
67
|
+
arrow_table = add_empty_col_if_absent(arrow_table, col, datatype)
|
|
68
|
+
if "distance" not in arrow_table.schema.names or is_col_all_null(
|
|
69
|
+
arrow_table, "distance"
|
|
70
|
+
):
|
|
71
|
+
arrow_table = compute_distance_and_speed(arrow_table)
|
|
72
|
+
elif "speed" not in arrow_table.schema.names or is_col_all_null(
|
|
73
|
+
arrow_table, "speed"
|
|
74
|
+
):
|
|
75
|
+
arrow_table = compute_speed(arrow_table)
|
|
76
|
+
if sport == "running":
|
|
77
|
+
arrow_table = preprocess_running_data(arrow_table)
|
|
78
|
+
else:
|
|
79
|
+
arrow_table = preprocess_data(arrow_table)
|
|
80
|
+
|
|
81
|
+
return arrow_table
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def compute_km_data(arrow_table: pa.Table) -> pd.DataFrame:
|
|
85
|
+
sql_file = "km_data_query.sql"
|
|
86
|
+
arrow_table = query_table(arrow_table, sql_file)
|
|
87
|
+
return arrow_table.to_pandas(types_mapper=pd.ArrowDtype)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def compute_lap_data(arrow_table: pa.Table) -> pd.DataFrame:
|
|
91
|
+
sql_file = "lap_data_query.sql"
|
|
92
|
+
arrow_table = query_table(arrow_table, sql_file)
|
|
93
|
+
return arrow_table.to_pandas(types_mapper=pd.ArrowDtype)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def compute_overall_stats(arrow_table: pa.Table):
|
|
97
|
+
sql_file = "overall_stats.sql"
|
|
98
|
+
arrow_table = query_table(arrow_table, sql_file)
|
|
99
|
+
return arrow_table.to_pandas(types_mapper=pd.ArrowDtype)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import gzip
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _get_file_extensions(file_path: pathlib.Path) -> str:
|
|
6
|
+
path = pathlib.Path(file_path)
|
|
7
|
+
return "".join(path.suffixes)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _get_file_type_from_extensions(extensions) -> str:
|
|
11
|
+
match extensions:
|
|
12
|
+
case ".gpx" | ".gpx.gz":
|
|
13
|
+
return "GPX"
|
|
14
|
+
case ".tcx" | ".tcx.gz":
|
|
15
|
+
return "TCX"
|
|
16
|
+
case ".fit" | ".fit.gz":
|
|
17
|
+
return "FIT"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _handle_gzipped_xml_files(
|
|
21
|
+
file_path: pathlib.Path, extensions, sport_func, extraction_func
|
|
22
|
+
):
|
|
23
|
+
is_gzipped = ".gz" in extensions
|
|
24
|
+
if is_gzipped:
|
|
25
|
+
with gzip.open(file_path, "rt") as gz:
|
|
26
|
+
sport = sport_func(gz)
|
|
27
|
+
with gzip.GzipFile(file_path, "r") as gz:
|
|
28
|
+
return sport, extraction_func(gz)
|
|
29
|
+
else:
|
|
30
|
+
with open(file_path, "r") as file:
|
|
31
|
+
sport = sport_func(file)
|
|
32
|
+
return sport, extraction_func(file_path)
|