upc-datasets 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. upc_datasets-0.2.0/LICENSE +21 -0
  2. upc_datasets-0.2.0/PKG-INFO +214 -0
  3. upc_datasets-0.2.0/README.md +201 -0
  4. upc_datasets-0.2.0/pyproject.toml +30 -0
  5. upc_datasets-0.2.0/setup.cfg +4 -0
  6. upc_datasets-0.2.0/src/pachamix_data/__init__.py +5 -0
  7. upc_datasets-0.2.0/src/pachamix_data/builders/__init__.py +13 -0
  8. upc_datasets-0.2.0/src/pachamix_data/builders/audio_core.py +140 -0
  9. upc_datasets-0.2.0/src/pachamix_data/builders/lyrics_core.py +134 -0
  10. upc_datasets-0.2.0/src/pachamix_data/builders/playlist_events.py +138 -0
  11. upc_datasets-0.2.0/src/pachamix_data/builders/song_graph.py +41 -0
  12. upc_datasets-0.2.0/src/pachamix_data/cli.py +130 -0
  13. upc_datasets-0.2.0/src/pachamix_data/pipeline.py +111 -0
  14. upc_datasets-0.2.0/src/upc_datasets/__init__.py +27 -0
  15. upc_datasets-0.2.0/src/upc_datasets/catalog.py +203 -0
  16. upc_datasets-0.2.0/src/upc_datasets/cli.py +190 -0
  17. upc_datasets-0.2.0/src/upc_datasets.egg-info/PKG-INFO +214 -0
  18. upc_datasets-0.2.0/src/upc_datasets.egg-info/SOURCES.txt +33 -0
  19. upc_datasets-0.2.0/src/upc_datasets.egg-info/dependency_links.txt +1 -0
  20. upc_datasets-0.2.0/src/upc_datasets.egg-info/entry_points.txt +3 -0
  21. upc_datasets-0.2.0/src/upc_datasets.egg-info/requires.txt +5 -0
  22. upc_datasets-0.2.0/src/upc_datasets.egg-info/top_level.txt +2 -0
  23. upc_datasets-0.2.0/tests/test_audio_core_builder.py +79 -0
  24. upc_datasets-0.2.0/tests/test_cli.py +42 -0
  25. upc_datasets-0.2.0/tests/test_course_dataset_pipeline.py +50 -0
  26. upc_datasets-0.2.0/tests/test_course_dataset_pipeline_core_only.py +166 -0
  27. upc_datasets-0.2.0/tests/test_course_dataset_pipeline_playlist2vec.py +42 -0
  28. upc_datasets-0.2.0/tests/test_end_to_end_generation.py +74 -0
  29. upc_datasets-0.2.0/tests/test_lyrics_builder.py +143 -0
  30. upc_datasets-0.2.0/tests/test_lyrics_official_format.py +26 -0
  31. upc_datasets-0.2.0/tests/test_playlist2vec_builder.py +37 -0
  32. upc_datasets-0.2.0/tests/test_playlist_builder.py +46 -0
  33. upc_datasets-0.2.0/tests/test_playlist_directory_builder.py +27 -0
  34. upc_datasets-0.2.0/tests/test_song_graph_builder.py +32 -0
  35. upc_datasets-0.2.0/tests/test_upc_datasets_package.py +59 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Carlos Adrián Alarcón
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,214 @@
1
+ Metadata-Version: 2.4
2
+ Name: upc-datasets
3
+ Version: 0.2.0
4
+ Summary: Student-facing structured dataset toolkit for the UPC big data course.
5
+ Requires-Python: >=3.11
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: polars>=1.0.0
9
+ Requires-Dist: pyarrow>=14.0.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
12
+ Dynamic: license-file
13
+
14
+ # UPC Datasets
15
+
16
+ This workspace contains a small Python toolkit for generating structured teaching datasets for the `PachaMix` course narrative.
17
+
18
+ The implementation is designed around:
19
+
20
+ - structured tables
21
+ - metadata
22
+ - audio-feature tables
23
+ - lyrics-derived features
24
+ - playlist interactions
25
+ - parquet outputs
26
+
27
+ It intentionally avoids:
28
+
29
+ - raw mp3 processing
30
+ - waveform pipelines
31
+ - dependence on live Spotify audio-feature endpoints
32
+
33
+ ## Data Sources
34
+
35
+ - `FMA` for metadata and audio features
36
+ - `musiXmatch/MSD` for lyrics-derived features
37
+ - `Playlist2vec` table exports for playlist interactions and graph construction
38
+ - optional `Spotify MPD` support when access is already available
39
+
40
+ Official source references:
41
+
42
+ - `FMA`: https://github.com/mdeff/fma
43
+ - `musiXmatch/MSD`: https://millionsongdataset.com/musixmatch/
44
+ - `Playlist2vec`: https://zenodo.org/records/5002584
45
+ - `Spotify MPD`: https://research.atspotify.com/2020/9/the-million-playlist-dataset-remastered
46
+
47
+ The codebase is intentionally focused on `structured data`, not raw media. That means:
48
+
49
+ - no mp3 decoding
50
+ - no spectrogram generation
51
+ - no waveform feature extraction inside the course toolkit
52
+
53
+ Instead, the builders assume the inputs are already in the form of:
54
+
55
+ - csv metadata tables
56
+ - csv feature tables
57
+ - lyric token-count text exports
58
+ - playlist membership tables or json playlist metadata
59
+
60
+ Detailed source notes are documented in [big_data_dataset_generation_plan.md](./big_data_dataset_generation_plan.md).
61
+
62
+ The processed schema reference is documented in [data_dictionary.md](./data_dictionary.md).
63
+
64
+ The student-oriented quickstart is documented in [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
65
+
66
+ Operational instructions are documented in [runbooks/README.md](./runbooks/README.md).
67
+
68
+ ## Runtime Note
69
+
70
+ `pyspark` was evaluated for large-scale processing, but the current local environment cannot launch Spark because the installed Java runtime is older than the version required by Spark `4.1`. For that reason, the working implementation uses `polars` plus `pyarrow`.
71
+
72
+ ## Quick Start
73
+
74
+ ```bash
75
+ python3 -m venv .venv
76
+ .venv/bin/pip install -e '.[dev]'
77
+ .venv/bin/python -m upc_datasets.cli --help
78
+ .venv/bin/pytest
79
+ ```
80
+
81
+ ## Student Package
82
+
83
+ The distribution name is `upc-datasets`.
84
+
85
+ If you want the shortest student path, read [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
86
+
87
+ Local development install:
88
+
89
+ ```bash
90
+ pip install -e .
91
+ ```
92
+
93
+ Student install after publishing to PyPI:
94
+
95
+ ```bash
96
+ pip install upc-datasets
97
+ ```
98
+
99
+ Python usage:
100
+
101
+ ```python
102
+ import upc_datasets
103
+
104
+ print(upc_datasets.list_datasets())
105
+ lyrics = upc_datasets.get_dataset_definition("pachamix_lyrics_long")
106
+ print(lyrics["grain"])
107
+ ```
108
+
109
+ CLI usage:
110
+
111
+ ```bash
112
+ upc-datasets list-datasets
113
+ upc-datasets show-dataset pachamix_lyrics_long
114
+ upc-datasets show-dataset pachamix_lyrics_long --format json
115
+ upc-datasets show-data-dictionary
116
+ ```
117
+
118
+ ## One-Command Course Build
119
+
120
+ If your raw data is arranged under `data/raw/` like this:
121
+
122
+ ```text
123
+ data/raw/
124
+ fma/
125
+ tracks.csv
126
+ features.csv
127
+ musixmatch_msd/
128
+ mxm_dataset_train.txt
129
+ mxm_dataset_test.txt
130
+ msd/
131
+ track_metadata.db
132
+ ```
133
+
134
+ then build the core course dataset with:
135
+
136
+ ```bash
137
+ .venv/bin/python -m upc_datasets.cli build-course-dataset \
138
+ --raw-root data/raw \
139
+ --processed-root data/processed
140
+ ```
141
+
142
+ or:
143
+
144
+ ```bash
145
+ make build-course-dataset RAW_ROOT=data/raw PROCESSED_ROOT=data/processed
146
+ ```
147
+
148
+ This always builds:
149
+
150
+ - `data/processed/pachamix_audio_core.parquet`
151
+ - `data/processed/pachamix_lyrics_long.parquet`
152
+
153
+ When `data/raw/msd/track_metadata.db` is present, the lyrics dataset is enriched with MSD metadata columns such as:
154
+
155
+ - `title`
156
+ - `song_id`
157
+ - `release`
158
+ - `artist_id`
159
+ - `artist_mbid`
160
+ - `artist_name`
161
+ - `duration`
162
+ - `artist_familiarity`
163
+ - `artist_hotttnesss`
164
+ - `year`
165
+ - `track_7digitalid`
166
+ - `shs_perf`
167
+ - `shs_work`
168
+
169
+ If you also want recommendation and graph data, add one of these optional behavior sources.
170
+
171
+ `Playlist2vec`:
172
+
173
+ ```text
174
+ data/raw/playlist2vec/
175
+ playlist.csv
176
+ track.csv
177
+ track_playlist1.csv
178
+ ```
179
+
180
+ Official `MPD`:
181
+
182
+ ```text
183
+ data/raw/mpd/
184
+ *.json
185
+ ```
186
+
187
+ When either optional source is present, the same one-command build also writes:
188
+
189
+ - `data/processed/pachamix_playlists/playlist_events.parquet`
190
+ - `data/processed/pachamix_song_graph_edges.parquet`
191
+
192
+ When both are present, the pipeline prefers `playlist2vec/`.
193
+
194
+ ## Example Commands
195
+
196
+ ```bash
197
+ .venv/bin/python -m upc_datasets.cli build-audio-core \
198
+ --tracks-csv data/raw/fma/tracks.csv \
199
+ --features-csv data/raw/fma/features.csv \
200
+ --output-parquet data/processed/pachamix_audio_core.parquet
201
+
202
+ .venv/bin/python -m upc_datasets.cli build-lyrics-core \
203
+ --lyrics-txt data/raw/musixmatch_msd \
204
+ --output-parquet data/processed/pachamix_lyrics_long.parquet \
205
+ --metadata-db data/raw/msd/track_metadata.db
206
+
207
+ .venv/bin/python -m upc_datasets.cli build-playlist-events \
208
+ --mpd-json data/raw/playlist2vec \
209
+ --output-dir data/processed/pachamix_playlists
210
+
211
+ .venv/bin/python -m upc_datasets.cli build-song-graph \
212
+ --playlist-events-parquet data/processed/pachamix_playlists/playlist_events.parquet \
213
+ --output-parquet data/processed/pachamix_song_graph_edges.parquet
214
+ ```
@@ -0,0 +1,201 @@
1
+ # UPC Datasets
2
+
3
+ This workspace contains a small Python toolkit for generating structured teaching datasets for the `PachaMix` course narrative.
4
+
5
+ The implementation is designed around:
6
+
7
+ - structured tables
8
+ - metadata
9
+ - audio-feature tables
10
+ - lyrics-derived features
11
+ - playlist interactions
12
+ - parquet outputs
13
+
14
+ It intentionally avoids:
15
+
16
+ - raw mp3 processing
17
+ - waveform pipelines
18
+ - dependence on live Spotify audio-feature endpoints
19
+
20
+ ## Data Sources
21
+
22
+ - `FMA` for metadata and audio features
23
+ - `musiXmatch/MSD` for lyrics-derived features
24
+ - `Playlist2vec` table exports for playlist interactions and graph construction
25
+ - optional `Spotify MPD` support when access is already available
26
+
27
+ Official source references:
28
+
29
+ - `FMA`: https://github.com/mdeff/fma
30
+ - `musiXmatch/MSD`: https://millionsongdataset.com/musixmatch/
31
+ - `Playlist2vec`: https://zenodo.org/records/5002584
32
+ - `Spotify MPD`: https://research.atspotify.com/2020/9/the-million-playlist-dataset-remastered
33
+
34
+ The codebase is intentionally focused on `structured data`, not raw media. That means:
35
+
36
+ - no mp3 decoding
37
+ - no spectrogram generation
38
+ - no waveform feature extraction inside the course toolkit
39
+
40
+ Instead, the builders assume the inputs are already in the form of:
41
+
42
+ - csv metadata tables
43
+ - csv feature tables
44
+ - lyric token-count text exports
45
+ - playlist membership tables or json playlist metadata
46
+
47
+ Detailed source notes are documented in [big_data_dataset_generation_plan.md](./big_data_dataset_generation_plan.md).
48
+
49
+ The processed schema reference is documented in [data_dictionary.md](./data_dictionary.md).
50
+
51
+ The student-oriented quickstart is documented in [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
52
+
53
+ Operational instructions are documented in [runbooks/README.md](./runbooks/README.md).
54
+
55
+ ## Runtime Note
56
+
57
+ `pyspark` was evaluated for large-scale processing, but the current local environment cannot launch Spark because the installed Java runtime is older than the version required by Spark `4.1`. For that reason, the working implementation uses `polars` plus `pyarrow`.
58
+
59
+ ## Quick Start
60
+
61
+ ```bash
62
+ python3 -m venv .venv
63
+ .venv/bin/pip install -e '.[dev]'
64
+ .venv/bin/python -m upc_datasets.cli --help
65
+ .venv/bin/pytest
66
+ ```
67
+
68
+ ## Student Package
69
+
70
+ The distribution name is `upc-datasets`.
71
+
72
+ If you want the shortest student path, read [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
73
+
74
+ Local development install:
75
+
76
+ ```bash
77
+ pip install -e .
78
+ ```
79
+
80
+ Student install after publishing to PyPI:
81
+
82
+ ```bash
83
+ pip install upc-datasets
84
+ ```
85
+
86
+ Python usage:
87
+
88
+ ```python
89
+ import upc_datasets
90
+
91
+ print(upc_datasets.list_datasets())
92
+ lyrics = upc_datasets.get_dataset_definition("pachamix_lyrics_long")
93
+ print(lyrics["grain"])
94
+ ```
95
+
96
+ CLI usage:
97
+
98
+ ```bash
99
+ upc-datasets list-datasets
100
+ upc-datasets show-dataset pachamix_lyrics_long
101
+ upc-datasets show-dataset pachamix_lyrics_long --format json
102
+ upc-datasets show-data-dictionary
103
+ ```
104
+
105
+ ## One-Command Course Build
106
+
107
+ If your raw data is arranged under `data/raw/` like this:
108
+
109
+ ```text
110
+ data/raw/
111
+ fma/
112
+ tracks.csv
113
+ features.csv
114
+ musixmatch_msd/
115
+ mxm_dataset_train.txt
116
+ mxm_dataset_test.txt
117
+ msd/
118
+ track_metadata.db
119
+ ```
120
+
121
+ then build the core course dataset with:
122
+
123
+ ```bash
124
+ .venv/bin/python -m upc_datasets.cli build-course-dataset \
125
+ --raw-root data/raw \
126
+ --processed-root data/processed
127
+ ```
128
+
129
+ or:
130
+
131
+ ```bash
132
+ make build-course-dataset RAW_ROOT=data/raw PROCESSED_ROOT=data/processed
133
+ ```
134
+
135
+ This always builds:
136
+
137
+ - `data/processed/pachamix_audio_core.parquet`
138
+ - `data/processed/pachamix_lyrics_long.parquet`
139
+
140
+ When `data/raw/msd/track_metadata.db` is present, the lyrics dataset is enriched with MSD metadata columns such as:
141
+
142
+ - `title`
143
+ - `song_id`
144
+ - `release`
145
+ - `artist_id`
146
+ - `artist_mbid`
147
+ - `artist_name`
148
+ - `duration`
149
+ - `artist_familiarity`
150
+ - `artist_hotttnesss`
151
+ - `year`
152
+ - `track_7digitalid`
153
+ - `shs_perf`
154
+ - `shs_work`
155
+
156
+ If you also want recommendation and graph data, add one of these optional behavior sources.
157
+
158
+ `Playlist2vec`:
159
+
160
+ ```text
161
+ data/raw/playlist2vec/
162
+ playlist.csv
163
+ track.csv
164
+ track_playlist1.csv
165
+ ```
166
+
167
+ Official `MPD`:
168
+
169
+ ```text
170
+ data/raw/mpd/
171
+ *.json
172
+ ```
173
+
174
+ When either optional source is present, the same one-command build also writes:
175
+
176
+ - `data/processed/pachamix_playlists/playlist_events.parquet`
177
+ - `data/processed/pachamix_song_graph_edges.parquet`
178
+
179
+ When both are present, the pipeline prefers `playlist2vec/`.
180
+
181
+ ## Example Commands
182
+
183
+ ```bash
184
+ .venv/bin/python -m upc_datasets.cli build-audio-core \
185
+ --tracks-csv data/raw/fma/tracks.csv \
186
+ --features-csv data/raw/fma/features.csv \
187
+ --output-parquet data/processed/pachamix_audio_core.parquet
188
+
189
+ .venv/bin/python -m upc_datasets.cli build-lyrics-core \
190
+ --lyrics-txt data/raw/musixmatch_msd \
191
+ --output-parquet data/processed/pachamix_lyrics_long.parquet \
192
+ --metadata-db data/raw/msd/track_metadata.db
193
+
194
+ .venv/bin/python -m upc_datasets.cli build-playlist-events \
195
+ --mpd-json data/raw/playlist2vec \
196
+ --output-dir data/processed/pachamix_playlists
197
+
198
+ .venv/bin/python -m upc_datasets.cli build-song-graph \
199
+ --playlist-events-parquet data/processed/pachamix_playlists/playlist_events.parquet \
200
+ --output-parquet data/processed/pachamix_song_graph_edges.parquet
201
+ ```
@@ -0,0 +1,30 @@
1
+ [project]
2
+ name = "upc-datasets"
3
+ version = "0.2.0"
4
+ description = "Student-facing structured dataset toolkit for the UPC big data course."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "polars>=1.0.0",
9
+ "pyarrow>=14.0.0",
10
+ ]
11
+
12
+ [project.scripts]
13
+ upc-datasets = "upc_datasets.cli:main"
14
+ pachamix-data = "pachamix_data.cli:main"
15
+
16
+ [project.optional-dependencies]
17
+ dev = [
18
+ "pytest>=8.0.0",
19
+ ]
20
+
21
+ [build-system]
22
+ requires = ["setuptools>=68", "wheel"]
23
+ build-backend = "setuptools.build_meta"
24
+
25
+ [tool.setuptools.packages.find]
26
+ where = ["src"]
27
+
28
+ [tool.pytest.ini_options]
29
+ pythonpath = ["src"]
30
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,5 @@
1
+ """PachaMix structured dataset builders."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.1.0"
@@ -0,0 +1,13 @@
1
+ """Dataset builders for the PachaMix course toolkit."""
2
+
3
+ from pachamix_data.builders.audio_core import build_audio_core
4
+ from pachamix_data.builders.lyrics_core import build_lyrics_core
5
+ from pachamix_data.builders.playlist_events import build_playlist_events
6
+ from pachamix_data.builders.song_graph import build_song_graph
7
+
8
+ __all__ = [
9
+ "build_audio_core",
10
+ "build_lyrics_core",
11
+ "build_playlist_events",
12
+ "build_song_graph",
13
+ ]
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Iterable
7
+
8
+ import polars as pl
9
+
10
+
11
+ PREFERRED_TRACK_COLUMNS = (
12
+ "track_id",
13
+ "title",
14
+ "genre_top",
15
+ "artist_name",
16
+ )
17
+
18
+ TRACK_COLUMN_RENAMES = {
19
+ "track__id": "track_id",
20
+ "track__title": "title",
21
+ "track__genre_top": "genre_top",
22
+ "artist__name": "artist_name",
23
+ }
24
+
25
+
26
+ def _normalize_token(token: str) -> str:
27
+ value = token.strip().lower()
28
+ value = re.sub(r"[^a-z0-9]+", "_", value)
29
+ return value.strip("_")
30
+
31
+
32
+ def _flatten_headers(header_rows: list[list[str]]) -> list[str]:
33
+ width = max(len(row) for row in header_rows)
34
+ headers: list[str] = []
35
+ for idx in range(width):
36
+ pieces = []
37
+ for row in header_rows:
38
+ if idx < len(row):
39
+ value = _normalize_token(row[idx])
40
+ if value:
41
+ pieces.append(value)
42
+ header = "__".join(pieces) or f"column_{idx}"
43
+ headers.append(header)
44
+ return headers
45
+
46
+
47
+ def _looks_like_data_row(row: list[str]) -> bool:
48
+ if not row:
49
+ return False
50
+ first_value = row[0].strip()
51
+ return first_value.isdigit()
52
+
53
+
54
+ def _read_multi_header_csv(path: Path) -> pl.DataFrame:
55
+ with path.open("r", encoding="utf-8", newline="") as handle:
56
+ reader = csv.reader(handle)
57
+ headers: list[list[str]] = []
58
+ records: list[list[str]] = []
59
+ for row in reader:
60
+ if not any(cell.strip() for cell in row):
61
+ continue
62
+ if _looks_like_data_row(row):
63
+ records.append(row)
64
+ break
65
+ headers.append(row)
66
+ records.extend(row for row in reader if any(cell.strip() for cell in row))
67
+ if not headers:
68
+ raise ValueError(f"expected at least one header row in {path}")
69
+ columns = _flatten_headers(headers)
70
+ padded_records = [
71
+ row + [""] * (len(columns) - len(row))
72
+ for row in records
73
+ ]
74
+ return pl.DataFrame(padded_records, schema=columns, orient="row")
75
+
76
+
77
+ def _find_track_id_column(columns: Iterable[str]) -> str | None:
78
+ for column in columns:
79
+ if "track" in column and "id" in column:
80
+ return column
81
+ return None
82
+
83
+
84
+ def build_audio_core(
85
+ tracks_csv: str | Path,
86
+ features_csv: str | Path,
87
+ output_parquet: str | Path,
88
+ ) -> pl.DataFrame:
89
+ tracks_path = Path(tracks_csv)
90
+ features_path = Path(features_csv)
91
+ output_path = Path(output_parquet)
92
+
93
+ tracks = _read_multi_header_csv(tracks_path)
94
+ track_renames = {
95
+ source: target
96
+ for source, target in TRACK_COLUMN_RENAMES.items()
97
+ if source in tracks.columns
98
+ }
99
+ if track_renames:
100
+ tracks = tracks.rename(track_renames)
101
+ if "track_id" not in tracks.columns:
102
+ track_id_column = _find_track_id_column(tracks.columns)
103
+ if track_id_column is None:
104
+ raise ValueError("tracks input must include a track_id column")
105
+ tracks = tracks.rename({track_id_column: "track_id"})
106
+
107
+ selected_track_columns = [
108
+ column for column in PREFERRED_TRACK_COLUMNS if column in tracks.columns
109
+ ]
110
+ tracks = tracks.select(selected_track_columns).with_columns(
111
+ pl.col("track_id").cast(pl.Int64)
112
+ )
113
+
114
+ raw_features = _read_multi_header_csv(features_path)
115
+ feature_track_id = _find_track_id_column(raw_features.columns)
116
+ if feature_track_id is None:
117
+ raise ValueError("features input must include a track_id column")
118
+
119
+ feature_renames = {
120
+ column: column.replace("__", "_")
121
+ for column in raw_features.columns
122
+ if column != feature_track_id
123
+ }
124
+ feature_renames[feature_track_id] = "track_id"
125
+ features = raw_features.rename(feature_renames).with_columns(
126
+ pl.col("track_id").cast(pl.Int64)
127
+ )
128
+
129
+ numeric_feature_columns = [
130
+ column for column in features.columns if column != "track_id"
131
+ ]
132
+ if numeric_feature_columns:
133
+ features = features.with_columns(
134
+ [pl.col(column).cast(pl.Float64) for column in numeric_feature_columns]
135
+ )
136
+
137
+ frame = tracks.join(features, on="track_id", how="inner")
138
+ output_path.parent.mkdir(parents=True, exist_ok=True)
139
+ frame.write_parquet(output_path)
140
+ return frame