PyPI - upc-datasets - Versions diffs - 0.2.0__tar.gz - Mend

upc-datasets 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

upc_datasets-0.2.0/LICENSE +21 -0
upc_datasets-0.2.0/PKG-INFO +214 -0
upc_datasets-0.2.0/README.md +201 -0
upc_datasets-0.2.0/pyproject.toml +30 -0
upc_datasets-0.2.0/setup.cfg +4 -0
upc_datasets-0.2.0/src/pachamix_data/__init__.py +5 -0
upc_datasets-0.2.0/src/pachamix_data/builders/__init__.py +13 -0
upc_datasets-0.2.0/src/pachamix_data/builders/audio_core.py +140 -0
upc_datasets-0.2.0/src/pachamix_data/builders/lyrics_core.py +134 -0
upc_datasets-0.2.0/src/pachamix_data/builders/playlist_events.py +138 -0
upc_datasets-0.2.0/src/pachamix_data/builders/song_graph.py +41 -0
upc_datasets-0.2.0/src/pachamix_data/cli.py +130 -0
upc_datasets-0.2.0/src/pachamix_data/pipeline.py +111 -0
upc_datasets-0.2.0/src/upc_datasets/__init__.py +27 -0
upc_datasets-0.2.0/src/upc_datasets/catalog.py +203 -0
upc_datasets-0.2.0/src/upc_datasets/cli.py +190 -0
upc_datasets-0.2.0/src/upc_datasets.egg-info/PKG-INFO +214 -0
upc_datasets-0.2.0/src/upc_datasets.egg-info/SOURCES.txt +33 -0
upc_datasets-0.2.0/src/upc_datasets.egg-info/dependency_links.txt +1 -0
upc_datasets-0.2.0/src/upc_datasets.egg-info/entry_points.txt +3 -0
upc_datasets-0.2.0/src/upc_datasets.egg-info/requires.txt +5 -0
upc_datasets-0.2.0/src/upc_datasets.egg-info/top_level.txt +2 -0
upc_datasets-0.2.0/tests/test_audio_core_builder.py +79 -0
upc_datasets-0.2.0/tests/test_cli.py +42 -0
upc_datasets-0.2.0/tests/test_course_dataset_pipeline.py +50 -0
upc_datasets-0.2.0/tests/test_course_dataset_pipeline_core_only.py +166 -0
upc_datasets-0.2.0/tests/test_course_dataset_pipeline_playlist2vec.py +42 -0
upc_datasets-0.2.0/tests/test_end_to_end_generation.py +74 -0
upc_datasets-0.2.0/tests/test_lyrics_builder.py +143 -0
upc_datasets-0.2.0/tests/test_lyrics_official_format.py +26 -0
upc_datasets-0.2.0/tests/test_playlist2vec_builder.py +37 -0
upc_datasets-0.2.0/tests/test_playlist_builder.py +46 -0
upc_datasets-0.2.0/tests/test_playlist_directory_builder.py +27 -0
upc_datasets-0.2.0/tests/test_song_graph_builder.py +32 -0
upc_datasets-0.2.0/tests/test_upc_datasets_package.py +59 -0

upc_datasets-0.2.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Carlos Adrián Alarcón
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

upc_datasets-0.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,214 @@
+Metadata-Version: 2.4
+Name: upc-datasets
+Version: 0.2.0
+Summary: Student-facing structured dataset toolkit for the UPC big data course.
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: polars>=1.0.0
+Requires-Dist: pyarrow>=14.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Dynamic: license-file
+# UPC Datasets
+This workspace contains a small Python toolkit for generating structured teaching datasets for the `PachaMix` course narrative.
+The implementation is designed around:
+- structured tables
+- metadata
+- audio-feature tables
+- lyrics-derived features
+- playlist interactions
+- parquet outputs
+It intentionally avoids:
+- raw mp3 processing
+- waveform pipelines
+- dependence on live Spotify audio-feature endpoints
+## Data Sources
+- `FMA` for metadata and audio features
+- `musiXmatch/MSD` for lyrics-derived features
+- `Playlist2vec` table exports for playlist interactions and graph construction
+- optional `Spotify MPD` support when access is already available
+Official source references:
+- `FMA`: https://github.com/mdeff/fma
+- `musiXmatch/MSD`: https://millionsongdataset.com/musixmatch/
+- `Playlist2vec`: https://zenodo.org/records/5002584
+- `Spotify MPD`: https://research.atspotify.com/2020/9/the-million-playlist-dataset-remastered
+The codebase is intentionally focused on `structured data`, not raw media. That means:
+- no mp3 decoding
+- no spectrogram generation
+- no waveform feature extraction inside the course toolkit
+Instead, the builders assume the inputs are already in the form of:
+- csv metadata tables
+- csv feature tables
+- lyric token-count text exports
+- playlist membership tables or json playlist metadata
+Detailed source notes are documented in [big_data_dataset_generation_plan.md](./big_data_dataset_generation_plan.md).
+The processed schema reference is documented in [data_dictionary.md](./data_dictionary.md).
+The student-oriented quickstart is documented in [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
+Operational instructions are documented in [runbooks/README.md](./runbooks/README.md).
+## Runtime Note
+`pyspark` was evaluated for large-scale processing, but the current local environment cannot launch Spark because the installed Java runtime is older than the version required by Spark `4.1`. For that reason, the working implementation uses `polars` plus `pyarrow`.
+## Quick Start
+```bash
+python3 -m venv .venv
+.venv/bin/pip install -e '.[dev]'
+.venv/bin/python -m upc_datasets.cli --help
+.venv/bin/pytest
+```
+## Student Package
+The distribution name is `upc-datasets`.
+If you want the shortest student path, read [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
+Local development install:
+```bash
+pip install -e .
+```
+Student install after publishing to PyPI:
+```bash
+pip install upc-datasets
+```
+Python usage:
+```python
+import upc_datasets
+print(upc_datasets.list_datasets())
+lyrics = upc_datasets.get_dataset_definition("pachamix_lyrics_long")
+print(lyrics["grain"])
+```
+CLI usage:
+```bash
+upc-datasets list-datasets
+upc-datasets show-dataset pachamix_lyrics_long
+upc-datasets show-dataset pachamix_lyrics_long --format json
+upc-datasets show-data-dictionary
+```
+## One-Command Course Build
+If your raw data is arranged under `data/raw/` like this:
+```text
+data/raw/
+  fma/
+    tracks.csv
+    features.csv
+  musixmatch_msd/
+    mxm_dataset_train.txt
+    mxm_dataset_test.txt
+  msd/
+    track_metadata.db
+```
+then build the core course dataset with:
+```bash
+.venv/bin/python -m upc_datasets.cli build-course-dataset \
+  --raw-root data/raw \
+  --processed-root data/processed
+```
+or:
+```bash
+make build-course-dataset RAW_ROOT=data/raw PROCESSED_ROOT=data/processed
+```
+This always builds:
+- `data/processed/pachamix_audio_core.parquet`
+- `data/processed/pachamix_lyrics_long.parquet`
+When `data/raw/msd/track_metadata.db` is present, the lyrics dataset is enriched with MSD metadata columns such as:
+- `title`
+- `song_id`
+- `release`
+- `artist_id`
+- `artist_mbid`
+- `artist_name`
+- `duration`
+- `artist_familiarity`
+- `artist_hotttnesss`
+- `year`
+- `track_7digitalid`
+- `shs_perf`
+- `shs_work`
+If you also want recommendation and graph data, add one of these optional behavior sources.
+`Playlist2vec`:
+```text
+data/raw/playlist2vec/
+  playlist.csv
+  track.csv
+  track_playlist1.csv
+```
+Official `MPD`:
+```text
+data/raw/mpd/
+  *.json
+```
+When either optional source is present, the same one-command build also writes:
+- `data/processed/pachamix_playlists/playlist_events.parquet`
+- `data/processed/pachamix_song_graph_edges.parquet`
+When both are present, the pipeline prefers `playlist2vec/`.
+## Example Commands
+```bash
+.venv/bin/python -m upc_datasets.cli build-audio-core \
+  --tracks-csv data/raw/fma/tracks.csv \
+  --features-csv data/raw/fma/features.csv \
+  --output-parquet data/processed/pachamix_audio_core.parquet
+.venv/bin/python -m upc_datasets.cli build-lyrics-core \
+  --lyrics-txt data/raw/musixmatch_msd \
+  --output-parquet data/processed/pachamix_lyrics_long.parquet \
+  --metadata-db data/raw/msd/track_metadata.db
+.venv/bin/python -m upc_datasets.cli build-playlist-events \
+  --mpd-json data/raw/playlist2vec \
+  --output-dir data/processed/pachamix_playlists
+.venv/bin/python -m upc_datasets.cli build-song-graph \
+  --playlist-events-parquet data/processed/pachamix_playlists/playlist_events.parquet \
+  --output-parquet data/processed/pachamix_song_graph_edges.parquet
+```

upc_datasets-0.2.0/README.md ADDED Viewed

@@ -0,0 +1,201 @@
+# UPC Datasets
+This workspace contains a small Python toolkit for generating structured teaching datasets for the `PachaMix` course narrative.
+The implementation is designed around:
+- structured tables
+- metadata
+- audio-feature tables
+- lyrics-derived features
+- playlist interactions
+- parquet outputs
+It intentionally avoids:
+- raw mp3 processing
+- waveform pipelines
+- dependence on live Spotify audio-feature endpoints
+## Data Sources
+- `FMA` for metadata and audio features
+- `musiXmatch/MSD` for lyrics-derived features
+- `Playlist2vec` table exports for playlist interactions and graph construction
+- optional `Spotify MPD` support when access is already available
+Official source references:
+- `FMA`: https://github.com/mdeff/fma
+- `musiXmatch/MSD`: https://millionsongdataset.com/musixmatch/
+- `Playlist2vec`: https://zenodo.org/records/5002584
+- `Spotify MPD`: https://research.atspotify.com/2020/9/the-million-playlist-dataset-remastered
+The codebase is intentionally focused on `structured data`, not raw media. That means:
+- no mp3 decoding
+- no spectrogram generation
+- no waveform feature extraction inside the course toolkit
+Instead, the builders assume the inputs are already in the form of:
+- csv metadata tables
+- csv feature tables
+- lyric token-count text exports
+- playlist membership tables or json playlist metadata
+Detailed source notes are documented in [big_data_dataset_generation_plan.md](./big_data_dataset_generation_plan.md).
+The processed schema reference is documented in [data_dictionary.md](./data_dictionary.md).
+The student-oriented quickstart is documented in [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
+Operational instructions are documented in [runbooks/README.md](./runbooks/README.md).
+## Runtime Note
+`pyspark` was evaluated for large-scale processing, but the current local environment cannot launch Spark because the installed Java runtime is older than the version required by Spark `4.1`. For that reason, the working implementation uses `polars` plus `pyarrow`.
+## Quick Start
+```bash
+python3 -m venv .venv
+.venv/bin/pip install -e '.[dev]'
+.venv/bin/python -m upc_datasets.cli --help
+.venv/bin/pytest
+```
+## Student Package
+The distribution name is `upc-datasets`.
+If you want the shortest student path, read [STUDENT_GUIDE.md](./STUDENT_GUIDE.md).
+Local development install:
+```bash
+pip install -e .
+```
+Student install after publishing to PyPI:
+```bash
+pip install upc-datasets
+```
+Python usage:
+```python
+import upc_datasets
+print(upc_datasets.list_datasets())
+lyrics = upc_datasets.get_dataset_definition("pachamix_lyrics_long")
+print(lyrics["grain"])
+```
+CLI usage:
+```bash
+upc-datasets list-datasets
+upc-datasets show-dataset pachamix_lyrics_long
+upc-datasets show-dataset pachamix_lyrics_long --format json
+upc-datasets show-data-dictionary
+```
+## One-Command Course Build
+If your raw data is arranged under `data/raw/` like this:
+```text
+data/raw/
+  fma/
+    tracks.csv
+    features.csv
+  musixmatch_msd/
+    mxm_dataset_train.txt
+    mxm_dataset_test.txt
+  msd/
+    track_metadata.db
+```
+then build the core course dataset with:
+```bash
+.venv/bin/python -m upc_datasets.cli build-course-dataset \
+  --raw-root data/raw \
+  --processed-root data/processed
+```
+or:
+```bash
+make build-course-dataset RAW_ROOT=data/raw PROCESSED_ROOT=data/processed
+```
+This always builds:
+- `data/processed/pachamix_audio_core.parquet`
+- `data/processed/pachamix_lyrics_long.parquet`
+When `data/raw/msd/track_metadata.db` is present, the lyrics dataset is enriched with MSD metadata columns such as:
+- `title`
+- `song_id`
+- `release`
+- `artist_id`
+- `artist_mbid`
+- `artist_name`
+- `duration`
+- `artist_familiarity`
+- `artist_hotttnesss`
+- `year`
+- `track_7digitalid`
+- `shs_perf`
+- `shs_work`
+If you also want recommendation and graph data, add one of these optional behavior sources.
+`Playlist2vec`:
+```text
+data/raw/playlist2vec/
+  playlist.csv
+  track.csv
+  track_playlist1.csv
+```
+Official `MPD`:
+```text
+data/raw/mpd/
+  *.json
+```
+When either optional source is present, the same one-command build also writes:
+- `data/processed/pachamix_playlists/playlist_events.parquet`
+- `data/processed/pachamix_song_graph_edges.parquet`
+When both are present, the pipeline prefers `playlist2vec/`.
+## Example Commands
+```bash
+.venv/bin/python -m upc_datasets.cli build-audio-core \
+  --tracks-csv data/raw/fma/tracks.csv \
+  --features-csv data/raw/fma/features.csv \
+  --output-parquet data/processed/pachamix_audio_core.parquet
+.venv/bin/python -m upc_datasets.cli build-lyrics-core \
+  --lyrics-txt data/raw/musixmatch_msd \
+  --output-parquet data/processed/pachamix_lyrics_long.parquet \
+  --metadata-db data/raw/msd/track_metadata.db
+.venv/bin/python -m upc_datasets.cli build-playlist-events \
+  --mpd-json data/raw/playlist2vec \
+  --output-dir data/processed/pachamix_playlists
+.venv/bin/python -m upc_datasets.cli build-song-graph \
+  --playlist-events-parquet data/processed/pachamix_playlists/playlist_events.parquet \
+  --output-parquet data/processed/pachamix_song_graph_edges.parquet
+```

upc_datasets-0.2.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,30 @@
+[project]
+name = "upc-datasets"
+version = "0.2.0"
+description = "Student-facing structured dataset toolkit for the UPC big data course."
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "polars>=1.0.0",
+  "pyarrow>=14.0.0",
+]
+[project.scripts]
+upc-datasets = "upc_datasets.cli:main"
+pachamix-data = "pachamix_data.cli:main"
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.0.0",
+]
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+testpaths = ["tests"]

upc_datasets-0.2.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

upc_datasets-0.2.0/src/pachamix_data/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""PachaMix structured dataset builders."""
+__all__ = ["__version__"]
+__version__ = "0.1.0"

upc_datasets-0.2.0/src/pachamix_data/builders/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Dataset builders for the PachaMix course toolkit."""
+from pachamix_data.builders.audio_core import build_audio_core
+from pachamix_data.builders.lyrics_core import build_lyrics_core
+from pachamix_data.builders.playlist_events import build_playlist_events
+from pachamix_data.builders.song_graph import build_song_graph
+__all__ = [
+    "build_audio_core",
+    "build_lyrics_core",
+    "build_playlist_events",
+    "build_song_graph",
+]

upc_datasets-0.2.0/src/pachamix_data/builders/audio_core.py ADDED Viewed

@@ -0,0 +1,140 @@
+from __future__ import annotations
+import csv
+import re
+from pathlib import Path
+from typing import Iterable
+import polars as pl
+PREFERRED_TRACK_COLUMNS = (
+    "track_id",
+    "title",
+    "genre_top",
+    "artist_name",
+)
+TRACK_COLUMN_RENAMES = {
+    "track__id": "track_id",
+    "track__title": "title",
+    "track__genre_top": "genre_top",
+    "artist__name": "artist_name",
+}
+def _normalize_token(token: str) -> str:
+    value = token.strip().lower()
+    value = re.sub(r"[^a-z0-9]+", "_", value)
+    return value.strip("_")
+def _flatten_headers(header_rows: list[list[str]]) -> list[str]:
+    width = max(len(row) for row in header_rows)
+    headers: list[str] = []
+    for idx in range(width):
+        pieces = []
+        for row in header_rows:
+            if idx < len(row):
+                value = _normalize_token(row[idx])
+                if value:
+                    pieces.append(value)
+        header = "__".join(pieces) or f"column_{idx}"
+        headers.append(header)
+    return headers
+def _looks_like_data_row(row: list[str]) -> bool:
+    if not row:
+        return False
+    first_value = row[0].strip()
+    return first_value.isdigit()
+def _read_multi_header_csv(path: Path) -> pl.DataFrame:
+    with path.open("r", encoding="utf-8", newline="") as handle:
+        reader = csv.reader(handle)
+        headers: list[list[str]] = []
+        records: list[list[str]] = []
+        for row in reader:
+            if not any(cell.strip() for cell in row):
+                continue
+            if _looks_like_data_row(row):
+                records.append(row)
+                break
+            headers.append(row)
+        records.extend(row for row in reader if any(cell.strip() for cell in row))
+    if not headers:
+        raise ValueError(f"expected at least one header row in {path}")
+    columns = _flatten_headers(headers)
+    padded_records = [
+        row + [""] * (len(columns) - len(row))
+        for row in records
+    ]
+    return pl.DataFrame(padded_records, schema=columns, orient="row")
+def _find_track_id_column(columns: Iterable[str]) -> str | None:
+    for column in columns:
+        if "track" in column and "id" in column:
+            return column
+    return None
+def build_audio_core(
+    tracks_csv: str | Path,
+    features_csv: str | Path,
+    output_parquet: str | Path,
+) -> pl.DataFrame:
+    tracks_path = Path(tracks_csv)
+    features_path = Path(features_csv)
+    output_path = Path(output_parquet)
+    tracks = _read_multi_header_csv(tracks_path)
+    track_renames = {
+        source: target
+        for source, target in TRACK_COLUMN_RENAMES.items()
+        if source in tracks.columns
+    }
+    if track_renames:
+        tracks = tracks.rename(track_renames)
+    if "track_id" not in tracks.columns:
+        track_id_column = _find_track_id_column(tracks.columns)
+        if track_id_column is None:
+            raise ValueError("tracks input must include a track_id column")
+        tracks = tracks.rename({track_id_column: "track_id"})
+    selected_track_columns = [
+        column for column in PREFERRED_TRACK_COLUMNS if column in tracks.columns
+    ]
+    tracks = tracks.select(selected_track_columns).with_columns(
+        pl.col("track_id").cast(pl.Int64)
+    )
+    raw_features = _read_multi_header_csv(features_path)
+    feature_track_id = _find_track_id_column(raw_features.columns)
+    if feature_track_id is None:
+        raise ValueError("features input must include a track_id column")
+    feature_renames = {
+        column: column.replace("__", "_")
+        for column in raw_features.columns
+        if column != feature_track_id
+    }
+    feature_renames[feature_track_id] = "track_id"
+    features = raw_features.rename(feature_renames).with_columns(
+        pl.col("track_id").cast(pl.Int64)
+    )
+    numeric_feature_columns = [
+        column for column in features.columns if column != "track_id"
+    ]
+    if numeric_feature_columns:
+        features = features.with_columns(
+            [pl.col(column).cast(pl.Float64) for column in numeric_feature_columns]
+        )
+    frame = tracks.join(features, on="track_id", how="inner")
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    frame.write_parquet(output_path)
+    return frame