PyPI - spotify-analytics-dataloader - Versions diffs - 0.1.0__tar.gz - Mend

spotify-analytics-dataloader 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

spotify_analytics_dataloader-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,224 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Git worktrees
+.worktrees/
+.claude/worktrees/
+# Project specific
+# Vector database and user data
+data/vectordb/
+# sqlite databases and wal, shm files - ignore all but keep directory
+data/*.db*
+# Exclude user's actual Spotify data, but keep sample data
+data/spotify_history/*
+!data/spotify_history/sample_history.json
+# logs
+logs/
+# Internal planning docs (superpowers/Claude Code session artifacts)
+docs/superpowers/

spotify_analytics_dataloader-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 WC Chang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

spotify_analytics_dataloader-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,25 @@
+Metadata-Version: 2.4
+Name: spotify-analytics-dataloader
+Version: 0.1.0
+Summary: Data loading and transformation utilities for Spotify listening history exports.
+Project-URL: Homepage, https://github.com/wcnoname5/spotify-ai-analytics
+Project-URL: Repository, https://github.com/wcnoname5/spotify-ai-analytics
+Project-URL: Issues, https://github.com/wcnoname5/spotify-ai-analytics/issues
+Author: WC Chang
+License: MIT
+License-File: LICENSE
+Keywords: dataloader,listening-history,polars,spotify
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.12
+Requires-Dist: polars>=1.0
+Requires-Dist: pydantic>=2.0
+Description-Content-Type: text/markdown
+# spotify-analytics-dataloader
+Data loading and transformation utilities for Spotify listening history exports.

spotify_analytics_dataloader-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# spotify-analytics-dataloader
+Data loading and transformation utilities for Spotify listening history exports.

spotify_analytics_dataloader-0.1.0/__init__.py ADDED Viewed

File without changes

spotify_analytics_dataloader-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,34 @@
+[project]
+name = "spotify-analytics-dataloader"
+version = "0.1.0"
+description = "Data loading and transformation utilities for Spotify listening history exports."
+readme = "README.md"
+license = { text = "MIT" }
+license-files = ["LICENSE"]
+authors = [{ name = "WC Chang" }]
+requires-python = ">=3.12"
+keywords = ["spotify", "dataloader", "polars", "listening-history"]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Developers",
+]
+dependencies = [
+    "polars>=1.0",
+    "pydantic>=2.0",
+]
+[project.urls]
+Homepage = "https://github.com/wcnoname5/spotify-ai-analytics"
+Repository = "https://github.com/wcnoname5/spotify-ai-analytics"
+Issues = "https://github.com/wcnoname5/spotify-ai-analytics/issues"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["spotify_dataloader"]

spotify_analytics_dataloader-0.1.0/spotify_dataloader/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+from .data_loader import SpotifyDataLoader
+from .analysis_functions import (
+    SummaryStats, query_data, aggregate_table, get_summary,
+    get_top_artists, get_top_tracks, get_monthly_listening_trend,
+    get_weekly_listening_trend, get_raw_df
+)
+from .models import Track, JsonTrackRecord, MONTHS, WEEKDAYS
+from . import analysis_functions
+__all__ = [
+    "SpotifyDataLoader",
+    "Track",
+    "JsonTrackRecord",
+    "MONTHS",
+    "WEEKDAYS",
+    "SummaryStats",
+    "query_data",
+    "aggregate_table",
+    "get_summary",
+    "get_top_artists",
+    "get_top_tracks",
+    "get_monthly_listening_trend",
+    "get_weekly_listening_trend",
+    "get_raw_df",
+    "analysis_functions",
+]

spotify_analytics_dataloader-0.1.0/spotify_dataloader/analysis_functions.py ADDED Viewed

@@ -0,0 +1,330 @@
+import polars as pl
+import logging
+from typing import Literal, Optional, Dict, TypedDict, Any, List, Union
+from datetime import date
+logger = logging.getLogger(__name__)
+class SummaryStats(TypedDict):
+    total_records: int
+    total_listening_time: int  # in minutes
+    columns: list[str]
+    date_range: Optional[Dict[str, str]]  # {'start': 'YYYY-MM-DD', 'end': 'YYYY-MM-DD'}
+    unique_tracks: int
+    unique_artists: int
+def query_data(
+    df: pl.DataFrame|pl.LazyFrame,
+    where: Optional[Union[pl.Expr, List[pl.Expr]]] = None,
+    select: Optional[List[str]] = None,
+    limit: Optional[int] = None,
+    sort_by: Optional[str] = None,
+    descending: bool = True
+) -> pl.DataFrame:
+    """
+    Query the Spotify listening history data with filtering, selection, and sorting.
+    """
+    if df is None or df.is_empty():
+        return pl.DataFrame()
+    if where is not None:
+        if isinstance(where, list):
+            if where:
+                df = df.filter(pl.all_horizontal(where))
+        else:
+            df = df.filter(where)
+    if select is not None:
+        df = df.select(select)
+    if sort_by is not None:
+        df = df.sort(sort_by, descending=descending)
+    if limit is not None:
+        df = df.head(limit)
+    return df
+def aggregate_table(
+    df: pl.DataFrame,
+    group_by: List[str],
+    metrics: Dict[str, Any],
+    where: Optional[Union[pl.Expr, List[pl.Expr]]] = None,
+    sort_by: Optional[str] = None,
+    descending: bool = True,
+    limit: Optional[int] = None,
+) -> pl.DataFrame:
+    """
+    Aggregate the data by grouping and applying metrics.
+    """
+    if df is None or df.is_empty():
+        return pl.DataFrame()
+    # Apply filters
+    if where is not None:
+        if isinstance(where, list):
+            if where:
+                df = df.filter(pl.all_horizontal(where))
+        else:
+            df = df.filter(where)
+    # Build aggregation expressions
+    agg_exprs_dict = {}
+    for col, agg_func_specs in metrics.items():
+        # Normalize to list of specs for uniform processing
+        if not isinstance(agg_func_specs, list):
+            specs = [agg_func_specs]
+        else:
+            specs = agg_func_specs
+        for spec in specs:
+            # Handle tuple format: (function, custom_alias)
+            if isinstance(spec, tuple):
+                func, custom_alias = spec
+            else:
+                func = spec
+                custom_alias = None
+            # Determine alias name
+            alias_name = custom_alias if custom_alias else f"{col}_{func}"
+            # Build aggregation expression
+            if func == "sum":
+                expr = pl.sum(col).alias(alias_name)
+            elif func == "mean":
+                expr = pl.mean(col).alias(alias_name)
+            elif func == "count":
+                expr = pl.count(col).alias(alias_name)
+            elif func == "n_unique":
+                expr = pl.n_unique(col).alias(alias_name)
+            else:
+                raise ValueError(f"Unsupported aggregation: {func}")
+            agg_exprs_dict[alias_name] = expr
+    result = df.group_by(group_by).agg(list(agg_exprs_dict.values()))
+    if sort_by is not None:
+        result = result.sort(sort_by, descending=descending)
+    if limit is not None:
+        result = result.head(limit)
+    return result
+def get_summary(
+    df: pl.DataFrame,
+    start_date: Optional[date] = None,
+    end_date: Optional[date] = None
+) -> SummaryStats:
+    """
+    Get summary statistics for listening history.
+    """
+    filters = []
+    if start_date:
+        filters.append(pl.col("date") >= start_date)
+    if end_date:
+        filters.append(pl.col("date") <= end_date)
+    df_filtered = query_data(df, where=filters)
+    if df_filtered is None or df_filtered.is_empty():
+        return {
+            'total_records': 0,
+            'total_listening_time': 0,
+            'columns': list(df.columns) if df is not None else [],
+            'date_range': None,
+            'unique_tracks': 0,
+            'unique_artists': 0
+        }
+    # Perform calculations in a single selection for optimal performance
+    metrics = []
+    if 'ms_played' in df_filtered.columns:
+        metrics.append(pl.col('ms_played').sum().dt.total_minutes().alias('total_min'))
+    if 'date' in df_filtered.columns:
+        metrics.extend([
+            pl.col('date').min().alias('start_date'),
+            pl.col('date').max().alias('end_date')
+        ])
+    if 'track_uri' in df_filtered.columns:
+        metrics.append(pl.col('track_uri').n_unique().alias('unique_tracks'))
+    elif 'track' in df_filtered.columns:
+        metrics.append(pl.col('track').n_unique().alias('unique_tracks'))
+    if 'artist' in df_filtered.columns:
+        metrics.append(pl.col('artist').n_unique().alias('unique_artists'))
+    results = df_filtered.select(metrics).to_dicts()[0]
+    return {
+        'total_records': df_filtered.height,
+        'total_listening_time': int(results.get('total_min') or 0),
+        'columns': list(df_filtered.columns),
+        'date_range': {
+            'start': str(results['start_date']),
+            'end': str(results['end_date'])
+        } if results.get('start_date') else None,
+        'unique_tracks': int(results.get('unique_tracks', 0)),
+        'unique_artists': int(results.get('unique_artists', 0))
+    }
+def get_top_artists(
+    df: pl.DataFrame,
+    k: int = 5,
+    start_date: Optional[date] = None,
+    end_date: Optional[date] = None
+) -> pl.DataFrame:
+    """
+    Get top k artists by total listening time in minutes.
+    """
+    filters = []
+    if start_date:
+        filters.append(pl.col("date") >= start_date)
+    if end_date:
+        filters.append(pl.col("date") <= end_date)
+    result = aggregate_table(
+        df,
+        group_by=["artist"],
+        metrics={
+            "ms_played": ("sum", "total_ms"),
+            "track": [("count", "total_tracks_played"), ("n_unique", "unique_listened_tracks")]
+            },
+        where=filters,
+        sort_by="total_ms",
+        descending=True,
+        limit=k
+    )
+    return result.with_columns(
+        minutes_played = pl.col("total_ms").dt.total_minutes().round(0).cast(pl.Int64),
+        hours_played = pl.col("total_ms").dt.total_hours().round(0).cast(pl.Int64),
+        ratio_uniq_over_total = (pl.col("unique_listened_tracks") / pl.col("total_tracks_played")).round(2),
+    ).drop("total_ms")
+def get_top_tracks(
+    df: pl.DataFrame,
+    k: int = 5,
+    artist: Optional[str] = None,
+    start_date: Optional[date] = None,
+    end_date: Optional[date] = None
+) -> pl.DataFrame:
+    """
+    Get top k tracks by total listening time in minutes.
+    """
+    where = []
+    if artist:
+        where.append(pl.col("artist").str.to_lowercase() == artist.lower())
+    if start_date:
+        where.append(pl.col("date") >= start_date)
+    if end_date:
+        where.append(pl.col("date") <= end_date)
+    result = aggregate_table(
+        df,
+        group_by=["track", "artist", "album"],
+        metrics={"track": ("count", "play_count"),
+                 "ms_played": ("sum", "total_ms")},
+        where=where,
+        sort_by="play_count",
+        descending=True,
+        limit=k
+    )
+    return result.with_columns(
+        minutes_played = pl.col("total_ms").dt.total_minutes().round(0).cast(pl.Int64)
+    ).drop("total_ms")
+def get_monthly_listening_trend(
+    df: pl.DataFrame,
+    start_date: Optional[date] = None,
+    end_date: Optional[date] = None
+) -> pl.DataFrame:
+    """
+    Get monthly listening trend (total listening time per month).
+    """
+    where = []
+    if start_date:
+        where.append(pl.col("date") >= start_date)
+    if end_date:
+        where.append(pl.col("date") <= end_date)
+    result = aggregate_table(
+        df,
+        group_by=["year", "month"],
+        metrics={"ms_played": ("sum", "total_ms"),
+                 "track": [
+                     ("count", "total_tracks_played"),
+                     ("n_unique", "unique_listened_tracks")
+                           ]
+                },
+        where=where,
+    )
+    if result.is_empty():
+        return result
+    return result.with_columns(
+        total_minutes = pl.col("total_ms").dt.total_minutes().round(0).cast(pl.Int64),
+        total_hours = pl.col("total_ms").dt.total_hours().round(0).cast(pl.Int64),
+        month_label = pl.format("{}-{}-1", pl.col("year"), pl.col("month"))
+                  .str.to_date("%Y-%b-%d")
+    ).sort("month_label")
+def get_weekly_listening_trend(
+    df: pl.DataFrame,
+    start_date: Optional[date] = None,
+    end_date: Optional[date] = None
+) -> pl.DataFrame:
+    """
+    Get weekly and daytime listening trend
+    grouped by daytime (Night, Morning, Afternoon, Evening)
+    """
+    where = []
+    if start_date:
+        where.append(pl.col("date") >= start_date)
+    if end_date:
+        where.append(pl.col("date") <= end_date)
+    if df is None or df.is_empty():
+        return pl.DataFrame()
+    # Apply filters
+    if where:
+        df = df.filter(pl.all_horizontal(where))
+    result = df.with_columns(
+        time_range=pl.col("hour").cut(
+            breaks=[5, 11, 17, 23], # breaks into 0-5, 6-11, 12-17, 18-23
+            labels=["Night", "Morning", "Afternoon", "Evening", "Night"]
+            ),
+        weekday_idx=pl.col("timestamp").dt.weekday()
+    ).group_by(
+        ["weekday", "weekday_idx", "time_range"]
+    ).agg(
+        total_minutes=pl.col("ms_played").sum().dt.total_minutes().round(0).cast(pl.Int64),
+        total_tracks_played=pl.count("track"),
+        unique_listened_tracks=pl.n_unique("track")
+    ).sort(["weekday_idx", "time_range"])
+    return result
+def get_raw_df(df: pl.DataFrame,
+                limit: int,
+                start_date: Optional[date] = None,
+                end_date: Optional[date] = None
+            ) -> pl.DataFrame:
+    """
+    Get raw listening history data with optional filtering and limit.
+    """
+    filters = []
+    if start_date:
+        filters.append(pl.col("date") >= start_date)
+    if end_date:
+        filters.append(pl.col("date") <= end_date)
+    return query_data(
+        df,
+        where=filters if filters else None,
+        limit=limit
+    )

spotify_analytics_dataloader-0.1.0/spotify_dataloader/data_loader.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""
+Data loader module for Spotify JSON history files.
+"""
+import logging
+import polars as pl
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Literal
+from pydantic import ValidationError
+from .models import JsonTrackRecord, Track, MONTHS, WEEKDAYS
+class SpotifyDataLoader:
+    """
+    Loads and processes Spotify listening history from JSON files.
+    The schema of the processed DataFrame matches the `Track` Pydantic model.
+    See `packages/dataloader/spotify_dataloader/models.py` for full field definitions.
+    """
+    def __init__(
+        self,
+        directory: Optional[Path] = None,
+        file_pattern: str = "Streaming*.json",
+        strict_validation: bool = False,
+    ):
+        """
+        Initialize the data loader.
+        Args:
+            directory: Path to directory containing Spotify JSON files. Must be specified.
+            file_pattern: Glob pattern for files to load (default: "Streaming*.json")
+            strict_validation: If True, raises ValidationError on failed sample validation.
+        """
+        if directory is None:
+            raise ValueError("directory must be specified — no default available in package mode")
+        else:
+            self.data_dir = Path(directory).resolve()
+        self.file_pattern = file_pattern
+        self.strict_validation = strict_validation
+        # intialize logging pattern
+        self._logger_prefix = (
+            f"{self.__class__.__module__}."
+            f"{self.__class__.__name__}"
+        )
+        # initialize df
+        self._df: pl.DataFrame | None = None
+        self._is_initialized: bool = False
+        # NOTE: Lazy loading - initialize_data() is called on first access via the df property
+    def _get_logger(self, method_name: str):
+        return logging.getLogger(f"{self._logger_prefix}.{method_name}")
+    # methods to get dataframes
+    @property
+    def df(self) -> Optional[pl.DataFrame]:
+        """Lazy loading: initialize data on first access."""
+        if not self._is_initialized:
+            self.initialize_data()
+            self._is_initialized = True
+        return self._df
+    @property
+    def lazy(self) -> pl.LazyFrame:
+        if self.df is None:  # Use the property to trigger lazy initialization
+            raise RuntimeError("Data not loaded")
+        return self._df.lazy()
+    def initialize_data(self) -> None:
+        """
+        Process raw JSON data into a structured Polars DataFrame.
+        """
+        logger = self._get_logger('initialize_data')
+        logger.info("Processing raw JSON data into structured DataFrame")
+        df = self._read_json_files(self.data_dir, self.file_pattern)
+        if df.is_empty():
+            self._df = pl.DataFrame()
+        else:
+            self._df = self._preprocess(df)
+    def _read_json_files(self, directory: Path, pattern: str = "Streaming*.json") -> pl.DataFrame:
+        """Read JSON files in a directory matching the pattern into a Polars DataFrame."""
+        logger = self._get_logger('_read_json_files')
+        # Use rglob to recursively find files matching the pattern
+        json_files = list(directory.rglob(pattern))
+        logger.info(f"Found {len(json_files)} JSON files matching '{pattern}' in {directory}")
+        if not json_files:
+            logger.warning(f"No JSON files found in {directory}")
+            return pl.DataFrame()
+        else:
+            dfs = []
+            for file in json_files:
+                try:
+                    # Increase infer_schema_length to handle mixed data types
+                    df = pl.read_json(file, infer_schema_length=10000)
+                    dfs.append(df)
+                    logger.info(f"Loaded {file.name}: {df.height} records")
+                except Exception as e:
+                    logger.error(f"Failed to load {file.name}: {e}")
+                    continue
+            if not dfs:
+                logger.warning("No valid JSON files could be loaded")
+                return pl.DataFrame()
+            combined_df = pl.concat(dfs, how="diagonal_relaxed")  # Use diagonal_relaxed for mismatched schemas
+            logger.info(f"Total {combined_df.height} records loaded.")
+            return combined_df
+    def _preprocess(self, df: pl.DataFrame) -> pl.DataFrame:
+        """
+        Normalize Spotify history to the standard schema with staged validation.
+        """
+        logger = self._get_logger('_preprocess')
+        initial_count = df.height
+        logger.info(f"Starting preprocessing of {initial_count} raw records")
+        # --- Stage 1: Cleanup & Raw Validation ---
+        working_df = df
+        if "ms_played" not in working_df.columns and "msPlayed" in working_df.columns:
+            working_df = working_df.rename({"msPlayed": "ms_played"})
+        # Validate raw data sample
+        if not working_df.is_empty():
+            # Filter non-nulls for raw validation sample
+            raw_sample_pool = working_df.filter(pl.col("master_metadata_track_name").is_not_null())
+            self._validate_sample(raw_sample_pool, JsonTrackRecord, sample_size=10)
+        # --- Stage 2: Filtering ---
+        logger.info("Filtering records: removing null tracks and zero playtime")
+        working_df = working_df.filter(
+            (pl.col("master_metadata_track_name").is_not_null()) &
+            (pl.col("ms_played") > 0)
+        )
+        filtered_count = working_df.height
+        logger.info(f"Filtered records: {initial_count} -> {filtered_count} (Dropped {initial_count - filtered_count})")
+        # --- Stage 3: Transformation ---
+        processed_df = (
+            working_df
+            .select([
+                pl.col("ts").str.strptime(
+                        pl.Datetime,
+                        format="%+"
+                    ).dt.replace_time_zone(
+                        "UTC"
+                    ).alias("timestamp"),
+                pl.col("ts").cast(pl.Utf8).alias("ts"),
+                pl.col("ms_played").cast(pl.Duration("ms")),
+                pl.col("master_metadata_track_name").alias("track"),
+                pl.col("master_metadata_album_artist_name").alias("artist"),
+                pl.col("master_metadata_album_album_name").alias("album"),
+                pl.col("spotify_track_uri").alias("track_uri"),
+                pl.col("conn_country"),
+                pl.col("platform"),
+                pl.col("reason_start"),
+                pl.col("reason_end"),
+                pl.col("shuffle"),
+                pl.col("skipped")
+            ])
+            .with_columns(
+                year = pl.col("timestamp").dt.year(),
+                # Use pl.Enum for month and weekday for:
+                # 1. Memory Efficiency: Stores as integers internally, strings only for display.
+                # 2. Performance: Faster grouping, filtering, and sorting than strings.
+                # 3. Logical Sorting: Ensures 'Jan' < 'Feb' and 'Mon' < 'Tue' instead of alphabetical.
+                # 4. Data Integrity: Strictly enforces that only values in our constants are allowed.
+                month = pl.col("timestamp").dt.strftime("%b").cast(pl.Enum(MONTHS)),
+                weekday = pl.col("timestamp").dt.strftime("%a").cast(pl.Enum(WEEKDAYS)),
+                hour = pl.col("timestamp").dt.hour(),
+                date = pl.col("timestamp").dt.date(),
+            )
+        )
+        # --- Stage 4: Processed Validation ---
+        if not processed_df.is_empty():
+            self._validate_sample(processed_df, Track, sample_size=10)
+        logger.info("Preprocessing complete")
+        return processed_df
+    # Validation helper
+    def _validate_sample(self, df: pl.DataFrame, model_class: Any, sample_size: int = 1):
+        """
+        Validate a sample of the data against a Pydantic model.
+        Args:
+            df: Polars DataFrame to sample from
+            model_class: Pydantic model class to validate against
+            sample_size: Number of records to sample (default: 10)
+        """
+        logger = self._get_logger('_validate_sample')
+        if df.is_empty():
+            return
+        # Take a sample (use head if df is small, otherwise sample)
+        sample_df = df.head(sample_size) if df.height <= sample_size else df.sample(n=sample_size)
+        records = sample_df.to_dicts()
+        errors = []
+        for i, record in enumerate(records):
+            try:
+                model_class.model_validate(record)
+            except ValidationError as e:
+                # Capture the first few errors for the log
+                error_details = e.errors()[0]
+                msg = f"Row {i} | Field: {error_details['loc']} | Error: {error_details['msg']}"
+                errors.append(msg)
+        if errors:
+            err_msg = f"Validation failed for {model_class.__name__} in {len(errors)}/{len(records)} sampled rows:\n" + "\n".join(errors[:5])
+            if self.strict_validation:
+                logger.error(err_msg)
+                raise ValidationError(err_msg)
+            else:
+                logger.warning(err_msg)
+        else:
+            logger.info(f"Successfully validated {len(records)} rows against {model_class.__name__}")

spotify_analytics_dataloader-0.1.0/spotify_dataloader/models.py ADDED Viewed

@@ -0,0 +1,77 @@
+'''
+Pydantic model for a Spotify track in DataLoader
+'''
+from pydantic import BaseModel
+from datetime import datetime, date, timedelta
+from typing import Optional, Literal, List
+# --- Constants for Data Consistency ---
+# These lists serve as the single source of truth for both Pydantic validation
+# and Polars categorical/Enum types.
+MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+WEEKDAYS = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]
+REASON_START = [
+    "clickrow", "trackdone", "appload", "fwdbtn", "backbtn",
+    "remote", "playbtn", "unknown", "switched-to-audio", "switched-to-video"
+]
+REASON_END = [
+    "trackerror", "trackdone", "endplay", "logout", "fwdbtn",
+    "backbtn", "unexpected-exit", "remote", "unexpected-exit-while-paused",
+    "unknown"
+]
+class JsonTrackRecord(BaseModel):
+    '''
+    A model representing the raw JSON structure of a Spotify track record
+    as found in the Spotify listening history data.
+    make sure the input JSON structure matches the following example:
+    '''
+    ts: str # ISO 8601 format
+    platform: Optional[str] # e.g., "Android"
+    ms_played: int
+    conn_country: Optional[str] # country code, e.g.,"TW"
+    ip_addr: Optional[str]
+    master_metadata_track_name: str
+    master_metadata_album_artist_name: str
+    master_metadata_album_album_name: str
+    spotify_track_uri: str # required, format: "spotify:track:6KE0cMC0Sa9NJMt8dbmAp8"
+    # ==== audiobook/podcast related fields ====
+    episode_name: Optional[str] = None
+    episode_show_name: Optional[str] = None
+    spotify_episode_uri: Optional[str] = None
+    audiobook_title: Optional[str] = None
+    audiobook_uri: Optional[str] = None
+    audiobook_chapter_uri: Optional[str] = None
+    audiobook_chapter_title: Optional[str] = None
+    # Additional fields tracking playing behavior
+    reason_start: Literal[*REASON_START]
+    reason_end: Literal[*REASON_END]
+    shuffle: bool
+    skipped: bool
+    offline: Optional[bool] = None
+    offline_timestamp: Optional[int] = None # not sure what this field does
+    incognito_mode: Optional[bool] = None
+class Track(BaseModel):
+    '''
+    A model representing a Spotify track in DataLoader
+    '''
+    timestamp: datetime # parsed datetime from (Use Taipei timezone) in ISO format
+    ts: str # rwar string timestamp from JSON
+    ms_played: timedelta # duration format
+    track: str
+    artist: str
+    album: str
+    track_uri: str
+    conn_country: str
+    platform: str
+    reason_start: str
+    reason_end: str
+    shuffle: bool
+    skipped: bool
+    year: int
+    month: Literal[*MONTHS]
+    weekday: Literal[*WEEKDAYS]
+    hour: int
+    date: date # yyyy-mm-dd date format