spotify-analytics-dataloader 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,224 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ # Git worktrees
210
+ .worktrees/
211
+ .claude/worktrees/
212
+
213
+ # Project specific
214
+ # Vector database and user data
215
+ data/vectordb/
216
+ # sqlite databases and wal, shm files - ignore all but keep directory
217
+ data/*.db*
218
+ # Exclude user's actual Spotify data, but keep sample data
219
+ data/spotify_history/*
220
+ !data/spotify_history/sample_history.json
221
+ # logs
222
+ logs/
223
+ # Internal planning docs (superpowers/Claude Code session artifacts)
224
+ docs/superpowers/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 WC Chang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: spotify-analytics-dataloader
3
+ Version: 0.1.0
4
+ Summary: Data loading and transformation utilities for Spotify listening history exports.
5
+ Project-URL: Homepage, https://github.com/wcnoname5/spotify-ai-analytics
6
+ Project-URL: Repository, https://github.com/wcnoname5/spotify-ai-analytics
7
+ Project-URL: Issues, https://github.com/wcnoname5/spotify-ai-analytics/issues
8
+ Author: WC Chang
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: dataloader,listening-history,polars,spotify
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Requires-Python: >=3.12
19
+ Requires-Dist: polars>=1.0
20
+ Requires-Dist: pydantic>=2.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # spotify-analytics-dataloader
24
+
25
+ Data loading and transformation utilities for Spotify listening history exports.
@@ -0,0 +1,3 @@
1
+ # spotify-analytics-dataloader
2
+
3
+ Data loading and transformation utilities for Spotify listening history exports.
File without changes
@@ -0,0 +1,34 @@
1
+ [project]
2
+ name = "spotify-analytics-dataloader"
3
+ version = "0.1.0"
4
+ description = "Data loading and transformation utilities for Spotify listening history exports."
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ license-files = ["LICENSE"]
8
+ authors = [{ name = "WC Chang" }]
9
+ requires-python = ">=3.12"
10
+ keywords = ["spotify", "dataloader", "polars", "listening-history"]
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "Programming Language :: Python :: 3.12",
14
+ "Programming Language :: Python :: 3.13",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Intended Audience :: Developers",
18
+ ]
19
+ dependencies = [
20
+ "polars>=1.0",
21
+ "pydantic>=2.0",
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/wcnoname5/spotify-ai-analytics"
26
+ Repository = "https://github.com/wcnoname5/spotify-ai-analytics"
27
+ Issues = "https://github.com/wcnoname5/spotify-ai-analytics/issues"
28
+
29
+ [build-system]
30
+ requires = ["hatchling"]
31
+ build-backend = "hatchling.build"
32
+
33
+ [tool.hatch.build.targets.wheel]
34
+ packages = ["spotify_dataloader"]
@@ -0,0 +1,26 @@
1
+ from .data_loader import SpotifyDataLoader
2
+ from .analysis_functions import (
3
+ SummaryStats, query_data, aggregate_table, get_summary,
4
+ get_top_artists, get_top_tracks, get_monthly_listening_trend,
5
+ get_weekly_listening_trend, get_raw_df
6
+ )
7
+ from .models import Track, JsonTrackRecord, MONTHS, WEEKDAYS
8
+ from . import analysis_functions
9
+
10
+ __all__ = [
11
+ "SpotifyDataLoader",
12
+ "Track",
13
+ "JsonTrackRecord",
14
+ "MONTHS",
15
+ "WEEKDAYS",
16
+ "SummaryStats",
17
+ "query_data",
18
+ "aggregate_table",
19
+ "get_summary",
20
+ "get_top_artists",
21
+ "get_top_tracks",
22
+ "get_monthly_listening_trend",
23
+ "get_weekly_listening_trend",
24
+ "get_raw_df",
25
+ "analysis_functions",
26
+ ]
@@ -0,0 +1,330 @@
1
+ import polars as pl
2
+ import logging
3
+ from typing import Literal, Optional, Dict, TypedDict, Any, List, Union
4
+ from datetime import date
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class SummaryStats(TypedDict):
9
+ total_records: int
10
+ total_listening_time: int # in minutes
11
+ columns: list[str]
12
+ date_range: Optional[Dict[str, str]] # {'start': 'YYYY-MM-DD', 'end': 'YYYY-MM-DD'}
13
+ unique_tracks: int
14
+ unique_artists: int
15
+
16
+ def query_data(
17
+ df: pl.DataFrame|pl.LazyFrame,
18
+ where: Optional[Union[pl.Expr, List[pl.Expr]]] = None,
19
+ select: Optional[List[str]] = None,
20
+ limit: Optional[int] = None,
21
+ sort_by: Optional[str] = None,
22
+ descending: bool = True
23
+ ) -> pl.DataFrame:
24
+ """
25
+ Query the Spotify listening history data with filtering, selection, and sorting.
26
+ """
27
+ if df is None or df.is_empty():
28
+ return pl.DataFrame()
29
+
30
+ if where is not None:
31
+ if isinstance(where, list):
32
+ if where:
33
+ df = df.filter(pl.all_horizontal(where))
34
+ else:
35
+ df = df.filter(where)
36
+
37
+ if select is not None:
38
+ df = df.select(select)
39
+
40
+ if sort_by is not None:
41
+ df = df.sort(sort_by, descending=descending)
42
+
43
+ if limit is not None:
44
+ df = df.head(limit)
45
+
46
+ return df
47
+
48
+ def aggregate_table(
49
+ df: pl.DataFrame,
50
+ group_by: List[str],
51
+ metrics: Dict[str, Any],
52
+ where: Optional[Union[pl.Expr, List[pl.Expr]]] = None,
53
+ sort_by: Optional[str] = None,
54
+ descending: bool = True,
55
+ limit: Optional[int] = None,
56
+ ) -> pl.DataFrame:
57
+ """
58
+ Aggregate the data by grouping and applying metrics.
59
+ """
60
+ if df is None or df.is_empty():
61
+ return pl.DataFrame()
62
+
63
+ # Apply filters
64
+ if where is not None:
65
+ if isinstance(where, list):
66
+ if where:
67
+ df = df.filter(pl.all_horizontal(where))
68
+ else:
69
+ df = df.filter(where)
70
+
71
+ # Build aggregation expressions
72
+ agg_exprs_dict = {}
73
+
74
+ for col, agg_func_specs in metrics.items():
75
+ # Normalize to list of specs for uniform processing
76
+ if not isinstance(agg_func_specs, list):
77
+ specs = [agg_func_specs]
78
+ else:
79
+ specs = agg_func_specs
80
+
81
+ for spec in specs:
82
+ # Handle tuple format: (function, custom_alias)
83
+ if isinstance(spec, tuple):
84
+ func, custom_alias = spec
85
+ else:
86
+ func = spec
87
+ custom_alias = None
88
+
89
+ # Determine alias name
90
+ alias_name = custom_alias if custom_alias else f"{col}_{func}"
91
+
92
+ # Build aggregation expression
93
+ if func == "sum":
94
+ expr = pl.sum(col).alias(alias_name)
95
+ elif func == "mean":
96
+ expr = pl.mean(col).alias(alias_name)
97
+ elif func == "count":
98
+ expr = pl.count(col).alias(alias_name)
99
+ elif func == "n_unique":
100
+ expr = pl.n_unique(col).alias(alias_name)
101
+ else:
102
+ raise ValueError(f"Unsupported aggregation: {func}")
103
+
104
+ agg_exprs_dict[alias_name] = expr
105
+
106
+ result = df.group_by(group_by).agg(list(agg_exprs_dict.values()))
107
+
108
+ if sort_by is not None:
109
+ result = result.sort(sort_by, descending=descending)
110
+
111
+ if limit is not None:
112
+ result = result.head(limit)
113
+
114
+ return result
115
+
116
+ def get_summary(
117
+ df: pl.DataFrame,
118
+ start_date: Optional[date] = None,
119
+ end_date: Optional[date] = None
120
+ ) -> SummaryStats:
121
+ """
122
+ Get summary statistics for listening history.
123
+ """
124
+ filters = []
125
+ if start_date:
126
+ filters.append(pl.col("date") >= start_date)
127
+ if end_date:
128
+ filters.append(pl.col("date") <= end_date)
129
+
130
+ df_filtered = query_data(df, where=filters)
131
+
132
+ if df_filtered is None or df_filtered.is_empty():
133
+ return {
134
+ 'total_records': 0,
135
+ 'total_listening_time': 0,
136
+ 'columns': list(df.columns) if df is not None else [],
137
+ 'date_range': None,
138
+ 'unique_tracks': 0,
139
+ 'unique_artists': 0
140
+ }
141
+
142
+ # Perform calculations in a single selection for optimal performance
143
+ metrics = []
144
+ if 'ms_played' in df_filtered.columns:
145
+ metrics.append(pl.col('ms_played').sum().dt.total_minutes().alias('total_min'))
146
+ if 'date' in df_filtered.columns:
147
+ metrics.extend([
148
+ pl.col('date').min().alias('start_date'),
149
+ pl.col('date').max().alias('end_date')
150
+ ])
151
+ if 'track_uri' in df_filtered.columns:
152
+ metrics.append(pl.col('track_uri').n_unique().alias('unique_tracks'))
153
+ elif 'track' in df_filtered.columns:
154
+ metrics.append(pl.col('track').n_unique().alias('unique_tracks'))
155
+
156
+ if 'artist' in df_filtered.columns:
157
+ metrics.append(pl.col('artist').n_unique().alias('unique_artists'))
158
+
159
+ results = df_filtered.select(metrics).to_dicts()[0]
160
+
161
+ return {
162
+ 'total_records': df_filtered.height,
163
+ 'total_listening_time': int(results.get('total_min') or 0),
164
+ 'columns': list(df_filtered.columns),
165
+ 'date_range': {
166
+ 'start': str(results['start_date']),
167
+ 'end': str(results['end_date'])
168
+ } if results.get('start_date') else None,
169
+ 'unique_tracks': int(results.get('unique_tracks', 0)),
170
+ 'unique_artists': int(results.get('unique_artists', 0))
171
+ }
172
+
173
+ def get_top_artists(
174
+ df: pl.DataFrame,
175
+ k: int = 5,
176
+ start_date: Optional[date] = None,
177
+ end_date: Optional[date] = None
178
+ ) -> pl.DataFrame:
179
+ """
180
+ Get top k artists by total listening time in minutes.
181
+ """
182
+ filters = []
183
+ if start_date:
184
+ filters.append(pl.col("date") >= start_date)
185
+ if end_date:
186
+ filters.append(pl.col("date") <= end_date)
187
+
188
+ result = aggregate_table(
189
+ df,
190
+ group_by=["artist"],
191
+ metrics={
192
+ "ms_played": ("sum", "total_ms"),
193
+ "track": [("count", "total_tracks_played"), ("n_unique", "unique_listened_tracks")]
194
+ },
195
+ where=filters,
196
+ sort_by="total_ms",
197
+ descending=True,
198
+ limit=k
199
+ )
200
+ return result.with_columns(
201
+ minutes_played = pl.col("total_ms").dt.total_minutes().round(0).cast(pl.Int64),
202
+ hours_played = pl.col("total_ms").dt.total_hours().round(0).cast(pl.Int64),
203
+ ratio_uniq_over_total = (pl.col("unique_listened_tracks") / pl.col("total_tracks_played")).round(2),
204
+ ).drop("total_ms")
205
+
206
+ def get_top_tracks(
207
+ df: pl.DataFrame,
208
+ k: int = 5,
209
+ artist: Optional[str] = None,
210
+ start_date: Optional[date] = None,
211
+ end_date: Optional[date] = None
212
+ ) -> pl.DataFrame:
213
+ """
214
+ Get top k tracks by total listening time in minutes.
215
+ """
216
+ where = []
217
+ if artist:
218
+ where.append(pl.col("artist").str.to_lowercase() == artist.lower())
219
+ if start_date:
220
+ where.append(pl.col("date") >= start_date)
221
+ if end_date:
222
+ where.append(pl.col("date") <= end_date)
223
+
224
+ result = aggregate_table(
225
+ df,
226
+ group_by=["track", "artist", "album"],
227
+ metrics={"track": ("count", "play_count"),
228
+ "ms_played": ("sum", "total_ms")},
229
+ where=where,
230
+ sort_by="play_count",
231
+ descending=True,
232
+ limit=k
233
+ )
234
+ return result.with_columns(
235
+ minutes_played = pl.col("total_ms").dt.total_minutes().round(0).cast(pl.Int64)
236
+ ).drop("total_ms")
237
+
238
+ def get_monthly_listening_trend(
239
+ df: pl.DataFrame,
240
+ start_date: Optional[date] = None,
241
+ end_date: Optional[date] = None
242
+ ) -> pl.DataFrame:
243
+ """
244
+ Get monthly listening trend (total listening time per month).
245
+ """
246
+ where = []
247
+ if start_date:
248
+ where.append(pl.col("date") >= start_date)
249
+ if end_date:
250
+ where.append(pl.col("date") <= end_date)
251
+
252
+ result = aggregate_table(
253
+ df,
254
+ group_by=["year", "month"],
255
+ metrics={"ms_played": ("sum", "total_ms"),
256
+ "track": [
257
+ ("count", "total_tracks_played"),
258
+ ("n_unique", "unique_listened_tracks")
259
+ ]
260
+ },
261
+ where=where,
262
+ )
263
+
264
+ if result.is_empty():
265
+ return result
266
+
267
+ return result.with_columns(
268
+ total_minutes = pl.col("total_ms").dt.total_minutes().round(0).cast(pl.Int64),
269
+ total_hours = pl.col("total_ms").dt.total_hours().round(0).cast(pl.Int64),
270
+ month_label = pl.format("{}-{}-1", pl.col("year"), pl.col("month"))
271
+ .str.to_date("%Y-%b-%d")
272
+ ).sort("month_label")
273
+
274
+ def get_weekly_listening_trend(
275
+ df: pl.DataFrame,
276
+ start_date: Optional[date] = None,
277
+ end_date: Optional[date] = None
278
+ ) -> pl.DataFrame:
279
+ """
280
+ Get weekly and daytime listening trend
281
+ grouped by daytime (Night, Morning, Afternoon, Evening)
282
+ """
283
+ where = []
284
+ if start_date:
285
+ where.append(pl.col("date") >= start_date)
286
+ if end_date:
287
+ where.append(pl.col("date") <= end_date)
288
+
289
+ if df is None or df.is_empty():
290
+ return pl.DataFrame()
291
+
292
+ # Apply filters
293
+ if where:
294
+ df = df.filter(pl.all_horizontal(where))
295
+
296
+ result = df.with_columns(
297
+ time_range=pl.col("hour").cut(
298
+ breaks=[5, 11, 17, 23], # breaks into 0-5, 6-11, 12-17, 18-23
299
+ labels=["Night", "Morning", "Afternoon", "Evening", "Night"]
300
+ ),
301
+ weekday_idx=pl.col("timestamp").dt.weekday()
302
+ ).group_by(
303
+ ["weekday", "weekday_idx", "time_range"]
304
+ ).agg(
305
+ total_minutes=pl.col("ms_played").sum().dt.total_minutes().round(0).cast(pl.Int64),
306
+ total_tracks_played=pl.count("track"),
307
+ unique_listened_tracks=pl.n_unique("track")
308
+ ).sort(["weekday_idx", "time_range"])
309
+
310
+ return result
311
+
312
+ def get_raw_df(df: pl.DataFrame,
313
+ limit: int,
314
+ start_date: Optional[date] = None,
315
+ end_date: Optional[date] = None
316
+ ) -> pl.DataFrame:
317
+ """
318
+ Get raw listening history data with optional filtering and limit.
319
+ """
320
+ filters = []
321
+ if start_date:
322
+ filters.append(pl.col("date") >= start_date)
323
+ if end_date:
324
+ filters.append(pl.col("date") <= end_date)
325
+
326
+ return query_data(
327
+ df,
328
+ where=filters if filters else None,
329
+ limit=limit
330
+ )
@@ -0,0 +1,217 @@
1
+ """
2
+ Data loader module for Spotify JSON history files.
3
+ """
4
+ import logging
5
+ import polars as pl
6
+ from pathlib import Path
7
+ from typing import List, Dict, Any, Optional, Literal
8
+ from pydantic import ValidationError
9
+ from .models import JsonTrackRecord, Track, MONTHS, WEEKDAYS
10
+
11
+ class SpotifyDataLoader:
12
+ """
13
+ Loads and processes Spotify listening history from JSON files.
14
+ The schema of the processed DataFrame matches the `Track` Pydantic model.
15
+ See `packages/dataloader/spotify_dataloader/models.py` for full field definitions.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ directory: Optional[Path] = None,
21
+ file_pattern: str = "Streaming*.json",
22
+ strict_validation: bool = False,
23
+ ):
24
+ """
25
+ Initialize the data loader.
26
+
27
+ Args:
28
+ directory: Path to directory containing Spotify JSON files. Must be specified.
29
+ file_pattern: Glob pattern for files to load (default: "Streaming*.json")
30
+ strict_validation: If True, raises ValidationError on failed sample validation.
31
+ """
32
+ if directory is None:
33
+ raise ValueError("directory must be specified — no default available in package mode")
34
+ else:
35
+ self.data_dir = Path(directory).resolve()
36
+
37
+ self.file_pattern = file_pattern
38
+ self.strict_validation = strict_validation
39
+
40
+ # intialize logging pattern
41
+ self._logger_prefix = (
42
+ f"{self.__class__.__module__}."
43
+ f"{self.__class__.__name__}"
44
+ )
45
+ # initialize df
46
+ self._df: pl.DataFrame | None = None
47
+ self._is_initialized: bool = False
48
+ # NOTE: Lazy loading - initialize_data() is called on first access via the df property
49
+
50
+ def _get_logger(self, method_name: str):
51
+ return logging.getLogger(f"{self._logger_prefix}.{method_name}")
52
+
53
+ # methods to get dataframes
54
+ @property
55
+ def df(self) -> Optional[pl.DataFrame]:
56
+ """Lazy loading: initialize data on first access."""
57
+ if not self._is_initialized:
58
+ self.initialize_data()
59
+ self._is_initialized = True
60
+ return self._df
61
+
62
+ @property
63
+ def lazy(self) -> pl.LazyFrame:
64
+ if self.df is None: # Use the property to trigger lazy initialization
65
+ raise RuntimeError("Data not loaded")
66
+ return self._df.lazy()
67
+
68
+ def initialize_data(self) -> None:
69
+ """
70
+ Process raw JSON data into a structured Polars DataFrame.
71
+ """
72
+ logger = self._get_logger('initialize_data')
73
+ logger.info("Processing raw JSON data into structured DataFrame")
74
+ df = self._read_json_files(self.data_dir, self.file_pattern)
75
+ if df.is_empty():
76
+ self._df = pl.DataFrame()
77
+ else:
78
+ self._df = self._preprocess(df)
79
+
80
+ def _read_json_files(self, directory: Path, pattern: str = "Streaming*.json") -> pl.DataFrame:
81
+ """Read JSON files in a directory matching the pattern into a Polars DataFrame."""
82
+ logger = self._get_logger('_read_json_files')
83
+ # Use rglob to recursively find files matching the pattern
84
+ json_files = list(directory.rglob(pattern))
85
+ logger.info(f"Found {len(json_files)} JSON files matching '{pattern}' in {directory}")
86
+ if not json_files:
87
+ logger.warning(f"No JSON files found in {directory}")
88
+ return pl.DataFrame()
89
+ else:
90
+ dfs = []
91
+ for file in json_files:
92
+ try:
93
+ # Increase infer_schema_length to handle mixed data types
94
+ df = pl.read_json(file, infer_schema_length=10000)
95
+ dfs.append(df)
96
+ logger.info(f"Loaded {file.name}: {df.height} records")
97
+ except Exception as e:
98
+ logger.error(f"Failed to load {file.name}: {e}")
99
+ continue
100
+
101
+ if not dfs:
102
+ logger.warning("No valid JSON files could be loaded")
103
+ return pl.DataFrame()
104
+
105
+ combined_df = pl.concat(dfs, how="diagonal_relaxed") # Use diagonal_relaxed for mismatched schemas
106
+ logger.info(f"Total {combined_df.height} records loaded.")
107
+ return combined_df
108
+
109
+ def _preprocess(self, df: pl.DataFrame) -> pl.DataFrame:
110
+ """
111
+ Normalize Spotify history to the standard schema with staged validation.
112
+ """
113
+ logger = self._get_logger('_preprocess')
114
+ initial_count = df.height
115
+ logger.info(f"Starting preprocessing of {initial_count} raw records")
116
+
117
+ # --- Stage 1: Cleanup & Raw Validation ---
118
+ working_df = df
119
+ if "ms_played" not in working_df.columns and "msPlayed" in working_df.columns:
120
+ working_df = working_df.rename({"msPlayed": "ms_played"})
121
+
122
+ # Validate raw data sample
123
+ if not working_df.is_empty():
124
+ # Filter non-nulls for raw validation sample
125
+ raw_sample_pool = working_df.filter(pl.col("master_metadata_track_name").is_not_null())
126
+ self._validate_sample(raw_sample_pool, JsonTrackRecord, sample_size=10)
127
+
128
+ # --- Stage 2: Filtering ---
129
+ logger.info("Filtering records: removing null tracks and zero playtime")
130
+ working_df = working_df.filter(
131
+ (pl.col("master_metadata_track_name").is_not_null()) &
132
+ (pl.col("ms_played") > 0)
133
+ )
134
+ filtered_count = working_df.height
135
+ logger.info(f"Filtered records: {initial_count} -> {filtered_count} (Dropped {initial_count - filtered_count})")
136
+
137
+ # --- Stage 3: Transformation ---
138
+ processed_df = (
139
+ working_df
140
+ .select([
141
+ pl.col("ts").str.strptime(
142
+ pl.Datetime,
143
+ format="%+"
144
+ ).dt.replace_time_zone(
145
+ "UTC"
146
+ ).alias("timestamp"),
147
+ pl.col("ts").cast(pl.Utf8).alias("ts"),
148
+ pl.col("ms_played").cast(pl.Duration("ms")),
149
+ pl.col("master_metadata_track_name").alias("track"),
150
+ pl.col("master_metadata_album_artist_name").alias("artist"),
151
+ pl.col("master_metadata_album_album_name").alias("album"),
152
+ pl.col("spotify_track_uri").alias("track_uri"),
153
+ pl.col("conn_country"),
154
+ pl.col("platform"),
155
+ pl.col("reason_start"),
156
+ pl.col("reason_end"),
157
+ pl.col("shuffle"),
158
+ pl.col("skipped")
159
+ ])
160
+ .with_columns(
161
+ year = pl.col("timestamp").dt.year(),
162
+ # Use pl.Enum for month and weekday for:
163
+ # 1. Memory Efficiency: Stores as integers internally, strings only for display.
164
+ # 2. Performance: Faster grouping, filtering, and sorting than strings.
165
+ # 3. Logical Sorting: Ensures 'Jan' < 'Feb' and 'Mon' < 'Tue' instead of alphabetical.
166
+ # 4. Data Integrity: Strictly enforces that only values in our constants are allowed.
167
+ month = pl.col("timestamp").dt.strftime("%b").cast(pl.Enum(MONTHS)),
168
+ weekday = pl.col("timestamp").dt.strftime("%a").cast(pl.Enum(WEEKDAYS)),
169
+ hour = pl.col("timestamp").dt.hour(),
170
+ date = pl.col("timestamp").dt.date(),
171
+ )
172
+ )
173
+
174
+ # --- Stage 4: Processed Validation ---
175
+ if not processed_df.is_empty():
176
+ self._validate_sample(processed_df, Track, sample_size=10)
177
+
178
+ logger.info("Preprocessing complete")
179
+ return processed_df
180
+
181
+ # Validation helper
182
+ def _validate_sample(self, df: pl.DataFrame, model_class: Any, sample_size: int = 1):
183
+ """
184
+ Validate a sample of the data against a Pydantic model.
185
+
186
+ Args:
187
+ df: Polars DataFrame to sample from
188
+ model_class: Pydantic model class to validate against
189
+ sample_size: Number of records to sample (default: 10)
190
+ """
191
+ logger = self._get_logger('_validate_sample')
192
+ if df.is_empty():
193
+ return
194
+
195
+ # Take a sample (use head if df is small, otherwise sample)
196
+ sample_df = df.head(sample_size) if df.height <= sample_size else df.sample(n=sample_size)
197
+ records = sample_df.to_dicts()
198
+
199
+ errors = []
200
+ for i, record in enumerate(records):
201
+ try:
202
+ model_class.model_validate(record)
203
+ except ValidationError as e:
204
+ # Capture the first few errors for the log
205
+ error_details = e.errors()[0]
206
+ msg = f"Row {i} | Field: {error_details['loc']} | Error: {error_details['msg']}"
207
+ errors.append(msg)
208
+
209
+ if errors:
210
+ err_msg = f"Validation failed for {model_class.__name__} in {len(errors)}/{len(records)} sampled rows:\n" + "\n".join(errors[:5])
211
+ if self.strict_validation:
212
+ logger.error(err_msg)
213
+ raise ValidationError(err_msg)
214
+ else:
215
+ logger.warning(err_msg)
216
+ else:
217
+ logger.info(f"Successfully validated {len(records)} rows against {model_class.__name__}")
@@ -0,0 +1,77 @@
1
+ '''
2
+ Pydantic model for a Spotify track in DataLoader
3
+ '''
4
+ from pydantic import BaseModel
5
+ from datetime import datetime, date, timedelta
6
+ from typing import Optional, Literal, List
7
+
8
+ # --- Constants for Data Consistency ---
9
+ # These lists serve as the single source of truth for both Pydantic validation
10
+ # and Polars categorical/Enum types.
11
+ MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
12
+ WEEKDAYS = ["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"]
13
+ REASON_START = [
14
+ "clickrow", "trackdone", "appload", "fwdbtn", "backbtn",
15
+ "remote", "playbtn", "unknown", "switched-to-audio", "switched-to-video"
16
+ ]
17
+ REASON_END = [
18
+ "trackerror", "trackdone", "endplay", "logout", "fwdbtn",
19
+ "backbtn", "unexpected-exit", "remote", "unexpected-exit-while-paused",
20
+ "unknown"
21
+ ]
22
+
23
+ class JsonTrackRecord(BaseModel):
24
+ '''
25
+ A model representing the raw JSON structure of a Spotify track record
26
+ as found in the Spotify listening history data.
27
+ make sure the input JSON structure matches the following example:
28
+ '''
29
+ ts: str # ISO 8601 format
30
+ platform: Optional[str] # e.g., "Android"
31
+ ms_played: int
32
+ conn_country: Optional[str] # country code, e.g.,"TW"
33
+ ip_addr: Optional[str]
34
+ master_metadata_track_name: str
35
+ master_metadata_album_artist_name: str
36
+ master_metadata_album_album_name: str
37
+ spotify_track_uri: str # required, format: "spotify:track:6KE0cMC0Sa9NJMt8dbmAp8"
38
+ # ==== audiobook/podcast related fields ====
39
+ episode_name: Optional[str] = None
40
+ episode_show_name: Optional[str] = None
41
+ spotify_episode_uri: Optional[str] = None
42
+ audiobook_title: Optional[str] = None
43
+ audiobook_uri: Optional[str] = None
44
+ audiobook_chapter_uri: Optional[str] = None
45
+ audiobook_chapter_title: Optional[str] = None
46
+ # Additional fields tracking playing behavior
47
+ reason_start: Literal[*REASON_START]
48
+ reason_end: Literal[*REASON_END]
49
+ shuffle: bool
50
+ skipped: bool
51
+ offline: Optional[bool] = None
52
+ offline_timestamp: Optional[int] = None # not sure what this field does
53
+ incognito_mode: Optional[bool] = None
54
+
55
+
56
+ class Track(BaseModel):
57
+ '''
58
+ A model representing a Spotify track in DataLoader
59
+ '''
60
+ timestamp: datetime # parsed datetime from (Use Taipei timezone) in ISO format
61
+ ts: str # rwar string timestamp from JSON
62
+ ms_played: timedelta # duration format
63
+ track: str
64
+ artist: str
65
+ album: str
66
+ track_uri: str
67
+ conn_country: str
68
+ platform: str
69
+ reason_start: str
70
+ reason_end: str
71
+ shuffle: bool
72
+ skipped: bool
73
+ year: int
74
+ month: Literal[*MONTHS]
75
+ weekday: Literal[*WEEKDAYS]
76
+ hour: int
77
+ date: date # yyyy-mm-dd date format