media-archivist 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- media_archivist-0.1.0/PKG-INFO +317 -0
- media_archivist-0.1.0/README.md +289 -0
- media_archivist-0.1.0/media_archivist/__init__.py +29 -0
- media_archivist-0.1.0/media_archivist/bandcamp.py +130 -0
- media_archivist-0.1.0/media_archivist/base.py +87 -0
- media_archivist-0.1.0/media_archivist/canon.py +176 -0
- media_archivist-0.1.0/media_archivist/canonicalize.py +543 -0
- media_archivist-0.1.0/media_archivist/cli.py +965 -0
- media_archivist-0.1.0/media_archivist/cli_args.py +271 -0
- media_archivist-0.1.0/media_archivist/discover.py +92 -0
- media_archivist-0.1.0/media_archivist/enrich/__init__.py +12 -0
- media_archivist-0.1.0/media_archivist/enrich/content_type.py +34 -0
- media_archivist-0.1.0/media_archivist/enrich/lyrics.py +26 -0
- media_archivist-0.1.0/media_archivist/enrich/orchestrator.py +106 -0
- media_archivist-0.1.0/media_archivist/enrich/transcripts.py +99 -0
- media_archivist-0.1.0/media_archivist/entities.py +106 -0
- media_archivist-0.1.0/media_archivist/exceptions.py +6 -0
- media_archivist-0.1.0/media_archivist/hub.py +119 -0
- media_archivist-0.1.0/media_archivist/ia.py +97 -0
- media_archivist-0.1.0/media_archivist/index.py +217 -0
- media_archivist-0.1.0/media_archivist/models/__init__.py +46 -0
- media_archivist-0.1.0/media_archivist/models/api.py +57 -0
- media_archivist-0.1.0/media_archivist/models/archive.py +61 -0
- media_archivist-0.1.0/media_archivist/models/canonical.py +64 -0
- media_archivist-0.1.0/media_archivist/models/canonical_record.py +90 -0
- media_archivist-0.1.0/media_archivist/models/dataset_card.py +129 -0
- media_archivist-0.1.0/media_archivist/models/enriched.py +61 -0
- media_archivist-0.1.0/media_archivist/models/raw.py +126 -0
- media_archivist-0.1.0/media_archivist/music.py +217 -0
- media_archivist-0.1.0/media_archivist/progress.py +50 -0
- media_archivist-0.1.0/media_archivist/providers/__init__.py +43 -0
- media_archivist-0.1.0/media_archivist/server/__init__.py +8 -0
- media_archivist-0.1.0/media_archivist/server/app.py +42 -0
- media_archivist-0.1.0/media_archivist/server/routes.py +188 -0
- media_archivist-0.1.0/media_archivist/server/scheduler.py +120 -0
- media_archivist-0.1.0/media_archivist/snapshot.py +45 -0
- media_archivist-0.1.0/media_archivist/soundcloud.py +113 -0
- media_archivist-0.1.0/media_archivist/storage.py +77 -0
- media_archivist-0.1.0/media_archivist/strm.py +80 -0
- media_archivist-0.1.0/media_archivist/sync.py +162 -0
- media_archivist-0.1.0/media_archivist/version.py +8 -0
- media_archivist-0.1.0/media_archivist/views.py +115 -0
- media_archivist-0.1.0/media_archivist/youtube.py +252 -0
- media_archivist-0.1.0/media_archivist.egg-info/PKG-INFO +317 -0
- media_archivist-0.1.0/media_archivist.egg-info/SOURCES.txt +65 -0
- media_archivist-0.1.0/media_archivist.egg-info/dependency_links.txt +1 -0
- media_archivist-0.1.0/media_archivist.egg-info/entry_points.txt +2 -0
- media_archivist-0.1.0/media_archivist.egg-info/requires.txt +22 -0
- media_archivist-0.1.0/media_archivist.egg-info/top_level.txt +1 -0
- media_archivist-0.1.0/pyproject.toml +39 -0
- media_archivist-0.1.0/setup.cfg +4 -0
- media_archivist-0.1.0/test/test_anime_game_providers.py +320 -0
- media_archivist-0.1.0/test/test_canonical.py +149 -0
- media_archivist-0.1.0/test/test_cli_args.py +52 -0
- media_archivist-0.1.0/test/test_disambiguation.py +640 -0
- media_archivist-0.1.0/test/test_discover_sync.py +74 -0
- media_archivist-0.1.0/test/test_entities.py +432 -0
- media_archivist-0.1.0/test/test_index_filters.py +83 -0
- media_archivist-0.1.0/test/test_metal_archives.py +83 -0
- media_archivist-0.1.0/test/test_models.py +98 -0
- media_archivist-0.1.0/test/test_new_providers.py +267 -0
- media_archivist-0.1.0/test/test_server.py +130 -0
- media_archivist-0.1.0/test/test_storage.py +48 -0
- media_archivist-0.1.0/test/test_strm.py +69 -0
- media_archivist-0.1.0/test/test_url_parsing.py +39 -0
- media_archivist-0.1.0/test/test_v06.py +147 -0
- media_archivist-0.1.0/test/test_views_extra.py +92 -0
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: media_archivist
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Media indexer for YouTube, YouTube Music, Internet Archive, Bandcamp and SoundCloud — index streams, download on demand
|
|
5
|
+
Author-email: JarbasAi <jarbasai@mailfence.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/JarbasAl/media_archivist
|
|
8
|
+
Requires-Python: >=3.9
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: json_database>=0.3.0
|
|
11
|
+
Requires-Dist: mediavocab>=0.1.0
|
|
12
|
+
Requires-Dist: metadatarr>=0.1.0
|
|
13
|
+
Requires-Dist: internetarchive
|
|
14
|
+
Requires-Dist: requests
|
|
15
|
+
Requires-Dist: pydantic>=2
|
|
16
|
+
Requires-Dist: tqdm
|
|
17
|
+
Provides-Extra: test
|
|
18
|
+
Requires-Dist: pytest; extra == "test"
|
|
19
|
+
Provides-Extra: hub
|
|
20
|
+
Requires-Dist: huggingface_hub; extra == "hub"
|
|
21
|
+
Provides-Extra: server
|
|
22
|
+
Requires-Dist: fastapi; extra == "server"
|
|
23
|
+
Requires-Dist: uvicorn[standard]; extra == "server"
|
|
24
|
+
Provides-Extra: all
|
|
25
|
+
Requires-Dist: huggingface_hub; extra == "all"
|
|
26
|
+
Requires-Dist: fastapi; extra == "all"
|
|
27
|
+
Requires-Dist: uvicorn[standard]; extra == "all"
|
|
28
|
+
|
|
29
|
+
# media_archivist
|
|
30
|
+
|
|
31
|
+
Cross-source media indexer. Builds a local JSON database of stream metadata
|
|
32
|
+
from YouTube, YouTube Music, Internet Archive, Bandcamp and SoundCloud.
|
|
33
|
+
|
|
34
|
+
| Backend | Library | What you can index |
|
|
35
|
+
| --- | --- | --- |
|
|
36
|
+
| **YouTube** | [`tutubo`](https://github.com/OpenJarbas/tutubo) | channels, playlists, videos (no API key) |
|
|
37
|
+
| **YouTube Music** | `tutubo.ytmus` (via `ytmusicapi`) | tracks, albums, artists, playlists |
|
|
38
|
+
| **Internet Archive** | `internetarchive` | items, collections |
|
|
39
|
+
| **Bandcamp** | [`py_bandcamp`](https://github.com/JarbasAl/py_bandcamp) | tracks, albums, artists, tag/search |
|
|
40
|
+
| **SoundCloud** | [`nuvem_de_som`](https://github.com/JarbasAl/nuvem_de_som) | tracks, sets, profiles, search |
|
|
41
|
+
|
|
42
|
+
`media_archivist` is **metadata-only**: it indexes streams; it does not
|
|
43
|
+
download them. Pair it with [`yt-dlp`](https://github.com/yt-dlp/yt-dlp) (or
|
|
44
|
+
SoundCloud's `resolve_stream`, Bandcamp's `track.stream`) for on-demand
|
|
45
|
+
extraction, or use the JSON DB to drive dataset-collection scripts, recommender
|
|
46
|
+
experiments, OVOS skills, etc.
|
|
47
|
+
|
|
48
|
+
Ships as both a Python library and a `media-archivist` CLI.
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install media_archivist # core (YouTube + IA + YT Music)
|
|
54
|
+
pip install media_archivist[bandcamp] # + py_bandcamp
|
|
55
|
+
pip install media_archivist[soundcloud] # + nuvem_de_som
|
|
56
|
+
pip install media_archivist[all] # everything
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## CLI
|
|
60
|
+
|
|
61
|
+
Every subcommand takes either:
|
|
62
|
+
|
|
63
|
+
- `--db-file PATH` — explicit path to a `.json` file (recommended for datasets
|
|
64
|
+
you want to commit alongside scripts), **or**
|
|
65
|
+
- `--db NAME` — auto-place under XDG at `~/.local/share/media_archivist/<NAME>.json`.
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Index a channel, a playlist, or individual videos
|
|
69
|
+
media-archivist add --db-file talks.json https://www.youtube.com/@LinusTechTips
|
|
70
|
+
media-archivist add --db-file talks.json --blacklist "#shorts" \
|
|
71
|
+
https://www.youtube.com/playlist?list=PL...
|
|
72
|
+
|
|
73
|
+
# Browse the DB
|
|
74
|
+
media-archivist list --db-file talks.json --limit 20
|
|
75
|
+
media-archivist list --db-file talks.json --grep "review" --json
|
|
76
|
+
media-archivist stats --db-file talks.json
|
|
77
|
+
|
|
78
|
+
# Pair with yt-dlp — index once, download on demand
|
|
79
|
+
media-archivist urls --db-file talks.json --grep "tutorial" | yt-dlp -a -
|
|
80
|
+
|
|
81
|
+
# Drop dead videos / unwanted titles
|
|
82
|
+
media-archivist prune --db-file talks.json --unavailable --blacklist sponsor
|
|
83
|
+
|
|
84
|
+
# Background-monitor a set of URLs (re-syncs every --interval seconds)
|
|
85
|
+
media-archivist monitor --db-file talks.json --interval 600 \
|
|
86
|
+
https://www.youtube.com/@LinusTechTips \
|
|
87
|
+
https://www.youtube.com/@SomeOtherChannel
|
|
88
|
+
|
|
89
|
+
# Internet Archive
|
|
90
|
+
media-archivist add --db-file ia_movies.json --ia classic_cartoons
|
|
91
|
+
media-archivist urls --db-file ia_movies.json | xargs -n1 -P4 wget
|
|
92
|
+
|
|
93
|
+
# YouTube Music — rich track metadata (artist, album, year, duration, explicit)
|
|
94
|
+
media-archivist add --db-file songs.json --music --skip-explicit "lo-fi beats"
|
|
95
|
+
media-archivist add --db-file songs.json --music \
|
|
96
|
+
"https://music.youtube.com/playlist?list=PL..."
|
|
97
|
+
|
|
98
|
+
# Bandcamp — tracks have direct stream URLs in the entry
|
|
99
|
+
media-archivist add --db-file bandcamp.json --bandcamp \
|
|
100
|
+
"https://artistname.bandcamp.com/album/some-album"
|
|
101
|
+
media-archivist add --db-file bandcamp.json --bandcamp "ambient drone"
|
|
102
|
+
|
|
103
|
+
# SoundCloud — search, profile, or set URLs
|
|
104
|
+
media-archivist add --db-file sc.json --soundcloud \
|
|
105
|
+
"https://soundcloud.com/some-artist"
|
|
106
|
+
media-archivist add --db-file sc.json --soundcloud "footwork"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Pick the backend with `--ia`, `--music`, `--bandcamp`, or `--soundcloud`
|
|
110
|
+
(default: YouTube). Every other subcommand (`list`, `export`, `urls`, `prune`,
|
|
111
|
+
`merge`, `stats`, …) works the same way against any backend's DB.
|
|
112
|
+
|
|
113
|
+
DBs are plain JSON — edit, back up, version-control, share. With `--db NAME` the
|
|
114
|
+
file is managed under XDG via
|
|
115
|
+
[`json_database`](https://github.com/OpenJarbas/json_database).
|
|
116
|
+
|
|
117
|
+
## Building datasets
|
|
118
|
+
|
|
119
|
+
`media_archivist` is metadata-only: it indexes streams; downloads happen on
|
|
120
|
+
demand via `yt-dlp` (or any other tool that reads URLs). The `export`,
|
|
121
|
+
`import`, `merge`, and `stats` subcommands turn the JSON DB into a workable
|
|
122
|
+
dataset.
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Build an index of three channels into one explicit file
|
|
126
|
+
media-archivist add --db-file documentaries.json \
|
|
127
|
+
https://www.youtube.com/@FreeDocumentary \
|
|
128
|
+
https://www.youtube.com/@FDSpace \
|
|
129
|
+
https://www.youtube.com/@FreeDocumentaryOcean
|
|
130
|
+
|
|
131
|
+
# Project specific fields → CSV (great for pandas / sklearn)
|
|
132
|
+
media-archivist export --db-file documentaries.json --format csv \
|
|
133
|
+
--fields videoId,title,url,published,tags,description \
|
|
134
|
+
-o documentaries.csv
|
|
135
|
+
|
|
136
|
+
# JSONL is the canonical "one-row-per-line" format for ML pipelines
|
|
137
|
+
media-archivist export --db-file documentaries.json --format jsonl \
|
|
138
|
+
-o documentaries.jsonl
|
|
139
|
+
|
|
140
|
+
# Just URLs (txt) for downstream tools
|
|
141
|
+
media-archivist export --db-file documentaries.json --format txt \
|
|
142
|
+
-o urls.txt
|
|
143
|
+
|
|
144
|
+
# Inspect coverage before training
|
|
145
|
+
media-archivist stats --db-file documentaries.json
|
|
146
|
+
|
|
147
|
+
# Merge per-topic indexes into a master dataset
|
|
148
|
+
media-archivist merge --db-file all_docs.json \
|
|
149
|
+
space.json ocean.json nature.json --overwrite
|
|
150
|
+
|
|
151
|
+
# Round-trip: import an existing JSONL produced elsewhere
|
|
152
|
+
media-archivist import --db-file talks.json talks.jsonl --overwrite
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Output formats
|
|
156
|
+
|
|
157
|
+
| `--format` | Use case |
|
|
158
|
+
| --- | --- |
|
|
159
|
+
| `jsonl` *(default)* | streaming pipelines, HuggingFace `datasets`, `jq` |
|
|
160
|
+
| `json` | small datasets, human inspection |
|
|
161
|
+
| `csv` | pandas, spreadsheets — list/dict fields auto-serialized to JSON strings |
|
|
162
|
+
| `txt` | flat URL list for `yt-dlp -a -` / `wget -i` / `xargs` |
|
|
163
|
+
|
|
164
|
+
Combine with `--fields` to project only what you need, `--grep` to filter by
|
|
165
|
+
title substring, and `--limit N` to cap row count.
|
|
166
|
+
|
|
167
|
+
### Stored fields per video
|
|
168
|
+
|
|
169
|
+
| field | source |
|
|
170
|
+
| --- | --- |
|
|
171
|
+
| `videoId`, `url`, `title`, `thumbnail` | tutubo `Video` |
|
|
172
|
+
| `tags` | union of `Video.keywords` and inferred `Video.tags` |
|
|
173
|
+
| `is_live`, `published`, `views`, `description` | tutubo channel-grid metadata |
|
|
174
|
+
| `playlist` | only set when archived from a playlist |
|
|
175
|
+
|
|
176
|
+
See [`examples/`](./examples) for end-to-end dataset-creation scripts.
|
|
177
|
+
|
|
178
|
+
## YouTube (library)
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from media_archivist import YoutubeArchivist
|
|
182
|
+
|
|
183
|
+
archivist = YoutubeArchivist(
|
|
184
|
+
db_path="./talks.json", # explicit file (or use db_name="..." for XDG)
|
|
185
|
+
blacklisted_kwords=["#shorts", "trailer"],
|
|
186
|
+
required_kwords=[], # all must appear in the title
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Channel — handles /channel/, /c/, /@handle, /user/
|
|
190
|
+
archivist.archive("https://www.youtube.com/@LinusTechTips")
|
|
191
|
+
|
|
192
|
+
# Playlist
|
|
193
|
+
archivist.archive("https://www.youtube.com/playlist?list=PL...")
|
|
194
|
+
|
|
195
|
+
# Single video (watch / youtu.be / shorts URLs)
|
|
196
|
+
archivist.archive("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
|
197
|
+
|
|
198
|
+
# All playlists of a channel
|
|
199
|
+
archivist.archive_channel_playlists("https://www.youtube.com/@LinusTechTips")
|
|
200
|
+
|
|
201
|
+
# Drop entries whose videos are no longer reachable
|
|
202
|
+
archivist.remove_unavailable()
|
|
203
|
+
|
|
204
|
+
for entry in archivist.sorted_entries():
|
|
205
|
+
print(entry["title"], entry["url"])
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
> **Note on duration:** tutubo's bare `Channel.videos` / `Playlist.videos`
|
|
209
|
+
> iterators don't expose track length, so `--min-duration` is a no-op for
|
|
210
|
+
> plain channel scrapes. It **does** apply when length is available — i.e.
|
|
211
|
+
> with `--music` (YT Music tracks), `--bandcamp`, `--soundcloud`, `--ia`,
|
|
212
|
+
> and YouTube search-result previews. `published` is a relative string
|
|
213
|
+
> ("2 days ago") rather than a timestamp.
|
|
214
|
+
|
|
215
|
+
### Background monitor
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from media_archivist import YoutubeMonitor
|
|
219
|
+
|
|
220
|
+
mon = YoutubeMonitor(db_name="my_channels")
|
|
221
|
+
mon.start()
|
|
222
|
+
mon.monitor("https://www.youtube.com/@LinusTechTips") # re-syncs every sync_interval
|
|
223
|
+
mon.sync("https://www.youtube.com/@SomeOtherChannel") # one-shot
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
`YoutubeMonitor.bootstrap_from_url(url)` seeds an empty database from a remote
|
|
227
|
+
JSON dump — handy for distributing pre-built indexes.
|
|
228
|
+
|
|
229
|
+
## YouTube Music (library)
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
from media_archivist import YoutubeMusicArchivist
|
|
233
|
+
|
|
234
|
+
m = YoutubeMusicArchivist(db_path="./songs.json", skip_explicit=True)
|
|
235
|
+
m.archive_search("lo-fi beats")
|
|
236
|
+
m.archive_playlist("https://music.youtube.com/playlist?list=PL...")
|
|
237
|
+
m.archive_album("MPREb_xxx") # browseId
|
|
238
|
+
m.archive_artist("UCxxx") # channelId
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Each entry includes `artist`, `album`, `year`, `duration` (seconds), `explicit`,
|
|
242
|
+
`video_type` (`MUSIC_VIDEO_TYPE_ATV` etc.), `audio_only`, `music_video`.
|
|
243
|
+
|
|
244
|
+
## Bandcamp (library)
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
from media_archivist import BandcampArchivist
|
|
248
|
+
|
|
249
|
+
bc = BandcampArchivist(db_path="./bandcamp.json")
|
|
250
|
+
bc.archive("https://artist.bandcamp.com/album/some-album")
|
|
251
|
+
bc.archive_artist("https://artist.bandcamp.com")
|
|
252
|
+
bc.archive_search("ambient drone")
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
Each entry stores `artist`, `album`, `track_number`, `duration` (seconds),
|
|
256
|
+
`thumbnail`, and **`stream`** (a direct audio URL when Bandcamp exposes one).
|
|
257
|
+
|
|
258
|
+
## SoundCloud (library)
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
from media_archivist import SoundCloudArchivist
|
|
262
|
+
|
|
263
|
+
sc = SoundCloudArchivist(db_path="./sc.json", resolve_streams=True)
|
|
264
|
+
sc.archive("https://soundcloud.com/some-artist") # profile
|
|
265
|
+
sc.archive("https://soundcloud.com/some-artist/sets/some-set") # set
|
|
266
|
+
sc.archive_search("footwork")
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
`resolve_streams=True` calls `nuvem_de_som`'s stream resolver per track and
|
|
270
|
+
stores the resulting MP3/HLS URL under `stream`.
|
|
271
|
+
|
|
272
|
+
## Internet Archive (library)
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
from media_archivist import IAArchivist
|
|
276
|
+
|
|
277
|
+
ia = IAArchivist(db_path="./ia_movies.json")
|
|
278
|
+
ia.archive("classic_cartoons") # collection or single item id
|
|
279
|
+
ia.archive_item("Popeye_forPresident")
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
Stream URLs are filtered to formats in `IAArchivist.VALID_FORMATS`
|
|
283
|
+
(`MPEG2`, `Ogg Video`, `512Kb MPEG4`, `h.264`).
|
|
284
|
+
|
|
285
|
+
## Filtering helpers
|
|
286
|
+
|
|
287
|
+
All archivists inherit from `JsonArchivist`:
|
|
288
|
+
|
|
289
|
+
- `remove_keyword(kwords)` — drop entries whose title matches any keyword
|
|
290
|
+
- `remove_missing(keys)` — drop entries missing any of the given fields
|
|
291
|
+
- `remove_below_duration(minutes)` — drop entries shorter than N minutes
|
|
292
|
+
- `sorted_entries()` — entries sorted by `upload_ts` (descending)
|
|
293
|
+
|
|
294
|
+
## Metadata providers
|
|
295
|
+
|
|
296
|
+
`media-archivist canonicalize` enriches indexed entries with external IDs
|
|
297
|
+
and structured metadata via the cross-source resolver in
|
|
298
|
+
[`metadatarr`](https://github.com/TigreGotico/metadatarr). The provider
|
|
299
|
+
registry, dispatcher, and ~24 built-in providers (MusicBrainz, Wikidata,
|
|
300
|
+
TMDB, AniList, Jikan, Google Books, LibriVox, Apple Podcasts, *arr family,
|
|
301
|
+
Discogs, Blu-ray.com, DVDCompare, OpenLibrary, Anna's Archive, Bandcamp,
|
|
302
|
+
SoundCloud, YouTube / YouTube Music, Metal Archives, …) all live in
|
|
303
|
+
metadatarr and self-register on import. See
|
|
304
|
+
[`docs/metadatarr.md`](docs/metadatarr.md) for the full table.
|
|
305
|
+
|
|
306
|
+
All resolver providers — including `metal_archives` — live in metadatarr.
|
|
307
|
+
There are no media-archivist-specific resolver providers.
|
|
308
|
+
|
|
309
|
+
The resolver gates providers on three independent axes: `media` (MediaType),
|
|
310
|
+
`modality` (PlaybackModality — AUDIO / VIDEO / TEXT / INTERACTIVE / UNKNOWN),
|
|
311
|
+
and `genre_filter` (genre tag set). Callers constructing `Signals` directly can
|
|
312
|
+
pass `modality=PlaybackModality.AUDIO` to restrict resolution to audio-only
|
|
313
|
+
providers. See [`docs/metadatarr.md`](docs/metadatarr.md#routing) for details.
|
|
314
|
+
|
|
315
|
+
## License
|
|
316
|
+
|
|
317
|
+
Apache-2.0
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# media_archivist
|
|
2
|
+
|
|
3
|
+
Cross-source media indexer. Builds a local JSON database of stream metadata
|
|
4
|
+
from YouTube, YouTube Music, Internet Archive, Bandcamp and SoundCloud.
|
|
5
|
+
|
|
6
|
+
| Backend | Library | What you can index |
|
|
7
|
+
| --- | --- | --- |
|
|
8
|
+
| **YouTube** | [`tutubo`](https://github.com/OpenJarbas/tutubo) | channels, playlists, videos (no API key) |
|
|
9
|
+
| **YouTube Music** | `tutubo.ytmus` (via `ytmusicapi`) | tracks, albums, artists, playlists |
|
|
10
|
+
| **Internet Archive** | `internetarchive` | items, collections |
|
|
11
|
+
| **Bandcamp** | [`py_bandcamp`](https://github.com/JarbasAl/py_bandcamp) | tracks, albums, artists, tag/search |
|
|
12
|
+
| **SoundCloud** | [`nuvem_de_som`](https://github.com/JarbasAl/nuvem_de_som) | tracks, sets, profiles, search |
|
|
13
|
+
|
|
14
|
+
`media_archivist` is **metadata-only**: it indexes streams; it does not
|
|
15
|
+
download them. Pair it with [`yt-dlp`](https://github.com/yt-dlp/yt-dlp) (or
|
|
16
|
+
SoundCloud's `resolve_stream`, Bandcamp's `track.stream`) for on-demand
|
|
17
|
+
extraction, or use the JSON DB to drive dataset-collection scripts, recommender
|
|
18
|
+
experiments, OVOS skills, etc.
|
|
19
|
+
|
|
20
|
+
Ships as both a Python library and a `media-archivist` CLI.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install media_archivist # core (YouTube + IA + YT Music)
|
|
26
|
+
pip install media_archivist[bandcamp] # + py_bandcamp
|
|
27
|
+
pip install media_archivist[soundcloud] # + nuvem_de_som
|
|
28
|
+
pip install media_archivist[all] # everything
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## CLI
|
|
32
|
+
|
|
33
|
+
Every subcommand takes either:
|
|
34
|
+
|
|
35
|
+
- `--db-file PATH` — explicit path to a `.json` file (recommended for datasets
|
|
36
|
+
you want to commit alongside scripts), **or**
|
|
37
|
+
- `--db NAME` — auto-place under XDG at `~/.local/share/media_archivist/<NAME>.json`.
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# Index a channel, a playlist, or individual videos
|
|
41
|
+
media-archivist add --db-file talks.json https://www.youtube.com/@LinusTechTips
|
|
42
|
+
media-archivist add --db-file talks.json --blacklist "#shorts" \
|
|
43
|
+
https://www.youtube.com/playlist?list=PL...
|
|
44
|
+
|
|
45
|
+
# Browse the DB
|
|
46
|
+
media-archivist list --db-file talks.json --limit 20
|
|
47
|
+
media-archivist list --db-file talks.json --grep "review" --json
|
|
48
|
+
media-archivist stats --db-file talks.json
|
|
49
|
+
|
|
50
|
+
# Pair with yt-dlp — index once, download on demand
|
|
51
|
+
media-archivist urls --db-file talks.json --grep "tutorial" | yt-dlp -a -
|
|
52
|
+
|
|
53
|
+
# Drop dead videos / unwanted titles
|
|
54
|
+
media-archivist prune --db-file talks.json --unavailable --blacklist sponsor
|
|
55
|
+
|
|
56
|
+
# Background-monitor a set of URLs (re-syncs every --interval seconds)
|
|
57
|
+
media-archivist monitor --db-file talks.json --interval 600 \
|
|
58
|
+
https://www.youtube.com/@LinusTechTips \
|
|
59
|
+
https://www.youtube.com/@SomeOtherChannel
|
|
60
|
+
|
|
61
|
+
# Internet Archive
|
|
62
|
+
media-archivist add --db-file ia_movies.json --ia classic_cartoons
|
|
63
|
+
media-archivist urls --db-file ia_movies.json | xargs -n1 -P4 wget
|
|
64
|
+
|
|
65
|
+
# YouTube Music — rich track metadata (artist, album, year, duration, explicit)
|
|
66
|
+
media-archivist add --db-file songs.json --music --skip-explicit "lo-fi beats"
|
|
67
|
+
media-archivist add --db-file songs.json --music \
|
|
68
|
+
"https://music.youtube.com/playlist?list=PL..."
|
|
69
|
+
|
|
70
|
+
# Bandcamp — tracks have direct stream URLs in the entry
|
|
71
|
+
media-archivist add --db-file bandcamp.json --bandcamp \
|
|
72
|
+
"https://artistname.bandcamp.com/album/some-album"
|
|
73
|
+
media-archivist add --db-file bandcamp.json --bandcamp "ambient drone"
|
|
74
|
+
|
|
75
|
+
# SoundCloud — search, profile, or set URLs
|
|
76
|
+
media-archivist add --db-file sc.json --soundcloud \
|
|
77
|
+
"https://soundcloud.com/some-artist"
|
|
78
|
+
media-archivist add --db-file sc.json --soundcloud "footwork"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Pick the backend with `--ia`, `--music`, `--bandcamp`, or `--soundcloud`
|
|
82
|
+
(default: YouTube). Every other subcommand (`list`, `export`, `urls`, `prune`,
|
|
83
|
+
`merge`, `stats`, …) works the same way against any backend's DB.
|
|
84
|
+
|
|
85
|
+
DBs are plain JSON — edit, back up, version-control, share. With `--db NAME` the
|
|
86
|
+
file is managed under XDG via
|
|
87
|
+
[`json_database`](https://github.com/OpenJarbas/json_database).
|
|
88
|
+
|
|
89
|
+
## Building datasets
|
|
90
|
+
|
|
91
|
+
`media_archivist` is metadata-only: it indexes streams; downloads happen on
|
|
92
|
+
demand via `yt-dlp` (or any other tool that reads URLs). The `export`,
|
|
93
|
+
`import`, `merge`, and `stats` subcommands turn the JSON DB into a workable
|
|
94
|
+
dataset.
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Build an index of three channels into one explicit file
|
|
98
|
+
media-archivist add --db-file documentaries.json \
|
|
99
|
+
https://www.youtube.com/@FreeDocumentary \
|
|
100
|
+
https://www.youtube.com/@FDSpace \
|
|
101
|
+
https://www.youtube.com/@FreeDocumentaryOcean
|
|
102
|
+
|
|
103
|
+
# Project specific fields → CSV (great for pandas / sklearn)
|
|
104
|
+
media-archivist export --db-file documentaries.json --format csv \
|
|
105
|
+
--fields videoId,title,url,published,tags,description \
|
|
106
|
+
-o documentaries.csv
|
|
107
|
+
|
|
108
|
+
# JSONL is the canonical "one-row-per-line" format for ML pipelines
|
|
109
|
+
media-archivist export --db-file documentaries.json --format jsonl \
|
|
110
|
+
-o documentaries.jsonl
|
|
111
|
+
|
|
112
|
+
# Just URLs (txt) for downstream tools
|
|
113
|
+
media-archivist export --db-file documentaries.json --format txt \
|
|
114
|
+
-o urls.txt
|
|
115
|
+
|
|
116
|
+
# Inspect coverage before training
|
|
117
|
+
media-archivist stats --db-file documentaries.json
|
|
118
|
+
|
|
119
|
+
# Merge per-topic indexes into a master dataset
|
|
120
|
+
media-archivist merge --db-file all_docs.json \
|
|
121
|
+
space.json ocean.json nature.json --overwrite
|
|
122
|
+
|
|
123
|
+
# Round-trip: import an existing JSONL produced elsewhere
|
|
124
|
+
media-archivist import --db-file talks.json talks.jsonl --overwrite
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Output formats
|
|
128
|
+
|
|
129
|
+
| `--format` | Use case |
|
|
130
|
+
| --- | --- |
|
|
131
|
+
| `jsonl` *(default)* | streaming pipelines, HuggingFace `datasets`, `jq` |
|
|
132
|
+
| `json` | small datasets, human inspection |
|
|
133
|
+
| `csv` | pandas, spreadsheets — list/dict fields auto-serialized to JSON strings |
|
|
134
|
+
| `txt` | flat URL list for `yt-dlp -a -` / `wget -i` / `xargs` |
|
|
135
|
+
|
|
136
|
+
Combine with `--fields` to project only what you need, `--grep` to filter by
|
|
137
|
+
title substring, and `--limit N` to cap row count.
|
|
138
|
+
|
|
139
|
+
### Stored fields per video
|
|
140
|
+
|
|
141
|
+
| field | source |
|
|
142
|
+
| --- | --- |
|
|
143
|
+
| `videoId`, `url`, `title`, `thumbnail` | tutubo `Video` |
|
|
144
|
+
| `tags` | union of `Video.keywords` and inferred `Video.tags` |
|
|
145
|
+
| `is_live`, `published`, `views`, `description` | tutubo channel-grid metadata |
|
|
146
|
+
| `playlist` | only set when archived from a playlist |
|
|
147
|
+
|
|
148
|
+
See [`examples/`](./examples) for end-to-end dataset-creation scripts.
|
|
149
|
+
|
|
150
|
+
## YouTube (library)
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from media_archivist import YoutubeArchivist
|
|
154
|
+
|
|
155
|
+
archivist = YoutubeArchivist(
|
|
156
|
+
db_path="./talks.json", # explicit file (or use db_name="..." for XDG)
|
|
157
|
+
blacklisted_kwords=["#shorts", "trailer"],
|
|
158
|
+
required_kwords=[], # all must appear in the title
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Channel — handles /channel/, /c/, /@handle, /user/
|
|
162
|
+
archivist.archive("https://www.youtube.com/@LinusTechTips")
|
|
163
|
+
|
|
164
|
+
# Playlist
|
|
165
|
+
archivist.archive("https://www.youtube.com/playlist?list=PL...")
|
|
166
|
+
|
|
167
|
+
# Single video (watch / youtu.be / shorts URLs)
|
|
168
|
+
archivist.archive("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
|
169
|
+
|
|
170
|
+
# All playlists of a channel
|
|
171
|
+
archivist.archive_channel_playlists("https://www.youtube.com/@LinusTechTips")
|
|
172
|
+
|
|
173
|
+
# Drop entries whose videos are no longer reachable
|
|
174
|
+
archivist.remove_unavailable()
|
|
175
|
+
|
|
176
|
+
for entry in archivist.sorted_entries():
|
|
177
|
+
print(entry["title"], entry["url"])
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
> **Note on duration:** tutubo's bare `Channel.videos` / `Playlist.videos`
|
|
181
|
+
> iterators don't expose track length, so `--min-duration` is a no-op for
|
|
182
|
+
> plain channel scrapes. It **does** apply when length is available — i.e.
|
|
183
|
+
> with `--music` (YT Music tracks), `--bandcamp`, `--soundcloud`, `--ia`,
|
|
184
|
+
> and YouTube search-result previews. `published` is a relative string
|
|
185
|
+
> ("2 days ago") rather than a timestamp.
|
|
186
|
+
|
|
187
|
+
### Background monitor
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
from media_archivist import YoutubeMonitor
|
|
191
|
+
|
|
192
|
+
mon = YoutubeMonitor(db_name="my_channels")
|
|
193
|
+
mon.start()
|
|
194
|
+
mon.monitor("https://www.youtube.com/@LinusTechTips") # re-syncs every sync_interval
|
|
195
|
+
mon.sync("https://www.youtube.com/@SomeOtherChannel") # one-shot
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
`YoutubeMonitor.bootstrap_from_url(url)` seeds an empty database from a remote
|
|
199
|
+
JSON dump — handy for distributing pre-built indexes.
|
|
200
|
+
|
|
201
|
+
## YouTube Music (library)
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from media_archivist import YoutubeMusicArchivist
|
|
205
|
+
|
|
206
|
+
m = YoutubeMusicArchivist(db_path="./songs.json", skip_explicit=True)
|
|
207
|
+
m.archive_search("lo-fi beats")
|
|
208
|
+
m.archive_playlist("https://music.youtube.com/playlist?list=PL...")
|
|
209
|
+
m.archive_album("MPREb_xxx") # browseId
|
|
210
|
+
m.archive_artist("UCxxx") # channelId
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Each entry includes `artist`, `album`, `year`, `duration` (seconds), `explicit`,
|
|
214
|
+
`video_type` (`MUSIC_VIDEO_TYPE_ATV` etc.), `audio_only`, `music_video`.
|
|
215
|
+
|
|
216
|
+
## Bandcamp (library)
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
from media_archivist import BandcampArchivist
|
|
220
|
+
|
|
221
|
+
bc = BandcampArchivist(db_path="./bandcamp.json")
|
|
222
|
+
bc.archive("https://artist.bandcamp.com/album/some-album")
|
|
223
|
+
bc.archive_artist("https://artist.bandcamp.com")
|
|
224
|
+
bc.archive_search("ambient drone")
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
Each entry stores `artist`, `album`, `track_number`, `duration` (seconds),
|
|
228
|
+
`thumbnail`, and **`stream`** (a direct audio URL when Bandcamp exposes one).
|
|
229
|
+
|
|
230
|
+
## SoundCloud (library)
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from media_archivist import SoundCloudArchivist
|
|
234
|
+
|
|
235
|
+
sc = SoundCloudArchivist(db_path="./sc.json", resolve_streams=True)
|
|
236
|
+
sc.archive("https://soundcloud.com/some-artist") # profile
|
|
237
|
+
sc.archive("https://soundcloud.com/some-artist/sets/some-set") # set
|
|
238
|
+
sc.archive_search("footwork")
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
`resolve_streams=True` calls `nuvem_de_som`'s stream resolver per track and
|
|
242
|
+
stores the resulting MP3/HLS URL under `stream`.
|
|
243
|
+
|
|
244
|
+
## Internet Archive (library)
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
from media_archivist import IAArchivist
|
|
248
|
+
|
|
249
|
+
ia = IAArchivist(db_path="./ia_movies.json")
|
|
250
|
+
ia.archive("classic_cartoons") # collection or single item id
|
|
251
|
+
ia.archive_item("Popeye_forPresident")
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Stream URLs are filtered to formats in `IAArchivist.VALID_FORMATS`
|
|
255
|
+
(`MPEG2`, `Ogg Video`, `512Kb MPEG4`, `h.264`).
|
|
256
|
+
|
|
257
|
+
## Filtering helpers
|
|
258
|
+
|
|
259
|
+
All archivists inherit from `JsonArchivist`:
|
|
260
|
+
|
|
261
|
+
- `remove_keyword(kwords)` — drop entries whose title matches any keyword
|
|
262
|
+
- `remove_missing(keys)` — drop entries missing any of the given fields
|
|
263
|
+
- `remove_below_duration(minutes)` — drop entries shorter than N minutes
|
|
264
|
+
- `sorted_entries()` — entries sorted by `upload_ts` (descending)
|
|
265
|
+
|
|
266
|
+
## Metadata providers
|
|
267
|
+
|
|
268
|
+
`media-archivist canonicalize` enriches indexed entries with external IDs
|
|
269
|
+
and structured metadata via the cross-source resolver in
|
|
270
|
+
[`metadatarr`](https://github.com/TigreGotico/metadatarr). The provider
|
|
271
|
+
registry, dispatcher, and ~24 built-in providers (MusicBrainz, Wikidata,
|
|
272
|
+
TMDB, AniList, Jikan, Google Books, LibriVox, Apple Podcasts, *arr family,
|
|
273
|
+
Discogs, Blu-ray.com, DVDCompare, OpenLibrary, Anna's Archive, Bandcamp,
|
|
274
|
+
SoundCloud, YouTube / YouTube Music, Metal Archives, …) all live in
|
|
275
|
+
metadatarr and self-register on import. See
|
|
276
|
+
[`docs/metadatarr.md`](docs/metadatarr.md) for the full table.
|
|
277
|
+
|
|
278
|
+
All resolver providers — including `metal_archives` — live in metadatarr.
|
|
279
|
+
There are no media-archivist-specific resolver providers.
|
|
280
|
+
|
|
281
|
+
The resolver gates providers on three independent axes: `media` (MediaType),
|
|
282
|
+
`modality` (PlaybackModality — AUDIO / VIDEO / TEXT / INTERACTIVE / UNKNOWN),
|
|
283
|
+
and `genre_filter` (genre tag set). Callers constructing `Signals` directly can
|
|
284
|
+
pass `modality=PlaybackModality.AUDIO` to restrict resolution to audio-only
|
|
285
|
+
providers. See [`docs/metadatarr.md`](docs/metadatarr.md#routing) for details.
|
|
286
|
+
|
|
287
|
+
## License
|
|
288
|
+
|
|
289
|
+
Apache-2.0
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from media_archivist.exceptions import MediaArchivistError, VideoUnavailable
|
|
2
|
+
from media_archivist.index import Index
|
|
3
|
+
from media_archivist.ia import IAArchivist
|
|
4
|
+
from media_archivist.music import YoutubeMusicArchivist
|
|
5
|
+
from media_archivist.version import __version__
|
|
6
|
+
from media_archivist.youtube import YoutubeArchivist, YoutubeMonitor
|
|
7
|
+
|
|
8
|
+
# Optional backends — only loaded if their underlying client is installed.
|
|
9
|
+
try:
|
|
10
|
+
from media_archivist.bandcamp import BandcampArchivist # noqa: F401
|
|
11
|
+
except Exception: # pragma: no cover
|
|
12
|
+
BandcampArchivist = None # type: ignore
|
|
13
|
+
try:
|
|
14
|
+
from media_archivist.soundcloud import SoundCloudArchivist # noqa: F401
|
|
15
|
+
except Exception: # pragma: no cover
|
|
16
|
+
SoundCloudArchivist = None # type: ignore
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"YoutubeArchivist",
|
|
20
|
+
"YoutubeMonitor",
|
|
21
|
+
"YoutubeMusicArchivist",
|
|
22
|
+
"IAArchivist",
|
|
23
|
+
"BandcampArchivist",
|
|
24
|
+
"SoundCloudArchivist",
|
|
25
|
+
"Index",
|
|
26
|
+
"MediaArchivistError",
|
|
27
|
+
"VideoUnavailable",
|
|
28
|
+
"__version__",
|
|
29
|
+
]
|