media-archivist 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. media_archivist-0.1.0/PKG-INFO +317 -0
  2. media_archivist-0.1.0/README.md +289 -0
  3. media_archivist-0.1.0/media_archivist/__init__.py +29 -0
  4. media_archivist-0.1.0/media_archivist/bandcamp.py +130 -0
  5. media_archivist-0.1.0/media_archivist/base.py +87 -0
  6. media_archivist-0.1.0/media_archivist/canon.py +176 -0
  7. media_archivist-0.1.0/media_archivist/canonicalize.py +543 -0
  8. media_archivist-0.1.0/media_archivist/cli.py +965 -0
  9. media_archivist-0.1.0/media_archivist/cli_args.py +271 -0
  10. media_archivist-0.1.0/media_archivist/discover.py +92 -0
  11. media_archivist-0.1.0/media_archivist/enrich/__init__.py +12 -0
  12. media_archivist-0.1.0/media_archivist/enrich/content_type.py +34 -0
  13. media_archivist-0.1.0/media_archivist/enrich/lyrics.py +26 -0
  14. media_archivist-0.1.0/media_archivist/enrich/orchestrator.py +106 -0
  15. media_archivist-0.1.0/media_archivist/enrich/transcripts.py +99 -0
  16. media_archivist-0.1.0/media_archivist/entities.py +106 -0
  17. media_archivist-0.1.0/media_archivist/exceptions.py +6 -0
  18. media_archivist-0.1.0/media_archivist/hub.py +119 -0
  19. media_archivist-0.1.0/media_archivist/ia.py +97 -0
  20. media_archivist-0.1.0/media_archivist/index.py +217 -0
  21. media_archivist-0.1.0/media_archivist/models/__init__.py +46 -0
  22. media_archivist-0.1.0/media_archivist/models/api.py +57 -0
  23. media_archivist-0.1.0/media_archivist/models/archive.py +61 -0
  24. media_archivist-0.1.0/media_archivist/models/canonical.py +64 -0
  25. media_archivist-0.1.0/media_archivist/models/canonical_record.py +90 -0
  26. media_archivist-0.1.0/media_archivist/models/dataset_card.py +129 -0
  27. media_archivist-0.1.0/media_archivist/models/enriched.py +61 -0
  28. media_archivist-0.1.0/media_archivist/models/raw.py +126 -0
  29. media_archivist-0.1.0/media_archivist/music.py +217 -0
  30. media_archivist-0.1.0/media_archivist/progress.py +50 -0
  31. media_archivist-0.1.0/media_archivist/providers/__init__.py +43 -0
  32. media_archivist-0.1.0/media_archivist/server/__init__.py +8 -0
  33. media_archivist-0.1.0/media_archivist/server/app.py +42 -0
  34. media_archivist-0.1.0/media_archivist/server/routes.py +188 -0
  35. media_archivist-0.1.0/media_archivist/server/scheduler.py +120 -0
  36. media_archivist-0.1.0/media_archivist/snapshot.py +45 -0
  37. media_archivist-0.1.0/media_archivist/soundcloud.py +113 -0
  38. media_archivist-0.1.0/media_archivist/storage.py +77 -0
  39. media_archivist-0.1.0/media_archivist/strm.py +80 -0
  40. media_archivist-0.1.0/media_archivist/sync.py +162 -0
  41. media_archivist-0.1.0/media_archivist/version.py +8 -0
  42. media_archivist-0.1.0/media_archivist/views.py +115 -0
  43. media_archivist-0.1.0/media_archivist/youtube.py +252 -0
  44. media_archivist-0.1.0/media_archivist.egg-info/PKG-INFO +317 -0
  45. media_archivist-0.1.0/media_archivist.egg-info/SOURCES.txt +65 -0
  46. media_archivist-0.1.0/media_archivist.egg-info/dependency_links.txt +1 -0
  47. media_archivist-0.1.0/media_archivist.egg-info/entry_points.txt +2 -0
  48. media_archivist-0.1.0/media_archivist.egg-info/requires.txt +22 -0
  49. media_archivist-0.1.0/media_archivist.egg-info/top_level.txt +1 -0
  50. media_archivist-0.1.0/pyproject.toml +39 -0
  51. media_archivist-0.1.0/setup.cfg +4 -0
  52. media_archivist-0.1.0/test/test_anime_game_providers.py +320 -0
  53. media_archivist-0.1.0/test/test_canonical.py +149 -0
  54. media_archivist-0.1.0/test/test_cli_args.py +52 -0
  55. media_archivist-0.1.0/test/test_disambiguation.py +640 -0
  56. media_archivist-0.1.0/test/test_discover_sync.py +74 -0
  57. media_archivist-0.1.0/test/test_entities.py +432 -0
  58. media_archivist-0.1.0/test/test_index_filters.py +83 -0
  59. media_archivist-0.1.0/test/test_metal_archives.py +83 -0
  60. media_archivist-0.1.0/test/test_models.py +98 -0
  61. media_archivist-0.1.0/test/test_new_providers.py +267 -0
  62. media_archivist-0.1.0/test/test_server.py +130 -0
  63. media_archivist-0.1.0/test/test_storage.py +48 -0
  64. media_archivist-0.1.0/test/test_strm.py +69 -0
  65. media_archivist-0.1.0/test/test_url_parsing.py +39 -0
  66. media_archivist-0.1.0/test/test_v06.py +147 -0
  67. media_archivist-0.1.0/test/test_views_extra.py +92 -0
@@ -0,0 +1,317 @@
1
+ Metadata-Version: 2.4
2
+ Name: media_archivist
3
+ Version: 0.1.0
4
+ Summary: Media indexer for YouTube, YouTube Music, Internet Archive, Bandcamp and SoundCloud — index streams, download on demand
5
+ Author-email: JarbasAi <jarbasai@mailfence.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/JarbasAl/media_archivist
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: json_database>=0.3.0
11
+ Requires-Dist: mediavocab>=0.1.0
12
+ Requires-Dist: metadatarr>=0.1.0
13
+ Requires-Dist: internetarchive
14
+ Requires-Dist: requests
15
+ Requires-Dist: pydantic>=2
16
+ Requires-Dist: tqdm
17
+ Provides-Extra: test
18
+ Requires-Dist: pytest; extra == "test"
19
+ Provides-Extra: hub
20
+ Requires-Dist: huggingface_hub; extra == "hub"
21
+ Provides-Extra: server
22
+ Requires-Dist: fastapi; extra == "server"
23
+ Requires-Dist: uvicorn[standard]; extra == "server"
24
+ Provides-Extra: all
25
+ Requires-Dist: huggingface_hub; extra == "all"
26
+ Requires-Dist: fastapi; extra == "all"
27
+ Requires-Dist: uvicorn[standard]; extra == "all"
28
+
29
+ # media_archivist
30
+
31
+ Cross-source media indexer. Builds a local JSON database of stream metadata
32
+ from YouTube, YouTube Music, Internet Archive, Bandcamp and SoundCloud.
33
+
34
+ | Backend | Library | What you can index |
35
+ | --- | --- | --- |
36
+ | **YouTube** | [`tutubo`](https://github.com/OpenJarbas/tutubo) | channels, playlists, videos (no API key) |
37
+ | **YouTube Music** | `tutubo.ytmus` (via `ytmusicapi`) | tracks, albums, artists, playlists |
38
+ | **Internet Archive** | `internetarchive` | items, collections |
39
+ | **Bandcamp** | [`py_bandcamp`](https://github.com/JarbasAl/py_bandcamp) | tracks, albums, artists, tag/search |
40
+ | **SoundCloud** | [`nuvem_de_som`](https://github.com/JarbasAl/nuvem_de_som) | tracks, sets, profiles, search |
41
+
42
+ `media_archivist` is **metadata-only**: it indexes streams; it does not
43
+ download them. Pair it with [`yt-dlp`](https://github.com/yt-dlp/yt-dlp) (or
44
+ SoundCloud's `resolve_stream`, Bandcamp's `track.stream`) for on-demand
45
+ extraction, or use the JSON DB to drive dataset-collection scripts, recommender
46
+ experiments, OVOS skills, etc.
47
+
48
+ Ships as both a Python library and a `media-archivist` CLI.
49
+
50
+ ## Install
51
+
52
+ ```bash
53
+ pip install media_archivist # core (YouTube + IA + YT Music)
54
+ pip install media_archivist[bandcamp] # + py_bandcamp
55
+ pip install media_archivist[soundcloud] # + nuvem_de_som
56
+ pip install media_archivist[all] # everything
57
+ ```
58
+
59
+ ## CLI
60
+
61
+ Every subcommand takes either:
62
+
63
+ - `--db-file PATH` — explicit path to a `.json` file (recommended for datasets
64
+ you want to commit alongside scripts), **or**
65
+ - `--db NAME` — auto-place under XDG at `~/.local/share/media_archivist/<NAME>.json`.
66
+
67
+ ```bash
68
+ # Index a channel, a playlist, or individual videos
69
+ media-archivist add --db-file talks.json https://www.youtube.com/@LinusTechTips
70
+ media-archivist add --db-file talks.json --blacklist "#shorts" \
71
+ https://www.youtube.com/playlist?list=PL...
72
+
73
+ # Browse the DB
74
+ media-archivist list --db-file talks.json --limit 20
75
+ media-archivist list --db-file talks.json --grep "review" --json
76
+ media-archivist stats --db-file talks.json
77
+
78
+ # Pair with yt-dlp — index once, download on demand
79
+ media-archivist urls --db-file talks.json --grep "tutorial" | yt-dlp -a -
80
+
81
+ # Drop dead videos / unwanted titles
82
+ media-archivist prune --db-file talks.json --unavailable --blacklist sponsor
83
+
84
+ # Background-monitor a set of URLs (re-syncs every --interval seconds)
85
+ media-archivist monitor --db-file talks.json --interval 600 \
86
+ https://www.youtube.com/@LinusTechTips \
87
+ https://www.youtube.com/@SomeOtherChannel
88
+
89
+ # Internet Archive
90
+ media-archivist add --db-file ia_movies.json --ia classic_cartoons
91
+ media-archivist urls --db-file ia_movies.json | xargs -n1 -P4 wget
92
+
93
+ # YouTube Music — rich track metadata (artist, album, year, duration, explicit)
94
+ media-archivist add --db-file songs.json --music --skip-explicit "lo-fi beats"
95
+ media-archivist add --db-file songs.json --music \
96
+ "https://music.youtube.com/playlist?list=PL..."
97
+
98
+ # Bandcamp — tracks have direct stream URLs in the entry
99
+ media-archivist add --db-file bandcamp.json --bandcamp \
100
+ "https://artistname.bandcamp.com/album/some-album"
101
+ media-archivist add --db-file bandcamp.json --bandcamp "ambient drone"
102
+
103
+ # SoundCloud — search, profile, or set URLs
104
+ media-archivist add --db-file sc.json --soundcloud \
105
+ "https://soundcloud.com/some-artist"
106
+ media-archivist add --db-file sc.json --soundcloud "footwork"
107
+ ```
108
+
109
+ Pick the backend with `--ia`, `--music`, `--bandcamp`, or `--soundcloud`
110
+ (default: YouTube). Every other subcommand (`list`, `export`, `urls`, `prune`,
111
+ `merge`, `stats`, …) works the same way against any backend's DB.
112
+
113
+ DBs are plain JSON — edit, back up, version-control, share. With `--db NAME` the
114
+ file is managed under XDG via
115
+ [`json_database`](https://github.com/OpenJarbas/json_database).
116
+
117
+ ## Building datasets
118
+
119
+ `media_archivist` is metadata-only: it indexes streams; downloads happen on
120
+ demand via `yt-dlp` (or any other tool that reads URLs). The `export`,
121
+ `import`, `merge`, and `stats` subcommands turn the JSON DB into a workable
122
+ dataset.
123
+
124
+ ```bash
125
+ # Build an index of three channels into one explicit file
126
+ media-archivist add --db-file documentaries.json \
127
+ https://www.youtube.com/@FreeDocumentary \
128
+ https://www.youtube.com/@FDSpace \
129
+ https://www.youtube.com/@FreeDocumentaryOcean
130
+
131
+ # Project specific fields → CSV (great for pandas / sklearn)
132
+ media-archivist export --db-file documentaries.json --format csv \
133
+ --fields videoId,title,url,published,tags,description \
134
+ -o documentaries.csv
135
+
136
+ # JSONL is the canonical "one-row-per-line" format for ML pipelines
137
+ media-archivist export --db-file documentaries.json --format jsonl \
138
+ -o documentaries.jsonl
139
+
140
+ # Just URLs (txt) for downstream tools
141
+ media-archivist export --db-file documentaries.json --format txt \
142
+ -o urls.txt
143
+
144
+ # Inspect coverage before training
145
+ media-archivist stats --db-file documentaries.json
146
+
147
+ # Merge per-topic indexes into a master dataset
148
+ media-archivist merge --db-file all_docs.json \
149
+ space.json ocean.json nature.json --overwrite
150
+
151
+ # Round-trip: import an existing JSONL produced elsewhere
152
+ media-archivist import --db-file talks.json talks.jsonl --overwrite
153
+ ```
154
+
155
+ ### Output formats
156
+
157
+ | `--format` | Use case |
158
+ | --- | --- |
159
+ | `jsonl` *(default)* | streaming pipelines, HuggingFace `datasets`, `jq` |
160
+ | `json` | small datasets, human inspection |
161
+ | `csv` | pandas, spreadsheets — list/dict fields auto-serialized to JSON strings |
162
+ | `txt` | flat URL list for `yt-dlp -a -` / `wget -i` / `xargs` |
163
+
164
+ Combine with `--fields` to project only what you need, `--grep` to filter by
165
+ title substring, and `--limit N` to cap row count.
166
+
167
+ ### Stored fields per video
168
+
169
+ | field | source |
170
+ | --- | --- |
171
+ | `videoId`, `url`, `title`, `thumbnail` | tutubo `Video` |
172
+ | `tags` | union of `Video.keywords` and inferred `Video.tags` |
173
+ | `is_live`, `published`, `views`, `description` | tutubo channel-grid metadata |
174
+ | `playlist` | only set when archived from a playlist |
175
+
176
+ See [`examples/`](./examples) for end-to-end dataset-creation scripts.
177
+
178
+ ## YouTube (library)
179
+
180
+ ```python
181
+ from media_archivist import YoutubeArchivist
182
+
183
+ archivist = YoutubeArchivist(
184
+ db_path="./talks.json", # explicit file (or use db_name="..." for XDG)
185
+ blacklisted_kwords=["#shorts", "trailer"],
186
+ required_kwords=[], # all must appear in the title
187
+ )
188
+
189
+ # Channel — handles /channel/, /c/, /@handle, /user/
190
+ archivist.archive("https://www.youtube.com/@LinusTechTips")
191
+
192
+ # Playlist
193
+ archivist.archive("https://www.youtube.com/playlist?list=PL...")
194
+
195
+ # Single video (watch / youtu.be / shorts URLs)
196
+ archivist.archive("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
197
+
198
+ # All playlists of a channel
199
+ archivist.archive_channel_playlists("https://www.youtube.com/@LinusTechTips")
200
+
201
+ # Drop entries whose videos are no longer reachable
202
+ archivist.remove_unavailable()
203
+
204
+ for entry in archivist.sorted_entries():
205
+ print(entry["title"], entry["url"])
206
+ ```
207
+
208
+ > **Note on duration:** tutubo's bare `Channel.videos` / `Playlist.videos`
209
+ > iterators don't expose track length, so `--min-duration` is a no-op for
210
+ > plain channel scrapes. It **does** apply when length is available — i.e.
211
+ > with `--music` (YT Music tracks), `--bandcamp`, `--soundcloud`, `--ia`,
212
+ > and YouTube search-result previews. `published` is a relative string
213
+ > ("2 days ago") rather than a timestamp.
214
+
215
+ ### Background monitor
216
+
217
+ ```python
218
+ from media_archivist import YoutubeMonitor
219
+
220
+ mon = YoutubeMonitor(db_name="my_channels")
221
+ mon.start()
222
+ mon.monitor("https://www.youtube.com/@LinusTechTips") # re-syncs every sync_interval
223
+ mon.sync("https://www.youtube.com/@SomeOtherChannel") # one-shot
224
+ ```
225
+
226
+ `YoutubeMonitor.bootstrap_from_url(url)` seeds an empty database from a remote
227
+ JSON dump — handy for distributing pre-built indexes.
228
+
229
+ ## YouTube Music (library)
230
+
231
+ ```python
232
+ from media_archivist import YoutubeMusicArchivist
233
+
234
+ m = YoutubeMusicArchivist(db_path="./songs.json", skip_explicit=True)
235
+ m.archive_search("lo-fi beats")
236
+ m.archive_playlist("https://music.youtube.com/playlist?list=PL...")
237
+ m.archive_album("MPREb_xxx") # browseId
238
+ m.archive_artist("UCxxx") # channelId
239
+ ```
240
+
241
+ Each entry includes `artist`, `album`, `year`, `duration` (seconds), `explicit`,
242
+ `video_type` (`MUSIC_VIDEO_TYPE_ATV` etc.), `audio_only`, `music_video`.
243
+
244
+ ## Bandcamp (library)
245
+
246
+ ```python
247
+ from media_archivist import BandcampArchivist
248
+
249
+ bc = BandcampArchivist(db_path="./bandcamp.json")
250
+ bc.archive("https://artist.bandcamp.com/album/some-album")
251
+ bc.archive_artist("https://artist.bandcamp.com")
252
+ bc.archive_search("ambient drone")
253
+ ```
254
+
255
+ Each entry stores `artist`, `album`, `track_number`, `duration` (seconds),
256
+ `thumbnail`, and **`stream`** (a direct audio URL when Bandcamp exposes one).
257
+
258
+ ## SoundCloud (library)
259
+
260
+ ```python
261
+ from media_archivist import SoundCloudArchivist
262
+
263
+ sc = SoundCloudArchivist(db_path="./sc.json", resolve_streams=True)
264
+ sc.archive("https://soundcloud.com/some-artist") # profile
265
+ sc.archive("https://soundcloud.com/some-artist/sets/some-set") # set
266
+ sc.archive_search("footwork")
267
+ ```
268
+
269
+ `resolve_streams=True` calls `nuvem_de_som`'s stream resolver per track and
270
+ stores the resulting MP3/HLS URL under `stream`.
271
+
272
+ ## Internet Archive (library)
273
+
274
+ ```python
275
+ from media_archivist import IAArchivist
276
+
277
+ ia = IAArchivist(db_path="./ia_movies.json")
278
+ ia.archive("classic_cartoons") # collection or single item id
279
+ ia.archive_item("Popeye_forPresident")
280
+ ```
281
+
282
+ Stream URLs are filtered to formats in `IAArchivist.VALID_FORMATS`
283
+ (`MPEG2`, `Ogg Video`, `512Kb MPEG4`, `h.264`).
284
+
285
+ ## Filtering helpers
286
+
287
+ All archivists inherit from `JsonArchivist`:
288
+
289
+ - `remove_keyword(kwords)` — drop entries whose title matches any keyword
290
+ - `remove_missing(keys)` — drop entries missing any of the given fields
291
+ - `remove_below_duration(minutes)` — drop entries shorter than N minutes
292
+ - `sorted_entries()` — entries sorted by `upload_ts` (descending)
293
+
294
+ ## Metadata providers
295
+
296
+ `media-archivist canonicalize` enriches indexed entries with external IDs
297
+ and structured metadata via the cross-source resolver in
298
+ [`metadatarr`](https://github.com/TigreGotico/metadatarr). The provider
299
+ registry, dispatcher, and ~24 built-in providers (MusicBrainz, Wikidata,
300
+ TMDB, AniList, Jikan, Google Books, LibriVox, Apple Podcasts, *arr family,
301
+ Discogs, Blu-ray.com, DVDCompare, OpenLibrary, Anna's Archive, Bandcamp,
302
+ SoundCloud, YouTube / YouTube Music, Metal Archives, …) all live in
303
+ metadatarr and self-register on import. See
304
+ [`docs/metadatarr.md`](docs/metadatarr.md) for the full table.
305
+
306
+ All resolver providers — including `metal_archives` — live in metadatarr.
307
+ There are no media-archivist-specific resolver providers.
308
+
309
+ The resolver gates providers on three independent axes: `media` (MediaType),
310
+ `modality` (PlaybackModality — AUDIO / VIDEO / TEXT / INTERACTIVE / UNKNOWN),
311
+ and `genre_filter` (genre tag set). Callers constructing `Signals` directly can
312
+ pass `modality=PlaybackModality.AUDIO` to restrict resolution to audio-only
313
+ providers. See [`docs/metadatarr.md`](docs/metadatarr.md#routing) for details.
314
+
315
+ ## License
316
+
317
+ Apache-2.0
@@ -0,0 +1,289 @@
1
+ # media_archivist
2
+
3
+ Cross-source media indexer. Builds a local JSON database of stream metadata
4
+ from YouTube, YouTube Music, Internet Archive, Bandcamp and SoundCloud.
5
+
6
+ | Backend | Library | What you can index |
7
+ | --- | --- | --- |
8
+ | **YouTube** | [`tutubo`](https://github.com/OpenJarbas/tutubo) | channels, playlists, videos (no API key) |
9
+ | **YouTube Music** | `tutubo.ytmus` (via `ytmusicapi`) | tracks, albums, artists, playlists |
10
+ | **Internet Archive** | `internetarchive` | items, collections |
11
+ | **Bandcamp** | [`py_bandcamp`](https://github.com/JarbasAl/py_bandcamp) | tracks, albums, artists, tag/search |
12
+ | **SoundCloud** | [`nuvem_de_som`](https://github.com/JarbasAl/nuvem_de_som) | tracks, sets, profiles, search |
13
+
14
+ `media_archivist` is **metadata-only**: it indexes streams; it does not
15
+ download them. Pair it with [`yt-dlp`](https://github.com/yt-dlp/yt-dlp) (or
16
+ SoundCloud's `resolve_stream`, Bandcamp's `track.stream`) for on-demand
17
+ extraction, or use the JSON DB to drive dataset-collection scripts, recommender
18
+ experiments, OVOS skills, etc.
19
+
20
+ Ships as both a Python library and a `media-archivist` CLI.
21
+
22
+ ## Install
23
+
24
+ ```bash
25
+ pip install media_archivist # core (YouTube + IA + YT Music)
26
+ pip install media_archivist[bandcamp] # + py_bandcamp
27
+ pip install media_archivist[soundcloud] # + nuvem_de_som
28
+ pip install media_archivist[all] # everything
29
+ ```
30
+
31
+ ## CLI
32
+
33
+ Every subcommand takes either:
34
+
35
+ - `--db-file PATH` — explicit path to a `.json` file (recommended for datasets
36
+ you want to commit alongside scripts), **or**
37
+ - `--db NAME` — auto-place under XDG at `~/.local/share/media_archivist/<NAME>.json`.
38
+
39
+ ```bash
40
+ # Index a channel, a playlist, or individual videos
41
+ media-archivist add --db-file talks.json https://www.youtube.com/@LinusTechTips
42
+ media-archivist add --db-file talks.json --blacklist "#shorts" \
43
+ https://www.youtube.com/playlist?list=PL...
44
+
45
+ # Browse the DB
46
+ media-archivist list --db-file talks.json --limit 20
47
+ media-archivist list --db-file talks.json --grep "review" --json
48
+ media-archivist stats --db-file talks.json
49
+
50
+ # Pair with yt-dlp — index once, download on demand
51
+ media-archivist urls --db-file talks.json --grep "tutorial" | yt-dlp -a -
52
+
53
+ # Drop dead videos / unwanted titles
54
+ media-archivist prune --db-file talks.json --unavailable --blacklist sponsor
55
+
56
+ # Background-monitor a set of URLs (re-syncs every --interval seconds)
57
+ media-archivist monitor --db-file talks.json --interval 600 \
58
+ https://www.youtube.com/@LinusTechTips \
59
+ https://www.youtube.com/@SomeOtherChannel
60
+
61
+ # Internet Archive
62
+ media-archivist add --db-file ia_movies.json --ia classic_cartoons
63
+ media-archivist urls --db-file ia_movies.json | xargs -n1 -P4 wget
64
+
65
+ # YouTube Music — rich track metadata (artist, album, year, duration, explicit)
66
+ media-archivist add --db-file songs.json --music --skip-explicit "lo-fi beats"
67
+ media-archivist add --db-file songs.json --music \
68
+ "https://music.youtube.com/playlist?list=PL..."
69
+
70
+ # Bandcamp — tracks have direct stream URLs in the entry
71
+ media-archivist add --db-file bandcamp.json --bandcamp \
72
+ "https://artistname.bandcamp.com/album/some-album"
73
+ media-archivist add --db-file bandcamp.json --bandcamp "ambient drone"
74
+
75
+ # SoundCloud — search, profile, or set URLs
76
+ media-archivist add --db-file sc.json --soundcloud \
77
+ "https://soundcloud.com/some-artist"
78
+ media-archivist add --db-file sc.json --soundcloud "footwork"
79
+ ```
80
+
81
+ Pick the backend with `--ia`, `--music`, `--bandcamp`, or `--soundcloud`
82
+ (default: YouTube). Every other subcommand (`list`, `export`, `urls`, `prune`,
83
+ `merge`, `stats`, …) works the same way against any backend's DB.
84
+
85
+ DBs are plain JSON — edit, back up, version-control, share. With `--db NAME` the
86
+ file is managed under XDG via
87
+ [`json_database`](https://github.com/OpenJarbas/json_database).
88
+
89
+ ## Building datasets
90
+
91
+ `media_archivist` is metadata-only: it indexes streams; downloads happen on
92
+ demand via `yt-dlp` (or any other tool that reads URLs). The `export`,
93
+ `import`, `merge`, and `stats` subcommands turn the JSON DB into a workable
94
+ dataset.
95
+
96
+ ```bash
97
+ # Build an index of three channels into one explicit file
98
+ media-archivist add --db-file documentaries.json \
99
+ https://www.youtube.com/@FreeDocumentary \
100
+ https://www.youtube.com/@FDSpace \
101
+ https://www.youtube.com/@FreeDocumentaryOcean
102
+
103
+ # Project specific fields → CSV (great for pandas / sklearn)
104
+ media-archivist export --db-file documentaries.json --format csv \
105
+ --fields videoId,title,url,published,tags,description \
106
+ -o documentaries.csv
107
+
108
+ # JSONL is the canonical "one-row-per-line" format for ML pipelines
109
+ media-archivist export --db-file documentaries.json --format jsonl \
110
+ -o documentaries.jsonl
111
+
112
+ # Just URLs (txt) for downstream tools
113
+ media-archivist export --db-file documentaries.json --format txt \
114
+ -o urls.txt
115
+
116
+ # Inspect coverage before training
117
+ media-archivist stats --db-file documentaries.json
118
+
119
+ # Merge per-topic indexes into a master dataset
120
+ media-archivist merge --db-file all_docs.json \
121
+ space.json ocean.json nature.json --overwrite
122
+
123
+ # Round-trip: import an existing JSONL produced elsewhere
124
+ media-archivist import --db-file talks.json talks.jsonl --overwrite
125
+ ```
126
+
127
+ ### Output formats
128
+
129
+ | `--format` | Use case |
130
+ | --- | --- |
131
+ | `jsonl` *(default)* | streaming pipelines, HuggingFace `datasets`, `jq` |
132
+ | `json` | small datasets, human inspection |
133
+ | `csv` | pandas, spreadsheets — list/dict fields auto-serialized to JSON strings |
134
+ | `txt` | flat URL list for `yt-dlp -a -` / `wget -i` / `xargs` |
135
+
136
+ Combine with `--fields` to project only what you need, `--grep` to filter by
137
+ title substring, and `--limit N` to cap row count.
138
+
139
+ ### Stored fields per video
140
+
141
+ | field | source |
142
+ | --- | --- |
143
+ | `videoId`, `url`, `title`, `thumbnail` | tutubo `Video` |
144
+ | `tags` | union of `Video.keywords` and inferred `Video.tags` |
145
+ | `is_live`, `published`, `views`, `description` | tutubo channel-grid metadata |
146
+ | `playlist` | only set when archived from a playlist |
147
+
148
+ See [`examples/`](./examples) for end-to-end dataset-creation scripts.
149
+
150
+ ## YouTube (library)
151
+
152
+ ```python
153
+ from media_archivist import YoutubeArchivist
154
+
155
+ archivist = YoutubeArchivist(
156
+ db_path="./talks.json", # explicit file (or use db_name="..." for XDG)
157
+ blacklisted_kwords=["#shorts", "trailer"],
158
+ required_kwords=[], # all must appear in the title
159
+ )
160
+
161
+ # Channel — handles /channel/, /c/, /@handle, /user/
162
+ archivist.archive("https://www.youtube.com/@LinusTechTips")
163
+
164
+ # Playlist
165
+ archivist.archive("https://www.youtube.com/playlist?list=PL...")
166
+
167
+ # Single video (watch / youtu.be / shorts URLs)
168
+ archivist.archive("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
169
+
170
+ # All playlists of a channel
171
+ archivist.archive_channel_playlists("https://www.youtube.com/@LinusTechTips")
172
+
173
+ # Drop entries whose videos are no longer reachable
174
+ archivist.remove_unavailable()
175
+
176
+ for entry in archivist.sorted_entries():
177
+ print(entry["title"], entry["url"])
178
+ ```
179
+
180
+ > **Note on duration:** tutubo's bare `Channel.videos` / `Playlist.videos`
181
+ > iterators don't expose track length, so `--min-duration` is a no-op for
182
+ > plain channel scrapes. It **does** apply when length is available — i.e.
183
+ > with `--music` (YT Music tracks), `--bandcamp`, `--soundcloud`, `--ia`,
184
+ > and YouTube search-result previews. `published` is a relative string
185
+ > ("2 days ago") rather than a timestamp.
186
+
187
+ ### Background monitor
188
+
189
+ ```python
190
+ from media_archivist import YoutubeMonitor
191
+
192
+ mon = YoutubeMonitor(db_name="my_channels")
193
+ mon.start()
194
+ mon.monitor("https://www.youtube.com/@LinusTechTips") # re-syncs every sync_interval
195
+ mon.sync("https://www.youtube.com/@SomeOtherChannel") # one-shot
196
+ ```
197
+
198
+ `YoutubeMonitor.bootstrap_from_url(url)` seeds an empty database from a remote
199
+ JSON dump — handy for distributing pre-built indexes.
200
+
201
+ ## YouTube Music (library)
202
+
203
+ ```python
204
+ from media_archivist import YoutubeMusicArchivist
205
+
206
+ m = YoutubeMusicArchivist(db_path="./songs.json", skip_explicit=True)
207
+ m.archive_search("lo-fi beats")
208
+ m.archive_playlist("https://music.youtube.com/playlist?list=PL...")
209
+ m.archive_album("MPREb_xxx") # browseId
210
+ m.archive_artist("UCxxx") # channelId
211
+ ```
212
+
213
+ Each entry includes `artist`, `album`, `year`, `duration` (seconds), `explicit`,
214
+ `video_type` (`MUSIC_VIDEO_TYPE_ATV` etc.), `audio_only`, `music_video`.
215
+
216
+ ## Bandcamp (library)
217
+
218
+ ```python
219
+ from media_archivist import BandcampArchivist
220
+
221
+ bc = BandcampArchivist(db_path="./bandcamp.json")
222
+ bc.archive("https://artist.bandcamp.com/album/some-album")
223
+ bc.archive_artist("https://artist.bandcamp.com")
224
+ bc.archive_search("ambient drone")
225
+ ```
226
+
227
+ Each entry stores `artist`, `album`, `track_number`, `duration` (seconds),
228
+ `thumbnail`, and **`stream`** (a direct audio URL when Bandcamp exposes one).
229
+
230
+ ## SoundCloud (library)
231
+
232
+ ```python
233
+ from media_archivist import SoundCloudArchivist
234
+
235
+ sc = SoundCloudArchivist(db_path="./sc.json", resolve_streams=True)
236
+ sc.archive("https://soundcloud.com/some-artist") # profile
237
+ sc.archive("https://soundcloud.com/some-artist/sets/some-set") # set
238
+ sc.archive_search("footwork")
239
+ ```
240
+
241
+ `resolve_streams=True` calls `nuvem_de_som`'s stream resolver per track and
242
+ stores the resulting MP3/HLS URL under `stream`.
243
+
244
+ ## Internet Archive (library)
245
+
246
+ ```python
247
+ from media_archivist import IAArchivist
248
+
249
+ ia = IAArchivist(db_path="./ia_movies.json")
250
+ ia.archive("classic_cartoons") # collection or single item id
251
+ ia.archive_item("Popeye_forPresident")
252
+ ```
253
+
254
+ Stream URLs are filtered to formats in `IAArchivist.VALID_FORMATS`
255
+ (`MPEG2`, `Ogg Video`, `512Kb MPEG4`, `h.264`).
256
+
257
+ ## Filtering helpers
258
+
259
+ All archivists inherit from `JsonArchivist`:
260
+
261
+ - `remove_keyword(kwords)` — drop entries whose title matches any keyword
262
+ - `remove_missing(keys)` — drop entries missing any of the given fields
263
+ - `remove_below_duration(minutes)` — drop entries shorter than N minutes
264
+ - `sorted_entries()` — entries sorted by `upload_ts` (descending)
265
+
266
+ ## Metadata providers
267
+
268
+ `media-archivist canonicalize` enriches indexed entries with external IDs
269
+ and structured metadata via the cross-source resolver in
270
+ [`metadatarr`](https://github.com/TigreGotico/metadatarr). The provider
271
+ registry, dispatcher, and ~24 built-in providers (MusicBrainz, Wikidata,
272
+ TMDB, AniList, Jikan, Google Books, LibriVox, Apple Podcasts, *arr family,
273
+ Discogs, Blu-ray.com, DVDCompare, OpenLibrary, Anna's Archive, Bandcamp,
274
+ SoundCloud, YouTube / YouTube Music, Metal Archives, …) all live in
275
+ metadatarr and self-register on import. See
276
+ [`docs/metadatarr.md`](docs/metadatarr.md) for the full table.
277
+
278
+ All resolver providers — including `metal_archives` — live in metadatarr.
279
+ There are no media-archivist-specific resolver providers.
280
+
281
+ The resolver gates providers on three independent axes: `media` (MediaType),
282
+ `modality` (PlaybackModality — AUDIO / VIDEO / TEXT / INTERACTIVE / UNKNOWN),
283
+ and `genre_filter` (genre tag set). Callers constructing `Signals` directly can
284
+ pass `modality=PlaybackModality.AUDIO` to restrict resolution to audio-only
285
+ providers. See [`docs/metadatarr.md`](docs/metadatarr.md#routing) for details.
286
+
287
+ ## License
288
+
289
+ Apache-2.0
@@ -0,0 +1,29 @@
1
+ from media_archivist.exceptions import MediaArchivistError, VideoUnavailable
2
+ from media_archivist.index import Index
3
+ from media_archivist.ia import IAArchivist
4
+ from media_archivist.music import YoutubeMusicArchivist
5
+ from media_archivist.version import __version__
6
+ from media_archivist.youtube import YoutubeArchivist, YoutubeMonitor
7
+
8
+ # Optional backends — only loaded if their underlying client is installed.
9
+ try:
10
+ from media_archivist.bandcamp import BandcampArchivist # noqa: F401
11
+ except Exception: # pragma: no cover
12
+ BandcampArchivist = None # type: ignore
13
+ try:
14
+ from media_archivist.soundcloud import SoundCloudArchivist # noqa: F401
15
+ except Exception: # pragma: no cover
16
+ SoundCloudArchivist = None # type: ignore
17
+
18
+ __all__ = [
19
+ "YoutubeArchivist",
20
+ "YoutubeMonitor",
21
+ "YoutubeMusicArchivist",
22
+ "IAArchivist",
23
+ "BandcampArchivist",
24
+ "SoundCloudArchivist",
25
+ "Index",
26
+ "MediaArchivistError",
27
+ "VideoUnavailable",
28
+ "__version__",
29
+ ]