youtube-to-docs 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ build:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v5
18
+
19
+ - name: Set up Python
20
+ run: uv python install
21
+
22
+ - name: Create virtual environment
23
+ run: uv venv
24
+
25
+ - name: Install dependencies
26
+ run: uv pip install -r requirements.txt
27
+
28
+ - name: Ruff Lint
29
+ run: uv tool run ruff check .
30
+
31
+ - name: Ruff Format
32
+ run: uv tool run ruff format --check .
33
+
34
+ - name: Type Check
35
+ run: uv tool run ty check
36
+
37
+ - name: Run Tests
38
+ run: uv run --with-requirements requirements.txt pytest
@@ -0,0 +1,215 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+
209
+ youtube-docs.csv
210
+ transcript-files/
211
+ summary-files/
212
+
213
+ implementation_plan.md
214
+
215
+ uv.lock
@@ -0,0 +1,30 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ - id: ruff-format
5
+ name: Ruff Format
6
+ entry: uv tool run ruff format
7
+ language: system
8
+ types: [python]
9
+ args: ["."]
10
+
11
+ - id: ruff-check
12
+ name: Ruff Lint
13
+ entry: uv tool run ruff check
14
+ language: system
15
+ types: [python]
16
+ args: ["--fix", "."]
17
+
18
+ - id: ty-check
19
+ name: Type Check (Ty)
20
+ entry: uv tool run ty check
21
+ language: system
22
+ types: [python]
23
+ pass_filenames: false
24
+
25
+ - id: pytest
26
+ name: Run Tests
27
+ entry: uv run --with-requirements requirements.txt pytest
28
+ language: system
29
+ types: [python]
30
+ pass_filenames: false
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 DoIT - Artifical Intelligence
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.4
2
+ Name: youtube-to-docs
3
+ Version: 0.0.1
4
+ Summary: Convert YouTube videos to docs/sheets for discoverability
5
+ Requires-Python: >=3.14
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: google-auth>=2.45.0
9
+ Requires-Dist: google-genai>=1.56.0
10
+ Requires-Dist: google-api-python-client>=2.187.0
11
+ Requires-Dist: isodate>=0.7.2
12
+ Requires-Dist: openai>=1.56.0
13
+ Requires-Dist: polars>=1.36.1
14
+ Requires-Dist: requests>=2.32.5
15
+ Requires-Dist: youtube-transcript-api>=1.2.3
16
+ Dynamic: license-file
17
+
18
+ # youtube-to-docs
19
+ Convert YouTube videos to docs/sheets for discoverability.
20
+
21
+
22
+
23
+ *Created with the help of AI. All artifacts have been checked and work as expected.*
@@ -0,0 +1,6 @@
1
+ # youtube-to-docs
2
+ Convert YouTube videos to docs/sheets for discoverability.
3
+
4
+
5
+
6
+ *Created with the help of AI. All artifacts have been checked and work as expected.*
@@ -0,0 +1,102 @@
1
+ # Development Guide
2
+
3
+ ## Prerequisites
4
+
5
+ - Python 3.14 or higher
6
+ - `uv` (recommended) or `pip`
7
+
8
+ ## Installation
9
+
10
+ 1. Install dependencies:
11
+ ```bash
12
+ uv pip install -r requirements.txt
13
+ # OR
14
+ pip install -r requirements.txt
15
+ ```
16
+
17
+ ## Running Tests
18
+
19
+ We use `pytest` for testing.
20
+
21
+ ### Using `uv` (Recommended)
22
+
23
+ To run tests with all dependencies automatically handled:
24
+
25
+ ```bash
26
+ uv run --with-requirements requirements.txt pytest
27
+ ```
28
+
29
+ ## Project Structure
30
+
31
+ - `main.py`: Main application script.
32
+ - `tests/`: Directory containing test files.
33
+ - `requirements.txt`: Python package dependencies.
34
+
35
+ ## Tooling
36
+
37
+ This project uses modern Python tooling for code quality:
38
+
39
+ - **Ruff**: For linting and code formatting.
40
+ - **Ty**: For static type checking.
41
+
42
+ These tools are configured in `pyproject.toml`.
43
+
44
+ ### Running Ruff
45
+
46
+ To check for linting errors:
47
+
48
+ ```bash
49
+ uv tool run ruff check .
50
+ ```
51
+
52
+ To fix fixable linting errors automatically:
53
+
54
+ ```bash
55
+ uv tool run ruff check --fix .
56
+ ```
57
+
58
+ To format the code:
59
+
60
+ ```bash
61
+ uv tool run ruff format .
62
+ ```
63
+
64
+ ### Running Ty (Type Checking)
65
+
66
+ To run type checks:
67
+
68
+ ```bash
69
+ uv tool run ty check
70
+ ```
71
+
72
+ ### Pre-commit
73
+
74
+ To run pre-commit hook:
75
+
76
+ ```bash
77
+ uv tool run pre-commit run --all-files
78
+ ```
79
+
80
+
81
+ ## Release to PyPI
82
+
83
+ To publish a new version of the package to PyPI, follow these steps:
84
+
85
+ 1. **Build the package**:
86
+ This will create a `dist/` directory with the distribution files.
87
+ ```bash
88
+ uv tool run --from build pyproject-build
89
+ ```
90
+
91
+ 2. **Upload to PyPI**:
92
+ Use `twine` to upload the distribution files.
93
+
94
+ If your `.pypirc` is already configured with your API key:
95
+ ```bash
96
+ uv tool run twine upload --repository testpypi dist/*
97
+ ```
98
+
99
+ Or, to explicitly use your `PYPI_API_KEY` environment variable:
100
+ ```bash
101
+ uv tool run twine upload -u __token__ -p $env:PYPI_API_KEY dist/*
102
+ ```
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "youtube-to-docs"
7
+ version = "0.0.1"
8
+ description = "Convert YouTube videos to docs/sheets for discoverability"
9
+ readme = "README.md"
10
+ requires-python = ">=3.14"
11
+ dependencies = [
12
+ "google-auth>=2.45.0",
13
+ "google-genai>=1.56.0",
14
+ "google-api-python-client>=2.187.0",
15
+ "isodate>=0.7.2",
16
+ "openai>=1.56.0",
17
+ "polars>=1.36.1",
18
+ "requests>=2.32.5",
19
+ "youtube-transcript-api>=1.2.3",
20
+ ]
21
+
22
+ [tool.setuptools.packages.find]
23
+ include = ["youtube_to_docs*"]
24
+ exclude = ["tests*", "docs*", "summary-files*", "transcript-files*"]
25
+
26
+ [tool.ruff]
27
+ # Match the project's Python version requirement
28
+ target-version = "py314"
29
+ line-length = 88
30
+
31
+ [tool.ruff.lint]
32
+ # Enable Pyflakes (`F`), pycodestyle (`E`, `W`), and isort (`I`)
33
+ select = ["E", "F", "I", "W"]
34
+ ignore = []
35
+
36
+ # Allow fix for all enabled rules (when `--fix`) is provided.
37
+ fixable = ["ALL"]
38
+ unfixable = []
39
+
40
+ [tool.ruff.format]
41
+ # Use double quotes for strings.
42
+ quote-style = "double"
43
+ # Indent with spaces, rather than tabs.
44
+ indent-style = "space"
45
+ # Respect magic trailing commas.
46
+ skip-magic-trailing-comma = false
47
+ # Automatically detect the appropriate line ending.
48
+ line-ending = "auto"
49
+
50
+ [tool.ty.environment]
51
+ # Target Python 3.14 to match project requirements
52
+ python-version = "3.14"
@@ -0,0 +1,9 @@
1
+ google-auth>=2.45.0
2
+ google-genai>=1.56.0
3
+ google-api-python-client>=2.187.0
4
+ isodate>=0.7.2
5
+ openai>=1.56.0
6
+ polars>=1.36.1
7
+ requests>=2.32.5
8
+ youtube-transcript-api>=1.2.3
9
+ pytest
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,196 @@
1
+ import os
2
+ import unittest
3
+ from unittest.mock import MagicMock, patch
4
+
5
+ from youtube_to_docs import main
6
+
7
+
8
+ class TestYoutubeToDocs(unittest.TestCase):
9
+ def setUp(self):
10
+ # Mock environment variables
11
+ self.env_patcher = patch.dict(
12
+ os.environ,
13
+ {
14
+ "YOUTUBE_DATA_API_KEY": "fake_youtube_key",
15
+ "GEMINI_API_KEY": "fake_gemini_key",
16
+ "PROJECT_ID": "fake_project_id",
17
+ "AWS_BEARER_TOKEN_BEDROCK": "fake_bedrock_token",
18
+ "AZURE_FOUNDRY_ENDPOINT": "https://fake.openai.azure.com/",
19
+ "AZURE_FOUNDRY_API_KEY": "fake_foundry_key",
20
+ },
21
+ )
22
+ self.env_patcher.start()
23
+
24
+ def tearDown(self):
25
+ self.env_patcher.stop()
26
+
27
+ @patch("youtube_to_docs.main.build")
28
+ def test_get_youtube_service(self, mock_build):
29
+ service = main.get_youtube_service()
30
+ self.assertIsNotNone(service)
31
+ mock_build.assert_called_with("youtube", "v3", developerKey="fake_youtube_key")
32
+
33
+ def test_get_youtube_service_no_key(self):
34
+ with patch.dict(os.environ, {}, clear=True):
35
+ service = main.get_youtube_service()
36
+ self.assertIsNone(service)
37
+
38
+ def test_resolve_video_ids_single(self):
39
+ ids = main.resolve_video_ids("KuPc06JgI_A", None)
40
+ self.assertEqual(ids, ["KuPc06JgI_A"])
41
+
42
+ def test_resolve_video_ids_list(self):
43
+ ids = main.resolve_video_ids("KuPc06JgI_A,GalhDyf3F8g", None)
44
+ self.assertEqual(ids, ["KuPc06JgI_A", "GalhDyf3F8g"])
45
+
46
+ def test_resolve_video_ids_playlist_no_service(self):
47
+ with self.assertRaises(SystemExit):
48
+ main.resolve_video_ids("PL8ZxoInteClyHaiReuOHpv6Z4SPrXtYtW", None)
49
+
50
+ @patch("youtube_to_docs.main.build")
51
+ def test_resolve_video_ids_playlist(self, mock_build):
52
+ mock_service = MagicMock()
53
+ mock_request = MagicMock()
54
+ mock_response = {
55
+ "items": [
56
+ {"contentDetails": {"videoId": "vid1"}},
57
+ {"contentDetails": {"videoId": "vid2"}},
58
+ ]
59
+ }
60
+ mock_request.execute.return_value = mock_response
61
+ # Mock list_next to return None to stop iteration
62
+ mock_service.playlistItems().list.return_value = mock_request
63
+ mock_service.playlistItems().list_next.return_value = None
64
+
65
+ ids = main.resolve_video_ids("PL123", mock_service)
66
+ self.assertEqual(ids, ["vid1", "vid2"])
67
+
68
+ @patch("youtube_to_docs.main.build")
69
+ def test_resolve_video_ids_channel_handle(self, mock_build):
70
+ mock_service = MagicMock()
71
+
72
+ # Mock channel list response
73
+ mock_channel_req = MagicMock()
74
+ mock_channel_resp = {
75
+ "items": [{"contentDetails": {"relatedPlaylists": {"uploads": "UU123"}}}]
76
+ }
77
+ mock_channel_req.execute.return_value = mock_channel_resp
78
+ mock_service.channels().list.return_value = mock_channel_req
79
+
80
+ # Mock playlist items response (since it calls resolve_video_ids internally
81
+ # with the playlist ID)
82
+ mock_playlist_req = MagicMock()
83
+ mock_playlist_resp = {
84
+ "items": [{"contentDetails": {"videoId": "vid_from_channel"}}]
85
+ }
86
+ mock_playlist_req.execute.return_value = mock_playlist_resp
87
+ mock_service.playlistItems().list.return_value = mock_playlist_req
88
+ mock_service.playlistItems().list_next.return_value = None
89
+
90
+ ids = main.resolve_video_ids("@channel", mock_service)
91
+ self.assertEqual(ids, ["vid_from_channel"])
92
+
93
+ def test_get_video_details_none(self):
94
+ details = main.get_video_details("vid1", None)
95
+ self.assertEqual(
96
+ details, ("", "", "", "", "", "", "https://www.youtube.com/watch?v=vid1")
97
+ )
98
+
99
+ def test_get_video_details_success(self):
100
+ mock_service = MagicMock()
101
+ mock_req = MagicMock()
102
+ mock_resp = {
103
+ "items": [
104
+ {
105
+ "snippet": {
106
+ "title": "Test Video",
107
+ "description": "Desc",
108
+ "publishedAt": "2023-01-01",
109
+ "channelTitle": "Test Channel",
110
+ "tags": ["tag1", "tag2"],
111
+ },
112
+ "contentDetails": {"duration": "PT1M10S"},
113
+ }
114
+ ]
115
+ }
116
+ mock_req.execute.return_value = mock_resp
117
+ mock_service.videos().list.return_value = mock_req
118
+
119
+ details = main.get_video_details("vid1", mock_service)
120
+ self.assertIsNotNone(details)
121
+ assert details is not None
122
+ self.assertEqual(details[0], "Test Video")
123
+ self.assertEqual(details[5], "0:01:10") # Duration
124
+
125
+ @patch("youtube_to_docs.main.ytt_api")
126
+ def test_fetch_transcript(self, mock_ytt_api):
127
+ mock_transcript_obj = MagicMock()
128
+ mock_transcript_obj.to_raw_data.return_value = [
129
+ {"text": "Hello"},
130
+ {"text": "world"},
131
+ ]
132
+ mock_ytt_api.fetch.return_value = mock_transcript_obj
133
+
134
+ text = main.fetch_transcript("vid1")
135
+ self.assertEqual(text, "Hello world")
136
+
137
+ @patch("youtube_to_docs.main.ytt_api")
138
+ def test_fetch_transcript_error(self, mock_ytt_api):
139
+ mock_ytt_api.fetch.side_effect = Exception("Transcript disabled")
140
+ text = main.fetch_transcript("vid1")
141
+ self.assertIsNone(text)
142
+
143
+ @patch("youtube_to_docs.main.genai.Client")
144
+ def test_generate_summary_gemini(self, mock_client_cls):
145
+ mock_client = mock_client_cls.return_value
146
+ mock_resp = MagicMock()
147
+ mock_resp.text = "Gemini Summary"
148
+ mock_client.models.generate_content.return_value = mock_resp
149
+
150
+ summary = main.generate_summary("gemini-pro", "transcript", "Title", "url")
151
+ self.assertEqual(summary, "Gemini Summary")
152
+
153
+ @patch("youtube_to_docs.main.requests.post")
154
+ @patch("youtube_to_docs.main.google.auth.default")
155
+ def test_generate_summary_vertex(self, mock_auth, mock_post):
156
+ mock_creds = MagicMock()
157
+ mock_creds.token = "fake_token"
158
+ mock_auth.return_value = (mock_creds, "proj")
159
+
160
+ mock_resp = MagicMock()
161
+ mock_resp.status_code = 200
162
+ mock_resp.json.return_value = {"content": [{"text": "Vertex Summary"}]}
163
+ mock_post.return_value = mock_resp
164
+
165
+ summary = main.generate_summary(
166
+ "vertex-claude-3-5", "transcript", "Title", "url"
167
+ )
168
+ self.assertEqual(summary, "Vertex Summary")
169
+
170
+ @patch("youtube_to_docs.main.requests.post")
171
+ def test_generate_summary_bedrock(self, mock_post):
172
+ mock_resp = MagicMock()
173
+ mock_resp.status_code = 200
174
+ mock_resp.json.return_value = {
175
+ "output": {"message": {"content": [{"text": "Bedrock Summary"}]}}
176
+ }
177
+ mock_post.return_value = mock_resp
178
+
179
+ summary = main.generate_summary(
180
+ "bedrock-claude-3-5", "transcript", "Title", "url"
181
+ )
182
+ self.assertEqual(summary, "Bedrock Summary")
183
+
184
+ @patch("youtube_to_docs.main.OpenAI")
185
+ def test_generate_summary_foundry(self, mock_openai):
186
+ mock_client = mock_openai.return_value
187
+ mock_completion = MagicMock()
188
+ mock_completion.choices[0].message.content = "Foundry Summary"
189
+ mock_client.chat.completions.create.return_value = mock_completion
190
+
191
+ summary = main.generate_summary("foundry-gpt-4", "transcript", "Title", "url")
192
+ self.assertEqual(summary, "Foundry Summary")
193
+
194
+
195
+ if __name__ == "__main__":
196
+ unittest.main()
File without changes
@@ -0,0 +1,506 @@
1
+ # /// script
2
+ # requires-python = ">=3.14"
3
+ # dependencies = [
4
+ # "google-auth>=2.45.0",
5
+ # "google-genai>=1.56.0",
6
+ # "google-api-python-client>=2.187.0",
7
+ # "isodate>=0.7.2",
8
+ # "openai>=1.56.0",
9
+ # "polars>=1.36.1",
10
+ # "requests>=2.32.5",
11
+ # "youtube-transcript-api>=1.2.3"
12
+ # ]
13
+ # ///
14
+ #
15
+ # Run as:
16
+ # uv run https://raw.githubusercontent.com/DoIT-Artifical-Intelligence/youtube-to-docs/refs/heads/main/youtube_to_docs/main.py --model gemini-3-flash-preview # noqa
17
+ # To test locally run one of:
18
+ # uv run youtube_to_docs/main.py --model gemini-3-flash-preview
19
+ # uv run youtube_to_docs/main.py --model vertex-claude-haiku-4-5@20251001
20
+ # uv run youtube_to_docs/main.py --model bedrock-claude-haiku-4-5-20251001-v1
21
+ # uv run youtube_to_docs/main.py --model bedrock-nova-2-lite-v1
22
+ # uv run youtube_to_docs/main.py --model bedrock-claude-haiku-4-5-20251001
23
+ # uv run youtube_to_docs/main.py --model foundry-gpt-5-mini
24
+
25
+
26
+ import argparse
27
+ import os
28
+ import re
29
+ import sys
30
+ import time
31
+ from typing import Any, List, Optional, Tuple, cast
32
+
33
+ import google.auth
34
+ import isodate
35
+ import polars as pl
36
+ import requests
37
+ from google import genai
38
+ from google.auth.transport.requests import Request as GoogleAuthRequest
39
+ from google.genai import types
40
+ from googleapiclient.discovery import Resource, build
41
+ from openai import OpenAI
42
+ from youtube_transcript_api import YouTubeTranscriptApi
43
+
44
+ # Global instance for transcript API
45
+ ytt_api = YouTubeTranscriptApi()
46
+
47
+
48
+ def get_youtube_service() -> Optional[Resource]:
49
+ """Builds and returns the YouTube Data API service."""
50
+ try:
51
+ api_key = os.environ["YOUTUBE_DATA_API_KEY"]
52
+ return build("youtube", "v3", developerKey=api_key)
53
+ except KeyError:
54
+ print(
55
+ "Warning: YOUTUBE_DATA_API_KEY not found. Playlist and Channel expansion "
56
+ "will fail."
57
+ )
58
+ return None
59
+
60
+
61
+ def resolve_video_ids(
62
+ video_id_input: str, youtube_service: Optional[Resource]
63
+ ) -> List[str]:
64
+ """
65
+ Resolves the input (video ID, list, playlist, or channel handle)
66
+ into a list of video IDs.
67
+ """
68
+ video_ids: List[str] = []
69
+
70
+ # Handle Channel Handles (e.g. @channelname)
71
+ if video_id_input.startswith("@"):
72
+ if not youtube_service:
73
+ print("Error: YOUTUBE_DATA_API_KEY is required to resolve channel handles.")
74
+ sys.exit(1)
75
+ service = cast(Any, youtube_service)
76
+ print(f"Resolving channel handle: {video_id_input}...")
77
+ request = service.channels().list(
78
+ part="contentDetails", forHandle=video_id_input
79
+ )
80
+ response = request.execute()
81
+ if not response["items"]:
82
+ print(f"Error: No channel found for handle {video_id_input}")
83
+ sys.exit(1)
84
+ # Get the 'uploads' playlist ID from the channel details
85
+ video_id_input = response["items"][0]["contentDetails"]["relatedPlaylists"][
86
+ "uploads"
87
+ ]
88
+ print(f"Found uploads playlist: {video_id_input}")
89
+
90
+ # Single video (standard ID length is 11)
91
+ if len(video_id_input) == 11 and "," not in video_id_input:
92
+ video_ids = [video_id_input]
93
+ # List of videos
94
+ elif "," in video_id_input:
95
+ video_ids = video_id_input.split(",")
96
+ # Playlist (Standard 'PL' or Uploads 'UU')
97
+ elif video_id_input.startswith("PL") or video_id_input.startswith("UU"):
98
+ if not youtube_service:
99
+ print("Error: YOUTUBE_DATA_API_KEY is required for playlists.")
100
+ sys.exit(1)
101
+ service = cast(Any, youtube_service)
102
+ request = service.playlistItems().list(
103
+ part="contentDetails", playlistId=video_id_input, maxResults=50
104
+ )
105
+ while request:
106
+ response = request.execute()
107
+ for item in response["items"]:
108
+ video_ids.append(item["contentDetails"]["videoId"])
109
+ request = service.playlistItems().list_next(request, response)
110
+
111
+ return video_ids
112
+
113
+
114
+ def get_video_details(
115
+ video_id: str, youtube_service: Optional[Resource]
116
+ ) -> Optional[Tuple[str, str, str, str, str, str, str]]:
117
+ """
118
+ Fetches video metadata from YouTube Data API.
119
+ Returns a tuple of (video_title, description, publishedAt,
120
+ channelTitle, tags, video_duration, url).
121
+ """
122
+ url = f"https://www.youtube.com/watch?v={video_id}"
123
+
124
+ if not youtube_service:
125
+ return "", "", "", "", "", "", url
126
+
127
+ service = cast(Any, youtube_service)
128
+ request = service.videos().list(part="snippet,contentDetails", id=video_id)
129
+ response = request.execute()
130
+
131
+ if response["items"]:
132
+ snippet = response["items"][0]["snippet"]
133
+ video_title: str = snippet["title"]
134
+ description: str = snippet["description"]
135
+ publishedAt: str = snippet["publishedAt"]
136
+ channelTitle: str = snippet["channelTitle"]
137
+ tags: str = ", ".join(snippet.get("tags", []))
138
+ iso_duration: str = response["items"][0]["contentDetails"]["duration"]
139
+ video_duration: str = str(isodate.parse_duration(iso_duration))
140
+ return (
141
+ video_title,
142
+ description,
143
+ publishedAt,
144
+ channelTitle,
145
+ tags,
146
+ video_duration,
147
+ url,
148
+ )
149
+ else:
150
+ print(f"Warning: No details found for video ID {video_id}")
151
+ return None
152
+
153
+
154
+ def fetch_transcript(video_id: str) -> Optional[str]:
155
+ """Fetches the transcript for a given video ID."""
156
+ try:
157
+ transcript_obj = ytt_api.fetch(video_id, languages=("en", "en-US"))
158
+ transcript_data = transcript_obj.to_raw_data()
159
+ transcript = " ".join([t["text"] for t in transcript_data])
160
+ return transcript
161
+ except Exception as e:
162
+ print(f"Error fetching transcript for {video_id}: {e}")
163
+ return None
164
+
165
+
166
+ def generate_summary(
167
+ model_name: str, transcript: str, video_title: str, url: str
168
+ ) -> str:
169
+ """Generates a summary using the specified LLM provider."""
170
+ summary_text = ""
171
+ prompt = (
172
+ f"I have included a transcript for {url} ({video_title})"
173
+ "\n\n"
174
+ "Can you please summarize this?"
175
+ "\n\n"
176
+ f"{transcript}"
177
+ )
178
+
179
+ if model_name.startswith("gemini"):
180
+ try:
181
+ GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]
182
+ google_genai_client = genai.Client(api_key=GEMINI_API_KEY)
183
+ response = google_genai_client.models.generate_content(
184
+ model=model_name,
185
+ contents=[
186
+ types.Content(
187
+ role="user", parts=[types.Part.from_text(text=prompt)]
188
+ )
189
+ ],
190
+ )
191
+ summary_text = response.text or ""
192
+ except KeyError:
193
+ print("Error: GEMINI_API_KEY not found")
194
+ summary_text = "Error: GEMINI_API_KEY not found"
195
+ except Exception as e:
196
+ print(f"Gemini API Error: {e}")
197
+ summary_text = f"Error: {e}"
198
+
199
+ elif model_name.startswith("vertex"):
200
+ try:
201
+ vertex_project_id = os.environ["PROJECT_ID"]
202
+ vertex_credentials, _ = google.auth.default()
203
+ actual_model_name = model_name.replace("vertex-", "")
204
+
205
+ if actual_model_name.startswith("claude"):
206
+ if vertex_credentials.expired:
207
+ vertex_credentials.refresh(GoogleAuthRequest())
208
+ access_token = vertex_credentials.token
209
+ endpoint = (
210
+ "https://us-east5-aiplatform.googleapis.com/v1/"
211
+ f"projects/{vertex_project_id}/locations/us-east5/"
212
+ f"publishers/anthropic/models/{actual_model_name}:rawPredict"
213
+ )
214
+ headers = {
215
+ "Authorization": f"Bearer {access_token}",
216
+ "Content-Type": "application/json; charset=utf-8",
217
+ }
218
+ payload = {
219
+ "anthropic_version": "vertex-2023-10-16",
220
+ "messages": [{"role": "user", "content": prompt}],
221
+ "max_tokens": 64_000,
222
+ "stream": False,
223
+ }
224
+ response = requests.post(endpoint, headers=headers, json=payload)
225
+ if response.status_code == 200:
226
+ response_json = response.json()
227
+ content_blocks = response_json.get("content", [])
228
+ if (
229
+ content_blocks
230
+ and isinstance(content_blocks, list)
231
+ and "text" in content_blocks[0]
232
+ ):
233
+ summary_text = content_blocks[0]["text"]
234
+ else:
235
+ summary_text = f"Unexpected response format: {response.text}"
236
+ else:
237
+ summary_text = (
238
+ f"Vertex API Error {response.status_code}: {response.text}"
239
+ )
240
+ print(summary_text)
241
+
242
+ except KeyError:
243
+ print(
244
+ "Error: PROJECT_ID environment variable required for GCPVertex models."
245
+ )
246
+ summary_text = "Error: PROJECT_ID required"
247
+ except Exception as e:
248
+ print(f"Vertex Request Error: {e}")
249
+ summary_text = f"Error: {e}"
250
+
251
+ elif model_name.startswith("bedrock"):
252
+ try:
253
+ aws_bearer_token_bedrock = os.environ["AWS_BEARER_TOKEN_BEDROCK"]
254
+ actual_model_name = model_name.replace("bedrock-", "")
255
+ if actual_model_name.startswith("claude"):
256
+ actual_model_name = f"us.anthropic.{actual_model_name}:0"
257
+ elif actual_model_name.startswith("nova"):
258
+ actual_model_name = f"us.amazon.{actual_model_name}:0"
259
+
260
+ endpoint = (
261
+ f"https://bedrock-runtime.us-east-1.amazonaws.com/model/"
262
+ f"{actual_model_name}/converse"
263
+ )
264
+ response = requests.post(
265
+ endpoint,
266
+ headers={
267
+ "Content-Type": "application/json",
268
+ "Authorization": f"Bearer {aws_bearer_token_bedrock}",
269
+ },
270
+ json={
271
+ "messages": [
272
+ {
273
+ "role": "user",
274
+ "content": [{"text": prompt}],
275
+ }
276
+ ],
277
+ "max_tokens": 64_000,
278
+ },
279
+ )
280
+ if response.status_code == 200:
281
+ response_json = response.json()
282
+ print(response_json)
283
+ try:
284
+ content_blocks = response_json["output"]["message"]["content"]
285
+ if (
286
+ content_blocks
287
+ and isinstance(content_blocks, list)
288
+ and "text" in content_blocks[0]
289
+ ):
290
+ summary_text = content_blocks[0]["text"]
291
+ else:
292
+ summary_text = f"Unexpected content format: {response_json}"
293
+ except KeyError:
294
+ summary_text = f"Unexpected response structure: {response_json}"
295
+ else:
296
+ summary_text = (
297
+ f"Bedrock API Error {response.status_code}: {response.text}"
298
+ )
299
+ print(summary_text)
300
+ except KeyError:
301
+ print(
302
+ "Error: AWS_BEARER_TOKEN_BEDROCK environment variable required for "
303
+ "AWS Bedrock models."
304
+ )
305
+ summary_text = "Error: AWS_BEARER_TOKEN_BEDROCK required"
306
+ except Exception as e:
307
+ print(f"Bedrock Request Error: {e}")
308
+ summary_text = f"Error: {e}"
309
+
310
+ elif model_name.startswith("foundry"):
311
+ try:
312
+ AZURE_FOUNDRY_ENDPOINT = os.environ["AZURE_FOUNDRY_ENDPOINT"]
313
+ AZURE_FOUNDRY_API_KEY = os.environ["AZURE_FOUNDRY_API_KEY"]
314
+ actual_model_name = model_name.replace("foundry-", "")
315
+ client = OpenAI(
316
+ base_url=AZURE_FOUNDRY_ENDPOINT, api_key=AZURE_FOUNDRY_API_KEY
317
+ )
318
+ completion = client.chat.completions.create(
319
+ model=actual_model_name,
320
+ messages=[
321
+ {
322
+ "role": "user",
323
+ "content": prompt,
324
+ }
325
+ ],
326
+ )
327
+ summary_text = completion.choices[0].message.content
328
+ except KeyError:
329
+ print(
330
+ "Error: AZURE_FOUNDRY_ENDPOINT and AZURE_FOUNDRY_API_KEY "
331
+ "environment variables required."
332
+ )
333
+ summary_text = "Error: Foundry vars required"
334
+ except Exception as e:
335
+ print(f"Foundry Request Error: {e}")
336
+ summary_text = f"Error: {e}"
337
+
338
+ return summary_text
339
+
340
+
341
+ def main() -> None:
342
+ parser = argparse.ArgumentParser()
343
+ parser.add_argument(
344
+ "video_id",
345
+ nargs="?",
346
+ default="KuPc06JgI_A",
347
+ help=(
348
+ "Can be one of: \n"
349
+ "A Video ID e.g. 'KuPc06JgI_A'\n"
350
+ "Playlist ID (starts with PL e.g. 'PL8ZxoInteClyHaiReuOHpv6Z4SPrXtYtW')\n"
351
+ "Channel Handle (starts with @ e.g. '@mga-hgo1740')\n"
352
+ "Comma-separated list of Video IDs. (e.g. 'KuPc06JgI_A,GalhDyf3F8g')"
353
+ ),
354
+ )
355
+ parser.add_argument(
356
+ "-o",
357
+ "--outfile",
358
+ default="youtube-docs.csv",
359
+ help=("Can be one of: \nLocal file path to save the output CSV file."),
360
+ )
361
+ parser.add_argument(
362
+ "-m",
363
+ "--model",
364
+ default=None,
365
+ help=(
366
+ "The LLM to use for summarization. Can be one of: \n"
367
+ "Gemini model (e.g., 'gemini-3-flash-preview')\n"
368
+ "GCP Vertex model (prefixed with 'vertex-'). e.g. "
369
+ "vertex-claude-haiku-4-5@20251001\n"
370
+ "AWS Bedrock model (prefixed with 'bedrock-'). e.g. "
371
+ "bedrock-claude-haiku-4-5-20251001-v1\n"
372
+ "Azure Foundry model (prefix with 'foundry-). e.g. 'foundry-gpt-5-mini'\n"
373
+ "Defaults to None."
374
+ ),
375
+ )
376
+
377
+ args = parser.parse_args()
378
+ video_id_input: str = args.video_id
379
+ outfile: str = args.outfile
380
+ model_name: Optional[str] = args.model
381
+
382
+ youtube_service = get_youtube_service()
383
+
384
+ video_ids = resolve_video_ids(video_id_input, youtube_service)
385
+
386
+ # Setup Output Directories
387
+ transcripts_dir: Optional[str] = None
388
+ summaries_dir: Optional[str] = None
389
+ if outfile.endswith(".csv"):
390
+ output_dir = os.path.dirname(outfile)
391
+ if output_dir and not os.path.exists(output_dir):
392
+ os.makedirs(output_dir, exist_ok=True)
393
+ base_dir = output_dir if output_dir else "."
394
+ transcripts_dir = os.path.join(base_dir, "transcript-files")
395
+ summaries_dir = os.path.join(base_dir, "summary-files")
396
+ os.makedirs(transcripts_dir, exist_ok=True)
397
+ os.makedirs(summaries_dir, exist_ok=True)
398
+
399
+ print(f"Processing {len(video_ids)} videos.")
400
+ print(f"Processing Videos: {video_ids}")
401
+ print(f"Saving to: {outfile}")
402
+ if model_name:
403
+ print(f"Summarizing using model: {model_name}")
404
+
405
+ data: List[dict] = []
406
+ for video_id in video_ids:
407
+ print(f"Processing Video ID: {video_id}")
408
+
409
+ # Get Details
410
+ details = get_video_details(video_id, youtube_service)
411
+ if not details:
412
+ # If explicit None returned, skip
413
+ continue
414
+
415
+ (
416
+ video_title,
417
+ description,
418
+ publishedAt,
419
+ channelTitle,
420
+ tags,
421
+ video_duration,
422
+ url,
423
+ ) = details
424
+ print(f"Processing Video URL: {url}")
425
+
426
+ # Fetch Transcript
427
+ transcript = fetch_transcript(video_id)
428
+ if not transcript:
429
+ continue
430
+
431
+ # Save Transcript
432
+ safe_title = (
433
+ re.sub(r'[\\/*?:"<>|]', "_", video_title)
434
+ .replace("\n", " ")
435
+ .replace("\r", "")
436
+ )
437
+ transcript_full_path = ""
438
+ if transcripts_dir:
439
+ transcript_filename = f"{video_id} - {safe_title}.txt"
440
+ transcript_full_path = os.path.abspath(
441
+ os.path.join(transcripts_dir, transcript_filename)
442
+ )
443
+ try:
444
+ with open(transcript_full_path, "w", encoding="utf-8") as f:
445
+ f.write(transcript)
446
+ print(f"Saved transcript: {transcript_filename}")
447
+ except OSError as e:
448
+ print(f"Error writing transcript: {e}")
449
+
450
+ # Summarize
451
+ summary_text = ""
452
+ summary_full_path = ""
453
+ if model_name:
454
+ print(f"Summarizing using model: {model_name}")
455
+ summary_text = generate_summary(model_name, transcript, video_title, url)
456
+
457
+ if summaries_dir and summary_text:
458
+ summary_filename = (
459
+ f"{model_name} - {video_id} - {safe_title} - summary.md"
460
+ )
461
+ summary_full_path = os.path.abspath(
462
+ os.path.join(summaries_dir, summary_filename)
463
+ )
464
+ try:
465
+ with open(summary_full_path, "w", encoding="utf-8") as f:
466
+ f.write(summary_text)
467
+ print(f"Saved summary: {summary_filename}")
468
+ except OSError as e:
469
+ print(f"Error writing summary: {e}")
470
+
471
+ print(f"Video Title: {video_title}")
472
+ print(f"Description: {description}")
473
+ print(f"Published At: {publishedAt}")
474
+ print(f"Channel Title: {channelTitle}")
475
+ print(f"Tags: {tags}")
476
+ print(f"Video Duration: {video_duration}")
477
+ print(f"Number of Transcript characters: {len(transcript)}")
478
+
479
+ row = {
480
+ "URL": url,
481
+ "Title": video_title,
482
+ "Description": description,
483
+ "Data Published": publishedAt,
484
+ "Channel": channelTitle,
485
+ "Tags": tags,
486
+ "Duration": video_duration,
487
+ "Transcript characters": len(transcript),
488
+ "Transcript File": transcript_full_path,
489
+ "Summary File": summary_full_path,
490
+ f"Summary Text {model_name}"
491
+ if model_name
492
+ else "Summary Text": summary_text,
493
+ }
494
+ data.append(row)
495
+ time.sleep(1)
496
+
497
+ if data:
498
+ df = pl.DataFrame(data)
499
+ df.write_csv(outfile)
500
+ print(f"Successfully wrote {len(df)} rows to {outfile}")
501
+ else:
502
+ print("No data gathered.")
503
+
504
+
505
+ if __name__ == "__main__":
506
+ main()
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.4
2
+ Name: youtube-to-docs
3
+ Version: 0.0.1
4
+ Summary: Convert YouTube videos to docs/sheets for discoverability
5
+ Requires-Python: >=3.14
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: google-auth>=2.45.0
9
+ Requires-Dist: google-genai>=1.56.0
10
+ Requires-Dist: google-api-python-client>=2.187.0
11
+ Requires-Dist: isodate>=0.7.2
12
+ Requires-Dist: openai>=1.56.0
13
+ Requires-Dist: polars>=1.36.1
14
+ Requires-Dist: requests>=2.32.5
15
+ Requires-Dist: youtube-transcript-api>=1.2.3
16
+ Dynamic: license-file
17
+
18
+ # youtube-to-docs
19
+ Convert YouTube videos to docs/sheets for discoverability.
20
+
21
+
22
+
23
+ *Created with the help of AI. All artifacts have been checked and work as expected.*
@@ -0,0 +1,17 @@
1
+ .gitignore
2
+ .pre-commit-config.yaml
3
+ LICENSE
4
+ README.md
5
+ pyproject.toml
6
+ requirements.txt
7
+ .github/workflows/ci.yml
8
+ docs/development.md
9
+ tests/__init__.py
10
+ tests/test_main.py
11
+ youtube_to_docs/__init__.py
12
+ youtube_to_docs/main.py
13
+ youtube_to_docs.egg-info/PKG-INFO
14
+ youtube_to_docs.egg-info/SOURCES.txt
15
+ youtube_to_docs.egg-info/dependency_links.txt
16
+ youtube_to_docs.egg-info/requires.txt
17
+ youtube_to_docs.egg-info/top_level.txt
@@ -0,0 +1,8 @@
1
+ google-auth>=2.45.0
2
+ google-genai>=1.56.0
3
+ google-api-python-client>=2.187.0
4
+ isodate>=0.7.2
5
+ openai>=1.56.0
6
+ polars>=1.36.1
7
+ requests>=2.32.5
8
+ youtube-transcript-api>=1.2.3
@@ -0,0 +1 @@
1
+ youtube_to_docs