youtube-to-docs 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- youtube_to_docs-0.0.1/.github/workflows/ci.yml +38 -0
- youtube_to_docs-0.0.1/.gitignore +215 -0
- youtube_to_docs-0.0.1/.pre-commit-config.yaml +30 -0
- youtube_to_docs-0.0.1/LICENSE +21 -0
- youtube_to_docs-0.0.1/PKG-INFO +23 -0
- youtube_to_docs-0.0.1/README.md +6 -0
- youtube_to_docs-0.0.1/docs/development.md +102 -0
- youtube_to_docs-0.0.1/pyproject.toml +52 -0
- youtube_to_docs-0.0.1/requirements.txt +9 -0
- youtube_to_docs-0.0.1/setup.cfg +4 -0
- youtube_to_docs-0.0.1/tests/__init__.py +0 -0
- youtube_to_docs-0.0.1/tests/test_main.py +196 -0
- youtube_to_docs-0.0.1/youtube_to_docs/__init__.py +0 -0
- youtube_to_docs-0.0.1/youtube_to_docs/main.py +506 -0
- youtube_to_docs-0.0.1/youtube_to_docs.egg-info/PKG-INFO +23 -0
- youtube_to_docs-0.0.1/youtube_to_docs.egg-info/SOURCES.txt +17 -0
- youtube_to_docs-0.0.1/youtube_to_docs.egg-info/dependency_links.txt +1 -0
- youtube_to_docs-0.0.1/youtube_to_docs.egg-info/requires.txt +8 -0
- youtube_to_docs-0.0.1/youtube_to_docs.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ main ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ main ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v5
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
run: uv python install
|
|
21
|
+
|
|
22
|
+
- name: Create virtual environment
|
|
23
|
+
run: uv venv
|
|
24
|
+
|
|
25
|
+
- name: Install dependencies
|
|
26
|
+
run: uv pip install -r requirements.txt
|
|
27
|
+
|
|
28
|
+
- name: Ruff Lint
|
|
29
|
+
run: uv tool run ruff check .
|
|
30
|
+
|
|
31
|
+
- name: Ruff Format
|
|
32
|
+
run: uv tool run ruff format --check .
|
|
33
|
+
|
|
34
|
+
- name: Type Check
|
|
35
|
+
run: uv tool run ty check
|
|
36
|
+
|
|
37
|
+
- name: Run Tests
|
|
38
|
+
run: uv run --with-requirements requirements.txt pytest
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
208
|
+
|
|
209
|
+
youtube-docs.csv
|
|
210
|
+
transcript-files/
|
|
211
|
+
summary-files/
|
|
212
|
+
|
|
213
|
+
implementation_plan.md
|
|
214
|
+
|
|
215
|
+
uv.lock
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: local
|
|
3
|
+
hooks:
|
|
4
|
+
- id: ruff-format
|
|
5
|
+
name: Ruff Format
|
|
6
|
+
entry: uv tool run ruff format
|
|
7
|
+
language: system
|
|
8
|
+
types: [python]
|
|
9
|
+
args: ["."]
|
|
10
|
+
|
|
11
|
+
- id: ruff-check
|
|
12
|
+
name: Ruff Lint
|
|
13
|
+
entry: uv tool run ruff check
|
|
14
|
+
language: system
|
|
15
|
+
types: [python]
|
|
16
|
+
args: ["--fix", "."]
|
|
17
|
+
|
|
18
|
+
- id: ty-check
|
|
19
|
+
name: Type Check (Ty)
|
|
20
|
+
entry: uv tool run ty check
|
|
21
|
+
language: system
|
|
22
|
+
types: [python]
|
|
23
|
+
pass_filenames: false
|
|
24
|
+
|
|
25
|
+
- id: pytest
|
|
26
|
+
name: Run Tests
|
|
27
|
+
entry: uv run --with-requirements requirements.txt pytest
|
|
28
|
+
language: system
|
|
29
|
+
types: [python]
|
|
30
|
+
pass_filenames: false
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 DoIT - Artifical Intelligence
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: youtube-to-docs
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Convert YouTube videos to docs/sheets for discoverability
|
|
5
|
+
Requires-Python: >=3.14
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: google-auth>=2.45.0
|
|
9
|
+
Requires-Dist: google-genai>=1.56.0
|
|
10
|
+
Requires-Dist: google-api-python-client>=2.187.0
|
|
11
|
+
Requires-Dist: isodate>=0.7.2
|
|
12
|
+
Requires-Dist: openai>=1.56.0
|
|
13
|
+
Requires-Dist: polars>=1.36.1
|
|
14
|
+
Requires-Dist: requests>=2.32.5
|
|
15
|
+
Requires-Dist: youtube-transcript-api>=1.2.3
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# youtube-to-docs
|
|
19
|
+
Convert YouTube videos to docs/sheets for discoverability.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
*Created with the help of AI. All artifacts have been checked and work as expected.*
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Development Guide
|
|
2
|
+
|
|
3
|
+
## Prerequisites
|
|
4
|
+
|
|
5
|
+
- Python 3.14 or higher
|
|
6
|
+
- `uv` (recommended) or `pip`
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
1. Install dependencies:
|
|
11
|
+
```bash
|
|
12
|
+
uv pip install -r requirements.txt
|
|
13
|
+
# OR
|
|
14
|
+
pip install -r requirements.txt
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Running Tests
|
|
18
|
+
|
|
19
|
+
We use `pytest` for testing.
|
|
20
|
+
|
|
21
|
+
### Using `uv` (Recommended)
|
|
22
|
+
|
|
23
|
+
To run tests with all dependencies automatically handled:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
uv run --with-requirements requirements.txt pytest
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Project Structure
|
|
30
|
+
|
|
31
|
+
- `main.py`: Main application script.
|
|
32
|
+
- `tests/`: Directory containing test files.
|
|
33
|
+
- `requirements.txt`: Python package dependencies.
|
|
34
|
+
|
|
35
|
+
## Tooling
|
|
36
|
+
|
|
37
|
+
This project uses modern Python tooling for code quality:
|
|
38
|
+
|
|
39
|
+
- **Ruff**: For linting and code formatting.
|
|
40
|
+
- **Ty**: For static type checking.
|
|
41
|
+
|
|
42
|
+
These tools are configured in `pyproject.toml`.
|
|
43
|
+
|
|
44
|
+
### Running Ruff
|
|
45
|
+
|
|
46
|
+
To check for linting errors:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv tool run ruff check .
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
To fix fixable linting errors automatically:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
uv tool run ruff check --fix .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
To format the code:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
uv tool run ruff format .
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Running Ty (Type Checking)
|
|
65
|
+
|
|
66
|
+
To run type checks:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
uv tool run ty check
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Pre-commit
|
|
73
|
+
|
|
74
|
+
To run pre-commit hook:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
uv tool run pre-commit run --all-files
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
## Release to PyPI
|
|
82
|
+
|
|
83
|
+
To publish a new version of the package to PyPI, follow these steps:
|
|
84
|
+
|
|
85
|
+
1. **Build the package**:
|
|
86
|
+
This will create a `dist/` directory with the distribution files.
|
|
87
|
+
```bash
|
|
88
|
+
uv tool run --from build pyproject-build
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
2. **Upload to PyPI**:
|
|
92
|
+
Use `twine` to upload the distribution files.
|
|
93
|
+
|
|
94
|
+
If your `.pypirc` is already configured with your API key:
|
|
95
|
+
```bash
|
|
96
|
+
uv tool run twine upload --repository testpypi dist/*
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Or, to explicitly use your `PYPI_API_KEY` environment variable:
|
|
100
|
+
```bash
|
|
101
|
+
uv tool run twine upload -u __token__ -p $env:PYPI_API_KEY dist/*
|
|
102
|
+
```
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "youtube-to-docs"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "Convert YouTube videos to docs/sheets for discoverability"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.14"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"google-auth>=2.45.0",
|
|
13
|
+
"google-genai>=1.56.0",
|
|
14
|
+
"google-api-python-client>=2.187.0",
|
|
15
|
+
"isodate>=0.7.2",
|
|
16
|
+
"openai>=1.56.0",
|
|
17
|
+
"polars>=1.36.1",
|
|
18
|
+
"requests>=2.32.5",
|
|
19
|
+
"youtube-transcript-api>=1.2.3",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
include = ["youtube_to_docs*"]
|
|
24
|
+
exclude = ["tests*", "docs*", "summary-files*", "transcript-files*"]
|
|
25
|
+
|
|
26
|
+
[tool.ruff]
|
|
27
|
+
# Match the project's Python version requirement
|
|
28
|
+
target-version = "py314"
|
|
29
|
+
line-length = 88
|
|
30
|
+
|
|
31
|
+
[tool.ruff.lint]
|
|
32
|
+
# Enable Pyflakes (`F`), pycodestyle (`E`, `W`), and isort (`I`)
|
|
33
|
+
select = ["E", "F", "I", "W"]
|
|
34
|
+
ignore = []
|
|
35
|
+
|
|
36
|
+
# Allow fix for all enabled rules (when `--fix`) is provided.
|
|
37
|
+
fixable = ["ALL"]
|
|
38
|
+
unfixable = []
|
|
39
|
+
|
|
40
|
+
[tool.ruff.format]
|
|
41
|
+
# Use double quotes for strings.
|
|
42
|
+
quote-style = "double"
|
|
43
|
+
# Indent with spaces, rather than tabs.
|
|
44
|
+
indent-style = "space"
|
|
45
|
+
# Respect magic trailing commas.
|
|
46
|
+
skip-magic-trailing-comma = false
|
|
47
|
+
# Automatically detect the appropriate line ending.
|
|
48
|
+
line-ending = "auto"
|
|
49
|
+
|
|
50
|
+
[tool.ty.environment]
|
|
51
|
+
# Target Python 3.14 to match project requirements
|
|
52
|
+
python-version = "3.14"
|
|
File without changes
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import unittest
|
|
3
|
+
from unittest.mock import MagicMock, patch
|
|
4
|
+
|
|
5
|
+
from youtube_to_docs import main
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestYoutubeToDocs(unittest.TestCase):
|
|
9
|
+
def setUp(self):
|
|
10
|
+
# Mock environment variables
|
|
11
|
+
self.env_patcher = patch.dict(
|
|
12
|
+
os.environ,
|
|
13
|
+
{
|
|
14
|
+
"YOUTUBE_DATA_API_KEY": "fake_youtube_key",
|
|
15
|
+
"GEMINI_API_KEY": "fake_gemini_key",
|
|
16
|
+
"PROJECT_ID": "fake_project_id",
|
|
17
|
+
"AWS_BEARER_TOKEN_BEDROCK": "fake_bedrock_token",
|
|
18
|
+
"AZURE_FOUNDRY_ENDPOINT": "https://fake.openai.azure.com/",
|
|
19
|
+
"AZURE_FOUNDRY_API_KEY": "fake_foundry_key",
|
|
20
|
+
},
|
|
21
|
+
)
|
|
22
|
+
self.env_patcher.start()
|
|
23
|
+
|
|
24
|
+
def tearDown(self):
|
|
25
|
+
self.env_patcher.stop()
|
|
26
|
+
|
|
27
|
+
@patch("youtube_to_docs.main.build")
|
|
28
|
+
def test_get_youtube_service(self, mock_build):
|
|
29
|
+
service = main.get_youtube_service()
|
|
30
|
+
self.assertIsNotNone(service)
|
|
31
|
+
mock_build.assert_called_with("youtube", "v3", developerKey="fake_youtube_key")
|
|
32
|
+
|
|
33
|
+
def test_get_youtube_service_no_key(self):
|
|
34
|
+
with patch.dict(os.environ, {}, clear=True):
|
|
35
|
+
service = main.get_youtube_service()
|
|
36
|
+
self.assertIsNone(service)
|
|
37
|
+
|
|
38
|
+
def test_resolve_video_ids_single(self):
|
|
39
|
+
ids = main.resolve_video_ids("KuPc06JgI_A", None)
|
|
40
|
+
self.assertEqual(ids, ["KuPc06JgI_A"])
|
|
41
|
+
|
|
42
|
+
def test_resolve_video_ids_list(self):
|
|
43
|
+
ids = main.resolve_video_ids("KuPc06JgI_A,GalhDyf3F8g", None)
|
|
44
|
+
self.assertEqual(ids, ["KuPc06JgI_A", "GalhDyf3F8g"])
|
|
45
|
+
|
|
46
|
+
def test_resolve_video_ids_playlist_no_service(self):
|
|
47
|
+
with self.assertRaises(SystemExit):
|
|
48
|
+
main.resolve_video_ids("PL8ZxoInteClyHaiReuOHpv6Z4SPrXtYtW", None)
|
|
49
|
+
|
|
50
|
+
@patch("youtube_to_docs.main.build")
|
|
51
|
+
def test_resolve_video_ids_playlist(self, mock_build):
|
|
52
|
+
mock_service = MagicMock()
|
|
53
|
+
mock_request = MagicMock()
|
|
54
|
+
mock_response = {
|
|
55
|
+
"items": [
|
|
56
|
+
{"contentDetails": {"videoId": "vid1"}},
|
|
57
|
+
{"contentDetails": {"videoId": "vid2"}},
|
|
58
|
+
]
|
|
59
|
+
}
|
|
60
|
+
mock_request.execute.return_value = mock_response
|
|
61
|
+
# Mock list_next to return None to stop iteration
|
|
62
|
+
mock_service.playlistItems().list.return_value = mock_request
|
|
63
|
+
mock_service.playlistItems().list_next.return_value = None
|
|
64
|
+
|
|
65
|
+
ids = main.resolve_video_ids("PL123", mock_service)
|
|
66
|
+
self.assertEqual(ids, ["vid1", "vid2"])
|
|
67
|
+
|
|
68
|
+
@patch("youtube_to_docs.main.build")
|
|
69
|
+
def test_resolve_video_ids_channel_handle(self, mock_build):
|
|
70
|
+
mock_service = MagicMock()
|
|
71
|
+
|
|
72
|
+
# Mock channel list response
|
|
73
|
+
mock_channel_req = MagicMock()
|
|
74
|
+
mock_channel_resp = {
|
|
75
|
+
"items": [{"contentDetails": {"relatedPlaylists": {"uploads": "UU123"}}}]
|
|
76
|
+
}
|
|
77
|
+
mock_channel_req.execute.return_value = mock_channel_resp
|
|
78
|
+
mock_service.channels().list.return_value = mock_channel_req
|
|
79
|
+
|
|
80
|
+
# Mock playlist items response (since it calls resolve_video_ids internally
|
|
81
|
+
# with the playlist ID)
|
|
82
|
+
mock_playlist_req = MagicMock()
|
|
83
|
+
mock_playlist_resp = {
|
|
84
|
+
"items": [{"contentDetails": {"videoId": "vid_from_channel"}}]
|
|
85
|
+
}
|
|
86
|
+
mock_playlist_req.execute.return_value = mock_playlist_resp
|
|
87
|
+
mock_service.playlistItems().list.return_value = mock_playlist_req
|
|
88
|
+
mock_service.playlistItems().list_next.return_value = None
|
|
89
|
+
|
|
90
|
+
ids = main.resolve_video_ids("@channel", mock_service)
|
|
91
|
+
self.assertEqual(ids, ["vid_from_channel"])
|
|
92
|
+
|
|
93
|
+
def test_get_video_details_none(self):
|
|
94
|
+
details = main.get_video_details("vid1", None)
|
|
95
|
+
self.assertEqual(
|
|
96
|
+
details, ("", "", "", "", "", "", "https://www.youtube.com/watch?v=vid1")
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def test_get_video_details_success(self):
|
|
100
|
+
mock_service = MagicMock()
|
|
101
|
+
mock_req = MagicMock()
|
|
102
|
+
mock_resp = {
|
|
103
|
+
"items": [
|
|
104
|
+
{
|
|
105
|
+
"snippet": {
|
|
106
|
+
"title": "Test Video",
|
|
107
|
+
"description": "Desc",
|
|
108
|
+
"publishedAt": "2023-01-01",
|
|
109
|
+
"channelTitle": "Test Channel",
|
|
110
|
+
"tags": ["tag1", "tag2"],
|
|
111
|
+
},
|
|
112
|
+
"contentDetails": {"duration": "PT1M10S"},
|
|
113
|
+
}
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
mock_req.execute.return_value = mock_resp
|
|
117
|
+
mock_service.videos().list.return_value = mock_req
|
|
118
|
+
|
|
119
|
+
details = main.get_video_details("vid1", mock_service)
|
|
120
|
+
self.assertIsNotNone(details)
|
|
121
|
+
assert details is not None
|
|
122
|
+
self.assertEqual(details[0], "Test Video")
|
|
123
|
+
self.assertEqual(details[5], "0:01:10") # Duration
|
|
124
|
+
|
|
125
|
+
@patch("youtube_to_docs.main.ytt_api")
|
|
126
|
+
def test_fetch_transcript(self, mock_ytt_api):
|
|
127
|
+
mock_transcript_obj = MagicMock()
|
|
128
|
+
mock_transcript_obj.to_raw_data.return_value = [
|
|
129
|
+
{"text": "Hello"},
|
|
130
|
+
{"text": "world"},
|
|
131
|
+
]
|
|
132
|
+
mock_ytt_api.fetch.return_value = mock_transcript_obj
|
|
133
|
+
|
|
134
|
+
text = main.fetch_transcript("vid1")
|
|
135
|
+
self.assertEqual(text, "Hello world")
|
|
136
|
+
|
|
137
|
+
@patch("youtube_to_docs.main.ytt_api")
|
|
138
|
+
def test_fetch_transcript_error(self, mock_ytt_api):
|
|
139
|
+
mock_ytt_api.fetch.side_effect = Exception("Transcript disabled")
|
|
140
|
+
text = main.fetch_transcript("vid1")
|
|
141
|
+
self.assertIsNone(text)
|
|
142
|
+
|
|
143
|
+
@patch("youtube_to_docs.main.genai.Client")
|
|
144
|
+
def test_generate_summary_gemini(self, mock_client_cls):
|
|
145
|
+
mock_client = mock_client_cls.return_value
|
|
146
|
+
mock_resp = MagicMock()
|
|
147
|
+
mock_resp.text = "Gemini Summary"
|
|
148
|
+
mock_client.models.generate_content.return_value = mock_resp
|
|
149
|
+
|
|
150
|
+
summary = main.generate_summary("gemini-pro", "transcript", "Title", "url")
|
|
151
|
+
self.assertEqual(summary, "Gemini Summary")
|
|
152
|
+
|
|
153
|
+
@patch("youtube_to_docs.main.requests.post")
|
|
154
|
+
@patch("youtube_to_docs.main.google.auth.default")
|
|
155
|
+
def test_generate_summary_vertex(self, mock_auth, mock_post):
|
|
156
|
+
mock_creds = MagicMock()
|
|
157
|
+
mock_creds.token = "fake_token"
|
|
158
|
+
mock_auth.return_value = (mock_creds, "proj")
|
|
159
|
+
|
|
160
|
+
mock_resp = MagicMock()
|
|
161
|
+
mock_resp.status_code = 200
|
|
162
|
+
mock_resp.json.return_value = {"content": [{"text": "Vertex Summary"}]}
|
|
163
|
+
mock_post.return_value = mock_resp
|
|
164
|
+
|
|
165
|
+
summary = main.generate_summary(
|
|
166
|
+
"vertex-claude-3-5", "transcript", "Title", "url"
|
|
167
|
+
)
|
|
168
|
+
self.assertEqual(summary, "Vertex Summary")
|
|
169
|
+
|
|
170
|
+
@patch("youtube_to_docs.main.requests.post")
|
|
171
|
+
def test_generate_summary_bedrock(self, mock_post):
|
|
172
|
+
mock_resp = MagicMock()
|
|
173
|
+
mock_resp.status_code = 200
|
|
174
|
+
mock_resp.json.return_value = {
|
|
175
|
+
"output": {"message": {"content": [{"text": "Bedrock Summary"}]}}
|
|
176
|
+
}
|
|
177
|
+
mock_post.return_value = mock_resp
|
|
178
|
+
|
|
179
|
+
summary = main.generate_summary(
|
|
180
|
+
"bedrock-claude-3-5", "transcript", "Title", "url"
|
|
181
|
+
)
|
|
182
|
+
self.assertEqual(summary, "Bedrock Summary")
|
|
183
|
+
|
|
184
|
+
@patch("youtube_to_docs.main.OpenAI")
|
|
185
|
+
def test_generate_summary_foundry(self, mock_openai):
|
|
186
|
+
mock_client = mock_openai.return_value
|
|
187
|
+
mock_completion = MagicMock()
|
|
188
|
+
mock_completion.choices[0].message.content = "Foundry Summary"
|
|
189
|
+
mock_client.chat.completions.create.return_value = mock_completion
|
|
190
|
+
|
|
191
|
+
summary = main.generate_summary("foundry-gpt-4", "transcript", "Title", "url")
|
|
192
|
+
self.assertEqual(summary, "Foundry Summary")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
unittest.main()
|
|
File without changes
|
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
# /// script
|
|
2
|
+
# requires-python = ">=3.14"
|
|
3
|
+
# dependencies = [
|
|
4
|
+
# "google-auth>=2.45.0",
|
|
5
|
+
# "google-genai>=1.56.0",
|
|
6
|
+
# "google-api-python-client>=2.187.0",
|
|
7
|
+
# "isodate>=0.7.2",
|
|
8
|
+
# "openai>=1.56.0",
|
|
9
|
+
# "polars>=1.36.1",
|
|
10
|
+
# "requests>=2.32.5",
|
|
11
|
+
# "youtube-transcript-api>=1.2.3"
|
|
12
|
+
# ]
|
|
13
|
+
# ///
|
|
14
|
+
#
|
|
15
|
+
# Run as:
|
|
16
|
+
# uv run https://raw.githubusercontent.com/DoIT-Artifical-Intelligence/youtube-to-docs/refs/heads/main/youtube_to_docs/main.py --model gemini-3-flash-preview # noqa
|
|
17
|
+
# To test locally run one of:
|
|
18
|
+
# uv run youtube_to_docs/main.py --model gemini-3-flash-preview
|
|
19
|
+
# uv run youtube_to_docs/main.py --model vertex-claude-haiku-4-5@20251001
|
|
20
|
+
# uv run youtube_to_docs/main.py --model bedrock-claude-haiku-4-5-20251001-v1
|
|
21
|
+
# uv run youtube_to_docs/main.py --model bedrock-nova-2-lite-v1
|
|
22
|
+
# uv run youtube_to_docs/main.py --model bedrock-claude-haiku-4-5-20251001
|
|
23
|
+
# uv run youtube_to_docs/main.py --model foundry-gpt-5-mini
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
import sys
|
|
30
|
+
import time
|
|
31
|
+
from typing import Any, List, Optional, Tuple, cast
|
|
32
|
+
|
|
33
|
+
import google.auth
|
|
34
|
+
import isodate
|
|
35
|
+
import polars as pl
|
|
36
|
+
import requests
|
|
37
|
+
from google import genai
|
|
38
|
+
from google.auth.transport.requests import Request as GoogleAuthRequest
|
|
39
|
+
from google.genai import types
|
|
40
|
+
from googleapiclient.discovery import Resource, build
|
|
41
|
+
from openai import OpenAI
|
|
42
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
43
|
+
|
|
44
|
+
# Global instance for transcript API
|
|
45
|
+
ytt_api = YouTubeTranscriptApi()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_youtube_service() -> Optional[Resource]:
|
|
49
|
+
"""Builds and returns the YouTube Data API service."""
|
|
50
|
+
try:
|
|
51
|
+
api_key = os.environ["YOUTUBE_DATA_API_KEY"]
|
|
52
|
+
return build("youtube", "v3", developerKey=api_key)
|
|
53
|
+
except KeyError:
|
|
54
|
+
print(
|
|
55
|
+
"Warning: YOUTUBE_DATA_API_KEY not found. Playlist and Channel expansion "
|
|
56
|
+
"will fail."
|
|
57
|
+
)
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def resolve_video_ids(
|
|
62
|
+
video_id_input: str, youtube_service: Optional[Resource]
|
|
63
|
+
) -> List[str]:
|
|
64
|
+
"""
|
|
65
|
+
Resolves the input (video ID, list, playlist, or channel handle)
|
|
66
|
+
into a list of video IDs.
|
|
67
|
+
"""
|
|
68
|
+
video_ids: List[str] = []
|
|
69
|
+
|
|
70
|
+
# Handle Channel Handles (e.g. @channelname)
|
|
71
|
+
if video_id_input.startswith("@"):
|
|
72
|
+
if not youtube_service:
|
|
73
|
+
print("Error: YOUTUBE_DATA_API_KEY is required to resolve channel handles.")
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
service = cast(Any, youtube_service)
|
|
76
|
+
print(f"Resolving channel handle: {video_id_input}...")
|
|
77
|
+
request = service.channels().list(
|
|
78
|
+
part="contentDetails", forHandle=video_id_input
|
|
79
|
+
)
|
|
80
|
+
response = request.execute()
|
|
81
|
+
if not response["items"]:
|
|
82
|
+
print(f"Error: No channel found for handle {video_id_input}")
|
|
83
|
+
sys.exit(1)
|
|
84
|
+
# Get the 'uploads' playlist ID from the channel details
|
|
85
|
+
video_id_input = response["items"][0]["contentDetails"]["relatedPlaylists"][
|
|
86
|
+
"uploads"
|
|
87
|
+
]
|
|
88
|
+
print(f"Found uploads playlist: {video_id_input}")
|
|
89
|
+
|
|
90
|
+
# Single video (standard ID length is 11)
|
|
91
|
+
if len(video_id_input) == 11 and "," not in video_id_input:
|
|
92
|
+
video_ids = [video_id_input]
|
|
93
|
+
# List of videos
|
|
94
|
+
elif "," in video_id_input:
|
|
95
|
+
video_ids = video_id_input.split(",")
|
|
96
|
+
# Playlist (Standard 'PL' or Uploads 'UU')
|
|
97
|
+
elif video_id_input.startswith("PL") or video_id_input.startswith("UU"):
|
|
98
|
+
if not youtube_service:
|
|
99
|
+
print("Error: YOUTUBE_DATA_API_KEY is required for playlists.")
|
|
100
|
+
sys.exit(1)
|
|
101
|
+
service = cast(Any, youtube_service)
|
|
102
|
+
request = service.playlistItems().list(
|
|
103
|
+
part="contentDetails", playlistId=video_id_input, maxResults=50
|
|
104
|
+
)
|
|
105
|
+
while request:
|
|
106
|
+
response = request.execute()
|
|
107
|
+
for item in response["items"]:
|
|
108
|
+
video_ids.append(item["contentDetails"]["videoId"])
|
|
109
|
+
request = service.playlistItems().list_next(request, response)
|
|
110
|
+
|
|
111
|
+
return video_ids
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_video_details(
|
|
115
|
+
video_id: str, youtube_service: Optional[Resource]
|
|
116
|
+
) -> Optional[Tuple[str, str, str, str, str, str, str]]:
|
|
117
|
+
"""
|
|
118
|
+
Fetches video metadata from YouTube Data API.
|
|
119
|
+
Returns a tuple of (video_title, description, publishedAt,
|
|
120
|
+
channelTitle, tags, video_duration, url).
|
|
121
|
+
"""
|
|
122
|
+
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
123
|
+
|
|
124
|
+
if not youtube_service:
|
|
125
|
+
return "", "", "", "", "", "", url
|
|
126
|
+
|
|
127
|
+
service = cast(Any, youtube_service)
|
|
128
|
+
request = service.videos().list(part="snippet,contentDetails", id=video_id)
|
|
129
|
+
response = request.execute()
|
|
130
|
+
|
|
131
|
+
if response["items"]:
|
|
132
|
+
snippet = response["items"][0]["snippet"]
|
|
133
|
+
video_title: str = snippet["title"]
|
|
134
|
+
description: str = snippet["description"]
|
|
135
|
+
publishedAt: str = snippet["publishedAt"]
|
|
136
|
+
channelTitle: str = snippet["channelTitle"]
|
|
137
|
+
tags: str = ", ".join(snippet.get("tags", []))
|
|
138
|
+
iso_duration: str = response["items"][0]["contentDetails"]["duration"]
|
|
139
|
+
video_duration: str = str(isodate.parse_duration(iso_duration))
|
|
140
|
+
return (
|
|
141
|
+
video_title,
|
|
142
|
+
description,
|
|
143
|
+
publishedAt,
|
|
144
|
+
channelTitle,
|
|
145
|
+
tags,
|
|
146
|
+
video_duration,
|
|
147
|
+
url,
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
print(f"Warning: No details found for video ID {video_id}")
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def fetch_transcript(video_id: str) -> Optional[str]:
|
|
155
|
+
"""Fetches the transcript for a given video ID."""
|
|
156
|
+
try:
|
|
157
|
+
transcript_obj = ytt_api.fetch(video_id, languages=("en", "en-US"))
|
|
158
|
+
transcript_data = transcript_obj.to_raw_data()
|
|
159
|
+
transcript = " ".join([t["text"] for t in transcript_data])
|
|
160
|
+
return transcript
|
|
161
|
+
except Exception as e:
|
|
162
|
+
print(f"Error fetching transcript for {video_id}: {e}")
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def generate_summary(
|
|
167
|
+
model_name: str, transcript: str, video_title: str, url: str
|
|
168
|
+
) -> str:
|
|
169
|
+
"""Generates a summary using the specified LLM provider."""
|
|
170
|
+
summary_text = ""
|
|
171
|
+
prompt = (
|
|
172
|
+
f"I have included a transcript for {url} ({video_title})"
|
|
173
|
+
"\n\n"
|
|
174
|
+
"Can you please summarize this?"
|
|
175
|
+
"\n\n"
|
|
176
|
+
f"{transcript}"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if model_name.startswith("gemini"):
|
|
180
|
+
try:
|
|
181
|
+
GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]
|
|
182
|
+
google_genai_client = genai.Client(api_key=GEMINI_API_KEY)
|
|
183
|
+
response = google_genai_client.models.generate_content(
|
|
184
|
+
model=model_name,
|
|
185
|
+
contents=[
|
|
186
|
+
types.Content(
|
|
187
|
+
role="user", parts=[types.Part.from_text(text=prompt)]
|
|
188
|
+
)
|
|
189
|
+
],
|
|
190
|
+
)
|
|
191
|
+
summary_text = response.text or ""
|
|
192
|
+
except KeyError:
|
|
193
|
+
print("Error: GEMINI_API_KEY not found")
|
|
194
|
+
summary_text = "Error: GEMINI_API_KEY not found"
|
|
195
|
+
except Exception as e:
|
|
196
|
+
print(f"Gemini API Error: {e}")
|
|
197
|
+
summary_text = f"Error: {e}"
|
|
198
|
+
|
|
199
|
+
elif model_name.startswith("vertex"):
|
|
200
|
+
try:
|
|
201
|
+
vertex_project_id = os.environ["PROJECT_ID"]
|
|
202
|
+
vertex_credentials, _ = google.auth.default()
|
|
203
|
+
actual_model_name = model_name.replace("vertex-", "")
|
|
204
|
+
|
|
205
|
+
if actual_model_name.startswith("claude"):
|
|
206
|
+
if vertex_credentials.expired:
|
|
207
|
+
vertex_credentials.refresh(GoogleAuthRequest())
|
|
208
|
+
access_token = vertex_credentials.token
|
|
209
|
+
endpoint = (
|
|
210
|
+
"https://us-east5-aiplatform.googleapis.com/v1/"
|
|
211
|
+
f"projects/{vertex_project_id}/locations/us-east5/"
|
|
212
|
+
f"publishers/anthropic/models/{actual_model_name}:rawPredict"
|
|
213
|
+
)
|
|
214
|
+
headers = {
|
|
215
|
+
"Authorization": f"Bearer {access_token}",
|
|
216
|
+
"Content-Type": "application/json; charset=utf-8",
|
|
217
|
+
}
|
|
218
|
+
payload = {
|
|
219
|
+
"anthropic_version": "vertex-2023-10-16",
|
|
220
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
221
|
+
"max_tokens": 64_000,
|
|
222
|
+
"stream": False,
|
|
223
|
+
}
|
|
224
|
+
response = requests.post(endpoint, headers=headers, json=payload)
|
|
225
|
+
if response.status_code == 200:
|
|
226
|
+
response_json = response.json()
|
|
227
|
+
content_blocks = response_json.get("content", [])
|
|
228
|
+
if (
|
|
229
|
+
content_blocks
|
|
230
|
+
and isinstance(content_blocks, list)
|
|
231
|
+
and "text" in content_blocks[0]
|
|
232
|
+
):
|
|
233
|
+
summary_text = content_blocks[0]["text"]
|
|
234
|
+
else:
|
|
235
|
+
summary_text = f"Unexpected response format: {response.text}"
|
|
236
|
+
else:
|
|
237
|
+
summary_text = (
|
|
238
|
+
f"Vertex API Error {response.status_code}: {response.text}"
|
|
239
|
+
)
|
|
240
|
+
print(summary_text)
|
|
241
|
+
|
|
242
|
+
except KeyError:
|
|
243
|
+
print(
|
|
244
|
+
"Error: PROJECT_ID environment variable required for GCPVertex models."
|
|
245
|
+
)
|
|
246
|
+
summary_text = "Error: PROJECT_ID required"
|
|
247
|
+
except Exception as e:
|
|
248
|
+
print(f"Vertex Request Error: {e}")
|
|
249
|
+
summary_text = f"Error: {e}"
|
|
250
|
+
|
|
251
|
+
elif model_name.startswith("bedrock"):
|
|
252
|
+
try:
|
|
253
|
+
aws_bearer_token_bedrock = os.environ["AWS_BEARER_TOKEN_BEDROCK"]
|
|
254
|
+
actual_model_name = model_name.replace("bedrock-", "")
|
|
255
|
+
if actual_model_name.startswith("claude"):
|
|
256
|
+
actual_model_name = f"us.anthropic.{actual_model_name}:0"
|
|
257
|
+
elif actual_model_name.startswith("nova"):
|
|
258
|
+
actual_model_name = f"us.amazon.{actual_model_name}:0"
|
|
259
|
+
|
|
260
|
+
endpoint = (
|
|
261
|
+
f"https://bedrock-runtime.us-east-1.amazonaws.com/model/"
|
|
262
|
+
f"{actual_model_name}/converse"
|
|
263
|
+
)
|
|
264
|
+
response = requests.post(
|
|
265
|
+
endpoint,
|
|
266
|
+
headers={
|
|
267
|
+
"Content-Type": "application/json",
|
|
268
|
+
"Authorization": f"Bearer {aws_bearer_token_bedrock}",
|
|
269
|
+
},
|
|
270
|
+
json={
|
|
271
|
+
"messages": [
|
|
272
|
+
{
|
|
273
|
+
"role": "user",
|
|
274
|
+
"content": [{"text": prompt}],
|
|
275
|
+
}
|
|
276
|
+
],
|
|
277
|
+
"max_tokens": 64_000,
|
|
278
|
+
},
|
|
279
|
+
)
|
|
280
|
+
if response.status_code == 200:
|
|
281
|
+
response_json = response.json()
|
|
282
|
+
print(response_json)
|
|
283
|
+
try:
|
|
284
|
+
content_blocks = response_json["output"]["message"]["content"]
|
|
285
|
+
if (
|
|
286
|
+
content_blocks
|
|
287
|
+
and isinstance(content_blocks, list)
|
|
288
|
+
and "text" in content_blocks[0]
|
|
289
|
+
):
|
|
290
|
+
summary_text = content_blocks[0]["text"]
|
|
291
|
+
else:
|
|
292
|
+
summary_text = f"Unexpected content format: {response_json}"
|
|
293
|
+
except KeyError:
|
|
294
|
+
summary_text = f"Unexpected response structure: {response_json}"
|
|
295
|
+
else:
|
|
296
|
+
summary_text = (
|
|
297
|
+
f"Bedrock API Error {response.status_code}: {response.text}"
|
|
298
|
+
)
|
|
299
|
+
print(summary_text)
|
|
300
|
+
except KeyError:
|
|
301
|
+
print(
|
|
302
|
+
"Error: AWS_BEARER_TOKEN_BEDROCK environment variable required for "
|
|
303
|
+
"AWS Bedrock models."
|
|
304
|
+
)
|
|
305
|
+
summary_text = "Error: AWS_BEARER_TOKEN_BEDROCK required"
|
|
306
|
+
except Exception as e:
|
|
307
|
+
print(f"Bedrock Request Error: {e}")
|
|
308
|
+
summary_text = f"Error: {e}"
|
|
309
|
+
|
|
310
|
+
elif model_name.startswith("foundry"):
|
|
311
|
+
try:
|
|
312
|
+
AZURE_FOUNDRY_ENDPOINT = os.environ["AZURE_FOUNDRY_ENDPOINT"]
|
|
313
|
+
AZURE_FOUNDRY_API_KEY = os.environ["AZURE_FOUNDRY_API_KEY"]
|
|
314
|
+
actual_model_name = model_name.replace("foundry-", "")
|
|
315
|
+
client = OpenAI(
|
|
316
|
+
base_url=AZURE_FOUNDRY_ENDPOINT, api_key=AZURE_FOUNDRY_API_KEY
|
|
317
|
+
)
|
|
318
|
+
completion = client.chat.completions.create(
|
|
319
|
+
model=actual_model_name,
|
|
320
|
+
messages=[
|
|
321
|
+
{
|
|
322
|
+
"role": "user",
|
|
323
|
+
"content": prompt,
|
|
324
|
+
}
|
|
325
|
+
],
|
|
326
|
+
)
|
|
327
|
+
summary_text = completion.choices[0].message.content
|
|
328
|
+
except KeyError:
|
|
329
|
+
print(
|
|
330
|
+
"Error: AZURE_FOUNDRY_ENDPOINT and AZURE_FOUNDRY_API_KEY "
|
|
331
|
+
"environment variables required."
|
|
332
|
+
)
|
|
333
|
+
summary_text = "Error: Foundry vars required"
|
|
334
|
+
except Exception as e:
|
|
335
|
+
print(f"Foundry Request Error: {e}")
|
|
336
|
+
summary_text = f"Error: {e}"
|
|
337
|
+
|
|
338
|
+
return summary_text
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def main() -> None:
|
|
342
|
+
parser = argparse.ArgumentParser()
|
|
343
|
+
parser.add_argument(
|
|
344
|
+
"video_id",
|
|
345
|
+
nargs="?",
|
|
346
|
+
default="KuPc06JgI_A",
|
|
347
|
+
help=(
|
|
348
|
+
"Can be one of: \n"
|
|
349
|
+
"A Video ID e.g. 'KuPc06JgI_A'\n"
|
|
350
|
+
"Playlist ID (starts with PL e.g. 'PL8ZxoInteClyHaiReuOHpv6Z4SPrXtYtW')\n"
|
|
351
|
+
"Channel Handle (starts with @ e.g. '@mga-hgo1740')\n"
|
|
352
|
+
"Comma-separated list of Video IDs. (e.g. 'KuPc06JgI_A,GalhDyf3F8g')"
|
|
353
|
+
),
|
|
354
|
+
)
|
|
355
|
+
parser.add_argument(
|
|
356
|
+
"-o",
|
|
357
|
+
"--outfile",
|
|
358
|
+
default="youtube-docs.csv",
|
|
359
|
+
help=("Can be one of: \nLocal file path to save the output CSV file."),
|
|
360
|
+
)
|
|
361
|
+
parser.add_argument(
|
|
362
|
+
"-m",
|
|
363
|
+
"--model",
|
|
364
|
+
default=None,
|
|
365
|
+
help=(
|
|
366
|
+
"The LLM to use for summarization. Can be one of: \n"
|
|
367
|
+
"Gemini model (e.g., 'gemini-3-flash-preview')\n"
|
|
368
|
+
"GCP Vertex model (prefixed with 'vertex-'). e.g. "
|
|
369
|
+
"vertex-claude-haiku-4-5@20251001\n"
|
|
370
|
+
"AWS Bedrock model (prefixed with 'bedrock-'). e.g. "
|
|
371
|
+
"bedrock-claude-haiku-4-5-20251001-v1\n"
|
|
372
|
+
"Azure Foundry model (prefix with 'foundry-). e.g. 'foundry-gpt-5-mini'\n"
|
|
373
|
+
"Defaults to None."
|
|
374
|
+
),
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
args = parser.parse_args()
|
|
378
|
+
video_id_input: str = args.video_id
|
|
379
|
+
outfile: str = args.outfile
|
|
380
|
+
model_name: Optional[str] = args.model
|
|
381
|
+
|
|
382
|
+
youtube_service = get_youtube_service()
|
|
383
|
+
|
|
384
|
+
video_ids = resolve_video_ids(video_id_input, youtube_service)
|
|
385
|
+
|
|
386
|
+
# Setup Output Directories
|
|
387
|
+
transcripts_dir: Optional[str] = None
|
|
388
|
+
summaries_dir: Optional[str] = None
|
|
389
|
+
if outfile.endswith(".csv"):
|
|
390
|
+
output_dir = os.path.dirname(outfile)
|
|
391
|
+
if output_dir and not os.path.exists(output_dir):
|
|
392
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
393
|
+
base_dir = output_dir if output_dir else "."
|
|
394
|
+
transcripts_dir = os.path.join(base_dir, "transcript-files")
|
|
395
|
+
summaries_dir = os.path.join(base_dir, "summary-files")
|
|
396
|
+
os.makedirs(transcripts_dir, exist_ok=True)
|
|
397
|
+
os.makedirs(summaries_dir, exist_ok=True)
|
|
398
|
+
|
|
399
|
+
print(f"Processing {len(video_ids)} videos.")
|
|
400
|
+
print(f"Processing Videos: {video_ids}")
|
|
401
|
+
print(f"Saving to: {outfile}")
|
|
402
|
+
if model_name:
|
|
403
|
+
print(f"Summarizing using model: {model_name}")
|
|
404
|
+
|
|
405
|
+
data: List[dict] = []
|
|
406
|
+
for video_id in video_ids:
|
|
407
|
+
print(f"Processing Video ID: {video_id}")
|
|
408
|
+
|
|
409
|
+
# Get Details
|
|
410
|
+
details = get_video_details(video_id, youtube_service)
|
|
411
|
+
if not details:
|
|
412
|
+
# If explicit None returned, skip
|
|
413
|
+
continue
|
|
414
|
+
|
|
415
|
+
(
|
|
416
|
+
video_title,
|
|
417
|
+
description,
|
|
418
|
+
publishedAt,
|
|
419
|
+
channelTitle,
|
|
420
|
+
tags,
|
|
421
|
+
video_duration,
|
|
422
|
+
url,
|
|
423
|
+
) = details
|
|
424
|
+
print(f"Processing Video URL: {url}")
|
|
425
|
+
|
|
426
|
+
# Fetch Transcript
|
|
427
|
+
transcript = fetch_transcript(video_id)
|
|
428
|
+
if not transcript:
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
# Save Transcript
|
|
432
|
+
safe_title = (
|
|
433
|
+
re.sub(r'[\\/*?:"<>|]', "_", video_title)
|
|
434
|
+
.replace("\n", " ")
|
|
435
|
+
.replace("\r", "")
|
|
436
|
+
)
|
|
437
|
+
transcript_full_path = ""
|
|
438
|
+
if transcripts_dir:
|
|
439
|
+
transcript_filename = f"{video_id} - {safe_title}.txt"
|
|
440
|
+
transcript_full_path = os.path.abspath(
|
|
441
|
+
os.path.join(transcripts_dir, transcript_filename)
|
|
442
|
+
)
|
|
443
|
+
try:
|
|
444
|
+
with open(transcript_full_path, "w", encoding="utf-8") as f:
|
|
445
|
+
f.write(transcript)
|
|
446
|
+
print(f"Saved transcript: {transcript_filename}")
|
|
447
|
+
except OSError as e:
|
|
448
|
+
print(f"Error writing transcript: {e}")
|
|
449
|
+
|
|
450
|
+
# Summarize
|
|
451
|
+
summary_text = ""
|
|
452
|
+
summary_full_path = ""
|
|
453
|
+
if model_name:
|
|
454
|
+
print(f"Summarizing using model: {model_name}")
|
|
455
|
+
summary_text = generate_summary(model_name, transcript, video_title, url)
|
|
456
|
+
|
|
457
|
+
if summaries_dir and summary_text:
|
|
458
|
+
summary_filename = (
|
|
459
|
+
f"{model_name} - {video_id} - {safe_title} - summary.md"
|
|
460
|
+
)
|
|
461
|
+
summary_full_path = os.path.abspath(
|
|
462
|
+
os.path.join(summaries_dir, summary_filename)
|
|
463
|
+
)
|
|
464
|
+
try:
|
|
465
|
+
with open(summary_full_path, "w", encoding="utf-8") as f:
|
|
466
|
+
f.write(summary_text)
|
|
467
|
+
print(f"Saved summary: {summary_filename}")
|
|
468
|
+
except OSError as e:
|
|
469
|
+
print(f"Error writing summary: {e}")
|
|
470
|
+
|
|
471
|
+
print(f"Video Title: {video_title}")
|
|
472
|
+
print(f"Description: {description}")
|
|
473
|
+
print(f"Published At: {publishedAt}")
|
|
474
|
+
print(f"Channel Title: {channelTitle}")
|
|
475
|
+
print(f"Tags: {tags}")
|
|
476
|
+
print(f"Video Duration: {video_duration}")
|
|
477
|
+
print(f"Number of Transcript characters: {len(transcript)}")
|
|
478
|
+
|
|
479
|
+
row = {
|
|
480
|
+
"URL": url,
|
|
481
|
+
"Title": video_title,
|
|
482
|
+
"Description": description,
|
|
483
|
+
"Data Published": publishedAt,
|
|
484
|
+
"Channel": channelTitle,
|
|
485
|
+
"Tags": tags,
|
|
486
|
+
"Duration": video_duration,
|
|
487
|
+
"Transcript characters": len(transcript),
|
|
488
|
+
"Transcript File": transcript_full_path,
|
|
489
|
+
"Summary File": summary_full_path,
|
|
490
|
+
f"Summary Text {model_name}"
|
|
491
|
+
if model_name
|
|
492
|
+
else "Summary Text": summary_text,
|
|
493
|
+
}
|
|
494
|
+
data.append(row)
|
|
495
|
+
time.sleep(1)
|
|
496
|
+
|
|
497
|
+
if data:
|
|
498
|
+
df = pl.DataFrame(data)
|
|
499
|
+
df.write_csv(outfile)
|
|
500
|
+
print(f"Successfully wrote {len(df)} rows to {outfile}")
|
|
501
|
+
else:
|
|
502
|
+
print("No data gathered.")
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
if __name__ == "__main__":
|
|
506
|
+
main()
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: youtube-to-docs
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Convert YouTube videos to docs/sheets for discoverability
|
|
5
|
+
Requires-Python: >=3.14
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: google-auth>=2.45.0
|
|
9
|
+
Requires-Dist: google-genai>=1.56.0
|
|
10
|
+
Requires-Dist: google-api-python-client>=2.187.0
|
|
11
|
+
Requires-Dist: isodate>=0.7.2
|
|
12
|
+
Requires-Dist: openai>=1.56.0
|
|
13
|
+
Requires-Dist: polars>=1.36.1
|
|
14
|
+
Requires-Dist: requests>=2.32.5
|
|
15
|
+
Requires-Dist: youtube-transcript-api>=1.2.3
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
|
|
18
|
+
# youtube-to-docs
|
|
19
|
+
Convert YouTube videos to docs/sheets for discoverability.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
*Created with the help of AI. All artifacts have been checked and work as expected.*
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
.gitignore
|
|
2
|
+
.pre-commit-config.yaml
|
|
3
|
+
LICENSE
|
|
4
|
+
README.md
|
|
5
|
+
pyproject.toml
|
|
6
|
+
requirements.txt
|
|
7
|
+
.github/workflows/ci.yml
|
|
8
|
+
docs/development.md
|
|
9
|
+
tests/__init__.py
|
|
10
|
+
tests/test_main.py
|
|
11
|
+
youtube_to_docs/__init__.py
|
|
12
|
+
youtube_to_docs/main.py
|
|
13
|
+
youtube_to_docs.egg-info/PKG-INFO
|
|
14
|
+
youtube_to_docs.egg-info/SOURCES.txt
|
|
15
|
+
youtube_to_docs.egg-info/dependency_links.txt
|
|
16
|
+
youtube_to_docs.egg-info/requires.txt
|
|
17
|
+
youtube_to_docs.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
youtube_to_docs
|