content-core 1.0.2__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (63) hide show
  1. {content_core-1.0.2 → content_core-1.0.3}/PKG-INFO +2 -1
  2. {content_core-1.0.2 → content_core-1.0.3}/pyproject.toml +2 -2
  3. content_core-1.0.3/src/content_core/processors/youtube.py +212 -0
  4. {content_core-1.0.2 → content_core-1.0.3}/uv.lock +12 -1
  5. content_core-1.0.2/src/content_core/processors/youtube.py +0 -195
  6. {content_core-1.0.2 → content_core-1.0.3}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  7. {content_core-1.0.2 → content_core-1.0.3}/.github/workflows/publish.yml +0 -0
  8. {content_core-1.0.2 → content_core-1.0.3}/.gitignore +0 -0
  9. {content_core-1.0.2 → content_core-1.0.3}/.python-version +0 -0
  10. {content_core-1.0.2 → content_core-1.0.3}/CONTRIBUTING.md +0 -0
  11. {content_core-1.0.2 → content_core-1.0.3}/LICENSE +0 -0
  12. {content_core-1.0.2 → content_core-1.0.3}/Makefile +0 -0
  13. {content_core-1.0.2 → content_core-1.0.3}/README.md +0 -0
  14. {content_core-1.0.2 → content_core-1.0.3}/docs/processors.md +0 -0
  15. {content_core-1.0.2 → content_core-1.0.3}/docs/usage.md +0 -0
  16. {content_core-1.0.2 → content_core-1.0.3}/prompts/content/cleanup.jinja +0 -0
  17. {content_core-1.0.2 → content_core-1.0.3}/prompts/content/summarize.jinja +0 -0
  18. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/__init__.py +0 -0
  19. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/cc_config.yaml +0 -0
  20. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/__init__.py +0 -0
  21. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/exceptions.py +0 -0
  22. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/state.py +0 -0
  23. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/types.py +0 -0
  24. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/utils.py +0 -0
  25. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/config.py +0 -0
  26. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/__init__.py +0 -0
  27. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/cleanup/__init__.py +0 -0
  28. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/cleanup/core.py +0 -0
  29. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/extraction/__init__.py +0 -0
  30. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/extraction/graph.py +0 -0
  31. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/identification/__init__.py +0 -0
  32. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/summary/__init__.py +0 -0
  33. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/summary/core.py +0 -0
  34. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/logging.py +0 -0
  35. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/models.py +0 -0
  36. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/models_config.yaml +0 -0
  37. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/notebooks/run.ipynb +0 -0
  38. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/audio.py +0 -0
  39. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/docling.py +0 -0
  40. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/office.py +0 -0
  41. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/pdf.py +0 -0
  42. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/text.py +0 -0
  43. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/url.py +0 -0
  44. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/video.py +0 -0
  45. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/py.typed +0 -0
  46. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/templated_message.py +0 -0
  47. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/__init__.py +0 -0
  48. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/cleanup.py +0 -0
  49. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/extract.py +0 -0
  50. {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/summarize.py +0 -0
  51. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.docx +0 -0
  52. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.epub +0 -0
  53. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.md +0 -0
  54. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.mp3 +0 -0
  55. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.mp4 +0 -0
  56. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.pdf +0 -0
  57. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.pptx +0 -0
  58. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.txt +0 -0
  59. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.xlsx +0 -0
  60. {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file_audio.mp3 +0 -0
  61. {content_core-1.0.2 → content_core-1.0.3}/tests/integration/test_cli.py +0 -0
  62. {content_core-1.0.2 → content_core-1.0.3}/tests/integration/test_extraction.py +0 -0
  63. {content_core-1.0.2 → content_core-1.0.3}/tests/unit/test_docling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.0.2
3
+ Version: 1.0.3
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -27,6 +27,7 @@ Requires-Dist: python-dotenv>=1.1.0
27
27
  Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
28
28
  Requires-Dist: python-magic>=0.4.27
29
29
  Requires-Dist: python-pptx>=1.0.2
30
+ Requires-Dist: pytubefix>=9.1.1
30
31
  Requires-Dist: readability-lxml>=0.8.4.1
31
32
  Requires-Dist: validators>=0.34.0
32
33
  Requires-Dist: youtube-transcript-api>=1.0.3
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "1.0.2"
3
+ version = "1.0.3"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -34,7 +34,7 @@ dependencies = [
34
34
  "pillow>=10.4.0",
35
35
  "asciidoc>=10.2.1",
36
36
  "python-magic-bin==0.4.14; sys_platform == 'win32'",
37
-
37
+ "pytubefix>=9.1.1",
38
38
  ]
39
39
 
40
40
  [project.scripts]
@@ -0,0 +1,212 @@
1
+ import re
2
+ import ssl
3
+
4
+ import aiohttp
5
+ from bs4 import BeautifulSoup
6
+ from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
7
+ from youtube_transcript_api.formatters import TextFormatter # type: ignore
8
+
9
+ from content_core.common import ProcessSourceState
10
+ from content_core.common.exceptions import NoTranscriptFound
11
+ from content_core.config import CONFIG
12
+ from content_core.logging import logger
13
+
14
+ ssl._create_default_https_context = ssl._create_unverified_context
15
+
16
+
17
+ async def get_video_title(video_id):
18
+ try:
19
+ url = f"https://www.youtube.com/watch?v={video_id}"
20
+ async with aiohttp.ClientSession() as session:
21
+ async with session.get(url) as response:
22
+ html = await response.text()
23
+
24
+ # BeautifulSoup doesn't support async operations
25
+ soup = BeautifulSoup(html, "html.parser")
26
+
27
+ # YouTube stores title in a meta tag
28
+ title = soup.find("meta", property="og:title")["content"]
29
+ return title
30
+
31
+ except Exception as e:
32
+ logger.error(f"Failed to get video title: {e}")
33
+ return None
34
+
35
+
36
+ async def _extract_youtube_id(url):
37
+ """
38
+ Extract the YouTube video ID from a given URL using regular expressions.
39
+
40
+ Args:
41
+ url (str): The YouTube URL from which to extract the video ID.
42
+
43
+ Returns:
44
+ str: The extracted YouTube video ID or None if no valid ID is found.
45
+ """
46
+ # Define a regular expression pattern to capture the YouTube video ID
47
+ youtube_regex = (
48
+ r"(?:https?://)?" # Optional scheme
49
+ r"(?:www\.)?" # Optional www.
50
+ r"(?:"
51
+ r"youtu\.be/" # Shortened URL
52
+ r"|youtube\.com" # Main URL
53
+ r"(?:" # Group start
54
+ r"/embed/" # Embed URL
55
+ r"|/v/" # Older video URL
56
+ r"|/watch\?v=" # Standard watch URL
57
+ r"|/watch\?.+&v=" # Other watch URL
58
+ r")" # Group end
59
+ r")" # End main group
60
+ r"([\w-]{11})" # 11 characters (YouTube video ID)
61
+ )
62
+
63
+ # Search the URL for the pattern
64
+ match = re.search(youtube_regex, url)
65
+
66
+ # Return the video ID if a match is found
67
+ return match.group(1) if match else None
68
+
69
+
70
+ async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
71
+ try:
72
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
73
+
74
+ # First try: Manual transcripts in preferred languages
75
+ manual_transcripts = []
76
+ try:
77
+ for transcript in transcript_list:
78
+ if not transcript.is_generated and not transcript.is_translatable:
79
+ manual_transcripts.append(transcript)
80
+
81
+ if manual_transcripts:
82
+ # Sort based on preferred language order
83
+ for lang in preferred_langs:
84
+ for transcript in manual_transcripts:
85
+ if transcript.language_code == lang:
86
+ return transcript.fetch()
87
+ # If no preferred language found, return first manual transcript
88
+ return manual_transcripts[0].fetch()
89
+ except NoTranscriptFound:
90
+ pass
91
+
92
+ # Second try: Auto-generated transcripts in preferred languages
93
+ generated_transcripts = []
94
+ try:
95
+ for transcript in transcript_list:
96
+ if transcript.is_generated and not transcript.is_translatable:
97
+ generated_transcripts.append(transcript)
98
+
99
+ if generated_transcripts:
100
+ # Sort based on preferred language order
101
+ for lang in preferred_langs:
102
+ for transcript in generated_transcripts:
103
+ if transcript.language_code == lang:
104
+ return transcript.fetch()
105
+ # If no preferred language found, return first generated transcript
106
+ return generated_transcripts[0].fetch()
107
+ except NoTranscriptFound:
108
+ pass
109
+
110
+ # Last try: Translated transcripts in preferred languages
111
+ translated_transcripts = []
112
+ try:
113
+ for transcript in transcript_list:
114
+ if transcript.is_translatable:
115
+ translated_transcripts.append(transcript)
116
+
117
+ if translated_transcripts:
118
+ # Sort based on preferred language order
119
+ for lang in preferred_langs:
120
+ for transcript in translated_transcripts:
121
+ if transcript.language_code == lang:
122
+ return transcript.fetch()
123
+ # If no preferred language found, return translation to first preferred language
124
+ translation = translated_transcripts[0].translate(preferred_langs[0])
125
+ return translation.fetch()
126
+ except NoTranscriptFound:
127
+ pass
128
+
129
+ raise Exception("No suitable transcript found")
130
+
131
+ except Exception as e:
132
+ logger.error(f"Failed to get transcript for video {video_id}: {e}")
133
+ return None
134
+
135
+
136
+ def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
137
+ from pytubefix import YouTube
138
+
139
+ yt = YouTube(url)
140
+ print(yt.captions)
141
+
142
+ # Try to get captions in the preferred languages
143
+ if yt.captions:
144
+ for lang in languages:
145
+ if lang in yt.captions:
146
+ caption = yt.captions[lang]
147
+ break
148
+ elif f"a.{lang}" in yt.captions:
149
+ caption = yt.captions[f"a.{lang}"]
150
+ break
151
+ else: # No preferred language found, use the first available
152
+ caption_key = next(iter(yt.captions))
153
+ caption = yt.captions[caption_key]
154
+
155
+ srt_captions = caption.generate_srt_captions()
156
+ txt_captions = caption.generate_txt_captions()
157
+ return txt_captions, srt_captions
158
+
159
+ return None, None
160
+
161
+
162
+ async def extract_youtube_transcript(state: ProcessSourceState):
163
+ """
164
+ Parse the text file and print its content.
165
+ """
166
+
167
+ assert state.url, "No URL provided"
168
+ logger.warning(f"Extracting transcript from URL: {state.url}")
169
+ languages = CONFIG.get("youtube_transcripts", {}).get(
170
+ "preferred_languages", ["en", "es", "pt"]
171
+ )
172
+
173
+ # quick fix since transcripts api is not working for now
174
+ engine = "pytubefix"
175
+ video_id = await _extract_youtube_id(state.url)
176
+
177
+ try:
178
+ title = await get_video_title(video_id)
179
+ except Exception as e:
180
+ logger.critical(f"Failed to get video title for video_id: {video_id}")
181
+ logger.exception(e)
182
+ title = ""
183
+
184
+ if engine == "pytubefix":
185
+ formatted_content, transcript_raw = extract_transcript_pytubefix(
186
+ state.url, languages
187
+ )
188
+ if engine == "transcripts-api":
189
+ transcript = await get_best_transcript(video_id, languages)
190
+
191
+ logger.debug(f"Found transcript: {transcript}")
192
+ formatter = TextFormatter()
193
+
194
+ try:
195
+ formatted_content = formatter.format_transcript(transcript)
196
+ except Exception as e:
197
+ logger.critical(f"Failed to format transcript for video_id: {video_id}")
198
+ logger.exception(e)
199
+ formatted_content = ""
200
+
201
+ try:
202
+ transcript_raw = transcript.to_raw_data()
203
+ except Exception as e:
204
+ logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
205
+ logger.exception(e)
206
+ transcript_raw = ""
207
+
208
+ return {
209
+ "content": formatted_content,
210
+ "title": title,
211
+ "metadata": {"video_id": video_id, "transcript": transcript_raw},
212
+ }
@@ -410,7 +410,7 @@ wheels = [
410
410
 
411
411
  [[package]]
412
412
  name = "content-core"
413
- version = "1.0.2"
413
+ version = "1.0.3"
414
414
  source = { editable = "." }
415
415
  dependencies = [
416
416
  { name = "ai-prompter" },
@@ -435,6 +435,7 @@ dependencies = [
435
435
  { name = "python-magic" },
436
436
  { name = "python-magic-bin", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux' and sys_platform == 'win32') or (platform_system != 'Darwin' and platform_system != 'Linux' and sys_platform == 'win32')" },
437
437
  { name = "python-pptx" },
438
+ { name = "pytubefix" },
438
439
  { name = "readability-lxml" },
439
440
  { name = "validators" },
440
441
  { name = "youtube-transcript-api" },
@@ -474,6 +475,7 @@ requires-dist = [
474
475
  { name = "python-magic", specifier = ">=0.4.27" },
475
476
  { name = "python-magic-bin", marker = "sys_platform == 'win32'", specifier = "==0.4.14" },
476
477
  { name = "python-pptx", specifier = ">=1.0.2" },
478
+ { name = "pytubefix", specifier = ">=9.1.1" },
477
479
  { name = "readability-lxml", specifier = ">=0.8.4.1" },
478
480
  { name = "validators", specifier = ">=0.34.0" },
479
481
  { name = "youtube-transcript-api", specifier = ">=1.0.3" },
@@ -2940,6 +2942,15 @@ wheels = [
2940
2942
  { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788 },
2941
2943
  ]
2942
2944
 
2945
+ [[package]]
2946
+ name = "pytubefix"
2947
+ version = "9.1.1"
2948
+ source = { registry = "https://pypi.org/simple" }
2949
+ sdist = { url = "https://files.pythonhosted.org/packages/3c/06/8570fb8fc1296dae7f156e4de57c2b351856e7813873178e1cbb8045eef3/pytubefix-9.1.1.tar.gz", hash = "sha256:68946ab2192d7bb9d8fcc0fe73f634bb0ab0cd33f2c3c718e65c0c4fbdbccbb1", size = 734325 }
2950
+ wheels = [
2951
+ { url = "https://files.pythonhosted.org/packages/5d/fd/80ba35c78cbd007bfdb71d83b64087cca10e671cae4eb77875c952a21734/pytubefix-9.1.1-py3-none-any.whl", hash = "sha256:cc1c9cca936b82fcbf136e4630639417072aa5fdacf54ec0426604ca81c33b77", size = 732005 },
2952
+ ]
2953
+
2943
2954
  [[package]]
2944
2955
  name = "pytz"
2945
2956
  version = "2025.2"
@@ -1,195 +0,0 @@
1
- import asyncio
2
- import re
3
- import ssl
4
-
5
- import aiohttp
6
- from bs4 import BeautifulSoup
7
- from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
8
- from youtube_transcript_api.formatters import TextFormatter # type: ignore
9
-
10
- from content_core.common import ProcessSourceState
11
- from content_core.common.exceptions import NoTranscriptFound
12
- from content_core.config import CONFIG
13
- from content_core.logging import logger
14
-
15
- ssl._create_default_https_context = ssl._create_unverified_context
16
-
17
-
18
- async def get_video_title(video_id):
19
- try:
20
- url = f"https://www.youtube.com/watch?v={video_id}"
21
- async with aiohttp.ClientSession() as session:
22
- async with session.get(url) as response:
23
- html = await response.text()
24
-
25
- # BeautifulSoup doesn't support async operations
26
- soup = BeautifulSoup(html, "html.parser")
27
-
28
- # YouTube stores title in a meta tag
29
- title = soup.find("meta", property="og:title")["content"]
30
- return title
31
-
32
- except Exception as e:
33
- logger.error(f"Failed to get video title: {e}")
34
- return None
35
-
36
-
37
- async def _extract_youtube_id(url):
38
- """
39
- Extract the YouTube video ID from a given URL using regular expressions.
40
-
41
- Args:
42
- url (str): The YouTube URL from which to extract the video ID.
43
-
44
- Returns:
45
- str: The extracted YouTube video ID or None if no valid ID is found.
46
- """
47
- # Define a regular expression pattern to capture the YouTube video ID
48
- youtube_regex = (
49
- r"(?:https?://)?" # Optional scheme
50
- r"(?:www\.)?" # Optional www.
51
- r"(?:"
52
- r"youtu\.be/" # Shortened URL
53
- r"|youtube\.com" # Main URL
54
- r"(?:" # Group start
55
- r"/embed/" # Embed URL
56
- r"|/v/" # Older video URL
57
- r"|/watch\?v=" # Standard watch URL
58
- r"|/watch\?.+&v=" # Other watch URL
59
- r")" # Group end
60
- r")" # End main group
61
- r"([\w-]{11})" # 11 characters (YouTube video ID)
62
- )
63
-
64
- # Search the URL for the pattern
65
- match = re.search(youtube_regex, url)
66
-
67
- # Return the video ID if a match is found
68
- return match.group(1) if match else None
69
-
70
-
71
- async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
72
- max_attempts = 5
73
- for attempt in range(max_attempts):
74
- try:
75
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
76
-
77
- # First try: Manual transcripts in preferred languages
78
- manual_transcripts = []
79
- try:
80
- for transcript in transcript_list:
81
- if not transcript.is_generated and not transcript.is_translatable:
82
- manual_transcripts.append(transcript)
83
-
84
- if manual_transcripts:
85
- # Sort based on preferred language order
86
- for lang in preferred_langs:
87
- for transcript in manual_transcripts:
88
- if transcript.language_code == lang:
89
- return transcript.fetch()
90
- # If no preferred language found, return first manual transcript
91
- return manual_transcripts[0].fetch()
92
- except NoTranscriptFound:
93
- pass
94
-
95
- # Second try: Auto-generated transcripts in preferred languages
96
- generated_transcripts = []
97
- try:
98
- for transcript in transcript_list:
99
- if transcript.is_generated and not transcript.is_translatable:
100
- generated_transcripts.append(transcript)
101
-
102
- if generated_transcripts:
103
- # Sort based on preferred language order
104
- for lang in preferred_langs:
105
- for transcript in generated_transcripts:
106
- if transcript.language_code == lang:
107
- return transcript.fetch()
108
- # If no preferred language found, return first generated transcript
109
- return generated_transcripts[0].fetch()
110
- except NoTranscriptFound:
111
- pass
112
-
113
- # Last try: Translated transcripts in preferred languages
114
- translated_transcripts = []
115
- try:
116
- for transcript in transcript_list:
117
- if transcript.is_translatable:
118
- translated_transcripts.append(transcript)
119
-
120
- if translated_transcripts:
121
- # Sort based on preferred language order
122
- for lang in preferred_langs:
123
- for transcript in translated_transcripts:
124
- if transcript.language_code == lang:
125
- return transcript.fetch()
126
- # If no preferred language found, return translation to first preferred language
127
- translation = translated_transcripts[0].translate(
128
- preferred_langs[0]
129
- )
130
- return translation.fetch()
131
- except NoTranscriptFound:
132
- pass
133
-
134
- raise Exception("No suitable transcript found")
135
-
136
- except Exception as e:
137
- if e.__class__.__name__ == "ParserError":
138
- logger.warning(
139
- f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
140
- )
141
- if attempt == max_attempts - 1:
142
- logger.error(
143
- f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
144
- )
145
- return None
146
- await asyncio.sleep(2)
147
- continue
148
- else:
149
- logger.error(f"Failed to get transcript for video {video_id}: {e}")
150
- return None
151
- return None
152
-
153
-
154
- async def extract_youtube_transcript(state: ProcessSourceState):
155
- """
156
- Parse the text file and print its content.
157
- """
158
-
159
- assert state.url, "No URL provided"
160
- logger.warning(f"Extracting transcript from URL: {state.url}")
161
- languages = CONFIG.get("youtube_transcripts", {}).get(
162
- "preferred_languages", ["en", "es", "pt"]
163
- )
164
-
165
- video_id = await _extract_youtube_id(state.url)
166
- transcript = await get_best_transcript(video_id, languages)
167
-
168
- logger.debug(f"Found transcript: {transcript}")
169
- formatter = TextFormatter()
170
- try:
171
- title = await get_video_title(video_id)
172
- except Exception as e:
173
- logger.critical(f"Failed to get video title for video_id: {video_id}")
174
- logger.exception(e)
175
- title = ""
176
-
177
- try:
178
- formatted_content = formatter.format_transcript(transcript)
179
- except Exception as e:
180
- logger.critical(f"Failed to format transcript for video_id: {video_id}")
181
- logger.exception(e)
182
- formatted_content = ""
183
-
184
- try:
185
- transcript_raw = transcript.to_raw_data()
186
- except Exception as e:
187
- logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
188
- logger.exception(e)
189
- transcript_raw = ""
190
-
191
- return {
192
- "content": formatted_content,
193
- "title": title,
194
- "metadata": {"video_id": video_id, "transcript": transcript_raw},
195
- }
File without changes
File without changes
File without changes
File without changes
File without changes