content-core 1.0.2__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (63) hide show
  1. {content_core-1.0.2 → content_core-1.0.4}/PKG-INFO +2 -1
  2. {content_core-1.0.2 → content_core-1.0.4}/pyproject.toml +2 -2
  3. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/summary/core.py +1 -1
  4. content_core-1.0.4/src/content_core/processors/youtube.py +219 -0
  5. {content_core-1.0.2 → content_core-1.0.4}/uv.lock +12 -1
  6. content_core-1.0.2/src/content_core/processors/youtube.py +0 -195
  7. {content_core-1.0.2 → content_core-1.0.4}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  8. {content_core-1.0.2 → content_core-1.0.4}/.github/workflows/publish.yml +0 -0
  9. {content_core-1.0.2 → content_core-1.0.4}/.gitignore +0 -0
  10. {content_core-1.0.2 → content_core-1.0.4}/.python-version +0 -0
  11. {content_core-1.0.2 → content_core-1.0.4}/CONTRIBUTING.md +0 -0
  12. {content_core-1.0.2 → content_core-1.0.4}/LICENSE +0 -0
  13. {content_core-1.0.2 → content_core-1.0.4}/Makefile +0 -0
  14. {content_core-1.0.2 → content_core-1.0.4}/README.md +0 -0
  15. {content_core-1.0.2 → content_core-1.0.4}/docs/processors.md +0 -0
  16. {content_core-1.0.2 → content_core-1.0.4}/docs/usage.md +0 -0
  17. {content_core-1.0.2 → content_core-1.0.4}/prompts/content/cleanup.jinja +0 -0
  18. {content_core-1.0.2 → content_core-1.0.4}/prompts/content/summarize.jinja +0 -0
  19. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/__init__.py +0 -0
  20. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/cc_config.yaml +0 -0
  21. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/common/__init__.py +0 -0
  22. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/common/exceptions.py +0 -0
  23. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/common/state.py +0 -0
  24. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/common/types.py +0 -0
  25. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/common/utils.py +0 -0
  26. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/config.py +0 -0
  27. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/__init__.py +0 -0
  28. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/cleanup/__init__.py +0 -0
  29. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/cleanup/core.py +0 -0
  30. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/extraction/__init__.py +0 -0
  31. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/extraction/graph.py +0 -0
  32. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/identification/__init__.py +0 -0
  33. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/content/summary/__init__.py +0 -0
  34. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/logging.py +0 -0
  35. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/models.py +0 -0
  36. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/models_config.yaml +0 -0
  37. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/notebooks/run.ipynb +0 -0
  38. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/processors/audio.py +0 -0
  39. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/processors/docling.py +0 -0
  40. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/processors/office.py +0 -0
  41. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/processors/pdf.py +0 -0
  42. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/processors/text.py +0 -0
  43. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/processors/url.py +0 -0
  44. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/processors/video.py +0 -0
  45. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/py.typed +0 -0
  46. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/templated_message.py +0 -0
  47. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/tools/__init__.py +0 -0
  48. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/tools/cleanup.py +0 -0
  49. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/tools/extract.py +0 -0
  50. {content_core-1.0.2 → content_core-1.0.4}/src/content_core/tools/summarize.py +0 -0
  51. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.docx +0 -0
  52. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.epub +0 -0
  53. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.md +0 -0
  54. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.mp3 +0 -0
  55. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.mp4 +0 -0
  56. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.pdf +0 -0
  57. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.pptx +0 -0
  58. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.txt +0 -0
  59. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file.xlsx +0 -0
  60. {content_core-1.0.2 → content_core-1.0.4}/tests/input_content/file_audio.mp3 +0 -0
  61. {content_core-1.0.2 → content_core-1.0.4}/tests/integration/test_cli.py +0 -0
  62. {content_core-1.0.2 → content_core-1.0.4}/tests/integration/test_extraction.py +0 -0
  63. {content_core-1.0.2 → content_core-1.0.4}/tests/unit/test_docling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -27,6 +27,7 @@ Requires-Dist: python-dotenv>=1.1.0
27
27
  Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
28
28
  Requires-Dist: python-magic>=0.4.27
29
29
  Requires-Dist: python-pptx>=1.0.2
30
+ Requires-Dist: pytubefix>=9.1.1
30
31
  Requires-Dist: readability-lxml>=0.8.4.1
31
32
  Requires-Dist: validators>=0.34.0
32
33
  Requires-Dist: youtube-transcript-api>=1.0.3
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "1.0.2"
3
+ version = "1.0.4"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -34,7 +34,7 @@ dependencies = [
34
34
  "pillow>=10.4.0",
35
35
  "asciidoc>=10.2.1",
36
36
  "python-magic-bin==0.4.14; sys_platform == 'win32'",
37
-
37
+ "pytubefix>=9.1.1",
38
38
  ]
39
39
 
40
40
  [project.scripts]
@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
8
8
  templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
9
9
  response = await templated_message_fn(
10
10
  TemplatedMessageInput(
11
- user_prompt_template="content/summarize",
11
+ user_prompt_template="prompts/content/summarize",
12
12
  data={"content": content, "context": context},
13
13
  )
14
14
  )
@@ -0,0 +1,219 @@
1
+ import re
2
+ import ssl
3
+
4
+ import aiohttp
5
+ from bs4 import BeautifulSoup
6
+ from content_core.common import ProcessSourceState
7
+ from content_core.common.exceptions import NoTranscriptFound
8
+ from content_core.config import CONFIG
9
+ from content_core.logging import logger
10
+ from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
11
+ from youtube_transcript_api.formatters import TextFormatter # type: ignore
12
+
13
+ ssl._create_default_https_context = ssl._create_unverified_context
14
+
15
+
16
+ async def get_video_title(video_id):
17
+ try:
18
+ url = f"https://www.youtube.com/watch?v={video_id}"
19
+ async with aiohttp.ClientSession() as session:
20
+ async with session.get(url) as response:
21
+ html = await response.text()
22
+
23
+ # BeautifulSoup doesn't support async operations
24
+ soup = BeautifulSoup(html, "html.parser")
25
+
26
+ # YouTube stores title in a meta tag
27
+ title = soup.find("meta", property="og:title")["content"]
28
+ return title
29
+
30
+ except Exception as e:
31
+ logger.error(f"Failed to get video title: {e}")
32
+ return None
33
+
34
+
35
+ async def _extract_youtube_id(url):
36
+ """
37
+ Extract the YouTube video ID from a given URL using regular expressions.
38
+
39
+ Args:
40
+ url (str): The YouTube URL from which to extract the video ID.
41
+
42
+ Returns:
43
+ str: The extracted YouTube video ID or None if no valid ID is found.
44
+ """
45
+ # Define a regular expression pattern to capture the YouTube video ID
46
+ youtube_regex = (
47
+ r"(?:https?://)?" # Optional scheme
48
+ r"(?:www\.)?" # Optional www.
49
+ r"(?:"
50
+ r"youtu\.be/" # Shortened URL
51
+ r"|youtube\.com" # Main URL
52
+ r"(?:" # Group start
53
+ r"/embed/" # Embed URL
54
+ r"|/v/" # Older video URL
55
+ r"|/watch\?v=" # Standard watch URL
56
+ r"|/watch\?.+&v=" # Other watch URL
57
+ r")" # Group end
58
+ r")" # End main group
59
+ r"([\w-]{11})" # 11 characters (YouTube video ID)
60
+ )
61
+
62
+ # Search the URL for the pattern
63
+ match = re.search(youtube_regex, url)
64
+
65
+ # Return the video ID if a match is found
66
+ return match.group(1) if match else None
67
+
68
+
69
+ async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
70
+ try:
71
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
72
+
73
+ # First try: Manual transcripts in preferred languages
74
+ manual_transcripts = []
75
+ try:
76
+ for transcript in transcript_list:
77
+ if not transcript.is_generated and not transcript.is_translatable:
78
+ manual_transcripts.append(transcript)
79
+
80
+ if manual_transcripts:
81
+ # Sort based on preferred language order
82
+ for lang in preferred_langs:
83
+ for transcript in manual_transcripts:
84
+ if transcript.language_code == lang:
85
+ return transcript.fetch()
86
+ # If no preferred language found, return first manual transcript
87
+ return manual_transcripts[0].fetch()
88
+ except NoTranscriptFound:
89
+ pass
90
+
91
+ # Second try: Auto-generated transcripts in preferred languages
92
+ generated_transcripts = []
93
+ try:
94
+ for transcript in transcript_list:
95
+ if transcript.is_generated and not transcript.is_translatable:
96
+ generated_transcripts.append(transcript)
97
+
98
+ if generated_transcripts:
99
+ # Sort based on preferred language order
100
+ for lang in preferred_langs:
101
+ for transcript in generated_transcripts:
102
+ if transcript.language_code == lang:
103
+ return transcript.fetch()
104
+ # If no preferred language found, return first generated transcript
105
+ return generated_transcripts[0].fetch()
106
+ except NoTranscriptFound:
107
+ pass
108
+
109
+ # Last try: Translated transcripts in preferred languages
110
+ translated_transcripts = []
111
+ try:
112
+ for transcript in transcript_list:
113
+ if transcript.is_translatable:
114
+ translated_transcripts.append(transcript)
115
+
116
+ if translated_transcripts:
117
+ # Sort based on preferred language order
118
+ for lang in preferred_langs:
119
+ for transcript in translated_transcripts:
120
+ if transcript.language_code == lang:
121
+ return transcript.fetch()
122
+ # If no preferred language found, return translation to first preferred language
123
+ translation = translated_transcripts[0].translate(preferred_langs[0])
124
+ return translation.fetch()
125
+ except NoTranscriptFound:
126
+ pass
127
+
128
+ raise Exception("No suitable transcript found")
129
+
130
+ except Exception as e:
131
+ logger.error(f"Failed to get transcript for video {video_id}: {e}")
132
+ return None
133
+
134
+
135
+ def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
136
+ from pytubefix import YouTube
137
+
138
+ yt = YouTube(url)
139
+ logger.debug(f"Captions: {yt.captions}")
140
+
141
+ # Try to get captions in the preferred languages
142
+ if yt.captions:
143
+ for lang in languages:
144
+ if lang in yt.captions:
145
+ caption = yt.captions[lang]
146
+ break
147
+ elif f"a.{lang}" in yt.captions:
148
+ caption = yt.captions[f"a.{lang}"]
149
+ break
150
+ else: # No preferred language found, use the first available
151
+ caption_key = list(yt.captions.keys())[0]
152
+ caption = yt.captions[caption_key.code]
153
+ try:
154
+ srt_captions = caption.generate_srt_captions()
155
+ txt_captions = caption.generate_txt_captions()
156
+ return txt_captions, srt_captions
157
+ except KeyError as e:
158
+ logger.error(f"KeyError while generating captions for {caption}: {e}")
159
+ return None, None
160
+ except Exception as e:
161
+ logger.error(
162
+ f"Unexpected error while generating captions for {caption}: {e}"
163
+ )
164
+ return None, None
165
+
166
+ return None, None
167
+
168
+
169
+ async def extract_youtube_transcript(state: ProcessSourceState):
170
+ """
171
+ Parse the text file and print its content.
172
+ """
173
+
174
+ assert state.url, "No URL provided"
175
+ logger.warning(f"Extracting transcript from URL: {state.url}")
176
+ languages = CONFIG.get("youtube_transcripts", {}).get(
177
+ "preferred_languages", ["en", "es", "pt"]
178
+ )
179
+
180
+ # quick fix since transcripts api is not working for now
181
+ engine = "pytubefix"
182
+ video_id = await _extract_youtube_id(state.url)
183
+
184
+ try:
185
+ title = await get_video_title(video_id)
186
+ except Exception as e:
187
+ logger.critical(f"Failed to get video title for video_id: {video_id}")
188
+ logger.exception(e)
189
+ title = ""
190
+
191
+ if engine == "pytubefix":
192
+ formatted_content, transcript_raw = extract_transcript_pytubefix(
193
+ state.url, languages
194
+ )
195
+ if engine == "transcripts-api":
196
+ transcript = await get_best_transcript(video_id, languages)
197
+
198
+ logger.debug(f"Found transcript: {transcript}")
199
+ formatter = TextFormatter()
200
+
201
+ try:
202
+ formatted_content = formatter.format_transcript(transcript)
203
+ except Exception as e:
204
+ logger.critical(f"Failed to format transcript for video_id: {video_id}")
205
+ logger.exception(e)
206
+ formatted_content = ""
207
+
208
+ try:
209
+ transcript_raw = transcript.to_raw_data()
210
+ except Exception as e:
211
+ logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
212
+ logger.exception(e)
213
+ transcript_raw = ""
214
+
215
+ return {
216
+ "content": formatted_content,
217
+ "title": title,
218
+ "metadata": {"video_id": video_id, "transcript": transcript_raw},
219
+ }
@@ -410,7 +410,7 @@ wheels = [
410
410
 
411
411
  [[package]]
412
412
  name = "content-core"
413
- version = "1.0.2"
413
+ version = "1.0.4"
414
414
  source = { editable = "." }
415
415
  dependencies = [
416
416
  { name = "ai-prompter" },
@@ -435,6 +435,7 @@ dependencies = [
435
435
  { name = "python-magic" },
436
436
  { name = "python-magic-bin", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux' and sys_platform == 'win32') or (platform_system != 'Darwin' and platform_system != 'Linux' and sys_platform == 'win32')" },
437
437
  { name = "python-pptx" },
438
+ { name = "pytubefix" },
438
439
  { name = "readability-lxml" },
439
440
  { name = "validators" },
440
441
  { name = "youtube-transcript-api" },
@@ -474,6 +475,7 @@ requires-dist = [
474
475
  { name = "python-magic", specifier = ">=0.4.27" },
475
476
  { name = "python-magic-bin", marker = "sys_platform == 'win32'", specifier = "==0.4.14" },
476
477
  { name = "python-pptx", specifier = ">=1.0.2" },
478
+ { name = "pytubefix", specifier = ">=9.1.1" },
477
479
  { name = "readability-lxml", specifier = ">=0.8.4.1" },
478
480
  { name = "validators", specifier = ">=0.34.0" },
479
481
  { name = "youtube-transcript-api", specifier = ">=1.0.3" },
@@ -2940,6 +2942,15 @@ wheels = [
2940
2942
  { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788 },
2941
2943
  ]
2942
2944
 
2945
+ [[package]]
2946
+ name = "pytubefix"
2947
+ version = "9.1.1"
2948
+ source = { registry = "https://pypi.org/simple" }
2949
+ sdist = { url = "https://files.pythonhosted.org/packages/3c/06/8570fb8fc1296dae7f156e4de57c2b351856e7813873178e1cbb8045eef3/pytubefix-9.1.1.tar.gz", hash = "sha256:68946ab2192d7bb9d8fcc0fe73f634bb0ab0cd33f2c3c718e65c0c4fbdbccbb1", size = 734325 }
2950
+ wheels = [
2951
+ { url = "https://files.pythonhosted.org/packages/5d/fd/80ba35c78cbd007bfdb71d83b64087cca10e671cae4eb77875c952a21734/pytubefix-9.1.1-py3-none-any.whl", hash = "sha256:cc1c9cca936b82fcbf136e4630639417072aa5fdacf54ec0426604ca81c33b77", size = 732005 },
2952
+ ]
2953
+
2943
2954
  [[package]]
2944
2955
  name = "pytz"
2945
2956
  version = "2025.2"
@@ -1,195 +0,0 @@
1
- import asyncio
2
- import re
3
- import ssl
4
-
5
- import aiohttp
6
- from bs4 import BeautifulSoup
7
- from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
8
- from youtube_transcript_api.formatters import TextFormatter # type: ignore
9
-
10
- from content_core.common import ProcessSourceState
11
- from content_core.common.exceptions import NoTranscriptFound
12
- from content_core.config import CONFIG
13
- from content_core.logging import logger
14
-
15
- ssl._create_default_https_context = ssl._create_unverified_context
16
-
17
-
18
- async def get_video_title(video_id):
19
- try:
20
- url = f"https://www.youtube.com/watch?v={video_id}"
21
- async with aiohttp.ClientSession() as session:
22
- async with session.get(url) as response:
23
- html = await response.text()
24
-
25
- # BeautifulSoup doesn't support async operations
26
- soup = BeautifulSoup(html, "html.parser")
27
-
28
- # YouTube stores title in a meta tag
29
- title = soup.find("meta", property="og:title")["content"]
30
- return title
31
-
32
- except Exception as e:
33
- logger.error(f"Failed to get video title: {e}")
34
- return None
35
-
36
-
37
- async def _extract_youtube_id(url):
38
- """
39
- Extract the YouTube video ID from a given URL using regular expressions.
40
-
41
- Args:
42
- url (str): The YouTube URL from which to extract the video ID.
43
-
44
- Returns:
45
- str: The extracted YouTube video ID or None if no valid ID is found.
46
- """
47
- # Define a regular expression pattern to capture the YouTube video ID
48
- youtube_regex = (
49
- r"(?:https?://)?" # Optional scheme
50
- r"(?:www\.)?" # Optional www.
51
- r"(?:"
52
- r"youtu\.be/" # Shortened URL
53
- r"|youtube\.com" # Main URL
54
- r"(?:" # Group start
55
- r"/embed/" # Embed URL
56
- r"|/v/" # Older video URL
57
- r"|/watch\?v=" # Standard watch URL
58
- r"|/watch\?.+&v=" # Other watch URL
59
- r")" # Group end
60
- r")" # End main group
61
- r"([\w-]{11})" # 11 characters (YouTube video ID)
62
- )
63
-
64
- # Search the URL for the pattern
65
- match = re.search(youtube_regex, url)
66
-
67
- # Return the video ID if a match is found
68
- return match.group(1) if match else None
69
-
70
-
71
- async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
72
- max_attempts = 5
73
- for attempt in range(max_attempts):
74
- try:
75
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
76
-
77
- # First try: Manual transcripts in preferred languages
78
- manual_transcripts = []
79
- try:
80
- for transcript in transcript_list:
81
- if not transcript.is_generated and not transcript.is_translatable:
82
- manual_transcripts.append(transcript)
83
-
84
- if manual_transcripts:
85
- # Sort based on preferred language order
86
- for lang in preferred_langs:
87
- for transcript in manual_transcripts:
88
- if transcript.language_code == lang:
89
- return transcript.fetch()
90
- # If no preferred language found, return first manual transcript
91
- return manual_transcripts[0].fetch()
92
- except NoTranscriptFound:
93
- pass
94
-
95
- # Second try: Auto-generated transcripts in preferred languages
96
- generated_transcripts = []
97
- try:
98
- for transcript in transcript_list:
99
- if transcript.is_generated and not transcript.is_translatable:
100
- generated_transcripts.append(transcript)
101
-
102
- if generated_transcripts:
103
- # Sort based on preferred language order
104
- for lang in preferred_langs:
105
- for transcript in generated_transcripts:
106
- if transcript.language_code == lang:
107
- return transcript.fetch()
108
- # If no preferred language found, return first generated transcript
109
- return generated_transcripts[0].fetch()
110
- except NoTranscriptFound:
111
- pass
112
-
113
- # Last try: Translated transcripts in preferred languages
114
- translated_transcripts = []
115
- try:
116
- for transcript in transcript_list:
117
- if transcript.is_translatable:
118
- translated_transcripts.append(transcript)
119
-
120
- if translated_transcripts:
121
- # Sort based on preferred language order
122
- for lang in preferred_langs:
123
- for transcript in translated_transcripts:
124
- if transcript.language_code == lang:
125
- return transcript.fetch()
126
- # If no preferred language found, return translation to first preferred language
127
- translation = translated_transcripts[0].translate(
128
- preferred_langs[0]
129
- )
130
- return translation.fetch()
131
- except NoTranscriptFound:
132
- pass
133
-
134
- raise Exception("No suitable transcript found")
135
-
136
- except Exception as e:
137
- if e.__class__.__name__ == "ParserError":
138
- logger.warning(
139
- f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
140
- )
141
- if attempt == max_attempts - 1:
142
- logger.error(
143
- f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
144
- )
145
- return None
146
- await asyncio.sleep(2)
147
- continue
148
- else:
149
- logger.error(f"Failed to get transcript for video {video_id}: {e}")
150
- return None
151
- return None
152
-
153
-
154
- async def extract_youtube_transcript(state: ProcessSourceState):
155
- """
156
- Parse the text file and print its content.
157
- """
158
-
159
- assert state.url, "No URL provided"
160
- logger.warning(f"Extracting transcript from URL: {state.url}")
161
- languages = CONFIG.get("youtube_transcripts", {}).get(
162
- "preferred_languages", ["en", "es", "pt"]
163
- )
164
-
165
- video_id = await _extract_youtube_id(state.url)
166
- transcript = await get_best_transcript(video_id, languages)
167
-
168
- logger.debug(f"Found transcript: {transcript}")
169
- formatter = TextFormatter()
170
- try:
171
- title = await get_video_title(video_id)
172
- except Exception as e:
173
- logger.critical(f"Failed to get video title for video_id: {video_id}")
174
- logger.exception(e)
175
- title = ""
176
-
177
- try:
178
- formatted_content = formatter.format_transcript(transcript)
179
- except Exception as e:
180
- logger.critical(f"Failed to format transcript for video_id: {video_id}")
181
- logger.exception(e)
182
- formatted_content = ""
183
-
184
- try:
185
- transcript_raw = transcript.to_raw_data()
186
- except Exception as e:
187
- logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
188
- logger.exception(e)
189
- transcript_raw = ""
190
-
191
- return {
192
- "content": formatted_content,
193
- "title": title,
194
- "metadata": {"video_id": video_id, "transcript": transcript_raw},
195
- }
File without changes
File without changes
File without changes
File without changes
File without changes