content-core 1.0.2__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-1.0.2 → content_core-1.0.3}/PKG-INFO +2 -1
- {content_core-1.0.2 → content_core-1.0.3}/pyproject.toml +2 -2
- content_core-1.0.3/src/content_core/processors/youtube.py +212 -0
- {content_core-1.0.2 → content_core-1.0.3}/uv.lock +12 -1
- content_core-1.0.2/src/content_core/processors/youtube.py +0 -195
- {content_core-1.0.2 → content_core-1.0.3}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/.github/workflows/publish.yml +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/.gitignore +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/.python-version +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/CONTRIBUTING.md +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/LICENSE +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/Makefile +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/README.md +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/docs/processors.md +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/docs/usage.md +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/prompts/content/summarize.jinja +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/state.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/types.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/common/utils.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/config.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/summary/core.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/logging.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/models.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/models_config.yaml +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/audio.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/docling.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/office.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/text.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/url.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/processors/video.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/py.typed +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/templated_message.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/extract.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.docx +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.epub +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.md +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.mp3 +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.mp4 +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.pdf +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.pptx +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.txt +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file.xlsx +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/integration/test_cli.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/integration/test_extraction.py +0 -0
- {content_core-1.0.2 → content_core-1.0.3}/tests/unit/test_docling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -27,6 +27,7 @@ Requires-Dist: python-dotenv>=1.1.0
|
|
|
27
27
|
Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
|
|
28
28
|
Requires-Dist: python-magic>=0.4.27
|
|
29
29
|
Requires-Dist: python-pptx>=1.0.2
|
|
30
|
+
Requires-Dist: pytubefix>=9.1.1
|
|
30
31
|
Requires-Dist: readability-lxml>=0.8.4.1
|
|
31
32
|
Requires-Dist: validators>=0.34.0
|
|
32
33
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.0.
|
|
3
|
+
version = "1.0.3"
|
|
4
4
|
description = "Extract what matters from any media source"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -34,7 +34,7 @@ dependencies = [
|
|
|
34
34
|
"pillow>=10.4.0",
|
|
35
35
|
"asciidoc>=10.2.1",
|
|
36
36
|
"python-magic-bin==0.4.14; sys_platform == 'win32'",
|
|
37
|
-
|
|
37
|
+
"pytubefix>=9.1.1",
|
|
38
38
|
]
|
|
39
39
|
|
|
40
40
|
[project.scripts]
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import ssl
|
|
3
|
+
|
|
4
|
+
import aiohttp
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
7
|
+
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
8
|
+
|
|
9
|
+
from content_core.common import ProcessSourceState
|
|
10
|
+
from content_core.common.exceptions import NoTranscriptFound
|
|
11
|
+
from content_core.config import CONFIG
|
|
12
|
+
from content_core.logging import logger
|
|
13
|
+
|
|
14
|
+
ssl._create_default_https_context = ssl._create_unverified_context
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def get_video_title(video_id):
|
|
18
|
+
try:
|
|
19
|
+
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
20
|
+
async with aiohttp.ClientSession() as session:
|
|
21
|
+
async with session.get(url) as response:
|
|
22
|
+
html = await response.text()
|
|
23
|
+
|
|
24
|
+
# BeautifulSoup doesn't support async operations
|
|
25
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
26
|
+
|
|
27
|
+
# YouTube stores title in a meta tag
|
|
28
|
+
title = soup.find("meta", property="og:title")["content"]
|
|
29
|
+
return title
|
|
30
|
+
|
|
31
|
+
except Exception as e:
|
|
32
|
+
logger.error(f"Failed to get video title: {e}")
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def _extract_youtube_id(url):
|
|
37
|
+
"""
|
|
38
|
+
Extract the YouTube video ID from a given URL using regular expressions.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
url (str): The YouTube URL from which to extract the video ID.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
str: The extracted YouTube video ID or None if no valid ID is found.
|
|
45
|
+
"""
|
|
46
|
+
# Define a regular expression pattern to capture the YouTube video ID
|
|
47
|
+
youtube_regex = (
|
|
48
|
+
r"(?:https?://)?" # Optional scheme
|
|
49
|
+
r"(?:www\.)?" # Optional www.
|
|
50
|
+
r"(?:"
|
|
51
|
+
r"youtu\.be/" # Shortened URL
|
|
52
|
+
r"|youtube\.com" # Main URL
|
|
53
|
+
r"(?:" # Group start
|
|
54
|
+
r"/embed/" # Embed URL
|
|
55
|
+
r"|/v/" # Older video URL
|
|
56
|
+
r"|/watch\?v=" # Standard watch URL
|
|
57
|
+
r"|/watch\?.+&v=" # Other watch URL
|
|
58
|
+
r")" # Group end
|
|
59
|
+
r")" # End main group
|
|
60
|
+
r"([\w-]{11})" # 11 characters (YouTube video ID)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Search the URL for the pattern
|
|
64
|
+
match = re.search(youtube_regex, url)
|
|
65
|
+
|
|
66
|
+
# Return the video ID if a match is found
|
|
67
|
+
return match.group(1) if match else None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
|
71
|
+
try:
|
|
72
|
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
73
|
+
|
|
74
|
+
# First try: Manual transcripts in preferred languages
|
|
75
|
+
manual_transcripts = []
|
|
76
|
+
try:
|
|
77
|
+
for transcript in transcript_list:
|
|
78
|
+
if not transcript.is_generated and not transcript.is_translatable:
|
|
79
|
+
manual_transcripts.append(transcript)
|
|
80
|
+
|
|
81
|
+
if manual_transcripts:
|
|
82
|
+
# Sort based on preferred language order
|
|
83
|
+
for lang in preferred_langs:
|
|
84
|
+
for transcript in manual_transcripts:
|
|
85
|
+
if transcript.language_code == lang:
|
|
86
|
+
return transcript.fetch()
|
|
87
|
+
# If no preferred language found, return first manual transcript
|
|
88
|
+
return manual_transcripts[0].fetch()
|
|
89
|
+
except NoTranscriptFound:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
# Second try: Auto-generated transcripts in preferred languages
|
|
93
|
+
generated_transcripts = []
|
|
94
|
+
try:
|
|
95
|
+
for transcript in transcript_list:
|
|
96
|
+
if transcript.is_generated and not transcript.is_translatable:
|
|
97
|
+
generated_transcripts.append(transcript)
|
|
98
|
+
|
|
99
|
+
if generated_transcripts:
|
|
100
|
+
# Sort based on preferred language order
|
|
101
|
+
for lang in preferred_langs:
|
|
102
|
+
for transcript in generated_transcripts:
|
|
103
|
+
if transcript.language_code == lang:
|
|
104
|
+
return transcript.fetch()
|
|
105
|
+
# If no preferred language found, return first generated transcript
|
|
106
|
+
return generated_transcripts[0].fetch()
|
|
107
|
+
except NoTranscriptFound:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# Last try: Translated transcripts in preferred languages
|
|
111
|
+
translated_transcripts = []
|
|
112
|
+
try:
|
|
113
|
+
for transcript in transcript_list:
|
|
114
|
+
if transcript.is_translatable:
|
|
115
|
+
translated_transcripts.append(transcript)
|
|
116
|
+
|
|
117
|
+
if translated_transcripts:
|
|
118
|
+
# Sort based on preferred language order
|
|
119
|
+
for lang in preferred_langs:
|
|
120
|
+
for transcript in translated_transcripts:
|
|
121
|
+
if transcript.language_code == lang:
|
|
122
|
+
return transcript.fetch()
|
|
123
|
+
# If no preferred language found, return translation to first preferred language
|
|
124
|
+
translation = translated_transcripts[0].translate(preferred_langs[0])
|
|
125
|
+
return translation.fetch()
|
|
126
|
+
except NoTranscriptFound:
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
raise Exception("No suitable transcript found")
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
|
|
137
|
+
from pytubefix import YouTube
|
|
138
|
+
|
|
139
|
+
yt = YouTube(url)
|
|
140
|
+
print(yt.captions)
|
|
141
|
+
|
|
142
|
+
# Try to get captions in the preferred languages
|
|
143
|
+
if yt.captions:
|
|
144
|
+
for lang in languages:
|
|
145
|
+
if lang in yt.captions:
|
|
146
|
+
caption = yt.captions[lang]
|
|
147
|
+
break
|
|
148
|
+
elif f"a.{lang}" in yt.captions:
|
|
149
|
+
caption = yt.captions[f"a.{lang}"]
|
|
150
|
+
break
|
|
151
|
+
else: # No preferred language found, use the first available
|
|
152
|
+
caption_key = next(iter(yt.captions))
|
|
153
|
+
caption = yt.captions[caption_key]
|
|
154
|
+
|
|
155
|
+
srt_captions = caption.generate_srt_captions()
|
|
156
|
+
txt_captions = caption.generate_txt_captions()
|
|
157
|
+
return txt_captions, srt_captions
|
|
158
|
+
|
|
159
|
+
return None, None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def extract_youtube_transcript(state: ProcessSourceState):
|
|
163
|
+
"""
|
|
164
|
+
Parse the text file and print its content.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
assert state.url, "No URL provided"
|
|
168
|
+
logger.warning(f"Extracting transcript from URL: {state.url}")
|
|
169
|
+
languages = CONFIG.get("youtube_transcripts", {}).get(
|
|
170
|
+
"preferred_languages", ["en", "es", "pt"]
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# quick fix since transcripts api is not working for now
|
|
174
|
+
engine = "pytubefix"
|
|
175
|
+
video_id = await _extract_youtube_id(state.url)
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
title = await get_video_title(video_id)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
logger.critical(f"Failed to get video title for video_id: {video_id}")
|
|
181
|
+
logger.exception(e)
|
|
182
|
+
title = ""
|
|
183
|
+
|
|
184
|
+
if engine == "pytubefix":
|
|
185
|
+
formatted_content, transcript_raw = extract_transcript_pytubefix(
|
|
186
|
+
state.url, languages
|
|
187
|
+
)
|
|
188
|
+
if engine == "transcripts-api":
|
|
189
|
+
transcript = await get_best_transcript(video_id, languages)
|
|
190
|
+
|
|
191
|
+
logger.debug(f"Found transcript: {transcript}")
|
|
192
|
+
formatter = TextFormatter()
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
formatted_content = formatter.format_transcript(transcript)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.critical(f"Failed to format transcript for video_id: {video_id}")
|
|
198
|
+
logger.exception(e)
|
|
199
|
+
formatted_content = ""
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
transcript_raw = transcript.to_raw_data()
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
|
|
205
|
+
logger.exception(e)
|
|
206
|
+
transcript_raw = ""
|
|
207
|
+
|
|
208
|
+
return {
|
|
209
|
+
"content": formatted_content,
|
|
210
|
+
"title": title,
|
|
211
|
+
"metadata": {"video_id": video_id, "transcript": transcript_raw},
|
|
212
|
+
}
|
|
@@ -410,7 +410,7 @@ wheels = [
|
|
|
410
410
|
|
|
411
411
|
[[package]]
|
|
412
412
|
name = "content-core"
|
|
413
|
-
version = "1.0.
|
|
413
|
+
version = "1.0.3"
|
|
414
414
|
source = { editable = "." }
|
|
415
415
|
dependencies = [
|
|
416
416
|
{ name = "ai-prompter" },
|
|
@@ -435,6 +435,7 @@ dependencies = [
|
|
|
435
435
|
{ name = "python-magic" },
|
|
436
436
|
{ name = "python-magic-bin", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux' and sys_platform == 'win32') or (platform_system != 'Darwin' and platform_system != 'Linux' and sys_platform == 'win32')" },
|
|
437
437
|
{ name = "python-pptx" },
|
|
438
|
+
{ name = "pytubefix" },
|
|
438
439
|
{ name = "readability-lxml" },
|
|
439
440
|
{ name = "validators" },
|
|
440
441
|
{ name = "youtube-transcript-api" },
|
|
@@ -474,6 +475,7 @@ requires-dist = [
|
|
|
474
475
|
{ name = "python-magic", specifier = ">=0.4.27" },
|
|
475
476
|
{ name = "python-magic-bin", marker = "sys_platform == 'win32'", specifier = "==0.4.14" },
|
|
476
477
|
{ name = "python-pptx", specifier = ">=1.0.2" },
|
|
478
|
+
{ name = "pytubefix", specifier = ">=9.1.1" },
|
|
477
479
|
{ name = "readability-lxml", specifier = ">=0.8.4.1" },
|
|
478
480
|
{ name = "validators", specifier = ">=0.34.0" },
|
|
479
481
|
{ name = "youtube-transcript-api", specifier = ">=1.0.3" },
|
|
@@ -2940,6 +2942,15 @@ wheels = [
|
|
|
2940
2942
|
{ url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788 },
|
|
2941
2943
|
]
|
|
2942
2944
|
|
|
2945
|
+
[[package]]
|
|
2946
|
+
name = "pytubefix"
|
|
2947
|
+
version = "9.1.1"
|
|
2948
|
+
source = { registry = "https://pypi.org/simple" }
|
|
2949
|
+
sdist = { url = "https://files.pythonhosted.org/packages/3c/06/8570fb8fc1296dae7f156e4de57c2b351856e7813873178e1cbb8045eef3/pytubefix-9.1.1.tar.gz", hash = "sha256:68946ab2192d7bb9d8fcc0fe73f634bb0ab0cd33f2c3c718e65c0c4fbdbccbb1", size = 734325 }
|
|
2950
|
+
wheels = [
|
|
2951
|
+
{ url = "https://files.pythonhosted.org/packages/5d/fd/80ba35c78cbd007bfdb71d83b64087cca10e671cae4eb77875c952a21734/pytubefix-9.1.1-py3-none-any.whl", hash = "sha256:cc1c9cca936b82fcbf136e4630639417072aa5fdacf54ec0426604ca81c33b77", size = 732005 },
|
|
2952
|
+
]
|
|
2953
|
+
|
|
2943
2954
|
[[package]]
|
|
2944
2955
|
name = "pytz"
|
|
2945
2956
|
version = "2025.2"
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import re
|
|
3
|
-
import ssl
|
|
4
|
-
|
|
5
|
-
import aiohttp
|
|
6
|
-
from bs4 import BeautifulSoup
|
|
7
|
-
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
8
|
-
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
9
|
-
|
|
10
|
-
from content_core.common import ProcessSourceState
|
|
11
|
-
from content_core.common.exceptions import NoTranscriptFound
|
|
12
|
-
from content_core.config import CONFIG
|
|
13
|
-
from content_core.logging import logger
|
|
14
|
-
|
|
15
|
-
ssl._create_default_https_context = ssl._create_unverified_context
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
async def get_video_title(video_id):
|
|
19
|
-
try:
|
|
20
|
-
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
21
|
-
async with aiohttp.ClientSession() as session:
|
|
22
|
-
async with session.get(url) as response:
|
|
23
|
-
html = await response.text()
|
|
24
|
-
|
|
25
|
-
# BeautifulSoup doesn't support async operations
|
|
26
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
27
|
-
|
|
28
|
-
# YouTube stores title in a meta tag
|
|
29
|
-
title = soup.find("meta", property="og:title")["content"]
|
|
30
|
-
return title
|
|
31
|
-
|
|
32
|
-
except Exception as e:
|
|
33
|
-
logger.error(f"Failed to get video title: {e}")
|
|
34
|
-
return None
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
async def _extract_youtube_id(url):
|
|
38
|
-
"""
|
|
39
|
-
Extract the YouTube video ID from a given URL using regular expressions.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
url (str): The YouTube URL from which to extract the video ID.
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
str: The extracted YouTube video ID or None if no valid ID is found.
|
|
46
|
-
"""
|
|
47
|
-
# Define a regular expression pattern to capture the YouTube video ID
|
|
48
|
-
youtube_regex = (
|
|
49
|
-
r"(?:https?://)?" # Optional scheme
|
|
50
|
-
r"(?:www\.)?" # Optional www.
|
|
51
|
-
r"(?:"
|
|
52
|
-
r"youtu\.be/" # Shortened URL
|
|
53
|
-
r"|youtube\.com" # Main URL
|
|
54
|
-
r"(?:" # Group start
|
|
55
|
-
r"/embed/" # Embed URL
|
|
56
|
-
r"|/v/" # Older video URL
|
|
57
|
-
r"|/watch\?v=" # Standard watch URL
|
|
58
|
-
r"|/watch\?.+&v=" # Other watch URL
|
|
59
|
-
r")" # Group end
|
|
60
|
-
r")" # End main group
|
|
61
|
-
r"([\w-]{11})" # 11 characters (YouTube video ID)
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
# Search the URL for the pattern
|
|
65
|
-
match = re.search(youtube_regex, url)
|
|
66
|
-
|
|
67
|
-
# Return the video ID if a match is found
|
|
68
|
-
return match.group(1) if match else None
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
|
72
|
-
max_attempts = 5
|
|
73
|
-
for attempt in range(max_attempts):
|
|
74
|
-
try:
|
|
75
|
-
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
76
|
-
|
|
77
|
-
# First try: Manual transcripts in preferred languages
|
|
78
|
-
manual_transcripts = []
|
|
79
|
-
try:
|
|
80
|
-
for transcript in transcript_list:
|
|
81
|
-
if not transcript.is_generated and not transcript.is_translatable:
|
|
82
|
-
manual_transcripts.append(transcript)
|
|
83
|
-
|
|
84
|
-
if manual_transcripts:
|
|
85
|
-
# Sort based on preferred language order
|
|
86
|
-
for lang in preferred_langs:
|
|
87
|
-
for transcript in manual_transcripts:
|
|
88
|
-
if transcript.language_code == lang:
|
|
89
|
-
return transcript.fetch()
|
|
90
|
-
# If no preferred language found, return first manual transcript
|
|
91
|
-
return manual_transcripts[0].fetch()
|
|
92
|
-
except NoTranscriptFound:
|
|
93
|
-
pass
|
|
94
|
-
|
|
95
|
-
# Second try: Auto-generated transcripts in preferred languages
|
|
96
|
-
generated_transcripts = []
|
|
97
|
-
try:
|
|
98
|
-
for transcript in transcript_list:
|
|
99
|
-
if transcript.is_generated and not transcript.is_translatable:
|
|
100
|
-
generated_transcripts.append(transcript)
|
|
101
|
-
|
|
102
|
-
if generated_transcripts:
|
|
103
|
-
# Sort based on preferred language order
|
|
104
|
-
for lang in preferred_langs:
|
|
105
|
-
for transcript in generated_transcripts:
|
|
106
|
-
if transcript.language_code == lang:
|
|
107
|
-
return transcript.fetch()
|
|
108
|
-
# If no preferred language found, return first generated transcript
|
|
109
|
-
return generated_transcripts[0].fetch()
|
|
110
|
-
except NoTranscriptFound:
|
|
111
|
-
pass
|
|
112
|
-
|
|
113
|
-
# Last try: Translated transcripts in preferred languages
|
|
114
|
-
translated_transcripts = []
|
|
115
|
-
try:
|
|
116
|
-
for transcript in transcript_list:
|
|
117
|
-
if transcript.is_translatable:
|
|
118
|
-
translated_transcripts.append(transcript)
|
|
119
|
-
|
|
120
|
-
if translated_transcripts:
|
|
121
|
-
# Sort based on preferred language order
|
|
122
|
-
for lang in preferred_langs:
|
|
123
|
-
for transcript in translated_transcripts:
|
|
124
|
-
if transcript.language_code == lang:
|
|
125
|
-
return transcript.fetch()
|
|
126
|
-
# If no preferred language found, return translation to first preferred language
|
|
127
|
-
translation = translated_transcripts[0].translate(
|
|
128
|
-
preferred_langs[0]
|
|
129
|
-
)
|
|
130
|
-
return translation.fetch()
|
|
131
|
-
except NoTranscriptFound:
|
|
132
|
-
pass
|
|
133
|
-
|
|
134
|
-
raise Exception("No suitable transcript found")
|
|
135
|
-
|
|
136
|
-
except Exception as e:
|
|
137
|
-
if e.__class__.__name__ == "ParserError":
|
|
138
|
-
logger.warning(
|
|
139
|
-
f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
|
|
140
|
-
)
|
|
141
|
-
if attempt == max_attempts - 1:
|
|
142
|
-
logger.error(
|
|
143
|
-
f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
|
|
144
|
-
)
|
|
145
|
-
return None
|
|
146
|
-
await asyncio.sleep(2)
|
|
147
|
-
continue
|
|
148
|
-
else:
|
|
149
|
-
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
|
150
|
-
return None
|
|
151
|
-
return None
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
async def extract_youtube_transcript(state: ProcessSourceState):
|
|
155
|
-
"""
|
|
156
|
-
Parse the text file and print its content.
|
|
157
|
-
"""
|
|
158
|
-
|
|
159
|
-
assert state.url, "No URL provided"
|
|
160
|
-
logger.warning(f"Extracting transcript from URL: {state.url}")
|
|
161
|
-
languages = CONFIG.get("youtube_transcripts", {}).get(
|
|
162
|
-
"preferred_languages", ["en", "es", "pt"]
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
video_id = await _extract_youtube_id(state.url)
|
|
166
|
-
transcript = await get_best_transcript(video_id, languages)
|
|
167
|
-
|
|
168
|
-
logger.debug(f"Found transcript: {transcript}")
|
|
169
|
-
formatter = TextFormatter()
|
|
170
|
-
try:
|
|
171
|
-
title = await get_video_title(video_id)
|
|
172
|
-
except Exception as e:
|
|
173
|
-
logger.critical(f"Failed to get video title for video_id: {video_id}")
|
|
174
|
-
logger.exception(e)
|
|
175
|
-
title = ""
|
|
176
|
-
|
|
177
|
-
try:
|
|
178
|
-
formatted_content = formatter.format_transcript(transcript)
|
|
179
|
-
except Exception as e:
|
|
180
|
-
logger.critical(f"Failed to format transcript for video_id: {video_id}")
|
|
181
|
-
logger.exception(e)
|
|
182
|
-
formatted_content = ""
|
|
183
|
-
|
|
184
|
-
try:
|
|
185
|
-
transcript_raw = transcript.to_raw_data()
|
|
186
|
-
except Exception as e:
|
|
187
|
-
logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
|
|
188
|
-
logger.exception(e)
|
|
189
|
-
transcript_raw = ""
|
|
190
|
-
|
|
191
|
-
return {
|
|
192
|
-
"content": formatted_content,
|
|
193
|
-
"title": title,
|
|
194
|
-
"metadata": {"video_id": video_id, "transcript": transcript_raw},
|
|
195
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{content_core-1.0.2 → content_core-1.0.3}/src/content_core/content/identification/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|