content-core 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/content/summary/core.py +1 -1
- content_core/processors/youtube.py +120 -96
- {content_core-1.0.2.dist-info → content_core-1.0.4.dist-info}/METADATA +2 -1
- {content_core-1.0.2.dist-info → content_core-1.0.4.dist-info}/RECORD +7 -7
- {content_core-1.0.2.dist-info → content_core-1.0.4.dist-info}/WHEEL +0 -0
- {content_core-1.0.2.dist-info → content_core-1.0.4.dist-info}/entry_points.txt +0 -0
- {content_core-1.0.2.dist-info → content_core-1.0.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
|
|
|
8
8
|
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
9
|
response = await templated_message_fn(
|
|
10
10
|
TemplatedMessageInput(
|
|
11
|
-
user_prompt_template="content/summarize",
|
|
11
|
+
user_prompt_template="prompts/content/summarize",
|
|
12
12
|
data={"content": content, "context": context},
|
|
13
13
|
)
|
|
14
14
|
)
|
|
@@ -1,16 +1,14 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import re
|
|
3
2
|
import ssl
|
|
4
3
|
|
|
5
4
|
import aiohttp
|
|
6
5
|
from bs4 import BeautifulSoup
|
|
7
|
-
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
8
|
-
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
9
|
-
|
|
10
6
|
from content_core.common import ProcessSourceState
|
|
11
7
|
from content_core.common.exceptions import NoTranscriptFound
|
|
12
8
|
from content_core.config import CONFIG
|
|
13
9
|
from content_core.logging import logger
|
|
10
|
+
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
11
|
+
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
14
12
|
|
|
15
13
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
16
14
|
|
|
@@ -69,86 +67,103 @@ async def _extract_youtube_id(url):
|
|
|
69
67
|
|
|
70
68
|
|
|
71
69
|
async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
|
72
|
-
|
|
73
|
-
|
|
70
|
+
try:
|
|
71
|
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
72
|
+
|
|
73
|
+
# First try: Manual transcripts in preferred languages
|
|
74
|
+
manual_transcripts = []
|
|
74
75
|
try:
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
preferred_langs[0]
|
|
129
|
-
)
|
|
130
|
-
return translation.fetch()
|
|
131
|
-
except NoTranscriptFound:
|
|
132
|
-
pass
|
|
133
|
-
|
|
134
|
-
raise Exception("No suitable transcript found")
|
|
76
|
+
for transcript in transcript_list:
|
|
77
|
+
if not transcript.is_generated and not transcript.is_translatable:
|
|
78
|
+
manual_transcripts.append(transcript)
|
|
79
|
+
|
|
80
|
+
if manual_transcripts:
|
|
81
|
+
# Sort based on preferred language order
|
|
82
|
+
for lang in preferred_langs:
|
|
83
|
+
for transcript in manual_transcripts:
|
|
84
|
+
if transcript.language_code == lang:
|
|
85
|
+
return transcript.fetch()
|
|
86
|
+
# If no preferred language found, return first manual transcript
|
|
87
|
+
return manual_transcripts[0].fetch()
|
|
88
|
+
except NoTranscriptFound:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
# Second try: Auto-generated transcripts in preferred languages
|
|
92
|
+
generated_transcripts = []
|
|
93
|
+
try:
|
|
94
|
+
for transcript in transcript_list:
|
|
95
|
+
if transcript.is_generated and not transcript.is_translatable:
|
|
96
|
+
generated_transcripts.append(transcript)
|
|
97
|
+
|
|
98
|
+
if generated_transcripts:
|
|
99
|
+
# Sort based on preferred language order
|
|
100
|
+
for lang in preferred_langs:
|
|
101
|
+
for transcript in generated_transcripts:
|
|
102
|
+
if transcript.language_code == lang:
|
|
103
|
+
return transcript.fetch()
|
|
104
|
+
# If no preferred language found, return first generated transcript
|
|
105
|
+
return generated_transcripts[0].fetch()
|
|
106
|
+
except NoTranscriptFound:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
# Last try: Translated transcripts in preferred languages
|
|
110
|
+
translated_transcripts = []
|
|
111
|
+
try:
|
|
112
|
+
for transcript in transcript_list:
|
|
113
|
+
if transcript.is_translatable:
|
|
114
|
+
translated_transcripts.append(transcript)
|
|
115
|
+
|
|
116
|
+
if translated_transcripts:
|
|
117
|
+
# Sort based on preferred language order
|
|
118
|
+
for lang in preferred_langs:
|
|
119
|
+
for transcript in translated_transcripts:
|
|
120
|
+
if transcript.language_code == lang:
|
|
121
|
+
return transcript.fetch()
|
|
122
|
+
# If no preferred language found, return translation to first preferred language
|
|
123
|
+
translation = translated_transcripts[0].translate(preferred_langs[0])
|
|
124
|
+
return translation.fetch()
|
|
125
|
+
except NoTranscriptFound:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
raise Exception("No suitable transcript found")
|
|
135
129
|
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
|
|
136
|
+
from pytubefix import YouTube
|
|
137
|
+
|
|
138
|
+
yt = YouTube(url)
|
|
139
|
+
logger.debug(f"Captions: {yt.captions}")
|
|
140
|
+
|
|
141
|
+
# Try to get captions in the preferred languages
|
|
142
|
+
if yt.captions:
|
|
143
|
+
for lang in languages:
|
|
144
|
+
if lang in yt.captions:
|
|
145
|
+
caption = yt.captions[lang]
|
|
146
|
+
break
|
|
147
|
+
elif f"a.{lang}" in yt.captions:
|
|
148
|
+
caption = yt.captions[f"a.{lang}"]
|
|
149
|
+
break
|
|
150
|
+
else: # No preferred language found, use the first available
|
|
151
|
+
caption_key = list(yt.captions.keys())[0]
|
|
152
|
+
caption = yt.captions[caption_key.code]
|
|
153
|
+
try:
|
|
154
|
+
srt_captions = caption.generate_srt_captions()
|
|
155
|
+
txt_captions = caption.generate_txt_captions()
|
|
156
|
+
return txt_captions, srt_captions
|
|
157
|
+
except KeyError as e:
|
|
158
|
+
logger.error(f"KeyError while generating captions for {caption}: {e}")
|
|
159
|
+
return None, None
|
|
136
160
|
except Exception as e:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
|
|
144
|
-
)
|
|
145
|
-
return None
|
|
146
|
-
await asyncio.sleep(2)
|
|
147
|
-
continue
|
|
148
|
-
else:
|
|
149
|
-
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
|
150
|
-
return None
|
|
151
|
-
return None
|
|
161
|
+
logger.error(
|
|
162
|
+
f"Unexpected error while generating captions for {caption}: {e}"
|
|
163
|
+
)
|
|
164
|
+
return None, None
|
|
165
|
+
|
|
166
|
+
return None, None
|
|
152
167
|
|
|
153
168
|
|
|
154
169
|
async def extract_youtube_transcript(state: ProcessSourceState):
|
|
@@ -162,11 +177,10 @@ async def extract_youtube_transcript(state: ProcessSourceState):
|
|
|
162
177
|
"preferred_languages", ["en", "es", "pt"]
|
|
163
178
|
)
|
|
164
179
|
|
|
180
|
+
# quick fix since transcripts api is not working for now
|
|
181
|
+
engine = "pytubefix"
|
|
165
182
|
video_id = await _extract_youtube_id(state.url)
|
|
166
|
-
transcript = await get_best_transcript(video_id, languages)
|
|
167
183
|
|
|
168
|
-
logger.debug(f"Found transcript: {transcript}")
|
|
169
|
-
formatter = TextFormatter()
|
|
170
184
|
try:
|
|
171
185
|
title = await get_video_title(video_id)
|
|
172
186
|
except Exception as e:
|
|
@@ -174,19 +188,29 @@ async def extract_youtube_transcript(state: ProcessSourceState):
|
|
|
174
188
|
logger.exception(e)
|
|
175
189
|
title = ""
|
|
176
190
|
|
|
177
|
-
|
|
178
|
-
formatted_content =
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
191
|
+
if engine == "pytubefix":
|
|
192
|
+
formatted_content, transcript_raw = extract_transcript_pytubefix(
|
|
193
|
+
state.url, languages
|
|
194
|
+
)
|
|
195
|
+
if engine == "transcripts-api":
|
|
196
|
+
transcript = await get_best_transcript(video_id, languages)
|
|
183
197
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
198
|
+
logger.debug(f"Found transcript: {transcript}")
|
|
199
|
+
formatter = TextFormatter()
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
formatted_content = formatter.format_transcript(transcript)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.critical(f"Failed to format transcript for video_id: {video_id}")
|
|
205
|
+
logger.exception(e)
|
|
206
|
+
formatted_content = ""
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
transcript_raw = transcript.to_raw_data()
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
|
|
212
|
+
logger.exception(e)
|
|
213
|
+
transcript_raw = ""
|
|
190
214
|
|
|
191
215
|
return {
|
|
192
216
|
"content": formatted_content,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -27,6 +27,7 @@ Requires-Dist: python-dotenv>=1.1.0
|
|
|
27
27
|
Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
|
|
28
28
|
Requires-Dist: python-magic>=0.4.27
|
|
29
29
|
Requires-Dist: python-pptx>=1.0.2
|
|
30
|
+
Requires-Dist: pytubefix>=9.1.1
|
|
30
31
|
Requires-Dist: readability-lxml>=0.8.4.1
|
|
31
32
|
Requires-Dist: validators>=0.34.0
|
|
32
33
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
@@ -18,7 +18,7 @@ content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU
|
|
|
18
18
|
content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8YnijESGvnD0,7605
|
|
19
19
|
content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
|
|
20
20
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
21
|
-
content_core/content/summary/core.py,sha256=
|
|
21
|
+
content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
|
|
22
22
|
content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
|
|
23
23
|
content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
|
|
24
24
|
content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
|
|
@@ -27,13 +27,13 @@ content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4af
|
|
|
27
27
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
28
28
|
content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
|
|
29
29
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
30
|
-
content_core/processors/youtube.py,sha256=
|
|
30
|
+
content_core/processors/youtube.py,sha256=MOeZboVfM9_C87L5mnUVvsbQeKoznwJoYn1wP1_hA_U,7869
|
|
31
31
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
32
32
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
33
33
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
34
34
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
35
|
-
content_core-1.0.
|
|
36
|
-
content_core-1.0.
|
|
37
|
-
content_core-1.0.
|
|
38
|
-
content_core-1.0.
|
|
39
|
-
content_core-1.0.
|
|
35
|
+
content_core-1.0.4.dist-info/METADATA,sha256=SdXexgOV0tc4ArCYWjxrZog4esHJxW0zh8pdnZFqLi8,11908
|
|
36
|
+
content_core-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
37
|
+
content_core-1.0.4.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
38
|
+
content_core-1.0.4.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
39
|
+
content_core-1.0.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|