content-core 1.0.2__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
8
8
  templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
9
9
  response = await templated_message_fn(
10
10
  TemplatedMessageInput(
11
- user_prompt_template="content/summarize",
11
+ user_prompt_template="prompts/content/summarize",
12
12
  data={"content": content, "context": context},
13
13
  )
14
14
  )
@@ -1,16 +1,14 @@
1
- import asyncio
2
1
  import re
3
2
  import ssl
4
3
 
5
4
  import aiohttp
6
5
  from bs4 import BeautifulSoup
7
- from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
8
- from youtube_transcript_api.formatters import TextFormatter # type: ignore
9
-
10
6
  from content_core.common import ProcessSourceState
11
7
  from content_core.common.exceptions import NoTranscriptFound
12
8
  from content_core.config import CONFIG
13
9
  from content_core.logging import logger
10
+ from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
11
+ from youtube_transcript_api.formatters import TextFormatter # type: ignore
14
12
 
15
13
  ssl._create_default_https_context = ssl._create_unverified_context
16
14
 
@@ -69,86 +67,103 @@ async def _extract_youtube_id(url):
69
67
 
70
68
 
71
69
  async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
72
- max_attempts = 5
73
- for attempt in range(max_attempts):
70
+ try:
71
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
72
+
73
+ # First try: Manual transcripts in preferred languages
74
+ manual_transcripts = []
74
75
  try:
75
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
76
-
77
- # First try: Manual transcripts in preferred languages
78
- manual_transcripts = []
79
- try:
80
- for transcript in transcript_list:
81
- if not transcript.is_generated and not transcript.is_translatable:
82
- manual_transcripts.append(transcript)
83
-
84
- if manual_transcripts:
85
- # Sort based on preferred language order
86
- for lang in preferred_langs:
87
- for transcript in manual_transcripts:
88
- if transcript.language_code == lang:
89
- return transcript.fetch()
90
- # If no preferred language found, return first manual transcript
91
- return manual_transcripts[0].fetch()
92
- except NoTranscriptFound:
93
- pass
94
-
95
- # Second try: Auto-generated transcripts in preferred languages
96
- generated_transcripts = []
97
- try:
98
- for transcript in transcript_list:
99
- if transcript.is_generated and not transcript.is_translatable:
100
- generated_transcripts.append(transcript)
101
-
102
- if generated_transcripts:
103
- # Sort based on preferred language order
104
- for lang in preferred_langs:
105
- for transcript in generated_transcripts:
106
- if transcript.language_code == lang:
107
- return transcript.fetch()
108
- # If no preferred language found, return first generated transcript
109
- return generated_transcripts[0].fetch()
110
- except NoTranscriptFound:
111
- pass
112
-
113
- # Last try: Translated transcripts in preferred languages
114
- translated_transcripts = []
115
- try:
116
- for transcript in transcript_list:
117
- if transcript.is_translatable:
118
- translated_transcripts.append(transcript)
119
-
120
- if translated_transcripts:
121
- # Sort based on preferred language order
122
- for lang in preferred_langs:
123
- for transcript in translated_transcripts:
124
- if transcript.language_code == lang:
125
- return transcript.fetch()
126
- # If no preferred language found, return translation to first preferred language
127
- translation = translated_transcripts[0].translate(
128
- preferred_langs[0]
129
- )
130
- return translation.fetch()
131
- except NoTranscriptFound:
132
- pass
133
-
134
- raise Exception("No suitable transcript found")
76
+ for transcript in transcript_list:
77
+ if not transcript.is_generated and not transcript.is_translatable:
78
+ manual_transcripts.append(transcript)
79
+
80
+ if manual_transcripts:
81
+ # Sort based on preferred language order
82
+ for lang in preferred_langs:
83
+ for transcript in manual_transcripts:
84
+ if transcript.language_code == lang:
85
+ return transcript.fetch()
86
+ # If no preferred language found, return first manual transcript
87
+ return manual_transcripts[0].fetch()
88
+ except NoTranscriptFound:
89
+ pass
90
+
91
+ # Second try: Auto-generated transcripts in preferred languages
92
+ generated_transcripts = []
93
+ try:
94
+ for transcript in transcript_list:
95
+ if transcript.is_generated and not transcript.is_translatable:
96
+ generated_transcripts.append(transcript)
97
+
98
+ if generated_transcripts:
99
+ # Sort based on preferred language order
100
+ for lang in preferred_langs:
101
+ for transcript in generated_transcripts:
102
+ if transcript.language_code == lang:
103
+ return transcript.fetch()
104
+ # If no preferred language found, return first generated transcript
105
+ return generated_transcripts[0].fetch()
106
+ except NoTranscriptFound:
107
+ pass
108
+
109
+ # Last try: Translated transcripts in preferred languages
110
+ translated_transcripts = []
111
+ try:
112
+ for transcript in transcript_list:
113
+ if transcript.is_translatable:
114
+ translated_transcripts.append(transcript)
115
+
116
+ if translated_transcripts:
117
+ # Sort based on preferred language order
118
+ for lang in preferred_langs:
119
+ for transcript in translated_transcripts:
120
+ if transcript.language_code == lang:
121
+ return transcript.fetch()
122
+ # If no preferred language found, return translation to first preferred language
123
+ translation = translated_transcripts[0].translate(preferred_langs[0])
124
+ return translation.fetch()
125
+ except NoTranscriptFound:
126
+ pass
127
+
128
+ raise Exception("No suitable transcript found")
135
129
 
130
+ except Exception as e:
131
+ logger.error(f"Failed to get transcript for video {video_id}: {e}")
132
+ return None
133
+
134
+
135
+ def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
136
+ from pytubefix import YouTube
137
+
138
+ yt = YouTube(url)
139
+ logger.debug(f"Captions: {yt.captions}")
140
+
141
+ # Try to get captions in the preferred languages
142
+ if yt.captions:
143
+ for lang in languages:
144
+ if lang in yt.captions:
145
+ caption = yt.captions[lang]
146
+ break
147
+ elif f"a.{lang}" in yt.captions:
148
+ caption = yt.captions[f"a.{lang}"]
149
+ break
150
+ else: # No preferred language found, use the first available
151
+ caption_key = list(yt.captions.keys())[0]
152
+ caption = yt.captions[caption_key.code]
153
+ try:
154
+ srt_captions = caption.generate_srt_captions()
155
+ txt_captions = caption.generate_txt_captions()
156
+ return txt_captions, srt_captions
157
+ except KeyError as e:
158
+ logger.error(f"KeyError while generating captions for {caption}: {e}")
159
+ return None, None
136
160
  except Exception as e:
137
- if e.__class__.__name__ == "ParserError":
138
- logger.warning(
139
- f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
140
- )
141
- if attempt == max_attempts - 1:
142
- logger.error(
143
- f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
144
- )
145
- return None
146
- await asyncio.sleep(2)
147
- continue
148
- else:
149
- logger.error(f"Failed to get transcript for video {video_id}: {e}")
150
- return None
151
- return None
161
+ logger.error(
162
+ f"Unexpected error while generating captions for {caption}: {e}"
163
+ )
164
+ return None, None
165
+
166
+ return None, None
152
167
 
153
168
 
154
169
  async def extract_youtube_transcript(state: ProcessSourceState):
@@ -162,11 +177,10 @@ async def extract_youtube_transcript(state: ProcessSourceState):
162
177
  "preferred_languages", ["en", "es", "pt"]
163
178
  )
164
179
 
180
+ # quick fix since transcripts api is not working for now
181
+ engine = "pytubefix"
165
182
  video_id = await _extract_youtube_id(state.url)
166
- transcript = await get_best_transcript(video_id, languages)
167
183
 
168
- logger.debug(f"Found transcript: {transcript}")
169
- formatter = TextFormatter()
170
184
  try:
171
185
  title = await get_video_title(video_id)
172
186
  except Exception as e:
@@ -174,19 +188,29 @@ async def extract_youtube_transcript(state: ProcessSourceState):
174
188
  logger.exception(e)
175
189
  title = ""
176
190
 
177
- try:
178
- formatted_content = formatter.format_transcript(transcript)
179
- except Exception as e:
180
- logger.critical(f"Failed to format transcript for video_id: {video_id}")
181
- logger.exception(e)
182
- formatted_content = ""
191
+ if engine == "pytubefix":
192
+ formatted_content, transcript_raw = extract_transcript_pytubefix(
193
+ state.url, languages
194
+ )
195
+ if engine == "transcripts-api":
196
+ transcript = await get_best_transcript(video_id, languages)
183
197
 
184
- try:
185
- transcript_raw = transcript.to_raw_data()
186
- except Exception as e:
187
- logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
188
- logger.exception(e)
189
- transcript_raw = ""
198
+ logger.debug(f"Found transcript: {transcript}")
199
+ formatter = TextFormatter()
200
+
201
+ try:
202
+ formatted_content = formatter.format_transcript(transcript)
203
+ except Exception as e:
204
+ logger.critical(f"Failed to format transcript for video_id: {video_id}")
205
+ logger.exception(e)
206
+ formatted_content = ""
207
+
208
+ try:
209
+ transcript_raw = transcript.to_raw_data()
210
+ except Exception as e:
211
+ logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
212
+ logger.exception(e)
213
+ transcript_raw = ""
190
214
 
191
215
  return {
192
216
  "content": formatted_content,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -27,6 +27,7 @@ Requires-Dist: python-dotenv>=1.1.0
27
27
  Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
28
28
  Requires-Dist: python-magic>=0.4.27
29
29
  Requires-Dist: python-pptx>=1.0.2
30
+ Requires-Dist: pytubefix>=9.1.1
30
31
  Requires-Dist: readability-lxml>=0.8.4.1
31
32
  Requires-Dist: validators>=0.34.0
32
33
  Requires-Dist: youtube-transcript-api>=1.0.3
@@ -18,7 +18,7 @@ content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU
18
18
  content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8YnijESGvnD0,7605
19
19
  content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
20
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
21
- content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
21
+ content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
22
22
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
23
23
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
24
24
  content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
@@ -27,13 +27,13 @@ content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4af
27
27
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
28
28
  content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
29
29
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
30
- content_core/processors/youtube.py,sha256=pLyNy6ebz80T0e-XnBnj36o22Gm4s_jfjkGCNtKz0so,7254
30
+ content_core/processors/youtube.py,sha256=MOeZboVfM9_C87L5mnUVvsbQeKoznwJoYn1wP1_hA_U,7869
31
31
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
32
32
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
33
33
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
34
34
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
35
- content_core-1.0.2.dist-info/METADATA,sha256=Pq6mo1Atip1YKA3mB6q-eMrbkPgkodV1K_TVGtOsNiA,11876
36
- content_core-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
37
- content_core-1.0.2.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
38
- content_core-1.0.2.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
39
- content_core-1.0.2.dist-info/RECORD,,
35
+ content_core-1.0.4.dist-info/METADATA,sha256=SdXexgOV0tc4ArCYWjxrZog4esHJxW0zh8pdnZFqLi8,11908
36
+ content_core-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
37
+ content_core-1.0.4.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
38
+ content_core-1.0.4.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
39
+ content_core-1.0.4.dist-info/RECORD,,