content-core 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_core/__init__.py +216 -0
- content_core/cc_config.yaml +86 -0
- content_core/common/__init__.py +38 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/retry.py +325 -0
- content_core/common/state.py +64 -0
- content_core/common/types.py +15 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +575 -0
- content_core/content/__init__.py +6 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +252 -0
- content_core/content/identification/__init__.py +9 -0
- content_core/content/identification/file_detector.py +505 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/logging.py +15 -0
- content_core/mcp/__init__.py +5 -0
- content_core/mcp/server.py +214 -0
- content_core/models.py +60 -0
- content_core/models_config.yaml +31 -0
- content_core/notebooks/run.ipynb +359 -0
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/audio.py +272 -0
- content_core/processors/docling.py +79 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +292 -0
- content_core/processors/text.py +36 -0
- content_core/processors/url.py +324 -0
- content_core/processors/video.py +166 -0
- content_core/processors/youtube.py +262 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +70 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-1.10.0.dist-info/METADATA +742 -0
- content_core-1.10.0.dist-info/RECORD +44 -0
- content_core-1.10.0.dist-info/WHEEL +4 -0
- content_core-1.10.0.dist-info/entry_points.txt +5 -0
- content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import ssl
|
|
3
|
+
|
|
4
|
+
import aiohttp
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
7
|
+
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
8
|
+
|
|
9
|
+
from content_core.common import ProcessSourceState
|
|
10
|
+
from content_core.common.exceptions import NoTranscriptFound
|
|
11
|
+
from content_core.common.retry import retry_youtube
|
|
12
|
+
from content_core.config import CONFIG, get_proxy, _redact_proxy_url
|
|
13
|
+
from content_core.logging import logger
|
|
14
|
+
|
|
15
|
+
ssl._create_default_https_context = ssl._create_unverified_context
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@retry_youtube()
|
|
19
|
+
async def _fetch_video_title(video_id, proxy: str | None = None):
|
|
20
|
+
"""Internal function that fetches video title - wrapped with retry logic."""
|
|
21
|
+
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
22
|
+
resolved_proxy = get_proxy(proxy)
|
|
23
|
+
async with aiohttp.ClientSession() as session:
|
|
24
|
+
async with session.get(url, proxy=resolved_proxy) as response:
|
|
25
|
+
html = await response.text()
|
|
26
|
+
|
|
27
|
+
# BeautifulSoup doesn't support async operations
|
|
28
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
29
|
+
|
|
30
|
+
# YouTube stores title in a meta tag
|
|
31
|
+
title = soup.find("meta", property="og:title")["content"]
|
|
32
|
+
return title
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def get_video_title(video_id, proxy: str | None = None):
|
|
36
|
+
"""Get video title from YouTube, with retry logic for transient failures."""
|
|
37
|
+
try:
|
|
38
|
+
return await _fetch_video_title(video_id, proxy)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
logger.error(f"Failed to get video title after retries: {e}")
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def _extract_youtube_id(url):
|
|
45
|
+
"""
|
|
46
|
+
Extract the YouTube video ID from a given URL using regular expressions.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
url (str): The YouTube URL from which to extract the video ID.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
str: The extracted YouTube video ID or None if no valid ID is found.
|
|
53
|
+
"""
|
|
54
|
+
# Define a regular expression pattern to capture the YouTube video ID
|
|
55
|
+
youtube_regex = (
|
|
56
|
+
r"(?:https?://)?" # Optional scheme
|
|
57
|
+
r"(?:www\.)?" # Optional www.
|
|
58
|
+
r"(?:"
|
|
59
|
+
r"youtu\.be/" # Shortened URL
|
|
60
|
+
r"|youtube\.com" # Main URL
|
|
61
|
+
r"(?:" # Group start
|
|
62
|
+
r"/embed/" # Embed URL
|
|
63
|
+
r"|/v/" # Older video URL
|
|
64
|
+
r"|/watch\?v=" # Standard watch URL
|
|
65
|
+
r"|/watch\?.+&v=" # Other watch URL
|
|
66
|
+
r")" # Group end
|
|
67
|
+
r")" # End main group
|
|
68
|
+
r"([\w-]{11})" # 11 characters (YouTube video ID)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Search the URL for the pattern
|
|
72
|
+
match = re.search(youtube_regex, url)
|
|
73
|
+
|
|
74
|
+
# Return the video ID if a match is found
|
|
75
|
+
return match.group(1) if match else None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@retry_youtube()
|
|
79
|
+
async def _fetch_best_transcript(
|
|
80
|
+
video_id, preferred_langs=["en", "es", "pt"], proxy: str | None = None
|
|
81
|
+
):
|
|
82
|
+
"""Internal function that fetches transcript - wrapped with retry logic."""
|
|
83
|
+
resolved_proxy = get_proxy(proxy)
|
|
84
|
+
proxies = None
|
|
85
|
+
if resolved_proxy:
|
|
86
|
+
proxies = {"http": resolved_proxy, "https": resolved_proxy}
|
|
87
|
+
logger.debug(f"YouTubeTranscriptApi using proxy: {_redact_proxy_url(resolved_proxy)}")
|
|
88
|
+
|
|
89
|
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies)
|
|
90
|
+
|
|
91
|
+
# First try: Manual transcripts in preferred languages
|
|
92
|
+
manual_transcripts = []
|
|
93
|
+
try:
|
|
94
|
+
for transcript in transcript_list:
|
|
95
|
+
if not transcript.is_generated and not transcript.is_translatable:
|
|
96
|
+
manual_transcripts.append(transcript)
|
|
97
|
+
|
|
98
|
+
if manual_transcripts:
|
|
99
|
+
# Sort based on preferred language order
|
|
100
|
+
for lang in preferred_langs:
|
|
101
|
+
for transcript in manual_transcripts:
|
|
102
|
+
if transcript.language_code == lang:
|
|
103
|
+
return transcript.fetch()
|
|
104
|
+
# If no preferred language found, return first manual transcript
|
|
105
|
+
return manual_transcripts[0].fetch()
|
|
106
|
+
except NoTranscriptFound:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
# Second try: Auto-generated transcripts in preferred languages
|
|
110
|
+
generated_transcripts = []
|
|
111
|
+
try:
|
|
112
|
+
for transcript in transcript_list:
|
|
113
|
+
if transcript.is_generated and not transcript.is_translatable:
|
|
114
|
+
generated_transcripts.append(transcript)
|
|
115
|
+
|
|
116
|
+
if generated_transcripts:
|
|
117
|
+
# Sort based on preferred language order
|
|
118
|
+
for lang in preferred_langs:
|
|
119
|
+
for transcript in generated_transcripts:
|
|
120
|
+
if transcript.language_code == lang:
|
|
121
|
+
return transcript.fetch()
|
|
122
|
+
# If no preferred language found, return first generated transcript
|
|
123
|
+
return generated_transcripts[0].fetch()
|
|
124
|
+
except NoTranscriptFound:
|
|
125
|
+
pass
|
|
126
|
+
|
|
127
|
+
# Last try: Translated transcripts in preferred languages
|
|
128
|
+
translated_transcripts = []
|
|
129
|
+
try:
|
|
130
|
+
for transcript in transcript_list:
|
|
131
|
+
if transcript.is_translatable:
|
|
132
|
+
translated_transcripts.append(transcript)
|
|
133
|
+
|
|
134
|
+
if translated_transcripts:
|
|
135
|
+
# Sort based on preferred language order
|
|
136
|
+
for lang in preferred_langs:
|
|
137
|
+
for transcript in translated_transcripts:
|
|
138
|
+
if transcript.language_code == lang:
|
|
139
|
+
return transcript.fetch()
|
|
140
|
+
# If no preferred language found, return translation to first preferred language
|
|
141
|
+
translation = translated_transcripts[0].translate(preferred_langs[0])
|
|
142
|
+
return translation.fetch()
|
|
143
|
+
except NoTranscriptFound:
|
|
144
|
+
pass
|
|
145
|
+
|
|
146
|
+
raise NoTranscriptFound("No suitable transcript found for this video")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
async def get_best_transcript(
|
|
150
|
+
video_id, preferred_langs=["en", "es", "pt"], proxy: str | None = None
|
|
151
|
+
):
|
|
152
|
+
"""Get best available transcript with retry logic for transient failures."""
|
|
153
|
+
try:
|
|
154
|
+
return await _fetch_best_transcript(video_id, preferred_langs, proxy)
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.error(
|
|
157
|
+
f"Failed to get transcript for video {video_id} after retries: {e}"
|
|
158
|
+
)
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@retry_youtube()
|
|
163
|
+
def _fetch_transcript_pytubefix(
|
|
164
|
+
url, languages=["en", "es", "pt"], proxy: str | None = None
|
|
165
|
+
):
|
|
166
|
+
"""Internal function that fetches transcript via pytubefix - wrapped with retry logic."""
|
|
167
|
+
from pytubefix import YouTube
|
|
168
|
+
|
|
169
|
+
resolved_proxy = get_proxy(proxy)
|
|
170
|
+
proxies = None
|
|
171
|
+
if resolved_proxy:
|
|
172
|
+
proxies = {"http": resolved_proxy, "https": resolved_proxy}
|
|
173
|
+
logger.debug(f"pytubefix using proxy: {_redact_proxy_url(resolved_proxy)}")
|
|
174
|
+
|
|
175
|
+
yt = YouTube(url, proxies=proxies)
|
|
176
|
+
logger.debug(f"Captions: {yt.captions}")
|
|
177
|
+
|
|
178
|
+
# Try to get captions in the preferred languages
|
|
179
|
+
if yt.captions:
|
|
180
|
+
for lang in languages:
|
|
181
|
+
if lang in yt.captions:
|
|
182
|
+
caption = yt.captions[lang]
|
|
183
|
+
break
|
|
184
|
+
elif f"a.{lang}" in yt.captions:
|
|
185
|
+
caption = yt.captions[f"a.{lang}"]
|
|
186
|
+
break
|
|
187
|
+
else: # No preferred language found, use the first available
|
|
188
|
+
caption_key = list(yt.captions.keys())[0]
|
|
189
|
+
caption = yt.captions[caption_key.code]
|
|
190
|
+
|
|
191
|
+
srt_captions = caption.generate_srt_captions()
|
|
192
|
+
txt_captions = caption.generate_txt_captions()
|
|
193
|
+
return txt_captions, srt_captions
|
|
194
|
+
|
|
195
|
+
return None, None
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def extract_transcript_pytubefix(
|
|
199
|
+
url, languages=["en", "es", "pt"], proxy: str | None = None
|
|
200
|
+
):
|
|
201
|
+
"""Extract transcript via pytubefix with retry logic for transient failures."""
|
|
202
|
+
try:
|
|
203
|
+
return _fetch_transcript_pytubefix(url, languages, proxy)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.error(f"Failed to extract transcript via pytubefix after retries: {e}")
|
|
206
|
+
return None, None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
async def extract_youtube_transcript(state: ProcessSourceState):
|
|
210
|
+
"""
|
|
211
|
+
Parse the text file and print its content.
|
|
212
|
+
|
|
213
|
+
Proxy configuration is passed through state.proxy and resolved using get_proxy().
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
assert state.url, "No URL provided"
|
|
217
|
+
logger.debug(f"Extracting transcript from URL: {state.url}")
|
|
218
|
+
languages = CONFIG.get("youtube_transcripts", {}).get(
|
|
219
|
+
"preferred_languages", ["en", "es", "pt"]
|
|
220
|
+
)
|
|
221
|
+
proxy = state.proxy
|
|
222
|
+
|
|
223
|
+
# quick fix since transcripts api is not working for now
|
|
224
|
+
engine = "pytubefix"
|
|
225
|
+
video_id = await _extract_youtube_id(state.url)
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
title = await get_video_title(video_id, proxy)
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.critical(f"Failed to get video title for video_id: {video_id}")
|
|
231
|
+
logger.exception(e)
|
|
232
|
+
title = ""
|
|
233
|
+
|
|
234
|
+
if engine == "pytubefix":
|
|
235
|
+
formatted_content, transcript_raw = extract_transcript_pytubefix(
|
|
236
|
+
state.url, languages, proxy
|
|
237
|
+
)
|
|
238
|
+
if engine == "transcripts-api":
|
|
239
|
+
transcript = await get_best_transcript(video_id, languages, proxy)
|
|
240
|
+
|
|
241
|
+
logger.debug(f"Found transcript: {transcript}")
|
|
242
|
+
formatter = TextFormatter()
|
|
243
|
+
|
|
244
|
+
try:
|
|
245
|
+
formatted_content = formatter.format_transcript(transcript)
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logger.critical(f"Failed to format transcript for video_id: {video_id}")
|
|
248
|
+
logger.exception(e)
|
|
249
|
+
formatted_content = ""
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
transcript_raw = transcript.to_raw_data()
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
|
|
255
|
+
logger.exception(e)
|
|
256
|
+
transcript_raw = ""
|
|
257
|
+
|
|
258
|
+
return {
|
|
259
|
+
"content": formatted_content,
|
|
260
|
+
"title": title,
|
|
261
|
+
"metadata": {"video_id": video_id, "transcript": transcript_raw},
|
|
262
|
+
}
|
content_core/py.typed
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from typing import Dict, Optional, Union
|
|
2
|
+
|
|
3
|
+
from ai_prompter import Prompter
|
|
4
|
+
from esperanto import LanguageModel
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from content_core.common.retry import retry_llm
|
|
8
|
+
from content_core.logging import logger
|
|
9
|
+
from content_core.models import ModelFactory
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TemplatedMessageInput(BaseModel):
|
|
13
|
+
system_prompt_template: Optional[str] = None
|
|
14
|
+
system_prompt_text: Optional[str] = None
|
|
15
|
+
user_prompt_template: Optional[str] = None
|
|
16
|
+
user_prompt_text: Optional[str] = None
|
|
17
|
+
data: Optional[Union[Dict, BaseModel]] = Field(default_factory=lambda: {})
|
|
18
|
+
config: Dict = Field(
|
|
19
|
+
description="The config for the LLM",
|
|
20
|
+
default={
|
|
21
|
+
"temperature": 0,
|
|
22
|
+
"top_p": 1,
|
|
23
|
+
"max_tokens": 600,
|
|
24
|
+
},
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@retry_llm()
|
|
29
|
+
async def _execute_llm_call(model: LanguageModel, msgs: list) -> str:
|
|
30
|
+
"""Internal function to execute LLM call - wrapped with retry logic."""
|
|
31
|
+
result = await model.achat_complete(msgs)
|
|
32
|
+
return result.content
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def templated_message(
|
|
36
|
+
input: TemplatedMessageInput, model: Optional[LanguageModel] = None
|
|
37
|
+
) -> Optional[str]:
|
|
38
|
+
"""
|
|
39
|
+
Execute a templated LLM message with retry logic for transient failures.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
input: TemplatedMessageInput with prompt templates and data
|
|
43
|
+
model: Optional LanguageModel instance (defaults to factory model)
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Optional[str]: LLM response content, or None if all retries exhausted
|
|
47
|
+
"""
|
|
48
|
+
if not model:
|
|
49
|
+
model = ModelFactory.get_model("default_model")
|
|
50
|
+
|
|
51
|
+
msgs = []
|
|
52
|
+
if input.system_prompt_template or input.system_prompt_text:
|
|
53
|
+
system_prompt = Prompter(
|
|
54
|
+
prompt_template=input.system_prompt_template,
|
|
55
|
+
template_text=input.system_prompt_text,
|
|
56
|
+
).render(data=input.data)
|
|
57
|
+
msgs.append({"role": "system", "content": system_prompt})
|
|
58
|
+
|
|
59
|
+
if input.user_prompt_template or input.user_prompt_text:
|
|
60
|
+
user_prompt = Prompter(
|
|
61
|
+
prompt_template=input.user_prompt_template,
|
|
62
|
+
template_text=input.user_prompt_text,
|
|
63
|
+
).render(data=input.data)
|
|
64
|
+
msgs.append({"role": "user", "content": user_prompt})
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
return await _execute_llm_call(model, msgs)
|
|
68
|
+
except Exception as e:
|
|
69
|
+
logger.error(f"LLM call failed after retries: {e}")
|
|
70
|
+
return None
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from langchain_core.tools import tool
|
|
2
|
+
|
|
3
|
+
from content_core.content_cleanup import cleanup_content
|
|
4
|
+
from content_core.common import process_input_content
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@tool
|
|
8
|
+
async def cleanup_content_tool(content: str) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Clean content. Rewrite paragraphs. Fix grammar and spelling.
|
|
11
|
+
Accepts direct text, URLs, or file paths. If a URL or file path is provided,
|
|
12
|
+
the content will be extracted first before cleaning.
|
|
13
|
+
"""
|
|
14
|
+
content = await process_input_content(content)
|
|
15
|
+
return await cleanup_content(content)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from langchain_core.tools import tool
|
|
4
|
+
|
|
5
|
+
from content_core.extraction import extract_content
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@tool
|
|
9
|
+
async def extract_content_tool(file_path_or_url: str) -> Dict:
|
|
10
|
+
"""
|
|
11
|
+
Extract title, content and metadata from URLs and Links.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
file_path_or_url: URL or file path to extract content from.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Dict: Extracted content and metadata.
|
|
18
|
+
"""
|
|
19
|
+
if file_path_or_url.startswith("http"):
|
|
20
|
+
return await extract_content({"url": file_path_or_url})
|
|
21
|
+
return await extract_content({"file_path": file_path_or_url})
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from langchain_core.tools import tool
|
|
4
|
+
|
|
5
|
+
from content_core.content_summary import summarize
|
|
6
|
+
from content_core.common import process_input_content
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@tool
|
|
10
|
+
async def summarize_content_tool(content: str, context: Optional[str] = None) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Summarize content according to instructions provided via context.
|
|
13
|
+
Accepts direct text, URLs, or file paths. If a URL or file path is provided,
|
|
14
|
+
the content will be extracted first before summarizing.
|
|
15
|
+
"""
|
|
16
|
+
content = await process_input_content(content)
|
|
17
|
+
return await summarize(content, context or "")
|