content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,262 @@
1
+ import re
2
+ import ssl
3
+
4
+ import aiohttp
5
+ from bs4 import BeautifulSoup
6
+ from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
7
+ from youtube_transcript_api.formatters import TextFormatter # type: ignore
8
+
9
+ from content_core.common import ProcessSourceState
10
+ from content_core.common.exceptions import NoTranscriptFound
11
+ from content_core.common.retry import retry_youtube
12
+ from content_core.config import CONFIG, get_proxy, _redact_proxy_url
13
+ from content_core.logging import logger
14
+
15
+ ssl._create_default_https_context = ssl._create_unverified_context
16
+
17
+
18
+ @retry_youtube()
19
+ async def _fetch_video_title(video_id, proxy: str | None = None):
20
+ """Internal function that fetches video title - wrapped with retry logic."""
21
+ url = f"https://www.youtube.com/watch?v={video_id}"
22
+ resolved_proxy = get_proxy(proxy)
23
+ async with aiohttp.ClientSession() as session:
24
+ async with session.get(url, proxy=resolved_proxy) as response:
25
+ html = await response.text()
26
+
27
+ # BeautifulSoup doesn't support async operations
28
+ soup = BeautifulSoup(html, "html.parser")
29
+
30
+ # YouTube stores title in a meta tag
31
+ title = soup.find("meta", property="og:title")["content"]
32
+ return title
33
+
34
+
35
+ async def get_video_title(video_id, proxy: str | None = None):
36
+ """Get video title from YouTube, with retry logic for transient failures."""
37
+ try:
38
+ return await _fetch_video_title(video_id, proxy)
39
+ except Exception as e:
40
+ logger.error(f"Failed to get video title after retries: {e}")
41
+ return None
42
+
43
+
44
+ async def _extract_youtube_id(url):
45
+ """
46
+ Extract the YouTube video ID from a given URL using regular expressions.
47
+
48
+ Args:
49
+ url (str): The YouTube URL from which to extract the video ID.
50
+
51
+ Returns:
52
+ str: The extracted YouTube video ID or None if no valid ID is found.
53
+ """
54
+ # Define a regular expression pattern to capture the YouTube video ID
55
+ youtube_regex = (
56
+ r"(?:https?://)?" # Optional scheme
57
+ r"(?:www\.)?" # Optional www.
58
+ r"(?:"
59
+ r"youtu\.be/" # Shortened URL
60
+ r"|youtube\.com" # Main URL
61
+ r"(?:" # Group start
62
+ r"/embed/" # Embed URL
63
+ r"|/v/" # Older video URL
64
+ r"|/watch\?v=" # Standard watch URL
65
+ r"|/watch\?.+&v=" # Other watch URL
66
+ r")" # Group end
67
+ r")" # End main group
68
+ r"([\w-]{11})" # 11 characters (YouTube video ID)
69
+ )
70
+
71
+ # Search the URL for the pattern
72
+ match = re.search(youtube_regex, url)
73
+
74
+ # Return the video ID if a match is found
75
+ return match.group(1) if match else None
76
+
77
+
78
+ @retry_youtube()
79
+ async def _fetch_best_transcript(
80
+ video_id, preferred_langs=["en", "es", "pt"], proxy: str | None = None
81
+ ):
82
+ """Internal function that fetches transcript - wrapped with retry logic."""
83
+ resolved_proxy = get_proxy(proxy)
84
+ proxies = None
85
+ if resolved_proxy:
86
+ proxies = {"http": resolved_proxy, "https": resolved_proxy}
87
+ logger.debug(f"YouTubeTranscriptApi using proxy: {_redact_proxy_url(resolved_proxy)}")
88
+
89
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies)
90
+
91
+ # First try: Manual transcripts in preferred languages
92
+ manual_transcripts = []
93
+ try:
94
+ for transcript in transcript_list:
95
+ if not transcript.is_generated and not transcript.is_translatable:
96
+ manual_transcripts.append(transcript)
97
+
98
+ if manual_transcripts:
99
+ # Sort based on preferred language order
100
+ for lang in preferred_langs:
101
+ for transcript in manual_transcripts:
102
+ if transcript.language_code == lang:
103
+ return transcript.fetch()
104
+ # If no preferred language found, return first manual transcript
105
+ return manual_transcripts[0].fetch()
106
+ except NoTranscriptFound:
107
+ pass
108
+
109
+ # Second try: Auto-generated transcripts in preferred languages
110
+ generated_transcripts = []
111
+ try:
112
+ for transcript in transcript_list:
113
+ if transcript.is_generated and not transcript.is_translatable:
114
+ generated_transcripts.append(transcript)
115
+
116
+ if generated_transcripts:
117
+ # Sort based on preferred language order
118
+ for lang in preferred_langs:
119
+ for transcript in generated_transcripts:
120
+ if transcript.language_code == lang:
121
+ return transcript.fetch()
122
+ # If no preferred language found, return first generated transcript
123
+ return generated_transcripts[0].fetch()
124
+ except NoTranscriptFound:
125
+ pass
126
+
127
+ # Last try: Translated transcripts in preferred languages
128
+ translated_transcripts = []
129
+ try:
130
+ for transcript in transcript_list:
131
+ if transcript.is_translatable:
132
+ translated_transcripts.append(transcript)
133
+
134
+ if translated_transcripts:
135
+ # Sort based on preferred language order
136
+ for lang in preferred_langs:
137
+ for transcript in translated_transcripts:
138
+ if transcript.language_code == lang:
139
+ return transcript.fetch()
140
+ # If no preferred language found, return translation to first preferred language
141
+ translation = translated_transcripts[0].translate(preferred_langs[0])
142
+ return translation.fetch()
143
+ except NoTranscriptFound:
144
+ pass
145
+
146
+ raise NoTranscriptFound("No suitable transcript found for this video")
147
+
148
+
149
+ async def get_best_transcript(
150
+ video_id, preferred_langs=["en", "es", "pt"], proxy: str | None = None
151
+ ):
152
+ """Get best available transcript with retry logic for transient failures."""
153
+ try:
154
+ return await _fetch_best_transcript(video_id, preferred_langs, proxy)
155
+ except Exception as e:
156
+ logger.error(
157
+ f"Failed to get transcript for video {video_id} after retries: {e}"
158
+ )
159
+ return None
160
+
161
+
162
+ @retry_youtube()
163
+ def _fetch_transcript_pytubefix(
164
+ url, languages=["en", "es", "pt"], proxy: str | None = None
165
+ ):
166
+ """Internal function that fetches transcript via pytubefix - wrapped with retry logic."""
167
+ from pytubefix import YouTube
168
+
169
+ resolved_proxy = get_proxy(proxy)
170
+ proxies = None
171
+ if resolved_proxy:
172
+ proxies = {"http": resolved_proxy, "https": resolved_proxy}
173
+ logger.debug(f"pytubefix using proxy: {_redact_proxy_url(resolved_proxy)}")
174
+
175
+ yt = YouTube(url, proxies=proxies)
176
+ logger.debug(f"Captions: {yt.captions}")
177
+
178
+ # Try to get captions in the preferred languages
179
+ if yt.captions:
180
+ for lang in languages:
181
+ if lang in yt.captions:
182
+ caption = yt.captions[lang]
183
+ break
184
+ elif f"a.{lang}" in yt.captions:
185
+ caption = yt.captions[f"a.{lang}"]
186
+ break
187
+ else: # No preferred language found, use the first available
188
+ caption_key = list(yt.captions.keys())[0]
189
+ caption = yt.captions[caption_key.code]
190
+
191
+ srt_captions = caption.generate_srt_captions()
192
+ txt_captions = caption.generate_txt_captions()
193
+ return txt_captions, srt_captions
194
+
195
+ return None, None
196
+
197
+
198
+ def extract_transcript_pytubefix(
199
+ url, languages=["en", "es", "pt"], proxy: str | None = None
200
+ ):
201
+ """Extract transcript via pytubefix with retry logic for transient failures."""
202
+ try:
203
+ return _fetch_transcript_pytubefix(url, languages, proxy)
204
+ except Exception as e:
205
+ logger.error(f"Failed to extract transcript via pytubefix after retries: {e}")
206
+ return None, None
207
+
208
+
209
+ async def extract_youtube_transcript(state: ProcessSourceState):
210
+ """
211
+ Parse the text file and print its content.
212
+
213
+ Proxy configuration is passed through state.proxy and resolved using get_proxy().
214
+ """
215
+
216
+ assert state.url, "No URL provided"
217
+ logger.debug(f"Extracting transcript from URL: {state.url}")
218
+ languages = CONFIG.get("youtube_transcripts", {}).get(
219
+ "preferred_languages", ["en", "es", "pt"]
220
+ )
221
+ proxy = state.proxy
222
+
223
+ # quick fix since transcripts api is not working for now
224
+ engine = "pytubefix"
225
+ video_id = await _extract_youtube_id(state.url)
226
+
227
+ try:
228
+ title = await get_video_title(video_id, proxy)
229
+ except Exception as e:
230
+ logger.critical(f"Failed to get video title for video_id: {video_id}")
231
+ logger.exception(e)
232
+ title = ""
233
+
234
+ if engine == "pytubefix":
235
+ formatted_content, transcript_raw = extract_transcript_pytubefix(
236
+ state.url, languages, proxy
237
+ )
238
+ if engine == "transcripts-api":
239
+ transcript = await get_best_transcript(video_id, languages, proxy)
240
+
241
+ logger.debug(f"Found transcript: {transcript}")
242
+ formatter = TextFormatter()
243
+
244
+ try:
245
+ formatted_content = formatter.format_transcript(transcript)
246
+ except Exception as e:
247
+ logger.critical(f"Failed to format transcript for video_id: {video_id}")
248
+ logger.exception(e)
249
+ formatted_content = ""
250
+
251
+ try:
252
+ transcript_raw = transcript.to_raw_data()
253
+ except Exception as e:
254
+ logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
255
+ logger.exception(e)
256
+ transcript_raw = ""
257
+
258
+ return {
259
+ "content": formatted_content,
260
+ "title": title,
261
+ "metadata": {"video_id": video_id, "transcript": transcript_raw},
262
+ }
content_core/py.typed ADDED
@@ -0,0 +1,2 @@
1
+ # This file indicates that the content_core package supports type checking.
2
+ # See PEP 561 for more information: https://www.python.org/dev/peps/pep-0561/
@@ -0,0 +1,70 @@
1
+ from typing import Dict, Optional, Union
2
+
3
+ from ai_prompter import Prompter
4
+ from esperanto import LanguageModel
5
+ from pydantic import BaseModel, Field
6
+
7
+ from content_core.common.retry import retry_llm
8
+ from content_core.logging import logger
9
+ from content_core.models import ModelFactory
10
+
11
+
12
+ class TemplatedMessageInput(BaseModel):
13
+ system_prompt_template: Optional[str] = None
14
+ system_prompt_text: Optional[str] = None
15
+ user_prompt_template: Optional[str] = None
16
+ user_prompt_text: Optional[str] = None
17
+ data: Optional[Union[Dict, BaseModel]] = Field(default_factory=lambda: {})
18
+ config: Dict = Field(
19
+ description="The config for the LLM",
20
+ default={
21
+ "temperature": 0,
22
+ "top_p": 1,
23
+ "max_tokens": 600,
24
+ },
25
+ )
26
+
27
+
28
+ @retry_llm()
29
+ async def _execute_llm_call(model: LanguageModel, msgs: list) -> str:
30
+ """Internal function to execute LLM call - wrapped with retry logic."""
31
+ result = await model.achat_complete(msgs)
32
+ return result.content
33
+
34
+
35
+ async def templated_message(
36
+ input: TemplatedMessageInput, model: Optional[LanguageModel] = None
37
+ ) -> Optional[str]:
38
+ """
39
+ Execute a templated LLM message with retry logic for transient failures.
40
+
41
+ Args:
42
+ input: TemplatedMessageInput with prompt templates and data
43
+ model: Optional LanguageModel instance (defaults to factory model)
44
+
45
+ Returns:
46
+ Optional[str]: LLM response content, or None if all retries exhausted
47
+ """
48
+ if not model:
49
+ model = ModelFactory.get_model("default_model")
50
+
51
+ msgs = []
52
+ if input.system_prompt_template or input.system_prompt_text:
53
+ system_prompt = Prompter(
54
+ prompt_template=input.system_prompt_template,
55
+ template_text=input.system_prompt_text,
56
+ ).render(data=input.data)
57
+ msgs.append({"role": "system", "content": system_prompt})
58
+
59
+ if input.user_prompt_template or input.user_prompt_text:
60
+ user_prompt = Prompter(
61
+ prompt_template=input.user_prompt_template,
62
+ template_text=input.user_prompt_text,
63
+ ).render(data=input.data)
64
+ msgs.append({"role": "user", "content": user_prompt})
65
+
66
+ try:
67
+ return await _execute_llm_call(model, msgs)
68
+ except Exception as e:
69
+ logger.error(f"LLM call failed after retries: {e}")
70
+ return None
@@ -0,0 +1,9 @@
1
+ from .cleanup import cleanup_content_tool
2
+ from .extract import extract_content_tool
3
+ from .summarize import summarize_content_tool
4
+
5
+ __all__ = [
6
+ "cleanup_content_tool",
7
+ "extract_content_tool",
8
+ "summarize_content_tool",
9
+ ]
@@ -0,0 +1,15 @@
1
+ from langchain_core.tools import tool
2
+
3
+ from content_core.content_cleanup import cleanup_content
4
+ from content_core.common import process_input_content
5
+
6
+
7
+ @tool
8
+ async def cleanup_content_tool(content: str) -> str:
9
+ """
10
+ Clean content. Rewrite paragraphs. Fix grammar and spelling.
11
+ Accepts direct text, URLs, or file paths. If a URL or file path is provided,
12
+ the content will be extracted first before cleaning.
13
+ """
14
+ content = await process_input_content(content)
15
+ return await cleanup_content(content)
@@ -0,0 +1,21 @@
1
+ from typing import Dict
2
+
3
+ from langchain_core.tools import tool
4
+
5
+ from content_core.extraction import extract_content
6
+
7
+
8
+ @tool
9
+ async def extract_content_tool(file_path_or_url: str) -> Dict:
10
+ """
11
+ Extract title, content and metadata from URLs and Links.
12
+
13
+ Args:
14
+ file_path_or_url: URL or file path to extract content from.
15
+
16
+ Returns:
17
+ Dict: Extracted content and metadata.
18
+ """
19
+ if file_path_or_url.startswith("http"):
20
+ return await extract_content({"url": file_path_or_url})
21
+ return await extract_content({"file_path": file_path_or_url})
@@ -0,0 +1,17 @@
1
+ from typing import Optional
2
+
3
+ from langchain_core.tools import tool
4
+
5
+ from content_core.content_summary import summarize
6
+ from content_core.common import process_input_content
7
+
8
+
9
+ @tool
10
+ async def summarize_content_tool(content: str, context: Optional[str] = None) -> str:
11
+ """
12
+ Summarize content according to instructions provided via context.
13
+ Accepts direct text, URLs, or file paths. If a URL or file path is provided,
14
+ the content will be extracted first before summarizing.
15
+ """
16
+ content = await process_input_content(content)
17
+ return await summarize(content, context or "")