content-core 1.0.3__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-1.0.3 → content_core-1.0.4}/PKG-INFO +1 -1
- {content_core-1.0.3 → content_core-1.0.4}/pyproject.toml +1 -1
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/summary/core.py +1 -1
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/youtube.py +17 -10
- {content_core-1.0.3 → content_core-1.0.4}/uv.lock +1 -1
- {content_core-1.0.3 → content_core-1.0.4}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/.github/workflows/publish.yml +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/.gitignore +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/.python-version +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/CONTRIBUTING.md +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/LICENSE +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/Makefile +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/README.md +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/docs/processors.md +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/docs/usage.md +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/prompts/content/summarize.jinja +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/common/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/common/state.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/common/types.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/common/utils.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/config.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/logging.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/models.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/models_config.yaml +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/audio.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/docling.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/office.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/text.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/url.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/processors/video.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/py.typed +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/templated_message.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/tools/extract.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.docx +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.epub +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.md +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.mp3 +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.mp4 +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.pdf +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.pptx +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.txt +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file.xlsx +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/integration/test_cli.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/integration/test_extraction.py +0 -0
- {content_core-1.0.3 → content_core-1.0.4}/tests/unit/test_docling.py +0 -0
|
@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
|
|
|
8
8
|
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
9
|
response = await templated_message_fn(
|
|
10
10
|
TemplatedMessageInput(
|
|
11
|
-
user_prompt_template="content/summarize",
|
|
11
|
+
user_prompt_template="prompts/content/summarize",
|
|
12
12
|
data={"content": content, "context": context},
|
|
13
13
|
)
|
|
14
14
|
)
|
|
@@ -3,13 +3,12 @@ import ssl
|
|
|
3
3
|
|
|
4
4
|
import aiohttp
|
|
5
5
|
from bs4 import BeautifulSoup
|
|
6
|
-
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
7
|
-
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
8
|
-
|
|
9
6
|
from content_core.common import ProcessSourceState
|
|
10
7
|
from content_core.common.exceptions import NoTranscriptFound
|
|
11
8
|
from content_core.config import CONFIG
|
|
12
9
|
from content_core.logging import logger
|
|
10
|
+
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
11
|
+
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
13
12
|
|
|
14
13
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
15
14
|
|
|
@@ -137,7 +136,7 @@ def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
|
|
|
137
136
|
from pytubefix import YouTube
|
|
138
137
|
|
|
139
138
|
yt = YouTube(url)
|
|
140
|
-
|
|
139
|
+
logger.debug(f"Captions: {yt.captions}")
|
|
141
140
|
|
|
142
141
|
# Try to get captions in the preferred languages
|
|
143
142
|
if yt.captions:
|
|
@@ -149,12 +148,20 @@ def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
|
|
|
149
148
|
caption = yt.captions[f"a.{lang}"]
|
|
150
149
|
break
|
|
151
150
|
else: # No preferred language found, use the first available
|
|
152
|
-
caption_key =
|
|
153
|
-
caption = yt.captions[caption_key]
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
151
|
+
caption_key = list(yt.captions.keys())[0]
|
|
152
|
+
caption = yt.captions[caption_key.code]
|
|
153
|
+
try:
|
|
154
|
+
srt_captions = caption.generate_srt_captions()
|
|
155
|
+
txt_captions = caption.generate_txt_captions()
|
|
156
|
+
return txt_captions, srt_captions
|
|
157
|
+
except KeyError as e:
|
|
158
|
+
logger.error(f"KeyError while generating captions for {caption}: {e}")
|
|
159
|
+
return None, None
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.error(
|
|
162
|
+
f"Unexpected error while generating captions for {caption}: {e}"
|
|
163
|
+
)
|
|
164
|
+
return None, None
|
|
158
165
|
|
|
159
166
|
return None, None
|
|
160
167
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{content_core-1.0.3 → content_core-1.0.4}/src/content_core/content/identification/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|