content-core 1.0.1__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-1.0.1 → content_core-1.0.3}/PKG-INFO +4 -2
- {content_core-1.0.1 → content_core-1.0.3}/pyproject.toml +4 -2
- content_core-1.0.3/src/content_core/processors/youtube.py +212 -0
- {content_core-1.0.1 → content_core-1.0.3}/uv.lock +25 -8
- content_core-1.0.1/src/content_core/processors/youtube.py +0 -195
- {content_core-1.0.1 → content_core-1.0.3}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/.github/workflows/publish.yml +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/.gitignore +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/.python-version +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/CONTRIBUTING.md +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/LICENSE +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/Makefile +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/README.md +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/docs/processors.md +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/docs/usage.md +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/prompts/content/summarize.jinja +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/common/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/common/state.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/common/types.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/common/utils.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/config.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/summary/core.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/logging.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/models.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/models_config.yaml +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/processors/audio.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/processors/docling.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/processors/office.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/processors/text.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/processors/url.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/processors/video.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/py.typed +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/templated_message.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/tools/extract.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.docx +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.epub +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.md +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.mp3 +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.mp4 +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.pdf +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.pptx +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.txt +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file.xlsx +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/integration/test_cli.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/integration/test_extraction.py +0 -0
- {content_core-1.0.1 → content_core-1.0.3}/tests/unit/test_docling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -11,7 +11,7 @@ Requires-Dist: asciidoc>=10.2.1
|
|
|
11
11
|
Requires-Dist: bs4>=0.0.2
|
|
12
12
|
Requires-Dist: dicttoxml>=1.7.16
|
|
13
13
|
Requires-Dist: docling>=2.34.0
|
|
14
|
-
Requires-Dist: esperanto
|
|
14
|
+
Requires-Dist: esperanto>=1.2.0
|
|
15
15
|
Requires-Dist: firecrawl-py>=2.7.0
|
|
16
16
|
Requires-Dist: jinja2>=3.1.6
|
|
17
17
|
Requires-Dist: langdetect>=1.0.9
|
|
@@ -24,8 +24,10 @@ Requires-Dist: pillow>=10.4.0
|
|
|
24
24
|
Requires-Dist: pymupdf>=1.25.5
|
|
25
25
|
Requires-Dist: python-docx>=1.1.2
|
|
26
26
|
Requires-Dist: python-dotenv>=1.1.0
|
|
27
|
+
Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
|
|
27
28
|
Requires-Dist: python-magic>=0.4.27
|
|
28
29
|
Requires-Dist: python-pptx>=1.0.2
|
|
30
|
+
Requires-Dist: pytubefix>=9.1.1
|
|
29
31
|
Requires-Dist: readability-lxml>=0.8.4.1
|
|
30
32
|
Requires-Dist: validators>=0.34.0
|
|
31
33
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.0.
|
|
3
|
+
version = "1.0.3"
|
|
4
4
|
description = "Extract what matters from any media source"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -11,7 +11,7 @@ requires-python = ">=3.10"
|
|
|
11
11
|
dependencies = [
|
|
12
12
|
"aiohttp>=3.11",
|
|
13
13
|
"bs4>=0.0.2",
|
|
14
|
-
"esperanto
|
|
14
|
+
"esperanto>=1.2.0",
|
|
15
15
|
"jinja2>=3.1.6",
|
|
16
16
|
"langdetect>=1.0.9",
|
|
17
17
|
"loguru>=0.7.3",
|
|
@@ -33,6 +33,8 @@ dependencies = [
|
|
|
33
33
|
"docling>=2.34.0",
|
|
34
34
|
"pillow>=10.4.0",
|
|
35
35
|
"asciidoc>=10.2.1",
|
|
36
|
+
"python-magic-bin==0.4.14; sys_platform == 'win32'",
|
|
37
|
+
"pytubefix>=9.1.1",
|
|
36
38
|
]
|
|
37
39
|
|
|
38
40
|
[project.scripts]
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import ssl
|
|
3
|
+
|
|
4
|
+
import aiohttp
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
7
|
+
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
8
|
+
|
|
9
|
+
from content_core.common import ProcessSourceState
|
|
10
|
+
from content_core.common.exceptions import NoTranscriptFound
|
|
11
|
+
from content_core.config import CONFIG
|
|
12
|
+
from content_core.logging import logger
|
|
13
|
+
|
|
14
|
+
ssl._create_default_https_context = ssl._create_unverified_context
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
async def get_video_title(video_id):
|
|
18
|
+
try:
|
|
19
|
+
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
20
|
+
async with aiohttp.ClientSession() as session:
|
|
21
|
+
async with session.get(url) as response:
|
|
22
|
+
html = await response.text()
|
|
23
|
+
|
|
24
|
+
# BeautifulSoup doesn't support async operations
|
|
25
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
26
|
+
|
|
27
|
+
# YouTube stores title in a meta tag
|
|
28
|
+
title = soup.find("meta", property="og:title")["content"]
|
|
29
|
+
return title
|
|
30
|
+
|
|
31
|
+
except Exception as e:
|
|
32
|
+
logger.error(f"Failed to get video title: {e}")
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def _extract_youtube_id(url):
|
|
37
|
+
"""
|
|
38
|
+
Extract the YouTube video ID from a given URL using regular expressions.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
url (str): The YouTube URL from which to extract the video ID.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
str: The extracted YouTube video ID or None if no valid ID is found.
|
|
45
|
+
"""
|
|
46
|
+
# Define a regular expression pattern to capture the YouTube video ID
|
|
47
|
+
youtube_regex = (
|
|
48
|
+
r"(?:https?://)?" # Optional scheme
|
|
49
|
+
r"(?:www\.)?" # Optional www.
|
|
50
|
+
r"(?:"
|
|
51
|
+
r"youtu\.be/" # Shortened URL
|
|
52
|
+
r"|youtube\.com" # Main URL
|
|
53
|
+
r"(?:" # Group start
|
|
54
|
+
r"/embed/" # Embed URL
|
|
55
|
+
r"|/v/" # Older video URL
|
|
56
|
+
r"|/watch\?v=" # Standard watch URL
|
|
57
|
+
r"|/watch\?.+&v=" # Other watch URL
|
|
58
|
+
r")" # Group end
|
|
59
|
+
r")" # End main group
|
|
60
|
+
r"([\w-]{11})" # 11 characters (YouTube video ID)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Search the URL for the pattern
|
|
64
|
+
match = re.search(youtube_regex, url)
|
|
65
|
+
|
|
66
|
+
# Return the video ID if a match is found
|
|
67
|
+
return match.group(1) if match else None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
|
71
|
+
try:
|
|
72
|
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
73
|
+
|
|
74
|
+
# First try: Manual transcripts in preferred languages
|
|
75
|
+
manual_transcripts = []
|
|
76
|
+
try:
|
|
77
|
+
for transcript in transcript_list:
|
|
78
|
+
if not transcript.is_generated and not transcript.is_translatable:
|
|
79
|
+
manual_transcripts.append(transcript)
|
|
80
|
+
|
|
81
|
+
if manual_transcripts:
|
|
82
|
+
# Sort based on preferred language order
|
|
83
|
+
for lang in preferred_langs:
|
|
84
|
+
for transcript in manual_transcripts:
|
|
85
|
+
if transcript.language_code == lang:
|
|
86
|
+
return transcript.fetch()
|
|
87
|
+
# If no preferred language found, return first manual transcript
|
|
88
|
+
return manual_transcripts[0].fetch()
|
|
89
|
+
except NoTranscriptFound:
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
# Second try: Auto-generated transcripts in preferred languages
|
|
93
|
+
generated_transcripts = []
|
|
94
|
+
try:
|
|
95
|
+
for transcript in transcript_list:
|
|
96
|
+
if transcript.is_generated and not transcript.is_translatable:
|
|
97
|
+
generated_transcripts.append(transcript)
|
|
98
|
+
|
|
99
|
+
if generated_transcripts:
|
|
100
|
+
# Sort based on preferred language order
|
|
101
|
+
for lang in preferred_langs:
|
|
102
|
+
for transcript in generated_transcripts:
|
|
103
|
+
if transcript.language_code == lang:
|
|
104
|
+
return transcript.fetch()
|
|
105
|
+
# If no preferred language found, return first generated transcript
|
|
106
|
+
return generated_transcripts[0].fetch()
|
|
107
|
+
except NoTranscriptFound:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# Last try: Translated transcripts in preferred languages
|
|
111
|
+
translated_transcripts = []
|
|
112
|
+
try:
|
|
113
|
+
for transcript in transcript_list:
|
|
114
|
+
if transcript.is_translatable:
|
|
115
|
+
translated_transcripts.append(transcript)
|
|
116
|
+
|
|
117
|
+
if translated_transcripts:
|
|
118
|
+
# Sort based on preferred language order
|
|
119
|
+
for lang in preferred_langs:
|
|
120
|
+
for transcript in translated_transcripts:
|
|
121
|
+
if transcript.language_code == lang:
|
|
122
|
+
return transcript.fetch()
|
|
123
|
+
# If no preferred language found, return translation to first preferred language
|
|
124
|
+
translation = translated_transcripts[0].translate(preferred_langs[0])
|
|
125
|
+
return translation.fetch()
|
|
126
|
+
except NoTranscriptFound:
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
raise Exception("No suitable transcript found")
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def extract_transcript_pytubefix(url, languages=["en", "es", "pt"]):
|
|
137
|
+
from pytubefix import YouTube
|
|
138
|
+
|
|
139
|
+
yt = YouTube(url)
|
|
140
|
+
print(yt.captions)
|
|
141
|
+
|
|
142
|
+
# Try to get captions in the preferred languages
|
|
143
|
+
if yt.captions:
|
|
144
|
+
for lang in languages:
|
|
145
|
+
if lang in yt.captions:
|
|
146
|
+
caption = yt.captions[lang]
|
|
147
|
+
break
|
|
148
|
+
elif f"a.{lang}" in yt.captions:
|
|
149
|
+
caption = yt.captions[f"a.{lang}"]
|
|
150
|
+
break
|
|
151
|
+
else: # No preferred language found, use the first available
|
|
152
|
+
caption_key = next(iter(yt.captions))
|
|
153
|
+
caption = yt.captions[caption_key]
|
|
154
|
+
|
|
155
|
+
srt_captions = caption.generate_srt_captions()
|
|
156
|
+
txt_captions = caption.generate_txt_captions()
|
|
157
|
+
return txt_captions, srt_captions
|
|
158
|
+
|
|
159
|
+
return None, None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def extract_youtube_transcript(state: ProcessSourceState):
|
|
163
|
+
"""
|
|
164
|
+
Parse the text file and print its content.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
assert state.url, "No URL provided"
|
|
168
|
+
logger.warning(f"Extracting transcript from URL: {state.url}")
|
|
169
|
+
languages = CONFIG.get("youtube_transcripts", {}).get(
|
|
170
|
+
"preferred_languages", ["en", "es", "pt"]
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# quick fix since transcripts api is not working for now
|
|
174
|
+
engine = "pytubefix"
|
|
175
|
+
video_id = await _extract_youtube_id(state.url)
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
title = await get_video_title(video_id)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
logger.critical(f"Failed to get video title for video_id: {video_id}")
|
|
181
|
+
logger.exception(e)
|
|
182
|
+
title = ""
|
|
183
|
+
|
|
184
|
+
if engine == "pytubefix":
|
|
185
|
+
formatted_content, transcript_raw = extract_transcript_pytubefix(
|
|
186
|
+
state.url, languages
|
|
187
|
+
)
|
|
188
|
+
if engine == "transcripts-api":
|
|
189
|
+
transcript = await get_best_transcript(video_id, languages)
|
|
190
|
+
|
|
191
|
+
logger.debug(f"Found transcript: {transcript}")
|
|
192
|
+
formatter = TextFormatter()
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
formatted_content = formatter.format_transcript(transcript)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.critical(f"Failed to format transcript for video_id: {video_id}")
|
|
198
|
+
logger.exception(e)
|
|
199
|
+
formatted_content = ""
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
transcript_raw = transcript.to_raw_data()
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
|
|
205
|
+
logger.exception(e)
|
|
206
|
+
transcript_raw = ""
|
|
207
|
+
|
|
208
|
+
return {
|
|
209
|
+
"content": formatted_content,
|
|
210
|
+
"title": title,
|
|
211
|
+
"metadata": {"video_id": video_id, "transcript": transcript_raw},
|
|
212
|
+
}
|
|
@@ -410,7 +410,7 @@ wheels = [
|
|
|
410
410
|
|
|
411
411
|
[[package]]
|
|
412
412
|
name = "content-core"
|
|
413
|
-
version = "1.0.
|
|
413
|
+
version = "1.0.3"
|
|
414
414
|
source = { editable = "." }
|
|
415
415
|
dependencies = [
|
|
416
416
|
{ name = "ai-prompter" },
|
|
@@ -419,7 +419,7 @@ dependencies = [
|
|
|
419
419
|
{ name = "bs4" },
|
|
420
420
|
{ name = "dicttoxml" },
|
|
421
421
|
{ name = "docling" },
|
|
422
|
-
{ name = "esperanto"
|
|
422
|
+
{ name = "esperanto" },
|
|
423
423
|
{ name = "firecrawl-py" },
|
|
424
424
|
{ name = "jinja2" },
|
|
425
425
|
{ name = "langdetect" },
|
|
@@ -433,7 +433,9 @@ dependencies = [
|
|
|
433
433
|
{ name = "python-docx" },
|
|
434
434
|
{ name = "python-dotenv" },
|
|
435
435
|
{ name = "python-magic" },
|
|
436
|
+
{ name = "python-magic-bin", marker = "(platform_machine != 'aarch64' and platform_system == 'Linux' and sys_platform == 'win32') or (platform_system != 'Darwin' and platform_system != 'Linux' and sys_platform == 'win32')" },
|
|
436
437
|
{ name = "python-pptx" },
|
|
438
|
+
{ name = "pytubefix" },
|
|
437
439
|
{ name = "readability-lxml" },
|
|
438
440
|
{ name = "validators" },
|
|
439
441
|
{ name = "youtube-transcript-api" },
|
|
@@ -457,7 +459,7 @@ requires-dist = [
|
|
|
457
459
|
{ name = "bs4", specifier = ">=0.0.2" },
|
|
458
460
|
{ name = "dicttoxml", specifier = ">=1.7.16" },
|
|
459
461
|
{ name = "docling", specifier = ">=2.34.0" },
|
|
460
|
-
{ name = "esperanto",
|
|
462
|
+
{ name = "esperanto", specifier = ">=1.2.0" },
|
|
461
463
|
{ name = "firecrawl-py", specifier = ">=2.7.0" },
|
|
462
464
|
{ name = "jinja2", specifier = ">=3.1.6" },
|
|
463
465
|
{ name = "langdetect", specifier = ">=1.0.9" },
|
|
@@ -471,7 +473,9 @@ requires-dist = [
|
|
|
471
473
|
{ name = "python-docx", specifier = ">=1.1.2" },
|
|
472
474
|
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
|
473
475
|
{ name = "python-magic", specifier = ">=0.4.27" },
|
|
476
|
+
{ name = "python-magic-bin", marker = "sys_platform == 'win32'", specifier = "==0.4.14" },
|
|
474
477
|
{ name = "python-pptx", specifier = ">=1.0.2" },
|
|
478
|
+
{ name = "pytubefix", specifier = ">=9.1.1" },
|
|
475
479
|
{ name = "readability-lxml", specifier = ">=0.8.4.1" },
|
|
476
480
|
{ name = "validators", specifier = ">=0.34.0" },
|
|
477
481
|
{ name = "youtube-transcript-api", specifier = ">=1.0.3" },
|
|
@@ -729,11 +733,6 @@ wheels = [
|
|
|
729
733
|
{ url = "https://files.pythonhosted.org/packages/f9/79/5d74f2b8f9d73da83bfe80a39ff11505a2a285c03a869750db98cd89ddfd/esperanto-1.2.1-py3-none-any.whl", hash = "sha256:2fa41e5e35c847b1fe58395906d8877035f7e55d6429870d897781f7c9f17c42", size = 57680 },
|
|
730
734
|
]
|
|
731
735
|
|
|
732
|
-
[package.optional-dependencies]
|
|
733
|
-
openai = [
|
|
734
|
-
{ name = "openai" },
|
|
735
|
-
]
|
|
736
|
-
|
|
737
736
|
[[package]]
|
|
738
737
|
name = "et-xmlfile"
|
|
739
738
|
version = "2.0.0"
|
|
@@ -2919,6 +2918,15 @@ wheels = [
|
|
|
2919
2918
|
{ url = "https://files.pythonhosted.org/packages/6c/73/9f872cb81fc5c3bb48f7227872c28975f998f3e7c2b1c16e95e6432bbb90/python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3", size = 13840 },
|
|
2920
2919
|
]
|
|
2921
2920
|
|
|
2921
|
+
[[package]]
|
|
2922
|
+
name = "python-magic-bin"
|
|
2923
|
+
version = "0.4.14"
|
|
2924
|
+
source = { registry = "https://pypi.org/simple" }
|
|
2925
|
+
wheels = [
|
|
2926
|
+
{ url = "https://files.pythonhosted.org/packages/5a/5d/10b9ac745d9fd2f7151a2ab901e6bb6983dbd70e87c71111f54859d1ca2e/python_magic_bin-0.4.14-py2.py3-none-win32.whl", hash = "sha256:34a788c03adde7608028203e2dbb208f1f62225ad91518787ae26d603ae68892", size = 397784 },
|
|
2927
|
+
{ url = "https://files.pythonhosted.org/packages/07/c2/094e3d62b906d952537196603a23aec4bcd7c6126bf80eb14e6f9f4be3a2/python_magic_bin-0.4.14-py2.py3-none-win_amd64.whl", hash = "sha256:90be6206ad31071a36065a2fc169c5afb5e0355cbe6030e87641c6c62edc2b69", size = 409299 },
|
|
2928
|
+
]
|
|
2929
|
+
|
|
2922
2930
|
[[package]]
|
|
2923
2931
|
name = "python-pptx"
|
|
2924
2932
|
version = "1.0.2"
|
|
@@ -2934,6 +2942,15 @@ wheels = [
|
|
|
2934
2942
|
{ url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788 },
|
|
2935
2943
|
]
|
|
2936
2944
|
|
|
2945
|
+
[[package]]
|
|
2946
|
+
name = "pytubefix"
|
|
2947
|
+
version = "9.1.1"
|
|
2948
|
+
source = { registry = "https://pypi.org/simple" }
|
|
2949
|
+
sdist = { url = "https://files.pythonhosted.org/packages/3c/06/8570fb8fc1296dae7f156e4de57c2b351856e7813873178e1cbb8045eef3/pytubefix-9.1.1.tar.gz", hash = "sha256:68946ab2192d7bb9d8fcc0fe73f634bb0ab0cd33f2c3c718e65c0c4fbdbccbb1", size = 734325 }
|
|
2950
|
+
wheels = [
|
|
2951
|
+
{ url = "https://files.pythonhosted.org/packages/5d/fd/80ba35c78cbd007bfdb71d83b64087cca10e671cae4eb77875c952a21734/pytubefix-9.1.1-py3-none-any.whl", hash = "sha256:cc1c9cca936b82fcbf136e4630639417072aa5fdacf54ec0426604ca81c33b77", size = 732005 },
|
|
2952
|
+
]
|
|
2953
|
+
|
|
2937
2954
|
[[package]]
|
|
2938
2955
|
name = "pytz"
|
|
2939
2956
|
version = "2025.2"
|
|
@@ -1,195 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import re
|
|
3
|
-
import ssl
|
|
4
|
-
|
|
5
|
-
import aiohttp
|
|
6
|
-
from bs4 import BeautifulSoup
|
|
7
|
-
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
8
|
-
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
9
|
-
|
|
10
|
-
from content_core.common import ProcessSourceState
|
|
11
|
-
from content_core.common.exceptions import NoTranscriptFound
|
|
12
|
-
from content_core.config import CONFIG
|
|
13
|
-
from content_core.logging import logger
|
|
14
|
-
|
|
15
|
-
ssl._create_default_https_context = ssl._create_unverified_context
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
async def get_video_title(video_id):
|
|
19
|
-
try:
|
|
20
|
-
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
21
|
-
async with aiohttp.ClientSession() as session:
|
|
22
|
-
async with session.get(url) as response:
|
|
23
|
-
html = await response.text()
|
|
24
|
-
|
|
25
|
-
# BeautifulSoup doesn't support async operations
|
|
26
|
-
soup = BeautifulSoup(html, "html.parser")
|
|
27
|
-
|
|
28
|
-
# YouTube stores title in a meta tag
|
|
29
|
-
title = soup.find("meta", property="og:title")["content"]
|
|
30
|
-
return title
|
|
31
|
-
|
|
32
|
-
except Exception as e:
|
|
33
|
-
logger.error(f"Failed to get video title: {e}")
|
|
34
|
-
return None
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
async def _extract_youtube_id(url):
|
|
38
|
-
"""
|
|
39
|
-
Extract the YouTube video ID from a given URL using regular expressions.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
url (str): The YouTube URL from which to extract the video ID.
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
str: The extracted YouTube video ID or None if no valid ID is found.
|
|
46
|
-
"""
|
|
47
|
-
# Define a regular expression pattern to capture the YouTube video ID
|
|
48
|
-
youtube_regex = (
|
|
49
|
-
r"(?:https?://)?" # Optional scheme
|
|
50
|
-
r"(?:www\.)?" # Optional www.
|
|
51
|
-
r"(?:"
|
|
52
|
-
r"youtu\.be/" # Shortened URL
|
|
53
|
-
r"|youtube\.com" # Main URL
|
|
54
|
-
r"(?:" # Group start
|
|
55
|
-
r"/embed/" # Embed URL
|
|
56
|
-
r"|/v/" # Older video URL
|
|
57
|
-
r"|/watch\?v=" # Standard watch URL
|
|
58
|
-
r"|/watch\?.+&v=" # Other watch URL
|
|
59
|
-
r")" # Group end
|
|
60
|
-
r")" # End main group
|
|
61
|
-
r"([\w-]{11})" # 11 characters (YouTube video ID)
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
# Search the URL for the pattern
|
|
65
|
-
match = re.search(youtube_regex, url)
|
|
66
|
-
|
|
67
|
-
# Return the video ID if a match is found
|
|
68
|
-
return match.group(1) if match else None
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
|
72
|
-
max_attempts = 5
|
|
73
|
-
for attempt in range(max_attempts):
|
|
74
|
-
try:
|
|
75
|
-
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
76
|
-
|
|
77
|
-
# First try: Manual transcripts in preferred languages
|
|
78
|
-
manual_transcripts = []
|
|
79
|
-
try:
|
|
80
|
-
for transcript in transcript_list:
|
|
81
|
-
if not transcript.is_generated and not transcript.is_translatable:
|
|
82
|
-
manual_transcripts.append(transcript)
|
|
83
|
-
|
|
84
|
-
if manual_transcripts:
|
|
85
|
-
# Sort based on preferred language order
|
|
86
|
-
for lang in preferred_langs:
|
|
87
|
-
for transcript in manual_transcripts:
|
|
88
|
-
if transcript.language_code == lang:
|
|
89
|
-
return transcript.fetch()
|
|
90
|
-
# If no preferred language found, return first manual transcript
|
|
91
|
-
return manual_transcripts[0].fetch()
|
|
92
|
-
except NoTranscriptFound:
|
|
93
|
-
pass
|
|
94
|
-
|
|
95
|
-
# Second try: Auto-generated transcripts in preferred languages
|
|
96
|
-
generated_transcripts = []
|
|
97
|
-
try:
|
|
98
|
-
for transcript in transcript_list:
|
|
99
|
-
if transcript.is_generated and not transcript.is_translatable:
|
|
100
|
-
generated_transcripts.append(transcript)
|
|
101
|
-
|
|
102
|
-
if generated_transcripts:
|
|
103
|
-
# Sort based on preferred language order
|
|
104
|
-
for lang in preferred_langs:
|
|
105
|
-
for transcript in generated_transcripts:
|
|
106
|
-
if transcript.language_code == lang:
|
|
107
|
-
return transcript.fetch()
|
|
108
|
-
# If no preferred language found, return first generated transcript
|
|
109
|
-
return generated_transcripts[0].fetch()
|
|
110
|
-
except NoTranscriptFound:
|
|
111
|
-
pass
|
|
112
|
-
|
|
113
|
-
# Last try: Translated transcripts in preferred languages
|
|
114
|
-
translated_transcripts = []
|
|
115
|
-
try:
|
|
116
|
-
for transcript in transcript_list:
|
|
117
|
-
if transcript.is_translatable:
|
|
118
|
-
translated_transcripts.append(transcript)
|
|
119
|
-
|
|
120
|
-
if translated_transcripts:
|
|
121
|
-
# Sort based on preferred language order
|
|
122
|
-
for lang in preferred_langs:
|
|
123
|
-
for transcript in translated_transcripts:
|
|
124
|
-
if transcript.language_code == lang:
|
|
125
|
-
return transcript.fetch()
|
|
126
|
-
# If no preferred language found, return translation to first preferred language
|
|
127
|
-
translation = translated_transcripts[0].translate(
|
|
128
|
-
preferred_langs[0]
|
|
129
|
-
)
|
|
130
|
-
return translation.fetch()
|
|
131
|
-
except NoTranscriptFound:
|
|
132
|
-
pass
|
|
133
|
-
|
|
134
|
-
raise Exception("No suitable transcript found")
|
|
135
|
-
|
|
136
|
-
except Exception as e:
|
|
137
|
-
if e.__class__.__name__ == "ParserError":
|
|
138
|
-
logger.warning(
|
|
139
|
-
f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
|
|
140
|
-
)
|
|
141
|
-
if attempt == max_attempts - 1:
|
|
142
|
-
logger.error(
|
|
143
|
-
f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
|
|
144
|
-
)
|
|
145
|
-
return None
|
|
146
|
-
await asyncio.sleep(2)
|
|
147
|
-
continue
|
|
148
|
-
else:
|
|
149
|
-
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
|
150
|
-
return None
|
|
151
|
-
return None
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
async def extract_youtube_transcript(state: ProcessSourceState):
|
|
155
|
-
"""
|
|
156
|
-
Parse the text file and print its content.
|
|
157
|
-
"""
|
|
158
|
-
|
|
159
|
-
assert state.url, "No URL provided"
|
|
160
|
-
logger.warning(f"Extracting transcript from URL: {state.url}")
|
|
161
|
-
languages = CONFIG.get("youtube_transcripts", {}).get(
|
|
162
|
-
"preferred_languages", ["en", "es", "pt"]
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
video_id = await _extract_youtube_id(state.url)
|
|
166
|
-
transcript = await get_best_transcript(video_id, languages)
|
|
167
|
-
|
|
168
|
-
logger.debug(f"Found transcript: {transcript}")
|
|
169
|
-
formatter = TextFormatter()
|
|
170
|
-
try:
|
|
171
|
-
title = await get_video_title(video_id)
|
|
172
|
-
except Exception as e:
|
|
173
|
-
logger.critical(f"Failed to get video title for video_id: {video_id}")
|
|
174
|
-
logger.exception(e)
|
|
175
|
-
title = ""
|
|
176
|
-
|
|
177
|
-
try:
|
|
178
|
-
formatted_content = formatter.format_transcript(transcript)
|
|
179
|
-
except Exception as e:
|
|
180
|
-
logger.critical(f"Failed to format transcript for video_id: {video_id}")
|
|
181
|
-
logger.exception(e)
|
|
182
|
-
formatted_content = ""
|
|
183
|
-
|
|
184
|
-
try:
|
|
185
|
-
transcript_raw = transcript.to_raw_data()
|
|
186
|
-
except Exception as e:
|
|
187
|
-
logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
|
|
188
|
-
logger.exception(e)
|
|
189
|
-
transcript_raw = ""
|
|
190
|
-
|
|
191
|
-
return {
|
|
192
|
-
"content": formatted_content,
|
|
193
|
-
"title": title,
|
|
194
|
-
"metadata": {"video_id": video_id, "transcript": transcript_raw},
|
|
195
|
-
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{content_core-1.0.1 → content_core-1.0.3}/src/content_core/content/identification/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|