content-core 1.0.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-1.0.0 → content_core-1.0.1}/PKG-INFO +1 -1
- {content_core-1.0.0 → content_core-1.0.1}/pyproject.toml +1 -1
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/youtube.py +80 -62
- {content_core-1.0.0 → content_core-1.0.1}/uv.lock +1 -1
- {content_core-1.0.0 → content_core-1.0.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/.github/workflows/publish.yml +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/.gitignore +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/.python-version +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/CONTRIBUTING.md +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/LICENSE +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/Makefile +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/README.md +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/docs/processors.md +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/docs/usage.md +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/prompts/content/summarize.jinja +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/common/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/common/state.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/common/types.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/common/utils.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/config.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/summary/core.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/logging.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/models.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/models_config.yaml +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/audio.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/docling.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/office.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/text.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/url.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/processors/video.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/py.typed +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/templated_message.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/tools/extract.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.docx +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.epub +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.md +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.mp3 +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.mp4 +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.pdf +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.pptx +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.txt +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file.xlsx +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/integration/test_cli.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/integration/test_extraction.py +0 -0
- {content_core-1.0.0 → content_core-1.0.1}/tests/unit/test_docling.py +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import re
|
|
2
3
|
import ssl
|
|
3
4
|
|
|
@@ -68,69 +69,86 @@ async def _extract_youtube_id(url):
|
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
async def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
# First try: Manual transcripts in preferred languages
|
|
75
|
-
manual_transcripts = []
|
|
76
|
-
try:
|
|
77
|
-
for transcript in transcript_list:
|
|
78
|
-
if not transcript.is_generated and not transcript.is_translatable:
|
|
79
|
-
manual_transcripts.append(transcript)
|
|
80
|
-
|
|
81
|
-
if manual_transcripts:
|
|
82
|
-
# Sort based on preferred language order
|
|
83
|
-
for lang in preferred_langs:
|
|
84
|
-
for transcript in manual_transcripts:
|
|
85
|
-
if transcript.language_code == lang:
|
|
86
|
-
return transcript.fetch()
|
|
87
|
-
# If no preferred language found, return first manual transcript
|
|
88
|
-
return manual_transcripts[0].fetch()
|
|
89
|
-
except NoTranscriptFound:
|
|
90
|
-
pass
|
|
91
|
-
|
|
92
|
-
# Second try: Auto-generated transcripts in preferred languages
|
|
93
|
-
generated_transcripts = []
|
|
94
|
-
try:
|
|
95
|
-
for transcript in transcript_list:
|
|
96
|
-
if transcript.is_generated and not transcript.is_translatable:
|
|
97
|
-
generated_transcripts.append(transcript)
|
|
98
|
-
|
|
99
|
-
if generated_transcripts:
|
|
100
|
-
# Sort based on preferred language order
|
|
101
|
-
for lang in preferred_langs:
|
|
102
|
-
for transcript in generated_transcripts:
|
|
103
|
-
if transcript.language_code == lang:
|
|
104
|
-
return transcript.fetch()
|
|
105
|
-
# If no preferred language found, return first generated transcript
|
|
106
|
-
return generated_transcripts[0].fetch()
|
|
107
|
-
except NoTranscriptFound:
|
|
108
|
-
pass
|
|
109
|
-
|
|
110
|
-
# Last try: Translated transcripts in preferred languages
|
|
111
|
-
translated_transcripts = []
|
|
72
|
+
max_attempts = 5
|
|
73
|
+
for attempt in range(max_attempts):
|
|
112
74
|
try:
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
75
|
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
|
76
|
+
|
|
77
|
+
# First try: Manual transcripts in preferred languages
|
|
78
|
+
manual_transcripts = []
|
|
79
|
+
try:
|
|
80
|
+
for transcript in transcript_list:
|
|
81
|
+
if not transcript.is_generated and not transcript.is_translatable:
|
|
82
|
+
manual_transcripts.append(transcript)
|
|
83
|
+
|
|
84
|
+
if manual_transcripts:
|
|
85
|
+
# Sort based on preferred language order
|
|
86
|
+
for lang in preferred_langs:
|
|
87
|
+
for transcript in manual_transcripts:
|
|
88
|
+
if transcript.language_code == lang:
|
|
89
|
+
return transcript.fetch()
|
|
90
|
+
# If no preferred language found, return first manual transcript
|
|
91
|
+
return manual_transcripts[0].fetch()
|
|
92
|
+
except NoTranscriptFound:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
# Second try: Auto-generated transcripts in preferred languages
|
|
96
|
+
generated_transcripts = []
|
|
97
|
+
try:
|
|
98
|
+
for transcript in transcript_list:
|
|
99
|
+
if transcript.is_generated and not transcript.is_translatable:
|
|
100
|
+
generated_transcripts.append(transcript)
|
|
101
|
+
|
|
102
|
+
if generated_transcripts:
|
|
103
|
+
# Sort based on preferred language order
|
|
104
|
+
for lang in preferred_langs:
|
|
105
|
+
for transcript in generated_transcripts:
|
|
106
|
+
if transcript.language_code == lang:
|
|
107
|
+
return transcript.fetch()
|
|
108
|
+
# If no preferred language found, return first generated transcript
|
|
109
|
+
return generated_transcripts[0].fetch()
|
|
110
|
+
except NoTranscriptFound:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
# Last try: Translated transcripts in preferred languages
|
|
114
|
+
translated_transcripts = []
|
|
115
|
+
try:
|
|
116
|
+
for transcript in transcript_list:
|
|
117
|
+
if transcript.is_translatable:
|
|
118
|
+
translated_transcripts.append(transcript)
|
|
119
|
+
|
|
120
|
+
if translated_transcripts:
|
|
121
|
+
# Sort based on preferred language order
|
|
122
|
+
for lang in preferred_langs:
|
|
123
|
+
for transcript in translated_transcripts:
|
|
124
|
+
if transcript.language_code == lang:
|
|
125
|
+
return transcript.fetch()
|
|
126
|
+
# If no preferred language found, return translation to first preferred language
|
|
127
|
+
translation = translated_transcripts[0].translate(
|
|
128
|
+
preferred_langs[0]
|
|
129
|
+
)
|
|
130
|
+
return translation.fetch()
|
|
131
|
+
except NoTranscriptFound:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
raise Exception("No suitable transcript found")
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
if e.__class__.__name__ == "ParserError":
|
|
138
|
+
logger.warning(
|
|
139
|
+
f"ParserError on attempt {attempt+1}/5 for video {video_id}. Retrying..."
|
|
140
|
+
)
|
|
141
|
+
if attempt == max_attempts - 1:
|
|
142
|
+
logger.error(
|
|
143
|
+
f"Failed to get transcript for video {video_id} after {max_attempts} attempts due to repeated ParserError."
|
|
144
|
+
)
|
|
145
|
+
return None
|
|
146
|
+
await asyncio.sleep(2)
|
|
147
|
+
continue
|
|
148
|
+
else:
|
|
149
|
+
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
|
150
|
+
return None
|
|
151
|
+
return None
|
|
134
152
|
|
|
135
153
|
|
|
136
154
|
async def extract_youtube_transcript(state: ProcessSourceState):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{content_core-1.0.0 → content_core-1.0.1}/src/content_core/content/identification/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|