content-core 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_core/__init__.py +216 -0
- content_core/cc_config.yaml +86 -0
- content_core/common/__init__.py +38 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/retry.py +325 -0
- content_core/common/state.py +64 -0
- content_core/common/types.py +15 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +575 -0
- content_core/content/__init__.py +6 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +252 -0
- content_core/content/identification/__init__.py +9 -0
- content_core/content/identification/file_detector.py +505 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/logging.py +15 -0
- content_core/mcp/__init__.py +5 -0
- content_core/mcp/server.py +214 -0
- content_core/models.py +60 -0
- content_core/models_config.yaml +31 -0
- content_core/notebooks/run.ipynb +359 -0
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/audio.py +272 -0
- content_core/processors/docling.py +79 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +292 -0
- content_core/processors/text.py +36 -0
- content_core/processors/url.py +324 -0
- content_core/processors/video.py +166 -0
- content_core/processors/youtube.py +262 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +70 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-1.10.0.dist-info/METADATA +742 -0
- content_core-1.10.0.dist-info/RECORD +44 -0
- content_core-1.10.0.dist-info/WHEEL +4 -0
- content_core-1.10.0.dist-info/entry_points.txt +5 -0
- content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 2,
|
|
6
|
+
"id": "873a872b",
|
|
7
|
+
"metadata": {},
|
|
8
|
+
"outputs": [],
|
|
9
|
+
"source": [
|
|
10
|
+
"from content_core.content.extraction import extract_content\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"async def process_url(url):\n",
|
|
13
|
+
" print(\"Processing: \", url)\n",
|
|
14
|
+
" print(\"Simple: -------\")\n",
|
|
15
|
+
" result = await extract_content(dict(url=url, engine=\"simple\"))\n",
|
|
16
|
+
" print(result.title[:100])\n",
|
|
17
|
+
" print(result.content[:100])\n",
|
|
18
|
+
" print(\"Jina: -------\")\n",
|
|
19
|
+
" result = await extract_content(dict(url=url, engine=\"jina\"))\n",
|
|
20
|
+
" print(result.title[:100])\n",
|
|
21
|
+
" print(result.content[:100])\n",
|
|
22
|
+
" print(\"Firecrawl: -------\")\n",
|
|
23
|
+
" result = await extract_content(dict(url=url, engine=\"firecrawl\"))\n",
|
|
24
|
+
" print(result.title[:100])\n",
|
|
25
|
+
" print(result.content[:100])\n",
|
|
26
|
+
" print(\"=============================\")"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"cell_type": "code",
|
|
31
|
+
"execution_count": 4,
|
|
32
|
+
"id": "263dc3af",
|
|
33
|
+
"metadata": {},
|
|
34
|
+
"outputs": [
|
|
35
|
+
{
|
|
36
|
+
"name": "stdout",
|
|
37
|
+
"output_type": "stream",
|
|
38
|
+
"text": [
|
|
39
|
+
"Processing: https://www.supernovalabs.com.br/\n",
|
|
40
|
+
"Simple: -------\n",
|
|
41
|
+
"Readability failed: No content extracted by readability\n",
|
|
42
|
+
"Supernova Labs | AI Consulting\n",
|
|
43
|
+
"Supernova Labs | AI Consulting\n",
|
|
44
|
+
"Jina: -------\n",
|
|
45
|
+
"Supernova Labs | Elite AI Consulting to help you build the Future\n",
|
|
46
|
+
"URL Source: https://www.supernovalabs.com.br/\n",
|
|
47
|
+
"\n",
|
|
48
|
+
"Markdown Content:\n",
|
|
49
|
+
"Supernova Labs\n",
|
|
50
|
+
"\n",
|
|
51
|
+
"[About](https://www\n",
|
|
52
|
+
"Firecrawl: -------\n",
|
|
53
|
+
"Supernova Labs | AI Consulting\n",
|
|
54
|
+
"# Unleash Your AI Edge. Fast.\n",
|
|
55
|
+
"\n",
|
|
56
|
+
"We turn your data, tech and capabilities into impact with lean AI sol\n",
|
|
57
|
+
"=============================\n",
|
|
58
|
+
"None\n",
|
|
59
|
+
"Processing: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
|
|
60
|
+
"Simple: -------\n",
|
|
61
|
+
"Fine-Tuning Transaction User Models - Building Nubank\n",
|
|
62
|
+
"Fine-Tuning Transaction User Models Learn how we combine transaction embeddings with tabular data us\n",
|
|
63
|
+
"Jina: -------\n",
|
|
64
|
+
"Fine-Tuning Transaction User Models - Building Nubank\n",
|
|
65
|
+
"URL Source: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
|
|
66
|
+
"\n",
|
|
67
|
+
"Published Time: 2025-0\n",
|
|
68
|
+
"Firecrawl: -------\n",
|
|
69
|
+
"Fine-Tuning Transaction User Models - Building Nubank\n",
|
|
70
|
+
"# Fine-Tuning Transaction User Models\n",
|
|
71
|
+
"\n",
|
|
72
|
+
"Learn how we combine transaction embeddings with tabular data\n",
|
|
73
|
+
"=============================\n",
|
|
74
|
+
"None\n",
|
|
75
|
+
"Processing: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\n",
|
|
76
|
+
"Simple: -------\n",
|
|
77
|
+
"You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
|
|
78
|
+
"Most people think they need to quit their job to build a new life. I thought that too. You scroll th\n",
|
|
79
|
+
"Jina: -------\n",
|
|
80
|
+
"You Can Make Money With AI Without Quitting Your Job\n",
|
|
81
|
+
"URL Source: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quittin\n",
|
|
82
|
+
"Firecrawl: -------\n",
|
|
83
|
+
"You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
|
|
84
|
+
"[Sitemap](https://medium.com/sitemap/sitemap.xml)\n",
|
|
85
|
+
"\n",
|
|
86
|
+
"[Open in app](https://rsci.app.link/?%24canonical\n",
|
|
87
|
+
"=============================\n",
|
|
88
|
+
"None\n",
|
|
89
|
+
"Processing: https://github.com/mirkonasato/pyodconverter\n",
|
|
90
|
+
"Simple: -------\n",
|
|
91
|
+
"GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
|
|
92
|
+
"This repository was archived by the owner on Dec 1, 2023. It is now read-only. mirkonasato/pyodconve\n",
|
|
93
|
+
"Jina: -------\n",
|
|
94
|
+
"GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
|
|
95
|
+
"URL Source: https://github.com/mirkonasato/pyodconverter\n",
|
|
96
|
+
"\n",
|
|
97
|
+
"Markdown Content:\n",
|
|
98
|
+
"GitHub - mirkonasato/pyo\n",
|
|
99
|
+
"Firecrawl: -------\n",
|
|
100
|
+
"GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
|
|
101
|
+
"[Skip to content](https://github.com/mirkonasato/pyodconverter#start-of-content)\n",
|
|
102
|
+
"\n",
|
|
103
|
+
"You signed in with\n",
|
|
104
|
+
"=============================\n",
|
|
105
|
+
"None\n",
|
|
106
|
+
"Processing: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\n",
|
|
107
|
+
"Simple: -------\n",
|
|
108
|
+
"Error processing URL https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR: HTTP error: 500\n",
|
|
109
|
+
"Error\n",
|
|
110
|
+
"Failed to extract content: HTTP error: 500\n",
|
|
111
|
+
"Jina: -------\n",
|
|
112
|
+
"Ultra-aprendizado: domine habilidades valiosas, seja mais esperto que a competição e dê um impulso n\n",
|
|
113
|
+
"URL Source: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-ca\n",
|
|
114
|
+
"Firecrawl: -------\n",
|
|
115
|
+
"Amazon.com.br\n",
|
|
116
|
+
"#### Digite os caracteres que você vê abaixo\n",
|
|
117
|
+
"\n",
|
|
118
|
+
"Desculpe pelo inconveniente. Para continuar realizando\n",
|
|
119
|
+
"=============================\n",
|
|
120
|
+
"None\n"
|
|
121
|
+
]
|
|
122
|
+
}
|
|
123
|
+
],
|
|
124
|
+
"source": [
|
|
125
|
+
"\n",
|
|
126
|
+
"urls= [\"https://www.supernovalabs.com.br/\", \"https://building.nubank.com/fine-tuning-transaction-user-models/\", \"https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\", \"https://github.com/mirkonasato/pyodconverter\", \"https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\"]\n",
|
|
127
|
+
"for url in urls:\n",
|
|
128
|
+
" result = await process_url(url=url)\n",
|
|
129
|
+
" print(result)"
|
|
130
|
+
]
|
|
131
|
+
}
|
|
132
|
+
],
|
|
133
|
+
"metadata": {
|
|
134
|
+
"kernelspec": {
|
|
135
|
+
"display_name": ".venv",
|
|
136
|
+
"language": "python",
|
|
137
|
+
"name": "python3"
|
|
138
|
+
},
|
|
139
|
+
"language_info": {
|
|
140
|
+
"codemirror_mode": {
|
|
141
|
+
"name": "ipython",
|
|
142
|
+
"version": 3
|
|
143
|
+
},
|
|
144
|
+
"file_extension": ".py",
|
|
145
|
+
"mimetype": "text/x-python",
|
|
146
|
+
"name": "python",
|
|
147
|
+
"nbconvert_exporter": "python",
|
|
148
|
+
"pygments_lexer": "ipython3",
|
|
149
|
+
"version": "3.10.6"
|
|
150
|
+
}
|
|
151
|
+
},
|
|
152
|
+
"nbformat": 4,
|
|
153
|
+
"nbformat_minor": 5
|
|
154
|
+
}
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import math
|
|
3
|
+
import os
|
|
4
|
+
import tempfile
|
|
5
|
+
import traceback
|
|
6
|
+
from functools import partial
|
|
7
|
+
|
|
8
|
+
from moviepy import AudioFileClip
|
|
9
|
+
|
|
10
|
+
from content_core.common import ProcessSourceState
|
|
11
|
+
from content_core.common.retry import retry_audio_transcription
|
|
12
|
+
from content_core.config import get_audio_concurrency, get_proxy
|
|
13
|
+
from content_core.logging import logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
|
|
17
|
+
"""
|
|
18
|
+
Split an audio file into segments asynchronously.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def _split(input_file, segment_length_minutes, output_prefix):
|
|
22
|
+
# Convert input file to absolute path
|
|
23
|
+
input_file_abs = os.path.abspath(input_file)
|
|
24
|
+
output_dir = os.path.dirname(input_file_abs)
|
|
25
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
# Set up output prefix
|
|
28
|
+
if output_prefix is None:
|
|
29
|
+
output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
|
|
30
|
+
|
|
31
|
+
# Load the audio file
|
|
32
|
+
audio = AudioFileClip(input_file_abs)
|
|
33
|
+
|
|
34
|
+
# Calculate segment length in seconds
|
|
35
|
+
segment_length_s = segment_length_minutes * 60
|
|
36
|
+
|
|
37
|
+
# Calculate number of segments
|
|
38
|
+
total_segments = math.ceil(audio.duration / segment_length_s)
|
|
39
|
+
logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
|
|
40
|
+
|
|
41
|
+
output_files = []
|
|
42
|
+
|
|
43
|
+
# Split the audio into segments
|
|
44
|
+
for i in range(total_segments):
|
|
45
|
+
start_time = i * segment_length_s
|
|
46
|
+
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
47
|
+
|
|
48
|
+
# Extract segment
|
|
49
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
50
|
+
output_path = os.path.join(output_dir, output_filename)
|
|
51
|
+
|
|
52
|
+
# Export segment
|
|
53
|
+
extract_audio(input_file_abs, output_path, start_time, end_time)
|
|
54
|
+
|
|
55
|
+
output_files.append(output_path)
|
|
56
|
+
|
|
57
|
+
logger.debug(
|
|
58
|
+
f"Exported segment {i + 1}/{total_segments}: {output_filename}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
return output_files
|
|
62
|
+
|
|
63
|
+
# Run CPU-bound audio processing in thread pool
|
|
64
|
+
return await asyncio.get_event_loop().run_in_executor(
|
|
65
|
+
None, partial(_split, input_file, segment_length_minutes, output_prefix)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extract_audio(
|
|
70
|
+
input_file: str, output_file: str, start_time: float = None, end_time: float = None
|
|
71
|
+
) -> None:
|
|
72
|
+
"""
|
|
73
|
+
Extract audio from a video or audio file and save it as an MP3 file.
|
|
74
|
+
If start_time and end_time are provided, only that segment of audio is extracted.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
input_file (str): Path to the input video or audio file.
|
|
78
|
+
output_file (str): Path where the output MP3 file will be saved.
|
|
79
|
+
start_time (float, optional): Start time of the audio segment in seconds. Defaults to None.
|
|
80
|
+
end_time (float, optional): End time of the audio segment in seconds. Defaults to None.
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
# Load the file as an AudioFileClip
|
|
84
|
+
audio_clip = AudioFileClip(input_file)
|
|
85
|
+
|
|
86
|
+
# If start_time and/or end_time are provided, trim the audio using subclipped
|
|
87
|
+
if start_time is not None and end_time is not None:
|
|
88
|
+
audio_clip = audio_clip.subclipped(start_time, end_time)
|
|
89
|
+
elif start_time is not None:
|
|
90
|
+
audio_clip = audio_clip.subclipped(start_time)
|
|
91
|
+
elif end_time is not None:
|
|
92
|
+
audio_clip = audio_clip.subclipped(0, end_time)
|
|
93
|
+
|
|
94
|
+
# Export the audio as MP3
|
|
95
|
+
audio_clip.write_audiofile(output_file, codec="mp3")
|
|
96
|
+
audio_clip.close()
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.error(f"Error extracting audio: {str(e)}")
|
|
99
|
+
raise
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@retry_audio_transcription()
|
|
103
|
+
async def _transcribe_segment(audio_file, model):
|
|
104
|
+
"""Internal function to transcribe a single segment - wrapped with retry logic."""
|
|
105
|
+
return (await model.atranscribe(audio_file)).text
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def transcribe_audio_segment(audio_file, model, semaphore):
|
|
109
|
+
"""
|
|
110
|
+
Transcribe a single audio segment asynchronously with concurrency control and retry logic.
|
|
111
|
+
|
|
112
|
+
This function uses a semaphore to limit the number of concurrent transcriptions,
|
|
113
|
+
preventing API rate limits while allowing parallel processing for improved performance.
|
|
114
|
+
Includes retry logic for transient API failures.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
audio_file (str): Path to the audio file segment to transcribe
|
|
118
|
+
model: Speech-to-text model instance with atranscribe() method
|
|
119
|
+
semaphore (asyncio.Semaphore): Semaphore to control concurrency
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
str: Transcribed text from the audio segment
|
|
123
|
+
|
|
124
|
+
Note:
|
|
125
|
+
Multiple instances of this function can run concurrently, but the semaphore
|
|
126
|
+
ensures that no more than N transcriptions happen simultaneously, where N
|
|
127
|
+
is configured via get_audio_concurrency() (default: 3, range: 1-10).
|
|
128
|
+
"""
|
|
129
|
+
async with semaphore:
|
|
130
|
+
return await _transcribe_segment(audio_file, model)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
async def extract_audio_data(data: ProcessSourceState):
|
|
134
|
+
"""
|
|
135
|
+
Extract and transcribe audio from a file with automatic segmentation and parallel processing.
|
|
136
|
+
|
|
137
|
+
This function handles the complete audio processing pipeline:
|
|
138
|
+
1. Splits long audio files (>10 minutes) into segments
|
|
139
|
+
2. Transcribes segments in parallel using configurable concurrency
|
|
140
|
+
3. Joins transcriptions in correct order
|
|
141
|
+
|
|
142
|
+
For files longer than 10 minutes, segments are processed concurrently with a
|
|
143
|
+
configurable concurrency limit to balance performance and API rate limits.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
data (ProcessSourceState): State object containing file_path to audio/video file
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
dict: Dictionary containing:
|
|
150
|
+
- metadata: Information about processed segments count
|
|
151
|
+
- content: Complete transcribed text
|
|
152
|
+
|
|
153
|
+
Configuration:
|
|
154
|
+
Concurrency is controlled via:
|
|
155
|
+
- Environment variable: CCORE_AUDIO_CONCURRENCY (1-10, default: 3)
|
|
156
|
+
- YAML config: extraction.audio.concurrency
|
|
157
|
+
|
|
158
|
+
Raises:
|
|
159
|
+
Exception: If audio extraction or transcription fails
|
|
160
|
+
"""
|
|
161
|
+
input_audio_path = data.file_path
|
|
162
|
+
audio = None
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
# Use TemporaryDirectory context manager for automatic cleanup
|
|
166
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
167
|
+
output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
|
|
168
|
+
output_dir = temp_dir
|
|
169
|
+
|
|
170
|
+
# Split audio into segments if longer than 10 minutes
|
|
171
|
+
audio = AudioFileClip(input_audio_path)
|
|
172
|
+
duration_s = audio.duration
|
|
173
|
+
segment_length_s = 10 * 60 # 10 minutes in seconds
|
|
174
|
+
output_files = []
|
|
175
|
+
|
|
176
|
+
if duration_s > segment_length_s:
|
|
177
|
+
logger.info(
|
|
178
|
+
f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
|
|
179
|
+
)
|
|
180
|
+
for i in range(math.ceil(duration_s / segment_length_s)):
|
|
181
|
+
start_time = i * segment_length_s
|
|
182
|
+
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
183
|
+
|
|
184
|
+
# Extract segment
|
|
185
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
186
|
+
output_path = os.path.join(output_dir, output_filename)
|
|
187
|
+
|
|
188
|
+
extract_audio(input_audio_path, output_path, start_time, end_time)
|
|
189
|
+
|
|
190
|
+
output_files.append(output_path)
|
|
191
|
+
else:
|
|
192
|
+
output_files = [input_audio_path]
|
|
193
|
+
|
|
194
|
+
# Close audio clip after determining segments
|
|
195
|
+
if audio:
|
|
196
|
+
audio.close()
|
|
197
|
+
audio = None
|
|
198
|
+
|
|
199
|
+
# Transcribe audio files in parallel with concurrency limit
|
|
200
|
+
from content_core.config import CONFIG
|
|
201
|
+
from content_core.models import ModelFactory
|
|
202
|
+
from esperanto import AIFactory
|
|
203
|
+
|
|
204
|
+
# Determine which model to use based on state parameters
|
|
205
|
+
if data.audio_provider and data.audio_model:
|
|
206
|
+
# Custom model provided - create new instance
|
|
207
|
+
try:
|
|
208
|
+
logger.info(
|
|
209
|
+
f"Using custom audio model: {data.audio_provider}/{data.audio_model}"
|
|
210
|
+
)
|
|
211
|
+
# Get timeout from config (same as default model) or use fallback
|
|
212
|
+
timeout = CONFIG.get('speech_to_text', {}).get('timeout', 3600)
|
|
213
|
+
stt_config = {'timeout': timeout} if timeout else {}
|
|
214
|
+
# Add proxy to config if configured
|
|
215
|
+
current_proxy = get_proxy(data.proxy)
|
|
216
|
+
if current_proxy:
|
|
217
|
+
stt_config['proxy'] = current_proxy
|
|
218
|
+
speech_to_text_model = AIFactory.create_speech_to_text(
|
|
219
|
+
data.audio_provider, data.audio_model, stt_config
|
|
220
|
+
)
|
|
221
|
+
except Exception as e:
|
|
222
|
+
logger.error(
|
|
223
|
+
f"Failed to create custom audio model '{data.audio_provider}/{data.audio_model}': {e}. "
|
|
224
|
+
f"Check that the provider and model are supported by Esperanto. "
|
|
225
|
+
f"Falling back to default model."
|
|
226
|
+
)
|
|
227
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
228
|
+
elif data.audio_provider or data.audio_model:
|
|
229
|
+
# Only one parameter provided - log warning and use default
|
|
230
|
+
missing = "audio_model" if data.audio_provider else "audio_provider"
|
|
231
|
+
provided = "audio_provider" if data.audio_provider else "audio_model"
|
|
232
|
+
logger.warning(
|
|
233
|
+
f"{provided} provided without {missing}. "
|
|
234
|
+
f"Both audio_provider and audio_model must be specified together. "
|
|
235
|
+
f"Falling back to default model."
|
|
236
|
+
)
|
|
237
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
238
|
+
else:
|
|
239
|
+
# No custom parameters - use default (backward compatible)
|
|
240
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
241
|
+
|
|
242
|
+
concurrency = get_audio_concurrency()
|
|
243
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
244
|
+
|
|
245
|
+
logger.debug(
|
|
246
|
+
f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Create tasks for parallel transcription
|
|
250
|
+
transcription_tasks = [
|
|
251
|
+
transcribe_audio_segment(audio_file, speech_to_text_model, semaphore)
|
|
252
|
+
for audio_file in output_files
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
# Execute all transcriptions concurrently (limited by semaphore)
|
|
256
|
+
transcriptions = await asyncio.gather(*transcription_tasks)
|
|
257
|
+
|
|
258
|
+
return {
|
|
259
|
+
"metadata": {"segments_count": len(output_files)},
|
|
260
|
+
"content": " ".join(transcriptions),
|
|
261
|
+
}
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.error(f"Error processing audio: {str(e)}")
|
|
264
|
+
logger.error(traceback.format_exc())
|
|
265
|
+
raise
|
|
266
|
+
finally:
|
|
267
|
+
# Ensure audio clip is closed even if an error occurs
|
|
268
|
+
if audio:
|
|
269
|
+
try:
|
|
270
|
+
audio.close()
|
|
271
|
+
except Exception:
|
|
272
|
+
pass
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Docling-based document extraction processor.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from content_core.common.state import ProcessSourceState
|
|
6
|
+
from content_core.config import CONFIG
|
|
7
|
+
|
|
8
|
+
DOCLING_AVAILABLE = False
|
|
9
|
+
try:
|
|
10
|
+
from docling.document_converter import DocumentConverter
|
|
11
|
+
DOCLING_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
|
|
14
|
+
class DocumentConverter:
|
|
15
|
+
"""Stub when docling is not installed."""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
raise ImportError(
|
|
19
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
20
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def convert(self, source: str):
|
|
24
|
+
raise ImportError(
|
|
25
|
+
"Docling not installed. Install with: pip install content-core[docling] "
|
|
26
|
+
"or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Supported MIME types for Docling extraction
|
|
30
|
+
DOCLING_SUPPORTED = {
|
|
31
|
+
"application/pdf",
|
|
32
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
33
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
34
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
35
|
+
"text/markdown",
|
|
36
|
+
# "text/plain", #docling currently not supporting txt
|
|
37
|
+
"text/x-markdown",
|
|
38
|
+
"text/csv",
|
|
39
|
+
"text/html",
|
|
40
|
+
"image/png",
|
|
41
|
+
"image/jpeg",
|
|
42
|
+
"image/tiff",
|
|
43
|
+
"image/bmp",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def extract_with_docling(state: ProcessSourceState) -> ProcessSourceState:
|
|
48
|
+
"""
|
|
49
|
+
Use Docling to parse files, URLs, or content into the desired format.
|
|
50
|
+
"""
|
|
51
|
+
# Initialize Docling converter
|
|
52
|
+
converter = DocumentConverter()
|
|
53
|
+
|
|
54
|
+
# Determine source: file path, URL, or direct content
|
|
55
|
+
source = state.file_path or state.url or state.content
|
|
56
|
+
if not source:
|
|
57
|
+
raise ValueError("No input provided for Docling extraction.")
|
|
58
|
+
|
|
59
|
+
# Convert document
|
|
60
|
+
result = converter.convert(source)
|
|
61
|
+
doc = result.document
|
|
62
|
+
|
|
63
|
+
# Determine output format (per execution override, metadata, then config)
|
|
64
|
+
cfg_fmt = (
|
|
65
|
+
CONFIG.get("extraction", {}).get("docling", {}).get("output_format", "markdown")
|
|
66
|
+
)
|
|
67
|
+
fmt = state.output_format or state.metadata.get("docling_format") or cfg_fmt
|
|
68
|
+
# Record the format used
|
|
69
|
+
state.metadata["docling_format"] = fmt
|
|
70
|
+
if fmt == "html":
|
|
71
|
+
output = doc.export_to_html()
|
|
72
|
+
elif fmt == "json":
|
|
73
|
+
output = doc.export_to_json()
|
|
74
|
+
else:
|
|
75
|
+
output = doc.export_to_markdown()
|
|
76
|
+
|
|
77
|
+
# Update state
|
|
78
|
+
state.content = output
|
|
79
|
+
return state
|