content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. content_core/__init__.py +216 -0
  2. content_core/cc_config.yaml +86 -0
  3. content_core/common/__init__.py +38 -0
  4. content_core/common/exceptions.py +70 -0
  5. content_core/common/retry.py +325 -0
  6. content_core/common/state.py +64 -0
  7. content_core/common/types.py +15 -0
  8. content_core/common/utils.py +31 -0
  9. content_core/config.py +575 -0
  10. content_core/content/__init__.py +6 -0
  11. content_core/content/cleanup/__init__.py +5 -0
  12. content_core/content/cleanup/core.py +15 -0
  13. content_core/content/extraction/__init__.py +13 -0
  14. content_core/content/extraction/graph.py +252 -0
  15. content_core/content/identification/__init__.py +9 -0
  16. content_core/content/identification/file_detector.py +505 -0
  17. content_core/content/summary/__init__.py +5 -0
  18. content_core/content/summary/core.py +15 -0
  19. content_core/logging.py +15 -0
  20. content_core/mcp/__init__.py +5 -0
  21. content_core/mcp/server.py +214 -0
  22. content_core/models.py +60 -0
  23. content_core/models_config.yaml +31 -0
  24. content_core/notebooks/run.ipynb +359 -0
  25. content_core/notebooks/urls.ipynb +154 -0
  26. content_core/processors/audio.py +272 -0
  27. content_core/processors/docling.py +79 -0
  28. content_core/processors/office.py +331 -0
  29. content_core/processors/pdf.py +292 -0
  30. content_core/processors/text.py +36 -0
  31. content_core/processors/url.py +324 -0
  32. content_core/processors/video.py +166 -0
  33. content_core/processors/youtube.py +262 -0
  34. content_core/py.typed +2 -0
  35. content_core/templated_message.py +70 -0
  36. content_core/tools/__init__.py +9 -0
  37. content_core/tools/cleanup.py +15 -0
  38. content_core/tools/extract.py +21 -0
  39. content_core/tools/summarize.py +17 -0
  40. content_core-1.10.0.dist-info/METADATA +742 -0
  41. content_core-1.10.0.dist-info/RECORD +44 -0
  42. content_core-1.10.0.dist-info/WHEEL +4 -0
  43. content_core-1.10.0.dist-info/entry_points.txt +5 -0
  44. content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,154 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "873a872b",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from content_core.content.extraction import extract_content\n",
11
+ "\n",
12
+ "async def process_url(url):\n",
13
+ " print(\"Processing: \", url)\n",
14
+ " print(\"Simple: -------\")\n",
15
+ " result = await extract_content(dict(url=url, engine=\"simple\"))\n",
16
+ " print(result.title[:100])\n",
17
+ " print(result.content[:100])\n",
18
+ " print(\"Jina: -------\")\n",
19
+ " result = await extract_content(dict(url=url, engine=\"jina\"))\n",
20
+ " print(result.title[:100])\n",
21
+ " print(result.content[:100])\n",
22
+ " print(\"Firecrawl: -------\")\n",
23
+ " result = await extract_content(dict(url=url, engine=\"firecrawl\"))\n",
24
+ " print(result.title[:100])\n",
25
+ " print(result.content[:100])\n",
26
+ " print(\"=============================\")"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 4,
32
+ "id": "263dc3af",
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stdout",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "Processing: https://www.supernovalabs.com.br/\n",
40
+ "Simple: -------\n",
41
+ "Readability failed: No content extracted by readability\n",
42
+ "Supernova Labs | AI Consulting\n",
43
+ "Supernova Labs | AI Consulting\n",
44
+ "Jina: -------\n",
45
+ "Supernova Labs | Elite AI Consulting to help you build the Future\n",
46
+ "URL Source: https://www.supernovalabs.com.br/\n",
47
+ "\n",
48
+ "Markdown Content:\n",
49
+ "Supernova Labs\n",
50
+ "\n",
51
+ "[About](https://www\n",
52
+ "Firecrawl: -------\n",
53
+ "Supernova Labs | AI Consulting\n",
54
+ "# Unleash Your AI Edge. Fast.\n",
55
+ "\n",
56
+ "We turn your data, tech and capabilities into impact with lean AI sol\n",
57
+ "=============================\n",
58
+ "None\n",
59
+ "Processing: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
60
+ "Simple: -------\n",
61
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
62
+ "Fine-Tuning Transaction User Models Learn how we combine transaction embeddings with tabular data us\n",
63
+ "Jina: -------\n",
64
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
65
+ "URL Source: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
66
+ "\n",
67
+ "Published Time: 2025-0\n",
68
+ "Firecrawl: -------\n",
69
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
70
+ "# Fine-Tuning Transaction User Models\n",
71
+ "\n",
72
+ "Learn how we combine transaction embeddings with tabular data\n",
73
+ "=============================\n",
74
+ "None\n",
75
+ "Processing: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\n",
76
+ "Simple: -------\n",
77
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
78
+ "Most people think they need to quit their job to build a new life. I thought that too. You scroll th\n",
79
+ "Jina: -------\n",
80
+ "You Can Make Money With AI Without Quitting Your Job\n",
81
+ "URL Source: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quittin\n",
82
+ "Firecrawl: -------\n",
83
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
84
+ "[Sitemap](https://medium.com/sitemap/sitemap.xml)\n",
85
+ "\n",
86
+ "[Open in app](https://rsci.app.link/?%24canonical\n",
87
+ "=============================\n",
88
+ "None\n",
89
+ "Processing: https://github.com/mirkonasato/pyodconverter\n",
90
+ "Simple: -------\n",
91
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
92
+ "This repository was archived by the owner on Dec 1, 2023. It is now read-only. mirkonasato/pyodconve\n",
93
+ "Jina: -------\n",
94
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
95
+ "URL Source: https://github.com/mirkonasato/pyodconverter\n",
96
+ "\n",
97
+ "Markdown Content:\n",
98
+ "GitHub - mirkonasato/pyo\n",
99
+ "Firecrawl: -------\n",
100
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
101
+ "[Skip to content](https://github.com/mirkonasato/pyodconverter#start-of-content)\n",
102
+ "\n",
103
+ "You signed in with\n",
104
+ "=============================\n",
105
+ "None\n",
106
+ "Processing: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\n",
107
+ "Simple: -------\n",
108
+ "Error processing URL https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR: HTTP error: 500\n",
109
+ "Error\n",
110
+ "Failed to extract content: HTTP error: 500\n",
111
+ "Jina: -------\n",
112
+ "Ultra-aprendizado: domine habilidades valiosas, seja mais esperto que a competição e dê um impulso n\n",
113
+ "URL Source: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-ca\n",
114
+ "Firecrawl: -------\n",
115
+ "Amazon.com.br\n",
116
+ "#### Digite os caracteres que você vê abaixo\n",
117
+ "\n",
118
+ "Desculpe pelo inconveniente. Para continuar realizando\n",
119
+ "=============================\n",
120
+ "None\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "\n",
126
+ "urls= [\"https://www.supernovalabs.com.br/\", \"https://building.nubank.com/fine-tuning-transaction-user-models/\", \"https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\", \"https://github.com/mirkonasato/pyodconverter\", \"https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\"]\n",
127
+ "for url in urls:\n",
128
+ " result = await process_url(url=url)\n",
129
+ " print(result)"
130
+ ]
131
+ }
132
+ ],
133
+ "metadata": {
134
+ "kernelspec": {
135
+ "display_name": ".venv",
136
+ "language": "python",
137
+ "name": "python3"
138
+ },
139
+ "language_info": {
140
+ "codemirror_mode": {
141
+ "name": "ipython",
142
+ "version": 3
143
+ },
144
+ "file_extension": ".py",
145
+ "mimetype": "text/x-python",
146
+ "name": "python",
147
+ "nbconvert_exporter": "python",
148
+ "pygments_lexer": "ipython3",
149
+ "version": "3.10.6"
150
+ }
151
+ },
152
+ "nbformat": 4,
153
+ "nbformat_minor": 5
154
+ }
@@ -0,0 +1,272 @@
1
+ import asyncio
2
+ import math
3
+ import os
4
+ import tempfile
5
+ import traceback
6
+ from functools import partial
7
+
8
+ from moviepy import AudioFileClip
9
+
10
+ from content_core.common import ProcessSourceState
11
+ from content_core.common.retry import retry_audio_transcription
12
+ from content_core.config import get_audio_concurrency, get_proxy
13
+ from content_core.logging import logger
14
+
15
+
16
+ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
17
+ """
18
+ Split an audio file into segments asynchronously.
19
+ """
20
+
21
+ def _split(input_file, segment_length_minutes, output_prefix):
22
+ # Convert input file to absolute path
23
+ input_file_abs = os.path.abspath(input_file)
24
+ output_dir = os.path.dirname(input_file_abs)
25
+ os.makedirs(output_dir, exist_ok=True)
26
+
27
+ # Set up output prefix
28
+ if output_prefix is None:
29
+ output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
30
+
31
+ # Load the audio file
32
+ audio = AudioFileClip(input_file_abs)
33
+
34
+ # Calculate segment length in seconds
35
+ segment_length_s = segment_length_minutes * 60
36
+
37
+ # Calculate number of segments
38
+ total_segments = math.ceil(audio.duration / segment_length_s)
39
+ logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
40
+
41
+ output_files = []
42
+
43
+ # Split the audio into segments
44
+ for i in range(total_segments):
45
+ start_time = i * segment_length_s
46
+ end_time = min((i + 1) * segment_length_s, audio.duration)
47
+
48
+ # Extract segment
49
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
50
+ output_path = os.path.join(output_dir, output_filename)
51
+
52
+ # Export segment
53
+ extract_audio(input_file_abs, output_path, start_time, end_time)
54
+
55
+ output_files.append(output_path)
56
+
57
+ logger.debug(
58
+ f"Exported segment {i + 1}/{total_segments}: {output_filename}"
59
+ )
60
+
61
+ return output_files
62
+
63
+ # Run CPU-bound audio processing in thread pool
64
+ return await asyncio.get_event_loop().run_in_executor(
65
+ None, partial(_split, input_file, segment_length_minutes, output_prefix)
66
+ )
67
+
68
+
69
+ def extract_audio(
70
+ input_file: str, output_file: str, start_time: float = None, end_time: float = None
71
+ ) -> None:
72
+ """
73
+ Extract audio from a video or audio file and save it as an MP3 file.
74
+ If start_time and end_time are provided, only that segment of audio is extracted.
75
+
76
+ Args:
77
+ input_file (str): Path to the input video or audio file.
78
+ output_file (str): Path where the output MP3 file will be saved.
79
+ start_time (float, optional): Start time of the audio segment in seconds. Defaults to None.
80
+ end_time (float, optional): End time of the audio segment in seconds. Defaults to None.
81
+ """
82
+ try:
83
+ # Load the file as an AudioFileClip
84
+ audio_clip = AudioFileClip(input_file)
85
+
86
+ # If start_time and/or end_time are provided, trim the audio using subclipped
87
+ if start_time is not None and end_time is not None:
88
+ audio_clip = audio_clip.subclipped(start_time, end_time)
89
+ elif start_time is not None:
90
+ audio_clip = audio_clip.subclipped(start_time)
91
+ elif end_time is not None:
92
+ audio_clip = audio_clip.subclipped(0, end_time)
93
+
94
+ # Export the audio as MP3
95
+ audio_clip.write_audiofile(output_file, codec="mp3")
96
+ audio_clip.close()
97
+ except Exception as e:
98
+ logger.error(f"Error extracting audio: {str(e)}")
99
+ raise
100
+
101
+
102
+ @retry_audio_transcription()
103
+ async def _transcribe_segment(audio_file, model):
104
+ """Internal function to transcribe a single segment - wrapped with retry logic."""
105
+ return (await model.atranscribe(audio_file)).text
106
+
107
+
108
+ async def transcribe_audio_segment(audio_file, model, semaphore):
109
+ """
110
+ Transcribe a single audio segment asynchronously with concurrency control and retry logic.
111
+
112
+ This function uses a semaphore to limit the number of concurrent transcriptions,
113
+ preventing API rate limits while allowing parallel processing for improved performance.
114
+ Includes retry logic for transient API failures.
115
+
116
+ Args:
117
+ audio_file (str): Path to the audio file segment to transcribe
118
+ model: Speech-to-text model instance with atranscribe() method
119
+ semaphore (asyncio.Semaphore): Semaphore to control concurrency
120
+
121
+ Returns:
122
+ str: Transcribed text from the audio segment
123
+
124
+ Note:
125
+ Multiple instances of this function can run concurrently, but the semaphore
126
+ ensures that no more than N transcriptions happen simultaneously, where N
127
+ is configured via get_audio_concurrency() (default: 3, range: 1-10).
128
+ """
129
+ async with semaphore:
130
+ return await _transcribe_segment(audio_file, model)
131
+
132
+
133
+ async def extract_audio_data(data: ProcessSourceState):
134
+ """
135
+ Extract and transcribe audio from a file with automatic segmentation and parallel processing.
136
+
137
+ This function handles the complete audio processing pipeline:
138
+ 1. Splits long audio files (>10 minutes) into segments
139
+ 2. Transcribes segments in parallel using configurable concurrency
140
+ 3. Joins transcriptions in correct order
141
+
142
+ For files longer than 10 minutes, segments are processed concurrently with a
143
+ configurable concurrency limit to balance performance and API rate limits.
144
+
145
+ Args:
146
+ data (ProcessSourceState): State object containing file_path to audio/video file
147
+
148
+ Returns:
149
+ dict: Dictionary containing:
150
+ - metadata: Information about processed segments count
151
+ - content: Complete transcribed text
152
+
153
+ Configuration:
154
+ Concurrency is controlled via:
155
+ - Environment variable: CCORE_AUDIO_CONCURRENCY (1-10, default: 3)
156
+ - YAML config: extraction.audio.concurrency
157
+
158
+ Raises:
159
+ Exception: If audio extraction or transcription fails
160
+ """
161
+ input_audio_path = data.file_path
162
+ audio = None
163
+
164
+ try:
165
+ # Use TemporaryDirectory context manager for automatic cleanup
166
+ with tempfile.TemporaryDirectory() as temp_dir:
167
+ output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
168
+ output_dir = temp_dir
169
+
170
+ # Split audio into segments if longer than 10 minutes
171
+ audio = AudioFileClip(input_audio_path)
172
+ duration_s = audio.duration
173
+ segment_length_s = 10 * 60 # 10 minutes in seconds
174
+ output_files = []
175
+
176
+ if duration_s > segment_length_s:
177
+ logger.info(
178
+ f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
179
+ )
180
+ for i in range(math.ceil(duration_s / segment_length_s)):
181
+ start_time = i * segment_length_s
182
+ end_time = min((i + 1) * segment_length_s, audio.duration)
183
+
184
+ # Extract segment
185
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
186
+ output_path = os.path.join(output_dir, output_filename)
187
+
188
+ extract_audio(input_audio_path, output_path, start_time, end_time)
189
+
190
+ output_files.append(output_path)
191
+ else:
192
+ output_files = [input_audio_path]
193
+
194
+ # Close audio clip after determining segments
195
+ if audio:
196
+ audio.close()
197
+ audio = None
198
+
199
+ # Transcribe audio files in parallel with concurrency limit
200
+ from content_core.config import CONFIG
201
+ from content_core.models import ModelFactory
202
+ from esperanto import AIFactory
203
+
204
+ # Determine which model to use based on state parameters
205
+ if data.audio_provider and data.audio_model:
206
+ # Custom model provided - create new instance
207
+ try:
208
+ logger.info(
209
+ f"Using custom audio model: {data.audio_provider}/{data.audio_model}"
210
+ )
211
+ # Get timeout from config (same as default model) or use fallback
212
+ timeout = CONFIG.get('speech_to_text', {}).get('timeout', 3600)
213
+ stt_config = {'timeout': timeout} if timeout else {}
214
+ # Add proxy to config if configured
215
+ current_proxy = get_proxy(data.proxy)
216
+ if current_proxy:
217
+ stt_config['proxy'] = current_proxy
218
+ speech_to_text_model = AIFactory.create_speech_to_text(
219
+ data.audio_provider, data.audio_model, stt_config
220
+ )
221
+ except Exception as e:
222
+ logger.error(
223
+ f"Failed to create custom audio model '{data.audio_provider}/{data.audio_model}': {e}. "
224
+ f"Check that the provider and model are supported by Esperanto. "
225
+ f"Falling back to default model."
226
+ )
227
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
228
+ elif data.audio_provider or data.audio_model:
229
+ # Only one parameter provided - log warning and use default
230
+ missing = "audio_model" if data.audio_provider else "audio_provider"
231
+ provided = "audio_provider" if data.audio_provider else "audio_model"
232
+ logger.warning(
233
+ f"{provided} provided without {missing}. "
234
+ f"Both audio_provider and audio_model must be specified together. "
235
+ f"Falling back to default model."
236
+ )
237
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
238
+ else:
239
+ # No custom parameters - use default (backward compatible)
240
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
241
+
242
+ concurrency = get_audio_concurrency()
243
+ semaphore = asyncio.Semaphore(concurrency)
244
+
245
+ logger.debug(
246
+ f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
247
+ )
248
+
249
+ # Create tasks for parallel transcription
250
+ transcription_tasks = [
251
+ transcribe_audio_segment(audio_file, speech_to_text_model, semaphore)
252
+ for audio_file in output_files
253
+ ]
254
+
255
+ # Execute all transcriptions concurrently (limited by semaphore)
256
+ transcriptions = await asyncio.gather(*transcription_tasks)
257
+
258
+ return {
259
+ "metadata": {"segments_count": len(output_files)},
260
+ "content": " ".join(transcriptions),
261
+ }
262
+ except Exception as e:
263
+ logger.error(f"Error processing audio: {str(e)}")
264
+ logger.error(traceback.format_exc())
265
+ raise
266
+ finally:
267
+ # Ensure audio clip is closed even if an error occurs
268
+ if audio:
269
+ try:
270
+ audio.close()
271
+ except Exception:
272
+ pass
@@ -0,0 +1,79 @@
1
+ """
2
+ Docling-based document extraction processor.
3
+ """
4
+
5
+ from content_core.common.state import ProcessSourceState
6
+ from content_core.config import CONFIG
7
+
8
+ DOCLING_AVAILABLE = False
9
+ try:
10
+ from docling.document_converter import DocumentConverter
11
+ DOCLING_AVAILABLE = True
12
+ except ImportError:
13
+
14
+ class DocumentConverter:
15
+ """Stub when docling is not installed."""
16
+
17
+ def __init__(self):
18
+ raise ImportError(
19
+ "Docling not installed. Install with: pip install content-core[docling] "
20
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
21
+ )
22
+
23
+ def convert(self, source: str):
24
+ raise ImportError(
25
+ "Docling not installed. Install with: pip install content-core[docling] "
26
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
27
+ )
28
+
29
+ # Supported MIME types for Docling extraction
30
+ DOCLING_SUPPORTED = {
31
+ "application/pdf",
32
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
33
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
34
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
35
+ "text/markdown",
36
+ # "text/plain", #docling currently not supporting txt
37
+ "text/x-markdown",
38
+ "text/csv",
39
+ "text/html",
40
+ "image/png",
41
+ "image/jpeg",
42
+ "image/tiff",
43
+ "image/bmp",
44
+ }
45
+
46
+
47
+ async def extract_with_docling(state: ProcessSourceState) -> ProcessSourceState:
48
+ """
49
+ Use Docling to parse files, URLs, or content into the desired format.
50
+ """
51
+ # Initialize Docling converter
52
+ converter = DocumentConverter()
53
+
54
+ # Determine source: file path, URL, or direct content
55
+ source = state.file_path or state.url or state.content
56
+ if not source:
57
+ raise ValueError("No input provided for Docling extraction.")
58
+
59
+ # Convert document
60
+ result = converter.convert(source)
61
+ doc = result.document
62
+
63
+ # Determine output format (per execution override, metadata, then config)
64
+ cfg_fmt = (
65
+ CONFIG.get("extraction", {}).get("docling", {}).get("output_format", "markdown")
66
+ )
67
+ fmt = state.output_format or state.metadata.get("docling_format") or cfg_fmt
68
+ # Record the format used
69
+ state.metadata["docling_format"] = fmt
70
+ if fmt == "html":
71
+ output = doc.export_to_html()
72
+ elif fmt == "json":
73
+ output = doc.export_to_json()
74
+ else:
75
+ output = doc.export_to_markdown()
76
+
77
+ # Update state
78
+ state.content = output
79
+ return state