PyPI - content-core - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

content-core 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (13) hide show

content_core/cc_config.yaml CHANGED Viewed

@@ -32,6 +32,8 @@ summary_model:
 extraction:
   document_engine: auto  # auto | simple | docling - for files/documents
   url_engine: auto  # auto | simple | firecrawl | jina | docling - for URLs
+  audio:
+    concurrency: 3              # Number of concurrent audio transcriptions (1-10)
   docling:
     output_format: markdown  # markdown | html | json
   pymupdf:

content_core/config.py CHANGED Viewed

@@ -70,6 +70,61 @@ def get_url_engine():
         return env_engine
     return CONFIG.get("extraction", {}).get("url_engine", "auto")
+def get_audio_concurrency():
+    """
+    Get audio concurrency with environment variable override and validation.
+    Returns the configured number of concurrent audio transcriptions, with automatic
+    validation and fallback to safe defaults.
+    Configuration priority (highest to lowest):
+    1. CCORE_AUDIO_CONCURRENCY environment variable
+    2. extraction.audio.concurrency in YAML config
+    3. Default value: 3
+    Returns:
+        int: Number of concurrent transcriptions (1-10)
+    Validation:
+        - Values must be integers between 1 and 10 (inclusive)
+        - Invalid values (out of range, non-integer, etc.) automatically fall back to default
+        - A warning is logged when invalid values are detected
+    Examples:
+        >>> import os
+        >>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "5"
+        >>> get_audio_concurrency()
+        5
+        >>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "20"  # Too high
+        >>> get_audio_concurrency()  # Falls back to default
+        3
+    """
+    env_concurrency = os.environ.get("CCORE_AUDIO_CONCURRENCY")
+    if env_concurrency:
+        try:
+            concurrency = int(env_concurrency)
+            if concurrency < 1 or concurrency > 10:
+                # Import logger here to avoid circular imports
+                from content_core.logging import logger
+                logger.warning(
+                    f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
+                    f"Must be between 1 and 10. "
+                    f"Using default from config."
+                )
+                return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
+            return concurrency
+        except ValueError:
+            # Import logger here to avoid circular imports
+            from content_core.logging import logger
+            logger.warning(
+                f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
+                f"Must be a valid integer. "
+                f"Using default from config."
+            )
+            return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
+    return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
 # Programmatic config overrides: use in notebooks or scripts
 def set_document_engine(engine: str):
     """Override the document extraction engine ('auto', 'simple', or 'docling')."""
@@ -102,3 +157,19 @@ def set_pymupdf_ocr_fallback(enabled: bool):
     extraction = CONFIG.setdefault("extraction", {})
     pymupdf_cfg = extraction.setdefault("pymupdf", {})
     pymupdf_cfg["ocr_fallback"] = enabled
+def set_audio_concurrency(concurrency: int):
+    """
+    Override the audio concurrency setting (1-10).
+    Args:
+        concurrency (int): Number of concurrent audio transcriptions (1-10)
+    Raises:
+        ValueError: If concurrency is not between 1 and 10
+    """
+    if not isinstance(concurrency, int) or concurrency < 1 or concurrency > 10:
+        raise ValueError(f"Audio concurrency must be an integer between 1 and 10, got: {concurrency}")
+    extraction = CONFIG.setdefault("extraction", {})
+    audio_cfg = extraction.setdefault("audio", {})
+    audio_cfg["concurrency"] = concurrency

content_core/content/identification/file_detector.py CHANGED Viewed

@@ -3,10 +3,9 @@ Pure Python file type detection using magic bytes and content analysis.
 Replaces libmagic dependency with a lightweight implementation.
 """
-import os
 import zipfile
 from pathlib import Path
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional
 from content_core.common.exceptions import UnsupportedTypeException
 from content_core.logging import logger

content_core/content/summary/core.py CHANGED Viewed

@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
     templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
     response = await templated_message_fn(
         TemplatedMessageInput(
-            user_prompt_template="prompts/content/summarize",
+            user_prompt_template="content/summarize",
             data={"content": content, "context": context},
         )
     )

content_core/notebooks/run.ipynb CHANGED Viewed

@@ -63,8 +63,6 @@
    "source": [
     "from content_core.content.extraction import extract_content\n",
     "\n",
-    "from content_core.content.cleanup import cleanup_content\n",
-    "from content_core.content.summary import summarize\n",
     "\n",
     "\n",
     "yt = await extract_content(dict(url=\"https://www.youtube.com/watch?v=lLprprtHfts\"))\n",

content_core/processors/audio.py CHANGED Viewed

@@ -8,11 +8,9 @@ from functools import partial
 from moviepy import AudioFileClip
 from content_core.common import ProcessSourceState
+from content_core.config import get_audio_concurrency
 from content_core.logging import logger
-# todo: remove reference to model_manager
-# future: parallelize the transcription process
 async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
     """
@@ -98,61 +96,126 @@ def extract_audio(
         raise
-async def transcribe_audio_segment(audio_file, model):
-    """Transcribe a single audio segment asynchronously"""
-    return (await model.atranscribe(audio_file)).text
+async def transcribe_audio_segment(audio_file, model, semaphore):
+    """
+    Transcribe a single audio segment asynchronously with concurrency control.
+    This function uses a semaphore to limit the number of concurrent transcriptions,
+    preventing API rate limits while allowing parallel processing for improved performance.
+    Args:
+        audio_file (str): Path to the audio file segment to transcribe
+        model: Speech-to-text model instance with atranscribe() method
+        semaphore (asyncio.Semaphore): Semaphore to control concurrency
+    Returns:
+        str: Transcribed text from the audio segment
+    Note:
+        Multiple instances of this function can run concurrently, but the semaphore
+        ensures that no more than N transcriptions happen simultaneously, where N
+        is configured via get_audio_concurrency() (default: 3, range: 1-10).
+    """
+    async with semaphore:
+        return (await model.atranscribe(audio_file)).text
 async def extract_audio_data(data: ProcessSourceState):
-    input_audio_path = data.file_path
+    """
+    Extract and transcribe audio from a file with automatic segmentation and parallel processing.
-    try:
-        # Create a temporary directory for audio segments
-        temp_dir = tempfile.mkdtemp()
-        output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
-        output_dir = temp_dir
-        os.makedirs(output_dir, exist_ok=True)
+    This function handles the complete audio processing pipeline:
+    1. Splits long audio files (>10 minutes) into segments
+    2. Transcribes segments in parallel using configurable concurrency
+    3. Joins transcriptions in correct order
-        # Split audio into segments if longer than 10 minutes
-        audio = AudioFileClip(input_audio_path)
-        duration_s = audio.duration
-        segment_length_s = 10 * 60  # 10 minutes in seconds
-        output_files = []
+    For files longer than 10 minutes, segments are processed concurrently with a
+    configurable concurrency limit to balance performance and API rate limits.
+    Args:
+        data (ProcessSourceState): State object containing file_path to audio/video file
+    Returns:
+        dict: Dictionary containing:
+            - metadata: Information about processed segments count
+            - content: Complete transcribed text
-        if duration_s > segment_length_s:
-            logger.info(
-                f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
-            )
-            for i in range(math.ceil(duration_s / segment_length_s)):
-                start_time = i * segment_length_s
-                end_time = min((i + 1) * segment_length_s, audio.duration)
-                # Extract segment
-                output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
-                output_path = os.path.join(output_dir, output_filename)
-                extract_audio(input_audio_path, output_path, start_time, end_time)
-                output_files.append(output_path)
-        else:
-            output_files = [input_audio_path]
-        # Transcribe audio files
-        from content_core.models import ModelFactory
-        speech_to_text_model = ModelFactory.get_model("speech_to_text")
-        transcriptions = []
-        for audio_file in output_files:
-            transcription = await transcribe_audio_segment(
-                audio_file, speech_to_text_model
-            )
-            transcriptions.append(transcription)
-        return {
-            "metadata": {"audio_files": output_files},
-            "content": " ".join(transcriptions),
-        }
+    Configuration:
+        Concurrency is controlled via:
+        - Environment variable: CCORE_AUDIO_CONCURRENCY (1-10, default: 3)
+        - YAML config: extraction.audio.concurrency
+    Raises:
+        Exception: If audio extraction or transcription fails
+    """
+    input_audio_path = data.file_path
+    audio = None
+    try:
+        # Use TemporaryDirectory context manager for automatic cleanup
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
+            output_dir = temp_dir
+            # Split audio into segments if longer than 10 minutes
+            audio = AudioFileClip(input_audio_path)
+            duration_s = audio.duration
+            segment_length_s = 10 * 60  # 10 minutes in seconds
+            output_files = []
+            if duration_s > segment_length_s:
+                logger.info(
+                    f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
+                )
+                for i in range(math.ceil(duration_s / segment_length_s)):
+                    start_time = i * segment_length_s
+                    end_time = min((i + 1) * segment_length_s, audio.duration)
+                    # Extract segment
+                    output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
+                    output_path = os.path.join(output_dir, output_filename)
+                    extract_audio(input_audio_path, output_path, start_time, end_time)
+                    output_files.append(output_path)
+            else:
+                output_files = [input_audio_path]
+            # Close audio clip after determining segments
+            if audio:
+                audio.close()
+                audio = None
+            # Transcribe audio files in parallel with concurrency limit
+            from content_core.models import ModelFactory
+            speech_to_text_model = ModelFactory.get_model("speech_to_text")
+            concurrency = get_audio_concurrency()
+            semaphore = asyncio.Semaphore(concurrency)
+            logger.debug(f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}")
+            # Create tasks for parallel transcription
+            transcription_tasks = [
+                transcribe_audio_segment(audio_file, speech_to_text_model, semaphore)
+                for audio_file in output_files
+            ]
+            # Execute all transcriptions concurrently (limited by semaphore)
+            transcriptions = await asyncio.gather(*transcription_tasks)
+            return {
+                "metadata": {"segments_count": len(output_files)},
+                "content": " ".join(transcriptions),
+            }
     except Exception as e:
         logger.error(f"Error processing audio: {str(e)}")
         logger.error(traceback.format_exc())
         raise
+    finally:
+        # Ensure audio clip is closed even if an error occurs
+        if audio:
+            try:
+                audio.close()
+            except Exception:
+                pass

content_core/processors/url.py CHANGED Viewed

@@ -147,10 +147,10 @@ async def extract_url_firecrawl(url: str):
         from firecrawl import AsyncFirecrawlApp
         app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
-        scrape_result = await app.scrape_url(url, formats=["markdown", "html"])
+        scrape_result = await app.scrape(url, formats=["markdown", "html"])
         return {
-            "title": scrape_result.metadata["title"] or scrape_result.title,
-            "content": scrape_result.markdown,
+            "title": scrape_result.metadata.title or "",
+            "content": scrape_result.markdown or "",
         }
     except Exception as e:

content_core/templated_message.py CHANGED Viewed

@@ -36,14 +36,14 @@ async def templated_message(
             prompt_template=input.system_prompt_template,
             template_text=input.system_prompt_text,
         ).render(data=input.data)
-        msgs.append(Message(role="system", content=system_prompt))
+        msgs.append({"role": "system", "content": system_prompt})
     if input.user_prompt_template or input.user_prompt_text:
         user_prompt = Prompter(
             prompt_template=input.user_prompt_template,
             template_text=input.user_prompt_text,
         ).render(data=input.data)
-        msgs.append(Message(role="user", content=user_prompt))
+        msgs.append({"role": "user", "content": user_prompt})
     result = await model.achat_complete(msgs)
     return result.content

{content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 1.4.1
+Version: 1.5.0
 Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
@@ -548,6 +548,9 @@ GOOGLE_API_KEY=your-key-here
 # Engine Selection (optional)
 CCORE_DOCUMENT_ENGINE=auto  # auto, simple, docling
 CCORE_URL_ENGINE=auto       # auto, simple, firecrawl, jina
+# Audio Processing (optional)
+CCORE_AUDIO_CONCURRENCY=3   # Number of concurrent audio transcriptions (1-10, default: 3)
 ```
 ### Engine Selection via Environment Variables
@@ -556,9 +559,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
 - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
 - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
+- **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
 These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
+### Audio Processing Configuration
+Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
+- **Default**: 3 concurrent transcriptions
+- **Range**: 1-10 concurrent transcriptions
+- **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
+Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
 ### Custom Prompt Templates
 Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.

{content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
-content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
-content_core/config.py,sha256=3XAsMF3EhDJ6aCpzk1UZG_m3-SFdYe3cHiDPH7eVGwQ,4312
+content_core/cc_config.yaml,sha256=3Ot5u-YSBx2k3JXWnCP7s7OVBbGpGebBy_CWj3we-u4,1211
+content_core/config.py,sha256=Mao6AZZoiSiX7uZwOGgk759LlV0j6NdfYGgWgX6vhAs,7112
 content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
 content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
 content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
 content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
-content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
+content_core/templated_message.py,sha256=jsjGqD-zf__pV4P0eo9cffTK2C90-VggL64qNYejFo0,1615
 content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
 content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
 content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
@@ -17,27 +17,27 @@ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnR
 content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
 content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
 content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
-content_core/content/identification/file_detector.py,sha256=s_10Osxv8gfVfs3UPXFzCOosvWCrf4ZCFXcW2yimUIM,17170
+content_core/content/identification/file_detector.py,sha256=JTfGK28BQg_SGYqLzGVT4OGBfWx8HtEPA-3kfW5o3oE,17153
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
-content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
+content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
 content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
 content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
-content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
+content_core/notebooks/run.ipynb,sha256=8gbFln9WLrli_qWJB8SKQKcSNbAv25DvN5Cu4EAAeBQ,370952
 content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
-content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
+content_core/processors/audio.py,sha256=fdR_KcLRG3jSwY3t_eVDoMgUHQQyXmAAlmfETMtomq0,8396
 content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
 content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
 content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
-content_core/processors/url.py,sha256=To0LTtMVNN3M83CdodQaZFuU7-IMM5w9QOHRKNV8PVI,7532
+content_core/processors/url.py,sha256=RhBOyqfSWFaf8Dhpxlo9xbsF5yuP5FhXfhbvbi4CQPc,7514
 content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
 content_core/processors/youtube.py,sha256=_qvxI9qTdxu3l1fKLuJARFt8KtZVFJ3JJBLkq1hAAXo,7868
 content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-1.4.1.dist-info/METADATA,sha256=MQIVrPCeN9dE1JZ8UbcE4NSch7tfGEI-mauLTtbYLoE,21093
-content_core-1.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-1.4.1.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
-content_core-1.4.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-1.4.1.dist-info/RECORD,,
+content_core-1.5.0.dist-info/METADATA,sha256=D3Cuy_zwW7u6jeuDVxYCwSEzJt8yrIjEFi9bJhJPqLQ,21963
+content_core-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-1.5.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
+content_core-1.5.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-1.5.0.dist-info/RECORD,,

{content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

content-core 1.4.1py3-none-any.whl → 1.5.0py3-none-any.whl