PyPI - content-core - Versions diffs - 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

content-core 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (12) hide show

content_core/processors/audio.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import asyncio
 import os
+import tempfile
+import math
+import traceback
 from functools import partial
-from math import ceil
-from pydub import AudioSegment
+from moviepy import AudioFileClip
 from content_core.common import ProcessSourceState
 from content_core.logging import logger
-from content_core.models import ModelFactory
 # todo: remove reference to model_manager
 # future: parallelize the transcription process
@@ -29,31 +29,29 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
             output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
         # Load the audio file
-        audio = AudioSegment.from_file(input_file_abs)
+        audio = AudioFileClip(input_file_abs)
-        # Calculate segment length in milliseconds
-        segment_length_ms = segment_length_minutes * 60 * 1000
+        # Calculate segment length in seconds
+        segment_length_s = segment_length_minutes * 60
         # Calculate number of segments
-        total_segments = ceil(len(audio) / segment_length_ms)
+        total_segments = math.ceil(audio.duration / segment_length_s)
         logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
         output_files = []
         # Split the audio into segments
         for i in range(total_segments):
-            start_time = i * segment_length_ms
-            end_time = min((i + 1) * segment_length_ms, len(audio))
+            start_time = i * segment_length_s
+            end_time = min((i + 1) * segment_length_s, audio.duration)
             # Extract segment
-            segment = audio[start_time:end_time]
-            # Generate output filename
             output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
             output_path = os.path.join(output_dir, output_filename)
             # Export segment
-            segment.export(output_path, format="mp3")
+            extract_audio(input_file_abs, output_path, start_time, end_time)
             output_files.append(output_path)
             logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
@@ -66,43 +64,87 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
     )
+def extract_audio(input_file: str, output_file: str, start_time: float = None, end_time: float = None) -> None:
+    """
+    Extract audio from a video or audio file and save it as an MP3 file.
+    If start_time and end_time are provided, only that segment of audio is extracted.
+    Args:
+        input_file (str): Path to the input video or audio file.
+        output_file (str): Path where the output MP3 file will be saved.
+        start_time (float, optional): Start time of the audio segment in seconds. Defaults to None.
+        end_time (float, optional): End time of the audio segment in seconds. Defaults to None.
+    """
+    try:
+        # Load the file as an AudioFileClip
+        audio_clip = AudioFileClip(input_file)
+        # If start_time and end_time are provided, trim the audio
+        if start_time is not None and end_time is not None:
+            audio_clip = audio_clip.cutout(0, start_time).cutout(end_time - start_time, audio_clip.duration)
+        elif start_time is not None:
+            audio_clip = audio_clip.cutout(0, start_time)
+        elif end_time is not None:
+            audio_clip = audio_clip.cutout(end_time, audio_clip.duration)
+        # Export the audio as MP3
+        audio_clip.write_audiofile(output_file, codec='mp3')
+        audio_clip.close()
+    except Exception as e:
+        logger.error(f"Error extracting audio: {str(e)}")
+        raise
 async def transcribe_audio_segment(audio_file, model):
     """Transcribe a single audio segment asynchronously"""
     return (await model.atranscribe(audio_file)).text
-async def extract_audio(data: ProcessSourceState):
+async def extract_audio_data(data: ProcessSourceState):
     input_audio_path = data.file_path
-    audio_files = []
     try:
-        # Split audio into segments
-        audio_files = await split_audio(input_audio_path)
+        # Create a temporary directory for audio segments
+        temp_dir = tempfile.mkdtemp()
+        output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
+        output_dir = temp_dir
+        os.makedirs(output_dir, exist_ok=True)
-        # Transcribe all segments concurrently
-        speech_to_text_model = ModelFactory.get_model("speech_to_text")
-        transcribe_tasks = [
-            transcribe_audio_segment(audio_file, speech_to_text_model)
-            for audio_file in audio_files
-        ]
-        transcriptions = await asyncio.gather(*transcribe_tasks)
+        # Split audio into segments if longer than 10 minutes
+        audio = AudioFileClip(input_audio_path)
+        duration_s = audio.duration
+        segment_length_s = 10 * 60  # 10 minutes in seconds
+        output_files = []
+        if duration_s > segment_length_s:
+            logger.info(f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments")
+            for i in range(math.ceil(duration_s / segment_length_s)):
+                start_time = i * segment_length_s
+                end_time = min((i + 1) * segment_length_s, audio.duration)
-        return {"content": " ".join(transcriptions)}
+                # Extract segment
+                output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
+                output_path = os.path.join(output_dir, output_filename)
+                extract_audio(input_audio_path, output_path, start_time, end_time)
+                output_files.append(output_path)
+        else:
+            output_files = [input_audio_path]
+        # Transcribe audio files
+        from content_core.models import ModelFactory
+        speech_to_text_model = ModelFactory.get_model("speech_to_text")
+        transcriptions = []
+        for audio_file in output_files:
+            transcription = await transcribe_audio_segment(audio_file, speech_to_text_model)
+            transcriptions.append(transcription)
+        return {
+            "metadata": {"audio_files": output_files},
+            "content": " ".join(transcriptions)
+        }
     except Exception as e:
-        logger.error(f"Error transcribing audio: {str(e)}")
-        logger.exception(e)
+        logger.error(f"Error processing audio: {str(e)}")
+        logger.error(traceback.format_exc())
         raise
-    finally:
-        # Clean up temporary files
-        def _cleanup(files):
-            for file in files:
-                try:
-                    os.remove(file)
-                except OSError as e:
-                    logger.error(f"Error removing temporary file {file}: {str(e)}")
-        await asyncio.get_event_loop().run_in_executor(
-            None, partial(_cleanup, audio_files)
-        )

content_core/processors/url.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import re
 from urllib.parse import urlparse
+from io import BytesIO
 import aiohttp
+import docx
 from bs4 import BeautifulSoup, Comment
 from content_core.common import ProcessSourceState
@@ -12,6 +14,49 @@ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
 # https://github.com/buriy/python-readability
 # also try readability: from readability import Document
+DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+async def _extract_docx_content(docx_bytes: bytes, url: str):
+    """
+    Extract content from DOCX file bytes.
+    """
+    try:
+        logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
+        doc = docx.Document(BytesIO(docx_bytes))
+        content_parts = [p.text for p in doc.paragraphs if p.text]
+        full_content = "\n\n".join(content_parts)
+        # Try to get a title from document properties or first heading
+        title = doc.core_properties.title
+        if not title and doc.paragraphs:
+            # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
+            for p in doc.paragraphs[:5]: # Check first 5 paragraphs
+                if p.style.name.startswith('Heading'):
+                    title = p.text
+                    break
+            if not title: # Fallback to first line if no heading found
+                 title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
+        # If no title found, use filename from URL
+        if not title:
+            title = urlparse(url).path.split('/')[-1]
+        logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
+        return {
+            "title": title,
+            "content": full_content,
+            "domain": urlparse(url).netloc,
+            "url": url,
+        }
+    except Exception as e:
+        logger.error(f"Failed to process DOCX content from {url}: {e}")
+        # Fallback or re-raise, depending on desired error handling
+        return {
+            "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
+            "content": f"Failed to extract content from DOCX: {e}",
+            "domain": urlparse(url).netloc,
+            "url": url,
+        }
 async def url_provider(state: ProcessSourceState):
     """
@@ -54,6 +99,13 @@ async def extract_url_bs4(url: str):
             async with aiohttp.ClientSession() as session:
                 async with session.get(url, headers=headers, timeout=10) as response:
                     response.raise_for_status()
+                    # Check content type for DOCX
+                    if response.content_type == DOCX_MIME_TYPE:
+                        logger.debug(f"Detected DOCX content type for {url}")
+                        docx_bytes = await response.read()
+                        return await _extract_docx_content(docx_bytes, url)
+                    # If not DOCX, proceed as HTML
                     html_content = await response.text()
         soup = BeautifulSoup(html_content, "html.parser")

{content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 0.5.1
+Version: 0.7.0
 Summary: Extract what matters from any media source
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
@@ -9,16 +9,14 @@ Requires-Dist: ai-prompter>=0.2.3
 Requires-Dist: aiohttp>=3.11
 Requires-Dist: bs4>=0.0.2
 Requires-Dist: dicttoxml>=1.7.16
-Requires-Dist: esperanto>=1.2.0
-Requires-Dist: google-genai>=1.10.0
+Requires-Dist: esperanto[openai]>=1.2.0
 Requires-Dist: jinja2>=3.1.6
 Requires-Dist: langdetect>=1.0.9
 Requires-Dist: langgraph>=0.3.29
 Requires-Dist: loguru>=0.7.3
-Requires-Dist: openai>=1.73.0
+Requires-Dist: moviepy>=2.1.2
 Requires-Dist: openpyxl>=3.1.5
 Requires-Dist: pandas>=2.2.3
-Requires-Dist: pydub>=0.25.1
 Requires-Dist: pymupdf>=1.25.5
 Requires-Dist: python-docx>=1.1.2
 Requires-Dist: python-dotenv>=1.1.0
@@ -28,7 +26,7 @@ Requires-Dist: validators>=0.34.0
 Requires-Dist: youtube-transcript-api>=1.0.3
 Provides-Extra: docling
 Requires-Dist: asciidoc; extra == 'docling'
-Requires-Dist: docling[ocr]; extra == 'docling'
+Requires-Dist: docling; extra == 'docling'
 Requires-Dist: pandas; extra == 'docling'
 Requires-Dist: pillow; extra == 'docling'
 Description-Content-Type: text/markdown

{content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/RECORD RENAMED Viewed

@@ -14,27 +14,24 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
 content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
 content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
 content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
-content_core/content/extraction/graph.py,sha256=Sp9XJ6AoLXA_FUFWhmfTMzOC2gkarp1Qg8MsIScLCok,6213
+content_core/content/extraction/graph.py,sha256=IKu-bV3YG2MigHnYixYYhtrQ-4qgGpETerXBEFn73zU,6304
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
 content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
-content_core/notebooks/docling.ipynb,sha256=aTad8NORNd-TUMlbX58DURJ4-QCeplTeTT0vUj301m0,631
-content_core/notebooks/run.ipynb,sha256=lV8n1fx_kgIQHBnk1vR6ChBjMS5luAEuDDljsTBNjrQ,369490
-content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
+content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
+content_core/processors/audio.py,sha256=KnwxK85X9qRyVziMhFd103kfHkE8qGB1D4yW5lYO90E,5701
 content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
 content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
 content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
-content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
+content_core/processors/url.py,sha256=vmkBVfJ1xpZQzlhRdkO64V1J9xdTBr6nrXY4M74QzEo,9094
 content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
 content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
-content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
-content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
 content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-0.5.1.dist-info/METADATA,sha256=mkvdVcLsiBDGiobgswCVQF8Xkceq5VpIRZspniB61PY,10533
-content_core-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-0.5.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
-content_core-0.5.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-0.5.1.dist-info/RECORD,,
+content_core-0.7.0.dist-info/METADATA,sha256=CFTVOA8hnMcofSlIlR-RwcCmvD9Hsa6mxFPjisBMKus,10471
+content_core-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-0.7.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
+content_core-0.7.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-0.7.0.dist-info/RECORD,,

content_core/notebooks/docling.ipynb DELETED Viewed

@@ -1,27 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from docling.document_converter import DocumentConverter\n",
-    "\n",
-    "\n",
-    "source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
-    "source_url = \"https://arxiv.org/pdf/2408.09869\"  # PDF path or URL\n",
-    "converter = DocumentConverter()\n",
-    "result = converter.convert(source)\n",
-    "print(result.document.export_to_markdown())"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

content_core/prompts/content/cleanup.jinja DELETED Viewed

@@ -1,16 +0,0 @@
-# GOAL
-Adjust the content below to make it clean and readable:
-Remove repeated strings that do not add value to the text.
-Remove any content unrelated to the text itself (e.g., metadata, artifacts, or extraction errors).
-Format the output as unstructured but clear text.
-Do not add extra text, introductions, conclusions, or commentary—only rewrite the provided content as it is.
-Do not interpret, analyze, or alter the meaning, intent, or narrative of the text—just reformat it for clarity and readability.
-Do not change the text structure, do not write conclusions about it. Your only job is to make it readable.
-Keep the text in its original language, regardless of what it is.

content_core/prompts/content/summarize.jinja DELETED Viewed

@@ -1,25 +0,0 @@
-You are an AI assistant for a personal study platform.
-In this platform, your user collects various articles and content from the Internet for reference and study.
-Your role is to summarize the selected content as densely as possible, helping the reader extract maximum value from it without reading the full text.
-Focus solely on the content's value, avoiding unnecessary comments or messages.
-The summary should be dense, rich in characters, and designed to create a powerful vector representation.
-If the user provided additional context, follow its instructions. Otherwise, summary the whole content.
-Do not return any acknowledgments or greetings—only the summary.
-CONTENT:
-{{ content }}
-{% if context %}
-CONTEXT:
-User has provided the aditional context for your task:
-{{context}}
-{% endif%}
-SUMMARY:

{content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

content-core 0.5.1py3-none-any.whl → 0.7.0py3-none-any.whl