PyPI - content-core - Versions diffs - 1.10.0__py3-none-any.whl - Mend

content-core 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

content_core/__init__.py +216 -0
content_core/cc_config.yaml +86 -0
content_core/common/__init__.py +38 -0
content_core/common/exceptions.py +70 -0
content_core/common/retry.py +325 -0
content_core/common/state.py +64 -0
content_core/common/types.py +15 -0
content_core/common/utils.py +31 -0
content_core/config.py +575 -0
content_core/content/__init__.py +6 -0
content_core/content/cleanup/__init__.py +5 -0
content_core/content/cleanup/core.py +15 -0
content_core/content/extraction/__init__.py +13 -0
content_core/content/extraction/graph.py +252 -0
content_core/content/identification/__init__.py +9 -0
content_core/content/identification/file_detector.py +505 -0
content_core/content/summary/__init__.py +5 -0
content_core/content/summary/core.py +15 -0
content_core/logging.py +15 -0
content_core/mcp/__init__.py +5 -0
content_core/mcp/server.py +214 -0
content_core/models.py +60 -0
content_core/models_config.yaml +31 -0
content_core/notebooks/run.ipynb +359 -0
content_core/notebooks/urls.ipynb +154 -0
content_core/processors/audio.py +272 -0
content_core/processors/docling.py +79 -0
content_core/processors/office.py +331 -0
content_core/processors/pdf.py +292 -0
content_core/processors/text.py +36 -0
content_core/processors/url.py +324 -0
content_core/processors/video.py +166 -0
content_core/processors/youtube.py +262 -0
content_core/py.typed +2 -0
content_core/templated_message.py +70 -0
content_core/tools/__init__.py +9 -0
content_core/tools/cleanup.py +15 -0
content_core/tools/extract.py +21 -0
content_core/tools/summarize.py +17 -0
content_core-1.10.0.dist-info/METADATA +742 -0
content_core-1.10.0.dist-info/RECORD +44 -0
content_core-1.10.0.dist-info/WHEEL +4 -0
content_core-1.10.0.dist-info/entry_points.txt +5 -0
content_core-1.10.0.dist-info/licenses/LICENSE +21 -0

content_core/processors/url.py ADDED Viewed

@@ -0,0 +1,324 @@
+import os
+import aiohttp
+from bs4 import BeautifulSoup
+from readability import Document
+from content_core.common import ProcessSourceState
+from content_core.common.retry import retry_url_api, retry_url_network
+from content_core.config import get_proxy, get_url_engine
+from content_core.logging import logger
+from content_core.processors.docling import DOCLING_SUPPORTED
+from content_core.processors.office import SUPPORTED_OFFICE_TYPES
+from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
+@retry_url_network()
+async def _fetch_url_mime_type(url: str, proxy: str | None = None) -> str:
+    """Internal function to fetch URL MIME type - wrapped with retry logic."""
+    resolved_proxy = get_proxy(proxy)
+    async with aiohttp.ClientSession() as session:
+        async with session.head(
+            url, timeout=10, allow_redirects=True, proxy=resolved_proxy
+        ) as resp:
+            mime = resp.headers.get("content-type", "").split(";", 1)[0]
+            logger.debug(f"MIME type for {url}: {mime}")
+            return mime
+async def url_provider(state: ProcessSourceState):
+    """
+    Identify the provider with retry logic for network requests.
+    """
+    return_dict = {}
+    url = state.url
+    if url:
+        if "youtube.com" in url or "youtu.be" in url:
+            return_dict["identified_type"] = "youtube"
+        else:
+            # remote URL: check content-type to catch PDFs
+            try:
+                mime = await _fetch_url_mime_type(url, state.proxy)
+            except Exception as e:
+                logger.warning(f"HEAD check failed for {url} after retries: {e}")
+                mime = "article"
+            if (
+                mime in DOCLING_SUPPORTED
+                or mime in SUPPORTED_FITZ_TYPES
+                or mime in SUPPORTED_OFFICE_TYPES
+            ):
+                logger.debug(f"Identified type for {url}: {mime}")
+                return_dict["identified_type"] = mime
+            else:
+                logger.debug(f"Identified type for {url}: article")
+                return_dict["identified_type"] = "article"
+    return return_dict
+@retry_url_network()
+async def _fetch_url_html(url: str, proxy: str | None = None) -> str:
+    """Internal function to fetch URL HTML content - wrapped with retry logic."""
+    resolved_proxy = get_proxy(proxy)
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url, timeout=10, proxy=resolved_proxy) as response:
+            # Raise ClientResponseError so retry logic can inspect status code
+            # (5xx and 429 will be retried, 4xx will not)
+            response.raise_for_status()
+            return await response.text()
+async def extract_url_bs4(url: str, proxy: str | None = None) -> dict:
+    """
+    Get the title and content of a URL using readability with a fallback to BeautifulSoup.
+    Includes retry logic for network failures.
+    Args:
+        url (str): The URL of the webpage to extract content from.
+        proxy (str | None): Optional proxy URL to use for this request.
+    Returns:
+        dict: A dictionary containing the 'title' and 'content' of the webpage.
+    """
+    try:
+        # Fetch the webpage content with retry
+        html = await _fetch_url_html(url, proxy)
+        # Try extracting with readability
+        try:
+            doc = Document(html)
+            title = doc.title() or "No title found"
+            # Extract content as plain text by parsing the cleaned HTML
+            soup = BeautifulSoup(doc.summary(), "lxml")
+            content = soup.get_text(separator=" ", strip=True)
+            if not content.strip():
+                raise ValueError("No content extracted by readability")
+        except Exception as e:
+            logger.debug(f"Readability failed: {e}")
+            # Fallback to BeautifulSoup
+            soup = BeautifulSoup(html, "lxml")
+            # Extract title
+            title_tag = (
+                soup.find("title")
+                or soup.find("h1")
+                or soup.find("meta", property="og:title")
+            )
+            title = (
+                title_tag.get_text(strip=True) if title_tag else "No title found"
+            )
+            # Extract content from common content tags
+            content_tags = soup.select(
+                'article, .content, .post, main, [role="main"], div[class*="content"], div[class*="article"]'
+            )
+            content = (
+                " ".join(
+                    tag.get_text(separator=" ", strip=True) for tag in content_tags
+                )
+                if content_tags
+                else soup.get_text(separator=" ", strip=True)
+            )
+            content = content.strip() or "No content found"
+        return {
+            "title": title,
+            "content": content,
+        }
+    except Exception as e:
+        logger.error(f"Error processing URL {url} after retries: {e}")
+        return {
+            "title": "Error",
+            "content": f"Failed to extract content: {str(e)}",
+        }
+@retry_url_api()
+async def _fetch_url_jina(url: str, headers: dict, proxy: str | None = None) -> str:
+    """Internal function to fetch URL content via Jina - wrapped with retry logic."""
+    resolved_proxy = get_proxy(proxy)
+    async with aiohttp.ClientSession() as session:
+        async with session.get(
+            f"https://r.jina.ai/{url}", headers=headers, proxy=resolved_proxy
+        ) as response:
+            # Raise ClientResponseError so retry logic can inspect status code
+            # (5xx and 429 will be retried, 4xx will not)
+            response.raise_for_status()
+            return await response.text()
+async def extract_url_jina(url: str, proxy: str | None = None) -> dict:
+    """
+    Get the content of a URL using Jina. Uses Bearer token if JINA_API_KEY is set.
+    Includes retry logic for transient API failures.
+    Args:
+        url (str): The URL to extract content from.
+        proxy (str | None): Optional proxy URL to use for this request.
+    """
+    headers = {}
+    api_key = os.environ.get("JINA_API_KEY")
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    try:
+        text = await _fetch_url_jina(url, headers, proxy)
+        if text.startswith("Title:") and "\n" in text:
+            title_end = text.index("\n")
+            title = text[6:title_end].strip()
+            content = text[title_end + 1 :].strip()
+            logger.debug(
+                f"Processed url: {url}, found title: {title}, content: {content[:100]}..."
+            )
+            return {"title": title, "content": content}
+        else:
+            logger.debug(
+                f"Processed url: {url}, does not have Title prefix, returning full content: {text[:100]}..."
+            )
+            return {"content": text}
+    except Exception as e:
+        logger.error(f"Jina extraction failed for {url} after retries: {e}")
+        raise
+@retry_url_api()
+async def _fetch_url_firecrawl(url: str, proxy: str | None = None) -> dict:
+    """Internal function to fetch URL content via Firecrawl - wrapped with retry logic."""
+    from firecrawl import AsyncFirecrawlApp
+    # Note: firecrawl-py does not support client-side proxy configuration
+    # Proxy must be configured on the Firecrawl server side
+    resolved_proxy = get_proxy(proxy)
+    if resolved_proxy:
+        logger.warning(
+            "Proxy is configured but Firecrawl does not support client-side proxy. "
+            "Proxy will NOT be used for this request. Configure proxy on Firecrawl server instead."
+        )
+    app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
+    scrape_result = await app.scrape(url, formats=["markdown", "html"])
+    return {
+        "title": scrape_result.metadata.title or "",
+        "content": scrape_result.markdown or "",
+    }
+async def extract_url_firecrawl(url: str, proxy: str | None = None) -> dict | None:
+    """
+    Get the content of a URL using Firecrawl.
+    Returns {"title": ..., "content": ...} or None on failure.
+    Includes retry logic for transient API failures.
+    Note: Firecrawl does not support client-side proxy configuration.
+    """
+    try:
+        return await _fetch_url_firecrawl(url, proxy)
+    except Exception as e:
+        logger.error(f"Firecrawl extraction failed for {url} after retries: {e}")
+        return None
+@retry_url_api()
+async def _fetch_url_crawl4ai(url: str, proxy: str | None = None) -> dict:
+    """Internal function to fetch URL content via Crawl4AI - wrapped with retry logic."""
+    try:
+        from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ProxyConfig
+    except ImportError:
+        raise ImportError(
+            "Crawl4AI is not installed. Install it with: pip install content-core[crawl4ai]"
+        )
+    resolved_proxy = get_proxy(proxy)
+    # Configure proxy if available
+    run_config = None
+    if resolved_proxy:
+        try:
+            run_config = CrawlerRunConfig(
+                proxy_config=ProxyConfig.from_string(resolved_proxy)
+            )
+            logger.debug(f"Crawl4AI using proxy: {resolved_proxy}")
+        except Exception as e:
+            logger.warning(f"Failed to configure proxy for Crawl4AI: {e}")
+    async with AsyncWebCrawler() as crawler:
+        if run_config:
+            result = await crawler.arun(url=url, config=run_config)
+        else:
+            result = await crawler.arun(url=url)
+        # Extract title from metadata if available
+        title = ""
+        if hasattr(result, "metadata") and result.metadata:
+            title = result.metadata.get("title", "")
+        # Get markdown content
+        content = result.markdown if hasattr(result, "markdown") else ""
+        return {
+            "title": title or "No title found",
+            "content": content,
+        }
+async def extract_url_crawl4ai(url: str, proxy: str | None = None) -> dict | None:
+    """
+    Get the content of a URL using Crawl4AI (local browser automation).
+    Returns {"title": ..., "content": ...} or None on failure.
+    Includes retry logic for transient failures.
+    Args:
+        url (str): The URL to extract content from.
+        proxy (str | None): Optional proxy URL to use for this request.
+    """
+    try:
+        return await _fetch_url_crawl4ai(url, proxy)
+    except Exception:
+        return None
+async def extract_url(state: ProcessSourceState):
+    """
+    Extract content from a URL using the url_engine specified in the state.
+    Supported engines: 'auto', 'simple', 'firecrawl', 'jina', 'crawl4ai'.
+    Proxy configuration is passed through state.proxy and resolved using get_proxy().
+    """
+    assert state.url, "No URL provided"
+    url = state.url
+    proxy = state.proxy
+    # Use environment-aware engine selection
+    engine = state.url_engine or get_url_engine()
+    try:
+        if engine == "auto":
+            if os.environ.get("FIRECRAWL_API_KEY"):
+                logger.debug(
+                    "Engine 'auto' selected: using Firecrawl (FIRECRAWL_API_KEY detected)"
+                )
+                return await extract_url_firecrawl(url, proxy)
+            else:
+                try:
+                    logger.debug("Trying to use Jina to extract URL")
+                    return await extract_url_jina(url, proxy)
+                except Exception as e:
+                    logger.error(f"Jina extraction error for URL: {url}: {e}")
+                    # Try Crawl4AI before falling back to BeautifulSoup
+                    logger.debug("Trying to use Crawl4AI to extract URL")
+                    result = await extract_url_crawl4ai(url, proxy)
+                    if result is not None:
+                        return result
+                    logger.debug(
+                        "Crawl4AI failed or not installed, falling back to BeautifulSoup"
+                    )
+                    return await extract_url_bs4(url, proxy)
+        elif engine == "simple":
+            return await extract_url_bs4(url, proxy)
+        elif engine == "firecrawl":
+            return await extract_url_firecrawl(url, proxy)
+        elif engine == "jina":
+            return await extract_url_jina(url, proxy)
+        elif engine == "crawl4ai":
+            return await extract_url_crawl4ai(url, proxy)
+        else:
+            raise ValueError(f"Unknown engine: {engine}")
+    except Exception as e:
+        logger.error(f"URL extraction failed for URL: {url}")
+        logger.exception(e)
+        return None

content_core/processors/video.py ADDED Viewed

@@ -0,0 +1,166 @@
+import asyncio
+import json
+import os
+import subprocess
+from functools import partial
+from content_core.common import ProcessSourceState
+from content_core.logging import logger
+async def extract_audio_from_video(input_file, output_file, stream_index):
+    """
+    Extract the specified audio stream to MP3 format asynchronously
+    """
+    def _extract(input_file, output_file, stream_index):
+        try:
+            cmd = [
+                "ffmpeg",
+                "-i",
+                input_file,
+                "-map",
+                f"0:a:{stream_index}",  # Select specific audio stream
+                "-codec:a",
+                "libmp3lame",  # Use MP3 codec
+                "-q:a",
+                "2",  # High quality setting
+                "-y",  # Overwrite output file if exists
+                output_file,
+            ]
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                raise Exception(f"FFmpeg failed: {result.stderr}")
+            return True
+        except Exception as e:
+            logger.error(f"Error extracting audio: {str(e)}")
+            return False
+    return await asyncio.get_event_loop().run_in_executor(
+        None, partial(_extract, input_file, output_file, stream_index)
+    )
+async def get_audio_streams(input_file):
+    """
+    Analyze video file and return information about all audio streams asynchronously
+    """
+    def _analyze(input_file):
+        logger.debug(f"Analyzing video file {input_file} for audio streams")
+        try:
+            cmd = [
+                "ffprobe",
+                "-v",
+                "quiet",
+                "-print_format",
+                "json",
+                "-show_streams",
+                "-select_streams",
+                "a",
+                input_file,
+            ]
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                raise Exception(f"FFprobe failed: {result.stderr}")
+            data = json.loads(result.stdout)
+            logger.debug(data)
+            return data.get("streams", [])
+        except Exception as e:
+            logger.error(f"Error analyzing file: {str(e)}")
+            return []
+    return await asyncio.get_event_loop().run_in_executor(
+        None, partial(_analyze, input_file)
+    )
+async def select_best_audio_stream(streams):
+    """
+    Select the best audio stream based on various quality metrics
+    """
+    def _select(streams):
+        if not streams:
+            logger.debug("No audio streams found")
+            return None
+        else:
+            logger.debug(f"Found {len(streams)} audio streams")
+        # Score each stream based on various factors
+        scored_streams = []
+        for stream in streams:
+            score = 0
+            # Prefer higher bit rates
+            bit_rate = stream.get("bit_rate")
+            if bit_rate:
+                score += int(int(bit_rate) / 1000000)  # Convert to Mbps and ensure int
+            # Prefer more channels (stereo over mono)
+            channels = stream.get("channels", 0)
+            score += channels * 10
+            # Prefer higher sample rates
+            sample_rate = stream.get("sample_rate", "0")
+            score += int(int(sample_rate) / 48000)
+            scored_streams.append((score, stream))
+        # Return the stream with highest score
+        return max(scored_streams, key=lambda x: x[0])[1]
+    return await asyncio.get_event_loop().run_in_executor(
+        None, partial(_select, streams)
+    )
+async def extract_best_audio_from_video(data: ProcessSourceState):
+    """
+    Main function to extract the best audio stream from a video file asynchronously
+    """
+    input_file = data.file_path
+    assert input_file is not None, "Input file path must be provided"
+    def _check_file(path):
+        return os.path.exists(path)
+    file_exists = await asyncio.get_event_loop().run_in_executor(
+        None, partial(_check_file, input_file)
+    )
+    if not file_exists:
+        logger.critical(f"Input file not found: {input_file}")
+        return False
+    base_name = os.path.splitext(input_file)[0]
+    output_file = f"{base_name}_audio.mp3"
+    # Get all audio streams
+    streams = await get_audio_streams(input_file)
+    if not streams:
+        logger.debug("No audio streams found in the file")
+        return False
+    # Select best stream
+    best_stream = await select_best_audio_stream(streams)
+    if not best_stream:
+        logger.error("Could not determine best audio stream")
+        return False
+    # Extract the selected stream
+    stream_index = streams.index(best_stream)
+    success = await extract_audio_from_video(input_file, output_file, stream_index)
+    if success:
+        logger.debug(f"Successfully extracted audio to: {output_file}")
+        logger.debug(f"- Channels: {best_stream.get('channels', 'unknown')}")
+        logger.debug(f"- Sample rate: {best_stream.get('sample_rate', 'unknown')} Hz")
+        logger.debug(f"- Bit rate: {best_stream.get('bit_rate', 'unknown')} bits/s")
+    return {"file_path": output_file, "identified_type": "audio/mp3"}