PyPI - content-core - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

content-core 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (10) hide show

content_core/cc_config.yaml +4 -0
content_core/config.py +20 -2
content_core/mcp/server.py +37 -34
content_core/notebooks/urls.ipynb +154 -0
content_core/processors/pdf.py +152 -28
{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/METADATA +205 -20
{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/RECORD +10 -9
{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/WHEEL +0 -0
{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/entry_points.txt +0 -0
{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/licenses/LICENSE +0 -0

content_core/cc_config.yaml CHANGED Viewed

@@ -34,6 +34,10 @@ extraction:
   url_engine: auto  # auto | simple | firecrawl | jina | docling - for URLs
   docling:
     output_format: markdown  # markdown | html | json
+  pymupdf:
+    enable_formula_ocr: false    # Enable OCR for formula-heavy pages (requires Tesseract)
+    formula_threshold: 3         # Minimum formulas per page to trigger OCR
+    ocr_fallback: true          # Gracefully fallback to standard extraction if OCR fails
 youtube_transcripts:
   preferred_languages: ["en", "es", "pt"]

content_core/config.py CHANGED Viewed

@@ -14,8 +14,8 @@ def load_config():
             with open(config_path, "r") as file:
                 return yaml.safe_load(file)
         except Exception as e:
-            print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
-            print("Usando configurações padrão internas.")
+            print(f"Error loading configuration file from {config_path}: {e}")
+            print("Using internal default settings.")
     default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
     if default_config_data:
@@ -47,3 +47,21 @@ def set_docling_output_format(fmt: str):
     extraction = CONFIG.setdefault("extraction", {})
     docling_cfg = extraction.setdefault("docling", {})
     docling_cfg["output_format"] = fmt
+def set_pymupdf_ocr_enabled(enabled: bool):
+    """Enable or disable PyMuPDF OCR for formula-heavy pages."""
+    extraction = CONFIG.setdefault("extraction", {})
+    pymupdf_cfg = extraction.setdefault("pymupdf", {})
+    pymupdf_cfg["enable_formula_ocr"] = enabled
+def set_pymupdf_formula_threshold(threshold: int):
+    """Set the minimum number of formulas per page to trigger OCR."""
+    extraction = CONFIG.setdefault("extraction", {})
+    pymupdf_cfg = extraction.setdefault("pymupdf", {})
+    pymupdf_cfg["formula_threshold"] = threshold
+def set_pymupdf_ocr_fallback(enabled: bool):
+    """Enable or disable fallback to standard extraction when OCR fails."""
+    extraction = CONFIG.setdefault("extraction", {})
+    pymupdf_cfg = extraction.setdefault("pymupdf", {})
+    pymupdf_cfg["ocr_fallback"] = enabled

content_core/mcp/server.py CHANGED Viewed

@@ -30,6 +30,7 @@ def suppress_stdout():
     finally:
         sys.stdout = original_stdout
 # Add parent directory to path to import content_core
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
@@ -38,38 +39,40 @@ import content_core as cc
 # Initialize MCP server
 mcp = FastMCP("Content Core MCP Server")
 async def _extract_content_impl(
-    url: Optional[str] = None,
-    file_path: Optional[str] = None
+    url: Optional[str] = None, file_path: Optional[str] = None
 ) -> Dict[str, Any]:
     """
-    Extract content from a URL or file using Content Core's auto engine.
+    Extract content from a URL or file using Content Core's auto engine. This is useful for processing Youtube transcripts, website content, PDFs, ePUB, Office files, etc. You can also use it to extract transcripts from audio or video files.
     Args:
         url: Optional URL to extract content from
         file_path: Optional file path to extract content from
     Returns:
         JSON object containing extracted content and metadata
     Raises:
         ValueError: If neither or both url and file_path are provided
     """
     # Validate input - exactly one must be provided
-    if (url is None and file_path is None) or (url is not None and file_path is not None):
+    if (url is None and file_path is None) or (
+        url is not None and file_path is not None
+    ):
         return {
             "success": False,
             "error": "Exactly one of 'url' or 'file_path' must be provided",
             "source_type": None,
             "source": None,
             "content": None,
-            "metadata": None
+            "metadata": None,
         }
     # Determine source type and validate
     source_type = "url" if url else "file"
     source = url if url else file_path
     # Additional validation for file paths
     if file_path:
         path = Path(file_path)
@@ -80,9 +83,9 @@ async def _extract_content_impl(
                 "source_type": source_type,
                 "source": source,
                 "content": None,
-                "metadata": None
+                "metadata": None,
             }
         # Security check - ensure no directory traversal
         try:
             # Resolve to absolute path and ensure it's not trying to access sensitive areas
@@ -95,30 +98,30 @@ async def _extract_content_impl(
                 "source_type": source_type,
                 "source": source,
                 "content": None,
-                "metadata": None
+                "metadata": None,
             }
     # Build extraction request
     extraction_request = {}
     if url:
         extraction_request["url"] = url
     else:
         extraction_request["file_path"] = str(Path(file_path).resolve())
     # Track start time
     start_time = datetime.utcnow()
     try:
         # Use Content Core's extract_content with auto engine
         logger.info(f"Extracting content from {source_type}: {source}")
         # Suppress stdout to prevent MoviePy and other libraries from interfering with MCP protocol
         with suppress_stdout():
             result = await cc.extract_content(extraction_request)
         # Calculate extraction time
         extraction_time = (datetime.utcnow() - start_time).total_seconds()
         # Build response - result is a ProcessSourceOutput object
         response = {
             "success": True,
@@ -132,13 +135,13 @@ async def _extract_content_impl(
                 "content_length": len(result.content or ""),
                 "identified_type": result.identified_type or "unknown",
                 "identified_provider": result.identified_provider or "",
-            }
+            },
         }
         # Add metadata from the result
         if result.metadata:
             response["metadata"].update(result.metadata)
         # Add specific metadata based on source type
         if source_type == "url":
             if result.title:
@@ -152,10 +155,10 @@ async def _extract_content_impl(
                 response["metadata"]["file_path"] = result.file_path
             response["metadata"]["file_size"] = Path(file_path).stat().st_size
             response["metadata"]["file_extension"] = Path(file_path).suffix
         logger.info(f"Successfully extracted content from {source_type}: {source}")
         return response
     except Exception as e:
         logger.error(f"Error extracting content from {source_type} {source}: {str(e)}")
         return {
@@ -166,26 +169,25 @@ async def _extract_content_impl(
             "content": None,
             "metadata": {
                 "extraction_timestamp": start_time.isoformat() + "Z",
-                "error_type": type(e).__name__
-            }
+                "error_type": type(e).__name__,
+            },
         }
 @mcp.tool
 async def extract_content(
-    url: Optional[str] = None,
-    file_path: Optional[str] = None
+    url: Optional[str] = None, file_path: Optional[str] = None
 ) -> Dict[str, Any]:
     """
     Extract content from a URL or file using Content Core's auto engine.
     Args:
         url: Optional URL to extract content from
         file_path: Optional file path to extract content from
     Returns:
         JSON object containing extracted content and metadata
     Raises:
         ValueError: If neither or both url and file_path are provided
     """
@@ -197,15 +199,16 @@ def main():
     # Additional MoviePy configuration to suppress all output
     try:
         import moviepy.config as mp_config
         mp_config.check_and_download_cmd("ffmpeg")  # Pre-download to avoid logs later
     except Exception:
         pass  # Ignore if MoviePy isn't available or configured
     logger.info("Starting Content Core MCP Server")
     # Run with STDIO transport for MCP compatibility
     mcp.run()
 if __name__ == "__main__":
-    main()
+    main()

content_core/notebooks/urls.ipynb ADDED Viewed

@@ -0,0 +1,154 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "873a872b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from content_core.content.extraction import extract_content\n",
+    "\n",
+    "async def process_url(url):\n",
+    "    print(\"Processing: \", url)\n",
+    "    print(\"Simple: -------\")\n",
+    "    result = await extract_content(dict(url=url, engine=\"simple\"))\n",
+    "    print(result.title[:100])\n",
+    "    print(result.content[:100])\n",
+    "    print(\"Jina: -------\")\n",
+    "    result = await extract_content(dict(url=url, engine=\"jina\"))\n",
+    "    print(result.title[:100])\n",
+    "    print(result.content[:100])\n",
+    "    print(\"Firecrawl: -------\")\n",
+    "    result = await extract_content(dict(url=url, engine=\"firecrawl\"))\n",
+    "    print(result.title[:100])\n",
+    "    print(result.content[:100])\n",
+    "    print(\"=============================\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "263dc3af",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing:  https://www.supernovalabs.com.br/\n",
+      "Simple: -------\n",
+      "Readability failed: No content extracted by readability\n",
+      "Supernova Labs | AI Consulting\n",
+      "Supernova Labs | AI Consulting\n",
+      "Jina: -------\n",
+      "Supernova Labs | Elite AI Consulting to help you build the Future\n",
+      "URL Source: https://www.supernovalabs.com.br/\n",
+      "\n",
+      "Markdown Content:\n",
+      "Supernova Labs\n",
+      "\n",
+      "[About](https://www\n",
+      "Firecrawl: -------\n",
+      "Supernova Labs | AI Consulting\n",
+      "# Unleash Your AI Edge. Fast.\n",
+      "\n",
+      "We turn your data, tech and capabilities into impact with lean AI sol\n",
+      "=============================\n",
+      "None\n",
+      "Processing:  https://building.nubank.com/fine-tuning-transaction-user-models/\n",
+      "Simple: -------\n",
+      "Fine-Tuning Transaction User Models - Building Nubank\n",
+      "Fine-Tuning Transaction User Models Learn how we combine transaction embeddings with tabular data us\n",
+      "Jina: -------\n",
+      "Fine-Tuning Transaction User Models - Building Nubank\n",
+      "URL Source: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
+      "\n",
+      "Published Time: 2025-0\n",
+      "Firecrawl: -------\n",
+      "Fine-Tuning Transaction User Models - Building Nubank\n",
+      "# Fine-Tuning Transaction User Models\n",
+      "\n",
+      "Learn how we combine transaction embeddings with tabular data\n",
+      "=============================\n",
+      "None\n",
+      "Processing:  https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\n",
+      "Simple: -------\n",
+      "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
+      "Most people think they need to quit their job to build a new life. I thought that too. You scroll th\n",
+      "Jina: -------\n",
+      "You Can Make Money With AI Without Quitting Your Job\n",
+      "URL Source: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quittin\n",
+      "Firecrawl: -------\n",
+      "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
+      "[Sitemap](https://medium.com/sitemap/sitemap.xml)\n",
+      "\n",
+      "[Open in app](https://rsci.app.link/?%24canonical\n",
+      "=============================\n",
+      "None\n",
+      "Processing:  https://github.com/mirkonasato/pyodconverter\n",
+      "Simple: -------\n",
+      "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
+      "This repository was archived by the owner on Dec 1, 2023. It is now read-only. mirkonasato/pyodconve\n",
+      "Jina: -------\n",
+      "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
+      "URL Source: https://github.com/mirkonasato/pyodconverter\n",
+      "\n",
+      "Markdown Content:\n",
+      "GitHub - mirkonasato/pyo\n",
+      "Firecrawl: -------\n",
+      "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
+      "[Skip to content](https://github.com/mirkonasato/pyodconverter#start-of-content)\n",
+      "\n",
+      "You signed in with\n",
+      "=============================\n",
+      "None\n",
+      "Processing:  https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\n",
+      "Simple: -------\n",
+      "Error processing URL https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR: HTTP error: 500\n",
+      "Error\n",
+      "Failed to extract content: HTTP error: 500\n",
+      "Jina: -------\n",
+      "Ultra-aprendizado: domine habilidades valiosas, seja mais esperto que a competição e dê um impulso n\n",
+      "URL Source: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-ca\n",
+      "Firecrawl: -------\n",
+      "Amazon.com.br\n",
+      "#### Digite os caracteres que você vê abaixo\n",
+      "\n",
+      "Desculpe pelo inconveniente. Para continuar realizando\n",
+      "=============================\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "urls= [\"https://www.supernovalabs.com.br/\", \"https://building.nubank.com/fine-tuning-transaction-user-models/\", \"https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\", \"https://github.com/mirkonasato/pyodconverter\",  \"https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\"]\n",
+    "for url in urls:\n",
+    "    result = await process_url(url=url)\n",
+    "    print(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

content_core/processors/pdf.py CHANGED Viewed

@@ -5,20 +5,90 @@ import unicodedata
 import fitz  # type: ignore
 from content_core.common import ProcessSourceState
+from content_core.config import CONFIG
 from content_core.logging import logger
-# todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
-# todo: what else can we do to make the text more readable?
-# todo: try to fix encoding for some PDF that is still breaking
-# def _extract_text_from_pdf(pdf_path):
-#     doc = fitz.open(pdf_path)
-#     text = ""
-#     logger.debug(f"Found {len(doc)} pages in PDF")
-#     for page in doc:
-#         # Use encode/decode if you need to clean up any encoding issues
-#         text += page.get_text().encode('utf-8').decode('utf-8')
-#     doc.close()
-#     return text
+def count_formula_placeholders(text):
+    """
+    Count the number of formula placeholders in extracted text.
+    Args:
+        text (str): Extracted text content
+    Returns:
+        int: Number of formula placeholders found
+    """
+    if not text:
+        return 0
+    return text.count('<!-- formula-not-decoded -->')
+def extract_page_with_ocr(page, page_num):
+    """
+    Extract text from a page using OCR (Tesseract).
+    Args:
+        page: PyMuPDF page object
+        page_num (int): Page number for logging
+    Returns:
+        str: OCR-extracted text or None if OCR fails
+    """
+    try:
+        logger.debug(f"Attempting OCR extraction for page {page_num}")
+        # Create TextPage using OCR
+        textpage = page.get_textpage_ocr()
+        if textpage:
+            # Extract text from the OCR TextPage
+            ocr_text = textpage.extractText()
+            logger.debug(f"OCR successful for page {page_num}, extracted {len(ocr_text)} characters")
+            return ocr_text
+        else:
+            logger.warning(f"OCR TextPage creation failed for page {page_num}")
+            return None
+    except (ImportError, RuntimeError, OSError) as e:
+        # Common errors: Tesseract not installed, OCR failure, file access issues
+        logger.debug(f"OCR extraction failed for page {page_num}: {e}")
+        return None
+    except Exception as e:
+        # Unexpected errors - log as warning for debugging
+        logger.warning(f"Unexpected error during OCR extraction for page {page_num}: {e}")
+        return None
+def convert_table_to_markdown(table):
+    """
+    Convert a PyMuPDF table to markdown format.
+    Args:
+        table: Table data from PyMuPDF (list of lists)
+    Returns:
+        str: Markdown-formatted table
+    """
+    if not table or not table[0]:
+        return ""
+    # Build markdown table
+    markdown_lines = []
+    # Header row
+    header = table[0]
+    header_row = "| " + " | ".join(str(cell) if cell else "" for cell in header) + " |"
+    markdown_lines.append(header_row)
+    # Separator row
+    separator = "|" + "|".join([" --- " for _ in header]) + "|"
+    markdown_lines.append(separator)
+    # Data rows
+    for row in table[1:]:
+        if row:  # Skip empty rows
+            row_text = "| " + " | ".join(str(cell) if cell else "" for cell in row) + " |"
+            markdown_lines.append(row_text)
+    return "\n".join(markdown_lines) + "\n"
+# Configuration constants
+DEFAULT_FORMULA_THRESHOLD = 3
+DEFAULT_OCR_FALLBACK = True
 SUPPORTED_FITZ_TYPES = [
     "application/pdf",
@@ -116,30 +186,84 @@ def clean_pdf_text(text):
     return text.strip()
-async def _extract_text_from_pdf(pdf_path):
-    doc = fitz.open(pdf_path)
-    try:
-        text = ""
-        logger.debug(f"Found {len(doc)} pages in PDF")
-        for page in doc:
-            text += page.get_text()
-        normalized_text = clean_pdf_text(text)
-        return normalized_text
-    finally:
-        doc.close()
 async def _extract_text_from_pdf(pdf_path):
-    """Extract text from PDF asynchronously"""
+    """Extract text from PDF asynchronously with table detection"""
     def _extract():
         doc = fitz.open(pdf_path)
         try:
-            text = ""
+            full_text = []
             logger.debug(f"Found {len(doc)} pages in PDF")
-            for page in doc:
-                text += page.get_text()
-            return clean_pdf_text(text)
+            # Use quality improvement flags for better text extraction
+            extraction_flags = (
+                fitz.TEXT_PRESERVE_LIGATURES |  # Better character rendering
+                fitz.TEXT_PRESERVE_WHITESPACE |  # Better spacing preservation
+                fitz.TEXT_PRESERVE_IMAGES       # Better image-text integration
+            )
+            # Get OCR configuration
+            ocr_config = CONFIG.get('extraction', {}).get('pymupdf', {})
+            enable_ocr = ocr_config.get('enable_formula_ocr', False)
+            formula_threshold = ocr_config.get('formula_threshold', DEFAULT_FORMULA_THRESHOLD)
+            ocr_fallback = ocr_config.get('ocr_fallback', DEFAULT_OCR_FALLBACK)
+            for page_num, page in enumerate(doc):
+                # Extract regular text with quality flags
+                standard_text = page.get_text(flags=extraction_flags)
+                # Check if we should try OCR for this page
+                formula_count = count_formula_placeholders(standard_text)
+                use_ocr = (enable_ocr and
+                          formula_count >= formula_threshold and
+                          formula_count > 0)
+                if use_ocr:
+                    logger.debug(f"Page {page_num + 1} has {formula_count} formulas, attempting OCR")
+                    ocr_text = extract_page_with_ocr(page, page_num + 1)
+                    if ocr_text and ocr_fallback:
+                        # Use OCR text but preserve table extraction from standard text
+                        page_text = ocr_text
+                        logger.debug(f"Using OCR text for page {page_num + 1}")
+                    else:
+                        # OCR failed, use standard text
+                        page_text = standard_text
+                        if not ocr_text:
+                            logger.debug(f"OCR failed for page {page_num + 1}, using standard extraction")
+                else:
+                    page_text = standard_text
+                # Try to find and extract tables (regardless of OCR)
+                try:
+                    tables = page.find_tables()
+                    if tables:
+                        logger.debug(f"Found {len(tables)} table(s) on page {page_num + 1}")
+                        # For each table found, convert to markdown and append
+                        for table_num, table in enumerate(tables):
+                            # Extract table data
+                            table_data = table.extract()
+                            # Validate table has actual content (not just empty rows/cells)
+                            if table_data and len(table_data) > 0 and any(
+                                any(str(cell).strip() for cell in row if cell) for row in table_data if row
+                            ):
+                                # Add a marker before the table
+                                page_text += f"\n\n[Table {table_num + 1} from page {page_num + 1}]\n"
+                                # Convert to markdown
+                                markdown_table = convert_table_to_markdown(table_data)
+                                page_text += markdown_table + "\n"
+                except Exception as e:
+                    # If table extraction fails, continue with regular text
+                    logger.debug(f"Table extraction failed on page {page_num + 1}: {e}")
+                full_text.append(page_text)
+            # Join all pages and clean
+            combined_text = "".join(full_text)
+            return clean_pdf_text(combined_text)
         finally:
             doc.close()

{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 1.1.0
-Summary: Extract what matters from any media source
+Version: 1.2.0
+Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
 Requires-Python: >=3.10
@@ -10,7 +10,6 @@ Requires-Dist: aiohttp>=3.11
 Requires-Dist: asciidoc>=10.2.1
 Requires-Dist: bs4>=0.0.2
 Requires-Dist: dicttoxml>=1.7.16
-Requires-Dist: docling>=2.34.0
 Requires-Dist: esperanto>=1.2.0
 Requires-Dist: firecrawl-py>=2.7.0
 Requires-Dist: jinja2>=3.1.6
@@ -31,6 +30,8 @@ Requires-Dist: pytubefix>=9.1.1
 Requires-Dist: readability-lxml>=0.8.4.1
 Requires-Dist: validators>=0.34.0
 Requires-Dist: youtube-transcript-api>=1.0.3
+Provides-Extra: docling
+Requires-Dist: docling>=2.34.0; extra == 'docling'
 Provides-Extra: mcp
 Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
 Description-Content-Type: text/markdown
@@ -39,28 +40,70 @@ Description-Content-Type: text/markdown
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
-**Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
+**Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
-## Overview
+## 🚀 What You Can Do
-> **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
+**Extract content from anywhere:**
+- 📄 **Documents** - PDF, Word, PowerPoint, Excel, Markdown, HTML, EPUB
+- 🎥 **Media** - Videos (MP4, AVI, MOV) with automatic transcription
+- 🎵 **Audio** - MP3, WAV, M4A with speech-to-text conversion
+- 🌐 **Web** - Any URL with intelligent content extraction
+- 🖼️ **Images** - JPG, PNG, TIFF with OCR text recognition
+- 📦 **Archives** - ZIP, TAR, GZ with content analysis
-The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
+**Process with AI:**
+- ✨ **Clean & format** extracted content automatically
+- 📝 **Generate summaries** with customizable styles (bullet points, executive summary, etc.)
+- 🎯 **Context-aware processing** - explain to a child, technical summary, action items
+- 🔄 **Smart engine selection** - automatically chooses the best extraction method
-## Key Features
+## 🛠️ Multiple Ways to Use
-*   **Multi-Source Extraction:** Handles content from:
-    *   Direct text strings.
-    *   Web URLs (using robust extraction methods).
-    *   Local files (including automatic transcription for video/audio files and parsing for text-based formats).
-*   **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
-*   **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
-    * For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
-    * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
-    * You can override this by specifying an engine, but `'auto'` is recommended for most users.
-*   **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
-*   **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
-*   **Asynchronous:** Built with `asyncio` for efficient I/O operations.
+### 🖥️ Command Line (Zero Install)
+```bash
+# Extract content from any source
+uvx --from "content-core" ccore https://example.com
+uvx --from "content-core" ccore document.pdf
+# Generate AI summaries
+uvx --from "content-core" csum video.mp4 --context "bullet points"
+```
+### 🤖 Claude Desktop Integration
+One-click setup with Model Context Protocol (MCP) - extract content directly in Claude conversations.
+### 🔍 Raycast Extension
+Smart auto-detection commands:
+- **Extract Content** - Full interface with format options
+- **Summarize Content** - 9 summary styles available
+- **Quick Extract** - Instant clipboard extraction
+### 🖱️ macOS Right-Click Integration
+Right-click any file in Finder → Services → Extract or Summarize content instantly.
+### 🐍 Python Library
+```python
+import content_core as cc
+# Extract from any source
+result = await cc.extract("https://example.com/article")
+summary = await cc.summarize_content(result, context="explain to a child")
+```
+## ⚡ Key Features
+*   **🎯 Intelligent Auto-Detection:** Automatically selects the best extraction method based on content type and available services
+*   **🔧 Smart Engine Selection:**
+    * **URLs:** Firecrawl → Jina → BeautifulSoup fallback chain
+    * **Documents:** Docling → Enhanced PyMuPDF → Simple extraction fallback
+    * **Media:** OpenAI Whisper transcription
+    * **Images:** OCR with multiple engine support
+*   **📊 Enhanced PDF Processing:** Advanced PyMuPDF engine with quality flags, table detection, and optional OCR for mathematical formulas
+*   **🌍 Multiple Integrations:** CLI, Python library, MCP server, Raycast extension, macOS Services
+*   **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
+*   **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
+*   **🔄 Asynchronous:** Built with `asyncio` for efficient processing
 ## Getting Started
@@ -92,6 +135,18 @@ uv sync
 Content Core provides three CLI commands for extracting, cleaning, and summarizing content:
 ccore, cclean, and csum. These commands support input from text, URLs, files, or piped data (e.g., via cat file | command).
+**Zero-install usage with uvx:**
+```bash
+# Extract content
+uvx --from "content-core" ccore https://example.com
+# Clean content
+uvx --from "content-core" cclean "messy content"
+# Summarize content
+uvx --from "content-core" csum "long text" --context "bullet points"
+```
 #### ccore - Extract Content
 Extracts content from text, URLs, or files, with optional formatting.
@@ -232,6 +287,136 @@ Add to your `claude_desktop_config.json`:
 For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
+## Enhanced PDF Processing
+Content Core features an optimized PyMuPDF extraction engine with significant improvements for scientific documents and complex PDFs.
+### Key Improvements
+- **🔬 Mathematical Formula Extraction**: Enhanced quality flags eliminate `<!-- formula-not-decoded -->` placeholders
+- **📊 Automatic Table Detection**: Tables converted to markdown format for LLM consumption
+- **🔧 Quality Text Rendering**: Better ligature, whitespace, and image-text integration
+- **⚡ Optional OCR Enhancement**: Selective OCR for formula-heavy pages (requires Tesseract)
+### Configuration for Scientific Documents
+For documents with heavy mathematical content, enable OCR enhancement:
+```yaml
+# In cc_config.yaml
+extraction:
+  pymupdf:
+    enable_formula_ocr: true      # Enable OCR for formula-heavy pages
+    formula_threshold: 3          # Min formulas per page to trigger OCR
+    ocr_fallback: true           # Graceful fallback if OCR fails
+```
+```python
+# Runtime configuration
+from content_core.config import set_pymupdf_ocr_enabled
+set_pymupdf_ocr_enabled(True)
+```
+### Requirements for OCR Enhancement
+```bash
+# Install Tesseract OCR (optional, for formula enhancement)
+# macOS
+brew install tesseract
+# Ubuntu/Debian
+sudo apt-get install tesseract-ocr
+```
+**Note**: OCR is optional - you get improved PDF extraction automatically without any additional setup.
+## macOS Services Integration
+Content Core provides powerful right-click integration with macOS Finder, allowing you to extract and summarize content from any file without installation. Choose between clipboard or TextEdit output for maximum flexibility.
+### Available Services
+Create **4 convenient services** for different workflows:
+- **Extract Content → Clipboard** - Quick copy for immediate pasting
+- **Extract Content → TextEdit** - Review before using
+- **Summarize Content → Clipboard** - Quick summary copying
+- **Summarize Content → TextEdit** - Formatted summary with headers
+### Quick Setup
+1. **Install uv** (if not already installed):
+   ```bash
+   curl -LsSf https://astral.sh/uv/install.sh | sh
+   ```
+2. **Create services manually** using Automator (5 minutes setup)
+### Usage
+**Right-click any supported file** in Finder → **Services** → Choose your option:
+- **PDFs, Word docs** - Instant text extraction
+- **Videos, audio files** - Automatic transcription
+- **Images** - OCR text recognition
+- **Web content** - Clean text extraction
+- **Multiple files** - Batch processing support
+### Features
+- **Zero-install processing**: Uses `uvx` for isolated execution
+- **Multiple output options**: Clipboard or TextEdit display
+- **System notifications**: Visual feedback on completion
+- **Wide format support**: 20+ file types supported
+- **Batch processing**: Handle multiple files at once
+- **Keyboard shortcuts**: Assignable hotkeys for power users
+For complete setup instructions with copy-paste scripts, see [macOS Services Documentation](docs/macos.md).
+## Raycast Extension
+Content Core provides a powerful Raycast extension with smart auto-detection that handles both URLs and file paths seamlessly. Extract and summarize content directly from your Raycast interface without switching applications.
+### Quick Setup
+**From Raycast Store** (coming soon):
+1. Open Raycast and search for "Content Core"
+2. Install the extension by `luis_novo`
+3. Configure API keys in preferences
+**Manual Installation**:
+1. Download the extension from the repository
+2. Open Raycast → "Import Extension"
+3. Select the `raycast-content-core` folder
+### Commands
+**🔍 Extract Content** - Smart URL/file detection with full interface
+- Auto-detects URLs vs file paths in real-time
+- Multiple output formats (Text, JSON, XML)
+- Drag & drop support for files
+- Rich results view with metadata
+**📝 Summarize Content** - AI-powered summaries with customizable styles
+- 9 different summary styles (bullet points, executive summary, etc.)
+- Auto-detects source type with visual feedback
+- One-click snippet creation and quicklinks
+**⚡ Quick Extract** - Instant extraction to clipboard
+- Type → Tab → Paste source → Enter
+- No UI, works directly from command bar
+- Perfect for quick workflows
+### Features
+- **Smart Auto-Detection**: Instantly recognizes URLs vs file paths
+- **Zero Installation**: Uses `uvx` for Content Core execution
+- **Rich Integration**: Keyboard shortcuts, clipboard actions, Raycast snippets
+- **All File Types**: Documents, videos, audio, images, archives
+- **Visual Feedback**: Real-time type detection with icons
+For detailed setup, configuration, and usage examples, see [Raycast Extension Documentation](docs/raycast.md).
 ## Using with Langchain
 For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.

{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
-content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
-content_core/config.py,sha256=vyx0fioR6r0mcZfVdwAFDhFrRNoG0ZNG8RNxIDnhNlo,1802
+content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
+content_core/config.py,sha256=OBwI58W4Twr00UiYD2mdw_rZDcuXxjBanE0IoA8ox-M,2601
 content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
 content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
 content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
@@ -20,12 +20,13 @@ content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjloz
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
 content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
 content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
-content_core/mcp/server.py,sha256=m2A63Qle3nJ_Lw46uWkwVvYERtEw84hd7NHAn1rwdAQ,6968
+content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
 content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
+content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
 content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
 content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
-content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
+content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
 content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
 content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
 content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
@@ -34,8 +35,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-1.1.0.dist-info/METADATA,sha256=9-ppXQ7o-s8BCb2lH5xBiaiYBHmOFmXFrWntHuo9G_o,13017
-content_core-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-1.1.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
-content_core-1.1.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-1.1.0.dist-info/RECORD,,
+content_core-1.2.0.dist-info/METADATA,sha256=wAEQSfn6tTd4hQwAZY8sKeB5e7QpHm6qeTz2akFZwWw,18881
+content_core-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-1.2.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
+content_core-1.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-1.2.0.dist-info/RECORD,,

{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-1.1.0.dist-info → content_core-1.2.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

Potentially problematic release.

content-core 1.1.0py3-none-any.whl → 1.2.0py3-none-any.whl