content-core 1.1.2__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -34,6 +34,10 @@ extraction:
34
34
  url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
35
35
  docling:
36
36
  output_format: markdown # markdown | html | json
37
+ pymupdf:
38
+ enable_formula_ocr: false # Enable OCR for formula-heavy pages (requires Tesseract)
39
+ formula_threshold: 3 # Minimum formulas per page to trigger OCR
40
+ ocr_fallback: true # Gracefully fallback to standard extraction if OCR fails
37
41
 
38
42
  youtube_transcripts:
39
43
  preferred_languages: ["en", "es", "pt"]
content_core/config.py CHANGED
@@ -14,8 +14,8 @@ def load_config():
14
14
  with open(config_path, "r") as file:
15
15
  return yaml.safe_load(file)
16
16
  except Exception as e:
17
- print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
18
- print("Usando configurações padrão internas.")
17
+ print(f"Error loading configuration file from {config_path}: {e}")
18
+ print("Using internal default settings.")
19
19
 
20
20
  default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
21
21
  if default_config_data:
@@ -47,3 +47,21 @@ def set_docling_output_format(fmt: str):
47
47
  extraction = CONFIG.setdefault("extraction", {})
48
48
  docling_cfg = extraction.setdefault("docling", {})
49
49
  docling_cfg["output_format"] = fmt
50
+
51
+ def set_pymupdf_ocr_enabled(enabled: bool):
52
+ """Enable or disable PyMuPDF OCR for formula-heavy pages."""
53
+ extraction = CONFIG.setdefault("extraction", {})
54
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
55
+ pymupdf_cfg["enable_formula_ocr"] = enabled
56
+
57
+ def set_pymupdf_formula_threshold(threshold: int):
58
+ """Set the minimum number of formulas per page to trigger OCR."""
59
+ extraction = CONFIG.setdefault("extraction", {})
60
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
61
+ pymupdf_cfg["formula_threshold"] = threshold
62
+
63
+ def set_pymupdf_ocr_fallback(enabled: bool):
64
+ """Enable or disable fallback to standard extraction when OCR fails."""
65
+ extraction = CONFIG.setdefault("extraction", {})
66
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
67
+ pymupdf_cfg["ocr_fallback"] = enabled
@@ -0,0 +1,154 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "873a872b",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from content_core.content.extraction import extract_content\n",
11
+ "\n",
12
+ "async def process_url(url):\n",
13
+ " print(\"Processing: \", url)\n",
14
+ " print(\"Simple: -------\")\n",
15
+ " result = await extract_content(dict(url=url, engine=\"simple\"))\n",
16
+ " print(result.title[:100])\n",
17
+ " print(result.content[:100])\n",
18
+ " print(\"Jina: -------\")\n",
19
+ " result = await extract_content(dict(url=url, engine=\"jina\"))\n",
20
+ " print(result.title[:100])\n",
21
+ " print(result.content[:100])\n",
22
+ " print(\"Firecrawl: -------\")\n",
23
+ " result = await extract_content(dict(url=url, engine=\"firecrawl\"))\n",
24
+ " print(result.title[:100])\n",
25
+ " print(result.content[:100])\n",
26
+ " print(\"=============================\")"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 4,
32
+ "id": "263dc3af",
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stdout",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "Processing: https://www.supernovalabs.com.br/\n",
40
+ "Simple: -------\n",
41
+ "Readability failed: No content extracted by readability\n",
42
+ "Supernova Labs | AI Consulting\n",
43
+ "Supernova Labs | AI Consulting\n",
44
+ "Jina: -------\n",
45
+ "Supernova Labs | Elite AI Consulting to help you build the Future\n",
46
+ "URL Source: https://www.supernovalabs.com.br/\n",
47
+ "\n",
48
+ "Markdown Content:\n",
49
+ "Supernova Labs\n",
50
+ "\n",
51
+ "[About](https://www\n",
52
+ "Firecrawl: -------\n",
53
+ "Supernova Labs | AI Consulting\n",
54
+ "# Unleash Your AI Edge. Fast.\n",
55
+ "\n",
56
+ "We turn your data, tech and capabilities into impact with lean AI sol\n",
57
+ "=============================\n",
58
+ "None\n",
59
+ "Processing: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
60
+ "Simple: -------\n",
61
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
62
+ "Fine-Tuning Transaction User Models Learn how we combine transaction embeddings with tabular data us\n",
63
+ "Jina: -------\n",
64
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
65
+ "URL Source: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
66
+ "\n",
67
+ "Published Time: 2025-0\n",
68
+ "Firecrawl: -------\n",
69
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
70
+ "# Fine-Tuning Transaction User Models\n",
71
+ "\n",
72
+ "Learn how we combine transaction embeddings with tabular data\n",
73
+ "=============================\n",
74
+ "None\n",
75
+ "Processing: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\n",
76
+ "Simple: -------\n",
77
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
78
+ "Most people think they need to quit their job to build a new life. I thought that too. You scroll th\n",
79
+ "Jina: -------\n",
80
+ "You Can Make Money With AI Without Quitting Your Job\n",
81
+ "URL Source: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quittin\n",
82
+ "Firecrawl: -------\n",
83
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
84
+ "[Sitemap](https://medium.com/sitemap/sitemap.xml)\n",
85
+ "\n",
86
+ "[Open in app](https://rsci.app.link/?%24canonical\n",
87
+ "=============================\n",
88
+ "None\n",
89
+ "Processing: https://github.com/mirkonasato/pyodconverter\n",
90
+ "Simple: -------\n",
91
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
92
+ "This repository was archived by the owner on Dec 1, 2023. It is now read-only. mirkonasato/pyodconve\n",
93
+ "Jina: -------\n",
94
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
95
+ "URL Source: https://github.com/mirkonasato/pyodconverter\n",
96
+ "\n",
97
+ "Markdown Content:\n",
98
+ "GitHub - mirkonasato/pyo\n",
99
+ "Firecrawl: -------\n",
100
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
101
+ "[Skip to content](https://github.com/mirkonasato/pyodconverter#start-of-content)\n",
102
+ "\n",
103
+ "You signed in with\n",
104
+ "=============================\n",
105
+ "None\n",
106
+ "Processing: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\n",
107
+ "Simple: -------\n",
108
+ "Error processing URL https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR: HTTP error: 500\n",
109
+ "Error\n",
110
+ "Failed to extract content: HTTP error: 500\n",
111
+ "Jina: -------\n",
112
+ "Ultra-aprendizado: domine habilidades valiosas, seja mais esperto que a competição e dê um impulso n\n",
113
+ "URL Source: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-ca\n",
114
+ "Firecrawl: -------\n",
115
+ "Amazon.com.br\n",
116
+ "#### Digite os caracteres que você vê abaixo\n",
117
+ "\n",
118
+ "Desculpe pelo inconveniente. Para continuar realizando\n",
119
+ "=============================\n",
120
+ "None\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "\n",
126
+ "urls= [\"https://www.supernovalabs.com.br/\", \"https://building.nubank.com/fine-tuning-transaction-user-models/\", \"https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\", \"https://github.com/mirkonasato/pyodconverter\", \"https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\"]\n",
127
+ "for url in urls:\n",
128
+ " result = await process_url(url=url)\n",
129
+ " print(result)"
130
+ ]
131
+ }
132
+ ],
133
+ "metadata": {
134
+ "kernelspec": {
135
+ "display_name": ".venv",
136
+ "language": "python",
137
+ "name": "python3"
138
+ },
139
+ "language_info": {
140
+ "codemirror_mode": {
141
+ "name": "ipython",
142
+ "version": 3
143
+ },
144
+ "file_extension": ".py",
145
+ "mimetype": "text/x-python",
146
+ "name": "python",
147
+ "nbconvert_exporter": "python",
148
+ "pygments_lexer": "ipython3",
149
+ "version": "3.10.6"
150
+ }
151
+ },
152
+ "nbformat": 4,
153
+ "nbformat_minor": 5
154
+ }
@@ -5,20 +5,90 @@ import unicodedata
5
5
  import fitz # type: ignore
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
+ from content_core.config import CONFIG
8
9
  from content_core.logging import logger
9
10
 
10
- # todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
11
- # todo: what else can we do to make the text more readable?
12
- # todo: try to fix encoding for some PDF that is still breaking
13
- # def _extract_text_from_pdf(pdf_path):
14
- # doc = fitz.open(pdf_path)
15
- # text = ""
16
- # logger.debug(f"Found {len(doc)} pages in PDF")
17
- # for page in doc:
18
- # # Use encode/decode if you need to clean up any encoding issues
19
- # text += page.get_text().encode('utf-8').decode('utf-8')
20
- # doc.close()
21
- # return text
11
+ def count_formula_placeholders(text):
12
+ """
13
+ Count the number of formula placeholders in extracted text.
14
+
15
+ Args:
16
+ text (str): Extracted text content
17
+ Returns:
18
+ int: Number of formula placeholders found
19
+ """
20
+ if not text:
21
+ return 0
22
+ return text.count('<!-- formula-not-decoded -->')
23
+
24
+
25
+ def extract_page_with_ocr(page, page_num):
26
+ """
27
+ Extract text from a page using OCR (Tesseract).
28
+
29
+ Args:
30
+ page: PyMuPDF page object
31
+ page_num (int): Page number for logging
32
+ Returns:
33
+ str: OCR-extracted text or None if OCR fails
34
+ """
35
+ try:
36
+ logger.debug(f"Attempting OCR extraction for page {page_num}")
37
+ # Create TextPage using OCR
38
+ textpage = page.get_textpage_ocr()
39
+ if textpage:
40
+ # Extract text from the OCR TextPage
41
+ ocr_text = textpage.extractText()
42
+ logger.debug(f"OCR successful for page {page_num}, extracted {len(ocr_text)} characters")
43
+ return ocr_text
44
+ else:
45
+ logger.warning(f"OCR TextPage creation failed for page {page_num}")
46
+ return None
47
+ except (ImportError, RuntimeError, OSError) as e:
48
+ # Common errors: Tesseract not installed, OCR failure, file access issues
49
+ logger.debug(f"OCR extraction failed for page {page_num}: {e}")
50
+ return None
51
+ except Exception as e:
52
+ # Unexpected errors - log as warning for debugging
53
+ logger.warning(f"Unexpected error during OCR extraction for page {page_num}: {e}")
54
+ return None
55
+
56
+
57
+ def convert_table_to_markdown(table):
58
+ """
59
+ Convert a PyMuPDF table to markdown format.
60
+
61
+ Args:
62
+ table: Table data from PyMuPDF (list of lists)
63
+ Returns:
64
+ str: Markdown-formatted table
65
+ """
66
+ if not table or not table[0]:
67
+ return ""
68
+
69
+ # Build markdown table
70
+ markdown_lines = []
71
+
72
+ # Header row
73
+ header = table[0]
74
+ header_row = "| " + " | ".join(str(cell) if cell else "" for cell in header) + " |"
75
+ markdown_lines.append(header_row)
76
+
77
+ # Separator row
78
+ separator = "|" + "|".join([" --- " for _ in header]) + "|"
79
+ markdown_lines.append(separator)
80
+
81
+ # Data rows
82
+ for row in table[1:]:
83
+ if row: # Skip empty rows
84
+ row_text = "| " + " | ".join(str(cell) if cell else "" for cell in row) + " |"
85
+ markdown_lines.append(row_text)
86
+
87
+ return "\n".join(markdown_lines) + "\n"
88
+
89
+ # Configuration constants
90
+ DEFAULT_FORMULA_THRESHOLD = 3
91
+ DEFAULT_OCR_FALLBACK = True
22
92
 
23
93
  SUPPORTED_FITZ_TYPES = [
24
94
  "application/pdf",
@@ -116,30 +186,84 @@ def clean_pdf_text(text):
116
186
  return text.strip()
117
187
 
118
188
 
119
- async def _extract_text_from_pdf(pdf_path):
120
- doc = fitz.open(pdf_path)
121
- try:
122
- text = ""
123
- logger.debug(f"Found {len(doc)} pages in PDF")
124
- for page in doc:
125
- text += page.get_text()
126
- normalized_text = clean_pdf_text(text)
127
- return normalized_text
128
- finally:
129
- doc.close()
130
189
 
131
190
 
132
191
  async def _extract_text_from_pdf(pdf_path):
133
- """Extract text from PDF asynchronously"""
192
+ """Extract text from PDF asynchronously with table detection"""
134
193
 
135
194
  def _extract():
136
195
  doc = fitz.open(pdf_path)
137
196
  try:
138
- text = ""
197
+ full_text = []
139
198
  logger.debug(f"Found {len(doc)} pages in PDF")
140
- for page in doc:
141
- text += page.get_text()
142
- return clean_pdf_text(text)
199
+
200
+ # Use quality improvement flags for better text extraction
201
+ extraction_flags = (
202
+ fitz.TEXT_PRESERVE_LIGATURES | # Better character rendering
203
+ fitz.TEXT_PRESERVE_WHITESPACE | # Better spacing preservation
204
+ fitz.TEXT_PRESERVE_IMAGES # Better image-text integration
205
+ )
206
+
207
+ # Get OCR configuration
208
+ ocr_config = CONFIG.get('extraction', {}).get('pymupdf', {})
209
+ enable_ocr = ocr_config.get('enable_formula_ocr', False)
210
+ formula_threshold = ocr_config.get('formula_threshold', DEFAULT_FORMULA_THRESHOLD)
211
+ ocr_fallback = ocr_config.get('ocr_fallback', DEFAULT_OCR_FALLBACK)
212
+
213
+ for page_num, page in enumerate(doc):
214
+ # Extract regular text with quality flags
215
+ standard_text = page.get_text(flags=extraction_flags)
216
+
217
+ # Check if we should try OCR for this page
218
+ formula_count = count_formula_placeholders(standard_text)
219
+ use_ocr = (enable_ocr and
220
+ formula_count >= formula_threshold and
221
+ formula_count > 0)
222
+
223
+ if use_ocr:
224
+ logger.debug(f"Page {page_num + 1} has {formula_count} formulas, attempting OCR")
225
+ ocr_text = extract_page_with_ocr(page, page_num + 1)
226
+
227
+ if ocr_text and ocr_fallback:
228
+ # Use OCR text but preserve table extraction from standard text
229
+ page_text = ocr_text
230
+ logger.debug(f"Using OCR text for page {page_num + 1}")
231
+ else:
232
+ # OCR failed, use standard text
233
+ page_text = standard_text
234
+ if not ocr_text:
235
+ logger.debug(f"OCR failed for page {page_num + 1}, using standard extraction")
236
+ else:
237
+ page_text = standard_text
238
+
239
+ # Try to find and extract tables (regardless of OCR)
240
+ try:
241
+ tables = page.find_tables()
242
+ if tables:
243
+ logger.debug(f"Found {len(tables)} table(s) on page {page_num + 1}")
244
+
245
+ # For each table found, convert to markdown and append
246
+ for table_num, table in enumerate(tables):
247
+ # Extract table data
248
+ table_data = table.extract()
249
+ # Validate table has actual content (not just empty rows/cells)
250
+ if table_data and len(table_data) > 0 and any(
251
+ any(str(cell).strip() for cell in row if cell) for row in table_data if row
252
+ ):
253
+ # Add a marker before the table
254
+ page_text += f"\n\n[Table {table_num + 1} from page {page_num + 1}]\n"
255
+ # Convert to markdown
256
+ markdown_table = convert_table_to_markdown(table_data)
257
+ page_text += markdown_table + "\n"
258
+ except Exception as e:
259
+ # If table extraction fails, continue with regular text
260
+ logger.debug(f"Table extraction failed on page {page_num + 1}: {e}")
261
+
262
+ full_text.append(page_text)
263
+
264
+ # Join all pages and clean
265
+ combined_text = "".join(full_text)
266
+ return clean_pdf_text(combined_text)
143
267
  finally:
144
268
  doc.close()
145
269
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.1.2
3
+ Version: 1.2.0
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -10,7 +10,6 @@ Requires-Dist: aiohttp>=3.11
10
10
  Requires-Dist: asciidoc>=10.2.1
11
11
  Requires-Dist: bs4>=0.0.2
12
12
  Requires-Dist: dicttoxml>=1.7.16
13
- Requires-Dist: docling>=2.34.0
14
13
  Requires-Dist: esperanto>=1.2.0
15
14
  Requires-Dist: firecrawl-py>=2.7.0
16
15
  Requires-Dist: jinja2>=3.1.6
@@ -31,6 +30,8 @@ Requires-Dist: pytubefix>=9.1.1
31
30
  Requires-Dist: readability-lxml>=0.8.4.1
32
31
  Requires-Dist: validators>=0.34.0
33
32
  Requires-Dist: youtube-transcript-api>=1.0.3
33
+ Provides-Extra: docling
34
+ Requires-Dist: docling>=2.34.0; extra == 'docling'
34
35
  Provides-Extra: mcp
35
36
  Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
36
37
  Description-Content-Type: text/markdown
@@ -39,29 +40,70 @@ Description-Content-Type: text/markdown
39
40
 
40
41
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
42
 
42
- **Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
43
+ **Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
43
44
 
44
- ## Overview
45
+ ## 🚀 What You Can Do
45
46
 
46
- > **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
47
+ **Extract content from anywhere:**
48
+ - 📄 **Documents** - PDF, Word, PowerPoint, Excel, Markdown, HTML, EPUB
49
+ - 🎥 **Media** - Videos (MP4, AVI, MOV) with automatic transcription
50
+ - 🎵 **Audio** - MP3, WAV, M4A with speech-to-text conversion
51
+ - 🌐 **Web** - Any URL with intelligent content extraction
52
+ - 🖼️ **Images** - JPG, PNG, TIFF with OCR text recognition
53
+ - 📦 **Archives** - ZIP, TAR, GZ with content analysis
47
54
 
48
- The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
55
+ **Process with AI:**
56
+ - ✨ **Clean & format** extracted content automatically
57
+ - 📝 **Generate summaries** with customizable styles (bullet points, executive summary, etc.)
58
+ - 🎯 **Context-aware processing** - explain to a child, technical summary, action items
59
+ - 🔄 **Smart engine selection** - automatically chooses the best extraction method
49
60
 
50
- ## Key Features
61
+ ## 🛠️ Multiple Ways to Use
51
62
 
52
- * **Multi-Source Extraction:** Handles content from:
53
- * Direct text strings.
54
- * Web URLs (using robust extraction methods).
55
- * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
56
- * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
57
- * **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
58
- * For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
59
- * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
60
- * You can override this by specifying an engine, but `'auto'` is recommended for most users.
61
- * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
62
- * **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
63
- * **macOS Services:** Right-click context menu integration for Finder (extract and summarize files directly).
64
- * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
63
+ ### 🖥️ Command Line (Zero Install)
64
+ ```bash
65
+ # Extract content from any source
66
+ uvx --from "content-core" ccore https://example.com
67
+ uvx --from "content-core" ccore document.pdf
68
+
69
+ # Generate AI summaries
70
+ uvx --from "content-core" csum video.mp4 --context "bullet points"
71
+ ```
72
+
73
+ ### 🤖 Claude Desktop Integration
74
+ One-click setup with Model Context Protocol (MCP) - extract content directly in Claude conversations.
75
+
76
+ ### 🔍 Raycast Extension
77
+ Smart auto-detection commands:
78
+ - **Extract Content** - Full interface with format options
79
+ - **Summarize Content** - 9 summary styles available
80
+ - **Quick Extract** - Instant clipboard extraction
81
+
82
+ ### 🖱️ macOS Right-Click Integration
83
+ Right-click any file in Finder → Services → Extract or Summarize content instantly.
84
+
85
+ ### 🐍 Python Library
86
+ ```python
87
+ import content_core as cc
88
+
89
+ # Extract from any source
90
+ result = await cc.extract("https://example.com/article")
91
+ summary = await cc.summarize_content(result, context="explain to a child")
92
+ ```
93
+
94
+ ## ⚡ Key Features
95
+
96
+ * **🎯 Intelligent Auto-Detection:** Automatically selects the best extraction method based on content type and available services
97
+ * **🔧 Smart Engine Selection:**
98
+ * **URLs:** Firecrawl → Jina → BeautifulSoup fallback chain
99
+ * **Documents:** Docling → Enhanced PyMuPDF → Simple extraction fallback
100
+ * **Media:** OpenAI Whisper transcription
101
+ * **Images:** OCR with multiple engine support
102
+ * **📊 Enhanced PDF Processing:** Advanced PyMuPDF engine with quality flags, table detection, and optional OCR for mathematical formulas
103
+ * **🌍 Multiple Integrations:** CLI, Python library, MCP server, Raycast extension, macOS Services
104
+ * **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
105
+ * **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
106
+ * **🔄 Asynchronous:** Built with `asyncio` for efficient processing
65
107
 
66
108
  ## Getting Started
67
109
 
@@ -245,6 +287,49 @@ Add to your `claude_desktop_config.json`:
245
287
 
246
288
  For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
247
289
 
290
+ ## Enhanced PDF Processing
291
+
292
+ Content Core features an optimized PyMuPDF extraction engine with significant improvements for scientific documents and complex PDFs.
293
+
294
+ ### Key Improvements
295
+
296
+ - **🔬 Mathematical Formula Extraction**: Enhanced quality flags eliminate `<!-- formula-not-decoded -->` placeholders
297
+ - **📊 Automatic Table Detection**: Tables converted to markdown format for LLM consumption
298
+ - **🔧 Quality Text Rendering**: Better ligature, whitespace, and image-text integration
299
+ - **⚡ Optional OCR Enhancement**: Selective OCR for formula-heavy pages (requires Tesseract)
300
+
301
+ ### Configuration for Scientific Documents
302
+
303
+ For documents with heavy mathematical content, enable OCR enhancement:
304
+
305
+ ```yaml
306
+ # In cc_config.yaml
307
+ extraction:
308
+ pymupdf:
309
+ enable_formula_ocr: true # Enable OCR for formula-heavy pages
310
+ formula_threshold: 3 # Min formulas per page to trigger OCR
311
+ ocr_fallback: true # Graceful fallback if OCR fails
312
+ ```
313
+
314
+ ```python
315
+ # Runtime configuration
316
+ from content_core.config import set_pymupdf_ocr_enabled
317
+ set_pymupdf_ocr_enabled(True)
318
+ ```
319
+
320
+ ### Requirements for OCR Enhancement
321
+
322
+ ```bash
323
+ # Install Tesseract OCR (optional, for formula enhancement)
324
+ # macOS
325
+ brew install tesseract
326
+
327
+ # Ubuntu/Debian
328
+ sudo apt-get install tesseract-ocr
329
+ ```
330
+
331
+ **Note**: OCR is optional - you get improved PDF extraction automatically without any additional setup.
332
+
248
333
  ## macOS Services Integration
249
334
 
250
335
  Content Core provides powerful right-click integration with macOS Finder, allowing you to extract and summarize content from any file without installation. Choose between clipboard or TextEdit output for maximum flexibility.
@@ -288,6 +373,50 @@ Create **4 convenient services** for different workflows:
288
373
 
289
374
  For complete setup instructions with copy-paste scripts, see [macOS Services Documentation](docs/macos.md).
290
375
 
376
+ ## Raycast Extension
377
+
378
+ Content Core provides a powerful Raycast extension with smart auto-detection that handles both URLs and file paths seamlessly. Extract and summarize content directly from your Raycast interface without switching applications.
379
+
380
+ ### Quick Setup
381
+
382
+ **From Raycast Store** (coming soon):
383
+ 1. Open Raycast and search for "Content Core"
384
+ 2. Install the extension by `luis_novo`
385
+ 3. Configure API keys in preferences
386
+
387
+ **Manual Installation**:
388
+ 1. Download the extension from the repository
389
+ 2. Open Raycast → "Import Extension"
390
+ 3. Select the `raycast-content-core` folder
391
+
392
+ ### Commands
393
+
394
+ **🔍 Extract Content** - Smart URL/file detection with full interface
395
+ - Auto-detects URLs vs file paths in real-time
396
+ - Multiple output formats (Text, JSON, XML)
397
+ - Drag & drop support for files
398
+ - Rich results view with metadata
399
+
400
+ **📝 Summarize Content** - AI-powered summaries with customizable styles
401
+ - 9 different summary styles (bullet points, executive summary, etc.)
402
+ - Auto-detects source type with visual feedback
403
+ - One-click snippet creation and quicklinks
404
+
405
+ **⚡ Quick Extract** - Instant extraction to clipboard
406
+ - Type → Tab → Paste source → Enter
407
+ - No UI, works directly from command bar
408
+ - Perfect for quick workflows
409
+
410
+ ### Features
411
+
412
+ - **Smart Auto-Detection**: Instantly recognizes URLs vs file paths
413
+ - **Zero Installation**: Uses `uvx` for Content Core execution
414
+ - **Rich Integration**: Keyboard shortcuts, clipboard actions, Raycast snippets
415
+ - **All File Types**: Documents, videos, audio, images, archives
416
+ - **Visual Feedback**: Real-time type detection with icons
417
+
418
+ For detailed setup, configuration, and usage examples, see [Raycast Extension Documentation](docs/raycast.md).
419
+
291
420
  ## Using with Langchain
292
421
 
293
422
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -1,6 +1,6 @@
1
1
  content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
- content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
3
- content_core/config.py,sha256=vyx0fioR6r0mcZfVdwAFDhFrRNoG0ZNG8RNxIDnhNlo,1802
2
+ content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
3
+ content_core/config.py,sha256=OBwI58W4Twr00UiYD2mdw_rZDcuXxjBanE0IoA8ox-M,2601
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
5
  content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
@@ -22,10 +22,11 @@ content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBV
22
22
  content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
23
23
  content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
24
24
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
25
+ content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
25
26
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
26
27
  content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
27
28
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
28
- content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
29
+ content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
29
30
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
30
31
  content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
31
32
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
@@ -34,8 +35,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
34
35
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
35
36
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
36
37
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
37
- content_core-1.1.2.dist-info/METADATA,sha256=_0Rg4yeU-05hDB_91dvcMXYKMaKcMcU5C8SpkYhtiRs,15072
38
- content_core-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
39
- content_core-1.1.2.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
40
- content_core-1.1.2.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
41
- content_core-1.1.2.dist-info/RECORD,,
38
+ content_core-1.2.0.dist-info/METADATA,sha256=wAEQSfn6tTd4hQwAZY8sKeB5e7QpHm6qeTz2akFZwWw,18881
39
+ content_core-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
40
+ content_core-1.2.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
41
+ content_core-1.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
42
+ content_core-1.2.0.dist-info/RECORD,,