content-core 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -34,6 +34,10 @@ extraction:
34
34
  url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
35
35
  docling:
36
36
  output_format: markdown # markdown | html | json
37
+ pymupdf:
38
+ enable_formula_ocr: false # Enable OCR for formula-heavy pages (requires Tesseract)
39
+ formula_threshold: 3 # Minimum formulas per page to trigger OCR
40
+ ocr_fallback: true # Gracefully fallback to standard extraction if OCR fails
37
41
 
38
42
  youtube_transcripts:
39
43
  preferred_languages: ["en", "es", "pt"]
content_core/config.py CHANGED
@@ -14,8 +14,8 @@ def load_config():
14
14
  with open(config_path, "r") as file:
15
15
  return yaml.safe_load(file)
16
16
  except Exception as e:
17
- print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
18
- print("Usando configurações padrão internas.")
17
+ print(f"Error loading configuration file from {config_path}: {e}")
18
+ print("Using internal default settings.")
19
19
 
20
20
  default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
21
21
  if default_config_data:
@@ -47,3 +47,21 @@ def set_docling_output_format(fmt: str):
47
47
  extraction = CONFIG.setdefault("extraction", {})
48
48
  docling_cfg = extraction.setdefault("docling", {})
49
49
  docling_cfg["output_format"] = fmt
50
+
51
+ def set_pymupdf_ocr_enabled(enabled: bool):
52
+ """Enable or disable PyMuPDF OCR for formula-heavy pages."""
53
+ extraction = CONFIG.setdefault("extraction", {})
54
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
55
+ pymupdf_cfg["enable_formula_ocr"] = enabled
56
+
57
+ def set_pymupdf_formula_threshold(threshold: int):
58
+ """Set the minimum number of formulas per page to trigger OCR."""
59
+ extraction = CONFIG.setdefault("extraction", {})
60
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
61
+ pymupdf_cfg["formula_threshold"] = threshold
62
+
63
+ def set_pymupdf_ocr_fallback(enabled: bool):
64
+ """Enable or disable fallback to standard extraction when OCR fails."""
65
+ extraction = CONFIG.setdefault("extraction", {})
66
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
67
+ pymupdf_cfg["ocr_fallback"] = enabled
@@ -30,6 +30,7 @@ def suppress_stdout():
30
30
  finally:
31
31
  sys.stdout = original_stdout
32
32
 
33
+
33
34
  # Add parent directory to path to import content_core
34
35
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
35
36
 
@@ -38,38 +39,40 @@ import content_core as cc
38
39
  # Initialize MCP server
39
40
  mcp = FastMCP("Content Core MCP Server")
40
41
 
42
+
41
43
  async def _extract_content_impl(
42
- url: Optional[str] = None,
43
- file_path: Optional[str] = None
44
+ url: Optional[str] = None, file_path: Optional[str] = None
44
45
  ) -> Dict[str, Any]:
45
46
  """
46
- Extract content from a URL or file using Content Core's auto engine.
47
-
47
+ Extract content from a URL or file using Content Core's auto engine. This is useful for processing Youtube transcripts, website content, PDFs, ePUB, Office files, etc. You can also use it to extract transcripts from audio or video files.
48
+
48
49
  Args:
49
50
  url: Optional URL to extract content from
50
51
  file_path: Optional file path to extract content from
51
-
52
+
52
53
  Returns:
53
54
  JSON object containing extracted content and metadata
54
-
55
+
55
56
  Raises:
56
57
  ValueError: If neither or both url and file_path are provided
57
58
  """
58
59
  # Validate input - exactly one must be provided
59
- if (url is None and file_path is None) or (url is not None and file_path is not None):
60
+ if (url is None and file_path is None) or (
61
+ url is not None and file_path is not None
62
+ ):
60
63
  return {
61
64
  "success": False,
62
65
  "error": "Exactly one of 'url' or 'file_path' must be provided",
63
66
  "source_type": None,
64
67
  "source": None,
65
68
  "content": None,
66
- "metadata": None
69
+ "metadata": None,
67
70
  }
68
-
71
+
69
72
  # Determine source type and validate
70
73
  source_type = "url" if url else "file"
71
74
  source = url if url else file_path
72
-
75
+
73
76
  # Additional validation for file paths
74
77
  if file_path:
75
78
  path = Path(file_path)
@@ -80,9 +83,9 @@ async def _extract_content_impl(
80
83
  "source_type": source_type,
81
84
  "source": source,
82
85
  "content": None,
83
- "metadata": None
86
+ "metadata": None,
84
87
  }
85
-
88
+
86
89
  # Security check - ensure no directory traversal
87
90
  try:
88
91
  # Resolve to absolute path and ensure it's not trying to access sensitive areas
@@ -95,30 +98,30 @@ async def _extract_content_impl(
95
98
  "source_type": source_type,
96
99
  "source": source,
97
100
  "content": None,
98
- "metadata": None
101
+ "metadata": None,
99
102
  }
100
-
103
+
101
104
  # Build extraction request
102
105
  extraction_request = {}
103
106
  if url:
104
107
  extraction_request["url"] = url
105
108
  else:
106
109
  extraction_request["file_path"] = str(Path(file_path).resolve())
107
-
110
+
108
111
  # Track start time
109
112
  start_time = datetime.utcnow()
110
-
113
+
111
114
  try:
112
115
  # Use Content Core's extract_content with auto engine
113
116
  logger.info(f"Extracting content from {source_type}: {source}")
114
-
117
+
115
118
  # Suppress stdout to prevent MoviePy and other libraries from interfering with MCP protocol
116
119
  with suppress_stdout():
117
120
  result = await cc.extract_content(extraction_request)
118
-
121
+
119
122
  # Calculate extraction time
120
123
  extraction_time = (datetime.utcnow() - start_time).total_seconds()
121
-
124
+
122
125
  # Build response - result is a ProcessSourceOutput object
123
126
  response = {
124
127
  "success": True,
@@ -132,13 +135,13 @@ async def _extract_content_impl(
132
135
  "content_length": len(result.content or ""),
133
136
  "identified_type": result.identified_type or "unknown",
134
137
  "identified_provider": result.identified_provider or "",
135
- }
138
+ },
136
139
  }
137
-
140
+
138
141
  # Add metadata from the result
139
142
  if result.metadata:
140
143
  response["metadata"].update(result.metadata)
141
-
144
+
142
145
  # Add specific metadata based on source type
143
146
  if source_type == "url":
144
147
  if result.title:
@@ -152,10 +155,10 @@ async def _extract_content_impl(
152
155
  response["metadata"]["file_path"] = result.file_path
153
156
  response["metadata"]["file_size"] = Path(file_path).stat().st_size
154
157
  response["metadata"]["file_extension"] = Path(file_path).suffix
155
-
158
+
156
159
  logger.info(f"Successfully extracted content from {source_type}: {source}")
157
160
  return response
158
-
161
+
159
162
  except Exception as e:
160
163
  logger.error(f"Error extracting content from {source_type} {source}: {str(e)}")
161
164
  return {
@@ -166,26 +169,25 @@ async def _extract_content_impl(
166
169
  "content": None,
167
170
  "metadata": {
168
171
  "extraction_timestamp": start_time.isoformat() + "Z",
169
- "error_type": type(e).__name__
170
- }
172
+ "error_type": type(e).__name__,
173
+ },
171
174
  }
172
175
 
173
176
 
174
177
  @mcp.tool
175
178
  async def extract_content(
176
- url: Optional[str] = None,
177
- file_path: Optional[str] = None
179
+ url: Optional[str] = None, file_path: Optional[str] = None
178
180
  ) -> Dict[str, Any]:
179
181
  """
180
182
  Extract content from a URL or file using Content Core's auto engine.
181
-
183
+
182
184
  Args:
183
185
  url: Optional URL to extract content from
184
186
  file_path: Optional file path to extract content from
185
-
187
+
186
188
  Returns:
187
189
  JSON object containing extracted content and metadata
188
-
190
+
189
191
  Raises:
190
192
  ValueError: If neither or both url and file_path are provided
191
193
  """
@@ -197,15 +199,16 @@ def main():
197
199
  # Additional MoviePy configuration to suppress all output
198
200
  try:
199
201
  import moviepy.config as mp_config
202
+
200
203
  mp_config.check_and_download_cmd("ffmpeg") # Pre-download to avoid logs later
201
204
  except Exception:
202
205
  pass # Ignore if MoviePy isn't available or configured
203
-
206
+
204
207
  logger.info("Starting Content Core MCP Server")
205
-
208
+
206
209
  # Run with STDIO transport for MCP compatibility
207
210
  mcp.run()
208
211
 
209
212
 
210
213
  if __name__ == "__main__":
211
- main()
214
+ main()
@@ -0,0 +1,154 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "873a872b",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from content_core.content.extraction import extract_content\n",
11
+ "\n",
12
+ "async def process_url(url):\n",
13
+ " print(\"Processing: \", url)\n",
14
+ " print(\"Simple: -------\")\n",
15
+ " result = await extract_content(dict(url=url, engine=\"simple\"))\n",
16
+ " print(result.title[:100])\n",
17
+ " print(result.content[:100])\n",
18
+ " print(\"Jina: -------\")\n",
19
+ " result = await extract_content(dict(url=url, engine=\"jina\"))\n",
20
+ " print(result.title[:100])\n",
21
+ " print(result.content[:100])\n",
22
+ " print(\"Firecrawl: -------\")\n",
23
+ " result = await extract_content(dict(url=url, engine=\"firecrawl\"))\n",
24
+ " print(result.title[:100])\n",
25
+ " print(result.content[:100])\n",
26
+ " print(\"=============================\")"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 4,
32
+ "id": "263dc3af",
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stdout",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "Processing: https://www.supernovalabs.com.br/\n",
40
+ "Simple: -------\n",
41
+ "Readability failed: No content extracted by readability\n",
42
+ "Supernova Labs | AI Consulting\n",
43
+ "Supernova Labs | AI Consulting\n",
44
+ "Jina: -------\n",
45
+ "Supernova Labs | Elite AI Consulting to help you build the Future\n",
46
+ "URL Source: https://www.supernovalabs.com.br/\n",
47
+ "\n",
48
+ "Markdown Content:\n",
49
+ "Supernova Labs\n",
50
+ "\n",
51
+ "[About](https://www\n",
52
+ "Firecrawl: -------\n",
53
+ "Supernova Labs | AI Consulting\n",
54
+ "# Unleash Your AI Edge. Fast.\n",
55
+ "\n",
56
+ "We turn your data, tech and capabilities into impact with lean AI sol\n",
57
+ "=============================\n",
58
+ "None\n",
59
+ "Processing: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
60
+ "Simple: -------\n",
61
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
62
+ "Fine-Tuning Transaction User Models Learn how we combine transaction embeddings with tabular data us\n",
63
+ "Jina: -------\n",
64
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
65
+ "URL Source: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
66
+ "\n",
67
+ "Published Time: 2025-0\n",
68
+ "Firecrawl: -------\n",
69
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
70
+ "# Fine-Tuning Transaction User Models\n",
71
+ "\n",
72
+ "Learn how we combine transaction embeddings with tabular data\n",
73
+ "=============================\n",
74
+ "None\n",
75
+ "Processing: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\n",
76
+ "Simple: -------\n",
77
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
78
+ "Most people think they need to quit their job to build a new life. I thought that too. You scroll th\n",
79
+ "Jina: -------\n",
80
+ "You Can Make Money With AI Without Quitting Your Job\n",
81
+ "URL Source: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quittin\n",
82
+ "Firecrawl: -------\n",
83
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
84
+ "[Sitemap](https://medium.com/sitemap/sitemap.xml)\n",
85
+ "\n",
86
+ "[Open in app](https://rsci.app.link/?%24canonical\n",
87
+ "=============================\n",
88
+ "None\n",
89
+ "Processing: https://github.com/mirkonasato/pyodconverter\n",
90
+ "Simple: -------\n",
91
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
92
+ "This repository was archived by the owner on Dec 1, 2023. It is now read-only. mirkonasato/pyodconve\n",
93
+ "Jina: -------\n",
94
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
95
+ "URL Source: https://github.com/mirkonasato/pyodconverter\n",
96
+ "\n",
97
+ "Markdown Content:\n",
98
+ "GitHub - mirkonasato/pyo\n",
99
+ "Firecrawl: -------\n",
100
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
101
+ "[Skip to content](https://github.com/mirkonasato/pyodconverter#start-of-content)\n",
102
+ "\n",
103
+ "You signed in with\n",
104
+ "=============================\n",
105
+ "None\n",
106
+ "Processing: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\n",
107
+ "Simple: -------\n",
108
+ "Error processing URL https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR: HTTP error: 500\n",
109
+ "Error\n",
110
+ "Failed to extract content: HTTP error: 500\n",
111
+ "Jina: -------\n",
112
+ "Ultra-aprendizado: domine habilidades valiosas, seja mais esperto que a competição e dê um impulso n\n",
113
+ "URL Source: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-ca\n",
114
+ "Firecrawl: -------\n",
115
+ "Amazon.com.br\n",
116
+ "#### Digite os caracteres que você vê abaixo\n",
117
+ "\n",
118
+ "Desculpe pelo inconveniente. Para continuar realizando\n",
119
+ "=============================\n",
120
+ "None\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "\n",
126
+ "urls= [\"https://www.supernovalabs.com.br/\", \"https://building.nubank.com/fine-tuning-transaction-user-models/\", \"https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\", \"https://github.com/mirkonasato/pyodconverter\", \"https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\"]\n",
127
+ "for url in urls:\n",
128
+ " result = await process_url(url=url)\n",
129
+ " print(result)"
130
+ ]
131
+ }
132
+ ],
133
+ "metadata": {
134
+ "kernelspec": {
135
+ "display_name": ".venv",
136
+ "language": "python",
137
+ "name": "python3"
138
+ },
139
+ "language_info": {
140
+ "codemirror_mode": {
141
+ "name": "ipython",
142
+ "version": 3
143
+ },
144
+ "file_extension": ".py",
145
+ "mimetype": "text/x-python",
146
+ "name": "python",
147
+ "nbconvert_exporter": "python",
148
+ "pygments_lexer": "ipython3",
149
+ "version": "3.10.6"
150
+ }
151
+ },
152
+ "nbformat": 4,
153
+ "nbformat_minor": 5
154
+ }
@@ -5,20 +5,90 @@ import unicodedata
5
5
  import fitz # type: ignore
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
+ from content_core.config import CONFIG
8
9
  from content_core.logging import logger
9
10
 
10
- # todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
11
- # todo: what else can we do to make the text more readable?
12
- # todo: try to fix encoding for some PDF that is still breaking
13
- # def _extract_text_from_pdf(pdf_path):
14
- # doc = fitz.open(pdf_path)
15
- # text = ""
16
- # logger.debug(f"Found {len(doc)} pages in PDF")
17
- # for page in doc:
18
- # # Use encode/decode if you need to clean up any encoding issues
19
- # text += page.get_text().encode('utf-8').decode('utf-8')
20
- # doc.close()
21
- # return text
11
+ def count_formula_placeholders(text):
12
+ """
13
+ Count the number of formula placeholders in extracted text.
14
+
15
+ Args:
16
+ text (str): Extracted text content
17
+ Returns:
18
+ int: Number of formula placeholders found
19
+ """
20
+ if not text:
21
+ return 0
22
+ return text.count('<!-- formula-not-decoded -->')
23
+
24
+
25
+ def extract_page_with_ocr(page, page_num):
26
+ """
27
+ Extract text from a page using OCR (Tesseract).
28
+
29
+ Args:
30
+ page: PyMuPDF page object
31
+ page_num (int): Page number for logging
32
+ Returns:
33
+ str: OCR-extracted text or None if OCR fails
34
+ """
35
+ try:
36
+ logger.debug(f"Attempting OCR extraction for page {page_num}")
37
+ # Create TextPage using OCR
38
+ textpage = page.get_textpage_ocr()
39
+ if textpage:
40
+ # Extract text from the OCR TextPage
41
+ ocr_text = textpage.extractText()
42
+ logger.debug(f"OCR successful for page {page_num}, extracted {len(ocr_text)} characters")
43
+ return ocr_text
44
+ else:
45
+ logger.warning(f"OCR TextPage creation failed for page {page_num}")
46
+ return None
47
+ except (ImportError, RuntimeError, OSError) as e:
48
+ # Common errors: Tesseract not installed, OCR failure, file access issues
49
+ logger.debug(f"OCR extraction failed for page {page_num}: {e}")
50
+ return None
51
+ except Exception as e:
52
+ # Unexpected errors - log as warning for debugging
53
+ logger.warning(f"Unexpected error during OCR extraction for page {page_num}: {e}")
54
+ return None
55
+
56
+
57
+ def convert_table_to_markdown(table):
58
+ """
59
+ Convert a PyMuPDF table to markdown format.
60
+
61
+ Args:
62
+ table: Table data from PyMuPDF (list of lists)
63
+ Returns:
64
+ str: Markdown-formatted table
65
+ """
66
+ if not table or not table[0]:
67
+ return ""
68
+
69
+ # Build markdown table
70
+ markdown_lines = []
71
+
72
+ # Header row
73
+ header = table[0]
74
+ header_row = "| " + " | ".join(str(cell) if cell else "" for cell in header) + " |"
75
+ markdown_lines.append(header_row)
76
+
77
+ # Separator row
78
+ separator = "|" + "|".join([" --- " for _ in header]) + "|"
79
+ markdown_lines.append(separator)
80
+
81
+ # Data rows
82
+ for row in table[1:]:
83
+ if row: # Skip empty rows
84
+ row_text = "| " + " | ".join(str(cell) if cell else "" for cell in row) + " |"
85
+ markdown_lines.append(row_text)
86
+
87
+ return "\n".join(markdown_lines) + "\n"
88
+
89
+ # Configuration constants
90
+ DEFAULT_FORMULA_THRESHOLD = 3
91
+ DEFAULT_OCR_FALLBACK = True
22
92
 
23
93
  SUPPORTED_FITZ_TYPES = [
24
94
  "application/pdf",
@@ -116,30 +186,84 @@ def clean_pdf_text(text):
116
186
  return text.strip()
117
187
 
118
188
 
119
- async def _extract_text_from_pdf(pdf_path):
120
- doc = fitz.open(pdf_path)
121
- try:
122
- text = ""
123
- logger.debug(f"Found {len(doc)} pages in PDF")
124
- for page in doc:
125
- text += page.get_text()
126
- normalized_text = clean_pdf_text(text)
127
- return normalized_text
128
- finally:
129
- doc.close()
130
189
 
131
190
 
132
191
  async def _extract_text_from_pdf(pdf_path):
133
- """Extract text from PDF asynchronously"""
192
+ """Extract text from PDF asynchronously with table detection"""
134
193
 
135
194
  def _extract():
136
195
  doc = fitz.open(pdf_path)
137
196
  try:
138
- text = ""
197
+ full_text = []
139
198
  logger.debug(f"Found {len(doc)} pages in PDF")
140
- for page in doc:
141
- text += page.get_text()
142
- return clean_pdf_text(text)
199
+
200
+ # Use quality improvement flags for better text extraction
201
+ extraction_flags = (
202
+ fitz.TEXT_PRESERVE_LIGATURES | # Better character rendering
203
+ fitz.TEXT_PRESERVE_WHITESPACE | # Better spacing preservation
204
+ fitz.TEXT_PRESERVE_IMAGES # Better image-text integration
205
+ )
206
+
207
+ # Get OCR configuration
208
+ ocr_config = CONFIG.get('extraction', {}).get('pymupdf', {})
209
+ enable_ocr = ocr_config.get('enable_formula_ocr', False)
210
+ formula_threshold = ocr_config.get('formula_threshold', DEFAULT_FORMULA_THRESHOLD)
211
+ ocr_fallback = ocr_config.get('ocr_fallback', DEFAULT_OCR_FALLBACK)
212
+
213
+ for page_num, page in enumerate(doc):
214
+ # Extract regular text with quality flags
215
+ standard_text = page.get_text(flags=extraction_flags)
216
+
217
+ # Check if we should try OCR for this page
218
+ formula_count = count_formula_placeholders(standard_text)
219
+ use_ocr = (enable_ocr and
220
+ formula_count >= formula_threshold and
221
+ formula_count > 0)
222
+
223
+ if use_ocr:
224
+ logger.debug(f"Page {page_num + 1} has {formula_count} formulas, attempting OCR")
225
+ ocr_text = extract_page_with_ocr(page, page_num + 1)
226
+
227
+ if ocr_text and ocr_fallback:
228
+ # Use OCR text but preserve table extraction from standard text
229
+ page_text = ocr_text
230
+ logger.debug(f"Using OCR text for page {page_num + 1}")
231
+ else:
232
+ # OCR failed, use standard text
233
+ page_text = standard_text
234
+ if not ocr_text:
235
+ logger.debug(f"OCR failed for page {page_num + 1}, using standard extraction")
236
+ else:
237
+ page_text = standard_text
238
+
239
+ # Try to find and extract tables (regardless of OCR)
240
+ try:
241
+ tables = page.find_tables()
242
+ if tables:
243
+ logger.debug(f"Found {len(tables)} table(s) on page {page_num + 1}")
244
+
245
+ # For each table found, convert to markdown and append
246
+ for table_num, table in enumerate(tables):
247
+ # Extract table data
248
+ table_data = table.extract()
249
+ # Validate table has actual content (not just empty rows/cells)
250
+ if table_data and len(table_data) > 0 and any(
251
+ any(str(cell).strip() for cell in row if cell) for row in table_data if row
252
+ ):
253
+ # Add a marker before the table
254
+ page_text += f"\n\n[Table {table_num + 1} from page {page_num + 1}]\n"
255
+ # Convert to markdown
256
+ markdown_table = convert_table_to_markdown(table_data)
257
+ page_text += markdown_table + "\n"
258
+ except Exception as e:
259
+ # If table extraction fails, continue with regular text
260
+ logger.debug(f"Table extraction failed on page {page_num + 1}: {e}")
261
+
262
+ full_text.append(page_text)
263
+
264
+ # Join all pages and clean
265
+ combined_text = "".join(full_text)
266
+ return clean_pdf_text(combined_text)
143
267
  finally:
144
268
  doc.close()
145
269
 
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.1.0
4
- Summary: Extract what matters from any media source
3
+ Version: 1.2.0
4
+ Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
@@ -10,7 +10,6 @@ Requires-Dist: aiohttp>=3.11
10
10
  Requires-Dist: asciidoc>=10.2.1
11
11
  Requires-Dist: bs4>=0.0.2
12
12
  Requires-Dist: dicttoxml>=1.7.16
13
- Requires-Dist: docling>=2.34.0
14
13
  Requires-Dist: esperanto>=1.2.0
15
14
  Requires-Dist: firecrawl-py>=2.7.0
16
15
  Requires-Dist: jinja2>=3.1.6
@@ -31,6 +30,8 @@ Requires-Dist: pytubefix>=9.1.1
31
30
  Requires-Dist: readability-lxml>=0.8.4.1
32
31
  Requires-Dist: validators>=0.34.0
33
32
  Requires-Dist: youtube-transcript-api>=1.0.3
33
+ Provides-Extra: docling
34
+ Requires-Dist: docling>=2.34.0; extra == 'docling'
34
35
  Provides-Extra: mcp
35
36
  Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
36
37
  Description-Content-Type: text/markdown
@@ -39,28 +40,70 @@ Description-Content-Type: text/markdown
39
40
 
40
41
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
42
 
42
- **Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
43
+ **Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
43
44
 
44
- ## Overview
45
+ ## 🚀 What You Can Do
45
46
 
46
- > **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
47
+ **Extract content from anywhere:**
48
+ - 📄 **Documents** - PDF, Word, PowerPoint, Excel, Markdown, HTML, EPUB
49
+ - 🎥 **Media** - Videos (MP4, AVI, MOV) with automatic transcription
50
+ - 🎵 **Audio** - MP3, WAV, M4A with speech-to-text conversion
51
+ - 🌐 **Web** - Any URL with intelligent content extraction
52
+ - 🖼️ **Images** - JPG, PNG, TIFF with OCR text recognition
53
+ - 📦 **Archives** - ZIP, TAR, GZ with content analysis
47
54
 
48
- The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
55
+ **Process with AI:**
56
+ - ✨ **Clean & format** extracted content automatically
57
+ - 📝 **Generate summaries** with customizable styles (bullet points, executive summary, etc.)
58
+ - 🎯 **Context-aware processing** - explain to a child, technical summary, action items
59
+ - 🔄 **Smart engine selection** - automatically chooses the best extraction method
49
60
 
50
- ## Key Features
61
+ ## 🛠️ Multiple Ways to Use
51
62
 
52
- * **Multi-Source Extraction:** Handles content from:
53
- * Direct text strings.
54
- * Web URLs (using robust extraction methods).
55
- * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
56
- * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
57
- * **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
58
- * For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
59
- * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
60
- * You can override this by specifying an engine, but `'auto'` is recommended for most users.
61
- * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
62
- * **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
63
- * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
63
+ ### 🖥️ Command Line (Zero Install)
64
+ ```bash
65
+ # Extract content from any source
66
+ uvx --from "content-core" ccore https://example.com
67
+ uvx --from "content-core" ccore document.pdf
68
+
69
+ # Generate AI summaries
70
+ uvx --from "content-core" csum video.mp4 --context "bullet points"
71
+ ```
72
+
73
+ ### 🤖 Claude Desktop Integration
74
+ One-click setup with Model Context Protocol (MCP) - extract content directly in Claude conversations.
75
+
76
+ ### 🔍 Raycast Extension
77
+ Smart auto-detection commands:
78
+ - **Extract Content** - Full interface with format options
79
+ - **Summarize Content** - 9 summary styles available
80
+ - **Quick Extract** - Instant clipboard extraction
81
+
82
+ ### 🖱️ macOS Right-Click Integration
83
+ Right-click any file in Finder → Services → Extract or Summarize content instantly.
84
+
85
+ ### 🐍 Python Library
86
+ ```python
87
+ import content_core as cc
88
+
89
+ # Extract from any source
90
+ result = await cc.extract("https://example.com/article")
91
+ summary = await cc.summarize_content(result, context="explain to a child")
92
+ ```
93
+
94
+ ## ⚡ Key Features
95
+
96
+ * **🎯 Intelligent Auto-Detection:** Automatically selects the best extraction method based on content type and available services
97
+ * **🔧 Smart Engine Selection:**
98
+ * **URLs:** Firecrawl → Jina → BeautifulSoup fallback chain
99
+ * **Documents:** Docling → Enhanced PyMuPDF → Simple extraction fallback
100
+ * **Media:** OpenAI Whisper transcription
101
+ * **Images:** OCR with multiple engine support
102
+ * **📊 Enhanced PDF Processing:** Advanced PyMuPDF engine with quality flags, table detection, and optional OCR for mathematical formulas
103
+ * **🌍 Multiple Integrations:** CLI, Python library, MCP server, Raycast extension, macOS Services
104
+ * **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
105
+ * **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
106
+ * **🔄 Asynchronous:** Built with `asyncio` for efficient processing
64
107
 
65
108
  ## Getting Started
66
109
 
@@ -92,6 +135,18 @@ uv sync
92
135
  Content Core provides three CLI commands for extracting, cleaning, and summarizing content:
93
136
  ccore, cclean, and csum. These commands support input from text, URLs, files, or piped data (e.g., via cat file | command).
94
137
 
138
+ **Zero-install usage with uvx:**
139
+ ```bash
140
+ # Extract content
141
+ uvx --from "content-core" ccore https://example.com
142
+
143
+ # Clean content
144
+ uvx --from "content-core" cclean "messy content"
145
+
146
+ # Summarize content
147
+ uvx --from "content-core" csum "long text" --context "bullet points"
148
+ ```
149
+
95
150
  #### ccore - Extract Content
96
151
 
97
152
  Extracts content from text, URLs, or files, with optional formatting.
@@ -232,6 +287,136 @@ Add to your `claude_desktop_config.json`:
232
287
 
233
288
  For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
234
289
 
290
+ ## Enhanced PDF Processing
291
+
292
+ Content Core features an optimized PyMuPDF extraction engine with significant improvements for scientific documents and complex PDFs.
293
+
294
+ ### Key Improvements
295
+
296
+ - **🔬 Mathematical Formula Extraction**: Enhanced quality flags eliminate `<!-- formula-not-decoded -->` placeholders
297
+ - **📊 Automatic Table Detection**: Tables converted to markdown format for LLM consumption
298
+ - **🔧 Quality Text Rendering**: Better ligature, whitespace, and image-text integration
299
+ - **⚡ Optional OCR Enhancement**: Selective OCR for formula-heavy pages (requires Tesseract)
300
+
301
+ ### Configuration for Scientific Documents
302
+
303
+ For documents with heavy mathematical content, enable OCR enhancement:
304
+
305
+ ```yaml
306
+ # In cc_config.yaml
307
+ extraction:
308
+ pymupdf:
309
+ enable_formula_ocr: true # Enable OCR for formula-heavy pages
310
+ formula_threshold: 3 # Min formulas per page to trigger OCR
311
+ ocr_fallback: true # Graceful fallback if OCR fails
312
+ ```
313
+
314
+ ```python
315
+ # Runtime configuration
316
+ from content_core.config import set_pymupdf_ocr_enabled
317
+ set_pymupdf_ocr_enabled(True)
318
+ ```
319
+
320
+ ### Requirements for OCR Enhancement
321
+
322
+ ```bash
323
+ # Install Tesseract OCR (optional, for formula enhancement)
324
+ # macOS
325
+ brew install tesseract
326
+
327
+ # Ubuntu/Debian
328
+ sudo apt-get install tesseract-ocr
329
+ ```
330
+
331
+ **Note**: OCR is optional - you get improved PDF extraction automatically without any additional setup.
332
+
333
+ ## macOS Services Integration
334
+
335
+ Content Core provides powerful right-click integration with macOS Finder, allowing you to extract and summarize content from any file without installation. Choose between clipboard or TextEdit output for maximum flexibility.
336
+
337
+ ### Available Services
338
+
339
+ Create **4 convenient services** for different workflows:
340
+
341
+ - **Extract Content → Clipboard** - Quick copy for immediate pasting
342
+ - **Extract Content → TextEdit** - Review before using
343
+ - **Summarize Content → Clipboard** - Quick summary copying
344
+ - **Summarize Content → TextEdit** - Formatted summary with headers
345
+
346
+ ### Quick Setup
347
+
348
+ 1. **Install uv** (if not already installed):
349
+ ```bash
350
+ curl -LsSf https://astral.sh/uv/install.sh | sh
351
+ ```
352
+
353
+ 2. **Create services manually** using Automator (5 minutes setup)
354
+
355
+ ### Usage
356
+
357
+ **Right-click any supported file** in Finder → **Services** → Choose your option:
358
+
359
+ - **PDFs, Word docs** - Instant text extraction
360
+ - **Videos, audio files** - Automatic transcription
361
+ - **Images** - OCR text recognition
362
+ - **Web content** - Clean text extraction
363
+ - **Multiple files** - Batch processing support
364
+
365
+ ### Features
366
+
367
+ - **Zero-install processing**: Uses `uvx` for isolated execution
368
+ - **Multiple output options**: Clipboard or TextEdit display
369
+ - **System notifications**: Visual feedback on completion
370
+ - **Wide format support**: 20+ file types supported
371
+ - **Batch processing**: Handle multiple files at once
372
+ - **Keyboard shortcuts**: Assignable hotkeys for power users
373
+
374
+ For complete setup instructions with copy-paste scripts, see [macOS Services Documentation](docs/macos.md).
375
+
376
+ ## Raycast Extension
377
+
378
+ Content Core provides a powerful Raycast extension with smart auto-detection that handles both URLs and file paths seamlessly. Extract and summarize content directly from your Raycast interface without switching applications.
379
+
380
+ ### Quick Setup
381
+
382
+ **From Raycast Store** (coming soon):
383
+ 1. Open Raycast and search for "Content Core"
384
+ 2. Install the extension by `luis_novo`
385
+ 3. Configure API keys in preferences
386
+
387
+ **Manual Installation**:
388
+ 1. Download the extension from the repository
389
+ 2. Open Raycast → "Import Extension"
390
+ 3. Select the `raycast-content-core` folder
391
+
392
+ ### Commands
393
+
394
+ **🔍 Extract Content** - Smart URL/file detection with full interface
395
+ - Auto-detects URLs vs file paths in real-time
396
+ - Multiple output formats (Text, JSON, XML)
397
+ - Drag & drop support for files
398
+ - Rich results view with metadata
399
+
400
+ **📝 Summarize Content** - AI-powered summaries with customizable styles
401
+ - 9 different summary styles (bullet points, executive summary, etc.)
402
+ - Auto-detects source type with visual feedback
403
+ - One-click snippet creation and quicklinks
404
+
405
+ **⚡ Quick Extract** - Instant extraction to clipboard
406
+ - Type → Tab → Paste source → Enter
407
+ - No UI, works directly from command bar
408
+ - Perfect for quick workflows
409
+
410
+ ### Features
411
+
412
+ - **Smart Auto-Detection**: Instantly recognizes URLs vs file paths
413
+ - **Zero Installation**: Uses `uvx` for Content Core execution
414
+ - **Rich Integration**: Keyboard shortcuts, clipboard actions, Raycast snippets
415
+ - **All File Types**: Documents, videos, audio, images, archives
416
+ - **Visual Feedback**: Real-time type detection with icons
417
+
418
+ For detailed setup, configuration, and usage examples, see [Raycast Extension Documentation](docs/raycast.md).
419
+
235
420
  ## Using with Langchain
236
421
 
237
422
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -1,6 +1,6 @@
1
1
  content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
- content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
3
- content_core/config.py,sha256=vyx0fioR6r0mcZfVdwAFDhFrRNoG0ZNG8RNxIDnhNlo,1802
2
+ content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
3
+ content_core/config.py,sha256=OBwI58W4Twr00UiYD2mdw_rZDcuXxjBanE0IoA8ox-M,2601
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
5
  content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
@@ -20,12 +20,13 @@ content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjloz
20
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
21
21
  content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
22
22
  content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
23
- content_core/mcp/server.py,sha256=m2A63Qle3nJ_Lw46uWkwVvYERtEw84hd7NHAn1rwdAQ,6968
23
+ content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
24
24
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
25
+ content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
25
26
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
26
27
  content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
27
28
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
28
- content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
29
+ content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
29
30
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
30
31
  content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
31
32
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
@@ -34,8 +35,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
34
35
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
35
36
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
36
37
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
37
- content_core-1.1.0.dist-info/METADATA,sha256=9-ppXQ7o-s8BCb2lH5xBiaiYBHmOFmXFrWntHuo9G_o,13017
38
- content_core-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
39
- content_core-1.1.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
40
- content_core-1.1.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
41
- content_core-1.1.0.dist-info/RECORD,,
38
+ content_core-1.2.0.dist-info/METADATA,sha256=wAEQSfn6tTd4hQwAZY8sKeB5e7QpHm6qeTz2akFZwWw,18881
39
+ content_core-1.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
40
+ content_core-1.2.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
41
+ content_core-1.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
42
+ content_core-1.2.0.dist-info/RECORD,,