content-core 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -34,6 +34,10 @@ extraction:
34
34
  url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
35
35
  docling:
36
36
  output_format: markdown # markdown | html | json
37
+ pymupdf:
38
+ enable_formula_ocr: false # Enable OCR for formula-heavy pages (requires Tesseract)
39
+ formula_threshold: 3 # Minimum formulas per page to trigger OCR
40
+ ocr_fallback: true # Gracefully fallback to standard extraction if OCR fails
37
41
 
38
42
  youtube_transcripts:
39
43
  preferred_languages: ["en", "es", "pt"]
content_core/config.py CHANGED
@@ -6,6 +6,10 @@ from dotenv import load_dotenv
6
6
  # Load environment variables from .env file
7
7
  load_dotenv()
8
8
 
9
+ # Allowed engine values for validation
10
+ ALLOWED_DOCUMENT_ENGINES = {"auto", "simple", "docling"}
11
+ ALLOWED_URL_ENGINES = {"auto", "simple", "firecrawl", "jina"}
12
+
9
13
 
10
14
  def load_config():
11
15
  config_path = os.environ.get("CCORE_CONFIG_PATH") or os.environ.get("CCORE_MODEL_CONFIG_PATH")
@@ -14,8 +18,8 @@ def load_config():
14
18
  with open(config_path, "r") as file:
15
19
  return yaml.safe_load(file)
16
20
  except Exception as e:
17
- print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
18
- print("Usando configurações padrão internas.")
21
+ print(f"Error loading configuration file from {config_path}: {e}")
22
+ print("Using internal default settings.")
19
23
 
20
24
  default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
21
25
  if default_config_data:
@@ -33,6 +37,39 @@ def load_config():
33
37
 
34
38
  CONFIG = load_config()
35
39
 
40
+ # Environment variable engine selectors for MCP/Raycast users
41
+ def get_document_engine():
42
+ """Get document engine with environment variable override and validation."""
43
+ env_engine = os.environ.get("CCORE_DOCUMENT_ENGINE")
44
+ if env_engine:
45
+ if env_engine not in ALLOWED_DOCUMENT_ENGINES:
46
+ # Import logger here to avoid circular imports
47
+ from content_core.logging import logger
48
+ logger.warning(
49
+ f"Invalid CCORE_DOCUMENT_ENGINE: '{env_engine}'. "
50
+ f"Allowed values: {', '.join(sorted(ALLOWED_DOCUMENT_ENGINES))}. "
51
+ f"Using default from config."
52
+ )
53
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
54
+ return env_engine
55
+ return CONFIG.get("extraction", {}).get("document_engine", "auto")
56
+
57
+ def get_url_engine():
58
+ """Get URL engine with environment variable override and validation."""
59
+ env_engine = os.environ.get("CCORE_URL_ENGINE")
60
+ if env_engine:
61
+ if env_engine not in ALLOWED_URL_ENGINES:
62
+ # Import logger here to avoid circular imports
63
+ from content_core.logging import logger
64
+ logger.warning(
65
+ f"Invalid CCORE_URL_ENGINE: '{env_engine}'. "
66
+ f"Allowed values: {', '.join(sorted(ALLOWED_URL_ENGINES))}. "
67
+ f"Using default from config."
68
+ )
69
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
70
+ return env_engine
71
+ return CONFIG.get("extraction", {}).get("url_engine", "auto")
72
+
36
73
  # Programmatic config overrides: use in notebooks or scripts
37
74
  def set_document_engine(engine: str):
38
75
  """Override the document extraction engine ('auto', 'simple', or 'docling')."""
@@ -47,3 +84,21 @@ def set_docling_output_format(fmt: str):
47
84
  extraction = CONFIG.setdefault("extraction", {})
48
85
  docling_cfg = extraction.setdefault("docling", {})
49
86
  docling_cfg["output_format"] = fmt
87
+
88
+ def set_pymupdf_ocr_enabled(enabled: bool):
89
+ """Enable or disable PyMuPDF OCR for formula-heavy pages."""
90
+ extraction = CONFIG.setdefault("extraction", {})
91
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
92
+ pymupdf_cfg["enable_formula_ocr"] = enabled
93
+
94
+ def set_pymupdf_formula_threshold(threshold: int):
95
+ """Set the minimum number of formulas per page to trigger OCR."""
96
+ extraction = CONFIG.setdefault("extraction", {})
97
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
98
+ pymupdf_cfg["formula_threshold"] = threshold
99
+
100
+ def set_pymupdf_ocr_fallback(enabled: bool):
101
+ """Enable or disable fallback to standard extraction when OCR fails."""
102
+ extraction = CONFIG.setdefault("extraction", {})
103
+ pymupdf_cfg = extraction.setdefault("pymupdf", {})
104
+ pymupdf_cfg["ocr_fallback"] = enabled
@@ -12,13 +12,19 @@ from content_core.common import (
12
12
  ProcessSourceState,
13
13
  UnsupportedTypeException,
14
14
  )
15
- from content_core.config import CONFIG # type: ignore
15
+ from content_core.config import get_document_engine
16
16
  from content_core.logging import logger
17
17
  from content_core.processors.audio import extract_audio_data # type: ignore
18
- from content_core.processors.docling import (
19
- DOCLING_SUPPORTED, # type: ignore
20
- extract_with_docling,
21
- )
18
+ try:
19
+ from content_core.processors.docling import (
20
+ DOCLING_SUPPORTED, # type: ignore
21
+ extract_with_docling,
22
+ DOCLING_AVAILABLE,
23
+ )
24
+ except ImportError:
25
+ DOCLING_AVAILABLE = False
26
+ DOCLING_SUPPORTED = set()
27
+ extract_with_docling = None
22
28
  from content_core.processors.office import (
23
29
  SUPPORTED_OFFICE_TYPES,
24
30
  extract_office_content,
@@ -126,26 +132,30 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
126
132
  Supports 'auto', 'docling', and 'simple'.
127
133
  'auto' tries docling first, then falls back to simple if docling fails.
128
134
  """
129
- engine = state.document_engine or CONFIG.get("extraction", {}).get("document_engine", "auto")
135
+ # Use environment-aware engine selection
136
+ engine = state.document_engine or get_document_engine()
137
+
130
138
  if engine == "auto":
131
139
  logger.debug("Using auto engine")
132
- # Try docling first; if it fails or is not supported, fallback to simple
133
- if state.identified_type in DOCLING_SUPPORTED:
134
- try:
135
- logger.debug("Trying docling extraction")
136
- return "extract_docling"
137
- except Exception as e:
138
- logger.warning(
139
- f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
140
- )
140
+ # Check if docling is available AND supports the file type
141
+ if DOCLING_AVAILABLE and state.identified_type in DOCLING_SUPPORTED:
142
+ logger.debug("Using docling extraction (auto mode)")
143
+ return "extract_docling"
141
144
  # Fallback to simple
142
- logger.debug("Falling back to simple extraction")
145
+ logger.debug("Falling back to simple extraction (docling unavailable or unsupported)")
143
146
  return await file_type_edge(state)
144
147
 
145
- if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
146
- logger.debug("Using docling engine")
147
- return "extract_docling"
148
- # For 'simple', use the default file type edge
148
+ if engine == "docling":
149
+ if not DOCLING_AVAILABLE:
150
+ raise ImportError("Docling engine requested but docling package not installed. Install with: pip install content-core[docling]")
151
+ if state.identified_type in DOCLING_SUPPORTED:
152
+ logger.debug("Using docling engine")
153
+ return "extract_docling"
154
+ # If docling doesn't support this file type, fall back to simple
155
+ logger.debug("Docling doesn't support this file type, using simple engine")
156
+ return await file_type_edge(state)
157
+
158
+ # For 'simple' or any other engine
149
159
  logger.debug("Using simple engine")
150
160
  return await file_type_edge(state)
151
161
 
@@ -168,7 +178,9 @@ workflow.add_node("extract_audio_data", extract_audio_data)
168
178
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
169
179
  workflow.add_node("delete_file", delete_file)
170
180
  workflow.add_node("download_remote_file", download_remote_file)
171
- workflow.add_node("extract_docling", extract_with_docling)
181
+ # Only add docling node if available
182
+ if DOCLING_AVAILABLE:
183
+ workflow.add_node("extract_docling", extract_with_docling)
172
184
 
173
185
  # Add edges
174
186
  workflow.add_edge(START, "source")
@@ -0,0 +1,154 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "873a872b",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from content_core.content.extraction import extract_content\n",
11
+ "\n",
12
+ "async def process_url(url):\n",
13
+ " print(\"Processing: \", url)\n",
14
+ " print(\"Simple: -------\")\n",
15
+ " result = await extract_content(dict(url=url, engine=\"simple\"))\n",
16
+ " print(result.title[:100])\n",
17
+ " print(result.content[:100])\n",
18
+ " print(\"Jina: -------\")\n",
19
+ " result = await extract_content(dict(url=url, engine=\"jina\"))\n",
20
+ " print(result.title[:100])\n",
21
+ " print(result.content[:100])\n",
22
+ " print(\"Firecrawl: -------\")\n",
23
+ " result = await extract_content(dict(url=url, engine=\"firecrawl\"))\n",
24
+ " print(result.title[:100])\n",
25
+ " print(result.content[:100])\n",
26
+ " print(\"=============================\")"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 4,
32
+ "id": "263dc3af",
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stdout",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "Processing: https://www.supernovalabs.com.br/\n",
40
+ "Simple: -------\n",
41
+ "Readability failed: No content extracted by readability\n",
42
+ "Supernova Labs | AI Consulting\n",
43
+ "Supernova Labs | AI Consulting\n",
44
+ "Jina: -------\n",
45
+ "Supernova Labs | Elite AI Consulting to help you build the Future\n",
46
+ "URL Source: https://www.supernovalabs.com.br/\n",
47
+ "\n",
48
+ "Markdown Content:\n",
49
+ "Supernova Labs\n",
50
+ "\n",
51
+ "[About](https://www\n",
52
+ "Firecrawl: -------\n",
53
+ "Supernova Labs | AI Consulting\n",
54
+ "# Unleash Your AI Edge. Fast.\n",
55
+ "\n",
56
+ "We turn your data, tech and capabilities into impact with lean AI sol\n",
57
+ "=============================\n",
58
+ "None\n",
59
+ "Processing: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
60
+ "Simple: -------\n",
61
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
62
+ "Fine-Tuning Transaction User Models Learn how we combine transaction embeddings with tabular data us\n",
63
+ "Jina: -------\n",
64
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
65
+ "URL Source: https://building.nubank.com/fine-tuning-transaction-user-models/\n",
66
+ "\n",
67
+ "Published Time: 2025-0\n",
68
+ "Firecrawl: -------\n",
69
+ "Fine-Tuning Transaction User Models - Building Nubank\n",
70
+ "# Fine-Tuning Transaction User Models\n",
71
+ "\n",
72
+ "Learn how we combine transaction embeddings with tabular data\n",
73
+ "=============================\n",
74
+ "None\n",
75
+ "Processing: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\n",
76
+ "Simple: -------\n",
77
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
78
+ "Most people think they need to quit their job to build a new life. I thought that too. You scroll th\n",
79
+ "Jina: -------\n",
80
+ "You Can Make Money With AI Without Quitting Your Job\n",
81
+ "URL Source: https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quittin\n",
82
+ "Firecrawl: -------\n",
83
+ "You Can Make Money With AI Without Quitting Your Job | by Nipuna Maduranga | LearnAIforproft.com | M\n",
84
+ "[Sitemap](https://medium.com/sitemap/sitemap.xml)\n",
85
+ "\n",
86
+ "[Open in app](https://rsci.app.link/?%24canonical\n",
87
+ "=============================\n",
88
+ "None\n",
89
+ "Processing: https://github.com/mirkonasato/pyodconverter\n",
90
+ "Simple: -------\n",
91
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
92
+ "This repository was archived by the owner on Dec 1, 2023. It is now read-only. mirkonasato/pyodconve\n",
93
+ "Jina: -------\n",
94
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
95
+ "URL Source: https://github.com/mirkonasato/pyodconverter\n",
96
+ "\n",
97
+ "Markdown Content:\n",
98
+ "GitHub - mirkonasato/pyo\n",
99
+ "Firecrawl: -------\n",
100
+ "GitHub - mirkonasato/pyodconverter: Python script to automate document conversions using LibreOffice\n",
101
+ "[Skip to content](https://github.com/mirkonasato/pyodconverter#start-of-content)\n",
102
+ "\n",
103
+ "You signed in with\n",
104
+ "=============================\n",
105
+ "None\n",
106
+ "Processing: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\n",
107
+ "Simple: -------\n",
108
+ "Error processing URL https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR: HTTP error: 500\n",
109
+ "Error\n",
110
+ "Failed to extract content: HTTP error: 500\n",
111
+ "Jina: -------\n",
112
+ "Ultra-aprendizado: domine habilidades valiosas, seja mais esperto que a competição e dê um impulso n\n",
113
+ "URL Source: https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-ca\n",
114
+ "Firecrawl: -------\n",
115
+ "Amazon.com.br\n",
116
+ "#### Digite os caracteres que você vê abaixo\n",
117
+ "\n",
118
+ "Desculpe pelo inconveniente. Para continuar realizando\n",
119
+ "=============================\n",
120
+ "None\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "\n",
126
+ "urls= [\"https://www.supernovalabs.com.br/\", \"https://building.nubank.com/fine-tuning-transaction-user-models/\", \"https://medium.com/writing-for-profit-with-ai/you-can-make-money-with-ai-without-quitting-your-job-5296bbcb703b\", \"https://github.com/mirkonasato/pyodconverter\", \"https://www.amazon.com.br/Ultra-aprendizado-habilidades-valiosas-competi%C3%A7%C3%A3o-carreira/dp/6555110058/ref=asc_df_6555110058?tag=googleshopp00-20&hvadid=709857900630&hvpos=&hvnetw=g&hvrand=17798174883330212364&hvpone=&hvptwo=&hvqmt=&hvdev=c&hvdvcmdl=&hvlocint=&hvlocphy=9195894&hvtargid=pla-1148630207439&psc=1&language=pt_BR\"]\n",
127
+ "for url in urls:\n",
128
+ " result = await process_url(url=url)\n",
129
+ " print(result)"
130
+ ]
131
+ }
132
+ ],
133
+ "metadata": {
134
+ "kernelspec": {
135
+ "display_name": ".venv",
136
+ "language": "python",
137
+ "name": "python3"
138
+ },
139
+ "language_info": {
140
+ "codemirror_mode": {
141
+ "name": "ipython",
142
+ "version": 3
143
+ },
144
+ "file_extension": ".py",
145
+ "mimetype": "text/x-python",
146
+ "name": "python",
147
+ "nbconvert_exporter": "python",
148
+ "pygments_lexer": "ipython3",
149
+ "version": "3.10.6"
150
+ }
151
+ },
152
+ "nbformat": 4,
153
+ "nbformat_minor": 5
154
+ }
@@ -2,22 +2,29 @@
2
2
  Docling-based document extraction processor.
3
3
  """
4
4
 
5
+ from content_core.common.state import ProcessSourceState
6
+ from content_core.config import CONFIG
7
+
8
+ DOCLING_AVAILABLE = False
5
9
  try:
6
10
  from docling.document_converter import DocumentConverter
11
+ DOCLING_AVAILABLE = True
7
12
  except ImportError:
8
13
 
9
14
  class DocumentConverter:
10
15
  """Stub when docling is not installed."""
11
16
 
12
17
  def __init__(self):
13
- raise ImportError("Docling not installed")
18
+ raise ImportError(
19
+ "Docling not installed. Install with: pip install content-core[docling] "
20
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
21
+ )
14
22
 
15
23
  def convert(self, source: str):
16
- raise ImportError("Docling not installed")
17
-
18
-
19
- from content_core.common.state import ProcessSourceState
20
- from content_core.config import CONFIG
24
+ raise ImportError(
25
+ "Docling not installed. Install with: pip install content-core[docling] "
26
+ "or use CCORE_DOCUMENT_ENGINE=simple to skip docling."
27
+ )
21
28
 
22
29
  # Supported MIME types for Docling extraction
23
30
  DOCLING_SUPPORTED = {
@@ -5,20 +5,90 @@ import unicodedata
5
5
  import fitz # type: ignore
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
+ from content_core.config import CONFIG
8
9
  from content_core.logging import logger
9
10
 
10
- # todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
11
- # todo: what else can we do to make the text more readable?
12
- # todo: try to fix encoding for some PDF that is still breaking
13
- # def _extract_text_from_pdf(pdf_path):
14
- # doc = fitz.open(pdf_path)
15
- # text = ""
16
- # logger.debug(f"Found {len(doc)} pages in PDF")
17
- # for page in doc:
18
- # # Use encode/decode if you need to clean up any encoding issues
19
- # text += page.get_text().encode('utf-8').decode('utf-8')
20
- # doc.close()
21
- # return text
11
+ def count_formula_placeholders(text):
12
+ """
13
+ Count the number of formula placeholders in extracted text.
14
+
15
+ Args:
16
+ text (str): Extracted text content
17
+ Returns:
18
+ int: Number of formula placeholders found
19
+ """
20
+ if not text:
21
+ return 0
22
+ return text.count('<!-- formula-not-decoded -->')
23
+
24
+
25
+ def extract_page_with_ocr(page, page_num):
26
+ """
27
+ Extract text from a page using OCR (Tesseract).
28
+
29
+ Args:
30
+ page: PyMuPDF page object
31
+ page_num (int): Page number for logging
32
+ Returns:
33
+ str: OCR-extracted text or None if OCR fails
34
+ """
35
+ try:
36
+ logger.debug(f"Attempting OCR extraction for page {page_num}")
37
+ # Create TextPage using OCR
38
+ textpage = page.get_textpage_ocr()
39
+ if textpage:
40
+ # Extract text from the OCR TextPage
41
+ ocr_text = textpage.extractText()
42
+ logger.debug(f"OCR successful for page {page_num}, extracted {len(ocr_text)} characters")
43
+ return ocr_text
44
+ else:
45
+ logger.warning(f"OCR TextPage creation failed for page {page_num}")
46
+ return None
47
+ except (ImportError, RuntimeError, OSError) as e:
48
+ # Common errors: Tesseract not installed, OCR failure, file access issues
49
+ logger.debug(f"OCR extraction failed for page {page_num}: {e}")
50
+ return None
51
+ except Exception as e:
52
+ # Unexpected errors - log as warning for debugging
53
+ logger.warning(f"Unexpected error during OCR extraction for page {page_num}: {e}")
54
+ return None
55
+
56
+
57
+ def convert_table_to_markdown(table):
58
+ """
59
+ Convert a PyMuPDF table to markdown format.
60
+
61
+ Args:
62
+ table: Table data from PyMuPDF (list of lists)
63
+ Returns:
64
+ str: Markdown-formatted table
65
+ """
66
+ if not table or not table[0]:
67
+ return ""
68
+
69
+ # Build markdown table
70
+ markdown_lines = []
71
+
72
+ # Header row
73
+ header = table[0]
74
+ header_row = "| " + " | ".join(str(cell) if cell else "" for cell in header) + " |"
75
+ markdown_lines.append(header_row)
76
+
77
+ # Separator row
78
+ separator = "|" + "|".join([" --- " for _ in header]) + "|"
79
+ markdown_lines.append(separator)
80
+
81
+ # Data rows
82
+ for row in table[1:]:
83
+ if row: # Skip empty rows
84
+ row_text = "| " + " | ".join(str(cell) if cell else "" for cell in row) + " |"
85
+ markdown_lines.append(row_text)
86
+
87
+ return "\n".join(markdown_lines) + "\n"
88
+
89
+ # Configuration constants
90
+ DEFAULT_FORMULA_THRESHOLD = 3
91
+ DEFAULT_OCR_FALLBACK = True
22
92
 
23
93
  SUPPORTED_FITZ_TYPES = [
24
94
  "application/pdf",
@@ -116,30 +186,84 @@ def clean_pdf_text(text):
116
186
  return text.strip()
117
187
 
118
188
 
119
- async def _extract_text_from_pdf(pdf_path):
120
- doc = fitz.open(pdf_path)
121
- try:
122
- text = ""
123
- logger.debug(f"Found {len(doc)} pages in PDF")
124
- for page in doc:
125
- text += page.get_text()
126
- normalized_text = clean_pdf_text(text)
127
- return normalized_text
128
- finally:
129
- doc.close()
130
189
 
131
190
 
132
191
  async def _extract_text_from_pdf(pdf_path):
133
- """Extract text from PDF asynchronously"""
192
+ """Extract text from PDF asynchronously with table detection"""
134
193
 
135
194
  def _extract():
136
195
  doc = fitz.open(pdf_path)
137
196
  try:
138
- text = ""
197
+ full_text = []
139
198
  logger.debug(f"Found {len(doc)} pages in PDF")
140
- for page in doc:
141
- text += page.get_text()
142
- return clean_pdf_text(text)
199
+
200
+ # Use quality improvement flags for better text extraction
201
+ extraction_flags = (
202
+ fitz.TEXT_PRESERVE_LIGATURES | # Better character rendering
203
+ fitz.TEXT_PRESERVE_WHITESPACE | # Better spacing preservation
204
+ fitz.TEXT_PRESERVE_IMAGES # Better image-text integration
205
+ )
206
+
207
+ # Get OCR configuration
208
+ ocr_config = CONFIG.get('extraction', {}).get('pymupdf', {})
209
+ enable_ocr = ocr_config.get('enable_formula_ocr', False)
210
+ formula_threshold = ocr_config.get('formula_threshold', DEFAULT_FORMULA_THRESHOLD)
211
+ ocr_fallback = ocr_config.get('ocr_fallback', DEFAULT_OCR_FALLBACK)
212
+
213
+ for page_num, page in enumerate(doc):
214
+ # Extract regular text with quality flags
215
+ standard_text = page.get_text(flags=extraction_flags)
216
+
217
+ # Check if we should try OCR for this page
218
+ formula_count = count_formula_placeholders(standard_text)
219
+ use_ocr = (enable_ocr and
220
+ formula_count >= formula_threshold and
221
+ formula_count > 0)
222
+
223
+ if use_ocr:
224
+ logger.debug(f"Page {page_num + 1} has {formula_count} formulas, attempting OCR")
225
+ ocr_text = extract_page_with_ocr(page, page_num + 1)
226
+
227
+ if ocr_text and ocr_fallback:
228
+ # Use OCR text but preserve table extraction from standard text
229
+ page_text = ocr_text
230
+ logger.debug(f"Using OCR text for page {page_num + 1}")
231
+ else:
232
+ # OCR failed, use standard text
233
+ page_text = standard_text
234
+ if not ocr_text:
235
+ logger.debug(f"OCR failed for page {page_num + 1}, using standard extraction")
236
+ else:
237
+ page_text = standard_text
238
+
239
+ # Try to find and extract tables (regardless of OCR)
240
+ try:
241
+ tables = page.find_tables()
242
+ if tables:
243
+ logger.debug(f"Found {len(tables)} table(s) on page {page_num + 1}")
244
+
245
+ # For each table found, convert to markdown and append
246
+ for table_num, table in enumerate(tables):
247
+ # Extract table data
248
+ table_data = table.extract()
249
+ # Validate table has actual content (not just empty rows/cells)
250
+ if table_data and len(table_data) > 0 and any(
251
+ any(str(cell).strip() for cell in row if cell) for row in table_data if row
252
+ ):
253
+ # Add a marker before the table
254
+ page_text += f"\n\n[Table {table_num + 1} from page {page_num + 1}]\n"
255
+ # Convert to markdown
256
+ markdown_table = convert_table_to_markdown(table_data)
257
+ page_text += markdown_table + "\n"
258
+ except Exception as e:
259
+ # If table extraction fails, continue with regular text
260
+ logger.debug(f"Table extraction failed on page {page_num + 1}: {e}")
261
+
262
+ full_text.append(page_text)
263
+
264
+ # Join all pages and clean
265
+ combined_text = "".join(full_text)
266
+ return clean_pdf_text(combined_text)
143
267
  finally:
144
268
  doc.close()
145
269
 
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
5
5
  from readability import Document
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
- from content_core.config import CONFIG
8
+ from content_core.config import get_url_engine
9
9
  from content_core.logging import logger
10
10
  from content_core.processors.docling import DOCLING_SUPPORTED
11
11
  from content_core.processors.office import SUPPORTED_OFFICE_TYPES
@@ -165,7 +165,8 @@ async def extract_url(state: ProcessSourceState):
165
165
  """
166
166
  assert state.url, "No URL provided"
167
167
  url = state.url
168
- engine = state.url_engine or CONFIG.get("extraction", {}).get("url_engine", "auto")
168
+ # Use environment-aware engine selection
169
+ engine = state.url_engine or get_url_engine()
169
170
  try:
170
171
  if engine == "auto":
171
172
  if os.environ.get("FIRECRAWL_API_KEY"):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.1.2
3
+ Version: 1.2.1
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -10,7 +10,6 @@ Requires-Dist: aiohttp>=3.11
10
10
  Requires-Dist: asciidoc>=10.2.1
11
11
  Requires-Dist: bs4>=0.0.2
12
12
  Requires-Dist: dicttoxml>=1.7.16
13
- Requires-Dist: docling>=2.34.0
14
13
  Requires-Dist: esperanto>=1.2.0
15
14
  Requires-Dist: firecrawl-py>=2.7.0
16
15
  Requires-Dist: jinja2>=3.1.6
@@ -31,6 +30,8 @@ Requires-Dist: pytubefix>=9.1.1
31
30
  Requires-Dist: readability-lxml>=0.8.4.1
32
31
  Requires-Dist: validators>=0.34.0
33
32
  Requires-Dist: youtube-transcript-api>=1.0.3
33
+ Provides-Extra: docling
34
+ Requires-Dist: docling>=2.34.0; extra == 'docling'
34
35
  Provides-Extra: mcp
35
36
  Requires-Dist: fastmcp>=0.5.0; extra == 'mcp'
36
37
  Description-Content-Type: text/markdown
@@ -39,29 +40,70 @@ Description-Content-Type: text/markdown
39
40
 
40
41
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
42
 
42
- **Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
43
+ **Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
43
44
 
44
- ## Overview
45
+ ## 🚀 What You Can Do
45
46
 
46
- > **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
47
+ **Extract content from anywhere:**
48
+ - 📄 **Documents** - PDF, Word, PowerPoint, Excel, Markdown, HTML, EPUB
49
+ - 🎥 **Media** - Videos (MP4, AVI, MOV) with automatic transcription
50
+ - 🎵 **Audio** - MP3, WAV, M4A with speech-to-text conversion
51
+ - 🌐 **Web** - Any URL with intelligent content extraction
52
+ - 🖼️ **Images** - JPG, PNG, TIFF with OCR text recognition
53
+ - 📦 **Archives** - ZIP, TAR, GZ with content analysis
47
54
 
48
- The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
55
+ **Process with AI:**
56
+ - ✨ **Clean & format** extracted content automatically
57
+ - 📝 **Generate summaries** with customizable styles (bullet points, executive summary, etc.)
58
+ - 🎯 **Context-aware processing** - explain to a child, technical summary, action items
59
+ - 🔄 **Smart engine selection** - automatically chooses the best extraction method
49
60
 
50
- ## Key Features
61
+ ## 🛠️ Multiple Ways to Use
51
62
 
52
- * **Multi-Source Extraction:** Handles content from:
53
- * Direct text strings.
54
- * Web URLs (using robust extraction methods).
55
- * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
56
- * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
57
- * **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
58
- * For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
59
- * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
60
- * You can override this by specifying an engine, but `'auto'` is recommended for most users.
61
- * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
62
- * **MCP Server:** Includes a Model Context Protocol (MCP) server for seamless integration with Claude Desktop and other MCP-compatible applications.
63
- * **macOS Services:** Right-click context menu integration for Finder (extract and summarize files directly).
64
- * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
63
+ ### 🖥️ Command Line (Zero Install)
64
+ ```bash
65
+ # Extract content from any source
66
+ uvx --from "content-core" ccore https://example.com
67
+ uvx --from "content-core" ccore document.pdf
68
+
69
+ # Generate AI summaries
70
+ uvx --from "content-core" csum video.mp4 --context "bullet points"
71
+ ```
72
+
73
+ ### 🤖 Claude Desktop Integration
74
+ One-click setup with Model Context Protocol (MCP) - extract content directly in Claude conversations.
75
+
76
+ ### 🔍 Raycast Extension
77
+ Smart auto-detection commands:
78
+ - **Extract Content** - Full interface with format options
79
+ - **Summarize Content** - 9 summary styles available
80
+ - **Quick Extract** - Instant clipboard extraction
81
+
82
+ ### 🖱️ macOS Right-Click Integration
83
+ Right-click any file in Finder → Services → Extract or Summarize content instantly.
84
+
85
+ ### 🐍 Python Library
86
+ ```python
87
+ import content_core as cc
88
+
89
+ # Extract from any source
90
+ result = await cc.extract("https://example.com/article")
91
+ summary = await cc.summarize_content(result, context="explain to a child")
92
+ ```
93
+
94
+ ## ⚡ Key Features
95
+
96
+ * **🎯 Intelligent Auto-Detection:** Automatically selects the best extraction method based on content type and available services
97
+ * **🔧 Smart Engine Selection:**
98
+ * **URLs:** Firecrawl → Jina → BeautifulSoup fallback chain
99
+ * **Documents:** Docling → Enhanced PyMuPDF → Simple extraction fallback
100
+ * **Media:** OpenAI Whisper transcription
101
+ * **Images:** OCR with multiple engine support
102
+ * **📊 Enhanced PDF Processing:** Advanced PyMuPDF engine with quality flags, table detection, and optional OCR for mathematical formulas
103
+ * **🌍 Multiple Integrations:** CLI, Python library, MCP server, Raycast extension, macOS Services
104
+ * **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
105
+ * **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
106
+ * **🔄 Asynchronous:** Built with `asyncio` for efficient processing
65
107
 
66
108
  ## Getting Started
67
109
 
@@ -70,11 +112,17 @@ The primary goal of Content Core is to simplify the process of ingesting content
70
112
  Install Content Core using `pip`:
71
113
 
72
114
  ```bash
73
- # Install the package
115
+ # Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
74
116
  pip install content-core
75
117
 
76
- # Install with MCP server support
118
+ # With enhanced document processing (adds Docling)
119
+ pip install content-core[docling]
120
+
121
+ # With MCP server support
77
122
  pip install content-core[mcp]
123
+
124
+ # Full installation
125
+ pip install content-core[docling,mcp]
78
126
  ```
79
127
 
80
128
  Alternatively, if you’re developing locally:
@@ -245,6 +293,49 @@ Add to your `claude_desktop_config.json`:
245
293
 
246
294
  For detailed setup instructions, configuration options, and usage examples, see our [MCP Documentation](docs/mcp.md).
247
295
 
296
+ ## Enhanced PDF Processing
297
+
298
+ Content Core features an optimized PyMuPDF extraction engine with significant improvements for scientific documents and complex PDFs.
299
+
300
+ ### Key Improvements
301
+
302
+ - **🔬 Mathematical Formula Extraction**: Enhanced quality flags eliminate `<!-- formula-not-decoded -->` placeholders
303
+ - **📊 Automatic Table Detection**: Tables converted to markdown format for LLM consumption
304
+ - **🔧 Quality Text Rendering**: Better ligature, whitespace, and image-text integration
305
+ - **⚡ Optional OCR Enhancement**: Selective OCR for formula-heavy pages (requires Tesseract)
306
+
307
+ ### Configuration for Scientific Documents
308
+
309
+ For documents with heavy mathematical content, enable OCR enhancement:
310
+
311
+ ```yaml
312
+ # In cc_config.yaml
313
+ extraction:
314
+ pymupdf:
315
+ enable_formula_ocr: true # Enable OCR for formula-heavy pages
316
+ formula_threshold: 3 # Min formulas per page to trigger OCR
317
+ ocr_fallback: true # Graceful fallback if OCR fails
318
+ ```
319
+
320
+ ```python
321
+ # Runtime configuration
322
+ from content_core.config import set_pymupdf_ocr_enabled
323
+ set_pymupdf_ocr_enabled(True)
324
+ ```
325
+
326
+ ### Requirements for OCR Enhancement
327
+
328
+ ```bash
329
+ # Install Tesseract OCR (optional, for formula enhancement)
330
+ # macOS
331
+ brew install tesseract
332
+
333
+ # Ubuntu/Debian
334
+ sudo apt-get install tesseract-ocr
335
+ ```
336
+
337
+ **Note**: OCR is optional - you get improved PDF extraction automatically without any additional setup.
338
+
248
339
  ## macOS Services Integration
249
340
 
250
341
  Content Core provides powerful right-click integration with macOS Finder, allowing you to extract and summarize content from any file without installation. Choose between clipboard or TextEdit output for maximum flexibility.
@@ -288,6 +379,50 @@ Create **4 convenient services** for different workflows:
288
379
 
289
380
  For complete setup instructions with copy-paste scripts, see [macOS Services Documentation](docs/macos.md).
290
381
 
382
+ ## Raycast Extension
383
+
384
+ Content Core provides a powerful Raycast extension with smart auto-detection that handles both URLs and file paths seamlessly. Extract and summarize content directly from your Raycast interface without switching applications.
385
+
386
+ ### Quick Setup
387
+
388
+ **From Raycast Store** (coming soon):
389
+ 1. Open Raycast and search for "Content Core"
390
+ 2. Install the extension by `luis_novo`
391
+ 3. Configure API keys in preferences
392
+
393
+ **Manual Installation**:
394
+ 1. Download the extension from the repository
395
+ 2. Open Raycast → "Import Extension"
396
+ 3. Select the `raycast-content-core` folder
397
+
398
+ ### Commands
399
+
400
+ **🔍 Extract Content** - Smart URL/file detection with full interface
401
+ - Auto-detects URLs vs file paths in real-time
402
+ - Multiple output formats (Text, JSON, XML)
403
+ - Drag & drop support for files
404
+ - Rich results view with metadata
405
+
406
+ **📝 Summarize Content** - AI-powered summaries with customizable styles
407
+ - 9 different summary styles (bullet points, executive summary, etc.)
408
+ - Auto-detects source type with visual feedback
409
+ - One-click snippet creation and quicklinks
410
+
411
+ **⚡ Quick Extract** - Instant extraction to clipboard
412
+ - Type → Tab → Paste source → Enter
413
+ - No UI, works directly from command bar
414
+ - Perfect for quick workflows
415
+
416
+ ### Features
417
+
418
+ - **Smart Auto-Detection**: Instantly recognizes URLs vs file paths
419
+ - **Zero Installation**: Uses `uvx` for Content Core execution
420
+ - **Rich Integration**: Keyboard shortcuts, clipboard actions, Raycast snippets
421
+ - **All File Types**: Documents, videos, audio, images, archives
422
+ - **Visual Feedback**: Real-time type detection with icons
423
+
424
+ For detailed setup, configuration, and usage examples, see [Raycast Extension Documentation](docs/raycast.md).
425
+
291
426
  ## Using with Langchain
292
427
 
293
428
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -397,8 +532,21 @@ Example `.env`:
397
532
  ```plaintext
398
533
  OPENAI_API_KEY=your-key-here
399
534
  GOOGLE_API_KEY=your-key-here
535
+
536
+ # Engine Selection (optional)
537
+ CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
538
+ CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
400
539
  ```
401
540
 
541
+ ### Engine Selection via Environment Variables
542
+
543
+ For deployment scenarios like MCP servers or Raycast extensions, you can override the extraction engines using environment variables:
544
+
545
+ - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
546
+ - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
547
+
548
+ These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
549
+
402
550
  ### Custom Prompt Templates
403
551
 
404
552
  Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
@@ -1,6 +1,6 @@
1
1
  content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
- content_core/cc_config.yaml,sha256=gGSPM-oO6GIHyCfDCH-cN72BgPJiRmZMgwPrrLhUmfU,851
3
- content_core/config.py,sha256=vyx0fioR6r0mcZfVdwAFDhFrRNoG0ZNG8RNxIDnhNlo,1802
2
+ content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
3
+ content_core/config.py,sha256=3XAsMF3EhDJ6aCpzk1UZG_m3-SFdYe3cHiDPH7eVGwQ,4312
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
5
  content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
@@ -15,27 +15,28 @@ content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeF
15
15
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
16
16
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
17
17
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
18
- content_core/content/extraction/graph.py,sha256=Nn2iaQc6YJ4Qt8WKTolwUQUNNqUlwpV8YnijESGvnD0,7605
18
+ content_core/content/extraction/graph.py,sha256=sjk6NpzOMOzMbUOM0bqrDSlB3cLQzboviLDNbj48pjY,8074
19
19
  content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
20
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
21
21
  content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
22
22
  content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
23
23
  content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
24
24
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
25
+ content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
25
26
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
26
- content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
27
+ content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
27
28
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
28
- content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
29
+ content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
29
30
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
30
- content_core/processors/url.py,sha256=6WT8Sw2VHiKyhgWXi_jZjKjwnT_QPSPcH4P99RKbjgU,7521
31
+ content_core/processors/url.py,sha256=YoWw2CjZbqSKBi1CpY0Qowu4hfqGVGJjLZEXUjz7wxs,7536
31
32
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
32
33
  content_core/processors/youtube.py,sha256=MOeZboVfM9_C87L5mnUVvsbQeKoznwJoYn1wP1_hA_U,7869
33
34
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
34
35
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
35
36
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
36
37
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
37
- content_core-1.1.2.dist-info/METADATA,sha256=_0Rg4yeU-05hDB_91dvcMXYKMaKcMcU5C8SpkYhtiRs,15072
38
- content_core-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
39
- content_core-1.1.2.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
40
- content_core-1.1.2.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
41
- content_core-1.1.2.dist-info/RECORD,,
38
+ content_core-1.2.1.dist-info/METADATA,sha256=1LpANnMvECxIekt6kKQr0hnZ1ULGaD2xEmhRh_uzTdk,19676
39
+ content_core-1.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
40
+ content_core-1.2.1.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
41
+ content_core-1.2.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
42
+ content_core-1.2.1.dist-info/RECORD,,