content-core 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -2,6 +2,9 @@ from typing import Optional
2
2
 
3
3
  from pydantic import BaseModel, Field
4
4
 
5
+ from content_core.common.types import Engine
6
+ from content_core.common.types import Engine
7
+
5
8
 
6
9
  class ProcessSourceState(BaseModel):
7
10
  file_path: Optional[str] = ""
@@ -13,8 +16,9 @@ class ProcessSourceState(BaseModel):
13
16
  identified_provider: Optional[str] = ""
14
17
  metadata: Optional[dict] = Field(default_factory=lambda: {})
15
18
  content: Optional[str] = ""
16
- engine: Optional[str] = Field(
17
- default=None, description="Override extraction engine: 'legacy' or 'docling'"
19
+ engine: Optional[Engine] = Field(
20
+ default=None,
21
+ description="Override extraction engine: 'auto', 'simple', 'legacy', 'firecrawl', 'jina', or 'docling'",
18
22
  )
19
23
  output_format: Optional[str] = Field(
20
24
  default=None,
@@ -0,0 +1,21 @@
1
+ from typing import Literal
2
+ import warnings
3
+
4
+ Engine = Literal[
5
+ "auto",
6
+ "simple",
7
+ "legacy",
8
+ "firecrawl",
9
+ "jina",
10
+ "docling",
11
+ ]
12
+
13
+ DEPRECATED_ENGINES = {"legacy": "simple"}
14
+
15
+ def warn_if_deprecated_engine(engine: str):
16
+ if engine in DEPRECATED_ENGINES:
17
+ warnings.warn(
18
+ f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
19
+ DeprecationWarning,
20
+ stacklevel=2,
21
+ )
@@ -2,6 +2,7 @@ import os
2
2
  import tempfile
3
3
  from typing import Any, Dict, Optional
4
4
  from urllib.parse import urlparse
5
+ from content_core.common.types import warn_if_deprecated_engine
5
6
 
6
7
  import aiohttp
7
8
  import magic
@@ -114,14 +115,28 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
114
115
  return {"file_path": tmp, "identified_type": mime}
115
116
 
116
117
 
118
+
117
119
  async def file_type_router_docling(state: ProcessSourceState) -> str:
118
120
  """
119
- Route to Docling if enabled and supported; otherwise use legacy file type edge.
121
+ Route to Docling if enabled and supported; otherwise use simple file type edge.
122
+ Supports 'auto', 'docling', 'simple', and 'legacy' (deprecated, alias for simple).
123
+ 'auto' tries simple first, then falls back to docling if simple fails.
120
124
  """
121
- # allow per-execution override of engine via state.engine
122
- engine = state.engine or CONFIG.get("extraction", {}).get("engine", "legacy")
125
+ engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
126
+ warn_if_deprecated_engine(engine)
127
+ if engine == "auto":
128
+ # Try docling first; if it fails or is not supported, fallback to simple
129
+ if state.identified_type in DOCLING_SUPPORTED:
130
+ try:
131
+ return "extract_docling"
132
+ except Exception as e:
133
+ logger.warning(f"Docling extraction failed in 'auto' mode, falling back to simple: {e}")
134
+ # Fallback to simple
135
+ return await file_type_edge(state)
136
+
123
137
  if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
124
138
  return "extract_docling"
139
+ # For 'simple' and 'legacy', use the default file type edge
125
140
  return await file_type_edge(state)
126
141
 
127
142
 
@@ -1,9 +1,10 @@
1
1
  import asyncio
2
+ import math
2
3
  import os
3
4
  import tempfile
4
- import math
5
5
  import traceback
6
6
  from functools import partial
7
+
7
8
  from moviepy import AudioFileClip
8
9
 
9
10
  from content_core.common import ProcessSourceState
@@ -64,7 +65,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
64
65
  )
65
66
 
66
67
 
67
- def extract_audio(input_file: str, output_file: str, start_time: float = None, end_time: float = None) -> None:
68
+ def extract_audio(
69
+ input_file: str, output_file: str, start_time: float = None, end_time: float = None
70
+ ) -> None:
68
71
  """
69
72
  Extract audio from a video or audio file and save it as an MP3 file.
70
73
  If start_time and end_time are provided, only that segment of audio is extracted.
@@ -78,17 +81,17 @@ def extract_audio(input_file: str, output_file: str, start_time: float = None, e
78
81
  try:
79
82
  # Load the file as an AudioFileClip
80
83
  audio_clip = AudioFileClip(input_file)
81
-
82
- # If start_time and end_time are provided, trim the audio
84
+
85
+ # If start_time and/or end_time are provided, trim the audio using subclipped
83
86
  if start_time is not None and end_time is not None:
84
- audio_clip = audio_clip.cutout(0, start_time).cutout(end_time - start_time, audio_clip.duration)
87
+ audio_clip = audio_clip.subclipped(start_time, end_time)
85
88
  elif start_time is not None:
86
- audio_clip = audio_clip.cutout(0, start_time)
89
+ audio_clip = audio_clip.subclipped(start_time)
87
90
  elif end_time is not None:
88
- audio_clip = audio_clip.cutout(end_time, audio_clip.duration)
91
+ audio_clip = audio_clip.subclipped(0, end_time)
89
92
 
90
93
  # Export the audio as MP3
91
- audio_clip.write_audiofile(output_file, codec='mp3')
94
+ audio_clip.write_audiofile(output_file, codec="mp3")
92
95
  audio_clip.close()
93
96
  except Exception as e:
94
97
  logger.error(f"Error extracting audio: {str(e)}")
@@ -117,7 +120,9 @@ async def extract_audio_data(data: ProcessSourceState):
117
120
  output_files = []
118
121
 
119
122
  if duration_s > segment_length_s:
120
- logger.info(f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments")
123
+ logger.info(
124
+ f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
125
+ )
121
126
  for i in range(math.ceil(duration_s / segment_length_s)):
122
127
  start_time = i * segment_length_s
123
128
  end_time = min((i + 1) * segment_length_s, audio.duration)
@@ -134,15 +139,18 @@ async def extract_audio_data(data: ProcessSourceState):
134
139
 
135
140
  # Transcribe audio files
136
141
  from content_core.models import ModelFactory
142
+
137
143
  speech_to_text_model = ModelFactory.get_model("speech_to_text")
138
144
  transcriptions = []
139
145
  for audio_file in output_files:
140
- transcription = await transcribe_audio_segment(audio_file, speech_to_text_model)
146
+ transcription = await transcribe_audio_segment(
147
+ audio_file, speech_to_text_model
148
+ )
141
149
  transcriptions.append(transcription)
142
150
 
143
151
  return {
144
152
  "metadata": {"audio_files": output_files},
145
- "content": " ".join(transcriptions)
153
+ "content": " ".join(transcriptions),
146
154
  }
147
155
  except Exception as e:
148
156
  logger.error(f"Error processing audio: {str(e)}")
@@ -1,20 +1,21 @@
1
- import re
2
- from urllib.parse import urlparse
1
+ import os
3
2
  from io import BytesIO
3
+ from urllib.parse import urlparse
4
4
 
5
5
  import aiohttp
6
6
  import docx
7
- from bs4 import BeautifulSoup, Comment
7
+ from bs4 import BeautifulSoup
8
+ from readability import Document
8
9
 
9
10
  from content_core.common import ProcessSourceState
11
+ from content_core.common.types import warn_if_deprecated_engine
10
12
  from content_core.logging import logger
11
13
  from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
12
14
 
13
- # future: better extraction methods
14
- # https://github.com/buriy/python-readability
15
- # also try readability: from readability import Document
15
+ DOCX_MIME_TYPE = (
16
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
17
+ )
16
18
 
17
- DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
18
19
 
19
20
  async def _extract_docx_content(docx_bytes: bytes, url: str):
20
21
  """
@@ -25,21 +26,25 @@ async def _extract_docx_content(docx_bytes: bytes, url: str):
25
26
  doc = docx.Document(BytesIO(docx_bytes))
26
27
  content_parts = [p.text for p in doc.paragraphs if p.text]
27
28
  full_content = "\n\n".join(content_parts)
28
-
29
+
29
30
  # Try to get a title from document properties or first heading
30
31
  title = doc.core_properties.title
31
32
  if not title and doc.paragraphs:
32
33
  # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
33
- for p in doc.paragraphs[:5]: # Check first 5 paragraphs
34
- if p.style.name.startswith('Heading'):
34
+ for p in doc.paragraphs[:5]: # Check first 5 paragraphs
35
+ if p.style.name.startswith("Heading"):
35
36
  title = p.text
36
37
  break
37
- if not title: # Fallback to first line if no heading found
38
- title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
38
+ if not title: # Fallback to first line if no heading found
39
+ title = (
40
+ doc.paragraphs[0].text.strip()
41
+ if doc.paragraphs[0].text.strip()
42
+ else None
43
+ )
39
44
 
40
45
  # If no title found, use filename from URL
41
46
  if not title:
42
- title = urlparse(url).path.split('/')[-1]
47
+ title = urlparse(url).path.split("/")[-1]
43
48
 
44
49
  logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
45
50
  return {
@@ -58,6 +63,7 @@ async def _extract_docx_content(docx_bytes: bytes, url: str):
58
63
  "url": url,
59
64
  }
60
65
 
66
+
61
67
  async def url_provider(state: ProcessSourceState):
62
68
  """
63
69
  Identify the provider
@@ -71,7 +77,9 @@ async def url_provider(state: ProcessSourceState):
71
77
  # remote URL: check content-type to catch PDFs
72
78
  try:
73
79
  async with aiohttp.ClientSession() as session:
74
- async with session.head(url, timeout=10, allow_redirects=True) as resp:
80
+ async with session.head(
81
+ url, timeout=10, allow_redirects=True
82
+ ) as resp:
75
83
  mime = resp.headers.get("content-type", "").split(";", 1)[0]
76
84
  except Exception as e:
77
85
  logger.debug(f"HEAD check failed for {url}: {e}")
@@ -83,142 +91,82 @@ async def url_provider(state: ProcessSourceState):
83
91
  return return_dict
84
92
 
85
93
 
86
- async def extract_url_bs4(url: str):
87
- """
88
- Get the title and content of a URL using bs4
94
+ async def extract_url_bs4(url: str) -> dict:
89
95
  """
90
- try:
91
- headers = {
92
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
93
- }
96
+ Get the title and content of a URL using readability with a fallback to BeautifulSoup.
94
97
 
95
- # If URL is actually HTML content
96
- if url.startswith("<!DOCTYPE html>") or url.startswith("<html"):
97
- html_content = url
98
- else:
99
- async with aiohttp.ClientSession() as session:
100
- async with session.get(url, headers=headers, timeout=10) as response:
101
- response.raise_for_status()
102
- # Check content type for DOCX
103
- if response.content_type == DOCX_MIME_TYPE:
104
- logger.debug(f"Detected DOCX content type for {url}")
105
- docx_bytes = await response.read()
106
- return await _extract_docx_content(docx_bytes, url)
107
-
108
- # If not DOCX, proceed as HTML
109
- html_content = await response.text()
110
-
111
- soup = BeautifulSoup(html_content, "html.parser")
112
-
113
- # Remove unwanted elements
114
- for element in soup.find_all(
115
- ["script", "style", "nav", "footer", "iframe", "noscript", "ad"]
116
- ):
117
- element.decompose()
118
-
119
- # Remove comments
120
- for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
121
- comment.extract()
122
-
123
- # Get title
124
- title = None
125
- title_tags = [
126
- soup.find("meta", property="og:title"),
127
- soup.find("meta", property="twitter:title"),
128
- soup.find("title"),
129
- soup.find("h1"),
130
- ]
131
-
132
- for tag in title_tags:
133
- if tag:
134
- if tag.string:
135
- title = tag.string
136
- elif tag.get("content"):
137
- title = tag.get("content")
138
- break
139
-
140
- # Clean up title
141
- if title:
142
- title = " ".join(title.split())
143
- title = re.sub(r"\s*\|.*$", "", title)
144
- title = re.sub(r"\s*-.*$", "", title)
145
-
146
- # Get content
147
- content = []
148
-
149
- # Look for main article content
150
- main_content = None
151
- content_tags = [
152
- soup.find("article"),
153
- soup.find("main"),
154
- soup.find(class_=re.compile(r"article|post|content|entry|document")),
155
- soup.find(id=re.compile(r"article|post|content|entry|main")),
156
- ]
157
-
158
- for tag in content_tags:
159
- if tag:
160
- main_content = tag
161
- break
162
-
163
- if not main_content:
164
- main_content = soup
165
-
166
- # Process content
167
- for element in main_content.find_all(
168
- ["p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "div"]
169
- ):
170
- # Handle code blocks
171
- if element.name == "pre" or "highlight" in element.get("class", []):
172
- code_text = element.get_text().strip()
173
- if code_text:
174
- content.append("\n```\n" + code_text + "\n```\n")
175
- continue
176
-
177
- # Handle regular text
178
- text = element.get_text().strip()
179
- if text:
180
- # Skip if text matches common patterns for navigation/footer
181
- if re.search(
182
- r"copyright|all rights reserved|privacy policy|terms of use",
183
- text.lower(),
184
- ):
185
- continue
186
-
187
- content.append(text)
188
-
189
- # Join content with proper spacing
190
- final_content = "\n\n".join(content)
191
-
192
- # Clean up content
193
- final_content = re.sub(
194
- r"\n\s*\n\s*\n", "\n\n", final_content
195
- ) # Remove extra newlines
196
- final_content = re.sub(r" +", " ", final_content) # Normalize whitespace
197
- final_content = final_content.strip()
98
+ Args:
99
+ url (str): The URL of the webpage to extract content from.
198
100
 
199
- return {
200
- "title": title,
201
- "content": final_content,
202
- "domain": urlparse(url).netloc
203
- if not url.startswith("<!DOCTYPE html>")
204
- else None,
205
- "url": url if not url.startswith("<!DOCTYPE html>") else None,
206
- }
101
+ Returns:
102
+ dict: A dictionary containing the 'title' and 'content' of the webpage.
103
+ """
104
+ async with aiohttp.ClientSession() as session:
105
+ try:
106
+ # Fetch the webpage content
107
+ async with session.get(url, timeout=10) as response:
108
+ if response.status != 200:
109
+ raise Exception(f"HTTP error: {response.status}")
110
+ html = await response.text()
111
+
112
+ # Try extracting with readability
113
+ try:
114
+ doc = Document(html)
115
+ title = doc.title() or "No title found"
116
+ # Extract content as plain text by parsing the cleaned HTML
117
+ soup = BeautifulSoup(doc.summary(), "lxml")
118
+ content = soup.get_text(separator=" ", strip=True)
119
+ if not content.strip():
120
+ raise ValueError("No content extracted by readability")
121
+ except Exception as e:
122
+ print(f"Readability failed: {e}")
123
+ # Fallback to BeautifulSoup
124
+ soup = BeautifulSoup(html, "lxml")
125
+ # Extract title
126
+ title_tag = (
127
+ soup.find("title")
128
+ or soup.find("h1")
129
+ or soup.find("meta", property="og:title")
130
+ )
131
+ title = (
132
+ title_tag.get_text(strip=True) if title_tag else "No title found"
133
+ )
134
+ # Extract content from common content tags
135
+ content_tags = soup.select(
136
+ 'article, .content, .post, main, [role="main"], div[class*="content"], div[class*="article"]'
137
+ )
138
+ content = (
139
+ " ".join(
140
+ tag.get_text(separator=" ", strip=True) for tag in content_tags
141
+ )
142
+ if content_tags
143
+ else soup.get_text(separator=" ", strip=True)
144
+ )
145
+ content = content.strip() or "No content found"
207
146
 
208
- except aiohttp.ClientError as e:
209
- logger.error(f"Failed to fetch URL {url}: {e}")
210
- return None
211
- except Exception as e:
212
- logger.error(f"Failed to process content: {e}")
213
- return None
147
+ return {
148
+ "title": title,
149
+ "content": content,
150
+ }
151
+
152
+ except Exception as e:
153
+ print(f"Error processing URL {url}: {e}")
154
+ return {
155
+ "title": "Error",
156
+ "content": f"Failed to extract content: {str(e)}",
157
+ }
214
158
 
215
159
 
216
160
  async def extract_url_jina(url: str):
217
161
  """
218
- Get the content of a URL using Jina
162
+ Get the content of a URL using Jina. Uses Bearer token if JINA_API_KEY is set.
219
163
  """
164
+ headers = {}
165
+ api_key = os.environ.get("JINA_API_KEY")
166
+ if api_key:
167
+ headers["Authorization"] = f"Bearer {api_key}"
220
168
  async with aiohttp.ClientSession() as session:
221
- async with session.get(f"https://r.jina.ai/{url}") as response:
169
+ async with session.get(f"https://r.jina.ai/{url}", headers=headers) as response:
222
170
  text = await response.text()
223
171
  if text.startswith("Title:") and "\n" in text:
224
172
  title_end = text.index("\n")
@@ -235,17 +183,65 @@ async def extract_url_jina(url: str):
235
183
  return {"content": text}
236
184
 
237
185
 
186
+ async def extract_url_firecrawl(url: str):
187
+ """
188
+ Get the content of a URL using Firecrawl.
189
+ Returns {"title": ..., "content": ...} or None on failure.
190
+ """
191
+ try:
192
+ from firecrawl import AsyncFirecrawlApp
193
+
194
+ app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
195
+ scrape_result = await app.scrape_url(url, formats=["markdown", "html"])
196
+ return {
197
+ "title": scrape_result.metadata["title"] or scrape_result.title,
198
+ "content": scrape_result.markdown,
199
+ }
200
+
201
+ except Exception as e:
202
+ logger.error(f"Firecrawl extraction error for URL: {url}: {e}")
203
+ return None
204
+
205
+
238
206
  async def extract_url(state: ProcessSourceState):
207
+ """
208
+ Extract content from a URL using the engine specified in the state.
209
+ Supported engines: 'auto', 'simple', 'legacy' (deprecated), 'firecrawl', 'jina'.
210
+ """
239
211
  assert state.url, "No URL provided"
240
212
  url = state.url
213
+ engine = state.engine or "auto"
214
+ warn_if_deprecated_engine(engine)
241
215
  try:
242
- result = await extract_url_bs4(url)
243
- if not result or not result.get("content"):
244
- logger.debug(
245
- f"BS4 extraction failed for url {url}, falling back to Jina extractor"
246
- )
247
- result = await extract_url_jina(url)
248
- return result
216
+ if engine == "auto":
217
+ if os.environ.get("FIRECRAWL_API_KEY"):
218
+ logger.debug(
219
+ "Engine 'auto' selected: using Firecrawl (FIRECRAWL_API_KEY detected)"
220
+ )
221
+ return await extract_url_firecrawl(url)
222
+ else:
223
+ try:
224
+ logger.debug("Trying to use Jina to extract URL")
225
+ return await extract_url_jina(url)
226
+ except Exception as e:
227
+ logger.error(f"Jina extraction error for URL: {url}: {e}")
228
+ logger.debug("Falling back to BeautifulSoup")
229
+ return await extract_url_bs4(url)
230
+ elif engine == "simple" or engine == "legacy":
231
+ # 'legacy' is deprecated alias for 'simple'
232
+ return await extract_url_bs4(url)
233
+ elif engine == "firecrawl":
234
+ return await extract_url_firecrawl(url)
235
+ elif engine == "jina":
236
+ return await extract_url_jina(url)
237
+ elif engine == "docling":
238
+ from content_core.processors.docling import extract_with_docling
239
+
240
+ state.url = url
241
+ result_state = await extract_with_docling(state)
242
+ return {"title": None, "content": result_state.content}
243
+ else:
244
+ raise ValueError(f"Unknown engine: {engine}")
249
245
  except Exception as e:
250
246
  logger.error(f"URL extraction failed for URL: {url}")
251
247
  logger.exception(e)
@@ -1,15 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.7.2
3
+ Version: 0.8.1
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
8
8
  Requires-Dist: ai-prompter>=0.2.3
9
9
  Requires-Dist: aiohttp>=3.11
10
+ Requires-Dist: asciidoc>=10.2.1
10
11
  Requires-Dist: bs4>=0.0.2
11
12
  Requires-Dist: dicttoxml>=1.7.16
13
+ Requires-Dist: docling>=2.34.0
12
14
  Requires-Dist: esperanto[openai]>=1.2.0
15
+ Requires-Dist: firecrawl-py>=2.7.0
13
16
  Requires-Dist: jinja2>=3.1.6
14
17
  Requires-Dist: langdetect>=1.0.9
15
18
  Requires-Dist: langgraph>=0.3.29
@@ -17,18 +20,15 @@ Requires-Dist: loguru>=0.7.3
17
20
  Requires-Dist: moviepy>=2.1.2
18
21
  Requires-Dist: openpyxl>=3.1.5
19
22
  Requires-Dist: pandas>=2.2.3
23
+ Requires-Dist: pillow>=10.4.0
20
24
  Requires-Dist: pymupdf>=1.25.5
21
25
  Requires-Dist: python-docx>=1.1.2
22
26
  Requires-Dist: python-dotenv>=1.1.0
23
27
  Requires-Dist: python-magic>=0.4.27
24
28
  Requires-Dist: python-pptx>=1.0.2
29
+ Requires-Dist: readability-lxml>=0.8.4.1
25
30
  Requires-Dist: validators>=0.34.0
26
31
  Requires-Dist: youtube-transcript-api>=1.0.3
27
- Provides-Extra: docling
28
- Requires-Dist: asciidoc; extra == 'docling'
29
- Requires-Dist: docling; extra == 'docling'
30
- Requires-Dist: pandas; extra == 'docling'
31
- Requires-Dist: pillow; extra == 'docling'
32
32
  Description-Content-Type: text/markdown
33
33
 
34
34
  # Content Core
@@ -39,6 +39,8 @@ Description-Content-Type: text/markdown
39
39
 
40
40
  ## Overview
41
41
 
42
+ > **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
43
+
42
44
  The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
43
45
 
44
46
  ## Key Features
@@ -48,6 +50,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
48
50
  * Web URLs (using robust extraction methods).
49
51
  * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
50
52
  * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
53
+ * **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
54
+ * For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
55
+ * For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
56
+ * You can override this by specifying an engine, but `'auto'` is recommended for most users.
51
57
  * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
52
58
  * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
53
59
 
@@ -60,8 +66,6 @@ Install Content Core using `pip`:
60
66
  ```bash
61
67
  # Install the package (without Docling)
62
68
  pip install content-core
63
- # Install with Docling support
64
- pip install content-core[docling]
65
69
  ```
66
70
 
67
71
  Alternatively, if you’re developing locally:
@@ -218,15 +222,15 @@ async def main():
218
222
  text_data = await extract_content({"content": "This is my sample text content."})
219
223
  print(text_data)
220
224
 
221
- # Extract from a URL
225
+ # Extract from a URL (uses 'auto' engine by default)
222
226
  url_data = await extract_content({"url": "https://www.example.com"})
223
227
  print(url_data)
224
228
 
225
- # Extract from a local video file (gets transcript)
229
+ # Extract from a local video file (gets transcript, engine='auto' by default)
226
230
  video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
227
231
  print(video_data)
228
232
 
229
- # Extract from a local markdown file
233
+ # Extract from a local markdown file (engine='auto' by default)
230
234
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
231
235
  print(md_data)
232
236
 
@@ -248,15 +252,11 @@ if __name__ == "__main__":
248
252
 
249
253
  Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
250
254
 
251
- ### Installation
252
-
253
- ```bash
254
- # Install with Docling support
255
- pip install content-core[docling]
256
- ```
257
255
 
258
256
  ### Enabling Docling
259
257
 
258
+ Docling is not the default engine when parsing documents. If you don't want to use it, you need to set engine to "simple".
259
+
260
260
  #### Via configuration file
261
261
 
262
262
  In your `cc_config.yaml` or custom config, set:
@@ -8,30 +8,31 @@ content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
8
8
  content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
9
9
  content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
10
10
  content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
11
- content_core/common/state.py,sha256=2yhb87ZXZIgxAzVH6KKz9_glUzI8rSJ4Iq_0s4sKb7U,1284
11
+ content_core/common/state.py,sha256=pO8Oq71KxznlZ4K5qUVfyLrNsZWd2yMO9bXKmrTIXQo,1427
12
+ content_core/common/types.py,sha256=FpIzYadBvafGI4e1EuwGjjiPuawL1HitxsQOciNjTZo,497
12
13
  content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
13
14
  content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
14
15
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
15
16
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
16
17
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
17
- content_core/content/extraction/graph.py,sha256=IKu-bV3YG2MigHnYixYYhtrQ-4qgGpETerXBEFn73zU,6304
18
+ content_core/content/extraction/graph.py,sha256=51B_j_hi7SsKh7kKNLFsMmxyR2HVS-mOYfKvDFyuYfw,7001
18
19
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
19
20
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
20
21
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
21
- content_core/processors/audio.py,sha256=KnwxK85X9qRyVziMhFd103kfHkE8qGB1D4yW5lYO90E,5701
22
+ content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
22
23
  content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
23
24
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
24
25
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
25
26
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
26
- content_core/processors/url.py,sha256=vmkBVfJ1xpZQzlhRdkO64V1J9xdTBr6nrXY4M74QzEo,9094
27
+ content_core/processors/url.py,sha256=yt-uuzS4N-RAOJ8vo5x-b4bgnrFeTV-3SDIatRTRI3g,9462
27
28
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
28
29
  content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
29
30
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
30
31
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
31
32
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
32
33
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
33
- content_core-0.7.2.dist-info/METADATA,sha256=oWJhvImNzPAcGuAnkEKIqWozWW383pszrszyYaSLB-s,10471
34
- content_core-0.7.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
- content_core-0.7.2.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
36
- content_core-0.7.2.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
37
- content_core-0.7.2.dist-info/RECORD,,
34
+ content_core-0.8.1.dist-info/METADATA,sha256=ZIW6gtawFeFo2uQqWkFH2ctSYIUq5PBrke4gyHQQAWU,11439
35
+ content_core-0.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
+ content_core-0.8.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
37
+ content_core-0.8.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
38
+ content_core-0.8.1.dist-info/RECORD,,