content-core 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/common/state.py +6 -2
- content_core/common/types.py +21 -0
- content_core/content/extraction/graph.py +18 -3
- content_core/processors/audio.py +19 -11
- content_core/processors/url.py +143 -147
- {content_core-0.7.2.dist-info → content_core-0.8.1.dist-info}/METADATA +17 -17
- {content_core-0.7.2.dist-info → content_core-0.8.1.dist-info}/RECORD +10 -9
- {content_core-0.7.2.dist-info → content_core-0.8.1.dist-info}/WHEEL +0 -0
- {content_core-0.7.2.dist-info → content_core-0.8.1.dist-info}/entry_points.txt +0 -0
- {content_core-0.7.2.dist-info → content_core-0.8.1.dist-info}/licenses/LICENSE +0 -0
content_core/common/state.py
CHANGED
|
@@ -2,6 +2,9 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel, Field
|
|
4
4
|
|
|
5
|
+
from content_core.common.types import Engine
|
|
6
|
+
from content_core.common.types import Engine
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
class ProcessSourceState(BaseModel):
|
|
7
10
|
file_path: Optional[str] = ""
|
|
@@ -13,8 +16,9 @@ class ProcessSourceState(BaseModel):
|
|
|
13
16
|
identified_provider: Optional[str] = ""
|
|
14
17
|
metadata: Optional[dict] = Field(default_factory=lambda: {})
|
|
15
18
|
content: Optional[str] = ""
|
|
16
|
-
engine: Optional[
|
|
17
|
-
default=None,
|
|
19
|
+
engine: Optional[Engine] = Field(
|
|
20
|
+
default=None,
|
|
21
|
+
description="Override extraction engine: 'auto', 'simple', 'legacy', 'firecrawl', 'jina', or 'docling'",
|
|
18
22
|
)
|
|
19
23
|
output_format: Optional[str] = Field(
|
|
20
24
|
default=None,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
import warnings
|
|
3
|
+
|
|
4
|
+
Engine = Literal[
|
|
5
|
+
"auto",
|
|
6
|
+
"simple",
|
|
7
|
+
"legacy",
|
|
8
|
+
"firecrawl",
|
|
9
|
+
"jina",
|
|
10
|
+
"docling",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
DEPRECATED_ENGINES = {"legacy": "simple"}
|
|
14
|
+
|
|
15
|
+
def warn_if_deprecated_engine(engine: str):
|
|
16
|
+
if engine in DEPRECATED_ENGINES:
|
|
17
|
+
warnings.warn(
|
|
18
|
+
f"Engine '{engine}' is deprecated and will be removed in a future release. Use '{DEPRECATED_ENGINES[engine]}' instead.",
|
|
19
|
+
DeprecationWarning,
|
|
20
|
+
stacklevel=2,
|
|
21
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import tempfile
|
|
3
3
|
from typing import Any, Dict, Optional
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
|
+
from content_core.common.types import warn_if_deprecated_engine
|
|
5
6
|
|
|
6
7
|
import aiohttp
|
|
7
8
|
import magic
|
|
@@ -114,14 +115,28 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
114
115
|
return {"file_path": tmp, "identified_type": mime}
|
|
115
116
|
|
|
116
117
|
|
|
118
|
+
|
|
117
119
|
async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
118
120
|
"""
|
|
119
|
-
Route to Docling if enabled and supported; otherwise use
|
|
121
|
+
Route to Docling if enabled and supported; otherwise use simple file type edge.
|
|
122
|
+
Supports 'auto', 'docling', 'simple', and 'legacy' (deprecated, alias for simple).
|
|
123
|
+
'auto' tries simple first, then falls back to docling if simple fails.
|
|
120
124
|
"""
|
|
121
|
-
|
|
122
|
-
|
|
125
|
+
engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
|
|
126
|
+
warn_if_deprecated_engine(engine)
|
|
127
|
+
if engine == "auto":
|
|
128
|
+
# Try docling first; if it fails or is not supported, fallback to simple
|
|
129
|
+
if state.identified_type in DOCLING_SUPPORTED:
|
|
130
|
+
try:
|
|
131
|
+
return "extract_docling"
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.warning(f"Docling extraction failed in 'auto' mode, falling back to simple: {e}")
|
|
134
|
+
# Fallback to simple
|
|
135
|
+
return await file_type_edge(state)
|
|
136
|
+
|
|
123
137
|
if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
|
|
124
138
|
return "extract_docling"
|
|
139
|
+
# For 'simple' and 'legacy', use the default file type edge
|
|
125
140
|
return await file_type_edge(state)
|
|
126
141
|
|
|
127
142
|
|
content_core/processors/audio.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import math
|
|
2
3
|
import os
|
|
3
4
|
import tempfile
|
|
4
|
-
import math
|
|
5
5
|
import traceback
|
|
6
6
|
from functools import partial
|
|
7
|
+
|
|
7
8
|
from moviepy import AudioFileClip
|
|
8
9
|
|
|
9
10
|
from content_core.common import ProcessSourceState
|
|
@@ -64,7 +65,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
64
65
|
)
|
|
65
66
|
|
|
66
67
|
|
|
67
|
-
def extract_audio(
|
|
68
|
+
def extract_audio(
|
|
69
|
+
input_file: str, output_file: str, start_time: float = None, end_time: float = None
|
|
70
|
+
) -> None:
|
|
68
71
|
"""
|
|
69
72
|
Extract audio from a video or audio file and save it as an MP3 file.
|
|
70
73
|
If start_time and end_time are provided, only that segment of audio is extracted.
|
|
@@ -78,17 +81,17 @@ def extract_audio(input_file: str, output_file: str, start_time: float = None, e
|
|
|
78
81
|
try:
|
|
79
82
|
# Load the file as an AudioFileClip
|
|
80
83
|
audio_clip = AudioFileClip(input_file)
|
|
81
|
-
|
|
82
|
-
# If start_time and end_time are provided, trim the audio
|
|
84
|
+
|
|
85
|
+
# If start_time and/or end_time are provided, trim the audio using subclipped
|
|
83
86
|
if start_time is not None and end_time is not None:
|
|
84
|
-
audio_clip = audio_clip.
|
|
87
|
+
audio_clip = audio_clip.subclipped(start_time, end_time)
|
|
85
88
|
elif start_time is not None:
|
|
86
|
-
audio_clip = audio_clip.
|
|
89
|
+
audio_clip = audio_clip.subclipped(start_time)
|
|
87
90
|
elif end_time is not None:
|
|
88
|
-
audio_clip = audio_clip.
|
|
91
|
+
audio_clip = audio_clip.subclipped(0, end_time)
|
|
89
92
|
|
|
90
93
|
# Export the audio as MP3
|
|
91
|
-
audio_clip.write_audiofile(output_file, codec=
|
|
94
|
+
audio_clip.write_audiofile(output_file, codec="mp3")
|
|
92
95
|
audio_clip.close()
|
|
93
96
|
except Exception as e:
|
|
94
97
|
logger.error(f"Error extracting audio: {str(e)}")
|
|
@@ -117,7 +120,9 @@ async def extract_audio_data(data: ProcessSourceState):
|
|
|
117
120
|
output_files = []
|
|
118
121
|
|
|
119
122
|
if duration_s > segment_length_s:
|
|
120
|
-
logger.info(
|
|
123
|
+
logger.info(
|
|
124
|
+
f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
|
|
125
|
+
)
|
|
121
126
|
for i in range(math.ceil(duration_s / segment_length_s)):
|
|
122
127
|
start_time = i * segment_length_s
|
|
123
128
|
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
@@ -134,15 +139,18 @@ async def extract_audio_data(data: ProcessSourceState):
|
|
|
134
139
|
|
|
135
140
|
# Transcribe audio files
|
|
136
141
|
from content_core.models import ModelFactory
|
|
142
|
+
|
|
137
143
|
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
138
144
|
transcriptions = []
|
|
139
145
|
for audio_file in output_files:
|
|
140
|
-
transcription = await transcribe_audio_segment(
|
|
146
|
+
transcription = await transcribe_audio_segment(
|
|
147
|
+
audio_file, speech_to_text_model
|
|
148
|
+
)
|
|
141
149
|
transcriptions.append(transcription)
|
|
142
150
|
|
|
143
151
|
return {
|
|
144
152
|
"metadata": {"audio_files": output_files},
|
|
145
|
-
"content": " ".join(transcriptions)
|
|
153
|
+
"content": " ".join(transcriptions),
|
|
146
154
|
}
|
|
147
155
|
except Exception as e:
|
|
148
156
|
logger.error(f"Error processing audio: {str(e)}")
|
content_core/processors/url.py
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
import
|
|
2
|
-
from urllib.parse import urlparse
|
|
1
|
+
import os
|
|
3
2
|
from io import BytesIO
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
4
|
|
|
5
5
|
import aiohttp
|
|
6
6
|
import docx
|
|
7
|
-
from bs4 import BeautifulSoup
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from readability import Document
|
|
8
9
|
|
|
9
10
|
from content_core.common import ProcessSourceState
|
|
11
|
+
from content_core.common.types import warn_if_deprecated_engine
|
|
10
12
|
from content_core.logging import logger
|
|
11
13
|
from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
|
|
12
14
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
15
|
+
DOCX_MIME_TYPE = (
|
|
16
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
17
|
+
)
|
|
16
18
|
|
|
17
|
-
DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
18
19
|
|
|
19
20
|
async def _extract_docx_content(docx_bytes: bytes, url: str):
|
|
20
21
|
"""
|
|
@@ -25,21 +26,25 @@ async def _extract_docx_content(docx_bytes: bytes, url: str):
|
|
|
25
26
|
doc = docx.Document(BytesIO(docx_bytes))
|
|
26
27
|
content_parts = [p.text for p in doc.paragraphs if p.text]
|
|
27
28
|
full_content = "\n\n".join(content_parts)
|
|
28
|
-
|
|
29
|
+
|
|
29
30
|
# Try to get a title from document properties or first heading
|
|
30
31
|
title = doc.core_properties.title
|
|
31
32
|
if not title and doc.paragraphs:
|
|
32
33
|
# Look for a potential title in the first few paragraphs (e.g., if styled as heading)
|
|
33
|
-
for p in doc.paragraphs[:5]:
|
|
34
|
-
if p.style.name.startswith(
|
|
34
|
+
for p in doc.paragraphs[:5]: # Check first 5 paragraphs
|
|
35
|
+
if p.style.name.startswith("Heading"):
|
|
35
36
|
title = p.text
|
|
36
37
|
break
|
|
37
|
-
if not title:
|
|
38
|
-
|
|
38
|
+
if not title: # Fallback to first line if no heading found
|
|
39
|
+
title = (
|
|
40
|
+
doc.paragraphs[0].text.strip()
|
|
41
|
+
if doc.paragraphs[0].text.strip()
|
|
42
|
+
else None
|
|
43
|
+
)
|
|
39
44
|
|
|
40
45
|
# If no title found, use filename from URL
|
|
41
46
|
if not title:
|
|
42
|
-
title = urlparse(url).path.split(
|
|
47
|
+
title = urlparse(url).path.split("/")[-1]
|
|
43
48
|
|
|
44
49
|
logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
|
|
45
50
|
return {
|
|
@@ -58,6 +63,7 @@ async def _extract_docx_content(docx_bytes: bytes, url: str):
|
|
|
58
63
|
"url": url,
|
|
59
64
|
}
|
|
60
65
|
|
|
66
|
+
|
|
61
67
|
async def url_provider(state: ProcessSourceState):
|
|
62
68
|
"""
|
|
63
69
|
Identify the provider
|
|
@@ -71,7 +77,9 @@ async def url_provider(state: ProcessSourceState):
|
|
|
71
77
|
# remote URL: check content-type to catch PDFs
|
|
72
78
|
try:
|
|
73
79
|
async with aiohttp.ClientSession() as session:
|
|
74
|
-
async with session.head(
|
|
80
|
+
async with session.head(
|
|
81
|
+
url, timeout=10, allow_redirects=True
|
|
82
|
+
) as resp:
|
|
75
83
|
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
76
84
|
except Exception as e:
|
|
77
85
|
logger.debug(f"HEAD check failed for {url}: {e}")
|
|
@@ -83,142 +91,82 @@ async def url_provider(state: ProcessSourceState):
|
|
|
83
91
|
return return_dict
|
|
84
92
|
|
|
85
93
|
|
|
86
|
-
async def extract_url_bs4(url: str):
|
|
87
|
-
"""
|
|
88
|
-
Get the title and content of a URL using bs4
|
|
94
|
+
async def extract_url_bs4(url: str) -> dict:
|
|
89
95
|
"""
|
|
90
|
-
|
|
91
|
-
headers = {
|
|
92
|
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
93
|
-
}
|
|
96
|
+
Get the title and content of a URL using readability with a fallback to BeautifulSoup.
|
|
94
97
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
html_content = url
|
|
98
|
-
else:
|
|
99
|
-
async with aiohttp.ClientSession() as session:
|
|
100
|
-
async with session.get(url, headers=headers, timeout=10) as response:
|
|
101
|
-
response.raise_for_status()
|
|
102
|
-
# Check content type for DOCX
|
|
103
|
-
if response.content_type == DOCX_MIME_TYPE:
|
|
104
|
-
logger.debug(f"Detected DOCX content type for {url}")
|
|
105
|
-
docx_bytes = await response.read()
|
|
106
|
-
return await _extract_docx_content(docx_bytes, url)
|
|
107
|
-
|
|
108
|
-
# If not DOCX, proceed as HTML
|
|
109
|
-
html_content = await response.text()
|
|
110
|
-
|
|
111
|
-
soup = BeautifulSoup(html_content, "html.parser")
|
|
112
|
-
|
|
113
|
-
# Remove unwanted elements
|
|
114
|
-
for element in soup.find_all(
|
|
115
|
-
["script", "style", "nav", "footer", "iframe", "noscript", "ad"]
|
|
116
|
-
):
|
|
117
|
-
element.decompose()
|
|
118
|
-
|
|
119
|
-
# Remove comments
|
|
120
|
-
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
|
121
|
-
comment.extract()
|
|
122
|
-
|
|
123
|
-
# Get title
|
|
124
|
-
title = None
|
|
125
|
-
title_tags = [
|
|
126
|
-
soup.find("meta", property="og:title"),
|
|
127
|
-
soup.find("meta", property="twitter:title"),
|
|
128
|
-
soup.find("title"),
|
|
129
|
-
soup.find("h1"),
|
|
130
|
-
]
|
|
131
|
-
|
|
132
|
-
for tag in title_tags:
|
|
133
|
-
if tag:
|
|
134
|
-
if tag.string:
|
|
135
|
-
title = tag.string
|
|
136
|
-
elif tag.get("content"):
|
|
137
|
-
title = tag.get("content")
|
|
138
|
-
break
|
|
139
|
-
|
|
140
|
-
# Clean up title
|
|
141
|
-
if title:
|
|
142
|
-
title = " ".join(title.split())
|
|
143
|
-
title = re.sub(r"\s*\|.*$", "", title)
|
|
144
|
-
title = re.sub(r"\s*-.*$", "", title)
|
|
145
|
-
|
|
146
|
-
# Get content
|
|
147
|
-
content = []
|
|
148
|
-
|
|
149
|
-
# Look for main article content
|
|
150
|
-
main_content = None
|
|
151
|
-
content_tags = [
|
|
152
|
-
soup.find("article"),
|
|
153
|
-
soup.find("main"),
|
|
154
|
-
soup.find(class_=re.compile(r"article|post|content|entry|document")),
|
|
155
|
-
soup.find(id=re.compile(r"article|post|content|entry|main")),
|
|
156
|
-
]
|
|
157
|
-
|
|
158
|
-
for tag in content_tags:
|
|
159
|
-
if tag:
|
|
160
|
-
main_content = tag
|
|
161
|
-
break
|
|
162
|
-
|
|
163
|
-
if not main_content:
|
|
164
|
-
main_content = soup
|
|
165
|
-
|
|
166
|
-
# Process content
|
|
167
|
-
for element in main_content.find_all(
|
|
168
|
-
["p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "div"]
|
|
169
|
-
):
|
|
170
|
-
# Handle code blocks
|
|
171
|
-
if element.name == "pre" or "highlight" in element.get("class", []):
|
|
172
|
-
code_text = element.get_text().strip()
|
|
173
|
-
if code_text:
|
|
174
|
-
content.append("\n```\n" + code_text + "\n```\n")
|
|
175
|
-
continue
|
|
176
|
-
|
|
177
|
-
# Handle regular text
|
|
178
|
-
text = element.get_text().strip()
|
|
179
|
-
if text:
|
|
180
|
-
# Skip if text matches common patterns for navigation/footer
|
|
181
|
-
if re.search(
|
|
182
|
-
r"copyright|all rights reserved|privacy policy|terms of use",
|
|
183
|
-
text.lower(),
|
|
184
|
-
):
|
|
185
|
-
continue
|
|
186
|
-
|
|
187
|
-
content.append(text)
|
|
188
|
-
|
|
189
|
-
# Join content with proper spacing
|
|
190
|
-
final_content = "\n\n".join(content)
|
|
191
|
-
|
|
192
|
-
# Clean up content
|
|
193
|
-
final_content = re.sub(
|
|
194
|
-
r"\n\s*\n\s*\n", "\n\n", final_content
|
|
195
|
-
) # Remove extra newlines
|
|
196
|
-
final_content = re.sub(r" +", " ", final_content) # Normalize whitespace
|
|
197
|
-
final_content = final_content.strip()
|
|
98
|
+
Args:
|
|
99
|
+
url (str): The URL of the webpage to extract content from.
|
|
198
100
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
101
|
+
Returns:
|
|
102
|
+
dict: A dictionary containing the 'title' and 'content' of the webpage.
|
|
103
|
+
"""
|
|
104
|
+
async with aiohttp.ClientSession() as session:
|
|
105
|
+
try:
|
|
106
|
+
# Fetch the webpage content
|
|
107
|
+
async with session.get(url, timeout=10) as response:
|
|
108
|
+
if response.status != 200:
|
|
109
|
+
raise Exception(f"HTTP error: {response.status}")
|
|
110
|
+
html = await response.text()
|
|
111
|
+
|
|
112
|
+
# Try extracting with readability
|
|
113
|
+
try:
|
|
114
|
+
doc = Document(html)
|
|
115
|
+
title = doc.title() or "No title found"
|
|
116
|
+
# Extract content as plain text by parsing the cleaned HTML
|
|
117
|
+
soup = BeautifulSoup(doc.summary(), "lxml")
|
|
118
|
+
content = soup.get_text(separator=" ", strip=True)
|
|
119
|
+
if not content.strip():
|
|
120
|
+
raise ValueError("No content extracted by readability")
|
|
121
|
+
except Exception as e:
|
|
122
|
+
print(f"Readability failed: {e}")
|
|
123
|
+
# Fallback to BeautifulSoup
|
|
124
|
+
soup = BeautifulSoup(html, "lxml")
|
|
125
|
+
# Extract title
|
|
126
|
+
title_tag = (
|
|
127
|
+
soup.find("title")
|
|
128
|
+
or soup.find("h1")
|
|
129
|
+
or soup.find("meta", property="og:title")
|
|
130
|
+
)
|
|
131
|
+
title = (
|
|
132
|
+
title_tag.get_text(strip=True) if title_tag else "No title found"
|
|
133
|
+
)
|
|
134
|
+
# Extract content from common content tags
|
|
135
|
+
content_tags = soup.select(
|
|
136
|
+
'article, .content, .post, main, [role="main"], div[class*="content"], div[class*="article"]'
|
|
137
|
+
)
|
|
138
|
+
content = (
|
|
139
|
+
" ".join(
|
|
140
|
+
tag.get_text(separator=" ", strip=True) for tag in content_tags
|
|
141
|
+
)
|
|
142
|
+
if content_tags
|
|
143
|
+
else soup.get_text(separator=" ", strip=True)
|
|
144
|
+
)
|
|
145
|
+
content = content.strip() or "No content found"
|
|
207
146
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
147
|
+
return {
|
|
148
|
+
"title": title,
|
|
149
|
+
"content": content,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
except Exception as e:
|
|
153
|
+
print(f"Error processing URL {url}: {e}")
|
|
154
|
+
return {
|
|
155
|
+
"title": "Error",
|
|
156
|
+
"content": f"Failed to extract content: {str(e)}",
|
|
157
|
+
}
|
|
214
158
|
|
|
215
159
|
|
|
216
160
|
async def extract_url_jina(url: str):
|
|
217
161
|
"""
|
|
218
|
-
Get the content of a URL using Jina
|
|
162
|
+
Get the content of a URL using Jina. Uses Bearer token if JINA_API_KEY is set.
|
|
219
163
|
"""
|
|
164
|
+
headers = {}
|
|
165
|
+
api_key = os.environ.get("JINA_API_KEY")
|
|
166
|
+
if api_key:
|
|
167
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
220
168
|
async with aiohttp.ClientSession() as session:
|
|
221
|
-
async with session.get(f"https://r.jina.ai/{url}") as response:
|
|
169
|
+
async with session.get(f"https://r.jina.ai/{url}", headers=headers) as response:
|
|
222
170
|
text = await response.text()
|
|
223
171
|
if text.startswith("Title:") and "\n" in text:
|
|
224
172
|
title_end = text.index("\n")
|
|
@@ -235,17 +183,65 @@ async def extract_url_jina(url: str):
|
|
|
235
183
|
return {"content": text}
|
|
236
184
|
|
|
237
185
|
|
|
186
|
+
async def extract_url_firecrawl(url: str):
|
|
187
|
+
"""
|
|
188
|
+
Get the content of a URL using Firecrawl.
|
|
189
|
+
Returns {"title": ..., "content": ...} or None on failure.
|
|
190
|
+
"""
|
|
191
|
+
try:
|
|
192
|
+
from firecrawl import AsyncFirecrawlApp
|
|
193
|
+
|
|
194
|
+
app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
|
|
195
|
+
scrape_result = await app.scrape_url(url, formats=["markdown", "html"])
|
|
196
|
+
return {
|
|
197
|
+
"title": scrape_result.metadata["title"] or scrape_result.title,
|
|
198
|
+
"content": scrape_result.markdown,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.error(f"Firecrawl extraction error for URL: {url}: {e}")
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
|
|
238
206
|
async def extract_url(state: ProcessSourceState):
|
|
207
|
+
"""
|
|
208
|
+
Extract content from a URL using the engine specified in the state.
|
|
209
|
+
Supported engines: 'auto', 'simple', 'legacy' (deprecated), 'firecrawl', 'jina'.
|
|
210
|
+
"""
|
|
239
211
|
assert state.url, "No URL provided"
|
|
240
212
|
url = state.url
|
|
213
|
+
engine = state.engine or "auto"
|
|
214
|
+
warn_if_deprecated_engine(engine)
|
|
241
215
|
try:
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
216
|
+
if engine == "auto":
|
|
217
|
+
if os.environ.get("FIRECRAWL_API_KEY"):
|
|
218
|
+
logger.debug(
|
|
219
|
+
"Engine 'auto' selected: using Firecrawl (FIRECRAWL_API_KEY detected)"
|
|
220
|
+
)
|
|
221
|
+
return await extract_url_firecrawl(url)
|
|
222
|
+
else:
|
|
223
|
+
try:
|
|
224
|
+
logger.debug("Trying to use Jina to extract URL")
|
|
225
|
+
return await extract_url_jina(url)
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.error(f"Jina extraction error for URL: {url}: {e}")
|
|
228
|
+
logger.debug("Falling back to BeautifulSoup")
|
|
229
|
+
return await extract_url_bs4(url)
|
|
230
|
+
elif engine == "simple" or engine == "legacy":
|
|
231
|
+
# 'legacy' is deprecated alias for 'simple'
|
|
232
|
+
return await extract_url_bs4(url)
|
|
233
|
+
elif engine == "firecrawl":
|
|
234
|
+
return await extract_url_firecrawl(url)
|
|
235
|
+
elif engine == "jina":
|
|
236
|
+
return await extract_url_jina(url)
|
|
237
|
+
elif engine == "docling":
|
|
238
|
+
from content_core.processors.docling import extract_with_docling
|
|
239
|
+
|
|
240
|
+
state.url = url
|
|
241
|
+
result_state = await extract_with_docling(state)
|
|
242
|
+
return {"title": None, "content": result_state.content}
|
|
243
|
+
else:
|
|
244
|
+
raise ValueError(f"Unknown engine: {engine}")
|
|
249
245
|
except Exception as e:
|
|
250
246
|
logger.error(f"URL extraction failed for URL: {url}")
|
|
251
247
|
logger.exception(e)
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: >=3.10
|
|
8
8
|
Requires-Dist: ai-prompter>=0.2.3
|
|
9
9
|
Requires-Dist: aiohttp>=3.11
|
|
10
|
+
Requires-Dist: asciidoc>=10.2.1
|
|
10
11
|
Requires-Dist: bs4>=0.0.2
|
|
11
12
|
Requires-Dist: dicttoxml>=1.7.16
|
|
13
|
+
Requires-Dist: docling>=2.34.0
|
|
12
14
|
Requires-Dist: esperanto[openai]>=1.2.0
|
|
15
|
+
Requires-Dist: firecrawl-py>=2.7.0
|
|
13
16
|
Requires-Dist: jinja2>=3.1.6
|
|
14
17
|
Requires-Dist: langdetect>=1.0.9
|
|
15
18
|
Requires-Dist: langgraph>=0.3.29
|
|
@@ -17,18 +20,15 @@ Requires-Dist: loguru>=0.7.3
|
|
|
17
20
|
Requires-Dist: moviepy>=2.1.2
|
|
18
21
|
Requires-Dist: openpyxl>=3.1.5
|
|
19
22
|
Requires-Dist: pandas>=2.2.3
|
|
23
|
+
Requires-Dist: pillow>=10.4.0
|
|
20
24
|
Requires-Dist: pymupdf>=1.25.5
|
|
21
25
|
Requires-Dist: python-docx>=1.1.2
|
|
22
26
|
Requires-Dist: python-dotenv>=1.1.0
|
|
23
27
|
Requires-Dist: python-magic>=0.4.27
|
|
24
28
|
Requires-Dist: python-pptx>=1.0.2
|
|
29
|
+
Requires-Dist: readability-lxml>=0.8.4.1
|
|
25
30
|
Requires-Dist: validators>=0.34.0
|
|
26
31
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
27
|
-
Provides-Extra: docling
|
|
28
|
-
Requires-Dist: asciidoc; extra == 'docling'
|
|
29
|
-
Requires-Dist: docling; extra == 'docling'
|
|
30
|
-
Requires-Dist: pandas; extra == 'docling'
|
|
31
|
-
Requires-Dist: pillow; extra == 'docling'
|
|
32
32
|
Description-Content-Type: text/markdown
|
|
33
33
|
|
|
34
34
|
# Content Core
|
|
@@ -39,6 +39,8 @@ Description-Content-Type: text/markdown
|
|
|
39
39
|
|
|
40
40
|
## Overview
|
|
41
41
|
|
|
42
|
+
> **Note:** As of v0.8, the default extraction engine is `'auto'`. Content Core will automatically select the best extraction method based on your environment and available API keys, with a smart fallback order for both URLs and files. For files/documents, `'auto'` now tries Docling first, then falls back to simple extraction. You can override the engine if needed, but `'auto'` is recommended for most users.
|
|
43
|
+
|
|
42
44
|
The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
|
|
43
45
|
|
|
44
46
|
## Key Features
|
|
@@ -48,6 +50,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
48
50
|
* Web URLs (using robust extraction methods).
|
|
49
51
|
* Local files (including automatic transcription for video/audio files and parsing for text-based formats).
|
|
50
52
|
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
|
|
53
|
+
* **Smart Engine Selection:** By default, Content Core uses the `'auto'` engine, which:
|
|
54
|
+
* For URLs: Uses Firecrawl if `FIRECRAWL_API_KEY` is set, else tries Jina. Jina might fail because of rate limits, which can be fixed by adding `JINA_API_KEY`. If Jina failes, BeautifulSoup is used as a fallback.
|
|
55
|
+
* For files: Tries Docling extraction first (for robust document parsing), then falls back to simple extraction if needed.
|
|
56
|
+
* You can override this by specifying an engine, but `'auto'` is recommended for most users.
|
|
51
57
|
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
52
58
|
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
53
59
|
|
|
@@ -60,8 +66,6 @@ Install Content Core using `pip`:
|
|
|
60
66
|
```bash
|
|
61
67
|
# Install the package (without Docling)
|
|
62
68
|
pip install content-core
|
|
63
|
-
# Install with Docling support
|
|
64
|
-
pip install content-core[docling]
|
|
65
69
|
```
|
|
66
70
|
|
|
67
71
|
Alternatively, if you’re developing locally:
|
|
@@ -218,15 +222,15 @@ async def main():
|
|
|
218
222
|
text_data = await extract_content({"content": "This is my sample text content."})
|
|
219
223
|
print(text_data)
|
|
220
224
|
|
|
221
|
-
# Extract from a URL
|
|
225
|
+
# Extract from a URL (uses 'auto' engine by default)
|
|
222
226
|
url_data = await extract_content({"url": "https://www.example.com"})
|
|
223
227
|
print(url_data)
|
|
224
228
|
|
|
225
|
-
# Extract from a local video file (gets transcript)
|
|
229
|
+
# Extract from a local video file (gets transcript, engine='auto' by default)
|
|
226
230
|
video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
|
|
227
231
|
print(video_data)
|
|
228
232
|
|
|
229
|
-
# Extract from a local markdown file
|
|
233
|
+
# Extract from a local markdown file (engine='auto' by default)
|
|
230
234
|
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
231
235
|
print(md_data)
|
|
232
236
|
|
|
@@ -248,15 +252,11 @@ if __name__ == "__main__":
|
|
|
248
252
|
|
|
249
253
|
Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
|
|
250
254
|
|
|
251
|
-
### Installation
|
|
252
|
-
|
|
253
|
-
```bash
|
|
254
|
-
# Install with Docling support
|
|
255
|
-
pip install content-core[docling]
|
|
256
|
-
```
|
|
257
255
|
|
|
258
256
|
### Enabling Docling
|
|
259
257
|
|
|
258
|
+
Docling is not the default engine when parsing documents. If you don't want to use it, you need to set engine to "simple".
|
|
259
|
+
|
|
260
260
|
#### Via configuration file
|
|
261
261
|
|
|
262
262
|
In your `cc_config.yaml` or custom config, set:
|
|
@@ -8,30 +8,31 @@ content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
|
8
8
|
content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
|
|
9
9
|
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
10
10
|
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
11
|
-
content_core/common/state.py,sha256=
|
|
11
|
+
content_core/common/state.py,sha256=pO8Oq71KxznlZ4K5qUVfyLrNsZWd2yMO9bXKmrTIXQo,1427
|
|
12
|
+
content_core/common/types.py,sha256=FpIzYadBvafGI4e1EuwGjjiPuawL1HitxsQOciNjTZo,497
|
|
12
13
|
content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
|
|
13
14
|
content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
|
|
14
15
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
15
16
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
16
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
17
|
-
content_core/content/extraction/graph.py,sha256=
|
|
18
|
+
content_core/content/extraction/graph.py,sha256=51B_j_hi7SsKh7kKNLFsMmxyR2HVS-mOYfKvDFyuYfw,7001
|
|
18
19
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
19
20
|
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
20
21
|
content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
|
|
21
|
-
content_core/processors/audio.py,sha256=
|
|
22
|
+
content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
|
|
22
23
|
content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
|
|
23
24
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
24
25
|
content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
|
|
25
26
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
26
|
-
content_core/processors/url.py,sha256=
|
|
27
|
+
content_core/processors/url.py,sha256=yt-uuzS4N-RAOJ8vo5x-b4bgnrFeTV-3SDIatRTRI3g,9462
|
|
27
28
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
28
29
|
content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
|
|
29
30
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
30
31
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
31
32
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
32
33
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
33
|
-
content_core-0.
|
|
34
|
-
content_core-0.
|
|
35
|
-
content_core-0.
|
|
36
|
-
content_core-0.
|
|
37
|
-
content_core-0.
|
|
34
|
+
content_core-0.8.1.dist-info/METADATA,sha256=ZIW6gtawFeFo2uQqWkFH2ctSYIUq5PBrke4gyHQQAWU,11439
|
|
35
|
+
content_core-0.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
36
|
+
content_core-0.8.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
37
|
+
content_core-0.8.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
38
|
+
content_core-0.8.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|