content-core 0.6.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-0.6.0 → content_core-0.7.2}/PKG-INFO +3 -5
- {content_core-0.6.0 → content_core-0.7.2}/pyproject.toml +4 -5
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/state.py +9 -2
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/extraction/graph.py +2 -9
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/url.py +52 -0
- {content_core-0.6.0 → content_core-0.7.2}/uv.lock +1019 -990
- {content_core-0.6.0 → content_core-0.7.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/.github/workflows/publish.yml +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/.gitignore +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/.python-version +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/CONTRIBUTING.md +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/LICENSE +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/Makefile +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/README.md +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/docs/processors.md +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/docs/usage.md +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/prompts/content/cleanup.jinja +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/prompts/content/summarize.jinja +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/__init__.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/cc_config.yaml +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/__init__.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/exceptions.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/utils.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/config.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/__init__.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/summary/core.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/logging.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/models.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/models_config.yaml +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/audio.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/docling.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/office.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/pdf.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/text.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/video.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/youtube.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/py.typed +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/templated_message.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/__init__.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/cleanup.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/extract.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/summarize.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.docx +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.epub +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.md +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.mp3 +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.mp4 +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.pdf +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.pptx +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.txt +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.xlsx +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/integration/test_extraction.py +0 -0
- {content_core-0.6.0 → content_core-0.7.2}/tests/unit/test_docling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -9,14 +9,12 @@ Requires-Dist: ai-prompter>=0.2.3
|
|
|
9
9
|
Requires-Dist: aiohttp>=3.11
|
|
10
10
|
Requires-Dist: bs4>=0.0.2
|
|
11
11
|
Requires-Dist: dicttoxml>=1.7.16
|
|
12
|
-
Requires-Dist: esperanto>=1.2.0
|
|
13
|
-
Requires-Dist: google-genai>=1.10.0
|
|
12
|
+
Requires-Dist: esperanto[openai]>=1.2.0
|
|
14
13
|
Requires-Dist: jinja2>=3.1.6
|
|
15
14
|
Requires-Dist: langdetect>=1.0.9
|
|
16
15
|
Requires-Dist: langgraph>=0.3.29
|
|
17
16
|
Requires-Dist: loguru>=0.7.3
|
|
18
17
|
Requires-Dist: moviepy>=2.1.2
|
|
19
|
-
Requires-Dist: openai>=1.73.0
|
|
20
18
|
Requires-Dist: openpyxl>=3.1.5
|
|
21
19
|
Requires-Dist: pandas>=2.2.3
|
|
22
20
|
Requires-Dist: pymupdf>=1.25.5
|
|
@@ -28,7 +26,7 @@ Requires-Dist: validators>=0.34.0
|
|
|
28
26
|
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
29
27
|
Provides-Extra: docling
|
|
30
28
|
Requires-Dist: asciidoc; extra == 'docling'
|
|
31
|
-
Requires-Dist: docling
|
|
29
|
+
Requires-Dist: docling; extra == 'docling'
|
|
32
30
|
Requires-Dist: pandas; extra == 'docling'
|
|
33
31
|
Requires-Dist: pillow; extra == 'docling'
|
|
34
32
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.7.2"
|
|
4
4
|
description = "Extract what matters from any media source"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -11,12 +11,10 @@ requires-python = ">=3.10"
|
|
|
11
11
|
dependencies = [
|
|
12
12
|
"aiohttp>=3.11",
|
|
13
13
|
"bs4>=0.0.2",
|
|
14
|
-
"esperanto>=1.2.0",
|
|
15
|
-
"google-genai>=1.10.0",
|
|
14
|
+
"esperanto[openai]>=1.2.0",
|
|
16
15
|
"jinja2>=3.1.6",
|
|
17
16
|
"langdetect>=1.0.9",
|
|
18
17
|
"loguru>=0.7.3",
|
|
19
|
-
"openai>=1.73.0",
|
|
20
18
|
"openpyxl>=3.1.5",
|
|
21
19
|
"pandas>=2.2.3",
|
|
22
20
|
"pymupdf>=1.25.5",
|
|
@@ -33,7 +31,7 @@ dependencies = [
|
|
|
33
31
|
]
|
|
34
32
|
|
|
35
33
|
[project.optional-dependencies]
|
|
36
|
-
docling = ["docling
|
|
34
|
+
docling = ["docling", "Pillow", "pandas", "asciidoc"]
|
|
37
35
|
|
|
38
36
|
[project.scripts]
|
|
39
37
|
ccore = "content_core:ccore"
|
|
@@ -54,6 +52,7 @@ package-dir = {"content_core" = "src/content_core"}
|
|
|
54
52
|
dev = [
|
|
55
53
|
"ipykernel>=4.0.1",
|
|
56
54
|
"ipywidgets>=4.0.0",
|
|
55
|
+
"openai>=1.78.1",
|
|
57
56
|
"pyperclip>=1.9.0",
|
|
58
57
|
"pytest>=7.2.0",
|
|
59
58
|
"pytest-asyncio>=0.21.0",
|
|
@@ -13,8 +13,13 @@ class ProcessSourceState(BaseModel):
|
|
|
13
13
|
identified_provider: Optional[str] = ""
|
|
14
14
|
metadata: Optional[dict] = Field(default_factory=lambda: {})
|
|
15
15
|
content: Optional[str] = ""
|
|
16
|
-
engine: Optional[str] = Field(
|
|
17
|
-
|
|
16
|
+
engine: Optional[str] = Field(
|
|
17
|
+
default=None, description="Override extraction engine: 'legacy' or 'docling'"
|
|
18
|
+
)
|
|
19
|
+
output_format: Optional[str] = Field(
|
|
20
|
+
default=None,
|
|
21
|
+
description="Override Docling output format: 'markdown', 'html', or 'json'",
|
|
22
|
+
)
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
class ProcessSourceInput(BaseModel):
|
|
@@ -27,6 +32,8 @@ class ProcessSourceInput(BaseModel):
|
|
|
27
32
|
|
|
28
33
|
class ProcessSourceOutput(BaseModel):
|
|
29
34
|
title: Optional[str] = ""
|
|
35
|
+
file_path: Optional[str] = ""
|
|
36
|
+
url: Optional[str] = ""
|
|
30
37
|
source_type: Optional[str] = ""
|
|
31
38
|
identified_type: Optional[str] = ""
|
|
32
39
|
identified_provider: Optional[str] = ""
|
|
@@ -15,10 +15,8 @@ from content_core.common import (
|
|
|
15
15
|
from content_core.config import CONFIG # type: ignore
|
|
16
16
|
from content_core.logging import logger
|
|
17
17
|
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
18
|
-
from content_core.processors.docling import
|
|
19
|
-
|
|
20
|
-
extract_with_docling,
|
|
21
|
-
)
|
|
18
|
+
from content_core.processors.docling import DOCLING_SUPPORTED # type: ignore
|
|
19
|
+
from content_core.processors.docling import extract_with_docling
|
|
22
20
|
from content_core.processors.office import (
|
|
23
21
|
SUPPORTED_OFFICE_TYPES,
|
|
24
22
|
extract_office_content,
|
|
@@ -186,8 +184,3 @@ workflow.add_edge("download_remote_file", "file_type")
|
|
|
186
184
|
|
|
187
185
|
# Compile graph
|
|
188
186
|
graph = workflow.compile()
|
|
189
|
-
|
|
190
|
-
# Compile graph
|
|
191
|
-
graph = workflow.compile()
|
|
192
|
-
# Compile graph
|
|
193
|
-
graph = workflow.compile()
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from urllib.parse import urlparse
|
|
3
|
+
from io import BytesIO
|
|
3
4
|
|
|
4
5
|
import aiohttp
|
|
6
|
+
import docx
|
|
5
7
|
from bs4 import BeautifulSoup, Comment
|
|
6
8
|
|
|
7
9
|
from content_core.common import ProcessSourceState
|
|
@@ -12,6 +14,49 @@ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
|
|
|
12
14
|
# https://github.com/buriy/python-readability
|
|
13
15
|
# also try readability: from readability import Document
|
|
14
16
|
|
|
17
|
+
DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
18
|
+
|
|
19
|
+
async def _extract_docx_content(docx_bytes: bytes, url: str):
|
|
20
|
+
"""
|
|
21
|
+
Extract content from DOCX file bytes.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
|
|
25
|
+
doc = docx.Document(BytesIO(docx_bytes))
|
|
26
|
+
content_parts = [p.text for p in doc.paragraphs if p.text]
|
|
27
|
+
full_content = "\n\n".join(content_parts)
|
|
28
|
+
|
|
29
|
+
# Try to get a title from document properties or first heading
|
|
30
|
+
title = doc.core_properties.title
|
|
31
|
+
if not title and doc.paragraphs:
|
|
32
|
+
# Look for a potential title in the first few paragraphs (e.g., if styled as heading)
|
|
33
|
+
for p in doc.paragraphs[:5]: # Check first 5 paragraphs
|
|
34
|
+
if p.style.name.startswith('Heading'):
|
|
35
|
+
title = p.text
|
|
36
|
+
break
|
|
37
|
+
if not title: # Fallback to first line if no heading found
|
|
38
|
+
title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
|
|
39
|
+
|
|
40
|
+
# If no title found, use filename from URL
|
|
41
|
+
if not title:
|
|
42
|
+
title = urlparse(url).path.split('/')[-1]
|
|
43
|
+
|
|
44
|
+
logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
|
|
45
|
+
return {
|
|
46
|
+
"title": title,
|
|
47
|
+
"content": full_content,
|
|
48
|
+
"domain": urlparse(url).netloc,
|
|
49
|
+
"url": url,
|
|
50
|
+
}
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.error(f"Failed to process DOCX content from {url}: {e}")
|
|
53
|
+
# Fallback or re-raise, depending on desired error handling
|
|
54
|
+
return {
|
|
55
|
+
"title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
|
|
56
|
+
"content": f"Failed to extract content from DOCX: {e}",
|
|
57
|
+
"domain": urlparse(url).netloc,
|
|
58
|
+
"url": url,
|
|
59
|
+
}
|
|
15
60
|
|
|
16
61
|
async def url_provider(state: ProcessSourceState):
|
|
17
62
|
"""
|
|
@@ -54,6 +99,13 @@ async def extract_url_bs4(url: str):
|
|
|
54
99
|
async with aiohttp.ClientSession() as session:
|
|
55
100
|
async with session.get(url, headers=headers, timeout=10) as response:
|
|
56
101
|
response.raise_for_status()
|
|
102
|
+
# Check content type for DOCX
|
|
103
|
+
if response.content_type == DOCX_MIME_TYPE:
|
|
104
|
+
logger.debug(f"Detected DOCX content type for {url}")
|
|
105
|
+
docx_bytes = await response.read()
|
|
106
|
+
return await _extract_docx_content(docx_bytes, url)
|
|
107
|
+
|
|
108
|
+
# If not DOCX, proceed as HTML
|
|
57
109
|
html_content = await response.text()
|
|
58
110
|
|
|
59
111
|
soup = BeautifulSoup(html_content, "html.parser")
|