content-core 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-0.3.0 → content_core-0.4.0}/.windsurfrules +2 -2
- {content_core-0.3.0 → content_core-0.4.0}/PKG-INFO +1 -1
- {content_core-0.3.0 → content_core-0.4.0}/pyproject.toml +2 -1
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/content/extraction/graph.py +22 -1
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/processors/url.py +14 -5
- {content_core-0.3.0 → content_core-0.4.0}/tests/integration/test_extraction.py +10 -0
- {content_core-0.3.0 → content_core-0.4.0}/uv.lock +2 -2
- {content_core-0.3.0 → content_core-0.4.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/.github/workflows/publish.yml +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/.gitignore +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/.python-version +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/CONTRIBUTING.md +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/LICENSE +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/Makefile +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/README.md +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/docs/processors.md +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/docs/usage.md +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/__init__.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/common/__init__.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/common/state.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/common/utils.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/config.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/content/__init__.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/content/summary/core.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/logging.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/models.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/models_config.yaml +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/processors/audio.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/processors/office.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/processors/text.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/processors/video.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/processors/youtube.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/prompter.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/prompts/content/cleanup.jinja +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/prompts/content/summarize.jinja +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/py.typed +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/templated_message.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/tools/extract.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.docx +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.epub +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.md +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.mp3 +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.mp4 +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.pdf +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.pptx +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.txt +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file.xlsx +0 -0
- {content_core-0.3.0 → content_core-0.4.0}/tests/input_content/file_audio.mp3 +0 -0
|
@@ -4,10 +4,10 @@ All documentation (code or readmes) must be in english.
|
|
|
4
4
|
Whenever I ask you to tag and release, make sure to run `make test` as part of the process.
|
|
5
5
|
|
|
6
6
|
The full release process is:
|
|
7
|
-
- Run `make test` to make sure everything is working
|
|
7
|
+
- Run `make test` to make sure everything is working (if we changed any code or import)
|
|
8
8
|
- Update version on pyproject.toml
|
|
9
9
|
- Run `uv sync` to update the lock file
|
|
10
10
|
- Commit all that's needed
|
|
11
|
-
- Merge to main
|
|
11
|
+
- Merge to main (if in a branch)
|
|
12
12
|
- Tag the release
|
|
13
13
|
- Push to GitHub
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
description = "Extract what matters from any media source"
|
|
5
5
|
readme = "README.md"
|
|
6
|
+
homepage = "https://github.com/lfnovo/content-core"
|
|
6
7
|
authors = [
|
|
7
8
|
{ name = "LUIS NOVO", email = "lfnovo@gmail.com" }
|
|
8
9
|
]
|
|
@@ -21,6 +21,10 @@ from content_core.processors.url import extract_url, url_provider
|
|
|
21
21
|
from content_core.processors.video import extract_best_audio_from_video
|
|
22
22
|
from content_core.processors.youtube import extract_youtube_transcript
|
|
23
23
|
|
|
24
|
+
import aiohttp
|
|
25
|
+
import tempfile
|
|
26
|
+
from urllib.parse import urlparse
|
|
27
|
+
|
|
24
28
|
|
|
25
29
|
async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
26
30
|
"""
|
|
@@ -91,6 +95,21 @@ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
|
|
|
91
95
|
return x.source_type
|
|
92
96
|
|
|
93
97
|
|
|
98
|
+
async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
99
|
+
url = state.url
|
|
100
|
+
assert url, "No URL provided"
|
|
101
|
+
async with aiohttp.ClientSession() as session:
|
|
102
|
+
async with session.get(url) as resp:
|
|
103
|
+
resp.raise_for_status()
|
|
104
|
+
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
105
|
+
suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
|
|
106
|
+
fd, tmp = tempfile.mkstemp(suffix=suffix)
|
|
107
|
+
os.close(fd)
|
|
108
|
+
with open(tmp, "wb") as f:
|
|
109
|
+
f.write(await resp.read())
|
|
110
|
+
return {"file_path": tmp, "identified_type": mime}
|
|
111
|
+
|
|
112
|
+
|
|
94
113
|
# Create workflow
|
|
95
114
|
workflow = StateGraph(
|
|
96
115
|
ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
|
|
@@ -108,6 +127,7 @@ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video
|
|
|
108
127
|
workflow.add_node("extract_audio", extract_audio)
|
|
109
128
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
110
129
|
workflow.add_node("delete_file", delete_file)
|
|
130
|
+
workflow.add_node("download_remote_file", download_remote_file)
|
|
111
131
|
|
|
112
132
|
# Add edges
|
|
113
133
|
workflow.add_edge(START, "source")
|
|
@@ -127,7 +147,7 @@ workflow.add_conditional_edges(
|
|
|
127
147
|
workflow.add_conditional_edges(
|
|
128
148
|
"url_provider",
|
|
129
149
|
url_type_router,
|
|
130
|
-
{"article": "extract_url", "youtube": "extract_youtube_transcript"},
|
|
150
|
+
{**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
|
|
131
151
|
)
|
|
132
152
|
workflow.add_edge("url_provider", END)
|
|
133
153
|
workflow.add_edge("file_type", END)
|
|
@@ -140,6 +160,7 @@ workflow.add_edge("extract_office_content", "delete_file")
|
|
|
140
160
|
workflow.add_edge("extract_best_audio_from_video", "extract_audio")
|
|
141
161
|
workflow.add_edge("extract_audio", "delete_file")
|
|
142
162
|
workflow.add_edge("delete_file", END)
|
|
163
|
+
workflow.add_edge("download_remote_file", "file_type")
|
|
143
164
|
|
|
144
165
|
# Compile graph
|
|
145
166
|
graph = workflow.compile()
|
|
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup, Comment
|
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
8
|
from content_core.logging import logger
|
|
9
|
+
from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
|
|
9
10
|
|
|
10
11
|
# future: better extraction methods
|
|
11
12
|
# https://github.com/buriy/python-readability
|
|
@@ -20,12 +21,20 @@ async def url_provider(state: ProcessSourceState):
|
|
|
20
21
|
url = state.url
|
|
21
22
|
if url:
|
|
22
23
|
if "youtube.com" in url or "youtu.be" in url:
|
|
23
|
-
return_dict["identified_type"] =
|
|
24
|
-
"youtube" # future: playlists, channels in the future
|
|
25
|
-
)
|
|
24
|
+
return_dict["identified_type"] = "youtube"
|
|
26
25
|
else:
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
# remote URL: check content-type to catch PDFs
|
|
27
|
+
try:
|
|
28
|
+
async with aiohttp.ClientSession() as session:
|
|
29
|
+
async with session.head(url, timeout=10, allow_redirects=True) as resp:
|
|
30
|
+
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
31
|
+
except Exception as e:
|
|
32
|
+
logger.debug(f"HEAD check failed for {url}: {e}")
|
|
33
|
+
mime = "article"
|
|
34
|
+
if mime in SUPPORTED_FITZ_TYPES:
|
|
35
|
+
return_dict["identified_type"] = mime
|
|
36
|
+
else:
|
|
37
|
+
return_dict["identified_type"] = "article"
|
|
29
38
|
return return_dict
|
|
30
39
|
|
|
31
40
|
|
|
@@ -200,3 +200,13 @@ async def test_extract_content_from_xlsx(fixture_path):
|
|
|
200
200
|
)
|
|
201
201
|
assert result.title is not None # Attempt to extract title/metadata
|
|
202
202
|
assert len(result.content) > 0 # Check that some content was extracted
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@pytest.mark.asyncio
|
|
206
|
+
async def test_extract_content_from_pdf_url():
|
|
207
|
+
"""Tests extracting content from a remote PDF URL."""
|
|
208
|
+
url = "https://arxiv.org/pdf/2408.09869"
|
|
209
|
+
result = await extract_content({"url": url})
|
|
210
|
+
assert result.source_type == "url"
|
|
211
|
+
assert result.identified_type == "application/pdf"
|
|
212
|
+
assert len(result.content) > 100 # Expect substantial extracted text
|
|
@@ -354,7 +354,7 @@ wheels = [
|
|
|
354
354
|
|
|
355
355
|
[[package]]
|
|
356
356
|
name = "content-core"
|
|
357
|
-
version = "0.
|
|
357
|
+
version = "0.4.0"
|
|
358
358
|
source = { editable = "." }
|
|
359
359
|
dependencies = [
|
|
360
360
|
{ name = "aiohttp" },
|
|
@@ -1018,7 +1018,7 @@ wheels = [
|
|
|
1018
1018
|
{ url = "https://files.pythonhosted.org/packages/8c/de/8eb6fffecd9c5f129461edcdd7e1ac944f9de15783e3d89c84ed6e0374bc/lxml-5.3.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:aa837e6ee9534de8d63bc4c1249e83882a7ac22bd24523f83fad68e6ffdf41ae", size = 5652903 },
|
|
1019
1019
|
{ url = "https://files.pythonhosted.org/packages/95/79/80f4102a08495c100014593680f3f0f7bd7c1333b13520aed855fc993326/lxml-5.3.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:da4c9223319400b97a2acdfb10926b807e51b69eb7eb80aad4942c0516934858", size = 5491813 },
|
|
1020
1020
|
{ url = "https://files.pythonhosted.org/packages/15/f5/9b1f7edf6565ee31e4300edb1bcc61eaebe50a3cff4053c0206d8dc772f2/lxml-5.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:dc0e9bdb3aa4d1de703a437576007d366b54f52c9897cae1a3716bb44fc1fc85", size = 5227837 },
|
|
1021
|
-
{ url = "https://files.pythonhosted.org/packages/
|
|
1021
|
+
{ url = "https://files.pythonhosted.org/packages/dd/53/a187c4ccfcd5fbfca01e6c96da39499d8b801ab5dcf57717db95d7a968a8/lxml-5.3.2-cp310-cp310-win32.win32.whl", hash = "sha256:dd755a0a78dd0b2c43f972e7b51a43be518ebc130c9f1a7c4480cf08b4385486", size = 3477533 },
|
|
1022
1022
|
{ url = "https://files.pythonhosted.org/packages/f2/2c/397c5a9d76a7a0faf9e5b13143ae1a7e223e71d2197a45da71c21aacb3d4/lxml-5.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:d64ea1686474074b38da13ae218d9fde0d1dc6525266976808f41ac98d9d7980", size = 3805160 },
|
|
1023
1023
|
{ url = "https://files.pythonhosted.org/packages/84/b8/2b727f5a90902f7cc5548349f563b60911ca05f3b92e35dfa751349f265f/lxml-5.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9d61a7d0d208ace43986a92b111e035881c4ed45b1f5b7a270070acae8b0bfb4", size = 8163457 },
|
|
1024
1024
|
{ url = "https://files.pythonhosted.org/packages/91/84/23135b2dc72b3440d68c8f39ace2bb00fe78e3a2255f7c74f7e76f22498e/lxml-5.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:856dfd7eda0b75c29ac80a31a6411ca12209183e866c33faf46e77ace3ce8a79", size = 4433445 },
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|