PyPI - content-core - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

content-core 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (7) hide show

content_core/content/extraction/graph.py CHANGED Viewed

@@ -21,6 +21,10 @@ from content_core.processors.url import extract_url, url_provider
 from content_core.processors.video import extract_best_audio_from_video
 from content_core.processors.youtube import extract_youtube_transcript
+import aiohttp
+import tempfile
+from urllib.parse import urlparse
 async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
     """
@@ -91,6 +95,21 @@ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
     return x.source_type
+async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
+    url = state.url
+    assert url, "No URL provided"
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as resp:
+            resp.raise_for_status()
+            mime = resp.headers.get("content-type", "").split(";", 1)[0]
+            suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
+            fd, tmp = tempfile.mkstemp(suffix=suffix)
+            os.close(fd)
+            with open(tmp, "wb") as f:
+                f.write(await resp.read())
+    return {"file_path": tmp, "identified_type": mime}
 # Create workflow
 workflow = StateGraph(
     ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
@@ -108,6 +127,7 @@ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video
 workflow.add_node("extract_audio", extract_audio)
 workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
 workflow.add_node("delete_file", delete_file)
+workflow.add_node("download_remote_file", download_remote_file)
 # Add edges
 workflow.add_edge(START, "source")
@@ -127,7 +147,7 @@ workflow.add_conditional_edges(
 workflow.add_conditional_edges(
     "url_provider",
     url_type_router,
-    {"article": "extract_url", "youtube": "extract_youtube_transcript"},
+    {**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
 )
 workflow.add_edge("url_provider", END)
 workflow.add_edge("file_type", END)
@@ -140,6 +160,7 @@ workflow.add_edge("extract_office_content", "delete_file")
 workflow.add_edge("extract_best_audio_from_video", "extract_audio")
 workflow.add_edge("extract_audio", "delete_file")
 workflow.add_edge("delete_file", END)
+workflow.add_edge("download_remote_file", "file_type")
 # Compile graph
 graph = workflow.compile()

content_core/processors/url.py CHANGED Viewed

@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup, Comment
 from content_core.common import ProcessSourceState
 from content_core.logging import logger
+from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
 # future: better extraction methods
 # https://github.com/buriy/python-readability
@@ -20,12 +21,20 @@ async def url_provider(state: ProcessSourceState):
     url = state.url
     if url:
         if "youtube.com" in url or "youtu.be" in url:
-            return_dict["identified_type"] = (
-                "youtube"  # future: playlists, channels in the future
-            )
+            return_dict["identified_type"] = "youtube"
         else:
-            return_dict["identified_type"] = "article"
-            # future: article providers in the future
+            # remote URL: check content-type to catch PDFs
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.head(url, timeout=10, allow_redirects=True) as resp:
+                        mime = resp.headers.get("content-type", "").split(";", 1)[0]
+            except Exception as e:
+                logger.debug(f"HEAD check failed for {url}: {e}")
+                mime = "article"
+            if mime in SUPPORTED_FITZ_TYPES:
+                return_dict["identified_type"] = mime
+            else:
+                return_dict["identified_type"] = "article"
     return return_dict

{content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 0.3.0
+Version: 0.4.0
 Summary: Extract what matters from any media source
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE

{content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -14,7 +14,7 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
 content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
 content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
 content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
-content_core/content/extraction/graph.py,sha256=4-yZDYErUvnPsgoBM3zmpGFg347-cbwJ4_VeyMmAYj4,4635
+content_core/content/extraction/graph.py,sha256=W_mpGcR_Vw6cMh56U-YONzVxFMbhY9aU8rt3Pdta6Bg,5526
 content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
 content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
 content_core/notebooks/run.ipynb,sha256=U_-SXsEmMNiNhFiZXtQeEeSnVn1NF4q9Xd6XOUpcjqg,330371
@@ -22,7 +22,7 @@ content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM1
 content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
 content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
 content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
-content_core/processors/url.py,sha256=u2qgGLe9n58RtGXMBf1d31rwMIgyogg7Btn-AEl8KQU,6282
+content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
 content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
 content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
 content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
@@ -31,8 +31,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
 content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
 content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
 content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-0.3.0.dist-info/METADATA,sha256=nBKxYD0J8db7zO9ZVSyyxwl7VxtVGMrRREpt-HPgbBg,9111
-content_core-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-0.3.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
-content_core-0.3.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-0.3.0.dist-info/RECORD,,
+content_core-0.4.0.dist-info/METADATA,sha256=sXLcda5ZXi4ibpBxrIlC_YT3DuJcNiqk_FFR_LgMISQ,9111
+content_core-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-0.4.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
+content_core-0.4.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-0.4.0.dist-info/RECORD,,

{content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

content-core 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl