content-core 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/content/extraction/graph.py +22 -1
- content_core/processors/url.py +14 -5
- {content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/METADATA +1 -1
- {content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/RECORD +7 -7
- {content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/WHEEL +0 -0
- {content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/entry_points.txt +0 -0
- {content_core-0.3.0.dist-info → content_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -21,6 +21,10 @@ from content_core.processors.url import extract_url, url_provider
|
|
|
21
21
|
from content_core.processors.video import extract_best_audio_from_video
|
|
22
22
|
from content_core.processors.youtube import extract_youtube_transcript
|
|
23
23
|
|
|
24
|
+
import aiohttp
|
|
25
|
+
import tempfile
|
|
26
|
+
from urllib.parse import urlparse
|
|
27
|
+
|
|
24
28
|
|
|
25
29
|
async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
26
30
|
"""
|
|
@@ -91,6 +95,21 @@ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
|
|
|
91
95
|
return x.source_type
|
|
92
96
|
|
|
93
97
|
|
|
98
|
+
async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
99
|
+
url = state.url
|
|
100
|
+
assert url, "No URL provided"
|
|
101
|
+
async with aiohttp.ClientSession() as session:
|
|
102
|
+
async with session.get(url) as resp:
|
|
103
|
+
resp.raise_for_status()
|
|
104
|
+
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
105
|
+
suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
|
|
106
|
+
fd, tmp = tempfile.mkstemp(suffix=suffix)
|
|
107
|
+
os.close(fd)
|
|
108
|
+
with open(tmp, "wb") as f:
|
|
109
|
+
f.write(await resp.read())
|
|
110
|
+
return {"file_path": tmp, "identified_type": mime}
|
|
111
|
+
|
|
112
|
+
|
|
94
113
|
# Create workflow
|
|
95
114
|
workflow = StateGraph(
|
|
96
115
|
ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
|
|
@@ -108,6 +127,7 @@ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video
|
|
|
108
127
|
workflow.add_node("extract_audio", extract_audio)
|
|
109
128
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
110
129
|
workflow.add_node("delete_file", delete_file)
|
|
130
|
+
workflow.add_node("download_remote_file", download_remote_file)
|
|
111
131
|
|
|
112
132
|
# Add edges
|
|
113
133
|
workflow.add_edge(START, "source")
|
|
@@ -127,7 +147,7 @@ workflow.add_conditional_edges(
|
|
|
127
147
|
workflow.add_conditional_edges(
|
|
128
148
|
"url_provider",
|
|
129
149
|
url_type_router,
|
|
130
|
-
{"article": "extract_url", "youtube": "extract_youtube_transcript"},
|
|
150
|
+
{**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
|
|
131
151
|
)
|
|
132
152
|
workflow.add_edge("url_provider", END)
|
|
133
153
|
workflow.add_edge("file_type", END)
|
|
@@ -140,6 +160,7 @@ workflow.add_edge("extract_office_content", "delete_file")
|
|
|
140
160
|
workflow.add_edge("extract_best_audio_from_video", "extract_audio")
|
|
141
161
|
workflow.add_edge("extract_audio", "delete_file")
|
|
142
162
|
workflow.add_edge("delete_file", END)
|
|
163
|
+
workflow.add_edge("download_remote_file", "file_type")
|
|
143
164
|
|
|
144
165
|
# Compile graph
|
|
145
166
|
graph = workflow.compile()
|
content_core/processors/url.py
CHANGED
|
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup, Comment
|
|
|
6
6
|
|
|
7
7
|
from content_core.common import ProcessSourceState
|
|
8
8
|
from content_core.logging import logger
|
|
9
|
+
from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
|
|
9
10
|
|
|
10
11
|
# future: better extraction methods
|
|
11
12
|
# https://github.com/buriy/python-readability
|
|
@@ -20,12 +21,20 @@ async def url_provider(state: ProcessSourceState):
|
|
|
20
21
|
url = state.url
|
|
21
22
|
if url:
|
|
22
23
|
if "youtube.com" in url or "youtu.be" in url:
|
|
23
|
-
return_dict["identified_type"] =
|
|
24
|
-
"youtube" # future: playlists, channels in the future
|
|
25
|
-
)
|
|
24
|
+
return_dict["identified_type"] = "youtube"
|
|
26
25
|
else:
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
# remote URL: check content-type to catch PDFs
|
|
27
|
+
try:
|
|
28
|
+
async with aiohttp.ClientSession() as session:
|
|
29
|
+
async with session.head(url, timeout=10, allow_redirects=True) as resp:
|
|
30
|
+
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
31
|
+
except Exception as e:
|
|
32
|
+
logger.debug(f"HEAD check failed for {url}: {e}")
|
|
33
|
+
mime = "article"
|
|
34
|
+
if mime in SUPPORTED_FITZ_TYPES:
|
|
35
|
+
return_dict["identified_type"] = mime
|
|
36
|
+
else:
|
|
37
|
+
return_dict["identified_type"] = "article"
|
|
29
38
|
return return_dict
|
|
30
39
|
|
|
31
40
|
|
|
@@ -14,7 +14,7 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
|
|
|
14
14
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
15
15
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
16
16
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
17
|
-
content_core/content/extraction/graph.py,sha256=
|
|
17
|
+
content_core/content/extraction/graph.py,sha256=W_mpGcR_Vw6cMh56U-YONzVxFMbhY9aU8rt3Pdta6Bg,5526
|
|
18
18
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
19
19
|
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
20
20
|
content_core/notebooks/run.ipynb,sha256=U_-SXsEmMNiNhFiZXtQeEeSnVn1NF4q9Xd6XOUpcjqg,330371
|
|
@@ -22,7 +22,7 @@ content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM1
|
|
|
22
22
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
23
23
|
content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
|
|
24
24
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
25
|
-
content_core/processors/url.py,sha256=
|
|
25
|
+
content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
|
|
26
26
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
27
27
|
content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
|
|
28
28
|
content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
|
|
@@ -31,8 +31,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
|
|
|
31
31
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
32
32
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
33
33
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
34
|
-
content_core-0.
|
|
35
|
-
content_core-0.
|
|
36
|
-
content_core-0.
|
|
37
|
-
content_core-0.
|
|
38
|
-
content_core-0.
|
|
34
|
+
content_core-0.4.0.dist-info/METADATA,sha256=sXLcda5ZXi4ibpBxrIlC_YT3DuJcNiqk_FFR_LgMISQ,9111
|
|
35
|
+
content_core-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
36
|
+
content_core-0.4.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
37
|
+
content_core-0.4.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
38
|
+
content_core-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|