content-core 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -21,6 +21,10 @@ from content_core.processors.url import extract_url, url_provider
21
21
  from content_core.processors.video import extract_best_audio_from_video
22
22
  from content_core.processors.youtube import extract_youtube_transcript
23
23
 
24
+ import aiohttp
25
+ import tempfile
26
+ from urllib.parse import urlparse
27
+
24
28
 
25
29
  async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
26
30
  """
@@ -91,6 +95,21 @@ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
91
95
  return x.source_type
92
96
 
93
97
 
98
+ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
99
+ url = state.url
100
+ assert url, "No URL provided"
101
+ async with aiohttp.ClientSession() as session:
102
+ async with session.get(url) as resp:
103
+ resp.raise_for_status()
104
+ mime = resp.headers.get("content-type", "").split(";", 1)[0]
105
+ suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
106
+ fd, tmp = tempfile.mkstemp(suffix=suffix)
107
+ os.close(fd)
108
+ with open(tmp, "wb") as f:
109
+ f.write(await resp.read())
110
+ return {"file_path": tmp, "identified_type": mime}
111
+
112
+
94
113
  # Create workflow
95
114
  workflow = StateGraph(
96
115
  ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
@@ -108,6 +127,7 @@ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video
108
127
  workflow.add_node("extract_audio", extract_audio)
109
128
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
110
129
  workflow.add_node("delete_file", delete_file)
130
+ workflow.add_node("download_remote_file", download_remote_file)
111
131
 
112
132
  # Add edges
113
133
  workflow.add_edge(START, "source")
@@ -127,7 +147,7 @@ workflow.add_conditional_edges(
127
147
  workflow.add_conditional_edges(
128
148
  "url_provider",
129
149
  url_type_router,
130
- {"article": "extract_url", "youtube": "extract_youtube_transcript"},
150
+ {**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
131
151
  )
132
152
  workflow.add_edge("url_provider", END)
133
153
  workflow.add_edge("file_type", END)
@@ -140,6 +160,7 @@ workflow.add_edge("extract_office_content", "delete_file")
140
160
  workflow.add_edge("extract_best_audio_from_video", "extract_audio")
141
161
  workflow.add_edge("extract_audio", "delete_file")
142
162
  workflow.add_edge("delete_file", END)
163
+ workflow.add_edge("download_remote_file", "file_type")
143
164
 
144
165
  # Compile graph
145
166
  graph = workflow.compile()
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup, Comment
6
6
 
7
7
  from content_core.common import ProcessSourceState
8
8
  from content_core.logging import logger
9
+ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
9
10
 
10
11
  # future: better extraction methods
11
12
  # https://github.com/buriy/python-readability
@@ -20,12 +21,20 @@ async def url_provider(state: ProcessSourceState):
20
21
  url = state.url
21
22
  if url:
22
23
  if "youtube.com" in url or "youtu.be" in url:
23
- return_dict["identified_type"] = (
24
- "youtube" # future: playlists, channels in the future
25
- )
24
+ return_dict["identified_type"] = "youtube"
26
25
  else:
27
- return_dict["identified_type"] = "article"
28
- # future: article providers in the future
26
+ # remote URL: check content-type to catch PDFs
27
+ try:
28
+ async with aiohttp.ClientSession() as session:
29
+ async with session.head(url, timeout=10, allow_redirects=True) as resp:
30
+ mime = resp.headers.get("content-type", "").split(";", 1)[0]
31
+ except Exception as e:
32
+ logger.debug(f"HEAD check failed for {url}: {e}")
33
+ mime = "article"
34
+ if mime in SUPPORTED_FITZ_TYPES:
35
+ return_dict["identified_type"] = mime
36
+ else:
37
+ return_dict["identified_type"] = "article"
29
38
  return return_dict
30
39
 
31
40
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -14,7 +14,7 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
14
14
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
15
15
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
16
16
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
17
- content_core/content/extraction/graph.py,sha256=4-yZDYErUvnPsgoBM3zmpGFg347-cbwJ4_VeyMmAYj4,4635
17
+ content_core/content/extraction/graph.py,sha256=W_mpGcR_Vw6cMh56U-YONzVxFMbhY9aU8rt3Pdta6Bg,5526
18
18
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
19
19
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
20
20
  content_core/notebooks/run.ipynb,sha256=U_-SXsEmMNiNhFiZXtQeEeSnVn1NF4q9Xd6XOUpcjqg,330371
@@ -22,7 +22,7 @@ content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM1
22
22
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
23
23
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
24
24
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
25
- content_core/processors/url.py,sha256=u2qgGLe9n58RtGXMBf1d31rwMIgyogg7Btn-AEl8KQU,6282
25
+ content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
26
26
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
27
27
  content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
28
28
  content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
@@ -31,8 +31,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
31
31
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
32
32
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
33
33
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
34
- content_core-0.3.0.dist-info/METADATA,sha256=nBKxYD0J8db7zO9ZVSyyxwl7VxtVGMrRREpt-HPgbBg,9111
35
- content_core-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
- content_core-0.3.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
37
- content_core-0.3.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
38
- content_core-0.3.0.dist-info/RECORD,,
34
+ content_core-0.4.0.dist-info/METADATA,sha256=sXLcda5ZXi4ibpBxrIlC_YT3DuJcNiqk_FFR_LgMISQ,9111
35
+ content_core-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
+ content_core-0.4.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
37
+ content_core-0.4.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
38
+ content_core-0.4.0.dist-info/RECORD,,