content-core 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/content/extraction/graph.py +19 -16
- content_core/notebooks/run.ipynb +28 -36
- content_core/processors/audio.py +83 -41
- content_core/processors/url.py +52 -0
- {content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/METADATA +4 -6
- {content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/RECORD +9 -12
- content_core/notebooks/docling.ipynb +0 -27
- content_core/prompts/content/cleanup.jinja +0 -16
- content_core/prompts/content/summarize.jinja +0 -25
- {content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/WHEEL +0 -0
- {content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/entry_points.txt +0 -0
- {content_core-0.5.1.dist-info → content_core-0.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import tempfile
|
|
2
3
|
from typing import Any, Dict, Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
3
5
|
|
|
6
|
+
import aiohttp
|
|
4
7
|
import magic
|
|
5
8
|
from langgraph.graph import END, START, StateGraph
|
|
6
9
|
|
|
@@ -9,8 +12,11 @@ from content_core.common import (
|
|
|
9
12
|
ProcessSourceState,
|
|
10
13
|
UnsupportedTypeException,
|
|
11
14
|
)
|
|
15
|
+
from content_core.config import CONFIG # type: ignore
|
|
12
16
|
from content_core.logging import logger
|
|
13
|
-
from content_core.processors.audio import
|
|
17
|
+
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
18
|
+
from content_core.processors.docling import DOCLING_SUPPORTED # type: ignore
|
|
19
|
+
from content_core.processors.docling import extract_with_docling
|
|
14
20
|
from content_core.processors.office import (
|
|
15
21
|
SUPPORTED_OFFICE_TYPES,
|
|
16
22
|
extract_office_content,
|
|
@@ -20,12 +26,6 @@ from content_core.processors.text import extract_txt
|
|
|
20
26
|
from content_core.processors.url import extract_url, url_provider
|
|
21
27
|
from content_core.processors.video import extract_best_audio_from_video
|
|
22
28
|
from content_core.processors.youtube import extract_youtube_transcript
|
|
23
|
-
from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
|
|
24
|
-
|
|
25
|
-
import aiohttp
|
|
26
|
-
import tempfile
|
|
27
|
-
from urllib.parse import urlparse
|
|
28
|
-
from content_core.config import CONFIG # type: ignore
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
@@ -69,7 +69,7 @@ async def file_type_edge(data: ProcessSourceState) -> str:
|
|
|
69
69
|
elif identified_type.startswith("video"):
|
|
70
70
|
return "extract_best_audio_from_video"
|
|
71
71
|
elif identified_type.startswith("audio"):
|
|
72
|
-
return "
|
|
72
|
+
return "extract_audio_data"
|
|
73
73
|
else:
|
|
74
74
|
raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
|
|
75
75
|
|
|
@@ -104,7 +104,9 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
104
104
|
async with session.get(url) as resp:
|
|
105
105
|
resp.raise_for_status()
|
|
106
106
|
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
107
|
-
suffix =
|
|
107
|
+
suffix = (
|
|
108
|
+
os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
|
|
109
|
+
)
|
|
108
110
|
fd, tmp = tempfile.mkstemp(suffix=suffix)
|
|
109
111
|
os.close(fd)
|
|
110
112
|
with open(tmp, "wb") as f:
|
|
@@ -137,7 +139,7 @@ workflow.add_node("extract_pdf", extract_pdf)
|
|
|
137
139
|
workflow.add_node("extract_url", extract_url)
|
|
138
140
|
workflow.add_node("extract_office_content", extract_office_content)
|
|
139
141
|
workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
|
|
140
|
-
workflow.add_node("
|
|
142
|
+
workflow.add_node("extract_audio_data", extract_audio_data)
|
|
141
143
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
142
144
|
workflow.add_node("delete_file", delete_file)
|
|
143
145
|
workflow.add_node("download_remote_file", download_remote_file)
|
|
@@ -161,7 +163,11 @@ workflow.add_conditional_edges(
|
|
|
161
163
|
workflow.add_conditional_edges(
|
|
162
164
|
"url_provider",
|
|
163
165
|
url_type_router,
|
|
164
|
-
{
|
|
166
|
+
{
|
|
167
|
+
**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES},
|
|
168
|
+
"article": "extract_url",
|
|
169
|
+
"youtube": "extract_youtube_transcript",
|
|
170
|
+
},
|
|
165
171
|
)
|
|
166
172
|
workflow.add_edge("url_provider", END)
|
|
167
173
|
workflow.add_edge("file_type", END)
|
|
@@ -171,13 +177,10 @@ workflow.add_edge("extract_youtube_transcript", END)
|
|
|
171
177
|
|
|
172
178
|
workflow.add_edge("extract_pdf", "delete_file")
|
|
173
179
|
workflow.add_edge("extract_office_content", "delete_file")
|
|
174
|
-
workflow.add_edge("extract_best_audio_from_video", "
|
|
175
|
-
workflow.add_edge("
|
|
180
|
+
workflow.add_edge("extract_best_audio_from_video", "extract_audio_data")
|
|
181
|
+
workflow.add_edge("extract_audio_data", "delete_file")
|
|
176
182
|
workflow.add_edge("delete_file", END)
|
|
177
183
|
workflow.add_edge("download_remote_file", "file_type")
|
|
178
184
|
|
|
179
185
|
# Compile graph
|
|
180
186
|
graph = workflow.compile()
|
|
181
|
-
|
|
182
|
-
# Compile graph
|
|
183
|
-
graph = workflow.compile()
|