content-core 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +6 -2
- content_core/content/extraction/graph.py +23 -13
- content_core/notebooks/run.ipynb +34 -42
- content_core/processors/audio.py +83 -41
- content_core/templated_message.py +16 -24
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/METADATA +3 -2
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/RECORD +10 -14
- content_core/notebooks/docling.ipynb +0 -27
- content_core/prompter.py +0 -159
- content_core/prompts/content/cleanup.jinja +0 -16
- content_core/prompts/content/summarize.jinja +0 -25
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/WHEEL +0 -0
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/entry_points.txt +0 -0
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/licenses/LICENSE +0 -0
content_core/__init__.py
CHANGED
|
@@ -5,9 +5,12 @@ import os
|
|
|
5
5
|
import sys
|
|
6
6
|
from xml.etree import ElementTree as ET
|
|
7
7
|
|
|
8
|
-
from dicttoxml import dicttoxml # type: ignore
|
|
9
8
|
from dotenv import load_dotenv
|
|
10
9
|
|
|
10
|
+
load_dotenv()
|
|
11
|
+
|
|
12
|
+
from dicttoxml import dicttoxml # type: ignore
|
|
13
|
+
|
|
11
14
|
from content_core.common import ProcessSourceInput
|
|
12
15
|
from content_core.content.cleanup import cleanup_content
|
|
13
16
|
from content_core.content.extraction import extract_content
|
|
@@ -18,7 +21,6 @@ from content_core.logging import configure_logging, logger
|
|
|
18
21
|
extract = extract_content
|
|
19
22
|
clean = cleanup_content
|
|
20
23
|
|
|
21
|
-
load_dotenv()
|
|
22
24
|
|
|
23
25
|
# Configure loguru logger using centralized configuration
|
|
24
26
|
configure_logging(debug=False)
|
|
@@ -212,3 +214,5 @@ def csum():
|
|
|
212
214
|
|
|
213
215
|
if __name__ == "__main__":
|
|
214
216
|
ccore()
|
|
217
|
+
if __name__ == "__main__":
|
|
218
|
+
ccore()
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import tempfile
|
|
2
3
|
from typing import Any, Dict, Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
3
5
|
|
|
6
|
+
import aiohttp
|
|
4
7
|
import magic
|
|
5
8
|
from langgraph.graph import END, START, StateGraph
|
|
6
9
|
|
|
@@ -9,8 +12,13 @@ from content_core.common import (
|
|
|
9
12
|
ProcessSourceState,
|
|
10
13
|
UnsupportedTypeException,
|
|
11
14
|
)
|
|
15
|
+
from content_core.config import CONFIG # type: ignore
|
|
12
16
|
from content_core.logging import logger
|
|
13
|
-
from content_core.processors.audio import
|
|
17
|
+
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
18
|
+
from content_core.processors.docling import (
|
|
19
|
+
DOCLING_SUPPORTED, # type: ignore
|
|
20
|
+
extract_with_docling,
|
|
21
|
+
)
|
|
14
22
|
from content_core.processors.office import (
|
|
15
23
|
SUPPORTED_OFFICE_TYPES,
|
|
16
24
|
extract_office_content,
|
|
@@ -20,12 +28,6 @@ from content_core.processors.text import extract_txt
|
|
|
20
28
|
from content_core.processors.url import extract_url, url_provider
|
|
21
29
|
from content_core.processors.video import extract_best_audio_from_video
|
|
22
30
|
from content_core.processors.youtube import extract_youtube_transcript
|
|
23
|
-
from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
|
|
24
|
-
|
|
25
|
-
import aiohttp
|
|
26
|
-
import tempfile
|
|
27
|
-
from urllib.parse import urlparse
|
|
28
|
-
from content_core.config import CONFIG # type: ignore
|
|
29
31
|
|
|
30
32
|
|
|
31
33
|
async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
@@ -69,7 +71,7 @@ async def file_type_edge(data: ProcessSourceState) -> str:
|
|
|
69
71
|
elif identified_type.startswith("video"):
|
|
70
72
|
return "extract_best_audio_from_video"
|
|
71
73
|
elif identified_type.startswith("audio"):
|
|
72
|
-
return "
|
|
74
|
+
return "extract_audio_data"
|
|
73
75
|
else:
|
|
74
76
|
raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
|
|
75
77
|
|
|
@@ -104,7 +106,9 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
104
106
|
async with session.get(url) as resp:
|
|
105
107
|
resp.raise_for_status()
|
|
106
108
|
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
107
|
-
suffix =
|
|
109
|
+
suffix = (
|
|
110
|
+
os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
|
|
111
|
+
)
|
|
108
112
|
fd, tmp = tempfile.mkstemp(suffix=suffix)
|
|
109
113
|
os.close(fd)
|
|
110
114
|
with open(tmp, "wb") as f:
|
|
@@ -137,7 +141,7 @@ workflow.add_node("extract_pdf", extract_pdf)
|
|
|
137
141
|
workflow.add_node("extract_url", extract_url)
|
|
138
142
|
workflow.add_node("extract_office_content", extract_office_content)
|
|
139
143
|
workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
|
|
140
|
-
workflow.add_node("
|
|
144
|
+
workflow.add_node("extract_audio_data", extract_audio_data)
|
|
141
145
|
workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
|
|
142
146
|
workflow.add_node("delete_file", delete_file)
|
|
143
147
|
workflow.add_node("download_remote_file", download_remote_file)
|
|
@@ -161,7 +165,11 @@ workflow.add_conditional_edges(
|
|
|
161
165
|
workflow.add_conditional_edges(
|
|
162
166
|
"url_provider",
|
|
163
167
|
url_type_router,
|
|
164
|
-
{
|
|
168
|
+
{
|
|
169
|
+
**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES},
|
|
170
|
+
"article": "extract_url",
|
|
171
|
+
"youtube": "extract_youtube_transcript",
|
|
172
|
+
},
|
|
165
173
|
)
|
|
166
174
|
workflow.add_edge("url_provider", END)
|
|
167
175
|
workflow.add_edge("file_type", END)
|
|
@@ -171,8 +179,8 @@ workflow.add_edge("extract_youtube_transcript", END)
|
|
|
171
179
|
|
|
172
180
|
workflow.add_edge("extract_pdf", "delete_file")
|
|
173
181
|
workflow.add_edge("extract_office_content", "delete_file")
|
|
174
|
-
workflow.add_edge("extract_best_audio_from_video", "
|
|
175
|
-
workflow.add_edge("
|
|
182
|
+
workflow.add_edge("extract_best_audio_from_video", "extract_audio_data")
|
|
183
|
+
workflow.add_edge("extract_audio_data", "delete_file")
|
|
176
184
|
workflow.add_edge("delete_file", END)
|
|
177
185
|
workflow.add_edge("download_remote_file", "file_type")
|
|
178
186
|
|
|
@@ -181,3 +189,5 @@ graph = workflow.compile()
|
|
|
181
189
|
|
|
182
190
|
# Compile graph
|
|
183
191
|
graph = workflow.compile()
|
|
192
|
+
# Compile graph
|
|
193
|
+
graph = workflow.compile()
|