content-core 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -1,6 +1,9 @@
1
1
  import os
2
+ import tempfile
2
3
  from typing import Any, Dict, Optional
4
+ from urllib.parse import urlparse
3
5
 
6
+ import aiohttp
4
7
  import magic
5
8
  from langgraph.graph import END, START, StateGraph
6
9
 
@@ -9,8 +12,11 @@ from content_core.common import (
9
12
  ProcessSourceState,
10
13
  UnsupportedTypeException,
11
14
  )
15
+ from content_core.config import CONFIG # type: ignore
12
16
  from content_core.logging import logger
13
- from content_core.processors.audio import extract_audio # type: ignore
17
+ from content_core.processors.audio import extract_audio_data # type: ignore
18
+ from content_core.processors.docling import DOCLING_SUPPORTED # type: ignore
19
+ from content_core.processors.docling import extract_with_docling
14
20
  from content_core.processors.office import (
15
21
  SUPPORTED_OFFICE_TYPES,
16
22
  extract_office_content,
@@ -20,12 +26,6 @@ from content_core.processors.text import extract_txt
20
26
  from content_core.processors.url import extract_url, url_provider
21
27
  from content_core.processors.video import extract_best_audio_from_video
22
28
  from content_core.processors.youtube import extract_youtube_transcript
23
- from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
24
-
25
- import aiohttp
26
- import tempfile
27
- from urllib.parse import urlparse
28
- from content_core.config import CONFIG # type: ignore
29
29
 
30
30
 
31
31
  async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
@@ -69,7 +69,7 @@ async def file_type_edge(data: ProcessSourceState) -> str:
69
69
  elif identified_type.startswith("video"):
70
70
  return "extract_best_audio_from_video"
71
71
  elif identified_type.startswith("audio"):
72
- return "extract_audio"
72
+ return "extract_audio_data"
73
73
  else:
74
74
  raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
75
75
 
@@ -104,7 +104,9 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
104
104
  async with session.get(url) as resp:
105
105
  resp.raise_for_status()
106
106
  mime = resp.headers.get("content-type", "").split(";", 1)[0]
107
- suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
107
+ suffix = (
108
+ os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
109
+ )
108
110
  fd, tmp = tempfile.mkstemp(suffix=suffix)
109
111
  os.close(fd)
110
112
  with open(tmp, "wb") as f:
@@ -137,7 +139,7 @@ workflow.add_node("extract_pdf", extract_pdf)
137
139
  workflow.add_node("extract_url", extract_url)
138
140
  workflow.add_node("extract_office_content", extract_office_content)
139
141
  workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
140
- workflow.add_node("extract_audio", extract_audio)
142
+ workflow.add_node("extract_audio_data", extract_audio_data)
141
143
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
142
144
  workflow.add_node("delete_file", delete_file)
143
145
  workflow.add_node("download_remote_file", download_remote_file)
@@ -161,7 +163,11 @@ workflow.add_conditional_edges(
161
163
  workflow.add_conditional_edges(
162
164
  "url_provider",
163
165
  url_type_router,
164
- {**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
166
+ {
167
+ **{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES},
168
+ "article": "extract_url",
169
+ "youtube": "extract_youtube_transcript",
170
+ },
165
171
  )
166
172
  workflow.add_edge("url_provider", END)
167
173
  workflow.add_edge("file_type", END)
@@ -171,13 +177,10 @@ workflow.add_edge("extract_youtube_transcript", END)
171
177
 
172
178
  workflow.add_edge("extract_pdf", "delete_file")
173
179
  workflow.add_edge("extract_office_content", "delete_file")
174
- workflow.add_edge("extract_best_audio_from_video", "extract_audio")
175
- workflow.add_edge("extract_audio", "delete_file")
180
+ workflow.add_edge("extract_best_audio_from_video", "extract_audio_data")
181
+ workflow.add_edge("extract_audio_data", "delete_file")
176
182
  workflow.add_edge("delete_file", END)
177
183
  workflow.add_edge("download_remote_file", "file_type")
178
184
 
179
185
  # Compile graph
180
186
  graph = workflow.compile()
181
-
182
- # Compile graph
183
- graph = workflow.compile()