content-core 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/__init__.py CHANGED
@@ -5,9 +5,12 @@ import os
5
5
  import sys
6
6
  from xml.etree import ElementTree as ET
7
7
 
8
- from dicttoxml import dicttoxml # type: ignore
9
8
  from dotenv import load_dotenv
10
9
 
10
+ load_dotenv()
11
+
12
+ from dicttoxml import dicttoxml # type: ignore
13
+
11
14
  from content_core.common import ProcessSourceInput
12
15
  from content_core.content.cleanup import cleanup_content
13
16
  from content_core.content.extraction import extract_content
@@ -18,7 +21,6 @@ from content_core.logging import configure_logging, logger
18
21
  extract = extract_content
19
22
  clean = cleanup_content
20
23
 
21
- load_dotenv()
22
24
 
23
25
  # Configure loguru logger using centralized configuration
24
26
  configure_logging(debug=False)
@@ -212,3 +214,5 @@ def csum():
212
214
 
213
215
  if __name__ == "__main__":
214
216
  ccore()
217
+ if __name__ == "__main__":
218
+ ccore()
@@ -1,6 +1,9 @@
1
1
  import os
2
+ import tempfile
2
3
  from typing import Any, Dict, Optional
4
+ from urllib.parse import urlparse
3
5
 
6
+ import aiohttp
4
7
  import magic
5
8
  from langgraph.graph import END, START, StateGraph
6
9
 
@@ -9,8 +12,13 @@ from content_core.common import (
9
12
  ProcessSourceState,
10
13
  UnsupportedTypeException,
11
14
  )
15
+ from content_core.config import CONFIG # type: ignore
12
16
  from content_core.logging import logger
13
- from content_core.processors.audio import extract_audio # type: ignore
17
+ from content_core.processors.audio import extract_audio_data # type: ignore
18
+ from content_core.processors.docling import (
19
+ DOCLING_SUPPORTED, # type: ignore
20
+ extract_with_docling,
21
+ )
14
22
  from content_core.processors.office import (
15
23
  SUPPORTED_OFFICE_TYPES,
16
24
  extract_office_content,
@@ -20,12 +28,6 @@ from content_core.processors.text import extract_txt
20
28
  from content_core.processors.url import extract_url, url_provider
21
29
  from content_core.processors.video import extract_best_audio_from_video
22
30
  from content_core.processors.youtube import extract_youtube_transcript
23
- from content_core.processors.docling import extract_with_docling, DOCLING_SUPPORTED # type: ignore
24
-
25
- import aiohttp
26
- import tempfile
27
- from urllib.parse import urlparse
28
- from content_core.config import CONFIG # type: ignore
29
31
 
30
32
 
31
33
  async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
@@ -69,7 +71,7 @@ async def file_type_edge(data: ProcessSourceState) -> str:
69
71
  elif identified_type.startswith("video"):
70
72
  return "extract_best_audio_from_video"
71
73
  elif identified_type.startswith("audio"):
72
- return "extract_audio"
74
+ return "extract_audio_data"
73
75
  else:
74
76
  raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
75
77
 
@@ -104,7 +106,9 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
104
106
  async with session.get(url) as resp:
105
107
  resp.raise_for_status()
106
108
  mime = resp.headers.get("content-type", "").split(";", 1)[0]
107
- suffix = os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
109
+ suffix = (
110
+ os.path.splitext(urlparse(url).path)[1] if urlparse(url).path else ""
111
+ )
108
112
  fd, tmp = tempfile.mkstemp(suffix=suffix)
109
113
  os.close(fd)
110
114
  with open(tmp, "wb") as f:
@@ -137,7 +141,7 @@ workflow.add_node("extract_pdf", extract_pdf)
137
141
  workflow.add_node("extract_url", extract_url)
138
142
  workflow.add_node("extract_office_content", extract_office_content)
139
143
  workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
140
- workflow.add_node("extract_audio", extract_audio)
144
+ workflow.add_node("extract_audio_data", extract_audio_data)
141
145
  workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
142
146
  workflow.add_node("delete_file", delete_file)
143
147
  workflow.add_node("download_remote_file", download_remote_file)
@@ -161,7 +165,11 @@ workflow.add_conditional_edges(
161
165
  workflow.add_conditional_edges(
162
166
  "url_provider",
163
167
  url_type_router,
164
- {**{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES}, "article": "extract_url", "youtube": "extract_youtube_transcript"},
168
+ {
169
+ **{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES},
170
+ "article": "extract_url",
171
+ "youtube": "extract_youtube_transcript",
172
+ },
165
173
  )
166
174
  workflow.add_edge("url_provider", END)
167
175
  workflow.add_edge("file_type", END)
@@ -171,8 +179,8 @@ workflow.add_edge("extract_youtube_transcript", END)
171
179
 
172
180
  workflow.add_edge("extract_pdf", "delete_file")
173
181
  workflow.add_edge("extract_office_content", "delete_file")
174
- workflow.add_edge("extract_best_audio_from_video", "extract_audio")
175
- workflow.add_edge("extract_audio", "delete_file")
182
+ workflow.add_edge("extract_best_audio_from_video", "extract_audio_data")
183
+ workflow.add_edge("extract_audio_data", "delete_file")
176
184
  workflow.add_edge("delete_file", END)
177
185
  workflow.add_edge("download_remote_file", "file_type")
178
186
 
@@ -181,3 +189,5 @@ graph = workflow.compile()
181
189
 
182
190
  # Compile graph
183
191
  graph = workflow.compile()
192
+ # Compile graph
193
+ graph = workflow.compile()