content-core 0.8.1__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -33,3 +33,7 @@ extraction:
33
33
  engine: legacy # change to 'docling' to enable Docling engine
34
34
  docling:
35
35
  output_format: markdown # markdown | html | json
36
+
37
+ youtube_transcripts:
38
+ preferred_languages: ["en", "es", "pt"]
39
+
@@ -1,5 +1,6 @@
1
1
  from .cleanup import cleanup_content
2
2
  from .extraction import extract_content
3
+ from .identification import get_file_type
3
4
  from .summary import summarize
4
5
 
5
- __all__ = ["extract_content", "cleanup_content", "summarize"]
6
+ __all__ = ["extract_content", "cleanup_content", "summarize", "get_file_type"]
@@ -2,7 +2,6 @@ import os
2
2
  import tempfile
3
3
  from typing import Any, Dict, Optional
4
4
  from urllib.parse import urlparse
5
- from content_core.common.types import warn_if_deprecated_engine
6
5
 
7
6
  import aiohttp
8
7
  import magic
@@ -13,11 +12,14 @@ from content_core.common import (
13
12
  ProcessSourceState,
14
13
  UnsupportedTypeException,
15
14
  )
15
+ from content_core.common.types import warn_if_deprecated_engine
16
16
  from content_core.config import CONFIG # type: ignore
17
17
  from content_core.logging import logger
18
18
  from content_core.processors.audio import extract_audio_data # type: ignore
19
- from content_core.processors.docling import DOCLING_SUPPORTED # type: ignore
20
- from content_core.processors.docling import extract_with_docling
19
+ from content_core.processors.docling import (
20
+ DOCLING_SUPPORTED, # type: ignore
21
+ extract_with_docling,
22
+ )
21
23
  from content_core.processors.office import (
22
24
  SUPPORTED_OFFICE_TYPES,
23
25
  extract_office_content,
@@ -60,6 +62,7 @@ async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
60
62
  async def file_type_edge(data: ProcessSourceState) -> str:
61
63
  assert data.identified_type, "Type not identified"
62
64
  identified_type = data.identified_type
65
+ logger.debug(f"Identified type: {identified_type}")
63
66
 
64
67
  if identified_type == "text/plain":
65
68
  return "extract_txt"
@@ -91,16 +94,19 @@ async def delete_file(data: ProcessSourceState) -> Dict[str, Any]:
91
94
 
92
95
 
93
96
  async def url_type_router(x: ProcessSourceState) -> Optional[str]:
97
+ assert x.identified_type, "Type not identified"
94
98
  return x.identified_type
95
99
 
96
100
 
97
101
  async def source_type_router(x: ProcessSourceState) -> Optional[str]:
102
+ assert x.source_type, "Source type not identified"
98
103
  return x.source_type
99
104
 
100
105
 
101
106
  async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
102
107
  url = state.url
103
108
  assert url, "No URL provided"
109
+ logger.debug(f"Downloading remote file: {url}")
104
110
  async with aiohttp.ClientSession() as session:
105
111
  async with session.get(url) as resp:
106
112
  resp.raise_for_status()
@@ -115,7 +121,6 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
115
121
  return {"file_path": tmp, "identified_type": mime}
116
122
 
117
123
 
118
-
119
124
  async def file_type_router_docling(state: ProcessSourceState) -> str:
120
125
  """
121
126
  Route to Docling if enabled and supported; otherwise use simple file type edge.
@@ -125,18 +130,25 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
125
130
  engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
126
131
  warn_if_deprecated_engine(engine)
127
132
  if engine == "auto":
133
+ logger.debug("Using auto engine")
128
134
  # Try docling first; if it fails or is not supported, fallback to simple
129
135
  if state.identified_type in DOCLING_SUPPORTED:
130
136
  try:
137
+ logger.debug("Trying docling extraction")
131
138
  return "extract_docling"
132
139
  except Exception as e:
133
- logger.warning(f"Docling extraction failed in 'auto' mode, falling back to simple: {e}")
140
+ logger.warning(
141
+ f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
142
+ )
134
143
  # Fallback to simple
144
+ logger.debug("Falling back to simple extraction")
135
145
  return await file_type_edge(state)
136
146
 
137
147
  if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
148
+ logger.debug("Using docling engine")
138
149
  return "extract_docling"
139
150
  # For 'simple' and 'legacy', use the default file type edge
151
+ logger.debug("Using simple engine")
140
152
  return await file_type_edge(state)
141
153
 
142
154
 
@@ -179,7 +191,12 @@ workflow.add_conditional_edges(
179
191
  "url_provider",
180
192
  url_type_router,
181
193
  {
182
- **{m: "download_remote_file" for m in SUPPORTED_FITZ_TYPES},
194
+ **{
195
+ m: "download_remote_file"
196
+ for m in list(SUPPORTED_FITZ_TYPES)
197
+ + list(SUPPORTED_OFFICE_TYPES)
198
+ + list(DOCLING_SUPPORTED)
199
+ },
183
200
  "article": "extract_url",
184
201
  "youtube": "extract_youtube_transcript",
185
202
  },
@@ -197,5 +214,4 @@ workflow.add_edge("extract_audio_data", "delete_file")
197
214
  workflow.add_edge("delete_file", END)
198
215
  workflow.add_edge("download_remote_file", "file_type")
199
216
 
200
- # Compile graph
201
217
  graph = workflow.compile()
@@ -0,0 +1,8 @@
1
+ import magic
2
+
3
+
4
+ async def get_file_type(file_path: str) -> str:
5
+ """
6
+ Identify the file using python-magic
7
+ """
8
+ return magic.from_file(file_path, mime=True)
@@ -26,7 +26,7 @@ DOCLING_SUPPORTED = {
26
26
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
27
27
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
28
28
  "text/markdown",
29
- "text/plain",
29
+ # "text/plain", #docling currently not supporting txt
30
30
  "text/x-markdown",
31
31
  "text/csv",
32
32
  "text/html",
@@ -1,68 +1,16 @@
1
1
  import os
2
- from io import BytesIO
3
- from urllib.parse import urlparse
4
2
 
5
3
  import aiohttp
6
- import docx
7
4
  from bs4 import BeautifulSoup
8
5
  from readability import Document
9
6
 
10
7
  from content_core.common import ProcessSourceState
11
8
  from content_core.common.types import warn_if_deprecated_engine
12
9
  from content_core.logging import logger
10
+ from content_core.processors.docling import DOCLING_SUPPORTED
11
+ from content_core.processors.office import SUPPORTED_OFFICE_TYPES
13
12
  from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
14
13
 
15
- DOCX_MIME_TYPE = (
16
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
17
- )
18
-
19
-
20
- async def _extract_docx_content(docx_bytes: bytes, url: str):
21
- """
22
- Extract content from DOCX file bytes.
23
- """
24
- try:
25
- logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
26
- doc = docx.Document(BytesIO(docx_bytes))
27
- content_parts = [p.text for p in doc.paragraphs if p.text]
28
- full_content = "\n\n".join(content_parts)
29
-
30
- # Try to get a title from document properties or first heading
31
- title = doc.core_properties.title
32
- if not title and doc.paragraphs:
33
- # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
34
- for p in doc.paragraphs[:5]: # Check first 5 paragraphs
35
- if p.style.name.startswith("Heading"):
36
- title = p.text
37
- break
38
- if not title: # Fallback to first line if no heading found
39
- title = (
40
- doc.paragraphs[0].text.strip()
41
- if doc.paragraphs[0].text.strip()
42
- else None
43
- )
44
-
45
- # If no title found, use filename from URL
46
- if not title:
47
- title = urlparse(url).path.split("/")[-1]
48
-
49
- logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
50
- return {
51
- "title": title,
52
- "content": full_content,
53
- "domain": urlparse(url).netloc,
54
- "url": url,
55
- }
56
- except Exception as e:
57
- logger.error(f"Failed to process DOCX content from {url}: {e}")
58
- # Fallback or re-raise, depending on desired error handling
59
- return {
60
- "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
61
- "content": f"Failed to extract content from DOCX: {e}",
62
- "domain": urlparse(url).netloc,
63
- "url": url,
64
- }
65
-
66
14
 
67
15
  async def url_provider(state: ProcessSourceState):
68
16
  """
@@ -81,12 +29,19 @@ async def url_provider(state: ProcessSourceState):
81
29
  url, timeout=10, allow_redirects=True
82
30
  ) as resp:
83
31
  mime = resp.headers.get("content-type", "").split(";", 1)[0]
32
+ logger.debug(f"MIME type for {url}: {mime}")
84
33
  except Exception as e:
85
- logger.debug(f"HEAD check failed for {url}: {e}")
34
+ logger.warning(f"HEAD check failed for {url}: {e}")
86
35
  mime = "article"
87
- if mime in SUPPORTED_FITZ_TYPES:
36
+ if (
37
+ mime in DOCLING_SUPPORTED
38
+ or mime in SUPPORTED_FITZ_TYPES
39
+ or mime in SUPPORTED_OFFICE_TYPES
40
+ ):
41
+ logger.warning(f"Identified type for {url}: {mime}")
88
42
  return_dict["identified_type"] = mime
89
43
  else:
44
+ logger.warning(f"Identified type for {url}: article")
90
45
  return_dict["identified_type"] = "article"
91
46
  return return_dict
92
47
 
@@ -8,6 +8,7 @@ from youtube_transcript_api.formatters import TextFormatter # type: ignore
8
8
 
9
9
  from content_core.common import ProcessSourceState
10
10
  from content_core.common.exceptions import NoTranscriptFound
11
+ from content_core.config import CONFIG
11
12
  from content_core.logging import logger
12
13
 
13
14
  ssl._create_default_https_context = ssl._create_unverified_context
@@ -137,10 +138,11 @@ async def extract_youtube_transcript(state: ProcessSourceState):
137
138
  Parse the text file and print its content.
138
139
  """
139
140
 
140
- languages = ["en", "es", "pt"]
141
- # languages = CONFIG.get("youtube_transcripts", {}).get(
142
- # "preferred_languages", ["en", "es", "pt"]
143
- # )
141
+ assert state.url, "No URL provided"
142
+ logger.warning(f"Extracting transcript from URL: {state.url}")
143
+ languages = CONFIG.get("youtube_transcripts", {}).get(
144
+ "preferred_languages", ["en", "es", "pt"]
145
+ )
144
146
 
145
147
  video_id = await _extract_youtube_id(state.url)
146
148
  transcript = await get_best_transcript(video_id, languages)
@@ -152,9 +154,24 @@ async def extract_youtube_transcript(state: ProcessSourceState):
152
154
  except Exception as e:
153
155
  logger.critical(f"Failed to get video title for video_id: {video_id}")
154
156
  logger.exception(e)
155
- title = None
157
+ title = ""
158
+
159
+ try:
160
+ formatted_content = formatter.format_transcript(transcript)
161
+ except Exception as e:
162
+ logger.critical(f"Failed to format transcript for video_id: {video_id}")
163
+ logger.exception(e)
164
+ formatted_content = ""
165
+
166
+ try:
167
+ transcript_raw = transcript.to_raw_data()
168
+ except Exception as e:
169
+ logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
170
+ logger.exception(e)
171
+ transcript_raw = ""
172
+
156
173
  return {
157
- "content": formatter.format_transcript(transcript),
174
+ "content": formatted_content,
158
175
  "title": title,
159
- "metadata": {"video_id": video_id, "transcript": transcript.to_raw_data()},
176
+ "metadata": {"video_id": video_id, "transcript": transcript_raw},
160
177
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.8.1
3
+ Version: 0.8.5
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -1,5 +1,5 @@
1
1
  content_core/__init__.py,sha256=ANKeslNXOGumwrkjqgRik23e5PdGps2C0FSup8_XH2Y,6515
2
- content_core/cc_config.yaml,sha256=w66fo5ut6TPaU3o4hkjnroqg2hkr8YuOG3BRtI50j1s,701
2
+ content_core/cc_config.yaml,sha256=tfbnJ4h9DWuJUljJrnz72s6TD24hD5P-uEPA9K_pNVY,767
3
3
  content_core/config.py,sha256=-aUsTB6Z3fa_XIWdHNXhMgWkVLWjEW1kfyQXXB_-j54,1632
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
5
  content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
@@ -11,28 +11,29 @@ content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cI
11
11
  content_core/common/state.py,sha256=pO8Oq71KxznlZ4K5qUVfyLrNsZWd2yMO9bXKmrTIXQo,1427
12
12
  content_core/common/types.py,sha256=FpIzYadBvafGI4e1EuwGjjiPuawL1HitxsQOciNjTZo,497
13
13
  content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
14
- content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
14
+ content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeFiQM,230
15
15
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
16
16
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
17
17
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
18
- content_core/content/extraction/graph.py,sha256=51B_j_hi7SsKh7kKNLFsMmxyR2HVS-mOYfKvDFyuYfw,7001
18
+ content_core/content/extraction/graph.py,sha256=Z8IqcFQmWLJG44jJ4399mBDQVMH-mYuQQpBDHTBUEe0,7571
19
+ content_core/content/identification/__init__.py,sha256=x4n8JIjDwmPvAopEEEcmZjlozg-zGbMq_s9VYdBjzYU,169
19
20
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
20
21
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
21
22
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
22
23
  content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
23
- content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
24
+ content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
24
25
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
25
26
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
26
27
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
27
- content_core/processors/url.py,sha256=yt-uuzS4N-RAOJ8vo5x-b4bgnrFeTV-3SDIatRTRI3g,9462
28
+ content_core/processors/url.py,sha256=qdtEIhZpi62zMXbwbCmmh86ySoomscwqxHdFib7QC-M,7898
28
29
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
29
- content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
30
+ content_core/processors/youtube.py,sha256=g_A-rv5bzq2GIuwqMH70YAnDK-4BZqpgQP0IQ3j9zXE,6340
30
31
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
31
32
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
32
33
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
33
34
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
34
- content_core-0.8.1.dist-info/METADATA,sha256=ZIW6gtawFeFo2uQqWkFH2ctSYIUq5PBrke4gyHQQAWU,11439
35
- content_core-0.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
- content_core-0.8.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
37
- content_core-0.8.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
38
- content_core-0.8.1.dist-info/RECORD,,
35
+ content_core-0.8.5.dist-info/METADATA,sha256=rba5vG3Vkm5WRKHfbTDay5xK4JD4kbPNFow9AoTNHDE,11439
36
+ content_core-0.8.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
37
+ content_core-0.8.5.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
38
+ content_core-0.8.5.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
39
+ content_core-0.8.5.dist-info/RECORD,,